def __post_init__(self): os.makedirs(self.download_directory, exist_ok=True) file_name = os.path.basename(self.input_path) file_prefix = StrConverter.extract_file_name(file_name) self.kegg_result_path = os.path.join( self.output_directory, '%s_kegg_result.txt' % file_prefix) self.kegg_error_path = os.path.join(self.output_directory, '%s_kegg_error.txt' % file_prefix) self.logger = LoggerFactory()
def __init__(self, file_path, ignore_gene=False, enable_debug_info=False): self.ignore_gene = ignore_gene self.gene_segments = [] self.dna_code = [] self.gene_name_segment_map = {} self.source = None self.enable_debug_info = enable_debug_info self.file_path = file_path self.logger = LoggerFactory(1)
def __post_init__(self): self.logger = LoggerFactory(1) file_prefix = StrConverter.extract_file_name(self.rna_tag) self.cluster_result_path = os.path.join( self.output_directory, '%s_cluster_result.txt' % file_prefix) self.sample_result_path = os.path.join( self.output_directory, '%s_sample_result.txt' % file_prefix) self.all_result_path = os.path.join(self.output_directory, '%s_all_result.txt' % file_prefix) self.only_result_path = os.path.join( self.output_directory, '%s_only_result.txt' % file_prefix)
def __post_init__(self): self.inter_path = self.input_path if self.mode == 'inter' else None self.rna_path = self.input_path if self.mode == 'rna' else None file_name = os.path.basename(self.input_path) file_prefix = StrConverter.extract_file_name(file_name) suffix = 'stream_%d' % self.limit if self.mode == 'rna' else 'gene' self.result_path = os.path.join( self.output_directory, '%s_%s_result.txt' % (file_prefix, suffix)) self.gene_reader = GeneFileReader(self.data_path) self.logger = LoggerFactory() self.headers = {} self.inv_headers = []
def __post_init__(self): self.data_name = os.path.basename(self.data_path) file_name = os.path.basename(self.gene_path) file_prefix = StrConverter.extract_file_name(file_name) self.result_path = os.path.join(self.output_directory, '%s_match_result.txt' % (file_prefix)) self.gene_reader = GeneFileReader(self.data_path) self.dna_code = None self.rev_dna_code = None self.logger = LoggerFactory() self.lock = threading.Lock() self.solved = 0 self.total = 0 self.weighted_sum = sum(self.weighted) assert self.weighted_sum > 0 and len(self.weighted) == 5
def find_neighbor_batch(self, datas, iteration_time): fw = open(self.neighbor_result_path, 'a') solve_cnt, success_cnt, total_cnt = 0, 0, len(datas) logger = LoggerFactory(1) logger.info_with_expire_time( '[Iteration %d]completed %d/%d=%.2f%%' % (iteration_time, solve_cnt, total_cnt, solve_cnt * 100.0 / total_cnt), solve_cnt, total_cnt) fe = open( self.error_result_path_prefix + ".iter-%d.txt" % iteration_time, 'w') fail_datas = [] for key, inter, additional in datas: solve_cnt += 1 file_path = os.path.join(self.download_directory, key + '.txt') flag, data = self.download_and_analysis(key, inter, file_path) if flag: success_cnt += 1 direction = '+' if (inter[0] < inter[1]) else '-' fw.write('>%s/%s-%s(%s)\n' % (key, inter[0], inter[1], direction)) if additional != '': for kv in additional.split(','): k, v = kv.split('=') fw.write('%s\t%s\n' % (k, v)) fw.write('SOURCE\t%s\n' % (data.get('source', 'UNKNOWN'))) for elem in data['data']: fw.write('%s\n' % elem) fw.write('sequence\t%s\n' % (data.get('sequence', ''))) fw.write('\n') fw.flush() else: if os.path.exists(file_path): os.remove(file_path) fe.write('>%s/%s-%s\n' % (key, inter[0], inter[1])) fe.flush() fail_datas.append([key, inter]) self.logger.info_with_expire_time( '[Iteration %d]completed %d/%d=%.2f%%, success %d/%d=%.2f%%' % (iteration_time, solve_cnt, total_cnt, solve_cnt * 100.0 / total_cnt, success_cnt, solve_cnt, success_cnt * 100.0 / solve_cnt), solve_cnt, total_cnt) time.sleep(random.random()) self.logger.info('[Iteration %d]done .' % iteration_time) fw.close() return fail_datas
def __post_init__(self): self.logger = LoggerFactory(3) file_name = os.path.basename(self.input_path) file_prefix = StrConverter.extract_file_name(file_name) self.neighbor_result_path = os.path.join( self.output_directory, '%s_neighbor_result.txt' % file_prefix) self.next_gene_result_path = os.path.join( self.output_directory, '%s_next_neighbor_result.txt' % file_prefix) self.source_count_path = os.path.join( self.output_directory, '%s_source_count_result.txt' % file_prefix) self.gene_count_path = os.path.join( self.output_directory, '%s_gene_count_result.txt' % file_prefix) error_directory = os.path.join(self.output_directory, 'error') if not os.path.exists(error_directory): os.makedirs(error_directory) self.error_result_path_prefix = os.path.join( error_directory, '%s_error_result' % file_prefix)
def __post_init__(self): self.from_gene_names = self.ecocyc_params['from_gene_names'] self.output_best_promoter = self.ecocyc_params['output_best_promoter'] self.output_detail_information = self.ecocyc_params[ 'output_detail_information'] self.analysis_promoter = self.ecocyc_params['analysis_promoter'] self.if_get_summary = self.ecocyc_params['if_get_summary'] self.if_get_go_table = self.ecocyc_params['if_get_go_table'] self.sequence_start_idx = None self.sequence_end_idx = None self.headers = {} self.inv_headers = [] file_name = os.path.basename(self.input_path) file_prefix = StrConverter.extract_file_name(file_name) self.ecocyc_result_path = os.path.join( self.output_directory, '%s_ecocyc_result.txt' % file_prefix) self.ecocyc_error_path = os.path.join( self.output_directory, '%s_ecocyc_error.txt' % file_prefix) self.logger = LoggerFactory()
class GeneFileReader: def __init__(self, file_path, ignore_gene=False, enable_debug_info=False): self.ignore_gene = ignore_gene self.gene_segments = [] self.dna_code = [] self.gene_name_segment_map = {} self.source = None self.enable_debug_info = enable_debug_info self.file_path = file_path self.logger = LoggerFactory(1) def build_information(self): part_status = GeneDataPartType.HeaderPart data = [] line_type = None for line_index, line in enumerate(open(self.file_path, 'r', encoding='utf8')): line_type = self.check_line_type(line, part_status) if line_type == GeneDataLineType.SourceLine: self.source = ' '.join(re.split(r'\s+', line)[1:]) elif line_type == GeneDataLineType.GeneSegmentStart: part_status = GeneDataPartType.GeneSegmentPart self.parse_gene_segment(data) elif line_type == GeneDataLineType.DNAStart: part_status = GeneDataPartType.DNAPart self.parse_gene_segment(data) elif line_type == GeneDataLineType.DNAEnd: break if part_status == GeneDataPartType.GeneSegmentPart: data.append(line) elif part_status == GeneDataPartType.DNAPart and line_type == GeneDataLineType.Other: items = re.split(r'\s+', line.strip()) for val in items[1:]: self.dna_code.append(val) self.logger.time_distance = 10 if self.enable_debug_info: self.logger.info_per_time("LineNo = %d, Added Gene Num = %d, Last Sample = %s" % ( line_index, len(self.gene_segments), self.gene_segments[-1].__dict__ if len(self.gene_segments) > 0 else "")) if part_status != GeneDataPartType.DNAPart and line_type != GeneDataLineType.DNAEnd: return False self.dna_code = ''.join(self.dna_code) check_order = None warning_num = 0 for idx, gene_segment in enumerate(self.gene_segments): name = gene_segment.gene if check_order is not None and check_order > min(gene_segment.cds): warning_num += 1 check_order = max(gene_segment.cds) if name not in self.gene_name_segment_map: self.gene_name_segment_map[name] = [] self.gene_name_segment_map[name].append(idx) self.logger.info("Total Gene Segment Number = %d, Total Gene Name Count = %d" % ( len(self.gene_segments), len(self.gene_name_segment_map))) return True def parse_gene_segment(self, data): if data is None or len(data) == 0 or self.ignore_gene: return gene_segment = GeneSegment() last_line = '' success = True complement = None for line in data: try: line_type = self.check_line_type(line, GeneDataPartType.GeneSegmentPart) line = line.strip() if line_type == GeneDataLineType.GeneSegmentStart: tag, complement = re.split(r'\s+', line) inter = list(map(lambda arg: int(arg.strip('<>')), complement.lstrip('complement(').rstrip(')').split('..'))) gene_segment.cds = inter assert (inter[0] < inter[1]) else: if line[0] == '/': last_line = line else: last_line += ' ' + line gene_segment.extract_attribute(last_line) except: self.logger.info(line) if not complement or ( not complement.startswith('join') and not complement.startswith('complement(join')): traceback.print_exc() success = False break if success: self.gene_segments.append(gene_segment) data.clear() @staticmethod def check_line_type(line: str, part_status): strip_line = line.strip() if part_status == GeneDataPartType.HeaderPart: if strip_line.startswith(ExperimentConfig.VALUE_SOURCE_START): return GeneDataLineType.SourceLine elif strip_line.startswith(ExperimentConfig.VALUE_GENE_START) or strip_line.startswith( ExperimentConfig.VALUE_REPEAT_REGION_START): return GeneDataLineType.GeneSegmentStart elif part_status == GeneDataPartType.GeneSegmentPart: if strip_line.startswith(ExperimentConfig.VALUE_GENE_START) or strip_line.startswith( ExperimentConfig.VALUE_REPEAT_REGION_START): return GeneDataLineType.GeneSegmentStart elif line[0] != ' ': return GeneDataLineType.DNAStart elif part_status == GeneDataPartType.DNAPart: if strip_line.startswith(ExperimentConfig.VALUE_DNA_PART_END): return GeneDataLineType.DNAEnd return GeneDataLineType.Other
class GeneSimilarityMatch: gene_path: str data_path: str output_directory: str top_k: int = 20 candidate_distance: int = 5 batch_size: int = 5 patience: int = 0 weighted: List[int] = field(default_factory=list) conditions: dict = None continuous_mismatch_limit: int = None order_type: OrderType = OrderType.Decrement dna_code = None rev_dna_code = None gene_name_filter = None def __post_init__(self): self.data_name = os.path.basename(self.data_path) file_name = os.path.basename(self.gene_path) file_prefix = StrConverter.extract_file_name(file_name) self.result_path = os.path.join(self.output_directory, '%s_match_result.txt' % (file_prefix)) self.gene_reader = GeneFileReader(self.data_path) self.dna_code = None self.rev_dna_code = None self.logger = LoggerFactory() self.lock = threading.Lock() self.solved = 0 self.total = 0 self.weighted_sum = sum(self.weighted) assert self.weighted_sum > 0 and len(self.weighted) == 5 def run(self, gene_name_filter: GeneLocationAnalysis = None): self.gene_name_filter = gene_name_filter self.gene_reader.build_information() self.dna_code = self.gene_reader.dna_code self.rev_dna_code = get_opposite_dna(self.gene_reader.dna_code[::-1]) with open(self.result_path, 'w', encoding='utf8') as fw: gene_sequences = open(self.gene_path, 'r', encoding='utf8').readlines()[1:] self.solved = 0 self.total = len(self.gene_reader.dna_code) * len(gene_sequences) * 2 self.logger.info_with_expire_time( 'Doing Similarity Matching: %d/%d(%.2f%%)' % ( self.solved, self.total, self.solved * 100.0 / self.total), self.solved, self.total) pending_tasks = deque() running_tasks = [] for gene_sequence in gene_sequences: items = gene_sequence.strip().split('\t') name, gene = items[0], items[1].lower() t = threading.Thread(target=self.find_candidate_for_gene, args=(name, gene, fw,)) pending_tasks.append(t) while len(pending_tasks) > 0: running_tasks = [t for t in running_tasks if t.isAlive()] while len(running_tasks) < self.batch_size and len(pending_tasks) > 0: t = pending_tasks.popleft() t.start() running_tasks.append(t) time.sleep(10) for t in running_tasks: t.join() def find_candidate_for_gene(self, name, gene, fw): t1 = HasReturnThread(func=self.match_gene, args=(name, gene, self.dna_code, False,)) t1.start() t2 = HasReturnThread(func=self.match_gene, args=(name, gene, self.rev_dna_code, True,)) t2.start() t1.join() t2.join() candidates = t1.get_result() + t2.get_result() candidates = list(candidates) candidates.sort(key=lambda arg: -arg.weighted_similarity) if self.order_type == OrderType.Increment: for candidate in candidates: candidate.weighted_similarity = -candidate.weighted_similarity results = self.render_similarity_for_candidates(gene, candidates[:self.top_k]) self.lock.acquire() idx = 1 headers = [ 'name', 'direction', 'weighted_similarity' ] for idx, similarity_name in enumerate( ['text_distance_similarity', 'direct_match_similarity', 'consistency_similarity', 'pattern_similarity', 'blat_similarity']): if self.weighted[idx] > 0: headers.append(similarity_name) headers.append('original :') sequence_headers = [ 'gene_format :', 'target_format :', 'match_format :'] for candidate_result in results: candidate = candidate_result[0] fw.write('(%d)\n' % idx) attribute = { 'name': name, 'direction': '-' if candidate.is_reverse else '+', 'weighted_similarity': '%.2f' % candidate.weighted_similarity, 'original :': gene } for idx, similarity_name in enumerate( ['text_distance_similarity', 'direct_match_similarity', 'consistency_similarity', 'pattern_similarity', 'blat_similarity']): if self.weighted[idx] > 0: attribute[similarity_name] = '%.2f' % candidate.similarity_dict[ MatchAlgorithm.get_match_algorithm_by_name(similarity_name)] sequence_content = [] offset = 1 for idx, match_algorithm in enumerate(MatchAlgorithm.get_all_items()): if self.weighted[idx] > 0: for sequence_header, value in zip(sequence_headers, candidate_result[offset:offset + 3]): value = ''.join(value) sequence_content.append(match_algorithm.name + "_" + sequence_header + '=' + value) offset += 3 fw.write('>%s/%s-%s\t%s,%s\n' % ( self.data_name.replace(".txt", ''), candidate.start, candidate.end, ','.join(['%s=%s' % (key, attribute[key]) for key in headers if key in attribute]), ','.join(sequence_content) )) fw.write('\n') idx += 1 self.lock.release() def match_gene(self, name, gene, database, is_reverse): candidates: List[MatchCandidate] = [] gene_length = len(gene) min_weighted_similarity_in_candidates = 0.0 database_length = len(database) limitation = database_length - gene_length + 1 new_solved = 0 similarity_heap = [] buff = deque() match_pattern = MatchPattern(gene, self.conditions) if self.conditions else None for start in range(limitation): weighted_similarity, similarity_dict = count_similarity(weighted=self.weighted, gene=gene, database=database, offset=start, is_reverse=is_reverse, max_patience=self.patience, match_pattern=match_pattern, continuous_mismatch_limit=self.continuous_mismatch_limit, gene_name_filter=self.gene_name_filter) if self.order_type == OrderType.Increment: weighted_similarity = -weighted_similarity new_candidate = MatchCandidate( left=start, right=start + gene_length - 1, is_reverse=is_reverse, database_length=database_length, weighted_similarity=weighted_similarity, similarity_dict=similarity_dict) added_flag = update_candidate_list(new_candidate, buff, candidates, self.candidate_distance) if added_flag: heapq.heappush(similarity_heap, candidates[-1]) if len(similarity_heap) > self.top_k: heapq.heappop(similarity_heap) top = similarity_heap[0] min_weighted_similarity_in_candidates = max(min_weighted_similarity_in_candidates, top.weighted_similarity) new_solved += 1 if random.random() * 1000 < 1: self.lock.acquire() self.solved += new_solved self.logger.info_with_expire_time( 'Doing Similarity Matching for %s[%s]: %d/%d(%.2f%%) ' '--top_k=%d ' '--top_similarity_info=[%s] ' '--gene_length=%d ' '--candidates_num=%d' % ( name, '-' if is_reverse else '+', self.solved, self.total, self.solved * 100.0 / self.total, self.top_k, similarity_heap[0].get_similarity_str() if len(similarity_heap) > 0 else 'None', gene_length, len(candidates) ), self.solved, self.total) self.lock.release() new_solved = 0 if len(candidates) > CandidateClearSize: candidates.sort(key=lambda arg: -arg.weighted_similarity) candidates = candidates[:self.top_k] while len(buff) > 0: update_candidate_list(None, buff, candidates, 1) self.lock.acquire() self.solved += new_solved + gene_length - 1 self.lock.release() return candidates def render_similarity_for_candidates(self, gene, candidates): result = [] for candidate in candidates: database = self.rev_dna_code if candidate.is_reverse else self.dna_code candidate_result = [candidate] for idx, match_algorithm in enumerate(MatchAlgorithm.get_all_items()): if self.weighted[idx] > 0: candidate_result.extend( self.render_target_dna_sequence(match_algorithm, gene, database, candidate.original_match_left)) result.append(candidate_result) return result def render_target_dna_sequence(self, match_algorithm: MatchAlgorithm, gene, database, offset): sequence_gene = [] sequence_target = [] sequence = [] tot = len(gene) if match_algorithm == MatchAlgorithm.text_distance: score, dp = compute_text_distance_similarity(gene, database, offset) i, j = tot, tot while i > 0 or j > 0: gene_a, gene_b = gene[i - 1] if i > 0 else '.', database[j + offset - 1] if j > 0 else '.' if i > 0 and j > 0 and dp[i][j] == dp[i - 1][j - 1] + should_change(gene[i - 1], database[j + offset - 1]): sequence_gene.append(gene_a) sequence_target.append(gene_b) sequence.append('*' if should_change(gene[i - 1], database[j + offset - 1]) == 0 else '.') i, j = i - 1, j - 1 elif dp[i][j] == dp[i - 1][j] + 1: sequence_gene.append(gene_a) sequence_target.append('.') sequence.append('.') i -= 1 elif dp[i][j] == dp[i][j - 1] + 1: sequence_gene.append('.') sequence_target.append(gene_b) sequence.append('.') j -= 1 else: raise ValueError('Should not go here!') sequence_gene.reverse() sequence_target.reverse() sequence.reverse() elif match_algorithm == MatchAlgorithm.direct_match: for i in range(tot): sequence_gene.append(gene[i]) sequence_target.append(database[i + offset]) if not should_change(gene[i], database[i + offset]): sequence.append('*') else: sequence.append('.') elif match_algorithm == MatchAlgorithm.consistency: score, score_queue, score_merge_idx = compute_consistency_similarity(gene, database, offset, self.patience) sequence_gene.extend(gene[:]) sequence_target.extend(database[offset:offset + tot]) cur_pos = 0 for idx, (same_cnt, same_end) in enumerate(score_queue): same_start = same_end - same_cnt while cur_pos < same_start: if score_merge_idx[0] < idx <= score_merge_idx[1]: sequence.append('-') else: sequence.append('.') cur_pos += 1 while cur_pos < same_end: sequence.append('*') cur_pos += 1 while cur_pos < tot: sequence.append('.') cur_pos += 1 elif match_algorithm == MatchAlgorithm.pattern: for i in range(tot): sequence_gene.append(gene[i]) sequence_target.append(database[i + offset]) if not should_change(gene[i], database[i + offset]): sequence.append('*') else: sequence.append('.') elif match_algorithm == MatchAlgorithm.blat: flag, pos_data_end = compute_blat_similarity(gene, database, offset) pos_data = offset pos_gene = 0 while pos_gene < 4: if should_change(gene[pos_gene], database[pos_data]) > 0: sequence_gene.append('-') sequence_target.append(database[pos_data]) sequence.append('.') pos_data += 1 else: sequence_gene.append(gene[pos_gene]) sequence_target.append(database[pos_data]) sequence.append('*') pos_gene += 1 pos_data += 1 rev_pos_gene = 7 rev_pos_data = pos_data_end - 1 rev_sequence_gene = [] rev_sequence_target = [] rev_sequence = [] while rev_pos_gene > 3: if should_change(gene[rev_pos_gene], database[rev_pos_data]) > 0: rev_sequence_gene.append('-') rev_sequence_target.append(database[rev_pos_data]) rev_sequence.append('.') rev_pos_data -= 1 else: rev_sequence_gene.append(gene[rev_pos_gene]) rev_sequence_target.append(database[rev_pos_data]) rev_sequence.append('*') rev_pos_gene -= 1 rev_pos_data -= 1 while pos_data <= rev_pos_data: sequence_gene.append('-') sequence_target.append(database[pos_data]) sequence.append('.') pos_data += 1 sequence_gene.extend(rev_sequence_gene[::-1]) sequence_target.extend(rev_sequence_target[::-1]) sequence.extend(rev_sequence[::-1]) return sequence_gene, sequence_target, sequence
class KeggAnalysis: input_path: str download_directory: str output_directory: str is_gene: bool = True def __post_init__(self): os.makedirs(self.download_directory, exist_ok=True) file_name = os.path.basename(self.input_path) file_prefix = StrConverter.extract_file_name(file_name) self.kegg_result_path = os.path.join( self.output_directory, '%s_kegg_result.txt' % file_prefix) self.kegg_error_path = os.path.join(self.output_directory, '%s_kegg_error.txt' % file_prefix) self.logger = LoggerFactory() def run(self): fstd = open(self.kegg_result_path, 'w') ferr = open(self.kegg_error_path, 'w') solved = 0 failed = 0 with open(self.input_path) as f: datas = [data.strip() for data in f.readlines()] total = len(datas) with multiprocessing.Pool(multiprocessing.cpu_count()) as p: func = self.work_for_gene if self.is_gene else self.work_for_kegg for flag, outputs in p.imap(func, datas): if flag: fstd.write('\n'.join(outputs) + '\n') fstd.flush() solved += 1 else: ferr.write('%s\n' % outputs) ferr.flush() failed += 1 self.logger.info_with_expire_time( "Completed %d/%d, success rate %d/%d=%.2f%%" % (solved + failed, total, solved, solved + failed, solved * 100.0 / (solved + failed)), solved + failed, total) fstd.close() ferr.close() def work_for_gene(self, gene): try: outputs = [] for kegg_id in self.get_kegg_id(gene): for flag, kegg_pathway in self.work_for_kegg(kegg_id): if not flag: return False, gene outputs.append('%s\t%s' % (gene, kegg_pathway)) return True, outputs except: traceback.print_exc() return False, gene def work_for_kegg(self, kegg_id): try: names, pathways = self.get_pathway(kegg_id) if self.is_gene: return True, ['%s\t%s' % (kegg_id, '; '.join(pathways))] else: outputs = [ '%s\t%s\t%s' % (kegg_id, name, '; '.join(pathways)) for name in names ] return True, outputs except: return False, kegg_id def get_kegg_id(self, gene): target_path = os.path.join(self.download_directory, 'get_kegg_id_%s.html' % gene) if not os.path.exists(target_path): url = "https://www.kegg.jp/dbget-bin/www_bfind_sub?mode=bfind&max_hit=1000&dbkey=kegg&keywords=" + gene for retry_time in range(3): try: req = request.Request(url=url) x = request.urlopen(req, timeout=30) body = x.read() for item in x.headers._headers: if item[0].lower( ) == 'content-encoding' and item[1].lower() == 'gzip': body = gzip.decompress(body) body = body.decode('utf-8') with open(target_path, 'w', encoding='utf8') as fw: fw.write(body) break except: traceback.print_exc() self.logger.info("Retry %d for %s" % (retry_time, gene)) if not os.path.exists(target_path): raise ValueError("Gene not found from web: %s" % gene) kegg_ids = self.extract_kegg_id(target_path) if len(kegg_ids) == 0: os.remove(target_path) raise ValueError("Gene extract failed: %s" % gene) return kegg_ids def extract_kegg_id(self, file_path): with open(file_path, 'r') as f: body = f.read() parser = KeggIdHTMLParser() parser.feed(body) return parser.kegg_id_map.keys() def get_pathway(self, kegg_id): target_path = os.path.join(self.download_directory, 'get_pathway_%s.html' % kegg_id) if not os.path.exists(target_path): url = "https://www.kegg.jp/dbget-bin/www_bget?ko:" + kegg_id for retry_time in range(3): try: req = request.Request(url=url) x = request.urlopen(req, timeout=30) body = x.read() for item in x.headers._headers: if item[0].lower( ) == 'content-encoding' and item[1].lower() == 'gzip': body = gzip.decompress(body) body = body.decode('utf-8') with open(target_path, 'w', encoding='utf8') as fw: fw.write(body) break except: traceback.print_exc() self.logger.info("Retry %d for %s" % (retry_time, kegg_id)) if not os.path.exists(target_path): raise ValueError("Kegg not found from web: %s" % kegg_id) names, pathways = self.extract_name_pathway(target_path) if len(pathways) == 0 or len(names) == 0: os.remove(target_path) if len(pathways) == 0: pathways = ["No Pathway"] if len(names) == 0: names = ["Not Found"] return names, pathways def extract_name_pathway(self, file_path): with open(file_path, 'r') as f: body = f.read() parser = KeggPathwayHTMLParser() parser.feed(body) return parser.names, parser.pathways
class GeneStreamAnalysis: data_path: str input_path: str output_directory: str mode: str = 'rna' limit: int = 200 def __post_init__(self): self.inter_path = self.input_path if self.mode == 'inter' else None self.rna_path = self.input_path if self.mode == 'rna' else None file_name = os.path.basename(self.input_path) file_prefix = StrConverter.extract_file_name(file_name) suffix = 'stream_%d' % self.limit if self.mode == 'rna' else 'gene' self.result_path = os.path.join( self.output_directory, '%s_%s_result.txt' % (file_prefix, suffix)) self.gene_reader = GeneFileReader(self.data_path) self.logger = LoggerFactory() self.headers = {} self.inv_headers = [] def get_utr_between(self, first, second): left = self.gene_reader.gene_segments[first].cds[1] right = self.gene_reader.gene_segments[second].cds[0] - 1 return self.gene_reader.dna_code[left:right] def work_for_gene_index(self, index, start, end): gene_segment = self.gene_reader.gene_segments[index] assert gene_segment.cds[0] == min(start, end) assert gene_segment.cds[1] == max(start, end) seq = self.gene_reader.dna_code[gene_segment.cds[0] - 1:gene_segment.cds[1]] upstream = self.gene_reader.dna_code[ max(gene_segment.cds[0] - self.limit - 1, 0):gene_segment.cds[0] - 1] downstream = self.gene_reader.dna_code[gene_segment. cds[1]:gene_segment.cds[1] + self.limit] if start > end: seq = get_opposite_dna(seq[::-1]) upstream, downstream = get_opposite_dna( downstream[::-1]), get_opposite_dna(upstream[::-1]) return seq, upstream, downstream def work_for_gene(self, gene_idx, gene_name, start, end, fw): if gene_name.find('->') >= 0: gene_name = gene_name[:gene_name.index('->')] if gene_name not in self.gene_reader.gene_name_segment_map: self.logger.info("%s not found in data" % gene_name) return cnt = 1 fw.write('%d. %s\n' % (gene_idx, gene_name)) for idx in self.gene_reader.gene_name_segment_map[gene_name]: seq, up, down = self.work_for_gene_index(idx, start, end) fw.write('%d)\n' % cnt) fw.write('position\t%d %s %d\n' % (self.gene_reader.gene_segments[idx].cds[0], '->' if start < end else '<-', self.gene_reader.gene_segments[idx].cds[1])) fw.write('product\t%s\n' % self.gene_reader.gene_segments[idx].product) fw.write('GeneID\t%s\n' % self.gene_reader.gene_segments[idx].gene_id) fw.write('stream\t%s\n' % seq) if up: fw.write('upstream\t%s\n' % up) if down: fw.write('downstream\t%s\n' % down) fw.write('\n') cnt += 1 def check_inter(self, fw): for line in open(self.inter_path, 'r', encoding='utf8'): line = line.strip() if line == '': continue left, right = map(int, line.split(',')) up, down = None, None for gene_segment in self.gene_reader.gene_segments: if max(gene_segment.cds) < left: if not up or max(up.cds) < max(gene_segment.cds): up = gene_segment if min(gene_segment.cds) > right: if not down or min(down.cds) > min(gene_segment.cds): down = gene_segment fw.write('%s:\n' % line) if up: fw.write('up-gene\t%s\nup-position\t%s\nup-product\t%s\n' % (up.gene, '-'.join(map(str, up.cds)), up.product)) if down: fw.write( 'down-gene\t%s\ndown-position\t%s\ndown-product\t%s\n' % (down.gene, '-'.join(map(str, down.cds)), down.product)) fw.write('\n') def generate_header(self, items): for idx, col_name in enumerate(items.strip().split('\t')): self.headers[col_name] = idx self.inv_headers.append(col_name) def run(self): self.gene_reader.build_information() with open(self.result_path, 'w', encoding='utf8') as fw: if self.mode == 'rna': lines = open(self.rna_path, 'r', encoding='utf8').readlines() self.generate_header(lines[0]) for gene_idx, line in enumerate(lines[1:]): items = line.split('\t') gene_name, start, end = items[self.headers['gene']], int( items[self.headers['map_start_pos']]), int( items[self.headers['map_end_pos']]) self.work_for_gene(gene_idx, gene_name.strip(), start, end, fw) elif self.mode == 'inter': self.check_inter(fw) else: raise ValueError(self.mode)
class ClusterMatcher: rna_tag: str input_path: str output_directory: str = None def __post_init__(self): self.logger = LoggerFactory(1) file_prefix = StrConverter.extract_file_name(self.rna_tag) self.cluster_result_path = os.path.join( self.output_directory, '%s_cluster_result.txt' % file_prefix) self.sample_result_path = os.path.join( self.output_directory, '%s_sample_result.txt' % file_prefix) self.all_result_path = os.path.join(self.output_directory, '%s_all_result.txt' % file_prefix) self.only_result_path = os.path.join( self.output_directory, '%s_only_result.txt' % file_prefix) def run(self): if not os.path.exists(self.output_directory): os.makedirs(self.output_directory) data = self.read_data() self.logger.info('data size = %d' % len(data)) self.compare_data(data) def format_data(self, index, lines): ig, gene_no = self.should_ignore(index, lines[3]) if ig and not gene_no: return None dic = {'ignored': ig, 'index': index, 'geneNo': gene_no} if dic['ignored']: return dic data = [{}, {}, {}] action = 0 others = lines[:4] for line in lines[4:]: if line.strip() == '': continue if line.strip().startswith(self.rna_tag): action = 1 self.update_sequence(index, data[0], line) elif action == 1: action = 2 self.update_sequence(index, data[1], line) elif action == 2: action = 0 self.update_sequence(index, data[2], line) else: action = 0 dic['data'] = data dic['others'] = others return dic def read_data(self): data = [] buff = [] index = 0 for line in open(self.input_path, 'r'): if line.startswith('>>'): if len(buff) > 0: index += 1 dic = self.format_data(index, buff) if dic: data.append(dic) buff = [] buff.append(line) if len(buff) > 0: index += 1 data.append(self.format_data(index, buff)) return data def compare_data(self, data): same = {} cluster = {} si = len(data) sample_cluster = {} for i in range(si): if i in same or data[i]['ignored']: continue same[i] = i cluster[i] = [data[i]['geneNo']] sample_cluster[i] = [data[i]] for j in range(i + 1, si): if j in same or data[j]['ignored']: continue if data[i]['data'][1]['seq'].upper( ) == data[j]['data'][1]['seq'].upper(): same[j] = i cluster[i].append(data[j]['geneNo']) sample_cluster[i].append(data[j]) fw = open(self.cluster_result_path, 'w') for _ in cluster: fw.write('%d\t%s\n' % (len(cluster[_]), ','.join(map(str, cluster[_])))) fw.close() fw = open(self.sample_result_path, 'w') for _ in sample_cluster: sample = sample_cluster[_][0] fw.write(''.join(sample['others'])) fw.write('\n') for elem in sample['data']: fw.write('%19s %8s %131s %8s\n' % (elem.get('name', ''), elem.get('start', ''), elem.get('seq', ''), elem.get('end', ''))) fw.write('\n') fw.close() fw_all = open(self.all_result_path, 'w') fw_only = open(self.only_result_path, 'w') other = set() for _ in sample_cluster: for item in sample_cluster[_]: elem = item['data'][-1] flag = True for x in elem['seq'].strip(): if x.upper() in set('AUCG'): continue other.add(x.upper()) flag = False fw_all.write('>%s/%s-%s\n%s\n' % (elem['name'], elem['start'], elem['end'], elem['seq'].upper())) fw_all.write('\n') if not flag: continue fw_only.write('>%s/%s-%s\n%s\n' % (elem['name'], elem['start'], elem['end'], elem['seq'].upper())) fw_only.write('\n') fw_all.write('\n') fw_only.write('\n') fw_all.close() fw_only.close() self.logger.info('\n'.join(list(other))) def should_ignore(self, index, line): info = re.split(r'\s+', line.strip()) gene_no = info[0].strip('()') if info[1] == '?': return False, gene_no elif info[1] == '!': return False, gene_no else: self.logger.info('ignore check failed: %d' % index) return True, None def update_sequence(self, index, elem, line): if line.strip()[-1] not in ExperimentConfig.SET_NUMBER_RANGE10: elem['seq'] = elem.get('seq', '') + line.strip() return try: info = re.split(r'\s+', line.strip()) name = info[0] start = int(info[1]) end = int(info[-1]) seq = ' '.join(info[2:-1]) except: self.logger.info('value num is not ok: %d, %s' % (index, line)) sys.exit(1) if elem.get('name', name) != name: self.logger.info('name is not equal: %d' % index) sys.exit(1) elem['name'] = name start = int(start) end = int(end) if 'start' not in elem: elem['start'] = start elem['end'] = end elem['seq'] = elem.get('seq', '') + seq
class EcocycAnalysis: input_path: str download_directory: str output_directory: str ecocyc_params: dict cookie: str = None def __post_init__(self): self.from_gene_names = self.ecocyc_params['from_gene_names'] self.output_best_promoter = self.ecocyc_params['output_best_promoter'] self.output_detail_information = self.ecocyc_params[ 'output_detail_information'] self.analysis_promoter = self.ecocyc_params['analysis_promoter'] self.if_get_summary = self.ecocyc_params['if_get_summary'] self.if_get_go_table = self.ecocyc_params['if_get_go_table'] self.sequence_start_idx = None self.sequence_end_idx = None self.headers = {} self.inv_headers = [] file_name = os.path.basename(self.input_path) file_prefix = StrConverter.extract_file_name(file_name) self.ecocyc_result_path = os.path.join( self.output_directory, '%s_ecocyc_result.txt' % file_prefix) self.ecocyc_error_path = os.path.join( self.output_directory, '%s_ecocyc_error.txt' % file_prefix) self.logger = LoggerFactory() def run(self): if self.from_gene_names: self.work_from_gene_list_file() else: self.work_from_url_list_file() def generate_header(self, items): for idx, col_name in enumerate(items.strip().split('\t')): self.headers[col_name] = idx self.inv_headers.append(col_name) self.sequence_end_idx = self.headers.get('gene_start_pos') self.sequence_start_idx = self.headers.get('promoter_pos') def work_from_gene_list_file(self): solve_cnt = 0 succ_cnt = 0 fail_cnt = 0 fail_json_cnt = 0 fw_error = open(self.ecocyc_error_path, 'w', encoding='utf8') fw_result = open(self.ecocyc_result_path, 'w', encoding='utf8') gene_items = list( filter(lambda arg: arg.strip() != '', open(self.input_path, 'r', encoding='utf8').readlines())) fw_result.write(gene_items[0]) self.generate_header(gene_items[0]) total_cnt = len(gene_items) - 1 self.logger.info_with_expire_time( 'Ecocyc analysis %d/%d=%.2f%%' % (solve_cnt, total_cnt, solve_cnt * 100.0 / total_cnt), solve_cnt, total_cnt) for line in gene_items[1:]: try: result = {} infos = line.strip().split('\t') for idx, info in enumerate(infos): result[self.inv_headers[idx]] = info gene_name = result['gene'] if gene_name.find('->') > 0: gene_name, result['gene'] = result['gene'].split('->') self.write_body(gene_name=gene_name) ecocyc_id = self.get_ecocyc_id(prefix='gene_', gene_name=gene_name) result['ecocyc_id'] = ecocyc_id self.write_body(ecocyc_id=ecocyc_id, page_type="tu") self.analysis_xml(prefix='tu_', ecocyc_id=ecocyc_id, result=result) if self.if_get_summary: self.write_body(ecocyc_id=ecocyc_id, page_type="summary") self.analysis_xml(prefix='summary_', ecocyc_id=ecocyc_id, result=result) if self.analysis_promoter: flag_json = self.write_body(ecocyc_id=ecocyc_id, page_type="promoter") if flag_json: self.analysis_json(prefix='promoter_', ecocyc_id=ecocyc_id, result=result, gene_name=result['gene']) if not flag_json: fail_json_cnt += 1 if self.if_get_go_table: self.write_body(ecocyc_id=ecocyc_id, page_type='go') self.analysis_xml(prefix='go_', ecocyc_id=ecocyc_id, result=result) if result['gene'] != gene_name: result['gene'] = gene_name + '->' + result['gene'] fw_result.write(self.extract_output(result) + '\n') fw_result.flush() succ_cnt += 1 except: fw_result.write('%s\tNot Found\n' % result['gene']) traceback.print_exc() fw_error.write(gene_name + '\n') fw_error.flush() fail_cnt += 1 solve_cnt += 1 self.logger.info_with_expire_time( 'Ecocyc analysis %d/%d=%.2f%%, success_cnt=%d, fail_cnt=%d, json_download_fail=%d' % (solve_cnt, total_cnt, solve_cnt * 100.0 / total_cnt, succ_cnt, fail_cnt, fail_json_cnt), solve_cnt, total_cnt) fw_error.close() fw_result.close() def extract_output(self, result): output = [] for name in self.inv_headers: if name == 'product_type': for key in [ 'enzyme', 'rna', 'protein', 'polypeptide', 'function when intact', 'transporter' ]: if result.get(key, '') != '': result['product_type'] = key result['product'] = result[key] elif result.get(name, '') in ['', 'Not Found']: try: if name in [ 'status', 'promoter_name', 'promoter_pos', 'gene_start_pos' ]: if result['table_unites'][0] == 'Not Found': if name == 'status': result['status'] = 'Not Found' else: promoter = result['table_unites'][1] result['status'] = 'Found' result['gene_start_pos'] = result['table_unites'][ 0] result[ 'promoter_name'] = promoter.get_promoter_name( ) result[ 'promoter_pos'] = promoter.get_promoter_start_site( int_pos=True) except: pass output.append(str(result.get(name, ''))) return '\t'.join(output) def work_from_url_list_file(self): solve_cnt = 0 succ_cnt = 0 fail_cnt = 0 fail_json_cnt = 0 buff = [] fw_error = open(self.ecocyc_error_path, 'w', encoding='utf8') fw_result = open(self.ecocyc_result_path, 'w', encoding='utf8') items = self.extract_urls_from_file() total_cnt = len(items) self.logger.info_with_expire_time( 'Ecocyc analysis %d/%d=%.2f%%' % (solve_cnt, total_cnt, solve_cnt * 100.0 / total_cnt), solve_cnt, total_cnt) for url, mock_name, title in items: try: result = {} self.write_body(url=url, mock_name=mock_name) ecocyc_id = self.analysis_xml(prefix='url_', gene_name=mock_name, result=result) flag_json = False if ecocyc_id is not None: result['ecocyc_id'] = ecocyc_id flag_json = self.write_body(ecocyc_id=ecocyc_id, page_type="promoter") if flag_json: self.analysis_json(prefix='promoter_', ecocyc_id=ecocyc_id, result=result, gene_name=result['gene']) if not flag_json: fail_json_cnt += 1 temp = self.format_result_json(result, fw_error) if temp.strip() == '': raise ValueError('No result found') buff.append(temp) fw_result.write(buff[-1]) max_col = max(max_col, len(buff[-1].split('\t'))) fw_result.flush() succ_cnt += 1 except: traceback.print_exc() fw_error.write(url + '\t' + mock_name + '\t' + title + '\n') fw_error.flush() fail_cnt += 1 solve_cnt += 1 self.logger.info_with_expire_time( 'Ecocyc analysis %d/%d=%.2f%%, success_cnt=%d, fail_cnt=%d, json_download_fail=%d' % (solve_cnt, total_cnt, solve_cnt * 100.0 / total_cnt, succ_cnt, fail_cnt, fail_json_cnt), solve_cnt, total_cnt) fw_error.close() fw_result.close() def extract_urls_from_file(self): with open(self.input_path, 'r', encoding='utf8') as fr: body = ''.join(fr.readlines()) parser = UrlHTMLParser() parser.feed(body) return parser.ecocycs def write_body(self, url=None, mock_name=None, ecocyc_id=None, gene_name=None, page_type="tu"): if url is not None: urls = [url] origin_path = os.path.join(self.download_directory, mock_name + '.html') file_path = os.path.join(self.download_directory, 'url_' + mock_name + '.html') self.transform_file(origin_path, file_path) elif gene_name is not None: urls = [ 'http://ecocyc.org/ECOLI/substring-search?type=GENE&object=%s&geneSearch=Gene+Search' % gene_name ] origin_path = os.path.join(self.download_directory, gene_name + '.html') file_path = os.path.join(self.download_directory, 'gene_' + gene_name + '.html') self.transform_file(origin_path, file_path) elif ecocyc_id is not None: if page_type == "tu": urls = [ 'https://ecocyc.org/gene?orgid=ECOLI&id=%s#tab=TU' % ecocyc_id ] origin_path = os.path.join(self.download_directory, ecocyc_id + '.html') file_path = os.path.join(self.download_directory, 'tu_' + ecocyc_id + '.html') self.transform_file(origin_path, file_path) elif page_type == "promoter": urls = [ 'https://ecocyc.org/tmp/ptools-images/ECOLI/TU_dir=1_topdir=-1_NO-PLOC_%s.wg' % ecocyc_id, 'https://ecocyc.org/tmp/ptools-images/ECOLI/TU_dir=1_topdir=-1_NO-INDEX_NO-PLOC_%s.wg' % ecocyc_id, 'https://ecocyc.org/tmp/ptools-images/ECOLI/TU_dir=1_topdir=1_NO-INDEX_NO-PLOC_%s.wg' % ecocyc_id, 'https://ecocyc.org/tmp/ptools-images/ECOLI/TU_dir=1_topdir=1_NO-PLOC_%s.wg' % ecocyc_id ] origin_path = os.path.join(self.download_directory, ecocyc_id + '.json') file_path = os.path.join(self.download_directory, 'promoter_' + ecocyc_id + '.json') self.transform_file(origin_path, file_path) elif page_type == "summary": urls = [ 'https://biocyc.org/gene-tab?id=%s&orgid=ECOLI&tab=SUMMARY' % ecocyc_id ] file_path = os.path.join(self.download_directory, 'summary_' + ecocyc_id + '.html') elif page_type == "go": urls = [ 'https://biocyc.org/gene-tab?id=%s&orgid=ECOLI&tab=GO' % ecocyc_id ] file_path = os.path.join(self.download_directory, 'go_' + ecocyc_id + '.html') else: raise ValueError('Parameter not correct') if os.path.exists(file_path): return True headers = { "Host": "ecocyc.org", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", "Accept": "*/*", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "cors", 'Accept-Encoding': "gzip, deflate, br", 'Connection': "Keep-Alive", 'Cookie': self.cookie } for retry_time in range(3): flag = False for url in urls: try: if retry_time == 0: x = request.urlopen(url, timeout=30) body = x.read().decode('utf8') with open(file_path, 'w', encoding='utf8') as fw: fw.write(body) flag = True break elif retry_time == 1: url = "https://biocyc.org/tmp/ptools-images/ECOLI/%s_REG-SUMMARY.wg" % ecocyc_id req = request.Request(url=url, headers=headers) x = request.urlopen(req, timeout=30) body = x.read() break else: req = request.Request(url=url, headers=headers) x = request.urlopen(req, timeout=30) body = x.read() for item in x.headers._headers: if item[0].lower() == 'content-encoding' and item[ 1].lower() == 'gzip': body = gzip.decompress(body) body = body.decode('utf-8') with open(file_path, 'w', encoding='utf8') as fw: fw.write(body) flag = True break except: continue if flag: break return flag def analysis_xml(self, prefix, ecocyc_id, result): xml_path = os.path.join(self.download_directory, prefix + ecocyc_id + '.html') with open(xml_path, 'r', encoding='utf8') as fr: body = ''.join(fr.readlines()) if prefix == 'summary_': parser = EcocycHTMLParser(do_extract_summary=True) parser.feed(''.join(body)) result['summary'] = parser.extract_attr['summary'] elif prefix == 'go_': parser = GoHTMLParser() parser.feed(''.join(body)) result['go'] = ';'.join( ['%s=%s' % (k, v) for k, v in parser.go_table]) else: parser = EcocycHTMLParser() parser.feed(''.join(body)) for k, v in parser.extract_attr.items(): if k == 'map position': result['map_start_pos'] = v[0] result['map_end_pos'] = v[1] elif v is not None: result[k] = v.strip('__#####__') return parser.ecocyc_id def analysis_json(self, prefix, ecocyc_id, result, gene_name=None): json_path = os.path.join(self.download_directory, prefix + ecocyc_id + '.json') with open(json_path, 'r') as fr: body = ''.join(fr.readlines()) body = json.loads(body) data = [] target_gene = None for link in body['links']: gene_tu = GeneTUInfo(link) if self.output_best_promoter and gene_name is not None: if gene_tu.is_gene(gene_name): target_gene = gene_tu data.append(gene_tu) if self.output_best_promoter: flag = False if target_gene is not None: target_promoter, near_gene_pos = get_target_promoter( target_gene, data) if target_promoter is not None: data = [near_gene_pos, target_promoter] flag = True if not flag: data = ['Not Found'] else: data = get_all_promoters(data, True) result['table_unites'] = data def get_ecocyc_id(self, prefix, gene_name): xml_path = os.path.join(self.download_directory, prefix + gene_name + '.html') with open(xml_path, 'r') as fr: body = ''.join(fr.readlines()) parser = EcocycHTMLParser(do_extract_id=True, gene_name=gene_name) parser.feed(''.join(body)) if parser.ecocyc_id is None: raise RuntimeError('Ecocyc is is None, parse error for %s' % gene_name) return parser.ecocyc_id @staticmethod def transform_file(original_path, new_path): if not os.path.exists(new_path) and os.path.exists(original_path): os.rename(original_path, new_path) def format_result_json(self, result, fw_error=None): keys = ['gene', 'cluster'] info = [result.get(key, '') for key in keys] product_type = '' product = '' for key in [ 'rna', 'protein', 'polypeptide', 'enzyme', 'function when intact', 'transporter' ]: val = result.get(key, '') if val is None or val == '': continue product_type = key product = val info.extend([product_type, product]) for key in ['location']: # , 'reaction']: val = result.get(key, '') if val is None: val = '' info.append(val) table_unites = result.get('table_unites', []) if self.output_best_promoter and len(table_unites) == 2 and type( table_unites[0]) is int: near_gene_pos, promoter = table_unites info.extend(['Gene Start Position', near_gene_pos]) info.extend([ promoter.get_promoter_name(), promoter.get_promoter_start_site(int_pos=True) ]) info = list(map(str, info)) else: for promoter in table_unites: info.extend([ promoter.get_promoter_name(), promoter.get_promoter_start_site() ]) info = list(map(str, info)) if self.output_best_promoter and fw_error is not None: fw_error.write('\t'.join(info) + '\n') return '\t'.join(info) + '\n'
class NeighborAnalysis: input_path: str download_directory: str output_directory: str keep_prefix_num: int = 1 def __post_init__(self): self.logger = LoggerFactory(3) file_name = os.path.basename(self.input_path) file_prefix = StrConverter.extract_file_name(file_name) self.neighbor_result_path = os.path.join( self.output_directory, '%s_neighbor_result.txt' % file_prefix) self.next_gene_result_path = os.path.join( self.output_directory, '%s_next_neighbor_result.txt' % file_prefix) self.source_count_path = os.path.join( self.output_directory, '%s_source_count_result.txt' % file_prefix) self.gene_count_path = os.path.join( self.output_directory, '%s_gene_count_result.txt' % file_prefix) error_directory = os.path.join(self.output_directory, 'error') if not os.path.exists(error_directory): os.makedirs(error_directory) self.error_result_path_prefix = os.path.join( error_directory, '%s_error_result' % file_prefix) def download_and_analysis(self, key, inter, file_path): self.logger.info('\nstart working for ' + key) try: if not os.path.exists(file_path): if not DataDownloadTool.download_data(key, file_path): return False, None flag, data = self.analysis_download_file(file_path, inter) if flag: return True, data except: traceback.print_exc() return False, None return False, None def find_neighbor_batch(self, datas, iteration_time): fw = open(self.neighbor_result_path, 'a') solve_cnt, success_cnt, total_cnt = 0, 0, len(datas) logger = LoggerFactory(1) logger.info_with_expire_time( '[Iteration %d]completed %d/%d=%.2f%%' % (iteration_time, solve_cnt, total_cnt, solve_cnt * 100.0 / total_cnt), solve_cnt, total_cnt) fe = open( self.error_result_path_prefix + ".iter-%d.txt" % iteration_time, 'w') fail_datas = [] for key, inter, additional in datas: solve_cnt += 1 file_path = os.path.join(self.download_directory, key + '.txt') flag, data = self.download_and_analysis(key, inter, file_path) if flag: success_cnt += 1 direction = '+' if (inter[0] < inter[1]) else '-' fw.write('>%s/%s-%s(%s)\n' % (key, inter[0], inter[1], direction)) if additional != '': for kv in additional.split(','): k, v = kv.split('=') fw.write('%s\t%s\n' % (k, v)) fw.write('SOURCE\t%s\n' % (data.get('source', 'UNKNOWN'))) for elem in data['data']: fw.write('%s\n' % elem) fw.write('sequence\t%s\n' % (data.get('sequence', ''))) fw.write('\n') fw.flush() else: if os.path.exists(file_path): os.remove(file_path) fe.write('>%s/%s-%s\n' % (key, inter[0], inter[1])) fe.flush() fail_datas.append([key, inter]) self.logger.info_with_expire_time( '[Iteration %d]completed %d/%d=%.2f%%, success %d/%d=%.2f%%' % (iteration_time, solve_cnt, total_cnt, solve_cnt * 100.0 / total_cnt, success_cnt, solve_cnt, success_cnt * 100.0 / solve_cnt), solve_cnt, total_cnt) time.sleep(random.random()) self.logger.info('[Iteration %d]done .' % iteration_time) fw.close() return fail_datas def extract_data(self, buff): name = buff[0][1:].strip() name, inter = name.split('/') direction = inter[-2] left, right = map(int, inter[:-3].split('-')) _, source = buff[1].strip().split('\t') source = self.get_prefix(source) target = None for line in buff[2:]: try: gene = self.read_gene(line) if self.check_gene(left, right, direction, gene, target): target = gene except: continue return { 'name': name, 'direction': direction, 'left': left, 'right': right, 'source': source, 'gene': target } def get_prefix(self, source): if self.keep_prefix_num > 0: return ' '.join(re.split('\s+', source)[:self.keep_prefix_num]) return source def source_gene_distribution_analysis(self): self.logger.info('Start source_gene_distribution_analysis') datas = [] buff = [] for line in open(self.neighbor_result_path, 'r'): if len(line.strip()) == 0: if len(buff) > 0: datas.append(self.extract_data(buff)) buff = [] else: buff.append(line.strip()) if len(buff) > 0: datas.append(self.extract_data(buff)) source_counter = Counter() gene_counter = Counter() with open(self.next_gene_result_path, 'w') as fw: for data in datas: if data['gene'] is None: continue fw.write('>%s/%s-%s(%s)\n' % (data['name'], data['left'], data['right'], data['direction'])) fw.write('SOURCE\t%s\n' % (data['source'])) fw.write('%s-%s\t%s\n\n' % (data['gene']['left'], data['gene']['right'], data['gene']['gene'])) source_counter[data['source']] += 1 gene_counter[data['gene']['gene']] += 1 total = len(datas) for file_path, counter in [(self.source_count_path, source_counter), (self.gene_count_path, gene_counter)]: with open(file_path, 'w') as fw: for k, v in counter.most_common(): fw.write('%s\t%d\t%.4f%%\n' % (k, v, v * 100.0 / total)) self.logger.info('End source_gene_distribution_analysis') def run(self): open(self.neighbor_result_path, 'w').close() with open(self.input_path, 'r') as f: unsolved_datas = filter(lambda arg: arg[0] is not None, [ DataDownloadTool.format_data(line) for line in filter( lambda arg: arg.startswith('>'), f.readlines()) ]) unsolved_datas = list(unsolved_datas) for iteration_time in range(1, ExperimentConfig.MAX_ITERATION_TIME + 1): unsolved_datas = self.find_neighbor_batch(unsolved_datas, iteration_time) if len(unsolved_datas) == 0: break print("Failed data:" + str(len(unsolved_datas)) + "," + str(unsolved_datas)) self.source_gene_distribution_analysis() @staticmethod def analysis_download_file(download_file_path, inter): left = min(inter) right = max(inter) gene_info = GeneFileReader(download_file_path) if not gene_info.build_information(): return False, None near_small = None near_big = None res_set = set() for idx, gene_segment in enumerate(gene_info.gene_segments): if gene_segment.cds[1] <= left: if not near_small or near_small.cds[1] < gene_segment.cds[1]: near_small = gene_segment if gene_segment.cds[0] >= right: if not near_big or near_big.cds[0] > gene_segment.cds[0]: near_big = gene_segment if gene_segment.cds[0] <= left <= gene_segment.cds[1]: res_set.add(str(gene_segment)) if gene_segment.cds[0] <= right <= gene_segment.cds[1]: res_set.add(str(gene_segment)) if near_small: res_set.add(near_small) if near_big: res_set.add(near_big) sequence = gene_info.dna_code[left - 1:right] if inter[0] > inter[1]: sequence = get_opposite_dna(sequence[::-1]) return True, { 'source': gene_info.source, 'data': list(res_set), 'sequence': sequence } @staticmethod def check_gene(left, right, direction, gene, target): if direction == '-': peer = min(left, right) gene_peer = max(gene['left'], gene['right']) if peer > gene_peer: return target is None or max(target['left'], target['right']) < gene_peer elif direction == '+': peer = max(left, right) gene_peer = min(gene['left'], gene['right']) if peer < gene_peer: return target is None or min(target['left'], target['right']) > gene_peer else: raise ValueError('Direction should be - or +') @staticmethod def read_gene(line): inter, gene = line.strip().split('\t') left, right = map(int, inter.split('-')) return {'gene': gene, 'left': left, 'right': right}