Esempio n. 1
0
 def __post_init__(self):
     os.makedirs(self.download_directory, exist_ok=True)
     file_name = os.path.basename(self.input_path)
     file_prefix = StrConverter.extract_file_name(file_name)
     self.kegg_result_path = os.path.join(
         self.output_directory, '%s_kegg_result.txt' % file_prefix)
     self.kegg_error_path = os.path.join(self.output_directory,
                                         '%s_kegg_error.txt' % file_prefix)
     self.logger = LoggerFactory()
Esempio n. 2
0
    def __init__(self, file_path, ignore_gene=False, enable_debug_info=False):
        self.ignore_gene = ignore_gene
        self.gene_segments = []
        self.dna_code = []
        self.gene_name_segment_map = {}
        self.source = None

        self.enable_debug_info = enable_debug_info
        self.file_path = file_path
        self.logger = LoggerFactory(1)
Esempio n. 3
0
 def __post_init__(self):
     self.logger = LoggerFactory(1)
     file_prefix = StrConverter.extract_file_name(self.rna_tag)
     self.cluster_result_path = os.path.join(
         self.output_directory, '%s_cluster_result.txt' % file_prefix)
     self.sample_result_path = os.path.join(
         self.output_directory, '%s_sample_result.txt' % file_prefix)
     self.all_result_path = os.path.join(self.output_directory,
                                         '%s_all_result.txt' % file_prefix)
     self.only_result_path = os.path.join(
         self.output_directory, '%s_only_result.txt' % file_prefix)
Esempio n. 4
0
 def __post_init__(self):
     self.inter_path = self.input_path if self.mode == 'inter' else None
     self.rna_path = self.input_path if self.mode == 'rna' else None
     file_name = os.path.basename(self.input_path)
     file_prefix = StrConverter.extract_file_name(file_name)
     suffix = 'stream_%d' % self.limit if self.mode == 'rna' else 'gene'
     self.result_path = os.path.join(
         self.output_directory, '%s_%s_result.txt' % (file_prefix, suffix))
     self.gene_reader = GeneFileReader(self.data_path)
     self.logger = LoggerFactory()
     self.headers = {}
     self.inv_headers = []
Esempio n. 5
0
    def __post_init__(self):
        self.data_name = os.path.basename(self.data_path)
        file_name = os.path.basename(self.gene_path)
        file_prefix = StrConverter.extract_file_name(file_name)
        self.result_path = os.path.join(self.output_directory,
                                        '%s_match_result.txt' % (file_prefix))
        self.gene_reader = GeneFileReader(self.data_path)
        self.dna_code = None
        self.rev_dna_code = None
        self.logger = LoggerFactory()

        self.lock = threading.Lock()
        self.solved = 0
        self.total = 0
        self.weighted_sum = sum(self.weighted)
        assert self.weighted_sum > 0 and len(self.weighted) == 5
 def find_neighbor_batch(self, datas, iteration_time):
     fw = open(self.neighbor_result_path, 'a')
     solve_cnt, success_cnt, total_cnt = 0, 0, len(datas)
     logger = LoggerFactory(1)
     logger.info_with_expire_time(
         '[Iteration %d]completed %d/%d=%.2f%%' %
         (iteration_time, solve_cnt, total_cnt,
          solve_cnt * 100.0 / total_cnt), solve_cnt, total_cnt)
     fe = open(
         self.error_result_path_prefix + ".iter-%d.txt" % iteration_time,
         'w')
     fail_datas = []
     for key, inter, additional in datas:
         solve_cnt += 1
         file_path = os.path.join(self.download_directory, key + '.txt')
         flag, data = self.download_and_analysis(key, inter, file_path)
         if flag:
             success_cnt += 1
             direction = '+' if (inter[0] < inter[1]) else '-'
             fw.write('>%s/%s-%s(%s)\n' %
                      (key, inter[0], inter[1], direction))
             if additional != '':
                 for kv in additional.split(','):
                     k, v = kv.split('=')
                     fw.write('%s\t%s\n' % (k, v))
             fw.write('SOURCE\t%s\n' % (data.get('source', 'UNKNOWN')))
             for elem in data['data']:
                 fw.write('%s\n' % elem)
             fw.write('sequence\t%s\n' % (data.get('sequence', '')))
             fw.write('\n')
             fw.flush()
         else:
             if os.path.exists(file_path):
                 os.remove(file_path)
             fe.write('>%s/%s-%s\n' % (key, inter[0], inter[1]))
             fe.flush()
             fail_datas.append([key, inter])
         self.logger.info_with_expire_time(
             '[Iteration %d]completed %d/%d=%.2f%%, success %d/%d=%.2f%%' %
             (iteration_time, solve_cnt, total_cnt,
              solve_cnt * 100.0 / total_cnt, success_cnt, solve_cnt,
              success_cnt * 100.0 / solve_cnt), solve_cnt, total_cnt)
         time.sleep(random.random())
     self.logger.info('[Iteration %d]done .' % iteration_time)
     fw.close()
     return fail_datas
    def __post_init__(self):
        self.logger = LoggerFactory(3)

        file_name = os.path.basename(self.input_path)
        file_prefix = StrConverter.extract_file_name(file_name)
        self.neighbor_result_path = os.path.join(
            self.output_directory, '%s_neighbor_result.txt' % file_prefix)
        self.next_gene_result_path = os.path.join(
            self.output_directory, '%s_next_neighbor_result.txt' % file_prefix)
        self.source_count_path = os.path.join(
            self.output_directory, '%s_source_count_result.txt' % file_prefix)
        self.gene_count_path = os.path.join(
            self.output_directory, '%s_gene_count_result.txt' % file_prefix)

        error_directory = os.path.join(self.output_directory, 'error')
        if not os.path.exists(error_directory):
            os.makedirs(error_directory)
        self.error_result_path_prefix = os.path.join(
            error_directory, '%s_error_result' % file_prefix)
Esempio n. 8
0
    def __post_init__(self):
        self.from_gene_names = self.ecocyc_params['from_gene_names']
        self.output_best_promoter = self.ecocyc_params['output_best_promoter']
        self.output_detail_information = self.ecocyc_params[
            'output_detail_information']
        self.analysis_promoter = self.ecocyc_params['analysis_promoter']
        self.if_get_summary = self.ecocyc_params['if_get_summary']
        self.if_get_go_table = self.ecocyc_params['if_get_go_table']
        self.sequence_start_idx = None
        self.sequence_end_idx = None
        self.headers = {}
        self.inv_headers = []

        file_name = os.path.basename(self.input_path)
        file_prefix = StrConverter.extract_file_name(file_name)
        self.ecocyc_result_path = os.path.join(
            self.output_directory, '%s_ecocyc_result.txt' % file_prefix)
        self.ecocyc_error_path = os.path.join(
            self.output_directory, '%s_ecocyc_error.txt' % file_prefix)
        self.logger = LoggerFactory()
Esempio n. 9
0
class GeneFileReader:
    def __init__(self, file_path, ignore_gene=False, enable_debug_info=False):
        self.ignore_gene = ignore_gene
        self.gene_segments = []
        self.dna_code = []
        self.gene_name_segment_map = {}
        self.source = None

        self.enable_debug_info = enable_debug_info
        self.file_path = file_path
        self.logger = LoggerFactory(1)

    def build_information(self):
        part_status = GeneDataPartType.HeaderPart
        data = []
        line_type = None
        for line_index, line in enumerate(open(self.file_path, 'r', encoding='utf8')):
            line_type = self.check_line_type(line, part_status)
            if line_type == GeneDataLineType.SourceLine:
                self.source = ' '.join(re.split(r'\s+', line)[1:])
            elif line_type == GeneDataLineType.GeneSegmentStart:
                part_status = GeneDataPartType.GeneSegmentPart
                self.parse_gene_segment(data)
            elif line_type == GeneDataLineType.DNAStart:
                part_status = GeneDataPartType.DNAPart
                self.parse_gene_segment(data)
            elif line_type == GeneDataLineType.DNAEnd:
                break

            if part_status == GeneDataPartType.GeneSegmentPart:
                data.append(line)
            elif part_status == GeneDataPartType.DNAPart and line_type == GeneDataLineType.Other:
                items = re.split(r'\s+', line.strip())
                for val in items[1:]:
                    self.dna_code.append(val)
                self.logger.time_distance = 10
            if self.enable_debug_info:
                self.logger.info_per_time("LineNo = %d, Added Gene Num = %d, Last Sample = %s" % (
                    line_index, len(self.gene_segments),
                    self.gene_segments[-1].__dict__ if len(self.gene_segments) > 0 else ""))
        if part_status != GeneDataPartType.DNAPart and line_type != GeneDataLineType.DNAEnd:
            return False
        self.dna_code = ''.join(self.dna_code)
        check_order = None
        warning_num = 0
        for idx, gene_segment in enumerate(self.gene_segments):
            name = gene_segment.gene
            if check_order is not None and check_order > min(gene_segment.cds):
                warning_num += 1
            check_order = max(gene_segment.cds)
            if name not in self.gene_name_segment_map:
                self.gene_name_segment_map[name] = []
            self.gene_name_segment_map[name].append(idx)
        self.logger.info("Total Gene Segment Number = %d, Total Gene Name Count = %d" % (
            len(self.gene_segments), len(self.gene_name_segment_map)))
        return True

    def parse_gene_segment(self, data):
        if data is None or len(data) == 0 or self.ignore_gene:
            return
        gene_segment = GeneSegment()
        last_line = ''
        success = True
        complement = None
        for line in data:
            try:
                line_type = self.check_line_type(line, GeneDataPartType.GeneSegmentPart)
                line = line.strip()
                if line_type == GeneDataLineType.GeneSegmentStart:
                    tag, complement = re.split(r'\s+', line)
                    inter = list(map(lambda arg: int(arg.strip('<>')),
                                     complement.lstrip('complement(').rstrip(')').split('..')))
                    gene_segment.cds = inter
                    assert (inter[0] < inter[1])
                else:
                    if line[0] == '/':
                        last_line = line
                    else:
                        last_line += ' ' + line
                    gene_segment.extract_attribute(last_line)
            except:
                self.logger.info(line)
                if not complement or (
                        not complement.startswith('join') and not complement.startswith('complement(join')):
                    traceback.print_exc()
                success = False
                break
        if success:
            self.gene_segments.append(gene_segment)
        data.clear()

    @staticmethod
    def check_line_type(line: str, part_status):
        strip_line = line.strip()
        if part_status == GeneDataPartType.HeaderPart:
            if strip_line.startswith(ExperimentConfig.VALUE_SOURCE_START):
                return GeneDataLineType.SourceLine
            elif strip_line.startswith(ExperimentConfig.VALUE_GENE_START) or strip_line.startswith(
                    ExperimentConfig.VALUE_REPEAT_REGION_START):
                return GeneDataLineType.GeneSegmentStart
        elif part_status == GeneDataPartType.GeneSegmentPart:
            if strip_line.startswith(ExperimentConfig.VALUE_GENE_START) or strip_line.startswith(
                    ExperimentConfig.VALUE_REPEAT_REGION_START):
                return GeneDataLineType.GeneSegmentStart
            elif line[0] != ' ':
                return GeneDataLineType.DNAStart
        elif part_status == GeneDataPartType.DNAPart:
            if strip_line.startswith(ExperimentConfig.VALUE_DNA_PART_END):
                return GeneDataLineType.DNAEnd
        return GeneDataLineType.Other
Esempio n. 10
0
class GeneSimilarityMatch:
    gene_path: str
    data_path: str
    output_directory: str
    top_k: int = 20
    candidate_distance: int = 5
    batch_size: int = 5
    patience: int = 0
    weighted: List[int] = field(default_factory=list)
    conditions: dict = None
    continuous_mismatch_limit: int = None
    order_type: OrderType = OrderType.Decrement
    dna_code = None
    rev_dna_code = None
    gene_name_filter = None

    def __post_init__(self):
        self.data_name = os.path.basename(self.data_path)
        file_name = os.path.basename(self.gene_path)
        file_prefix = StrConverter.extract_file_name(file_name)
        self.result_path = os.path.join(self.output_directory,
                                        '%s_match_result.txt' % (file_prefix))
        self.gene_reader = GeneFileReader(self.data_path)
        self.dna_code = None
        self.rev_dna_code = None
        self.logger = LoggerFactory()

        self.lock = threading.Lock()
        self.solved = 0
        self.total = 0
        self.weighted_sum = sum(self.weighted)
        assert self.weighted_sum > 0 and len(self.weighted) == 5

    def run(self, gene_name_filter: GeneLocationAnalysis = None):
        self.gene_name_filter = gene_name_filter
        self.gene_reader.build_information()
        self.dna_code = self.gene_reader.dna_code
        self.rev_dna_code = get_opposite_dna(self.gene_reader.dna_code[::-1])
        with open(self.result_path, 'w', encoding='utf8') as fw:
            gene_sequences = open(self.gene_path, 'r', encoding='utf8').readlines()[1:]
            self.solved = 0
            self.total = len(self.gene_reader.dna_code) * len(gene_sequences) * 2
            self.logger.info_with_expire_time(
                'Doing Similarity Matching: %d/%d(%.2f%%)' % (
                    self.solved, self.total, self.solved * 100.0 / self.total), self.solved, self.total)
            pending_tasks = deque()
            running_tasks = []
            for gene_sequence in gene_sequences:
                items = gene_sequence.strip().split('\t')
                name, gene = items[0], items[1].lower()
                t = threading.Thread(target=self.find_candidate_for_gene, args=(name, gene, fw,))
                pending_tasks.append(t)
            while len(pending_tasks) > 0:
                running_tasks = [t for t in running_tasks if t.isAlive()]
                while len(running_tasks) < self.batch_size and len(pending_tasks) > 0:
                    t = pending_tasks.popleft()
                    t.start()
                    running_tasks.append(t)
                time.sleep(10)
            for t in running_tasks:
                t.join()

    def find_candidate_for_gene(self, name, gene, fw):

        t1 = HasReturnThread(func=self.match_gene,
                             args=(name, gene, self.dna_code, False,))
        t1.start()
        t2 = HasReturnThread(func=self.match_gene,
                             args=(name, gene, self.rev_dna_code, True,))
        t2.start()
        t1.join()
        t2.join()

        candidates = t1.get_result() + t2.get_result()
        candidates = list(candidates)
        candidates.sort(key=lambda arg: -arg.weighted_similarity)
        if self.order_type == OrderType.Increment:
            for candidate in candidates:
                candidate.weighted_similarity = -candidate.weighted_similarity
        results = self.render_similarity_for_candidates(gene, candidates[:self.top_k])
        self.lock.acquire()
        idx = 1
        headers = [
            'name',
            'direction',
            'weighted_similarity'
        ]
        for idx, similarity_name in enumerate(
                ['text_distance_similarity', 'direct_match_similarity', 'consistency_similarity',
                 'pattern_similarity', 'blat_similarity']):
            if self.weighted[idx] > 0:
                headers.append(similarity_name)
        headers.append('original      :')
        sequence_headers = [
            'gene_format   :',
            'target_format :',
            'match_format  :']
        for candidate_result in results:
            candidate = candidate_result[0]
            fw.write('(%d)\n' % idx)
            attribute = {
                'name': name,
                'direction': '-' if candidate.is_reverse else '+',
                'weighted_similarity': '%.2f' % candidate.weighted_similarity,
                'original      :': gene
            }
            for idx, similarity_name in enumerate(
                    ['text_distance_similarity', 'direct_match_similarity', 'consistency_similarity',
                     'pattern_similarity', 'blat_similarity']):
                if self.weighted[idx] > 0:
                    attribute[similarity_name] = '%.2f' % candidate.similarity_dict[
                        MatchAlgorithm.get_match_algorithm_by_name(similarity_name)]
            sequence_content = []
            offset = 1
            for idx, match_algorithm in enumerate(MatchAlgorithm.get_all_items()):
                if self.weighted[idx] > 0:
                    for sequence_header, value in zip(sequence_headers, candidate_result[offset:offset + 3]):
                        value = ''.join(value)
                        sequence_content.append(match_algorithm.name + "_" + sequence_header + '=' + value)
                    offset += 3

            fw.write('>%s/%s-%s\t%s,%s\n' % (
                self.data_name.replace(".txt", ''),
                candidate.start,
                candidate.end,
                ','.join(['%s=%s' % (key, attribute[key]) for key in headers if key in attribute]),
                ','.join(sequence_content)
            ))
            fw.write('\n')
            idx += 1
        self.lock.release()

    def match_gene(self, name, gene, database, is_reverse):
        candidates: List[MatchCandidate] = []
        gene_length = len(gene)
        min_weighted_similarity_in_candidates = 0.0
        database_length = len(database)
        limitation = database_length - gene_length + 1
        new_solved = 0
        similarity_heap = []
        buff = deque()
        match_pattern = MatchPattern(gene, self.conditions) if self.conditions else None
        for start in range(limitation):
            weighted_similarity, similarity_dict = count_similarity(weighted=self.weighted,
                                                                    gene=gene,
                                                                    database=database,
                                                                    offset=start,
                                                                    is_reverse=is_reverse,
                                                                    max_patience=self.patience,
                                                                    match_pattern=match_pattern,
                                                                    continuous_mismatch_limit=self.continuous_mismatch_limit,
                                                                    gene_name_filter=self.gene_name_filter)
            if self.order_type == OrderType.Increment:
                weighted_similarity = -weighted_similarity
            new_candidate = MatchCandidate(
                left=start,
                right=start + gene_length - 1,
                is_reverse=is_reverse,
                database_length=database_length,
                weighted_similarity=weighted_similarity,
                similarity_dict=similarity_dict)

            added_flag = update_candidate_list(new_candidate,
                                               buff,
                                               candidates,
                                               self.candidate_distance)
            if added_flag:
                heapq.heappush(similarity_heap, candidates[-1])
                if len(similarity_heap) > self.top_k:
                    heapq.heappop(similarity_heap)
                    top = similarity_heap[0]
                    min_weighted_similarity_in_candidates = max(min_weighted_similarity_in_candidates,
                                                                top.weighted_similarity)

            new_solved += 1
            if random.random() * 1000 < 1:
                self.lock.acquire()
                self.solved += new_solved
                self.logger.info_with_expire_time(
                    'Doing Similarity Matching for %s[%s]: %d/%d(%.2f%%) '
                    '--top_k=%d '
                    '--top_similarity_info=[%s] '
                    '--gene_length=%d '
                    '--candidates_num=%d' % (
                        name,
                        '-' if is_reverse else '+',
                        self.solved,
                        self.total,
                        self.solved * 100.0 / self.total,
                        self.top_k,
                        similarity_heap[0].get_similarity_str() if len(similarity_heap) > 0 else 'None',
                        gene_length,
                        len(candidates)
                    ),
                    self.solved,
                    self.total)
                self.lock.release()
                new_solved = 0

            if len(candidates) > CandidateClearSize:
                candidates.sort(key=lambda arg: -arg.weighted_similarity)
                candidates = candidates[:self.top_k]
        while len(buff) > 0:
            update_candidate_list(None, buff, candidates, 1)
        self.lock.acquire()
        self.solved += new_solved + gene_length - 1
        self.lock.release()
        return candidates

    def render_similarity_for_candidates(self, gene, candidates):
        result = []
        for candidate in candidates:
            database = self.rev_dna_code if candidate.is_reverse else self.dna_code
            candidate_result = [candidate]
            for idx, match_algorithm in enumerate(MatchAlgorithm.get_all_items()):
                if self.weighted[idx] > 0:
                    candidate_result.extend(
                        self.render_target_dna_sequence(match_algorithm, gene, database, candidate.original_match_left))
            result.append(candidate_result)
        return result

    def render_target_dna_sequence(self, match_algorithm: MatchAlgorithm, gene, database, offset):
        sequence_gene = []
        sequence_target = []
        sequence = []
        tot = len(gene)
        if match_algorithm == MatchAlgorithm.text_distance:
            score, dp = compute_text_distance_similarity(gene, database, offset)
            i, j = tot, tot
            while i > 0 or j > 0:
                gene_a, gene_b = gene[i - 1] if i > 0 else '.', database[j + offset - 1] if j > 0 else '.'
                if i > 0 and j > 0 and dp[i][j] == dp[i - 1][j - 1] + should_change(gene[i - 1],
                                                                                    database[j + offset - 1]):
                    sequence_gene.append(gene_a)
                    sequence_target.append(gene_b)
                    sequence.append('*' if should_change(gene[i - 1], database[j + offset - 1]) == 0 else '.')
                    i, j = i - 1, j - 1
                elif dp[i][j] == dp[i - 1][j] + 1:
                    sequence_gene.append(gene_a)
                    sequence_target.append('.')
                    sequence.append('.')
                    i -= 1
                elif dp[i][j] == dp[i][j - 1] + 1:
                    sequence_gene.append('.')
                    sequence_target.append(gene_b)
                    sequence.append('.')
                    j -= 1
                else:
                    raise ValueError('Should not go here!')
            sequence_gene.reverse()
            sequence_target.reverse()
            sequence.reverse()
        elif match_algorithm == MatchAlgorithm.direct_match:
            for i in range(tot):
                sequence_gene.append(gene[i])
                sequence_target.append(database[i + offset])
                if not should_change(gene[i], database[i + offset]):
                    sequence.append('*')
                else:
                    sequence.append('.')
        elif match_algorithm == MatchAlgorithm.consistency:
            score, score_queue, score_merge_idx = compute_consistency_similarity(gene, database, offset,
                                                                                 self.patience)
            sequence_gene.extend(gene[:])
            sequence_target.extend(database[offset:offset + tot])
            cur_pos = 0
            for idx, (same_cnt, same_end) in enumerate(score_queue):
                same_start = same_end - same_cnt
                while cur_pos < same_start:
                    if score_merge_idx[0] < idx <= score_merge_idx[1]:
                        sequence.append('-')
                    else:
                        sequence.append('.')
                    cur_pos += 1
                while cur_pos < same_end:
                    sequence.append('*')
                    cur_pos += 1
            while cur_pos < tot:
                sequence.append('.')
                cur_pos += 1
        elif match_algorithm == MatchAlgorithm.pattern:
            for i in range(tot):
                sequence_gene.append(gene[i])
                sequence_target.append(database[i + offset])
                if not should_change(gene[i], database[i + offset]):
                    sequence.append('*')
                else:
                    sequence.append('.')
        elif match_algorithm == MatchAlgorithm.blat:
            flag, pos_data_end = compute_blat_similarity(gene, database, offset)
            pos_data = offset
            pos_gene = 0
            while pos_gene < 4:
                if should_change(gene[pos_gene], database[pos_data]) > 0:
                    sequence_gene.append('-')
                    sequence_target.append(database[pos_data])
                    sequence.append('.')
                    pos_data += 1
                else:
                    sequence_gene.append(gene[pos_gene])
                    sequence_target.append(database[pos_data])
                    sequence.append('*')
                    pos_gene += 1
                    pos_data += 1
            rev_pos_gene = 7
            rev_pos_data = pos_data_end - 1
            rev_sequence_gene = []
            rev_sequence_target = []
            rev_sequence = []
            while rev_pos_gene > 3:
                if should_change(gene[rev_pos_gene], database[rev_pos_data]) > 0:
                    rev_sequence_gene.append('-')
                    rev_sequence_target.append(database[rev_pos_data])
                    rev_sequence.append('.')
                    rev_pos_data -= 1
                else:
                    rev_sequence_gene.append(gene[rev_pos_gene])
                    rev_sequence_target.append(database[rev_pos_data])
                    rev_sequence.append('*')
                    rev_pos_gene -= 1
                    rev_pos_data -= 1
            while pos_data <= rev_pos_data:
                sequence_gene.append('-')
                sequence_target.append(database[pos_data])
                sequence.append('.')
                pos_data += 1
            sequence_gene.extend(rev_sequence_gene[::-1])
            sequence_target.extend(rev_sequence_target[::-1])
            sequence.extend(rev_sequence[::-1])
        return sequence_gene, sequence_target, sequence
Esempio n. 11
0
class KeggAnalysis:
    input_path: str
    download_directory: str
    output_directory: str
    is_gene: bool = True

    def __post_init__(self):
        os.makedirs(self.download_directory, exist_ok=True)
        file_name = os.path.basename(self.input_path)
        file_prefix = StrConverter.extract_file_name(file_name)
        self.kegg_result_path = os.path.join(
            self.output_directory, '%s_kegg_result.txt' % file_prefix)
        self.kegg_error_path = os.path.join(self.output_directory,
                                            '%s_kegg_error.txt' % file_prefix)
        self.logger = LoggerFactory()

    def run(self):
        fstd = open(self.kegg_result_path, 'w')
        ferr = open(self.kegg_error_path, 'w')
        solved = 0
        failed = 0
        with open(self.input_path) as f:
            datas = [data.strip() for data in f.readlines()]
        total = len(datas)

        with multiprocessing.Pool(multiprocessing.cpu_count()) as p:
            func = self.work_for_gene if self.is_gene else self.work_for_kegg
            for flag, outputs in p.imap(func, datas):
                if flag:
                    fstd.write('\n'.join(outputs) + '\n')
                    fstd.flush()
                    solved += 1
                else:
                    ferr.write('%s\n' % outputs)
                    ferr.flush()
                    failed += 1
                self.logger.info_with_expire_time(
                    "Completed %d/%d, success rate %d/%d=%.2f%%" %
                    (solved + failed, total, solved, solved + failed,
                     solved * 100.0 / (solved + failed)), solved + failed,
                    total)
        fstd.close()
        ferr.close()

    def work_for_gene(self, gene):
        try:
            outputs = []
            for kegg_id in self.get_kegg_id(gene):
                for flag, kegg_pathway in self.work_for_kegg(kegg_id):
                    if not flag:
                        return False, gene
                    outputs.append('%s\t%s' % (gene, kegg_pathway))
            return True, outputs
        except:
            traceback.print_exc()
            return False, gene

    def work_for_kegg(self, kegg_id):
        try:
            names, pathways = self.get_pathway(kegg_id)
            if self.is_gene:
                return True, ['%s\t%s' % (kegg_id, '; '.join(pathways))]
            else:
                outputs = [
                    '%s\t%s\t%s' % (kegg_id, name, '; '.join(pathways))
                    for name in names
                ]
                return True, outputs
        except:
            return False, kegg_id

    def get_kegg_id(self, gene):
        target_path = os.path.join(self.download_directory,
                                   'get_kegg_id_%s.html' % gene)
        if not os.path.exists(target_path):
            url = "https://www.kegg.jp/dbget-bin/www_bfind_sub?mode=bfind&max_hit=1000&dbkey=kegg&keywords=" + gene
            for retry_time in range(3):
                try:
                    req = request.Request(url=url)
                    x = request.urlopen(req, timeout=30)
                    body = x.read()
                    for item in x.headers._headers:
                        if item[0].lower(
                        ) == 'content-encoding' and item[1].lower() == 'gzip':
                            body = gzip.decompress(body)
                    body = body.decode('utf-8')
                    with open(target_path, 'w', encoding='utf8') as fw:
                        fw.write(body)
                        break
                except:
                    traceback.print_exc()
                    self.logger.info("Retry %d for %s" % (retry_time, gene))
        if not os.path.exists(target_path):
            raise ValueError("Gene not found from web: %s" % gene)
        kegg_ids = self.extract_kegg_id(target_path)
        if len(kegg_ids) == 0:
            os.remove(target_path)
            raise ValueError("Gene extract failed: %s" % gene)
        return kegg_ids

    def extract_kegg_id(self, file_path):
        with open(file_path, 'r') as f:
            body = f.read()
        parser = KeggIdHTMLParser()
        parser.feed(body)
        return parser.kegg_id_map.keys()

    def get_pathway(self, kegg_id):
        target_path = os.path.join(self.download_directory,
                                   'get_pathway_%s.html' % kegg_id)
        if not os.path.exists(target_path):
            url = "https://www.kegg.jp/dbget-bin/www_bget?ko:" + kegg_id
            for retry_time in range(3):
                try:
                    req = request.Request(url=url)
                    x = request.urlopen(req, timeout=30)
                    body = x.read()
                    for item in x.headers._headers:
                        if item[0].lower(
                        ) == 'content-encoding' and item[1].lower() == 'gzip':
                            body = gzip.decompress(body)
                    body = body.decode('utf-8')
                    with open(target_path, 'w', encoding='utf8') as fw:
                        fw.write(body)
                        break
                except:
                    traceback.print_exc()
                    self.logger.info("Retry %d for %s" % (retry_time, kegg_id))
        if not os.path.exists(target_path):
            raise ValueError("Kegg not found from web: %s" % kegg_id)
        names, pathways = self.extract_name_pathway(target_path)
        if len(pathways) == 0 or len(names) == 0:
            os.remove(target_path)
            if len(pathways) == 0:
                pathways = ["No Pathway"]
            if len(names) == 0:
                names = ["Not Found"]
        return names, pathways

    def extract_name_pathway(self, file_path):
        with open(file_path, 'r') as f:
            body = f.read()
        parser = KeggPathwayHTMLParser()
        parser.feed(body)
        return parser.names, parser.pathways
Esempio n. 12
0
class GeneStreamAnalysis:
    data_path: str
    input_path: str
    output_directory: str
    mode: str = 'rna'
    limit: int = 200

    def __post_init__(self):
        self.inter_path = self.input_path if self.mode == 'inter' else None
        self.rna_path = self.input_path if self.mode == 'rna' else None
        file_name = os.path.basename(self.input_path)
        file_prefix = StrConverter.extract_file_name(file_name)
        suffix = 'stream_%d' % self.limit if self.mode == 'rna' else 'gene'
        self.result_path = os.path.join(
            self.output_directory, '%s_%s_result.txt' % (file_prefix, suffix))
        self.gene_reader = GeneFileReader(self.data_path)
        self.logger = LoggerFactory()
        self.headers = {}
        self.inv_headers = []

    def get_utr_between(self, first, second):
        left = self.gene_reader.gene_segments[first].cds[1]
        right = self.gene_reader.gene_segments[second].cds[0] - 1
        return self.gene_reader.dna_code[left:right]

    def work_for_gene_index(self, index, start, end):
        gene_segment = self.gene_reader.gene_segments[index]
        assert gene_segment.cds[0] == min(start, end)
        assert gene_segment.cds[1] == max(start, end)
        seq = self.gene_reader.dna_code[gene_segment.cds[0] -
                                        1:gene_segment.cds[1]]
        upstream = self.gene_reader.dna_code[
            max(gene_segment.cds[0] - self.limit -
                1, 0):gene_segment.cds[0] - 1]
        downstream = self.gene_reader.dna_code[gene_segment.
                                               cds[1]:gene_segment.cds[1] +
                                               self.limit]
        if start > end:
            seq = get_opposite_dna(seq[::-1])
            upstream, downstream = get_opposite_dna(
                downstream[::-1]), get_opposite_dna(upstream[::-1])
        return seq, upstream, downstream

    def work_for_gene(self, gene_idx, gene_name, start, end, fw):
        if gene_name.find('->') >= 0:
            gene_name = gene_name[:gene_name.index('->')]
        if gene_name not in self.gene_reader.gene_name_segment_map:
            self.logger.info("%s not found in data" % gene_name)
            return
        cnt = 1
        fw.write('%d. %s\n' % (gene_idx, gene_name))
        for idx in self.gene_reader.gene_name_segment_map[gene_name]:
            seq, up, down = self.work_for_gene_index(idx, start, end)
            fw.write('%d)\n' % cnt)
            fw.write('position\t%d %s %d\n' %
                     (self.gene_reader.gene_segments[idx].cds[0],
                      '->' if start < end else '<-',
                      self.gene_reader.gene_segments[idx].cds[1]))
            fw.write('product\t%s\n' %
                     self.gene_reader.gene_segments[idx].product)
            fw.write('GeneID\t%s\n' %
                     self.gene_reader.gene_segments[idx].gene_id)
            fw.write('stream\t%s\n' % seq)
            if up: fw.write('upstream\t%s\n' % up)
            if down: fw.write('downstream\t%s\n' % down)
            fw.write('\n')
            cnt += 1

    def check_inter(self, fw):
        for line in open(self.inter_path, 'r', encoding='utf8'):
            line = line.strip()
            if line == '': continue
            left, right = map(int, line.split(','))
            up, down = None, None
            for gene_segment in self.gene_reader.gene_segments:
                if max(gene_segment.cds) < left:
                    if not up or max(up.cds) < max(gene_segment.cds):
                        up = gene_segment
                if min(gene_segment.cds) > right:
                    if not down or min(down.cds) > min(gene_segment.cds):
                        down = gene_segment
            fw.write('%s:\n' % line)
            if up:
                fw.write('up-gene\t%s\nup-position\t%s\nup-product\t%s\n' %
                         (up.gene, '-'.join(map(str, up.cds)), up.product))
            if down:
                fw.write(
                    'down-gene\t%s\ndown-position\t%s\ndown-product\t%s\n' %
                    (down.gene, '-'.join(map(str, down.cds)), down.product))
            fw.write('\n')

    def generate_header(self, items):
        for idx, col_name in enumerate(items.strip().split('\t')):
            self.headers[col_name] = idx
            self.inv_headers.append(col_name)

    def run(self):
        self.gene_reader.build_information()
        with open(self.result_path, 'w', encoding='utf8') as fw:
            if self.mode == 'rna':
                lines = open(self.rna_path, 'r', encoding='utf8').readlines()
                self.generate_header(lines[0])
                for gene_idx, line in enumerate(lines[1:]):
                    items = line.split('\t')
                    gene_name, start, end = items[self.headers['gene']], int(
                        items[self.headers['map_start_pos']]), int(
                            items[self.headers['map_end_pos']])
                    self.work_for_gene(gene_idx, gene_name.strip(), start, end,
                                       fw)
            elif self.mode == 'inter':
                self.check_inter(fw)
            else:
                raise ValueError(self.mode)
Esempio n. 13
0
class ClusterMatcher:
    rna_tag: str
    input_path: str
    output_directory: str = None

    def __post_init__(self):
        self.logger = LoggerFactory(1)
        file_prefix = StrConverter.extract_file_name(self.rna_tag)
        self.cluster_result_path = os.path.join(
            self.output_directory, '%s_cluster_result.txt' % file_prefix)
        self.sample_result_path = os.path.join(
            self.output_directory, '%s_sample_result.txt' % file_prefix)
        self.all_result_path = os.path.join(self.output_directory,
                                            '%s_all_result.txt' % file_prefix)
        self.only_result_path = os.path.join(
            self.output_directory, '%s_only_result.txt' % file_prefix)

    def run(self):
        if not os.path.exists(self.output_directory):
            os.makedirs(self.output_directory)
        data = self.read_data()
        self.logger.info('data size = %d' % len(data))
        self.compare_data(data)

    def format_data(self, index, lines):
        ig, gene_no = self.should_ignore(index, lines[3])
        if ig and not gene_no:
            return None
        dic = {'ignored': ig, 'index': index, 'geneNo': gene_no}
        if dic['ignored']:
            return dic

        data = [{}, {}, {}]
        action = 0
        others = lines[:4]
        for line in lines[4:]:
            if line.strip() == '':
                continue
            if line.strip().startswith(self.rna_tag):
                action = 1
                self.update_sequence(index, data[0], line)
            elif action == 1:
                action = 2
                self.update_sequence(index, data[1], line)
            elif action == 2:
                action = 0
                self.update_sequence(index, data[2], line)
            else:
                action = 0
        dic['data'] = data
        dic['others'] = others
        return dic

    def read_data(self):
        data = []
        buff = []
        index = 0
        for line in open(self.input_path, 'r'):
            if line.startswith('>>'):
                if len(buff) > 0:
                    index += 1
                    dic = self.format_data(index, buff)
                    if dic:
                        data.append(dic)
                buff = []
            buff.append(line)
        if len(buff) > 0:
            index += 1
            data.append(self.format_data(index, buff))
        return data

    def compare_data(self, data):
        same = {}
        cluster = {}
        si = len(data)
        sample_cluster = {}
        for i in range(si):
            if i in same or data[i]['ignored']:
                continue
            same[i] = i
            cluster[i] = [data[i]['geneNo']]
            sample_cluster[i] = [data[i]]
            for j in range(i + 1, si):
                if j in same or data[j]['ignored']:
                    continue
                if data[i]['data'][1]['seq'].upper(
                ) == data[j]['data'][1]['seq'].upper():
                    same[j] = i
                    cluster[i].append(data[j]['geneNo'])
                    sample_cluster[i].append(data[j])
        fw = open(self.cluster_result_path, 'w')
        for _ in cluster:
            fw.write('%d\t%s\n' %
                     (len(cluster[_]), ','.join(map(str, cluster[_]))))
        fw.close()
        fw = open(self.sample_result_path, 'w')
        for _ in sample_cluster:
            sample = sample_cluster[_][0]
            fw.write(''.join(sample['others']))
            fw.write('\n')
            for elem in sample['data']:
                fw.write('%19s %8s %131s %8s\n' %
                         (elem.get('name', ''), elem.get('start', ''),
                          elem.get('seq', ''), elem.get('end', '')))
            fw.write('\n')
        fw.close()
        fw_all = open(self.all_result_path, 'w')
        fw_only = open(self.only_result_path, 'w')
        other = set()
        for _ in sample_cluster:
            for item in sample_cluster[_]:
                elem = item['data'][-1]
                flag = True
                for x in elem['seq'].strip():
                    if x.upper() in set('AUCG'): continue
                    other.add(x.upper())
                    flag = False
                fw_all.write('>%s/%s-%s\n%s\n' %
                             (elem['name'], elem['start'], elem['end'],
                              elem['seq'].upper()))
                fw_all.write('\n')
                if not flag:
                    continue
                fw_only.write('>%s/%s-%s\n%s\n' %
                              (elem['name'], elem['start'], elem['end'],
                               elem['seq'].upper()))
                fw_only.write('\n')
            fw_all.write('\n')
            fw_only.write('\n')
        fw_all.close()
        fw_only.close()
        self.logger.info('\n'.join(list(other)))

    def should_ignore(self, index, line):
        info = re.split(r'\s+', line.strip())
        gene_no = info[0].strip('()')
        if info[1] == '?':
            return False, gene_no
        elif info[1] == '!':
            return False, gene_no
        else:
            self.logger.info('ignore check failed: %d' % index)
            return True, None

    def update_sequence(self, index, elem, line):
        if line.strip()[-1] not in ExperimentConfig.SET_NUMBER_RANGE10:
            elem['seq'] = elem.get('seq', '') + line.strip()
            return
        try:
            info = re.split(r'\s+', line.strip())
            name = info[0]
            start = int(info[1])
            end = int(info[-1])
            seq = ' '.join(info[2:-1])
        except:
            self.logger.info('value num is not ok: %d, %s' % (index, line))
            sys.exit(1)
        if elem.get('name', name) != name:
            self.logger.info('name is not equal: %d' % index)
            sys.exit(1)
        elem['name'] = name
        start = int(start)
        end = int(end)
        if 'start' not in elem:
            elem['start'] = start
        elem['end'] = end
        elem['seq'] = elem.get('seq', '') + seq
Esempio n. 14
0
class EcocycAnalysis:
    input_path: str
    download_directory: str
    output_directory: str
    ecocyc_params: dict
    cookie: str = None

    def __post_init__(self):
        self.from_gene_names = self.ecocyc_params['from_gene_names']
        self.output_best_promoter = self.ecocyc_params['output_best_promoter']
        self.output_detail_information = self.ecocyc_params[
            'output_detail_information']
        self.analysis_promoter = self.ecocyc_params['analysis_promoter']
        self.if_get_summary = self.ecocyc_params['if_get_summary']
        self.if_get_go_table = self.ecocyc_params['if_get_go_table']
        self.sequence_start_idx = None
        self.sequence_end_idx = None
        self.headers = {}
        self.inv_headers = []

        file_name = os.path.basename(self.input_path)
        file_prefix = StrConverter.extract_file_name(file_name)
        self.ecocyc_result_path = os.path.join(
            self.output_directory, '%s_ecocyc_result.txt' % file_prefix)
        self.ecocyc_error_path = os.path.join(
            self.output_directory, '%s_ecocyc_error.txt' % file_prefix)
        self.logger = LoggerFactory()

    def run(self):
        if self.from_gene_names:
            self.work_from_gene_list_file()
        else:
            self.work_from_url_list_file()

    def generate_header(self, items):
        for idx, col_name in enumerate(items.strip().split('\t')):
            self.headers[col_name] = idx
            self.inv_headers.append(col_name)
        self.sequence_end_idx = self.headers.get('gene_start_pos')
        self.sequence_start_idx = self.headers.get('promoter_pos')

    def work_from_gene_list_file(self):
        solve_cnt = 0
        succ_cnt = 0
        fail_cnt = 0
        fail_json_cnt = 0
        fw_error = open(self.ecocyc_error_path, 'w', encoding='utf8')
        fw_result = open(self.ecocyc_result_path, 'w', encoding='utf8')
        gene_items = list(
            filter(lambda arg: arg.strip() != '',
                   open(self.input_path, 'r', encoding='utf8').readlines()))
        fw_result.write(gene_items[0])
        self.generate_header(gene_items[0])
        total_cnt = len(gene_items) - 1
        self.logger.info_with_expire_time(
            'Ecocyc analysis %d/%d=%.2f%%' %
            (solve_cnt, total_cnt, solve_cnt * 100.0 / total_cnt), solve_cnt,
            total_cnt)
        for line in gene_items[1:]:
            try:
                result = {}
                infos = line.strip().split('\t')
                for idx, info in enumerate(infos):
                    result[self.inv_headers[idx]] = info
                gene_name = result['gene']
                if gene_name.find('->') > 0:
                    gene_name, result['gene'] = result['gene'].split('->')
                self.write_body(gene_name=gene_name)
                ecocyc_id = self.get_ecocyc_id(prefix='gene_',
                                               gene_name=gene_name)
                result['ecocyc_id'] = ecocyc_id
                self.write_body(ecocyc_id=ecocyc_id, page_type="tu")
                self.analysis_xml(prefix='tu_',
                                  ecocyc_id=ecocyc_id,
                                  result=result)
                if self.if_get_summary:
                    self.write_body(ecocyc_id=ecocyc_id, page_type="summary")
                    self.analysis_xml(prefix='summary_',
                                      ecocyc_id=ecocyc_id,
                                      result=result)
                if self.analysis_promoter:
                    flag_json = self.write_body(ecocyc_id=ecocyc_id,
                                                page_type="promoter")
                    if flag_json:
                        self.analysis_json(prefix='promoter_',
                                           ecocyc_id=ecocyc_id,
                                           result=result,
                                           gene_name=result['gene'])
                    if not flag_json:
                        fail_json_cnt += 1
                if self.if_get_go_table:
                    self.write_body(ecocyc_id=ecocyc_id, page_type='go')
                    self.analysis_xml(prefix='go_',
                                      ecocyc_id=ecocyc_id,
                                      result=result)
                if result['gene'] != gene_name:
                    result['gene'] = gene_name + '->' + result['gene']
                fw_result.write(self.extract_output(result) + '\n')
                fw_result.flush()
                succ_cnt += 1
            except:
                fw_result.write('%s\tNot Found\n' % result['gene'])
                traceback.print_exc()
                fw_error.write(gene_name + '\n')
                fw_error.flush()
                fail_cnt += 1
            solve_cnt += 1
            self.logger.info_with_expire_time(
                'Ecocyc analysis %d/%d=%.2f%%, success_cnt=%d, fail_cnt=%d, json_download_fail=%d'
                % (solve_cnt, total_cnt, solve_cnt * 100.0 / total_cnt,
                   succ_cnt, fail_cnt, fail_json_cnt), solve_cnt, total_cnt)
        fw_error.close()
        fw_result.close()

    def extract_output(self, result):
        output = []
        for name in self.inv_headers:
            if name == 'product_type':
                for key in [
                        'enzyme', 'rna', 'protein', 'polypeptide',
                        'function when intact', 'transporter'
                ]:
                    if result.get(key, '') != '':
                        result['product_type'] = key
                        result['product'] = result[key]
            elif result.get(name, '') in ['', 'Not Found']:
                try:
                    if name in [
                            'status', 'promoter_name', 'promoter_pos',
                            'gene_start_pos'
                    ]:
                        if result['table_unites'][0] == 'Not Found':
                            if name == 'status':
                                result['status'] = 'Not Found'
                        else:
                            promoter = result['table_unites'][1]
                            result['status'] = 'Found'
                            result['gene_start_pos'] = result['table_unites'][
                                0]
                            result[
                                'promoter_name'] = promoter.get_promoter_name(
                                )
                            result[
                                'promoter_pos'] = promoter.get_promoter_start_site(
                                    int_pos=True)
                except:
                    pass
            output.append(str(result.get(name, '')))
        return '\t'.join(output)

    def work_from_url_list_file(self):
        solve_cnt = 0
        succ_cnt = 0
        fail_cnt = 0
        fail_json_cnt = 0
        buff = []
        fw_error = open(self.ecocyc_error_path, 'w', encoding='utf8')
        fw_result = open(self.ecocyc_result_path, 'w', encoding='utf8')
        items = self.extract_urls_from_file()
        total_cnt = len(items)
        self.logger.info_with_expire_time(
            'Ecocyc analysis %d/%d=%.2f%%' %
            (solve_cnt, total_cnt, solve_cnt * 100.0 / total_cnt), solve_cnt,
            total_cnt)
        for url, mock_name, title in items:
            try:
                result = {}
                self.write_body(url=url, mock_name=mock_name)
                ecocyc_id = self.analysis_xml(prefix='url_',
                                              gene_name=mock_name,
                                              result=result)
                flag_json = False
                if ecocyc_id is not None:
                    result['ecocyc_id'] = ecocyc_id
                    flag_json = self.write_body(ecocyc_id=ecocyc_id,
                                                page_type="promoter")
                    if flag_json:
                        self.analysis_json(prefix='promoter_',
                                           ecocyc_id=ecocyc_id,
                                           result=result,
                                           gene_name=result['gene'])
                if not flag_json:
                    fail_json_cnt += 1
                temp = self.format_result_json(result, fw_error)
                if temp.strip() == '':
                    raise ValueError('No result found')
                buff.append(temp)
                fw_result.write(buff[-1])
                max_col = max(max_col, len(buff[-1].split('\t')))
                fw_result.flush()
                succ_cnt += 1
            except:
                traceback.print_exc()
                fw_error.write(url + '\t' + mock_name + '\t' + title + '\n')
                fw_error.flush()
                fail_cnt += 1
            solve_cnt += 1
            self.logger.info_with_expire_time(
                'Ecocyc analysis %d/%d=%.2f%%, success_cnt=%d, fail_cnt=%d, json_download_fail=%d'
                % (solve_cnt, total_cnt, solve_cnt * 100.0 / total_cnt,
                   succ_cnt, fail_cnt, fail_json_cnt), solve_cnt, total_cnt)
        fw_error.close()
        fw_result.close()

    def extract_urls_from_file(self):
        with open(self.input_path, 'r', encoding='utf8') as fr:
            body = ''.join(fr.readlines())
        parser = UrlHTMLParser()
        parser.feed(body)
        return parser.ecocycs

    def write_body(self,
                   url=None,
                   mock_name=None,
                   ecocyc_id=None,
                   gene_name=None,
                   page_type="tu"):
        if url is not None:
            urls = [url]
            origin_path = os.path.join(self.download_directory,
                                       mock_name + '.html')
            file_path = os.path.join(self.download_directory,
                                     'url_' + mock_name + '.html')
            self.transform_file(origin_path, file_path)
        elif gene_name is not None:
            urls = [
                'http://ecocyc.org/ECOLI/substring-search?type=GENE&object=%s&geneSearch=Gene+Search'
                % gene_name
            ]
            origin_path = os.path.join(self.download_directory,
                                       gene_name + '.html')
            file_path = os.path.join(self.download_directory,
                                     'gene_' + gene_name + '.html')
            self.transform_file(origin_path, file_path)
        elif ecocyc_id is not None:
            if page_type == "tu":
                urls = [
                    'https://ecocyc.org/gene?orgid=ECOLI&id=%s#tab=TU' %
                    ecocyc_id
                ]
                origin_path = os.path.join(self.download_directory,
                                           ecocyc_id + '.html')
                file_path = os.path.join(self.download_directory,
                                         'tu_' + ecocyc_id + '.html')
                self.transform_file(origin_path, file_path)
            elif page_type == "promoter":
                urls = [
                    'https://ecocyc.org/tmp/ptools-images/ECOLI/TU_dir=1_topdir=-1_NO-PLOC_%s.wg'
                    % ecocyc_id,
                    'https://ecocyc.org/tmp/ptools-images/ECOLI/TU_dir=1_topdir=-1_NO-INDEX_NO-PLOC_%s.wg'
                    % ecocyc_id,
                    'https://ecocyc.org/tmp/ptools-images/ECOLI/TU_dir=1_topdir=1_NO-INDEX_NO-PLOC_%s.wg'
                    % ecocyc_id,
                    'https://ecocyc.org/tmp/ptools-images/ECOLI/TU_dir=1_topdir=1_NO-PLOC_%s.wg'
                    % ecocyc_id
                ]
                origin_path = os.path.join(self.download_directory,
                                           ecocyc_id + '.json')
                file_path = os.path.join(self.download_directory,
                                         'promoter_' + ecocyc_id + '.json')
                self.transform_file(origin_path, file_path)
            elif page_type == "summary":
                urls = [
                    'https://biocyc.org/gene-tab?id=%s&orgid=ECOLI&tab=SUMMARY'
                    % ecocyc_id
                ]
                file_path = os.path.join(self.download_directory,
                                         'summary_' + ecocyc_id + '.html')
            elif page_type == "go":
                urls = [
                    'https://biocyc.org/gene-tab?id=%s&orgid=ECOLI&tab=GO' %
                    ecocyc_id
                ]
                file_path = os.path.join(self.download_directory,
                                         'go_' + ecocyc_id + '.html')
        else:
            raise ValueError('Parameter not correct')
        if os.path.exists(file_path):
            return True
        headers = {
            "Host": "ecocyc.org",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
            "Accept": "*/*",
            "Sec-Fetch-Site": "same-origin",
            "Sec-Fetch-Mode": "cors",
            'Accept-Encoding': "gzip, deflate, br",
            'Connection': "Keep-Alive",
            'Cookie': self.cookie
        }
        for retry_time in range(3):
            flag = False
            for url in urls:
                try:
                    if retry_time == 0:
                        x = request.urlopen(url, timeout=30)
                        body = x.read().decode('utf8')
                        with open(file_path, 'w', encoding='utf8') as fw:
                            fw.write(body)
                            flag = True
                            break
                    elif retry_time == 1:
                        url = "https://biocyc.org/tmp/ptools-images/ECOLI/%s_REG-SUMMARY.wg" % ecocyc_id
                        req = request.Request(url=url, headers=headers)
                        x = request.urlopen(req, timeout=30)
                        body = x.read()
                        break
                    else:
                        req = request.Request(url=url, headers=headers)
                        x = request.urlopen(req, timeout=30)
                        body = x.read()
                        for item in x.headers._headers:
                            if item[0].lower() == 'content-encoding' and item[
                                    1].lower() == 'gzip':
                                body = gzip.decompress(body)
                        body = body.decode('utf-8')
                        with open(file_path, 'w', encoding='utf8') as fw:
                            fw.write(body)
                            flag = True
                            break
                except:
                    continue
            if flag:
                break
        return flag

    def analysis_xml(self, prefix, ecocyc_id, result):
        xml_path = os.path.join(self.download_directory,
                                prefix + ecocyc_id + '.html')
        with open(xml_path, 'r', encoding='utf8') as fr:
            body = ''.join(fr.readlines())
        if prefix == 'summary_':
            parser = EcocycHTMLParser(do_extract_summary=True)
            parser.feed(''.join(body))
            result['summary'] = parser.extract_attr['summary']
        elif prefix == 'go_':
            parser = GoHTMLParser()
            parser.feed(''.join(body))
            result['go'] = ';'.join(
                ['%s=%s' % (k, v) for k, v in parser.go_table])
        else:
            parser = EcocycHTMLParser()
            parser.feed(''.join(body))
            for k, v in parser.extract_attr.items():
                if k == 'map position':
                    result['map_start_pos'] = v[0]
                    result['map_end_pos'] = v[1]
                elif v is not None:
                    result[k] = v.strip('__#####__')
            return parser.ecocyc_id

    def analysis_json(self, prefix, ecocyc_id, result, gene_name=None):
        json_path = os.path.join(self.download_directory,
                                 prefix + ecocyc_id + '.json')
        with open(json_path, 'r') as fr:
            body = ''.join(fr.readlines())
        body = json.loads(body)
        data = []
        target_gene = None
        for link in body['links']:
            gene_tu = GeneTUInfo(link)
            if self.output_best_promoter and gene_name is not None:
                if gene_tu.is_gene(gene_name):
                    target_gene = gene_tu
            data.append(gene_tu)
        if self.output_best_promoter:
            flag = False
            if target_gene is not None:
                target_promoter, near_gene_pos = get_target_promoter(
                    target_gene, data)
                if target_promoter is not None:
                    data = [near_gene_pos, target_promoter]
                    flag = True
            if not flag:
                data = ['Not Found']
        else:
            data = get_all_promoters(data, True)
        result['table_unites'] = data

    def get_ecocyc_id(self, prefix, gene_name):
        xml_path = os.path.join(self.download_directory,
                                prefix + gene_name + '.html')
        with open(xml_path, 'r') as fr:
            body = ''.join(fr.readlines())

        parser = EcocycHTMLParser(do_extract_id=True, gene_name=gene_name)
        parser.feed(''.join(body))
        if parser.ecocyc_id is None:
            raise RuntimeError('Ecocyc is is None, parse error for %s' %
                               gene_name)
        return parser.ecocyc_id

    @staticmethod
    def transform_file(original_path, new_path):
        if not os.path.exists(new_path) and os.path.exists(original_path):
            os.rename(original_path, new_path)

    def format_result_json(self, result, fw_error=None):
        keys = ['gene', 'cluster']
        info = [result.get(key, '') for key in keys]
        product_type = ''
        product = ''
        for key in [
                'rna', 'protein', 'polypeptide', 'enzyme',
                'function when intact', 'transporter'
        ]:
            val = result.get(key, '')
            if val is None or val == '': continue
            product_type = key
            product = val
        info.extend([product_type, product])
        for key in ['location']:  # , 'reaction']:
            val = result.get(key, '')
            if val is None: val = ''
            info.append(val)
        table_unites = result.get('table_unites', [])
        if self.output_best_promoter and len(table_unites) == 2 and type(
                table_unites[0]) is int:
            near_gene_pos, promoter = table_unites
            info.extend(['Gene Start Position', near_gene_pos])
            info.extend([
                promoter.get_promoter_name(),
                promoter.get_promoter_start_site(int_pos=True)
            ])
            info = list(map(str, info))
        else:
            for promoter in table_unites:
                info.extend([
                    promoter.get_promoter_name(),
                    promoter.get_promoter_start_site()
                ])
            info = list(map(str, info))
            if self.output_best_promoter and fw_error is not None:
                fw_error.write('\t'.join(info) + '\n')
        return '\t'.join(info) + '\n'
Esempio n. 15
0
class NeighborAnalysis:
    input_path: str
    download_directory: str
    output_directory: str
    keep_prefix_num: int = 1

    def __post_init__(self):
        self.logger = LoggerFactory(3)

        file_name = os.path.basename(self.input_path)
        file_prefix = StrConverter.extract_file_name(file_name)
        self.neighbor_result_path = os.path.join(
            self.output_directory, '%s_neighbor_result.txt' % file_prefix)
        self.next_gene_result_path = os.path.join(
            self.output_directory, '%s_next_neighbor_result.txt' % file_prefix)
        self.source_count_path = os.path.join(
            self.output_directory, '%s_source_count_result.txt' % file_prefix)
        self.gene_count_path = os.path.join(
            self.output_directory, '%s_gene_count_result.txt' % file_prefix)

        error_directory = os.path.join(self.output_directory, 'error')
        if not os.path.exists(error_directory):
            os.makedirs(error_directory)
        self.error_result_path_prefix = os.path.join(
            error_directory, '%s_error_result' % file_prefix)

    def download_and_analysis(self, key, inter, file_path):
        self.logger.info('\nstart working for ' + key)
        try:
            if not os.path.exists(file_path):
                if not DataDownloadTool.download_data(key, file_path):
                    return False, None
            flag, data = self.analysis_download_file(file_path, inter)
            if flag:
                return True, data
        except:
            traceback.print_exc()
            return False, None
        return False, None

    def find_neighbor_batch(self, datas, iteration_time):
        fw = open(self.neighbor_result_path, 'a')
        solve_cnt, success_cnt, total_cnt = 0, 0, len(datas)
        logger = LoggerFactory(1)
        logger.info_with_expire_time(
            '[Iteration %d]completed %d/%d=%.2f%%' %
            (iteration_time, solve_cnt, total_cnt,
             solve_cnt * 100.0 / total_cnt), solve_cnt, total_cnt)
        fe = open(
            self.error_result_path_prefix + ".iter-%d.txt" % iteration_time,
            'w')
        fail_datas = []
        for key, inter, additional in datas:
            solve_cnt += 1
            file_path = os.path.join(self.download_directory, key + '.txt')
            flag, data = self.download_and_analysis(key, inter, file_path)
            if flag:
                success_cnt += 1
                direction = '+' if (inter[0] < inter[1]) else '-'
                fw.write('>%s/%s-%s(%s)\n' %
                         (key, inter[0], inter[1], direction))
                if additional != '':
                    for kv in additional.split(','):
                        k, v = kv.split('=')
                        fw.write('%s\t%s\n' % (k, v))
                fw.write('SOURCE\t%s\n' % (data.get('source', 'UNKNOWN')))
                for elem in data['data']:
                    fw.write('%s\n' % elem)
                fw.write('sequence\t%s\n' % (data.get('sequence', '')))
                fw.write('\n')
                fw.flush()
            else:
                if os.path.exists(file_path):
                    os.remove(file_path)
                fe.write('>%s/%s-%s\n' % (key, inter[0], inter[1]))
                fe.flush()
                fail_datas.append([key, inter])
            self.logger.info_with_expire_time(
                '[Iteration %d]completed %d/%d=%.2f%%, success %d/%d=%.2f%%' %
                (iteration_time, solve_cnt, total_cnt,
                 solve_cnt * 100.0 / total_cnt, success_cnt, solve_cnt,
                 success_cnt * 100.0 / solve_cnt), solve_cnt, total_cnt)
            time.sleep(random.random())
        self.logger.info('[Iteration %d]done .' % iteration_time)
        fw.close()
        return fail_datas

    def extract_data(self, buff):
        name = buff[0][1:].strip()
        name, inter = name.split('/')
        direction = inter[-2]
        left, right = map(int, inter[:-3].split('-'))
        _, source = buff[1].strip().split('\t')
        source = self.get_prefix(source)
        target = None
        for line in buff[2:]:
            try:
                gene = self.read_gene(line)
                if self.check_gene(left, right, direction, gene, target):
                    target = gene
            except:
                continue
        return {
            'name': name,
            'direction': direction,
            'left': left,
            'right': right,
            'source': source,
            'gene': target
        }

    def get_prefix(self, source):
        if self.keep_prefix_num > 0:
            return ' '.join(re.split('\s+', source)[:self.keep_prefix_num])
        return source

    def source_gene_distribution_analysis(self):
        self.logger.info('Start source_gene_distribution_analysis')
        datas = []
        buff = []
        for line in open(self.neighbor_result_path, 'r'):
            if len(line.strip()) == 0:
                if len(buff) > 0:
                    datas.append(self.extract_data(buff))
                    buff = []
            else:
                buff.append(line.strip())
        if len(buff) > 0:
            datas.append(self.extract_data(buff))
        source_counter = Counter()
        gene_counter = Counter()
        with open(self.next_gene_result_path, 'w') as fw:
            for data in datas:
                if data['gene'] is None:
                    continue
                fw.write('>%s/%s-%s(%s)\n' %
                         (data['name'], data['left'], data['right'],
                          data['direction']))
                fw.write('SOURCE\t%s\n' % (data['source']))
                fw.write('%s-%s\t%s\n\n' %
                         (data['gene']['left'], data['gene']['right'],
                          data['gene']['gene']))
                source_counter[data['source']] += 1
                gene_counter[data['gene']['gene']] += 1
        total = len(datas)
        for file_path, counter in [(self.source_count_path, source_counter),
                                   (self.gene_count_path, gene_counter)]:
            with open(file_path, 'w') as fw:
                for k, v in counter.most_common():
                    fw.write('%s\t%d\t%.4f%%\n' % (k, v, v * 100.0 / total))

        self.logger.info('End source_gene_distribution_analysis')

    def run(self):
        open(self.neighbor_result_path, 'w').close()
        with open(self.input_path, 'r') as f:
            unsolved_datas = filter(lambda arg: arg[0] is not None, [
                DataDownloadTool.format_data(line) for line in filter(
                    lambda arg: arg.startswith('>'), f.readlines())
            ])
            unsolved_datas = list(unsolved_datas)
        for iteration_time in range(1,
                                    ExperimentConfig.MAX_ITERATION_TIME + 1):
            unsolved_datas = self.find_neighbor_batch(unsolved_datas,
                                                      iteration_time)
            if len(unsolved_datas) == 0:
                break
        print("Failed data:" + str(len(unsolved_datas)) + "," +
              str(unsolved_datas))
        self.source_gene_distribution_analysis()

    @staticmethod
    def analysis_download_file(download_file_path, inter):
        left = min(inter)
        right = max(inter)
        gene_info = GeneFileReader(download_file_path)
        if not gene_info.build_information():
            return False, None
        near_small = None
        near_big = None
        res_set = set()
        for idx, gene_segment in enumerate(gene_info.gene_segments):
            if gene_segment.cds[1] <= left:
                if not near_small or near_small.cds[1] < gene_segment.cds[1]:
                    near_small = gene_segment
            if gene_segment.cds[0] >= right:
                if not near_big or near_big.cds[0] > gene_segment.cds[0]:
                    near_big = gene_segment
            if gene_segment.cds[0] <= left <= gene_segment.cds[1]:
                res_set.add(str(gene_segment))
            if gene_segment.cds[0] <= right <= gene_segment.cds[1]:
                res_set.add(str(gene_segment))
        if near_small:
            res_set.add(near_small)
        if near_big:
            res_set.add(near_big)
        sequence = gene_info.dna_code[left - 1:right]
        if inter[0] > inter[1]:
            sequence = get_opposite_dna(sequence[::-1])
        return True, {
            'source': gene_info.source,
            'data': list(res_set),
            'sequence': sequence
        }

    @staticmethod
    def check_gene(left, right, direction, gene, target):
        if direction == '-':
            peer = min(left, right)
            gene_peer = max(gene['left'], gene['right'])
            if peer > gene_peer:
                return target is None or max(target['left'],
                                             target['right']) < gene_peer
        elif direction == '+':
            peer = max(left, right)
            gene_peer = min(gene['left'], gene['right'])
            if peer < gene_peer:
                return target is None or min(target['left'],
                                             target['right']) > gene_peer
        else:
            raise ValueError('Direction should be - or +')

    @staticmethod
    def read_gene(line):
        inter, gene = line.strip().split('\t')
        left, right = map(int, inter.split('-'))
        return {'gene': gene, 'left': left, 'right': right}