def parse_blastxml(input_path, augustus_mapping, feature_table, annotation_count_with_putative_function, gene_counter, locus_tag, min_coverage, min_ident): #print input_path, augustus_mapping, feature_table, annotation_count_with_putative_function, gene_counter, locus_tag # extract the sequence name seq_name = os.path.splitext(os.path.split(input_path)[-1])[0] #input_path.split('Seq')[-1].split('.blastxml')[0] #print seq_name feature_table.write('>Feature %s\n' % seq_name) #locus_tag = 'M7I_' with open(input_path) as blast_handle: for entry in NCBIXML.parse(blast_handle): if entry.application == "BLASTX": query_length = entry.query_length / 3 if type(query_length) == type(1.7): print "Query length is not a multiple of three" break query_id = entry.query.split()[0] query_info = augustus_mapping[query_id] assert query_info.mRNA.seq_type == 'gene' gene_start = query_info.mRNA.start gene_end = query_info.mRNA.stop cds = query_info.exons mRNA = query_info.mRNA else: break gene_counter += 1 hsp_has_annotation = False feature_table_text = dict() for alignment in entry.alignments: for hsp in alignment.hsps: nident = hsp.identities ident = (100 * float(nident) / float(hsp.align_length)) """ Coverage: 'c8-c7+1 >= 0.5*c23' """ coverage = False if int(hsp.query_end) - int( hsp.query_start ) + 1 >= min_coverage * query_length: coverage = True # only annotate hits with an identity over 50% and a coverage over 50% if ident > min_ident and coverage: feature_table_text[hsp.bits] = "" hsp_has_annotation = True """ Hit_def changed: It now looks like: 'RecName: Full=Erythronolide synthase, modules 3 and 4; Short=PKS; AltName: Full=6-deoxyerythronolide B synthase II; AltName: Full=DEBS 2; AltName: Full=ORF 2' """ print alignment.hit_def accession = alignment.hit_def.encode('utf8') accession = filter( lambda token: token.startswith('RecName:'), map(str.strip, accession.split(';')))[0].split('Full=')[-1] accession = change_according_reviewer(accession, note_line=False) feature_table_text[hsp.bits] += '%i\t%i\tgene\n' % ( gene_start, gene_end) feature_table_text[ hsp.bits] += '\t\t\tlocus_tag\t%s%04d\n' % ( locus_tag, gene_counter) short_intron = check_short_introns(cds) if short_intron: feature_table_text[hsp.bits] += '\t\t\tpseudo\n' feature_table_text[ hsp. bits] += '\t\t\tnote\tnonfunctional; similar to %s\n' % accession continue """ Write the CDS section for the 'annotation' case and save a string for the mRNA section. """ mRNA_annotation = '' mRNA_annotation += '%i\t%i\tmRNA\n' % (cds[0].start, cds[0].stop) feature_table_text[hsp.bits] += '%i\t%i\tCDS\n' % ( cds[0].start, cds[0].stop) for region in cds[1:]: feature_table_text[hsp.bits] += '%i\t%i\n' % ( region.start, region.stop) mRNA_annotation += '%i\t%i\n' % (region.start, region.stop) if accession.startswith('hypothetical protein') or \ accession.startswith('predicted protein') or \ accession == '' or accession == 'protein': feature_table_text[ hsp. bits] += '\t\t\tproduct\thypothetical protein\n' mRNA_annotation += '\t\t\tproduct\thypothetical protein\n' else: feature_table_text[ hsp.bits] += '\t\t\tproduct\tputative %s\n' % ( accession) mRNA_annotation += '\t\t\tproduct\tputative %s\n' % ( accession) feature_table_text[ hsp. bits] += '\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % ( locus_tag, gene_counter) mRNA_annotation += '\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % ( locus_tag, gene_counter) feature_table_text[ hsp. bits] += '\t\t\ttranscript_id\tgnl|PBUF|%smrna%04d\n' % ( locus_tag, gene_counter) mRNA_annotation += '\t\t\ttranscript_id\tgnl|PBUF|%smrna%04d\n' % ( locus_tag, gene_counter) # Write mRNA section feature_table_text[hsp.bits] += mRNA_annotation if str(hsp.expect).find('e') != -1: """ Der evalue ist eine lange Zahl und muss gekuertzt werden. Z.B. 4.787347812347e-124""" evalue_first, evalue_last = str( hsp.expect).split('e') evalue = str(round(float(evalue_first), 1)) + 'e' + evalue_last else: evalue = round(hsp.expect, 1) """ hit_def = change_according_reviewer(alignment.hit_def, note_line = True) if hit_def.split('|')[:-1] != []: hit_def = hit_def.split('|')[-1].split()[0] else: hit_def = accession """ hit_def = accession """ try: protein_accession_gb = hit_def.split('gb|')[1].split('|')[0] #try to extract the genbank accession number >gi|302432474|gb|EFL04290.1|; -> EFL04290.1 inference = "similar to AA sequence:INSD: %s" % protein_accession_gb feature_table_text[ hsp.bits ] += '\t\t\tinference\t%s\n' % (inference) protein_accession_ref = hit_def.split('ref|')[1].split('|')[0] #try to extract the genbank accession number >gi|302432474|gb|EFL04290.1|; -> EFL04290.1 inference = "similar to AA sequence:RefSeq: %s" % protein_accession_ref feature_table_text[ hsp.bits ] += '\t\t\tinference\t%s\n' % (inference) except: pass """ inference = """ab initio prediction:Augustus:2.5.5""" feature_table_text[ hsp.bits] += '\t\t\tinference\t%s\n' % (inference) note = """similar to UniProtKB/Swiss-Prot Entry: %(hit_accession)s""" % { 'gene_counter': gene_counter, 'accession': accession, 'alignment_hit_def': hit_def, 'hit_accession': alignment.accession, 'len': query_length, 'evalue': evalue, 'bit_score': round(hsp.bits, 2), 'locus_tag': locus_tag, } feature_table_text[ hsp.bits] += '\t\t\tnote\t%s\n' % (note) #for region in cds[1:]: # mRNA_annotation += '%i\t%i\n' % (region.start, region.stop) # feature_table.write('%i\t%i\n' % (region.start, region.stop)) if hsp_has_annotation == False: """ If hsp has no annotation, insert a hypothetical protein """ feature_table.write('%i\t%i\tgene\n' % (gene_start, gene_end)) feature_table.write('\t\t\tlocus_tag\t%s%04d\n' % (locus_tag, gene_counter)) assert cds[0].seq_type == 'CDS' short_intron = check_short_introns(cds) if short_intron: feature_table.write('\t\t\tpseudo\n') feature_table.write('\t\t\tnote\tnonfunctional\n') """ Write the CDS section for the 'no-annotation' case. """ feature_table.write('%i\t%i\tCDS\n' % (cds[0].start, cds[0].stop)) for region in cds[1:]: feature_table.write('%i\t%i\n' % (region.start, region.stop)) feature_table.write('\t\t\tproduct\thypothetical protein\n') feature_table.write('\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % (locus_tag, gene_counter)) feature_table.write( '\t\t\ttranscript_id\tgnl|PBUF|%smrna%04d\n' % (locus_tag, gene_counter)) feature_table.write( '\t\t\tnote\tpredicted with Augustus 2.5.5\n') """ Write the mRNA section for the 'no-annotation' case. """ feature_table.write('%i\t%i\tmRNA\n' % (cds[0].start, cds[0].stop)) for region in cds[1:]: feature_table.write('%i\t%i\n' % (region.start, region.stop)) feature_table.write('\t\t\tproduct\thypothetical protein\n') feature_table.write('\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % (locus_tag, gene_counter)) feature_table.write( '\t\t\ttranscript_id\tgnl|PBUF|%smrna%04d\n' % (locus_tag, gene_counter)) else: bitscores = feature_table_text.keys() bitscores.sort(reverse=True) feature_table.write(feature_table_text[bitscores[0]]) if feature_table_text[bitscores[0]].find( '\t\t\tproduct\thypothetical protein\n') == -1: annotation_count_with_putative_function += 1 return (gene_counter, annotation_count_with_putative_function)
def parse_blastxml(input_path, glimmer_mapping, feature_table, annotation_count_with_putative_function, gene_counter, locus_tag, min_coverage, min_ident): # extract the sequence number seq_number = input_path.split('Seq')[-1].split('.blastxml')[0] feature_table.write('>Feature Seq%s\n' % seq_number) with open(input_path) as blast_handle: for entry in NCBIXML.parse(blast_handle): if entry.application == "BLASTX": query_length = entry.query_length if type(query_length) == type(1.7): print "Query length is not a multiple of three" break query_id = entry.query.split()[0] query_info = glimmer_mapping[query_id] query_start = int(query_info[0]) query_end = int(query_info[1]) else: break gene_counter += 1 """ if not entry.alignments: feature_table.write('%i\t%i\tgene\n' % (query_start, query_end)) feature_table.write('\t\t\tlocus_tag\t%s%04d\n' % (locus_tag, gene_counter)) feature_table.write('%i\t%i\tCDS\n' % (query_start, query_end)) feature_table.write('\t\t\tproduct\thypothetical protein\n') feature_table.write('\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % (locus_tag, gene_counter)) feature_table.write('\t\t\tnote\tpredicted with glimmer3\n') break """ hsp_has_annotation = False feature_table_text = dict() for alignment in entry.alignments: for hsp in alignment.hsps: nident = hsp.identities ident = (100 * float(nident) / float(hsp.align_length)) coverage = False if int(hsp.query_end) - int( hsp.query_start ) + 1 >= min_coverage * query_length: coverage = True # only annotate hits with an identity over 50% and a coverage over 50% if ident > min_ident and coverage: feature_table_text[hsp.bits] = "" hsp_has_annotation = True """ Hit_def changed: It now looks like: 'RecName: Full=Erythronolide synthase, modules 3 and 4; Short=PKS; AltName: Full=6-deoxyerythronolide B synthase II; AltName: Full=DEBS 2; AltName: Full=ORF 2' """ print alignment.hit_def accession = alignment.hit_def.encode('utf8') accession = filter( lambda token: token.startswith('RecName:'), map(str.strip, accession.split(';')))[0].split('Full=')[-1] assert change_according_reviewer( 'Pimelyl-[acyl-carrier protein] methyl ester esterase', note_line=False ) == 'Pimelyl-[acyl-carrier protein] methyl ester esterase' assert change_according_reviewer( 'putative D-malate dehydrogenase [decarboxylating] [gnl|PBUF|STVIR_0046:1-352] [gnl|PBUF|STVIR_0046: raw, aa len= 352]', note_line=False) == 'D-malate dehydrogenase' accession = change_according_reviewer(accession, note_line=False) feature_table_text[hsp.bits] += '%i\t%i\tgene\n' % ( query_start, query_end) feature_table_text[ hsp.bits] += '\t\t\tlocus_tag\t%s%04d\n' % ( locus_tag, gene_counter) feature_table_text[hsp.bits] += '%i\t%i\tCDS\n' % ( query_start, query_end) if accession.startswith('hypothetical protein') or \ accession.startswith('predicted protein') or \ accession == '' or accession == 'protein': feature_table_text[ hsp. bits] += '\t\t\tproduct\thypothetical protein\n' else: feature_table_text[ hsp.bits] += '\t\t\tproduct\tputative %s\n' % ( accession) feature_table_text[ hsp. bits] += '\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % ( locus_tag, gene_counter) if str(hsp.expect).find('e') != -1: """ Der evalue ist eine lange Zahl und muss gekuertzt werden. Z.B. 4.787347812347e-124""" evalue_first, evalue_last = str( hsp.expect).split('e') evalue = str(round(float(evalue_first), 1)) + 'e' + evalue_last else: evalue = round(hsp.expect, 1) """ hit_def = change_according_reviewer(alignment.hit_def, note_line = True) if hit_def.split('|')[:-1] != []: hit_def = hit_def.split('|')[-1].split()[0] else: hit_def = accession """ """" try: protein_accession_gb = hit_def.split('gb|')[1].split('|')[0] #try to extract the genbank accession number >gi|302432474|gb|EFL04290.1|; -> EFL04290.1 inference = "similar to AA sequence:INSD: %s" % protein_accession_gb feature_table_text[ hsp.bits ] += '\t\t\tinference\t%s\n' % (inference) protein_accession_ref = hit_def.split('ref|')[1].split('|')[0] #try to extract the genbank accession number >gi|302432474|gb|EFL04290.1|; -> EFL04290.1 inference = "similar to AA sequence:RefSeq: %s" % protein_accession_ref feature_table_text[ hsp.bits ] += '\t\t\tinference\t%s\n' % (inference) except: pass """ inference = """ab initio prediction:Glimmer:3""" feature_table_text[ hsp.bits] += '\t\t\tinference\t%s\n' % (inference) note = """similar to UniProtKB/Swiss-Prot Entry: %(hit_accession)s""" % { 'gene_counter': gene_counter, 'accession': accession, 'alignment_hit_def': accession, 'hit_accession': alignment.accession, 'len': query_length, 'evalue': evalue, 'bit_score': round(hsp.bits, 2), 'locus_tag': locus_tag, } feature_table_text[ hsp.bits] += '\t\t\tnote\t%s\n' % (note) if hsp_has_annotation == False: """ If hsp has no annotation with the specified identity and coverage, insert a hypothetical protein """ feature_table.write('%i\t%i\tgene\n' % (query_start, query_end)) feature_table.write('\t\t\tlocus_tag\t%s%04d\n' % (locus_tag, gene_counter)) feature_table.write('%i\t%i\tCDS\n' % (query_start, query_end)) feature_table.write('\t\t\tproduct\thypothetical protein\n') feature_table.write('\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % (locus_tag, gene_counter)) feature_table.write( '\t\t\tnote\tab initio prediction:Glimmer3\n') else: bitscores = feature_table_text.keys() bitscores.sort(reverse=True) feature_table.write(feature_table_text[bitscores[0]]) if feature_table_text[bitscores[0]].find( '\t\t\tproduct\thypothetical protein\n') == -1: annotation_count_with_putative_function += 1 return (gene_counter, annotation_count_with_putative_function)
def parse_blastxml(input_path, augustus_mapping, feature_table, annotation_count_with_putative_function, gene_counter, locus_tag, min_coverage, min_ident): #print input_path, augustus_mapping, feature_table, annotation_count_with_putative_function, gene_counter, locus_tag # extract the sequence name seq_name = os.path.splitext(os.path.split(input_path)[-1])[0] #input_path.split('Seq')[-1].split('.blastxml')[0] #print seq_name feature_table.write('>Feature %s\n' % seq_name) #locus_tag = 'M7I_' with open(input_path) as blast_handle: for entry in NCBIXML.parse(blast_handle): if entry.application == "BLASTX": query_length = entry.query_length / 3 if type(query_length) == type(1.7): print "Query length is not a multiple of three" break query_id = entry.query.split()[0] query_info = augustus_mapping[ query_id ] assert query_info.mRNA.seq_type == 'gene' gene_start = query_info.mRNA.start gene_end = query_info.mRNA.stop cds = query_info.exons mRNA = query_info.mRNA else: break gene_counter += 1 hsp_has_annotation = False feature_table_text = dict() for alignment in entry.alignments: for hsp in alignment.hsps: nident = hsp.identities ident = (100*float(nident)/float(hsp.align_length)) """ Coverage: 'c8-c7+1 >= 0.5*c23' """ coverage = False if int(hsp.query_end) - int(hsp.query_start) + 1 >= min_coverage * query_length: coverage = True # only annotate hits with an identity over 50% and a coverage over 50% if ident > min_ident and coverage: feature_table_text[ hsp.bits ] = "" hsp_has_annotation = True """ Hit_def changed: It now looks like: 'RecName: Full=Erythronolide synthase, modules 3 and 4; Short=PKS; AltName: Full=6-deoxyerythronolide B synthase II; AltName: Full=DEBS 2; AltName: Full=ORF 2' """ print alignment.hit_def accession = alignment.hit_def.encode('utf8') accession = filter(lambda token: token.startswith('RecName:'), map(str.strip, accession.split(';')))[0].split('Full=')[-1] accession = change_according_reviewer(accession, note_line = False) feature_table_text[ hsp.bits ] += '%i\t%i\tgene\n' % (gene_start, gene_end) feature_table_text[ hsp.bits ] += '\t\t\tlocus_tag\t%s%04d\n' % (locus_tag, gene_counter) short_intron = check_short_introns(cds) if short_intron: feature_table_text[ hsp.bits ] += '\t\t\tpseudo\n' feature_table_text[ hsp.bits ] += '\t\t\tnote\tnonfunctional; similar to %s\n' % accession continue """ Write the CDS section for the 'annotation' case and save a string for the mRNA section. """ mRNA_annotation = '' mRNA_annotation += '%i\t%i\tmRNA\n' % (cds[0].start, cds[0].stop) feature_table_text[ hsp.bits ] += '%i\t%i\tCDS\n' % (cds[0].start, cds[0].stop) for region in cds[1:]: feature_table_text[ hsp.bits ] += '%i\t%i\n' % (region.start, region.stop) mRNA_annotation += '%i\t%i\n' % (region.start, region.stop) if accession.startswith('hypothetical protein') or \ accession.startswith('predicted protein') or \ accession == '' or accession == 'protein': feature_table_text[ hsp.bits ] += '\t\t\tproduct\thypothetical protein\n' mRNA_annotation += '\t\t\tproduct\thypothetical protein\n' else: feature_table_text[ hsp.bits ] += '\t\t\tproduct\tputative %s\n' % (accession) mRNA_annotation += '\t\t\tproduct\tputative %s\n' % (accession) feature_table_text[ hsp.bits ] += '\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % (locus_tag, gene_counter) mRNA_annotation += '\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % (locus_tag, gene_counter) feature_table_text[ hsp.bits ] += '\t\t\ttranscript_id\tgnl|PBUF|%smrna%04d\n' % (locus_tag, gene_counter) mRNA_annotation += '\t\t\ttranscript_id\tgnl|PBUF|%smrna%04d\n' % (locus_tag, gene_counter) # Write mRNA section feature_table_text[ hsp.bits ] += mRNA_annotation if str(hsp.expect).find('e') != -1: """ Der evalue ist eine lange Zahl und muss gekuertzt werden. Z.B. 4.787347812347e-124""" evalue_first, evalue_last = str(hsp.expect).split('e') evalue = str(round(float(evalue_first), 1)) + 'e' + evalue_last else: evalue = round(hsp.expect, 1) """ hit_def = change_according_reviewer(alignment.hit_def, note_line = True) if hit_def.split('|')[:-1] != []: hit_def = hit_def.split('|')[-1].split()[0] else: hit_def = accession """ hit_def = accession """ try: protein_accession_gb = hit_def.split('gb|')[1].split('|')[0] #try to extract the genbank accession number >gi|302432474|gb|EFL04290.1|; -> EFL04290.1 inference = "similar to AA sequence:INSD: %s" % protein_accession_gb feature_table_text[ hsp.bits ] += '\t\t\tinference\t%s\n' % (inference) protein_accession_ref = hit_def.split('ref|')[1].split('|')[0] #try to extract the genbank accession number >gi|302432474|gb|EFL04290.1|; -> EFL04290.1 inference = "similar to AA sequence:RefSeq: %s" % protein_accession_ref feature_table_text[ hsp.bits ] += '\t\t\tinference\t%s\n' % (inference) except: pass """ inference = """ab initio prediction:Augustus:2.5.5""" feature_table_text[ hsp.bits ] += '\t\t\tinference\t%s\n' % (inference) note = """similar to UniProtKB/Swiss-Prot Entry: %(hit_accession)s""" % {'gene_counter': gene_counter, 'accession':accession, 'alignment_hit_def': hit_def, 'hit_accession': alignment.accession, 'len': query_length, 'evalue': evalue, 'bit_score': round(hsp.bits, 2), 'locus_tag': locus_tag, } feature_table_text[ hsp.bits ] += '\t\t\tnote\t%s\n' % (note) #for region in cds[1:]: # mRNA_annotation += '%i\t%i\n' % (region.start, region.stop) # feature_table.write('%i\t%i\n' % (region.start, region.stop)) if hsp_has_annotation == False: """ If hsp has no annotation, insert a hypothetical protein """ feature_table.write('%i\t%i\tgene\n' % (gene_start, gene_end)) feature_table.write('\t\t\tlocus_tag\t%s%04d\n' % (locus_tag, gene_counter)) assert cds[0].seq_type == 'CDS' short_intron = check_short_introns(cds) if short_intron: feature_table.write('\t\t\tpseudo\n') feature_table.write('\t\t\tnote\tnonfunctional\n') """ Write the CDS section for the 'no-annotation' case. """ feature_table.write('%i\t%i\tCDS\n' % (cds[0].start, cds[0].stop)) for region in cds[1:]: feature_table.write('%i\t%i\n' % (region.start, region.stop)) feature_table.write('\t\t\tproduct\thypothetical protein\n') feature_table.write('\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % (locus_tag, gene_counter)) feature_table.write('\t\t\ttranscript_id\tgnl|PBUF|%smrna%04d\n' % (locus_tag, gene_counter)) feature_table.write('\t\t\tnote\tpredicted with Augustus 2.5.5\n') """ Write the mRNA section for the 'no-annotation' case. """ feature_table.write('%i\t%i\tmRNA\n' % (cds[0].start, cds[0].stop)) for region in cds[1:]: feature_table.write('%i\t%i\n' % (region.start, region.stop)) feature_table.write('\t\t\tproduct\thypothetical protein\n') feature_table.write('\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % (locus_tag, gene_counter)) feature_table.write('\t\t\ttranscript_id\tgnl|PBUF|%smrna%04d\n' % (locus_tag, gene_counter)) else: bitscores = feature_table_text.keys() bitscores.sort(reverse=True) feature_table.write(feature_table_text[ bitscores[0] ]) if feature_table_text[ bitscores[0] ].find('\t\t\tproduct\thypothetical protein\n') == -1: annotation_count_with_putative_function += 1 return (gene_counter, annotation_count_with_putative_function)
def parse_blastxml(input_path, glimmer_mapping, feature_table, annotation_count_with_putative_function, gene_counter, locus_tag, min_coverage, min_ident): # extract the sequence number seq_number = input_path.split('Seq')[-1].split('.blastxml')[0] feature_table.write('>Feature Seq%s\n' % seq_number) with open(input_path) as blast_handle: for entry in NCBIXML.parse(blast_handle): if entry.application == "BLASTX": query_length = entry.query_length if type(query_length) == type(1.7): print "Query length is not a multiple of three" break query_id = entry.query.split()[0] query_info = glimmer_mapping[ query_id ] query_start = int(query_info[0]) query_end = int(query_info[1]) else: break gene_counter += 1 """ if not entry.alignments: feature_table.write('%i\t%i\tgene\n' % (query_start, query_end)) feature_table.write('\t\t\tlocus_tag\t%s%04d\n' % (locus_tag, gene_counter)) feature_table.write('%i\t%i\tCDS\n' % (query_start, query_end)) feature_table.write('\t\t\tproduct\thypothetical protein\n') feature_table.write('\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % (locus_tag, gene_counter)) feature_table.write('\t\t\tnote\tpredicted with glimmer3\n') break """ hsp_has_annotation = False feature_table_text = dict() for alignment in entry.alignments: for hsp in alignment.hsps: nident = hsp.identities ident = (100*float(nident)/float(hsp.align_length)) coverage = False if int(hsp.query_end) - int(hsp.query_start) + 1 >= min_coverage * query_length: coverage = True # only annotate hits with an identity over 50% and a coverage over 50% if ident > min_ident and coverage: feature_table_text[ hsp.bits ] = "" hsp_has_annotation = True accession = alignment.hit_def.split('OS=')[0].strip() assert change_according_reviewer('Pimelyl-[acyl-carrier protein] methyl ester esterase', note_line = False) == 'Pimelyl-[acyl-carrier protein] methyl ester esterase' assert change_according_reviewer('putative D-malate dehydrogenase [decarboxylating] [gnl|PBUF|STVIR_0046:1-352] [gnl|PBUF|STVIR_0046: raw, aa len= 352]', note_line = False) == 'D-malate dehydrogenase' accession = change_according_reviewer(accession, note_line = False) feature_table_text[ hsp.bits ] += '%i\t%i\tgene\n' % (query_start, query_end) feature_table_text[ hsp.bits ] += '\t\t\tlocus_tag\t%s%04d\n' % (locus_tag, gene_counter) feature_table_text[ hsp.bits ] += '%i\t%i\tCDS\n' % (query_start, query_end) if accession.startswith('hypothetical protein') or \ accession.startswith('predicted protein') or \ accession == '' or accession == 'protein': feature_table_text[ hsp.bits ] += '\t\t\tproduct\thypothetical protein\n' else: feature_table_text[ hsp.bits ] += '\t\t\tproduct\tputative %s\n' % (accession) feature_table_text[ hsp.bits ] += '\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % (locus_tag, gene_counter) if str(hsp.expect).find('e') != -1: """ Der evalue ist eine lange Zahl und muss gekuertzt werden. Z.B. 4.787347812347e-124""" evalue_first, evalue_last = str(hsp.expect).split('e') evalue = str(round(float(evalue_first), 1)) + 'e' + evalue_last else: evalue = round(hsp.expect, 1) """ hit_def = change_according_reviewer(alignment.hit_def, note_line = True) if hit_def.split('|')[:-1] != []: hit_def = hit_def.split('|')[-1].split()[0] else: hit_def = accession """ """" try: protein_accession_gb = hit_def.split('gb|')[1].split('|')[0] #try to extract the genbank accession number >gi|302432474|gb|EFL04290.1|; -> EFL04290.1 inference = "similar to AA sequence:INSD: %s" % protein_accession_gb feature_table_text[ hsp.bits ] += '\t\t\tinference\t%s\n' % (inference) protein_accession_ref = hit_def.split('ref|')[1].split('|')[0] #try to extract the genbank accession number >gi|302432474|gb|EFL04290.1|; -> EFL04290.1 inference = "similar to AA sequence:RefSeq: %s" % protein_accession_ref feature_table_text[ hsp.bits ] += '\t\t\tinference\t%s\n' % (inference) except: pass """ inference = """ab initio prediction:Glimmer:3""" feature_table_text[ hsp.bits ] += '\t\t\tinference\t%s\n' % (inference) note = """similar to UniProtKB/Swiss-Prot Entry: %(hit_accession)s""" % {'gene_counter': gene_counter, 'accession':accession, 'alignment_hit_def': accession, 'hit_accession': alignment.accession, 'len': query_length, 'evalue': evalue, 'bit_score': round(hsp.bits, 2), 'locus_tag': locus_tag, } feature_table_text[ hsp.bits ] += '\t\t\tnote\t%s\n' % (note) if hsp_has_annotation == False: """ If hsp has no annotation with the specified identity and coverage, insert a hypothetical protein """ feature_table.write('%i\t%i\tgene\n' % (query_start, query_end)) feature_table.write('\t\t\tlocus_tag\t%s%04d\n' % (locus_tag, gene_counter)) feature_table.write('%i\t%i\tCDS\n' % (query_start, query_end)) feature_table.write('\t\t\tproduct\thypothetical protein\n') feature_table.write('\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % (locus_tag, gene_counter)) feature_table.write('\t\t\tnote\tab initio prediction:Glimmer3\n') else: bitscores = feature_table_text.keys() bitscores.sort(reverse=True) feature_table.write(feature_table_text[ bitscores[0] ]) if feature_table_text[ bitscores[0] ].find('\t\t\tproduct\thypothetical protein\n') == -1: annotation_count_with_putative_function += 1 return (gene_counter, annotation_count_with_putative_function)