def _producer_blast(self, genome_pair): """Apply reciprocal blast to a pair of genomes. Parameters ---------- genome_pair : list Identifier of genomes to process. """ blast = Blast(cpus=self.producer_cpus) aa_gene_fileA, aa_gene_fileB = genome_pair genome_idA = remove_extension(aa_gene_fileA) genome_idB = remove_extension(aa_gene_fileB) dbA = os.path.join(self.output_dir, genome_idA + '.db') dbB = os.path.join(self.output_dir, genome_idB + '.db') output_fileAB = os.path.join(self.output_dir, genome_idA + '-' + genome_idB + '.blastp.tsv') blast.blastp(aa_gene_fileA, dbB, output_fileAB, self.evalue) output_fileBA = os.path.join(self.output_dir, genome_idB + '-' + genome_idA + '.blastp.tsv') blast.blastp(aa_gene_fileB, dbA, output_fileBA, self.evalue) return True
def classify(self, seq_file, db, taxonomy_file, evalue_threshold, output_dir): """Classify rRNA genes. Parameters ---------- seq_file : str Name of fasta file containing rRNA sequences. ssu_db : str BLAST database of rRNA genes. ssu_taxonomy_file : str Taxonomy file for genes in the rRNA database. evalue_threshold : float E-value threshold for defining valid hits. output_dir : str Output directory. """ # blast sequences against rRNA database blast = Blast(self.cpus) blast_file = os.path.join(output_dir, '%s.blastn.tsv' % self.rna_name) blast.blastn(seq_file, db, blast_file, evalue=evalue_threshold, max_matches=5, output_fmt='custom') # read taxonomy file taxonomy = Taxonomy().read(taxonomy_file) # write out classification file classification_file = os.path.join( output_dir, '%s.taxonomy.tsv' % self.rna_name) fout = open(classification_file, 'w') fout.write( 'query_id\ttaxonomy\tlength\tblast_subject_id\tblast_evalue\tblast_bitscore\tblast_align_len\tblast_perc_identity\n') processed_query_ids = set() for line in open(blast_file): line_split = [x.strip() for x in line.split('\t')] query_id = line_split[0] if query_id in processed_query_ids: # A query may have multiple hits to different genes or sections # of a gene. Blast results are organized by bitscore so # only the first hit is considered. continue processed_query_ids.add(query_id) query_len = int(line_split[1]) subject_id = line_split[2] align_len = line_split[5] perc_identity = line_split[6] evalue = line_split[7] bitscore = line_split[8] taxonomy_str = ';'.join(taxonomy[subject_id]) fout.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (query_id, taxonomy_str, query_len, subject_id, evalue, bitscore, align_len, perc_identity)) fout.close()
def _run_self_blastp(self, query_gene_file, evalue, per_identity, per_aln_len, max_hits, tmp_dir, output_dir): """Perform similarity search of query genes against themselves. Parameters ---------- query_gene_file : str File with all query sequences. evalue : float E-value threshold for reporting hits. per_identity : float Percent identity threshold for reporting hits. per_aln_len : float Percent query coverage threshold for reporting hits. max_hits : int Maximum number of hits to report per query sequences. tmp_dir : str Directory to store temporary files. output_dir : str Directory to store blast results. """ # concatenate all gene files and create a single diamond database self.logger.info('Creating BLASTP database (be patient!).') blast = Blast(self.cpus, silent=True) blast.create_blastp_db(query_gene_file) # create temporary hits table if tmp_dir: tmp_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', dir=tmp_dir, delete=False) else: tmp_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', delete=False) tmp_hits_table.close() # blast all genes against the database self.logger.info('Performing sequence similarity search between all query genomes (be patient!).') hits_daa_file = os.path.join(output_dir, 'query_hits') blast.blastp(query_gene_file, query_gene_file, tmp_hits_table.name, evalue, max_hits, task='blastp-fast') # sort hit table hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv') self._sort_hit_table(tmp_hits_table.name, hits_table_file)
def _tax_filter(self, ssu_output_file, taxonomy, output_dir): """Identify sequence to filter based on taxonomy of best BLAST hit. """ tax_filter_dir = os.path.join(output_dir, 'tax_filter') if not os.path.exists(tax_filter_dir): os.makedirs(tax_filter_dir) blast = Blast(self.cpus) self.logger.info('Creating BLASTN database.') blast.create_blastn_db(ssu_output_file) self.logger.info( 'Performing reciprocal BLAST to identify sequences with incongruent taxonomies.' ) blast_table = os.path.join(tax_filter_dir, 'blastn.tsv') blast.blastn(ssu_output_file, ssu_output_file, blast_table, evalue=1e-10, max_matches=2, output_fmt='custom', task='megablast') filter = set() order_index = Taxonomy.rank_labels.index('order') fout = open(os.path.join(tax_filter_dir, 'filtered_seqs.tsv'), 'w') fout.write( 'Seq Id\tQuery Taxonomy\tSubject Taxonomy\tPerc. Identity\tAlign. Length\n' ) for hit in blast.read_hit(blast_table, table_fmt='custom'): if hit.query_id == hit.subject_id: # ignore self hits continue # require a (very lenient) percent identity of 82% # (threshold from Yarza et al., 2014) if hit.perc_identity >= 82 and hit.alignment_len > 800: # there is a close hit in the database so verify it has # the expected taxonomic order query_genome_id = hit.query_id.split('~', 1)[0] subject_genome_id = hit.subject_id.split('~', 1)[0] order_of_query = taxonomy[query_genome_id][order_index][ 3:].strip() order_of_subject = taxonomy[subject_genome_id][order_index][ 3:].strip() if order_of_query and order_of_subject and order_of_query != order_of_subject: filter.add(hit.query_id) fout.write( '%s\t%s\t%s\%.2f\t%d\n' % (hit.query_id, ';'.join(taxonomy[query_genome_id]), ';'.join(taxonomy[subject_genome_id]), hit.perc_identity, hit.alignment_len)) fout.close() return filter
def _run_self_blastp(self, query_gene_file, evalue, per_identity, per_aln_len, max_hits, tmp_dir, output_dir): """Perform similarity search of query genes against themselves. Parameters ---------- query_gene_file : str File with all query sequences. evalue : float E-value threshold for reporting hits. per_identity : float Percent identity threshold for reporting hits. per_aln_len : float Percent query coverage threshold for reporting hits. max_hits : int Maximum number of hits to report per query sequences. tmp_dir : str Directory to store temporary files. output_dir : str Directory to store blast results. """ # concatenate all gene files and create a single diamond database self.logger.info('Creating BLASTP database (be patient!).') blastp_db = os.path.join(output_dir, 'query_genes') blast = Blast(self.cpus, silent=True) blast.create_blastp_db(query_gene_file, blastp_db) # create temporary hits table if tmp_dir: tmp_hits_table = tempfile.NamedTemporaryFile( prefix='comparem_hits_', dir=tmp_dir, delete=False) else: tmp_hits_table = tempfile.NamedTemporaryFile( prefix='comparem_hits_', delete=False) tmp_hits_table.close() # blast all genes against the database self.logger.info( 'Performing sequence similarity search between all query genomes (be patient!).' ) hits_daa_file = os.path.join(output_dir, 'query_hits') blast.blastp(query_gene_file, blastp_db, tmp_hits_table.name, evalue, max_hits, task='blastp-fast') # sort hit table hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv') self._sort_hit_table(tmp_hits_table.name, hits_table_file)
def run(self, query_proteins, db_file, custom_db_file, taxonomy_file, custom_taxonomy_file, evalue, per_identity, per_aln_len, max_matches, homology_search, min_per_taxa, consensus, min_per_bp, use_trimAl, restrict_taxon, msa_program, tree_program, prot_model, skip_rooting, output_dir): """Infer a gene tree for homologs genes identified by blast. Workflow for inferring a gene tree from sequences identified as being homologs to a set of query proteins. Homologs are identified using BLASTP and a set of user-defined parameters. Parameters ---------- query_proteins : str Fasta file containing query proteins. db_file : str BLAST database of reference proteins. custom_db_file : str Custom database of proteins. taxonomy_file : str Taxonomic assignment of each reference genomes. custom_taxonomy_file : str Taxonomic assignment of genomes in custom database. evalue : float E-value threshold used to define homolog. per_identity : float Percent identity threshold used to define a homolog. per_aln_len : float Alignment length threshold used to define a homolog. max_matches : int Maximum matches per query protein. metadata : dict[genome_id] -> metadata dictionary Metadata for genomes. homology_search : str Type of homology search to perform. min_per_taxa : float Minimum percentage of taxa required to retain a column. consensus : float Minimum percentage of the same amino acid required to retain column. min_per_bp : float Minimum percentage of base pairs required to keep trimmed sequence. use_trimAl : boolean Filter columns using trimAl. restrict_taxon : str Restrict alignment to specific taxonomic group (e.g., k__Archaea). msa_program : str Program to use for multiple sequence alignment ['mafft', 'muscle']. tree_program : str Program to use for tree inference ['fasttree', 'raxml']. prot_model : str Protein substitution model for tree inference ['WAG', 'LG', 'AUTO']. skip_rooting : boolean Skip midpoint rooting if True. output_dir : str Directory to store results. """ # validate query sequence names for use with GeneTreeTk validate_seq_ids(query_proteins) # read taxonomy file self.logger.info('Reading taxonomy file.') taxonomy = Taxonomy().read(taxonomy_file) if custom_taxonomy_file: custom_taxonomy = Taxonomy().read(custom_taxonomy_file) taxonomy.update(custom_taxonomy) # report distribution of query genes mean_len, max_len, min_len, p10, p50, p90 = self._gene_distribution( query_proteins) self.logger.info( 'Query gene lengths: min, mean, max = %d, %.1f, %d | p10, p50, p90 = %.1f, %.1f, %.1f' % (min_len, mean_len, max_len, p10, p50, p90)) # identify homologs using BLASTP self.logger.info('Identifying homologs using %s.' % homology_search) blast = Blast(self.cpus) blast_output = os.path.join(output_dir, 'reference_hits.tsv') if homology_search == 'diamond': diamond = Diamond(self.cpus) diamond.blastp(query_proteins, db_file, evalue, per_identity, per_aln_len, max_matches, blast_output, output_fmt='custom') else: blast.blastp(query_proteins, db_file, blast_output, evalue, max_matches, output_fmt='custom', task=homology_search) homologs = blast.identify_homologs(blast_output, evalue, per_identity, per_aln_len) self.logger.info('Identified %d homologs in reference database.' % len(homologs)) custom_homologs = None if custom_db_file: custom_blast_output = os.path.join(output_dir, 'custom_hits.tsv') if homology_search == 'diamond': diamond = Diamond(self.cpus) diamond.blastp(query_proteins, custom_db_file, evalue, per_identity, per_aln_len, max_matches, custom_blast_output, output_fmt='custom') else: blast.blastp(query_proteins, custom_db_file, custom_blast_output, evalue, max_matches, output_fmt='custom', task=homology_search) custom_homologs = blast.identify_homologs(custom_blast_output, evalue, per_identity, per_aln_len) self.logger.info('Identified %d homologs in custom database.' % len(custom_homologs)) # restrict homologs to specific taxonomic group if restrict_taxon: self.logger.info('Restricting homologs to %s.' % restrict_taxon) restricted_homologs = {} for query_id, hit in homologs.iteritems(): genome_id = hit.subject_id.split('~')[0] if restrict_taxon in taxonomy[genome_id]: restricted_homologs[query_id] = hit self.logger.info( '%d of %d homologs in reference database are from the specified group.' % (len(restricted_homologs), len(homologs))) homologs = restricted_homologs if len(homologs) == 0: self.logger.error( 'Too few homologs were identified. Gene tree cannot be inferred.' ) sys.exit() # extract homologs self.logger.info( 'Extracting homologs and determining local gene context.') db_homologs_tmp = os.path.join(output_dir, 'homologs_db.tmp') gene_precontext, gene_postcontext = self.extract_homologs_and_context( homologs.keys(), db_file, db_homologs_tmp) # report gene length distribution of homologs mean_len, max_len, min_len, p10, p50, p90 = self._gene_distribution( db_homologs_tmp) self.logger.info( 'Homolog gene lengths: min, mean, max = %d, %.1f, %d | p10, p50, p90 = %.1f, %.1f, %.1f' % (min_len, mean_len, max_len, p10, p50, p90)) # concatenate homologs with initial query genes homolog_ouput_tmp = os.path.join(output_dir, 'homologs.faa.tmp') if custom_homologs: custom_db_homologs_tmp = os.path.join(output_dir, 'custom_homologs_db.tmp') custom_gene_precontext, custom_gene_postcontext = self.extract_homologs_and_context( custom_homologs.keys(), custom_db_file, custom_db_homologs_tmp) gene_precontext.update(custom_gene_precontext) gene_postcontext.update(custom_gene_postcontext) homologs.update(custom_homologs) concatenate_files( [query_proteins, db_homologs_tmp, custom_db_homologs_tmp], homolog_ouput_tmp) os.remove(custom_db_homologs_tmp) else: concatenate_files([query_proteins, db_homologs_tmp], homolog_ouput_tmp) os.remove(db_homologs_tmp) # remove stop codons homolog_ouput = os.path.join(output_dir, 'homologs.faa') self._remove_stop_codons(homolog_ouput_tmp, homolog_ouput) os.remove(homolog_ouput_tmp) # infer multiple sequence alignment msa = MsaWorkflow(self.cpus) trimmed_msa_output = msa.run(homolog_ouput, min_per_taxa, consensus, min_per_bp, use_trimAl, msa_program, output_dir) # infer tree tw = TreeWorkflow(self.cpus) tree_output = tw.run(trimmed_msa_output, tree_program, prot_model, skip_rooting, output_dir) # create tax2tree consensus map and decorate tree self.logger.info('Decorating internal tree nodes with tax2tree.') output_taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv') fout = open(output_taxonomy_file, 'w') for homolog_id in homologs.keys(): genome_id = homolog_id.split('~')[0] t = taxonomy.get(genome_id, None) if t: fout.write(homolog_id + '\t' + ';'.join(t) + '\n') fout.close() t2t_tree = os.path.join(output_dir, 'homologs.tax2tree.tree') cmd = 't2t decorate -m %s -t %s -o %s' % (output_taxonomy_file, tree_output, t2t_tree) os.system(cmd) # create tree with leaf nodes given as genome accessions tree = dendropy.Tree.get_from_path(t2t_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) for leaf in tree.leaf_node_iter(): leaf.taxon.label = leaf.taxon.label.split('~')[0] genome_tree = os.path.join(output_dir, 'homologs.tax2tree.genome_accessions.tree') tree.write_to_path(genome_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True) # setup metadata for ARB file src_dir = os.path.dirname(os.path.realpath(__file__)) version_file = open(os.path.join(src_dir, 'VERSION')) metadata = {} metadata['genetreetk_version'] = version_file.read().strip() metadata['genetreetk_query_proteins'] = query_proteins metadata['genetreetk_db_file'] = db_file metadata['genetreetk_taxonomy_file'] = taxonomy_file metadata['genetreetk_blast_evalue'] = str(evalue) metadata['genetreetk_blast_per_identity'] = str(per_identity) metadata['genetreetk_blast_per_aln_len'] = str(per_aln_len) metadata['genetreetk_blast_max_matches'] = str(max_matches) metadata['genetreetk_homology_search'] = homology_search metadata['genetreetk_msa_min_per_taxa'] = str(min_per_taxa) metadata['genetreetk_msa_consensus'] = str(consensus) metadata['genetreetk_msa_min_per_bp'] = str(min_per_bp) metadata['genetreetk_msa_program'] = msa_program metadata['genetreetk_tree_program'] = tree_program metadata['genetreetk_tree_prot_model'] = prot_model # create ARB metadata file self.logger.info('Creating ARB metadata file.') arb_metadata_file = os.path.join(output_dir, 'arb.metadata.txt') self.create_arb_metadata(homologs, trimmed_msa_output, taxonomy, metadata, gene_precontext, gene_postcontext, arb_metadata_file)
def _tax_filter(self, ssu_output_file, taxonomy, output_dir): """Identify sequence to filter based on taxonomy of best BLAST hit. """ extant_taxa = Taxonomy().extant_taxa(taxonomy) tax_filter_dir = os.path.join(output_dir, 'tax_filter') if not os.path.exists(tax_filter_dir): os.makedirs(tax_filter_dir) blast = Blast(self.cpus) self.logger.info('Creating BLASTN database.') blast.create_blastn_db(ssu_output_file) self.logger.info( 'Performing reciprocal BLAST to identify sequences with incongruent taxonomies.' ) blast_table = os.path.join(tax_filter_dir, 'blastn.tsv') blast.blastn(ssu_output_file, ssu_output_file, blast_table, evalue=1e-10, max_matches=2, output_fmt='custom', task='blastn') filter = set() fout = open(os.path.join(tax_filter_dir, 'filtered_seqs.tsv'), 'w') fout.write( 'Query ID\tQuery Taxonomy\tSubject ID\tSubject Taxonomy\tPerc. Identity\tAlign. Length\tMismatch Rank\tNo. Query Genomes\tNo. Subject Genomes\n' ) for hit in blast.read_hit(blast_table, table_fmt='custom'): if hit.query_id == hit.subject_id: # ignore self hits continue if hit.alignment_len > 800: query_genome_id = hit.query_id.split('~', 1)[0] subject_genome_id = hit.subject_id.split('~', 1)[0] # require a (very lenient) percent identity of threshold from Yarza et al., 2014 if hit.query_id not in filter and hit.perc_identity >= 82: # order rank_index = Taxonomy.rank_labels.index('order') query_taxa = taxonomy[query_genome_id][rank_index][ 3:].strip() subject_taxa = taxonomy[subject_genome_id][rank_index][ 3:].strip() if query_taxa and subject_taxa and query_taxa != subject_taxa: filter.add(hit.query_id) fout.write( '%s\t%s\t%s\t%s\t%.2f\t%d\t%s\t%d\t%d\n' % (hit.query_id, ';'.join(taxonomy[query_genome_id]), hit.subject_id, ';'.join( taxonomy[subject_genome_id]), hit.perc_identity, hit.alignment_len, 'Order', len(extant_taxa['o__' + query_taxa]), len(extant_taxa['o__' + subject_taxa]))) if False: if hit.perc_identity >= 75: # phylum rank_index = Taxonomy.rank_labels.index('phylum') query_taxa = taxonomy[query_genome_id][rank_index][ 3:].strip() subject_taxa = taxonomy[subject_genome_id][rank_index][ 3:].strip() if query_taxa and subject_taxa and query_taxa != subject_taxa: filter.add(hit.query_id) fout.write( '%s\t%s\t%s\t%s\t%.2f\t%d\t%s\t%d\t%d\n' % (hit.query_id, ';'.join( taxonomy[query_genome_id]), hit.subject_id, ';'.join(taxonomy[subject_genome_id]), hit.perc_identity, hit.alignment_len, 'Phylum', len( extant_taxa['p__' + query_taxa]), len(extant_taxa['p__' + subject_taxa]))) if hit.perc_identity >= 78.5: # class rank_index = Taxonomy.rank_labels.index('class') query_taxa = taxonomy[query_genome_id][rank_index][ 3:].strip() subject_taxa = taxonomy[subject_genome_id][rank_index][ 3:].strip() if query_taxa and subject_taxa and query_taxa != subject_taxa: filter.add(hit.query_id) fout.write( '%s\t%s\t%s\t%s\t%.2f\t%d\t%s\t%d\t%d\n' % (hit.query_id, ';'.join( taxonomy[query_genome_id]), hit.subject_id, ';'.join(taxonomy[subject_genome_id]), hit.perc_identity, hit.alignment_len, 'Class', len(extant_taxa['c__' + query_taxa]), len(extant_taxa['c__' + subject_taxa]))) if hit.query_id not in filter and hit.perc_identity >= 82: # order rank_index = Taxonomy.rank_labels.index('order') query_taxa = taxonomy[query_genome_id][rank_index][ 3:].strip() subject_taxa = taxonomy[subject_genome_id][rank_index][ 3:].strip() if query_taxa and subject_taxa and query_taxa != subject_taxa: filter.add(hit.query_id) fout.write( '%s\t%s\t%s\t%s\t%.2f\t%d\t%s\t%d\t%d\n' % (hit.query_id, ';'.join( taxonomy[query_genome_id]), hit.subject_id, ';'.join(taxonomy[subject_genome_id]), hit.perc_identity, hit.alignment_len, 'Order', len(extant_taxa['o__' + query_taxa]), len(extant_taxa['o__' + subject_taxa]))) if hit.query_id not in filter and hit.perc_identity >= 86.5: # family rank_index = Taxonomy.rank_labels.index('family') query_taxa = taxonomy[query_genome_id][rank_index][ 3:].strip() subject_taxa = taxonomy[subject_genome_id][rank_index][ 3:].strip() if query_taxa and subject_taxa and query_taxa != subject_taxa: filter.add(hit.query_id) fout.write( '%s\t%s\t%s\t%s\t%.2f\t%d\t%s\t%d\t%d\n' % (hit.query_id, ';'.join( taxonomy[query_genome_id]), hit.subject_id, ';'.join(taxonomy[subject_genome_id]), hit.perc_identity, hit.alignment_len, 'Family', len( extant_taxa['f__' + query_taxa]), len(extant_taxa['f__' + subject_taxa]))) if hit.query_id not in filter and hit.perc_identity >= 94.5: # genus rank_index = Taxonomy.rank_labels.index('genus') query_taxa = taxonomy[query_genome_id][rank_index][ 3:].strip() subject_taxa = taxonomy[subject_genome_id][rank_index][ 3:].strip() if query_taxa and subject_taxa and query_taxa != subject_taxa: filter.add(hit.query_id) fout.write( '%s\t%s\t%s\t%s\t%.2f\t%d\t%s\t%d\t%d\n' % (hit.query_id, ';'.join( taxonomy[query_genome_id]), hit.subject_id, ';'.join(taxonomy[subject_genome_id]), hit.perc_identity, hit.alignment_len, 'Genus', len(extant_taxa['g__' + query_taxa]), len(extant_taxa['g__' + subject_taxa]))) if hit.query_id not in filter and hit.perc_identity >= 99: # species rank_index = Taxonomy.rank_labels.index('species') query_taxa = taxonomy[query_genome_id][rank_index][ 3:].strip() subject_taxa = taxonomy[subject_genome_id][rank_index][ 3:].strip() if query_taxa and subject_taxa and query_taxa != subject_taxa: filter.add(hit.query_id) fout.write( '%s\t%s\t%s\t%s\t%.2f\t%d\t%s\t%d\t%d\n' % (hit.query_id, ';'.join( taxonomy[query_genome_id]), hit.subject_id, ';'.join(taxonomy[subject_genome_id]), hit.perc_identity, hit.alignment_len, 'Species', len( extant_taxa['s__' + query_taxa]), len(extant_taxa['s__' + subject_taxa]))) fout.close() return filter
def classify(self, seq_files, ssu_db, ssu_taxonomy_file, evalue_threshold, output_dir): """Classify 16S rRNA genes. Parameters ---------- seq_files : d[genome_id] -> fasta file Fasta file containing 16S rRNA sequences for each genome. ssu_db : str BLAST database of 16S rRNA genes. ssu_taxonomy_file : str Taxonomy file for genes in the 16S rRNA database. evalue_threshold : float E-value threshold for defining valid hits. output_dir : str Output directory. Returns ------- d[genome_id][scaffold_id] -> str Taxonomic classifications of SSU sequences for each genome. """ blast = Blast(self.cpus) self.logger.info('Classifying SSU rRNA genes.') classifications = defaultdict(dict) for genome_id, seq_file in seq_files.iteritems(): genome_dir = os.path.join(output_dir, genome_id) # blast sequences against 16S database blast_file = os.path.join(genome_dir, 'ssu.blastn.tsv') blast.blastn(seq_file, ssu_db, blast_file, evalue=evalue_threshold, max_matches=1, output_fmt='custom') # read taxonomy file taxonomy = Taxonomy().read(ssu_taxonomy_file) # write out classification file classification_file = os.path.join(genome_dir, 'ssu.taxonomy.tsv') fout = open(classification_file, 'w') fout.write( 'query_id\tssu_taxonomy\tssu_length\tssu_blast_subject_id\tssu_blast_evalue\tssu_blast_bitscore\tssu_blast_align_len\tssu_blast_perc_identity\n' ) processed_query_ids = set() for line in open(blast_file): line_split = [x.strip() for x in line.split('\t')] query_id = line_split[0] if query_id in processed_query_ids: # A query may have multiple hits to different sections # of a gene. Blast results are organized by e-value so # only the first hit is considered. The subject gene # is the same in all cases so the taxonomy string will # be identical. continue processed_query_ids.add(query_id) query_len = int(line_split[1]) subject_id = line_split[2] align_len = line_split[5] perc_identity = line_split[6] evalue = line_split[7] bitscore = line_split[8] taxonomy_str = ';'.join(taxonomy[subject_id]) classifications[genome_id][query_id] = [ taxonomy_str, query_len, evalue, align_len, perc_identity ] fout.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (query_id, taxonomy_str, query_len, subject_id, evalue, bitscore, align_len, perc_identity)) fout.close() return classifications