def run(self, aa_gene_files, evalue, per_identity, output_dir): """Apply reciprocal blast to all pairs of genomes in parallel. Parameters ---------- aa_gene_files : list of str Amino acid fasta files to process via reciprocal blast. evalue : float E-value threshold for reporting hits. per_identity : float Percent identity threshold for reporting hits. output_dir : str Directory to store blast results. """ # concatenate all gene files and create a single diamond database self.logger.info(' Creating diamond database (be patient!).') gene_file = os.path.join(output_dir, 'all_genes.faa') concatenate_files(aa_gene_files, gene_file) diamond_db = os.path.join(output_dir, 'all_genes') diamond = Diamond(self.cpus) diamond.make_database(gene_file, diamond_db) # blast all genes against the database self.logger.info('') self.logger.info(' Identifying hits between all pairs of genomes (be patient!).') hits_daa_file = os.path.join(output_dir, 'all_hits') diamond.blastp(gene_file, diamond_db, evalue, per_identity, len(aa_gene_files) * 10, hits_daa_file) # create flat hits table self.logger.info(' Creating table with hits.') hits_table_file = os.path.join(output_dir, 'all_hits.tsv') diamond.view(hits_daa_file + '.daa', hits_table_file)
def run(self, scaffold_gene_file, stat_file, ref_genome_gene_files, db_file, evalue, per_identity, per_aln_len): """Create taxonomic profiles for a set of genomes. Parameters ---------- scaffold_gene_file : str Fasta file of genes on scaffolds in amino acid space. stat_file : str File with statistics for individual scaffolds. ref_genome_gene_files : list of str Fasta files of called genes on reference genomes of interest. db_file : str Database of competing reference genes. evalue : float E-value threshold of valid hits. per_identity : float Percent identity threshold of valid hits [0,100]. per_aln_len : float Percent query coverage of valid hits [0, 100]. """ # read statistics file self.logger.info('Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(stat_file) # perform homology searches self.logger.info('Creating DIAMOND database for reference genomes.') ref_gene_file = os.path.join(self.output_dir, 'ref_genes.faa') concatenate_gene_files(ref_genome_gene_files, ref_gene_file) diamond = Diamond(self.cpus) ref_diamond_db = os.path.join(self.output_dir, 'ref_genes') diamond.create_db(ref_gene_file, ref_diamond_db) self.logger.info('Identifying homologs within reference genomes of interest (be patient!).') self.diamond_dir = os.path.join(self.output_dir, 'diamond') make_sure_path_exists(self.diamond_dir) hits_ref_genomes = os.path.join(self.diamond_dir, 'ref_hits.tsv') diamond.blastp(scaffold_gene_file, ref_diamond_db, evalue, per_identity, per_aln_len, 1, False, hits_ref_genomes) self.logger.info('Identifying homologs within competing reference genomes (be patient!).') hits_comp_ref_genomes = os.path.join(self.diamond_dir, 'competing_ref_hits.tsv') diamond.blastp(scaffold_gene_file, db_file, evalue, per_identity, per_aln_len, 1, False, hits_comp_ref_genomes) # get list of genes with a top hit to the reference genomes of interest hits_to_ref = self._top_hits_to_reference(hits_ref_genomes, hits_comp_ref_genomes) # get number of genes on each scaffold num_genes_on_scaffold = defaultdict(int) for seq_id, _seq in seq_io.read_seq(scaffold_gene_file): scaffold_id = seq_id[0:seq_id.rfind('_')] num_genes_on_scaffold[scaffold_id] += 1 # get hits to each scaffold hits_to_scaffold = defaultdict(list) for query_id, hit in hits_to_ref.items(): gene_id = query_id[0:query_id.rfind('~')] scaffold_id = gene_id[0:gene_id.rfind('_')] hits_to_scaffold[scaffold_id].append(hit) # report summary stats for each scaffold reference_out = os.path.join(self.output_dir, 'references.tsv') fout = open(reference_out, 'w') fout.write('Scaffold ID\tSubject genome IDs\tSubject scaffold IDs') fout.write('\tGenome ID\tLength (bp)\tGC\tMean coverage') fout.write('\t# genes\t# hits\t% genes\tAvg. align. length (bp)\tAvg. % identity\tAvg. e-value\tAvg. bitscore\n') for scaffold_id, hits in hits_to_scaffold.items(): aln_len = [] perc_iden = [] evalue = [] bitscore = [] subject_scaffold_ids = defaultdict(int) subject_bin_ids = defaultdict(int) for hit in hits: aln_len.append(hit.aln_length) perc_iden.append(hit.perc_identity) evalue.append(hit.evalue) bitscore.append(hit.bitscore) subject_bin_id, subject_gene_id = hit.subject_id.split('~') subject_scaffold_id = subject_gene_id[0:subject_gene_id.rfind('_')] subject_scaffold_ids[subject_scaffold_id] += 1 subject_bin_ids[subject_bin_id] += 1 sorted_subject_bin_ids = sorted(subject_bin_ids.items(), key=operator.itemgetter(1), reverse=True) subject_bin_id_str = [] for bin_id, num_hits in sorted_subject_bin_ids: subject_bin_id_str.append(bin_id + ':' + str(num_hits)) subject_bin_id_str = ','.join(subject_bin_id_str) sorted_subject_scaffold_ids = sorted(subject_scaffold_ids.items(), key=operator.itemgetter(1), reverse=True) subject_scaffold_id_str = [] for subject_id, num_hits in sorted_subject_scaffold_ids: subject_scaffold_id_str.append(subject_id + ':' + str(num_hits)) subject_scaffold_id_str = ','.join(subject_scaffold_id_str) fout.write('%s\t%s\t%s\t%s\t%.2f\t%d\t%d\t%.2f\t%d\t%.2f\t%.2g\t%.2f\n' % ( scaffold_id, subject_bin_id_str, subject_scaffold_id_str, scaffold_stats.print_stats(scaffold_id), mean(scaffold_stats.coverage(scaffold_id)), num_genes_on_scaffold[scaffold_id], len(hits), len(hits) * 100.0 / num_genes_on_scaffold[scaffold_id], mean(aln_len), mean(perc_iden), mean(evalue), mean(bitscore))) fout.close() return reference_out
def run(self, genome_files, db_file, taxonomy_file, evalue, per_identity, window_size, step_size): """Create taxonomic profiles for a set of genomes. Parameters ---------- genome_files : list of str Fasta files of genomes to process. db_file : str Database of reference genes. taxonomy_file : str File containing GreenGenes taxonomy strings for reference genomes. evalue : float E-value threshold used by blast. per_identity: float Percent identity threshold used by blast. window_size : int Size of each fragment. step_size : int Number of bases to move after each window. """ # parse taxonomy file self.logger.info(' Reading taxonomic assignment of reference genomes.') taxonomy = Taxonomy().read(taxonomy_file) # fragment each genome into fixed sizes windows self.logger.info('') self.logger.info(' Fragmenting sequences in each bin:') diamond_output_dir = os.path.join(self.output_dir, 'diamond') make_sure_path_exists(diamond_output_dir) fragment_file = os.path.join(diamond_output_dir, 'fragments.fna') fragment_out = open(fragment_file, 'w') contig_id_to_genome_id = {} for genome_file in genome_files: genome_id = remove_extension(genome_file) self.profiles[genome_id] = Profile(genome_id, taxonomy) self._fragment_genomes(genome_file, window_size, step_size, self.profiles[genome_id], fragment_out) for seq_id, _seq in seq_io.read_seq(genome_file): contig_id_to_genome_id[seq_id] = genome_id # run diamond self.logger.info('') self.logger.info(' Running diamond blastx with %d processes (be patient!)' % self.cpus) diamond = Diamond(self.cpus) diamond_daa_out = os.path.join(diamond_output_dir, 'diamond_hits') diamond.blastx(fragment_file, db_file, evalue, per_identity, 1, diamond_daa_out) diamond_table_out = os.path.join(diamond_output_dir, 'diamond_hits.tsv') diamond.view(diamond_daa_out + '.daa', diamond_table_out) self.logger.info('') self.logger.info(' Creating taxonomic profile for each genome.') self._taxonomic_profiles(diamond_table_out, taxonomy, contig_id_to_genome_id) self.logger.info('') self.logger.info(' Writing taxonomic profile for each genome.') report_dir = os.path.join(self.output_dir, 'bin_reports') make_sure_path_exists(report_dir) for genome_id, profile in self.profiles.iteritems(): seq_summary_out = os.path.join(report_dir, genome_id + '.sequences.tsv') profile.write_seq_summary(seq_summary_out) genome_profile_out = os.path.join(report_dir, genome_id + '.profile.tsv') profile.write_genome_profile(genome_profile_out) genome_summary_out = os.path.join(self.output_dir, 'genome_summary.tsv') self._write_genome_summary(genome_summary_out) # create Krona plot krona_profiles = defaultdict(lambda: defaultdict(int)) for genome_id, profile in self.profiles.iteritems(): seq_assignments = profile.classify_seqs(taxonomy) for seq_id, classification in seq_assignments.iteritems(): taxa = [] for r in xrange(0, len(profile.rank_labels)): taxa.append(classification[r][0]) krona_profiles[genome_id][';'.join(taxa)] += profile.seq_len[seq_id] krona = Krona() krona_output_file = os.path.join(self.output_dir, 'taxonomic_profiles.krona.html') krona.create(krona_profiles, krona_output_file)
def _run_reciprocal_diamond(self, query_gene_file, target_gene_file, evalue, per_identity, per_aln_len, max_hits, sensitive, high_mem, tmp_dir, output_dir): """Perform similarity search of query genes against target genes, and reciprocal hits. Parameters ---------- query_gene_file : str File with all query proteins. target_gene_file : str File with all target proteins. evalue : float E-value threshold for reporting hits. per_identity : float Percent identity threshold for reporting hits. per_aln_len : float Percent query coverage threshold for reporting hits. max_hits : int Maximum number of hits to report per query sequences. tmp_dir : str Directory to store temporary files. output_dir : str Directory to store blast results. """ self.logger.info( 'Creating DIAMOND database of query proteins (be patient!).') diamond = Diamond(self.cpus) query_diamond_db = os.path.join(output_dir, 'query_genes') diamond.create_db(query_gene_file, query_diamond_db) self.logger.info( 'Creating DIAMOND database of target proteins (be patient!).') target_diamond_db = os.path.join(output_dir, 'target_genes') diamond.create_db(target_gene_file, target_diamond_db) # blast query genes against target proteins self.logger.info( 'Performing similarity sequence between query and target proteins (be patient!).' ) if tmp_dir: tmp_query_hits_table = tempfile.NamedTemporaryFile( prefix='comparem_hits_', dir=tmp_dir, delete=False) else: tmp_query_hits_table = tempfile.NamedTemporaryFile( prefix='comparem_hits_', delete=False) tmp_query_hits_table.close() query_hits_daa_file = os.path.join(output_dir, 'query_hits') if high_mem: diamond.blastp(query_gene_file, target_diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_query_hits_table.name, 'standard', tmp_dir, chunk_size=1, block_size=8) else: diamond.blastp(query_gene_file, target_diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_query_hits_table.name, 'standard', tmp_dir) # get target genes hit by one or more query proteins self.logger.info( 'Creating file with target proteins with similarity to query proteins.' ) target_hit = set() for line in open(tmp_query_hits_table.name): line_split = line.split('\t') target_hit.add(line_split[1]) target_genes_hits = os.path.join(output_dir, 'target_genes_hit.faa') fout = open(target_genes_hits, 'w') for seq_id, seq in seq_io.read_seq(target_gene_file): if seq_id in target_hit: fout.write('>' + seq_id + '\n') fout.write(seq + '\n') fout.close() self.logger.info( 'Identified %d target proteins to be used in reciprocal search.' % len(target_hit)) # perform reciprocal blast self.logger.info( 'Performing reciprocal similarity sequence between target and query proteins (be patient!).' ) if tmp_dir: tmp_target_hits_table = tempfile.NamedTemporaryFile( prefix='comparem_hits_', dir=tmp_dir, delete=False) else: tmp_target_hits_table = tempfile.NamedTemporaryFile( prefix='comparem_hits_', delete=False) tmp_target_hits_table.close() if high_mem: diamond.blastp(target_genes_hits, query_diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_target_hits_table.name, 'standard', tmp_dir, chunk_size=1, block_size=8) else: diamond.blastp(target_genes_hits, query_diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_target_hits_table.name, 'standard', tmp_dir) # combine hit tables and sort os.system('cat %s >> %s' % (tmp_target_hits_table.name, tmp_query_hits_table.name)) os.remove(tmp_target_hits_table.name) hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv') self._sort_hit_table(tmp_query_hits_table.name, hits_table_file)
def _run_self_diamond(self, query_gene_file, evalue, per_identity, per_aln_len, max_hits, sensitive, high_mem, tmp_dir, output_dir): """Perform similarity search of query genes against themselves. Parameters ---------- query_gene_file : str File with all query sequences. evalue : float E-value threshold for reporting hits. per_identity : float Percent identity threshold for reporting hits. per_aln_len : float Percent query coverage threshold for reporting hits. max_hits : int Maximum number of hits to report per query sequences. tmp_dir : str Directory to store temporary files. output_dir : str Directory to store blast results. """ self.logger.info('Creating DIAMOND database (be patient!).') diamond_db = os.path.join(output_dir, 'query_genes') diamond = Diamond(self.cpus) diamond.create_db(query_gene_file, diamond_db) # create flat hits table if tmp_dir: tmp_hits_table = tempfile.NamedTemporaryFile( prefix='comparem_hits_', dir=tmp_dir, delete=False) else: tmp_hits_table = tempfile.NamedTemporaryFile( prefix='comparem_hits_', delete=False) tmp_hits_table.close() # blast all genes against the database self.logger.info( 'Performing self similarity sequence between genomes (be patient!).' ) if high_mem: diamond.blastp(query_gene_file, diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_hits_table.name, 'standard', tmp_dir, chunk_size=1, block_size=8) else: diamond.blastp(query_gene_file, diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_hits_table.name, 'standard', tmp_dir) # sort hit table hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv') self._sort_hit_table(tmp_hits_table.name, hits_table_file)
def _run_reciprocal_diamond(self, query_gene_file, target_gene_file, evalue, per_identity, per_aln_len, max_hits, sensitive, high_mem, tmp_dir, output_dir): """Perform similarity search of query genes against target genes, and reciprocal hits. Parameters ---------- query_gene_file : str File with all query proteins. target_gene_file : str File with all target proteins. evalue : float E-value threshold for reporting hits. per_identity : float Percent identity threshold for reporting hits. per_aln_len : float Percent query coverage threshold for reporting hits. max_hits : int Maximum number of hits to report per query sequences. tmp_dir : str Directory to store temporary files. output_dir : str Directory to store blast results. """ self.logger.info('Creating DIAMOND database of query proteins (be patient!).') diamond = Diamond(self.cpus) query_diamond_db = os.path.join(output_dir, 'query_genes') diamond.make_database(query_gene_file, query_diamond_db) self.logger.info('Creating DIAMOND database of target proteins (be patient!).') target_diamond_db = os.path.join(output_dir, 'target_genes') diamond.make_database(target_gene_file, target_diamond_db) # blast query genes against target proteins self.logger.info('Performing similarity sequence between query and target proteins (be patient!).') if tmp_dir: tmp_query_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', dir=tmp_dir, delete=False) else: tmp_query_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', delete=False) tmp_query_hits_table.close() query_hits_daa_file = os.path.join(output_dir, 'query_hits') if high_mem: diamond.blastp(query_gene_file, target_diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_query_hits_table.name, 'standard', tmp_dir, chunk_size=1, block_size=8) else: diamond.blastp(query_gene_file, target_diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_query_hits_table.name, 'standard', tmp_dir) # get target genes hit by one or more query proteins self.logger.info('Creating file with target proteins with similarity to query proteins.') target_hit = set() for line in open(tmp_query_hits_table.name): line_split = line.split('\t') target_hit.add(line_split[1]) target_genes_hits = os.path.join(output_dir, 'target_genes_hit.faa') fout = open(target_genes_hits, 'w') for seq_id, seq in seq_io.read_seq(target_gene_file): if seq_id in target_hit: fout.write('>' + seq_id + '\n') fout.write(seq + '\n') fout.close() self.logger.info('Identified %d target proteins to be used in reciprocal search.' % len(target_hit)) # perform reciprocal blast self.logger.info('Performing reciprocal similarity sequence between target and query proteins (be patient!).') if tmp_dir: tmp_target_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', dir=tmp_dir, delete=False) else: tmp_target_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', delete=False) tmp_target_hits_table.close() if high_mem: diamond.blastp(target_genes_hits, query_diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_target_hits_table.name, 'standard', tmp_dir, chunk_size=1, block_size=8) else: diamond.blastp(target_genes_hits, query_diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_target_hits_table.name, 'standard', tmp_dir) # combine hit tables and sort os.system('cat %s >> %s' % (tmp_target_hits_table.name, tmp_query_hits_table.name)) os.remove(tmp_target_hits_table.name) hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv') self._sort_hit_table(tmp_query_hits_table.name, hits_table_file)
def _run_self_diamond(self, query_gene_file, evalue, per_identity, per_aln_len, max_hits, sensitive, high_mem, tmp_dir, output_dir): """Perform similarity search of query genes against themselves. Parameters ---------- query_gene_file : str File with all query sequences. evalue : float E-value threshold for reporting hits. per_identity : float Percent identity threshold for reporting hits. per_aln_len : float Percent query coverage threshold for reporting hits. max_hits : int Maximum number of hits to report per query sequences. tmp_dir : str Directory to store temporary files. output_dir : str Directory to store blast results. """ self.logger.info('Creating DIAMOND database (be patient!).') diamond_db = os.path.join(output_dir, 'query_genes') diamond = Diamond(self.cpus) diamond.make_database(query_gene_file, diamond_db) # create flat hits table if tmp_dir: tmp_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', dir=tmp_dir, delete=False) else: tmp_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', delete=False) tmp_hits_table.close() # blast all genes against the database self.logger.info('Performing self similarity sequence between genomes (be patient!).') if high_mem: diamond.blastp(query_gene_file, diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_hits_table.name, 'standard', tmp_dir, chunk_size=1, block_size=8) else: diamond.blastp(query_gene_file, diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_hits_table.name, 'standard', tmp_dir) # sort hit table hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv') self._sort_hit_table(tmp_hits_table.name, hits_table_file)
def run(self, query_proteins, db_file, custom_db_file, taxonomy_file, custom_taxonomy_file, evalue, per_identity, per_aln_len, max_matches, homology_search, min_per_taxa, consensus, min_per_bp, use_trimAl, restrict_taxon, msa_program, tree_program, prot_model, skip_rooting, output_dir): """Infer a gene tree for homologs genes identified by blast. Workflow for inferring a gene tree from sequences identified as being homologs to a set of query proteins. Homologs are identified using BLASTP and a set of user-defined parameters. Parameters ---------- query_proteins : str Fasta file containing query proteins. db_file : str BLAST database of reference proteins. custom_db_file : str Custom database of proteins. taxonomy_file : str Taxonomic assignment of each reference genomes. custom_taxonomy_file : str Taxonomic assignment of genomes in custom database. evalue : float E-value threshold used to define homolog. per_identity : float Percent identity threshold used to define a homolog. per_aln_len : float Alignment length threshold used to define a homolog. max_matches : int Maximum matches per query protein. metadata : dict[genome_id] -> metadata dictionary Metadata for genomes. homology_search : str Type of homology search to perform. min_per_taxa : float Minimum percentage of taxa required to retain a column. consensus : float Minimum percentage of the same amino acid required to retain column. min_per_bp : float Minimum percentage of base pairs required to keep trimmed sequence. use_trimAl : boolean Filter columns using trimAl. restrict_taxon : str Restrict alignment to specific taxonomic group (e.g., k__Archaea). msa_program : str Program to use for multiple sequence alignment ['mafft', 'muscle']. tree_program : str Program to use for tree inference ['fasttree', 'raxml']. prot_model : str Protein substitution model for tree inference ['WAG', 'LG', 'AUTO']. skip_rooting : boolean Skip midpoint rooting if True. output_dir : str Directory to store results. """ # validate query sequence names for use with GeneTreeTk validate_seq_ids(query_proteins) # read taxonomy file self.logger.info('Reading taxonomy file.') taxonomy = Taxonomy().read(taxonomy_file) if custom_taxonomy_file: custom_taxonomy = Taxonomy().read(custom_taxonomy_file) taxonomy.update(custom_taxonomy) # report distribution of query genes mean_len, max_len, min_len, p10, p50, p90 = self._gene_distribution( query_proteins) self.logger.info( 'Query gene lengths: min, mean, max = %d, %.1f, %d | p10, p50, p90 = %.1f, %.1f, %.1f' % (min_len, mean_len, max_len, p10, p50, p90)) # identify homologs using BLASTP self.logger.info('Identifying homologs using %s.' % homology_search) blast = Blast(self.cpus) blast_output = os.path.join(output_dir, 'reference_hits.tsv') if homology_search == 'diamond': diamond = Diamond(self.cpus) diamond.blastp(query_proteins, db_file, evalue, per_identity, per_aln_len, max_matches, blast_output, output_fmt='custom') else: blast.blastp(query_proteins, db_file, blast_output, evalue, max_matches, output_fmt='custom', task=homology_search) homologs = blast.identify_homologs(blast_output, evalue, per_identity, per_aln_len) self.logger.info('Identified %d homologs in reference database.' % len(homologs)) custom_homologs = None if custom_db_file: custom_blast_output = os.path.join(output_dir, 'custom_hits.tsv') if homology_search == 'diamond': diamond = Diamond(self.cpus) diamond.blastp(query_proteins, custom_db_file, evalue, per_identity, per_aln_len, max_matches, custom_blast_output, output_fmt='custom') else: blast.blastp(query_proteins, custom_db_file, custom_blast_output, evalue, max_matches, output_fmt='custom', task=homology_search) custom_homologs = blast.identify_homologs(custom_blast_output, evalue, per_identity, per_aln_len) self.logger.info('Identified %d homologs in custom database.' % len(custom_homologs)) # restrict homologs to specific taxonomic group if restrict_taxon: self.logger.info('Restricting homologs to %s.' % restrict_taxon) restricted_homologs = {} for query_id, hit in homologs.iteritems(): genome_id = hit.subject_id.split('~')[0] if restrict_taxon in taxonomy[genome_id]: restricted_homologs[query_id] = hit self.logger.info( '%d of %d homologs in reference database are from the specified group.' % (len(restricted_homologs), len(homologs))) homologs = restricted_homologs if len(homologs) == 0: self.logger.error( 'Too few homologs were identified. Gene tree cannot be inferred.' ) sys.exit() # extract homologs self.logger.info( 'Extracting homologs and determining local gene context.') db_homologs_tmp = os.path.join(output_dir, 'homologs_db.tmp') gene_precontext, gene_postcontext = self.extract_homologs_and_context( homologs.keys(), db_file, db_homologs_tmp) # report gene length distribution of homologs mean_len, max_len, min_len, p10, p50, p90 = self._gene_distribution( db_homologs_tmp) self.logger.info( 'Homolog gene lengths: min, mean, max = %d, %.1f, %d | p10, p50, p90 = %.1f, %.1f, %.1f' % (min_len, mean_len, max_len, p10, p50, p90)) # concatenate homologs with initial query genes homolog_ouput_tmp = os.path.join(output_dir, 'homologs.faa.tmp') if custom_homologs: custom_db_homologs_tmp = os.path.join(output_dir, 'custom_homologs_db.tmp') custom_gene_precontext, custom_gene_postcontext = self.extract_homologs_and_context( custom_homologs.keys(), custom_db_file, custom_db_homologs_tmp) gene_precontext.update(custom_gene_precontext) gene_postcontext.update(custom_gene_postcontext) homologs.update(custom_homologs) concatenate_files( [query_proteins, db_homologs_tmp, custom_db_homologs_tmp], homolog_ouput_tmp) os.remove(custom_db_homologs_tmp) else: concatenate_files([query_proteins, db_homologs_tmp], homolog_ouput_tmp) os.remove(db_homologs_tmp) # remove stop codons homolog_ouput = os.path.join(output_dir, 'homologs.faa') self._remove_stop_codons(homolog_ouput_tmp, homolog_ouput) os.remove(homolog_ouput_tmp) # infer multiple sequence alignment msa = MsaWorkflow(self.cpus) trimmed_msa_output = msa.run(homolog_ouput, min_per_taxa, consensus, min_per_bp, use_trimAl, msa_program, output_dir) # infer tree tw = TreeWorkflow(self.cpus) tree_output = tw.run(trimmed_msa_output, tree_program, prot_model, skip_rooting, output_dir) # create tax2tree consensus map and decorate tree self.logger.info('Decorating internal tree nodes with tax2tree.') output_taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv') fout = open(output_taxonomy_file, 'w') for homolog_id in homologs.keys(): genome_id = homolog_id.split('~')[0] t = taxonomy.get(genome_id, None) if t: fout.write(homolog_id + '\t' + ';'.join(t) + '\n') fout.close() t2t_tree = os.path.join(output_dir, 'homologs.tax2tree.tree') cmd = 't2t decorate -m %s -t %s -o %s' % (output_taxonomy_file, tree_output, t2t_tree) os.system(cmd) # create tree with leaf nodes given as genome accessions tree = dendropy.Tree.get_from_path(t2t_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) for leaf in tree.leaf_node_iter(): leaf.taxon.label = leaf.taxon.label.split('~')[0] genome_tree = os.path.join(output_dir, 'homologs.tax2tree.genome_accessions.tree') tree.write_to_path(genome_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True) # setup metadata for ARB file src_dir = os.path.dirname(os.path.realpath(__file__)) version_file = open(os.path.join(src_dir, 'VERSION')) metadata = {} metadata['genetreetk_version'] = version_file.read().strip() metadata['genetreetk_query_proteins'] = query_proteins metadata['genetreetk_db_file'] = db_file metadata['genetreetk_taxonomy_file'] = taxonomy_file metadata['genetreetk_blast_evalue'] = str(evalue) metadata['genetreetk_blast_per_identity'] = str(per_identity) metadata['genetreetk_blast_per_aln_len'] = str(per_aln_len) metadata['genetreetk_blast_max_matches'] = str(max_matches) metadata['genetreetk_homology_search'] = homology_search metadata['genetreetk_msa_min_per_taxa'] = str(min_per_taxa) metadata['genetreetk_msa_consensus'] = str(consensus) metadata['genetreetk_msa_min_per_bp'] = str(min_per_bp) metadata['genetreetk_msa_program'] = msa_program metadata['genetreetk_tree_program'] = tree_program metadata['genetreetk_tree_prot_model'] = prot_model # create ARB metadata file self.logger.info('Creating ARB metadata file.') arb_metadata_file = os.path.join(output_dir, 'arb.metadata.txt') self.create_arb_metadata(homologs, trimmed_msa_output, taxonomy, metadata, gene_precontext, gene_postcontext, arb_metadata_file)
def run(self, scaffold_gene_file, stat_file, ref_genome_gene_files, db_file, evalue, per_identity,): """Create taxonomic profiles for a set of genomes. Parameters ---------- scaffold_gene_file : str Fasta file of genes on scaffolds in amino acid space. stat_file : str File with statistics for individual scaffolds. ref_genome_gene_files : list of str Fasta files of called genes on reference genomes of interest. db_file : str Database of competing reference genes. evalue : float E-value threshold used by blast. per_identity: float Percent identity threshold used by blast. """ # read statistics file self.logger.info('') self.logger.info(' Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(stat_file) # perform homology searches self.logger.info('') self.logger.info(' Creating diamond database for reference genomes.') ref_gene_file = os.path.join(self.output_dir, 'ref_genes.faa') concatenate_gene_files(ref_genome_gene_files, ref_gene_file) diamond = Diamond(self.cpus) ref_diamond_db = os.path.join(self.output_dir, 'ref_genes') diamond.make_database(ref_gene_file, ref_diamond_db) self.logger.info(' Identifying homologs within reference genomes of interest (be patient!).') self.diamond_dir = os.path.join(self.output_dir, 'diamond') make_sure_path_exists(self.diamond_dir) hits_ref_genomes_daa = os.path.join(self.diamond_dir, 'ref_hits') diamond.blastp(scaffold_gene_file, ref_diamond_db, evalue, per_identity, 1, hits_ref_genomes_daa) hits_ref_genomes = os.path.join(self.diamond_dir, 'ref_hits.tsv') diamond.view(hits_ref_genomes_daa + '.daa', hits_ref_genomes) self.logger.info(' Identifying homologs within competing reference genomes (be patient!).') hits_comp_ref_genomes_daa = os.path.join(self.diamond_dir, 'competing_ref_hits') diamond.blastp(scaffold_gene_file, db_file, evalue, per_identity, 1, hits_comp_ref_genomes_daa) hits_comp_ref_genomes = os.path.join(self.diamond_dir, 'competing_ref_hits.tsv') diamond.view(hits_comp_ref_genomes_daa + '.daa', hits_comp_ref_genomes) # get list of genes with a top hit to the reference genomes of interest hits_to_ref = self._top_hits_to_reference(hits_ref_genomes, hits_comp_ref_genomes) # get number of genes on each scaffold num_genes_on_scaffold = defaultdict(int) for seq_id, _seq in seq_io.read_seq(scaffold_gene_file): scaffold_id = seq_id[0:seq_id.rfind('_')] num_genes_on_scaffold[scaffold_id] += 1 # get hits to each scaffold hits_to_scaffold = defaultdict(list) for query_id, hit in hits_to_ref.iteritems(): gene_id = query_id[0:query_id.rfind('~')] scaffold_id = gene_id[0:gene_id.rfind('_')] hits_to_scaffold[scaffold_id].append(hit) # report summary stats for each scaffold reference_out = os.path.join(self.output_dir, 'references.tsv') fout = open(reference_out, 'w') fout.write('Scaffold id\tSubject scaffold ids\tSubject genome ids') fout.write('\tGenome id\tLength (bp)\tGC\tMean coverage') fout.write('\t# genes\t# hits\t% genes\tAvg. align. length (bp)\tAvg. % identity\tAvg. e-value\tAvg. bitscore\n') for scaffold_id, hits in hits_to_scaffold.iteritems(): aln_len = [] perc_iden = [] evalue = [] bitscore = [] subject_scaffold_ids = defaultdict(int) subject_bin_ids = defaultdict(int) for hit in hits: aln_len.append(hit.aln_length) perc_iden.append(hit.perc_identity) evalue.append(hit.evalue) bitscore.append(hit.bitscore) subject_id, subject_bin_id = hit.subject_id.split('~') subject_scaffold_id = subject_id[0:subject_id.rfind('_')] subject_scaffold_ids[subject_scaffold_id] += 1 subject_bin_ids[subject_bin_id] += 1 subject_scaffold_id_str = [] for subject_id, num_hits in subject_scaffold_ids.iteritems(): subject_scaffold_id_str.append(subject_id + ':' + str(num_hits)) subject_scaffold_id_str = ','.join(subject_scaffold_id_str) subject_bin_id_str = [] for bin_id, num_hits in subject_bin_ids.iteritems(): subject_bin_id_str.append(bin_id + ':' + str(num_hits)) subject_bin_id_str = ','.join(subject_bin_id_str) fout.write('%s\t%s\t%s\t%s\t%.2f\t%d\t%d\t%.2f\t%d\t%.2f\t%.2g\t%.2f\n' % ( scaffold_id, subject_scaffold_id_str, subject_bin_id_str, scaffold_stats.print_stats(scaffold_id), mean(scaffold_stats.coverage(scaffold_id)), num_genes_on_scaffold[scaffold_id], len(hits), len(hits) * 100.0 / num_genes_on_scaffold[scaffold_id], mean(aln_len), mean(perc_iden), mean(evalue), mean(bitscore))) fout.close() return reference_out
def run(self, genome_files, db_file, taxonomy_file, evalue, per_identity, window_size, step_size): """Create taxonomic profiles for a set of genomes. Parameters ---------- genome_files : list of str Fasta files of genomes to process. db_file : str Database of reference genes. taxonomy_file : str File containing GreenGenes taxonomy strings for reference genomes. evalue : float E-value threshold used by blast. per_identity: float Percent identity threshold used by blast. window_size : int Size of each fragment. step_size : int Number of bases to move after each window. """ # parse taxonomy file self.logger.info( ' Reading taxonomic assignment of reference genomes.') taxonomy = Taxonomy().read(taxonomy_file) # fragment each genome into fixed sizes windows self.logger.info('') self.logger.info(' Fragmenting sequences in each bin:') diamond_output_dir = os.path.join(self.output_dir, 'diamond') make_sure_path_exists(diamond_output_dir) fragment_file = os.path.join(diamond_output_dir, 'fragments.fna') fragment_out = open(fragment_file, 'w') contig_id_to_genome_id = {} for genome_file in genome_files: genome_id = remove_extension(genome_file) self.profiles[genome_id] = Profile(genome_id, taxonomy) self._fragment_genomes(genome_file, window_size, step_size, self.profiles[genome_id], fragment_out) for seq_id, _seq in seq_io.read_seq(genome_file): contig_id_to_genome_id[seq_id] = genome_id # run diamond self.logger.info('') self.logger.info( ' Running diamond blastx with %d processes (be patient!)' % self.cpus) diamond = Diamond(self.cpus) diamond_daa_out = os.path.join(diamond_output_dir, 'diamond_hits') diamond.blastx(fragment_file, db_file, evalue, per_identity, 1, diamond_daa_out) diamond_table_out = os.path.join(diamond_output_dir, 'diamond_hits.tsv') diamond.view(diamond_daa_out + '.daa', diamond_table_out) self.logger.info('') self.logger.info(' Creating taxonomic profile for each genome.') self._taxonomic_profiles(diamond_table_out, taxonomy, contig_id_to_genome_id) self.logger.info('') self.logger.info(' Writing taxonomic profile for each genome.') report_dir = os.path.join(self.output_dir, 'bin_reports') make_sure_path_exists(report_dir) for genome_id, profile in self.profiles.iteritems(): seq_summary_out = os.path.join(report_dir, genome_id + '.sequences.tsv') profile.write_seq_summary(seq_summary_out) genome_profile_out = os.path.join(report_dir, genome_id + '.profile.tsv') profile.write_genome_profile(genome_profile_out) genome_summary_out = os.path.join(self.output_dir, 'genome_summary.tsv') self._write_genome_summary(genome_summary_out) # create Krona plot krona_profiles = defaultdict(lambda: defaultdict(int)) for genome_id, profile in self.profiles.iteritems(): seq_assignments = profile.classify_seqs(taxonomy) for seq_id, classification in seq_assignments.iteritems(): taxa = [] for r in xrange(0, len(profile.rank_labels)): taxa.append(classification[r][0]) krona_profiles[genome_id][';'.join( taxa)] += profile.seq_len[seq_id] krona = Krona() krona_output_file = os.path.join(self.output_dir, 'taxonomic_profiles.krona.html') krona.create(krona_profiles, krona_output_file)