def run(self, aa_gene_files, evalue, per_identity, output_dir): """Apply reciprocal blast to all pairs of genomes in parallel. Parameters ---------- aa_gene_files : list of str Amino acid fasta files to process via reciprocal blast. evalue : float E-value threshold for reporting hits. per_identity : float Percent identity threshold for reporting hits. output_dir : str Directory to store blast results. """ # concatenate all gene files and create a single diamond database self.logger.info(' Creating diamond database (be patient!).') gene_file = os.path.join(output_dir, 'all_genes.faa') concatenate_files(aa_gene_files, gene_file) diamond_db = os.path.join(output_dir, 'all_genes') diamond = Diamond(self.cpus) diamond.make_database(gene_file, diamond_db) # blast all genes against the database self.logger.info('') self.logger.info(' Identifying hits between all pairs of genomes (be patient!).') hits_daa_file = os.path.join(output_dir, 'all_hits') diamond.blastp(gene_file, diamond_db, evalue, per_identity, len(aa_gene_files) * 10, hits_daa_file) # create flat hits table self.logger.info(' Creating table with hits.') hits_table_file = os.path.join(output_dir, 'all_hits.tsv') diamond.view(hits_daa_file + '.daa', hits_table_file)
def _run_reciprocal_diamond(self, query_gene_file, target_gene_file, evalue, per_identity, per_aln_len, max_hits, sensitive, high_mem, tmp_dir, output_dir): """Perform similarity search of query genes against target genes, and reciprocal hits. Parameters ---------- query_gene_file : str File with all query proteins. target_gene_file : str File with all target proteins. evalue : float E-value threshold for reporting hits. per_identity : float Percent identity threshold for reporting hits. per_aln_len : float Percent query coverage threshold for reporting hits. max_hits : int Maximum number of hits to report per query sequences. tmp_dir : str Directory to store temporary files. output_dir : str Directory to store blast results. """ self.logger.info( 'Creating DIAMOND database of query proteins (be patient!).') diamond = Diamond(self.cpus) query_diamond_db = os.path.join(output_dir, 'query_genes') diamond.make_database(query_gene_file, query_diamond_db) self.logger.info( 'Creating DIAMOND database of target proteins (be patient!).') target_diamond_db = os.path.join(output_dir, 'target_genes') diamond.make_database(target_gene_file, target_diamond_db) # blast query genes against target proteins self.logger.info( 'Performing similarity sequence between query and target proteins (be patient!).' ) if tmp_dir: tmp_query_hits_table = tempfile.NamedTemporaryFile( prefix='comparem_hits_', dir=tmp_dir, delete=False) else: tmp_query_hits_table = tempfile.NamedTemporaryFile( prefix='comparem_hits_', delete=False) tmp_query_hits_table.close() query_hits_daa_file = os.path.join(output_dir, 'query_hits') if high_mem: diamond.blastp(query_gene_file, target_diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_query_hits_table.name, 'standard', tmp_dir, chunk_size=1, block_size=8) else: diamond.blastp(query_gene_file, target_diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_query_hits_table.name, 'standard', tmp_dir) # get target genes hit by one or more query proteins self.logger.info( 'Creating file with target proteins with similarity to query proteins.' ) target_hit = set() for line in open(tmp_query_hits_table.name): line_split = line.split('\t') target_hit.add(line_split[1]) target_genes_hits = os.path.join(output_dir, 'target_genes_hit.faa') fout = open(target_genes_hits, 'w') for seq_id, seq in seq_io.read_seq(target_gene_file): if seq_id in target_hit: fout.write('>' + seq_id + '\n') fout.write(seq + '\n') fout.close() self.logger.info( 'Identified %d target proteins to be used in reciprocal search.' % len(target_hit)) # perform reciprocal blast self.logger.info( 'Performing reciprocal similarity sequence between target and query proteins (be patient!).' ) if tmp_dir: tmp_target_hits_table = tempfile.NamedTemporaryFile( prefix='comparem_hits_', dir=tmp_dir, delete=False) else: tmp_target_hits_table = tempfile.NamedTemporaryFile( prefix='comparem_hits_', delete=False) tmp_target_hits_table.close() if high_mem: diamond.blastp(target_genes_hits, query_diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_target_hits_table.name, 'standard', tmp_dir, chunk_size=1, block_size=8) else: diamond.blastp(target_genes_hits, query_diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_target_hits_table.name, 'standard', tmp_dir) # combine hit tables and sort os.system('cat %s >> %s' % (tmp_target_hits_table.name, tmp_query_hits_table.name)) os.remove(tmp_target_hits_table.name) hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv') self._sort_hit_table(tmp_query_hits_table.name, hits_table_file)
def _run_self_diamond(self, query_gene_file, evalue, per_identity, per_aln_len, max_hits, sensitive, high_mem, tmp_dir, output_dir): """Perform similarity search of query genes against themselves. Parameters ---------- query_gene_file : str File with all query sequences. evalue : float E-value threshold for reporting hits. per_identity : float Percent identity threshold for reporting hits. per_aln_len : float Percent query coverage threshold for reporting hits. max_hits : int Maximum number of hits to report per query sequences. tmp_dir : str Directory to store temporary files. output_dir : str Directory to store blast results. """ self.logger.info('Creating DIAMOND database (be patient!).') diamond_db = os.path.join(output_dir, 'query_genes') diamond = Diamond(self.cpus) diamond.make_database(query_gene_file, diamond_db) # create flat hits table if tmp_dir: tmp_hits_table = tempfile.NamedTemporaryFile( prefix='comparem_hits_', dir=tmp_dir, delete=False) else: tmp_hits_table = tempfile.NamedTemporaryFile( prefix='comparem_hits_', delete=False) tmp_hits_table.close() # blast all genes against the database self.logger.info( 'Performing self similarity sequence between genomes (be patient!).' ) if high_mem: diamond.blastp(query_gene_file, diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_hits_table.name, 'standard', tmp_dir, chunk_size=1, block_size=8) else: diamond.blastp(query_gene_file, diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_hits_table.name, 'standard', tmp_dir) # sort hit table hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv') self._sort_hit_table(tmp_hits_table.name, hits_table_file)
def _run_reciprocal_diamond(self, query_gene_file, target_gene_file, evalue, per_identity, per_aln_len, max_hits, sensitive, high_mem, tmp_dir, output_dir): """Perform similarity search of query genes against target genes, and reciprocal hits. Parameters ---------- query_gene_file : str File with all query proteins. target_gene_file : str File with all target proteins. evalue : float E-value threshold for reporting hits. per_identity : float Percent identity threshold for reporting hits. per_aln_len : float Percent query coverage threshold for reporting hits. max_hits : int Maximum number of hits to report per query sequences. tmp_dir : str Directory to store temporary files. output_dir : str Directory to store blast results. """ self.logger.info('Creating DIAMOND database of query proteins (be patient!).') diamond = Diamond(self.cpus) query_diamond_db = os.path.join(output_dir, 'query_genes') diamond.make_database(query_gene_file, query_diamond_db) self.logger.info('Creating DIAMOND database of target proteins (be patient!).') target_diamond_db = os.path.join(output_dir, 'target_genes') diamond.make_database(target_gene_file, target_diamond_db) # blast query genes against target proteins self.logger.info('Performing similarity sequence between query and target proteins (be patient!).') if tmp_dir: tmp_query_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', dir=tmp_dir, delete=False) else: tmp_query_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', delete=False) tmp_query_hits_table.close() query_hits_daa_file = os.path.join(output_dir, 'query_hits') if high_mem: diamond.blastp(query_gene_file, target_diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_query_hits_table.name, 'standard', tmp_dir, chunk_size=1, block_size=8) else: diamond.blastp(query_gene_file, target_diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_query_hits_table.name, 'standard', tmp_dir) # get target genes hit by one or more query proteins self.logger.info('Creating file with target proteins with similarity to query proteins.') target_hit = set() for line in open(tmp_query_hits_table.name): line_split = line.split('\t') target_hit.add(line_split[1]) target_genes_hits = os.path.join(output_dir, 'target_genes_hit.faa') fout = open(target_genes_hits, 'w') for seq_id, seq in seq_io.read_seq(target_gene_file): if seq_id in target_hit: fout.write('>' + seq_id + '\n') fout.write(seq + '\n') fout.close() self.logger.info('Identified %d target proteins to be used in reciprocal search.' % len(target_hit)) # perform reciprocal blast self.logger.info('Performing reciprocal similarity sequence between target and query proteins (be patient!).') if tmp_dir: tmp_target_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', dir=tmp_dir, delete=False) else: tmp_target_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', delete=False) tmp_target_hits_table.close() if high_mem: diamond.blastp(target_genes_hits, query_diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_target_hits_table.name, 'standard', tmp_dir, chunk_size=1, block_size=8) else: diamond.blastp(target_genes_hits, query_diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_target_hits_table.name, 'standard', tmp_dir) # combine hit tables and sort os.system('cat %s >> %s' % (tmp_target_hits_table.name, tmp_query_hits_table.name)) os.remove(tmp_target_hits_table.name) hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv') self._sort_hit_table(tmp_query_hits_table.name, hits_table_file)
def _run_self_diamond(self, query_gene_file, evalue, per_identity, per_aln_len, max_hits, sensitive, high_mem, tmp_dir, output_dir): """Perform similarity search of query genes against themselves. Parameters ---------- query_gene_file : str File with all query sequences. evalue : float E-value threshold for reporting hits. per_identity : float Percent identity threshold for reporting hits. per_aln_len : float Percent query coverage threshold for reporting hits. max_hits : int Maximum number of hits to report per query sequences. tmp_dir : str Directory to store temporary files. output_dir : str Directory to store blast results. """ self.logger.info('Creating DIAMOND database (be patient!).') diamond_db = os.path.join(output_dir, 'query_genes') diamond = Diamond(self.cpus) diamond.make_database(query_gene_file, diamond_db) # create flat hits table if tmp_dir: tmp_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', dir=tmp_dir, delete=False) else: tmp_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', delete=False) tmp_hits_table.close() # blast all genes against the database self.logger.info('Performing self similarity sequence between genomes (be patient!).') if high_mem: diamond.blastp(query_gene_file, diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_hits_table.name, 'standard', tmp_dir, chunk_size=1, block_size=8) else: diamond.blastp(query_gene_file, diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_hits_table.name, 'standard', tmp_dir) # sort hit table hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv') self._sort_hit_table(tmp_hits_table.name, hits_table_file)
def run(self, scaffold_gene_file, stat_file, ref_genome_gene_files, db_file, evalue, per_identity, per_aln_len): """Create taxonomic profiles for a set of genomes. Parameters ---------- scaffold_gene_file : str Fasta file of genes on scaffolds in amino acid space. stat_file : str File with statistics for individual scaffolds. ref_genome_gene_files : list of str Fasta files of called genes on reference genomes of interest. db_file : str Database of competing reference genes. evalue : float E-value threshold of valid hits. per_identity : float Percent identity threshold of valid hits [0,100]. per_aln_len : float Percent query coverage of valid hits [0, 100]. """ # read statistics file self.logger.info('Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(stat_file) # perform homology searches self.logger.info('Creating diamond database for reference genomes.') ref_gene_file = os.path.join(self.output_dir, 'ref_genes.faa') concatenate_gene_files(ref_genome_gene_files, ref_gene_file) diamond = Diamond(self.cpus) ref_diamond_db = os.path.join(self.output_dir, 'ref_genes') diamond.make_database(ref_gene_file, ref_diamond_db) self.logger.info('Identifying homologs within reference genomes of interest (be patient!).') self.diamond_dir = os.path.join(self.output_dir, 'diamond') make_sure_path_exists(self.diamond_dir) hits_ref_genomes = os.path.join(self.diamond_dir, 'ref_hits.tsv') diamond.blastp(scaffold_gene_file, ref_diamond_db, evalue, per_identity, per_aln_len, 1, hits_ref_genomes) self.logger.info('Identifying homologs within competing reference genomes (be patient!).') hits_comp_ref_genomes = os.path.join(self.diamond_dir, 'competing_ref_hits.tsv') diamond.blastp(scaffold_gene_file, db_file, evalue, per_identity, per_aln_len, 1, hits_comp_ref_genomes) # get list of genes with a top hit to the reference genomes of interest hits_to_ref = self._top_hits_to_reference(hits_ref_genomes, hits_comp_ref_genomes) # get number of genes on each scaffold num_genes_on_scaffold = defaultdict(int) for seq_id, _seq in seq_io.read_seq(scaffold_gene_file): scaffold_id = seq_id[0:seq_id.rfind('_')] num_genes_on_scaffold[scaffold_id] += 1 # get hits to each scaffold hits_to_scaffold = defaultdict(list) for query_id, hit in hits_to_ref.iteritems(): gene_id = query_id[0:query_id.rfind('~')] scaffold_id = gene_id[0:gene_id.rfind('_')] hits_to_scaffold[scaffold_id].append(hit) # report summary stats for each scaffold reference_out = os.path.join(self.output_dir, 'references.tsv') fout = open(reference_out, 'w') fout.write('Scaffold ID\tSubject genome IDs\tSubject scaffold IDs') fout.write('\tGenome ID\tLength (bp)\tGC\tMean coverage') fout.write('\t# genes\t# hits\t% genes\tAvg. align. length (bp)\tAvg. % identity\tAvg. e-value\tAvg. bitscore\n') for scaffold_id, hits in hits_to_scaffold.iteritems(): aln_len = [] perc_iden = [] evalue = [] bitscore = [] subject_scaffold_ids = defaultdict(int) subject_bin_ids = defaultdict(int) for hit in hits: aln_len.append(hit.aln_length) perc_iden.append(hit.perc_identity) evalue.append(hit.evalue) bitscore.append(hit.bitscore) subject_bin_id, subject_gene_id = hit.subject_id.split('~') subject_scaffold_id = subject_gene_id[0:subject_gene_id.rfind('_')] subject_scaffold_ids[subject_scaffold_id] += 1 subject_bin_ids[subject_bin_id] += 1 sorted_subject_bin_ids = sorted(subject_bin_ids.items(), key=operator.itemgetter(1), reverse=True) subject_bin_id_str = [] for bin_id, num_hits in sorted_subject_bin_ids: subject_bin_id_str.append(bin_id + ':' + str(num_hits)) subject_bin_id_str = ','.join(subject_bin_id_str) sorted_subject_scaffold_ids = sorted(subject_scaffold_ids.items(), key=operator.itemgetter(1), reverse=True) subject_scaffold_id_str = [] for subject_id, num_hits in sorted_subject_scaffold_ids: subject_scaffold_id_str.append(subject_id + ':' + str(num_hits)) subject_scaffold_id_str = ','.join(subject_scaffold_id_str) fout.write('%s\t%s\t%s\t%s\t%.2f\t%d\t%d\t%.2f\t%d\t%.2f\t%.2g\t%.2f\n' % ( scaffold_id, subject_bin_id_str, subject_scaffold_id_str, scaffold_stats.print_stats(scaffold_id), mean(scaffold_stats.coverage(scaffold_id)), num_genes_on_scaffold[scaffold_id], len(hits), len(hits) * 100.0 / num_genes_on_scaffold[scaffold_id], mean(aln_len), mean(perc_iden), mean(evalue), mean(bitscore))) fout.close() return reference_out
def run(self, scaffold_gene_file, stat_file, ref_genome_gene_files, db_file, evalue, per_identity,): """Create taxonomic profiles for a set of genomes. Parameters ---------- scaffold_gene_file : str Fasta file of genes on scaffolds in amino acid space. stat_file : str File with statistics for individual scaffolds. ref_genome_gene_files : list of str Fasta files of called genes on reference genomes of interest. db_file : str Database of competing reference genes. evalue : float E-value threshold used by blast. per_identity: float Percent identity threshold used by blast. """ # read statistics file self.logger.info('') self.logger.info(' Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(stat_file) # perform homology searches self.logger.info('') self.logger.info(' Creating diamond database for reference genomes.') ref_gene_file = os.path.join(self.output_dir, 'ref_genes.faa') concatenate_gene_files(ref_genome_gene_files, ref_gene_file) diamond = Diamond(self.cpus) ref_diamond_db = os.path.join(self.output_dir, 'ref_genes') diamond.make_database(ref_gene_file, ref_diamond_db) self.logger.info(' Identifying homologs within reference genomes of interest (be patient!).') self.diamond_dir = os.path.join(self.output_dir, 'diamond') make_sure_path_exists(self.diamond_dir) hits_ref_genomes_daa = os.path.join(self.diamond_dir, 'ref_hits') diamond.blastp(scaffold_gene_file, ref_diamond_db, evalue, per_identity, 1, hits_ref_genomes_daa) hits_ref_genomes = os.path.join(self.diamond_dir, 'ref_hits.tsv') diamond.view(hits_ref_genomes_daa + '.daa', hits_ref_genomes) self.logger.info(' Identifying homologs within competing reference genomes (be patient!).') hits_comp_ref_genomes_daa = os.path.join(self.diamond_dir, 'competing_ref_hits') diamond.blastp(scaffold_gene_file, db_file, evalue, per_identity, 1, hits_comp_ref_genomes_daa) hits_comp_ref_genomes = os.path.join(self.diamond_dir, 'competing_ref_hits.tsv') diamond.view(hits_comp_ref_genomes_daa + '.daa', hits_comp_ref_genomes) # get list of genes with a top hit to the reference genomes of interest hits_to_ref = self._top_hits_to_reference(hits_ref_genomes, hits_comp_ref_genomes) # get number of genes on each scaffold num_genes_on_scaffold = defaultdict(int) for seq_id, _seq in seq_io.read_seq(scaffold_gene_file): scaffold_id = seq_id[0:seq_id.rfind('_')] num_genes_on_scaffold[scaffold_id] += 1 # get hits to each scaffold hits_to_scaffold = defaultdict(list) for query_id, hit in hits_to_ref.iteritems(): gene_id = query_id[0:query_id.rfind('~')] scaffold_id = gene_id[0:gene_id.rfind('_')] hits_to_scaffold[scaffold_id].append(hit) # report summary stats for each scaffold reference_out = os.path.join(self.output_dir, 'references.tsv') fout = open(reference_out, 'w') fout.write('Scaffold id\tSubject scaffold ids\tSubject genome ids') fout.write('\tGenome id\tLength (bp)\tGC\tMean coverage') fout.write('\t# genes\t# hits\t% genes\tAvg. align. length (bp)\tAvg. % identity\tAvg. e-value\tAvg. bitscore\n') for scaffold_id, hits in hits_to_scaffold.iteritems(): aln_len = [] perc_iden = [] evalue = [] bitscore = [] subject_scaffold_ids = defaultdict(int) subject_bin_ids = defaultdict(int) for hit in hits: aln_len.append(hit.aln_length) perc_iden.append(hit.perc_identity) evalue.append(hit.evalue) bitscore.append(hit.bitscore) subject_id, subject_bin_id = hit.subject_id.split('~') subject_scaffold_id = subject_id[0:subject_id.rfind('_')] subject_scaffold_ids[subject_scaffold_id] += 1 subject_bin_ids[subject_bin_id] += 1 subject_scaffold_id_str = [] for subject_id, num_hits in subject_scaffold_ids.iteritems(): subject_scaffold_id_str.append(subject_id + ':' + str(num_hits)) subject_scaffold_id_str = ','.join(subject_scaffold_id_str) subject_bin_id_str = [] for bin_id, num_hits in subject_bin_ids.iteritems(): subject_bin_id_str.append(bin_id + ':' + str(num_hits)) subject_bin_id_str = ','.join(subject_bin_id_str) fout.write('%s\t%s\t%s\t%s\t%.2f\t%d\t%d\t%.2f\t%d\t%.2f\t%.2g\t%.2f\n' % ( scaffold_id, subject_scaffold_id_str, subject_bin_id_str, scaffold_stats.print_stats(scaffold_id), mean(scaffold_stats.coverage(scaffold_id)), num_genes_on_scaffold[scaffold_id], len(hits), len(hits) * 100.0 / num_genes_on_scaffold[scaffold_id], mean(aln_len), mean(perc_iden), mean(evalue), mean(bitscore))) fout.close() return reference_out