def aai(self, options): """AAI command""" check_file_exists(options.sorted_hit_table) make_sure_path_exists(options.output_dir) aai_calculator = AAICalculator(options.cpus) aai_output_file, rbh_output_file = aai_calculator.run( options.query_gene_file, None, options.sorted_hit_table, options.evalue, options.per_identity, options.per_aln_len, options.keep_rbhs, options.output_dir) if rbh_output_file: self.logger.info('Identified reciprocal best hits written to: %s' % rbh_output_file) self.logger.info('AAI between genomes written to: %s' % aai_output_file)
def aai(self, options): """AAI command""" check_file_exists(options.sorted_hit_table) make_sure_path_exists(options.output_dir) aai_calculator = AAICalculator(options.cpus) aai_output_file, rbh_output_file = aai_calculator.run(options.query_gene_file, None, options.sorted_hit_table, options.evalue, options.per_identity, options.per_aln_len, options.keep_rbhs, options.output_dir) if rbh_output_file: self.logger.info('Identified reciprocal best hits written to: %s' % rbh_output_file) self.logger.info('AAI between genomes written to: %s' % aai_output_file)
def aai(self, options): """AAI command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - aai] Calculating the AAI between homologs in genome pairs.') self.logger.info('*******************************************************************************') self.logger.info('') check_dir_exists(options.rblast_dir) make_sure_path_exists(options.output_dir) genome_ids = [] protein_dir = os.path.join(options.rblast_dir, 'genes') for f in os.listdir(protein_dir): if f.endswith('.faa'): genome_id = remove_extension(f, '.faa') genome_ids.append(genome_id) if not genome_ids: self.logger.warning(' [Warning] No gene files found. Check the --protein_ext flag used to identify gene files.') sys.exit() aai_calculator = AAICalculator(options.cpus) aai_calculator.run(genome_ids, protein_dir, options.rblast_dir, options.per_identity, options.per_aln_len, options.write_shared_genes, options.output_dir) shared_genes_dir = os.path.join(options.output_dir, aai_calculator.shared_genes) self.logger.info('') self.logger.info(' Identified homologs between genome pairs written to: %s' % shared_genes_dir) self.time_keeper.print_time_stamp()
def run(self, query_gene_file, target_gene_file, sorted_hit_table, evalue_threshold, per_iden_threshold, per_aln_len_threshold, num_top_targets, taxonomy_file, keep_rbhs, output_dir): """Classify genomes based on AAI to reference genomes. Parameters ---------- query_gene_file : str File with all query genes in FASTA format. target_gene_file : str File with all target genes in FASTA format. sorted_hit_table : str Sorted table indicating genes with sequence similarity. evalue_threshold : float Evalue threshold used to define a homologous gene. per_identity_threshold : float Percent identity threshold used to define a homologous gene. per_aln_len_threshold : float Alignment length threshold used to define a homologous gene. num_top_targets : int Number of top scoring target genomes to report per query genome. taxonomy_file : str File indicating taxonomic identification of all target genomes. keep_rbhs : boolean Flag indicating if RBH should be written to file. output_dir : str Directory to store AAI results. """ # read taxonomic identification of each genome taxonomy = {} if taxonomy_file: for line in open(taxonomy_file): genome_id, taxa_str = line.rstrip().split('\t') taxonomy[genome_id] = taxa_str # calculate AAI between query and target genomes aai_output_dir = os.path.join(output_dir, 'aai') make_sure_path_exists(aai_output_dir) aai_calculator = AAICalculator(self.cpus) aai_output_file, rbh_output_file = aai_calculator.run( query_gene_file, target_gene_file, sorted_hit_table, evalue_threshold, per_iden_threshold, per_aln_len_threshold, keep_rbhs, aai_output_dir) # determine matches to each query genomes aai_results_file = os.path.join(aai_output_dir, 'aai_summary.tsv') with open(aai_results_file) as f: f.readline() hits = defaultdict(list) for line in f: line_split = line.rstrip().split('\t') query_id = line_split[0] target_id = line_split[2] aai = float(line_split[5]) of = float(line_split[7]) hits[query_id].append([target_id, aai, of]) # report top matches results_file = os.path.join(output_dir, 'classify.tsv') fout = open(results_file, 'w') fout.write('Query Id\tTarget Id\tAAI\tOF\tScore') if taxonomy: fout.write('\tTarget Taxonomy') fout.write('\n') for query_id, cur_hits in hits.items(): cur_hits.sort(key=lambda x: x[1], reverse=True) for i in xrange(0, min(num_top_targets, len(cur_hits))): data = [query_id] + cur_hits[i] fout.write('%s\t%s\t%.2f\t%.2f' % tuple(data)) aai = data[2] of = data[3] fout.write('\t%.2f' % (aai + of)) target_id = cur_hits[i][0] if target_id in taxonomy: fout.write('\t%s' % taxonomy[target_id]) fout.write('\n') fout.close() return results_file
def run(self, query_gene_file, target_gene_file, sorted_hit_table, evalue_threshold, per_iden_threshold, per_aln_len_threshold, num_top_targets, taxonomy_file, keep_rbhs, output_dir): """Classify genomes based on AAI to reference genomes. Parameters ---------- query_gene_file : str File with all query genes in FASTA format. target_gene_file : str File with all target genes in FASTA format. sorted_hit_table : str Sorted table indicating genes with sequence similarity. evalue_threshold : float Evalue threshold used to define a homologous gene. per_identity_threshold : float Percent identity threshold used to define a homologous gene. per_aln_len_threshold : float Alignment length threshold used to define a homologous gene. num_top_targets : int Number of top scoring target genomes to report per query genome. taxonomy_file : str File indicating taxonomic identification of all target genomes. keep_rbhs : boolean Flag indicating if RBH should be written to file. output_dir : str Directory to store AAI results. """ # read taxonomic identification of each genome taxonomy = {} if taxonomy_file: for line in open(taxonomy_file): genome_id, taxa_str = line.rstrip().split('\t') taxonomy[genome_id] = taxa_str # calculate AAI between query and target genomes aai_output_dir = os.path.join(output_dir, 'aai') make_sure_path_exists(aai_output_dir) aai_calculator = AAICalculator(self.cpus) aai_output_file, rbh_output_file = aai_calculator.run(query_gene_file, target_gene_file, sorted_hit_table, evalue_threshold, per_iden_threshold, per_aln_len_threshold, keep_rbhs, aai_output_dir) # determine matches to each query genomes aai_results_file = os.path.join(aai_output_dir, 'aai_summary.tsv') with open(aai_results_file) as f: f.readline() hits = defaultdict(list) for line in f: line_split = line.rstrip().split('\t') query_id = line_split[0] target_id = line_split[2] aai = float(line_split[5]) of = float(line_split[7]) hits[query_id].append([target_id, aai, of]) # report top matches results_file = os.path.join(output_dir, 'classify.tsv') fout = open(results_file, 'w') fout.write('Query Id\tTarget Id\tAAI\tOF\tScore') if taxonomy: fout.write('\tTarget Taxonomy') fout.write('\n') for query_id, cur_hits in hits.items(): cur_hits.sort(key=lambda x: x[1], reverse=True) for i in xrange(0, min(num_top_targets, len(cur_hits))): data = [query_id] + cur_hits[i] fout.write('%s\t%s\t%.2f\t%.2f' % tuple(data)) aai = data[2] of = data[3] fout.write('\t%.2f' % (aai+of)) target_id = cur_hits[i][0] if target_id in taxonomy: fout.write('\t%s' % taxonomy[target_id]) fout.write('\n') fout.close() return results_file