def _prefix_gene_identifiers(self, gene_files, keep_headers, output_file): """Prefix all gene IDs with genome IDs: <genome_id>~<gene_id>. Parameters ---------- gene_files : list of str Genes in fasta files to modify. keep_headers : boolean If True, indicates FASTA headers already have the format <genome_id>~<gene_id>. output_file : str Name of FASTA file to contain modified genes. """ fout = open(output_file, 'w') for gf in gene_files: genome_id = remove_extension(gf) if genome_id.endswith('_genes'): genome_id = genome_id[0:genome_id.rfind('_genes')] for seq_id, seq, annotation in seq_io.read_fasta_seq( gf, keep_annotation=True): if keep_headers: fout.write('>' + seq_id + ' ' + annotation + '\n') else: fout.write('>' + genome_id + '~' + seq_id + ' ' + annotation + '\n') fout.write(seq + '\n') fout.close()
def write_gene_file(self, gene_out, gene_dir, genome_list, taxonomy, genes_to_ignore): """Write genes to output stream. Parameters ---------- gene_out : stream Output stream. gene_dir : str Directory containing called genes in amino acid space. genome_list : iterable Genomes to process. genes_to_ignore : set Genes which should not be written to file. """ genes_kept = 0 for genome_id in genome_list: genome_gene_file = os.path.join(gene_dir, genome_id + '.faa') if not os.path.exists(genome_gene_file): print '[WARNING] Missing gene file for genome %s.' % genome_gene_file continue if os.stat(genome_gene_file).st_size == 0: print '[WARNING] Gene file is empty for genome %s.' % genome_gene_file continue for gene_id, seq, annotation in seq_io.read_fasta_seq(genome_gene_file, keep_annotation=True): if gene_id in genes_to_ignore: continue gene_out.write('>' + gene_id + ' ' + annotation + '\n') gene_out.write(seq + '\n') genes_kept += 1 return genes_kept
def _prefix_gene_identifiers(self, gene_files, keep_headers, output_file): """Prefix all gene IDs with genome IDs: <genome_id>~<gene_id>. Parameters ---------- gene_files : list of str Genes in fasta files to modify. keep_headers : boolean If True, indicates FASTA headers already have the format <genome_id>~<gene_id>. output_file : str Name of FASTA file to contain modified genes. """ fout = open(output_file, 'w') for gf in gene_files: genome_id = remove_extension(gf) if genome_id.endswith('_genes'): genome_id = genome_id[0:genome_id.rfind('_genes')] for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True): if keep_headers: fout.write('>' + seq_id + ' ' + annotation + '\n') else: fout.write('>' + genome_id + '~' + seq_id + ' ' + annotation + '\n') fout.write(seq + '\n') fout.close()
def amend_gene_identifies(self, gene_dir, output_dir): """Modify gene ids to include source genome id. The following format is used: <genome_id>~<gene_id> Parameters ---------- gene_dir : str Directory with fasta files containing protein sequences. output_dir : float Directory to contain modified fasta files. """ if not os.path.exists(output_dir): os.makedirs(output_dir) for f in os.listdir(gene_dir): gf = os.path.join(gene_dir, f) genome_id = remove_extension(gf) aa_file = os.path.join(output_dir, genome_id + '.faa') fout = open(aa_file, 'w') for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True): fout.write('>%s~%s %s\n' % (genome_id, seq_id, annotation)) if seq[-1] == '*': seq = seq[0:-1] fout.write(seq + '\n') fout.close()
def amend_gene_identifies(self, gene_dir, output_dir): """Modify gene ids to include source genome id. The following format is used: <gene_id>~<genome_id> Parameters ---------- gene_dir : str Directory with fasta files containing protein sequences. output_dir : float Directory to contain modified fasta files. """ if not os.path.exists(output_dir): os.makedirs(output_dir) for f in os.listdir(gene_dir): gf = os.path.join(gene_dir, f) genome_id = remove_extension(gf) aa_file = os.path.join(output_dir, genome_id + '.faa') fout = open(aa_file, 'w') for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True): fout.write('>' + seq_id + '~' + genome_id + ' ' + annotation + '\n') if seq[-1] == '*': seq = seq[0:-1] fout.write(seq + '\n') fout.close()
def img_gene_id_to_scaffold_id(self, genome_dir, genome_id, output_dir): """Modify IMG gene ids to format which explicitly gives scaffold names. For downstream processing it is often necessary to know which scaffold a gene is contained on. IMG uses unique identifiers for genes. As such, these are changed to the following format: <scaffold_id>_<gene #> <annotation> [IMG gene id] Parameters ---------- genome_dir : str Directory with files for genome. genome_id : str Unique identifier of genome. output_dir : float Directory to contain modified fasta files. """ # determine source scaffold for each gene gene_id_to_scaffold_id = {} gene_number = defaultdict(int) for line in open(os.path.join(genome_dir, genome_id + '.gff')): if line[0] == '#': continue line_split = line.split('\t') scaffold_id = line_split[0] info = line_split[8] if info != '': # this will be empty for non-protein coding genes gene_id = info.split(';')[0].replace('ID=', '') gene_number[scaffold_id] += 1 gene_id_to_scaffold_id[gene_id] = scaffold_id + '_' + str(gene_number[scaffold_id]) # write out gene file with modified identifiers genome_gene_file = os.path.abspath(os.path.join(genome_dir, genome_id + '.genes.faa')) fout = open(os.path.join(output_dir, genome_id + '.faa'), 'w') for gene_id, seq, annotation in seq_io.read_fasta_seq(genome_gene_file, keep_annotation=True): annotation = annotation[annotation.find(' ') + 1:] # remove additional gene id from annotation annotation += ' [IMG Gene ID: ' + gene_id + ']' # append IMG gene id for future reference fout.write('>' + gene_id_to_scaffold_id[gene_id] + ' ' + annotation + '\n') fout.write(seq + '\n') fout.close()
def reformat_gene_id_to_scaffold_id(self, gene_file, gff_file, taxonomy, output_file): """Reformat gene ids to format which explicitly gives scaffold names. <genome_id>~<scaffold_id>_<gene_#> [gtdb_taxonomy] [NCBI organism name] [annotation] Parameters ---------- gene_file : str Gene file for genome. gff_file : str General feature file (GFF) for genome. output_file : float File to contain modified gene fasta file. """ # determine source scaffold for each gene gene_id_to_scaffold_id = {} gene_number = defaultdict(int) for line in open(gff_file): if line.startswith('##FASTA'): # start of FASTA section with individual sequences break if line[0] == '#': continue line_split = line.split('\t') scaffold_id = line_split[0] info = line_split[8] if info != '': # this will be empty for non-protein coding genes gene_id = info.split(';')[0].replace('ID=', '') gene_number[scaffold_id] += 1 gene_id_to_scaffold_id[gene_id] = scaffold_id + '_' + str(gene_number[scaffold_id]) # write out gene file with modified identifiers fout = open(output_file, 'w') for gene_id, seq, annotation in seq_io.read_fasta_seq(gene_file, keep_annotation=True): genome_id = remove_extension(gene_file) fout.write('>%s [%s] [%s] [%s]\n' % (gene_id_to_scaffold_id[gene_id], ';'.join(taxonomy.get(genome_id, ['none'])), 'none', annotation)) fout.write(seq + '\n') fout.close()
def write_gene_file(self, gene_out, gene_dir, genome_list, taxonomy, genes_to_ignore): """Write genes to output stream. Parameters ---------- gene_out : stream Output stream. gene_dir : str Directory containing called genes in amino acid space. genome_list : iterable Genomes to process. genes_to_ignore : set Genes which should not be written to file. """ genes_kept = 0 for genome_id in genome_list: genome_gene_file = os.path.join(gene_dir, genome_id + '.faa') if not os.path.exists(genome_gene_file): print '[WARNING] Missing gene file for genome %s.' % genome_gene_file continue if os.stat(genome_gene_file).st_size == 0: print '[WARNING] Gene file is empty for genome %s.' % genome_gene_file continue for gene_id, seq, annotation in seq_io.read_fasta_seq(genome_gene_file, keep_annotation=True): if gene_id in genes_to_ignore: continue # IMG headers sometimes contain non-ascii characters which cause # problems with BLAST and DIAMOND so there are explicitly filtered out annotation = filter(lambda x: x in string.printable, annotation) # a few IMG genomes contain protein sequences which start with a hyphen if seq[0] == '-': seq = seq[1:] gene_out.write('>' + gene_id + ' ' + annotation + '\n') gene_out.write(seq + '\n') genes_kept += 1 return genes_kept
def rblast(self, options): """Reciprocal blast command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - rblast] Performing reciprocal blast between genomes.') self.logger.info('*******************************************************************************') check_dir_exists(options.protein_dir) make_sure_path_exists(options.output_dir) aa_gene_files = [] for f in os.listdir(options.protein_dir): if f.endswith(options.protein_ext): aa_gene_files.append(os.path.join(options.protein_dir, f)) if not aa_gene_files: self.logger.warning(' [Warning] No gene files found. Check the --protein_ext flag used to identify gene files.') sys.exit() # modify gene ids to include genome ids in order to ensure # all gene identifiers are unique across the set of genomes, # also removes the trailing asterisk used to identify the stop # codon self.logger.info('') self.logger.info(' Appending genome identifiers to all gene identifiers.') gene_out_dir = os.path.join(options.output_dir, 'genes') make_sure_path_exists(gene_out_dir) modified_aa_gene_files = [] for gf in aa_gene_files: genome_id = remove_extension(gf) aa_file = os.path.join(gene_out_dir, genome_id + '.faa') fout = open(aa_file, 'w') for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True): fout.write('>' + seq_id + '~' + genome_id + ' ' + annotation + '\n') if seq[-1] == '*': seq = seq[0:-1] fout.write(seq + '\n') fout.close() modified_aa_gene_files.append(aa_file) # perform the reciprocal blast with blastp or diamond self.logger.info('') if options.blastp: rblast = ReciprocalBlast(options.cpus) rblast.run(modified_aa_gene_files, options.evalue, options.output_dir) # concatenate all blast tables to mimic output of diamond, all hits # for a given genome MUST be in consecutive order to fully mimic # the expected results from diamond self.logger.info('') self.logger.info(' Creating single file with all blast hits (be patient!).') blast_files = sorted([f for f in os.listdir(options.output_dir) if f.endswith('.blastp.tsv')]) hit_tables = [os.path.join(options.output_dir, f) for f in blast_files] concatenate_files(hit_tables, os.path.join(options.output_dir, 'all_hits.tsv')) else: rdiamond = ReciprocalDiamond(options.cpus) rdiamond.run(modified_aa_gene_files, options.evalue, options.per_identity, options.output_dir) self.logger.info('') self.logger.info(' Reciprocal blast hits written to: %s' % options.output_dir) self.time_keeper.print_time_stamp()
def run(self, query_gene_file, target_gene_file, sorted_hit_table, evalue_threshold, per_iden_threshold, per_aln_len_threshold, keep_rbhs, output_dir): """Calculate amino acid identity (AAI) between pairs of genomes. Parameters ---------- query_gene_file : str File with all query genes in FASTA format. target_gene_file : str or None File with all target genes in FASTA format, or None if performing a reciprocal AAI calculation. sorted_hit_table : str Sorted table indicating genes with sequence similarity. evalue_threshold : float Evalue threshold used to define a homologous gene. per_identity_threshold : float Percent identity threshold used to define a homologous gene. per_aln_len_threshold : float Alignment length threshold used to define a homologous gene. keep_rbhs : boolean Flag indicating if RBH should be written to file. output_dir : str Directory to store AAI results. """ self.sorted_hit_table = sorted_hit_table self.evalue_threshold = evalue_threshold self.per_identity_threshold = per_iden_threshold self.per_aln_len_threshold = per_aln_len_threshold self.keep_rbhs = keep_rbhs self.output_dir = output_dir # calculate length of genes and number of genes in each genome self.logger.info('Calculating length of genes.') self.gene_lengths = {} self.query_gene_count = defaultdict(int) query_genomes = set() for seq_id, seq in seq_io.read_fasta_seq(query_gene_file): if seq[-1] == '*': self.gene_lengths[seq_id] = len(seq) - 1 else: self.gene_lengths[seq_id] = len(seq) genome_id = seq_id[0:seq_id.find('~')] self.query_gene_count[genome_id] += 1 query_genomes.add(genome_id) self.target_gene_count = defaultdict(int) target_genomes = set() if target_gene_file: for seq_id, seq in seq_io.read_fasta_seq(target_gene_file): if seq[-1] == '*': self.gene_lengths[seq_id] = len(seq) - 1 else: self.gene_lengths[seq_id] = len(seq) genome_id = seq_id[0:seq_id.find('~')] self.target_gene_count[genome_id] += 1 target_genomes.add(genome_id) else: self.target_gene_count = self.query_gene_count # get byte offset of hits from each genome self.logger.info('Indexing sorted hit table.') self.offset_table = self._genome_offsets(self.sorted_hit_table) # calculate AAI between each pair of genomes in parallel if target_genomes: # compare query genomes to target genomes self.num_pairs = len(query_genomes) * len(target_genomes) self.logger.info( 'Calculating AAI between %d query and %d target genomes:' % (len(query_genomes), len(target_genomes))) else: # compute pairwise values between target genomes ng = len(query_genomes) self.num_pairs = (ng * ng - ng) / 2 self.logger.info( 'Calculating AAI between all %d pairs of genomes:' % self.num_pairs) if self.num_pairs == 0: self.logger.warning('No genome pairs identified.') return genome_id_lists = [] query_genomes = list(query_genomes) target_genomes = list(target_genomes) for i in xrange(0, len(query_genomes)): genome_idI = query_genomes[i] if target_genomes: genome_id_list = target_genomes else: genome_id_list = [] for j in xrange(i + 1, len(query_genomes)): genome_idJ = query_genomes[j] genome_id_list.append(genome_idJ) genome_id_lists.append((genome_idI, genome_id_list)) self.processed_paired = 0 parallel = Parallel(self.cpus) progress_func = self._progress if self.logger.is_silent: progress_func = None consumer_data = parallel.run(self._producer, self._consumer, genome_id_lists, progress_func) # write results for each genome pair self.logger.info('Summarizing AAI results.') aai_summay_file = os.path.join(output_dir, 'aai_summary.tsv') fout = open(aai_summay_file, 'w') fout.write( 'Genome A\tGenes in A\tGenome B\tGenes in B\t# orthologous genes\tMean AAI\tStd AAI\tOrthologous fraction (OF)\n' ) for data in consumer_data: fout.write('%s\t%d\t%s\t%d\t%d\t%.2f\t%.2f\t%.2f\n' % data) fout.close() # concatenate RBH files rbh_output_file = None if self.keep_rbhs: self.logger.info('Concatenating RBH files.') rbh_files = [] for genome_id in query_genomes: rbh_files.append( os.path.join(self.output_dir, genome_id + '.rbh.tsv')) rbh_output_file = os.path.join(self.output_dir, 'rbh.tsv') concatenate_files(rbh_files, rbh_output_file, common_header=True) for f in rbh_files: os.remove(f) return aai_summay_file, rbh_output_file
def extract_homologs_and_context(self, homologs, db_file, output_file): """Extract homologs sequences from database file, and local gene context. This function extract sequences information for each homolog and writes this to file for downstream processing. In addition, it determines the local gene context for each gene. Specifically, it saves the annotations for the 3 genes prior to and after a given gene. This function assumes the database is sorted according to the order genes are identified on each contig. Parameters ---------- homologs : iterable Unique identifiers of sequences to extract db_file : str Fasta file with sequences. output_file : str File to write homologs. Returns ------- dict d[seq_id] -> list of annotations for pre-context genes dict d[seq_id] -> list of annotations for post-context genes """ gene_precontext = {} gene_postcontext = {} if len(homologs) == 0: return gene_precontext, gene_postcontext if type(homologs) is not set: homologs = set(homologs) fout = open(output_file, 'w') local_context = [('unknown~unknown_x', None)] * 3 post_context_counter = {} for seq_id, seq, annotation in seq_io.read_fasta_seq( db_file, keep_annotation=True): if seq_id in homologs: fout.write('>' + seq_id + ' ' + annotation + '\n') fout.write(seq + '\n') gene_precontext[seq_id] = list(local_context) post_context_counter[seq_id] = 3 # record 3 precontext genes local_context[0] = local_context[1] local_context[1] = local_context[2] local_context[2] = (seq_id, annotation) # record 3 postcontext genes if len(post_context_counter): key_to_remove = None for seq_id, count in post_context_counter.iteritems(): count -= 1 if count == -1: gene_postcontext[seq_id] = list(local_context) key_to_remove = seq_id else: post_context_counter[seq_id] = count if key_to_remove: post_context_counter.pop(key_to_remove) fout.close() # filter gene context to contain only genes on the same scaffold gene_precontext = self._filter_gene_context(gene_precontext) gene_postcontext = self._filter_gene_context(gene_postcontext) return gene_precontext, gene_postcontext
def run(self, query_gene_file, target_gene_file, sorted_hit_table, evalue_threshold, per_iden_threshold, per_aln_len_threshold, keep_rbhs, output_dir): """Calculate amino acid identity (AAI) between pairs of genomes. Parameters ---------- query_gene_file : str File with all query genes in FASTA format. target_gene_file : str or None File with all target genes in FASTA format, or None if performing a reciprocal AAI calculation. sorted_hit_table : str Sorted table indicating genes with sequence similarity. evalue_threshold : float Evalue threshold used to define a homologous gene. per_identity_threshold : float Percent identity threshold used to define a homologous gene. per_aln_len_threshold : float Alignment length threshold used to define a homologous gene. keep_rbhs : boolean Flag indicating if RBH should be written to file. output_dir : str Directory to store AAI results. """ self.sorted_hit_table = sorted_hit_table self.evalue_threshold = evalue_threshold self.per_identity_threshold = per_iden_threshold self.per_aln_len_threshold = per_aln_len_threshold self.keep_rbhs = keep_rbhs self.output_dir = output_dir # calculate length of genes and number of genes in each genome self.logger.info('Calculating length of genes.') self.gene_lengths = {} self.query_gene_count = defaultdict(int) query_genomes = set() for seq_id, seq in seq_io.read_fasta_seq(query_gene_file): if seq[-1] == '*': self.gene_lengths[seq_id] = len(seq) - 1 else: self.gene_lengths[seq_id] = len(seq) genome_id = seq_id[0:seq_id.find('~')] self.query_gene_count[genome_id] += 1 query_genomes.add(genome_id) self.target_gene_count = defaultdict(int) target_genomes = set() if target_gene_file: for seq_id, seq in seq_io.read_fasta_seq(target_gene_file): if seq[-1] == '*': self.gene_lengths[seq_id] = len(seq) - 1 else: self.gene_lengths[seq_id] = len(seq) genome_id = seq_id[0:seq_id.find('~')] self.target_gene_count[genome_id] += 1 target_genomes.add(genome_id) else: self.target_gene_count = self.query_gene_count # get byte offset of hits from each genome self.logger.info('Indexing sorted hit table.') self.offset_table = self._genome_offsets(self.sorted_hit_table) # calculate AAI between each pair of genomes in parallel if target_genomes: # compare query genomes to target genomes self.num_pairs = len(query_genomes) * len(target_genomes) self.logger.info('Calculating AAI between %d query and %d target genomes:' % (len(query_genomes), len(target_genomes))) else: # compute pairwise values between target genomes ng = len(query_genomes) self.num_pairs = (ng*ng - ng) / 2 self.logger.info('Calculating AAI between all %d pairs of genomes:' % self.num_pairs) if self.num_pairs == 0: self.logger.warning('No genome pairs identified.') return genome_id_lists = [] query_genomes = list(query_genomes) target_genomes = list(target_genomes) for i in range(0, len(query_genomes)): genome_idI = query_genomes[i] if target_genomes: genome_id_list = target_genomes else: genome_id_list = [] for j in range(i + 1, len(query_genomes)): genome_idJ = query_genomes[j] genome_id_list.append(genome_idJ) genome_id_lists.append((genome_idI, genome_id_list)) self.processed_paired = 0 parallel = Parallel(self.cpus) progress_func = self._progress if self.logger.is_silent: progress_func = None consumer_data = parallel.run(self._producer, self._consumer, genome_id_lists, progress_func) # write results for each genome pair self.logger.info('Summarizing AAI results.') aai_summay_file = os.path.join(output_dir, 'aai_summary.tsv') fout = open(aai_summay_file, 'w') fout.write('#Genome A\tGenes in A\tGenome B\tGenes in B\t# orthologous genes\tMean AAI\tStd AAI\tOrthologous fraction (OF)\n') for data in consumer_data: fout.write('%s\t%d\t%s\t%d\t%d\t%.2f\t%.2f\t%.2f\n' % data) fout.close() # concatenate RBH files rbh_output_file = None if self.keep_rbhs: self.logger.info('Concatenating RBH files.') rbh_files = [] for genome_id in query_genomes: rbh_files.append(os.path.join(self.output_dir, genome_id + '.rbh.tsv')) rbh_output_file = os.path.join(self.output_dir, 'rbh.tsv') concatenate_files(rbh_files, rbh_output_file, common_header=True) for f in rbh_files: os.remove(f) return aai_summay_file, rbh_output_file