def _producer_blast(self, genome_pair): """Apply reciprocal blast to a pair of genomes. Parameters ---------- genome_pair : list Identifier of genomes to process. """ blast = Blast(cpus=self.producer_cpus) aa_gene_fileA, aa_gene_fileB = genome_pair genome_idA = remove_extension(aa_gene_fileA) genome_idB = remove_extension(aa_gene_fileB) dbA = os.path.join(self.output_dir, genome_idA + '.db') dbB = os.path.join(self.output_dir, genome_idB + '.db') output_fileAB = os.path.join(self.output_dir, genome_idA + '-' + genome_idB + '.blastp.tsv') blast.blastp(aa_gene_fileA, dbB, output_fileAB, self.evalue) output_fileBA = os.path.join(self.output_dir, genome_idB + '-' + genome_idA + '.blastp.tsv') blast.blastp(aa_gene_fileB, dbA, output_fileBA, self.evalue) return True
def run(self, bam_files, out_file, all_reads, min_align_per, max_edit_dist_per): """Calculate coverage of sequences for each BAM file.""" # make sure all BAM files are sorted for bam_file in bam_files: if not os.path.exists(bam_file + '.bai'): self.logger.error(' [Error] BAM file is not sorted: ' + bam_file + '\n') sys.exit() # calculate coverage of each BAM file coverage_info = {} for i, bam_file in enumerate(bam_files): self.logger.info('') self.logger.info(' Calculating coverage profile for %s (%d of %d):' % (ntpath.basename(bam_file), i + 1, len(bam_files))) coverage_info[bam_file] = mp.Manager().dict() coverage_info[bam_file] = self._process_bam(bam_file, all_reads, min_align_per, max_edit_dist_per, coverage_info[bam_file]) fout = open(out_file, 'w') header = 'Scaffold Id\tLength (bp)' for bam_file in bam_files: bam_id = remove_extension(bam_file) header += '\t' + bam_id fout.write(header + '\n') for seq_id in coverage_info[coverage_info.keys()[0]].keys(): row_str = seq_id + '\t' + str(coverage_info[coverage_info.keys()[0]][seq_id].seq_len) for bam_file in bam_files: bam_id = remove_extension(bam_file) row_str += '\t' + str(coverage_info[bam_file][seq_id].coverage) fout.write(row_str + '\n') fout.close()
def run(self, bam_files, out_file, all_reads, min_align_per, max_edit_dist_per): """Calculate coverage of sequences for each BAM file.""" # make sure all BAM files are indexed for bam_file in bam_files: if not os.path.exists(bam_file + '.bai'): self.logger.error('BAM index file is missing: ' + bam_file + '.bai\n') sys.exit() # calculate coverage of each BAM file coverage_info = {} for i, bam_file in enumerate(bam_files): self.logger.info('Calculating coverage profile for %s (%d of %d):' % (ntpath.basename(bam_file), i + 1, len(bam_files))) coverage_info[bam_file] = mp.Manager().dict() coverage_info[bam_file] = self._process_bam(bam_file, all_reads, min_align_per, max_edit_dist_per, coverage_info[bam_file]) fout = open(out_file, 'w') header = 'Scaffold Id\tLength (bp)' for bam_file in bam_files: bam_id = remove_extension(bam_file) header += '\t' + bam_id fout.write(header + '\n') for seq_id in coverage_info[coverage_info.keys()[0]].keys(): row_str = seq_id + '\t' + str(coverage_info[coverage_info.keys()[0]][seq_id].seq_len) for bam_file in bam_files: bam_id = remove_extension(bam_file) if seq_id in coverage_info[bam_file]: row_str += '\t' + str(coverage_info[bam_file][seq_id].coverage) else: row_str += '\t' + '0.0' fout.write(row_str + '\n') fout.close()
def add_compatible(self, scaffold_file, genome_file, compatible_file, min_len, out_genome): """Add sequences specified as compatible. Parameters ---------- scaffold_file : str Fasta file containing scaffolds to add. genome_file : str Fasta file of binned scaffolds. compatible_file : str File specifying compatible scaffolds. min_len : int Minimum length to add scaffold. out_genome : str Name of output genome. """ cur_bin_id = remove_extension(genome_file) # determine statistics for each potentially compatible scaffold scaffold_ids = set() with open(compatible_file) as f: headers = [x.strip() for x in f.readline().split('\t')] scaffold_gc_index = headers.index('Scaffold GC') genome_gc_index = headers.index('Median genome GC') td_dist_index = headers.index('Scaffold TD') scaffold_cov_index = headers.index('Scaffold coverage') genome_cov_index = headers.index('Median genome coverage') for line in f: line_split = line.split('\t') scaffold_id = line_split[0] bin_id = line_split[1].strip() scaffold_gc = float(line_split[scaffold_gc_index]) genome_gc = float(line_split[genome_gc_index]) gc_dist = abs(scaffold_gc - genome_gc) td_dist = float(line_split[td_dist_index]) scaffold_cov = float(line_split[scaffold_cov_index]) genome_cov = float(line_split[genome_cov_index]) cov_dist = abs(scaffold_cov - genome_cov) if bin_id == cur_bin_id: scaffold_ids.add(scaffold_id) # add compatible sequences to genome added_seqs = 0 genome_seqs = seq_io.read(genome_file) for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id in scaffold_ids: if len(seq) >= min_len: genome_seqs[seq_id] = seq added_seqs += 1 self.logger.info('Added {:,} scaffolds meeting length criterion.'.format(added_seqs)) # save modified bin seq_io.write_fasta(genome_seqs, out_genome)
def _prefix_gene_identifiers(self, gene_files, keep_headers, output_file): """Prefix all gene IDs with genome IDs: <genome_id>~<gene_id>. Parameters ---------- gene_files : list of str Genes in fasta files to modify. keep_headers : boolean If True, indicates FASTA headers already have the format <genome_id>~<gene_id>. output_file : str Name of FASTA file to contain modified genes. """ fout = open(output_file, 'w') for gf in gene_files: genome_id = remove_extension(gf) if genome_id.endswith('_genes'): genome_id = genome_id[0:genome_id.rfind('_genes')] for seq_id, seq, annotation in seq_io.read_fasta_seq( gf, keep_annotation=True): if keep_headers: fout.write('>' + seq_id + ' ' + annotation + '\n') else: fout.write('>' + genome_id + '~' + seq_id + ' ' + annotation + '\n') fout.write(seq + '\n') fout.close()
def _prefix_gene_identifiers(self, gene_files, keep_headers, output_file): """Prefix all gene IDs with genome IDs: <genome_id>~<gene_id>. Parameters ---------- gene_files : list of str Genes in fasta files to modify. keep_headers : boolean If True, indicates FASTA headers already have the format <genome_id>~<gene_id>. output_file : str Name of FASTA file to contain modified genes. """ fout = open(output_file, 'w') for gf in gene_files: genome_id = remove_extension(gf) if genome_id.endswith('_genes'): genome_id = genome_id[0:genome_id.rfind('_genes')] for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True): if keep_headers: fout.write('>' + seq_id + ' ' + annotation + '\n') else: fout.write('>' + genome_id + '~' + seq_id + ' ' + annotation + '\n') fout.write(seq + '\n') fout.close()
def concatenate_gene_files(gene_files, concatenated_gene_file): """Combine all gene files into a single file. Gene ids are modified to include genome ids in order to ensure all gene identifiers are unique across the set of genomes. Parameters ---------- gene_files : list of str Fasta files of called genes to process. concatenated_gene_file : str Name of file to contain concatenated gene files. """ fout = open(concatenated_gene_file, 'w') for gf in gene_files: genome_id = remove_extension(gf) for seq_id, seq in seq_io.read_seq(gf): fout.write('>' + seq_id + '~' + genome_id + '\n') if seq[-1] == '*': seq = seq[0:-1] fout.write(seq + '\n') fout.close()
def amend_gene_identifies(self, gene_dir, output_dir): """Modify gene ids to include source genome id. The following format is used: <gene_id>~<genome_id> Parameters ---------- gene_dir : str Directory with fasta files containing protein sequences. output_dir : float Directory to contain modified fasta files. """ if not os.path.exists(output_dir): os.makedirs(output_dir) for f in os.listdir(gene_dir): gf = os.path.join(gene_dir, f) genome_id = remove_extension(gf) aa_file = os.path.join(output_dir, genome_id + '.faa') fout = open(aa_file, 'w') for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True): fout.write('>' + seq_id + '~' + genome_id + ' ' + annotation + '\n') if seq[-1] == '*': seq = seq[0:-1] fout.write(seq + '\n') fout.close()
def concatenate_gene_files(gene_files, concatenated_gene_file): """Combine all gene files into a single file. Gene ids are modified to include genome ids in order to ensure all gene identifiers are unique across the set of genomes. Parameters ---------- gene_files : list of str Fasta files of called genes to process. concatenated_gene_file : str Name of file to contain concatenated gene files. """ fout = open(concatenated_gene_file, 'w') for gf in gene_files: genome_id = remove_extension(gf) for seq_id, seq in seq_io.read_seq(gf): fout.write('>' + genome_id + '~' + seq_id + '\n') if seq[-1] == '*': seq = seq[0:-1] fout.write(seq + '\n') fout.close()
def _parse_fastani_results(self, fastout_file, list_leaf): """ Parse the fastani output file Parameters ---------- fastout_file : fastani output file. Returns ------- dictionary dict_results[user_g]={"ref_genome":ref_genome,"ani":ani} """ dict_results = {} with open(fastout_file) as fastfile: for line in fastfile: info = line.strip().split(" ") ref_genome = os.path.basename(info[1]).replace( Config.FASTANI_GENOMES_EXT, "") user_g = remove_extension(os.path.basename(info[0])) ani = float(info[2]) if user_g in dict_results: print "it should not happen! (if user_g in dict_results)" else: dict_results[user_g] = { "ref_genome": ref_genome, "ani": ani } return dict_results
def amend_gene_identifies(self, gene_dir, output_dir): """Modify gene ids to include source genome id. The following format is used: <genome_id>~<gene_id> Parameters ---------- gene_dir : str Directory with fasta files containing protein sequences. output_dir : float Directory to contain modified fasta files. """ if not os.path.exists(output_dir): os.makedirs(output_dir) for f in os.listdir(gene_dir): gf = os.path.join(gene_dir, f) genome_id = remove_extension(gf) aa_file = os.path.join(output_dir, genome_id + '.faa') fout = open(aa_file, 'w') for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True): fout.write('>%s~%s %s\n' % (genome_id, seq_id, annotation)) if seq[-1] == '*': seq = seq[0:-1] fout.write(seq + '\n') fout.close()
def add_compatible_unique(self, scaffold_file, genome_file, compatible_file, min_len, out_genome): """Add sequences specified as compatible. Only sequences specified exactly once in the compatibility file are added. Parameters ---------- scaffold_file : str Fasta file containing scaffolds to add. genome_file : str Fasta file of binned scaffolds. compatible_file : str File specifying compatible scaffolds. min_len : int Minimum length to add scaffold. out_genome : str Name of output genome. """ cur_bin_id = remove_extension(genome_file) # determine scaffolds compatible with genome scaffold_ids = [] bin_ids = {} with open(compatible_file) as f: f.readline() for line in f: line_split = line.split('\t') scaffold_id = line_split[0] bin_id = line_split[1].strip() scaffold_ids.append(scaffold_id) bin_ids[scaffold_id] = bin_id compatible_scaffolds = set() for scaffold_id, bin_id in bin_ids.iteritems(): if scaffold_ids.count(scaffold_id) == 1 and bin_id == cur_bin_id: compatible_scaffolds.add(scaffold_id) self.logger.info('Identified %d compatible scaffolds.' % len(compatible_scaffolds)) # add compatible sequences to genome added_seqs = 0 genome_seqs = seq_io.read(genome_file) for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id in compatible_scaffolds: if len(seq) >= min_len: genome_seqs[seq_id] = seq added_seqs += 1 self.logger.info('Added %d scaffolds meeting length criterion.' % added_seqs) # save modified bin seq_io.write_fasta(genome_seqs, out_genome)
def identify(self, genome_files, evalue_threshold, concatenate_threshold, output_dir): """Identify 16S rRNA genes. Parameters ---------- genome_files : iterable Path to genome files to process. evalue_threshold : float E-value threshold for defining valid hits. concatenate_threshold : int Concatenate hits within the specified number of base pairs. output_dir : str Output directory. Returns ------- dict : d[genome_id][seq_id] -> information about best hit Information about best hits for each genome. """ self.logger.info('Identifying SSU rRNA genes.') best_hits = {} for genome_file in genome_files: genome_id = remove_extension(genome_file) genome_dir = os.path.join(output_dir, genome_id) make_sure_path_exists(genome_dir) # identify 16S reads from contigs/scaffolds self._hmm_search(genome_file, evalue_threshold, genome_dir) # read HMM hits hits_per_domain = {} for domain in ['archaea', 'bacteria', 'euk']: seq_info = self._read_hits( os.path.join(genome_dir, 'ssu' + '.hmm_' + domain + '.txt'), domain, evalue_threshold) hits = {} if len(seq_info) > 0: for seq_id, seq_hits in seq_info.iteritems(): for hit in seq_hits: self._add_hit(hits, seq_id, hit, concatenate_threshold) hits_per_domain[domain] = hits # find best domain hit for each best_hits[genome_id] = {} for _, hits in hits_per_domain.iteritems(): for seq_id, info in hits.iteritems(): if '-#' in seq_id: seq_id = seq_id[0:seq_id.rfind('-#')] self._add_domain_hit(best_hits[genome_id], seq_id, info) return best_hits
def run(self, input_tree, msa_file, outgroup_file, perc_taxa_to_keep, num_replicates, model, output_dir): """Jackknife taxa. Parameters ---------- input_tree : str Tree inferred with all data. msa_file : str File containing multiple sequence alignment for all taxa. outgroup_file : str File indicating labels of outgroup taxa. perc_taxa_to_keep : float Percentage of taxa to keep in each replicate. num_replicates : int Number of replicates to perform. model : str Desired model of evolution. output_dir : str input_tree directory for bootstrap trees. """ assert (model in ['wag', 'jtt']) self.perc_taxa_to_keep = perc_taxa_to_keep self.model = model self.replicate_dir = os.path.join(output_dir, 'replicates') make_sure_path_exists(self.replicate_dir) # read outgroup taxa self.outgroup_ids = set() if outgroup_file: for line in open(outgroup_file): self.outgroup_ids.add(line.strip()) # read full multiple sequence alignment self.msa = seq_io.read(msa_file) # calculate replicates #***self.logger.info('Calculating jackknife taxa replicates:') #***parallel = Parallel(self.cpus) #***parallel.run(self._producer, None, range(num_replicates), self._progress) # calculate support rep_tree_files = [] for rep_index in range(num_replicates): rep_tree_files.append( os.path.join(self.replicate_dir, 'jk_taxa.tree.' + str(rep_index) + '.tre')) tree_support = TreeSupport() output_tree = os.path.join( output_dir, remove_extension(input_tree) + '.jk_taxa.tree') tree_support.subset_taxa(input_tree, rep_tree_files, output_tree) return output_tree
def extract(self, genome_files, best_hits, output_dir): """Extract 16S rRNA genes. Parameters ---------- genome_files : iterable Path to genome files to process. best_hits : d[genome_id][seq_id] -> information about best hit Information about best hits for each genome. output_dir : str Output directory. Returns ------- d[genome_id] -> str Fasta file containing SSU sequences for each genome. """ self.logger.info('Extracting SSU rRNA genes.') ssu_seq_files = {} for genome_file in genome_files: genome_id = remove_extension(genome_file) genome_dir = os.path.join(output_dir, genome_id) if len(best_hits[genome_id]) == 0: continue # write summary file and putative SSU rRNAs to file summary_file = os.path.join(genome_dir, 'ssu.hmm_summary.tsv') summary_out = open(summary_file, 'w') summary_out.write( 'Sequence Id\tHMM\ti-Evalue\tStart hit\tEnd hit\tSSU gene length\tReverse Complement\tSequence length\n' ) ssu_seq_files[genome_id] = os.path.join(genome_dir, 'ssu.fna') seq_out = open(ssu_seq_files[genome_id], 'w') seqs = seq_io.read(genome_file) for seq_id in best_hits[genome_id]: orig_seq_id = seq_id if '-#' in seq_id: seq_id = seq_id[0:seq_id.rfind('-#')] seq_info = [orig_seq_id] + best_hits[genome_id][orig_seq_id] seq = seqs[seq_id] summary_out.write('\t'.join(seq_info) + '\n') seq_out.write('>' + seq_info[0] + '\n') seq_out.write(seq[int(seq_info[3]) + 1:int(seq_info[4]) + 1] + '\n') summary_out.close() seq_out.close() return ssu_seq_files
def run(self, input_tree, msa_file, num_replicates, model, base_type, frac, output_dir): """Bootstrap multiple sequence alignment. Parameters ---------- input_tree : str Tree inferred with all data. msa_file : str File containing multiple sequence alignment for all taxa. num_replicates : int Number of replicates to perform. model : str Desired model of evolution. base_type : str Indicates if bases are nucleotides or amino acids. frac : float Fraction of alignment to subsample. output_dir : str Directory for bootstrap trees. """ assert (model in ['wag', 'lg', 'jtt']) assert (base_type in ['nt', 'prot']) self.model = model self.base_type = base_type self.frac = frac self.replicate_dir = os.path.join(output_dir, 'replicates') make_sure_path_exists(self.replicate_dir) # read full multiple sequence alignment self.msa = seq_io.read(msa_file) # calculate replicates self.logger.info('Calculating bootstrap replicates:') parallel = Parallel(self.cpus) parallel.run(self._producer, None, range(num_replicates), self._progress) # calculate support values rep_tree_files = [] for rep_index in range(num_replicates): rep_tree_files.append( os.path.join(self.replicate_dir, 'bootstrap_tree.r_' + str(rep_index) + '.tree')) output_tree = os.path.join( output_dir, remove_extension(input_tree) + '.bootstrap.tree') bootstrap_support(input_tree, rep_tree_files, output_tree) return output_tree
def run(self, input_tree, msa_file, outgroup_file, perc_taxa_to_keep, num_replicates, model, output_dir): """Jackknife taxa. Parameters ---------- input_tree : str Tree inferred with all data. msa_file : str File containing multiple sequence alignment for all taxa. outgroup_file : str File indicating labels of outgroup taxa. perc_taxa_to_keep : float Percentage of taxa to keep in each replicate. num_replicates : int Number of replicates to perform. model : str Desired model of evolution. output_dir : str input_tree directory for bootstrap trees. """ assert(model in ['wag', 'jtt']) self.perc_taxa_to_keep = perc_taxa_to_keep self.model = model self.replicate_dir = os.path.join(output_dir, 'replicates') make_sure_path_exists(self.replicate_dir) # read outgroup taxa self.outgroup_ids = set() if outgroup_file: for line in open(outgroup_file): self.outgroup_ids.add(line.strip()) # read full multiple sequence alignment self.msa = seq_io.read(msa_file) # calculate replicates #***self.logger.info('Calculating jackknife taxa replicates:') #***parallel = Parallel(self.cpus) #***parallel.run(self._producer, None, xrange(num_replicates), self._progress) # calculate support rep_tree_files = [] for rep_index in xrange(num_replicates): rep_tree_files.append(os.path.join(self.replicate_dir, 'jk_taxa.tree.' + str(rep_index) + '.tre')) tree_support = TreeSupport() output_tree = os.path.join(output_dir, remove_extension(input_tree) + '.jk_taxa.tree') tree_support.subset_taxa(input_tree, rep_tree_files, output_tree) return output_tree
def add_compatible_unique(self, scaffold_file, genome_file, compatible_file, out_genome): """Add sequences specified as compatible. Only sequences specified exactly once in the compatibility file are added. Parameters ---------- scaffold_file : str Fasta file containing scaffolds to add. genome_file : str Fasta file of binned scaffolds. compatible_file : str File specifying compatible scaffolds. out_genome : str Name of output genome. """ cur_bin_id = remove_extension(genome_file) # determine scaffolds compatible with genome scaffold_ids = [] bin_ids = {} with open(compatible_file) as f: f.readline() for line in f: line_split = line.split('\t') scaffold_id = line_split[0] bin_id = line_split[1].strip() scaffold_ids.append(scaffold_id) bin_ids[scaffold_id] = bin_id compatible_scaffolds = set() for scaffold_id, bin_id in bin_ids.iteritems(): if scaffold_ids.count(scaffold_id) == 1 and bin_id == cur_bin_id: compatible_scaffolds.add(scaffold_id) # add compatible sequences to genome genome_seqs = seq_io.read(genome_file) for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id in compatible_scaffolds: genome_seqs[seq_id] = seq # save modified bin seq_io.write_fasta(genome_seqs, out_genome)
def _producer_db(self, aa_gene_file): """Create blast database. Parameters ---------- aa_gene_files : str Fasta file with genes in amino acid space. """ genome_id = remove_extension(aa_gene_file) blast_DB = os.path.join(self.output_dir, genome_id + '.db') log_file = os.path.join(self.output_dir, genome_id + '.log') cmd = 'makeblastdb -dbtype prot -in %s -out %s -logfile %s' % (aa_gene_file, blast_DB, log_file) os.system(cmd) return True
def unique(genome_files): """Check if sequences are assigned to multiple bins. Parameters ---------- genome_files : iterable Path to genome fasta files. Returns ------- dict : d[genome_id][genome_id] -> [shared sequences] List of any sequences within a genome observed multiple times. """ # read sequence IDs from all genomes, # while checking for duplicate sequences within a genomes duplicates = defaultdict(lambda: defaultdict(list)) genome_seqs = {} for f in genome_files: genome_id = remove_extension(f) seq_ids = set() for seq_id, _seq in seq_io.read_seq(f): if seq_id in seq_ids: duplicates[genome_id][genome_id].append(seq_id) seq_ids.add(seq_id) genome_seqs[genome_id] = seq_ids # check for sequences assigned to multiple bins genome_ids = genome_seqs.keys() for i in xrange(0, len(genome_ids)): seq_idsI = genome_seqs[genome_ids[i]] for j in xrange(i + 1, len(genome_ids)): seq_idsJ = genome_seqs[genome_ids[j]] seq_intersection = seq_idsI.intersection(seq_idsJ) if len(seq_intersection) > 0: duplicates[genome_ids[i]][genome_ids[j]] = seq_intersection duplicates[genome_ids[j]][genome_ids[i]] = seq_intersection return duplicates
def reformat_gene_id_to_scaffold_id(self, gene_file, gff_file, taxonomy, output_file): """Reformat gene ids to format which explicitly gives scaffold names. <genome_id>~<scaffold_id>_<gene_#> [gtdb_taxonomy] [NCBI organism name] [annotation] Parameters ---------- gene_file : str Gene file for genome. gff_file : str General feature file (GFF) for genome. output_file : float File to contain modified gene fasta file. """ # determine source scaffold for each gene gene_id_to_scaffold_id = {} gene_number = defaultdict(int) for line in open(gff_file): if line.startswith('##FASTA'): # start of FASTA section with individual sequences break if line[0] == '#': continue line_split = line.split('\t') scaffold_id = line_split[0] info = line_split[8] if info != '': # this will be empty for non-protein coding genes gene_id = info.split(';')[0].replace('ID=', '') gene_number[scaffold_id] += 1 gene_id_to_scaffold_id[gene_id] = scaffold_id + '_' + str(gene_number[scaffold_id]) # write out gene file with modified identifiers fout = open(output_file, 'w') for gene_id, seq, annotation in seq_io.read_fasta_seq(gene_file, keep_annotation=True): genome_id = remove_extension(gene_file) fout.write('>%s [%s] [%s] [%s]\n' % (gene_id_to_scaffold_id[gene_id], ';'.join(taxonomy.get(genome_id, ['none'])), 'none', annotation)) fout.write(seq + '\n') fout.close()
def filter_bins(self, options): """Filter bins command""" make_sure_path_exists(options.output_dir) genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext) if not self._check_nuclotide_seqs(genome_files): self.logger.warning('All files must contain nucleotide sequences.') sys.exit() outliers = Outliers() for genome_file in genome_files: gf = remove_extension( genome_file) + '.filtered.' + options.genome_ext out_genome = os.path.join(options.output_dir, gf) outliers.remove_outliers(genome_file, options.filter_file, out_genome, options.modified_only) self.logger.info('Modified genome written to: ' + options.output_dir)
def _genome_seqs(self, genome_files): """Get unique id of sequences in each genome. Parameters ---------- genome_files : iterable Genome files in fasta format. Returns ------- dict: d[genome_id] -> set(seq_id1, ..., seq_idN) Ids of sequences in each genome. """ genome_seqs = defaultdict(set) for genome_file in genome_files: genome_id = remove_extension(genome_file) for seq_id, _seq in seq_io.read_seq(genome_file): genome_seqs[genome_id].add(seq_id) return genome_seqs
def manual(self, options): """Manual command""" check_file_exists(options.cluster_file) check_file_exists(options.genome_file) make_sure_path_exists(options.output_dir) genome_id = remove_extension(options.genome_file) seqs = seq_io.read(options.genome_file) fout = {} with open(options.cluster_file) as f: f.readline() for line in f: line_split = line.rstrip().split('\t') scaffold_id = line_split[0] cluster_id = int(line_split[1]) if cluster_id < 0: # negative values indicate scaffolds that should # not be placed in a cluster continue if cluster_id not in fout: fout[cluster_id] = open( os.path.join(options.output_dir, genome_id + '_c%d.fna' % cluster_id), 'w') f = fout[cluster_id] f.write('>' + scaffold_id + '\n') f.write(seqs[scaffold_id] + '\n') for f in fout.values(): f.close() self.logger.info('Partitioned sequences written to: ' + options.output_dir)
def aai(self, options): """AAI command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - aai] Calculating the AAI between homologs in genome pairs.') self.logger.info('*******************************************************************************') self.logger.info('') check_dir_exists(options.rblast_dir) make_sure_path_exists(options.output_dir) genome_ids = [] protein_dir = os.path.join(options.rblast_dir, 'genes') for f in os.listdir(protein_dir): if f.endswith('.faa'): genome_id = remove_extension(f, '.faa') genome_ids.append(genome_id) if not genome_ids: self.logger.warning(' [Warning] No gene files found. Check the --protein_ext flag used to identify gene files.') sys.exit() aai_calculator = AAICalculator(options.cpus) aai_calculator.run(genome_ids, protein_dir, options.rblast_dir, options.per_identity, options.per_aln_len, options.write_shared_genes, options.output_dir) shared_genes_dir = os.path.join(options.output_dir, aai_calculator.shared_genes) self.logger.info('') self.logger.info(' Identified homologs between genome pairs written to: %s' % shared_genes_dir) self.time_keeper.print_time_stamp()
def run(self, input_tree, msa_file, num_replicates, model, gamma, base_type, frac, boot_dir, output_dir): """Bootstrap multiple sequence alignment. Parameters ---------- input_tree : str Tree inferred with all data. msa_file : str File containing multiple sequence alignment for all taxa. num_replicates : int Number of replicates to perform. model : str Desired model of evolution. base_type : str Indicates if bases are nucleotides or amino acids. frac : float Fraction of alignment to subsample. output_dir : str Directory for bootstrap trees. """ assert(model in ['wag', 'lg', 'jtt']) assert(base_type in ['nt', 'prot']) self.model = model self.gamma = gamma self.base_type = base_type self.frac = frac rep_tree_files = [] if not boot_dir: self.replicate_dir = os.path.join(output_dir, 'replicates') make_sure_path_exists(self.replicate_dir) # read full multiple sequence alignment self.msa = seq_io.read(msa_file) # calculate replicates self.logger.info('Calculating bootstrap replicates:') parallel = Parallel(self.cpus) parallel.run(self._producer, None, xrange(num_replicates), self._progress) for rep_index in xrange(num_replicates): rep_tree_files.append(os.path.join(self.replicate_dir, 'bootstrap_tree.r_' + str(rep_index) + '.tree')) else: for f in os.listdir(boot_dir): if f.endswith('.tree') or f.endswith('.tre'): rep_tree_files.append(os.path.join(boot_dir, f)) self.logger.info('Read %d bootstrap replicates.' % len(rep_tree_files)) # calculate support values self.logger.info('Calculating bootstrap support values.') output_tree = os.path.join(output_dir, remove_extension(input_tree) + '.bootstrap.tree') bootstrap_support(input_tree, rep_tree_files, output_tree) return output_tree
def features(self, options): """Making bam features matrix""" make_sure_path_exists(options.output_dir) reads_abundance = os.path.join( options.output_dir, DefaultValues.FEATURES_ABUNDANCE_FILES[0]) reads_normalised = os.path.join( options.output_dir, DefaultValues.FEATURES_ABUNDANCE_FILES[1]) reads_relative = os.path.join( options.output_dir, DefaultValues.FEATURES_ABUNDANCE_FILES[2]) base_abundance = os.path.join( options.output_dir, DefaultValues.FEATURES_ABUNDANCE_FILES[3]) base_normalised = os.path.join( options.output_dir, DefaultValues.FEATURES_ABUNDANCE_FILES[4]) base_relative = os.path.join(options.output_dir, DefaultValues.FEATURES_ABUNDANCE_FILES[5]) reads_count = os.path.join(options.output_dir, 'features_reads_raw_count.tsv') tpm_count = os.path.join(options.output_dir, 'TPM.tsv') features_size = {} raw_counts = {} rpk = {} counts = {} counts_base = {} reference = remove_extension(options.faidx, options.faidx_extension) self.logger.info('Get features and initialise matrix') with open(options.faidx) as f: for line in f: if not line.startswith('#'): line_list = line.rstrip().split('\t') features = line_list[0] if options.merge: features = options.separator.join( features.split(options.separator)[:-1]) if options.genome: features = reference try: features_size[features] = features_size[ features] + int(line_list[1]) except KeyError: features_size[features] = int(line_list[1]) counts[features] = 0 counts_base[features] = 0 raw_counts[features] = 0 rpk[features] = 0 counts_raw_all = [] counts_tpm_all = [] counts_all = [] counts_all_normalised = [] counts_all_relative = [] counts_base_all = [] counts_base_all_normalised = [] counts_base_all_relative = [] header = ["Features", "Features_size"] self.logger.info('Browse alignement file(s)') samtoolsexec = findEx('samtools') samtoolsthreads = '-@ ' + options.threads samtoolsminqual = '-q ' + options.mapQ with open(options.bam_list, 'r') as b: for bam in b: if bam.startswith('#'): continue i = 0 alignementfile, librarysize = bam.split('\t') if librarysize == '' or librarysize == 0 or options.discard_library_size_normalisation: librarysize = 1 samplename = remove_extension(os.path.basename(alignementfile), options.extension) header.append(samplename) self.logger.info('\t' + samplename) cmd = [ samtoolsexec, 'view', samtoolsthreads, samtoolsminqual, alignementfile ] p = subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout for line in p: try: k = line line = line.decode(sys.getdefaultencoding()).rstrip() l = k except: print(l) print(line) sys.exit() if i > 0 and i % 1000000 == 0: self.logger.info("Alignment record %s processed" % i) i += 1 line_list = line.split('\t') features = line_list[2] if options.merge: features = options.separator.join( features.split(options.separator)[:-1]) if options.genome: features = reference cigar = line_list[5] base_mapped = 0 match = re.findall(r'(\d+)M', cigar) read_len = len(line_list[6]) for base_match in match: base_mapped += int(base_match) if read_len == 0: self.logger.info(line_list) if base_mapped / read_len < float(options.id_cutoff): continue raw_counts[features] += 1 rpk[features] += (1 / int(features_size[features])) * 1000 if options.discard_feature_length_normalisation: counts_base[features] += base_mapped counts[features] += 1 else: counts_base[features] += ( base_mapped / int(features_size[features]) ) * options.feature_size_normalisation counts[features] += ( 1 / int(features_size[features]) ) * options.feature_size_normalisation if options.library_size_normalisation == 'aligned': librarysize = sum(counts.values()) if librarysize == 0: librarysize = 1 # raw reads count wo gl counts_raw_all.append(raw_counts.copy()) # rpk count_tmp = {} try: count_tmp = { k: v * 1000000 / total for total in (sum(rpk.values()), ) for k, v in rpk.items() } except ZeroDivisionError: count_tmp = {k: v for k, v in counts.items()} counts_tpm_all.append(count_tmp.copy()) # raw reads count counts_all.append(counts.copy()) # normalised reads count count_tmp = {} count_tmp = { k: (v / int(librarysize)) * options.feature_normalisation for k, v in counts.items() } counts_all_normalised.append(count_tmp.copy()) # relative reads count count_tmp = {} try: count_tmp = { k: v / total for total in (sum(counts.values()), ) for k, v in counts.items() } except ZeroDivisionError: count_tmp = {k: v for k, v in counts.items()} counts_all_relative.append(count_tmp.copy()) # raw bases count counts_base_all.append(counts_base.copy()) # normalised bases count count_tmp = {} count_tmp = { k: (v / int(librarysize)) * options.feature_normalisation for k, v in counts_base.items() } counts_base_all_normalised.append(count_tmp.copy()) # relative bases count count_tmp = {} try: count_tmp = { k: v / total for total in (sum(counts_base.values()), ) for k, v in counts_base.items() } except ZeroDivisionError: count_tmp = {k: v for k, v in counts_base.items()} counts_base_all_relative.append(count_tmp.copy()) for fn in counts: raw_counts[fn] = 0 counts[fn] = 0 counts_base[fn] = 0 self.logger.info('Print matrices') self.logger.info('Print raw reads count matrix in %s' % reads_count) output_handle = open(reads_count, "w") output_handle.write('\t'.join(header) + '\n') for fn in counts.keys(): if sum([c[fn] for c in counts_raw_all]) == 0 and options.removed: continue else: output_handle.write( '\t'.join([fn] + [str(features_size[fn])] + [str(c[fn]) for c in counts_raw_all]) + '\n') output_handle.close() self.logger.info('Print TMP matrix in %s' % tpm_count) output_handle = open(tpm_count, "w") output_handle.write('\t'.join(header) + '\n') for fn in counts.keys(): if sum([c[fn] for c in counts_tpm_all]) == 0 and options.removed: continue else: output_handle.write( '\t'.join([fn] + [str(features_size[fn])] + [str(c[fn]) for c in counts_tpm_all]) + '\n') output_handle.close() self.logger.info('Print raw reads abundance matrix in %s' % reads_abundance) output_handle = open(reads_abundance, "w") output_handle.write('\t'.join(header) + '\n') for fn in counts.keys(): if sum([c[fn] for c in counts_all]) == 0 and options.removed: continue else: output_handle.write('\t'.join([fn] + [str(features_size[fn])] + [str(c[fn]) for c in counts_all]) + '\n') output_handle.close() self.logger.info('Print normalised reads abundance matrix in %s' % reads_normalised) output_handle = open(reads_normalised, "w") output_handle.write('\t'.join(header) + '\n') for fn in counts.keys(): if sum([c[fn] for c in counts_all_normalised]) == 0 and options.removed: continue else: output_handle.write( '\t'.join([fn] + [str(features_size[fn])] + [str(c[fn]) for c in counts_all_normalised]) + '\n') output_handle.close() self.logger.info('Print relative reads abundance matrix in %s' % reads_normalised) output_handle = open(reads_relative, "w") output_handle.write('\t'.join(header) + '\n') for fn in counts.keys(): if sum([c[fn] for c in counts_all_relative]) == 0 and options.removed: continue else: output_handle.write( '\t'.join([fn] + [str(features_size[fn])] + [str(c[fn]) for c in counts_all_relative]) + '\n') output_handle.close() self.logger.info('Print raw base abundance matrix in %s' % reads_normalised) output_handle = open(base_abundance, "w") output_handle.write('\t'.join(header) + '\n') for fn in counts_base.keys(): if sum([c[fn] for c in counts_all]) == 0 and options.removed: continue else: output_handle.write( '\t'.join([fn] + [str(features_size[fn])] + [str(c[fn]) for c in counts_base_all]) + '\n') output_handle.close() self.logger.info('Print normalised base abundance matrix in %s' % reads_normalised) output_handle = open(base_normalised, "w") output_handle.write('\t'.join(header) + '\n') for fn in counts_base.keys(): if sum([c[fn] for c in counts_all_normalised]) == 0 and options.removed: continue else: output_handle.write( '\t'.join([fn] + [str(features_size[fn])] + [str(c[fn]) for c in counts_base_all_normalised]) + '\n') output_handle.close() self.logger.info('Print relative base abundance matrix in %s' % reads_normalised) output_handle = open(base_relative, "w") output_handle.write('\t'.join(header) + '\n') for fn in counts_base.keys(): if sum([c[fn] for c in counts_all_relative]) == 0 and options.removed: continue else: output_handle.write( '\t'.join([fn] + [str(features_size[fn])] + [str(c[fn]) for c in counts_base_all_relative]) + '\n') output_handle.close() self.logger.info('Matrices printed')
def run(self, scaffold_stats, num_clusters, num_components, K, no_coverage, no_pca, iterations, genome_file, output_dir): """Calculate statistics for genomes. Parameters ---------- scaffold_stats : ScaffoldStats Statistics for individual scaffolds. num_clusters : int Number of cluster to form. num_components : int Number of PCA components to consider. K : int K-mer size to use for calculating genomic signature. no_coverage : boolean Flag indicating if coverage information should be used during clustering. no_pca : boolean Flag indicating if PCA of genomic signature should be calculated. iterations : int Iterations of clustering to perform. genome_file : str Sequences being clustered. output_dir : str Directory to write results. """ # get GC and mean coverage for each scaffold in genome self.logger.info('') self.logger.info(' Determining mean coverage and genomic signatures.') signatures = GenomicSignature(K) genome_stats = [] signature_matrix = [] seqs = seq_io.read(genome_file) for seq_id, seq in seqs.iteritems(): stats = scaffold_stats.stats[seq_id] if not no_coverage: genome_stats.append((np_mean(stats.coverage))) else: genome_stats.append(()) if K == 0: pass elif K == 4: signature_matrix.append(stats.signature) else: sig = signatures.seq_signature(seq) total_kmers = sum(sig) for i in xrange(0, len(sig)): sig[i] = float(sig[i]) / total_kmers signature_matrix.append(sig) # calculate PCA of tetranucleotide signatures if K != 0: if not no_pca: self.logger.info(' Calculating PCA of genomic signatures.') pc, variance = self.pca(signature_matrix) self.logger.info(' First %d PCs capture %.1f%% of the variance.' % (num_components, sum(variance[0:num_components]) * 100)) for i, stats in enumerate(genome_stats): genome_stats[i] = np_append(stats, pc[i][0:num_components]) else: self.logger.info(' Using complete genomic signature.') for i, stats in enumerate(genome_stats): genome_stats[i] = np_append(stats, signature_matrix[i]) # whiten data if feature matrix contains coverage and genomic signature data if not no_coverage and K != 0: print ' Whitening data.' genome_stats = whiten(genome_stats) else: genome_stats = np_array(genome_stats) # cluster self.logger.info(' Partitioning genome into %d clusters.' % num_clusters) bError = True while bError: try: bError = False _centroids, labels = kmeans2(genome_stats, num_clusters, iterations, minit='points', missing='raise') except ClusterError: bError = True for k in range(num_clusters): self.logger.info(' Placed %d sequences in cluster %d.' % (sum(labels == k), (k + 1))) # write out clusters genome_id = remove_extension(genome_file) for k in range(num_clusters): fout = open(os.path.join(output_dir, genome_id + '_c%d' % (k + 1) + '.fna'), 'w') for i in np_where(labels == k)[0]: seq_id = seqs.keys()[i] fout.write('>' + seq_id + '\n') fout.write(seqs[seq_id] + '\n') fout.close()
def run(self, genome_files, db_file, taxonomy_file, evalue, per_identity, window_size, step_size): """Create taxonomic profiles for a set of genomes. Parameters ---------- genome_files : list of str Fasta files of genomes to process. db_file : str Database of reference genes. taxonomy_file : str File containing GreenGenes taxonomy strings for reference genomes. evalue : float E-value threshold used by blast. per_identity: float Percent identity threshold used by blast. window_size : int Size of each fragment. step_size : int Number of bases to move after each window. """ # parse taxonomy file self.logger.info(' Reading taxonomic assignment of reference genomes.') taxonomy = Taxonomy().read(taxonomy_file) # fragment each genome into fixed sizes windows self.logger.info('') self.logger.info(' Fragmenting sequences in each bin:') diamond_output_dir = os.path.join(self.output_dir, 'diamond') make_sure_path_exists(diamond_output_dir) fragment_file = os.path.join(diamond_output_dir, 'fragments.fna') fragment_out = open(fragment_file, 'w') contig_id_to_genome_id = {} for genome_file in genome_files: genome_id = remove_extension(genome_file) self.profiles[genome_id] = Profile(genome_id, taxonomy) self._fragment_genomes(genome_file, window_size, step_size, self.profiles[genome_id], fragment_out) for seq_id, _seq in seq_io.read_seq(genome_file): contig_id_to_genome_id[seq_id] = genome_id # run diamond self.logger.info('') self.logger.info(' Running diamond blastx with %d processes (be patient!)' % self.cpus) diamond = Diamond(self.cpus) diamond_daa_out = os.path.join(diamond_output_dir, 'diamond_hits') diamond.blastx(fragment_file, db_file, evalue, per_identity, 1, diamond_daa_out) diamond_table_out = os.path.join(diamond_output_dir, 'diamond_hits.tsv') diamond.view(diamond_daa_out + '.daa', diamond_table_out) self.logger.info('') self.logger.info(' Creating taxonomic profile for each genome.') self._taxonomic_profiles(diamond_table_out, taxonomy, contig_id_to_genome_id) self.logger.info('') self.logger.info(' Writing taxonomic profile for each genome.') report_dir = os.path.join(self.output_dir, 'bin_reports') make_sure_path_exists(report_dir) for genome_id, profile in self.profiles.iteritems(): seq_summary_out = os.path.join(report_dir, genome_id + '.sequences.tsv') profile.write_seq_summary(seq_summary_out) genome_profile_out = os.path.join(report_dir, genome_id + '.profile.tsv') profile.write_genome_profile(genome_profile_out) genome_summary_out = os.path.join(self.output_dir, 'genome_summary.tsv') self._write_genome_summary(genome_summary_out) # create Krona plot krona_profiles = defaultdict(lambda: defaultdict(int)) for genome_id, profile in self.profiles.iteritems(): seq_assignments = profile.classify_seqs(taxonomy) for seq_id, classification in seq_assignments.iteritems(): taxa = [] for r in xrange(0, len(profile.rank_labels)): taxa.append(classification[r][0]) krona_profiles[genome_id][';'.join(taxa)] += profile.seq_len[seq_id] krona = Krona() krona_output_file = os.path.join(self.output_dir, 'taxonomic_profiles.krona.html') krona.create(krona_profiles, krona_output_file)
def kmeans(self, scaffold_stats, num_clusters, num_components, K, no_coverage, no_pca, iterations, genome_file, output_dir): """Cluster genome with k-means. Parameters ---------- scaffold_stats : ScaffoldStats Statistics for individual scaffolds. num_clusters : int Number of cluster to form. num_components : int Number of PCA components to consider. K : int K-mer size to use for calculating genomic signature no_coverage : boolean Flag indicating if coverage information should be used during clustering. no_pca : boolean Flag indicating if PCA of genomic signature should be calculated. iterations: int iterations to perform during clustering genome_file : str Sequences being clustered. output_dir : str Directory to write results. """ # get GC and mean coverage for each scaffold in genome self.logger.info('Determining mean coverage and genomic signatures.') signatures = GenomicSignature(K) genome_stats = [] signature_matrix = [] seqs = seq_io.read(genome_file) for seq_id, seq in seqs.items(): stats = scaffold_stats.stats[seq_id] if not no_coverage: genome_stats.append((np_mean(stats.coverage))) else: genome_stats.append(()) if K == 0: pass elif K == 4: signature_matrix.append(stats.signature) else: sig = signatures.seq_signature(seq) total_kmers = sum(sig) for i in range(0, len(sig)): sig[i] = float(sig[i]) / total_kmers signature_matrix.append(sig) # calculate PCA of signatures if K != 0: if not no_pca: self.logger.info('Calculating PCA of genomic signatures.') pc, variance = self.pca(signature_matrix) self.logger.info( 'First {:,} PCs capture {:.1f}% of the variance.'.format( num_components, sum(variance[0:num_components]) * 100)) for i, stats in enumerate(genome_stats): genome_stats[i] = np_append(stats, pc[i][0:num_components]) else: self.logger.info('Using complete genomic signature.') for i, stats in enumerate(genome_stats): genome_stats[i] = np_append(stats, signature_matrix[i]) # whiten data if feature matrix contains coverage and genomic signature data if not no_coverage and K != 0: self.logger.info('Whitening data.') genome_stats = whiten(genome_stats) else: genome_stats = np_array(genome_stats) # cluster self.logger.info( 'Partitioning genome into {:,} clusters.'.format(num_clusters)) bError = True while bError: try: bError = False _centroids, labels = kmeans2(genome_stats, num_clusters, iterations, minit='points', missing='raise') except ClusterError: bError = True for k in range(num_clusters): self.logger.info('Placed {:,} sequences in cluster {:,}.'.format( sum(labels == k), (k + 1))) # write out clusters genome_id = remove_extension(genome_file) for k in range(num_clusters): fout = open( os.path.join(output_dir, genome_id + '_c%d' % (k + 1) + '.fna'), 'w') for i in np_where(labels == k)[0]: seq_id = seqs.keys()[i] fout.write('>' + seq_id + '\n') fout.write(seqs[seq_id] + '\n') fout.close()
def run(self, input_tree, msa_file, marker_info_file, mask_file, perc_markers_to_keep, num_replicates, model, jk_dir, output_dir): """Jackknife marker genes. Marker file should have the format: <marker id>\t<marker name>\t<marker desc>\t<length>\n Parameters ---------- input_tree : str Tree inferred with all data. msa_file : str File containing multiple sequence alignment for all taxa. marker_info_file : str File indicating database id, HMM name, description and length of each marker in the alignment. mask_file : str File indicating masking of multiple sequence alignment. perc_markers_to_keep : float [0, 1] Percentage of marker genes to keep in each replicate. num_replicates : int Number of replicates to perform. model : str Desired model of evolution. output_dir : str Output directory for jackkife trees. """ assert(model in ['wag', 'jtt']) self.model = model self.perc_markers_to_keep = perc_markers_to_keep # determine length of each marker gene in alignment rep_tree_files = [] if not jk_dir: self.replicate_dir = os.path.join(output_dir, 'replicates') make_sure_path_exists(self.replicate_dir) marker_lengths = [] total_len = 0 with open(marker_info_file) as f: f.readline() for line in f: line_split = line.split('\t') ml = int(line_split[3]) marker_lengths.append(ml) total_len += ml self.logger.info('Concatenated length of markers: %d' % total_len) # read mask mask = open(mask_file).readline().strip() start = 0 self.marker_lengths = [] total_mask_len = 0 for ml in marker_lengths: end = start + ml zeros = mask[start:end].count('0') start = end self.marker_lengths.append(ml - zeros) total_mask_len += ml - zeros self.logger.info('Concatenated length of filtered MSA: %d' % total_mask_len) # read full multiple sequence alignment self.msa = seq_io.read(msa_file) if len(self.msa.values()[0]) != total_mask_len: self.logger.error('Length of MSA does not meet length of mask.') sys.exit() # calculate replicates self.logger.info('Calculating jackknife marker replicates:') parallel = Parallel(self.cpus) parallel.run(self._producer, None, xrange(num_replicates), self._progress) # calculate support self.logger.info('Calculating support for %d replicates.' % num_replicates) for rep_index in xrange(num_replicates): rep_tree_files.append(os.path.join(self.replicate_dir, 'jk_markers.tree.' + str(rep_index) + '.tre')) else: for f in os.listdir(jk_dir): if f.endswith('.tree') or f.endswith('.tre'): rep_tree_files.append(os.path.join(jk_dir, f)) self.logger.info('Read %d jackknife replicates.' % len(rep_tree_files)) output_tree = os.path.join(output_dir, remove_extension(input_tree) + '.jk_markers.tree') bootstrap_support(input_tree, rep_tree_files, output_tree) return output_tree
def add_compatible_closest(self, scaffold_file, genome_file, compatible_file, min_len, out_genome): """Add sequences specified as compatible. A sequences is added to a bin if and only if it is closest to that bin in GC, tetranuclotide, and coverage space. Parameters ---------- scaffold_file : str Fasta file containing scaffolds to add. genome_file : str Fasta file of binned scaffolds. compatible_file : str File specifying compatible scaffolds. min_len : int Minimum length to add scaffold. out_genome : str Name of output genome. """ cur_bin_id = remove_extension(genome_file) # determine statistics for each potentially compatible scaffold scaffold_ids = defaultdict(dict) with open(compatible_file) as f: headers = [x.strip() for x in f.readline().split('\t')] scaffold_gc_index = headers.index('Scaffold GC') genome_gc_index = headers.index('Median genome GC') td_dist_index = headers.index('Scaffold TD') scaffold_cov_index = headers.index('Scaffold coverage') genome_cov_index = headers.index('Median genome coverage') for line in f: line_split = line.split('\t') scaffold_id = line_split[0] bin_id = line_split[1].strip() scaffold_gc = float(line_split[scaffold_gc_index]) genome_gc = float(line_split[genome_gc_index]) gc_dist = abs(scaffold_gc - genome_gc) td_dist = float(line_split[td_dist_index]) scaffold_cov = float(line_split[scaffold_cov_index]) genome_cov = float(line_split[genome_cov_index]) cov_dist = abs(scaffold_cov - genome_cov) scaffold_ids[scaffold_id][bin_id] = [gc_dist, td_dist, cov_dist] # determine scaffolds that are closest to a single bin # in terms of GC, tetranucleotide distance, and coverage compatible_scaffolds = set() for scaffold_id, bin_stats in scaffold_ids.items(): best_gc = [1e9, None] best_td = [1e9, None] best_cov = [1e9, None] for bin_id, stats in bin_stats.items(): gc, td, cov = stats if gc < best_gc[0]: best_gc = [gc, bin_id] if td < best_td[0]: best_td = [td, bin_id] if cov < best_cov[0]: best_cov = [cov, bin_id] # check if scaffold is closest to a single bin if (best_gc[1] == best_td[1] == best_cov[1]) and best_gc[1] == cur_bin_id: compatible_scaffolds.add(scaffold_id) self.logger.info('Identified {:,} compatible scaffolds.'.format(len(compatible_scaffolds))) # add compatible sequences to genome added_seqs = 0 genome_seqs = seq_io.read(genome_file) for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id in compatible_scaffolds: if len(seq) >= min_len: genome_seqs[seq_id] = seq added_seqs += 1 self.logger.info('Added {:,} scaffolds meeting length criterion.'.format(added_seqs)) # save modified bin seq_io.write_fasta(genome_seqs, out_genome)
def rblast(self, options): """Reciprocal blast command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - rblast] Performing reciprocal blast between genomes.') self.logger.info('*******************************************************************************') check_dir_exists(options.protein_dir) make_sure_path_exists(options.output_dir) aa_gene_files = [] for f in os.listdir(options.protein_dir): if f.endswith(options.protein_ext): aa_gene_files.append(os.path.join(options.protein_dir, f)) if not aa_gene_files: self.logger.warning(' [Warning] No gene files found. Check the --protein_ext flag used to identify gene files.') sys.exit() # modify gene ids to include genome ids in order to ensure # all gene identifiers are unique across the set of genomes, # also removes the trailing asterisk used to identify the stop # codon self.logger.info('') self.logger.info(' Appending genome identifiers to all gene identifiers.') gene_out_dir = os.path.join(options.output_dir, 'genes') make_sure_path_exists(gene_out_dir) modified_aa_gene_files = [] for gf in aa_gene_files: genome_id = remove_extension(gf) aa_file = os.path.join(gene_out_dir, genome_id + '.faa') fout = open(aa_file, 'w') for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True): fout.write('>' + seq_id + '~' + genome_id + ' ' + annotation + '\n') if seq[-1] == '*': seq = seq[0:-1] fout.write(seq + '\n') fout.close() modified_aa_gene_files.append(aa_file) # perform the reciprocal blast with blastp or diamond self.logger.info('') if options.blastp: rblast = ReciprocalBlast(options.cpus) rblast.run(modified_aa_gene_files, options.evalue, options.output_dir) # concatenate all blast tables to mimic output of diamond, all hits # for a given genome MUST be in consecutive order to fully mimic # the expected results from diamond self.logger.info('') self.logger.info(' Creating single file with all blast hits (be patient!).') blast_files = sorted([f for f in os.listdir(options.output_dir) if f.endswith('.blastp.tsv')]) hit_tables = [os.path.join(options.output_dir, f) for f in blast_files] concatenate_files(hit_tables, os.path.join(options.output_dir, 'all_hits.tsv')) else: rdiamond = ReciprocalDiamond(options.cpus) rdiamond.run(modified_aa_gene_files, options.evalue, options.per_identity, options.output_dir) self.logger.info('') self.logger.info(' Reciprocal blast hits written to: %s' % options.output_dir) self.time_keeper.print_time_stamp()
def _producer(self, genome_file): """Apply prodigal to genome with most suitable translation table. Parameters ---------- genome_file : queue Fasta file for genome. """ genome_id = remove_extension(genome_file) aa_gene_file = os.path.join(self.output_dir, genome_id + '_genes.faa') nt_gene_file = os.path.join(self.output_dir, genome_id + '_genes.fna') gff_file = os.path.join(self.output_dir, genome_id + '.gff') best_translation_table = -1 table_coding_density = {4:-1, 11:-1} if self.called_genes: os.system('cp %s %s' % (os.path.abspath(genome_file), aa_gene_file)) else: tmp_dir = tempfile.mkdtemp() seqs = read_fasta(genome_file) # determine number of bases total_bases = 0 for seq in seqs.values(): total_bases += len(seq) # call genes under different translation tables if self.translation_table: translation_tables = [self.translation_table] else: translation_tables = [4, 11] for translation_table in translation_tables: os.makedirs(os.path.join(tmp_dir, str(translation_table))) aa_gene_file_tmp = os.path.join(tmp_dir, str(translation_table), genome_id + '_genes.faa') nt_gene_file_tmp = os.path.join(tmp_dir, str(translation_table), genome_id + '_genes.fna') gff_file_tmp = os.path.join(tmp_dir, str(translation_table), genome_id + '.gff') # check if there is sufficient bases to calculate prodigal parameters if total_bases < 100000 or self.meta: proc_str = 'meta' # use best precalculated parameters else: proc_str = 'single' # estimate parameters from data args = '-m' if self.closed_ends: args += ' -c' cmd = 'prodigal %s -p %s -q -f gff -g %d -a %s -d %s -i %s > %s 2> /dev/null' % (args, proc_str, translation_table, aa_gene_file_tmp, nt_gene_file_tmp, genome_file, gff_file_tmp) os.system(cmd) # determine coding density prodigalParser = ProdigalGeneFeatureParser(gff_file_tmp) codingBases = 0 for seq_id, _seq in seqs.items(): codingBases += prodigalParser.coding_bases(seq_id) codingDensity = float(codingBases) / total_bases table_coding_density[translation_table] = codingDensity # determine best translation table if not self.translation_table: best_translation_table = 11 if (table_coding_density[4] - table_coding_density[11] > 0.05) and table_coding_density[4] > 0.7: best_translation_table = 4 else: best_translation_table = self.translation_table shutil.copyfile(os.path.join(tmp_dir, str(best_translation_table), genome_id + '_genes.faa'), aa_gene_file) shutil.copyfile(os.path.join(tmp_dir, str(best_translation_table), genome_id + '_genes.fna'), nt_gene_file) shutil.copyfile(os.path.join(tmp_dir, str(best_translation_table), genome_id + '.gff'), gff_file) # clean up temporary files shutil.rmtree(tmp_dir) return (genome_id, aa_gene_file, nt_gene_file, gff_file, best_translation_table, table_coding_density[4], table_coding_density[11])
def run(self, taxonomy_file, type_strains_file, genome_prot_dir, extension, max_taxa, rank, per_identity, per_aln_len, genomes_to_process, keep_all_genes, no_reformat_gene_ids, output_dir): """ Create dereplicate set of genes. Taxonomy file should have the following format: <genome_id>\t<taxonomy_str> where taxonomy_str is in GreenGenes format: d__Bacteria;p__Proteobacteria;...;s__Escherichia coli Type strain file should have the following format: <genome_id>\t<genome name> Parameters ---------- taxonomy_file : str File indicating taxonomy string for all genomes of interest type_strains_file : str File indicating type strains. genome_prot_dir : str Directory containing amino acid genes for each genome. extension : str Extension of files with called genes. max_taxa : int Maximum taxa to retain in a named group. rank : int Taxonomic rank to perform dereplication (0 = domain, ..., 6 = species). per_identity : float Percent identity for subsampling similar genes. per_aln_len : float Percent alignment length for subsampling similar genes. genomes_to_process : str File with list of genomes to retain instead of performing taxon subsampling. keep_all_genes : boolean Flag indicating that no gene subsampling should be performed. no_reformat_gene_ids : boolean Flag indicating if gene ids should be reformatted to include scaffold names given by the GFF file. output_dir : str Desired output directory for storing results. """ make_sure_path_exists(output_dir) self.logger.info('Dereplicating at the rank of %s.' % self.rank_labels[rank]) # get taxonomy string for each genome taxonomy = {} if taxonomy_file: self.logger.info('Reading taxonomy file.') taxonomy = Taxonomy().read(taxonomy_file) self.logger.info('There are %d genomes with taxonomy strings.' % len(taxonomy)) # get type strains; genomes which should never be dereplicated type_strains = set() if type_strains_file: self.logger.info('Reading type strain file.') type_strains = self.read_type_strain(type_strains_file) self.logger.info('There are %d type strains.' % len(type_strains)) # get specific list of genomes to process genomes_to_retain = set() if genomes_to_process: self.logger.info('Reading genomes to retain.') for line in open(genomes_to_process): line_split = line.split() genomes_to_retain.add(line_split[0]) self.logger.info('Retaining %d genomes.' % len(genomes_to_retain)) # make sure extension filter starts with a '.' if not extension.startswith('.'): extension = '.' + extension # identify unique genes in each named group fout = open(os.path.join(output_dir, 'genomes_without_called_genes.tsv'), 'w') rank_genomes = defaultdict(list) genome_files = os.listdir(genome_prot_dir) underclassified_genomes = 0 genomes_with_missing_data = 0 for genome_file in genome_files: genome_id = remove_extension(genome_file, extension) if not genome_file.endswith(extension): continue if genomes_to_process and genome_id not in genomes_to_retain: continue genome_file = os.path.join(genome_prot_dir, genome_file) if not os.path.exists(genome_file): genomes_with_missing_data += 1 fout.write(genome_id + '\t' + ';'.join(taxonomy[genome_id]) + '\n') continue t = taxonomy.get(genome_id, self.rank_prefixes) taxa = t[rank] if taxa[3:] == '': underclassified_genomes += 1 rank_genomes[self.underclassified].append(genome_id) else: rank_genomes[taxa].append(genome_id) validate_seq_ids(genome_file) fout.close() total_genomes_to_process = sum([len(genome_list) for genome_list in rank_genomes.values()]) if total_genomes_to_process == 0: self.logger.error('No genomes found in directory: %s. Check the --extension flag used to identify genomes.' % genome_prot_dir) sys.exit(-1) self.logger.info('Under-classified genomes automatically placed into the database: %d' % underclassified_genomes) self.logger.info('Genomes with missing sequence data: %d' % genomes_with_missing_data) self.logger.info('Total named groups: %d' % len(rank_genomes)) self.logger.info('Total genomes to process: %d' % total_genomes_to_process) # process each named group gene_file = os.path.join(output_dir, 'custom_db.faa') gene_out = open(gene_file, 'w') taxonomy_out = open(os.path.join(output_dir, 'custom_taxonomy.tsv'), 'w') tmp_dir = tempfile.mkdtemp() total_genes_removed = 0 total_genes_kept = 0 total_genomes_kept = 0 processed_genomes = 0 for taxa, genome_list in rank_genomes.iteritems(): processed_genomes += len(genome_list) print '-------------------------------------------------------------------------------' self.logger.info('Processing %s | Finished %d of %d (%.2f%%) genomes.' % (taxa, processed_genomes, total_genomes_to_process, processed_genomes * 100.0 / total_genomes_to_process)) # create directory with selected genomes taxon_dir = os.path.join(tmp_dir, 'taxon') os.mkdir(taxon_dir) reduced_genome_list = genome_list if not genomes_to_process and taxa != self.underclassified: # perform taxon subsampling reduced_genome_list = self.select_taxa(genome_list, taxonomy, type_strains, max_taxa) total_genomes_kept += len(reduced_genome_list) gene_dir = os.path.join(taxon_dir, 'genes') os.mkdir(gene_dir) for genome_id in reduced_genome_list: taxonomy_out.write(genome_id + '\t' + ';'.join(taxonomy.get(genome_id, self.rank_prefixes)) + '\n') genome_gene_file = os.path.join(genome_prot_dir, genome_id + extension) gff_file = os.path.join(genome_prot_dir, genome_id + '.gff') output_gene_file = os.path.join(gene_dir, genome_id + '.faa') if not no_reformat_gene_ids: self.reformat_gene_id_to_scaffold_id(genome_gene_file, gff_file, taxonomy, output_gene_file) else: os.system('cp %s %s' % (genome_gene_file, output_gene_file)) # filter genes based on amino acid identity genes_to_remove = [] amended_gene_dir = os.path.join(taxon_dir, 'amended_genes') if keep_all_genes or taxa == self.underclassified: # modify gene identifiers to include genome ids self.amend_gene_identifies(gene_dir, amended_gene_dir) else: # filter genes on AAI genes_to_remove = self.filter_aai(taxon_dir, gene_dir, amended_gene_dir, per_identity, per_aln_len, self.cpus) self.logger.info('Writing unique genes from genomes in %s.' % taxa) genes_kept = self.write_gene_file(gene_out, amended_gene_dir, reduced_genome_list, taxonomy, genes_to_remove) self.logger.info('Retain %d of %d taxa.' % (len(reduced_genome_list), len(genome_list))) self.logger.info('Genes to keep: %d' % genes_kept) self.logger.info('Genes removed: %d' % len(genes_to_remove)) total_genes_kept += genes_kept total_genes_removed += len(genes_to_remove) shutil.rmtree(taxon_dir) taxonomy_out.close() gene_out.close() self.logger.info('Retain %d of %d (%.1f%%) genomes' % (total_genomes_kept, total_genomes_to_process, total_genomes_kept * 100.0 / (total_genomes_to_process))) self.logger.info('Total genes kept: %d' % total_genes_kept) self.logger.info('Total genes removed: %d (%.1f%%)' % (total_genes_removed, total_genes_removed * 100.0 / (total_genes_kept + total_genes_removed))) self.logger.info('Creating BLAST database.') os.system('makeblastdb -dbtype prot -in %s' % gene_file) shutil.rmtree(tmp_dir)
def run(self, input_dir, tmp_dir, threads): # get path to all unprocessed genome files print 'Reading genomes.' genome_files = [] for genome_dir in os.listdir(input_dir): cur_genome_dir = os.path.join(input_dir, genome_dir) if not os.path.isdir(cur_genome_dir): continue for assembly_id in os.listdir(cur_genome_dir): assembly_dir = os.path.join(cur_genome_dir, assembly_id) genome_id = assembly_id[0:assembly_id.find('_', 4)] # check if prodigal has already been called aa_gene_file = os.path.join(assembly_dir, 'prodigal', genome_id + '_protein.faa') if os.path.exists(aa_gene_file): # verify checksum checksum_file = aa_gene_file + '.sha256' if os.path.exists(checksum_file): checksum = sha256(aa_gene_file) cur_checksum = open(checksum_file).readline().strip() if checksum == cur_checksum: continue genome_file = os.path.join(assembly_dir, assembly_id + '_genomic.fna') if os.path.exists(genome_file): if os.stat(genome_file).st_size == 0: print '[Warning] Genome file appears to be empty: %s' % genome_file else: genome_files.append(genome_file) print ' Number of unprocessed genomes: %d' % len(genome_files) # run prodigal on each genome print 'Running prodigal.' prodigal = Prodigal(cpus=threads) summary_stats = prodigal.run(genome_files, output_dir=tmp_dir) # move results into individual genome directories print 'Moving files and calculating checksums.' for genome_file in genome_files: genome_path, genome_id = ntpath.split(genome_file) genome_id = remove_extension(genome_id) aa_gene_file = os.path.join(tmp_dir, genome_id + '_genes.faa') nt_gene_file = os.path.join(tmp_dir, genome_id + '_genes.fna') gff_file = os.path.join(tmp_dir, genome_id + '.gff') genome_root = genome_id[0:genome_id.find('_', 4)] prodigal_path = os.path.join(genome_path, 'prodigal') if not os.path.exists(prodigal_path): os.makedirs(prodigal_path) new_aa_gene_file = os.path.join(prodigal_path, genome_root + '_protein.faa') new_nt_gene_file = os.path.join(prodigal_path, genome_root + '_protein.fna') new_gff_file = os.path.join(prodigal_path, genome_root + '_protein.gff') os.system('mv %s %s' % (aa_gene_file, new_aa_gene_file)) os.system('mv %s %s' % (nt_gene_file, new_nt_gene_file)) os.system('mv %s %s' % (gff_file, new_gff_file)) # save translation table information translation_table_file = os.path.join( prodigal_path, 'prodigal_translation_table.tsv') fout = open(translation_table_file, 'w') fout.write('%s\t%d\n' % ('best_translation_table', summary_stats[genome_id].best_translation_table)) fout.write('%s\t%.2f\n' % ('coding_density_4', summary_stats[genome_id].coding_density_4 * 100)) fout.write('%s\t%.2f\n' % ('coding_density_11', summary_stats[genome_id].coding_density_11 * 100)) fout.close() checksum = sha256(new_aa_gene_file) fout = open(new_aa_gene_file + '.sha256', 'w') fout.write(checksum) fout.close()
def run(self, scaffold_file, genome_files, tetra_file, coverage_file, output_file): """Calculate statistics for scaffolds. Parameters ---------- scaffold_file : str Fasta file containing scaffolds. genome_files : list of str Fasta files with binned scaffolds. tetra_file : str Tetranucleotide signatures for scaffolds. coverage_file : str Coverage profiles for scaffolds output_file : str Output file for scaffolds statistics. """ tetra = Tetranucleotide(self.cpus) signatures = tetra.read(tetra_file) cov_profiles = None if coverage_file: coverage = Coverage(self.cpus) cov_profiles, _ = coverage.read(coverage_file) # determine bin assignment for each scaffold self.logger.info('Determining scaffold statistics.') scaffold_id_genome_id = {} for gf in genome_files: genome_id = remove_extension(gf) for scaffold_id, _seq in seq_io.read_seq(gf): scaffold_id_genome_id[scaffold_id] = genome_id # write out scaffold statistics fout = open(output_file, 'w') fout.write('Scaffold id\tGenome Id\tGC\tLength (bp)') if cov_profiles: first_key = list(cov_profiles.keys())[0] bam_ids = sorted(cov_profiles[first_key].keys()) for bam_id in bam_ids: fout.write('\t' + bam_id) for kmer in tetra.canonical_order(): fout.write('\t' + kmer) fout.write('\n') for scaffold_id, seq in seq_io.read_seq(scaffold_file): fout.write(scaffold_id) fout.write('\t' + scaffold_id_genome_id.get(scaffold_id, self.unbinned)) fout.write('\t%.2f' % (seq_tk.gc(seq) * 100.0)) fout.write('\t%d' % len(seq)) if cov_profiles: for bam_id in bam_ids: fout.write('\t%.2f' % cov_profiles[scaffold_id][bam_id]) fout.write('\t' + '\t'.join(map(str, signatures[scaffold_id]))) fout.write('\n') fout.close()
def split(self, scaffold_stats, criteria1, criteria2, genome_file, output_dir): """Split genome into two based ongenomic feature. Parameters ---------- scaffold_stats : ScaffoldStats Statistics for individual scaffolds. criteria1 : str First criteria used for splitting genome. criteria2 : str Second criteria used for splitting genome. genome_file : str Sequences being clustered. output_dir : str Directory to write results. """ seqs = seq_io.read(genome_file) # calculate PCA if necessary if 'pc' in criteria1 or 'pc' in criteria2: self.logger.info('Performing PCA.') signatures = GenomicSignature(K) signature_matrix = [] seqs = seq_io.read(genome_file) for seq_id, seq in seqs.items(): stats = scaffold_stats.stats[seq_id] signature_matrix.append(stats.signature) pc, _variance = self.pca(signature_matrix) for i, seq_id in enumerate(seqs): scaffold_stats.stats[seq_id].pc1 = pc[i][0] scaffold_stats.stats[seq_id].pc2 = pc[i][1] scaffold_stats.stats[seq_id].pc3 = pc[i][2] # split bin genome_id = remove_extension(genome_file) fout1 = open(os.path.join(output_dir, genome_id + '_c1.fna'), 'w') fout2 = open(os.path.join(output_dir, genome_id + '_c2.fna'), 'w') for seq_id, seq in seqs.items(): stats = scaffold_stats.stats[seq_id] meet_criteria = True for criteria in [criteria1, criteria2]: if 'gc' in criteria: v = eval(criteria.replace('gc', str(stats.gc)), {"__builtins__": {}}) elif 'coverage' in criteria: v = eval(criteria.replace('coverage', str(stats.coverage)), {"__builtins__": {}}) elif 'pc1' in criteria: v = eval(criteria.replace('pc1', str(stats.pc1)), {"__builtins__": {}}) elif 'pc2' in criteria: v = eval(criteria.replace('pc2', str(stats.pc2)), {"__builtins__": {}}) elif 'pc3' in criteria: v = eval(criteria.replace('pc3', str(stats.pc3)), {"__builtins__": {}}) meet_criteria = meet_criteria and v if meet_criteria: fout1.write('>' + seq_id + '\n') fout1.write(seqs[seq_id] + '\n') else: fout2.write('>' + seq_id + '\n') fout2.write(seqs[seq_id] + '\n') fout1.close() fout2.close()
def run(self, scaffold_file, genome_files, tetra_file, coverage_file, output_file): """Calculate statistics for scaffolds. Parameters ---------- scaffold_file : str Fasta file containing scaffolds. genome_files : list of str Fasta files with binned scaffolds. tetra_file : str Tetranucleotide signatures for scaffolds. coverage_file : str Coverage profiles for scaffolds output_file : str Output file for scaffolds statistics. """ tetra = Tetranucleotide(self.cpus) signatures = tetra.read(tetra_file) cov_profiles = None if coverage_file: coverage = Coverage(self.cpus) cov_profiles, _ = coverage.read(coverage_file) # determine bin assignment for each scaffold self.logger.info('') self.logger.info(' Determining scaffold statistics.') scaffold_id_genome_id = {} for gf in genome_files: genome_id = remove_extension(gf) for scaffold_id, _seq in seq_io.read_seq(gf): scaffold_id_genome_id[scaffold_id] = genome_id # write out scaffold statistics fout = open(output_file, 'w') fout.write('Scaffold id\tGenome Id\tGC\tLength (bp)') if cov_profiles: bam_ids = sorted(cov_profiles[cov_profiles.keys()[0]].keys()) for bam_id in bam_ids: fout.write('\t' + bam_id) for kmer in tetra.canonical_order(): fout.write('\t' + kmer) fout.write('\n') for scaffold_id, seq in seq_io.read_seq(scaffold_file): fout.write(scaffold_id) fout.write('\t' + scaffold_id_genome_id.get(scaffold_id, self.unbinned)) fout.write('\t%.2f' % (seq_tk.gc(seq) * 100.0)) fout.write('\t%d' % len(seq)) if cov_profiles: for bam_id in bam_ids: fout.write('\t%.2f' % cov_profiles[scaffold_id][bam_id]) fout.write('\t' + '\t'.join(map(str, signatures[scaffold_id]))) fout.write('\n') fout.close()
def run(self, input_tree, msa_file, marker_info_file, mask_file, perc_markers_to_keep, num_replicates, model, jk_dir, output_dir): """Jackknife marker genes. Marker file should have the format: <marker id>\t<marker name>\t<marker desc>\t<length>\n Parameters ---------- input_tree : str Tree inferred with all data. msa_file : str File containing multiple sequence alignment for all taxa. marker_info_file : str File indicating database id, HMM name, description and length of each marker in the alignment. mask_file : str File indicating masking of multiple sequence alignment. perc_markers_to_keep : float [0, 1] Percentage of marker genes to keep in each replicate. num_replicates : int Number of replicates to perform. model : str Desired model of evolution. output_dir : str Output directory for jackkife trees. """ assert (model in ['wag', 'jtt']) self.model = model self.perc_markers_to_keep = perc_markers_to_keep # determine length of each marker gene in alignment rep_tree_files = [] if not jk_dir: self.replicate_dir = os.path.join(output_dir, 'replicates') make_sure_path_exists(self.replicate_dir) marker_lengths = [] total_len = 0 with open(marker_info_file) as f: f.readline() for line in f: line_split = line.split('\t') ml = int(line_split[3]) marker_lengths.append(ml) total_len += ml self.logger.info('Concatenated length of markers: %d' % total_len) # read mask mask = open(mask_file).readline().strip() start = 0 self.marker_lengths = [] total_mask_len = 0 for ml in marker_lengths: end = start + ml zeros = mask[start:end].count('0') start = end self.marker_lengths.append(ml - zeros) total_mask_len += ml - zeros self.logger.info('Concatenated length of filtered MSA: %d' % total_mask_len) # read full multiple sequence alignment self.msa = seq_io.read(msa_file) if len(self.msa.values()[0]) != total_mask_len: self.logger.error( 'Length of MSA does not meet length of mask.') sys.exit() # calculate replicates self.logger.info('Calculating jackknife marker replicates:') parallel = Parallel(self.cpus) parallel.run(self._producer, None, xrange(num_replicates), self._progress) # calculate support self.logger.info('Calculating support for %d replicates.' % num_replicates) for rep_index in xrange(num_replicates): rep_tree_files.append( os.path.join(self.replicate_dir, 'jk_markers.tree.' + str(rep_index) + '.tre')) else: for f in os.listdir(jk_dir): if f.endswith('.tree') or f.endswith('.tre'): rep_tree_files.append(os.path.join(jk_dir, f)) self.logger.info('Read %d jackknife replicates.' % len(rep_tree_files)) output_tree = os.path.join( output_dir, remove_extension(input_tree) + '.jk_markers.tree') bootstrap_support(input_tree, rep_tree_files, output_tree) return output_tree
def _genomes_to_process(self, genome_dir, batchfile, extension): """Get genomes to process. Parameters ---------- genome_dir : str Directory containing genomes. batchfile : str File describing genomes. extension : str Extension of files to process. Returns ------- genomic_files : d[genome_id] -> FASTA file Map of genomes to their genomic FASTA files. """ genomic_files = {} if genome_dir: for f in os.listdir(genome_dir): if f.endswith(extension): genome_id = remove_extension(f) genomic_files[genome_id] = os.path.join(genome_dir, f) elif batchfile: for line_no, line in enumerate(open(batchfile, "rb")): line_split = line.strip().split("\t") if line_split[0] == '': continue # blank line if len(line_split) != 2: self.logger.error( 'Batch file must contain exactly 2 columns.') sys.exit(-1) genome_file, genome_id = line_split self._verify_genome_id(genome_id) if genome_file is None or genome_file == '': self.logger.error('Missing genome file on line %d.' % line_no + 1) self.exit(-1) elif genome_id is None or genome_id == '': self.logger.error('Missing genome ID on line %d.' % line_no + 1) self.exit(-1) elif genome_id in genomic_files: self.logger.error('Genome ID %s appear multiple times.' % genome_id) self.exit(-1) genomic_files[genome_id] = genome_file for genome_key in genomic_files.iterkeys(): if genome_key.startswith("RS_") or genome_key.startswith( "GB_") or genome_key.startswith("UBA"): self.logger.error( "Submitted genomes start with the same prefix (RS_,GB_,UBA) as reference genomes in GTDB-Tk. This will cause issues for downstream analysis." ) sys.exit(-1) if len(genomic_files) == 0: if genome_dir: self.logger.warning( 'No genomes found in directory: %s. Check the --extension flag used to identify genomes.' % genome_dir) else: self.logger.warning( 'No genomes found in batch file: %s. Please check the format of this file.' % batchfile) sys.exit(-1) return genomic_files
def _run_prodigal(self, genome_paths): """Run Prodigal on genomes.""" # get genome path and translation table for each file self.logger.info('Determining genomic file and translation table for each of the %d genomes.' % len(genome_paths)) genome_files = [] translation_table = {} for gid, gpath in genome_paths.items(): assembly_id = os.path.basename(os.path.normpath(gpath)) canonical_gid = assembly_id[0:assembly_id.find('_', 4)] genome_file = os.path.join(gpath, assembly_id + '_genomic.fna') if os.path.exists(genome_file): if os.stat(genome_file).st_size == 0: self.logger.warning('Genomic file appears to be empty: %s' % genome_file) continue genome_files.append(genome_file) else: self.logger.warning('Genomic file appears to be missing: %s' % genome_file) gff_file = os.path.join(gpath, assembly_id + '_genomic.gff') if os.path.exists(gff_file): if os.stat(gff_file).st_size == 0: self.logger.warning('GFF appears to be empty: %s' % gff_file) continue tt = self._parse_translation_table(gff_file) if tt: translation_table[canonical_gid] = tt else: translation_table[canonical_gid] = None self.logger.warning('Unable to determine translation table for: %s' % gff_file) sys.exit(-1) else: self.logger.warning('GFF appears to be missing: %s' % gff_file) sys.exit(-1) # run Prodigal on each genome self.logger.info('Running Prodigal on %d genomes.' % len(genome_paths)) prodigal = Prodigal(cpus=self.cpus) summary_stats = prodigal.run(genome_files, translation_table=translation_table, output_dir=self.tmp_dir) # move results into individual genome directories self.logger.info('Moving files and calculating checksums.') for genome_file in genome_files: genome_path, genome_id = ntpath.split(genome_file) genome_id = remove_extension(genome_id) canonical_gid = genome_id[0:genome_id.find('_', 4)] aa_gene_file = os.path.join(self.tmp_dir, genome_id + '_genes.faa') nt_gene_file = os.path.join(self.tmp_dir, genome_id + '_genes.fna') gff_file = os.path.join(self.tmp_dir, genome_id + '.gff') genome_root = genome_id[0:genome_id.find('_', 4)] prodigal_path = os.path.join(genome_path, 'prodigal') if not os.path.exists(prodigal_path): os.makedirs(prodigal_path) new_aa_gene_file = os.path.join(prodigal_path, genome_root + '_protein.faa') new_nt_gene_file = os.path.join(prodigal_path, genome_root + '_protein.fna') new_gff_file = os.path.join(prodigal_path, genome_root + '_protein.gff') os.system('mv %s %s' % (aa_gene_file, new_aa_gene_file)) os.system('mv %s %s' % (nt_gene_file, new_nt_gene_file)) os.system('mv %s %s' % (gff_file, new_gff_file)) # save translation table information translation_table_file = os.path.join(prodigal_path, 'prodigal_translation_table.tsv') fout = open(translation_table_file, 'w') if translation_table[canonical_gid]: fout.write('%s\t%d\t%s\n' % ('best_translation_table', summary_stats[genome_id].best_translation_table, 'used table specified by NCBI')) else: fout.write('%s\t%d\n' % ('best_translation_table', summary_stats[genome_id].best_translation_table)) fout.write('%s\t%.2f\n' % ('coding_density_4', summary_stats[genome_id].coding_density_4 * 100)) fout.write('%s\t%.2f\n' % ('coding_density_11', summary_stats[genome_id].coding_density_11 * 100)) fout.close() checksum = sha256(new_aa_gene_file) fout = open(new_aa_gene_file + '.sha256', 'w') fout.write(checksum) fout.close()
def matrix_maker(faidx, bam_list, extension, threads, mapq, id_cutoff, abundance_file, normalised_file, relative_file, base_abundance_file, base_normalised_file, base_relative_file, feature_normalisation, discard_gene_length_normalisation, removed): import subprocess import re logger = logging.getLogger('timestamp') features_size = {} counts = {} counts_base = {} logger.info('Get features and initialise matrix') with open(faidx) as f: for line in f: if not line.startswith('#'): LINE = line.rstrip().split('\t') features = LINE[0] features_size[features] = LINE[1] counts[features] = 0 counts_base[features] = 0 counts_all = [] counts_all_normalised = [] counts_all_relative = [] counts_base_all = [] counts_base_all_normalised = [] counts_base_all_relative = [] file = ["Features", "Features_size"] logger.info('Browse alignement file(s)') samtoolsexec = find_ex('samtools') samtoolsthreads = '-@ ' + threads samtoolsminqual = '-q ' + mapq with open(bam_list, 'r') as b: for bam in b: if bam.startswith('#'): continue i = 0 alignementfile, librarysize = bam.split(',') if librarysize == '' or librarysize == 0: librarysize = 1 samplename = remove_extension(os.path.basename(alignementfile), extension) file.append(samplename) logger.info('\t' + samplename) cmd = [ samtoolsexec, 'view', samtoolsthreads, samtoolsminqual, alignementfile ] p = subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout for line in p: line = line.decode(sys.getdefaultencoding()).rstrip() if i > 0 and i % 10000 == 0: logger.info("Alignment record %s processed" % i) i += 1 LINE = line.split('\t') features = LINE[2] cigar = LINE[5] base_mapped = 0 match = re.findall(r'(\d+)M', cigar) read_len = len(LINE[6]) for base_match in match: base_mapped += int(base_match) if read_len == 0: logger.info(LINE) if base_mapped / read_len < float(id_cutoff): continue counts[features] += 1 if discard_gene_length_normalisation: counts_base[features] += base_mapped else: counts_base[features] += base_mapped / int( features_size[features]) if abundance_file: counts_all.append(counts.copy()) if normalised_file: count_tmp = {} count_tmp = { k: (v / int(librarysize)) * feature_normalisation for k, v in counts.items() } counts_all_normalised.append(count_tmp.copy()) if relative_file: count_tmp = {} count_tmp = { k: v / total for total in (sum(counts.values()), ) for k, v in counts.items() } counts_all_relative.append(count_tmp.copy()) if base_abundance_file: counts_base_all.append(counts_base.copy()) if base_normalised_file: count_tmp = {} count_tmp = { k: (v / int(librarysize)) * feature_normalisation for k, v in counts_base.items() } counts_base_all_normalised.append(count_tmp.copy()) if base_relative_file: count_tmp = {} count_tmp = { k: v / total for total in (sum(counts_base.values()), ) for k, v in counts_base.items() } counts_base_all_relative.append(count_tmp.copy()) for fn in counts: counts[fn] = 0 counts_base[fn] = 0 logger.info('Print matrix') if abundance_file: output_handle = open(abundance_file, "w") output_handle.write('\t'.join(file) + '\n') for fn in counts.keys(): if sum([c[fn] for c in counts_all]) == 0 and removed: continue else: output_handle.write('\t'.join([fn] + [features_size[fn]] + [str(c[fn]) for c in counts_all]) + '\n') output_handle.close() if normalised_file: output_handle = open(normalised_file, "w") output_handle.write('\t'.join(file) + '\n') for fn in counts.keys(): if sum([c[fn] for c in counts_all_normalised]) == 0 and removed: continue else: output_handle.write( '\t'.join([fn] + [features_size[fn]] + [str(c[fn]) for c in counts_all_normalised]) + '\n') output_handle.close() if relative_file: output_handle = open(relative_file, "w") output_handle.write('\t'.join(file) + '\n') for fn in counts.keys(): if sum([c[fn] for c in counts_all_relative]) == 0 and removed: continue else: output_handle.write( '\t'.join([fn] + [features_size[fn]] + [str(c[fn]) for c in counts_all_relative]) + '\n') output_handle.close() if base_abundance_file: output_handle = open(base_abundance_file, "w") output_handle.write('\t'.join(file) + '\n') for fn in counts_base.keys(): if sum([c[fn] for c in counts_all]) == 0 and removed: continue else: output_handle.write( '\t'.join([fn] + [features_size[fn]] + [str(c[fn]) for c in counts_base_all]) + '\n') output_handle.close() if base_normalised_file: output_handle = open(base_normalised_file, "w") output_handle.write('\t'.join(file) + '\n') for fn in counts_base.keys(): if sum([c[fn] for c in counts_all_normalised]) == 0 and removed: continue else: output_handle.write( '\t'.join([fn] + [features_size[fn]] + [str(c[fn]) for c in counts_base_all_normalised]) + '\n') output_handle.close() if base_relative_file: output_handle = open(base_relative_file, "w") output_handle.write('\t'.join(file) + '\n') for fn in counts_base.keys(): if sum([c[fn] for c in counts_all_relative]) == 0 and removed: continue else: output_handle.write( '\t'.join([fn] + [features_size[fn]] + [str(c[fn]) for c in counts_base_all_relative]) + '\n') output_handle.close()
def run(self, rna_name, gtdb_metadata_file, rna_file, min_rna_length, min_scaffold_length, min_quality, max_contigs, min_N50, tax_filter, genome_list, output_dir, align_method='ssu_align'): """Infer rRNA gene tree spanning select GTDB genomes. Parameters ---------- rna_name : str Name of rRNA gene. gtdb_metadata_file : str File specifying GTDB metadata for each genome. rna_file : str File with rRNA gene sequences in FASTA format. min_rna_length : int Minimum required length of rRNA gene sequences. min_scaffold_length : int Minimum required length of scaffold containing rRNA gene sequence. min_quality : float [0, 100] Minimum genome quality for a genome to be include in tree. max_contigs : int Maximum number of contigs to include genome. min_N50 : int Minimum N50 to include genome. tax_filter : boolean Filter sequences based on incongruent taxonomy classification. genome_list : str Explicit list of genomes to use (ignores --ncbi_rep_only and --user_genomes). output_dir : str Directory to store results """ if rna_name not in ['ssu', 'lsu']: self.logger.error('Unrecognized rRNA gene type: %s' % rna_name) sys.exit(-1) genome_metadata = read_gtdb_metadata(gtdb_metadata_file, [ 'checkm_completeness', 'checkm_contamination', 'scaffold_count', 'n50_scaffolds', 'organism_name', 'gtdb_representative' ]) gtdb_taxonomy = read_gtdb_taxonomy(gtdb_metadata_file) user_genomes = set() uba_genomes = set() ncbi_genomes = set() rep_genomes = set() for genome_id in genome_metadata: org_name = str(genome_metadata[genome_id][4]) if genome_id.startswith('U_'): if '(UBA' in org_name: uba_genomes.add(genome_id) else: user_genomes.add(genome_id) elif genome_id.startswith('RS_') or genome_id.startswith('GB_'): ncbi_genomes.add(genome_id) else: self.logger.warning('Unrecognized genome prefix: %s' % genome_id) rep = genome_metadata[genome_id][5] == 't' if rep: rep_genomes.add(genome_id) self.logger.info( 'Initially considering %d genomes (%d NCBI, %d UBA, %d User).' % (len(genome_metadata), len(ncbi_genomes), len(uba_genomes), len(user_genomes))) self.logger.info('Identified %d representative genomes.' % len(rep_genomes)) # get genomes specified in genome list by user genomes_to_consider = set() if genome_list: for line in open(genome_list): gid = line.rstrip().split('\t')[0] if gid.startswith('RS_') or gid.startswith( 'GB_') or gid.startswith('U_'): genomes_to_consider.add(gid) self.logger.info( 'Restricting genomes to the %d in the genome list.' % len(genomes_to_consider)) else: # filter genomes based on quality and database source self.logger.info('Filtering genomes based on specified critieria.') self.logger.info('Filtering on minimum quality <%d.' % min_quality) self.logger.info('Filtering on number of contigs >%d.' % max_contigs) self.logger.info('Filtering on scaffold N50 <%d.' % min_N50) new_genomes_to_consider = [] filtered_genomes = 0 gt = 0 gq = 0 sc = 0 n50 = 0 for genome_id in genome_metadata: if genome_id not in rep_genomes: gt += 1 filtered_genomes += 1 continue if genome_id not in ncbi_genomes and genome_id not in uba_genomes: gt += 1 filtered_genomes += 1 continue comp, cont, scaffold_count, n50_contigs, _org_name, _rep = genome_metadata[ genome_id] q = float(comp) - 5 * float(cont) if q < min_quality or int(scaffold_count) > max_contigs or int( n50_contigs) < min_N50: if q < min_quality: gq += 1 if int(scaffold_count) > max_contigs: sc += 1 if int(n50_contigs) < min_N50: n50 += 1 filtered_genomes += 1 continue new_genomes_to_consider.append(genome_id) genomes_to_consider = new_genomes_to_consider self.logger.info( 'Filtered %d genomes (%d on genome type, %d on genome quality, %d on number of contigs, %d on N50).' % (filtered_genomes, gt, gq, sc, n50)) self.logger.info('Considering %d genomes after filtering.' % len(genomes_to_consider)) # limit taxonomy to genomes being considered cur_gtdb_taxonomy = {} for gid in genomes_to_consider: cur_gtdb_taxonomy[gid] = gtdb_taxonomy[gid] # get rRNA gene sequences for each genome rna_output_file = self._get_rna_seqs(rna_name, rna_file, min_rna_length, min_scaffold_length, cur_gtdb_taxonomy, genomes_to_consider, output_dir) # identify erroneous rRNA gene sequences if tax_filter: self.logger.info( 'Filtering sequences with incongruent taxonomy strings.') filter = self._tax_filter(rna_output_file, cur_gtdb_taxonomy, output_dir) self.logger.info('Filtered %d sequences.' % len(filter)) if len(filter) > 0: rna_filtered_output = os.path.join( output_dir, 'gtdb_%s.tax_filter.fna' % rna_name) fout = open(rna_filtered_output, 'w') for seq_id, seq, annotation in seq_io.read_seq( rna_output_file, keep_annotation=True): if seq_id not in filter: fout.write('>' + seq_id + ' ' + annotation + '\n') fout.write(seq + '\n') fout.close() rna_output_file = rna_filtered_output # align sequences with ssu-align or mothur if rna_name == 'ssu': if align_method == 'ssu_align': self.logger.info('Aligning sequences with ssu-align.') align_dir = os.path.join(output_dir, '%s_align' % rna_name) os.system('ssu-align --dna %s %s' % (rna_output_file, align_dir)) os.system('ssu-mask --afa %s' % align_dir) elif align_method == 'mothur': self.logger.info('Aligning sequences with mothur.') align_dir = os.path.join(output_dir, 'mothur') if not os.path.exists(align_dir): os.makedirs(align_dir) mothur_cmd = 'mothur "#set.dir(output=%s, blastdir=/srv/sw/Mothur/1.39.5)' % align_dir mothur_cmd += '; align.seqs(candidate=%s, template=/srv/db/mothur/silva_128/silva.seed_v128.align, search=blast, flip=t, processors=%d)' % ( rna_output_file, self.cpus) input_prefix = remove_extension(rna_output_file) align_file = os.path.join(align_dir, input_prefix + '.align') mothur_cmd += '; filter.seqs(fasta=%s, hard=/srv/db/mothur/silva_128/Lane1349.silva.filter, processors=%d);"' % ( align_file, self.cpus) os.system(mothur_cmd) input_msa = os.path.join(align_dir, input_prefix + '.filter.fasta') elif rna_name == 'lsu': self.logger.info('Aligning sequences with ssu-align.') align_dir = os.path.join(output_dir, '%s_align' % rna_name) if not os.path.exists(align_dir): os.makedirs(align_dir) os.system('esl-sfetch --index %s' % rna_output_file) # search fo sequences using domain-specific LSU HMMs for domain in ['archaea', 'bacteria', 'eukaryote']: self.logger.info( 'Matching LSU rRNA genes to %s-specific HMM.' % domain) table_out = os.path.join( align_dir, 'cmsearch.%s.%s.tblout' % (rna_name, domain)) cm_dir = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'cm_files') cm_file = os.path.join(cm_dir, 'lsu_%s.cm' % domain) log_file = os.path.join( align_dir, 'cmsearch.%s.%s.out' % (rna_name, domain)) os.system( 'cmsearch --hmmonly --cpu %d --noali --tblout %s %s %s > %s' % (self.cpus, table_out, cm_file, rna_output_file, log_file)) # identify top hits for each domain self.logger.info( 'Identifying best domain-specific HMM for each LSU rRNA gene.') top_hits = {} for domain in ['archaea', 'bacteria', 'eukaryote']: table_out = os.path.join( align_dir, 'cmsearch.%s.%s.tblout' % (rna_name, domain)) for line in open(table_out): if line[0] == '#': continue line_split = line.split() seq_id = line_split[0] start_seq = int(line_split[7]) end_seq = int(line_split[8]) bitscore = float(line_split[14]) prev_bitscore = top_hits.get(seq_id, [None, 0, 0, 0, 0])[4] if bitscore > prev_bitscore: top_hits[seq_id] = [ domain, seq_id, start_seq, end_seq, bitscore ] # create MSA for each bacteria and archaea for domain in ['archaea', 'bacteria']: # creat file of top hits top_hits_out = os.path.join( align_dir, 'top_hits.%s.%s.tsv' % (rna_name, domain)) fout = open(top_hits_out, 'w') num_hits = 0 for top_domain, seq_id, start_seq, end_seq, bitscore in top_hits.values( ): if top_domain == domain: fout.write('%s\t%d\t%d\%f\n' % (seq_id, start_seq, end_seq, bitscore)) num_hits += 1 fout.close() # align top hits self.logger.info( 'Creating MSA for %s LSU rRNA genes (%d sequences).' % (domain, num_hits)) if num_hits > 0: seq_file = os.path.join( align_dir, 'cmsearch.%s.%s.fna' % (rna_name, domain)) os.system( "grep -v '^#' %s | awk '{print $1, $2, $3, $1}' | esl-sfetch -Cf %s - > %s" % (top_hits_out, rna_output_file, seq_file)) align_file = os.path.join( align_dir, 'cmalign.%s.%s.stk' % (rna_name, domain)) os.system('cmalign --dnaout --outformat Pfam %s %s > %s' % (cm_file, seq_file, align_file)) masked_file = os.path.join( align_dir, 'cmalign.%s.%s.mask.afa' % (rna_name, domain)) os.system('esl-alimask -p --outformat AFA %s > %s' % (align_file, masked_file)) # trim sequences and infer tree if align_method == 'ssu_align': for domain in ['archaea', 'bacteria']: if rna_name == 'ssu': input_msa = os.path.join( align_dir, 'ssu_align.' + domain + '.mask.afa') elif rna_name == 'lsu': input_msa = os.path.join( align_dir, 'cmalign.%s.%s.mask.afa' % (rna_name, domain)) if not os.path.exists(input_msa): continue trimmed_msa = os.path.join(output_dir, domain + '.trimmed.fna') self._trim_seqs(input_msa, trimmed_msa) # infer tree self.logger.info('Inferring tree for %s genes.' % domain) output_tree = os.path.join(output_dir, domain + '.tree') os.system('FastTreeMP -nosupport -nt -gtr -gamma %s > %s' % (trimmed_msa, output_tree)) elif align_method == 'mothur': trimmed_msa = os.path.join(output_dir, input_prefix + '.trimmed.fna') self._trim_seqs(input_msa, trimmed_msa) # infer tree self.logger.info('Inferring tree for %s genes.') output_tree = os.path.join(output_dir, input_prefix + '.tree') os.system('FastTreeMP -nosupport -nt -gtr -gamma %s > %s' % (trimmed_msa, output_tree))
def add_compatible_closest(self, scaffold_file, genome_file, compatible_file, out_genome): """Add sequences specified as compatible. A sequences is added to a bin if and only if it is closest to that bin in GC, tetranuclotide, and coverage space. Parameters ---------- scaffold_file : str Fasta file containing scaffolds to add. genome_file : str Fasta file of binned scaffolds. compatible_file : str File specifying compatible scaffolds. out_genome : str Name of output genome. """ cur_bin_id = remove_extension(genome_file) # determine statistics for each potentially compatible scaffold scaffold_ids = defaultdict(dict) with open(compatible_file) as f: headers = [x.strip() for x in f.readline().split('\t')] scaffold_gc_index = headers.index('Scaffold GC') genome_gc_index = headers.index('Mean genome GC') td_dist_index = headers.index('Scaffold TD') scaffold_cov_index = headers.index('Mean scaffold coverage') genome_cov_index = headers.index('Mean genome coverage') for line in f: line_split = line.split('\t') scaffold_id = line_split[0] bin_id = line_split[1].strip() scaffold_gc = float(line_split[scaffold_gc_index]) genome_gc = float(line_split[genome_gc_index]) gc_dist = abs(scaffold_gc - genome_gc) td_dist = float(line_split[td_dist_index]) scaffold_cov = float(line_split[scaffold_cov_index]) genome_cov = float(line_split[genome_cov_index]) cov_dist = abs(scaffold_cov - genome_cov) scaffold_ids[scaffold_id][bin_id] = [gc_dist, td_dist, cov_dist] # determine scaffolds that are closest to a single bin # in terms of GC, tetranucleotide distance, and coverage compatible_scaffolds = set() for scaffold_id, bin_stats in scaffold_ids.iteritems(): best_gc = [1e9, None] best_td = [1e9, None] best_cov = [1e9, None] for bin_id, stats in bin_stats.iteritems(): gc, td, cov = stats if gc < best_gc[0]: best_gc = [gc, bin_id] if td < best_td[0]: best_td = [td, bin_id] if cov < best_cov[0]: best_cov = [cov, bin_id] # check if scaffold is closest to a single bin if (best_gc[1] == best_td[1] == best_cov[1]) and best_gc[1] == cur_bin_id: compatible_scaffolds.add(scaffold_id) # add compatible sequences to genome genome_seqs = seq_io.read(genome_file) for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id in compatible_scaffolds: genome_seqs[seq_id] = seq # save modified bin seq_io.write_fasta(genome_seqs, out_genome)
def run(self, genome_files, db_file, taxonomy_file, evalue, per_identity, window_size, step_size): """Create taxonomic profiles for a set of genomes. Parameters ---------- genome_files : list of str Fasta files of genomes to process. db_file : str Database of reference genes. taxonomy_file : str File containing GreenGenes taxonomy strings for reference genomes. evalue : float E-value threshold used by blast. per_identity: float Percent identity threshold used by blast. window_size : int Size of each fragment. step_size : int Number of bases to move after each window. """ # parse taxonomy file self.logger.info( ' Reading taxonomic assignment of reference genomes.') taxonomy = Taxonomy().read(taxonomy_file) # fragment each genome into fixed sizes windows self.logger.info('') self.logger.info(' Fragmenting sequences in each bin:') diamond_output_dir = os.path.join(self.output_dir, 'diamond') make_sure_path_exists(diamond_output_dir) fragment_file = os.path.join(diamond_output_dir, 'fragments.fna') fragment_out = open(fragment_file, 'w') contig_id_to_genome_id = {} for genome_file in genome_files: genome_id = remove_extension(genome_file) self.profiles[genome_id] = Profile(genome_id, taxonomy) self._fragment_genomes(genome_file, window_size, step_size, self.profiles[genome_id], fragment_out) for seq_id, _seq in seq_io.read_seq(genome_file): contig_id_to_genome_id[seq_id] = genome_id # run diamond self.logger.info('') self.logger.info( ' Running diamond blastx with %d processes (be patient!)' % self.cpus) diamond = Diamond(self.cpus) diamond_daa_out = os.path.join(diamond_output_dir, 'diamond_hits') diamond.blastx(fragment_file, db_file, evalue, per_identity, 1, diamond_daa_out) diamond_table_out = os.path.join(diamond_output_dir, 'diamond_hits.tsv') diamond.view(diamond_daa_out + '.daa', diamond_table_out) self.logger.info('') self.logger.info(' Creating taxonomic profile for each genome.') self._taxonomic_profiles(diamond_table_out, taxonomy, contig_id_to_genome_id) self.logger.info('') self.logger.info(' Writing taxonomic profile for each genome.') report_dir = os.path.join(self.output_dir, 'bin_reports') make_sure_path_exists(report_dir) for genome_id, profile in self.profiles.iteritems(): seq_summary_out = os.path.join(report_dir, genome_id + '.sequences.tsv') profile.write_seq_summary(seq_summary_out) genome_profile_out = os.path.join(report_dir, genome_id + '.profile.tsv') profile.write_genome_profile(genome_profile_out) genome_summary_out = os.path.join(self.output_dir, 'genome_summary.tsv') self._write_genome_summary(genome_summary_out) # create Krona plot krona_profiles = defaultdict(lambda: defaultdict(int)) for genome_id, profile in self.profiles.iteritems(): seq_assignments = profile.classify_seqs(taxonomy) for seq_id, classification in seq_assignments.iteritems(): taxa = [] for r in xrange(0, len(profile.rank_labels)): taxa.append(classification[r][0]) krona_profiles[genome_id][';'.join( taxa)] += profile.seq_len[seq_id] krona = Krona() krona_output_file = os.path.join(self.output_dir, 'taxonomic_profiles.krona.html') krona.create(krona_profiles, krona_output_file)
def _producer(self, genome_pair): """Identify reciprocal best blast hits between pairs of genomes. Parameters ---------- genome_pair : list Identifier of genomes to process. """ blast_stream = open(self.blast_table, 'rb', 32 * (10 ** 6)) genome_fileA, genome_fileB = genome_pair # count number of genes in each genome genes_in_genomeA = seq_io.read_fasta(genome_fileA) genes_in_genomeB = seq_io.read_fasta(genome_fileB) genome_idA = remove_extension(genome_fileA) genome_idB = remove_extension(genome_fileB) # find blast hits between genome A and B, and vice versa hitsAB = self._valid_hits(blast_stream, self.offset_table, self.per_identity_threshold, self.per_aln_len_threshold, genome_idA, genome_idB) hitsBA = self._valid_hits(blast_stream, self.offset_table, self.per_identity_threshold, self.per_aln_len_threshold, genome_idB, genome_idA) # report reciprocal best blast hits if self.write_shared_genes: fout_seqs = open(os.path.join(self.shared_genes_dir, genome_idA + '-' + genome_idB + '.shared_genes.faa'), 'w') fout_stats = open(os.path.join(self.shared_genes_dir, genome_idA + '-' + genome_idB + '.rbb_hits.tsv'), 'w') fout_stats.write(genome_idA + '\t' + genome_idB + '\tPercent Identity\tPercent Alignment Length\te-value\tbitscore\n') per_identity_hits = [] for query_id, hit_stats in hitsAB.iteritems(): subject_id, per_identA, per_aln_lenA, evalueA, bitscoreA = hit_stats if subject_id in hitsBA and query_id == hitsBA[subject_id][0]: _subject_id, per_identB, per_aln_lenB, evalueB, bitscoreB = hitsBA[subject_id] # take average of statistics in both blast directions as # the results will be similar, but not identical per_ident = 0.5 * (per_identA + per_identB) per_identity_hits.append(per_ident) per_aln_len = 0.5 * (per_aln_lenA + per_aln_lenB) evalue = 0.5 * (evalueA + evalueB) bitscore = 0.5 * (bitscoreA + bitscoreB) fout_stats.write('%s\t%s\t%.2f\t%.2f\t%.2g\t%.2f\n' % (query_id, subject_id, per_ident, per_aln_len, evalue, bitscore)) # write out shared genes if self.write_shared_genes: fout_seqs.write('>' + query_id + '\n') fout_seqs.write(genes_in_genomeA[query_id] + '\n') fout_seqs.write('>' + subject_id + '\n') fout_seqs.write(genes_in_genomeB[subject_id] + '\n') if self.write_shared_genes: fout_seqs.close() fout_stats.close() mean_per_identity_hits = 0 if len(per_identity_hits) > 0: mean_per_identity_hits = mean(per_identity_hits) std_per_identity_hits = 0 if len(per_identity_hits) >= 2: std_per_identity_hits = std(per_identity_hits) return (genome_idA, len(genes_in_genomeA), genome_idB, len(genes_in_genomeB), len(per_identity_hits), mean_per_identity_hits, std_per_identity_hits)
def run(self, input_dir, tmp_dir, threads): # get path to all unprocessed genome files print 'Reading genomes.' genome_files = [] for genome_dir in os.listdir(input_dir): cur_genome_dir = os.path.join(input_dir, genome_dir) if not os.path.isdir(cur_genome_dir): continue for assembly_id in os.listdir(cur_genome_dir): assembly_dir = os.path.join(cur_genome_dir, assembly_id) genome_id = assembly_id[0:assembly_id.find('_', 4)] # check if prodigal has already been called if False: # for safety, I am just recalling genes for all genomes right now, # but this is very efficient aa_gene_file = os.path.join(assembly_dir, 'prodigal', genome_id + '_protein.faa') if os.path.exists(aa_gene_file): # verify checksum checksum_file = aa_gene_file + '.sha256' if os.path.exists(checksum_file): checksum = sha256(aa_gene_file) cur_checksum = open(checksum_file).readline().strip() if checksum == cur_checksum: continue genome_file = os.path.join(assembly_dir, assembly_id + '_genomic.fna') if os.path.exists(genome_file): if os.stat(genome_file).st_size == 0: print '[Warning] Genome file appears to be empty: %s' % genome_file else: genome_files.append(genome_file) print ' Number of unprocessed genomes: %d' % len(genome_files) # run prodigal on each genome print 'Running prodigal.' prodigal = Prodigal(cpus=threads) summary_stats = prodigal.run(genome_files, output_dir=tmp_dir) # move results into individual genome directories print 'Moving files and calculating checksums.' for genome_file in genome_files: genome_path, genome_id = ntpath.split(genome_file) genome_id = remove_extension(genome_id) aa_gene_file = os.path.join(tmp_dir, genome_id + '_genes.faa') nt_gene_file = os.path.join(tmp_dir, genome_id + '_genes.fna') gff_file = os.path.join(tmp_dir, genome_id + '.gff') genome_root = genome_id[0:genome_id.find('_', 4)] prodigal_path = os.path.join(genome_path, 'prodigal') if not os.path.exists(prodigal_path): os.makedirs(prodigal_path) new_aa_gene_file = os.path.join(prodigal_path, genome_root + '_protein.faa') new_nt_gene_file = os.path.join(prodigal_path, genome_root + '_protein.fna') new_gff_file = os.path.join(prodigal_path, genome_root + '_protein.gff') os.system('mv %s %s' % (aa_gene_file, new_aa_gene_file)) os.system('mv %s %s' % (nt_gene_file, new_nt_gene_file)) os.system('mv %s %s' % (gff_file, new_gff_file)) # save translation table information translation_table_file = os.path.join(prodigal_path, 'prodigal_translation_table.tsv') fout = open(translation_table_file, 'w') fout.write('%s\t%d\n' % ('best_translation_table', summary_stats[genome_id].best_translation_table)) fout.write('%s\t%.2f\n' % ('coding_density_4', summary_stats[genome_id].coding_density_4 * 100)) fout.write('%s\t%.2f\n' % ('coding_density_11', summary_stats[genome_id].coding_density_11 * 100)) fout.close() checksum = sha256(new_aa_gene_file) fout = open(new_aa_gene_file + '.sha256', 'w') fout.write(checksum) fout.close()