def _generate(self): """Generate a new sketch file.""" with tempfile.TemporaryDirectory(prefix='gtdbtk_mash_tmp_') as dir_tmp: path_genomes = os.path.join(dir_tmp, 'genomes.txt') with open(path_genomes, 'w') as fh: for path in self.genomes.values(): fh.write(f'{path}\n') args = [ 'mash', 'sketch', '-l', '-p', self.cpus, path_genomes, '-o', self.path, '-k', self.k, '-s', self.s ] args = list(map(str, args)) proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8') with tqdm_log(total=len(self.genomes), unit='genome') as p_bar: for line in iter(proc.stderr.readline, ''): if line.startswith('Sketching'): p_bar.update() proc.wait() if proc.returncode != 0 or not os.path.isfile(self.path): raise GTDBTkExit( f'Error generating Mash sketch: {proc.stderr.read()}')
def _apply_mask(self, gtdb_msa, user_msa, msa_mask, min_perc_aa): """Apply canonical mask to MSA file.""" aligned_genomes = merge_two_dicts(gtdb_msa, user_msa) list_mask = np.fromfile(msa_mask, dtype='S1') == b'1' output_seqs, pruned_seqs = dict(), dict() for seq_id, seq in tqdm_log(aligned_genomes.items(), unit='sequence'): list_seq = np.fromiter(seq, dtype='S1') if list_mask.shape[0] != list_seq.shape[0]: raise MSAMaskLengthMismatch( f'Mask ({list_mask.shape[0]}) and alignment ({list_seq.shape[0]}) length do not match.' ) list_masked_seq = list_seq[list_mask] masked_seq_unique = np.unique(list_masked_seq, return_counts=True) masked_seq_counts = defaultdict(lambda: 0) for aa_char, aa_count in zip(masked_seq_unique[0], masked_seq_unique[1]): masked_seq_counts[aa_char.decode('utf-8')] = aa_count masked_seq = list_masked_seq.tostring().decode('utf-8') valid_bases = list_masked_seq.shape[0] - \ masked_seq_counts['.'] - masked_seq_counts['-'] if seq_id in user_msa and valid_bases < list_masked_seq.shape[ 0] * min_perc_aa: pruned_seqs[seq_id] = masked_seq continue output_seqs[seq_id] = masked_seq return output_seqs, pruned_seqs
def align_marker_set(gid_dict, marker_info_file: MarkerInfoFile, copy_number_file: CopyNumberFile, cpus): """Aligns the set of genomes for a specific domain. Parameters ---------- gid_dict : dict A dictionary containing information about the genome, indexed by the id. marker_info_file : MarkerInfoFile A domain specific subclass of the marker info file. copy_number_file : CopyNumberFile A domain-specific subclass of the copy number file. cpus : int The maximum number of CPUs to use in subprocesses. Returns ------- Dict[str, str] dict[gid] = sequence """ logger = logging.getLogger('timestamp') logger.log(LOG_TASK, f'Generating concatenated alignment for each marker.') single_copy_hits = get_single_copy_hits(gid_dict, copy_number_file, cpus) with tempfile.TemporaryDirectory(prefix='gtdbtk_tmp_') as dir_tmp: # Write each of the markers to disk. marker_paths = dict() for marker_id, marker_d in single_copy_hits.items(): cur_path = os.path.join(dir_tmp, f'{marker_id}.fa') marker_paths[marker_id] = cur_path with open(cur_path, 'w') as fh: for cur_gid, cur_seq in marker_d.items(): fh.write(f'>{cur_gid}\n{cur_seq}\n') # Run hmmalign on all of the markers (in order of largest) hmmer_v = HmmAligner.get_version() logger.log( LOG_TASK, f'Aligning {len(marker_paths)} identified markers using hmmalign {hmmer_v}.' ) queue = list() for marker_id, marker_path in sorted( marker_paths.items(), key=lambda z: -marker_info_file.markers[z[0]]['size']): queue.append( (marker_id, marker_info_file.markers[marker_id]['path'], marker_path, frozenset(single_copy_hits[marker_id]))) with mp.get_context('spawn').Pool(processes=cpus) as pool: results = list( tqdm_log(pool.imap_unordered(run_hmm_align_worker, queue), total=len(queue), unit='marker')) # Create the concatenated alignment. return create_concat_alignment(results, marker_info_file)
def _writer(self, q_writer, n_genomes): """The writer function, which reports the progress of the workers. Parameters ---------- q_writer : multiprocessing.Queue A queue of genome ids which have been processed. n_genomes : int The total number of genomes to be processed. """ with tqdm_log(total=n_genomes, unit='genome') as p_bar: for _ in iter(q_writer.get, None): p_bar.update()
def _writer(self, q_writer, n_total): """The writer function, which reports the progress of the workers. Parameters ---------- q_writer : mp.Queue A queue of genome ids which have been processed. n_total : int The total number of items to be processed. """ with tqdm_log(unit='comparison', total=n_total) as p_bar: for _ in iter(q_writer.get, None): p_bar.update()
def get_single_copy_hits(gid_dict: dict, copy_number_file, cpus): """Collect all of the single copy hits (both domains) for each genome. Parameters ---------- gid_dict : dict A dictionary containing information about the genome, indexed by the id. copy_number_file : CopyNumberFile A domain-specific subclass of the copy number file. cpus : int The number of CPUs to use in sub-processes. Returns ------- Dict[str, Dict[str, str]] dict[marker id][genome id] = sequence """ # Generate a queue job jobs. queue = list() for gid, gid_info in gid_dict.items(): queue.append((gid, gid_info['aa_gene_path'], copy_number_file)) # Process the queue. with mp.get_context('spawn').Pool(processes=cpus) as pool: results = list( tqdm_log(pool.imap_unordered(get_single_copy_hits_worker, queue), total=len(queue), unit='genome')) # Re-format the results. out = defaultdict(dict) for result in results: for marker_id, marker_d in result.items(): for gid, seq in marker_d.items(): out[marker_id][gid] = seq return out
def _report_identified_marker_genes(self, gene_dict, outdir, prefix, write_single_copy_genes): """Report statistics for identified marker genes.""" # Summarise the copy number of each AR53 and BAC120 markers. tln_summary_file = TlnTableSummaryFile(outdir, prefix) ar53_copy_number_file = CopyNumberFileAR53(outdir, prefix) bac120_copy_number_file = CopyNumberFileBAC120(outdir, prefix) # Process each genome. for db_genome_id, info in tqdm_log(sorted(gene_dict.items()), unit='genome'): cur_marker_dir = os.path.join(outdir, DIR_MARKER_GENE) pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id) tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id) pfam_tophit_file.read() tigr_tophit_file.read() # Summarise each of the markers for this genome. ar53_copy_number_file.add_genome(db_genome_id, info.get("aa_gene_path"), pfam_tophit_file, tigr_tophit_file) bac120_copy_number_file.add_genome(db_genome_id, info.get("aa_gene_path"), pfam_tophit_file, tigr_tophit_file) # Write the best translation table to disk for this genome. tln_summary_file.add_genome(db_genome_id, info.get("best_translation_table")) # Write each of the summary files to disk. ar53_copy_number_file.write() bac120_copy_number_file.write() tln_summary_file.write() # Create a symlink to store the summary files in the root. # symlink_f(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix), # os.path.join(outdir, os.path.basename(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix)))) # symlink_f(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix), # os.path.join(outdir, os.path.basename(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix)))) # symlink_f(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix), # os.path.join(outdir, os.path.basename(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix)))) symlink_f( PATH_FAILS.format(prefix=prefix), os.path.join(outdir, os.path.basename(PATH_FAILS.format(prefix=prefix)))) # Write the single copy AR53/BAC120 FASTA files to disk. if write_single_copy_genes: fasta_dir = os.path.join(outdir, DIR_IDENTIFY_FASTA) self.logger.info( f'Writing unaligned single-copy genes to: {fasta_dir}') # Iterate over each domain. marker_doms = list() marker_doms.append( (Config.AR53_MARKERS['PFAM'] + Config.AR53_MARKERS['TIGRFAM'], ar53_copy_number_file, 'ar53')) marker_doms.append((Config.BAC120_MARKERS['PFAM'] + Config.BAC120_MARKERS['TIGRFAM'], bac120_copy_number_file, 'bac120')) for marker_names, marker_file, marker_d in marker_doms: # Create the domain-specific subdirectory. fasta_d_dir = os.path.join(fasta_dir, marker_d) make_sure_path_exists(fasta_d_dir) # Iterate over each marker. for marker_name in marker_names: marker_name = marker_name.rstrip(r'\.[HMMhmm]') marker_path = os.path.join(fasta_d_dir, f'{marker_name}.fa') to_write = list() for genome_id in sorted(gene_dict): unq_hits = marker_file.get_single_copy_hits(genome_id) if marker_name in unq_hits: to_write.append(f'>{genome_id}') to_write.append(unq_hits[marker_name]['seq']) if len(to_write) > 0: with open(marker_path, 'w') as fh: fh.write('\n'.join(to_write))
def _writerThread(self, numDataItems, writerQueue): """Store or write results of worker threads in a single thread.""" with tqdm_log(total=numDataItems, unit='genome') as p_bar: for _ in iter(writerQueue.get, None): p_bar.update()