def run(self, gene_files): """Annotate genes with TIGRFAM HMMs. Parameters ---------- gene_files : iterable Gene files in FASTA format to process. """ if len(gene_files) == 0: raise GTDBTkExit('There are no genomes to process.') self.cpus_per_genome = max(1, int(self.threads / len(gene_files))) # populate worker queue with data to process workerQueue = mp.Queue() writerQueue = mp.Queue() n_skipped = mp.Value('i', 0) for f in gene_files: workerQueue.put(f) for _ in range(self.threads): workerQueue.put(None) try: workerProc = [ mp.Process(target=self._workerThread, args=(workerQueue, writerQueue, n_skipped)) for _ in range(self.threads) ] writeProc = mp.Process(target=self._writerThread, args=(len(gene_files), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put(None) writeProc.join() for proc in workerProc: if proc.exitcode != 0: raise GTDBTkExit( 'An error was encountered while running hmmsearch.') except Exception: for p in workerProc: p.terminate() writeProc.terminate() raise if n_skipped.value > 0: genome_s = 'genome' if n_skipped.value == 1 else 'genomes' self.logger.warning( f'TIGRFAM skipped {n_skipped.value:,} {genome_s} ' f'due to pre-existing data, see warnings.log')
def run(self, gene_files): """Annotate genes with TIGRFAM HMMs. Parameters ---------- gene_files : iterable Gene files in FASTA format to process. """ if len(gene_files) == 0: raise GTDBTkExit('There are no genomes to process.') self.cpus_per_genome = max(1, self.threads / len(gene_files)) # populate worker queue with data to process workerQueue = mp.Queue() writerQueue = mp.Queue() for f in gene_files: workerQueue.put(f) for _ in range(self.threads): workerQueue.put(None) try: workerProc = [ mp.Process(target=self._workerThread, args=(workerQueue, writerQueue)) for _ in range(self.threads) ] writeProc = mp.Process(target=self._writerThread, args=(len(gene_files), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put(None) writeProc.join() for proc in workerProc: if proc.exitcode != 0: raise GTDBTkExit( 'An error was encountered while running hmmsearch.') except Exception: for p in workerProc: p.terminate() writeProc.terminate() raise
def check_install(self): """Check that all reference files exist. Returns ------- bool True if the installation is complete, False otherwise. """ # Assume this was successful unless otherwise observed. ok = True # Compute the hash for each directory self.logger.info('Checking {}'.format(Config.GENERIC_PATH)) for obj_path, expected_hash in Config.REF_HASHES.items(): base_name = obj_path[:-1] if obj_path.endswith('/') else obj_path base_name = base_name.split('/')[-1] user_hash = sha1_dir(obj_path, progress=True) if user_hash != expected_hash: self.logger.info(" |-- {:16} {}".format( base_name, colour('HASH MISMATCH', ['bright'], fg='yellow'))) ok = False else: self.logger.info(" |-- {:16} {}".format( base_name, colour('OK', ['bright'], fg='green'))) if not ok: raise GTDBTkExit( 'Unexpected files were seen, or the reference package is corrupt.' )
def _generate(self): """Generate a new sketch file.""" with tempfile.TemporaryDirectory(prefix='gtdbtk_mash_tmp_') as dir_tmp: path_genomes = os.path.join(dir_tmp, 'genomes.txt') with open(path_genomes, 'w') as fh: for path in self.genomes.values(): fh.write(f'{path}\n') args = [ 'mash', 'sketch', '-l', '-p', self.cpus, path_genomes, '-o', self.path, '-k', self.k, '-s', self.s ] args = list(map(str, args)) proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8') n_processed = 0 while True: line = proc.stderr.readline() if not line: sys.stdout.write('\n') sys.stdout.flush() break if line.startswith('Sketching'): n_processed += 1 pct = round(100 * (n_processed / len(self.genomes)), 2) sys.stdout.write(f'\r==> Sketching {n_processed} of ' f'{len(self.genomes)} ({pct}%) genomes') sys.stdout.flush() proc.wait() if proc.returncode != 0 or not os.path.isfile(self.path): raise GTDBTkExit( f'Error generating Mash sketch: {proc.stderr.read()}')
def _find_ingroup_red(self, ingroup_node, ingroup_domain, tree): """Find RED of the ingroup taxon.""" red_file = MRCA_RED_BAC120 if ingroup_domain == 'd__Archaea': red_file = MRCA_RED_AR53 # create map from leave labels to tree nodes leaf_node_map = {} for leaf in tree.leaf_node_iter(): leaf_node_map[leaf.taxon.label] = leaf # find RED value of ingroup node reference_nodes = set() with open(red_file) as rf: for line in rf: label_ids, red = line.strip().split('\t') labels = label_ids.split('|') if len(labels) == 2: taxa = [leaf_node_map[label].taxon for label in labels] node = tree.mrca(taxa=taxa) if node == ingroup_node: return float(red) raise GTDBTkExit(f'Could not determine RED of ingroup taxon {ingroup_node}.')
def read(self): """Reads the marker names from disk. No sequence information!""" with open(self.path) as fh: fh.readline() for line in fh.readlines(): genome_id, n_unq, n_mul, n_muq, n_mis, unq, mul, muq, mis = line.split( '\t') n_unq, n_mul, n_muq, n_mis = int(n_unq), int(n_mul), int( n_muq), int(n_mis) self.genomes[genome_id] = { 'unq': {x: None for x in unq.strip().split(',') if len(x) > 0}, 'mul': {x: None for x in mul.strip().split(',') if len(x) > 0}, 'muq': {x: None for x in muq.strip().split(',') if len(x) > 0}, 'mis': {x: None for x in mis.strip().split(',') if len(x) > 0} } cur_dict = self.genomes[genome_id] if len(cur_dict['unq']) != n_unq or len(cur_dict['mul']) != n_mul or \ len(cur_dict['muq']) != n_muq or len(cur_dict['mis']) != n_mis: raise GTDBTkExit( f'The marker file is inconsistent: {self.path}')
def run_hmm_align_worker(job): """A worker process for running hmmalign. Parameters ---------- job : Tuple[str, str, str, frozenset] The marker id, hmm marker path, called genes path, expected gids. Returns ------- List[Tuple[str, str, str]] A list containing the (genome id, marker id, sequence). """ marker_id, marker_path, marker_fa, expected_gids = job # Run the process and capture stdout. args = ["hmmalign", "--outformat", "Pfam", marker_path, marker_fa] proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8') stdout, stderr = proc.communicate() # Exit if an error was raised. if proc.returncode != 0: arg_str = ' '.join(args) raise GTDBTkExit(f'hmmalign returned a non-zero exit code: {arg_str}') # Process the output and return the sequences. seqs = read_hmmalign_output(stdout, expected_gids) return [(gid, marker_id, seq) for gid, seq in seqs.items()]
def __init__(self, genomes, root, prefix, cpus, k, s, mash_db=None): """Create a query file for a given set of genomes. Parameters ---------- genomes : dict[str, str] The genomes to create a sketch file from (genome_id, fasta_path). root : str The directory where the sketch file will be saved. prefix : str The prefix to use for this file. cpus : int The maximum number of CPUs available for Mash. k : int The k-mer size. s : int Maximum number of non-redundant hashes. mash_db : Optional[str] The path to read/write the pre-computed Mash reference sketch database. """ if mash_db is not None: export_msh = mash_db.rstrip('\\') if not export_msh.endswith(".msh"): export_msh = export_msh + ".msh" if os.path.isdir(export_msh): raise GTDBTkExit(f"{export_msh} is a directory") make_sure_path_exists(os.path.dirname(export_msh)) path = export_msh else: path = os.path.join(root, f'{prefix}.{self.name}') super().__init__(genomes, path, cpus, k, s)
def _generate(self): """Generate a new sketch file.""" with tempfile.TemporaryDirectory(prefix='gtdbtk_mash_tmp_') as dir_tmp: path_genomes = os.path.join(dir_tmp, 'genomes.txt') with open(path_genomes, 'w') as fh: for path in self.genomes.values(): fh.write(f'{path}\n') args = [ 'mash', 'sketch', '-l', '-p', self.cpus, path_genomes, '-o', self.path, '-k', self.k, '-s', self.s ] args = list(map(str, args)) proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8') with tqdm_log(total=len(self.genomes), unit='genome') as p_bar: for line in iter(proc.stderr.readline, ''): if line.startswith('Sketching'): p_bar.update() proc.wait() if proc.returncode != 0 or not os.path.isfile(self.path): raise GTDBTkExit( f'Error generating Mash sketch: {proc.stderr.read()}')
def read_hmmalign_output(output, expected_gids): """Reads the output of hmmalign and extracts the alignment. Parameters ---------- output : str The string output from hmmalign. expected_gids : frozenset A set containing all of the gids expected in the output. Returns ------- Dict[str, str] dict[genome id] = sequence """ # In case gids have spaces. exp_mapping = {x.split(' ', 1)[0]: x for x in expected_gids} # Get the sequences and the mask. unmasked = dict() mask = None for line in output.splitlines(): splitline = line.split(' ', 1) # Sequence if splitline[0] in exp_mapping: rsplitline = line.rsplit(" ", 1) hit_seq = rsplitline[-1] unmasked[exp_mapping[splitline[0]]] = hit_seq # Mask elif line[0:len("#=GC RF")] == "#=GC RF": mask = [x == 'x' for x in line.rsplit(' ', 1)[-1]] # Sanity check. if mask is None: raise GTDBTkExit( f'Unable to get mask from hmmalign result file: {output}') if len(unmasked) != len(expected_gids): raise GTDBTkExit(f'Not all genomes could be aligned: {output}') # Mask each of the sequences and return them. out = dict() for gid, seq in unmasked.items(): out[gid] = ''.join([s for s, m in zip(seq, mask) if m is True]) return out
def read(path): logger = logging.getLogger('timestamp') genomes, tln_tables = dict(), dict() seen_paths = set() warnings = list() with open(path) as fh: for line_no, line in enumerate(fh): line_split = line.strip().split("\t") if line_split[0] == '': continue # blank line if len(line_split) not in {2, 3}: raise GTDBTkExit('Batch file must contain either 2 ' 'columns (detect translation table), ' 'or 3 (specify translation table).') if len(line_split) == 2: genome_file, genome_id = line_split elif len(line_split) == 3: genome_file, genome_id, tln_table = line_split if tln_table not in {'4', '11'}: raise GTDBTkExit('Specified translation table must ' 'be either 4, or 11.') tln_tables[genome_id] = int(tln_table) if genome_file is None or genome_file == '': warnings.append(f'Missing genome path on line {line_no + 1}.') elif genome_id is None or genome_id == '': warnings.append(f'Missing genome ID on line {line_no + 1}.') elif genome_id in genomes: warnings.append(f'Genome ID {genome_id} appears multiple times.') if genome_file in seen_paths: logger.warning(f'Genome file appears multiple times: {genome_file}') # All good, record the value. genomes[genome_id] = genome_file seen_paths.add(genome_file) # Check if any warnings were raised. if len(warnings) > 0: warning_str = '\n'.join(warnings) raise GTDBTkExit(f'Please check the format of your batchfile, ' f'the following errors were found: {warning_str}') return genomes, tln_tables
def _workerThread(self, queueIn, queueOut, n_skipped): """Process each data item in parallel.""" while True: gene_file = queueIn.get(block=True, timeout=None) if gene_file is None: break assembly_dir, filename = os.path.split(gene_file) genome_id = filename.replace(self.protein_file_suffix, '') genome_dir = os.path.join(self.output_dir, genome_id) output_hit_file = os.path.join( genome_dir, filename.replace(self.protein_file_suffix, self.tigrfam_suffix)) hmmsearch_out = os.path.join( genome_dir, filename.replace(self.protein_file_suffix, '_tigrfam.out')) # Check if this has already been processed. out_files = (output_hit_file, hmmsearch_out, TopHitTigrFile.get_path(self.output_dir, genome_id)) if all([file_has_checksum(x) for x in out_files]): self.warnings.info( f'Skipped TIGRFAM processing for: {genome_id}') with n_skipped.get_lock(): n_skipped.value += 1 else: args = [ 'hmmsearch', '-o', hmmsearch_out, '--tblout', output_hit_file, '--noali', '--notextw', '--cut_nc', '--cpu', str(self.cpus_per_genome), self.tigrfam_hmms, gene_file ] p = subprocess.Popen(args, stdout=subprocess.PIPE, encoding='utf-8') stdout, stderr = p.communicate() if p.returncode != 0: raise GTDBTkExit( f'Non-zero exit code returned when running hmsearch: {stdout}' ) # calculate checksum for out_file in [output_hit_file, hmmsearch_out]: checksum = sha256(out_file) with open(out_file + self.checksum_suffix, 'w') as fh: fh.write(checksum) # identify top hit for each gene self._topHit(output_hit_file) # allow results to be processed or written to file queueOut.put(gene_file)
def _find_ingroup_taxon(self, ingroup_taxon, tree): """Find node of ingroup taxon in tree.""" ingroup_node = None for node in tree.postorder_node_iter(): support, taxon, auxiliary_info = parse_label(node.label) if taxon: taxa = [t.strip() for t in taxon.split(';')] if ingroup_taxon in taxa: if ingroup_node is not None: raise GTDBTkExit(f'Ingroup taxon {ingroup_taxon} ' f'identified multiple times.') ingroup_node = node if ingroup_node is None: raise GTDBTkExit(f'Ingroup taxon {ingroup_taxon} not found in tree.') return ingroup_node
def read(self): """Read the translation table summary file from disk.""" if len(self.genomes) > 0: raise GTDBTkExit( f'Warning! Attempting to override in-memory values ' f'for translation table summary file: {self.path}') with open(self.path, 'r') as fh: for line in fh.readlines(): gid, tbl = line.strip().split('\t') self.genomes[gid] = int(tbl)
def check_install(self): """Check that all reference files exist. Returns ------- bool True if the installation is complete, False otherwise. """ # Check that all programs are on the system path. self.logger.info( f'Checking that all third-party software are on the system path:') names = { 'prodigal', 'hmmsearch', 'fastANI', 'mash', 'pplacer', 'guppy', 'FastTree', 'FastTreeMP', 'hmmalign' } for name in sorted(names): on_path = False try: on_path = on_path or check_dependencies([name], exit_on_fail=False) except: pass if on_path: self.logger.info(" |-- {:16} {}".format( name, colour('OK', ['bright'], fg='green'))) else: self.logger.info(" |-- {:16} {}".format( name, colour('NOT FOUND', ['bright'], fg='yellow'))) # Assume this was successful unless otherwise observed. ok = True # Compute the hash for each directory self.logger.info( f'Checking integrity of reference package: {Config.GENERIC_PATH}') for obj_path, expected_hash in Config.REF_HASHES.items(): base_name = obj_path[:-1] if obj_path.endswith('/') else obj_path base_name = base_name.split('/')[-1] user_hash = sha1_dir(obj_path, progress=True) if user_hash != expected_hash: self.logger.info(" |-- {:16} {}".format( base_name, colour(f'HASH MISMATCH {user_hash}', ['bright'], fg='yellow'))) ok = False else: self.logger.info(" |-- {:16} {}".format( base_name, colour('OK', ['bright'], fg='green'))) if not ok: raise GTDBTkExit( 'Unexpected files were seen, or the reference package is corrupt.' )
def _calculate(self): self.logger.info('Calculating Mash distances.') args = ['mash', 'dist', '-p', self.cpus, '-d', self.max_d, '-v', self.mash_v, self.ref_sketch.path, self.qry_sketch.path] args = list(map(str, args)) with open(self.path, 'w') as f_out: proc = subprocess.Popen(args, stdout=f_out, stderr=subprocess.PIPE, encoding='utf-8') _, stderr = proc.communicate() if proc.returncode != 0: raise GTDBTkExit(f'Error running Mash dist: {proc.stderr.read()}')
def _load_metadata(self): """Loads the metadata from an existing Mash sketch file.""" args = ['mash', 'info', '-t', self.path] proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8') stdout, stderr = proc.communicate() if proc.returncode != 0: raise GTDBTkExit(f'Error reading Mash sketch file {self.path}:\n{stderr}') for hashes, length, path in re.findall(r'(\d+)\t(\d+)\t(.+)\t.+\n', stdout): self.data[path] = (int(hashes), int(length))
def read(self): """Read the summary file from disk.""" if not os.path.isfile(self.path): raise GTDBTkExit( f'Error, classify summary file not found: {self.path}') with open(self.path) as fh: # Load and verify the columns match the expected order. cols_exp, _ = self.get_col_order() cols_cur = fh.readline().strip().split('\t') if cols_exp != cols_cur: raise GTDBTkExit( f'The classify summary file columns are inconsistent: {cols_cur}' ) # Process the data. for line in fh.readlines(): data = line.strip().split('\t') row = ClassifySummaryFileRow() row.gid = data[0] row.classification = data[1] row.fastani_ref = data[2] row.fastani_ref_radius = data[3] row.fastani_tax = data[4] row.fastani_ani = data[5] row.fastani_af = data[6] row.closest_placement_ref = data[7] row.closest_placement_radius = data[8] row.closest_placement_tax = data[9] row.closest_placement_ani = data[10] row.closest_placement_af = data[11] row.pplacer_tax = data[12] row.classification_method = data[13] row.note = data[14] row.other_related_refs = data[15] row.msa_percent = data[16] row.tln_table = data[17] row.red_value = data[18] row.warnings = data[19] self.add_row(row)
def run(self, genomic_files): """Run Prodigal across a set of genomes. Parameters ---------- genomic_files : dict Dictionary indicating the genomic and gene file for each genome. """ # populate worker queue with data to process worker_queue = mp.Queue() writer_queue = mp.Queue() for genome_id, file_path in genomic_files.iteritems(): worker_queue.put([genome_id, file_path]) for _ in range(self.threads): worker_queue.put(None) try: manager = mp.Manager() out_dict = manager.dict() worker_proc = [ mp.Process(target=self._worker, args=(out_dict, worker_queue, writer_queue)) for _ in range(self.threads) ] writer_proc = mp.Process(target=self._writer, args=(len(genomic_files), writer_queue)) writer_proc.start() for p in worker_proc: p.start() for p in worker_proc: p.join() # Gracefully terminate the program. if p.exitcode != 0: raise GTDBTkExit('Prodigal returned a non-zero exit code.') writer_queue.put(None) writer_proc.join() except Exception: for p in worker_proc: p.terminate() writer_proc.terminate() raise result_dict = {k: v for k, v in out_dict.items()} return result_dict
def read(self): with open(self.path, 'r') as fh: for line in fh.readlines(): idx, val = line.strip().split('\t') if idx == 'best_translation_table': try: self.best_tln_table = int(val) except ValueError: raise GTDBTkExit( f'Invalid translation table: {val} for {self.path}' ) elif idx == 'coding_density_4': try: self.coding_density_4 = float(val) except ValueError: raise GTDBTkExit( f'Invalid coding density: {val} for {self.path}') elif idx == 'coding_density_11': try: self.coding_density_11 = float(val) except ValueError: raise GTDBTkExit( f'Invalid coding density: {val} for {self.path}')
def read(self): """Read the summary file from disk.""" if not os.path.isfile(self.path): raise GTDBTkExit( f'Error, classify tree mappings file not found: {self.path}') with open(self.path) as fh: # Load and verify the columns match the expected order. cols_exp, _ = self.get_col_order() cols_cur = fh.readline().strip().split('\t') if cols_exp != cols_cur: raise GTDBTkExit( f'The classify tree mappings columns are inconsistent: {cols_cur}' ) # Process the data. for line in fh.readlines(): data = line.strip().split('\t') row = GenomeMappingFileRow() row.gid = data[0] row.ani_classification = data[1] row.mapped_tree = data[2] self.add_row(row)
def add_genome(self, genome_id: str, path_faa: str, pfam_th: TopHitPfamFile, tigr_th: TopHitTigrFile): """Process the top hit files for a genome and store the copy info.""" if genome_id in self.genomes: self.logger.warning(f'Genome already exists in copy number file: {genome_id}') self.genomes[genome_id] = {'unq': dict(), 'mul': dict(), 'muq': dict(), 'mis': dict()} # Pointers to unique, multiple hit, multiple-unique, missing markers. cur_unq = self.genomes[genome_id]['unq'] cur_mul = self.genomes[genome_id]['mul'] cur_muq = self.genomes[genome_id]['muq'] cur_mis = self.genomes[genome_id]['mis'] # Load genes from the prodigal faa file. d_genes = read_fasta(path_faa, False) for seq_id, seq in d_genes.items(): if seq.endswith('*'): d_genes[seq_id] = seq[:-1] # Create a dictionary of marker names -> Hits d_hmm_hits = self._merge_hit_files(pfam_th, tigr_th) # Foreach expected marker determine which category it falls into. for marker_id in self.marker_names: # Marker is missing. if marker_id not in d_hmm_hits: cur_mis[marker_id] = None # Multiple hits to to the same marker. elif len(d_hmm_hits[marker_id]) > 1: # If sequences are the same, take the most significant hit unq_seqs = {d_genes[x.gene_id] for x in d_hmm_hits[marker_id]} if len(unq_seqs) == 1: cur_top_hit = sorted(d_hmm_hits[marker_id], reverse=True)[0] cur_muq[marker_id] = {'hit': cur_top_hit, 'seq': d_genes[cur_top_hit.gene_id]} # Marker maps to multiple genes. else: cur_mul[marker_id] = None # This was a unique hit. else: cur_hit = d_hmm_hits[marker_id][0] cur_unq[marker_id] = {'hit': cur_hit, 'seq': d_genes[cur_hit.gene_id]} # Sanity check - confirm that the total number of markers matches. if len(self.marker_names) != len(cur_unq) + len(cur_mul) + len(cur_muq) + len(cur_mis): raise GTDBTkExit('The marker set is inconsistent, please report this issue.')
def _parse_result_queue(self, q_results, path_to_gid): """Creates the output dictionary given the results from FastANI Parameters ---------- q_results : mp.Queue A multiprocessing queue containing raw results. path_to_gid : dict[str ,str] A dictionary containing the file path to genome id. Returns ------- dict[str, dict[str, dict[str, float]]] The ANI/AF of the query genome to all reference genomes. """ out = dict() while True: q_item = q_results.get(block=True) if q_item is None: break job, result = q_item qry_gid = job['qry'] for path_a, dict_b in result.items(): for path_b, (ani, af) in dict_b.items(): gid_a, gid_b = path_to_gid[path_a], path_to_gid[path_b] # This was done in the forward direction. if gid_a == qry_gid: ref_gid = gid_b # This was done in the reverse direction. elif gid_b == qry_gid: ref_gid = gid_a else: raise GTDBTkExit('FastANI results are malformed.') # Take the largest ANI / AF from either pass. if qry_gid not in out: out[qry_gid] = {ref_gid: {'ani': ani, 'af': af}} elif ref_gid not in out[qry_gid]: out[qry_gid][ref_gid] = {'ani': ani, 'af': af} else: out[qry_gid][ref_gid]['ani'] = max( out[qry_gid][ref_gid]['ani'], ani) out[qry_gid][ref_gid]['af'] = max( out[qry_gid][ref_gid]['af'], af) return out
def export_msa(domain: Domain, output_file: str): """Exports the GTDB MSA to the specified path. :param domain: The domain used to determine the marker set. :param output_file: The path to write the MSA. """ if domain is Domain.ARCHAEA: file_to_export = CONCAT_AR53 elif domain is Domain.BACTERIA: file_to_export = CONCAT_BAC120 else: raise GTDBTkExit(f'Unknown domain: "{domain}"') make_sure_path_exists(os.path.dirname(output_file)) copyfile(file_to_export, output_file)
def _get_ingroup_domain(self, ingroup_taxon) -> str: """Get domain on ingroup taxon.""" # read GTDB taxonomy in order to establish domain on ingroup taxon gtdb_taxonomy = Taxonomy().read(TAXONOMY_FILE) ingroup_domain = None for taxa in gtdb_taxonomy.values(): if ingroup_taxon in taxa: ingroup_domain = taxa[Taxonomy.DOMAIN_IDX] if ingroup_domain is None: raise GTDBTkExit(f'Ingroup taxon {ingroup_taxon} was not found in ' f'the GTDB taxonomy.') return ingroup_domain
def run_proc(self, q, r, ql, rl, output): """Runs the FastANI process. Parameters ---------- q : str The path to the query genome. r : str The path to the reference genome. ql : str The path to the query list file. rl : str The path to the reference list file. output : str The path to the output file. Returns ------- dict[str, dict[str, float]] The ANI/AF of the query genomes to the reference genomes. """ args = ['fastANI'] if self.minFrac: args.extend(['--minFraction', '0']) if q is not None: args.extend(['-q', q]) if r is not None: args.extend(['-r', r]) if ql is not None: args.extend(['--ql', ql]) if rl is not None: args.extend(['--rl', rl]) args.extend(['-o', output]) self.logger.debug(' '.join(args)) proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8') stdout, stderr = proc.communicate() if proc.returncode != 0: self.logger.error('STDOUT:\n' + stdout) self.logger.error('STDERR:\n' + stderr) raise GTDBTkExit('FastANI returned a non-zero exit code.') # Parse the output return self.parse_output_file(output)
def read(self, taxonomy_file: str, canonical_ids: bool = False) -> Dict[str, List[str]]: """Read Greengenes-style taxonomy file. Expected format is: <id>\t<taxonomy string> where the taxonomy string has the formats: d__; p__; c__; o__; f__; g__; s__ Parameters ---------- taxonomy_file : str Path to a Greengenes-style taxonomy file. canonical_ids : bool True if to use the canonical ID format, False otherwise. """ try: d = {} with open(taxonomy_file, 'r') as f: for row, line in enumerate(f.readlines()): line_split = line.split('\t') if len(line_split) != 2: raise GTDBTkExit(f'Not a tab-separated line: {line}') unique_id = line_split[0] if canonical_ids: unique_id = canonical_gid(unique_id) tax_str = line_split[1].rstrip() if tax_str[-1] == ';': # remove trailing semicolons which sometimes # appear in Greengenes-style taxonomy files tax_str = tax_str[0:-1] d[unique_id] = [x.strip() for x in tax_str.split(';')] except: self.logger.error('Failed to parse taxonomy file on line %d' % (row + 1)) raise return d
def parse_output_file(self, path_out): """Parses the resulting output file from FastANI. Parameters ---------- path_out : str The path where the output file resides. Returns ------- dict[str, dict[str, tuple[float, float]]] The ANI/AF of the query genomes to the reference genomes. """ out = dict() if os.path.isfile(path_out): with open(path_out, 'r') as fh: for line in fh.readlines(): """FastANI version >=1.1 uses tabs instead of spaces to separate columns. Preferentially try split with tabs first instead of split() in-case of spaces in the file path.""" try: try: path_qry, path_ref, ani, frac1, frac2 = line.strip( ).split('\t') except ValueError: path_qry, path_ref, ani, frac1, frac2 = line.strip( ).split(' ') if not self._suppress_v1_warning: self.logger.warning( 'You are using FastANI v1.0, it is recommended ' 'that you update to a more recent version.' ) self._suppress_v1_warning = True af = round(float(frac1) / float(frac2), 2) if path_qry not in out: out[path_qry] = {path_ref: (float(ani), af)} elif path_ref not in out[path_qry]: out[path_qry][path_ref] = (float(ani), af) except Exception as e: self.logger.error( f'Exception reading FastANI output: {repr(e)}') raise GTDBTkExit(f'Unable to read line "{line}"') return out
def _run(self, ref_msh, qry_msh, max_d): args = ['mash', 'dist', '-p', self.cpus, '-d', max_d, ref_msh, qry_msh] args = list(map(str, args)) proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8') stdout, stderr = proc.communicate() if proc.returncode != 0: raise GTDBTkExit(f'Error running Mash dist: {proc.stderr.read()}') out = defaultdict(dict) for ref_id, qry_id, dist, p_val, shared_n, shared_d in re.findall( r'(.+)\t(.+)\t(.+)\t(.+)\t(\d+)\/(\d+)\n', stdout): dist, p_val = float(dist), float(p_val) shared_num, shared_den = int(shared_n), int(shared_d) out[qry_id][ref_id] = (dist, p_val, shared_num, shared_den) return out
def _get_median_reds(self, ingroup_domain: str): """Get median RED values for domain of ingroup taxon.""" # get median RED values for domain if ingroup_domain == 'd__Bacteria': median_reds = RED_DIST_BAC_DICT elif ingroup_domain == 'd__Archaea': median_reds = RED_DIST_ARC_DICT else: raise GTDBTkExit(f'Unrecognized GTDB domain: {ingroup_domain}.') # report median values domain = ingroup_domain.replace('d__', '') self.logger.info('Median RED values for {}:'.format(domain)) for idx, rank_prefix in enumerate(Taxonomy.rank_prefixes): if idx != Taxonomy.DOMAIN_IDX and idx != Taxonomy.SPECIES_IDX: self.logger.info(' {}\t{:.3f}'.format( Taxonomy.rank_labels[idx].capitalize(), median_reds[rank_prefix])) return median_reds