Ejemplo n.º 1
0
    def run(self, gene_files):
        """Annotate genes with TIGRFAM HMMs.

        Parameters
        ----------
        gene_files : iterable
            Gene files in FASTA format to process.
        """
        if len(gene_files) == 0:
            raise GTDBTkExit('There are no genomes to process.')
        self.cpus_per_genome = max(1, int(self.threads / len(gene_files)))

        # populate worker queue with data to process
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()
        n_skipped = mp.Value('i', 0)

        for f in gene_files:
            workerQueue.put(f)

        for _ in range(self.threads):
            workerQueue.put(None)

        try:
            workerProc = [
                mp.Process(target=self._workerThread,
                           args=(workerQueue, writerQueue, n_skipped))
                for _ in range(self.threads)
            ]
            writeProc = mp.Process(target=self._writerThread,
                                   args=(len(gene_files), writerQueue))

            writeProc.start()

            for p in workerProc:
                p.start()

            for p in workerProc:
                p.join()

            writerQueue.put(None)
            writeProc.join()

            for proc in workerProc:
                if proc.exitcode != 0:
                    raise GTDBTkExit(
                        'An error was encountered while running hmmsearch.')

        except Exception:
            for p in workerProc:
                p.terminate()

            writeProc.terminate()
            raise

        if n_skipped.value > 0:
            genome_s = 'genome' if n_skipped.value == 1 else 'genomes'
            self.logger.warning(
                f'TIGRFAM skipped {n_skipped.value:,} {genome_s} '
                f'due to pre-existing data, see warnings.log')
Ejemplo n.º 2
0
    def run(self, gene_files):
        """Annotate genes with TIGRFAM HMMs.

        Parameters
        ----------
        gene_files : iterable
            Gene files in FASTA format to process.
        """
        if len(gene_files) == 0:
            raise GTDBTkExit('There are no genomes to process.')
        self.cpus_per_genome = max(1, self.threads / len(gene_files))

        # populate worker queue with data to process
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for f in gene_files:
            workerQueue.put(f)

        for _ in range(self.threads):
            workerQueue.put(None)

        try:
            workerProc = [
                mp.Process(target=self._workerThread,
                           args=(workerQueue, writerQueue))
                for _ in range(self.threads)
            ]
            writeProc = mp.Process(target=self._writerThread,
                                   args=(len(gene_files), writerQueue))

            writeProc.start()

            for p in workerProc:
                p.start()

            for p in workerProc:
                p.join()

            writerQueue.put(None)
            writeProc.join()

            for proc in workerProc:
                if proc.exitcode != 0:
                    raise GTDBTkExit(
                        'An error was encountered while running hmmsearch.')

        except Exception:
            for p in workerProc:
                p.terminate()

            writeProc.terminate()
            raise
Ejemplo n.º 3
0
Archivo: misc.py Proyecto: 31380/GTDBTk
    def check_install(self):
        """Check that all reference files exist.

        Returns
        -------
        bool
            True if the installation is complete, False otherwise.
        """

        # Assume this was successful unless otherwise observed.
        ok = True

        # Compute the hash for each directory
        self.logger.info('Checking {}'.format(Config.GENERIC_PATH))
        for obj_path, expected_hash in Config.REF_HASHES.items():
            base_name = obj_path[:-1] if obj_path.endswith('/') else obj_path
            base_name = base_name.split('/')[-1]
            user_hash = sha1_dir(obj_path, progress=True)

            if user_hash != expected_hash:
                self.logger.info("         |-- {:16} {}".format(
                    base_name, colour('HASH MISMATCH', ['bright'],
                                      fg='yellow')))
                ok = False
            else:
                self.logger.info("         |-- {:16} {}".format(
                    base_name, colour('OK', ['bright'], fg='green')))

        if not ok:
            raise GTDBTkExit(
                'Unexpected files were seen, or the reference package is corrupt.'
            )
Ejemplo n.º 4
0
    def _generate(self):
        """Generate a new sketch file."""
        with tempfile.TemporaryDirectory(prefix='gtdbtk_mash_tmp_') as dir_tmp:
            path_genomes = os.path.join(dir_tmp, 'genomes.txt')
            with open(path_genomes, 'w') as fh:
                for path in self.genomes.values():
                    fh.write(f'{path}\n')

            args = [
                'mash', 'sketch', '-l', '-p', self.cpus, path_genomes, '-o',
                self.path, '-k', self.k, '-s', self.s
            ]
            args = list(map(str, args))
            proc = subprocess.Popen(args,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE,
                                    encoding='utf-8')
            n_processed = 0
            while True:
                line = proc.stderr.readline()
                if not line:
                    sys.stdout.write('\n')
                    sys.stdout.flush()
                    break
                if line.startswith('Sketching'):
                    n_processed += 1
                    pct = round(100 * (n_processed / len(self.genomes)), 2)
                    sys.stdout.write(f'\r==> Sketching {n_processed} of '
                                     f'{len(self.genomes)} ({pct}%) genomes')
                    sys.stdout.flush()
            proc.wait()

            if proc.returncode != 0 or not os.path.isfile(self.path):
                raise GTDBTkExit(
                    f'Error generating Mash sketch: {proc.stderr.read()}')
Ejemplo n.º 5
0
    def _find_ingroup_red(self, ingroup_node, ingroup_domain, tree):
        """Find RED of the ingroup taxon."""

        red_file = MRCA_RED_BAC120
        if ingroup_domain == 'd__Archaea':
            red_file = MRCA_RED_AR53

        # create map from leave labels to tree nodes
        leaf_node_map = {}
        for leaf in tree.leaf_node_iter():
            leaf_node_map[leaf.taxon.label] = leaf

        # find RED value of ingroup node
        reference_nodes = set()
        with open(red_file) as rf:
            for line in rf:
                label_ids, red = line.strip().split('\t')
                labels = label_ids.split('|')
                if len(labels) == 2:
                    taxa = [leaf_node_map[label].taxon for label in labels]
                    node = tree.mrca(taxa=taxa)
                    if node == ingroup_node:
                        return float(red)

        raise GTDBTkExit(f'Could not determine RED of ingroup taxon {ingroup_node}.')
Ejemplo n.º 6
0
 def read(self):
     """Reads the marker names from disk. No sequence information!"""
     with open(self.path) as fh:
         fh.readline()
         for line in fh.readlines():
             genome_id, n_unq, n_mul, n_muq, n_mis, unq, mul, muq, mis = line.split(
                 '\t')
             n_unq, n_mul, n_muq, n_mis = int(n_unq), int(n_mul), int(
                 n_muq), int(n_mis)
             self.genomes[genome_id] = {
                 'unq':
                 {x: None
                  for x in unq.strip().split(',') if len(x) > 0},
                 'mul':
                 {x: None
                  for x in mul.strip().split(',') if len(x) > 0},
                 'muq':
                 {x: None
                  for x in muq.strip().split(',') if len(x) > 0},
                 'mis':
                 {x: None
                  for x in mis.strip().split(',') if len(x) > 0}
             }
             cur_dict = self.genomes[genome_id]
             if len(cur_dict['unq']) != n_unq or len(cur_dict['mul']) != n_mul or \
                     len(cur_dict['muq']) != n_muq or len(cur_dict['mis']) != n_mis:
                 raise GTDBTkExit(
                     f'The marker file is inconsistent: {self.path}')
Ejemplo n.º 7
0
def run_hmm_align_worker(job):
    """A worker process for running hmmalign.

    Parameters
    ----------
    job : Tuple[str, str, str, frozenset]
        The marker id, hmm marker path, called genes path, expected gids.

    Returns
    -------
    List[Tuple[str, str, str]]
        A list containing the (genome id, marker id, sequence).
    """
    marker_id, marker_path, marker_fa, expected_gids = job

    # Run the process and capture stdout.
    args = ["hmmalign", "--outformat", "Pfam", marker_path, marker_fa]
    proc = subprocess.Popen(args,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE,
                            encoding='utf-8')
    stdout, stderr = proc.communicate()

    # Exit if an error was raised.
    if proc.returncode != 0:
        arg_str = ' '.join(args)
        raise GTDBTkExit(f'hmmalign returned a non-zero exit code: {arg_str}')

    # Process the output and return the sequences.
    seqs = read_hmmalign_output(stdout, expected_gids)
    return [(gid, marker_id, seq) for gid, seq in seqs.items()]
Ejemplo n.º 8
0
    def __init__(self, genomes, root, prefix, cpus, k, s, mash_db=None):
        """Create a query file for a given set of genomes.

        Parameters
        ----------
        genomes : dict[str, str]
            The genomes to create a sketch file from (genome_id, fasta_path).
        root : str
            The directory where the sketch file will be saved.
        prefix : str
            The prefix to use for this file.
        cpus : int
            The maximum number of CPUs available for Mash.
        k : int
            The k-mer size.
        s : int
            Maximum number of non-redundant hashes.
        mash_db : Optional[str]
            The path to read/write the pre-computed Mash reference sketch database.
        """
        if mash_db is not None:
            export_msh = mash_db.rstrip('\\')
            if not export_msh.endswith(".msh"):
                export_msh = export_msh + ".msh"
            if os.path.isdir(export_msh):
                raise GTDBTkExit(f"{export_msh} is a directory")
            make_sure_path_exists(os.path.dirname(export_msh))
            path = export_msh
        else:
            path = os.path.join(root, f'{prefix}.{self.name}')

        super().__init__(genomes, path, cpus, k, s)
Ejemplo n.º 9
0
    def _generate(self):
        """Generate a new sketch file."""
        with tempfile.TemporaryDirectory(prefix='gtdbtk_mash_tmp_') as dir_tmp:
            path_genomes = os.path.join(dir_tmp, 'genomes.txt')
            with open(path_genomes, 'w') as fh:
                for path in self.genomes.values():
                    fh.write(f'{path}\n')

            args = [
                'mash', 'sketch', '-l', '-p', self.cpus, path_genomes, '-o',
                self.path, '-k', self.k, '-s', self.s
            ]
            args = list(map(str, args))
            proc = subprocess.Popen(args,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE,
                                    encoding='utf-8')
            with tqdm_log(total=len(self.genomes), unit='genome') as p_bar:
                for line in iter(proc.stderr.readline, ''):
                    if line.startswith('Sketching'):
                        p_bar.update()
            proc.wait()

            if proc.returncode != 0 or not os.path.isfile(self.path):
                raise GTDBTkExit(
                    f'Error generating Mash sketch: {proc.stderr.read()}')
Ejemplo n.º 10
0
def read_hmmalign_output(output, expected_gids):
    """Reads the output of hmmalign and extracts the alignment.

    Parameters
    ----------
    output : str
        The string output from hmmalign.
    expected_gids : frozenset
        A set containing all of the gids expected in the output.

    Returns
    -------
    Dict[str, str]
        dict[genome id] = sequence
    """
    # In case gids have spaces.
    exp_mapping = {x.split(' ', 1)[0]: x for x in expected_gids}

    # Get the sequences and the mask.
    unmasked = dict()
    mask = None
    for line in output.splitlines():
        splitline = line.split(' ', 1)

        # Sequence
        if splitline[0] in exp_mapping:
            rsplitline = line.rsplit(" ", 1)
            hit_seq = rsplitline[-1]
            unmasked[exp_mapping[splitline[0]]] = hit_seq

        # Mask
        elif line[0:len("#=GC RF")] == "#=GC RF":
            mask = [x == 'x' for x in line.rsplit(' ', 1)[-1]]

    # Sanity check.
    if mask is None:
        raise GTDBTkExit(
            f'Unable to get mask from hmmalign result file: {output}')
    if len(unmasked) != len(expected_gids):
        raise GTDBTkExit(f'Not all genomes could be aligned: {output}')

    # Mask each of the sequences and return them.
    out = dict()
    for gid, seq in unmasked.items():
        out[gid] = ''.join([s for s, m in zip(seq, mask) if m is True])
    return out
Ejemplo n.º 11
0
    def read(path):
        logger = logging.getLogger('timestamp')
        genomes, tln_tables = dict(), dict()
        seen_paths = set()
        warnings = list()
        with open(path) as fh:
            for line_no, line in enumerate(fh):
                line_split = line.strip().split("\t")
                if line_split[0] == '':
                    continue  # blank line

                if len(line_split) not in {2, 3}:
                    raise GTDBTkExit('Batch file must contain either 2 '
                                     'columns (detect translation table), '
                                     'or 3 (specify translation table).')

                if len(line_split) == 2:
                    genome_file, genome_id = line_split
                elif len(line_split) == 3:
                    genome_file, genome_id, tln_table = line_split
                    if tln_table not in {'4', '11'}:
                        raise GTDBTkExit('Specified translation table must '
                                         'be either 4, or 11.')
                    tln_tables[genome_id] = int(tln_table)

                if genome_file is None or genome_file == '':
                    warnings.append(f'Missing genome path on line {line_no + 1}.')
                elif genome_id is None or genome_id == '':
                    warnings.append(f'Missing genome ID on line {line_no + 1}.')
                elif genome_id in genomes:
                    warnings.append(f'Genome ID {genome_id} appears multiple times.')
                if genome_file in seen_paths:
                    logger.warning(f'Genome file appears multiple times: {genome_file}')

                # All good, record the value.
                genomes[genome_id] = genome_file
                seen_paths.add(genome_file)

        # Check if any warnings were raised.
        if len(warnings) > 0:
            warning_str = '\n'.join(warnings)
            raise GTDBTkExit(f'Please check the format of your batchfile, '
                             f'the following errors were found: {warning_str}')

        return genomes, tln_tables
Ejemplo n.º 12
0
    def _workerThread(self, queueIn, queueOut, n_skipped):
        """Process each data item in parallel."""
        while True:
            gene_file = queueIn.get(block=True, timeout=None)
            if gene_file is None:
                break

            assembly_dir, filename = os.path.split(gene_file)
            genome_id = filename.replace(self.protein_file_suffix, '')
            genome_dir = os.path.join(self.output_dir, genome_id)
            output_hit_file = os.path.join(
                genome_dir,
                filename.replace(self.protein_file_suffix,
                                 self.tigrfam_suffix))

            hmmsearch_out = os.path.join(
                genome_dir,
                filename.replace(self.protein_file_suffix, '_tigrfam.out'))

            # Check if this has already been processed.
            out_files = (output_hit_file, hmmsearch_out,
                         TopHitTigrFile.get_path(self.output_dir, genome_id))
            if all([file_has_checksum(x) for x in out_files]):
                self.warnings.info(
                    f'Skipped TIGRFAM processing for: {genome_id}')
                with n_skipped.get_lock():
                    n_skipped.value += 1

            else:
                args = [
                    'hmmsearch', '-o', hmmsearch_out, '--tblout',
                    output_hit_file, '--noali', '--notextw', '--cut_nc',
                    '--cpu',
                    str(self.cpus_per_genome), self.tigrfam_hmms, gene_file
                ]
                p = subprocess.Popen(args,
                                     stdout=subprocess.PIPE,
                                     encoding='utf-8')
                stdout, stderr = p.communicate()

                if p.returncode != 0:
                    raise GTDBTkExit(
                        f'Non-zero exit code returned when running hmsearch: {stdout}'
                    )

                # calculate checksum
                for out_file in [output_hit_file, hmmsearch_out]:
                    checksum = sha256(out_file)
                    with open(out_file + self.checksum_suffix, 'w') as fh:
                        fh.write(checksum)

                # identify top hit for each gene
                self._topHit(output_hit_file)

            # allow results to be processed or written to file
            queueOut.put(gene_file)
Ejemplo n.º 13
0
    def _find_ingroup_taxon(self, ingroup_taxon, tree):
        """Find node of ingroup taxon in tree."""

        ingroup_node = None
        for node in tree.postorder_node_iter():
            support, taxon, auxiliary_info = parse_label(node.label)

            if taxon:
                taxa = [t.strip() for t in taxon.split(';')]
                if ingroup_taxon in taxa:
                    if ingroup_node is not None:
                        raise GTDBTkExit(f'Ingroup taxon {ingroup_taxon} '
                                         f'identified multiple times.')
                    ingroup_node = node

        if ingroup_node is None:
            raise GTDBTkExit(f'Ingroup taxon {ingroup_taxon} not found in tree.')

        return ingroup_node
Ejemplo n.º 14
0
 def read(self):
     """Read the translation table summary file from disk."""
     if len(self.genomes) > 0:
         raise GTDBTkExit(
             f'Warning! Attempting to override in-memory values '
             f'for translation table summary file: {self.path}')
     with open(self.path, 'r') as fh:
         for line in fh.readlines():
             gid, tbl = line.strip().split('\t')
             self.genomes[gid] = int(tbl)
Ejemplo n.º 15
0
    def check_install(self):
        """Check that all reference files exist.

        Returns
        -------
        bool
            True if the installation is complete, False otherwise.
        """

        # Check that all programs are on the system path.
        self.logger.info(
            f'Checking that all third-party software are on the system path:')
        names = {
            'prodigal', 'hmmsearch', 'fastANI', 'mash', 'pplacer', 'guppy',
            'FastTree', 'FastTreeMP', 'hmmalign'
        }
        for name in sorted(names):
            on_path = False
            try:
                on_path = on_path or check_dependencies([name],
                                                        exit_on_fail=False)
            except:
                pass
            if on_path:
                self.logger.info("         |-- {:16} {}".format(
                    name, colour('OK', ['bright'], fg='green')))
            else:
                self.logger.info("         |-- {:16} {}".format(
                    name, colour('NOT FOUND', ['bright'], fg='yellow')))

        # Assume this was successful unless otherwise observed.
        ok = True

        # Compute the hash for each directory
        self.logger.info(
            f'Checking integrity of reference package: {Config.GENERIC_PATH}')
        for obj_path, expected_hash in Config.REF_HASHES.items():
            base_name = obj_path[:-1] if obj_path.endswith('/') else obj_path
            base_name = base_name.split('/')[-1]
            user_hash = sha1_dir(obj_path, progress=True)

            if user_hash != expected_hash:
                self.logger.info("         |-- {:16} {}".format(
                    base_name,
                    colour(f'HASH MISMATCH {user_hash}', ['bright'],
                           fg='yellow')))
                ok = False
            else:
                self.logger.info("         |-- {:16} {}".format(
                    base_name, colour('OK', ['bright'], fg='green')))

        if not ok:
            raise GTDBTkExit(
                'Unexpected files were seen, or the reference package is corrupt.'
            )
Ejemplo n.º 16
0
 def _calculate(self):
     self.logger.info('Calculating Mash distances.')
     args = ['mash', 'dist', '-p', self.cpus, '-d', self.max_d, '-v',
             self.mash_v, self.ref_sketch.path, self.qry_sketch.path]
     args = list(map(str, args))
     with open(self.path, 'w') as f_out:
         proc = subprocess.Popen(args, stdout=f_out,
                                 stderr=subprocess.PIPE, encoding='utf-8')
         _, stderr = proc.communicate()
     if proc.returncode != 0:
         raise GTDBTkExit(f'Error running Mash dist: {proc.stderr.read()}')
Ejemplo n.º 17
0
    def _load_metadata(self):
        """Loads the metadata from an existing Mash sketch file."""
        args = ['mash', 'info', '-t', self.path]
        proc = subprocess.Popen(args, stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE, encoding='utf-8')
        stdout, stderr = proc.communicate()

        if proc.returncode != 0:
            raise GTDBTkExit(f'Error reading Mash sketch file {self.path}:\n{stderr}')

        for hashes, length, path in re.findall(r'(\d+)\t(\d+)\t(.+)\t.+\n', stdout):
            self.data[path] = (int(hashes), int(length))
Ejemplo n.º 18
0
    def read(self):
        """Read the summary file from disk."""
        if not os.path.isfile(self.path):
            raise GTDBTkExit(
                f'Error, classify summary file not found: {self.path}')
        with open(self.path) as fh:

            # Load and verify the columns match the expected order.
            cols_exp, _ = self.get_col_order()
            cols_cur = fh.readline().strip().split('\t')
            if cols_exp != cols_cur:
                raise GTDBTkExit(
                    f'The classify summary file columns are inconsistent: {cols_cur}'
                )

            # Process the data.
            for line in fh.readlines():
                data = line.strip().split('\t')
                row = ClassifySummaryFileRow()
                row.gid = data[0]
                row.classification = data[1]
                row.fastani_ref = data[2]
                row.fastani_ref_radius = data[3]
                row.fastani_tax = data[4]
                row.fastani_ani = data[5]
                row.fastani_af = data[6]
                row.closest_placement_ref = data[7]
                row.closest_placement_radius = data[8]
                row.closest_placement_tax = data[9]
                row.closest_placement_ani = data[10]
                row.closest_placement_af = data[11]
                row.pplacer_tax = data[12]
                row.classification_method = data[13]
                row.note = data[14]
                row.other_related_refs = data[15]
                row.msa_percent = data[16]
                row.tln_table = data[17]
                row.red_value = data[18]
                row.warnings = data[19]
                self.add_row(row)
Ejemplo n.º 19
0
    def run(self, genomic_files):
        """Run Prodigal across a set of genomes.

        Parameters
        ----------
        genomic_files : dict
            Dictionary indicating the genomic and gene file for each genome.
        """

        # populate worker queue with data to process
        worker_queue = mp.Queue()
        writer_queue = mp.Queue()

        for genome_id, file_path in genomic_files.iteritems():
            worker_queue.put([genome_id, file_path])

        for _ in range(self.threads):
            worker_queue.put(None)

        try:
            manager = mp.Manager()
            out_dict = manager.dict()

            worker_proc = [
                mp.Process(target=self._worker,
                           args=(out_dict, worker_queue, writer_queue))
                for _ in range(self.threads)
            ]
            writer_proc = mp.Process(target=self._writer,
                                     args=(len(genomic_files), writer_queue))

            writer_proc.start()
            for p in worker_proc:
                p.start()

            for p in worker_proc:
                p.join()

                # Gracefully terminate the program.
                if p.exitcode != 0:
                    raise GTDBTkExit('Prodigal returned a non-zero exit code.')

            writer_queue.put(None)
            writer_proc.join()
        except Exception:
            for p in worker_proc:
                p.terminate()
            writer_proc.terminate()
            raise

        result_dict = {k: v for k, v in out_dict.items()}
        return result_dict
Ejemplo n.º 20
0
 def read(self):
     with open(self.path, 'r') as fh:
         for line in fh.readlines():
             idx, val = line.strip().split('\t')
             if idx == 'best_translation_table':
                 try:
                     self.best_tln_table = int(val)
                 except ValueError:
                     raise GTDBTkExit(
                         f'Invalid translation table: {val} for {self.path}'
                     )
             elif idx == 'coding_density_4':
                 try:
                     self.coding_density_4 = float(val)
                 except ValueError:
                     raise GTDBTkExit(
                         f'Invalid coding density: {val} for {self.path}')
             elif idx == 'coding_density_11':
                 try:
                     self.coding_density_11 = float(val)
                 except ValueError:
                     raise GTDBTkExit(
                         f'Invalid coding density: {val} for {self.path}')
Ejemplo n.º 21
0
    def read(self):
        """Read the summary file from disk."""
        if not os.path.isfile(self.path):
            raise GTDBTkExit(
                f'Error, classify tree mappings file not found: {self.path}')
        with open(self.path) as fh:

            # Load and verify the columns match the expected order.
            cols_exp, _ = self.get_col_order()
            cols_cur = fh.readline().strip().split('\t')
            if cols_exp != cols_cur:
                raise GTDBTkExit(
                    f'The classify tree mappings columns are inconsistent: {cols_cur}'
                )

            # Process the data.
            for line in fh.readlines():
                data = line.strip().split('\t')
                row = GenomeMappingFileRow()
                row.gid = data[0]
                row.ani_classification = data[1]
                row.mapped_tree = data[2]
                self.add_row(row)
Ejemplo n.º 22
0
    def add_genome(self, genome_id: str, path_faa: str, pfam_th: TopHitPfamFile, tigr_th: TopHitTigrFile):
        """Process the top hit files for a genome and store the copy info."""
        if genome_id in self.genomes:
            self.logger.warning(f'Genome already exists in copy number file: {genome_id}')
        self.genomes[genome_id] = {'unq': dict(), 'mul': dict(), 'muq': dict(), 'mis': dict()}

        # Pointers to unique, multiple hit, multiple-unique, missing markers.
        cur_unq = self.genomes[genome_id]['unq']
        cur_mul = self.genomes[genome_id]['mul']
        cur_muq = self.genomes[genome_id]['muq']
        cur_mis = self.genomes[genome_id]['mis']

        # Load genes from the prodigal faa file.
        d_genes = read_fasta(path_faa, False)
        for seq_id, seq in d_genes.items():
            if seq.endswith('*'):
                d_genes[seq_id] = seq[:-1]

        # Create a dictionary of marker names -> Hits
        d_hmm_hits = self._merge_hit_files(pfam_th, tigr_th)

        # Foreach expected marker determine which category it falls into.
        for marker_id in self.marker_names:

            # Marker is missing.
            if marker_id not in d_hmm_hits:
                cur_mis[marker_id] = None

            # Multiple hits to to the same marker.
            elif len(d_hmm_hits[marker_id]) > 1:

                # If sequences are the same, take the most significant hit
                unq_seqs = {d_genes[x.gene_id] for x in d_hmm_hits[marker_id]}
                if len(unq_seqs) == 1:
                    cur_top_hit = sorted(d_hmm_hits[marker_id], reverse=True)[0]
                    cur_muq[marker_id] = {'hit': cur_top_hit, 'seq': d_genes[cur_top_hit.gene_id]}

                # Marker maps to multiple genes.
                else:
                    cur_mul[marker_id] = None

            # This was a unique hit.
            else:
                cur_hit = d_hmm_hits[marker_id][0]
                cur_unq[marker_id] = {'hit': cur_hit, 'seq': d_genes[cur_hit.gene_id]}

        # Sanity check - confirm that the total number of markers matches.
        if len(self.marker_names) != len(cur_unq) + len(cur_mul) + len(cur_muq) + len(cur_mis):
            raise GTDBTkExit('The marker set is inconsistent, please report this issue.')
Ejemplo n.º 23
0
    def _parse_result_queue(self, q_results, path_to_gid):
        """Creates the output dictionary given the results from FastANI

        Parameters
        ----------
        q_results : mp.Queue
            A multiprocessing queue containing raw results.
        path_to_gid : dict[str ,str]
            A dictionary containing the file path to genome id.

        Returns
        -------
        dict[str, dict[str, dict[str, float]]]
            The ANI/AF of the query genome to all reference genomes.
        """
        out = dict()
        while True:
            q_item = q_results.get(block=True)
            if q_item is None:
                break

            job, result = q_item
            qry_gid = job['qry']

            for path_a, dict_b in result.items():
                for path_b, (ani, af) in dict_b.items():
                    gid_a, gid_b = path_to_gid[path_a], path_to_gid[path_b]

                    # This was done in the forward direction.
                    if gid_a == qry_gid:
                        ref_gid = gid_b
                    # This was done in the reverse direction.
                    elif gid_b == qry_gid:
                        ref_gid = gid_a
                    else:
                        raise GTDBTkExit('FastANI results are malformed.')

                    # Take the largest ANI / AF from either pass.
                    if qry_gid not in out:
                        out[qry_gid] = {ref_gid: {'ani': ani, 'af': af}}
                    elif ref_gid not in out[qry_gid]:
                        out[qry_gid][ref_gid] = {'ani': ani, 'af': af}
                    else:
                        out[qry_gid][ref_gid]['ani'] = max(
                            out[qry_gid][ref_gid]['ani'], ani)
                        out[qry_gid][ref_gid]['af'] = max(
                            out[qry_gid][ref_gid]['af'], af)

        return out
Ejemplo n.º 24
0
def export_msa(domain: Domain, output_file: str):
    """Exports the GTDB MSA to the specified path.

    :param domain: The domain used to determine the marker set.
    :param output_file: The path to write the MSA.
    """
    if domain is Domain.ARCHAEA:
        file_to_export = CONCAT_AR53
    elif domain is Domain.BACTERIA:
        file_to_export = CONCAT_BAC120
    else:
        raise GTDBTkExit(f'Unknown domain: "{domain}"')

    make_sure_path_exists(os.path.dirname(output_file))
    copyfile(file_to_export, output_file)
Ejemplo n.º 25
0
    def _get_ingroup_domain(self, ingroup_taxon) -> str:
        """Get domain on ingroup taxon."""

        # read GTDB taxonomy in order to establish domain on ingroup taxon
        gtdb_taxonomy = Taxonomy().read(TAXONOMY_FILE)
        ingroup_domain = None
        for taxa in gtdb_taxonomy.values():
            if ingroup_taxon in taxa:
                ingroup_domain = taxa[Taxonomy.DOMAIN_IDX]

        if ingroup_domain is None:
            raise GTDBTkExit(f'Ingroup taxon {ingroup_taxon} was not found in '
                             f'the GTDB taxonomy.')

        return ingroup_domain
Ejemplo n.º 26
0
    def run_proc(self, q, r, ql, rl, output):
        """Runs the FastANI process.

        Parameters
        ----------
        q : str
            The path to the query genome.
        r : str
            The path to the reference genome.
        ql : str
            The path to the query list file.
        rl : str
            The path to the reference list file.
        output : str
            The path to the output file.

        Returns
        -------
        dict[str, dict[str, float]]
            The ANI/AF of the query genomes to the reference genomes.
        """
        args = ['fastANI']
        if self.minFrac:
            args.extend(['--minFraction', '0'])
        if q is not None:
            args.extend(['-q', q])
        if r is not None:
            args.extend(['-r', r])
        if ql is not None:
            args.extend(['--ql', ql])
        if rl is not None:
            args.extend(['--rl', rl])
        args.extend(['-o', output])
        self.logger.debug(' '.join(args))
        proc = subprocess.Popen(args,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE,
                                encoding='utf-8')
        stdout, stderr = proc.communicate()

        if proc.returncode != 0:
            self.logger.error('STDOUT:\n' + stdout)
            self.logger.error('STDERR:\n' + stderr)
            raise GTDBTkExit('FastANI returned a non-zero exit code.')

        # Parse the output
        return self.parse_output_file(output)
Ejemplo n.º 27
0
    def read(self,
             taxonomy_file: str,
             canonical_ids: bool = False) -> Dict[str, List[str]]:
        """Read Greengenes-style taxonomy file.

        Expected format is:
            <id>\t<taxonomy string>

        where the taxonomy string has the formats:
            d__; p__; c__; o__; f__; g__; s__

        Parameters
        ----------
        taxonomy_file : str
            Path to a Greengenes-style taxonomy file.
        canonical_ids : bool
            True if to use the canonical ID format, False otherwise.
        """

        try:
            d = {}
            with open(taxonomy_file, 'r') as f:
                for row, line in enumerate(f.readlines()):
                    line_split = line.split('\t')

                    if len(line_split) != 2:
                        raise GTDBTkExit(f'Not a tab-separated line: {line}')

                    unique_id = line_split[0]
                    if canonical_ids:
                        unique_id = canonical_gid(unique_id)

                    tax_str = line_split[1].rstrip()
                    if tax_str[-1] == ';':
                        # remove trailing semicolons which sometimes
                        # appear in Greengenes-style taxonomy files
                        tax_str = tax_str[0:-1]

                    d[unique_id] = [x.strip() for x in tax_str.split(';')]
        except:
            self.logger.error('Failed to parse taxonomy file on line %d' %
                              (row + 1))
            raise

        return d
Ejemplo n.º 28
0
    def parse_output_file(self, path_out):
        """Parses the resulting output file from FastANI.

        Parameters
        ----------
        path_out : str
            The path where the output file resides.

        Returns
        -------
        dict[str, dict[str, tuple[float, float]]]
            The ANI/AF of the query genomes to the reference genomes.
        """
        out = dict()
        if os.path.isfile(path_out):
            with open(path_out, 'r') as fh:
                for line in fh.readlines():
                    """FastANI version >=1.1 uses tabs instead of spaces to separate columns.
                    Preferentially try split with tabs first instead of split() in-case of 
                    spaces in the file path."""
                    try:
                        try:
                            path_qry, path_ref, ani, frac1, frac2 = line.strip(
                            ).split('\t')
                        except ValueError:
                            path_qry, path_ref, ani, frac1, frac2 = line.strip(
                            ).split(' ')
                            if not self._suppress_v1_warning:
                                self.logger.warning(
                                    'You are using FastANI v1.0, it is recommended '
                                    'that you update to a more recent version.'
                                )
                                self._suppress_v1_warning = True
                        af = round(float(frac1) / float(frac2), 2)
                        if path_qry not in out:
                            out[path_qry] = {path_ref: (float(ani), af)}
                        elif path_ref not in out[path_qry]:
                            out[path_qry][path_ref] = (float(ani), af)
                    except Exception as e:
                        self.logger.error(
                            f'Exception reading FastANI output: {repr(e)}')
                        raise GTDBTkExit(f'Unable to read line "{line}"')
        return out
Ejemplo n.º 29
0
    def _run(self, ref_msh, qry_msh, max_d):
        args = ['mash', 'dist', '-p', self.cpus, '-d', max_d, ref_msh, qry_msh]
        args = list(map(str, args))
        proc = subprocess.Popen(args,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE,
                                encoding='utf-8')
        stdout, stderr = proc.communicate()
        if proc.returncode != 0:
            raise GTDBTkExit(f'Error running Mash dist: {proc.stderr.read()}')

        out = defaultdict(dict)
        for ref_id, qry_id, dist, p_val, shared_n, shared_d in re.findall(
                r'(.+)\t(.+)\t(.+)\t(.+)\t(\d+)\/(\d+)\n', stdout):
            dist, p_val = float(dist), float(p_val)
            shared_num, shared_den = int(shared_n), int(shared_d)
            out[qry_id][ref_id] = (dist, p_val, shared_num, shared_den)

        return out
Ejemplo n.º 30
0
    def _get_median_reds(self, ingroup_domain: str):
        """Get median RED values for domain of ingroup taxon."""

        # get median RED values for domain
        if ingroup_domain == 'd__Bacteria':
            median_reds = RED_DIST_BAC_DICT
        elif ingroup_domain == 'd__Archaea':
            median_reds = RED_DIST_ARC_DICT
        else:
            raise GTDBTkExit(f'Unrecognized GTDB domain: {ingroup_domain}.')

        # report median values
        domain = ingroup_domain.replace('d__', '')
        self.logger.info('Median RED values for {}:'.format(domain))
        for idx, rank_prefix in enumerate(Taxonomy.rank_prefixes):
            if idx != Taxonomy.DOMAIN_IDX and idx != Taxonomy.SPECIES_IDX:
                self.logger.info('  {}\t{:.3f}'.format(
                    Taxonomy.rank_labels[idx].capitalize(),
                    median_reds[rank_prefix]))

        return median_reds