Exemple #1
0
    def run(self, gene_files):
        """Annotate genes with Pfam HMMs.

        Parameters
        ----------
        gene_files : iterable
            Gene files in FASTA format to process.
        """

        self.cpus_per_genome = max(1, int(self.threads / len(gene_files)))

        # populate worker queue with data to process
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for f in gene_files:
            workerQueue.put(f)

        for _ in range(self.threads):
            workerQueue.put(None)

        try:
            workerProc = [
                mp.Process(target=self._workerThread,
                           args=(workerQueue, writerQueue))
                for _ in range(self.threads)
            ]
            writeProc = mp.Process(target=self._writerThread,
                                   args=(len(gene_files), writerQueue))

            writeProc.start()

            for p in workerProc:
                p.start()

            for p in workerProc:
                p.join()
                if p.exitcode != 0:
                    raise GTDBTkExit(
                        'An error was encountered while running hmmsearch.')

            writerQueue.put(None)
            writeProc.join()
        except Exception:
            for p in workerProc:
                p.terminate()

            writeProc.terminate()
            raise
Exemple #2
0
    def _parse_result_queue(self, q_results, path_to_gid):
        """Creates the output dictionary given the results from FastANI

        Parameters
        ----------
        q_results : Queue
            A multiprocessing queue containing raw results.
        path_to_gid : Dict[str ,str]
            A dictionary containing the file path to genome id.

        Returns
        -------
        Dict[str, Dict[str, Dict[str, float]]]
            The ANI/AF of the query genome to all reference genomes.
        """
        out = dict()
        while True:
            q_item = q_results.get(block=True)
            if q_item is None:
                break

            job, result = q_item
            qry_gid = job['qry']

            for path_a, dict_b in result.items():
                for path_b, (ani, af) in dict_b.items():
                    gid_a, gid_b = path_to_gid[path_a], path_to_gid[path_b]

                    # This was done in the forward direction.
                    if gid_a == qry_gid:
                        ref_gid = gid_b
                    # This was done in the reverse direction.
                    elif gid_b == qry_gid:
                        ref_gid = gid_a
                    else:
                        raise GTDBTkExit('FastANI results are malformed.')

                    # Take the largest ANI / AF from either pass.
                    if qry_gid not in out:
                        out[qry_gid] = {ref_gid: {'ani': ani, 'af': af}}
                    elif ref_gid not in out[qry_gid]:
                        out[qry_gid][ref_gid] = {'ani': ani, 'af': af}
                    else:
                        out[qry_gid][ref_gid]['ani'] = max(
                            out[qry_gid][ref_gid]['ani'], ani)
                        out[qry_gid][ref_gid]['af'] = max(
                            out[qry_gid][ref_gid]['af'], af)

        return out
Exemple #3
0
    def add_genome(self, genome_id: str, path_faa: str, pfam_th: TopHitPfamFile, tigr_th: TopHitTigrFile):
        """Process the top hit files for a genome and store the copy info."""
        if genome_id in self.genomes:
            self.logger.warning(f'Genome already exists in copy number file: {genome_id}')
        self.genomes[genome_id] = {'unq': dict(), 'mul': dict(), 'muq': dict(), 'mis': dict()}

        # Pointers to unique, multiple hit, multiple-unique, missing markers.
        cur_unq = self.genomes[genome_id]['unq']
        cur_mul = self.genomes[genome_id]['mul']
        cur_muq = self.genomes[genome_id]['muq']
        cur_mis = self.genomes[genome_id]['mis']

        # Load genes from the prodigal faa file.
        d_genes = read_fasta(path_faa, False)
        for seq_id, seq in d_genes.items():
            if seq.endswith('*'):
                d_genes[seq_id] = seq[:-1]

        # Create a dictionary of marker names -> Hits
        d_hmm_hits = self._merge_hit_files(pfam_th, tigr_th)

        # Foreach expected marker determine which category it falls into.
        for marker_id in self.marker_names:

            # Marker is missing.
            if marker_id not in d_hmm_hits:
                cur_mis[marker_id] = None

            # Multiple hits to to the same marker.
            elif len(d_hmm_hits[marker_id]) > 1:

                # If sequences are the same, take the most significant hit
                unq_seqs = {d_genes[x.gene_id] for x in d_hmm_hits[marker_id]}
                if len(unq_seqs) == 1:
                    cur_top_hit = sorted(d_hmm_hits[marker_id], reverse=True)[0]
                    cur_muq[marker_id] = {'hit': cur_top_hit, 'seq': d_genes[cur_top_hit.gene_id]}

                # Marker maps to multiple genes.
                else:
                    cur_mul[marker_id] = None

            # This was a unique hit.
            else:
                cur_hit = d_hmm_hits[marker_id][0]
                cur_unq[marker_id] = {'hit': cur_hit, 'seq': d_genes[cur_hit.gene_id]}

        # Sanity check - confirm that the total number of markers matches.
        if len(self.marker_names) != len(cur_unq) + len(cur_mul) + len(cur_muq) + len(cur_mis):
            raise GTDBTkExit('The marker set is inconsistent, please report this issue.')
Exemple #4
0
 def _calculate(self):
     self.logger.info('Calculating Mash distances.')
     args = [
         'mash', 'dist', '-p', self.cpus, '-d', self.max_d, '-v',
         self.mash_v, self.ref_sketch.path, self.qry_sketch.path
     ]
     args = list(map(str, args))
     with open(self.path, 'w') as f_out:
         proc = subprocess.Popen(args,
                                 stdout=f_out,
                                 stderr=subprocess.PIPE,
                                 encoding='utf-8')
         _, stderr = proc.communicate()
     if proc.returncode != 0:
         raise GTDBTkExit(f'Error running Mash dist: {proc.stderr.read()}')
Exemple #5
0
def export_msa(domain: Domain, output_file: str):
    """Exports the GTDB MSA to the specified path.

    :param domain: The domain used to determine the marker set.
    :param output_file: The path to write the MSA.
    """
    if domain is Domain.ARCHAEA:
        file_to_export = CONCAT_AR53
    elif domain is Domain.BACTERIA:
        file_to_export = CONCAT_BAC120
    else:
        raise GTDBTkExit(f'Unknown domain: "{domain}"')

    make_sure_path_exists(os.path.dirname(output_file))
    copyfile(file_to_export, output_file)
Exemple #6
0
    def _get_ingroup_domain(self, ingroup_taxon) -> str:
        """Get domain on ingroup taxon."""

        # read GTDB taxonomy in order to establish domain on ingroup taxon
        gtdb_taxonomy = Taxonomy().read(TAXONOMY_FILE)
        ingroup_domain = None
        for taxa in gtdb_taxonomy.values():
            if ingroup_taxon in taxa:
                ingroup_domain = taxa[Taxonomy.DOMAIN_IDX]

        if ingroup_domain is None:
            raise GTDBTkExit(f'Ingroup taxon {ingroup_taxon} was not found in '
                             f'the GTDB taxonomy.')

        return ingroup_domain
Exemple #7
0
 def read(self):
     """Reads the marker names from disk. No sequence information!"""
     with open(self.path) as fh:
         fh.readline()
         for line in fh.readlines():
             genome_id, n_unq, n_mul, n_muq, n_mis, unq, mul, muq, mis = line.split('\t')
             n_unq, n_mul, n_muq, n_mis = int(n_unq), int(n_mul), int(n_muq), int(n_mis)
             self.genomes[genome_id] = {'unq': {x: None for x in unq.strip().split(',') if len(x) > 0},
                                        'mul': {x: None for x in mul.strip().split(',') if len(x) > 0},
                                        'muq': {x: None for x in muq.strip().split(',') if len(x) > 0},
                                        'mis': {x: None for x in mis.strip().split(',') if len(x) > 0}}
             cur_dict = self.genomes[genome_id]
             if len(cur_dict['unq']) != n_unq or len(cur_dict['mul']) != n_mul or \
                     len(cur_dict['muq']) != n_muq or len(cur_dict['mis']) != n_mis:
                 raise GTDBTkExit(f'The marker file is inconsistent: {self.path}')
Exemple #8
0
    def _load_metadata(self):
        """Loads the metadata from an existing Mash sketch file."""
        args = ['mash', 'info', '-t', self.path]
        proc = subprocess.Popen(args,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE,
                                encoding='utf-8')
        stdout, stderr = proc.communicate()

        if proc.returncode != 0:
            raise GTDBTkExit(
                f'Error reading Mash sketch file {self.path}:\n{stderr}')

        for hashes, length, path in re.findall(r'(\d+)\t(\d+)\t(.+)\t.+\n',
                                               stdout):
            self.data[path] = (int(hashes), int(length))
Exemple #9
0
    def run_proc(self, q, r, ql, rl, output):
        """Runs the FastANI process.

        Parameters
        ----------
        q : str
            The path to the query genome.
        r : str
            The path to the reference genome.
        ql : str
            The path to the query list file.
        rl : str
            The path to the reference list file.
        output : str
            The path to the output file.

        Returns
        -------
        dict[str, dict[str, float]]
            The ANI/AF of the query genomes to the reference genomes.
        """
        args = ['fastANI']
        if self.minFrac:
            args.extend(['--minFraction', '0'])
        if q is not None:
            args.extend(['-q', q])
        if r is not None:
            args.extend(['-r', r])
        if ql is not None:
            args.extend(['--ql', ql])
        if rl is not None:
            args.extend(['--rl', rl])
        args.extend(['-o', output])
        self.logger.debug(' '.join(args))
        proc = subprocess.Popen(args,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE,
                                encoding='utf-8')
        stdout, stderr = proc.communicate()

        if proc.returncode != 0:
            self.logger.error('STDOUT:\n' + stdout)
            self.logger.error('STDERR:\n' + stderr)
            raise GTDBTkExit('FastANI returned a non-zero exit code.')

        # Parse the output
        return self.parse_output_file(output)
Exemple #10
0
    def read(self,
             taxonomy_file: str,
             canonical_ids: bool = False) -> Dict[str, List[str]]:
        """Read Greengenes-style taxonomy file.

        Expected format is:
            <id>\t<taxonomy string>

        where the taxonomy string has the formats:
            d__; p__; c__; o__; f__; g__; s__

        Parameters
        ----------
        taxonomy_file : str
            Path to a Greengenes-style taxonomy file.
        canonical_ids : bool
            True if to use the canonical ID format, False otherwise.
        """

        try:
            d = {}
            with open(taxonomy_file, 'r') as f:
                for row, line in enumerate(f.readlines()):
                    line_split = line.split('\t')

                    if len(line_split) != 2:
                        raise GTDBTkExit(f'Not a tab-separated line: {line}')

                    unique_id = line_split[0]
                    if canonical_ids:
                        unique_id = canonical_gid(unique_id)

                    tax_str = line_split[1].rstrip()
                    if tax_str[-1] == ';':
                        # remove trailing semicolons which sometimes
                        # appear in Greengenes-style taxonomy files
                        tax_str = tax_str[0:-1]

                    d[unique_id] = [x.strip() for x in tax_str.split(';')]
        except:
            self.logger.error('Failed to parse taxonomy file on line %d' %
                              (row + 1))
            raise

        return d
Exemple #11
0
    def parse_output_file(self, path_out):
        """Parses the resulting output file from FastANI.

        Parameters
        ----------
        path_out : str
            The path where the output file resides.

        Returns
        -------
        dict[str, dict[str, tuple[float, float]]]
            The ANI/AF of the query genomes to the reference genomes.
        """
        out = dict()
        if os.path.isfile(path_out):
            with open(path_out, 'r') as fh:
                for line in fh.readlines():
                    """FastANI version >=1.1 uses tabs instead of spaces to separate columns.
                    Preferentially try split with tabs first instead of split() in-case of 
                    spaces in the file path."""
                    try:
                        try:
                            path_qry, path_ref, ani, frac1, frac2 = line.strip(
                            ).split('\t')
                        except ValueError:
                            path_qry, path_ref, ani, frac1, frac2 = line.strip(
                            ).split(' ')
                            if not self._suppress_v1_warning:
                                self.logger.warning(
                                    'You are using FastANI v1.0, it is recommended '
                                    'that you update to a more recent version.'
                                )
                                self._suppress_v1_warning = True
                        af = round(float(frac1) / float(frac2), 2)
                        if path_qry not in out:
                            out[path_qry] = {path_ref: (float(ani), af)}
                        elif path_ref not in out[path_qry]:
                            out[path_qry][path_ref] = (float(ani), af)
                    except Exception as e:
                        self.logger.error(
                            f'Exception reading FastANI output: {repr(e)}')
                        raise GTDBTkExit(f'Unable to read line "{line}"')
        return out
Exemple #12
0
    def _run(self, ref_msh, qry_msh, max_d):
        args = ['mash', 'dist', '-p', self.cpus, '-d', max_d, ref_msh, qry_msh]
        args = list(map(str, args))
        proc = subprocess.Popen(args,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE,
                                encoding='utf-8')
        stdout, stderr = proc.communicate()
        if proc.returncode != 0:
            raise GTDBTkExit(f'Error running Mash dist: {proc.stderr.read()}')

        out = defaultdict(dict)
        for ref_id, qry_id, dist, p_val, shared_n, shared_d in re.findall(
                r'(.+)\t(.+)\t(.+)\t(.+)\t(\d+)\/(\d+)\n', stdout):
            dist, p_val = float(dist), float(p_val)
            shared_num, shared_den = int(shared_n), int(shared_d)
            out[qry_id][ref_id] = (dist, p_val, shared_num, shared_den)

        return out
Exemple #13
0
    def _get_median_reds(self, ingroup_domain: str):
        """Get median RED values for domain of ingroup taxon."""

        # get median RED values for domain
        if ingroup_domain == 'd__Bacteria':
            median_reds = RED_DIST_BAC_DICT
        elif ingroup_domain == 'd__Archaea':
            median_reds = RED_DIST_ARC_DICT
        else:
            raise GTDBTkExit(f'Unrecognized GTDB domain: {ingroup_domain}.')

        # report median values
        domain = ingroup_domain.replace('d__', '')
        self.logger.info('Median RED values for {}:'.format(domain))
        for idx, rank_prefix in enumerate(Taxonomy.rank_prefixes):
            if idx != Taxonomy.DOMAIN_IDX and idx != Taxonomy.SPECIES_IDX:
                self.logger.info('  {}\t{:.3f}'.format(
                    Taxonomy.rank_labels[idx].capitalize(),
                    median_reds[rank_prefix]))

        return median_reds
Exemple #14
0
    def __init__(self, genomes, path, cpus, k, s):
        """Create a sketch file for a given set of genomes.

        Parameters
        ----------
        genomes : dict[str, str]
            The genomes to create a sketch file from (genome_id, fasta_path).
        path : str
            The path to write the sketch file to.
        cpus : int
            The maximum number of CPUs available for Mash.
        k : int
            The k-mer size.
        s : int
            Maximum number of non-redundant hashes.
        """
        self.logger = logging.getLogger('timestamp')
        self.genomes = genomes
        self.path = path
        self.data = dict()
        self.args = dict()
        self.cpus = cpus
        self.k = k
        self.s = s

        make_sure_path_exists(os.path.dirname(self.path))

        # Use the pre-existing sketch file, otherwise generate it.
        if os.path.isfile(self.path):
            self.logger.info(
                f'Loading data from existing Mash sketch file: {self.path}')
            self._load_metadata()
            if not self._is_consistent():
                raise GTDBTkExit(f'The sketch file is not consistent with the '
                                 f'input genomes. Remove the existing sketch '
                                 f'file or specify a new output directory.')
        else:
            self.logger.info(f'Creating Mash sketch file: {self.path}')
            self._generate()
Exemple #15
0
    def _generate(self):
        """Generate a new sketch file."""
        with tempfile.TemporaryDirectory(prefix='gtdbtk_mash_tmp_') as dir_tmp:
            path_genomes = os.path.join(dir_tmp, 'genomes.txt')
            with open(path_genomes, 'w') as fh:
                for path in self.genomes.values():
                    fh.write(f'{path}\n')

            args = ['mash', 'sketch', '-l', '-p', self.cpus, path_genomes, '-o',
                    self.path, '-k', self.k, '-s', self.s]
            args = list(map(str, args))
            proc = subprocess.Popen(args, stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE, encoding='utf-8')
            bar_fmt = '==> Sketched {n_fmt}/{total_fmt} ({percentage:.0f}%) ' \
                      'genomes [{rate_fmt}, ETA {remaining}]'
            with tqdm(bar_format=bar_fmt, total=len(self.genomes), mininterval=1, smoothing=0.1) as p_bar:
                for line in iter(proc.stderr.readline, ''):
                    if line.startswith('Sketching'):
                        p_bar.update()
            proc.wait()

            if proc.returncode != 0 or not os.path.isfile(self.path):
                raise GTDBTkExit(f'Error generating Mash sketch: {proc.stderr.read()}')
Exemple #16
0
    def __init__(self, genomes, path, cpus, k, s):
        self.logger = logging.getLogger('timestamp')
        self.genomes = genomes
        self.path = path
        self.data = dict()
        self.args = dict()
        self.cpus = cpus
        self.k = k
        self.s = s

        make_sure_path_exists(os.path.dirname(self.path))

        # Use the pre-existing sketch file, otherwise generate it.
        if os.path.isfile(self.path):
            self.logger.info(
                f'Loading data from existing Mash sketch file: {self.path}')
            self._load_metadata()
            if not self._is_consistent():
                raise GTDBTkExit(f'The sketch file is not consistent with the '
                                 f'input genomes. Remove the existing sketch '
                                 f'file or specify a new output directory.')
        else:
            self.logger.info(f'Creating Mash sketch file: {self.path}')
            self._generate()
Exemple #17
0
    def align(self,
              identify_dir,
              skip_gtdb_refs,
              taxa_filter,
              min_perc_aa,
              custom_msa_filters,
              skip_trimming,
              rnd_seed,
              cols_per_gene,
              min_consensus,
              max_consensus,
              min_per_taxa,
              out_dir,
              prefix,
              outgroup_taxon,
              genomes_to_process=None):
        """Align marker genes in genomes."""

        # read genomes that failed identify steps to skip them
        failed_genomes_file = os.path.join(
            os.path.join(identify_dir, PATH_FAILS.format(prefix=prefix)))
        if os.path.isfile(failed_genomes_file):
            with open(failed_genomes_file) as fgf:
                failed_genomes = [row.split()[0] for row in fgf]
        else:
            failed_genomes = list()

        # If the user is re-running this step, check if the identify step is consistent.
        genomic_files = self._path_to_identify_data(identify_dir,
                                                    identify_dir != out_dir)
        if genomes_to_process is not None and len(genomic_files) != len(
                genomes_to_process):
            if list(
                    set(genomic_files.keys()) - set(genomes_to_process.keys())
            ).sort() != failed_genomes.sort():
                self.logger.error(
                    '{} are not present in the input list of genome to process.'
                    .format(
                        list(
                            set(genomic_files.keys()) -
                            set(genomes_to_process.keys()))))
                raise InconsistentGenomeBatch(
                    'You are attempting to run GTDB-Tk on a non-empty directory that contains extra '
                    'genomes not present in your initial identify directory. Remove them, or run '
                    'GTDB-Tk on a new directory.')

        # If this is being run as a part of classify_wf, copy the required files.
        if identify_dir != out_dir:
            identify_path = os.path.join(out_dir, DIR_IDENTIFY)
            make_sure_path_exists(identify_path)
            copy(
                CopyNumberFileBAC120(identify_dir, prefix).path, identify_path)
            copy(CopyNumberFileAR53(identify_dir, prefix).path, identify_path)
            copy(TlnTableSummaryFile(identify_dir, prefix).path, identify_path)

        # Create the align intermediate directory.
        make_sure_path_exists(os.path.join(out_dir, DIR_ALIGN_INTERMEDIATE))

        # Write out files with marker information
        ar53_marker_info_file = MarkerInfoFileAR53(out_dir, prefix)
        ar53_marker_info_file.write()
        bac120_marker_info_file = MarkerInfoFileBAC120(out_dir, prefix)
        bac120_marker_info_file.write()

        # Determine what domain each genome belongs to.
        bac_gids, ar_gids, _bac_ar_diff = self.genome_domain(
            identify_dir, prefix)
        if len(bac_gids) + len(ar_gids) == 0:
            raise GTDBTkExit(f'Unable to assign a domain to any genomes, '
                             f'please check the identify marker summary file, '
                             f'and verify genome quality.')

        # # Create a temporary directory that will be used to generate each of the alignments.
        # with tempfile.TemporaryDirectory(prefix='gtdbtk_tmp_') as dir_tmp_arc, \
        #         tempfile.TemporaryDirectory(prefix='gtdbtk_tmp_') as dir_tmp_bac:
        #
        #     cur_gid_dict = {x: genomic_files[x] for x in ar_gids}
        #     self.logger.info(f'Collecting marker sequences from {len(cur_gid_dict):,} '
        #                      f'genomes identified as archaeal.')
        #     align.concat_single_copy_hits(dir_tmp_arc,
        #                                   cur_gid_dict,
        #                                   ar53_marker_info_file)
        #

        self.logger.info(
            f'Aligning markers in {len(genomic_files):,} genomes with {self.cpus} CPUs.'
        )
        dom_iter = ((bac_gids, Config.CONCAT_BAC120, Config.MASK_BAC120,
                     "bac120", 'bacterial', CopyNumberFileBAC120),
                    (ar_gids, Config.CONCAT_AR53, Config.MASK_AR53, "ar53",
                     'archaeal', CopyNumberFileAR53))
        gtdb_taxonomy = Taxonomy().read(self.taxonomy_file)
        for gids, msa_file, mask_file, marker_set_id, domain_str, copy_number_f in dom_iter:

            # No genomes identified as this domain.
            if len(gids) == 0:
                continue

            self.logger.info(
                f'Processing {len(gids):,} genomes identified as {domain_str}.'
            )
            if marker_set_id == 'bac120':
                marker_info_file = bac120_marker_info_file
                marker_filtered_genomes = os.path.join(
                    out_dir,
                    PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix))
                marker_msa_path = os.path.join(
                    out_dir, PATH_BAC120_MSA.format(prefix=prefix))
                marker_user_msa_path = os.path.join(
                    out_dir, PATH_BAC120_USER_MSA.format(prefix=prefix))
            else:
                marker_info_file = ar53_marker_info_file
                marker_filtered_genomes = os.path.join(
                    out_dir, PATH_AR53_FILTERED_GENOMES.format(prefix=prefix))
                marker_msa_path = os.path.join(
                    out_dir, PATH_AR53_MSA.format(prefix=prefix))
                marker_user_msa_path = os.path.join(
                    out_dir, PATH_AR53_USER_MSA.format(prefix=prefix))

            cur_genome_files = {
                gid: f
                for gid, f in genomic_files.items() if gid in gids
            }

            if skip_gtdb_refs:
                gtdb_msa = {}
            else:
                gtdb_msa = self._msa_filter_by_taxa(msa_file, gtdb_taxonomy,
                                                    taxa_filter,
                                                    outgroup_taxon)
            gtdb_msa_mask = os.path.join(Config.MASK_DIR, mask_file)

            # Generate the user MSA.
            user_msa = align.align_marker_set(cur_genome_files,
                                              marker_info_file, copy_number_f,
                                              self.cpus)
            if len(user_msa) == 0:
                self.logger.warning(
                    f'Identified {len(user_msa):,} single copy {domain_str} hits.'
                )
                continue

            # Write the individual marker alignments to disk
            if self.debug:
                self._write_individual_markers(user_msa, marker_set_id,
                                               marker_info_file.path, out_dir,
                                               prefix)

            # filter columns without sufficient representation across taxa
            if skip_trimming:
                self.logger.info(
                    'Skipping custom filtering and selection of columns.')
                pruned_seqs = {}
                trimmed_seqs = merge_two_dicts(gtdb_msa, user_msa)

            elif custom_msa_filters:
                aligned_genomes = merge_two_dicts(gtdb_msa, user_msa)
                self.logger.info(
                    'Performing custom filtering and selection of columns.')

                trim_msa = TrimMSA(
                    cols_per_gene, min_perc_aa / 100.0, min_consensus / 100.0,
                    max_consensus / 100.0, min_per_taxa / 100.0, rnd_seed,
                    os.path.join(out_dir, f'filter_{marker_set_id}'))

                trimmed_seqs, pruned_seqs = trim_msa.trim(
                    aligned_genomes, marker_info_file.path)

                if trimmed_seqs:
                    self.logger.info(
                        'Filtered MSA from {:,} to {:,} AAs.'.format(
                            len(list(aligned_genomes.values())[0]),
                            len(list(trimmed_seqs.values())[0])))

                self.logger.info(
                    'Filtered {:,} genomes with amino acids in <{:.1f}% of columns in filtered MSA.'
                    .format(len(pruned_seqs), min_perc_aa))

                filtered_user_genomes = set(pruned_seqs).intersection(user_msa)
                if len(filtered_user_genomes):
                    self.logger.info(
                        f'Filtered genomes include {len(filtered_user_genomes)} user submitted genomes.'
                    )
            else:
                self.logger.log(
                    Config.LOG_TASK,
                    f'Masking columns of {domain_str} multiple sequence alignment using canonical mask.'
                )
                trimmed_seqs, pruned_seqs = self._apply_mask(
                    gtdb_msa, user_msa, gtdb_msa_mask, min_perc_aa / 100.0)
                self.logger.info(
                    'Masked {} alignment from {:,} to {:,} AAs.'.format(
                        domain_str, len(list(user_msa.values())[0]),
                        len(list(trimmed_seqs.values())[0])))

                if min_perc_aa > 0:
                    self.logger.info(
                        '{:,} {} user genomes have amino acids in <{:.1f}% of columns in filtered MSA.'
                        .format(len(pruned_seqs), domain_str, min_perc_aa))

            # write out filtering information
            with open(marker_filtered_genomes, 'w') as fout:
                for pruned_seq_id, pruned_seq in pruned_seqs.items():
                    if len(pruned_seq) == 0:
                        perc_alignment = 0
                    else:
                        valid_bases = sum(
                            [1 for c in pruned_seq if c.isalpha()])
                        perc_alignment = valid_bases * 100.0 / len(pruned_seq)
                    fout.write(
                        f'{pruned_seq_id}\tInsufficient number of amino acids in MSA ({perc_alignment:.1f}%)\n'
                    )

            # write out MSAs
            if not skip_gtdb_refs:
                self.logger.info(
                    f'Creating concatenated alignment for {len(trimmed_seqs):,} '
                    f'{domain_str} GTDB and user genomes.')
                self._write_msa(trimmed_seqs,
                                marker_msa_path,
                                gtdb_taxonomy,
                                zip_output=True)

            trimmed_user_msa = {
                k: v
                for k, v in trimmed_seqs.items() if k in user_msa
            }
            if len(trimmed_user_msa) > 0:
                self.logger.info(
                    f'Creating concatenated alignment for {len(trimmed_user_msa):,} '
                    f'{domain_str} user genomes.')
                self._write_msa(trimmed_user_msa,
                                marker_user_msa_path,
                                gtdb_taxonomy,
                                zip_output=True)
            else:
                self.logger.info(
                    f'All {domain_str} user genomes have been filtered out.')
    def run(self, gtdbtk_output_dir, ar122_metadata_file, bac120_metadata_file,
            output_file, gtdbtk_prefix):
        """Translate GTDB to NCBI classification via majority vote."""

        # Set the output directories
        if not (ar122_metadata_file or bac120_metadata_file):
            raise GTDBTkExit(
                'You must specify at least one of --ar122_metadata_file or --bac120_metadata_file'
            )
        ar_summary = os.path.join(gtdbtk_output_dir,
                                  PATH_AR122_SUMMARY_OUT.format(prefix=gtdbtk_prefix)) \
            if ar122_metadata_file else None
        ar_tree = os.path.join(gtdbtk_output_dir,
                               PATH_AR122_TREE_FILE.format(prefix=gtdbtk_prefix)) \
            if ar122_metadata_file else None
        bac_summary = os.path.join(gtdbtk_output_dir,
                                   PATH_BAC120_SUMMARY_OUT.format(prefix=gtdbtk_prefix)) \
            if bac120_metadata_file else None
        bac_tree = os.path.join(gtdbtk_output_dir,
                                PATH_BAC120_TREE_FILE.format(prefix=gtdbtk_prefix)) \
            if bac120_metadata_file else None

        # Create the output file directory.
        output_dir = os.path.dirname(output_file)
        if output_dir and not os.path.isdir(output_dir):
            os.makedirs(output_dir)

        # get NCBI taxonomy string for GTDB genomes and GTDB species clusters
        ncbi_taxa = {}
        ncbi_lineages = {}
        gtdb_sp_clusters = defaultdict(set)
        for domain, metadata_file in [('archaeal', ar122_metadata_file),
                                      ('bacterial', bac120_metadata_file)]:
            # Only process those domains which have been provided as an input.
            if metadata_file is None:
                continue

            self._logger.info(f'Processing {domain} metadata file.')
            if not os.path.exists(metadata_file):
                raise GTDBTkExit(f'File does not exist {metadata_file}')

            with open(metadata_file, 'r', encoding='utf-8') as f:
                header = f.readline().strip().split('\t')

                ncbi_taxonomy_index = header.index('ncbi_taxonomy')
                gtdb_genome_rep_index = header.index(
                    'gtdb_genome_representative')

                for line in f.readlines():
                    line_split = line.strip().split('\t')

                    gid = line_split[0]
                    ncbi_taxonomy = line_split[ncbi_taxonomy_index]

                    if ncbi_taxonomy and ncbi_taxonomy != 'none':
                        ncbi_taxa[gid] = [
                            t.strip() for t in ncbi_taxonomy.split(';')
                        ]

                        for idx, taxon in enumerate(ncbi_taxa[gid]):
                            ncbi_lineages[taxon] = ncbi_taxa[gid][0:idx + 1]
                            if idx < 6:
                                ncbi_lineages[taxon] += self.rank_prefix[idx +
                                                                         1:]

                    rep_id = line_split[gtdb_genome_rep_index]
                    gtdb_sp_clusters[rep_id].add(gid)

        self._logger.info(
            f'Read NCBI taxonomy for {len(ncbi_taxa):,} genomes.')
        self._logger.info(
            f'Identified {len(gtdb_sp_clusters):,} GTDB species clusters.')

        # get majority vote NCBI classification for each GTDB species cluster
        ncbi_sp_classification = defaultdict(list)
        for rep_id, cluster_ids in gtdb_sp_clusters.items():
            for rank in range(6, -1, -1):
                ncbi_taxon_list = []
                for cid in cluster_ids:
                    if cid in ncbi_taxa:
                        ncbi_taxon_list.append(ncbi_taxa[cid][rank])

                if len(ncbi_taxon_list) > 0:
                    counter = Counter(ncbi_taxon_list)
                    mc_taxon, mc_count = counter.most_common(1)[0]

                    if mc_count >= 0.5 * len(ncbi_taxon_list) and len(
                            mc_taxon) > 3:
                        ncbi_sp_classification[rep_id] = ncbi_lineages[
                            mc_taxon]
                        break

            if rep_id in ncbi_sp_classification and ncbi_sp_classification[
                    rep_id][0] == 'd__':
                raise GTDBTkExit(
                    f'Majority vote domain is undefined for {rep_id}')

        self._logger.info(f'Identified {len(ncbi_sp_classification):,} GTDB '
                          f'species clusters with an NCBI classification.')

        # convert GTDB classifications to NCBI classification
        with open(output_file, 'w') as fout:
            fout.write(
                'user_genome\tGTDB classification\tNCBI classification\n')
            for domain, summary_file, tree_file in [
                ('Archaea', ar_summary, ar_tree),
                ('Bacteria', bac_summary, bac_tree)
            ]:
                if summary_file is None or tree_file is None:
                    self._logger.warning(
                        f'{domain} have been skipped as no metadata file was provided.'
                    )
                    continue
                if not os.path.exists(summary_file):
                    self._logger.warning(
                        f'{domain} have been skipped as the summary file does not exist: {summary_file}'
                    )
                    continue
                if not os.path.exists(tree_file):
                    self._logger.warning(
                        f'{domain} have been skipped as the tree file does not exist: {summary_file}'
                    )
                    continue

                self._logger.info(f'Parsing {tree_file}')
                tree = dendropy.Tree.get_from_path(tree_file,
                                                   schema='newick',
                                                   rooting='force-rooted',
                                                   preserve_underscores=True)

                # map genomes IDs to leaf nodes
                leaf_node_map = {}
                for leaf in tree.leaf_node_iter():
                    leaf_node_map[leaf.taxon.label] = leaf

                # get majority vote NCBI classification for each user genome
                self._logger.info(f'Reclassifying genomes in {summary_file}')
                with open(summary_file) as f:
                    header = f.readline().strip().split('\t')

                    gtdb_classification_index = header.index('classification')

                    for line in f:
                        line_split = line.strip().split('\t')

                        user_gid = line_split[0]
                        gtdb_taxonomy = line_split[gtdb_classification_index]
                        gtdb_taxa = [
                            t.strip() for t in gtdb_taxonomy.split(';')
                        ]
                        gtdb_species = gtdb_taxa[6]

                        ncbi_rep_ids = self.get_ncbi_descendants(
                            user_gid, tree, leaf_node_map,
                            ncbi_sp_classification)

                        # take a majority vote over species with a NCBI classification, and
                        # limit taxonomic resolution to most-specific rank reported by GTDB-Tk
                        ncbi_classification = []
                        for rank in range(6, -1, -1):
                            if len(gtdb_taxa[rank]) == 3:
                                continue

                            ncbi_taxon_list = []
                            for rep_id in ncbi_rep_ids:
                                ncbi_taxon_list.append(
                                    ncbi_sp_classification[rep_id][rank])

                            counter = Counter(ncbi_taxon_list)
                            mc_taxon, mc_count = counter.most_common(1)[0]

                            if mc_count >= 0.5 * len(ncbi_taxon_list) and len(
                                    mc_taxon) > 3:
                                ncbi_classification = ncbi_lineages[mc_taxon]
                                break

                        # write out results
                        fout.write('%s\t%s\t%s\n' %
                                   (user_gid, gtdb_taxonomy,
                                    ';'.join(ncbi_classification)))

        self._logger.info(f'Results have been written to: {output_file}')
Exemple #19
0
    def root_with_outgroup(self, input_tree: str, output_tree: str,
                           outgroup: Set[str]):
        """Reroot the tree using the given outgroup.

        Parameters
        ----------
        input_tree
          File containing Newick tree to rerooted.
        output_tree
          Name of file for rerooted tree.
        outgroup
          Labels of taxa in outgroup.
        """

        tree = dendropy.Tree.get_from_path(input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        outgroup = set(outgroup)
        outgroup_in_tree = set()
        ingroup_leaves = set()
        for n in tree.leaf_node_iter():
            if n.taxon.label in outgroup:
                outgroup_in_tree.add(n.taxon)
            else:
                ingroup_leaves.add(n)

        self.logger.info(
            f'Identified {len(outgroup_in_tree):,} outgroup taxa in the tree.')
        self.logger.info(
            f'Identified {len(ingroup_leaves):,} ingroup taxa in the tree.')

        if len(outgroup_in_tree) == 0:
            self.logger.error('No outgroup taxa identified in the tree.')
            raise GTDBTkExit('Tree was not rerooted.')

        # Since finding the MRCA is a rooted tree operation,
        # the tree is first rerooted on an ingroup taxa. This
        # ensures the MRCA of the outgroup can be identified
        # so long as the outgroup is monophyletic. If the
        # outgroup is polyphyletic trying to root on it
        # is ill defined. To try and pick a "good" root for
        # polyphyletic outgroups, random ingroup taxa are
        # selected until two of them give the same size
        # lineage. This will, likely, be the smallest
        # bipartition possible for the given outgroup though
        # this is not guaranteed.
        mrca = tree.mrca(taxa=outgroup_in_tree)
        mrca_leaves = len(mrca.leaf_nodes())
        while True:
            rnd_ingroup = random.sample(ingroup_leaves, 1)[0]
            tree.reroot_at_edge(rnd_ingroup.edge,
                                length1=0.5 * rnd_ingroup.edge_length,
                                length2=0.5 * rnd_ingroup.edge_length)

            mrca = tree.mrca(taxa=outgroup_in_tree)
            if len(mrca.leaf_nodes()) == mrca_leaves:
                break

            mrca_leaves = len(mrca.leaf_nodes())

        if len(mrca.leaf_nodes()) != len(outgroup_in_tree):
            self.logger.info('Outgroup is not monophyletic. Tree will be '
                             'rerooted at the MRCA of the outgroup.')
            self.logger.info(f'The outgroup consisted of '
                             f'{len(outgroup_in_tree):,} taxa, while the MRCA '
                             f'has {len(mrca.leaf_nodes()):,} leaf nodes.')
            if len(mrca.leaf_nodes()) == len(tree.leaf_nodes()):
                self.logger.warning('The MRCA spans all taxa in the tree.')
                self.logger.warning('This indicating the selected outgroup is '
                                    'likely polyphyletic in the current tree.')
                self.logger.warning('Polyphyletic outgroups are not suitable '
                                    'for rooting. Try another outgroup.')
        else:
            self.logger.info('Outgroup is monophyletic.')

        if mrca.edge_length is None:
            self.logger.info(
                'Tree appears to already be rooted on this outgroup.')
        else:
            self.logger.info('Rerooting tree.')
            tree.reroot_at_edge(mrca.edge,
                                length1=0.5 * mrca.edge_length,
                                length2=0.5 * mrca.edge_length)
            tree.write_to_path(output_tree,
                               schema='newick',
                               suppress_rooting=True,
                               unquoted_underscores=True)
            self.logger.info(f'Rerooted tree written to: {output_tree}')
Exemple #20
0
 def coding_density_11(self, v):
     try:
         self._coding_density_11 = float(v)
     except ValueError:
         raise GTDBTkExit(f'Invalid coding density: {v} for {self.path}')
Exemple #21
0
 def add_genome(self, gid: str, tree_index: str):
     """PlAdds the pplacer classification of a given genome."""
     if gid in self.data:
         raise GTDBTkExit(
             f'Warning! Attempting to add duplicate genome: {gid}')
     self.data[gid] = tree_index
Exemple #22
0
 def best_tln_table(self, v):
     try:
         self._best_tln_table = int(v)
     except ValueError:
         raise GTDBTkExit(f'Invalid translation table: {v} for {self.path}')
 def add_row(self, row: PplacerHighClassifyRow):
     if row.gid in self.rows:
         raise GTDBTkExit(f'Attempting to add duplicate row: {row.gid}')
     self.rows[row.gid] = row
Exemple #24
0
    def _run_prodigal(self, genome_id, fasta_path):
        """Run Prodigal.

        Parameters
        ----------
        fasta_path : str
            Path to FASTA file to process.
        :return
            False if an error occurred.
        """

        # Setup output files
        output_dir = os.path.join(self.marker_gene_dir, genome_id)
        aa_gene_file = os.path.join(output_dir,
                                    genome_id + self.protein_file_suffix)
        nt_gene_file = None
        gff_file = None
        translation_table_file = None

        if not self.proteins:
            nt_gene_file = os.path.join(output_dir,
                                        genome_id + self.nt_gene_file_suffix)
            gff_file = os.path.join(output_dir,
                                    genome_id + self.gff_file_suffix)
            translation_table_file = os.path.join(
                output_dir, 'prodigal' + TRANSLATION_TABLE_SUFFIX)

        # Return early if files are already done
        if not self.proteins and file_has_checksum(aa_gene_file) and file_has_checksum(nt_gene_file) \
                and file_has_checksum(gff_file) and file_has_checksum(translation_table_file):
            best_tln_table = -1
            with open(translation_table_file, 'r') as tln_f:
                for line in tln_f.readlines():
                    cols = line.strip().split('\t')
                    if cols[0] == 'best_translation_table':
                        best_tln_table = int(cols[1])
                        break
            if best_tln_table > 0:
                self.logger.info(
                    'Skipping result from a previous run: {}'.format(
                        genome_id))
                return aa_gene_file, nt_gene_file, gff_file, translation_table_file, best_tln_table

        # Did not meet the conditions to skip processing this genome, call genes.
        prodigal = BioLibProdigal(1, False)
        summary_stats = prodigal.run([fasta_path],
                                     output_dir,
                                     called_genes=self.proteins)

        # An error occured in BioLib Prodigal.
        if not summary_stats:
            if self.force:
                return None
            else:
                raise GTDBTkExit(
                    "Prodigal failed to call genes for: {} "
                    "(to skip these genomes, re-run with --force)".format(
                        genome_id))

        summary_stats = list(summary_stats.values())[0]

        # rename output files to adhere to GTDB conventions and desired genome
        # ID

        shutil.move(summary_stats.aa_gene_file, aa_gene_file)
        with open(aa_gene_file + CHECKSUM_SUFFIX, 'w') as f:
            f.write(sha256(aa_gene_file))

        if not self.proteins:
            shutil.move(summary_stats.nt_gene_file, nt_gene_file)
            with open(nt_gene_file + CHECKSUM_SUFFIX, 'w') as f:
                f.write(sha256(nt_gene_file))

            shutil.move(summary_stats.gff_file, gff_file)
            with open(gff_file + CHECKSUM_SUFFIX, 'w') as f:
                f.write(sha256(gff_file))

            # save translation table information
            translation_table_file = os.path.join(
                output_dir, 'prodigal_translation_table.tsv')
            with open(translation_table_file, 'w') as fout:
                fout.write('%s\t%d\n' % ('best_translation_table',
                                         summary_stats.best_translation_table))
                fout.write(
                    '%s\t%.2f\n' %
                    ('coding_density_4', summary_stats.coding_density_4 * 100))
                fout.write('%s\t%.2f\n' %
                           ('coding_density_11',
                            summary_stats.coding_density_11 * 100))
                fout.write(
                    '%s\t%.2f\n' %
                    ('probability_4', summary_stats.probability_4 * 100))
                fout.write(
                    '%s\t%.2f\n' %
                    ('probability_11', summary_stats.probability_11 * 100))

            with open(translation_table_file + CHECKSUM_SUFFIX, 'w') as f:
                f.write(sha256(translation_table_file))

        return aa_gene_file, nt_gene_file, gff_file, translation_table_file, summary_stats.best_translation_table
Exemple #25
0
def calculate_patristic_distance(qry_node, ref_nodes, tt=None):
    """Computes the patristic distance from the query node to all reference
    nodes. Note that all nodes must be a leaf nodes under max_node.

    Parameters
    ----------
    qry_node : dendropy.Node
        The query taxon node that the distance to all ref nodes will be found.
    ref_nodes : List[dendropy.Node]
        A list of reference nodes that the qry_node will be calculated to.
    tt : Optional[TreeTraversal]
        A TreeTraversal index, if absent a new one will be created.

    Returns
    -------
    Dict[dendropy.Node, float]
        A dictionary keyed by each reference taxon, valued by patristic dist.
    """
    tt = tt or TreeTraversal()

    # Iterate over each of the ref_nodes to find the MRCA to qry_node.
    d_ref_to_mrca = dict()
    for ref_node in ref_nodes:
        cur_dist_to_mrca = ref_node.edge_length

        # Go up the tree until the descendants include qry_node.
        parent_node = ref_node.parent_node
        while parent_node is not None:
            leaf_nodes = tt.get_leaf_nodes(parent_node)

            # Found the MRCA node.
            if qry_node in leaf_nodes:
                d_ref_to_mrca[ref_node] = (parent_node, cur_dist_to_mrca)
                break

            # Keep going up.
            cur_dist_to_mrca += parent_node.edge_length
            parent_node = parent_node.parent_node

        # If the loop did not break, raise an exception.
        else:
            raise GTDBTkExit(f'Unable to find MRCA: {qry_node.taxon.label} / '
                             f'{ref_node.taxon.label}')

    # Compute the distance from the qry_node to each of the MRCAs.
    out = dict()
    for ref_node, (mrca_node, ref_mrca_dist) in d_ref_to_mrca.items():

        # Go up the tree until the MRCA is found again.
        cur_dist_to_mrca = qry_node.edge_length
        cur_node = qry_node.parent_node
        while cur_node is not None:

            # Found the MRCA node.
            if cur_node == mrca_node:
                out[ref_node] = cur_dist_to_mrca + ref_mrca_dist
                break

            # Keep going up.
            cur_dist_to_mrca += cur_node.edge_length
            cur_node = cur_node.parent_node

        # Impossible case, but throw an exception anyway.
        else:
            raise GTDBTkExit(f'Tree is inconsistent: {qry_node.taxon.label} / '
                             f'{ref_node.taxon.label}')

    return out
 def add_row(self, row: ClassifySummaryFileRow):
     if row.gid in self.rows:
         raise GTDBTkExit(f'Attempting to add duplicate row: {row.gid}')
     self.rows[row.gid] = row
Exemple #27
0
 def add_genome(self, genome_id: str, tln_table: int):
     """Record a translation table for a genome."""
     if genome_id in self.genomes:
         raise GTDBTkExit(
             f'Genome already exists in summary file: {genome_id}')
     self.genomes[genome_id] = tln_table
Exemple #28
0
    def _classify_on_internal_branch(self, leaf, child_taxons,
                                     current_rel_list, child_rel_dist,
                                     node_in_ref_tree, parent_rank, child_rk,
                                     taxa_str, taxa_str_terminal,
                                     is_on_terminal_branch, red_bac_dict):
        """
         Classification on an internal node is very similar to the 'normal' classification
         """

        # Persist descendant information for efficient traversal.
        tt = TreeTraversal()

        closest_rank = None

        if len(child_taxons) == 0:
            list_leaves = [
                childnd.taxon.label.replace("'", '')
                for childnd in tt.get_leaf_nodes(node_in_ref_tree)
                if childnd.taxon.label in self.reference_ids
            ]
            if len(list_leaves) != 1:
                list_subrank = []
                for leaf_subrank in list_leaves:
                    list_subrank.append(
                        self.gtdb_taxonomy.get(leaf_subrank)[
                            self.order_rank.index(parent_rank) + 1])
                if len(set(list_subrank)) == 1:
                    print(leaf.taxon.label)
                    print(list_leaves)
                    print(list_subrank)
                    raise GTDBTkExit('There should be only one leaf.')
                else:
                    closest_rank = parent_rank
                    detection = "taxonomic classification fully defined by topology"
            list_leaf_ranks = self.gtdb_taxonomy.get(
                list_leaves[0])[self.order_rank.index(child_rk):
                                -1]  # We remove the species name

            for leaf_taxon in reversed(list_leaf_ranks):
                leaf_taxon_rank = leaf_taxon[:3]
                if leaf_taxon == list_leaf_ranks[0]:
                    if abs(current_rel_list - red_bac_dict.get(leaf_taxon_rank)
                           ) < abs(current_rel_list -
                                   red_bac_dict.get(parent_rank[:3])):
                        closest_rank = leaf_taxon
                        break
                else:
                    pchildrank = list_leaf_ranks[
                        list_leaf_ranks.index(leaf_taxon) - 1]
                    if abs(current_rel_list - red_bac_dict.get(leaf_taxon_rank)
                           ) < abs(current_rel_list -
                                   red_bac_dict.get(pchildrank[:3])):
                        closest_rank = leaf_taxon
                        break
            if closest_rank is None:
                closest_rank = parent_rank
        # if there is multiple ranks on the child node (i.e genome between p__Nitrospirae and c__Nitrospiria;o__Nitrospirales;f__Nitropiraceae)
        # we loop through the list of rank from f_ to c_ rank
        for child_taxon in reversed(child_taxons):
            child_taxon_rank = child_taxon[:3]
            if child_taxon == child_taxons[0]:
                if (abs(current_rel_list - red_bac_dict.get(child_taxon_rank))
                        < abs(child_rel_dist -
                              red_bac_dict.get(child_taxon_rank))
                        and abs(current_rel_list -
                                red_bac_dict.get(child_taxon_rank)) <
                        abs(current_rel_list -
                            red_bac_dict.get(parent_rank[:3]))):
                    closest_rank = child_taxon
                elif closest_rank is None:
                    closest_rank = parent_rank
            else:
                pchildrank = child_taxons[child_taxons.index(child_taxon) - 1]
                if (abs(current_rel_list - red_bac_dict.get(child_taxon_rank))
                        < abs(current_rel_list -
                              red_bac_dict.get(child_taxon_rank))
                        and abs(current_rel_list -
                                red_bac_dict.get(child_taxon_rank)) <
                        abs(child_rel_dist -
                            red_bac_dict.get(child_taxon_rank))):
                    closest_rank = child_taxon
                    break
        if closest_rank is not None:
            # when we have the closest rank found, we can find it in
            # gtdb_Taxonomy and get the higher level from it.
            for k, v in self.gtdb_taxonomy.items():
                if closest_rank in v:
                    taxa_str = ';'.join(v[1:v.index(closest_rank) + 1])
                    # All classification should be at least to the order level if a genome
                    # is placed on a internal branch with only one order under
                    if any(x.startswith('o__') for x in child_taxons) \
                            and self.order_rank.index(closest_rank[0:3]) < self.order_rank.index('o__') \
                            and ('o__' in taxa_str_terminal.split(';') or not is_on_terminal_branch):
                        taxa_str_terminal = ';'.join(
                            v[1:self.order_rank.index('o__') + 1])
                    break

        return taxa_str, taxa_str_terminal
Exemple #29
0
    def run(self, dict_compare, dict_paths):
        """Runs FastANI in batch mode.

        Parameters
        ----------
        dict_compare : dict[str, set[str]]
            All query to reference comparisons to be made.
        dict_paths : dict[str, str]
            The path for each genome id being compared.

        Returns
        -------
        dict[str, dict[str, dict[str, float]]]
            A dictionary containing the ANI and AF for each comparison."""

        # Create the multiprocessing items.
        manager = mp.Manager()
        q_worker = manager.Queue()
        q_writer = manager.Queue()
        q_results = manager.Queue()

        # Populate the queue of comparisons in forwards and reverse direction.
        n_total = 0
        if self.force_single:
            for qry_gid, ref_set in dict_compare.items():
                qry_path = dict_paths[qry_gid]

                for ref_gid in ref_set:
                    ref_path = dict_paths[ref_gid]

                    fwd_dict = {'q': dict(), 'r': dict(), 'qry': qry_gid}
                    rev_dict = {'q': dict(), 'r': dict(), 'qry': qry_gid}

                    fwd_dict['q'][qry_gid] = qry_path
                    fwd_dict['r'][ref_gid] = ref_path

                    rev_dict['q'][ref_gid] = ref_path
                    rev_dict['r'][qry_gid] = qry_path

                    q_worker.put(fwd_dict)
                    q_worker.put(rev_dict)
                    n_total += 2
        else:
            for qry_gid, ref_set in dict_compare.items():
                fwd_dict = {'ql': dict(), 'rl': dict(), 'qry': qry_gid}
                rev_dict = {'ql': dict(), 'rl': dict(), 'qry': qry_gid}

                qry_path = dict_paths[qry_gid]
                fwd_dict['ql'][qry_gid] = qry_path
                rev_dict['rl'][qry_gid] = qry_path

                for ref_gid in ref_set:
                    ref_path = dict_paths[ref_gid]
                    fwd_dict['rl'][ref_gid] = ref_path
                    rev_dict['ql'][ref_gid] = ref_path

                q_worker.put(fwd_dict)
                q_worker.put(rev_dict)
                n_total += 2

        # Set the terminate condition for each worker thread.
        [q_worker.put(None) for _ in range(self.cpus)]

        # Create each of the processes
        p_workers = [
            mp.Process(target=self._worker,
                       args=(q_worker, q_writer, q_results))
            for _ in range(self.cpus)
        ]

        p_writer = mp.Process(target=self._writer, args=(q_writer, n_total))

        # Start each of the threads.
        try:
            # Start the writer and each processing thread.
            p_writer.start()
            for p_worker in p_workers:
                p_worker.start()

            # Wait until each worker has finished.
            for p_worker in p_workers:
                p_worker.join()

                # Gracefully terminate the program.
                if p_worker.exitcode != 0:
                    raise GTDBTkExit('FastANI returned a non-zero exit code.')

            # Stop the writer thread.
            q_writer.put(None)
            p_writer.join()

        except Exception:
            for p in p_workers:
                p.terminate()
            p_writer.terminate()
            raise

        # Process and return each of the results obtained
        path_to_gid = {v: k for k, v in dict_paths.items()}
        q_results.put(None)
        return self._parse_result_queue(q_results, path_to_gid)
Exemple #30
0
 def add_row(self, row: GenomeMappingFileRow):
     if row.gid in self.rows:
         raise GTDBTkExit(f'Attempting to add duplicate row: {row.gid}')
     self.rows[row.gid] = row