Ejemplo n.º 1
0
    def unanimous(self, options):
        """Unanimous command"""

        check_dir_exists(options.profile_dir)
        make_sure_path_exists(options.output_dir)

        bin_dirs = self._bin_dirs(options)

        e = Ensemble(options.bin_prefix)
        e.run(options.profile_dir,
              bin_dirs,
              options.marker_dir,
              options.weight,
              options.sel_min_quality,
              options.sel_min_comp,
              options.sel_max_cont,
              None,
              None,
              None,
              False,  # perform greedy bin selection
              True,   # perform unanimous bin selection
              options.report_min_quality,
              options.simple_headers,
              options.output_dir)

        self.logger.info(
            f"UniteM 'unanimous' results written to: {options.output_dir}")
Ejemplo n.º 2
0
    def write(self):
        """Writes the file to disk."""

        make_sure_path_exists(os.path.dirname(self.path))
        header = ['Gene Id', 'Top hits (Family id,e-value,bitscore)']
        with open(self.path, 'w') as fh:
            fh.write('\t'.join(header) + '\n')
            for gene_id, hits in sorted(self.hits.items()):
                out_hits = list()
                for cur_hit in sorted(hits.values(), reverse=True):
                    out_hits.append(cur_hit.hmm_str())
                concat_hits = ';'.join(out_hits)
                fh.write(f'{gene_id}\t{concat_hits}\n')
Ejemplo n.º 3
0
    def profile(self, options):
        """Profile command"""

        make_sure_path_exists(options.output_dir)

        bin_dirs = self._bin_dirs(options)

        profile = Profile(options.cpus)
        profile.run(bin_dirs,
                    options.marker_dir,
                    options.keep_intermediate,
                    options.output_dir)

        self.logger.info(
            f"UniteM 'profile' results written to: {options.output_dir}")
Ejemplo n.º 4
0
    def bin(self, options):
        """Bin command"""

        check_file_exists(options.assembly_file)
        make_sure_path_exists(options.output_dir)

        bin = Bin(options.assembly_file,
                  options.output_dir,
                  options.min_contig_len,
                  options.cpus)
        bin.check_on_path(options)
        bin.coverage(options.bam_files, options.cov_file)
        bin.run(options)

        self.logger.info(
            f"UniteM 'bin' results written to: {options.output_dir}")
Ejemplo n.º 5
0
    def maxbin(self, bin_file_out, num_markers):
        """Run MaxBin."""

        bin_dir = os.path.join(self.output_dir, f'maxbin_ms{num_markers}')
        make_sure_path_exists(bin_dir)
        cov_file_dir = os.path.join(bin_dir, 'coverage_files')
        make_sure_path_exists(cov_file_dir)
        abund_list_file = self._create_maxbin_coverage_files(
            self.cov_file, cov_file_dir)

        self.logger.info(f"Running MaxBin v2 with {num_markers} markers.")
        bin_prefix = os.path.join(bin_dir, f'max{num_markers}')
        cmd = f'run_MaxBin.pl -min_contig_length {self.min_contig_len}'
        cmd += ' -thread {} -markerset {} -contig {} -out {} -abund_list {}'.format(
            self.cpus, num_markers, self.assembly_file, bin_prefix,
            abund_list_file)

        self._run_method(cmd, bin_dir, bin_file_out, f'maxbin_ms{num_markers}')
Ejemplo n.º 6
0
    def groopm2(self, bin_file_out):
        """Run GroopM v2."""

        self.logger.info("Running GroopM v2 parse.")
        bin_dir = os.path.join(self.output_dir, 'groopm2')
        make_sure_path_exists(bin_dir)
        output_db = os.path.join(bin_dir, 'groopm.db')
        cmd = 'groopm2 parse -f -t {} -c {} --cov_file {} {} {}'.format(
            self.cpus, self.min_contig_len, self.cov_file, output_db,
            self.assembly_file)
        run_cmd(cmd, program='groopm2')

        self.logger.info("Running GroopM v2 core.")
        cmd = 'groopm2 core -f {} -c {} --save_dists'.format(
            output_db, self.min_contig_len)
        run_cmd(cmd, program='groopm2')

        self.logger.info("Running GroopM v2 extract.")
        bin_prefix = os.path.join(bin_dir, 'gm2')
        cmd = 'groopm2 extract -p {} {} {}'.format(bin_prefix, output_db,
                                                   self.assembly_file)

        self._run_method(cmd, bin_dir, bin_file_out, 'groopm2')
Ejemplo n.º 7
0
    def run(self, profile_dir, bin_dirs, marker_dir, quality_weight,
            sel_min_quality, sel_min_comp, sel_max_cont, remove_perc, add_perc,
            add_matches, greedy, unanimous, report_min_quality, simple_headers,
            output_dir):
        """Perform ensemble binning of genomes across multiple binning methods.

        Parameters
        ----------
        profile_dir : str
          Directory with bin profiles (output of 'profile' command).
        bin_dirs : list of str
            Directories containing bins from different binning methods.
        quality_weight : float
          Weight given to contamination when assessing quality of bins.
        sel_min_quality : float
          Minimum quality of bins to select.
        sel_min_comp : float
          Minimum completeness of bins to select.
        sel_max_cont : float
          Maximum contamination of bins to select.
        remove_perc : float
          Minimum percentage of bins from other binning methods require to remove contig in highest quality bin.
        add_perc : float
          Minimum percentage of matched bins required to add contig to highest quality bin.
        add_matches : float
          Minimum number of matched bins required to 'add' contigs.
        greedy : boolean
          Perform greedy bin selection.
        unanimous : boolean
          Perform unanimous bin selection.
        report_min_quality : float
          Minimum quality of bins to report.
        simple_headers : boolean
          Flag indicating that information should not be appended to the headers of bin FASTA files.
        output_dir : str
          Output directory.
        """

        markers = Markers(marker_dir)

        if greedy:
            self.logger.info(
                "Greedy selection with quality_weight = {:.1f}, sel_min_quality = {:.1f}, sel_min_comp = {:.1f}, and sel_max_cont = {:.1f}."
                .format(quality_weight, sel_min_quality, sel_min_comp,
                        sel_max_cont))
            remove_perc = 101
            add_perc = 101
        elif unanimous:
            self.logger.info(
                "Unanimous selection with quality_weight = {:.1f}, sel_min_quality = {:.1f}, sel_min_comp = {:.1f}, and sel_max_cont = {:.1f}."
                .format(quality_weight, sel_min_quality, sel_min_comp,
                        sel_max_cont))
        else:
            self.logger.info(
                "Consensus selection with quality_weight ={:.1f}, sel_min_quality = {:.1f}, sel_min_comp = {:.1f}, and sel_max_cont = {:.1f}."
                .format(quality_weight, sel_min_quality, sel_min_comp,
                        sel_max_cont))
            self.logger.info(
                'Removing and adding contigs by consensus with remove_perc = {:.1f}, add_perc = {:.1f}, add_matches = {:,}.'
                .format(remove_perc, add_perc, add_matches))

        self.logger.info('Reporting bins with a quality >= {:.1f}.'.format(
            report_min_quality))

        # get scaffold IDs in bins across all binning methods
        self.logger.info('Reading all bins.')
        bins, contigs, contigs_in_bins = read_bins(bin_dirs)
        methods_sorted = sorted(bins.keys())
        contig_lens = {cid: len(contigs[cid]) for cid in contigs}
        orig_bins = copy.deepcopy(bins)

        # get marker genes for bins across all binning methods
        self.logger.info('Identifying marker genes across all bins.')
        gene_tables = markers.marker_gene_tables(profile_dir,
                                                 binning_methods=bins.keys())

        # create output directories
        bin_dir = os.path.join(output_dir, 'bins')
        make_sure_path_exists(bin_dir)

        # get initial quality of bins
        self.logger.info('Reading initial quality of bins.')
        init_bin_quality = self._read_init_bin_quality(profile_dir)

        # write out initial state of each contig
        self.logger.info('Recording initial state of contigs.')
        self._write_initial_contig_state(methods_sorted, contigs_in_bins,
                                         init_bin_quality, output_dir)

        # perform consensus selection of bins
        header = 'UniteM Bin ID\tBinning Method\tBin ID'
        header += '\tMarker Domain\tCompleteness (%)\tContamination (%)\tQuality (%)'
        header += '\tGenome Size\tN50\tL50\tNo. Contigs'
        if not greedy:
            header += '\tNo. Matched Bins\tNo. Removed Contigs'
            if not unanimous:
                header += '\tNo. Added Contigs'
        header += '\n'

        fout = open(os.path.join(output_dir, 'bin_info.tsv'), 'w')
        fout.write(header)

        fout_matched = open(os.path.join(output_dir, 'matched_set_info.tsv'),
                            'w')
        if greedy:
            fout_matched.write(
                'No. Matched Bins\tBin ID\tCompleteness (%)\tContamination (%)\tQuality (%)\n'
            )
        else:
            fout_matched.write(
                'No. Matched Bins\tUniteM Bin ID\tCompleteness (%)\tContamination (%)\tQuality (%)'
            )
            fout_matched.write(
                '\tMatched Bin ID\tCompleteness (%)\tContamination (%)\tQuality (%)\n'
            )

        fout_bin_info = open(os.path.join(output_dir, 'matched_bin_info.tsv'),
                             'w')
        fout_bin_info.write(
            'UniteM Bin ID\tCompleteness (%)\tContamination (%)\tQuality (%)')
        fout_bin_info.write(
            '\tNo. Matched Bins\tMatched Bin ID\tPercent Common Bases\tCompleteness (%)\tContamination (%)\tQuality (%)\n'
        )

        fout_contigs = open(os.path.join(output_dir, 'contig_info.tsv'), 'w')
        fout_contigs.write(
            'UniteM Bin ID\tContig ID\tNo. Matched Bins\tNo. Unmatched Bins\tNo. Degenerate Bins\tNo. Unbinned'
        )
        for method in methods_sorted:
            fout_contigs.write(f'\t{method}')
        fout_contigs.write('\n')

        plot_dir = os.path.join(output_dir, 'common_bases')
        make_sure_path_exists(plot_dir)

        # perform ensemble binning
        self.logger.info('Performing ensemble binning.')
        bin_num = 0
        total_comp = 0
        total_cont = 0
        total_quality = 0
        sel_bins = {}
        selected_rows = {}
        unitem_common_bases = defaultdict(lambda: defaultdict(int))
        unitem_bin_quality = {}
        sanity_check_contigs = set()
        tree_common_bases = TreeCommonBases()
        while True:
            # determine highest quality match bin set
            bin_quality = self._bin_quality(bins, contigs, gene_tables,
                                            quality_weight, markers)

            matched_sets = self._matched_bin_sets(bins, contig_lens,
                                                  bin_quality, sel_min_quality,
                                                  sel_min_comp, sel_max_cont,
                                                  greedy)

            if len(matched_sets) == 0:
                break  # no bins meeting selection criteria

            new_bin, removed_contigs, added_contigs = self._resolve_matched_set(
                matched_sets[0], bins, contigs, bin_quality, remove_perc,
                add_perc, add_matches, sel_min_quality, sel_min_comp,
                sel_max_cont, greedy, unanimous)

            _domain, comp, cont = markers.bin_quality(new_bin)

            quality = comp - quality_weight * cont
            if quality < report_min_quality:
                break

            total_comp += comp
            total_cont += cont
            total_quality += quality

            # report selection
            bin_num += 1
            unitem_bin_id = f'{self.bin_prefix}{bin_num}'
            unitem_bin_quality[unitem_bin_id] = (comp, cont)
            primary_bm, primary_bid, _q, _n50, _gs = matched_sets[0][0]
            self.logger.info(
                "Selected {} from {} with quality = {:.1f} (comp. = {:.1f}%, cont. = {:.1f}%)."
                .format(primary_bid, primary_bm, quality, comp, cont))

            if not greedy and not unanimous:
                # performing consensus binning
                self.logger.info(
                    "-> Identified {} matched bins, removed {} contigs, added {} contigs."
                    .format(len(matched_sets[0]), len(removed_contigs),
                            len(added_contigs)))
            elif not greedy:
                # performing unanimous binning
                self.logger.info(
                    "-> Identified {} matched bins and removed {} contigs.".
                    format(len(matched_sets[0]), len(removed_contigs)))

            # write out matched set
            fout_matched.write(f'{len(matched_sets[0])}')
            if not greedy:
                fout_matched.write('\t{}\t{:.1f}\t{:.1f}\t{:.1f}'.format(
                    unitem_bin_id, comp, cont, quality))

            for bm, bid, q, _n50, _gs in matched_sets[0]:
                _domain, comp, cont = markers.bin_quality(bins[bm][bid])
                quality = comp - quality_weight * cont
                fout_matched.write('\t{}\t{:.1f}\t{:.1f}\t{:.1f}'.format(
                    bm + '~' + bid, comp, cont, quality))
            fout_matched.write('\n')

            # write out common base pairs
            matches = self._perc_bases_in_common(new_bin, orig_bins,
                                                 contig_lens)
            fout_bin_info.write('{}\t{:.1f}\t{:.1f}\t{:.1f}\t{}'.format(
                unitem_bin_id, comp, cont, quality, len(matches)))

            for bm, bid, perc_common in matches:
                unitem_common_bases[unitem_bin_id][bm] = perc_common
                _domain, comp, cont = markers.bin_quality(orig_bins[bm][bid])
                quality = comp - quality_weight * cont
                fout_bin_info.write(
                    '\t{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}'.format(
                        bm + '~' + bid, perc_common, comp, cont, quality))

            fout_bin_info.write('\n')

            output_file = os.path.join(plot_dir, unitem_bin_id + '.svg')
            tree_common_bases.plot(unitem_bin_id, matches, output_file)

            # write out summary information about selected bins
            row = self._report_selection(unitem_bin_id, new_bin, markers,
                                         quality_weight,
                                         primary_bm, primary_bid,
                                         str(len(matched_sets[0])),
                                         str(len(removed_contigs)),
                                         str(len(added_contigs)), greedy,
                                         unanimous)
            fout.write(row)
            selected_rows[unitem_bin_id] = row

            # write out contig info
            matched_bins = {}
            for m in matched_sets[0]:
                matched_bins[m[0]] = m[1]

            new_bin_file = os.path.join(bin_dir, unitem_bin_id + '.fna.gz')
            fout_bin = gzip.open(new_bin_file, 'wt')
            for cid, seq in new_bin.items():
                fout_contigs.write(f'{unitem_bin_id}\t{cid}')

                if cid in sanity_check_contigs:
                    self.logger.error(f'Contig selected twice: {cid}')
                    sys.exit(1)
                sanity_check_contigs.add(cid)

                matched = 0
                unmatched = 0
                unbinned = 0
                degenerate = 0
                row = ''
                for m in methods_sorted:
                    if m in contigs_in_bins[cid]:
                        bid = contigs_in_bins[cid][m]
                        domain, comp, cont = markers.bin_quality(bins[m][bid])
                        if m in matched_bins and bid == matched_bins[m]:
                            row += '\t{},{},{:.1f},{:.1f}'.format(
                                'matched', bid, comp, cont)
                            matched += 1
                        else:
                            q = comp - quality_weight * cont
                            if q < sel_min_quality or comp < sel_min_comp or cont > sel_max_cont:
                                row += '\t{},{},{:.1f},{:.1f}'.format(
                                    'degenerate', bid, comp, cont)
                                degenerate += 1
                            else:
                                row += '\t{},{},{:.1f},{:.1f}'.format(
                                    'unmatched', bid, comp, cont)
                                unmatched += 1
                    else:
                        row += '\tunbinned'
                        unbinned += 1

                fout_contigs.write('\t{}\t{}\t{}\t{}'.format(
                    matched, unmatched, degenerate, unbinned))
                fout_contigs.write(row + '\n')

                if simple_headers:
                    fout_bin.write(f'>{cid}\n')
                else:
                    cid_info = '[matched={}] [unmatched={}] [degenerate={}] [unbinned={}]'.format(
                        matched, unmatched, degenerate, unbinned)

                    if cid in added_contigs:
                        cid_info += ' [added by consensus]'

                    fout_bin.write(f'>{cid} {cid_info}\n')
                fout_bin.write(seq + '\n')
            fout_bin.close()

            # remove contigs in highest quality bin from marker gene tables and all other bins
            self._update_gene_tables(gene_tables, new_bin.keys())
            self._update_bins(bins, new_bin.keys())

            sel_bins[unitem_bin_id] = new_bin

        self.logger.info(f'Selected {bin_num} bins.')
        self.logger.info(
            '-> total comp. = {:.1f}, total cont. = {:.1f}, total quality = {:.1f}'
            .format(total_comp, total_cont, total_quality))

        fout.close()
        fout_matched.close()
        fout_contigs.close()
        fout_bin_info.close()

        plot = PlotCommonBases()
        output_plot = os.path.join(output_dir, 'percent_common_bases.svg')
        plot.plot(unitem_common_bases, unitem_bin_quality, output_plot)

        # summarize results of reported bins
        summary_file = os.path.join(output_dir, 'bin_quality_summary.tsv')
        self._bin_summary(sel_bins, {}, quality_weight, markers, summary_file)
Ejemplo n.º 8
0
    def run(self, genomic_files, output_dir):
        """Run Prodigal across a set of genomes."""

        self.output_dir = output_dir
        make_sure_path_exists(output_dir)

        # populate worker queue with data to process
        worker_queue = mp.Queue()
        writer_queue = mp.Queue()

        for gid, genome_file in genomic_files.items():
            worker_queue.put((gid, genome_file))

        for _ in range(self.cpus):
            worker_queue.put(None)

        worker_proc = []
        writer_proc = None
        try:
            manager = mp.Manager()
            out_dict = manager.dict()

            worker_proc = [
                mp.Process(target=self._worker,
                           args=(out_dict, worker_queue, writer_queue))
                for _ in range(self.cpus)
            ]
            writer_proc = mp.Process(target=self._writer,
                                     args=(len(genomic_files), writer_queue))

            writer_proc.start()
            for p in worker_proc:
                p.start()

            for p in worker_proc:
                p.join()

                # Gracefully terminate the program.
                if p.exitcode != 0:
                    raise ProdigalException(
                        'Prodigal returned non-zero exit code.')

            writer_queue.put(None)
            writer_proc.join()
        except Exception as e:
            for p in worker_proc:
                p.terminate()

            if writer_proc:
                writer_proc.terminate()

            raise ProdigalException(
                f'Exception caught while running Prodigal: {e}')

        # report genomes which failed to have any genes called
        result_dict = dict()
        failed_gids = list()
        for gid, gid_dict in out_dict.items():
            if os.path.getsize(gid_dict['aa_gene_path']) <= 1:
                failed_gids.append(gid)
            else:
                result_dict[gid] = gid_dict

        if len(failed_gids) > 0:
            self.logger.warning(
                f'Skipping {len(failed_gids)} of {len(genomic_files)} '
                'genomes as no genes were called by Prodigal.')

        return result_dict