コード例 #1
0
ファイル: Cluster.py プロジェクト: WenchaoLin/cDNA_primer
class Cluster(IceFiles):

    """
    An object of `Cluster` calls the ICE algorithm to
    generate consensus isoforms.
    """

    def __init__(
        self,
        root_dir,
        flnc_fa,
        nfl_fa,
        bas_fofn,
        ccs_fofn,
        out_fa,
        sge_opts,
        ice_opts,
        ipq_opts,
        report_fn=None,
        summary_fn=None,
        fasta_fofn=None,
        nfl_reads_per_split=30000,
    ):
        super(Cluster, self).__init__(
            prog_name="Cluster", root_dir=root_dir, bas_fofn=bas_fofn, ccs_fofn=ccs_fofn, fasta_fofn=fasta_fofn
        )

        self.sge_opts = sge_opts  # SGE, CPU arguments and etc
        self.ice_opts = ice_opts  # ICE clustering algorithm arguments
        self.ipq_opts = ipq_opts  # IceQuiver HQ/LQ isoform arguments
        self.nfl_reads_per_split = int(nfl_reads_per_split)  # ToDo: setsanity check here?

        self.flnc_fa, self.nfl_fa, self.ccs_fofn, self.fasta_fofn = self._validate_inputs(
            flnc_fa, nfl_fa, ccs_fofn, fasta_fofn=fasta_fofn, quiver=self.ice_opts.quiver
        )

        self.root_dir, self.out_fa = self._validate_outputs(root_dir, out_fa)

        self.sanity_check()

        self._probqv = None  # probability & quality value

        self._flnc_splitted_fas = []  # split flnc_fa into smaller files.
        self._nflncSplittedFas = []  # split nfl_fa into smaller files.
        self._logConfigs()  # Log configurations

        self.iceinit = None
        self.icec = None
        self.iceq = None
        self.pol = None

        self.add_log(
            "Setting ece_penalty: {0} ece_min_len: {1}".format(ice_opts.ece_penalty, ice_opts.ece_min_len),
            level=logging.INFO,
        )

        self.report_fn = realpath(report_fn) if report_fn is not None else op.join(self.root_dir, "cluster_report.csv")
        self.summary_fn = (
            realpath(summary_fn) if summary_fn is not None else op.join(self.root_dir, "cluster_summary.txt")
        )

        self.add_log("A Cluster Object created.", level=logging.INFO)

    def _validate_inputs(self, _flnc_fa, _nfl_fa, _ccs_fofn, fasta_fofn=None, quiver=False):
        """Validate input files and return absolute expaneded paths."""
        flnc_fa, nfl_fa, ccs_fofn = _flnc_fa, _nfl_fa, _ccs_fofn
        self.add_log("Checking input files.", level=logging.INFO)
        if flnc_fa is None:
            raise ClusterException(
                "Input full-length non-chimeric reads " + "files (i.e., flnc_fa) needs to be specified."
            )
        else:
            flnc_fa = realpath(flnc_fa)
            if not op.exists(flnc_fa):
                raise ClusterException("Unable to find full-length " + "non-chimeric reads: {fn}".format(fn=flnc_fa))

        if nfl_fa is None:
            if quiver is True:
                raise ClusterException(
                    "Input non-full-length reads file " + "(i.e., nfl_fa) needs to be specified for isoform polish."
                )
        else:
            nfl_fa = realpath(nfl_fa)
            if not op.exists(nfl_fa):
                raise ClusterException("Unable to find non-full-length " + "non-chimeric reads: {fn}".format(fn=nfl_fa))

        if ccs_fofn is not None and not op.exists(ccs_fofn):
            raise ClusterException("Unable to find CCS FOFN file: " + "{fn}".format(fn=ccs_fofn))

        if fasta_fofn is not None and quiver:
            if not os.path.exists(fasta_fofn):
                raise ClusterException("Unable to find FASTA_FOFN file: {0}".format(fasta_fofn))
            for line in open(fasta_fofn):
                if not os.path.exists(line.strip()):
                    raise ClusterException("Unable to locate {0} in {1}".format(line.strip(), fasta_fofn))
        return (flnc_fa, nfl_fa, ccs_fofn, fasta_fofn)

    def _validate_outputs(self, _root_dir, _out_fa):
        """Validate outputs, create root_dir if it does not exist."""
        self.add_log("Checking outputs.", level=logging.INFO)
        root_dir, out_fa = _root_dir, _out_fa
        if root_dir is None:
            self.add_log("Output directory needs to be specified.", level=logging.ERROR)
        if out_fa is None:
            self.add_log("Output consensus fasta needs to be specified.", level=logging.ERROR)

        root_dir = realpath(root_dir)
        out_fa = realpath(out_fa)

        if op.exists(root_dir):
            self.add_log("Output directory {d} already exists.".format(d=root_dir))
        else:
            self.add_log("Creating output directory {d}.".format(d=root_dir))
            os.mkdir(root_dir)
        if op.exists(out_fa):
            raise ClusterException("Consensus FASTA file {f} already exists.".format(f=out_fa))
        return root_dir, out_fa

    def sanity_check(self):
        """Do sanity check before stat to run."""
        errMsg = ""
        if self.ice_opts.quiver is True:
            if self.bas_fofn is None:
                errMsg = (
                    "A fofn of bas/bax.h5 files, e.g., input.fofn, "
                    + "is required in order to polish consensus "
                    + "isoforms using quiver."
                )
            if self.nfl_fa is None:
                errMsg = "Non-full-length reads are required for polishing " + "consensus isoforms using quiver."
        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise ValueError(errMsg)

    @property
    def configFN(self):
        """Return configuration file of the current run."""
        return op.join(self.root_dir, "run_ice_config.txt")

    def _logConfigs(self):
        """Log configuration."""
        with open(self.configFN, "w", 0) as f:
            f.write("pbtranscript " + get_version() + "\n")
            f.write(str(self.ice_opts) + "\n")
            f.write(str(self.sge_opts) + "\n")

    @property
    def initPickleFN(self):
        """Return path to pickle file with initial clusters."""
        return op.join(self.root_dir, "init.uc.pickle")

    def _setProbQV_ccs(self, ccs_fofn, firstSplitFa):
        """Set ProbQV from .ccs.h5"""
        start_t = time.time()
        self._probqv = ProbFromQV(ccs_fofn, firstSplitFa)
        self.add_log(
            "Loading QVs from {f} + {c} took {t} sec.".format(f=firstSplitFa, c=ccs_fofn, t=time.time() - start_t),
            level=logging.INFO,
        )

    def _setProbQV_fq(self, firstSplitFq=None):
        """Set probability and quality values from FASTQ"""
        start_t = time.time()
        self._probqv = ProbFromFastq(firstSplitFq)
        self.add_log(
            "Loading QVs from {f} took {t} sec.".format(f=firstSplitFq, t=time.time() - start_t), level=logging.INFO
        )

    def run(self):
        """Call ICE to cluster consensus isoforms."""
        self.add_log("Start to run cluster.", level=logging.INFO)

        if self.ice_opts.targeted_isoseq:
            first_split = 1000
            self.ice_opts.flnc_reads_per_split = 10000
            self.add_log(
                "targeted_isoseq: further splitting JUST first split to 1000. Changing flnc_reads_per_split=10000."
            )
        else:
            first_split = None

        # Split flnc_fa into smaller files and save files to _flnc_splitted_fas.
        self.add_log(
            "Splitting {flnc} into ".format(flnc=self.flnc_fa)
            + "smaller files each containing {n} reads.".format(n=self.ice_opts.flnc_reads_per_split),
            level=logging.INFO,
        )

        self._flnc_splitted_fas = splitFasta(
            input_fasta=self.flnc_fa,
            reads_per_split=self.ice_opts.flnc_reads_per_split,
            out_dir=self.root_dir,
            out_prefix="input.split",
            first_split=first_split,
        )

        self.add_log("Splitted files are: " + "\n".join(self._flnc_splitted_fas), level=logging.INFO)

        firstSplit = self._flnc_splitted_fas[0]
        firstSplit_fq = firstSplit[: firstSplit.rfind(".")] + ".fastq"
        self.add_log(
            "Converting first split file {0} + {1} into fastq\n".format(firstSplit, self.ccs_fofn), level=logging.INFO
        )
        # Convert this into FASTQ
        ice_fa2fq(firstSplit, self.ccs_fofn, firstSplit_fq)

        # Set up probabbility and quality value model
        if self.ice_opts.use_finer_qv:
            self._setProbQV_ccs(self.ccs_fofn, firstSplit)
        else:
            self._setProbQV_fq(firstSplitFq=firstSplit_fq)

        # Initialize cluster by clique
        if os.path.exists(self.initPickleFN):
            self.add_log("Reading existing uc pickle: {0}".format(self.initPickleFN), level=logging.INFO)
            with open(self.initPickleFN) as f:
                uc = cPickle.load(f)
        else:
            self.add_log("Finding maximal cliques: initializing IceInit.", level=logging.INFO)
            self.iceinit = IceInit(
                readsFa=firstSplit,
                qver_get_func=self._probqv.get_smoothed,
                ice_opts=self.ice_opts,
                sge_opts=self.sge_opts,
                qvmean_get_func=self._probqv.get_mean,
            )
            uc = self.iceinit.uc

            # Dump uc to a file
            self.add_log("Dumping initial clusters to {f}".format(f=self.initPickleFN), level=logging.INFO)
            with open(self.initPickleFN, "w") as f:
                cPickle.dump(uc, f)

        # Run IceIterative.
        self.add_log("Iterative clustering: initializing IceIterative.", level=logging.INFO)
        # self.add_log("In Cluster. DEBUG: Calling Cluster with {0} {1} {2} ".format(self.bas_fofn, self.ccs_fofn, self.fasta_fofn))

        self.icec = IceIterative(
            fasta_filename=firstSplit,
            fasta_filenames_to_add=self._flnc_splitted_fas[1:],
            all_fasta_filename=self.flnc_fa,
            ccs_fofn=self.ccs_fofn,
            root_dir=self.root_dir,
            ice_opts=self.ice_opts,
            sge_opts=self.sge_opts,
            uc=uc,
            probQV=self._probqv,
            fastq_filename=firstSplit_fq,
            use_ccs_qv=self.ice_opts.use_finer_qv,
        )
        self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn))
        self.icec.run()
        self.add_log("IceIterative completed.", level=logging.INFO)

        # IceIterative done, write predicted (unplished) consensus isoforms
        # to an output fasta
        self.add_log("Creating a link to unpolished consensus isoforms.")
        ln(self.icec.final_consensus_fa, self.out_fa)

        # self.add_log("In Cluster. DEBUG: End Cluster with {0} {1} {2} ".format(self.bas_fofn, self.ccs_fofn, self.fasta_fofn))

        # Call quiver to polish predicted consensus isoforms.
        if self.ice_opts.quiver is not True:
            self.add_log("Creating a link to cluster report.", level=logging.INFO)
            ln(src=self.icec.report_fn, dst=self.report_fn)

            # Summarize cluster and write to summary_fn.
            self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.out_fa)
        else:  # self.ice_opts.quiver is True
            self.add_log("Polishing clusters: initializing IcePolish.", level=logging.INFO)
            # self.add_log("In Cluster. DEBUG: Calling Polish with {0} {1} {2} ".format(self.bas_fofn, self.ccs_fofn, self.fasta_fofn))
            self.pol = Polish(
                root_dir=self.root_dir,
                nfl_fa=self.nfl_fa,
                bas_fofn=self.bas_fofn,
                ccs_fofn=self.ccs_fofn,
                fasta_fofn=self.fasta_fofn,
                ice_opts=self.ice_opts,
                sge_opts=self.sge_opts,
                ipq_opts=self.ipq_opts,
                nfl_reads_per_split=self.nfl_reads_per_split,
            )
            self.add_log("IcePolish log: {f}.".format(f=self.pol.log_fn), level=logging.INFO)
            self.pol.run()
            self.add_log("IcePolish completed.", level=logging.INFO)

            # cluster report
            self.add_log("Creating a link to cluster report.", level=logging.INFO)
            ln(src=self.pol.iceq.report_fn, dst=self.report_fn)

            # Summarize cluster & polish and write to summary_fn.
            self.write_summary(
                summary_fn=self.summary_fn,
                isoforms_fa=self.out_fa,
                hq_fa=self.pol.icepq.quivered_good_fa,
                lq_fa=self.pol.icepq.quivered_bad_fa,
            )

        # Create log file.
        self.close_log()
        return 0
コード例 #2
0
ファイル: Cluster.py プロジェクト: avrajit/cDNA_primer
class Cluster(IceFiles):
    """
    An object of `Cluster` calls the ICE algorithm to
    generate consensus isoforms.
    """
    def __init__(self, root_dir, flnc_fa, nfl_fa,
                 bas_fofn, ccs_fofn, out_fa,
                 sge_opts, ice_opts,
                 hq_isoforms_fa=None, hq_isoforms_fq=None,
                 lq_isoforms_fa=None, lq_isoforms_fq=None,
                 report_fn=None, summary_fn=None):
        super(Cluster, self).__init__(prog_name="Cluster",
                root_dir=root_dir, bas_fofn=bas_fofn, ccs_fofn=ccs_fofn)

        self.flnc_fa, self.nfl_fa, self.ccs_fofn = self._validateInputs(
            flnc_fa, nfl_fa, ccs_fofn)

        self.root_dir, self.out_fa = self._validateOutputs(
            root_dir, out_fa)

        self.hq_isoforms_fa = hq_isoforms_fa
        self.hq_isoforms_fq = hq_isoforms_fq
        self.lq_isoforms_fa = lq_isoforms_fa
        self.lq_isoforms_fq = lq_isoforms_fq

        self.sge_opts = sge_opts  # SGE, CPU options and etc
        self.ice_opts = ice_opts  # The ICE clutering algorithm options

        self.sanity_check()

        self._probqv = None     # probability & quality value

        self._flnc_splitted_fas = []  # split flnc_fa into smaller files.
        self._nflncSplittedFas = []  # split nfl_fa into smaller files.
        self._logConfigs()      # Log configurations

        self.iceinit = None
        self.icec = None
        self.iceq = None
        self.pol = None

        self.report_fn = realpath(report_fn) if report_fn is not None \
                else op.join(self.root_dir, "cluster_report.csv")
        self.summary_fn = realpath(summary_fn) if summary_fn is not None \
                else op.join(self.root_dir, "cluster_summary.txt")

        self.summary = ClusterSummary()

        self.add_log("Finishing create Cluster Object.", level=logging.INFO)

    def _validateInputs(self, _flnc_fa, _nfl_fa, _ccs_fofn):
        """Validate input files and return absolute expaneded paths."""
        flnc_fa, nfl_fa, ccs_fofn = _flnc_fa, _nfl_fa, _ccs_fofn
        self.add_log("Checking input files.", level=logging.INFO)
        if flnc_fa is None or nfl_fa is None:
            raise ClusterException(
                "Input non-chimeric reads files needs to be specified.")
        else:
            flnc_fa, nfl_fa = realpath(flnc_fa), realpath(nfl_fa)
            if not op.exists(flnc_fa):
                raise ClusterException("Unable to find full-length " +
                    "non-chimeric reads: {fn}".format(fn=flnc_fa))
            if not op.exists(nfl_fa):
                raise ClusterException("Unable to find non-full-length " +
                    "non-chimeric reads: {fn}".format(fn=nfl_fa))
            if ccs_fofn is not None and not op.exists(ccs_fofn):
                raise ClusterException("Unable to find FOFN file: " +
                    "{fn}".format(fn=ccs_fofn))
        return (flnc_fa, nfl_fa, ccs_fofn)

    def _validateOutputs(self, _root_dir, _out_fa):
        """Validate outputs, create root_dir if it does not exist."""
        self.add_log("Checking outputs.", level=logging.INFO)
        root_dir, out_fa = _root_dir, _out_fa
        if root_dir is None:
            self.add_log("Output directory needs to be specified.",
                         level=logging.ERROR)
        if out_fa is None:
            self.add_log("Output consensus fasta needs to be specified.",
                         level=logging.ERROR)

        root_dir = realpath(root_dir)
        out_fa = realpath(out_fa)

        if op.exists(root_dir):
            self.add_log("Output directory {d} already exists.".
                format(d=root_dir))
        else:
            self.add_log("Creating output directory {d}.".format(d=root_dir))
            os.mkdir(root_dir)
        if op.exists(out_fa):
            raise ClusterException("Consensus FASTA file {f} already exists.".
                format(f=out_fa))
        return root_dir, out_fa

    def sanity_check(self):
        """Do sanity check before stat to run."""
        errMsg = ""
        if self.ice_opts.quiver is True:
            if self.bas_fofn is None:
                errMsg = "A fofn of bas/bax.h5 files, e.g., input.fofn, " + \
                         "is required in order to polish consensus " + \
                         "isoforms using quiver."
            if self.nfl_fa is None:
                errMsg = "Non-full-length reads are required for polishing " + \
                         "consensus isoforms using quiver."
        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise ValueError(errMsg)

    @property
    def configFN(self):
        """Return configuration file of the current run."""
        return op.join(self.root_dir, "run_ice_config.txt")

    def _logConfigs(self):
        """Log configuration."""
        with open (self.configFN, 'w', 0) as f:
            f.write('pbtranscript ' + get_version() + "\n")
            f.write(str(self.ice_opts) + "\n")
            f.write(str(self.sge_opts) + "\n")

    @property
    def initPickleFN(self):
        """Return path to pickle file with initial clusters."""
        return op.join(self.root_dir, "init.uc.pickle")

    def _setProbQV(self, ccs_fofn=None, firstSplitFa=None):
        """Set probability and quality values.
        If a fofn file is specified, load QV from it. Otherwise, use
        a pre-defined probability model."""
        if ccs_fofn is None:
            self.add_log("Loading predefined probabilities model.")
            self._probqv = ProbFromModel(0.01, 0.07, 0.06)
        else:
            self.add_log("Loading probabilities and QV from {f}.".
                format(f=firstSplitFa))
            self._probqv = ProbFromQV(ccs_fofn, firstSplitFa)

    def writeSummary(self, fa, summary_fn, hq_fa=None, lq_fa=None):
        """Extract number of consensus isoforms predicted, and total
        number of bases in all consensuus isoforms from fa and write
        the two attributes to summary_fn.

        if hq_fa (polished high-quality isoforms) is not None, report
            the number of polished hq clusters
        if lq_fa (polished high-quality isoforms) is not None, report
            the number of polished hq clusters
        """
        try:
            with FastaReader(fa) as reader:
                for r in reader:
                    self.summary.numConsensusIsoforms += 1
                    self.summary.numTotalBases += len(r.sequence)
            if hq_fa is not None:
                self.summary.num_polished_hq_isoforms = 0
                with FastaReader(hq_fa) as reader:
                    for r in reader:
                        self.summary.num_polished_hq_isoforms += 1
            if lq_fa is not None:
                self.summary.num_polished_lq_isoforms = 0
                with FastaReader(lq_fa) as reader:
                    for r in reader:
                        self.summary.num_polished_lq_isoforms += 1
            self.summary.write(summary_fn)
        except ZeroDivisionError:
            errMsg = "No consensus isoforms predicted."
            self.add_log(errMsg, level=logging.ERROR)
            raise ClusterException(errMsg)

    def run(self):
        """Call ICE to cluster consensus isoforms."""
        self.add_log("Start to run cluster.", level=logging.INFO)

        # Split flnc_fa into smaller files and save files to _flnc_splitted_fas.
        self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) +
                     "smaller files each containing {n} reads.".format(
                     n=self.ice_opts.flnc_reads_per_split),
                     level=logging.INFO)
        self._flnc_splitted_fas = splitFasta(
            input_fasta=self.flnc_fa,
            reads_per_split=self.ice_opts.flnc_reads_per_split,
            out_dir=self.root_dir,
            out_prefix="input.split")
        self.add_log("Splitted files are: " +
                     "\n".join(self._flnc_splitted_fas),
                     level=logging.INFO)

        firstSplit = self._flnc_splitted_fas[0]
        # Set up probabbility and quality value model
        self._setProbQV(ccs_fofn=self.ccs_fofn, firstSplitFa=firstSplit)

        # Initialize cluster by clique
        # check if init.pickle already exists, if so, no need to run IceInit
        if os.path.exists(self.initPickleFN):
            self.add_log("{0} already exists. Reading to get uc.".format(self.initPickleFN), level=logging.INFO)
            with open(self.initPickleFN) as f:
                uc = cPickle.load(f)
        else:
            self.add_log("Finding maximal cliques.", level=logging.INFO)
            self.iceinit = IceInit(readsFa=firstSplit,
                      qver_get_func=self._probqv.get_smoothed,
                      ice_opts=self.ice_opts,
                      sge_opts=self.sge_opts)
            uc = self.iceinit.uc
            # Dump uc to a file
            self.add_log("Dumping initial clusters to {f}".format(
                         f=self.initPickleFN), level=logging.INFO)
            with open(self.initPickleFN, 'w') as f:
                cPickle.dump(uc, f)

        # Run IceIterative.
        self.add_log("Iteratively clustering.", level=logging.INFO)
        self.icec = IceIterative(
                fasta_filename=firstSplit,
                fasta_filenames_to_add=self._flnc_splitted_fas[1:],
                all_fasta_filename=self.flnc_fa,
                ccs_fofn=self.ccs_fofn,
                root_dir=self.root_dir,
                ice_opts=self.ice_opts,
                sge_opts=self.sge_opts,
                uc=uc,
                probQV=self._probqv)
        self.icec.run()
        clean_up_after_ICE(self.root_dir)

        # IceIterative done, write predicted (unplished) consensus isoforms
        # to an output fasta
        self.add_log("Creating a link to unpolished consensus isoforms.")
        ln(self.icec.final_consensus_fa, self.out_fa)

        # Call quiver to polish predicted consensus isoforms.
        if self.ice_opts.quiver is not True:
            self.add_log("Creating a link to cluster report.")
            ln(src=self.icec.report_fn, dst=self.report_fn)

            self.add_log("Writing a summary to {f}".format(f=self.summary_fn),
                         level=logging.INFO)
            self.writeSummary(fa=self.out_fa, summary_fn=self.summary_fn)
        else:  # self.ice_opts.quiver is True
            #TODO review code
            self.pol = Polish(root_dir=self.root_dir,
                         nfl_fa=self.nfl_fa,
                         bas_fofn=self.bas_fofn,
                         ccs_fofn=self.ccs_fofn,
                         hq_isoforms_fa=self.hq_isoforms_fa,
                         hq_isoforms_fq=self.hq_isoforms_fq,
                         lq_isoforms_fa=self.lq_isoforms_fa,
                         lq_isoforms_fq=self.lq_isoforms_fq,
                         ice_opts=self.ice_opts,
                         sge_opts=self.sge_opts)
            self.pol.run()

            # cluster report
            self.add_log("Creating a link to cluster report.")
            ln(src=self.pol.iceq.report_fn, dst=self.report_fn)

            # Write a summary.
            self.add_log("Writing a summary to {f}".format(f=self.summary_fn),
                         level=logging.INFO)
            self.writeSummary(fa=self.out_fa, summary_fn=self.summary_fn,
                              hq_fa=self.pol.icepq.quivered_good_fa,
                              lq_fa=self.pol.icepq.quivered_bad_fa)

        # Create log file.
        self.close_log()
        return 0
コード例 #3
0
ファイル: Cluster.py プロジェクト: yimsea/cDNA_primer
class Cluster(IceFiles):
    """
    An object of `Cluster` calls the ICE algorithm to
    generate consensus isoforms.
    """
    def __init__(self,
                 root_dir,
                 flnc_fa,
                 nfl_fa,
                 bas_fofn,
                 ccs_fofn,
                 out_fa,
                 sge_opts,
                 ice_opts,
                 ipq_opts,
                 report_fn=None,
                 summary_fn=None,
                 fasta_fofn=None,
                 nfl_reads_per_split=30000):
        super(Cluster, self).__init__(prog_name="Cluster",
                                      root_dir=root_dir,
                                      bas_fofn=bas_fofn,
                                      ccs_fofn=ccs_fofn,
                                      fasta_fofn=fasta_fofn)

        self.sge_opts = sge_opts  # SGE, CPU arguments and etc
        self.ice_opts = ice_opts  # ICE clustering algorithm arguments
        self.ipq_opts = ipq_opts  # IceQuiver HQ/LQ isoform arguments
        self.nfl_reads_per_split = int(
            nfl_reads_per_split)  # ToDo: setsanity check here?

        self.flnc_fa, self.nfl_fa, self.ccs_fofn, self.fasta_fofn = self._validate_inputs(
            flnc_fa,
            nfl_fa,
            ccs_fofn,
            fasta_fofn=fasta_fofn,
            quiver=self.ice_opts.quiver)

        self.add_log("DEBUG: self.fasta_fofn is {0}".format(self.fasta_fofn))

        self.root_dir, self.out_fa = self._validate_outputs(root_dir, out_fa)

        self.sanity_check()

        self._probqv = None  # probability & quality value

        self._flnc_splitted_fas = []  # split flnc_fa into smaller files.
        self._nflncSplittedFas = []  # split nfl_fa into smaller files.
        self._logConfigs()  # Log configurations

        self.iceinit = None
        self.icec = None
        self.iceq = None
        self.pol = None

        self.report_fn = realpath(report_fn) if report_fn is not None \
            else op.join(self.root_dir, "cluster_report.csv")
        self.summary_fn = realpath(summary_fn) if summary_fn is not None \
            else op.join(self.root_dir, "cluster_summary.txt")

        self.add_log("A Cluster Object created.", level=logging.INFO)

    def _validate_inputs(self,
                         _flnc_fa,
                         _nfl_fa,
                         _ccs_fofn,
                         fasta_fofn=None,
                         quiver=False):
        """Validate input files and return absolute expaneded paths."""
        flnc_fa, nfl_fa, ccs_fofn = _flnc_fa, _nfl_fa, _ccs_fofn
        self.add_log("Checking input files.", level=logging.INFO)
        if flnc_fa is None:
            raise ClusterException(
                "Input full-length non-chimeric reads " +
                "files (i.e., flnc_fa) needs to be specified.")
        else:
            flnc_fa = realpath(flnc_fa)
            if not op.exists(flnc_fa):
                raise ClusterException("Unable to find full-length " +
                                       "non-chimeric reads: {fn}".format(
                                           fn=flnc_fa))

        if nfl_fa is None:
            if quiver is True:
                raise ClusterException(
                    "Input non-full-length reads file " +
                    "(i.e., nfl_fa) needs to be specified for isoform polish.")
        else:
            nfl_fa = realpath(nfl_fa)
            if not op.exists(nfl_fa):
                raise ClusterException("Unable to find non-full-length " +
                                       "non-chimeric reads: {fn}".format(
                                           fn=nfl_fa))

        if ccs_fofn is not None and not op.exists(ccs_fofn):
            raise ClusterException("Unable to find CCS FOFN file: " +
                                   "{fn}".format(fn=ccs_fofn))

        if fasta_fofn is not None and quiver:
            if not os.path.exists(fasta_fofn):
                raise ClusterException(
                    "Unable to find FASTA_FOFN file: {0}".format(fasta_fofn))
            for line in open(fasta_fofn):
                if not os.path.exists(line.strip()):
                    raise ClusterException("Unable to locate {0} in {1}".format(\
                            line.strip(), fasta_fofn))
        return (flnc_fa, nfl_fa, ccs_fofn, fasta_fofn)

    def _validate_outputs(self, _root_dir, _out_fa):
        """Validate outputs, create root_dir if it does not exist."""
        self.add_log("Checking outputs.", level=logging.INFO)
        root_dir, out_fa = _root_dir, _out_fa
        if root_dir is None:
            self.add_log("Output directory needs to be specified.",
                         level=logging.ERROR)
        if out_fa is None:
            self.add_log("Output consensus fasta needs to be specified.",
                         level=logging.ERROR)

        root_dir = realpath(root_dir)
        out_fa = realpath(out_fa)

        if op.exists(root_dir):
            self.add_log(
                "Output directory {d} already exists.".format(d=root_dir))
        else:
            self.add_log("Creating output directory {d}.".format(d=root_dir))
            os.mkdir(root_dir)
        if op.exists(out_fa):
            raise ClusterException(
                "Consensus FASTA file {f} already exists.".format(f=out_fa))
        return root_dir, out_fa

    def sanity_check(self):
        """Do sanity check before stat to run."""
        errMsg = ""
        if self.ice_opts.quiver is True:
            if self.bas_fofn is None:
                errMsg = "A fofn of bas/bax.h5 files, e.g., input.fofn, " + \
                         "is required in order to polish consensus " + \
                         "isoforms using quiver."
            if self.nfl_fa is None:
                errMsg = "Non-full-length reads are required for polishing " + \
                         "consensus isoforms using quiver."
        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise ValueError(errMsg)

    @property
    def configFN(self):
        """Return configuration file of the current run."""
        return op.join(self.root_dir, "run_ice_config.txt")

    def _logConfigs(self):
        """Log configuration."""
        with open(self.configFN, 'w', 0) as f:
            f.write('pbtranscript ' + get_version() + "\n")
            f.write(str(self.ice_opts) + "\n")
            f.write(str(self.sge_opts) + "\n")

    @property
    def initPickleFN(self):
        """Return path to pickle file with initial clusters."""
        return op.join(self.root_dir, "init.uc.pickle")

    def _setProbQV_ccs(self, ccs_fofn, firstSplitFa):
        """Set ProbQV from .ccs.h5"""
        start_t = time.time()
        self.add_log("Loading QVs from {f} + {c} took {t} sec.".format(
            f=firstSplitFa, c=ccs_fofn, t=time.time() - start_t),
                     level=logging.INFO)
        self._probqv = ProbFromQV(ccs_fofn, firstSplitFa)

    def _setProbQV_fq(self, firstSplitFq=None):
        """Set probability and quality values from FASTQ"""
        start_t = time.time()
        self.add_log("Loading QVs from {f} took {t} sec.".format(
            f=firstSplitFq, t=time.time() - start_t),
                     level=logging.INFO)
        self._probqv = ProbFromFastq(firstSplitFq)

    def run(self):
        """Call ICE to cluster consensus isoforms."""
        self.add_log("Start to run cluster.", level=logging.INFO)

        #self.ice_opts.flnc_reads_per_split=1000 #FOR DEBUGGING, REMOVE LATER
        # Split flnc_fa into smaller files and save files to _flnc_splitted_fas.
        self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) +
                     "smaller files each containing {n} reads.".format(
                         n=self.ice_opts.flnc_reads_per_split),
                     level=logging.INFO)
        self._flnc_splitted_fas = splitFasta(
            input_fasta=self.flnc_fa,
            reads_per_split=self.ice_opts.flnc_reads_per_split,
            out_dir=self.root_dir,
            out_prefix="input.split")
        self.add_log("Splitted files are: " +
                     "\n".join(self._flnc_splitted_fas),
                     level=logging.INFO)

        firstSplit = self._flnc_splitted_fas[0]
        firstSplit_fq = firstSplit[:firstSplit.rfind('.')] + '.fastq'
        self.add_log("Converting first split file {0} + {1} into fastq\n".format(\
                firstSplit, self.ccs_fofn), level=logging.INFO)
        # Convert this into FASTQ
        ice_fa2fq(firstSplit, self.ccs_fofn, firstSplit_fq)

        # Set up probabbility and quality value model
        if self.ice_opts.use_finer_qv:
            self._setProbQV_ccs(self.ccs_fofn, firstSplit)
        else:
            self._setProbQV_fq(firstSplitFq=firstSplit_fq)

        # Initialize cluster by clique
        self.add_log("Finding maximal cliques: initializing IceInit.",
                     level=logging.INFO)
        self.iceinit = IceInit(readsFa=firstSplit,
                               qver_get_func=self._probqv.get_smoothed,
                               ice_opts=self.ice_opts,
                               sge_opts=self.sge_opts)
        uc = self.iceinit.uc

        # Dump uc to a file
        self.add_log(
            "Dumping initial clusters to {f}".format(f=self.initPickleFN),
            level=logging.INFO)
        with open(self.initPickleFN, 'w') as f:
            cPickle.dump(uc, f)

        # Run IceIterative.
        self.add_log("Iterative clustering: initializing IceIterative.",
                     level=logging.INFO)
        self.icec = IceIterative(
            fasta_filename=firstSplit,
            fasta_filenames_to_add=self._flnc_splitted_fas[1:],
            all_fasta_filename=self.flnc_fa,
            ccs_fofn=self.ccs_fofn,
            root_dir=self.root_dir,
            ice_opts=self.ice_opts,
            sge_opts=self.sge_opts,
            uc=uc,
            probQV=self._probqv,
            fastq_filename=firstSplit_fq,
            use_ccs_qv=self.ice_opts.use_finer_qv)
        self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn))
        self.icec.run()
        self.add_log("IceIterative completed.", level=logging.INFO)

        # IceIterative done, write predicted (unplished) consensus isoforms
        # to an output fasta
        self.add_log("Creating a link to unpolished consensus isoforms.")
        ln(self.icec.final_consensus_fa, self.out_fa)

        # Call quiver to polish predicted consensus isoforms.
        if self.ice_opts.quiver is not True:
            self.add_log("Creating a link to cluster report.",
                         level=logging.INFO)
            ln(src=self.icec.report_fn, dst=self.report_fn)

            # Summarize cluster and write to summary_fn.
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.out_fa)
        else:  # self.ice_opts.quiver is True
            self.add_log("Polishing clusters: initializing IcePolish.",
                         level=logging.INFO)
            self.pol = Polish(root_dir=self.root_dir,
                              nfl_fa=self.nfl_fa,
                              bas_fofn=self.bas_fofn,
                              ccs_fofn=self.ccs_fofn,
                              fasta_fofn=self.fasta_fofn,
                              ice_opts=self.ice_opts,
                              sge_opts=self.sge_opts,
                              ipq_opts=self.ipq_opts,
                              nfl_reads_per_split=self.nfl_reads_per_split)
            self.add_log("IcePolish log: {f}.".format(f=self.pol.log_fn),
                         level=logging.INFO)
            self.pol.run()
            self.add_log("IcePolish completed.", level=logging.INFO)

            # cluster report
            self.add_log("Creating a link to cluster report.",
                         level=logging.INFO)
            ln(src=self.pol.iceq.report_fn, dst=self.report_fn)

            # Summarize cluster & polish and write to summary_fn.
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.out_fa,
                               hq_fa=self.pol.icepq.quivered_good_fa,
                               lq_fa=self.pol.icepq.quivered_bad_fa)

        # Create log file.
        self.close_log()
        return 0
コード例 #4
0
class Cluster(IceFiles):
    """
    An object of `Cluster` calls the ICE algorithm to
    generate consensus isoforms.
    """
    def __init__(self,
                 root_dir,
                 flnc_fa,
                 nfl_fa,
                 bas_fofn,
                 ccs_fofn,
                 out_fa,
                 sge_opts,
                 ice_opts,
                 hq_isoforms_fa=None,
                 hq_isoforms_fq=None,
                 lq_isoforms_fa=None,
                 lq_isoforms_fq=None,
                 report_fn=None,
                 summary_fn=None):
        super(Cluster, self).__init__(prog_name="Cluster",
                                      root_dir=root_dir,
                                      bas_fofn=bas_fofn,
                                      ccs_fofn=ccs_fofn)

        self.flnc_fa, self.nfl_fa, self.ccs_fofn = self._validateInputs(
            flnc_fa, nfl_fa, ccs_fofn)

        self.root_dir, self.out_fa = self._validateOutputs(root_dir, out_fa)

        self.hq_isoforms_fa = hq_isoforms_fa
        self.hq_isoforms_fq = hq_isoforms_fq
        self.lq_isoforms_fa = lq_isoforms_fa
        self.lq_isoforms_fq = lq_isoforms_fq

        self.sge_opts = sge_opts  # SGE, CPU options and etc
        self.ice_opts = ice_opts  # The ICE clutering algorithm options

        self.sanity_check()

        self._probqv = None  # probability & quality value

        self._flnc_splitted_fas = []  # split flnc_fa into smaller files.
        self._nflncSplittedFas = []  # split nfl_fa into smaller files.
        self._logConfigs()  # Log configurations

        self.iceinit = None
        self.icec = None
        self.iceq = None
        self.pol = None

        self.report_fn = realpath(report_fn) if report_fn is not None \
                else op.join(self.root_dir, "cluster_report.csv")
        self.summary_fn = realpath(summary_fn) if summary_fn is not None \
                else op.join(self.root_dir, "cluster_summary.txt")

        self.summary = ClusterSummary()

        self.add_log("Finishing create Cluster Object.", level=logging.INFO)

    def _validateInputs(self, _flnc_fa, _nfl_fa, _ccs_fofn):
        """Validate input files and return absolute expaneded paths."""
        flnc_fa, nfl_fa, ccs_fofn = _flnc_fa, _nfl_fa, _ccs_fofn
        self.add_log("Checking input files.", level=logging.INFO)
        if flnc_fa is None or nfl_fa is None:
            raise ClusterException(
                "Input non-chimeric reads files needs to be specified.")
        else:
            flnc_fa, nfl_fa = realpath(flnc_fa), realpath(nfl_fa)
            if not op.exists(flnc_fa):
                raise ClusterException("Unable to find full-length " +
                                       "non-chimeric reads: {fn}".format(
                                           fn=flnc_fa))
            if not op.exists(nfl_fa):
                raise ClusterException("Unable to find non-full-length " +
                                       "non-chimeric reads: {fn}".format(
                                           fn=nfl_fa))
            if ccs_fofn is not None and not op.exists(ccs_fofn):
                raise ClusterException("Unable to find FOFN file: " +
                                       "{fn}".format(fn=ccs_fofn))
        return (flnc_fa, nfl_fa, ccs_fofn)

    def _validateOutputs(self, _root_dir, _out_fa):
        """Validate outputs, create root_dir if it does not exist."""
        self.add_log("Checking outputs.", level=logging.INFO)
        root_dir, out_fa = _root_dir, _out_fa
        if root_dir is None:
            self.add_log("Output directory needs to be specified.",
                         level=logging.ERROR)
        if out_fa is None:
            self.add_log("Output consensus fasta needs to be specified.",
                         level=logging.ERROR)

        root_dir = realpath(root_dir)
        out_fa = realpath(out_fa)

        if op.exists(root_dir):
            self.add_log(
                "Output directory {d} already exists.".format(d=root_dir))
        else:
            self.add_log("Creating output directory {d}.".format(d=root_dir))
            os.mkdir(root_dir)
        if op.exists(out_fa):
            raise ClusterException(
                "Consensus FASTA file {f} already exists.".format(f=out_fa))
        return root_dir, out_fa

    def sanity_check(self):
        """Do sanity check before stat to run."""
        errMsg = ""
        if self.ice_opts.quiver is True:
            if self.bas_fofn is None:
                errMsg = "A fofn of bas/bax.h5 files, e.g., input.fofn, " + \
                         "is required in order to polish consensus " + \
                         "isoforms using quiver."
            if self.nfl_fa is None:
                errMsg = "Non-full-length reads are required for polishing " + \
                         "consensus isoforms using quiver."
        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise ValueError(errMsg)

    @property
    def configFN(self):
        """Return configuration file of the current run."""
        return op.join(self.root_dir, "run_ice_config.txt")

    def _logConfigs(self):
        """Log configuration."""
        with open(self.configFN, 'w', 0) as f:
            f.write('pbtranscript ' + get_version() + "\n")
            f.write(str(self.ice_opts) + "\n")
            f.write(str(self.sge_opts) + "\n")

    @property
    def initPickleFN(self):
        """Return path to pickle file with initial clusters."""
        return op.join(self.root_dir, "init.uc.pickle")

    def _setProbQV(self, ccs_fofn=None, firstSplitFa=None):
        """Set probability and quality values.
        If a fofn file is specified, load QV from it. Otherwise, use
        a pre-defined probability model."""
        if ccs_fofn is None:
            self.add_log("Loading predefined probabilities model.")
            self._probqv = ProbFromModel(0.01, 0.07, 0.06)
        else:
            self.add_log("Loading probabilities and QV from {f}.".format(
                f=firstSplitFa))
            self._probqv = ProbFromQV(ccs_fofn, firstSplitFa)

    def writeSummary(self, fa, summary_fn, hq_fa=None, lq_fa=None):
        """Extract number of consensus isoforms predicted, and total
        number of bases in all consensuus isoforms from fa and write
        the two attributes to summary_fn.

        if hq_fa (polished high-quality isoforms) is not None, report
            the number of polished hq clusters
        if lq_fa (polished high-quality isoforms) is not None, report
            the number of polished hq clusters
        """
        try:
            with FastaReader(fa) as reader:
                for r in reader:
                    self.summary.numConsensusIsoforms += 1
                    self.summary.numTotalBases += len(r.sequence)
            if hq_fa is not None:
                self.summary.num_polished_hq_isoforms = 0
                with FastaReader(hq_fa) as reader:
                    for r in reader:
                        self.summary.num_polished_hq_isoforms += 1
            if lq_fa is not None:
                self.summary.num_polished_lq_isoforms = 0
                with FastaReader(lq_fa) as reader:
                    for r in reader:
                        self.summary.num_polished_lq_isoforms += 1
            self.summary.write(summary_fn)
        except ZeroDivisionError:
            errMsg = "No consensus isoforms predicted."
            self.add_log(errMsg, level=logging.ERROR)
            raise ClusterException(errMsg)

    def run(self):
        """Call ICE to cluster consensus isoforms."""
        self.add_log("Start to run cluster.", level=logging.INFO)

        # Split flnc_fa into smaller files and save files to _flnc_splitted_fas.
        self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) +
                     "smaller files each containing {n} reads.".format(
                         n=self.ice_opts.flnc_reads_per_split),
                     level=logging.INFO)
        self._flnc_splitted_fas = splitFasta(
            input_fasta=self.flnc_fa,
            reads_per_split=self.ice_opts.flnc_reads_per_split,
            out_dir=self.root_dir,
            out_prefix="input.split")
        self.add_log("Splitted files are: " +
                     "\n".join(self._flnc_splitted_fas),
                     level=logging.INFO)

        firstSplit = self._flnc_splitted_fas[0]
        # Set up probabbility and quality value model
        self._setProbQV(ccs_fofn=self.ccs_fofn, firstSplitFa=firstSplit)

        # Initialize cluster by clique
        # check if init.pickle already exists, if so, no need to run IceInit
        if os.path.exists(self.initPickleFN):
            self.add_log("{0} already exists. Reading to get uc.".format(
                self.initPickleFN),
                         level=logging.INFO)
            with open(self.initPickleFN) as f:
                uc = cPickle.load(f)
        else:
            self.add_log("Finding maximal cliques.", level=logging.INFO)
            self.iceinit = IceInit(readsFa=firstSplit,
                                   qver_get_func=self._probqv.get_smoothed,
                                   ice_opts=self.ice_opts,
                                   sge_opts=self.sge_opts)
            uc = self.iceinit.uc
            # Dump uc to a file
            self.add_log(
                "Dumping initial clusters to {f}".format(f=self.initPickleFN),
                level=logging.INFO)
            with open(self.initPickleFN, 'w') as f:
                cPickle.dump(uc, f)

        # Run IceIterative.
        self.add_log("Iteratively clustering.", level=logging.INFO)
        self.icec = IceIterative(
            fasta_filename=firstSplit,
            fasta_filenames_to_add=self._flnc_splitted_fas[1:],
            all_fasta_filename=self.flnc_fa,
            ccs_fofn=self.ccs_fofn,
            root_dir=self.root_dir,
            ice_opts=self.ice_opts,
            sge_opts=self.sge_opts,
            uc=uc,
            probQV=self._probqv)
        self.icec.run()
        clean_up_after_ICE(self.root_dir)

        # IceIterative done, write predicted (unplished) consensus isoforms
        # to an output fasta
        self.add_log("Creating a link to unpolished consensus isoforms.")
        ln(self.icec.final_consensus_fa, self.out_fa)

        # Call quiver to polish predicted consensus isoforms.
        if self.ice_opts.quiver is not True:
            self.add_log("Creating a link to cluster report.")
            ln(src=self.icec.report_fn, dst=self.report_fn)

            self.add_log("Writing a summary to {f}".format(f=self.summary_fn),
                         level=logging.INFO)
            self.writeSummary(fa=self.out_fa, summary_fn=self.summary_fn)
        else:  # self.ice_opts.quiver is True
            #TODO review code
            self.pol = Polish(root_dir=self.root_dir,
                              nfl_fa=self.nfl_fa,
                              bas_fofn=self.bas_fofn,
                              ccs_fofn=self.ccs_fofn,
                              hq_isoforms_fa=self.hq_isoforms_fa,
                              hq_isoforms_fq=self.hq_isoforms_fq,
                              lq_isoforms_fa=self.lq_isoforms_fa,
                              lq_isoforms_fq=self.lq_isoforms_fq,
                              ice_opts=self.ice_opts,
                              sge_opts=self.sge_opts)
            self.pol.run()

            # cluster report
            self.add_log("Creating a link to cluster report.")
            ln(src=self.pol.iceq.report_fn, dst=self.report_fn)

            # Write a summary.
            self.add_log("Writing a summary to {f}".format(f=self.summary_fn),
                         level=logging.INFO)
            self.writeSummary(fa=self.out_fa,
                              summary_fn=self.summary_fn,
                              hq_fa=self.pol.icepq.quivered_good_fa,
                              lq_fa=self.pol.icepq.quivered_bad_fa)

        # Create log file.
        self.close_log()
        return 0