Example #1
0
    def run(self):
        """Run"""
        logging.debug("root_dir: {d}.".format(d=self.root_dir))
        logging.debug("nfl_fa: {f}.".format(f=self.nfl_fa))
        logging.debug("Total number of chunks: N={N}.".format(N=self.N))

        # Validate input files,
        (num_reads, reads_per_split, nfl_dir, splitted_fas_todo) = \
            self.validate_inputs()

        logging.info("Total number of reads is {n}.".format(n=num_reads))
        logging.info("Splitting nfl_fa into chunks each " +
                     "containing {n} reads.".format(n=reads_per_split))

        splitted_fas_done = splitFasta(input_fasta=real_ppath(self.nfl_fa),
                                       reads_per_split=reads_per_split,
                                       out_dir=nfl_dir,
                                       out_prefix="input.split")

        logging.info("Splitted files are: " + "\n".join(splitted_fas_done))
        for fa in splitted_fas_todo:
            if fa not in splitted_fas_done:
                logging.info("touching {f}".format(f=fa))
                touch(fa)
    def run(self):
        """Run"""
        logging.debug("root_dir: {d}.".format(d=self.root_dir))
        logging.debug("nfl_fa: {f}.".format(f=self.nfl_fa))
        logging.debug("Total number of chunks: N={N}.".format(N=self.N))

        # Validate input files,
        (num_reads, reads_per_split, nfl_dir, splitted_fas_todo) = self.validate_inputs()

        logging.info("Total number of reads is {n}.".format(n=num_reads))
        logging.info("Splitting nfl_fa into chunks each " + "containing {n} reads.".format(n=reads_per_split))

        splitted_fas_done = splitFasta(
            input_fasta=real_ppath(self.nfl_fa),
            reads_per_split=reads_per_split,
            out_dir=nfl_dir,
            out_prefix="input.split",
        )

        logging.info("Splitted files are: " + "\n".join(splitted_fas_done))
        for fa in splitted_fas_todo:
            if fa not in splitted_fas_done:
                logging.info("touching {f}".format(f=fa))
                touch(fa)
Example #3
0
    def run(self):
        """
        First, split non-full-length (nfl) fasta files into smaller
        chunks, assign nfl reads in each splitted fasta file
        into unpolished isoform clusters and then merge all pickles
        into self.nfl_all_pickle_fn.
        Second, bin every 100 clusters, for each bin, call blasr,
        samto5h, loadPulses, cmph5tools to create cmp.h5 files and
        call quiver to polish each isoforms within each bin.
        Finally, pick up good isoform clusters whose QV errors is less
        than a threshold.
        Save all high quality isoforms to hq_isoforms_fa|fq if they are not None
        Save all low quality isoforms to lq_isoforms_fa|fq if they are not None
        """
        # Create final.consensus.fa.sa
        self.add_log("Generating suffix array for {f}".format(
            f=self.final_consensus_sa),
                     level=logging.INFO)
        sa_file = self.get_sa_file()

        # Create input.fasta.fofn from bas_fofn
        self.add_log("Creating fasta fofn from bas/bax.h5 fofn",
                     level=logging.INFO)
        if self.fasta_fofn is None:
            self.fasta_fofn = op.join(self.nfl_dir, "input.fasta.fofn")
        self.add_log("bas fofn={f}".format(f=self.bas_fofn))
        self.add_log("fasta fofn={f}".format(f=self.fasta_fofn))
        convert_fofn_to_fasta(fofn_filename=self.bas_fofn,
                              out_filename=self.fasta_fofn,
                              fasta_out_dir=self.nfl_dir)

        # Split non-full-length reads into smaller fasta files
        # and save files to root_dir/nfl_00.fa, ..., .
        self.add_log("Splitting {nfl} into ".format(nfl=self.nfl_fa) +
                     "smaller files each containing {n} reads.".format(
                         n=self.ice_opts.nfl_reads_per_split),
                     level=logging.INFO)
        self._nfl_splitted_fas = splitFasta(
            input_fasta=self.nfl_fa,
            reads_per_split=self.ice_opts.nfl_reads_per_split,
            out_dir=self.nfl_dir,
            out_prefix="input.split")
        msg = "Splitted files are: " + "\n".join(self._nfl_splitted_fas)
        self.add_log(msg, level=logging.INFO)

        # Process nfl reads in each splitted fasta.
        self.add_log("IceAllPartials initiated.", level=logging.INFO)
        sa_file = self.final_consensus_sa \
                  if op.exists(self.final_consensus_fa) else None
        self.icep = IceAllPartials(root_dir=self.root_dir,
                                   fasta_filenames=self._nfl_splitted_fas,
                                   ref_fasta=self.final_consensus_fa,
                                   out_pickle=self.nfl_all_pickle_fn,
                                   sge_opts=self.sge_opts,
                                   sa_file=sa_file,
                                   ccs_fofn=self.ccs_fofn)
        self.icep.run()
        self.add_log("IceAllPartials completed.", level=logging.INFO)

        self.add_log("IceQuiver initiated.", level=logging.INFO)
        self.iceq = IceQuiver(root_dir=self.root_dir,
                              bas_fofn=self.bas_fofn,
                              fasta_fofn=self.fasta_fofn,
                              sge_opts=self.sge_opts)
        self.iceq.run()
        self.add_log("IceQuiver finished.", level=logging.INFO)

        self.add_log("IcePostQuiver initiated.", level=logging.INFO)
        self.icepq = IcePostQuiver(root_dir=self.root_dir,
                                   hq_isoforms_fa=self.hq_isoforms_fa,
                                   hq_isoforms_fq=self.hq_isoforms_fq,
                                   lq_isoforms_fa=self.lq_isoforms_fa,
                                   lq_isoforms_fq=self.lq_isoforms_fq,
                                   use_sge=self.sge_opts.use_sge,
                                   quit_if_not_done=False)
        self.icepq.run()
        self.add_log("IcePostQuiver finished.", level=logging.INFO)
Example #4
0
    def run(self):
        """
        First, split non-full-length (nfl) fasta files into smaller
        chunks, assign nfl reads in each splitted fasta file
        into unpolished isoform clusters and then merge all pickles
        into self.nfl_all_pickle_fn.
        Second, bin every 100 clusters, for each bin, call blasr,
        samto5h, loadPulses, cmph5tools to create cmp.h5 files and
        call quiver to polish each isoforms within each bin.
        Finally, pick up good isoform clusters whose QV errors is less
        than a threshold.
        Save all high quality isoforms to hq_isoforms_fa|fq if they are not None
        Save all low quality isoforms to lq_isoforms_fa|fq if they are not None
        """
        # Create final.consensus.fa.sa
        #self.add_log("Generating suffix array for {f}".format(
        #             f=self.final_consensus_sa), level=logging.INFO)
        #sa_file = self.get_sa_file()

        # Create input.fasta.fofn from bas_fofn
        self.add_log("Creating fasta fofn from bas/bax.h5 fofn",
                     level=logging.INFO)
        if self.fasta_fofn is None:
            self.fasta_fofn = op.join(self.nfl_dir, "input.fasta.fofn")
        self.add_log("bas fofn={f}".format(f=self.bas_fofn))
        self.add_log("fasta fofn={f}".format(f=self.fasta_fofn))
        if op.exists(self.fasta_fofn):
            self.add_log("No need to run convert_fofn_to_fasta.")
        else:
            convert_fofn_to_fasta(fofn_filename=self.bas_fofn,
                                out_filename=self.fasta_fofn,
                                fasta_out_dir=self.nfl_dir,
                                cpus=self.sge_opts.blasr_nproc)

        # Split non-full-length reads into smaller fasta files
        # and save files to root_dir/nfl_00.fa, ..., .
        self.add_log("Splitting {nfl} into ".format(nfl=self.nfl_fa) +
                     "smaller files each containing {n} reads.".format(
                     n=self.nfl_reads_per_split),
                     level=logging.INFO)
        self._nfl_splitted_fas = splitFasta(input_fasta=self.nfl_fa,
                                            reads_per_split=self.nfl_reads_per_split,
                                            out_dir=self.nfl_dir,
                                            out_prefix="input.split")
        msg = "Splitted files are: " + "\n".join(self._nfl_splitted_fas)
        self.add_log(msg, level=logging.INFO)

        # Generating dazz DB for final.consensus.fasta
        ref_obj = DazzIDHandler(self.final_consensus_fa, False)
        DalignerRunner.make_db(ref_obj.dazz_filename)
        msg = "Dazz DB made for: " + ref_obj.dazz_filename
        self.add_log(msg, level=logging.INFO)

        # Process nfl reads in each splitted fasta.
        self.add_log("Initializing IceAllPartials.", level=logging.INFO)
        #sa_file = self.final_consensus_sa \
        #    if op.exists(self.final_consensus_fa) else None

        self.icep = IceAllPartials(
            root_dir=self.root_dir,
            fasta_filenames=self._nfl_splitted_fas,
            ref_fasta=self.final_consensus_fa,
            out_pickle=self.nfl_all_pickle_fn,
            sge_opts=self.sge_opts,
            sa_file=None,  # since we are switching to daligner, just give it as None now; remove sa_file completely later when daligner is mature (ToDo)
            ccs_fofn=self.ccs_fofn)
        self.add_log("IceAllPartials log: {f}.".format(f=self.icep.log_fn),
                     level=logging.INFO)
        self.icep.run()
        self.add_log("IceAllPartials completed.", level=logging.INFO)

        self.add_log("Initializing IceQuiver.", level=logging.INFO)
        self.iceq = IceQuiver(root_dir=self.root_dir,
                              bas_fofn=self.bas_fofn,
                              fasta_fofn=self.fasta_fofn,
                              sge_opts=self.sge_opts)
        self.add_log("IceQuiver log: {f}.".format(f=self.iceq.log_fn),
                     level=logging.INFO)
        self.iceq.run()
        self.add_log("IceQuiver finished.", level=logging.INFO)

        self.add_log("Initializing IceQuiverPostprocess.", level=logging.INFO)
        self.icepq = IceQuiverPostprocess(root_dir=self.root_dir,
                                          use_sge=self.sge_opts.use_sge,
                                          quit_if_not_done=False,
                                          ipq_opts=self.ipq_opts)
        self.add_log("IceQuiverPostprocess log: {f}.".
                     format(f=self.icepq.log_fn), level=logging.INFO)
        self.icepq.run()
        self.add_log("IceQuiverPostprocess finished.", level=logging.INFO)
Example #5
0
    def run(self):
        """
        First, split non-full-length (nfl) fasta files into smaller
        chunks, assign nfl reads in each splitted fasta file
        into unpolished isoform clusters and then merge all pickles
        into self.nfl_all_pickle_fn.
        Second, bin every 100 clusters, for each bin, call blasr,
        samto5h, loadPulses, cmph5tools to create cmp.h5 files and
        call quiver to polish each isoforms within each bin.
        Finally, pick up good isoform clusters whose QV errors is less
        than a threshold.
        Save all high quality isoforms to hq_isoforms_fa|fq if they are not None
        Save all low quality isoforms to lq_isoforms_fa|fq if they are not None
        """
        # Create final.consensus.fa.sa
        self.add_log("Generating suffix array for {f}".format(
                     f=self.final_consensus_sa), level=logging.INFO)
        sa_file = self.get_sa_file()

        # Create input.fasta.fofn from bas_fofn
        self.add_log("Creating fasta fofn from bas/bax.h5 fofn",
                     level=logging.INFO)
        if self.fasta_fofn is None:
            self.fasta_fofn = op.join(self.nfl_dir, "input.fasta.fofn")
        self.add_log("bas fofn={f}".format(f=self.bas_fofn))
        self.add_log("fasta fofn={f}".format(f=self.fasta_fofn))
        convert_fofn_to_fasta(fofn_filename=self.bas_fofn,
                              out_filename=self.fasta_fofn,
                              fasta_out_dir=self.nfl_dir)

        # Split non-full-length reads into smaller fasta files
        # and save files to root_dir/nfl_00.fa, ..., .
        self.add_log("Splitting {nfl} into ".format(nfl=self.nfl_fa) +
                     "smaller files each containing {n} reads.".format(
                     n=self.ice_opts.nfl_reads_per_split),
                     level=logging.INFO)
        self._nfl_splitted_fas = splitFasta(input_fasta=self.nfl_fa,
            reads_per_split=self.ice_opts.nfl_reads_per_split,
            out_dir=self.nfl_dir,
            out_prefix="input.split")
        msg = "Splitted files are: " + "\n".join(self._nfl_splitted_fas)
        self.add_log(msg, level=logging.INFO)

        # Process nfl reads in each splitted fasta.
        self.add_log("IceAllPartials initiated.", level=logging.INFO)
        sa_file = self.final_consensus_sa \
                  if op.exists(self.final_consensus_fa) else None
        self.icep = IceAllPartials(
                root_dir=self.root_dir,
                fasta_filenames=self._nfl_splitted_fas,
                ref_fasta=self.final_consensus_fa,
                out_pickle=self.nfl_all_pickle_fn,
                sge_opts=self.sge_opts,
                sa_file=sa_file,
                ccs_fofn=self.ccs_fofn)
        self.icep.run()
        self.add_log("IceAllPartials completed.", level=logging.INFO)

        self.add_log("IceQuiver initiated.", level=logging.INFO)
        self.iceq = IceQuiver(root_dir=self.root_dir,
                              bas_fofn=self.bas_fofn,
                              fasta_fofn=self.fasta_fofn,
                              sge_opts=self.sge_opts)
        self.iceq.run()
        self.add_log("IceQuiver finished.", level=logging.INFO)

        self.add_log("IcePostQuiver initiated.", level=logging.INFO)
        self.icepq = IcePostQuiver(root_dir=self.root_dir,
                                   hq_isoforms_fa=self.hq_isoforms_fa,
                                   hq_isoforms_fq=self.hq_isoforms_fq,
                                   lq_isoforms_fa=self.lq_isoforms_fa,
                                   lq_isoforms_fq=self.lq_isoforms_fq,
                                   use_sge=self.sge_opts.use_sge,
                                   quit_if_not_done=False)
        self.icepq.run()
        self.add_log("IcePostQuiver finished.", level=logging.INFO)
Example #6
0
    def run(self):
        """Call ICE to cluster consensus isoforms."""
        self.add_log("Start to run cluster.", level=logging.INFO)

        if self.ice_opts.targeted_isoseq:
            first_split = 1000
            self.ice_opts.flnc_reads_per_split = 10000
            self.add_log(
                "targeted_isoseq: further splitting JUST first split to 1000. Changing flnc_reads_per_split=10000."
            )
        else:
            first_split = None

        # Split flnc_fa into smaller files and save files to _flnc_splitted_fas.
        self.add_log(
            "Splitting {flnc} into ".format(flnc=self.flnc_fa)
            + "smaller files each containing {n} reads.".format(n=self.ice_opts.flnc_reads_per_split),
            level=logging.INFO,
        )

        self._flnc_splitted_fas = splitFasta(
            input_fasta=self.flnc_fa,
            reads_per_split=self.ice_opts.flnc_reads_per_split,
            out_dir=self.root_dir,
            out_prefix="input.split",
            first_split=first_split,
        )

        self.add_log("Splitted files are: " + "\n".join(self._flnc_splitted_fas), level=logging.INFO)

        firstSplit = self._flnc_splitted_fas[0]
        firstSplit_fq = firstSplit[: firstSplit.rfind(".")] + ".fastq"
        self.add_log(
            "Converting first split file {0} + {1} into fastq\n".format(firstSplit, self.ccs_fofn), level=logging.INFO
        )
        # Convert this into FASTQ
        ice_fa2fq(firstSplit, self.ccs_fofn, firstSplit_fq)

        # Set up probabbility and quality value model
        if self.ice_opts.use_finer_qv:
            self._setProbQV_ccs(self.ccs_fofn, firstSplit)
        else:
            self._setProbQV_fq(firstSplitFq=firstSplit_fq)

        # Initialize cluster by clique
        if os.path.exists(self.initPickleFN):
            self.add_log("Reading existing uc pickle: {0}".format(self.initPickleFN), level=logging.INFO)
            with open(self.initPickleFN) as f:
                uc = cPickle.load(f)
        else:
            self.add_log("Finding maximal cliques: initializing IceInit.", level=logging.INFO)
            self.iceinit = IceInit(
                readsFa=firstSplit,
                qver_get_func=self._probqv.get_smoothed,
                ice_opts=self.ice_opts,
                sge_opts=self.sge_opts,
                qvmean_get_func=self._probqv.get_mean,
            )
            uc = self.iceinit.uc

            # Dump uc to a file
            self.add_log("Dumping initial clusters to {f}".format(f=self.initPickleFN), level=logging.INFO)
            with open(self.initPickleFN, "w") as f:
                cPickle.dump(uc, f)

        # Run IceIterative.
        self.add_log("Iterative clustering: initializing IceIterative.", level=logging.INFO)
        # self.add_log("In Cluster. DEBUG: Calling Cluster with {0} {1} {2} ".format(self.bas_fofn, self.ccs_fofn, self.fasta_fofn))

        self.icec = IceIterative(
            fasta_filename=firstSplit,
            fasta_filenames_to_add=self._flnc_splitted_fas[1:],
            all_fasta_filename=self.flnc_fa,
            ccs_fofn=self.ccs_fofn,
            root_dir=self.root_dir,
            ice_opts=self.ice_opts,
            sge_opts=self.sge_opts,
            uc=uc,
            probQV=self._probqv,
            fastq_filename=firstSplit_fq,
            use_ccs_qv=self.ice_opts.use_finer_qv,
        )
        self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn))
        self.icec.run()
        self.add_log("IceIterative completed.", level=logging.INFO)

        # IceIterative done, write predicted (unplished) consensus isoforms
        # to an output fasta
        self.add_log("Creating a link to unpolished consensus isoforms.")
        ln(self.icec.final_consensus_fa, self.out_fa)

        # self.add_log("In Cluster. DEBUG: End Cluster with {0} {1} {2} ".format(self.bas_fofn, self.ccs_fofn, self.fasta_fofn))

        # Call quiver to polish predicted consensus isoforms.
        if self.ice_opts.quiver is not True:
            self.add_log("Creating a link to cluster report.", level=logging.INFO)
            ln(src=self.icec.report_fn, dst=self.report_fn)

            # Summarize cluster and write to summary_fn.
            self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.out_fa)
        else:  # self.ice_opts.quiver is True
            self.add_log("Polishing clusters: initializing IcePolish.", level=logging.INFO)
            # self.add_log("In Cluster. DEBUG: Calling Polish with {0} {1} {2} ".format(self.bas_fofn, self.ccs_fofn, self.fasta_fofn))
            self.pol = Polish(
                root_dir=self.root_dir,
                nfl_fa=self.nfl_fa,
                bas_fofn=self.bas_fofn,
                ccs_fofn=self.ccs_fofn,
                fasta_fofn=self.fasta_fofn,
                ice_opts=self.ice_opts,
                sge_opts=self.sge_opts,
                ipq_opts=self.ipq_opts,
                nfl_reads_per_split=self.nfl_reads_per_split,
            )
            self.add_log("IcePolish log: {f}.".format(f=self.pol.log_fn), level=logging.INFO)
            self.pol.run()
            self.add_log("IcePolish completed.", level=logging.INFO)

            # cluster report
            self.add_log("Creating a link to cluster report.", level=logging.INFO)
            ln(src=self.pol.iceq.report_fn, dst=self.report_fn)

            # Summarize cluster & polish and write to summary_fn.
            self.write_summary(
                summary_fn=self.summary_fn,
                isoforms_fa=self.out_fa,
                hq_fa=self.pol.icepq.quivered_good_fa,
                lq_fa=self.pol.icepq.quivered_bad_fa,
            )

        # Create log file.
        self.close_log()
        return 0
Example #7
0
    def run(self):
        """Call ICE to cluster consensus isoforms."""
        self.add_log("Start to run cluster.", level=logging.INFO)

        #self.ice_opts.flnc_reads_per_split=1000 #FOR DEBUGGING, REMOVE LATER
        # Split flnc_fa into smaller files and save files to _flnc_splitted_fas.
        self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) +
                     "smaller files each containing {n} reads.".format(
                         n=self.ice_opts.flnc_reads_per_split),
                     level=logging.INFO)
        self._flnc_splitted_fas = splitFasta(
            input_fasta=self.flnc_fa,
            reads_per_split=self.ice_opts.flnc_reads_per_split,
            out_dir=self.root_dir,
            out_prefix="input.split")
        self.add_log("Splitted files are: " +
                     "\n".join(self._flnc_splitted_fas),
                     level=logging.INFO)

        firstSplit = self._flnc_splitted_fas[0]
        firstSplit_fq = firstSplit[:firstSplit.rfind('.')] + '.fastq'
        self.add_log("Converting first split file {0} + {1} into fastq\n".format(\
                firstSplit, self.ccs_fofn), level=logging.INFO)
        # Convert this into FASTQ
        ice_fa2fq(firstSplit, self.ccs_fofn, firstSplit_fq)

        # Set up probabbility and quality value model
        if self.ice_opts.use_finer_qv:
            self._setProbQV_ccs(self.ccs_fofn, firstSplit)
        else:
            self._setProbQV_fq(firstSplitFq=firstSplit_fq)

        # Initialize cluster by clique
        self.add_log("Finding maximal cliques: initializing IceInit.",
                     level=logging.INFO)
        self.iceinit = IceInit(readsFa=firstSplit,
                               qver_get_func=self._probqv.get_smoothed,
                               ice_opts=self.ice_opts,
                               sge_opts=self.sge_opts)
        uc = self.iceinit.uc

        # Dump uc to a file
        self.add_log(
            "Dumping initial clusters to {f}".format(f=self.initPickleFN),
            level=logging.INFO)
        with open(self.initPickleFN, 'w') as f:
            cPickle.dump(uc, f)

        # Run IceIterative.
        self.add_log("Iterative clustering: initializing IceIterative.",
                     level=logging.INFO)
        self.icec = IceIterative(
            fasta_filename=firstSplit,
            fasta_filenames_to_add=self._flnc_splitted_fas[1:],
            all_fasta_filename=self.flnc_fa,
            ccs_fofn=self.ccs_fofn,
            root_dir=self.root_dir,
            ice_opts=self.ice_opts,
            sge_opts=self.sge_opts,
            uc=uc,
            probQV=self._probqv,
            fastq_filename=firstSplit_fq,
            use_ccs_qv=self.ice_opts.use_finer_qv)
        self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn))
        self.icec.run()
        self.add_log("IceIterative completed.", level=logging.INFO)

        # IceIterative done, write predicted (unplished) consensus isoforms
        # to an output fasta
        self.add_log("Creating a link to unpolished consensus isoforms.")
        ln(self.icec.final_consensus_fa, self.out_fa)

        # Call quiver to polish predicted consensus isoforms.
        if self.ice_opts.quiver is not True:
            self.add_log("Creating a link to cluster report.",
                         level=logging.INFO)
            ln(src=self.icec.report_fn, dst=self.report_fn)

            # Summarize cluster and write to summary_fn.
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.out_fa)
        else:  # self.ice_opts.quiver is True
            self.add_log("Polishing clusters: initializing IcePolish.",
                         level=logging.INFO)
            self.pol = Polish(root_dir=self.root_dir,
                              nfl_fa=self.nfl_fa,
                              bas_fofn=self.bas_fofn,
                              ccs_fofn=self.ccs_fofn,
                              fasta_fofn=self.fasta_fofn,
                              ice_opts=self.ice_opts,
                              sge_opts=self.sge_opts,
                              ipq_opts=self.ipq_opts,
                              nfl_reads_per_split=self.nfl_reads_per_split)
            self.add_log("IcePolish log: {f}.".format(f=self.pol.log_fn),
                         level=logging.INFO)
            self.pol.run()
            self.add_log("IcePolish completed.", level=logging.INFO)

            # cluster report
            self.add_log("Creating a link to cluster report.",
                         level=logging.INFO)
            ln(src=self.pol.iceq.report_fn, dst=self.report_fn)

            # Summarize cluster & polish and write to summary_fn.
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.out_fa,
                               hq_fa=self.pol.icepq.quivered_good_fa,
                               lq_fa=self.pol.icepq.quivered_bad_fa)

        # Create log file.
        self.close_log()
        return 0
Example #8
0
    def run(self):
        """Call ICE to cluster consensus isoforms."""
        self.add_log("Start to run cluster.", level=logging.INFO)

        # Split flnc_fa into smaller files and save files to _flnc_splitted_fas.
        self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) +
                     "smaller files each containing {n} reads.".format(
                     n=self.ice_opts.flnc_reads_per_split),
                     level=logging.INFO)
        self._flnc_splitted_fas = splitFasta(
            input_fasta=self.flnc_fa,
            reads_per_split=self.ice_opts.flnc_reads_per_split,
            out_dir=self.root_dir,
            out_prefix="input.split")
        self.add_log("Splitted files are: " +
                     "\n".join(self._flnc_splitted_fas),
                     level=logging.INFO)

        firstSplit = self._flnc_splitted_fas[0]
        # Set up probabbility and quality value model
        self._setProbQV(ccs_fofn=self.ccs_fofn, firstSplitFa=firstSplit)

        # Initialize cluster by clique
        # check if init.pickle already exists, if so, no need to run IceInit
        if os.path.exists(self.initPickleFN):
            self.add_log("{0} already exists. Reading to get uc.".format(self.initPickleFN), level=logging.INFO)
            with open(self.initPickleFN) as f:
                uc = cPickle.load(f)
        else:
            self.add_log("Finding maximal cliques.", level=logging.INFO)
            self.iceinit = IceInit(readsFa=firstSplit,
                      qver_get_func=self._probqv.get_smoothed,
                      ice_opts=self.ice_opts,
                      sge_opts=self.sge_opts)
            uc = self.iceinit.uc
            # Dump uc to a file
            self.add_log("Dumping initial clusters to {f}".format(
                         f=self.initPickleFN), level=logging.INFO)
            with open(self.initPickleFN, 'w') as f:
                cPickle.dump(uc, f)

        # Run IceIterative.
        self.add_log("Iteratively clustering.", level=logging.INFO)
        self.icec = IceIterative(
                fasta_filename=firstSplit,
                fasta_filenames_to_add=self._flnc_splitted_fas[1:],
                all_fasta_filename=self.flnc_fa,
                ccs_fofn=self.ccs_fofn,
                root_dir=self.root_dir,
                ice_opts=self.ice_opts,
                sge_opts=self.sge_opts,
                uc=uc,
                probQV=self._probqv)
        self.icec.run()
        clean_up_after_ICE(self.root_dir)

        # IceIterative done, write predicted (unplished) consensus isoforms
        # to an output fasta
        self.add_log("Creating a link to unpolished consensus isoforms.")
        ln(self.icec.final_consensus_fa, self.out_fa)

        # Call quiver to polish predicted consensus isoforms.
        if self.ice_opts.quiver is not True:
            self.add_log("Creating a link to cluster report.")
            ln(src=self.icec.report_fn, dst=self.report_fn)

            self.add_log("Writing a summary to {f}".format(f=self.summary_fn),
                         level=logging.INFO)
            self.writeSummary(fa=self.out_fa, summary_fn=self.summary_fn)
        else:  # self.ice_opts.quiver is True
            #TODO review code
            self.pol = Polish(root_dir=self.root_dir,
                         nfl_fa=self.nfl_fa,
                         bas_fofn=self.bas_fofn,
                         ccs_fofn=self.ccs_fofn,
                         hq_isoforms_fa=self.hq_isoforms_fa,
                         hq_isoforms_fq=self.hq_isoforms_fq,
                         lq_isoforms_fa=self.lq_isoforms_fa,
                         lq_isoforms_fq=self.lq_isoforms_fq,
                         ice_opts=self.ice_opts,
                         sge_opts=self.sge_opts)
            self.pol.run()

            # cluster report
            self.add_log("Creating a link to cluster report.")
            ln(src=self.pol.iceq.report_fn, dst=self.report_fn)

            # Write a summary.
            self.add_log("Writing a summary to {f}".format(f=self.summary_fn),
                         level=logging.INFO)
            self.writeSummary(fa=self.out_fa, summary_fn=self.summary_fn,
                              hq_fa=self.pol.icepq.quivered_good_fa,
                              lq_fa=self.pol.icepq.quivered_bad_fa)

        # Create log file.
        self.close_log()
        return 0
Example #9
0
    def run(self):
        """Call ICE to cluster consensus isoforms."""
        self.add_log("Start to run cluster.", level=logging.INFO)

        #self.ice_opts.flnc_reads_per_split=1000 #FOR DEBUGGING, REMOVE LATER
        # Split flnc_fa into smaller files and save files to _flnc_splitted_fas.
        self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) +
                     "smaller files each containing {n} reads.".format(
                     n=self.ice_opts.flnc_reads_per_split),
                     level=logging.INFO)
        self._flnc_splitted_fas = splitFasta(
            input_fasta=self.flnc_fa,
            reads_per_split=self.ice_opts.flnc_reads_per_split,
            out_dir=self.root_dir,
            out_prefix="input.split")
        self.add_log("Splitted files are: " +
                     "\n".join(self._flnc_splitted_fas),
                     level=logging.INFO)

        firstSplit = self._flnc_splitted_fas[0]
        firstSplit_fq = firstSplit[:firstSplit.rfind('.')] + '.fastq'
        self.add_log("Converting first split file {0} + {1} into fastq\n".format(\
                firstSplit, self.ccs_fofn), level=logging.INFO)
        # Convert this into FASTQ
        ice_fa2fq(firstSplit, self.ccs_fofn, firstSplit_fq)

        # Set up probabbility and quality value model
        if self.ice_opts.use_finer_qv:
            self._setProbQV_ccs(self.ccs_fofn, firstSplit)
        else:
            self._setProbQV_fq(firstSplitFq=firstSplit_fq)

        # Initialize cluster by clique
        if os.path.exists(self.initPickleFN):
            self.add_log("Reading existing uc pickle: {0}".format(self.initPickleFN), level=logging.INFO)
            with open(self.initPickleFN) as f:
                uc = cPickle.load(f)
        else:
            self.add_log("Finding maximal cliques: initializing IceInit.",
                         level=logging.INFO)
            self.iceinit = IceInit(readsFa=firstSplit,
                                   qver_get_func=self._probqv.get_smoothed,
                                   ice_opts=self.ice_opts,
                                   sge_opts=self.sge_opts)
            uc = self.iceinit.uc

            # Dump uc to a file
            self.add_log("Dumping initial clusters to {f}".format(
                         f=self.initPickleFN), level=logging.INFO)
            with open(self.initPickleFN, 'w') as f:
                cPickle.dump(uc, f)

        # Run IceIterative.
        self.add_log("Iterative clustering: initializing IceIterative.",
                     level=logging.INFO)
        self.icec = IceIterative(
            fasta_filename=firstSplit,
            fasta_filenames_to_add=self._flnc_splitted_fas[1:],
            all_fasta_filename=self.flnc_fa,
            ccs_fofn=self.ccs_fofn,
            root_dir=self.root_dir,
            ice_opts=self.ice_opts,
            sge_opts=self.sge_opts,
            uc=uc,
            probQV=self._probqv,
            fastq_filename=firstSplit_fq,
            use_ccs_qv=self.ice_opts.use_finer_qv)
        self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn))


        return self.icec
Example #10
0
    def run(self):
        """Call ICE to cluster consensus isoforms."""
        self.add_log("Start to run cluster.", level=logging.INFO)

        # Split flnc_fa into smaller files and save files to _flnc_splitted_fas.
        self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) +
                     "smaller files each containing {n} reads.".format(
                         n=self.ice_opts.flnc_reads_per_split),
                     level=logging.INFO)
        self._flnc_splitted_fas = splitFasta(
            input_fasta=self.flnc_fa,
            reads_per_split=self.ice_opts.flnc_reads_per_split,
            out_dir=self.root_dir,
            out_prefix="input.split")
        self.add_log("Splitted files are: " +
                     "\n".join(self._flnc_splitted_fas),
                     level=logging.INFO)

        firstSplit = self._flnc_splitted_fas[0]
        # Set up probabbility and quality value model
        self._setProbQV(ccs_fofn=self.ccs_fofn, firstSplitFa=firstSplit)

        # Initialize cluster by clique
        # check if init.pickle already exists, if so, no need to run IceInit
        if os.path.exists(self.initPickleFN):
            self.add_log("{0} already exists. Reading to get uc.".format(
                self.initPickleFN),
                         level=logging.INFO)
            with open(self.initPickleFN) as f:
                uc = cPickle.load(f)
        else:
            self.add_log("Finding maximal cliques.", level=logging.INFO)
            self.iceinit = IceInit(readsFa=firstSplit,
                                   qver_get_func=self._probqv.get_smoothed,
                                   ice_opts=self.ice_opts,
                                   sge_opts=self.sge_opts)
            uc = self.iceinit.uc
            # Dump uc to a file
            self.add_log(
                "Dumping initial clusters to {f}".format(f=self.initPickleFN),
                level=logging.INFO)
            with open(self.initPickleFN, 'w') as f:
                cPickle.dump(uc, f)

        # Run IceIterative.
        self.add_log("Iteratively clustering.", level=logging.INFO)
        self.icec = IceIterative(
            fasta_filename=firstSplit,
            fasta_filenames_to_add=self._flnc_splitted_fas[1:],
            all_fasta_filename=self.flnc_fa,
            ccs_fofn=self.ccs_fofn,
            root_dir=self.root_dir,
            ice_opts=self.ice_opts,
            sge_opts=self.sge_opts,
            uc=uc,
            probQV=self._probqv)
        self.icec.run()
        clean_up_after_ICE(self.root_dir)

        # IceIterative done, write predicted (unplished) consensus isoforms
        # to an output fasta
        self.add_log("Creating a link to unpolished consensus isoforms.")
        ln(self.icec.final_consensus_fa, self.out_fa)

        # Call quiver to polish predicted consensus isoforms.
        if self.ice_opts.quiver is not True:
            self.add_log("Creating a link to cluster report.")
            ln(src=self.icec.report_fn, dst=self.report_fn)

            self.add_log("Writing a summary to {f}".format(f=self.summary_fn),
                         level=logging.INFO)
            self.writeSummary(fa=self.out_fa, summary_fn=self.summary_fn)
        else:  # self.ice_opts.quiver is True
            #TODO review code
            self.pol = Polish(root_dir=self.root_dir,
                              nfl_fa=self.nfl_fa,
                              bas_fofn=self.bas_fofn,
                              ccs_fofn=self.ccs_fofn,
                              hq_isoforms_fa=self.hq_isoforms_fa,
                              hq_isoforms_fq=self.hq_isoforms_fq,
                              lq_isoforms_fa=self.lq_isoforms_fa,
                              lq_isoforms_fq=self.lq_isoforms_fq,
                              ice_opts=self.ice_opts,
                              sge_opts=self.sge_opts)
            self.pol.run()

            # cluster report
            self.add_log("Creating a link to cluster report.")
            ln(src=self.pol.iceq.report_fn, dst=self.report_fn)

            # Write a summary.
            self.add_log("Writing a summary to {f}".format(f=self.summary_fn),
                         level=logging.INFO)
            self.writeSummary(fa=self.out_fa,
                              summary_fn=self.summary_fn,
                              hq_fa=self.pol.icepq.quivered_good_fa,
                              lq_fa=self.pol.icepq.quivered_bad_fa)

        # Create log file.
        self.close_log()
        return 0