Beispiel #1
0
    def __init__(self,
                 input_fasta,
                 input_fastq,
                 ref_fasta,
                 out_pickle,
                 done_filename,
                 ice_opts,
                 cpus=4,
                 tmp_dir=None):
        self.input_fasta = input_fasta
        self.input_fastq = input_fastq  # could be None
        self.ref_fasta = ref_fasta
        self.out_pickle = out_pickle
        self.done_filename = done_filename
        self.cpus = cpus
        self.ice_opts = ice_opts
        self.tmp_dir = tmp_dir

        # read QV from fastq
        start_t = time.time()
        if self.input_fastq is not None:
            self.probqv, msg = set_probqv_from_fq(self.input_fastq)
        else:
            self.probqv, msg = set_probqv_from_model()
        logging.info("Reading probQV from {0} took {1:.1f} sec.".format(
            self.input_fastq,
            time.time() - start_t))
Beispiel #2
0
def run_IceInit2(readsFa, out_pickle, ice_opts, sge_opts, readsFq=None):
    if readsFq is None:
        _probqv, msg = set_probqv_from_model()
    else:
        _probqv, msg = set_probqv_from_fq(readsFq)

    qver_get_func = _probqv.get_smoothed
    qvmean_get_func = _probqv.get_mean

    i = IceInit2(readsFa, qver_get_func, qvmean_get_func, ice_opts, sge_opts)

    with open(out_pickle, 'w') as f:
        dump(i.uc, f)

    return i.uc
def pickup_icec_job(pickle_filename, root_dir, flnc_filename, fasta_files_to_add, fastq_files_to_add):
    icec_obj, icec_pickle_filename = ensure_pickle_goodness(pickle_filename, root_dir, fasta_files_to_add, fastq_files_to_add)
    probqv, msg = set_probqv_from_model()

    icec = ice.IceIterative2.from_pickle(icec_pickle_filename, probqv)

    # first must RE-RUN gcon to get all the proper refs
    icec.changes = set()
    icec.refs = {}
    icec.all_fasta_filename = flnc_filename
    todo = icec.uc.keys()
    print >> sys.stderr, "Re-run gcon for proper refs...."
    icec.run_gcon_parallel(todo)
    print >> sys.stderr, "Re-calculating cluster prob, just to be safe...."
    icec.calc_cluster_prob(True)
    print >> sys.stderr, "Sanity checking now...."
    icec.sanity_check_uc_refs()
    icec.ensure_probQV_newid_consistency()
    print >> sys.stderr, "Sanity check done. Resuming ICE job."
    icec.run()
Beispiel #4
0
    def run(self):
        """Call ICE to cluster consensus isoforms."""
        self.add_log("Start to run cluster.", level=logging.INFO)

        if self.ice_opts.targeted_isoseq:
            reads_in_first_split = 1000
            self.ice_opts.flnc_reads_per_split = 10000
            self.add_log("targeted_isoseq: further splitting JUST first " +
                         "split to 1000. Changing flnc_reads_per_split=10000.")
        else:
            reads_in_first_split = None

        # Split flnc_fa into smaller files and save files to _flnc_splitted_fas.
        self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) +
                     "smaller files each containing {n} reads.".format(
                         n=self.ice_opts.flnc_reads_per_split),
                     level=logging.INFO)
        self._flnc_splitted_fas = splitFasta(
            input_fasta=self.flnc_fa,
            reads_per_split=self.ice_opts.flnc_reads_per_split,
            out_dir=self.root_dir,
            out_prefix="input.split",
            reads_in_first_split=reads_in_first_split)
        self.add_log("Splitted files are: " +
                     "\n".join(self._flnc_splitted_fas),
                     level=logging.INFO)

        # This is the first piece of reads to work on
        first_split_fa = self._flnc_splitted_fas[0]
        first_split_fq = fafn2fqfn(first_split_fa)

        # Set up probability and quality value model
        if self.ice_opts.use_finer_qv: # default off
            # Use multi-Qvs from ccs.h5, no need to write FASTQ
            self._probqv, msg = set_probqv_from_ccs(
                ccs_fofn=self.ccs_fofn, fasta_filename=first_split_fa)
        else: # use a single Qv from FASTQ
            if self.ccs_fofn is not None:
                self.add_log("Converting {fa} + {ccs} into {fq}\n".format(
                    fa=first_split_fa, ccs=self.ccs_fofn,
                    fq=first_split_fq), level=logging.INFO)
                ice_fa2fq(in_fa=first_split_fa, ccs_fofn=self.ccs_fofn,
                          out_fq=first_split_fq)
                # Set probqv from the first splitted FASTQ file.
                self._probqv, msg = set_probqv_from_fq(fastq_filename=first_split_fq)
            else: # use predefined model
                self._probqv, msg = set_probqv_from_model()
            self.add_log(msg, level=logging.INFO)

        # Initialize cluster by clique
        self.add_log("Finding maximal cliques: initializing IceInit.",
                     level=logging.INFO)
        self.iceinit = IceInit(readsFa=first_split_fa,
                               qver_get_func=self._probqv.get_smoothed,
                               qvmean_get_func=self._probqv.get_mean,
                               ice_opts=self.ice_opts,
                               sge_opts=self.sge_opts)
        uc = self.iceinit.uc

        # Dump uc to a file
        self.add_log("Dumping initial clusters to {f}"
                     .format(f=self.initPickleFN), level=logging.INFO)
        with open(self.initPickleFN, 'w') as f:
            if self.initPickleFN.endswith(".json"):
                f.write(json.dumps(uc))
            else:
                cPickle.dump(uc, f)

        # Run IceIterative.
        self.add_log("Iterative clustering: initializing IceIterative.",
                     level=logging.INFO)
        self.icec = IceIterative(
            fasta_filename=first_split_fa,
            fasta_filenames_to_add=self._flnc_splitted_fas[1:],
            all_fasta_filename=self.flnc_fa,
            ccs_fofn=self.ccs_fofn,
            root_dir=self.root_dir,
            ice_opts=self.ice_opts,
            sge_opts=self.sge_opts,
            uc=uc,
            probQV=self._probqv,
            fastq_filename=first_split_fq,
            output_pickle_file=self.output_pickle_file,
            tmp_dir=self.tmp_dir)

        self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn))
        self.icec.run()
        self.add_log("IceIterative completed.", level=logging.INFO)

        # IceIterative done, write predicted (unplished) consensus isoforms
        # to an output fasta
        self.add_log("Creating a link to unpolished consensus isoforms.")
        ln(self.icec.final_consensus_fa, self.out_fa)
        if self.out_fa_dataset is not None:
            dummy_ds = as_contigset(
                fasta_file=self.icec.final_consensus_fa,
                xml_file=self.out_fa_dataset)

        # Call quiver to polish predicted consensus isoforms.
        if self.ice_opts.quiver is not True:
            self.add_log("Creating a link to cluster report.",
                         level=logging.INFO)
            ln(src=self.icec.report_fn, dst=self.report_fn)

            # Summarize cluster and write to summary_fn.
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.out_fa)
        else:  # self.ice_opts.quiver is True
            self.add_log("Polishing clusters: initializing IcePolish.",
                         level=logging.INFO)
            self.pol = Polish(root_dir=self.root_dir,
                              nfl_fa=self.nfl_fa,
                              bas_fofn=self.bas_fofn,
                              ccs_fofn=self.ccs_fofn,
                              fasta_fofn=self.fasta_fofn,
                              ice_opts=self.ice_opts,
                              sge_opts=self.sge_opts,
                              ipq_opts=self.ipq_opts,
                              tmp_dir=self.tmp_dir)
            self.add_log("IcePolish log: {f}.".format(f=self.pol.log_fn),
                         level=logging.INFO)
            self.pol.run()
            self.add_log("IcePolish completed.", level=logging.INFO)

            # cluster report
            self.add_log("Creating a link to cluster report.",
                         level=logging.INFO)
            ln(src=self.pol.iceq.report_fn, dst=self.report_fn)

            # Summarize cluster & polish and write to summary_fn.
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.out_fa,
                               hq_fa=self.pol.icepq.quivered_good_fa,
                               lq_fa=self.pol.icepq.quivered_bad_fa)

        # Create log file.
        self.close_log()
        return 0
Beispiel #5
0
    def run(self):
        """Call ICE to cluster consensus isoforms."""
        self.add_log("Start to run cluster.", level=logging.INFO)

        if self.ice_opts.targeted_isoseq:
            reads_in_first_split = 1000
            self.ice_opts.flnc_reads_per_split = 10000
            self.add_log("targeted_isoseq: further splitting JUST first " +
                         "split to 1000. Changing flnc_reads_per_split=10000.")
        else:
            reads_in_first_split = None

        # Split flnc_fa into smaller files and save files to _flnc_splitted_fas.
        self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) +
                     "smaller files each containing {n} reads.".format(
                         n=self.ice_opts.flnc_reads_per_split),
                     level=logging.INFO)
        self._flnc_splitted_fas = splitFasta(
            input_fasta=self.flnc_fa,
            reads_per_split=self.ice_opts.flnc_reads_per_split,
            out_dir=self.root_dir,
            out_prefix="input.split",
            reads_in_first_split=reads_in_first_split)
        self.add_log("Splitted files are: " +
                     "\n".join(self._flnc_splitted_fas),
                     level=logging.INFO)

        # This is the first piece of reads to work on
        first_split_fa = self._flnc_splitted_fas[0]
        first_split_fq = fafn2fqfn(first_split_fa)

        # Set up probability and quality value model
        if self.ice_opts.use_finer_qv:  # default off
            # Use multi-Qvs from ccs.h5, no need to write FASTQ
            self._probqv, msg = set_probqv_from_ccs(
                ccs_fofn=self.ccs_fofn, fasta_filename=first_split_fa)
        else:  # use a single Qv from FASTQ
            if self.ccs_fofn is not None:
                self.add_log("Converting {fa} + {ccs} into {fq}\n".format(
                    fa=first_split_fa, ccs=self.ccs_fofn, fq=first_split_fq),
                             level=logging.INFO)
                ice_fa2fq(in_fa=first_split_fa,
                          ccs_fofn=self.ccs_fofn,
                          out_fq=first_split_fq)
                # Set probqv from the first splitted FASTQ file.
                self._probqv, msg = set_probqv_from_fq(
                    fastq_filename=first_split_fq)
            else:  # use predefined model
                self._probqv, msg = set_probqv_from_model()
            self.add_log(msg, level=logging.INFO)

        # Initialize cluster by clique
        self.add_log("Finding maximal cliques: initializing IceInit.",
                     level=logging.INFO)
        self.iceinit = IceInit(readsFa=first_split_fa,
                               qver_get_func=self._probqv.get_smoothed,
                               qvmean_get_func=self._probqv.get_mean,
                               ice_opts=self.ice_opts,
                               sge_opts=self.sge_opts)
        uc = self.iceinit.uc

        # Dump uc to a file
        self.add_log(
            "Dumping initial clusters to {f}".format(f=self.initPickleFN),
            level=logging.INFO)
        with open(self.initPickleFN, 'w') as f:
            if self.initPickleFN.endswith(".json"):
                f.write(json.dumps(uc))
            else:
                cPickle.dump(uc, f)

        # Run IceIterative.
        self.add_log("Iterative clustering: initializing IceIterative.",
                     level=logging.INFO)
        self.icec = IceIterative(
            fasta_filename=first_split_fa,
            fasta_filenames_to_add=self._flnc_splitted_fas[1:],
            all_fasta_filename=self.flnc_fa,
            ccs_fofn=self.ccs_fofn,
            root_dir=self.root_dir,
            ice_opts=self.ice_opts,
            sge_opts=self.sge_opts,
            uc=uc,
            probQV=self._probqv,
            fastq_filename=first_split_fq,
            output_pickle_file=self.output_pickle_file,
            tmp_dir=self.tmp_dir)

        self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn))
        self.icec.run()
        self.add_log("IceIterative completed.", level=logging.INFO)

        # IceIterative done, write predicted (unplished) consensus isoforms
        # to an output fasta
        self.add_log("Creating a link to unpolished consensus isoforms.")
        ln(self.icec.final_consensus_fa, self.out_fa)
        if self.out_fa_dataset is not None:
            dummy_ds = as_contigset(fasta_file=self.icec.final_consensus_fa,
                                    xml_file=self.out_fa_dataset)

        # Call quiver to polish predicted consensus isoforms.
        if self.ice_opts.quiver is not True:
            self.add_log("Creating a link to cluster report.",
                         level=logging.INFO)
            ln(src=self.icec.report_fn, dst=self.report_fn)

            # Summarize cluster and write to summary_fn.
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.out_fa)
        else:  # self.ice_opts.quiver is True
            self.add_log("Polishing clusters: initializing IcePolish.",
                         level=logging.INFO)
            self.pol = Polish(root_dir=self.root_dir,
                              nfl_fa=self.nfl_fa,
                              bas_fofn=self.bas_fofn,
                              ccs_fofn=self.ccs_fofn,
                              fasta_fofn=self.fasta_fofn,
                              ice_opts=self.ice_opts,
                              sge_opts=self.sge_opts,
                              ipq_opts=self.ipq_opts,
                              tmp_dir=self.tmp_dir)
            self.add_log("IcePolish log: {f}.".format(f=self.pol.log_fn),
                         level=logging.INFO)
            self.pol.run()
            self.add_log("IcePolish completed.", level=logging.INFO)

            # cluster report
            self.add_log("Creating a link to cluster report.",
                         level=logging.INFO)
            ln(src=self.pol.iceq.report_fn, dst=self.report_fn)

            # Summarize cluster & polish and write to summary_fn.
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.out_fa,
                               hq_fa=self.pol.icepq.quivered_good_fa,
                               lq_fa=self.pol.icepq.quivered_bad_fa)

        # Create log file.
        self.close_log()
        return 0