コード例 #1
0
def run_main(chunk_json, contigset_output, chunk_key):
    """run main"""
    chunks = load_pipeline_chunks_from_json(chunk_json)

    # Allow looseness
    if not chunk_key.startswith('$chunk.'):
        chunk_key = '$chunk.' + chunk_key
        log.warn("Prepending chunk key with '$chunk.' to '%s'", str(chunk_key))

    fasta_files = get_datum_from_chunks_by_chunk_key(chunks, chunk_key)
    log.debug("Chunked consensus isoforms files are %s.", (', '.join(fasta_files)))

    out_fa = CombinedFiles(combined_dir=op.dirname(contigset_output)).all_consensus_isoforms_fa
    combine_consensus_isoforms(split_indices=range(0, len(fasta_files)),
                               split_files=fasta_files,
                               combined_consensus_isoforms_fa=out_fa)
    log.info("Combining files to %s.", out_fa)

    log.info("Writing contigset %s", contigset_output)
    assert contigset_output.endswith('xml')
    as_contigset(out_fa, contigset_output)

    #cs = ContigSet(*fasta_files)
    #cs.newUuid()
    #cs.write(contigset_output)
    return 0
コード例 #2
0
    def make_flnc(in_flnc, root_dir):
        bin_name = op.basename(op.dirname(in_flnc))
        flnc_name = op.basename(in_flnc)

        assert in_flnc.endswith(".contigset.xml")
        in_flnc_fa = in_flnc.replace(".contigset.xml", ".fasta")
        new_flnc = op.join(root_dir, bin_name, flnc_name)
        new_flnc_fa = new_flnc.replace(".contigset.xml", ".fasta")

        print "new_flnc = %s" % new_flnc
        shutil.copy(in_flnc_fa, new_flnc_fa)
        as_contigset(new_flnc_fa, new_flnc)
コード例 #3
0
    def make_flnc(in_flnc, root_dir):
        bin_name = op.basename(op.dirname(in_flnc))
        flnc_name = op.basename(in_flnc)

        assert in_flnc.endswith(".contigset.xml")
        in_flnc_fa = in_flnc.replace(".contigset.xml", ".fasta")
        new_flnc = op.join(root_dir, bin_name, flnc_name)
        new_flnc_fa = new_flnc.replace(".contigset.xml", ".fasta")

        print "new_flnc = %s" % new_flnc
        shutil.copy(in_flnc_fa, new_flnc_fa)
        as_contigset(new_flnc_fa, new_flnc)
コード例 #4
0
    def __exit__(self, exc_type, exc_value, traceback):
        """
        Close all fasta file handles.
        If create_contigset is True, convert out_fasta_files to out_contigset_files.
        """
        # close fasta file handlers
        for f in self.handles.itervalues():
            f.close()

        if self.create_contigset is True:
            for fasta_fn, xml_fn in zip(self.out_fasta_files, self.out_contigset_files):
                as_contigset(fasta_fn, xml_fn)

        # write out_pickle
        self.write_pickle()
コード例 #5
0
ファイル: separate_flnc.py プロジェクト: lpp1985/lpp_Script
    def __exit__(self, exc_type, exc_value, traceback):
        """
        Close all fasta file handles.
        If create_contigset is True, convert out_fasta_files to out_contigset_files.
        """
        # close fasta file handlers
        for f in self.handles.itervalues():
            f.close()

        if self.create_contigset is True:
            for fasta_fn, xml_fn in zip(self.out_fasta_files, self.out_contigset_files):
                as_contigset(fasta_fn, xml_fn)

        # write out_pickle
        self.write_pickle()
コード例 #6
0
ファイル: Classifier.py プロジェクト: lpp1985/lpp_Script
    def run(self):
        """Classify/annotate reads according to 5' primer seen,
        3' primer seen, polyA seen, chimera (concatenation of two
        or multiple transcripts with primers seen in the middle of
        a read)
        (1) Create and validate input/output
        (2) Check phmmer is runnable
        (3) Find primers using phmmer and trim away primers and polyAs
        (4) Detect chimeras from trimmed reads
        """
        # Validate input files and required data files.
        self._validate_inputs(self.reads_fn, self.primer_fn, self.pbmatrix_fn)

        # Validate and create output dir.
        self._validate_outputs(self.out_dir, self.out_all_reads_fn_fasta)

        # Sanity check phmmer can be called successfully.
        self._checkPhmmer()

        # Find and trim primers and polyAs.
        self.runPrimerTrimmer()

        # Check whether no fl reads detected.
        no_flnc_errMsg = "No full-length non-chimeric reads detected."
        if self.summary.num_fl == 0:
            logging.error(no_flnc_errMsg)
            if not self.ignore_empty_output:
                raise ClassifierException(no_flnc_errMsg)
        else:
            # Detect chimeras and generate primer reports.
            self.runChimeraDetector()

        dataset_uuids = []
        for file_attr in ["out_nfl_fn", "out_nflnc_fn", "out_nflc_fn",
                          "out_flnc_fn", "out_flc_fn", "out_all_reads_fn"]:
            file_name = getattr(self, file_attr)
            fasta_file_name = getattr(self, "%s_fasta" % file_attr)
            ds = as_contigset(
                fasta_file=fasta_file_name,
                xml_file=file_name)
            if file_attr in ["out_all_reads_fn", "out_nfl_fn", "out_flnc_fn"]:
                dataset_uuids.append(ds.uuid)

        try:
            # Write summary.
            logging.info("Writing report to {f}".format(f=self.summary_fn))
            self.summary.write(self.summary_fn, dataset_uuids=dataset_uuids)
        except ZeroDivisionError:
            logging.error(no_flnc_errMsg)
            raise ClassifierException(no_flnc_errMsg)

        return 0
コード例 #7
0
ファイル: Classifier.py プロジェクト: lpp1985/lpp_Script
    def run(self):
        """Classify/annotate reads according to 5' primer seen,
        3' primer seen, polyA seen, chimera (concatenation of two
        or multiple transcripts with primers seen in the middle of
        a read)
        (1) Create and validate input/output
        (2) Check phmmer is runnable
        (3) Find primers using phmmer and trim away primers and polyAs
        (4) Detect chimeras from trimmed reads
        """
        # Validate input files and required data files.
        self._validate_inputs(self.reads_fn, self.primer_fn, self.pbmatrix_fn)

        # Validate and create output dir.
        self._validate_outputs(self.out_dir, self.out_all_reads_fn_fasta)

        # Sanity check phmmer can be called successfully.
        self._checkPhmmer()

        # Find and trim primers and polyAs.
        self.runPrimerTrimmer()

        # Check whether no fl reads detected.
        no_flnc_errMsg = "No full-length non-chimeric reads detected."
        if self.summary.num_fl == 0:
            logging.error(no_flnc_errMsg)
            if not self.ignore_empty_output:
                raise ClassifierException(no_flnc_errMsg)
        else:
            # Detect chimeras and generate primer reports.
            self.runChimeraDetector()

        dataset_uuids = []
        for file_attr in [
                "out_nfl_fn", "out_nflnc_fn", "out_nflc_fn", "out_flnc_fn",
                "out_flc_fn", "out_all_reads_fn"
        ]:
            file_name = getattr(self, file_attr)
            fasta_file_name = getattr(self, "%s_fasta" % file_attr)
            ds = as_contigset(fasta_file=fasta_file_name, xml_file=file_name)
            if file_attr in ["out_all_reads_fn", "out_nfl_fn", "out_flnc_fn"]:
                dataset_uuids.append(ds.uuid)

        try:
            # Write summary.
            logging.info("Writing report to {f}".format(f=self.summary_fn))
            self.summary.write(self.summary_fn, dataset_uuids=dataset_uuids)
        except ZeroDivisionError:
            logging.error(no_flnc_errMsg)
            raise ClassifierException(no_flnc_errMsg)

        return 0
コード例 #8
0
def args_runner(args):
    """Run given input args"""
    c = CollapseIsoformsRunner(isoform_filename=args.input_isoforms,
                               sam_filename=args.sam_filename,
                               output_prefix=args.output_prefix,
                               min_aln_coverage=args.min_aln_coverage,
                               min_aln_identity=args.min_aln_identity,
                               min_flnc_coverage=args.min_flnc_coverage,
                               max_fuzzy_junction=args.max_fuzzy_junction,
                               allow_extra_5exon=args.allow_extra_5exon,
                               skip_5_exon_alt=args.skip_5_exon_alt)
    c.run()

    if args.collapsed_isoforms is not None:
        suffix = parse_ds_filename(args.collapsed_isoforms)[1]
        if op.exists(c.rep_fn(suffix)):
            ln(c.rep_fn(suffix), args.collapsed_isoforms)
        else:
            if suffix == ".contigset.xml": # make contigset from fasta
                as_contigset(c.rep_fn("fasta"), args.collapsed_isoforms)
            else:
                raise IOError("Could not make collapsed isoform file %s" % args.collapsed_isoforms)
    return 0
コード例 #9
0
def args_runner(args):
    """Run given input args"""
    c = CollapseIsoformsRunner(isoform_filename=args.input_isoforms,
                               sam_filename=args.sam_filename,
                               output_prefix=args.output_prefix,
                               min_aln_coverage=args.min_aln_coverage,
                               min_aln_identity=args.min_aln_identity,
                               min_flnc_coverage=args.min_flnc_coverage,
                               max_fuzzy_junction=args.max_fuzzy_junction,
                               allow_extra_5exon=args.allow_extra_5exon,
                               skip_5_exon_alt=args.skip_5_exon_alt)
    c.run()

    if args.collapsed_isoforms is not None:
        suffix = parse_ds_filename(args.collapsed_isoforms)[1]
        if op.exists(c.rep_fn(suffix)):
            ln(c.rep_fn(suffix), args.collapsed_isoforms)
        else:
            if suffix == ".contigset.xml":  # make contigset from fasta
                as_contigset(c.rep_fn("fasta"), args.collapsed_isoforms)
            else:
                raise IOError("Could not make collapsed isoform file %s" %
                              args.collapsed_isoforms)
    return 0
コード例 #10
0
    def test_as_contigset(self):
        """Test as_contigset"""
        out_dir = op.join(OUT_DIR, 'test_Utils')
        mknewdir(out_dir)
        fa = op.join(out_dir, "empty.fasta")
        xml = op.join(out_dir, "empty.contigset.xml")
        fai = fa + ".fai"

        execute("touch %s" % fa)
        as_contigset(fa, xml)
        self.assertTrue(op.exists(xml))
        self.assertTrue(op.exists(fai))

        fn = 'reads_of_insert.fasta'
        shutil.copy(src=op.join(DATA_DIR, fn), dst=op.join(out_dir, fn))
        fa = op.join(out_dir, fn)
        as_contigset(fa, fa)

        fai = fa + ".fai"
        xml = op.join(out_dir, 'reads_of_insert.contigset.xml')
        as_contigset(fa, xml)
        self.assertTrue(op.exists(xml))
        self.assertTrue(op.exists(fai))
コード例 #11
0
    def run(self):
        """
        Check all arrow jobs are running, failed or done. Write high-quality
        consensus and low-quality consensus to all_arrowed.hq|lq fasta|fastq
        """
        self.validate_inputs()

        job_stats = self.check_arrow_jobs_completion()
        self.add_log("Arrow job status: {s}".format(s=job_stats))

        if job_stats == 'DONE':
            pass  # continue on below to process data
        elif job_stats == 'FAILED':
            self.add_log("Has incomplete jobs. Please re-run them.",
                         level=logging.ERROR)
            return -1
        elif job_stats == 'RUNNING':
            if self.quit_if_not_done:
                self.add_log(
                    "Jobs are still running. Please wait before running this script."
                )
                return 1
            else:
                while job_stats != "DONE":
                    self.add_log(
                        "Jobs are still running. Wait. Sleeping for 180 seconds."
                    )
                    sleep(180)
                    job_stats = self.check_arrow_jobs_completion()
                    if job_stats == "DONE":
                        break
                    elif job_stats == "FAILED":
                        self.add_log(
                            "There are some failed jobs. Please check.",
                            level=logging.ERROR)
                        return 1
                    elif job_stats == "RUNNING":
                        self.add_log(
                            "Jobs are still running. Wait. Sleeping for 180 seconds.",
                            level=logging.INFO)
        else:
            msg = "Unable to recognize job_stats {s}".format(s=job_stats)
            self.add_log(msg, logging.ERROR)
            raise ValueError(msg)

        # at this point, all jobs must be done and all fastq files present.
        self.pickup_best_clusters()

        self.add_log("Creating polished high quality consensus isoforms.")
        if self.hq_isoforms_fa is not None:
            ln(self.arrowed_good_fa, self.hq_isoforms_fa)
        if self.hq_isoforms_fq is not None:
            ln(self.arrowed_good_fq, self.hq_isoforms_fq)

        self.add_log("Creating polished low quality consensus isoforms.")
        if self.lq_isoforms_fa is not None:
            ln(self.arrowed_bad_fa, self.lq_isoforms_fa)
        if self.lq_isoforms_fq is not None:
            ln(self.arrowed_bad_fq, self.lq_isoforms_fq)

        if self.hq_isoforms_dataset is not None:
            ds = as_contigset(self.hq_isoforms_fa, self.hq_isoforms_dataset)
        if self.lq_isoforms_dataset is not None:
            ds = as_contigset(self.lq_isoforms_fa, self.lq_isoforms_dataset)
        if self.summary_fn is not None:
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.final_consensus_fa,
                               hq_fa=self.hq_isoforms_fa,
                               lq_fa=self.lq_isoforms_fa)

        self.close_log()
コード例 #12
0
def resolved_tool_contract_runner(rtc):
    """
    For each cluster bin, create summary.json, cluster_report.csv,
    hq_isoforms.fa|fq, lq_isoforms.fa|fq
    Finally, merge all cluster bins and save all outputs to 'combined'.
    """
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, ClusterChunkTask) for task in p])
    p.sorted_by_attr(attr='cluster_bin_index')

    opts = rtc.task.options
    ipq_opts = IceQuiverHQLQOptions(qv_trim_5=opts[Constants.QV_TRIM_FIVEPRIME_ID],
                                    qv_trim_3=opts[Constants.QV_TRIM_THREEPRIME_ID],
                                    hq_quiver_min_accuracy=opts[Constants.HQ_QUIVER_MIN_ACCURACY_ID])
    sample_name = get_sample_name(input_sample_name=opts[Constants.SAMPLE_NAME_ID])

    out_consensus_isoforms_cs = rtc.task.output_files[0]
    out_summary = rtc.task.output_files[1]
    out_report = rtc.task.output_files[2]
    out_hq_cs = rtc.task.output_files[3]
    out_hq_fq = rtc.task.output_files[4]
    out_lq_cs = rtc.task.output_files[5]
    out_lq_fq = rtc.task.output_files[6]
    out_hq_lq_prefix_dict_pickle = rtc.task.output_files[7]

    assert out_consensus_isoforms_cs.endswith(".contigset.xml")
    assert out_hq_cs.endswith(".contigset.xml")
    assert out_lq_cs.endswith(".contigset.xml")
    out_consensus_isoforms_fa = out_consensus_isoforms_cs.replace(".contigset.xml", ".fasta")
    out_hq_fa = out_hq_cs.replace('.contigset.xml', '.fasta')
    out_lq_fa = out_lq_cs.replace('.contigset.xml', '.fasta')

    hq_fq_fns, lq_fq_fns = [], []
    split_uc_pickles, split_partial_uc_pickles = [], []
    split_consensus_isoforms = []

    cluster_bin_indices = [task.cluster_bin_index for task in p]
    cluster_out_dirs = [task.cluster_out_dir for task in p]
    # sanity check that Cluster indices are unique!
    assert len(set(cluster_bin_indices)) == len(cluster_bin_indices)

    for task in p:
        ice_pq = IceQuiverPostprocess(root_dir=task.cluster_out_dir,
                                      ipq_opts=ipq_opts)
        hq_fq_fns.append(ice_pq.quivered_good_fq)
        lq_fq_fns.append(ice_pq.quivered_bad_fq)
        split_uc_pickles.append(ice_pq.final_pickle_fn)
        split_partial_uc_pickles.append(ice_pq.nfl_all_pickle_fn)
        split_consensus_isoforms.append(ice_pq.final_consensus_fa)

    combined_dir = op.join(op.dirname(op.dirname(cluster_out_dirs[0])), "combined")
    mkdir(combined_dir)
    combined_files = CombinedFiles(combined_dir)
    log.info("Combining results of all cluster bins to %s.", combined_dir)
    log.info("Merging HQ|LQ isoforms from all cluster bins.")
    log.info("HQ isoforms are: %s.", ",".join(hq_fq_fns))
    log.info("LQ isoforms are: %s.", ",".join(lq_fq_fns))
    combine_polished_isoforms(split_indices=cluster_bin_indices,
                              split_hq_fns=hq_fq_fns,
                              split_lq_fns=lq_fq_fns,
                              combined_hq_fa=combined_files.all_hq_fa,
                              combined_hq_fq=combined_files.all_hq_fq,
                              combined_lq_fa=combined_files.all_lq_fa,
                              combined_lq_fq=combined_files.all_lq_fq,
                              hq_lq_prefix_dict_pickle=combined_files.hq_lq_prefix_dict_pickle,
                              sample_name=sample_name)

    ln(combined_files.all_hq_fa, out_hq_fa) #'HQ isoforms'
    ln(combined_files.all_hq_fq, out_hq_fq) #'HQ isoforms'
    ln(combined_files.all_lq_fa, out_lq_fa) #'LQ isoforms'
    ln(combined_files.all_lq_fq, out_lq_fq) #'LQ isoforms'
    ln(combined_files.hq_lq_prefix_dict_pickle, out_hq_lq_prefix_dict_pickle)

    as_contigset(out_hq_fa, out_hq_cs)
    as_contigset(out_lq_fa, out_lq_cs)

    log.info("Merging consensus isoforms from all cluster bins.")
    combine_consensus_isoforms(split_indices=cluster_bin_indices,
                               split_files=split_consensus_isoforms,
                               combined_consensus_isoforms_fa=combined_files.all_consensus_isoforms_fa,
                               sample_name=sample_name)
    ln(combined_files.all_consensus_isoforms_fa, out_consensus_isoforms_fa)
    #consensus isoforms
    as_contigset(out_consensus_isoforms_fa, out_consensus_isoforms_cs)

    log.info("Writing cluster summary to %s", combined_files.all_cluster_summary_fn)
    write_cluster_summary(summary_fn=combined_files.all_cluster_summary_fn,
                          isoforms_fa=out_consensus_isoforms_cs,
                          hq_fa=out_hq_fa,
                          lq_fa=out_lq_fa)
    ln(combined_files.all_cluster_summary_fn, out_summary) # "cluster summary"

    log.info("Writing cluster report to %s", combined_files.all_cluster_report_fn)
    write_combined_cluster_report(split_indices=cluster_bin_indices,
                                  split_uc_pickles=split_uc_pickles,
                                  split_partial_uc_pickles=split_partial_uc_pickles,
                                  report_fn=combined_files.all_cluster_report_fn,
                                  sample_name=sample_name)
    ln(combined_files.all_cluster_report_fn, out_report) # "cluster report"
コード例 #13
0
ファイル: CollapsingUtils.py プロジェクト: lpp1985/lpp_Script
def pick_rep(isoform_filename, gff_filename,
             group_filename, output_filename,
             pick_least_err_instead=False,
             bad_gff_filename=None):
    """
    For each group of collapsed sam records, select the representative record.

    If is FASTA file -- then always pick the longest one
    If is FASTQ file -- then
          If pick_least_err_instead is True, pick the one w/ least number of expected base errors
          Else, pick the longest one
    """
    fd = None
    is_fq = False
    dummy_prefix, _suffix = parse_ds_filename(isoform_filename)
    if _suffix == "fasta":
        fd = FastaRandomReader(isoform_filename)
    elif _suffix == "fastq":
        fd = FastqRandomReader(isoform_filename)
        is_fq = True
    elif _suffix == "contigset.xml":
        fd = ContigSet(isoform_filename)
        _fns = fd.toExternalFiles()
        if len(_fns) == 1 and _fns[0].endswith(".fq") or _fns[0].endswith(".fastq"):
            fd = FastqRandomReader(_fns[0])
            is_fq = True
        else:
            if not fd.isIndexed:
                # Must be indexed FASTA, or exactly contains one FASTQ file
                raise IOError("%s must contain either indexed FASTA files or " % isoform_filename +
                              "contain exactly one FASTQ file!")
    else:
        raise IOError("Unable to recognize file type of %s." % isoform_filename)

    fa_out_fn, fq_out_fn, ds_out_fn = None, None, None

    _prefix, _suffix = parse_ds_filename(output_filename)
    if _suffix == "fasta":
        fa_out_fn = output_filename
    elif _suffix == "fastq":
        if not is_fq:
            raise ValueError("Input file %s is not FASTQ while output is." % isoform_filename)
        else:
            fq_out_fn = output_filename
    elif _suffix == "contigset.xml": # output is contigset.xml
        ds_out_fn = output_filename
        fa_out_fn = _prefix + ".fasta"
        if is_fq:
            fq_out_fn = _prefix + ".fastq"
    else:
        raise IOError("Unable to recognize file type of %s." % output_filename)

    fa_writer = FastaWriter(fa_out_fn) if fa_out_fn is not None else None
    fq_writer = FastqWriter(fq_out_fn) if fq_out_fn is not None else None

    coords = {}
    for r in CollapseGffReader(gff_filename):
        tid = r.transcript_id
        coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand)

    if bad_gff_filename is not None:
        for r in CollapseGffReader(gff_filename):
            tid = r.transcript_id
            coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand)

    for group in GroupReader(group_filename):
        pb_id, members = group.name, group.members
        if not pb_id in coords:
            raise ValueError("Could not find %s in %s and %s" %
                             (pb_id, gff_filename, bad_gff_filename))
        #logging.info("Picking representative sequence for %s", pb_id)
        best_id = None
        best_seq = None
        best_qual = None
        best_err = 9999999
        err = 9999999
        max_len = 0

        for x in members:
            if is_fq and pick_least_err_instead:
                err = sum(i**-(i/10.) for i in fd[x].quality)
            if (is_fq and pick_least_err_instead and err < best_err) or \
               ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len):
                best_id = x
                best_seq = fd[x].sequence
                if is_fq:
                    best_qual = fd[x].quality
                    best_err = err
                max_len = len(fd[x].sequence)

        _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id)
        _seq_ = best_seq
        if fq_writer is not None:
            fq_writer.writeRecord(_id_, _seq_, best_qual)
        if fa_writer is not None:
            fa_writer.writeRecord(_id_, _seq_)

    if fa_writer is not None:
        fa_writer.close()
    if fq_writer is not None:
        fq_writer.close()
    if ds_out_fn is not None:
        as_contigset(fa_out_fn, ds_out_fn)
コード例 #14
0
ファイル: Cluster.py プロジェクト: lpp1985/lpp_Script
    def run(self):
        """Call ICE to cluster consensus isoforms."""
        self.add_log("Start to run cluster.", level=logging.INFO)

        if self.ice_opts.targeted_isoseq:
            reads_in_first_split = 1000
            self.ice_opts.flnc_reads_per_split = 10000
            self.add_log("targeted_isoseq: further splitting JUST first " +
                         "split to 1000. Changing flnc_reads_per_split=10000.")
        else:
            reads_in_first_split = None

        # Split flnc_fa into smaller files and save files to _flnc_splitted_fas.
        self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) +
                     "smaller files each containing {n} reads.".format(
                         n=self.ice_opts.flnc_reads_per_split),
                     level=logging.INFO)
        self._flnc_splitted_fas = splitFasta(
            input_fasta=self.flnc_fa,
            reads_per_split=self.ice_opts.flnc_reads_per_split,
            out_dir=self.root_dir,
            out_prefix="input.split",
            reads_in_first_split=reads_in_first_split)
        self.add_log("Splitted files are: " +
                     "\n".join(self._flnc_splitted_fas),
                     level=logging.INFO)

        # This is the first piece of reads to work on
        first_split_fa = self._flnc_splitted_fas[0]
        first_split_fq = fafn2fqfn(first_split_fa)

        # Set up probability and quality value model
        if self.ice_opts.use_finer_qv: # default off
            # Use multi-Qvs from ccs.h5, no need to write FASTQ
            self._probqv, msg = set_probqv_from_ccs(
                ccs_fofn=self.ccs_fofn, fasta_filename=first_split_fa)
        else: # use a single Qv from FASTQ
            if self.ccs_fofn is not None:
                self.add_log("Converting {fa} + {ccs} into {fq}\n".format(
                    fa=first_split_fa, ccs=self.ccs_fofn,
                    fq=first_split_fq), level=logging.INFO)
                ice_fa2fq(in_fa=first_split_fa, ccs_fofn=self.ccs_fofn,
                          out_fq=first_split_fq)
                # Set probqv from the first splitted FASTQ file.
                self._probqv, msg = set_probqv_from_fq(fastq_filename=first_split_fq)
            else: # use predefined model
                self._probqv, msg = set_probqv_from_model()
            self.add_log(msg, level=logging.INFO)

        # Initialize cluster by clique
        self.add_log("Finding maximal cliques: initializing IceInit.",
                     level=logging.INFO)
        self.iceinit = IceInit(readsFa=first_split_fa,
                               qver_get_func=self._probqv.get_smoothed,
                               qvmean_get_func=self._probqv.get_mean,
                               ice_opts=self.ice_opts,
                               sge_opts=self.sge_opts)
        uc = self.iceinit.uc

        # Dump uc to a file
        self.add_log("Dumping initial clusters to {f}"
                     .format(f=self.initPickleFN), level=logging.INFO)
        with open(self.initPickleFN, 'w') as f:
            if self.initPickleFN.endswith(".json"):
                f.write(json.dumps(uc))
            else:
                cPickle.dump(uc, f)

        # Run IceIterative.
        self.add_log("Iterative clustering: initializing IceIterative.",
                     level=logging.INFO)
        self.icec = IceIterative(
            fasta_filename=first_split_fa,
            fasta_filenames_to_add=self._flnc_splitted_fas[1:],
            all_fasta_filename=self.flnc_fa,
            ccs_fofn=self.ccs_fofn,
            root_dir=self.root_dir,
            ice_opts=self.ice_opts,
            sge_opts=self.sge_opts,
            uc=uc,
            probQV=self._probqv,
            fastq_filename=first_split_fq,
            output_pickle_file=self.output_pickle_file,
            tmp_dir=self.tmp_dir)

        self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn))
        self.icec.run()
        self.add_log("IceIterative completed.", level=logging.INFO)

        # IceIterative done, write predicted (unplished) consensus isoforms
        # to an output fasta
        self.add_log("Creating a link to unpolished consensus isoforms.")
        ln(self.icec.final_consensus_fa, self.out_fa)
        if self.out_fa_dataset is not None:
            dummy_ds = as_contigset(
                fasta_file=self.icec.final_consensus_fa,
                xml_file=self.out_fa_dataset)

        # Call quiver to polish predicted consensus isoforms.
        if self.ice_opts.quiver is not True:
            self.add_log("Creating a link to cluster report.",
                         level=logging.INFO)
            ln(src=self.icec.report_fn, dst=self.report_fn)

            # Summarize cluster and write to summary_fn.
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.out_fa)
        else:  # self.ice_opts.quiver is True
            self.add_log("Polishing clusters: initializing IcePolish.",
                         level=logging.INFO)
            self.pol = Polish(root_dir=self.root_dir,
                              nfl_fa=self.nfl_fa,
                              bas_fofn=self.bas_fofn,
                              ccs_fofn=self.ccs_fofn,
                              fasta_fofn=self.fasta_fofn,
                              ice_opts=self.ice_opts,
                              sge_opts=self.sge_opts,
                              ipq_opts=self.ipq_opts,
                              tmp_dir=self.tmp_dir)
            self.add_log("IcePolish log: {f}.".format(f=self.pol.log_fn),
                         level=logging.INFO)
            self.pol.run()
            self.add_log("IcePolish completed.", level=logging.INFO)

            # cluster report
            self.add_log("Creating a link to cluster report.",
                         level=logging.INFO)
            ln(src=self.pol.iceq.report_fn, dst=self.report_fn)

            # Summarize cluster & polish and write to summary_fn.
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.out_fa,
                               hq_fa=self.pol.icepq.quivered_good_fa,
                               lq_fa=self.pol.icepq.quivered_bad_fa)

        # Create log file.
        self.close_log()
        return 0
コード例 #15
0
def pick_rep(isoform_filename,
             gff_filename,
             group_filename,
             output_filename,
             pick_least_err_instead=False,
             bad_gff_filename=None):
    """
    For each group of collapsed sam records, select the representative record.

    If is FASTA file -- then always pick the longest one
    If is FASTQ file -- then
          If pick_least_err_instead is True, pick the one w/ least number of expected base errors
          Else, pick the longest one
    """
    fd = None
    is_fq = False
    dummy_prefix, _suffix = parse_ds_filename(isoform_filename)
    if _suffix == "fasta":
        fd = FastaRandomReader(isoform_filename)
    elif _suffix == "fastq":
        fd = FastqRandomReader(isoform_filename)
        is_fq = True
    elif _suffix == "contigset.xml":
        fd = ContigSet(isoform_filename)
        _fns = fd.toExternalFiles()
        if len(_fns) == 1 and _fns[0].endswith(".fq") or _fns[0].endswith(
                ".fastq"):
            fd = FastqRandomReader(_fns[0])
            is_fq = True
        else:
            if not fd.isIndexed:
                # Must be indexed FASTA, or exactly contains one FASTQ file
                raise IOError(
                    "%s must contain either indexed FASTA files or " %
                    isoform_filename + "contain exactly one FASTQ file!")
    else:
        raise IOError("Unable to recognize file type of %s." %
                      isoform_filename)

    fa_out_fn, fq_out_fn, ds_out_fn = None, None, None

    _prefix, _suffix = parse_ds_filename(output_filename)
    if _suffix == "fasta":
        fa_out_fn = output_filename
    elif _suffix == "fastq":
        if not is_fq:
            raise ValueError("Input file %s is not FASTQ while output is." %
                             isoform_filename)
        else:
            fq_out_fn = output_filename
    elif _suffix == "contigset.xml":  # output is contigset.xml
        ds_out_fn = output_filename
        fa_out_fn = _prefix + ".fasta"
        if is_fq:
            fq_out_fn = _prefix + ".fastq"
    else:
        raise IOError("Unable to recognize file type of %s." % output_filename)

    fa_writer = FastaWriter(fa_out_fn) if fa_out_fn is not None else None
    fq_writer = FastqWriter(fq_out_fn) if fq_out_fn is not None else None

    coords = {}
    for r in CollapseGffReader(gff_filename):
        tid = r.transcript_id
        coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end,
                                                r.strand)

    if bad_gff_filename is not None:
        for r in CollapseGffReader(gff_filename):
            tid = r.transcript_id
            coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end,
                                                    r.strand)

    for group in GroupReader(group_filename):
        pb_id, members = group.name, group.members
        if not pb_id in coords:
            raise ValueError("Could not find %s in %s and %s" %
                             (pb_id, gff_filename, bad_gff_filename))
        #logging.info("Picking representative sequence for %s", pb_id)
        best_id = None
        best_seq = None
        best_qual = None
        best_err = 9999999
        err = 9999999
        max_len = 0

        for x in members:
            if is_fq and pick_least_err_instead:
                err = sum(i**-(i / 10.) for i in fd[x].quality)
            if (is_fq and pick_least_err_instead and err < best_err) or \
               ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len):
                best_id = x
                best_seq = fd[x].sequence
                if is_fq:
                    best_qual = fd[x].quality
                    best_err = err
                max_len = len(fd[x].sequence)

        _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id)
        _seq_ = best_seq
        if fq_writer is not None:
            fq_writer.writeRecord(_id_, _seq_, best_qual)
        if fa_writer is not None:
            fa_writer.writeRecord(_id_, _seq_)

    if fa_writer is not None:
        fa_writer.close()
    if fq_writer is not None:
        fq_writer.close()
    if ds_out_fn is not None:
        as_contigset(fa_out_fn, ds_out_fn)
コード例 #16
0
    def run(self):
        """
        Check all arrow jobs are running, failed or done. Write high-quality
        consensus and low-quality consensus to all_arrowed.hq|lq fasta|fastq
        """
        self.validate_inputs()

        job_stats = self.check_arrow_jobs_completion()
        self.add_log("Arrow job status: {s}".format(s=job_stats))

        if job_stats == 'DONE':
            pass # continue on below to process data
        elif job_stats == 'FAILED':
            self.add_log("Has incomplete jobs. Please re-run them.",
                         level=logging.ERROR)
            return -1
        elif job_stats == 'RUNNING':
            if self.quit_if_not_done:
                self.add_log("Jobs are still running. Please wait before running this script.")
                return 1
            else:
                while job_stats != "DONE":
                    self.add_log("Jobs are still running. Wait. Sleeping for 180 seconds.")
                    sleep(180)
                    job_stats = self.check_arrow_jobs_completion()
                    if job_stats == "DONE":
                        break
                    elif job_stats == "FAILED":
                        self.add_log("There are some failed jobs. Please check.",
                                     level=logging.ERROR)
                        return 1
                    elif job_stats == "RUNNING":
                        self.add_log("Jobs are still running. Wait. Sleeping for 180 seconds.",
                                     level=logging.INFO)
        else:
            msg = "Unable to recognize job_stats {s}".format(s=job_stats)
            self.add_log(msg, logging.ERROR)
            raise ValueError(msg)

        # at this point, all jobs must be done and all fastq files present.
        self.pickup_best_clusters()

        self.add_log("Creating polished high quality consensus isoforms.")
        if self.hq_isoforms_fa is not None:
            ln(self.arrowed_good_fa, self.hq_isoforms_fa)
        if self.hq_isoforms_fq is not None:
            ln(self.arrowed_good_fq, self.hq_isoforms_fq)

        self.add_log("Creating polished low quality consensus isoforms.")
        if self.lq_isoforms_fa is not None:
            ln(self.arrowed_bad_fa, self.lq_isoforms_fa)
        if self.lq_isoforms_fq is not None:
            ln(self.arrowed_bad_fq, self.lq_isoforms_fq)

        if self.hq_isoforms_dataset is not None:
            ds = as_contigset(self.hq_isoforms_fa, self.hq_isoforms_dataset)
        if self.lq_isoforms_dataset is not None:
            ds = as_contigset(self.lq_isoforms_fa, self.lq_isoforms_dataset)
        if self.summary_fn is not None:
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.final_consensus_fa,
                               hq_fa=self.hq_isoforms_fa,
                               lq_fa=self.lq_isoforms_fa)

        self.close_log()
コード例 #17
0
    def run(self):
        """Check all quiver jobs are running, failed or done. Write high-quality
        consensus and low-quality consensus to all_quivered.good|bad.fasta|fastq.
        """
        self.validate_inputs()

        job_stats = self.check_quiver_jobs_completion()
        self.add_log("quiver job status: {s}".format(s=job_stats))

        if self.use_sge is not True and job_stats != "DONE":
            self.add_log("quiver jobs were not submitted via sge, " +
                         "however are still incomplete. Please check.",
                         level=logging.ERROR)
            return -1
        elif self.use_sge is True:
            while job_stats != "DONE":
                self.add_log("Sleeping for 180 seconds.")
                sleep(180)
                job_stats = self.check_quiver_jobs_completion()
                if job_stats == "DONE":
                    break
                elif job_stats == "FAILED":
                    self.add_log("There are some failed jobs. Please check.",
                                 level=logging.ERROR)
                    return 1
                elif job_stats == "RUNNING":
                    self.add_log("There are jobs still running, waiting...",
                                 level=logging.INFO)
                    if self.quit_if_not_done is True:
                        return 0
                else:
                    msg = "Unable to recognize job_stats {s}".format(job_stats)
                    self.add_log(msg, logging.ERROR)
                    raise ValueError(msg)

        self.pickup_best_clusters(self.fq_filenames)

        self.add_log("Creating polished high quality consensus isoforms.")
        if self.hq_isoforms_fa is not None:
            ln(self.quivered_good_fa, self.hq_isoforms_fa)
        if self.hq_isoforms_fq is not None:
            ln(self.quivered_good_fq, self.hq_isoforms_fq)

        self.add_log("Creating polished low quality consensus isoforms.")
        if self.lq_isoforms_fa is not None:
            ln(self.quivered_bad_fa, self.lq_isoforms_fa)
        if self.lq_isoforms_fq is not None:
            ln(self.quivered_bad_fq, self.lq_isoforms_fq)

        hq_fa = self.hq_isoforms_fa
        lq_fa = self.lq_isoforms_fa
        if self.hq_isoforms_dataset is not None:
            ds = as_contigset(self.hq_isoforms_fa, self.hq_isoforms_dataset)
        if self.lq_isoforms_dataset is not None:
            ds = as_contigset(self.lq_isoforms_fa, self.lq_isoforms_dataset)
        if self.summary_fn is not None:
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.final_consensus_fa,
                               hq_fa=self.hq_isoforms_fa,
                               lq_fa=self.lq_isoforms_fa)

        self.close_log()
コード例 #18
0
    def run(self):
        """Check all quiver jobs are running, failed or done. Write high-quality
        consensus and low-quality consensus to all_quivered.good|bad.fasta|fastq.
        """
        self.validate_inputs()

        job_stats = self.check_quiver_jobs_completion()
        self.add_log("quiver job status: {s}".format(s=job_stats))

        if self.use_sge is not True and job_stats != "DONE":
            self.add_log("quiver jobs were not submitted via sge, " +
                         "however are still incomplete. Please check.",
                         level=logging.ERROR)
            return -1
        elif self.use_sge is True:
            while job_stats != "DONE":
                self.add_log("Sleeping for 180 seconds.")
                sleep(180)
                job_stats = self.check_quiver_jobs_completion()
                if job_stats == "DONE":
                    break
                elif job_stats == "FAILED":
                    self.add_log("There are some failed jobs. Please check.",
                                 level=logging.ERROR)
                    return 1
                elif job_stats == "RUNNING":
                    self.add_log("There are jobs still running, waiting...",
                                 level=logging.INFO)
                    if self.quit_if_not_done is True:
                        return 0
                else:
                    msg = "Unable to recognize job_stats {s}".format(job_stats)
                    self.add_log(msg, logging.ERROR)
                    raise ValueError(msg)

        self.pickup_best_clusters(self.fq_filenames)

        self.add_log("Creating polished high quality consensus isoforms.")
        if self.hq_isoforms_fa is not None:
            ln(self.quivered_good_fa, self.hq_isoforms_fa)
        if self.hq_isoforms_fq is not None:
            ln(self.quivered_good_fq, self.hq_isoforms_fq)

        self.add_log("Creating polished low quality consensus isoforms.")
        if self.lq_isoforms_fa is not None:
            ln(self.quivered_bad_fa, self.lq_isoforms_fa)
        if self.lq_isoforms_fq is not None:
            ln(self.quivered_bad_fq, self.lq_isoforms_fq)

        hq_fa = self.hq_isoforms_fa
        lq_fa = self.lq_isoforms_fa
        if self.hq_isoforms_dataset is not None:
            ds = as_contigset(self.hq_isoforms_fa, self.hq_isoforms_dataset)
        if self.lq_isoforms_dataset is not None:
            ds = as_contigset(self.lq_isoforms_fa, self.lq_isoforms_dataset)
        if self.summary_fn is not None:
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.final_consensus_fa,
                               hq_fa=self.hq_isoforms_fa,
                               lq_fa=self.lq_isoforms_fa)

        self.close_log()
コード例 #19
0
def resolved_tool_contract_runner(rtc):
    """
    For each cluster bin, create summary.json, cluster_report.csv,
    hq_isoforms.fa|fq, lq_isoforms.fa|fq
    Finally, merge all cluster bins and save all outputs to 'combined'.
    """
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, ClusterChunkTask) for task in p])
    p.sorted_by_attr(attr='cluster_bin_index')

    opts = rtc.task.options
    ipq_opts = IceQuiverHQLQOptions(
        qv_trim_5=opts[Constants.QV_TRIM_FIVEPRIME_ID],
        qv_trim_3=opts[Constants.QV_TRIM_THREEPRIME_ID],
        hq_quiver_min_accuracy=opts[Constants.HQ_QUIVER_MIN_ACCURACY_ID])
    sample_name = get_sample_name(
        input_sample_name=opts[Constants.SAMPLE_NAME_ID])

    out_consensus_isoforms_cs = rtc.task.output_files[0]
    out_summary = rtc.task.output_files[1]
    out_report = rtc.task.output_files[2]
    out_hq_cs = rtc.task.output_files[3]
    out_hq_fq = rtc.task.output_files[4]
    out_lq_cs = rtc.task.output_files[5]
    out_lq_fq = rtc.task.output_files[6]
    out_hq_lq_prefix_dict_pickle = rtc.task.output_files[7]

    assert out_consensus_isoforms_cs.endswith(".contigset.xml")
    assert out_hq_cs.endswith(".contigset.xml")
    assert out_lq_cs.endswith(".contigset.xml")
    out_consensus_isoforms_fa = out_consensus_isoforms_cs.replace(
        ".contigset.xml", ".fasta")
    out_hq_fa = out_hq_cs.replace('.contigset.xml', '.fasta')
    out_lq_fa = out_lq_cs.replace('.contigset.xml', '.fasta')

    hq_fq_fns, lq_fq_fns = [], []
    split_uc_pickles, split_partial_uc_pickles = [], []
    split_consensus_isoforms = []

    cluster_bin_indices = [task.cluster_bin_index for task in p]
    cluster_out_dirs = [task.cluster_out_dir for task in p]
    # sanity check that Cluster indices are unique!
    assert len(set(cluster_bin_indices)) == len(cluster_bin_indices)

    for task in p:
        ice_pq = IceQuiverPostprocess(root_dir=task.cluster_out_dir,
                                      ipq_opts=ipq_opts)
        hq_fq_fns.append(ice_pq.quivered_good_fq)
        lq_fq_fns.append(ice_pq.quivered_bad_fq)
        split_uc_pickles.append(ice_pq.final_pickle_fn)
        split_partial_uc_pickles.append(ice_pq.nfl_all_pickle_fn)
        split_consensus_isoforms.append(ice_pq.final_consensus_fa)

    combined_dir = op.join(op.dirname(op.dirname(cluster_out_dirs[0])),
                           "combined")
    mkdir(combined_dir)
    combined_files = CombinedFiles(combined_dir)
    log.info("Combining results of all cluster bins to %s.", combined_dir)
    log.info("Merging HQ|LQ isoforms from all cluster bins.")
    log.info("HQ isoforms are: %s.", ",".join(hq_fq_fns))
    log.info("LQ isoforms are: %s.", ",".join(lq_fq_fns))
    combine_polished_isoforms(
        split_indices=cluster_bin_indices,
        split_hq_fns=hq_fq_fns,
        split_lq_fns=lq_fq_fns,
        combined_hq_fa=combined_files.all_hq_fa,
        combined_hq_fq=combined_files.all_hq_fq,
        combined_lq_fa=combined_files.all_lq_fa,
        combined_lq_fq=combined_files.all_lq_fq,
        hq_lq_prefix_dict_pickle=combined_files.hq_lq_prefix_dict_pickle,
        sample_name=sample_name)

    ln(combined_files.all_hq_fa, out_hq_fa)  #'HQ isoforms'
    ln(combined_files.all_hq_fq, out_hq_fq)  #'HQ isoforms'
    ln(combined_files.all_lq_fa, out_lq_fa)  #'LQ isoforms'
    ln(combined_files.all_lq_fq, out_lq_fq)  #'LQ isoforms'
    ln(combined_files.hq_lq_prefix_dict_pickle, out_hq_lq_prefix_dict_pickle)

    as_contigset(out_hq_fa, out_hq_cs)
    as_contigset(out_lq_fa, out_lq_cs)

    log.info("Merging consensus isoforms from all cluster bins.")
    combine_consensus_isoforms(split_indices=cluster_bin_indices,
                               split_files=split_consensus_isoforms,
                               combined_consensus_isoforms_fa=combined_files.
                               all_consensus_isoforms_fa,
                               sample_name=sample_name)
    ln(combined_files.all_consensus_isoforms_fa, out_consensus_isoforms_fa)
    #consensus isoforms
    as_contigset(out_consensus_isoforms_fa, out_consensus_isoforms_cs)

    log.info("Writing cluster summary to %s",
             combined_files.all_cluster_summary_fn)
    write_cluster_summary(summary_fn=combined_files.all_cluster_summary_fn,
                          isoforms_fa=out_consensus_isoforms_cs,
                          hq_fa=out_hq_fa,
                          lq_fa=out_lq_fa)
    ln(combined_files.all_cluster_summary_fn, out_summary)  # "cluster summary"

    log.info("Writing cluster report to %s",
             combined_files.all_cluster_report_fn)
    write_combined_cluster_report(
        split_indices=cluster_bin_indices,
        split_uc_pickles=split_uc_pickles,
        split_partial_uc_pickles=split_partial_uc_pickles,
        report_fn=combined_files.all_cluster_report_fn,
        sample_name=sample_name)
    ln(combined_files.all_cluster_report_fn, out_report)  # "cluster report"
コード例 #20
0
ファイル: Cluster.py プロジェクト: lpp1985/lpp_Script
    def run(self):
        """Call ICE to cluster consensus isoforms."""
        self.add_log("Start to run cluster.", level=logging.INFO)

        if self.ice_opts.targeted_isoseq:
            reads_in_first_split = 1000
            self.ice_opts.flnc_reads_per_split = 10000
            self.add_log("targeted_isoseq: further splitting JUST first " +
                         "split to 1000. Changing flnc_reads_per_split=10000.")
        else:
            reads_in_first_split = None

        # Split flnc_fa into smaller files and save files to _flnc_splitted_fas.
        self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) +
                     "smaller files each containing {n} reads.".format(
                         n=self.ice_opts.flnc_reads_per_split),
                     level=logging.INFO)
        self._flnc_splitted_fas = splitFasta(
            input_fasta=self.flnc_fa,
            reads_per_split=self.ice_opts.flnc_reads_per_split,
            out_dir=self.root_dir,
            out_prefix="input.split",
            reads_in_first_split=reads_in_first_split)
        self.add_log("Splitted files are: " +
                     "\n".join(self._flnc_splitted_fas),
                     level=logging.INFO)

        # This is the first piece of reads to work on
        first_split_fa = self._flnc_splitted_fas[0]
        first_split_fq = fafn2fqfn(first_split_fa)

        # Set up probability and quality value model
        if self.ice_opts.use_finer_qv:  # default off
            # Use multi-Qvs from ccs.h5, no need to write FASTQ
            self._probqv, msg = set_probqv_from_ccs(
                ccs_fofn=self.ccs_fofn, fasta_filename=first_split_fa)
        else:  # use a single Qv from FASTQ
            if self.ccs_fofn is not None:
                self.add_log("Converting {fa} + {ccs} into {fq}\n".format(
                    fa=first_split_fa, ccs=self.ccs_fofn, fq=first_split_fq),
                             level=logging.INFO)
                ice_fa2fq(in_fa=first_split_fa,
                          ccs_fofn=self.ccs_fofn,
                          out_fq=first_split_fq)
                # Set probqv from the first splitted FASTQ file.
                self._probqv, msg = set_probqv_from_fq(
                    fastq_filename=first_split_fq)
            else:  # use predefined model
                self._probqv, msg = set_probqv_from_model()
            self.add_log(msg, level=logging.INFO)

        # Initialize cluster by clique
        self.add_log("Finding maximal cliques: initializing IceInit.",
                     level=logging.INFO)
        self.iceinit = IceInit(readsFa=first_split_fa,
                               qver_get_func=self._probqv.get_smoothed,
                               qvmean_get_func=self._probqv.get_mean,
                               ice_opts=self.ice_opts,
                               sge_opts=self.sge_opts)
        uc = self.iceinit.uc

        # Dump uc to a file
        self.add_log(
            "Dumping initial clusters to {f}".format(f=self.initPickleFN),
            level=logging.INFO)
        with open(self.initPickleFN, 'w') as f:
            if self.initPickleFN.endswith(".json"):
                f.write(json.dumps(uc))
            else:
                cPickle.dump(uc, f)

        # Run IceIterative.
        self.add_log("Iterative clustering: initializing IceIterative.",
                     level=logging.INFO)
        self.icec = IceIterative(
            fasta_filename=first_split_fa,
            fasta_filenames_to_add=self._flnc_splitted_fas[1:],
            all_fasta_filename=self.flnc_fa,
            ccs_fofn=self.ccs_fofn,
            root_dir=self.root_dir,
            ice_opts=self.ice_opts,
            sge_opts=self.sge_opts,
            uc=uc,
            probQV=self._probqv,
            fastq_filename=first_split_fq,
            output_pickle_file=self.output_pickle_file,
            tmp_dir=self.tmp_dir)

        self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn))
        self.icec.run()
        self.add_log("IceIterative completed.", level=logging.INFO)

        # IceIterative done, write predicted (unplished) consensus isoforms
        # to an output fasta
        self.add_log("Creating a link to unpolished consensus isoforms.")
        ln(self.icec.final_consensus_fa, self.out_fa)
        if self.out_fa_dataset is not None:
            dummy_ds = as_contigset(fasta_file=self.icec.final_consensus_fa,
                                    xml_file=self.out_fa_dataset)

        # Call quiver to polish predicted consensus isoforms.
        if self.ice_opts.quiver is not True:
            self.add_log("Creating a link to cluster report.",
                         level=logging.INFO)
            ln(src=self.icec.report_fn, dst=self.report_fn)

            # Summarize cluster and write to summary_fn.
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.out_fa)
        else:  # self.ice_opts.quiver is True
            self.add_log("Polishing clusters: initializing IcePolish.",
                         level=logging.INFO)
            self.pol = Polish(root_dir=self.root_dir,
                              nfl_fa=self.nfl_fa,
                              bas_fofn=self.bas_fofn,
                              ccs_fofn=self.ccs_fofn,
                              fasta_fofn=self.fasta_fofn,
                              ice_opts=self.ice_opts,
                              sge_opts=self.sge_opts,
                              ipq_opts=self.ipq_opts,
                              tmp_dir=self.tmp_dir)
            self.add_log("IcePolish log: {f}.".format(f=self.pol.log_fn),
                         level=logging.INFO)
            self.pol.run()
            self.add_log("IcePolish completed.", level=logging.INFO)

            # cluster report
            self.add_log("Creating a link to cluster report.",
                         level=logging.INFO)
            ln(src=self.pol.iceq.report_fn, dst=self.report_fn)

            # Summarize cluster & polish and write to summary_fn.
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.out_fa,
                               hq_fa=self.pol.icepq.quivered_good_fa,
                               lq_fa=self.pol.icepq.quivered_bad_fa)

        # Create log file.
        self.close_log()
        return 0