Ejemplo n.º 1
0
def run_main(separate_flnc_pickle_file, nfl_contigset, cluster_chunk_pickle,
             partial_chunk_pickle, polish_chunk_pickle, max_nchunks):
    """
    Create chunk tasks for ICE, ice_partial and ice_polish, write each
    set of chunk tasks to output pickles.
    """
    log.info("Getting all binned flnc files from %s",
             separate_flnc_pickle_file)
    flnc_fns = SeparateFLNCBase.convert_pickle_to_sorted_flnc_files(
        separate_flnc_pickle_file)
    log.debug("Binned flnc files are: %s", ", ".join(flnc_fns))

    # Number of ICE chunk tasks equals to number of bins.
    n_bins = len(flnc_fns)
    assert n_bins > 0

    log.info("max_nchunks: %s", max_nchunks)
    n_nfl_chunks = max(1, int(max_nchunks))

    out_dir = op.dirname(cluster_chunk_pickle)
    nfl_chunk_json = op.join(out_dir, 'nfl_chunk.json')
    chunked_nfl_files = chunk_contigset(in_file=nfl_contigset,
                                        n_chunks=n_nfl_chunks,
                                        out_dir=out_dir,
                                        out_chunk_json=nfl_chunk_json)

    create_cluster_pickle(flnc_files=flnc_fns, out_pickle=cluster_chunk_pickle)
    create_partial_pickle(flnc_files=flnc_fns,
                          chunked_nfl_files=chunked_nfl_files,
                          out_pickle=partial_chunk_pickle)

    # Total number of flnc reads in all bins
    n_reads_in_bins = n_reads_in_contigsets(flnc_fns)
    sum_n_flnc_reads = sum(n_reads_in_bins)
    n_polish_chunks_in_bins = [
        max(1, int(n * max_nchunks / (1.0 * sum_n_flnc_reads)))
        for n in n_reads_in_bins
    ]
    create_polish_pickle(n_polish_chunks_in_bins=n_polish_chunks_in_bins,
                         flnc_files=flnc_fns,
                         out_pickle=polish_chunk_pickle)

    # Make a soft link of nfl_contigset in the same directory as separate_flnc.pickle
    # for users' convenience
    dst_nfl_contigset = op.join(op.dirname(separate_flnc_pickle_file),
                                "isoseq_nfl.contigset.xml")
    log.info("Making a soft link of %s to %s.", nfl_contigset,
             dst_nfl_contigset)
    ln(nfl_contigset, dst_nfl_contigset)
Ejemplo n.º 2
0
    def run(self):
        """Assigning nfl reads to consensus isoforms and merge."""
        # Call $ICE_PARTIAL_PY to create a pickle for each splitted nfl fasta
        self.createPickles()
        # Wait for pickles to be created, if SGE is used.
        self.waitForPickles(pickle_filenames=self.pickle_filenames,
                            done_filenames=self.done_filenames)
        # Combine all pickles to a big pickle file: nfl_all_pickle_fn.
        self.combinePickles(pickle_filenames=self.pickle_filenames,
                            out_pickle=self.nfl_all_pickle_fn)
        # Create symbolic link if necessary
        ln(self.nfl_all_pickle_fn, self.out_pickle)

        # Close log
        self.close_log()
Ejemplo n.º 3
0
    def run(self):
        """Assigning nfl reads to consensus isoforms and merge."""
        # Call $ICE_PARTIAL_PY to create a pickle for each splitted nfl fasta
        self.createPickles()
        # Wait for pickles to be created, if SGE is used.
        self.waitForPickles(pickle_filenames=self.pickle_filenames,
                            done_filenames=self.done_filenames)
        # Combine all pickles to a big pickle file: nfl_all_pickle_fn.
        self.combinePickles(pickle_filenames=self.pickle_filenames,
                            out_pickle=self.nfl_all_pickle_fn)
        # Create symbolic link if necessary
        ln(self.nfl_all_pickle_fn, self.out_pickle)

        # Close log
        self.close_log()
def link_files(smrtlink_job_dir, out_dir, more_files):
    """
    Make soft link of some smrtlink isoseq job output files and more_files in {out_dir}.
    """
    log.info("Making soft link of files")
    hq_fq = smrtlink_file(smrtlink_job_dir=smrtlink_job_dir, basename="hq_isoforms.fastq")
    cluster_report = smrtlink_file(smrtlink_job_dir=smrtlink_job_dir, basename="cluster_report.csv")
    hq_lq_prefix_pickle = smrtlink_file(smrtlink_job_dir=smrtlink_job_dir,
                                        basename="hq_lq_prefix_dict.pickle")

    assert isinstance(more_files, list)
    fs = more_files + [hq_fq, cluster_report, hq_lq_prefix_pickle]
    for f in fs:
        dst = op.join(out_dir, op.basename(f))
        log.debug("%s --> %s", f, dst)
        ln(f, dst)

    return hq_fq, hq_lq_prefix_pickle
def link_files(smrtlink_job_dir, out_dir, more_files):
    """
    Make soft link of some smrtlink isoseq job output files and more_files in {out_dir}.
    """
    log.info("Making soft link of files")
    hq_fq = smrtlink_file(smrtlink_job_dir=smrtlink_job_dir,
                          basename="hq_isoforms.fastq")
    cluster_report = smrtlink_file(smrtlink_job_dir=smrtlink_job_dir,
                                   basename="cluster_report.csv")
    hq_lq_prefix_pickle = smrtlink_file(smrtlink_job_dir=smrtlink_job_dir,
                                        basename="hq_lq_prefix_dict.pickle")

    assert isinstance(more_files, list)
    fs = more_files + [hq_fq, cluster_report, hq_lq_prefix_pickle]
    for f in fs:
        dst = op.join(out_dir, op.basename(f))
        log.debug("%s --> %s", f, dst)
        ln(f, dst)

    return hq_fq, hq_lq_prefix_pickle
Ejemplo n.º 6
0
def args_runner(args):
    """Run given input args"""
    c = CollapseIsoformsRunner(isoform_filename=args.input_isoforms,
                               sam_filename=args.sam_filename,
                               output_prefix=args.output_prefix,
                               min_aln_coverage=args.min_aln_coverage,
                               min_aln_identity=args.min_aln_identity,
                               min_flnc_coverage=args.min_flnc_coverage,
                               max_fuzzy_junction=args.max_fuzzy_junction,
                               allow_extra_5exon=args.allow_extra_5exon,
                               skip_5_exon_alt=args.skip_5_exon_alt)
    c.run()

    if args.collapsed_isoforms is not None:
        suffix = parse_ds_filename(args.collapsed_isoforms)[1]
        if op.exists(c.rep_fn(suffix)):
            ln(c.rep_fn(suffix), args.collapsed_isoforms)
        else:
            if suffix == ".contigset.xml": # make contigset from fasta
                as_contigset(c.rep_fn("fasta"), args.collapsed_isoforms)
            else:
                raise IOError("Could not make collapsed isoform file %s" % args.collapsed_isoforms)
    return 0
Ejemplo n.º 7
0
def args_runner(args):
    """Run given input args"""
    c = CollapseIsoformsRunner(isoform_filename=args.input_isoforms,
                               sam_filename=args.sam_filename,
                               output_prefix=args.output_prefix,
                               min_aln_coverage=args.min_aln_coverage,
                               min_aln_identity=args.min_aln_identity,
                               min_flnc_coverage=args.min_flnc_coverage,
                               max_fuzzy_junction=args.max_fuzzy_junction,
                               allow_extra_5exon=args.allow_extra_5exon,
                               skip_5_exon_alt=args.skip_5_exon_alt)
    c.run()

    if args.collapsed_isoforms is not None:
        suffix = parse_ds_filename(args.collapsed_isoforms)[1]
        if op.exists(c.rep_fn(suffix)):
            ln(c.rep_fn(suffix), args.collapsed_isoforms)
        else:
            if suffix == ".contigset.xml":  # make contigset from fasta
                as_contigset(c.rep_fn("fasta"), args.collapsed_isoforms)
            else:
                raise IOError("Could not make collapsed isoform file %s" %
                              args.collapsed_isoforms)
    return 0
Ejemplo n.º 8
0
    def run(self):
        """Call ICE to cluster consensus isoforms."""
        self.add_log("Start to run cluster.", level=logging.INFO)

        if self.ice_opts.targeted_isoseq:
            reads_in_first_split = 1000
            self.ice_opts.flnc_reads_per_split = 10000
            self.add_log("targeted_isoseq: further splitting JUST first " +
                         "split to 1000. Changing flnc_reads_per_split=10000.")
        else:
            reads_in_first_split = None

        # Split flnc_fa into smaller files and save files to _flnc_splitted_fas.
        self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) +
                     "smaller files each containing {n} reads.".format(
                         n=self.ice_opts.flnc_reads_per_split),
                     level=logging.INFO)
        self._flnc_splitted_fas = splitFasta(
            input_fasta=self.flnc_fa,
            reads_per_split=self.ice_opts.flnc_reads_per_split,
            out_dir=self.root_dir,
            out_prefix="input.split",
            reads_in_first_split=reads_in_first_split)
        self.add_log("Splitted files are: " +
                     "\n".join(self._flnc_splitted_fas),
                     level=logging.INFO)

        # This is the first piece of reads to work on
        first_split_fa = self._flnc_splitted_fas[0]
        first_split_fq = fafn2fqfn(first_split_fa)

        # Set up probability and quality value model
        if self.ice_opts.use_finer_qv: # default off
            # Use multi-Qvs from ccs.h5, no need to write FASTQ
            self._probqv, msg = set_probqv_from_ccs(
                ccs_fofn=self.ccs_fofn, fasta_filename=first_split_fa)
        else: # use a single Qv from FASTQ
            if self.ccs_fofn is not None:
                self.add_log("Converting {fa} + {ccs} into {fq}\n".format(
                    fa=first_split_fa, ccs=self.ccs_fofn,
                    fq=first_split_fq), level=logging.INFO)
                ice_fa2fq(in_fa=first_split_fa, ccs_fofn=self.ccs_fofn,
                          out_fq=first_split_fq)
                # Set probqv from the first splitted FASTQ file.
                self._probqv, msg = set_probqv_from_fq(fastq_filename=first_split_fq)
            else: # use predefined model
                self._probqv, msg = set_probqv_from_model()
            self.add_log(msg, level=logging.INFO)

        # Initialize cluster by clique
        self.add_log("Finding maximal cliques: initializing IceInit.",
                     level=logging.INFO)
        self.iceinit = IceInit(readsFa=first_split_fa,
                               qver_get_func=self._probqv.get_smoothed,
                               qvmean_get_func=self._probqv.get_mean,
                               ice_opts=self.ice_opts,
                               sge_opts=self.sge_opts)
        uc = self.iceinit.uc

        # Dump uc to a file
        self.add_log("Dumping initial clusters to {f}"
                     .format(f=self.initPickleFN), level=logging.INFO)
        with open(self.initPickleFN, 'w') as f:
            if self.initPickleFN.endswith(".json"):
                f.write(json.dumps(uc))
            else:
                cPickle.dump(uc, f)

        # Run IceIterative.
        self.add_log("Iterative clustering: initializing IceIterative.",
                     level=logging.INFO)
        self.icec = IceIterative(
            fasta_filename=first_split_fa,
            fasta_filenames_to_add=self._flnc_splitted_fas[1:],
            all_fasta_filename=self.flnc_fa,
            ccs_fofn=self.ccs_fofn,
            root_dir=self.root_dir,
            ice_opts=self.ice_opts,
            sge_opts=self.sge_opts,
            uc=uc,
            probQV=self._probqv,
            fastq_filename=first_split_fq,
            output_pickle_file=self.output_pickle_file,
            tmp_dir=self.tmp_dir)

        self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn))
        self.icec.run()
        self.add_log("IceIterative completed.", level=logging.INFO)

        # IceIterative done, write predicted (unplished) consensus isoforms
        # to an output fasta
        self.add_log("Creating a link to unpolished consensus isoforms.")
        ln(self.icec.final_consensus_fa, self.out_fa)
        if self.out_fa_dataset is not None:
            dummy_ds = as_contigset(
                fasta_file=self.icec.final_consensus_fa,
                xml_file=self.out_fa_dataset)

        # Call quiver to polish predicted consensus isoforms.
        if self.ice_opts.quiver is not True:
            self.add_log("Creating a link to cluster report.",
                         level=logging.INFO)
            ln(src=self.icec.report_fn, dst=self.report_fn)

            # Summarize cluster and write to summary_fn.
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.out_fa)
        else:  # self.ice_opts.quiver is True
            self.add_log("Polishing clusters: initializing IcePolish.",
                         level=logging.INFO)
            self.pol = Polish(root_dir=self.root_dir,
                              nfl_fa=self.nfl_fa,
                              bas_fofn=self.bas_fofn,
                              ccs_fofn=self.ccs_fofn,
                              fasta_fofn=self.fasta_fofn,
                              ice_opts=self.ice_opts,
                              sge_opts=self.sge_opts,
                              ipq_opts=self.ipq_opts,
                              tmp_dir=self.tmp_dir)
            self.add_log("IcePolish log: {f}.".format(f=self.pol.log_fn),
                         level=logging.INFO)
            self.pol.run()
            self.add_log("IcePolish completed.", level=logging.INFO)

            # cluster report
            self.add_log("Creating a link to cluster report.",
                         level=logging.INFO)
            ln(src=self.pol.iceq.report_fn, dst=self.report_fn)

            # Summarize cluster & polish and write to summary_fn.
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.out_fa,
                               hq_fa=self.pol.icepq.quivered_good_fa,
                               lq_fa=self.pol.icepq.quivered_bad_fa)

        # Create log file.
        self.close_log()
        return 0
Ejemplo n.º 9
0
def resolved_tool_contract_runner(rtc):
    """
    For each cluster bin, create summary.json, cluster_report.csv,
    hq_isoforms.fa|fq, lq_isoforms.fa|fq
    Finally, merge all cluster bins and save all outputs to 'combined'.
    """
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, ClusterChunkTask) for task in p])
    p.sorted_by_attr(attr='cluster_bin_index')

    opts = rtc.task.options
    ipq_opts = IceQuiverHQLQOptions(
        qv_trim_5=opts[Constants.QV_TRIM_FIVEPRIME_ID],
        qv_trim_3=opts[Constants.QV_TRIM_THREEPRIME_ID],
        hq_quiver_min_accuracy=opts[Constants.HQ_QUIVER_MIN_ACCURACY_ID])
    sample_name = get_sample_name(
        input_sample_name=opts[Constants.SAMPLE_NAME_ID])

    out_consensus_isoforms_cs = rtc.task.output_files[0]
    out_summary = rtc.task.output_files[1]
    out_report = rtc.task.output_files[2]
    out_hq_cs = rtc.task.output_files[3]
    out_hq_fq = rtc.task.output_files[4]
    out_lq_cs = rtc.task.output_files[5]
    out_lq_fq = rtc.task.output_files[6]
    out_hq_lq_prefix_dict_pickle = rtc.task.output_files[7]

    assert out_consensus_isoforms_cs.endswith(".contigset.xml")
    assert out_hq_cs.endswith(".contigset.xml")
    assert out_lq_cs.endswith(".contigset.xml")
    out_consensus_isoforms_fa = out_consensus_isoforms_cs.replace(
        ".contigset.xml", ".fasta")
    out_hq_fa = out_hq_cs.replace('.contigset.xml', '.fasta')
    out_lq_fa = out_lq_cs.replace('.contigset.xml', '.fasta')

    hq_fq_fns, lq_fq_fns = [], []
    split_uc_pickles, split_partial_uc_pickles = [], []
    split_consensus_isoforms = []

    cluster_bin_indices = [task.cluster_bin_index for task in p]
    cluster_out_dirs = [task.cluster_out_dir for task in p]
    # sanity check that Cluster indices are unique!
    assert len(set(cluster_bin_indices)) == len(cluster_bin_indices)

    for task in p:
        ice_pq = IceQuiverPostprocess(root_dir=task.cluster_out_dir,
                                      ipq_opts=ipq_opts)
        hq_fq_fns.append(ice_pq.quivered_good_fq)
        lq_fq_fns.append(ice_pq.quivered_bad_fq)
        split_uc_pickles.append(ice_pq.final_pickle_fn)
        split_partial_uc_pickles.append(ice_pq.nfl_all_pickle_fn)
        split_consensus_isoforms.append(ice_pq.final_consensus_fa)

    combined_dir = op.join(op.dirname(op.dirname(cluster_out_dirs[0])),
                           "combined")
    mkdir(combined_dir)
    combined_files = CombinedFiles(combined_dir)
    log.info("Combining results of all cluster bins to %s.", combined_dir)
    log.info("Merging HQ|LQ isoforms from all cluster bins.")
    log.info("HQ isoforms are: %s.", ",".join(hq_fq_fns))
    log.info("LQ isoforms are: %s.", ",".join(lq_fq_fns))
    combine_polished_isoforms(
        split_indices=cluster_bin_indices,
        split_hq_fns=hq_fq_fns,
        split_lq_fns=lq_fq_fns,
        combined_hq_fa=combined_files.all_hq_fa,
        combined_hq_fq=combined_files.all_hq_fq,
        combined_lq_fa=combined_files.all_lq_fa,
        combined_lq_fq=combined_files.all_lq_fq,
        hq_lq_prefix_dict_pickle=combined_files.hq_lq_prefix_dict_pickle,
        sample_name=sample_name)

    ln(combined_files.all_hq_fa, out_hq_fa)  #'HQ isoforms'
    ln(combined_files.all_hq_fq, out_hq_fq)  #'HQ isoforms'
    ln(combined_files.all_lq_fa, out_lq_fa)  #'LQ isoforms'
    ln(combined_files.all_lq_fq, out_lq_fq)  #'LQ isoforms'
    ln(combined_files.hq_lq_prefix_dict_pickle, out_hq_lq_prefix_dict_pickle)

    as_contigset(out_hq_fa, out_hq_cs)
    as_contigset(out_lq_fa, out_lq_cs)

    log.info("Merging consensus isoforms from all cluster bins.")
    combine_consensus_isoforms(split_indices=cluster_bin_indices,
                               split_files=split_consensus_isoforms,
                               combined_consensus_isoforms_fa=combined_files.
                               all_consensus_isoforms_fa,
                               sample_name=sample_name)
    ln(combined_files.all_consensus_isoforms_fa, out_consensus_isoforms_fa)
    #consensus isoforms
    as_contigset(out_consensus_isoforms_fa, out_consensus_isoforms_cs)

    log.info("Writing cluster summary to %s",
             combined_files.all_cluster_summary_fn)
    write_cluster_summary(summary_fn=combined_files.all_cluster_summary_fn,
                          isoforms_fa=out_consensus_isoforms_cs,
                          hq_fa=out_hq_fa,
                          lq_fa=out_lq_fa)
    ln(combined_files.all_cluster_summary_fn, out_summary)  # "cluster summary"

    log.info("Writing cluster report to %s",
             combined_files.all_cluster_report_fn)
    write_combined_cluster_report(
        split_indices=cluster_bin_indices,
        split_uc_pickles=split_uc_pickles,
        split_partial_uc_pickles=split_partial_uc_pickles,
        report_fn=combined_files.all_cluster_report_fn,
        sample_name=sample_name)
    ln(combined_files.all_cluster_report_fn, out_report)  # "cluster report"
def post_mapping_to_genome_runner(in_isoforms, in_sam, in_pickle,
                                  out_isoforms, out_gff, out_abundance, out_group, out_read_stat,
                                  min_aln_coverage=cmi.Constants.MIN_ALN_COVERAGE_DEFAULT,
                                  min_aln_identity=cmi.Constants.MIN_ALN_IDENTITY_DEFAULT,
                                  min_flnc_coverage=cmi.Constants.MIN_FLNC_COVERAGE_DEFAULT,
                                  max_fuzzy_junction=cmi.Constants.MAX_FUZZY_JUNCTION_DEFAULT,
                                  allow_extra_5exon=cmi.Constants.ALLOW_EXTRA_5EXON_DEFAULT,
                                  skip_5_exon_alt=cmi.Constants.SKIP_5_EXON_ALT_DEFAULT,
                                  min_count=fci.Constants.MIN_COUNT_DEFAULT,
                                  to_filter_out_subsets=True):
    """
    (1) Collapse isoforms and merge fuzzy junctions if needed.
    (2) Generate read stat file and abundance file
    (3) Based on abundance file, filter collapsed isoforms by min FL count
    """
    # Check input and output format
    in_suffix = parse_ds_filename(in_isoforms)[1]
    out_prefix, out_suffix = parse_ds_filename(out_isoforms)
    if in_suffix != out_suffix:
        raise ValueError("Format of input and output isoforms %s, %s must be the same." %
                         (in_isoforms, out_isoforms))
    if in_suffix not in ("fasta", "fastq"):
        raise ValueError("Format of input and output isoforms %s, %s must be FASTA or FASTQ." %
                         (in_isoforms, out_isoforms))

    #(1) Collapse isoforms and merge fuzzy junctions if needed.
    cf = CollapsedFiles(prefix=out_prefix, allow_extra_5exon=allow_extra_5exon)
    cir = CollapseIsoformsRunner(isoform_filename=in_isoforms,
                                 sam_filename=in_sam,
                                 output_prefix=out_prefix,
                                 min_aln_coverage=min_aln_coverage,
                                 min_aln_identity=min_aln_identity,
                                 min_flnc_coverage=min_flnc_coverage,
                                 max_fuzzy_junction=max_fuzzy_junction,
                                 allow_extra_5exon=allow_extra_5exon,
                                 skip_5_exon_alt=skip_5_exon_alt)
    cir.run()

    # (2) Generate read stat file and abundance file
    cr = CountRunner(group_filename=cf.group_fn, pickle_filename=in_pickle,
                     output_read_stat_filename=cf.read_stat_fn,
                     output_abundance_filename=cf.abundance_fn)
    cr.run()

    # (3) Filter collapsed isoforms by min FL count based on abundance file.
    fff = FilteredFiles(prefix=out_prefix, allow_extra_5exon=allow_extra_5exon,
                        min_count=min_count, filter_out_subsets=False)
    filter_by_count(in_group_filename=cf.group_fn, in_abundance_filename=cf.abundance_fn,
                    in_gff_filename=cf.good_gff_fn, in_rep_filename=cf.rep_fn(out_suffix),
                    out_abundance_filename=fff.filtered_abundance_fn,
                    out_gff_filename=fff.filtered_gff_fn,
                    out_rep_filename=fff.filtered_rep_fn(out_suffix),
                    min_count=min_count)

    fft = FilteredFiles(prefix=out_prefix, allow_extra_5exon=allow_extra_5exon,
                        min_count=min_count, filter_out_subsets=True)
    # (4) Remove collapsed isoforms which are a subset of another isoform
    if to_filter_out_subsets is True:
        filter_out_subsets(in_abundance_filename=fff.filtered_abundance_fn,
                           in_gff_filename=fff.filtered_gff_fn,
                           in_rep_filename=fff.filtered_rep_fn(out_suffix),
                           out_abundance_filename=fft.filtered_abundance_fn,
                           out_gff_filename=fft.filtered_gff_fn,
                           out_rep_filename=fft.filtered_rep_fn(out_suffix),
                           max_fuzzy_junction=max_fuzzy_junction)
        fff = fft

    # (5) ln outputs files
    ln_pairs = [(fff.filtered_rep_fn(out_suffix), out_isoforms), # rep isoforms
                (fff.filtered_gff_fn, out_gff), # gff annotation
                (fff.filtered_abundance_fn, out_abundance), # abundance info
                (fff.group_fn, out_group), # groups
                (fff.read_stat_fn, out_read_stat)] # read stat info
    for src, dst in ln_pairs:
        if dst is not None:
            ln(src, dst)

    logging.info("Filter arguments: min_count = %s, filter_out_subsets=%s",
                 min_count, filter_out_subsets)
    logging.info("Collapsed and filtered isoform sequences written to %s",
                 realpath(out_isoforms) if out_isoforms is not None else
                 realpath(fff.filtered_rep_fn(out_suffix)))
    logging.info("Collapsed and filtered isoform annotations written to %s",
                 realpath(out_gff) if out_gff is not None else realpath(fff.filtered_gff_fn))
    logging.info("Collapsed and filtered isoform abundance info written to %s",
                 realpath(out_abundance) if out_abundance is not None else
                 realpath(fff.filtered_abundance_fn))
    logging.info("Collapsed isoform groups written to %s",
                 realpath(out_group) if out_group is not None else realpath(fff.group_fn))
    logging.info("Read status of FL and nFL reads written to %s",
                 realpath(out_read_stat) if out_read_stat is not None else
                 realpath(fff.read_stat_fn))
Ejemplo n.º 11
0
def post_mapping_to_genome_runner(
        in_isoforms,
        in_sam,
        in_pickle,
        out_isoforms,
        out_gff,
        out_abundance,
        out_group,
        out_read_stat,
        min_aln_coverage=cmi.Constants.MIN_ALN_COVERAGE_DEFAULT,
        min_aln_identity=cmi.Constants.MIN_ALN_IDENTITY_DEFAULT,
        min_flnc_coverage=cmi.Constants.MIN_FLNC_COVERAGE_DEFAULT,
        max_fuzzy_junction=cmi.Constants.MAX_FUZZY_JUNCTION_DEFAULT,
        allow_extra_5exon=cmi.Constants.ALLOW_EXTRA_5EXON_DEFAULT,
        skip_5_exon_alt=cmi.Constants.SKIP_5_EXON_ALT_DEFAULT,
        min_count=fci.Constants.MIN_COUNT_DEFAULT,
        to_filter_out_subsets=True):
    """
    (1) Collapse isoforms and merge fuzzy junctions if needed.
    (2) Generate read stat file and abundance file
    (3) Based on abundance file, filter collapsed isoforms by min FL count
    """
    log.info('args: {!r}'.format(locals()))
    # Check input and output format
    in_suffix = parse_ds_filename(in_isoforms)[1]
    out_prefix, out_suffix = parse_ds_filename(out_isoforms)
    if in_suffix != out_suffix:
        raise ValueError(
            "Format of input and output isoforms %s, %s must be the same." %
            (in_isoforms, out_isoforms))
    if in_suffix not in ("fasta", "fastq"):
        raise ValueError(
            "Format of input and output isoforms %s, %s must be FASTA or FASTQ."
            % (in_isoforms, out_isoforms))

    #(1) Collapse isoforms and merge fuzzy junctions if needed.
    cf = CollapsedFiles(prefix=out_prefix, allow_extra_5exon=allow_extra_5exon)
    cir = CollapseIsoformsRunner(isoform_filename=in_isoforms,
                                 sam_filename=in_sam,
                                 output_prefix=out_prefix,
                                 min_aln_coverage=min_aln_coverage,
                                 min_aln_identity=min_aln_identity,
                                 min_flnc_coverage=min_flnc_coverage,
                                 max_fuzzy_junction=max_fuzzy_junction,
                                 allow_extra_5exon=allow_extra_5exon,
                                 skip_5_exon_alt=skip_5_exon_alt)
    cir.run()

    # (2) Generate read stat file and abundance file
    cr = CountRunner(group_filename=cf.group_fn,
                     pickle_filename=in_pickle,
                     output_read_stat_filename=cf.read_stat_fn,
                     output_abundance_filename=cf.abundance_fn)
    cr.run()

    # (3) Filter collapsed isoforms by min FL count based on abundance file.
    fff = FilteredFiles(prefix=out_prefix,
                        allow_extra_5exon=allow_extra_5exon,
                        min_count=min_count,
                        filter_out_subsets=False)
    filter_by_count(in_group_filename=cf.group_fn,
                    in_abundance_filename=cf.abundance_fn,
                    in_gff_filename=cf.good_gff_fn,
                    in_rep_filename=cf.rep_fn(out_suffix),
                    out_abundance_filename=fff.filtered_abundance_fn,
                    out_gff_filename=fff.filtered_gff_fn,
                    out_rep_filename=fff.filtered_rep_fn(out_suffix),
                    min_count=min_count)

    fft = FilteredFiles(prefix=out_prefix,
                        allow_extra_5exon=allow_extra_5exon,
                        min_count=min_count,
                        filter_out_subsets=True)
    # (4) Remove collapsed isoforms which are a subset of another isoform
    if to_filter_out_subsets is True:
        filter_out_subsets(in_abundance_filename=fff.filtered_abundance_fn,
                           in_gff_filename=fff.filtered_gff_fn,
                           in_rep_filename=fff.filtered_rep_fn(out_suffix),
                           out_abundance_filename=fft.filtered_abundance_fn,
                           out_gff_filename=fft.filtered_gff_fn,
                           out_rep_filename=fft.filtered_rep_fn(out_suffix),
                           max_fuzzy_junction=max_fuzzy_junction)
        fff = fft

    # (5) ln outputs files
    ln_pairs = [
        (fff.filtered_rep_fn(out_suffix), out_isoforms),  # rep isoforms
        (fff.filtered_gff_fn, out_gff),  # gff annotation
        (fff.filtered_abundance_fn, out_abundance),  # abundance info
        (fff.group_fn, out_group),  # groups
        (fff.read_stat_fn, out_read_stat)
    ]  # read stat info
    for src, dst in ln_pairs:
        if dst is not None:
            ln(src, dst)

    logging.info("Filter arguments: min_count = %s, filter_out_subsets=%s",
                 min_count, filter_out_subsets)
    logging.info(
        "Collapsed and filtered isoform sequences written to %s",
        realpath(out_isoforms) if out_isoforms is not None else realpath(
            fff.filtered_rep_fn(out_suffix)))
    logging.info(
        "Collapsed and filtered isoform annotations written to %s",
        realpath(out_gff)
        if out_gff is not None else realpath(fff.filtered_gff_fn))
    logging.info(
        "Collapsed and filtered isoform abundance info written to %s",
        realpath(out_abundance)
        if out_abundance is not None else realpath(fff.filtered_abundance_fn))
    logging.info(
        "Collapsed isoform groups written to %s",
        realpath(out_group)
        if out_group is not None else realpath(fff.group_fn))
    logging.info(
        "Read status of FL and nFL reads written to %s",
        realpath(out_read_stat)
        if out_read_stat is not None else realpath(fff.read_stat_fn))
Ejemplo n.º 12
0
    def run(self):
        """
        Check all arrow jobs are running, failed or done. Write high-quality
        consensus and low-quality consensus to all_arrowed.hq|lq fasta|fastq
        """
        self.validate_inputs()

        job_stats = self.check_arrow_jobs_completion()
        self.add_log("Arrow job status: {s}".format(s=job_stats))

        if job_stats == 'DONE':
            pass  # continue on below to process data
        elif job_stats == 'FAILED':
            self.add_log("Has incomplete jobs. Please re-run them.",
                         level=logging.ERROR)
            return -1
        elif job_stats == 'RUNNING':
            if self.quit_if_not_done:
                self.add_log(
                    "Jobs are still running. Please wait before running this script."
                )
                return 1
            else:
                while job_stats != "DONE":
                    self.add_log(
                        "Jobs are still running. Wait. Sleeping for 180 seconds."
                    )
                    sleep(180)
                    job_stats = self.check_arrow_jobs_completion()
                    if job_stats == "DONE":
                        break
                    elif job_stats == "FAILED":
                        self.add_log(
                            "There are some failed jobs. Please check.",
                            level=logging.ERROR)
                        return 1
                    elif job_stats == "RUNNING":
                        self.add_log(
                            "Jobs are still running. Wait. Sleeping for 180 seconds.",
                            level=logging.INFO)
        else:
            msg = "Unable to recognize job_stats {s}".format(s=job_stats)
            self.add_log(msg, logging.ERROR)
            raise ValueError(msg)

        # at this point, all jobs must be done and all fastq files present.
        self.pickup_best_clusters()

        self.add_log("Creating polished high quality consensus isoforms.")
        if self.hq_isoforms_fa is not None:
            ln(self.arrowed_good_fa, self.hq_isoforms_fa)
        if self.hq_isoforms_fq is not None:
            ln(self.arrowed_good_fq, self.hq_isoforms_fq)

        self.add_log("Creating polished low quality consensus isoforms.")
        if self.lq_isoforms_fa is not None:
            ln(self.arrowed_bad_fa, self.lq_isoforms_fa)
        if self.lq_isoforms_fq is not None:
            ln(self.arrowed_bad_fq, self.lq_isoforms_fq)

        if self.hq_isoforms_dataset is not None:
            ds = as_contigset(self.hq_isoforms_fa, self.hq_isoforms_dataset)
        if self.lq_isoforms_dataset is not None:
            ds = as_contigset(self.lq_isoforms_fa, self.lq_isoforms_dataset)
        if self.summary_fn is not None:
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.final_consensus_fa,
                               hq_fa=self.hq_isoforms_fa,
                               lq_fa=self.lq_isoforms_fa)

        self.close_log()
Ejemplo n.º 13
0
def args_runner(args):
    """args runner"""
    logging.info("%s arguments are:\n%s\n", __file__, args)

    # sanity check arguments
    _sanity_check_args(args)

    # make option objects
    ice_opts = IceOptions(quiver=args.quiver, use_finer_qv=args.use_finer_qv,
                          targeted_isoseq=args.targeted_isoseq,
                          ece_penalty=args.ece_penalty, ece_min_len=args.ece_min_len,
                          nfl_reads_per_split=args.nfl_reads_per_split)
    sge_opts = SgeOptions(unique_id=args.unique_id, use_sge=args.use_sge,
                          max_sge_jobs=args.max_sge_jobs, blasr_nproc=args.blasr_nproc,
                          quiver_nproc=args.quiver_nproc, gcon_nproc=args.gcon_nproc,
                          sge_env_name=args.sge_env_name, sge_queue=args.sge_queue)
    ipq_opts = IceQuiverHQLQOptions(qv_trim_5=args.qv_trim_5, qv_trim_3=args.qv_trim_3,
                                    hq_quiver_min_accuracy=args.hq_quiver_min_accuracy)

    # (1) separate flnc reads into bins
    logging.info("Separating FLNC reads into bins.")
    tofu_f = TofuFiles(tofu_dir=args.tofu_dir)
    s = SeparateFLNCRunner(flnc_fa=args.flnc_fa, root_dir=args.tofu_dir,
                           out_pickle=tofu_f.separate_flnc_pickle,
                           bin_size_kb=args.bin_size_kb, bin_by_primer=args.bin_by_primer,
                           bin_manual=args.bin_manual, max_base_limit_MB=args.max_base_limit_MB)
    s.run()

    flnc_files = SeparateFLNCBase.convert_pickle_to_sorted_flnc_files(tofu_f.separate_flnc_pickle)
    logging.info("Separated FLNC reads bins are %s", flnc_files)

    # (2) apply 'pbtranscript cluster' to each bin
    # run ICE/Quiver (the whole thing), providing the fasta_fofn
    logging.info("Running ICE/Polish on separated FLNC reads bins.")
    split_dirs = []
    for flnc_file in flnc_files:
        split_dir = op.join(realpath(op.dirname(flnc_file)), "cluster_out")
        mkdir(split_dir)
        split_dirs.append(split_dir)
        cur_out_cons = op.join(split_dir, "consensus_isoforms.fasta")

        ipq_f = IceQuiverPostprocess(root_dir=split_dir, ipq_opts=ipq_opts)
        if op.exists(ipq_f.quivered_good_fq):
            logging.warning("HQ polished isoforms %s already exist. SKIP!", ipq_f.quivered_good_fq)
            continue
        else:
            logging.info("Running ICE/Quiver on %s", split_dir)
            rmpath(cur_out_cons)

        obj = Cluster(root_dir=split_dir, flnc_fa=flnc_file,
                      nfl_fa=args.nfl_fa,
                      bas_fofn=args.bas_fofn,
                      ccs_fofn=args.ccs_fofn,
                      fasta_fofn=args.fasta_fofn,
                      out_fa=cur_out_cons, sge_opts=sge_opts,
                      ice_opts=ice_opts, ipq_opts=ipq_opts)

        if args.mem_debug: # DEBUG
            from memory_profiler import memory_usage
            start_t = time.time()
            mem_usage = memory_usage(obj.run, interval=60)
            end_t = time.time()
            with open('mem_debug.log', 'a') as f:
                f.write("Running ICE/Quiver on {0} took {1} secs.\n".format(split_dir,
                                                                            end_t-start_t))
                f.write("Maximum memory usage: {0}\n".format(max(mem_usage)))
                f.write("Memory usage: {0}\n".format(mem_usage))
        else:
            obj.run()

        if not args.keep_tmp_files: # by deafult, delete all tempory files.
            logging.info("Deleting %s", ipq_f.tmp_dir)
            subprocess.Popen(['rm', '-rf', '%s' % ipq_f.tmp_dir])
            logging.info("Deleting %s", ipq_f.quivered_dir)
            subprocess.Popen(['rm', '-rf', '%s' % ipq_f.quivered_dir])

    # (3) merge polished isoform cluster from all bins
    logging.info("Merging isoforms from all bins to %s.", tofu_f.combined_dir)
    c = CombineRunner(combined_dir=tofu_f.combined_dir,
                      sample_name=get_sample_name(args.sample_name),
                      split_dirs=split_dirs, ipq_opts=ipq_opts)
    c.run()
    if args.summary_fn is not None:
        ln(tofu_f.all_cluster_summary_fn, args.summary_fn)
    if args.report_fn is not None:
        ln(tofu_f.all_cluster_report_fn, args.report_fn)

    # (4) map HQ isoforms to GMAP reference genome
    map_isoforms_and_sort(input_filename=tofu_f.all_hq_fq, sam_filename=tofu_f.sorted_gmap_sam,
                          gmap_db_dir=args.gmap_db, gmap_db_name=args.gmap_name,
                          gmap_nproc=args.gmap_nproc)

    # (5) post mapping to genome analysis, including
    #     * collapse polished HQ isoform clusters into groups
    #     * count abundance of collapsed isoform groups
    #     * filter collapsed isoforms based on abundance info
    logging.info("Post mapping to genome analysis.")
    out_isoforms = args.collapsed_filtered_fn
    if any(out_isoforms.endswith(ext) for ext in (".fa", ".fasta")):
        in_isoforms = tofu_f.all_hq_fa
    elif any(out_isoforms.endswith(ext) for ext in (".fq", ".fastq")):
        in_isoforms = tofu_f.all_hq_fq
    else:
        raise ValueError("Output file %s must be FASTA or FASTQ!" % out_isoforms)

    post_mapping_to_genome_runner(
        in_isoforms=in_isoforms, in_sam=tofu_f.sorted_gmap_sam,
        in_pickle=tofu_f.hq_lq_prefix_dict_pickle, out_isoforms=args.collapsed_filtered_fn,
        out_gff=args.gff_fn, out_abundance=args.abundance_fn,
        out_group=args.group_fn, out_read_stat=args.read_stat_fn,
        min_aln_coverage=args.min_aln_coverage, min_aln_identity=args.min_aln_identity,
        min_flnc_coverage=args.min_flnc_coverage, max_fuzzy_junction=args.max_fuzzy_junction,
        allow_extra_5exon=args.allow_extra_5exon, min_count=args.min_count)

    return 0
Ejemplo n.º 14
0
    def run(self):
        """
        First, collapse input isoforms by calling Branch.run().
        Then collapse fuzzy junctions by calling collapse_fuzzy_junctions.
        Finally, pick up representitive gff record for each group of collapsed isoforms.
        """
        self.validate_inputs()

        logging.info("Collapsing isoforms into transcripts.")
        b = Branch(isoform_filename=self.isoform_filename,
                   sam_filename=self.sam_filename,
                   cov_threshold=self.min_flnc_coverage,
                   min_aln_coverage=self.min_aln_coverage,
                   min_aln_identity=self.min_aln_identity)

        b.run(allow_extra_5exon=self.allow_extra_5exon,
              skip_5_exon_alt=self.skip_5_exon_alt,
              ignored_ids_fn=self.ignored_ids_txt_fn,
              good_gff_fn=self.good_unfuzzy_gff_fn,
              bad_gff_fn=self.bad_unfuzzy_gff_fn,
              group_fn=self.unfuzzy_group_fn)

        logging.info("Good unfuzzy isoforms written to: %s",
                     realpath(self.good_unfuzzy_gff_fn))
        logging.info("Bad unfuzzy isoforms written to: %s",
                     realpath(self.bad_unfuzzy_gff_fn))
        logging.info("Unfuzzy isoform groups written to: %s",
                     realpath(self.unfuzzy_group_fn))

        if self.shall_collapse_fuzzy_junctions:
            logging.info("Further collapsing fuzzy junctions.")
            # need to further collapse those that have fuzzy junctions!
            collapse_fuzzy_junctions(
                gff_filename=self.good_unfuzzy_gff_fn,
                group_filename=self.unfuzzy_group_fn,
                fuzzy_gff_filename=self.good_fuzzy_gff_fn,
                fuzzy_group_filename=self.fuzzy_group_fn,
                allow_extra_5exon=self.allow_extra_5exon,
                max_fuzzy_junction=self.max_fuzzy_junction)

            logging.info("Good fuzzy isoforms written to: %s",
                         realpath(self.good_fuzzy_gff_fn))
            logging.info("Bad fuzzy isoforms written to: %s",
                         realpath(self.bad_fuzzy_gff_fn))
            logging.info("Fuzzy isoform groups written to: %s",
                         realpath(self.fuzzy_group_fn))
            ln(self.good_fuzzy_gff_fn, self.good_gff_fn)
            ln(self.good_fuzzy_gff_fn, self.gff_fn)
            ln(self.fuzzy_group_fn, self.group_fn)
        else:
            logging.info("No need to further collapse fuzzy junctions.")
            ln(self.good_unfuzzy_gff_fn, self.good_gff_fn)
            ln(self.good_unfuzzy_gff_fn, self.gff_fn)
            ln(self.unfuzzy_group_fn, self.group_fn)

        # Pick up representative
        logging.info("Picking up representative record.")
        pick_least_err_instead = not self.allow_extra_5exon  # 5merge, pick longest

        pick_rep(isoform_filename=self.isoform_filename,
                 gff_filename=self.good_gff_fn,
                 group_filename=self.group_fn,
                 output_filename=self.rep_fn(self.suffix),
                 pick_least_err_instead=pick_least_err_instead,
                 bad_gff_filename=self.bad_gff_fn)

        logging.info("Ignored IDs written to: %s",
                     realpath(self.ignored_ids_txt_fn))
        logging.info("Output GFF written to: %s", realpath(self.gff_fn))
        logging.info("Output Group TXT written to: %s",
                     realpath(self.group_fn))
        logging.info("Output collapsed isoforms written to: %s",
                     realpath(self.rep_fn(self.suffix)))
        logging.info("CollapseIsoforms Arguments: %s", self.arg_str())
Ejemplo n.º 15
0
def args_runner(args):
    """args runner"""
    logging.info("%s arguments are:\n%s\n", __file__, args)

    # sanity check arguments
    _sanity_check_args(args)

    # make option objects
    ice_opts = IceOptions(quiver=args.quiver,
                          use_finer_qv=args.use_finer_qv,
                          targeted_isoseq=args.targeted_isoseq,
                          ece_penalty=args.ece_penalty,
                          ece_min_len=args.ece_min_len,
                          flnc_reads_per_split=args.flnc_reads_per_split,
                          nfl_reads_per_split=args.nfl_reads_per_split)
    sge_opts = SgeOptions(unique_id=args.unique_id,
                          use_sge=args.use_sge,
                          max_sge_jobs=args.max_sge_jobs,
                          blasr_nproc=args.blasr_nproc,
                          quiver_nproc=args.quiver_nproc,
                          gcon_nproc=args.gcon_nproc,
                          sge_env_name=args.sge_env_name,
                          sge_queue=args.sge_queue)
    ipq_opts = IceQuiverHQLQOptions(
        qv_trim_5=args.qv_trim_5,
        qv_trim_3=args.qv_trim_3,
        hq_quiver_min_accuracy=args.hq_quiver_min_accuracy)

    # (1) separate flnc reads into bins
    logging.info("Separating FLNC reads into bins.")
    tofu_f = TofuFiles(tofu_dir=args.tofu_dir)
    s = SeparateFLNCRunner(flnc_fa=args.flnc_fa,
                           root_dir=args.tofu_dir,
                           out_pickle=tofu_f.separate_flnc_pickle,
                           bin_size_kb=args.bin_size_kb,
                           bin_by_primer=args.bin_by_primer,
                           bin_manual=args.bin_manual,
                           max_base_limit_MB=args.max_base_limit_MB)
    s.run()

    flnc_files = SeparateFLNCBase.convert_pickle_to_sorted_flnc_files(
        tofu_f.separate_flnc_pickle)
    logging.info("Separated FLNC reads bins are %s", flnc_files)

    # (2) apply 'pbtranscript cluster' to each bin
    # run ICE/Quiver (the whole thing), providing the fasta_fofn
    logging.info("Running ICE/Polish on separated FLNC reads bins.")
    split_dirs = []
    for flnc_file in flnc_files:
        split_dir = op.join(realpath(op.dirname(flnc_file)), "cluster_out")
        mkdir(split_dir)
        split_dirs.append(split_dir)
        cur_out_cons = op.join(split_dir, "consensus_isoforms.fasta")

        ipq_f = IceQuiverPostprocess(root_dir=split_dir, ipq_opts=ipq_opts)
        if op.exists(ipq_f.quivered_good_fq):
            logging.warning("HQ polished isoforms %s already exist. SKIP!",
                            ipq_f.quivered_good_fq)
            continue
        else:
            logging.info("Running ICE/Quiver on %s", split_dir)
            rmpath(cur_out_cons)

        obj = Cluster(root_dir=split_dir,
                      flnc_fa=flnc_file,
                      nfl_fa=args.nfl_fa,
                      bas_fofn=args.bas_fofn,
                      ccs_fofn=args.ccs_fofn,
                      fasta_fofn=args.fasta_fofn,
                      out_fa=cur_out_cons,
                      sge_opts=sge_opts,
                      ice_opts=ice_opts,
                      ipq_opts=ipq_opts)

        if args.mem_debug:  # DEBUG
            from memory_profiler import memory_usage
            start_t = time.time()
            mem_usage = memory_usage(obj.run, interval=60)
            end_t = time.time()
            with open('mem_debug.log', 'a') as f:
                f.write("Running ICE/Quiver on {0} took {1} secs.\n".format(
                    split_dir, end_t - start_t))
                f.write("Maximum memory usage: {0}\n".format(max(mem_usage)))
                f.write("Memory usage: {0}\n".format(mem_usage))
        else:
            obj.run()

        if not args.keep_tmp_files:  # by deafult, delete all tempory files.
            logging.info("Deleting %s", ipq_f.tmp_dir)
            subprocess.Popen(['rm', '-rf', '%s' % ipq_f.tmp_dir])
            logging.info("Deleting %s", ipq_f.quivered_dir)
            subprocess.Popen(['rm', '-rf', '%s' % ipq_f.quivered_dir])

    # (3) merge polished isoform cluster from all bins
    logging.info("Merging isoforms from all bins to %s.", tofu_f.combined_dir)
    c = CombineRunner(combined_dir=tofu_f.combined_dir,
                      sample_name=get_sample_name(args.sample_name),
                      split_dirs=split_dirs,
                      ipq_opts=ipq_opts)
    c.run()
    if args.summary_fn is not None:
        ln(tofu_f.all_cluster_summary_fn, args.summary_fn)
    if args.report_fn is not None:
        ln(tofu_f.all_cluster_report_fn, args.report_fn)

    # (4) map HQ isoforms to GMAP reference genome
    map_isoforms_and_sort(input_filename=tofu_f.all_hq_fq,
                          sam_filename=tofu_f.sorted_gmap_sam,
                          gmap_db_dir=args.gmap_db,
                          gmap_db_name=args.gmap_name,
                          gmap_nproc=args.gmap_nproc)

    # (5) post mapping to genome analysis, including
    #     * collapse polished HQ isoform clusters into groups
    #     * count abundance of collapsed isoform groups
    #     * filter collapsed isoforms based on abundance info
    logging.info("Post mapping to genome analysis.")
    out_isoforms = args.collapsed_filtered_fn
    if any(out_isoforms.endswith(ext) for ext in (".fa", ".fasta")):
        in_isoforms = tofu_f.all_hq_fa
    elif any(out_isoforms.endswith(ext) for ext in (".fq", ".fastq")):
        in_isoforms = tofu_f.all_hq_fq
    else:
        raise ValueError("Output file %s must be FASTA or FASTQ!" %
                         out_isoforms)

    post_mapping_to_genome_runner(in_isoforms=in_isoforms,
                                  in_sam=tofu_f.sorted_gmap_sam,
                                  in_pickle=tofu_f.hq_lq_prefix_dict_pickle,
                                  out_isoforms=args.collapsed_filtered_fn,
                                  out_gff=args.gff_fn,
                                  out_abundance=args.abundance_fn,
                                  out_group=args.group_fn,
                                  out_read_stat=args.read_stat_fn,
                                  min_aln_coverage=args.min_aln_coverage,
                                  min_aln_identity=args.min_aln_identity,
                                  min_flnc_coverage=args.min_flnc_coverage,
                                  max_fuzzy_junction=args.max_fuzzy_junction,
                                  allow_extra_5exon=args.allow_extra_5exon,
                                  min_count=args.min_count)

    return 0
Ejemplo n.º 16
0
    def run(self):
        """Call ICE to cluster consensus isoforms."""
        self.add_log("Start to run cluster.", level=logging.INFO)

        if self.ice_opts.targeted_isoseq:
            reads_in_first_split = 1000
            self.ice_opts.flnc_reads_per_split = 10000
            self.add_log("targeted_isoseq: further splitting JUST first " +
                         "split to 1000. Changing flnc_reads_per_split=10000.")
        else:
            reads_in_first_split = None

        # Split flnc_fa into smaller files and save files to _flnc_splitted_fas.
        self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) +
                     "smaller files each containing {n} reads.".format(
                         n=self.ice_opts.flnc_reads_per_split),
                     level=logging.INFO)
        self._flnc_splitted_fas = splitFasta(
            input_fasta=self.flnc_fa,
            reads_per_split=self.ice_opts.flnc_reads_per_split,
            out_dir=self.root_dir,
            out_prefix="input.split",
            reads_in_first_split=reads_in_first_split)
        self.add_log("Splitted files are: " +
                     "\n".join(self._flnc_splitted_fas),
                     level=logging.INFO)

        # This is the first piece of reads to work on
        first_split_fa = self._flnc_splitted_fas[0]
        first_split_fq = fafn2fqfn(first_split_fa)

        # Set up probability and quality value model
        if self.ice_opts.use_finer_qv:  # default off
            # Use multi-Qvs from ccs.h5, no need to write FASTQ
            self._probqv, msg = set_probqv_from_ccs(
                ccs_fofn=self.ccs_fofn, fasta_filename=first_split_fa)
        else:  # use a single Qv from FASTQ
            if self.ccs_fofn is not None:
                self.add_log("Converting {fa} + {ccs} into {fq}\n".format(
                    fa=first_split_fa, ccs=self.ccs_fofn, fq=first_split_fq),
                             level=logging.INFO)
                ice_fa2fq(in_fa=first_split_fa,
                          ccs_fofn=self.ccs_fofn,
                          out_fq=first_split_fq)
                # Set probqv from the first splitted FASTQ file.
                self._probqv, msg = set_probqv_from_fq(
                    fastq_filename=first_split_fq)
            else:  # use predefined model
                self._probqv, msg = set_probqv_from_model()
            self.add_log(msg, level=logging.INFO)

        # Initialize cluster by clique
        self.add_log("Finding maximal cliques: initializing IceInit.",
                     level=logging.INFO)
        self.iceinit = IceInit(readsFa=first_split_fa,
                               qver_get_func=self._probqv.get_smoothed,
                               qvmean_get_func=self._probqv.get_mean,
                               ice_opts=self.ice_opts,
                               sge_opts=self.sge_opts)
        uc = self.iceinit.uc

        # Dump uc to a file
        self.add_log(
            "Dumping initial clusters to {f}".format(f=self.initPickleFN),
            level=logging.INFO)
        with open(self.initPickleFN, 'w') as f:
            if self.initPickleFN.endswith(".json"):
                f.write(json.dumps(uc))
            else:
                cPickle.dump(uc, f)

        # Run IceIterative.
        self.add_log("Iterative clustering: initializing IceIterative.",
                     level=logging.INFO)
        self.icec = IceIterative(
            fasta_filename=first_split_fa,
            fasta_filenames_to_add=self._flnc_splitted_fas[1:],
            all_fasta_filename=self.flnc_fa,
            ccs_fofn=self.ccs_fofn,
            root_dir=self.root_dir,
            ice_opts=self.ice_opts,
            sge_opts=self.sge_opts,
            uc=uc,
            probQV=self._probqv,
            fastq_filename=first_split_fq,
            output_pickle_file=self.output_pickle_file,
            tmp_dir=self.tmp_dir)

        self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn))
        self.icec.run()
        self.add_log("IceIterative completed.", level=logging.INFO)

        # IceIterative done, write predicted (unplished) consensus isoforms
        # to an output fasta
        self.add_log("Creating a link to unpolished consensus isoforms.")
        ln(self.icec.final_consensus_fa, self.out_fa)
        if self.out_fa_dataset is not None:
            dummy_ds = as_contigset(fasta_file=self.icec.final_consensus_fa,
                                    xml_file=self.out_fa_dataset)

        # Call quiver to polish predicted consensus isoforms.
        if self.ice_opts.quiver is not True:
            self.add_log("Creating a link to cluster report.",
                         level=logging.INFO)
            ln(src=self.icec.report_fn, dst=self.report_fn)

            # Summarize cluster and write to summary_fn.
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.out_fa)
        else:  # self.ice_opts.quiver is True
            self.add_log("Polishing clusters: initializing IcePolish.",
                         level=logging.INFO)
            self.pol = Polish(root_dir=self.root_dir,
                              nfl_fa=self.nfl_fa,
                              bas_fofn=self.bas_fofn,
                              ccs_fofn=self.ccs_fofn,
                              fasta_fofn=self.fasta_fofn,
                              ice_opts=self.ice_opts,
                              sge_opts=self.sge_opts,
                              ipq_opts=self.ipq_opts,
                              tmp_dir=self.tmp_dir)
            self.add_log("IcePolish log: {f}.".format(f=self.pol.log_fn),
                         level=logging.INFO)
            self.pol.run()
            self.add_log("IcePolish completed.", level=logging.INFO)

            # cluster report
            self.add_log("Creating a link to cluster report.",
                         level=logging.INFO)
            ln(src=self.pol.iceq.report_fn, dst=self.report_fn)

            # Summarize cluster & polish and write to summary_fn.
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.out_fa,
                               hq_fa=self.pol.icepq.quivered_good_fa,
                               lq_fa=self.pol.icepq.quivered_bad_fa)

        # Create log file.
        self.close_log()
        return 0
Ejemplo n.º 17
0
    def runChimeraDetector(self):
        """Call chimera detection on full-length reads, and non-full-length
        reads if required."""
        # Create forward/reverse primers for chimera detection.
        self._processPrimers(
            primer_fn=self.primer_fn,
            window_size=self.chimera_detection_opts.primer_search_window,
            primer_out_fn=self.primer_chimera_fn,
            revcmp_primers=True)

        # Detect chimeras among full-length reads, separate flnc reads and
        # flc reads.
        logging.info("Detect chimeric reads from trimmed full-length reads.")
        (self.summary.num_flnc, self.summary.num_flc,
         self.summary.num_flnc_bases, _x) = \
            self._detect_chimera(in_fasta=self._trimmed_fl_reads_fn,
                                 out_nc_fasta=self.out_flnc_fn_fasta,
                                 out_c_fasta=self.out_flc_fn,
                                 primer_report_fn=self._primer_report_fl_fn,
                                 out_dom=self.out_trimmed_fl_dom_fn,
                                 num_reads=self.summary.num_fl,
                                 job_name="fl")
        assert(self.summary.num_fl == self.summary.num_flnc +
               self.summary.num_flc)
        logging.info("Done with chimera detection on trimmed full-length " +
                     "reads.")

        # Detect chimeras among non-full-length reads if required, separate
        # nflnc reads and nflc reads, rewrite self.primer_report_nfl_fn.
        if self.chimera_detection_opts.detect_chimera_nfl is True:
            logging.info("Detect chimeric reads from trimmed non-full-length " +
                         "reads.")
            (self.summary.num_nflnc, self.summary.num_nflc, _x, _y) = \
                self._detect_chimera(in_fasta=self._trimmed_nfl_reads_fn,
                                     out_nc_fasta=self.out_nflnc_fn,
                                     out_c_fasta=self.out_nflc_fn,
                                     primer_report_fn=self._primer_report_nfl_fn,
                                     out_dom=self.out_trimmed_nfl_dom_fn,
                                     num_reads=self.summary.num_nfl,
                                     job_name="nfl")
            assert(self.summary.num_nfl == self.summary.num_nflnc +
                   self.summary.num_nflc)
            logging.info("Done with chimera detection on trimmed " +
                         "non-full-length reads.")

            # Concatenate out_nflnc_fn and out_nflc_fn as out_nfl_fn
            cat_files(src=[self.out_nflnc_fn_fasta, self.out_nflc_fn_fasta],
                      dst=self.out_nfl_fn_fasta)
            # Concatenate out_flnc and out_nflnc to make out_all_reads_fn
            cat_files(src=[self.out_flnc_fn_fasta, self.out_nflnc_fn_fasta],
                      dst=self.out_all_reads_fn_fasta)

        else:
            # Soft link _trimmed_nfl_reads_fn as out_nfl_fn
            ln(self._trimmed_nfl_reads_fn, self.out_nfl_fn_fasta)
            # Concatenate out_flnc and out_nfl to make out_all_reads_fn
            cat_files(src=[self.out_flnc_fn_fasta, self.out_nfl_fn_fasta],
                      dst=self.out_all_reads_fn_fasta)

        # primer info of fl/nfl reads reported to _primer_report_fl_fn
        # and _primer_report_nfl_fn, concatenate them in order to make
        # a full report: primer_report_fn.
        cat_files(src=[self._primer_report_fl_fn, self._primer_report_nfl_fn],
                  dst=self.primer_report_fn)

        # Delete intermediate files.
        self._cleanup([self._primer_report_nfl_fn,
                       self._primer_report_fl_fn])
Ejemplo n.º 18
0
    def runChimeraDetector(self):
        """Call chimera detection on full-length reads, and non-full-length
        reads if required."""
        # Create forward/reverse primers for chimera detection.
        self._processPrimers(
            primer_fn=self.primer_fn,
            window_size=self.chimera_detection_opts.primer_search_window,
            primer_out_fn=self.primer_chimera_fn,
            revcmp_primers=True)

        # Detect chimeras among full-length reads, separate flnc reads and
        # flc reads.
        logging.info("Detect chimeric reads from trimmed full-length reads.")
        (self.summary.num_flnc, self.summary.num_flc,
         self.summary.num_flnc_bases, _x) = \
            self._detect_chimera(in_fasta=self._trimmed_fl_reads_fn,
                                 out_nc_fasta=self.out_flnc_fn_fasta,
                                 out_c_fasta=self.out_flc_fn,
                                 primer_report_fn=self._primer_report_fl_fn,
                                 out_dom=self.out_trimmed_fl_dom_fn,
                                 num_reads=self.summary.num_fl,
                                 job_name="fl")
        assert (self.summary.num_fl == self.summary.num_flnc +
                self.summary.num_flc)
        logging.info("Done with chimera detection on trimmed full-length " +
                     "reads.")

        # Detect chimeras among non-full-length reads if required, separate
        # nflnc reads and nflc reads, rewrite self.primer_report_nfl_fn.
        if self.chimera_detection_opts.detect_chimera_nfl is True:
            logging.info(
                "Detect chimeric reads from trimmed non-full-length " +
                "reads.")
            (self.summary.num_nflnc, self.summary.num_nflc, _x, _y) = \
                self._detect_chimera(in_fasta=self._trimmed_nfl_reads_fn,
                                     out_nc_fasta=self.out_nflnc_fn,
                                     out_c_fasta=self.out_nflc_fn,
                                     primer_report_fn=self._primer_report_nfl_fn,
                                     out_dom=self.out_trimmed_nfl_dom_fn,
                                     num_reads=self.summary.num_nfl,
                                     job_name="nfl")
            assert (self.summary.num_nfl == self.summary.num_nflnc +
                    self.summary.num_nflc)
            logging.info("Done with chimera detection on trimmed " +
                         "non-full-length reads.")

            # Concatenate out_nflnc_fn and out_nflc_fn as out_nfl_fn
            cat_files(src=[self.out_nflnc_fn_fasta, self.out_nflc_fn_fasta],
                      dst=self.out_nfl_fn_fasta)
            # Concatenate out_flnc and out_nflnc to make out_all_reads_fn
            cat_files(src=[self.out_flnc_fn_fasta, self.out_nflnc_fn_fasta],
                      dst=self.out_all_reads_fn_fasta)

        else:
            # Soft link _trimmed_nfl_reads_fn as out_nfl_fn
            ln(self._trimmed_nfl_reads_fn, self.out_nfl_fn_fasta)
            # Concatenate out_flnc and out_nfl to make out_all_reads_fn
            cat_files(src=[self.out_flnc_fn_fasta, self.out_nfl_fn_fasta],
                      dst=self.out_all_reads_fn_fasta)

        # primer info of fl/nfl reads reported to _primer_report_fl_fn
        # and _primer_report_nfl_fn, concatenate them in order to make
        # a full report: primer_report_fn.
        cat_files(src=[self._primer_report_fl_fn, self._primer_report_nfl_fn],
                  dst=self.primer_report_fn)

        # Delete intermediate files.
        self._cleanup([self._primer_report_nfl_fn, self._primer_report_fl_fn])
Ejemplo n.º 19
0
    def run(self):
        """Check all quiver jobs are running, failed or done. Write high-quality
        consensus and low-quality consensus to all_quivered.good|bad.fasta|fastq.
        """
        self.validate_inputs()

        job_stats = self.check_quiver_jobs_completion()
        self.add_log("quiver job status: {s}".format(s=job_stats))

        if self.use_sge is not True and job_stats != "DONE":
            self.add_log("quiver jobs were not submitted via sge, " +
                         "however are still incomplete. Please check.",
                         level=logging.ERROR)
            return -1
        elif self.use_sge is True:
            while job_stats != "DONE":
                self.add_log("Sleeping for 180 seconds.")
                sleep(180)
                job_stats = self.check_quiver_jobs_completion()
                if job_stats == "DONE":
                    break
                elif job_stats == "FAILED":
                    self.add_log("There are some failed jobs. Please check.",
                                 level=logging.ERROR)
                    return 1
                elif job_stats == "RUNNING":
                    self.add_log("There are jobs still running, waiting...",
                                 level=logging.INFO)
                    if self.quit_if_not_done is True:
                        return 0
                else:
                    msg = "Unable to recognize job_stats {s}".format(job_stats)
                    self.add_log(msg, logging.ERROR)
                    raise ValueError(msg)

        self.pickup_best_clusters(self.fq_filenames)

        self.add_log("Creating polished high quality consensus isoforms.")
        if self.hq_isoforms_fa is not None:
            ln(self.quivered_good_fa, self.hq_isoforms_fa)
        if self.hq_isoforms_fq is not None:
            ln(self.quivered_good_fq, self.hq_isoforms_fq)

        self.add_log("Creating polished low quality consensus isoforms.")
        if self.lq_isoforms_fa is not None:
            ln(self.quivered_bad_fa, self.lq_isoforms_fa)
        if self.lq_isoforms_fq is not None:
            ln(self.quivered_bad_fq, self.lq_isoforms_fq)

        hq_fa = self.hq_isoforms_fa
        lq_fa = self.lq_isoforms_fa
        if self.hq_isoforms_dataset is not None:
            ds = as_contigset(self.hq_isoforms_fa, self.hq_isoforms_dataset)
        if self.lq_isoforms_dataset is not None:
            ds = as_contigset(self.lq_isoforms_fa, self.lq_isoforms_dataset)
        if self.summary_fn is not None:
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.final_consensus_fa,
                               hq_fa=self.hq_isoforms_fa,
                               lq_fa=self.lq_isoforms_fa)

        self.close_log()
Ejemplo n.º 20
0
    def run(self):
        """Check all quiver jobs are running, failed or done. Write high-quality
        consensus and low-quality consensus to all_quivered.good|bad.fasta|fastq.
        """
        self.validate_inputs()

        job_stats = self.check_quiver_jobs_completion()
        self.add_log("quiver job status: {s}".format(s=job_stats))

        if self.use_sge is not True and job_stats != "DONE":
            self.add_log("quiver jobs were not submitted via sge, " +
                         "however are still incomplete. Please check.",
                         level=logging.ERROR)
            return -1
        elif self.use_sge is True:
            while job_stats != "DONE":
                self.add_log("Sleeping for 180 seconds.")
                sleep(180)
                job_stats = self.check_quiver_jobs_completion()
                if job_stats == "DONE":
                    break
                elif job_stats == "FAILED":
                    self.add_log("There are some failed jobs. Please check.",
                                 level=logging.ERROR)
                    return 1
                elif job_stats == "RUNNING":
                    self.add_log("There are jobs still running, waiting...",
                                 level=logging.INFO)
                    if self.quit_if_not_done is True:
                        return 0
                else:
                    msg = "Unable to recognize job_stats {s}".format(job_stats)
                    self.add_log(msg, logging.ERROR)
                    raise ValueError(msg)

        self.pickup_best_clusters(self.fq_filenames)

        self.add_log("Creating polished high quality consensus isoforms.")
        if self.hq_isoforms_fa is not None:
            ln(self.quivered_good_fa, self.hq_isoforms_fa)
        if self.hq_isoforms_fq is not None:
            ln(self.quivered_good_fq, self.hq_isoforms_fq)

        self.add_log("Creating polished low quality consensus isoforms.")
        if self.lq_isoforms_fa is not None:
            ln(self.quivered_bad_fa, self.lq_isoforms_fa)
        if self.lq_isoforms_fq is not None:
            ln(self.quivered_bad_fq, self.lq_isoforms_fq)

        hq_fa = self.hq_isoforms_fa
        lq_fa = self.lq_isoforms_fa
        if self.hq_isoforms_dataset is not None:
            ds = as_contigset(self.hq_isoforms_fa, self.hq_isoforms_dataset)
        if self.lq_isoforms_dataset is not None:
            ds = as_contigset(self.lq_isoforms_fa, self.lq_isoforms_dataset)
        if self.summary_fn is not None:
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.final_consensus_fa,
                               hq_fa=self.hq_isoforms_fa,
                               lq_fa=self.lq_isoforms_fa)

        self.close_log()
Ejemplo n.º 21
0
    def run(self):
        """
        For each cluster bin, create summary.json, cluster_report.csv,
        hq_isoforms.fa|fq, lq_isoforms.fa|fq
        Finally, merge all cluster bins and save all outputs to 'combined'.
        """
        logging.info("Running {f} v{v}.".format(f=op.basename(__file__),
                                                v=self.getVersion()))
        args = self.args

        # Get cluster bins directories as input
        cluster_bin_dirs = self.get_cluster_bin_dirs(separate_flnc_pickle=args.separate_flnc_pickle,
                                                     cluster_bin_dirs=args.cluster_bin_dirs)
        cluster_bin_indices = range(0, len(cluster_bin_dirs))

        # Create output dir
        combined_dir = args.combined_dir
        mkdir(combined_dir)

        # Get combined output filenames
        def f(input_fn, default_fn):
            if input_fn is None:
                return op.join(combined_dir, default_fn)

        out_consensus_isoforms_fa = f(args.consensus_isoforms_fa, "all.consensus_isoforms.fasta")
        out_summary = f(args.summary_fn, "all.cluster_summary.json")
        out_report = f(args.report_fn, "all.cluster_report.csv")
        out_hq_fa = f(args.hq_isoforms_fa, "all.polished_hq.fasta")
        out_lq_fa = f(args.lq_isoforms_fa, "all.polished_lq.fasta")
        out_hq_fq = f(args.hq_isoforms_fq, "all.polished_hq.fastq")
        out_lq_fq = f(args.lq_isoforms_fq, "all.polished_lq.fastq")

        ipq_opts = IceQuiverHQLQOptions(qv_trim_5=args.qv_trim_5,
                                        qv_trim_3=args.qv_trim_3,
                                        hq_quiver_min_accuracy=args.hq_quiver_min_accuracy)
        sample_name = get_sample_name(input_sample_name=args.sample_name)


        hq_fq_fns, lq_fq_fns = [], []
        split_uc_pickles, split_partial_uc_pickles = [], []
        split_consensus_isoforms = []

        for cluster_bin_dir in cluster_bin_dirs:
            ice_pq = IceQuiverPostprocess(root_dir=cluster_bin_dir, ipq_opts=ipq_opts)
            hq_fq_fns.append(ice_pq.quivered_good_fq)
            lq_fq_fns.append(ice_pq.quivered_bad_fq)
            split_uc_pickles.append(ice_pq.final_pickle_fn)
            split_partial_uc_pickles.append(ice_pq.nfl_all_pickle_fn)
            split_consensus_isoforms.append(ice_pq.final_consensus_fa)

        combined_files = CombinedFiles(combined_dir)
        log.info("Combining results of all cluster bins to %s.", combined_dir)
        log.info("Merging HQ|LQ isoforms from all cluster bins.")
        log.info("HQ isoforms are: %s.", ",".join(hq_fq_fns))
        log.info("LQ isoforms are: %s.", ",".join(lq_fq_fns))
        combine_polished_isoforms(split_indices=cluster_bin_indices,
                                  split_hq_fns=hq_fq_fns,
                                  split_lq_fns=lq_fq_fns,
                                  combined_hq_fa=combined_files.all_hq_fa,
                                  combined_hq_fq=combined_files.all_hq_fq,
                                  combined_lq_fa=combined_files.all_lq_fa,
                                  combined_lq_fq=combined_files.all_lq_fq,
                                  hq_lq_prefix_dict_pickle=combined_files.hq_lq_prefix_dict_pickle,
                                  sample_name=sample_name)

        ln(combined_files.all_hq_fa, out_hq_fa) #'HQ isoforms'
        ln(combined_files.all_hq_fq, out_hq_fq) #'HQ isoforms'
        ln(combined_files.all_lq_fa, out_lq_fa) #'LQ isoforms'
        ln(combined_files.all_lq_fq, out_lq_fq) #'LQ isoforms'

        log.info("Merging consensus isoforms from all cluster bins.")
        combine_consensus_isoforms(split_indices=cluster_bin_indices,
                                   split_files=split_consensus_isoforms,
                                   combined_consensus_isoforms_fa=combined_files.all_consensus_isoforms_fa,
                                   sample_name=sample_name)
        ln(combined_files.all_consensus_isoforms_fa, out_consensus_isoforms_fa)

        log.info("Writing cluster summary to %s", combined_files.all_cluster_summary_fn)
        write_cluster_summary(summary_fn=combined_files.all_cluster_summary_fn,
                              isoforms_fa=out_consensus_isoforms_fa,
                              hq_fa=out_hq_fa, lq_fa=out_lq_fa)
        ln(combined_files.all_cluster_summary_fn, out_summary) # "cluster summary"

        log.info("Writing cluster report to %s", combined_files.all_cluster_report_fn)
        write_combined_cluster_report(split_indices=cluster_bin_indices,
                                      split_uc_pickles=split_uc_pickles,
                                      split_partial_uc_pickles=split_partial_uc_pickles,
                                      report_fn=combined_files.all_cluster_report_fn,
                                      sample_name=sample_name)
        ln(combined_files.all_cluster_report_fn, out_report) # "cluster report"
Ejemplo n.º 22
0
def resolved_tool_contract_runner(rtc):
    """
    For each cluster bin, create summary.json, cluster_report.csv,
    hq_isoforms.fa|fq, lq_isoforms.fa|fq
    Finally, merge all cluster bins and save all outputs to 'combined'.
    """
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, ClusterChunkTask) for task in p])
    p.sorted_by_attr(attr='cluster_bin_index')

    opts = rtc.task.options
    ipq_opts = IceQuiverHQLQOptions(qv_trim_5=opts[Constants.QV_TRIM_FIVEPRIME_ID],
                                    qv_trim_3=opts[Constants.QV_TRIM_THREEPRIME_ID],
                                    hq_quiver_min_accuracy=opts[Constants.HQ_QUIVER_MIN_ACCURACY_ID])
    sample_name = get_sample_name(input_sample_name=opts[Constants.SAMPLE_NAME_ID])

    out_consensus_isoforms_cs = rtc.task.output_files[0]
    out_summary = rtc.task.output_files[1]
    out_report = rtc.task.output_files[2]
    out_hq_cs = rtc.task.output_files[3]
    out_hq_fq = rtc.task.output_files[4]
    out_lq_cs = rtc.task.output_files[5]
    out_lq_fq = rtc.task.output_files[6]
    out_hq_lq_prefix_dict_pickle = rtc.task.output_files[7]

    assert out_consensus_isoforms_cs.endswith(".contigset.xml")
    assert out_hq_cs.endswith(".contigset.xml")
    assert out_lq_cs.endswith(".contigset.xml")
    out_consensus_isoforms_fa = out_consensus_isoforms_cs.replace(".contigset.xml", ".fasta")
    out_hq_fa = out_hq_cs.replace('.contigset.xml', '.fasta')
    out_lq_fa = out_lq_cs.replace('.contigset.xml', '.fasta')

    hq_fq_fns, lq_fq_fns = [], []
    split_uc_pickles, split_partial_uc_pickles = [], []
    split_consensus_isoforms = []

    cluster_bin_indices = [task.cluster_bin_index for task in p]
    cluster_out_dirs = [task.cluster_out_dir for task in p]
    # sanity check that Cluster indices are unique!
    assert len(set(cluster_bin_indices)) == len(cluster_bin_indices)

    for task in p:
        ice_pq = IceQuiverPostprocess(root_dir=task.cluster_out_dir,
                                      ipq_opts=ipq_opts)
        hq_fq_fns.append(ice_pq.quivered_good_fq)
        lq_fq_fns.append(ice_pq.quivered_bad_fq)
        split_uc_pickles.append(ice_pq.final_pickle_fn)
        split_partial_uc_pickles.append(ice_pq.nfl_all_pickle_fn)
        split_consensus_isoforms.append(ice_pq.final_consensus_fa)

    combined_dir = op.join(op.dirname(op.dirname(cluster_out_dirs[0])), "combined")
    mkdir(combined_dir)
    combined_files = CombinedFiles(combined_dir)
    log.info("Combining results of all cluster bins to %s.", combined_dir)
    log.info("Merging HQ|LQ isoforms from all cluster bins.")
    log.info("HQ isoforms are: %s.", ",".join(hq_fq_fns))
    log.info("LQ isoforms are: %s.", ",".join(lq_fq_fns))
    combine_polished_isoforms(split_indices=cluster_bin_indices,
                              split_hq_fns=hq_fq_fns,
                              split_lq_fns=lq_fq_fns,
                              combined_hq_fa=combined_files.all_hq_fa,
                              combined_hq_fq=combined_files.all_hq_fq,
                              combined_lq_fa=combined_files.all_lq_fa,
                              combined_lq_fq=combined_files.all_lq_fq,
                              hq_lq_prefix_dict_pickle=combined_files.hq_lq_prefix_dict_pickle,
                              sample_name=sample_name)

    ln(combined_files.all_hq_fa, out_hq_fa) #'HQ isoforms'
    ln(combined_files.all_hq_fq, out_hq_fq) #'HQ isoforms'
    ln(combined_files.all_lq_fa, out_lq_fa) #'LQ isoforms'
    ln(combined_files.all_lq_fq, out_lq_fq) #'LQ isoforms'
    ln(combined_files.hq_lq_prefix_dict_pickle, out_hq_lq_prefix_dict_pickle)

    as_contigset(out_hq_fa, out_hq_cs)
    as_contigset(out_lq_fa, out_lq_cs)

    log.info("Merging consensus isoforms from all cluster bins.")
    combine_consensus_isoforms(split_indices=cluster_bin_indices,
                               split_files=split_consensus_isoforms,
                               combined_consensus_isoforms_fa=combined_files.all_consensus_isoforms_fa,
                               sample_name=sample_name)
    ln(combined_files.all_consensus_isoforms_fa, out_consensus_isoforms_fa)
    #consensus isoforms
    as_contigset(out_consensus_isoforms_fa, out_consensus_isoforms_cs)

    log.info("Writing cluster summary to %s", combined_files.all_cluster_summary_fn)
    write_cluster_summary(summary_fn=combined_files.all_cluster_summary_fn,
                          isoforms_fa=out_consensus_isoforms_cs,
                          hq_fa=out_hq_fa,
                          lq_fa=out_lq_fa)
    ln(combined_files.all_cluster_summary_fn, out_summary) # "cluster summary"

    log.info("Writing cluster report to %s", combined_files.all_cluster_report_fn)
    write_combined_cluster_report(split_indices=cluster_bin_indices,
                                  split_uc_pickles=split_uc_pickles,
                                  split_partial_uc_pickles=split_partial_uc_pickles,
                                  report_fn=combined_files.all_cluster_report_fn,
                                  sample_name=sample_name)
    ln(combined_files.all_cluster_report_fn, out_report) # "cluster report"
Ejemplo n.º 23
0
    def run(self):
        """
        First, collapse input isoforms by calling Branch.run().
        Then collapse fuzzy junctions by calling collapse_fuzzy_junctions.
        Finally, pick up representitive gff record for each group of collapsed isoforms.
        """
        self.validate_inputs()

        logging.info("Collapsing isoforms into transcripts.")
        b = Branch(isoform_filename=self.isoform_filename,
                   sam_filename=self.sam_filename,
                   cov_threshold=self.min_flnc_coverage,
                   min_aln_coverage=self.min_aln_coverage,
                   min_aln_identity=self.min_aln_identity)

        b.run(allow_extra_5exon=self.allow_extra_5exon,
              skip_5_exon_alt=self.skip_5_exon_alt,
              ignored_ids_fn=self.ignored_ids_txt_fn,
              good_gff_fn=self.good_unfuzzy_gff_fn,
              bad_gff_fn=self.bad_unfuzzy_gff_fn,
              group_fn=self.unfuzzy_group_fn)

        logging.info("Good unfuzzy isoforms written to: %s", realpath(self.good_unfuzzy_gff_fn))
        logging.info("Bad unfuzzy isoforms written to: %s", realpath(self.bad_unfuzzy_gff_fn))
        logging.info("Unfuzzy isoform groups written to: %s", realpath(self.unfuzzy_group_fn))

        if self.shall_collapse_fuzzy_junctions:
            logging.info("Further collapsing fuzzy junctions.")
            # need to further collapse those that have fuzzy junctions!
            collapse_fuzzy_junctions(gff_filename=self.good_unfuzzy_gff_fn,
                                     group_filename=self.unfuzzy_group_fn,
                                     fuzzy_gff_filename=self.good_fuzzy_gff_fn,
                                     fuzzy_group_filename=self.fuzzy_group_fn,
                                     allow_extra_5exon=self.allow_extra_5exon,
                                     max_fuzzy_junction=self.max_fuzzy_junction)

            logging.info("Good fuzzy isoforms written to: %s", realpath(self.good_fuzzy_gff_fn))
            logging.info("Bad fuzzy isoforms written to: %s", realpath(self.bad_fuzzy_gff_fn))
            logging.info("Fuzzy isoform groups written to: %s", realpath(self.fuzzy_group_fn))
            ln(self.good_fuzzy_gff_fn, self.good_gff_fn)
            ln(self.good_fuzzy_gff_fn, self.gff_fn)
            ln(self.fuzzy_group_fn, self.group_fn)
        else:
            logging.info("No need to further collapse fuzzy junctions.")
            ln(self.good_unfuzzy_gff_fn, self.good_gff_fn)
            ln(self.good_unfuzzy_gff_fn, self.gff_fn)
            ln(self.unfuzzy_group_fn, self.group_fn)

        # Pick up representative
        logging.info("Picking up representative record.")
        pick_least_err_instead = not self.allow_extra_5exon # 5merge, pick longest

        pick_rep(isoform_filename=self.isoform_filename,
                 gff_filename=self.good_gff_fn,
                 group_filename=self.group_fn,
                 output_filename=self.rep_fn(self.suffix),
                 pick_least_err_instead=pick_least_err_instead,
                 bad_gff_filename=self.bad_gff_fn)

        logging.info("Ignored IDs written to: %s", realpath(self.ignored_ids_txt_fn))
        logging.info("Output GFF written to: %s", realpath(self.gff_fn))
        logging.info("Output Group TXT written to: %s", realpath(self.group_fn))
        logging.info("Output collapsed isoforms written to: %s", realpath(self.rep_fn(self.suffix)))
        logging.info("CollapseIsoforms Arguments: %s", self.arg_str())
Ejemplo n.º 24
0
    def run(self):
        """
        Check all arrow jobs are running, failed or done. Write high-quality
        consensus and low-quality consensus to all_arrowed.hq|lq fasta|fastq
        """
        self.validate_inputs()

        job_stats = self.check_arrow_jobs_completion()
        self.add_log("Arrow job status: {s}".format(s=job_stats))

        if job_stats == 'DONE':
            pass # continue on below to process data
        elif job_stats == 'FAILED':
            self.add_log("Has incomplete jobs. Please re-run them.",
                         level=logging.ERROR)
            return -1
        elif job_stats == 'RUNNING':
            if self.quit_if_not_done:
                self.add_log("Jobs are still running. Please wait before running this script.")
                return 1
            else:
                while job_stats != "DONE":
                    self.add_log("Jobs are still running. Wait. Sleeping for 180 seconds.")
                    sleep(180)
                    job_stats = self.check_arrow_jobs_completion()
                    if job_stats == "DONE":
                        break
                    elif job_stats == "FAILED":
                        self.add_log("There are some failed jobs. Please check.",
                                     level=logging.ERROR)
                        return 1
                    elif job_stats == "RUNNING":
                        self.add_log("Jobs are still running. Wait. Sleeping for 180 seconds.",
                                     level=logging.INFO)
        else:
            msg = "Unable to recognize job_stats {s}".format(s=job_stats)
            self.add_log(msg, logging.ERROR)
            raise ValueError(msg)

        # at this point, all jobs must be done and all fastq files present.
        self.pickup_best_clusters()

        self.add_log("Creating polished high quality consensus isoforms.")
        if self.hq_isoforms_fa is not None:
            ln(self.arrowed_good_fa, self.hq_isoforms_fa)
        if self.hq_isoforms_fq is not None:
            ln(self.arrowed_good_fq, self.hq_isoforms_fq)

        self.add_log("Creating polished low quality consensus isoforms.")
        if self.lq_isoforms_fa is not None:
            ln(self.arrowed_bad_fa, self.lq_isoforms_fa)
        if self.lq_isoforms_fq is not None:
            ln(self.arrowed_bad_fq, self.lq_isoforms_fq)

        if self.hq_isoforms_dataset is not None:
            ds = as_contigset(self.hq_isoforms_fa, self.hq_isoforms_dataset)
        if self.lq_isoforms_dataset is not None:
            ds = as_contigset(self.lq_isoforms_fa, self.lq_isoforms_dataset)
        if self.summary_fn is not None:
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.final_consensus_fa,
                               hq_fa=self.hq_isoforms_fa,
                               lq_fa=self.lq_isoforms_fa)

        self.close_log()