Ejemplo n.º 1
0
    def _validate_inputs(self, root_dir, N):
        """
        Check inputs, return
        (splitted_pickles, out_pickle)
        """
        icef = IceFiles(prog_name="ice_partial_merge",
                        root_dir=root_dir, no_log_f=False)

        # root_dir/output/map_noFL/input.split_{0:03d}.fasta.partial_uc.pickle
        splitted_pickles = [icef.nfl_pickle_i(i) for i in range(0, N)]
        dones = [icef.nfl_done_i(i) for i in range(0, N)]

        # Check if inputs exist.
        errMsg = ""
        for done in dones:
            if not nfs_exists(done):
                errMsg = "DONE file {f} does not exist.".format(f=done)
        for pickle in splitted_pickles:
            if not nfs_exists(pickle):
                errMsg = "Pickle file {f} does not exist.".format(f=pickle)

        if len(errMsg) != 0:
            raise ValueError(errMsg)

        # root_dir/output/map_noFL/nfl.all.partial_uc.pickle
        out_pickle = icef.nfl_all_pickle_fn
        return (splitted_pickles, out_pickle)
Ejemplo n.º 2
0
    def __init__(self,
                 root_dir,
                 fasta_filenames,
                 fastq_filenames,
                 ref_fasta,
                 out_pickle,
                 ice_opts,
                 sge_opts,
                 cpus,
                 tmp_dir=None):
        """
        fasta_filenames --- a list of splitted nfl fasta files.

        ref_fasta --- (unpolished) consensus isoforms

        out_pickle --- a pickle file with all nfl fasta reads

        root_dir --- ICE root output directory

        tmp_dir --- if not None, write temporary clusters, dazz, las
                    files to the given temporaray directory

        sge_opts --- params for SGE environment, including
            use_sge    : use SGE or not
            max_sge_jobs: maximum number of sub-jobs submitted
            unique_id  : unique qsub job id, important that this
                        DOES NOT CONFLICT!

        """
        self.prog_name = "IceAllPartials"
        IceFiles.__init__(self,
                          prog_name=self.prog_name,
                          root_dir=root_dir,
                          tmp_dir=tmp_dir)

        self.fasta_filenames, self.ref_fasta = \
            self._validate_inputs(fasta_filenames=fasta_filenames,
                                  ref_fasta=ref_fasta)

        if fastq_filenames is not None:
            for fq in fastq_filenames:
                assert op.exists(fq)

        self.fastq_filenames = fastq_filenames  # note: could be None

        self.out_pickle = out_pickle

        self.ice_opts = ice_opts
        self.sge_opts = sge_opts
        self.cpus = cpus  # this is the number of CPUs to use per SGE job or per local job

        self.add_log("Making dir for mapping noFL reads: " + self.nfl_dir)
        mkdir(self.nfl_dir)

        self.add_log("input fasta files are: " +
                     ", ".join(self.fasta_filenames))
        self.add_log("temp pickle files are: " +
                     ", ".join(self.pickle_filenames))
        self.add_log("out pickle file is: " + self.out_pickle)
        self.add_log("temp directory is: " + str(self.tmp_dir))
Ejemplo n.º 3
0
    def __init__(self, root_dir, nfl_fa, bas_fofn, ccs_fofn,
                 ice_opts, sge_opts, ipq_opts, fasta_fofn=None,
                 tmp_dir=None):
        """
        root_dir --- IceFiles.root_dir, usually data/clusterOutDir
        nfl_fa    --- non-full-length reads in fasta, e.g., isoseq_nfl.fasta
        bas_fofn --- e.g. input.fofn of bas|bax.h5 files
        ccs_fofn --- e.g. ccs.fofn of ccs files.

        ipq_opts --- IceQuiverHQLQOptions
                     qv_trim_5: ignore QV of n bases in the 5' end
                     qv_trim_3: ignore QV of n bases in the 3' end
                     hq_quiver_min_accuracy: minimum allowed quiver accuracy
                                      to mark an isoform as high quality
                     hq_isoforms_fa|fq: polished, hiqh quality consensus
                                        isoforms in fasta|q
                     lq_isoforms_fa|fq: polished, low quality consensus
                                        isoforms in fasta|q
        """
        IceFiles.__init__(self, prog_name="IcePolish", root_dir=root_dir,
                          bas_fofn=bas_fofn, ccs_fofn=ccs_fofn,
                          fasta_fofn=fasta_fofn, tmp_dir=tmp_dir)
        self.nfl_fa = realpath(nfl_fa)
        self.ice_opts = ice_opts
        self.sge_opts = sge_opts
        self.ipq_opts = ipq_opts

        self.add_log("ece_penalty: {0}, ece_min_len: {1}".format(self.ice_opts.ece_penalty, self.ice_opts.ece_min_len))

        self.icep = None   # IceAllPartials.
        self.iceq = None   # IceQuiver
        self.icepq = None  # IceQuiverPostprocess
        self._nfl_splitted_fas = None

        self.validate_inputs()
Ejemplo n.º 4
0
    def _validate_inputs(self, root_dir, nfl_fa, N):
        """
        Check inputs, return
        (num_reads,
         number_reads_per_chunk,
         nfl_dir,
         [i-th_chunk_nfl_fa for i in [0...N-1]])
        """
        icef = IceFiles(prog_name="ice_partial_split",
                        root_dir=root_dir,
                        no_log_f=False)

        nfl_dir = icef.nfl_dir

        # root_dir/output/map_noFL/input.split_{0:03d}.fasta
        splitted_nfl_fas = [icef.nfl_fa_i(i) for i in range(0, N)]

        mkdir(icef.nfl_dir)

        # Check if inputs exist.
        errMsg = ""
        if not nfs_exists(nfl_fa):
            errMsg = ("The input non-full-length reads fasta file " +
                      "{f} does not exists. ".format(f=nfl_fa))
        if len(errMsg) != 0:
            raise ValueError(errMsg)

        num_reads = num_reads_in_fasta(nfl_fa)
        reads_per_split = int(max(1, ceil(num_reads * 1.0 / N)))

        return (num_reads, reads_per_split, nfl_dir, splitted_nfl_fas)
Ejemplo n.º 5
0
    def _validate_inputs(self, root_dir, nfl_fa, N):
        """
        Check inputs, return
        (num_reads,
         number_reads_per_chunk,
         nfl_dir,
         [i-th_chunk_nfl_fa for i in [0...N-1]])
        """
        icef = IceFiles(prog_name="ice_partial_split",
                        root_dir=root_dir, no_log_f=False)

        nfl_dir = icef.nfl_dir

        # root_dir/output/map_noFL/input.split_{0:03d}.fasta
        splitted_nfl_fas = [icef.nfl_fa_i(i) for i in range(0, N)]

        mkdir(icef.nfl_dir)

        # Check if inputs exist.
        errMsg = ""
        if N <= 0 or N > 100:
            errMsg = "Input file can not be splitted into %d chunks!" % N

        if not nfs_exists(nfl_fa):
            errMsg = ("The input non-full-length reads fasta file " +
                      "{f} does not exists. ".format(f=nfl_fa))
        if len(errMsg) != 0:
            raise ValueError(errMsg)

        num_reads = num_reads_in_fasta(nfl_fa)
        reads_per_split = int(max(1, ceil(num_reads * 1.0 / N)))

        return (num_reads, reads_per_split, nfl_dir, splitted_nfl_fas)
Ejemplo n.º 6
0
    def _validate_inputs(self, root_dir, i, ccs_fofn, blasr_nproc, tmp_dir):
        """
        Check inputs, write $ICE_PARTIAL_PY i command to script_file
        and return (input_fasta, ref_fasta, out_pickle, done_file)
        for the i-th chunk of nfl reads.
        """
        icef = IceFiles(prog_name="ice_partial_{i}".format(i=i),
                        root_dir=root_dir,
                        no_log_f=False)

        # root_dir/output/final.consensus.fasta
        ref_fasta = icef.final_consensus_fa
        ref_dazz = icef.final_dazz_db

        # root_dir/output/map_noFL/input.split_{0:03d}.fasta
        input_fasta = icef.nfl_fa_i(i)

        # $input_fasta.partial_uc.pickle
        out_pickle = icef.nfl_pickle_i(i)

        # $input_fasta.partial_uc.pickle.DONE
        done_file = icef.nfl_done_i(i)

        # $input_fasta.partial_uc.sh
        script_file = icef.nfl_script_i(i)

        # Check if inputs exist.
        errMsg = ""
        if not nfs_exists(input_fasta):
            errMsg = (
                "The {i}-th splitted non-full-length reads ".format(i=i) +
                "fasta file {f} does not exist. ".format(f=input_fasta) +
                "Please run $ICE_PARTIAL_PY split first.")
        elif not nfs_exists(ref_fasta):
            errMsg = ("The unpolished consensus isoforms fasta file " +
                      "{f} does not exist. ".format(f=ref_fasta) +
                      "Please make sure ICE is successfully done.")
        elif not nfs_exists(ref_dazz):
            errMsg = ("The dazz db " +
                      "{f} does not exist. ".format(f=ref_dazz) +
                      "Please make sure it is already built.")
        if len(errMsg) != 0:
            raise ValueError(errMsg)

        # Save cmd to script_file.
        cmd = self._cmd_str(root_dir=root_dir,
                            i=[i],
                            ccs_fofn=ccs_fofn,
                            blasr_nproc=blasr_nproc,
                            tmp_dir=tmp_dir)
        with open(script_file, 'w') as writer:
            writer.write(cmd + "\n")

        icef.add_log(
            "Writing CMD to: {script_file}".format(script_file=script_file))
        icef.close_log()

        return (input_fasta, ref_fasta, out_pickle, done_file)
Ejemplo n.º 7
0
    def __init__(self,
                 root_dir,
                 fasta_filenames,
                 ref_fasta,
                 out_pickle,
                 sge_opts,
                 ccs_fofn=None,
                 tmp_dir=None):
        """
        fasta_filenames --- a list of splitted nfl fasta files.

        ref_fasta --- (unpolished) consensus isoforms

        out_pickle --- a pickle file with all nfl fasta reads

        ccs_fofn --- should be reads_of_insert.fofn or None

        root_dir --- ICE root output directory

        tmp_dir --- if not None, write temporary clusters, dazz, las
                    files to the given temporaray directory

        sge_opts --- params for SGE environment, including
            use_sge    : use SGE or not
            max_sge_jobs: maximum number of sub-jobs submitted
            unique_id  : unique qsub job id, important that this
                        DOES NOT CONFLICT!
            blasr_nproc: blasr -nproc param, number of threads per cpu.

        """
        self.prog_name = "IceAllPartials"
        IceFiles.__init__(self,
                          prog_name=self.prog_name,
                          root_dir=root_dir,
                          tmp_dir=tmp_dir)

        self.fasta_filenames, self.ref_fasta, self.ccs_fofn, = \
            self._validate_inputs(fasta_filenames=fasta_filenames,
                                  ref_fasta=ref_fasta,
                                  ccs_fofn=ccs_fofn)

        self.out_pickle = out_pickle

        self.sge_opts = sge_opts

        self.add_log("Making dir for mapping noFL reads: " + self.nfl_dir)
        mkdir(self.nfl_dir)

        self.add_log("input fasta files are: " +
                     ", ".join(self.fasta_filenames))
        self.add_log("temp pickle files are: " +
                     ", ".join(self.pickle_filenames))
        self.add_log("out pickle file is: " + self.out_pickle)
        self.add_log("temp directory is: " + str(self.tmp_dir))
Ejemplo n.º 8
0
    def __init__(self, root_dir, fasta_filenames, fastq_filenames, ref_fasta,
                 out_pickle, ice_opts, sge_opts, cpus, tmp_dir=None):
        """
        fasta_filenames --- a list of splitted nfl fasta files.

        ref_fasta --- (unpolished) consensus isoforms

        out_pickle --- a pickle file with all nfl fasta reads

        root_dir --- ICE root output directory

        tmp_dir --- if not None, write temporary clusters, dazz, las
                    files to the given temporaray directory

        sge_opts --- params for SGE environment, including
            use_sge    : use SGE or not
            max_sge_jobs: maximum number of sub-jobs submitted
            unique_id  : unique qsub job id, important that this
                        DOES NOT CONFLICT!

        """
        self.prog_name = "IceAllPartials"
        IceFiles.__init__(self, prog_name=self.prog_name, root_dir=root_dir, tmp_dir=tmp_dir)

        self.fasta_filenames, self.ref_fasta = \
            self._validate_inputs(fasta_filenames=fasta_filenames,
                                  ref_fasta=ref_fasta)

        if fastq_filenames is not None:
            for fq in fastq_filenames:
                assert op.exists(fq)

        self.fastq_filenames = fastq_filenames # note: could be None


        self.out_pickle = out_pickle

        self.ice_opts = ice_opts
        self.sge_opts = sge_opts
        self.cpus = cpus # this is the number of CPUs to use per SGE job or per local job


        self.add_log("Making dir for mapping noFL reads: " + self.nfl_dir)
        mkdir(self.nfl_dir)

        self.add_log("input fasta files are: " +
                     ", ".join(self.fasta_filenames))
        self.add_log("temp pickle files are: " +
                     ", ".join(self.pickle_filenames))
        self.add_log("out pickle file is: " + self.out_pickle)
        self.add_log("temp directory is: " + str(self.tmp_dir))
Ejemplo n.º 9
0
    def _validate_inputs(self, root_dir, i, ccs_fofn, blasr_nproc, tmp_dir):
        """
        Check inputs, write $ICE_PARTIAL_PY i command to script_file
        and return (input_fasta, ref_fasta, out_pickle, done_file)
        for the i-th chunk of nfl reads.
        """
        icef = IceFiles(prog_name="ice_partial_{i}".format(i=i),
                        root_dir=root_dir, no_log_f=False)

        # root_dir/output/final.consensus.fasta
        ref_fasta = icef.final_consensus_fa
        ref_dazz = icef.final_dazz_db

        # root_dir/output/map_noFL/input.split_{0:03d}.fasta
        input_fasta = icef.nfl_fa_i(i)

        # $input_fasta.partial_uc.pickle
        out_pickle = icef.nfl_pickle_i(i)

        # $input_fasta.partial_uc.pickle.DONE
        done_file = icef.nfl_done_i(i)

        # $input_fasta.partial_uc.sh
        script_file = icef.nfl_script_i(i)

        # Check if inputs exist.
        errMsg = ""
        if not nfs_exists(input_fasta):
            errMsg = ("The {i}-th splitted non-full-length reads ".format(i=i) +
                      "fasta file {f} does not exist. ".format(f=input_fasta) +
                      "Please run $ICE_PARTIAL_PY split first.")
        elif not nfs_exists(ref_fasta):
            errMsg = ("The unpolished consensus isoforms fasta file " +
                      "{f} does not exist. ".format(f=ref_fasta) +
                      "Please make sure ICE is successfully done.")
        elif not nfs_exists(ref_dazz):
            errMsg = ("The dazz db " +
                      "{f} does not exist. ".format(f=ref_dazz) +
                      "Please make sure it is already built.")
        if len(errMsg) != 0:
            raise ValueError(errMsg)

        # Save cmd to script_file.
        cmd = self._cmd_str(root_dir=root_dir, i=[i],
                            ccs_fofn=ccs_fofn,
                            blasr_nproc=blasr_nproc,
                            tmp_dir=tmp_dir)
        with open(script_file, 'w') as writer:
            writer.write(cmd + "\n")

        icef.add_log("Writing CMD to: {script_file}".
                     format(script_file=script_file))
        icef.close_log()

        return (input_fasta, ref_fasta, out_pickle, done_file)
Ejemplo n.º 10
0
    def run_after(self, rtc, output_dir):
        self.assertTrue(op.exists(rtc.task.output_files[0]))

        out_dir = op.join(OUT_DIR, "test_gather_polished_isoforms_in_each_bin")
        cluster_out_dirs = [
            op.join(out_dir, bin_name, "cluster_out") for bin_name in BIN_NAMES
        ]
        out_hq_fns = [
            op.join(d, fn) for d in cluster_out_dirs for fn in HQ_ISOFORMS_FNS
        ]
        print "out_hq_fns %s" % out_hq_fns
        self.assertTrue(all([op.exists(f) for f in out_hq_fns]))

        out_lq_fns = [
            op.join(d, fn) for d in cluster_out_dirs for fn in LQ_ISOFORMS_FNS
        ]
        print "out_lq_fns %s" % out_lq_fns
        self.assertTrue(all([op.exists(f) for f in out_lq_fns]))

        print "out_lq_fa %s is not empty" % out_lq_fns[0]
        n = len([r for r in FastaReader(out_lq_fns[0])])
        self.assertTrue(n > 0)

        out_logs = [
            IceFiles(prog_name="", root_dir=d).submitted_quiver_jobs_log
            for d in cluster_out_dirs
        ]
        print "out_logs %s" % out_logs
        self.assertTrue(all([op.exists(f) for f in out_logs]))
Ejemplo n.º 11
0
def resolved_tool_contract_runner(rtc):
    """
    For each cluster bin, clean up intermediate files under tmp.
    """
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, ClusterChunkTask) for task in p])

    cluster_bin_indices = [task.cluster_bin_index for task in p]
    # sanity check that Cluster indices are unique!
    assert len(set(cluster_bin_indices)) == len(cluster_bin_indices)

    sentinel_out = rtc.task.output_files[0]
    with open(sentinel_out, 'w') as writer:
        for task in p:
            icef = IceFiles(prog_name="ice_cleanup",
                            root_dir=task.cluster_out_dir)
            tmp_dir = icef.tmp_dir
            log.info("Cleaning up, removing %s", tmp_dir)
            writer.write("removing %s\n" % tmp_dir)
            execute("rm -rf %s" % real_upath(tmp_dir))

            quivered_dir = icef.quivered_dir
            log.info("Cleaning up, removing %s", quivered_dir)
            writer.write("removing %s\n" % quivered_dir)
            execute("rm -rf %s" % real_upath(quivered_dir))
Ejemplo n.º 12
0
    def __init__(self,
                 root_dir,
                 nfl_fa,
                 bas_fofn,
                 ccs_fofn,
                 ice_opts,
                 sge_opts,
                 ipq_opts,
                 fasta_fofn=None,
                 tmp_dir=None):
        """
        root_dir --- IceFiles.root_dir, usually data/clusterOutDir
        nfl_fa    --- non-full-length reads in fasta, e.g., isoseq_nfl.fasta
        bas_fofn --- e.g. input.fofn of bas|bax.h5 files
        ccs_fofn --- e.g. ccs.fofn of ccs files.

        ipq_opts --- IceQuiverHQLQOptions
                     qv_trim_5: ignore QV of n bases in the 5' end
                     qv_trim_3: ignore QV of n bases in the 3' end
                     hq_quiver_min_accuracy: minimum allowed quiver accuracy
                                      to mark an isoform as high quality
                     hq_isoforms_fa|fq: polished, hiqh quality consensus
                                        isoforms in fasta|q
                     lq_isoforms_fa|fq: polished, low quality consensus
                                        isoforms in fasta|q
        """
        IceFiles.__init__(self,
                          prog_name="IcePolish",
                          root_dir=root_dir,
                          bas_fofn=bas_fofn,
                          ccs_fofn=ccs_fofn,
                          fasta_fofn=fasta_fofn,
                          tmp_dir=tmp_dir)
        self.nfl_fa = realpath(nfl_fa)
        self.ice_opts = ice_opts
        self.sge_opts = sge_opts
        self.ipq_opts = ipq_opts

        self.add_log("ece_penalty: {0}, ece_min_len: {1}".format(
            self.ice_opts.ece_penalty, self.ice_opts.ece_min_len))

        self.icep = None  # IceAllPartials.
        self.iceq = None  # IceQuiver
        self.icepq = None  # IceQuiverPostprocess
        self._nfl_splitted_fas = None

        self.validate_inputs()
Ejemplo n.º 13
0
    def __init__(self, root_dir, fasta_filenames, ref_fasta,
                 out_pickle, sge_opts, ccs_fofn=None, tmp_dir=None):
        """
        fasta_filenames --- a list of splitted nfl fasta files.

        ref_fasta --- (unpolished) consensus isoforms

        out_pickle --- a pickle file with all nfl fasta reads

        ccs_fofn --- should be reads_of_insert.fofn or None

        root_dir --- ICE root output directory

        tmp_dir --- if not None, write temporary clusters, dazz, las
                    files to the given temporaray directory

        sge_opts --- params for SGE environment, including
            use_sge    : use SGE or not
            max_sge_jobs: maximum number of sub-jobs submitted
            unique_id  : unique qsub job id, important that this
                        DOES NOT CONFLICT!
            blasr_nproc: blasr -nproc param, number of threads per cpu.

        """
        self.prog_name = "IceAllPartials"
        IceFiles.__init__(self, prog_name=self.prog_name, root_dir=root_dir, tmp_dir=tmp_dir)

        self.fasta_filenames, self.ref_fasta, self.ccs_fofn, = \
            self._validate_inputs(fasta_filenames=fasta_filenames,
                                  ref_fasta=ref_fasta,
                                  ccs_fofn=ccs_fofn)

        self.out_pickle = out_pickle

        self.sge_opts = sge_opts

        self.add_log("Making dir for mapping noFL reads: " + self.nfl_dir)
        mkdir(self.nfl_dir)

        self.add_log("input fasta files are: " +
                     ", ".join(self.fasta_filenames))
        self.add_log("temp pickle files are: " +
                     ", ".join(self.pickle_filenames))
        self.add_log("out pickle file is: " + self.out_pickle)
        self.add_log("temp directory is: " + str(self.tmp_dir))
Ejemplo n.º 14
0
    def run_after(self, rtc, output_dir):
        self.assertTrue(op.exists(rtc.task.output_files[0]))

        out_dir = op.join(OUT_DIR, "test_ice_partial_cluster_bins")
        cluster_out_dirs = [
            op.join(out_dir, bin_name, "cluster_out") for bin_name in BIN_NAMES
        ]
        out_pickles = [
            IceFiles(prog_name="", root_dir=d).nfl_pickle_i(i=i)
            for d in cluster_out_dirs for i in range(N_NFL_CHUNKS)
        ]
        print "output scattered nfl pickles are %s" % out_pickles
        self.assertTrue(all([op.exists(f) for f in out_pickles]))
Ejemplo n.º 15
0
    def run_after(self, rtc, output_dir):
        self.assertTrue(op.exists(rtc.task.output_files[0]))

        out_dir = op.join(OUT_DIR,
                          "test_gather_ice_partial_cluster_bins_pickle")
        cluster_out_dirs = [
            op.join(out_dir, bin_name, "cluster_out") for bin_name in BIN_NAMES
        ]
        out_pickles = [
            IceFiles(prog_name="", root_dir=d).nfl_all_pickle_fn
            for d in cluster_out_dirs
        ]
        print "output nfl pickles are %s" % out_pickles
        self.assertTrue(all([op.exists(f) for f in out_pickles]))
Ejemplo n.º 16
0
 def prefix_nfl_pickle_tuples(self):
     """Returns a list of (sample_prefix, nfl_uc_pickle) tuples."""
     ret = []
     for sample_prefix, cluster_out_d in self.prefix_dict.iteritems():
         sample_prefix = sample_prefix if not sample_prefix.endswith(
             '|') else sample_prefix[0:-1]
         nfl_fn = IceFiles(prog_name="Count",
                           root_dir=cluster_out_d,
                           no_log_f=True).nfl_all_pickle_fn
         if not op.exists(nfl_fn):
             raise IOError(
                 "NFL pickle %s of sample prefix %s does not exist." %
                 (nfl_fn, sample_prefix))
         ret.append((sample_prefix, nfl_fn))
     return ret
Ejemplo n.º 17
0
    def run_after(self, rtc, output_dir):
        self.assertTrue(op.exists(rtc.task.output_files[i]) for i in range(7))

        out_dir = op.join(OUT_DIR, "test_combine_cluster_bins")
        cluster_out_dirs = [
            op.join(out_dir, bin_name, "cluster_out") for bin_name in BIN_NAMES
        ]

        combined_lq_cs = rtc.task.output_files[5]
        print "combined_lq_fa %s must not be empty" % combined_lq_cs
        n = len([r for r in ContigSet(combined_lq_cs)])
        self.assertTrue(n > 0)

        out_logs = [
            IceFiles(prog_name="", root_dir=d).submitted_quiver_jobs_log
            for d in cluster_out_dirs
        ]
        print "out_logs %s" % out_logs
        self.assertTrue(all([op.exists(f) for f in out_logs]))
def resolved_tool_contract_runner(rtc):
    """Given resolved tool contract, run"""
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    p.sorted_by_attr(attr='cluster_bin_index')
    assert all([isinstance(task, PartialChunkTask) for task in p])

    with open(rtc.task.output_files[0], 'w') as writer:
        for i, group in groupby(p, lambda x: x.cluster_bin_index):
            gs = [g for g in group]
            nfl_pickles_of_bin_i = [g.nfl_pickle for g in gs]
            out_pickle = IceFiles(prog_name="",
                                  root_dir=gs[0].cluster_out_dir,
                                  no_log_f=True).nfl_all_pickle_fn
            log.info("Combining nfl pickles of cluster bin %s.", str(i))
            log.debug("nfl pickles are: %s.",
                      (", ".join(nfl_pickles_of_bin_i)))
            log.debug("Output merged nfl pickle is %s.", out_pickle)
            combine_nfl_pickles(splitted_pickles=nfl_pickles_of_bin_i,
                                out_pickle=out_pickle)
            writer.write("Merge nfl pickles of cluster bin %s DONE: %s\n" %
                         (i, out_pickle))
Ejemplo n.º 19
0
 def nfl_pickle(self):
     """Return output nfl pickle of the i-th chunk."""
     return IceFiles(prog_name="", root_dir=self.cluster_out_dir, no_log_f=True).nfl_pickle_i(self.nfl_index)
Ejemplo n.º 20
0
    def _validate_inputs(self, root_dir, i, ccs_fofn, blasr_nproc, tmp_dir,
                         ref_fasta=None):
        """
        Check inputs, write $ICE_PARTIAL_PY i command to script_file
        and return (input_fasta, ref_fasta, out_pickle, done_file)
        for the i-th chunk of nfl reads.
        """
        icef = IceFiles(prog_name="ice_partial_{i}".format(i=i),
                        root_dir=root_dir, no_log_f=False)

        # root_dir/output/final.consensus.fasta
        ref_fasta = icef.final_consensus_fa
        ref_dazz = icef.final_dazz_db

        # root_dir/output/map_noFL/input.split_{0:03d}.fasta
        input_fasta = icef.nfl_fa_i(i)

        # $input_fasta.partial_uc.pickle
        out_pickle = icef.nfl_pickle_i(i)

        # $input_fasta.partial_uc.pickle.DONE
        done_file = icef.nfl_done_i(i)

        # $input_fasta.partial_uc.sh
        script_file = icef.nfl_script_i(i)

        # Check if inputs exist.
        errMsg = ""
        if not nfs_exists(input_fasta):
            errMsg = ("The {i}-th splitted non-full-length reads ".format(i=i) +
                      "fasta file {f} does not exist. ".format(f=input_fasta) +
                      "Please run $ICE_PARTIAL_PY split first.")
        elif not nfs_exists(ref_fasta):
            # ref_fasta --- root_dir/output/final.consensus.fasta
            # ref_dazz --- root_dir/output/final.consensus.dazz.fasta.db
            # ref_fasta and ref_dazz must exist if ICE has run successfully in
            # root_dir. If either one does not exist, it means ICE has not
            # successfully run in root_dir. Then we have to throw an error message
            # requring users to copy the root_dir/output directory manually,
            # rather than providing an option to overwrite ref_fasta and build
            # ref_dazz, because a race condition can happen when multiple
            # IcePartialI tasks start to run at the same time, which can corrupt
            # fasta and dazz db files and lead to unexpected runtime errors.
            errMsg = ("The unpolished consensus isoforms fasta file " +
                      "{f} does not exist. ".format(f=ref_fasta) +
                      "Please make sure ICE is successfully done in root_dir, " +
                      "or copy ICE output directory (e.g., cluster_out/output) " +
                      "to {dst}".format(dst=op.dirname(ref_fasta)))
        elif not nfs_exists(ref_dazz):
            errMsg = ("The dazz db " +
                      "{f} does not exist. ".format(f=ref_dazz) +
                      "Please make sure it is already built.")
        if len(errMsg) != 0:
            raise IOError(errMsg)

        # Save cmd to script_file.
        cmd = self._cmd_str(root_dir=root_dir, i=[i],
                            ccs_fofn=ccs_fofn,
                            blasr_nproc=blasr_nproc,
                            tmp_dir=tmp_dir)
        with open(script_file, 'w') as writer:
            writer.write(cmd + "\n")

        icef.add_log("Writing CMD to: {script_file}".
                     format(script_file=script_file))
        icef.close_log()

        return (input_fasta, ref_fasta, out_pickle, done_file)
Ejemplo n.º 21
0
 def consensus_isoforms_file(self):
     """Return output consensus isoform file, cluster_out/output/final.consensus.fasta"""
     return IceFiles(root_dir=self.cluster_out_dir, prog_name="", no_log_f=True).final_consensus_fa
Ejemplo n.º 22
0
 def nfl_pickle(self):
     """Return output nfl pickle file, cluster_out/output/nfl.all.partial_uc.pickle
     """
     return IceFiles(prog_name="", root_dir=self.cluster_out_dir, no_log_f=True).nfl_all_pickle_fn
Ejemplo n.º 23
0
 def flnc_pickle(self):
     """Return output flnc pickle file, cluster_out/output/final.pickle"""
     return IceFiles(root_dir=self.cluster_out_dir, prog_name="", no_log_f=True).final_pickle_fn