コード例 #1
0
ファイル: IceAllPartials.py プロジェクト: avrajit/cDNA_primer
    def combinePickles(self, pickle_filenames, out_pickle):
        """Combine all *.pickle files to one and dump to self.out_pickle."""
        self.add_log("Combining pickles: {ps} to a big pickle {p}".format(
                     ps=", ".join(pickle_filenames), p=out_pickle),
                     level=logging.INFO)
        if len(pickle_filenames) == 1:
            src = pickle_filenames[0]
            dst = out_pickle

            if (realpath(src) != realpath(dst)):
                self.add_log("Copying {src} to {dst}.".format(src=src, dst=dst))
                shutil.copyfile(src, dst)
            else:
                self.add_log("{dst} has been created, no need to merge.".
                    format(dst=out_pickle))
        else:
            # Combine all partial outputs
            self.add_log("Merging all pickles.")
            partial_uc = defaultdict(lambda: [])
            nohit = set()
            for pf in pickle_filenames:
                self.add_log("Merging {pf}.".format(pf=pf))
                a = load(open(pf))
                nohit.update(a['nohit'])
                for k, v in a['partial_uc'].iteritems():
                    partial_uc[k] += v

            self.add_log("Dumping all to {f}".format(f=out_pickle))
            # Dump to one file
            partial_uc = dict(partial_uc)
            with open(out_pickle, 'w') as f:
                dump({'nohit':nohit, 'partial_uc':partial_uc}, f)
コード例 #2
0
def combine_nfl_pickles(splitted_pickles, out_pickle):
    """Combine splitted nfl pickles to a big pickle."""
    logging.debug("Cominbing {N} nfl pickles: {ps} ".
                  format(N=len(splitted_pickles),
                         ps=",".join(splitted_pickles)) +
                  " into a big pickle {p}.".format(p=out_pickle))

    if len(splitted_pickles) == 1:
        logging.debug("Copying the only given pickle to out_pickle.")
        if realpath(splitted_pickles[0]) != realpath(out_pickle):
            shutil.copyfile(splitted_pickles[0], out_pickle)
    else:
        # Combine all partial outputs
        logging.debug("Merging all pickles.")
        partial_uc = defaultdict(lambda: [])
        nohit = set()
        for pf in splitted_pickles:
            logging.debug("Merging {pf}.".format(pf=pf))
            a = load(open(pf))
            nohit.update(a['nohit'])
            for k, v in a['partial_uc'].iteritems():
                partial_uc[k] += v

        logging.debug("Dumping all to {f}".format(f=out_pickle))
        # Dump to one file
        partial_uc = dict(partial_uc)
        with open(out_pickle, 'w') as f:
            dump({'nohit': nohit, 'partial_uc': partial_uc}, f)
        logging.debug("{f} created.".format(f=out_pickle))
コード例 #3
0
ファイル: IceUtils.py プロジェクト: ksahlin/cDNA_primer
def combine_nfl_pickles(splitted_pickles, out_pickle):
    """Combine splitted nfl pickles to a big pickle."""
    logging.debug("Cominbing {N} nfl pickles: {ps} ".
                  format(N=len(splitted_pickles),
                         ps=",".join(splitted_pickles)) +
                  " into a big pickle {p}.".format(p=out_pickle))

    if len(splitted_pickles) == 1:
        logging.debug("Copying the only given pickle to out_pickle.")
        if realpath(splitted_pickles[0]) != realpath(out_pickle):
            shutil.copyfile(splitted_pickles[0], out_pickle)
    else:
        # Combine all partial outputs
        logging.debug("Merging all pickles.")
        partial_uc = defaultdict(lambda: [])
        nohit = set()
        for pf in splitted_pickles:
            logging.debug("Merging {pf}.".format(pf=pf))
            a = load(open(pf))
            nohit.update(a['nohit'])
            for k, v in a['partial_uc'].iteritems():
                partial_uc[k] += v

        logging.debug("Dumping all to {f}".format(f=out_pickle))
        # Dump to one file
        partial_uc = dict(partial_uc)
        with open(out_pickle, 'w') as f:
            dump({'nohit': nohit, 'partial_uc': partial_uc}, f)
        logging.debug("{f} created.".format(f=out_pickle))
コード例 #4
0
ファイル: Cluster.py プロジェクト: WenchaoLin/cDNA_primer
    def _validate_inputs(self, _flnc_fa, _nfl_fa, _ccs_fofn, fasta_fofn=None, quiver=False):
        """Validate input files and return absolute expaneded paths."""
        flnc_fa, nfl_fa, ccs_fofn = _flnc_fa, _nfl_fa, _ccs_fofn
        self.add_log("Checking input files.", level=logging.INFO)
        if flnc_fa is None:
            raise ClusterException(
                "Input full-length non-chimeric reads " + "files (i.e., flnc_fa) needs to be specified."
            )
        else:
            flnc_fa = realpath(flnc_fa)
            if not op.exists(flnc_fa):
                raise ClusterException("Unable to find full-length " + "non-chimeric reads: {fn}".format(fn=flnc_fa))

        if nfl_fa is None:
            if quiver is True:
                raise ClusterException(
                    "Input non-full-length reads file " + "(i.e., nfl_fa) needs to be specified for isoform polish."
                )
        else:
            nfl_fa = realpath(nfl_fa)
            if not op.exists(nfl_fa):
                raise ClusterException("Unable to find non-full-length " + "non-chimeric reads: {fn}".format(fn=nfl_fa))

        if ccs_fofn is not None and not op.exists(ccs_fofn):
            raise ClusterException("Unable to find CCS FOFN file: " + "{fn}".format(fn=ccs_fofn))

        if fasta_fofn is not None and quiver:
            if not os.path.exists(fasta_fofn):
                raise ClusterException("Unable to find FASTA_FOFN file: {0}".format(fasta_fofn))
            for line in open(fasta_fofn):
                if not os.path.exists(line.strip()):
                    raise ClusterException("Unable to locate {0} in {1}".format(line.strip(), fasta_fofn))
        return (flnc_fa, nfl_fa, ccs_fofn, fasta_fofn)
コード例 #5
0
ファイル: ClusterTest.py プロジェクト: wangyuu/cDNA_primer
    def _validate_inputs(self, _flnc_fa, _nfl_fa, _ccs_fofn, fasta_fofn=None, quiver=False):
        """Validate input files and return absolute expaneded paths."""
        flnc_fa, nfl_fa, ccs_fofn = _flnc_fa, _nfl_fa, _ccs_fofn
        self.add_log("Checking input files.", level=logging.INFO)
        if flnc_fa is None:
            raise ClusterTestException("Input full-length non-chimeric reads " +
                    "files (i.e., flnc_fa) needs to be specified.")
        else:
            flnc_fa = realpath(flnc_fa)
            if not op.exists(flnc_fa):
                raise ClusterTestException("Unable to find full-length " +
                    "non-chimeric reads: {fn}".format(fn=flnc_fa))

        if nfl_fa is None:
            if quiver is True:
                raise ClusterTestException("Input non-full-length reads file " +
                    "(i.e., nfl_fa) needs to be specified for isoform polish.")
        else:
            nfl_fa = realpath(nfl_fa)
            if not op.exists(nfl_fa):
                raise ClusterTestException("Unable to find non-full-length " +
                    "non-chimeric reads: {fn}".format(fn=nfl_fa))

        if ccs_fofn is not None and not op.exists(ccs_fofn):
            raise ClusterTestException("Unable to find CCS FOFN file: " +
                                   "{fn}".format(fn=ccs_fofn))

        if fasta_fofn is not None and quiver:
            if not os.path.exists(fasta_fofn):
                raise ClusterTestException("Unable to find FASTA_FOFN file: {0}".format(fasta_fofn))
            for line in open(fasta_fofn):
                if not os.path.exists(line.strip()):
                    raise ClusterTestException("Unable to locate {0} in {1}".format(\
                            line.strip(), fasta_fofn))
        return (flnc_fa, nfl_fa, ccs_fofn, fasta_fofn)
コード例 #6
0
    def combinePickles(self, pickle_filenames, out_pickle):
        """Combine all *.pickle files to one and dump to self.out_pickle."""
        self.add_log("Combining pickles: {ps} to a big pickle {p}".format(
            ps=", ".join(pickle_filenames), p=out_pickle),
                     level=logging.INFO)
        if len(pickle_filenames) == 1:
            src = pickle_filenames[0]
            dst = out_pickle

            if (realpath(src) != realpath(dst)):
                self.add_log("Copying {src} to {dst}.".format(src=src,
                                                              dst=dst))
                shutil.copyfile(src, dst)
            else:
                self.add_log(
                    "{dst} has been created, no need to merge.".format(
                        dst=out_pickle))
        else:
            # Combine all partial outputs
            self.add_log("Merging all pickles.")
            partial_uc = defaultdict(lambda: [])
            nohit = set()
            for pf in pickle_filenames:
                self.add_log("Merging {pf}.".format(pf=pf))
                a = load(open(pf))
                nohit.update(a['nohit'])
                for k, v in a['partial_uc'].iteritems():
                    partial_uc[k] += v

            self.add_log("Dumping all to {f}".format(f=out_pickle))
            # Dump to one file
            partial_uc = dict(partial_uc)
            with open(out_pickle, 'w') as f:
                dump({'nohit': nohit, 'partial_uc': partial_uc}, f)
コード例 #7
0
ファイル: Cluster.py プロジェクト: yimsea/cDNA_primer
    def __init__(self,
                 root_dir,
                 flnc_fa,
                 nfl_fa,
                 bas_fofn,
                 ccs_fofn,
                 out_fa,
                 sge_opts,
                 ice_opts,
                 ipq_opts,
                 report_fn=None,
                 summary_fn=None,
                 fasta_fofn=None,
                 nfl_reads_per_split=30000):
        super(Cluster, self).__init__(prog_name="Cluster",
                                      root_dir=root_dir,
                                      bas_fofn=bas_fofn,
                                      ccs_fofn=ccs_fofn,
                                      fasta_fofn=fasta_fofn)

        self.sge_opts = sge_opts  # SGE, CPU arguments and etc
        self.ice_opts = ice_opts  # ICE clustering algorithm arguments
        self.ipq_opts = ipq_opts  # IceQuiver HQ/LQ isoform arguments
        self.nfl_reads_per_split = int(
            nfl_reads_per_split)  # ToDo: setsanity check here?

        self.flnc_fa, self.nfl_fa, self.ccs_fofn, self.fasta_fofn = self._validate_inputs(
            flnc_fa,
            nfl_fa,
            ccs_fofn,
            fasta_fofn=fasta_fofn,
            quiver=self.ice_opts.quiver)

        self.add_log("DEBUG: self.fasta_fofn is {0}".format(self.fasta_fofn))

        self.root_dir, self.out_fa = self._validate_outputs(root_dir, out_fa)

        self.sanity_check()

        self._probqv = None  # probability & quality value

        self._flnc_splitted_fas = []  # split flnc_fa into smaller files.
        self._nflncSplittedFas = []  # split nfl_fa into smaller files.
        self._logConfigs()  # Log configurations

        self.iceinit = None
        self.icec = None
        self.iceq = None
        self.pol = None

        self.report_fn = realpath(report_fn) if report_fn is not None \
            else op.join(self.root_dir, "cluster_report.csv")
        self.summary_fn = realpath(summary_fn) if summary_fn is not None \
            else op.join(self.root_dir, "cluster_summary.txt")

        self.add_log("A Cluster Object created.", level=logging.INFO)
コード例 #8
0
ファイル: Cluster.py プロジェクト: WenchaoLin/cDNA_primer
    def __init__(
        self,
        root_dir,
        flnc_fa,
        nfl_fa,
        bas_fofn,
        ccs_fofn,
        out_fa,
        sge_opts,
        ice_opts,
        ipq_opts,
        report_fn=None,
        summary_fn=None,
        fasta_fofn=None,
        nfl_reads_per_split=30000,
    ):
        super(Cluster, self).__init__(
            prog_name="Cluster", root_dir=root_dir, bas_fofn=bas_fofn, ccs_fofn=ccs_fofn, fasta_fofn=fasta_fofn
        )

        self.sge_opts = sge_opts  # SGE, CPU arguments and etc
        self.ice_opts = ice_opts  # ICE clustering algorithm arguments
        self.ipq_opts = ipq_opts  # IceQuiver HQ/LQ isoform arguments
        self.nfl_reads_per_split = int(nfl_reads_per_split)  # ToDo: setsanity check here?

        self.flnc_fa, self.nfl_fa, self.ccs_fofn, self.fasta_fofn = self._validate_inputs(
            flnc_fa, nfl_fa, ccs_fofn, fasta_fofn=fasta_fofn, quiver=self.ice_opts.quiver
        )

        self.root_dir, self.out_fa = self._validate_outputs(root_dir, out_fa)

        self.sanity_check()

        self._probqv = None  # probability & quality value

        self._flnc_splitted_fas = []  # split flnc_fa into smaller files.
        self._nflncSplittedFas = []  # split nfl_fa into smaller files.
        self._logConfigs()  # Log configurations

        self.iceinit = None
        self.icec = None
        self.iceq = None
        self.pol = None

        self.add_log(
            "Setting ece_penalty: {0} ece_min_len: {1}".format(ice_opts.ece_penalty, ice_opts.ece_min_len),
            level=logging.INFO,
        )

        self.report_fn = realpath(report_fn) if report_fn is not None else op.join(self.root_dir, "cluster_report.csv")
        self.summary_fn = (
            realpath(summary_fn) if summary_fn is not None else op.join(self.root_dir, "cluster_summary.txt")
        )

        self.add_log("A Cluster Object created.", level=logging.INFO)
コード例 #9
0
    def __init__(self,
                 root_dir,
                 flnc_fa,
                 nfl_fa,
                 bas_fofn,
                 ccs_fofn,
                 out_fa,
                 sge_opts,
                 ice_opts,
                 hq_isoforms_fa=None,
                 hq_isoforms_fq=None,
                 lq_isoforms_fa=None,
                 lq_isoforms_fq=None,
                 report_fn=None,
                 summary_fn=None):
        super(Cluster, self).__init__(prog_name="Cluster",
                                      root_dir=root_dir,
                                      bas_fofn=bas_fofn,
                                      ccs_fofn=ccs_fofn)

        self.flnc_fa, self.nfl_fa, self.ccs_fofn = self._validateInputs(
            flnc_fa, nfl_fa, ccs_fofn)

        self.root_dir, self.out_fa = self._validateOutputs(root_dir, out_fa)

        self.hq_isoforms_fa = hq_isoforms_fa
        self.hq_isoforms_fq = hq_isoforms_fq
        self.lq_isoforms_fa = lq_isoforms_fa
        self.lq_isoforms_fq = lq_isoforms_fq

        self.sge_opts = sge_opts  # SGE, CPU options and etc
        self.ice_opts = ice_opts  # The ICE clutering algorithm options

        self.sanity_check()

        self._probqv = None  # probability & quality value

        self._flnc_splitted_fas = []  # split flnc_fa into smaller files.
        self._nflncSplittedFas = []  # split nfl_fa into smaller files.
        self._logConfigs()  # Log configurations

        self.iceinit = None
        self.icec = None
        self.iceq = None
        self.pol = None

        self.report_fn = realpath(report_fn) if report_fn is not None \
                else op.join(self.root_dir, "cluster_report.csv")
        self.summary_fn = realpath(summary_fn) if summary_fn is not None \
                else op.join(self.root_dir, "cluster_summary.txt")

        self.summary = ClusterSummary()

        self.add_log("Finishing create Cluster Object.", level=logging.INFO)
コード例 #10
0
 def _validateInputs(self, fasta_filenames, ref_fasta, ccs_fofn, sa_file):
     """Validate input files."""
     for f in fasta_filenames:
         if not op.exists(f):
             raise IOError("Input fasta {f} does not exist.".format(f=f))
     if ref_fasta is None or not op.exists(ref_fasta):
         raise IOError("Reference {r} does not exist.".format(r=ref_fasta))
     if ccs_fofn is not None and not op.exists(ccs_fofn):
         raise IOError("ccs_fofn file {ccs_fofn} does not exist.".format(
             ccs_fofn=ccs_fofn))
     if sa_file is not None and not op.exists(sa_file):
         raise IOError("suffix array {s} does not exist.".format(s=sa_file))
     return ([realpath(f) for f in fasta_filenames], realpath(ref_fasta),
             realpath(ccs_fofn),
             realpath(sa_file) if sa_file is not None else None)
コード例 #11
0
def sanity_check_daligner(scriptDir, testDirName="daligner_test_dir"):
    """
    Run daligner on gcon_in.fa, but don't care about results.
    Just make sure it runs.
    """
    scriptDir = realpath(scriptDir)
    testDir = op.join(scriptDir, testDirName)

    if not op.exists(scriptDir):
        os.makedirs(scriptDir)
    if not op.exists(testDir):
        os.makedirs(testDir)

    testInFa = op.join(testDir, "gcon_in.fa")
    if op.exists(testInFa):
        os.remove(testInFa)
    shutil.copy(GCON_IN_FA, testInFa)
    assert(op.exists(testInFa))

    obj = DazzIDHandler(testInFa)
    DalignerRunner.make_db(obj.dazz_filename)
    runner = DalignerRunner(testInFa, testInFa, is_FL=True, same_strand_only=True, \
                            query_converted=True, db_converted=True, query_made=True, \
                            db_made=True, use_sge=False, cpus=4, sge_opts=None)
    runner.runHPC(min_match_len=300, output_dir=testDir, sensitive_mode=False)

    shutil.rmtree(testDir)
    logging.info("daligner check passed.")
    return True
コード例 #12
0
ファイル: Polish.py プロジェクト: avrajit/cDNA_primer
    def __init__(self, root_dir, nfl_fa, bas_fofn, ccs_fofn,
                 ice_opts, sge_opts, hq_isoforms_fa=None, hq_isoforms_fq=None,
                 lq_isoforms_fa=None, lq_isoforms_fq=None, fasta_fofn=None):
        """
        root_dir --- IceFiles.root_dir, usually data/clusterOutDir
        nfl_fa    --- non-full-length reads in fasta, e.g., isoseq_nfl.fa
        bas_fofn --- e.g. input.fofn of bas|bax.h5 files
        ccs_fofn --- e.g. reads_of_insert.fofn of ccs files.
        hq_isoforms_fa|fq  --- polished, hiqh quality consensus isoforms in fasta|q
        lq_isoforms_fa|fq  --- polished, low quality consensus isoforms in fasta|q
        """
        IceFiles.__init__(self, prog_name="IcePolish", root_dir=root_dir,
                          bas_fofn=bas_fofn, ccs_fofn=ccs_fofn,
                          fasta_fofn=fasta_fofn)
        self.nfl_fa = realpath(nfl_fa)
        self.hq_isoforms_fa = hq_isoforms_fa
        self.hq_isoforms_fq = hq_isoforms_fq
        self.lq_isoforms_fa = lq_isoforms_fa
        self.lq_isoforms_fq = lq_isoforms_fq
        self.ice_opts = ice_opts
        self.sge_opts = sge_opts

        self.icep = None   # IceAllPartials.
        self.iceq = None   # IceQuiver
        self.icepq = None  # IcePostQuiver
        self._nfl_splitted_fas = None

        self._validate_inputs()
コード例 #13
0
ファイル: IceAllPartials.py プロジェクト: ksahlin/cDNA_primer
 def _validate_inputs(self, fastq_filenames, ref_fasta, ccs_fofn, sa_file):
     """Validate input files."""
     for f in fastq_filenames:
         if not op.exists(f):
             raise IOError("Input fastq {f} does not exist.".format(f=f))
     if ref_fasta is None or not op.exists(ref_fasta):
         raise IOError("Reference {r} does not exist.".format(r=ref_fasta))
     if ccs_fofn is not None and not op.exists(ccs_fofn):
         raise IOError("ccs_fofn file {ccs_fofn} does not exist.".format(
             ccs_fofn=ccs_fofn))
     if sa_file is not None and not op.exists(sa_file):
         raise IOError("suffix array {s} does not exist.".format(s=sa_file))
     return ([realpath(f) for f in fastq_filenames],
             realpath(ref_fasta),
             realpath(ccs_fofn),
             realpath(sa_file) if sa_file is not None else None)
コード例 #14
0
ファイル: IceUtils.py プロジェクト: ksahlin/cDNA_primer
def sanity_check_daligner(scriptDir, testDirName="daligner_test_dir"):
    """
    Run daligner on gcon_in.fa, but don't care about results.
    Just make sure it runs.
    """
    scriptDir = realpath(scriptDir)
    testDir = op.join(scriptDir, testDirName)

    if not op.exists(scriptDir):
        os.makedirs(scriptDir)
    if not op.exists(testDir):
        os.makedirs(testDir)

    testInFa = op.join(testDir, "gcon_in.fa")
    if op.exists(testInFa):
        os.remove(testInFa)
    shutil.copy(GCON_IN_FA, testInFa)
    assert(op.exists(testInFa))

    obj = DazzIDHandler(testInFa)
    DalignerRunner.make_db(obj.dazz_filename)
    runner = DalignerRunner(testInFa, testInFa, is_FL=True, same_strand_only=True, \
                            query_converted=True, db_converted=True, query_made=True, \
                            db_made=True, use_sge=False, cpus=4, sge_opts=None)
    runner.runHPC(min_match_len=300, output_dir=testDir, sensitive_mode=False)

    shutil.rmtree(testDir)
    logging.info("daligner check passed.")
    return True
コード例 #15
0
ファイル: Cluster.py プロジェクト: avrajit/cDNA_primer
    def __init__(self, root_dir, flnc_fa, nfl_fa,
                 bas_fofn, ccs_fofn, out_fa,
                 sge_opts, ice_opts,
                 hq_isoforms_fa=None, hq_isoforms_fq=None,
                 lq_isoforms_fa=None, lq_isoforms_fq=None,
                 report_fn=None, summary_fn=None):
        super(Cluster, self).__init__(prog_name="Cluster",
                root_dir=root_dir, bas_fofn=bas_fofn, ccs_fofn=ccs_fofn)

        self.flnc_fa, self.nfl_fa, self.ccs_fofn = self._validateInputs(
            flnc_fa, nfl_fa, ccs_fofn)

        self.root_dir, self.out_fa = self._validateOutputs(
            root_dir, out_fa)

        self.hq_isoforms_fa = hq_isoforms_fa
        self.hq_isoforms_fq = hq_isoforms_fq
        self.lq_isoforms_fa = lq_isoforms_fa
        self.lq_isoforms_fq = lq_isoforms_fq

        self.sge_opts = sge_opts  # SGE, CPU options and etc
        self.ice_opts = ice_opts  # The ICE clutering algorithm options

        self.sanity_check()

        self._probqv = None     # probability & quality value

        self._flnc_splitted_fas = []  # split flnc_fa into smaller files.
        self._nflncSplittedFas = []  # split nfl_fa into smaller files.
        self._logConfigs()      # Log configurations

        self.iceinit = None
        self.icec = None
        self.iceq = None
        self.pol = None

        self.report_fn = realpath(report_fn) if report_fn is not None \
                else op.join(self.root_dir, "cluster_report.csv")
        self.summary_fn = realpath(summary_fn) if summary_fn is not None \
                else op.join(self.root_dir, "cluster_summary.txt")

        self.summary = ClusterSummary()

        self.add_log("Finishing create Cluster Object.", level=logging.INFO)
コード例 #16
0
ファイル: Cluster.py プロジェクト: avrajit/cDNA_primer
 def _validateInputs(self, _flnc_fa, _nfl_fa, _ccs_fofn):
     """Validate input files and return absolute expaneded paths."""
     flnc_fa, nfl_fa, ccs_fofn = _flnc_fa, _nfl_fa, _ccs_fofn
     self.add_log("Checking input files.", level=logging.INFO)
     if flnc_fa is None or nfl_fa is None:
         raise ClusterException(
             "Input non-chimeric reads files needs to be specified.")
     else:
         flnc_fa, nfl_fa = realpath(flnc_fa), realpath(nfl_fa)
         if not op.exists(flnc_fa):
             raise ClusterException("Unable to find full-length " +
                 "non-chimeric reads: {fn}".format(fn=flnc_fa))
         if not op.exists(nfl_fa):
             raise ClusterException("Unable to find non-full-length " +
                 "non-chimeric reads: {fn}".format(fn=nfl_fa))
         if ccs_fofn is not None and not op.exists(ccs_fofn):
             raise ClusterException("Unable to find FOFN file: " +
                 "{fn}".format(fn=ccs_fofn))
     return (flnc_fa, nfl_fa, ccs_fofn)
コード例 #17
0
ファイル: IceAllPartials.py プロジェクト: avrajit/cDNA_primer
    def run(self):
        """Assigning nfl reads to consensus isoforms and merge."""
        # Call ice_partial.py to create a pickle for each splitted nfl fasta
        self.createPickles()
        # Wait for pickles to be created, if SGE is used.
        self.waitForPickles(pickle_filenames=self.pickle_filenames,
                            done_filenames=self.done_filenames)
        # Combine all pickles to a big pickle file: nfl_all_pickle_fn.
        self.combinePickles(pickle_filenames=self.pickle_filenames,
                            out_pickle=self.nfl_all_pickle_fn)
        # Create symbolic link if necessary
        if realpath(self.nfl_all_pickle_fn) != realpath(self.out_pickle):
            self.add_log("Creating a symbolic link for {f}".format(
                f=self.out_pickle), level=logging.INFO)
            if op.exists(self.out_pickle):
                os.remove(self.out_pickle)
            os.symlink(self.nfl_all_pickle_fn, self.out_pickle)

        # Close log
        self.close_log()
コード例 #18
0
ファイル: Cluster.py プロジェクト: WenchaoLin/cDNA_primer
    def _validate_outputs(self, _root_dir, _out_fa):
        """Validate outputs, create root_dir if it does not exist."""
        self.add_log("Checking outputs.", level=logging.INFO)
        root_dir, out_fa = _root_dir, _out_fa
        if root_dir is None:
            self.add_log("Output directory needs to be specified.", level=logging.ERROR)
        if out_fa is None:
            self.add_log("Output consensus fasta needs to be specified.", level=logging.ERROR)

        root_dir = realpath(root_dir)
        out_fa = realpath(out_fa)

        if op.exists(root_dir):
            self.add_log("Output directory {d} already exists.".format(d=root_dir))
        else:
            self.add_log("Creating output directory {d}.".format(d=root_dir))
            os.mkdir(root_dir)
        if op.exists(out_fa):
            raise ClusterException("Consensus FASTA file {f} already exists.".format(f=out_fa))
        return root_dir, out_fa
コード例 #19
0
    def run(self):
        """Assigning nfl reads to consensus isoforms and merge."""
        # Call ice_partial.py to create a pickle for each splitted nfl fasta
        self.createPickles()
        # Wait for pickles to be created, if SGE is used.
        self.waitForPickles(pickle_filenames=self.pickle_filenames,
                            done_filenames=self.done_filenames)
        # Combine all pickles to a big pickle file: nfl_all_pickle_fn.
        self.combinePickles(pickle_filenames=self.pickle_filenames,
                            out_pickle=self.nfl_all_pickle_fn)
        # Create symbolic link if necessary
        if realpath(self.nfl_all_pickle_fn) != realpath(self.out_pickle):
            self.add_log(
                "Creating a symbolic link for {f}".format(f=self.out_pickle),
                level=logging.INFO)
            if op.exists(self.out_pickle):
                os.remove(self.out_pickle)
            os.symlink(self.nfl_all_pickle_fn, self.out_pickle)

        # Close log
        self.close_log()
コード例 #20
0
 def _validateInputs(self, _flnc_fa, _nfl_fa, _ccs_fofn):
     """Validate input files and return absolute expaneded paths."""
     flnc_fa, nfl_fa, ccs_fofn = _flnc_fa, _nfl_fa, _ccs_fofn
     self.add_log("Checking input files.", level=logging.INFO)
     if flnc_fa is None or nfl_fa is None:
         raise ClusterException(
             "Input non-chimeric reads files needs to be specified.")
     else:
         flnc_fa, nfl_fa = realpath(flnc_fa), realpath(nfl_fa)
         if not op.exists(flnc_fa):
             raise ClusterException("Unable to find full-length " +
                                    "non-chimeric reads: {fn}".format(
                                        fn=flnc_fa))
         if not op.exists(nfl_fa):
             raise ClusterException("Unable to find non-full-length " +
                                    "non-chimeric reads: {fn}".format(
                                        fn=nfl_fa))
         if ccs_fofn is not None and not op.exists(ccs_fofn):
             raise ClusterException("Unable to find FOFN file: " +
                                    "{fn}".format(fn=ccs_fofn))
     return (flnc_fa, nfl_fa, ccs_fofn)
コード例 #21
0
ファイル: IceUtils.py プロジェクト: ksahlin/cDNA_primer
def sanity_check_sge(sge_opts, scriptDir, testDirName="gcon_test_dir"):
    """Sanity check if sge can work."""
    scriptDir = realpath(scriptDir)
    testDir = op.join(scriptDir, testDirName)

    if not op.exists(scriptDir):
        os.makedirs(scriptDir)
    if not op.exists(testDir):
        os.makedirs(testDir)

    testSh = op.join(scriptDir, 'test.sh')
    consensusFa = op.join(testDir, "g_consensus.fasta")
    testInFa = op.join(testDir, "gcon_in.fa")
    if op.exists(testInFa):
        os.remove(testInFa)
    shutil.copy(GCON_IN_FA, testInFa)
    assert(op.exists(testInFa))

    with open(testSh, 'w') as f:
        f.write("#!/bin/bash\n")
        f.write("{gcon}".format(gcon=gcon_py) +
                " {inFa} ".format(inFa=real_upath(testInFa)) +
                " {testDir}/g_consensus".format(testDir=real_upath(testDir)) +
                " c1\n")

    assert(op.exists(testSh))
    cmd = "qsub"
    if sge_opts.sge_queue is not None:
        cmd += " -q " + sge_opts.sge_queue
    cmd += " -sync y -pe {env} 1 -cwd -S /bin/bash -V -e /dev/null -o /dev/null {t}".\
          format(t=real_upath(testSh), env=sge_opts.sge_env_name)
    logging.info("Submitting cmd: " + cmd)
    _out, _code, _msg = backticks(cmd)

#    answer = FastaReader(GCON_OUT_FA).__iter__().next()
#    tester = FastaReader(consensusFa).__iter__().next()
#
#    if answer.name != tester.name or \
#       answer.sequence != tester.sequence:
    if not filecmp.cmp(consensusFa, GCON_OUT_FA):
        errMsg = "Trouble running qsub or output is not as " + \
                 "expected ({0} and {1} must agree). Abort!".format(
                     consensusFa, GCON_OUT_FA)
        logging.error(errMsg)
        return False
    else:
        shutil.rmtree(testDir)
        logging.info("sge and gcon check passed.")
        return True
コード例 #22
0
def sanity_check_sge(sge_opts, scriptDir, testDirName="gcon_test_dir"):
    """Sanity check if sge can work."""
    scriptDir = realpath(scriptDir)
    testDir = op.join(scriptDir, testDirName)

    if not op.exists(scriptDir):
        os.makedirs(scriptDir)
    if not op.exists(testDir):
        os.makedirs(testDir)

    testSh = op.join(scriptDir, 'test.sh')
    consensusFa = op.join(testDir, "g_consensus.fasta")
    testInFa = op.join(testDir, "gcon_in.fa")
    if op.exists(testInFa):
        os.remove(testInFa)
    shutil.copy(GCON_IN_FA, testInFa)
    assert(op.exists(testInFa))

    with open(testSh, 'w') as f:
        f.write("#!/bin/bash\n")
        f.write("{gcon}".format(gcon=gcon_py) +
                " {inFa} ".format(inFa=real_upath(testInFa)) +
                " {testDir}/g_consensus".format(testDir=real_upath(testDir)) +
                " c1\n")

    assert(op.exists(testSh))
    cmd = "qsub"
    if sge_opts.sge_queue is not None:
        cmd += " -q " + sge_opts.sge_queue
    cmd += " -sync y -pe {env} 1 -cwd -S /bin/bash -V -e /dev/null -o /dev/null {t}".\
          format(t=real_upath(testSh), env=sge_opts.sge_env_name)
    logging.info("Submitting cmd: " + cmd)
    _out, _code, _msg = backticks(cmd)

#    answer = FastaReader(GCON_OUT_FA).__iter__().next()
#    tester = FastaReader(consensusFa).__iter__().next()
#
#    if answer.name != tester.name or \
#       answer.sequence != tester.sequence:
    if not filecmp.cmp(consensusFa, GCON_OUT_FA):
        errMsg = "Trouble running qsub or output is not as " + \
                 "expected ({0} and {1} must agree). Abort!".format(
                     consensusFa, GCON_OUT_FA)
        logging.error(errMsg)
        return False
    else:
        shutil.rmtree(testDir)
        logging.info("sge and gcon check passed.")
        return True
コード例 #23
0
ファイル: Cluster.py プロジェクト: yimsea/cDNA_primer
    def _validate_outputs(self, _root_dir, _out_fa):
        """Validate outputs, create root_dir if it does not exist."""
        self.add_log("Checking outputs.", level=logging.INFO)
        root_dir, out_fa = _root_dir, _out_fa
        if root_dir is None:
            self.add_log("Output directory needs to be specified.",
                         level=logging.ERROR)
        if out_fa is None:
            self.add_log("Output consensus fasta needs to be specified.",
                         level=logging.ERROR)

        root_dir = realpath(root_dir)
        out_fa = realpath(out_fa)

        if op.exists(root_dir):
            self.add_log(
                "Output directory {d} already exists.".format(d=root_dir))
        else:
            self.add_log("Creating output directory {d}.".format(d=root_dir))
            os.mkdir(root_dir)
        if op.exists(out_fa):
            raise ClusterException(
                "Consensus FASTA file {f} already exists.".format(f=out_fa))
        return root_dir, out_fa
コード例 #24
0
ファイル: Polish.py プロジェクト: yimsea/cDNA_primer
    def __init__(self,
                 root_dir,
                 nfl_fa,
                 bas_fofn,
                 ccs_fofn,
                 ice_opts,
                 sge_opts,
                 ipq_opts,
                 fasta_fofn=None,
                 nfl_reads_per_split=30000):
        """
        root_dir --- IceFiles.root_dir, usually data/clusterOutDir
        nfl_fa    --- non-full-length reads in fasta, e.g., isoseq_nfl.fa
        bas_fofn --- e.g. input.fofn of bas|bax.h5 files
        ccs_fofn --- e.g. reads_of_insert.fofn of ccs files.

        ipq_opts --- IceQuiverHQLQOptions
                     qv_trim_5: ignore QV of n bases in the 5' end
                     qv_trim_3: ignore QV of n bases in the 3' end
                     hq_quiver_min_accuracy: minimum allowed quiver accuracy
                                      to mark an isoform as high quality
                     hq_isoforms_fa|fq: polished, hiqh quality consensus
                                        isoforms in fasta|q
                     lq_isoforms_fa|fq: polished, low quality consensus
                                        isoforms in fasta|q
        """
        IceFiles.__init__(self,
                          prog_name="IcePolish",
                          root_dir=root_dir,
                          bas_fofn=bas_fofn,
                          ccs_fofn=ccs_fofn,
                          fasta_fofn=fasta_fofn)
        self.nfl_fa = realpath(nfl_fa)
        self.nfl_reads_per_split = nfl_reads_per_split
        self.ice_opts = ice_opts
        self.sge_opts = sge_opts
        self.ipq_opts = ipq_opts

        self.icep = None  # IceAllPartials.
        self.iceq = None  # IceQuiver
        self.icepq = None  # IceQuiverPostprocess
        self._nfl_splitted_fas = None

        self.validate_inputs()
コード例 #25
0
ファイル: IceFiles.py プロジェクト: gonzalezibeas/cDNA_primer
    def __init__(self, prog_name, root_dir,
                 bas_fofn=None, ccs_fofn=None, fasta_fofn=None):
        """
        prog_name --- name of a sub-class
        root_dir --- root directory of the whole project. There will be
                     sub-directories under it, including:
                     tmp/ --- 0/  c0, c1, ..., c9999
                          --- 1/  c10000, c10001, ..., c19999
                          ...
                          each c? folder contains data for a cluster id=c?
                     script/
                          --- 0/  gcon_job_?.sh, gcon jobs in the first iteration
                          --- 1/  gcon_job_?.sh, gcon jobs in the second iteration
                          ...
                     log/
                          --- ICE.log   Log of the ICE algorithm
                          --- 0/  log for jobs in the first iteration
                          ...
                     output/   output files go here.
        bas_fofn --- input.fofn which contains movie.bas|bax.h5 files.
        ccs_fofn --- a fofn contains movie.ccs.h5 files.
        fasta_fofn --- a fofn contains movie.bax.h5.fasta files.
                     script/
                     output/
        bas_fofn --- input.fofn which contains movie.bas|bax.h5 files.
        ccs_fofn --- a fofn contains movie.ccs.h5 files.
        """
        self.prog_name = str(prog_name)
        self.root_dir = realpath(root_dir)

        self.bas_fofn = bas_fofn
        self.ccs_fofn = ccs_fofn
        self.fasta_fofn = fasta_fofn

        mkdir(self.root_dir)
        mkdir(self.tmp_dir)
        mkdir(self.log_dir)
        mkdir(self.script_dir)
        mkdir(self.out_dir)

        self.log_f = open(self.log_fn, 'w', 0)
        self.add_log(msg= "{p} initiated".format(p=self.prog_name))
コード例 #26
0
ファイル: Polish.py プロジェクト: gonzalezibeas/cDNA_primer
    def __init__(self,
                 root_dir,
                 nfl_fa,
                 bas_fofn,
                 ccs_fofn,
                 ice_opts,
                 sge_opts,
                 hq_isoforms_fa=None,
                 hq_isoforms_fq=None,
                 lq_isoforms_fa=None,
                 lq_isoforms_fq=None,
                 fasta_fofn=None):
        """
        root_dir --- IceFiles.root_dir, usually data/clusterOutDir
        nfl_fa    --- non-full-length reads in fasta, e.g., isoseq_nfl.fa
        bas_fofn --- e.g. input.fofn of bas|bax.h5 files
        ccs_fofn --- e.g. reads_of_insert.fofn of ccs files.
        hq_isoforms_fa|fq  --- polished, hiqh quality consensus isoforms in fasta|q
        lq_isoforms_fa|fq  --- polished, low quality consensus isoforms in fasta|q
        """
        IceFiles.__init__(self,
                          prog_name="IcePolish",
                          root_dir=root_dir,
                          bas_fofn=bas_fofn,
                          ccs_fofn=ccs_fofn,
                          fasta_fofn=fasta_fofn)
        self.nfl_fa = realpath(nfl_fa)
        self.hq_isoforms_fa = hq_isoforms_fa
        self.hq_isoforms_fq = hq_isoforms_fq
        self.lq_isoforms_fa = lq_isoforms_fa
        self.lq_isoforms_fq = lq_isoforms_fq
        self.ice_opts = ice_opts
        self.sge_opts = sge_opts

        self.icep = None  # IceAllPartials.
        self.iceq = None  # IceQuiver
        self.icepq = None  # IcePostQuiver
        self._nfl_splitted_fas = None

        self._validate_inputs()
コード例 #27
0
ファイル: Polish.py プロジェクト: 52teth/cDNA_primer
    def __init__(self, root_dir, nfl_fa, bas_fofn, ccs_fofn,
            ice_opts, sge_opts, ipq_opts, fasta_fofn=None, nfl_reads_per_split=30000):
        """
        root_dir --- IceFiles.root_dir, usually data/clusterOutDir
        nfl_fa    --- non-full-length reads in fasta, e.g., isoseq_nfl.fa
        bas_fofn --- e.g. input.fofn of bas|bax.h5 files
        ccs_fofn --- e.g. reads_of_insert.fofn of ccs files.

        ipq_opts --- IceQuiverHQLQOptions
                     qv_trim_5: ignore QV of n bases in the 5' end
                     qv_trim_3: ignore QV of n bases in the 3' end
                     hq_quiver_min_accuracy: minimum allowed quiver accuracy
                                      to mark an isoform as high quality
                     hq_isoforms_fa|fq: polished, hiqh quality consensus
                                        isoforms in fasta|q
                     lq_isoforms_fa|fq: polished, low quality consensus
                                        isoforms in fasta|q
        """
        IceFiles.__init__(self, prog_name="IcePolish", root_dir=root_dir,
                          bas_fofn=bas_fofn, ccs_fofn=ccs_fofn,
                          fasta_fofn=fasta_fofn)

        #self.add_log("DEBUG: in Polish ccs_fofn is {0}".format(self.ccs_fofn))
        #self.add_log("DEBUG: in Polish fasta_fofn is {0}".format(self.fasta_fofn))
        #self.add_log("DEBUG: in Polish bas_fofn is {0}".format(self.bas_fofn))
        self.nfl_fa = realpath(nfl_fa)
        self.nfl_reads_per_split = nfl_reads_per_split
        self.ice_opts = ice_opts
        self.sge_opts = sge_opts
        self.ipq_opts = ipq_opts

        self.add_log("ece_penalty: {0}, ece_min_len: {1}".format(self.ice_opts.ece_penalty, self.ice_opts.ece_min_len))

        self.icep = None   # IceAllPartials.
        self.iceq = None   # IceQuiver
        self.icepq = None  # IceQuiverPostprocess
        self._nfl_splitted_fas = None

        self.validate_inputs()
コード例 #28
0
    def __init__(self, reads_fn="test.fa", out_dir="output/",
                 out_reads_fn="testout.fa", primer_fn=None,
                 primer_report_fn=None, summary_fn=None,
                 cpus=1, change_read_id=True,
                 opts=ChimeraDetectionOptions(50, 10, 100, 50, 100, False),
                 out_nfl_fn=None, out_flnc_fn=None,
                 ignore_polyA=False, reuse_dom=False):
        self.reads_fn = realpath(reads_fn)
        self.out_dir = realpath(out_dir)
        self.cpus = cpus
        self.change_read_id = change_read_id
        self.chimera_detection_opts = opts
        self.ignore_polyA = ignore_polyA
        self.reuse_dom = reuse_dom

        # The input primer file: primers.fa
        self.primer_fn = primer_fn if primer_fn is not None else \
            op.join(self.data_dir, PRIMERFN)
        # The output fasta file.
        self.out_all_reads_fn = realpath(out_reads_fn)

        # Intermediate output fasta file before chimera detection.
        #     trimmed full-length reads: fl.trimmed.fasta
        # and
        #     trimmed non-full-length reads: nfl.trimmed.fasta
        self._trimmed_fl_reads_fn = op.join(self.out_dir, "fl.trimmed.fasta")
        self._trimmed_nfl_reads_fn = op.join(self.out_dir, "nfl.trimmed.fasta")

        self.primer_front_back_fn = op.join(self.out_dir, PRIMERFRONTENDFN)
        self.primer_chimera_fn = op.join(self.out_dir, PRIMERCHIMERAFN)

        # The output primer file: primer_info.csv
        self.primer_report_fn = primer_report_fn \
            if primer_report_fn is not None else \
            ".".join(out_reads_fn.split('.')[:-1]) + "." + PRIMERREPORTFN
        # primer reports for nfl reads before chimera detection. Note that
        # chimera detection is not necessary for nfl reads.
        self._primer_report_nfl_fn = op.join(self.out_dir,
                                             "primer_report.nfl.csv")
        # primer reports for fl reads after chimera detection. Note that
        # chimera detection is required for fl reads.
        self._primer_report_fl_fn = op.join(self.out_dir,
                                            "primer_report.fl.csv")

        # The matrix file: PBMATRIX.txt
        self.pbmatrix_fn = op.join(self.data_dir, PBMATRIXFN)

        # The output phmmer Dom file for trimming primers: hmmer.front_end.dom
        self.out_front_back_dom_fn = op.join(self.out_dir, FRONTENDDOMFN)
        # The output phmmer Dom file for chimera detection:
        #     hmmer.fl.chimera.dom and hmmer.nfl.chimera.dom
        self.out_trimmed_fl_dom_fn = op.join(self.out_dir, FLCHIMERADOMFN)
        self.out_trimmed_nfl_dom_fn = op.join(self.out_dir, NFLCHIMERADOMFN)

        self.chunked_front_back_reads_fns = None
        self.chunked_front_back_dom_fns = None

        #self.chunked_trimmed_reads_fns = None
        #self.chunked_trimmed_reads_dom_fns = None

        # The summary file: *.classify_summary.txt
        self.summary = ClassifySummary()
        self.summary_fn = summary_fn if summary_fn is not None else \
            ".".join(out_reads_fn.split('.')[:-1]) + \
            "." + CLASSIFYSUMMARY

        self.out_nfl_fn = realpath(out_nfl_fn) if out_nfl_fn is not None \
            else op.join(self.out_dir, "nfl.fasta")
        self.out_nflnc_fn = op.join(self.out_dir, "nflnc.fasta")
        self.out_nflc_fn = op.join(self.out_dir, "nflc.fasta")

        self.out_flnc_fn = realpath(out_flnc_fn) if out_flnc_fn is not None \
            else op.join(self.out_dir, "flnc.fasta")
        self.out_flc_fn = op.join(self.out_dir, "flc.fasta")
コード例 #29
0
ファイル: tofu_wrap.py プロジェクト: ksahlin/cDNA_primer
def tofu_wrap_main():
    parser = argparse.ArgumentParser(prog='tofu_wrap')
    add_cluster_arguments(parser, show_sge_env_name=True, show_sge_queue=True)

    parser.add_argument("--bin_size_kb", default=1, type=int, help="Bin size by kb (default: 1)")
    parser.add_argument("--bin_manual", default=None, help="Bin manual (ex: (1,2,3,5)), overwrites bin_size_kb")
    parser.add_argument("--bin_by_primer", default=False, action="store_true", help="Instead of binning by size, bin by primer (overwrites --bin_size_kb and --bin_manual)")
    parser.add_argument("--max_base_limit_MB", default=600, type=int, help="Maximum number of bases per partitioned bin, in MB (default: 600)")
    parser.add_argument("--gmap_name", default="hg19", help="GMAP DB name (default: hg19)")
    parser.add_argument("--gmap_db", default="/home/UNIXHOME/etseng/share/gmap_db_new/", help="GMAP DB location (default: /home/UNIXHOME/etseng/share/gmap_db_new/)")
    parser.add_argument("--output_seqid_prefix", type=str, default=None, help="Output seqid prefix. If not given, a random ID is generated")
    parser.add_argument("--mem_debug", default=False, action="store_true", help=argparse.SUPPRESS)
    parser.add_argument("--max_fuzzy_junction", default=5, type=int, help="Max fuzzy junction (default: 5 bp)")
    parser.add_argument("--version", action='version', version='%(prog)s ' + str(get_version()))
    args = parser.parse_args()

    # PRINT VERSION AND EXIT
#    if args.version:
#        print >> sys.stderr, get_version()
#        sys.exit(0)
    # DEBUG
    if args.mem_debug:
        from memory_profiler import memory_usage
    
    # #################################################################
    # SANITY CHECKS
    if not args.quiver:
        print >> sys.stderr, "--quiver must be turned on for tofu_wrap. Quit."
        sys.exit(-1)
    if args.nfl_fa is None:
        print >> sys.stderr, "--nfl_fa must be provided for tofu_wrap. Quit."
        sys.exit(-1)
    if not os.path.exists(args.gmap_db):
        print >> sys.stderr, "GMAP DB location not valid: {0}. Quit.".format(args.gmap_db)
        sys.exit(-1)
    if not os.path.exists(os.path.join(args.gmap_db, args.gmap_name)):
        print >> sys.stderr, "GMAP name not valid: {0}. Quit.".format(args.gmap_name)
        sys.exit(-1)
    # #################################################################

    tofu_prefix = binascii.b2a_hex(os.urandom(3)) if args.output_seqid_prefix is None else args.output_seqid_prefix

    ice_opts = IceOptions(quiver=args.quiver,
            use_finer_qv=args.use_finer_qv,
            targeted_isoseq=args.targeted_isoseq,
            ece_penalty=args.ece_penalty,
            ece_min_len=args.ece_min_len,
    )
    sge_opts = SgeOptions(unique_id=args.unique_id,
            use_sge=args.use_sge,
            max_sge_jobs=args.max_sge_jobs,
            blasr_nproc=args.blasr_nproc,
            quiver_nproc=args.quiver_nproc,
            gcon_nproc=args.gcon_nproc,
            sge_env_name=args.sge_env_name,
            sge_queue=args.sge_queue)
    ipq_opts = IceQuiverHQLQOptions(qv_trim_5=args.qv_trim_5,
            qv_trim_3=args.qv_trim_3,
            hq_quiver_min_accuracy=args.hq_quiver_min_accuracy,
            hq_isoforms_fa=args.hq_isoforms_fa,
            hq_isoforms_fq=args.hq_isoforms_fq,
            lq_isoforms_fa=args.lq_isoforms_fa,
            lq_isoforms_fq=args.lq_isoforms_fq)

    # ex: all_quivered_hq.100_30_0.99.fastq
    quiver_hq_filename = "all_quivered_hq.{0}_{1}_{2:.2f}.fastq".format(\
            args.qv_trim_5,args.qv_trim_3,args.hq_quiver_min_accuracy)
    quiver_lq_filename = "all_quivered_lq.fastq"

    # (1) separate input flnc into size bins or primers
    if args.bin_by_primer:
        split_files = sep_flnc_by_primer(args.flnc_fa, os.path.abspath(args.root_dir))
    else:
        bin_manual = eval(args.bin_manual) if args.bin_manual is not None else None
        split_files = sep_flnc_by_size(args.flnc_fa, args.root_dir, bin_size_kb=args.bin_size_kb, bin_manual=bin_manual, max_base_limit_MB=args.max_base_limit_MB)
    print >> sys.stderr, "split input {0} into {1} bins".format(args.flnc_fa, len(split_files))

    # (2) if fasta_fofn already is there, use it; otherwise make it first
    if args.quiver and args.fasta_fofn is None:
        print >> sys.stderr, "Making fasta_fofn now"
        nfl_dir = os.path.abspath(os.path.join(args.root_dir, "fasta_fofn_files"))
        if not os.path.exists(nfl_dir):
            os.makedirs(nfl_dir)
        args.fasta_fofn = os.path.join(nfl_dir, 'input.fasta.fofn')
        print >> sys.stderr, "fasta_fofn", args.fasta_fofn
        print >> sys.stderr, "nfl_dir", nfl_dir
        convert_fofn_to_fasta(fofn_filename=args.bas_fofn,
                            out_filename=args.fasta_fofn,
                            fasta_out_dir=nfl_dir,
                            cpus=args.blasr_nproc)
    else:
        if not os.path.exists(args.fasta_fofn):
            raise Exception, "fasta_fofn {0} does not exist!".format(args.fasta_fofn)
        for line in open(args.fasta_fofn):
            file = line.strip()
            if len(file) > 0 and not os.path.exists(file):
                raise Exception, "File {0} does not exists in {1}".format(file, args.fasta_fofn)

    # (3) run ICE/Quiver (the whole thing), providing the fasta_fofn
    split_dirs = []
    for cur_file in split_files:
        cur_dir = os.path.abspath(os.path.dirname(cur_file))
        split_dirs.append(cur_dir)
        cur_out_cons = os.path.join(cur_dir, args.consensusFa)
        
        hq_quiver = os.path.join(cur_dir, quiver_hq_filename)
        if os.path.exists(hq_quiver):
            print >> sys.stderr, "{0} already exists. SKIP!".format(hq_quiver)
            continue
        print >> sys.stderr, "running ICE/Quiver on", cur_dir
        start_t = time.time()

        obj = Cluster(root_dir=cur_dir,
                flnc_fa=cur_file,
                nfl_fa=realpath(args.nfl_fa),
                bas_fofn=realpath(args.bas_fofn),
                ccs_fofn=realpath(args.ccs_fofn),
                fasta_fofn=realpath(args.fasta_fofn),
                out_fa=cur_out_cons,
                sge_opts=sge_opts,
                ice_opts=ice_opts,
                ipq_opts=ipq_opts,
                report_fn=args.report_fn,
                summary_fn=args.summary_fn,
                nfl_reads_per_split=args.nfl_reads_per_split)
        
        # DEBUG
        if args.mem_debug: 
            mem_usage = memory_usage(obj.run, interval=60)
            end_t = time.time()
            with open('mem_debug.log', 'a') as f:
                f.write("Running ICE/Quiver on {0} took {1} secs.\n".format(cur_dir, end_t-start_t))
                f.write("Maximum memory usage: {0}\n".format(max(mem_usage)))
                f.write("Memory usage: {0}\n".format(mem_usage))
        else:
            obj.run()

    combined_dir = os.path.join(args.root_dir, 'combined')
    if not os.path.exists(combined_dir):
        os.makedirs(combined_dir)
    # (4) combine quivered HQ/LQ results
    hq_filename, lq_filename, hq_pre_dict, lq_pre_dict = \
            combine_quiver_results(split_dirs, combined_dir, quiver_hq_filename, quiver_lq_filename,\
            tofu_prefix)
    with open(os.path.join(args.root_dir, 'combined', 'combined.hq_lq_pre_dict.pickle'), 'w') as f:
        dump({'HQ': hq_pre_dict, 'LQ': lq_pre_dict}, f)
    # (5) collapse quivered HQ results
    collapse_prefix_hq = run_collapse_sam(hq_filename, args.gmap_db, args.gmap_name, cpus=args.blasr_nproc, max_fuzzy_junction=args.max_fuzzy_junction, dun_merge_5_shorter=True)
    # (6) make abundance 
    get_abundance(collapse_prefix_hq, hq_pre_dict, collapse_prefix_hq)
    # (7) run filtering & removing subsets in no5merge
    if args.targeted_isoseq:
        run_filtering_by_count(collapse_prefix_hq, collapse_prefix_hq+'.min_fl_5', min_count=5)
        run_filtering_away_subsets(collapse_prefix_hq+'.min_fl_5', collapse_prefix_hq+'.min_fl_5.filtered', args.max_fuzzy_junction)
    else:
        run_filtering_by_count(collapse_prefix_hq, collapse_prefix_hq+'.min_fl_2', min_count=2)
        run_filtering_away_subsets(collapse_prefix_hq+'.min_fl_2', collapse_prefix_hq+'.min_fl_2.filtered', args.max_fuzzy_junction)
コード例 #30
0
ファイル: IcePartial.py プロジェクト: ksahlin/cDNA_primer
def build_uc_from_partial_daligner(input_fastq, ref_fasta, out_pickle,
                          ccs_fofn=None,
                          done_filename=None, use_finer_qv=False, cpus=24, no_qv_or_aln_checking=True):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.
    """
    input_fastq = realpath(input_fastq)
    input_fasta = input_fastq[:input_fastq.rfind('.')] + '.fasta'
    ice_fq2fa(input_fastq, input_fasta)
    ref_fasta = realpath(ref_fasta)
    out_pickle = realpath(out_pickle)
    output_dir = os.path.dirname(out_pickle)

    daligner_sensitive_mode, _low, _high, _ignore5, _ignore3, _ece_min_len = get_daligner_sensitivity_setting(ref_fasta, is_fasta=True)

    # DB should always be already converted
    ref_obj = DazzIDHandler(ref_fasta, True)
    input_obj = DazzIDHandler(input_fasta, False)

    # ice_partial is already being called through qsub, so run everything local!
    runner = DalignerRunner(input_fasta, ref_fasta, is_FL=False, same_strand_only=False, \
                            query_converted=True, db_converted=True, query_made=False, \
                            db_made=True, use_sge=False, cpus=cpus, sge_opts=None)
    las_filenames, las_out_filenames = runner.runHPC(min_match_len=_low, output_dir=output_dir, sensitive_mode=daligner_sensitive_mode)

    if no_qv_or_aln_checking:
        # not using QVs or alignment checking!
        # this probqv is just a DUMMY to pass to daligner_against_ref, which won't be used
        logging.info("Not using QV for partial_uc. Loading dummy QV.")
        probqv = ProbFromModel(.01, .07, .06)
    else:
#        if ccs_fofn is None:
#            logging.info("Loading probability from model (0.01,0.07,0.06)")
#            probqv = ProbFromModel(.01, .07, .06)
#        else:
        start_t = time.time()
        probqv = ProbFromFastq(input_fastq)
        logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t))
# --------- comment out below since we are just using FASTQ / BAM
#            if use_finer_qv:
#                probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)
#                logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\
#                    s=time.time()-start_t))
#            else:
#                input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq'
#                logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq))
#                ice_fa2fq(input_fasta, ccs_fofn, input_fastq)
#                probqv = ProbFromFastq(input_fastq)
#                logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t))
#                print >> sys.stderr, "Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t)

    logging.info("Calling dalign_against_ref ...")

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from DALIGNER hits.")

    for las_out_filename in las_out_filenames:
        start_t = time.time()
        hitItems = dalign_against_ref(input_obj, ref_obj, las_out_filename,
                                 is_FL=False,
                                 sID_starts_with_c=True,
                                 qver_get_func=probqv.get_smoothed,
                                 qvmean_get_func=probqv.get_mean,
                                 ece_penalty=1,
                                 ece_min_len=_ece_min_len,
                                 same_strand_only=False,
                                 no_qv_or_aln_checking=no_qv_or_aln_checking,
                                 max_missed_start=_ignore5,
                                 max_missed_end=_ignore3)
        for h in hitItems:
            if h.ece_arr is not None:
                if h.cID not in partial_uc:
                    partial_uc[h.cID] = set()
                partial_uc[h.cID].add(h.qID)
                seen.add(h.qID)
        logging.info("processing {0} took {1} sec".format(las_out_filename, time.time()-start_t))
        print >> sys.stderr, "processing {0} took {1} sec".format(las_out_filename, time.time()-start_t)

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0] for r in FastaReader(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle))
    with open(out_pickle, 'w') as f:
        dump({'partial_uc': partial_uc, 'nohit': nohit}, f)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating {f}.".format(f=done_filename))
    touch(done_filename)

    # remove all the .las and .las.out filenames
    for file in las_filenames:
        os.remove(file)
    for file in las_out_filenames:
        os.remove(file)
コード例 #31
0
ファイル: tofu_wrap.py プロジェクト: wangyuu/cDNA_primer
def tofu_wrap_main():
    parser = argparse.ArgumentParser(prog='tofu_wrap')
    add_cluster_arguments(parser, show_sge_env_name=True, show_sge_queue=True)

    parser.add_argument("--bin_size_kb", default=1, type=int, help="Bin size by kb (default: 1)")
    parser.add_argument("--bin_manual", default=None, help="Bin manual (ex: (1,2,3,5)), overwrites bin_size_kb")
    parser.add_argument("--bin_by_primer", default=False, action="store_true", help="Instead of binning by size, bin by primer (overwrites --bin_size_kb and --bin_manual)")
    parser.add_argument("--max_base_limit_MB", default=600, type=int, help="Maximum number of bases per partitioned bin, in MB (default: 600)")
    parser.add_argument("--gmap_name", default="hg19", help="GMAP DB name (default: hg19)")
    parser.add_argument("--gmap_db", default="/home/UNIXHOME/etseng/share/gmap_db_new/", help="GMAP DB location (default: /home/UNIXHOME/etseng/share/gmap_db_new/)")
    parser.add_argument("--output_seqid_prefix", type=str, default=None, help="Output seqid prefix. If not given, a random ID is generated")
    parser.add_argument("--mem_debug", default=False, action="store_true", help=argparse.SUPPRESS)
    parser.add_argument("--max_fuzzy_junction", default=5, type=int, help="Max fuzzy junction (default: 5 bp)")
    parser.add_argument("--version", action='version', version='%(prog)s ' + str(get_version()))
    args = parser.parse_args()

    # PRINT VERSION AND EXIT
#    if args.version:
#        print >> sys.stderr, get_version()
#        sys.exit(0)
    # DEBUG
    if args.mem_debug:
        from memory_profiler import memory_usage
    
    # #################################################################
    # SANITY CHECKS
    if not args.quiver:
        print >> sys.stderr, "--quiver must be turned on for tofu_wrap. Quit."
        sys.exit(-1)
    if args.nfl_fa is None:
        print >> sys.stderr, "--nfl_fa must be provided for tofu_wrap. Quit."
        sys.exit(-1)
    if not os.path.exists(args.gmap_db):
        print >> sys.stderr, "GMAP DB location not valid: {0}. Quit.".format(args.gmap_db)
        sys.exit(-1)
    if not os.path.exists(os.path.join(args.gmap_db, args.gmap_name)):
        print >> sys.stderr, "GMAP name not valid: {0}. Quit.".format(args.gmap_name)
        sys.exit(-1)
    # #################################################################

    tofu_prefix = binascii.b2a_hex(os.urandom(3)) if args.output_seqid_prefix is None else args.output_seqid_prefix

    ice_opts = IceOptions(quiver=args.quiver,
            use_finer_qv=args.use_finer_qv,
            targeted_isoseq=args.targeted_isoseq,
            ece_penalty=args.ece_penalty,
            ece_min_len=args.ece_min_len,
    )
    sge_opts = SgeOptions(unique_id=args.unique_id,
            use_sge=args.use_sge,
            max_sge_jobs=args.max_sge_jobs,
            blasr_nproc=args.blasr_nproc,
            quiver_nproc=args.quiver_nproc,
            gcon_nproc=args.gcon_nproc,
            sge_env_name=args.sge_env_name,
            sge_queue=args.sge_queue)
    ipq_opts = IceQuiverHQLQOptions(qv_trim_5=args.qv_trim_5,
            qv_trim_3=args.qv_trim_3,
            hq_quiver_min_accuracy=args.hq_quiver_min_accuracy,
            hq_isoforms_fa=args.hq_isoforms_fa,
            hq_isoforms_fq=args.hq_isoforms_fq,
            lq_isoforms_fa=args.lq_isoforms_fa,
            lq_isoforms_fq=args.lq_isoforms_fq)

    # ex: all_quivered_hq.100_30_0.99.fastq
    quiver_hq_filename = "all_quivered_hq.{0}_{1}_{2:.2f}.fastq".format(\
            args.qv_trim_5,args.qv_trim_3,args.hq_quiver_min_accuracy)
    quiver_lq_filename = "all_quivered_lq.fastq"

    # (1) separate input flnc into size bins or primers
    if args.bin_by_primer:
        split_files = sep_flnc_by_primer(args.flnc_fa, os.path.abspath(args.root_dir))
    else:
        bin_manual = eval(args.bin_manual) if args.bin_manual is not None else None
        split_files = sep_flnc_by_size(args.flnc_fa, args.root_dir, bin_size_kb=args.bin_size_kb, bin_manual=bin_manual, max_base_limit_MB=args.max_base_limit_MB)
    print >> sys.stderr, "split input {0} into {1} bins".format(args.flnc_fa, len(split_files))

    # (2) if fasta_fofn already is there, use it; otherwise make it first
    if args.quiver and args.fasta_fofn is None:
        print >> sys.stderr, "Making fasta_fofn now"
        nfl_dir = os.path.abspath(os.path.join(args.root_dir, "fasta_fofn_files"))
        if not os.path.exists(nfl_dir):
            os.makedirs(nfl_dir)
        args.fasta_fofn = os.path.join(nfl_dir, 'input.fasta.fofn')
        print >> sys.stderr, "fasta_fofn", args.fasta_fofn
        print >> sys.stderr, "nfl_dir", nfl_dir
        convert_fofn_to_fasta(fofn_filename=args.bas_fofn,
                            out_filename=args.fasta_fofn,
                            fasta_out_dir=nfl_dir,
                            cpus=args.blasr_nproc)
    else:
        if not os.path.exists(args.fasta_fofn):
            raise Exception, "fasta_fofn {0} does not exist!".format(args.fasta_fofn)
        for line in open(args.fasta_fofn):
            file = line.strip()
            if len(file) > 0 and not os.path.exists(file):
                raise Exception, "File {0} does not exists in {1}".format(file, args.fasta_fofn)

    # (3) run ICE/Quiver (the whole thing), providing the fasta_fofn
    split_dirs = []
    for cur_file in split_files:
        cur_dir = os.path.abspath(os.path.dirname(cur_file))
        split_dirs.append(cur_dir)
        cur_out_cons = os.path.join(cur_dir, args.consensusFa)
        
        hq_quiver = os.path.join(cur_dir, quiver_hq_filename)
        if os.path.exists(hq_quiver):
            print >> sys.stderr, "{0} already exists. SKIP!".format(hq_quiver)
            continue
        print >> sys.stderr, "running ICE/Quiver on", cur_dir
        start_t = time.time()

        obj = Cluster(root_dir=cur_dir,
                flnc_fa=cur_file,
                nfl_fa=realpath(args.nfl_fa),
                bas_fofn=realpath(args.bas_fofn),
                ccs_fofn=realpath(args.ccs_fofn),
                fasta_fofn=realpath(args.fasta_fofn),
                out_fa=cur_out_cons,
                sge_opts=sge_opts,
                ice_opts=ice_opts,
                ipq_opts=ipq_opts,
                report_fn=args.report_fn,
                summary_fn=args.summary_fn,
                nfl_reads_per_split=args.nfl_reads_per_split)
        
        # DEBUG
        if args.mem_debug: 
            mem_usage = memory_usage(obj.run, interval=60)
            end_t = time.time()
            with open('mem_debug.log', 'a') as f:
                f.write("Running ICE/Quiver on {0} took {1} secs.\n".format(cur_dir, end_t-start_t))
                f.write("Maximum memory usage: {0}\n".format(max(mem_usage)))
                f.write("Memory usage: {0}\n".format(mem_usage))
        else:
            obj.run()

    combined_dir = os.path.join(args.root_dir, 'combined')
    if not os.path.exists(combined_dir):
        os.makedirs(combined_dir)
    # (4) combine quivered HQ/LQ results
    hq_filename, lq_filename, hq_pre_dict, lq_pre_dict = \
            combine_quiver_results(split_dirs, combined_dir, quiver_hq_filename, quiver_lq_filename,\
            tofu_prefix)
    with open(os.path.join(args.root_dir, 'combined', 'combined.hq_lq_pre_dict.pickle'), 'w') as f:
        dump({'HQ': hq_pre_dict, 'LQ': lq_pre_dict}, f)
    # (5) collapse quivered HQ results
    collapse_prefix_hq = run_collapse_sam(hq_filename, args.gmap_db, args.gmap_name, cpus=args.blasr_nproc, max_fuzzy_junction=args.max_fuzzy_junction)
    # (6) make abundance 
    get_abundance(collapse_prefix_hq, hq_pre_dict, collapse_prefix_hq)
    # (7) run filtering
    run_filtering_by_count(collapse_prefix_hq, collapse_prefix_hq+'.min_fl_2', min_count=2)
コード例 #32
0
ファイル: IcePartial.py プロジェクト: wangyuu/cDNA_primer
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle,
                          sa_file=None, ccs_fofn=None,
                          done_filename=None, blasr_nproc=12, use_finer_qv=False):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.
    blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use
    """
    input_fasta = realpath(input_fasta)
    m5_file = input_fasta + ".blasr"
    out_pickle = realpath(out_pickle)
    if sa_file is None:
        if op.exists(input_fasta + ".sa"):
            sa_file = input_fasta + ".sa"

    cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \
          "{r} -bestn 5 ".format(r=real_upath(ref_fasta)) + \
          "-nproc {n} -m 5 ".format(n=blasr_nproc) + \
          "-maxScore -1000 -minPctIdentity 85 " + \
          "-out {o} ".format(o=real_upath(m5_file))
    if sa_file is not None and op.exists(sa_file):
        cmd += "-sa {sa}".format(sa=real_upath(sa_file))

    logging.info("CMD: {cmd}".format(cmd=cmd))
    _out, _code, _msg = backticks(cmd)
    if _code != 0:
        errMsg = "Command failed: {cmd}\n{e}".format(cmd=cmd, e=_msg)
        logging.error(errMsg)
        raise RuntimeError(errMsg)
    
    if ccs_fofn is None:
        logging.info("Loading probability from model (0.01,0.07,0.06)")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        start_t = time.time()
        if use_finer_qv:
            logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\
                    s=time.time()-start_t))
            probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)
        else:
            input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq'
            logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq))
            ice_fa2fq(input_fasta, ccs_fofn, input_fastq)
            logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t))
            probqv = ProbFromFastq(input_fastq)


    logging.info("Calling blasr_against_ref ...")
    hitItems = blasr_against_ref(output_filename=m5_file,
                                 is_FL=False,
                                 sID_starts_with_c=True,
                                 qvmean_get_func=probqv.get_mean,
                                 qver_get_func=probqv.get_smoothed,
                                 ece_penalty=1,
                                 ece_min_len=10,
                                 same_strand_only=False)

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from BLASR hits.")
    for h in hitItems:
        if h.ece_arr is not None:
            if h.cID not in partial_uc:
                partial_uc[h.cID] = set()
            partial_uc[h.cID].add(h.qID)
            seen.add(h.qID)

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0] for r in FastaReader(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle))
    with open(out_pickle, 'w') as f:
        dump({'partial_uc': partial_uc, 'nohit': nohit}, f)

    os.remove(m5_file)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating {f}.".format(f=done_filename))
    touch(done_filename)
コード例 #33
0
ファイル: IcePartial.py プロジェクト: wangyuu/cDNA_primer
def build_uc_from_partial_daligner(input_fasta, ref_fasta, out_pickle,
                          ccs_fofn=None,
                          done_filename=None, use_finer_qv=False, cpus=24, no_qv_or_aln_checking=True):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.
    """
    input_fasta = realpath(input_fasta)
    ref_fasta = realpath(ref_fasta)
    out_pickle = realpath(out_pickle)
    output_dir = os.path.dirname(out_pickle)

    daligner_sensitive_mode, _low, _high = get_daligner_sensitivity_setting(ref_fasta)

    # DB should always be already converted
    ref_obj = DazzIDHandler(ref_fasta, True)
    input_obj = DazzIDHandler(input_fasta, False)

    # ice_partial is already being called through qsub, so run everything local!
    runner = DalignerRunner(input_fasta, ref_fasta, is_FL=False, same_strand_only=False, \
                            query_converted=True, db_converted=True, query_made=False, \
                            db_made=True, use_sge=False, cpus=cpus, sge_opts=None)
    las_filenames, las_out_filenames = runner.runHPC(min_match_len=300, output_dir=output_dir, sensitive_mode=daligner_sensitive_mode)

    if no_qv_or_aln_checking:
        # not using QVs or alignment checking!
        # this probqv is just a DUMMY to pass to daligner_against_ref, which won't be used
        logging.info("Not using QV for partial_uc. Loading dummy QV.")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        if ccs_fofn is None:
            logging.info("Loading probability from model (0.01,0.07,0.06)")
            probqv = ProbFromModel(.01, .07, .06)
        else:
            start_t = time.time()
            if use_finer_qv:
                probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)
                logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\
                    s=time.time()-start_t))
            else:
                input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq'
                logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq))
                ice_fa2fq(input_fasta, ccs_fofn, input_fastq)
                probqv = ProbFromFastq(input_fastq)
                logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t))
                print >> sys.stderr, "Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t)

    logging.info("Calling dalign_against_ref ...")

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from DALIGNER hits.")

    for las_out_filename in las_out_filenames:
        start_t = time.time()
        hitItems = dalign_against_ref(input_obj, ref_obj, las_out_filename,
                                 is_FL=False,
                                 sID_starts_with_c=True,
                                 qver_get_func=probqv.get_smoothed,
                                 qvmean_get_func=probqv.get_mean,
                                 ece_penalty=1,
                                 ece_min_len=20,
                                 same_strand_only=False,
                                 no_qv_or_aln_checking=no_qv_or_aln_checking)
        for h in hitItems:
            if h.ece_arr is not None:
                if h.cID not in partial_uc:
                    partial_uc[h.cID] = set()
                partial_uc[h.cID].add(h.qID)
                seen.add(h.qID)
        logging.info("processing {0} took {1} sec".format(las_out_filename, time.time()-start_t))
        print >> sys.stderr, "processing {0} took {1} sec".format(las_out_filename, time.time()-start_t)

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0] for r in FastaReader(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle))
    with open(out_pickle, 'w') as f:
        dump({'partial_uc': partial_uc, 'nohit': nohit}, f)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating {f}.".format(f=done_filename))
    touch(done_filename)

    # remove all the .las and .las.out filenames
    for file in las_filenames:
        os.remove(file)
    for file in las_out_filenames:
        os.remove(file)
コード例 #34
0
ファイル: Classifier.py プロジェクト: avrajit/cDNA_primer
    def __init__(self, reads_fn="test.fa", out_dir="output/",
                 out_reads_fn="testout.fa", primer_fn=None,
                 primer_report_fn=None, summary_fn=None,
                 cpus=1, change_read_id=True,
                 opts=ChimeraDetectionOptions(50, 10, 100, 50, 100),
                 out_nfl_fn=None, out_flnc_fn=None, ignore_polyA=False):
        self.reads_fn = realpath(reads_fn)
        self.out_dir = realpath(out_dir)
        self.cpus = cpus
        self.change_read_id = change_read_id
        self.chimera_detection_opts = opts
        self.ignore_polyA = ignore_polyA

        # The input primer file: primers.fa
        self.primer_fn = primer_fn if primer_fn is not None else \
            op.join(self.data_dir, PRIMERFN)
        # The output fasta file.
        self.out_all_reads_fn = realpath(out_reads_fn)

        # Intermediate output fasta file before chimera detection.
        # trimmed full-length reads before chimera detection
        self._trimmed_fl_reads_fn = op.join(self.out_dir, "fl.trimmed.fasta")

        self.primer_front_back_fn = op.join(self.out_dir, PRIMERFRONTENDFN)
        self.primer_chimera_fn = op.join(self.out_dir, PRIMERCHIMERAFN)

        # The output primer file: primer_info.csv
        self.primer_report_fn = primer_report_fn \
            if primer_report_fn is not None else \
            ".".join(out_reads_fn.split('.')[:-1]) + "." + PRIMERREPORTFN
        # primer reports for nfl reads before chimera detection. Note that
        # chimera detection is not necessary for nfl reads.
        self._primer_report_nfl_fn = op.join(self.out_dir,
                                             "primer_report.nfl.csv")
        # primer reports for fl reads after chimera detection. Note that
        # chimera detection is required for fl reads.
        self._primer_report_fl_fn = op.join(self.out_dir,
                                            "primer_report.fl.csv")

        # The matrix file: PBMATRIX.txt
        self.pbmatrix_fn = op.join(self.data_dir, PBMATRIXFN)

        # The output phmmer Dom file: hmmer.front_end.dom and hmmer.chimera.dom
        self.out_front_back_dom_fn = op.join(self.out_dir, FRONTENDDOMFN)
        self.out_trimmed_reads_dom_fn = op.join(self.out_dir, CHIMERADOMFN)

        self.chunked_front_back_reads_fns = None
        self.chunked_front_back_dom_fns = None

        self.chunked_trimmed_reads_fns = None
        self.chunked_trimmed_reads_dom_fns = None

        # The summary file: *.classify_summary.txt
        self.summary = ClassifySummary()
        self.summary_fn = summary_fn if summary_fn is not None else \
                          ".".join(out_reads_fn.split('.')[:-1]) + \
                          "." + CLASSIFYSUMMARY

        self.out_nfl_fn = realpath(out_nfl_fn) if out_nfl_fn is not None \
                          else op.join(self.out_dir, "nfl.fasta")
        self.out_flnc_fn = realpath(out_flnc_fn) if out_flnc_fn is not None \
                           else op.join(self.out_dir, "flnc.fasta")
        self.out_flc_fn = op.join(self.out_dir, "flc.fasta")
コード例 #35
0
def build_uc_from_partial(input_fasta,
                          ref_fasta,
                          out_pickle,
                          sa_file=None,
                          ccs_fofn=None,
                          done_filename=None,
                          blasr_nproc=12):
    """Align consensus isoforms in ref_fasta and reads in input_fasta,
    and save mappings between isoforms and reads to out_pickle.
    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.
    blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use
    """
    input_fasta = realpath(input_fasta)
    m5_file = input_fasta + ".blasr"
    out_pickle = realpath(out_pickle)
    if sa_file is None:
        if op.exists(input_fasta + ".sa"):
            sa_file = input_fasta + ".sa"

    cmd = "blasr {i} ".format(i=input_fasta) + \
          "{r} -bestn 5 ".format(r=ref_fasta) + \
          "-nproc {n} -m 5 ".format(n=blasr_nproc) + \
          "-maxScore -1000 -minPctIdentity 85 -out {o} ".format(o=m5_file)
    if sa_file is not None and op.exists(sa_file):
        cmd += "-sa {sa}".format(sa=sa_file)

    logging.info("CMD: {cmd}".format(cmd=cmd))
    _out, _code, _msg = backticks(cmd)
    if _code != 0:
        errMsg = "Command failed: {cmd}\n{e}".format(cmd=cmd, e=_msg)
        logging.error(errMsg)
        raise RuntimeError(errMsg)

    if ccs_fofn is None:
        logging.info("Loading probability from model")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        logging.info("Loading probability from QV in {f}".format(f=ccs_fofn))
        probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)

    logging.info("Calling blasr_against_ref ...")
    hitItems = blasr_against_ref(output_filename=m5_file,
                                 is_FL=False,
                                 sID_starts_with_c=True,
                                 qver_get_func=probqv.get_smoothed,
                                 ece_penalty=1,
                                 ece_min_len=10,
                                 same_strand_only=False)

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from BLASR hits.")
    for h in hitItems:
        if h.ece_arr is not None:
            if h.cID not in partial_uc:
                partial_uc[h.cID] = []
            partial_uc[h.cID].append(h.qID)
            seen.add(h.qID)

    allhits = set(r.name.split()[0] for r in FastaReader(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle))
    with open(out_pickle, 'w') as f:
        dump({'partial_uc': partial_uc, 'nohit': nohit}, f)

    os.remove(m5_file)

    done_filename = realpath(done_filename) if done_filename is not None \
            else out_pickle + '.DONE'
    logging.debug("Creating {f}.".format(f=done_filename))
    touch(done_filename)
コード例 #36
0
    def __init__(self, reads_fn="ccs.fasta", out_dir="classifyOut/",
                 out_reads_fn="isoseq_draft.fasta", primer_fn_forward=None, primer_fn_reverse=None,
                 primer_report_fn=None, summary_fn=None,
                 cpus=1, change_read_id=True,
                 opts=ChimeraDetectionOptions(50, 10, 100, 50, 150, False),
                 out_nfl_fn=None, out_flnc_fn=None,
                 ignore_polyA=False, keep_primer=False, reuse_dom=False):
        self.reads_fn = realpath(reads_fn)
        self.out_dir = realpath(out_dir)
        self.cpus = cpus
        self.change_read_id = change_read_id
        self.chimera_detection_opts = opts
        self.ignore_polyA = ignore_polyA
        self.keep_primer = keep_primer # if True, primers are not removed (useful for targeted)
        self.reuse_dom = reuse_dom

        # for now, the barcoded primer files must be given!
        assert primer_fn_forward is not None
        assert primer_fn_reverse is not None
        self.primer_fn_forward = primer_fn_forward
        self.primer_fn_reverse = primer_fn_reverse
        # The output fasta file.
        self.out_all_reads_fn = realpath(out_reads_fn)

        # Intermediate output fasta file before chimera detection.
        #     trimmed full-length reads: fl.trimmed.fasta
        # and
        #     trimmed non-full-length reads: nfl.trimmed.fasta
        self._trimmed_fl_reads_fn = op.join(self.out_dir, "fl.trimmed.fasta")
        self._trimmed_nfl_reads_fn = op.join(self.out_dir, "nfl.trimmed.fasta")

        self.primer_front_back_fn = op.join(self.out_dir, PRIMERFRONTENDFN)
        self.primer_chimera_fn = op.join(self.out_dir, PRIMERCHIMERAFN)

        # The output primer file: primer_info.csv
        self.primer_report_fn = primer_report_fn \
            if primer_report_fn is not None else \
            ".".join(out_reads_fn.split('.')[:-1]) + "." + PRIMERREPORTFN
        # primer reports for nfl reads before chimera detection. Note that
        # chimera detection is not necessary for nfl reads.
        self._primer_report_nfl_fn = op.join(self.out_dir,
                                             "primer_report.nfl.csv")
        # primer reports for fl reads after chimera detection. Note that
        # chimera detection is required for fl reads.
        self._primer_report_fl_fn = op.join(self.out_dir,
                                            "primer_report.fl.csv")

        # The matrix file: PBMATRIX.txt
        self.pbmatrix_fn = op.join(self.data_dir, PBMATRIXFN)

        # The output phmmer Dom file for trimming primers: hmmer.front_end.dom
        self.out_front_back_dom_fn = op.join(self.out_dir, FRONTENDDOMFN)
        # The output phmmer Dom file for chimera detection:
        #     hmmer.fl.chimera.dom and hmmer.nfl.chimera.dom
        self.out_trimmed_fl_dom_fn = op.join(self.out_dir, FLCHIMERADOMFN)
        self.out_trimmed_nfl_dom_fn = op.join(self.out_dir, NFLCHIMERADOMFN)

        self.chunked_front_back_reads_fns = None
        self.chunked_front_back_dom_fns = None

        #self.chunked_trimmed_reads_fns = None
        #self.chunked_trimmed_reads_dom_fns = None

        # The summary file: *.classify_summary.txt
        self.summary = ClassifySummary()
        self.summary_fn = summary_fn if summary_fn is not None else \
            ".".join(out_reads_fn.split('.')[:-1]) + \
            "." + CLASSIFYSUMMARY

        self.out_nfl_fn = realpath(out_nfl_fn) if out_nfl_fn is not None \
            else op.join(self.out_dir, "nfl.fasta")
        self.out_nflnc_fn = op.join(self.out_dir, "nflnc.fasta")
        self.out_nflc_fn = op.join(self.out_dir, "nflc.fasta")

        self.out_flnc_fn = realpath(out_flnc_fn) if out_flnc_fn is not None \
            else op.join(self.out_dir, "flnc.fasta")
        self.out_flc_fn = op.join(self.out_dir, "flc.fasta")
コード例 #37
0
def build_uc_from_partial(
    input_fasta, ref_fasta, out_pickle, sa_file=None, ccs_fofn=None, done_filename=None, blasr_nproc=12
):
    """Align consensus isoforms in ref_fasta and reads in input_fasta,
    and save mappings between isoforms and reads to out_pickle.
    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.
    blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use
    """
    input_fasta = realpath(input_fasta)
    m5_file = input_fasta + ".blasr"
    out_pickle = realpath(out_pickle)
    if sa_file is None:
        if op.exists(input_fasta + ".sa"):
            sa_file = input_fasta + ".sa"

    cmd = (
        "blasr {i} ".format(i=input_fasta)
        + "{r} -bestn 5 ".format(r=ref_fasta)
        + "-nproc {n} -m 5 ".format(n=blasr_nproc)
        + "-maxScore -1000 -minPctIdentity 85 -out {o} ".format(o=m5_file)
    )
    if sa_file is not None and op.exists(sa_file):
        cmd += "-sa {sa}".format(sa=sa_file)

    logging.info("CMD: {cmd}".format(cmd=cmd))
    _out, _code, _msg = backticks(cmd)
    if _code != 0:
        errMsg = "Command failed: {cmd}\n{e}".format(cmd=cmd, e=_msg)
        logging.error(errMsg)
        raise RuntimeError(errMsg)

    if ccs_fofn is None:
        logging.info("Loading probability from model")
        probqv = ProbFromModel(0.01, 0.07, 0.06)
    else:
        logging.info("Loading probability from QV in {f}".format(f=ccs_fofn))
        probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)

    logging.info("Calling blasr_against_ref ...")
    hitItems = blasr_against_ref(
        output_filename=m5_file,
        is_FL=False,
        sID_starts_with_c=True,
        qver_get_func=probqv.get_smoothed,
        ece_penalty=1,
        ece_min_len=10,
        same_strand_only=False,
    )

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from BLASR hits.")
    for h in hitItems:
        if h.ece_arr is not None:
            if h.cID not in partial_uc:
                partial_uc[h.cID] = []
            partial_uc[h.cID].append(h.qID)
            seen.add(h.qID)

    allhits = set(r.name.split()[0] for r in FastaReader(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle))
    with open(out_pickle, "w") as f:
        dump({"partial_uc": partial_uc, "nohit": nohit}, f)

    os.remove(m5_file)

    done_filename = realpath(done_filename) if done_filename is not None else out_pickle + ".DONE"
    logging.debug("Creating {f}.".format(f=done_filename))
    touch(done_filename)
コード例 #38
0
ファイル: IcePartial.py プロジェクト: ksahlin/cDNA_primer
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle,
                          sa_file=None, ccs_fofn=None,
                          done_filename=None, blasr_nproc=12, use_finer_qv=False):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.
    blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use
    """
    input_fasta = realpath(input_fasta)
    m5_file = input_fasta + ".blasr"
    out_pickle = realpath(out_pickle)
    if sa_file is None:
        if op.exists(input_fasta + ".sa"):
            sa_file = input_fasta + ".sa"

    cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \
          "{r} -bestn 5 ".format(r=real_upath(ref_fasta)) + \
          "-nproc {n} -m 5 ".format(n=blasr_nproc) + \
          "-maxScore -1000 -minPctIdentity 85 " + \
          "-out {o} ".format(o=real_upath(m5_file))
    if sa_file is not None and op.exists(sa_file):
        cmd += "-sa {sa}".format(sa=real_upath(sa_file))

    logging.info("CMD: {cmd}".format(cmd=cmd))
    _out, _code, _msg = backticks(cmd)
    if _code != 0:
        errMsg = "Command failed: {cmd}\n{e}".format(cmd=cmd, e=_msg)
        logging.error(errMsg)
        raise RuntimeError(errMsg)
    
    if ccs_fofn is None:
        logging.info("Loading probability from model (0.01,0.07,0.06)")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        start_t = time.time()
        if use_finer_qv:
            logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\
                    s=time.time()-start_t))
            probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)
        else:
            input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq'
            logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq))
            ice_fa2fq(input_fasta, ccs_fofn, input_fastq)
            logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t))
            probqv = ProbFromFastq(input_fastq)


    logging.info("Calling blasr_against_ref ...")
    hitItems = blasr_against_ref(output_filename=m5_file,
                                 is_FL=False,
                                 sID_starts_with_c=True,
                                 qvmean_get_func=probqv.get_mean,
                                 qver_get_func=probqv.get_smoothed,
                                 ece_penalty=1,
                                 ece_min_len=20,
                                 same_strand_only=False,
                                 max_missed_start=200,
                                 max_missed_end=50)

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from BLASR hits.")
    for h in hitItems:
        if h.ece_arr is not None:
            if h.cID not in partial_uc:
                partial_uc[h.cID] = set()
            partial_uc[h.cID].add(h.qID)
            seen.add(h.qID)

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0] for r in FastaReader(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle))
    with open(out_pickle, 'w') as f:
        dump({'partial_uc': partial_uc, 'nohit': nohit}, f)

    os.remove(m5_file)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating {f}.".format(f=done_filename))
    touch(done_filename)