Exemple #1
0
def convert_fofn_to_fasta(fofn_filename,
                          out_filename,
                          fasta_out_dir,
                          force_overwrite=False):
    """
    For each .bax.h5 file, create .bax.h5.fasta file and save paths to
    out_filename, which should usually be 'input.fasta.fofn'
    Modified: 09/14/2015, both ends of subreads in fasta files will
    be trimmed in IceQuiver (trim_and_write_raw_file) instead of here.
    """
    logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename))
    in_fns = get_files_from_file_or_fofn(fofn_filename)
    out_fns = []
    mkdir(fasta_out_dir)
    for in_fn in in_fns:
        logging.debug("converting h5 file: {f}.".format(f=in_fn))
        if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')):
            raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) +
                             "should only contain bax/bas.h5 files.")

        # e.g. m111xxxx.1.bax.h5 ==>
        #      tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp
        #      out_file = m11xxxx.1.bax.h5.fasta
        in_basename = op.basename(in_fn)
        out_file = op.join(fasta_out_dir, in_basename + '.fasta')
        if op.exists(out_file) and not force_overwrite:
            logging.debug(
                "File {0} already exists. skipping.".format(out_file))
        else:
            cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \
                  " {out} ".format(out=real_upath(out_file)) + \
                  "-minSubreadLength 300 -minReadScore 750 -trimByRegion"
            execute(cmd=cmd)
        out_fns.append(out_file)
    write_files_to_fofn(out_fns, out_filename)
Exemple #2
0
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir,
                          force_overwrite=False):
    """
    For each .bax.h5 file, create .bax.h5.fasta file and save paths to
    out_filename, which should usually be 'input.fasta.fofn'
    Modified: 09/14/2015, both ends of subreads in fasta files will
    be trimmed in IceQuiver (trim_and_write_raw_file) instead of here.
    """
    logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename))
    in_fns = get_files_from_file_or_fofn(fofn_filename)
    out_fns = []
    mkdir(fasta_out_dir)
    for in_fn in in_fns:
        logging.debug("converting h5 file: {f}.".format(f=in_fn))
        if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')):
            raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) +
                             "should only contain bax/bas.h5 files.")

        # e.g. m111xxxx.1.bax.h5 ==>
        #      tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp
        #      out_file = m11xxxx.1.bax.h5.fasta
        in_basename = op.basename(in_fn)
        out_file = op.join(fasta_out_dir, in_basename + '.fasta')
        if op.exists(out_file) and not force_overwrite:
            logging.debug("File {0} already exists. skipping.".format(out_file))
        else:
            cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \
                  " {out} ".format(out=real_upath(out_file)) + \
                  "-minSubreadLength 300 -minReadScore 750 -trimByRegion"
            execute(cmd=cmd)
        out_fns.append(out_file)
    write_files_to_fofn(out_fns, out_filename)
Exemple #3
0
def resolved_tool_contract_runner(rtc):
    """
    For each cluster bin, clean up intermediate files under tmp.
    """
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, ClusterChunkTask) for task in p])

    cluster_bin_indices = [task.cluster_bin_index for task in p]
    # sanity check that Cluster indices are unique!
    assert len(set(cluster_bin_indices)) == len(cluster_bin_indices)

    sentinel_out = rtc.task.output_files[0]
    with open(sentinel_out, 'w') as writer:
        for task in p:
            icef = IceFiles(prog_name="ice_cleanup",
                            root_dir=task.cluster_out_dir)
            tmp_dir = icef.tmp_dir
            log.info("Cleaning up, removing %s", tmp_dir)
            writer.write("removing %s\n" % tmp_dir)
            execute("rm -rf %s" % real_upath(tmp_dir))

            quivered_dir = icef.quivered_dir
            log.info("Cleaning up, removing %s", quivered_dir)
            writer.write("removing %s\n" % quivered_dir)
            execute("rm -rf %s" % real_upath(quivered_dir))
Exemple #4
0
    def _startPhmmers(self, chunked_reads_fns, chunked_dom_fns, out_dom_fn,
                      primer_fn, pbmatrix_fn):
        """Run phmmers on chunked reads files in 'chunked_reads_fns' and
        generate chunked dom files as listed in 'chunked_dom_fns', finally
        concatenate dom files to 'out_dom_fn'."""
        logging.info("Start to launch phmmer on chunked reads.")
        jobs = []
        for reads_fn, domFN in zip(chunked_reads_fns, chunked_dom_fns):
            p = multiprocessing.Process(target=self._phmmer,
                                        args=(reads_fn, domFN, primer_fn,
                                              pbmatrix_fn))
            jobs.append((p, domFN))
            p.start()

        for p, domFN in jobs:
            p.join()
            cmd = "cat {0} >> {1}".format(real_upath(domFN),
                                          real_upath(out_dom_fn))
            _output, errCode, errMsg = backticks(cmd)
            if errCode != 0:
                raise ClassifierException(
                    "Error concatenating dom files: {e}".format(e=str(errMsg)))

        self._cleanup(chunked_reads_fns)
        self._cleanup(chunked_dom_fns)
Exemple #5
0
def blasr_for_quiver(query_fn,
                     ref_fasta,
                     out_fn,
                     bam=False,
                     run_cmd=True,
                     blasr_nproc=12):
    """
    query_fn  --- should be in.raw.fasta|bam
    ref_fasta --- reference fasta (ex: g_consensus.fasta) to align to
    out_fn    --- sam|bam output aligning query_fn to ref_fasta

    blasr query_fn ref_fasta -out out_fn -sam -clipping soft
    blasr query_fn ref_fasta -out out_fn -bam
    """
    cmd = "blasr {i} ".format(i=real_upath(query_fn)) + \
          "{r} ".format(r=real_upath(ref_fasta)) + \
          "--nproc {n} ".format(n=blasr_nproc) + \
          "--bestn 5 --nCandidates 10 " + \
          ("--sam --clipping soft " if not bam else "--bam ") + \
          "--out {o} ".format(o=real_upath(out_fn)) + \
          "1>/dev/null 2>/dev/null"
    if run_cmd:
        execute(cmd)
    else:
        logging.debug("CMD: " + cmd)
    return cmd
Exemple #6
0
    def _startPhmmers(self, chunked_reads_fns, chunked_dom_fns,
                      out_dom_fn, primer_fn, pbmatrix_fn):
        """Run phmmers on chunked reads files in 'chunked_reads_fns' and
        generate chunked dom files as listed in 'chunked_dom_fns', finally
        concatenate dom files to 'out_dom_fn'."""
        logging.info("Start to launch phmmer on chunked reads.")
        jobs = []
        for reads_fn, domFN in zip(chunked_reads_fns, chunked_dom_fns):
            p = multiprocessing.Process(
                target=self._phmmer,
                args=(reads_fn, domFN, primer_fn, pbmatrix_fn))
            jobs.append((p, domFN))
            p.start()

        for p, domFN in jobs:
            p.join()
            cmd = "cat {0} >> {1}".format(real_upath(domFN),
                                          real_upath(out_dom_fn))
            _output, errCode, errMsg = backticks(cmd)
            if errCode != 0:
                raise ClassifierException(
                    "Error concatenating dom files: {e}".
                    format(e=str(errMsg)))

        self._cleanup(chunked_reads_fns)
        self._cleanup(chunked_dom_fns)
Exemple #7
0
 def map_isoforms_to_reference_transcripts(self):
     """Map isoforms to reference transcripts."""
     m5out = self.output_analysis_fn + ".blasr.out.m5"
     cmd = 'blasr %s %s --bestn 1 -m 5 --out %s' % \
           (real_upath(self.isoseq_output_fa),
            real_upath(self.reference_transcripts_fn),
            real_upath(m5out))
     execute(cmd)
     return [r for r in BLASRM5Reader(m5out)]
Exemple #8
0
 def _phmmer(self, reads_fn, domFN, primer_fn, pbmaxtrixFN):
     """Invoke phmmer once."""
     cmd = "phmmer --cpu 1 --domtblout {d} --noali --domE 1 ".\
           format(d=real_upath(domFN)) + \
           "--mxfile {m} ".format(m=real_upath(pbmaxtrixFN)) + \
           "--popen 0.07 --pextend 0.07 {r} {p} > /dev/null".\
           format(r=real_upath(reads_fn), p=real_upath(primer_fn))
     logging.debug("Calling phmmer: {cmd}".format(cmd=cmd))
     _output, errCode, errMsg = backticks(cmd)
     if (errCode != 0):
         raise ClassifierException(
             "Error calling phmmer: {e}.".format(e=str(errMsg)))
Exemple #9
0
 def _phmmer(self, reads_fn, domFN, primer_fn, pbmaxtrixFN):
     """Invoke phmmer once."""
     cmd = "phmmer --cpu 1 --domtblout {d} --noali --domE 1 ".\
           format(d=real_upath(domFN)) + \
           "--mxfile {m} ".format(m=real_upath(pbmaxtrixFN)) + \
           "--popen 0.07 --pextend 0.07 {r} {p} > /dev/null".\
           format(r=real_upath(reads_fn), p=real_upath(primer_fn))
     logging.debug("Calling phmmer: {cmd}".format(cmd=cmd))
     _output, errCode, errMsg = backticks(cmd)
     if (errCode != 0):
         raise ClassifierException(
             "Error calling phmmer: {e}.".format(e=str(errMsg)))
Exemple #10
0
    def createPickles(self):
        """For each file in fasta_filenames, call 'ICE_PARTIAL_PY one' to
        build clusters and to save results to a pickle file. When all pickles
        are done, union all pickles.
        """
        self.add_log("Mapping non-full-length reads to consensus isoforms.")
        self.add_log("Creating pickles...", level=logging.INFO)

        for idx, fa in enumerate(self.fasta_filenames):
            # for each splitted non-full-length reads fasta file, build #
            # partial_uc.pickle
            cmd = ICE_PARTIAL_PY + " " + \
                  "one {i} ".format(i=real_upath(fa)) + \
                  "{r} ".format(r=real_upath(self.ref_fasta)) + \
                  "{o} ".format(o=real_upath(self.pickle_filenames[idx])) + \
                  "--blasr_nproc={n} ".format(n=self.sge_opts.blasr_nproc) + \
                  "--done={d} ".format(d=real_upath(self.done_filenames[idx]))
            if self.ccs_fofn is not None:
                cmd += "--ccs_fofn={f} ".format(f=real_upath(self.ccs_fofn))
            if self.tmp_dir is not None:
                cmd += "--tmp_dir={t}".format(t=self.tmp_dir)

            self.add_log("Writing command to script {fsh}".
                         format(fsh=self.script_filenames[idx]))
            with open(self.script_filenames[idx], 'w') as fsh:
                fsh.write(cmd + "\n")

            # determine elog & olog
            partial_log_fn = op.join(self.log_dir,
                                     'IcePartial.{idx}'.format(idx=idx))
            elog = partial_log_fn + ".elog"
            olog = partial_log_fn + ".olog"
            jid = "ice_partial_{unique_id}_{name}".format(
                unique_id=self.sge_opts.unique_id,
                name=op.basename(fa))

            qsub_cmd = "qsub " + \
                       "-pe smp {n} ".format(n=self.sge_opts.blasr_nproc) + \
                       "-cwd -S /bin/bash -V " + \
                       "-e {elog} ".format(elog=real_upath(elog)) + \
                       "-o {olog} ".format(olog=real_upath(olog)) + \
                       "-N {jid} ".format(jid=jid) + \
                       "{sh}".format(sh=real_upath(self.script_filenames[idx]))

            self.add_log("Creating a pickle for {f}".format(f=fa))

            if self.sge_opts.use_sge is True:
                self.qsub_cmd_and_log(qsub_cmd)
            else:
                cmd += " 1>{olog} 2>{elog}".format(olog=real_upath(olog),
                                                   elog=real_upath(elog))
                self.run_cmd_and_log(cmd=cmd, olog=olog, elog=elog)
    def createPickles(self):
        """For each file in fasta_filenames, call 'ICE_PARTIAL_PY one' to
        build clusters and to save results to a pickle file. When all pickles
        are done, union all pickles.
        """
        self.add_log("Mapping non-full-length reads to consensus isoforms.")
        self.add_log("Creating pickles...", level=logging.INFO)

        for idx, fa in enumerate(self.fasta_filenames):
            # for each splitted non-full-length reads fasta file, build #
            # partial_uc.pickle
            cmd = ICE_PARTIAL_PY + " " + \
                  "one {i} ".format(i=real_upath(fa)) + \
                  "{r} ".format(r=real_upath(self.ref_fasta)) + \
                  "{o} ".format(o=real_upath(self.pickle_filenames[idx])) + \
                  "--blasr_nproc={n} ".format(n=self.sge_opts.blasr_nproc) + \
                  "--done={d} ".format(d=real_upath(self.done_filenames[idx]))
            if self.ccs_fofn is not None:
                cmd += "--ccs_fofn={f} ".format(f=real_upath(self.ccs_fofn))
            if self.tmp_dir is not None:
                cmd += "--tmp_dir={t}".format(t=self.tmp_dir)

            self.add_log("Writing command to script {fsh}".
                         format(fsh=self.script_filenames[idx]))
            with open(self.script_filenames[idx], 'w') as fsh:
                fsh.write(cmd + "\n")

            # determine elog & olog
            partial_log_fn = op.join(self.log_dir,
                                     'IcePartial.{idx}'.format(idx=idx))
            elog = partial_log_fn + ".elog"
            olog = partial_log_fn + ".olog"
            jid = "ice_partial_{unique_id}_{name}".format(
                unique_id=self.sge_opts.unique_id,
                name=op.basename(fa))

            qsub_cmd = "qsub " + \
                       "-pe smp {n} ".format(n=self.sge_opts.blasr_nproc) + \
                       "-cwd -S /bin/bash -V " + \
                       "-e {elog} ".format(elog=real_upath(elog)) + \
                       "-o {olog} ".format(olog=real_upath(olog)) + \
                       "-N {jid} ".format(jid=jid) + \
                       "{sh}".format(sh=real_upath(self.script_filenames[idx]))

            self.add_log("Creating a pickle for {f}".format(f=fa))

            if self.sge_opts.use_sge is True:
                self.qsub_cmd_and_log(qsub_cmd)
            else:
                cmd += " 1>{olog} 2>{elog}".format(olog=real_upath(olog),
                                                   elog=real_upath(elog))
                self.run_cmd_and_log(cmd=cmd, olog=olog, elog=elog)
Exemple #12
0
def build_sa(input_fasta, out_sa):
    """Generate suffix array of input_fasta"""
    if op.exists(input_fasta):
        cmd = "sawriter {o} {i} -blt 8 -welter ".\
            format(o=real_upath(out_sa), i=real_upath(input_fasta))
        dummy_out, code, dummy_msg = backticks(cmd)
        if code == 0:
            return True
        else:
            # If failed to generate suffix array, warning.
            logging.warn("Unable to create suffix array for {f}.".format(f=input_fasta))
            return False
    else:
        raise IOError("Unable to find fasta file {f}.".format(f=input_fasta))
Exemple #13
0
def build_sa(input_fasta, out_sa):
    """Generate suffix array of input_fasta"""
    if op.exists(input_fasta):
        cmd = "sawriter {o} {i} -blt 8 -welter ".\
            format(o=real_upath(out_sa), i=real_upath(input_fasta))
        dummy_out, code, dummy_msg = backticks(cmd)
        if code == 0:
            return True
        else:
            # If failed to generate suffix array, warning.
            logging.warn("Unable to create suffix array for {f}.".format(f=input_fasta))
            return False
    else:
        raise IOError("Unable to find fasta file {f}.".format(f=input_fasta))
Exemple #14
0
 def _align_withBLASR(self, queryFa, targetFa, outFN, ice_opts, sge_opts):
     """Align input reads against itself using BLASR."""
     if op.exists(outFN):
         logging.info("{0} already exists. No need to run BLASR.".format(outFN))
     else:
         cmd = "blasr {q} ".format(q=real_upath(queryFa)) + \
               "{t} ".format(t=real_upath(targetFa)) + \
               "-m 5 --maxLCPLength 15 " + \
               "--nproc {cpu} ".format(cpu=sge_opts.blasr_nproc) + \
               "--maxScore {score} ".format(score=ice_opts.maxScore) + \
               "--bestn {n} --nCandidates {n} ".format(n=ice_opts.bestn) + \
               "--out {o} ".format(o=real_upath(outFN)) + \
               "1>/dev/null 2>/dev/null"
         logging.info("Calling {cmd}".format(cmd=cmd))
         execute(cmd)
def generate_batch_cmds(csv_filename, dirname, cmd_filename, cpus):
    #, nfl_filename, tucked_filename, subread_xml, cpus):
    cmd_f = open(cmd_filename, 'w')
    for r in DictReader(open(csv_filename), delimiter=','):
        cid = r['cluster']
        d2 = os.path.join(dirname, cid)
        if not os.path.exists(d2):
            print >> sys.stderr, "Directory {0} does not exist! Abort!".format(d2)
            sys.exit(-1)

        cmd_f.write("cd {0}\n".format(real_upath(d2)))

        fa_files, fq_files = preprocess_flnc_split_if_necessary(d2, int(r['size']), flnc_split=20000)

        ice2_aligner = 'blasr' if int(r['size']) <= 20000 else 'daligner'

        cmd_f.write("run_IceInit2.py {fa} init.uc.pickle --aligner_choice=blasr --cpus={c}\n".format(c=cpus, fa=fa_files[0]))
        cmd_f.write("run_IceIterative2.py {fas} {fqs} isoseq_flnc.fasta . ".format(fas=",".join(fa_files), fqs=",".join(fq_files)) + \
                    "--init_uc_pickle=init.uc.pickle --aligner_choice={aln} ".format(aln=ice2_aligner) + \
                    "--blasr_nproc {c} --gcon_nproc {c2}\n".format(c=cpus, c2=min(cpus, 4)))
#        cmd_f.write("run_IcePartial2.py all {nfl},{tucked} ".format(nfl=nfl_filename, tucked=tucked_filename) + \
#                    "output/final.consensus.fasta nfl.pickle " + \
#                    "--root_dir . --aligner_choice=blasr --cpus={c}\n".format(c=cpus))
#        cmd_f.write("run_IceArrow2.py all --subread_xml {s} ".format(s=subread_xml) + \
#                    "--blasr_nproc {c} --arrow_nproc {c} .\n".format(c=cpus))

    cmd_f.close()
Exemple #16
0
def generate_batch_cmds(csv_filename, dirname, cmd_filename, cpus):
    #, nfl_filename, tucked_filename, subread_xml, cpus):
    cmd_f = open(cmd_filename, 'w')
    for r in DictReader(open(csv_filename), delimiter=','):
        cid = r['cluster']
        d2 = os.path.join(dirname, cid)
        if not os.path.exists(d2):
            print("Directory {0} does not exist! Abort!".format(d2),
                  file=sys.stderr)
            sys.exit(-1)

        cmd_f.write("cd {0}\n".format(real_upath(d2)))

        fa_files, fq_files = preprocess_flnc_split_if_necessary(
            d2, int(r['size']), flnc_split=20000)

        ice2_aligner = 'blasr' if int(r['size']) <= 20000 else 'daligner'

        cmd_f.write(
            "run_IceInit2.py {fa} init.uc.pickle --aligner_choice=blasr --cpus={c}\n"
            .format(c=cpus, fa=fa_files[0]))
        cmd_f.write("run_IceIterative2.py {fas} {fqs} isoseq_flnc.fasta . ".format(fas=",".join(fa_files), fqs=",".join(fq_files)) + \
                    "--init_uc_pickle=init.uc.pickle --aligner_choice={aln} ".format(aln=ice2_aligner) + \
                    "--blasr_nproc {c} --gcon_nproc {c2}\n".format(c=cpus, c2=min(cpus, 4)))


#        cmd_f.write("run_IcePartial2.py all {nfl},{tucked} ".format(nfl=nfl_filename, tucked=tucked_filename) + \
#                    "output/final.consensus.fasta nfl.pickle " + \
#                    "--root_dir . --aligner_choice=blasr --cpus={c}\n".format(c=cpus))
#        cmd_f.write("run_IceArrow2.py all --subread_xml {s} ".format(s=subread_xml) + \
#                    "--blasr_nproc {c} --arrow_nproc {c} .\n".format(c=cpus))

    cmd_f.close()
def sort_sam(in_sam, out_sam):
    """
    Sort input sam file and write to output sam file.
    """
    # Copy SAM headers
    copy_sam_header(in_sam=in_sam, out_sam=out_sam)

    # Call sort to sort gmap output sam file
    cmd_args = [
        'sort', '-k 3,3', '-k 4,4n',
        real_upath(in_sam), '| grep -v \'^@\' ', ' >> ',
        real_upath(out_sam)
    ]

    if os.stat(in_sam).st_size == 0:  # overwrite cmds if file is empty
        cmd_args = ['touch', out_sam]

    execute(' '.join(cmd_args))
Exemple #18
0
    def arrow_cmds_for_bin(self, cids):
        """
        Return a list of quiver related cmds. Input format must be BAM.
        """
        first, last = cids[0], cids[-1]
        self.add_log("Creating arrow cmds for c{first} to c{last}".format(
            first=first, last=last))

        bin_ref_fa = self.ref_fa_of_arrowed_bin(first, last)
        bin_fq = self.fq_of_arrowed_bin(first, last)

        bin_unsorted_bam_file = self.bam_of_arrowed_bin(first,
                                                        last,
                                                        is_sorted=False)
        bin_bam_file = self.bam_of_arrowed_bin(first, last, is_sorted=True)
        bin_bam_prefix = self._arrowed_bin_prefix(first, last)

        cmds = []
        if not self.use_samtools_v_1_3_1:
            # SA2.*, SA3.0, SA3.1 and SA3.2 use v0.1.19
            cmds.append("samtools sort {f} {d}".format(
                f=real_upath(bin_unsorted_bam_file),
                d=real_upath(bin_bam_prefix)))
        else:
            # SA3.3 and up use v1.3.1
            cmds.append("samtools sort {f} -o {d}.bam".format(
                f=real_upath(bin_unsorted_bam_file),
                d=real_upath(bin_bam_prefix)))

        cmds.append("samtools index {f}".format(f=real_upath(bin_bam_file)))
        cmds.append("samtools faidx {ref}".format(ref=real_upath(bin_ref_fa)))
        cmds.append("pbindex {f}".format(f=real_upath(bin_bam_file)))
        #        cmds.append("variantCaller --maskRadius 3 -x 1 --minAccuracy 0 --algorithm=best " +
        #                    "{f} ".format(f=real_upath(bin_bam_file)) +
        #                    "--verbose -j{n} ".format(n=self.sge_opts.arrow_nproc) +
        #                    "--referenceFilename={ref} ".format(ref=real_upath(bin_ref_fa)) +
        #                    "-o {fq}".format(fq=real_upath(bin_fq)))
        cmds.append("variantCaller --algorithm=best " +
                    "{f} ".format(f=real_upath(bin_bam_file)) +
                    "--verbose -j{n} ".format(n=self.sge_opts.arrow_nproc) +
                    "--referenceFilename={ref} ".format(
                        ref=real_upath(bin_ref_fa)) +
                    "-o {fq}".format(fq=real_upath(bin_fq)))
        return cmds
Exemple #19
0
def sanity_check_sge(sge_opts, scriptDir, testDirName="gcon_test_dir"):
    """Sanity check if sge can work."""
    scriptDir = realpath(scriptDir)
    testDir = op.join(scriptDir, testDirName)

    if not op.exists(scriptDir):
        os.makedirs(scriptDir)
    if not op.exists(testDir):
        os.makedirs(testDir)

    testSh = op.join(scriptDir, 'test.sh')
    consensusFa = op.join(testDir, "g_consensus.fasta")
    testInFa = op.join(testDir, "gcon_in.fasta")
    if op.exists(testInFa):
        os.remove(testInFa)
    shutil.copy(GCON_IN_FA, testInFa)
    assert op.exists(testInFa)

    cmd = " ".join([
        gcon_py,
        real_upath(testInFa),
        "{testDir}/g_consensus".format(testDir=real_upath(testDir)), "c1"
    ])
    write_cmd_to_script(cmd=cmd, script=testSh)

    assert op.exists(testSh)
    cmd = sge_opts.qsub_cmd(script=real_upath(testSh),
                            num_threads=1,
                            wait_before_exit=True)

    logging.debug("Submitting cmd: " + cmd)
    backticks(cmd)

    if not filecmp.cmp(consensusFa, GCON_OUT_FA):
        errMsg = "Trouble running qsub or output is not as " + \
                 "expected ({0} and {1} must agree). Abort!".format(
                     consensusFa, GCON_OUT_FA)
        logging.error(errMsg)
        return False
    else:
        shutil.rmtree(testDir)
        logging.info("sge and gcon check passed.")
        return True
Exemple #20
0
def sanity_check_sge(sge_opts, scriptDir, testDirName="gcon_test_dir"):
    """Sanity check if sge can work."""
    scriptDir = realpath(scriptDir)
    testDir = op.join(scriptDir, testDirName)

    if not op.exists(scriptDir):
        os.makedirs(scriptDir)
    if not op.exists(testDir):
        os.makedirs(testDir)

    testSh = op.join(scriptDir, 'test.sh')
    consensusFa = op.join(testDir, "g_consensus.fasta")
    testInFa = op.join(testDir, "gcon_in.fasta")
    if op.exists(testInFa):
        os.remove(testInFa)
    shutil.copy(GCON_IN_FA, testInFa)
    assert op.exists(testInFa)

    cmd = " ".join([gcon_py, real_upath(testInFa),
                    "{testDir}/g_consensus".format(testDir=real_upath(testDir)),
                    "c1"])
    write_cmd_to_script(cmd=cmd, script=testSh)

    assert op.exists(testSh)
    cmd = sge_opts.qsub_cmd(script=real_upath(testSh),
                            num_threads=1, wait_before_exit=True)

    logging.debug("Submitting cmd: " + cmd)
    backticks(cmd)

    if not filecmp.cmp(consensusFa, GCON_OUT_FA):
        errMsg = "Trouble running qsub or output is not as " + \
                 "expected ({0} and {1} must agree). Abort!".format(
                     consensusFa, GCON_OUT_FA)
        logging.error(errMsg)
        return False
    else:
        shutil.rmtree(testDir)
        logging.info("sge and gcon check passed.")
        return True
def map_isoforms_and_sort(input_filename, sam_filename, gmap_db_dir,
                          gmap_db_name, gmap_nproc):
    """
    Map isoforms to references by gmap, generate a sam output and sort sam.
    Parameters:
        input_filename -- input isoforms. e.g., hq_isoforms.fasta|fastq|xml
        sam_filename -- output sam file, produced by gmap and sorted.
        gmap_db_dir -- gmap database directory
        gmap_db_name -- gmap database name
        gmap_nproc -- gmap nproc
    """
    unsorted_sam_filename = sam_filename + ".tmp"
    log_filename = sam_filename + ".log"

    gmap_input_filename = input_filename
    if input_filename.endswith('.xml'):
        # must consolidate dataset xml to FASTA/FASTQ
        w = ContigSetReaderWrapper(input_filename)
        gmap_input_filename = w.consolidate(out_prefix=sam_filename + '.input')
    if not op.exists(gmap_input_filename):
        raise IOError("Gmap input file %s does not exists" %
                      gmap_input_filename)

    # In order to prevent mount issues, cd to ${gmap_db_dir} and ls ${gmap_db_name}.* files
    cwd = realpath(os.getcwd())
    cmd_args = [
        'cd %s' % real_upath(op.join(gmap_db_dir, gmap_db_name)),
        'ls *.iit *meta', 'sleep 3',
        'cd %s' % real_upath(cwd)
    ]
    execute(' && '.join(cmd_args))

    cmd_args = [
        'gmap',
        '-D {d}'.format(d=real_upath(gmap_db_dir)),
        '-d {name}'.format(name=gmap_db_name),
        '-t {nproc}'.format(nproc=gmap_nproc),
        '-n 0',
        '-z sense_force',
        '--cross-species',
        '-f samse',
        '--max-intronlength-ends 200000',  # for long genes
        real_upath(gmap_input_filename),
        '>',
        real_upath(unsorted_sam_filename),
        '2>{log}'.format(log=real_upath(log_filename))
    ]
    # Call gmap to map isoforms to reference and output sam.
    try:
        execute(' '.join(cmd_args))
    except Exception:
        logging.debug("gmap failed, try again.")
        execute('sleep 3')
        execute(' '.join(cmd_args))

    # sort sam file
    sort_sam(in_sam=unsorted_sam_filename, out_sam=sam_filename)

    # remove intermediate unsorted sam file.
    rmpath(unsorted_sam_filename)
Exemple #22
0
    def make_db(self):
        """Make dazz database for input file.
        1. fasta2DB
        2. DBsplit
        3. get & store number of blocks
        *.dazz.fasta.db will be created.
        """
        log.debug("Making DAZZ database for %s.", self.dazz_filename)
        if not op.exists(self.dazz_filename):
            raise RuntimeError(
                "%s hasn't been converted to daligner-compatible format." %
                self.input_filename)
        if op.exists(self.db_filename):
            cmd = "DBrm %s" % real_upath(self.dazz_filename)
            execute(cmd=cmd)

        cmd = "fasta2DB %s %s " % (real_upath(
            self.dazz_filename), real_upath(self.dazz_filename))
        execute(cmd=cmd)

        cmd = "DBsplit -s200 %s" % real_upath(self.dazz_filename)
        execute(cmd)
Exemple #23
0
def blasr_for_quiver(query_fn, ref_fasta, out_fn, bam=False,
                     run_cmd=True, blasr_nproc=12):
    """
    query_fn  --- should be in.raw.fasta|bam
    ref_fasta --- reference fasta (ex: g_consensus.fasta) to align to
    out_fn    --- sam|bam output aligning query_fn to ref_fasta

    blasr query_fn ref_fasta -out out_fn -sam -clipping soft
    blasr query_fn ref_fasta -out out_fn -bam
    """
    cmd = "blasr {i} ".format(i=real_upath(query_fn)) + \
          "{r} ".format(r=real_upath(ref_fasta)) + \
          "--nproc {n} ".format(n=blasr_nproc) + \
          "--bestn 5 --nCandidates 10 " + \
          ("--sam --clipping soft " if not bam else "--bam ") + \
          "--out {o} ".format(o=real_upath(out_fn)) + \
          "1>/dev/null 2>/dev/null"
    if run_cmd:
        execute(cmd)
    else:
        logging.debug("CMD: " + cmd)
    return cmd
Exemple #24
0
def generate_batch_cmds_for_polishing(chunk_prefix, nfl_filename, subread_xml,
                                      cpus, cmd_filename, walltime, queue):

    subread_xml = real_upath(subread_xml)
    nfl_filename = real_upath(nfl_filename)

    fastas = glob.glob(chunk_prefix + '*.consensus.fasta')
    # verify that the pickles exists as well
    for fasta in fastas:
        pickle = fasta[:-len('.consensus.fasta')] + '.pickle'
        print "looking for", pickle
        assert os.path.exists(pickle)
        dirname = fasta[:-len('.consensus.fasta')]
        if os.path.exists(dirname):
            print >> sys.stderr, "Directory {0} already exist! Abort!".format(
                dirname)
            sys.exit(-1)

    cmd_f = open(cmd_filename, 'w')

    for fasta in fastas:
        pickle = fasta[:-len('.consensus.fasta')] + '.pickle'
        dirname = fasta[:-len('.consensus.fasta')]
        full_fasta = real_upath(fasta)
        full_pickle = real_upath(pickle)
        os.makedirs(dirname)
        os.chdir(dirname)
        os.symlink(full_fasta, os.path.basename(full_fasta))
        os.makedirs('output')
        os.chdir('output')
        os.symlink(full_pickle, 'final.pickle')
        os.symlink(full_fasta, 'final.consensus.fasta')
        os.chdir('../../')
        f = open(os.path.join(dirname, dirname + '.sh'), 'w')
        f.write("#!/bin/bash\n")
        f.write(
            "source /projects/banchereau-lab/ISO-seq/annotation_processing/pitchfork_ToFU2_dev/setup-env.sh\n"
        )
        f.write("module load gcc/4.9.2\n")
        f.write("module load graphviz\n")
        f.write(
            "PATH=$PATH:/projects/banchereau-lab/ISO-seq/annotation_processing/cDNA_Cupcake/sequence\n"
        )
        f.write("cd $PBS_O_WORKDIR\n")
        f.write("run_IcePartial2.py all {nfl} {p}.consensus.fasta {p}.nfl.pickle "\
                "--root_dir {d} --aligner_choice=daligner --cpus={c}\n".format(\
                p=dirname, nfl=nfl_filename, d=real_upath(dirname), c=cpus))
        f.write("run_IceArrow2.py all {d} --subread_xml {s} --blasr_nproc {c} --arrow_nproc {c} --hq_min_full_length_reads=1\n".format(\
                d=real_upath(dirname), s=subread_xml, c=cpus))
        f.close()
        cmd_f.write(
            "qsub -q {q} -l walltime={w} -l nodes=1:ppn={c} {sh}\n".format(
                sh=real_upath(f.name), c=cpus, w=walltime, q=queue))
Exemple #25
0
    def submit_jobs_local_or_remote(self, files_to_run):
        """
        Run jobs either locally or through SGE.
        Return a list of [(sge_job_id, filename)], which
        is also written to log/submitted_arrow_jobs.txt
        """
        flag_run_locally = (self.sge_opts.use_sge
                            is not True) or (self.sge_opts.max_sge_jobs == 0)
        if flag_run_locally:
            self.add_log("Files to submit locally: {0}\n".format(
                ",".join(files_to_run)))
        else:
            self.add_log("Files to submit through SGE: {0}\n".format(
                ",".join(files_to_run)))

        submit_f = open(self.arrow_submission_run_file, 'w')

        submitted = []
        for file in files_to_run:
            elog = op.join(self.arrowed_log_dir, op.basename(file) + ".elog")
            olog = op.join(self.arrowed_log_dir, op.basename(file) + ".olog")

            if flag_run_locally:
                cmd = "bash {f}".format(f=real_upath(file))
                self.run_cmd_and_log(cmd,
                                     olog=olog,
                                     elog=elog,
                                     description="Failed to run Arrow")
                submitted.append(("local", file))
                submit_f.write("{0}\t{1}\n".format("local", file))
            else:
                jid = "ice_arrow_{unique_id}_{name}".format(
                    unique_id=self.sge_opts.unique_id, name=op.basename(file))
                qsub_cmd = self.sge_opts.qsub_cmd(
                    script=file,
                    num_threads=self.sge_opts.arrow_nproc,
                    wait_before_exit=False,
                    depend_on_jobs=None,
                    elog=elog,
                    olog=olog,
                    is_script=True,
                    jobid=jid)
                job_id = self.qsub_cmd_and_log(qsub_cmd)
                submitted.append((job_id, file))
                submit_f.write("{0}\t{1}\n".format(job_id, file))
        submit_f.close()
        return submitted
Exemple #26
0
    def init_cluster_by_clique(self):
        """
        Only called once and in the very beginning, when (probably a subset)
        of sequences are given to generate the initial cluster.

        readsFa --- initial fasta filename, probably called *_split00.fasta
        qver_get_func --- function that returns QVs on reads
        qvmean_get_func --- function that returns the mean QV on reads
        bestn --- parameter in BLASR, higher helps in finding perfect
            cliques but bigger output
        nproc, maxScore --- parameter in BLASR, set maxScore appropriate
            to input transcript length
        ece_penalty, ece_min_len --- parameter in isoform hit calling

        Self-blasr input then iteratively find all mutually exclusive
            cliques (in decreasing size)
        Returns dict of cluster_index --> list of seqids
        which is the 'uc' dict that can be used by IceIterative
        """
        alignGraph = None

        if self.ice_opts.aligner_choice == 'blasr':
            outFN = self.readsFa + '.self.blasr'
            self._align_withBLASR(queryFa=self.readsFa,
                                  targetFa=self.readsFa,
                                  outFN=outFN)
            alignGraph = self._makeGraphFromM5(m5FN=outFN)
        elif self.ice_opts.aligner_choice == 'daligner':
            try:
                runner = self._align_withDALIGNER(
                    queryFa=self.readsFa,
                    output_dir=op.dirname(real_upath(self.readsFa)))
                alignGraph = self._makeGraphFromLA4Ice(runner=runner)
                runner.clean_run()
            except RuntimeError:  # daligner probably crashed, fall back to blasr
                outFN = self.readsFa + '.self.blasr'
                self._align_withBLASR(queryFa=self.readsFa,
                                      targetFa=self.readsFa,
                                      outFN=outFN)
                alignGraph = self._makeGraphFromM5(m5FN=outFN)
        else:
            raise Exception, "Unrecognized aligner_choice {0}!".format(
                self.ice_opts.aligner_choice)

        uc = IceInit2._findCliques(alignGraph=alignGraph, readsFa=self.readsFa)
        return uc
Exemple #27
0
def generate_batch_cmds(csv_filename, dirname, cmd_filename, cpus):
    #, nfl_filename, tucked_filename, subread_xml, cpus):
    cmd_f = open(cmd_filename, 'w')
    for r in DictReader(open(csv_filename), delimiter=','):
        cid = r['cluster']
        d2 = os.path.join(dirname, cid)
        if not os.path.exists(d2):
            print >> sys.stderr, "Directory {0} does not exist! Abort!".format(
                d2)
            sys.exit(-1)

        cmd_f.write("#!/bin/bash\n")
        cmd_f.write(
            "source /projects/banchereau-lab/ISO-seq/annotation_processing/pitchfork_ToFU2_dev/setup-env.sh\n"
        )
        cmd_f.write("module load gcc/4.9.2\n")
        cmd_f.write(
            "PATH=$PATH:/projects/banchereau-lab/ISO-seq/annotation_processing/cDNA_Cupcake/sequence\n"
        )
        cmd_f.write("cd $PBS_O_WORKDIR\n")

        cmd_f.write("cd {0}\n".format(real_upath(d2)))

        fa_files, fq_files = preprocess_flnc_split_if_necessary(
            d2, int(r['size']), flnc_split=20000)

        ice2_aligner = 'blasr' if int(r['size']) <= 20000 else 'daligner'

        cmd_f.write(
            "run_IceInit2.py {fa} init.uc.pickle --aligner_choice=blasr --cpus={c}\n"
            .format(c=cpus, fa=fa_files[0]))
        cmd_f.write("run_IceIterative2.py {fas} {fqs} isoseq_flnc.fasta . ".format(fas=",".join(fa_files), fqs=",".join(fq_files)) + \
                    "--init_uc_pickle=init.uc.pickle --aligner_choice={aln} ".format(aln=ice2_aligner) + \
                    "--blasr_nproc {c} --gcon_nproc {c2}\n".format(c=cpus, c2=min(cpus, 4)))


#        cmd_f.write("run_IcePartial2.py all {nfl},{tucked} ".format(nfl=nfl_filename, tucked=tucked_filename) + \
#                    "output/final.consensus.fasta nfl.pickle " + \
#                    "--root_dir . --aligner_choice=blasr --cpus={c}\n".format(c=cpus))
#        cmd_f.write("run_IceArrow2.py all --subread_xml {s} ".format(s=subread_xml) + \
#                    "--blasr_nproc {c} --arrow_nproc {c} .\n".format(c=cpus))

    cmd_f.close()
Exemple #28
0
    def submit_todo_quiver_jobs(self, todo, submitted, sge_opts):
        """
        todo --- a list of sh scripts to run
        submitted --- a list of sh scripts which have been submitted
        sge_opts --- SGE options, including
                     use_sge, whether or not to use sge
                     max_sge_jobs, maximum number sge jobs to submit
                     quiver_nproc, number of nproc per job
                     unique_id, unique id to name qsub jobs
        """
        self.add_log("Submitting todo quiver jobs.")
        if sge_opts.use_sge is not True or \
           sge_opts.max_sge_jobs == 0:  # don't use SGE
            for job in todo:
                elog = op.join(self.quivered_log_dir,
                               op.basename(job) + ".elog")
                olog = op.join(self.quivered_log_dir,
                               op.basename(job) + ".olog")
                cmd = "bash " + real_upath(job) + " 1>{olog} 2>{elog}".\
                      format(olog=real_upath(olog), elog=real_upath(elog))
                self.run_cmd_and_log(cmd,
                                     olog=olog,
                                     elog=elog,
                                     description="Failed to run Quiver")
                submitted.append(("local", job))
            todo = []
        else:
            while len(todo) > 0:
                n = min(sge_opts.max_sge_jobs, len(todo))
                for job in todo[:n]:
                    # ex: Your job 8613116 ("c20to70.sh") has been submitted
                    elog = op.join(self.quivered_log_dir,
                                   op.basename(job) + ".elog")
                    olog = op.join(self.quivered_log_dir,
                                   op.basename(job) + ".olog")
                    jid = "ice_quiver_{unique_id}_{name}".format(
                        unique_id=self.sge_opts.unique_id,
                        name=op.basename(job))
                    qsub_cmd = "qsub " + \
                               "-pe smp {n} ".\
                               format(n=sge_opts.quiver_nproc) + \
                               "-cwd -S /bin/bash -V " + \
                               "-e {elog} ".format(elog=real_upath(elog)) +\
                               "-o {olog} ".format(olog=real_upath(olog)) +\
                               "-N {jid} ".format(jid=jid) + \
                               "{job}".format(job=real_upath(job))
                    job_id = self.qsub_cmd_and_log(qsub_cmd)

                    submitted.append((job_id, job))
                    todo.remove(job)
Exemple #29
0
    def submit_todo_quiver_jobs(self, todo, submitted, sge_opts):
        """
        todo --- a list of sh scripts to run
        submitted --- a list of sh scripts which have been submitted
        sge_opts --- SGE options, including
                     use_sge, whether or not to use sge
                     max_sge_jobs, maximum number sge jobs to submit
                     quiver_nproc, number of nproc per job
                     unique_id, unique id to name qsub jobs
        """
        self.add_log("Submitting todo quiver jobs.")
        if sge_opts.use_sge is not True or \
           sge_opts.max_sge_jobs == 0:  # don't use SGE
            for job in todo:
                elog = op.join(self.quivered_log_dir,
                               op.basename(job) + ".elog")
                olog = op.join(self.quivered_log_dir,
                               op.basename(job) + ".olog")
                cmd = "bash " + real_upath(job) + " 1>{olog} 2>{elog}".\
                      format(olog=real_upath(olog), elog=real_upath(elog))
                self.run_cmd_and_log(cmd, olog=olog, elog=elog,
                                     description="Failed to run Quiver")
                submitted.append(("local", job))
            todo = []
        else:
            while len(todo) > 0:
                n = min(sge_opts.max_sge_jobs, len(todo))
                for job in todo[:n]:
                    # ex: Your job 8613116 ("c20to70.sh") has been submitted
                    elog = op.join(self.quivered_log_dir,
                                   op.basename(job) + ".elog")
                    olog = op.join(self.quivered_log_dir,
                                   op.basename(job) + ".olog")
                    jid = "ice_quiver_{unique_id}_{name}".format(
                        unique_id=self.sge_opts.unique_id,
                        name=op.basename(job))
                    qsub_cmd = "qsub " + \
                               "-pe smp {n} ".\
                               format(n=sge_opts.quiver_nproc) + \
                               "-cwd -S /bin/bash -V " + \
                               "-e {elog} ".format(elog=real_upath(elog)) +\
                               "-o {olog} ".format(olog=real_upath(olog)) +\
                               "-N {jid} ".format(jid=jid) + \
                               "{job}".format(job=real_upath(job))
                    job_id = self.qsub_cmd_and_log(qsub_cmd)

                    submitted.append((job_id, job))
                    todo.remove(job)
def generate_batch_cmds_for_polishing(chunk_prefix, nfl_filename, subread_xml,
                                      cpus, cmd_filename):

    subread_xml = real_upath(subread_xml)
    nfl_filename = real_upath(nfl_filename)

    fastas = glob.glob(chunk_prefix + '*.consensus.fasta')
    # verify that the pickles exists as well
    for fasta in fastas:
        pickle = fasta[:-len('.consensus.fasta')] + '.pickle'
        print("looking for", pickle)
        assert os.path.exists(pickle)
        dirname = fasta[:-len('.consensus.fasta')]
        if os.path.exists(dirname):
            print("Directory {0} already exist! Abort!".format(dirname),
                  file=sys.stderr)
            sys.exit(-1)

    cmd_f = open(cmd_filename, 'w')

    for fasta in fastas:
        pickle = fasta[:-len('.consensus.fasta')] + '.pickle'
        dirname = fasta[:-len('.consensus.fasta')]
        full_fasta = real_upath(fasta)
        full_pickle = real_upath(pickle)
        os.makedirs(dirname)
        os.chdir(dirname)
        os.symlink(full_fasta, os.path.basename(full_fasta))
        os.makedirs('output')
        os.chdir('output')
        os.symlink(full_pickle, 'final.pickle')
        os.symlink(full_fasta, 'final.consensus.fasta')
        os.chdir('../../')
        f = open(os.path.join(dirname, dirname + '.sh'), 'w')
        f.write("run_IcePartial2.py all {nfl} {p}.consensus.fasta {p}.nfl.pickle "\
                "--root_dir {d} --aligner_choice=daligner --cpus={c}\n".format(\
                p=dirname, nfl=nfl_filename, d=real_upath(dirname), c=cpus))
        f.write("run_IceArrow2.py all {d} --subread_xml {s} --blasr_nproc {c} --arrow_nproc {c} --hq_min_full_length_reads=2\n".format(\
                d=real_upath(dirname), s=subread_xml, c=cpus))
        f.close()
        cmd_f.write("qsub -cwd -S /bin/bash -pe smp 12 -V {sh}\n".format(
            sh=real_upath(f.name)))
def generate_batch_cmds_for_polishing(chunk_prefix, nfl_filename, subread_xml, cpus, cmd_filename):

    subread_xml = real_upath(subread_xml)
    nfl_filename = real_upath(nfl_filename)

    fastas = glob.glob(chunk_prefix + '*.consensus.fasta')
    # verify that the pickles exists as well
    for fasta in fastas:
        pickle = fasta[:-len('.consensus.fasta')] + '.pickle'
        print "looking for", pickle
        assert os.path.exists(pickle)
        dirname = fasta[:-len('.consensus.fasta')]
        if os.path.exists(dirname):
            print >> sys.stderr, "Directory {0} already exist! Abort!".format(dirname)
            sys.exit(-1)

    cmd_f = open(cmd_filename, 'w')

    for fasta in fastas:
        pickle = fasta[:-len('.consensus.fasta')] + '.pickle'
        dirname = fasta[:-len('.consensus.fasta')]
        full_fasta = real_upath(fasta)
        full_pickle = real_upath(pickle)
        os.makedirs(dirname)
        os.chdir(dirname)
        os.symlink(full_fasta, os.path.basename(full_fasta))
        os.makedirs('output')
        os.chdir('output')
        os.symlink(full_pickle, 'final.pickle')
        os.symlink(full_fasta, 'final.consensus.fasta')
        os.chdir('../../')
        f = open(os.path.join(dirname, dirname+'.sh'), 'w')
        f.write("run_IcePartial2.py all {nfl} {p}.consensus.fasta {p}.nfl.pickle "\
                "--root_dir {d} --aligner_choice=daligner --cpus={c}\n".format(\
                p=dirname, nfl=nfl_filename, d=real_upath(dirname), c=cpus))
        f.write("run_IceArrow2.py all {d} --subread_xml {s} --blasr_nproc {c} --arrow_nproc {c} --hq_min_full_length_reads=2\n".format(\
                d=real_upath(dirname), s=subread_xml, c=cpus))
        f.close()
        cmd_f.write("qsub -cwd -S /bin/bash -pe smp 12 -V {sh}\n".format(sh=real_upath(f.name)))
Exemple #32
0
    from argparse import ArgumentParser

    parser = ArgumentParser(
        "Generate batch commands for running IceInit2->IceIterative2 for each preCluster output bin"
    )
    parser.add_argument(
        "precluster_csv",
        help="Cluster CSV file (ex: preCluster.cluster_info.csv)")
    parser.add_argument("precluster_dir",
                        help="preCluster out directory (ex: preCluster_out/)")
    #parser.add_argument("nfl_filename", help="nFL filename (ex: isoseq_nfl.fasta)")
    #parser.add_argument("tucked_filename", help="tucked filename (ex: preCluster_out.tucked.fasta)")
    #parser.add_argument("subread_xml", help="Subread XML")
    parser.add_argument("--cpus",
                        default=20,
                        type=int,
                        help="Number of CPUs (default: 20)")
    parser.add_argument("--cmd_filename",
                        default='cmds',
                        help="Output command filename (default: cmds)")
    args = parser.parse_args()

    generate_batch_cmds(
        args.precluster_csv,
        real_upath(args.precluster_dir),
        args.cmd_filename,
        #real_upath(args.nfl_filename),
        #real_upath(args.tucked_filename),
        #real_upath(args.subread_xml),
        args.cpus)
Exemple #33
0
    def quiver_cmds_for_bin(self, cids, quiver_nproc=2, bam=False):
        """
        Return a list of quiver related cmds. Input format can be FASTA or BAM.
        If inputs are in FASTA format, call samtoh5, loadPulses, comph5tools.py,
        samtools, loadChemistry, quiver...
        If inputs are in BAM format, call quiver directly.
        """
        first, last = cids[0], cids[-1]
        self.add_log("Creating quiver cmds for c{first} to c{last}".
                     format(first=first, last=last))

        bin_ref_fa = self.ref_fa_of_quivered_bin(first, last)
        bin_sam_file = self.sam_of_quivered_bin(first, last)
        bin_cmph5 = self.cmph5_of_quivered_bin(first, last)
        bin_fq = self.fq_of_quivered_bin(first, last)

        bin_unsorted_bam_file = self.bam_of_quivered_bin(first, last, is_sorted=False)
        bin_bam_file = self.bam_of_quivered_bin(first, last, is_sorted=True)
        bin_bam_prefix = self._quivered_bin_prefix(first, last)

        quiver_input = bin_cmph5 if not bam else bin_bam_file

        cmds = []
        if not bam:
            raise IOError("conversion to cmp.h5 no longer supported")
            cmds.append("samtoh5 {sam} {ref} {cmph5} -smrtTitle".format(
                sam=real_upath(bin_sam_file),
                ref=real_upath(bin_ref_fa),
                cmph5=real_upath(bin_cmph5)))
            cmds.append("gzip {sam}".format(sam=real_upath(bin_sam_file)))
            metrics = ["QualityValue", "InsertionQV", "MergeQV", "DeletionQV",
                       "DeletionTag", "SubstitutionTag", "SubstitutionQV"]
            cmds.append("loadPulses {bas_fofn} ".
                        format(bas_fofn=real_upath(self.bas_fofn)) +
                        "{cmph5} ".format(cmph5=real_upath(bin_cmph5)) +
                        "-byread -metrics " + ",".join(metrics))
            cmds.append("cmph5tools.py sort {cmph5}".
                        format(cmph5=real_upath(bin_cmph5)))
            cmds.append("loadChemistry.py {bas_fofn} {cmph5}".
                        format(bas_fofn=real_upath(self.bas_fofn),
                               cmph5=real_upath(bin_cmph5)))
        else:
            cmds.append("samtools sort {f} {d}".format(
                f=real_upath(bin_unsorted_bam_file),
                d=real_upath(bin_bam_prefix)))
            cmds.append("samtools index {f}".format(f=real_upath(bin_bam_file)))

        cmds.append("samtools faidx {ref}".format(ref=real_upath(bin_ref_fa)))
        cmds.append("pbindex {f}".format(f=real_upath(bin_bam_file)))
        cmds.append("variantCaller --algorithm=best " +
                    "{f} ".format(f=real_upath(quiver_input)) +
                    "--verbose -j{n} ".format(n=quiver_nproc) +
                    "--referenceFilename={ref} ".format(ref=real_upath(bin_ref_fa)) +
                    "-o {fq}".format(fq=real_upath(bin_fq)))
        return cmds
Exemple #34
0
def concat_sam(samfiles, outsam_filename):
    """
    Header looks like:
    @HD     VN:1.3.1
    @SQ     SN:c31  LN:3104 M5:ef7d3f84dea9d9face43e6fd5b6336c4
    @RG     ID:2caa54eef6   PU:in.raw_with_partial.fasta       SM:NO_CHIP_ID
    @PG     ID:BLASR        VN:1.3.1.126469 CL:blasr in.raw_with_partial.fasta g_consensus.fasta -nproc 12 -bestn 5 -nCandidates 10 -sam -out out.sam

    NOTE: check for M5 conflicts; manipulate them if it conflicts
    """
    f_sq = open(outsam_filename + '.sq', 'w')
    f_bd = open(outsam_filename + '.bd', 'w')

    rg_line = None
    pg_line = None

    md5_seen = set()

    if len(samfiles) == 0:
        raise ValueError("No sam input files to concatenate.")

    h = open(samfiles[0])
    line = h.readline()
    assert line.startswith('@HD')
    f_sq.write(line)
    line = h.readline()
    assert line.startswith('@SQ')
    line = h.readline()
    assert line.startswith('@RG')
    rg_line = line  # write at the end
    line = h.readline()
    assert line.startswith('@PG')
    pg_line = line  # write at the end
    h.close()

    for f in samfiles:
        with open(f) as h:
            assert h.readline().startswith('@HD')
            line = h.readline()
            assert line.startswith('@SQ')
            # ------- check for MD5 conflicts ----------- #
            m5 = line.strip().split()[-1]
            assert m5.startswith("M5:")
            if m5 not in md5_seen:
                f_sq.write(line)
                md5_seen.add(m5)
            else:
                s = list(m5[3:])
                while True:
                    # create a random m5 string.
                    random.shuffle(s)
                    s = "".join(s)
                    if s not in md5_seen:
                        break
                line = line[:line.find('M5:')] + 'M5:' + s + '\n'
                logging.debug("MD5 conflict: change to {0}".format(s))
                md5_seen.add(s)
                f_sq.write(line)
            # ----- end MD5 checking and writing --------- #
            assert h.readline().startswith('@RG')
            assert h.readline().startswith('@PG')
            for line in h:
                f_bd.write(line)

    f_bd.close()
    f_sq.write(rg_line)
    f_sq.write(pg_line)
    f_sq.close()

    cmd = "cat {0}.sq {0}.bd > {0}".format(real_upath(outsam_filename))
    execute(cmd=cmd,
            errmsg="Failed to concat sam files! Abort.",
            errcls=IOError)

    os.remove(f_sq.name)
    os.remove(f_bd.name)
Exemple #35
0
def build_uc_from_partial_blasr(input_fasta, ref_fasta, out_pickle,
                                done_filename,
                                ice_opts,
                                probqv,
                                qv_prob_threshold=0.3,
                                cpus=4,
                                no_qv_or_aln_checking=False,
                                tmp_dir=None,
                                sID_starts_with_c=False):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.
    """
    input_fasta = _get_fasta_path(realpath(input_fasta))
    m5_file = os.path.basename(input_fasta) + ".blasr"
    if tmp_dir is not None:
        m5_file = op.join(tmp_dir, m5_file)

    out_pickle = realpath(out_pickle)

    cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \
          "{r} --bestn 100 --nCandidates 200 ".format(r=real_upath(_get_fasta_path(ref_fasta))) + \
          "--nproc {n} -m 5 ".format(n=cpus) + \
          "--maxScore -1000 --minPctIdentity 85 " + \
          "--minAlnLength {a} ".format(a=ice_opts.min_match_len) + \
          "--out {o} ".format(o=real_upath(m5_file)) + \
          "1>/dev/null 2>/dev/null"

    execute(cmd)


    logging.info("Calling blasr_against_ref ...")

    # no need to provide full_missed_start/end for nFLs, since is_FL = False
    hitItems = blasr_against_ref2(output_filename=m5_file,
                                 is_FL=False,
                                 sID_starts_with_c=sID_starts_with_c,
                                 qver_get_func=probqv.get_smoothed,
                                 qvmean_get_func=probqv.get_mean,
                                 qv_prob_threshold=qv_prob_threshold,
                                 ece_penalty=ice_opts.ece_penalty,
                                 ece_min_len=ice_opts.ece_min_len,
                                 max_missed_start=ice_opts.max_missed_start,
                                 max_missed_end=ice_opts.max_missed_end,
                                 full_missed_start=ice_opts.full_missed_start,
                                 full_missed_end=ice_opts.full_missed_end,
                                 same_strand_only=False)


    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from BLASR hits.")
    for h in hitItems:
        if h.ece_arr is not None:
            if h.cID not in partial_uc:
                partial_uc[h.cID] = set()
            partial_uc[h.cID].add(h.qID)
            seen.add(h.qID)

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: %s.", out_pickle)
    with open(out_pickle, 'w') as f:
        if out_pickle.endswith(".pickle"):
            dump({'partial_uc': partial_uc, 'nohit': nohit}, f)
        elif out_pickle.endswith(".json"):
            f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit}))
        else:
            raise IOError("Unrecognized extension: %s" % out_pickle)

    os.remove(m5_file)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating %s.", done_filename)
    touch(done_filename)
Exemple #36
0
def concat_sam(samfiles, outsam_filename):
    """
    Header looks like:
    @HD     VN:1.3.1
    @SQ     SN:c31  LN:3104 M5:ef7d3f84dea9d9face43e6fd5b6336c4
    @RG     ID:2caa54eef6   PU:in.raw_with_partial.fasta       SM:NO_CHIP_ID
    @PG     ID:BLASR        VN:1.3.1.126469 CL:blasr in.raw_with_partial.fasta g_consensus.fasta -nproc 12 -bestn 5 -nCandidates 10 -sam -out out.sam

    NOTE: check for M5 conflicts; manipulate them if it conflicts
    """
    f_sq = open(outsam_filename + '.sq', 'w')
    f_bd = open(outsam_filename + '.bd', 'w')

    rg_line = None
    pg_line = None

    md5_seen = set()

    if len(samfiles) == 0:
        raise ValueError("No sam input files to concatenate.")

    h = open(samfiles[0])
    line = h.readline()
    assert line.startswith('@HD')
    f_sq.write(line)
    line = h.readline()
    assert line.startswith('@SQ')
    line = h.readline()
    assert line.startswith('@RG')
    rg_line = line  # write at the end
    line = h.readline()
    assert line.startswith('@PG')
    pg_line = line  # write at the end
    h.close()

    for f in samfiles:
        with open(f) as h:
            assert h.readline().startswith('@HD')
            line = h.readline()
            assert line.startswith('@SQ')
            # ------- check for MD5 conflicts ----------- #
            m5 = line.strip().split()[-1]
            assert m5.startswith("M5:")
            if m5 not in md5_seen:
                f_sq.write(line)
                md5_seen.add(m5)
            else:
                s = list(m5[3:])
                while True:
                    # create a random m5 string.
                    random.shuffle(s)
                    s = "".join(s)
                    if s not in md5_seen:
                        break
                line = line[:line.find('M5:')] + 'M5:' + s + '\n'
                logging.debug("MD5 conflict: change to {0}".format(s))
                md5_seen.add(s)
                f_sq.write(line)
            # ----- end MD5 checking and writing --------- #
            assert h.readline().startswith('@RG')
            assert h.readline().startswith('@PG')
            for line in h:
                f_bd.write(line)

    f_bd.close()
    f_sq.write(rg_line)
    f_sq.write(pg_line)
    f_sq.close()

    cmd = "cat {0}.sq {0}.bd > {0}".format(real_upath(outsam_filename))
    execute(cmd=cmd,
            errmsg="Failed to concat sam files! Abort.",
            errcls=IOError)

    os.remove(f_sq.name)
    os.remove(f_bd.name)
    def createPickles(self):
        """For each file in fasta_filenames, call 'ICE_PARTIAL_PY one' to
        build clusters and to save results to a pickle file. When all pickles
        are done, union all pickles.
        """
        self.add_log("Mapping non-full-length reads to consensus isoforms.")
        self.add_log("Creating pickles...", level=logging.INFO)

        for idx, fa in enumerate(self.fasta_filenames):
            # for each splitted non-full-length reads fasta file, build #
            # partial_uc.pickle

            # ex:
            #      python run_IcePartial2.py one isoseq_nfl.fasta isoseq_nfl.fastq \
            #  output/final.consensus.fasta isoseq_nfl.fasta.pickle --aligner_choice=blasr  --cpus=12
            if self.fastq_filenames is not None:
                fq = self.fastq_filenames[idx]
            else:
                fq = None

            cmd = ICE_PARTIAL_PY + " "
            cmd += "one {fa} ".format(fa=real_upath(fa))
            if fq is not None:
                cmd += "--fq {fq} ".format(fq=real_upath(fq))
            cmd += "{r} ".format(r=real_upath(self.ref_fasta)) + \
                  "{o} ".format(o=real_upath(self.pickle_filenames[idx])) + \
                  "--aligner_choice={c} ".format(c=self.ice_opts.aligner_choice) + \
                  "--cpus={n} ".format(n=self.cpus) + \
                  "--max_missed_start={0} ".format(self.ice_opts.max_missed_start) + \
                  "--max_missed_end={0} ".format(self.ice_opts.max_missed_end) + \
                  "--ece_penalty={0} ".format(self.ice_opts.ece_penalty) + \
                  "--ece_min_len={0} ".format(self.ice_opts.ece_min_len) + \
                  "--done={d} ".format(d=real_upath(self.done_filenames[idx]))
            if self.tmp_dir is not None:
                cmd += "--tmp_dir={t}".format(t=self.tmp_dir)

            self.add_log("Writing command to script {fsh}".format(
                fsh=self.script_filenames[idx]))
            with open(self.script_filenames[idx], 'w') as fsh:
                fsh.write(cmd + "\n")

            # determine elog & olog
            partial_log_fn = op.join(self.log_dir,
                                     'IcePartial.{idx}'.format(idx=idx))
            elog = partial_log_fn + ".elog"
            olog = partial_log_fn + ".olog"
            jid = "ice_partial_{unique_id}_{name}".format(
                unique_id=self.sge_opts.unique_id, name=op.basename(fa))

            self.add_log("Creating a pickle for {f}".format(f=fa))

            if self.sge_opts.use_sge is True:
                qsub_cmd = self.sge_opts.qsub_cmd(script=real_upath(
                    self.script_filenames[idx]),
                                                  num_threads=self.cpus,
                                                  wait_before_exit=False,
                                                  depend_on_jobs=None,
                                                  elog=real_upath(elog),
                                                  olog=real_upath(olog),
                                                  is_script=True,
                                                  jobid=jid)
                #          qsub_cmd = "qsub " + \
                #                     "-pe smp {n} ".format(n=self.sge_opts.blasr_nproc) + \
                #                     "-cwd -S /bin/bash -V " + \
                #                     "-e {elog} ".format(elog=real_upath(elog)) + \
                #                     "-o {olog} ".format(olog=real_upath(olog)) + \
                #                     "-N {jid} ".format(jid=jid) + \
                #                     "{sh}".format(sh=real_upath(self.script_filenames[idx]))
                self.qsub_cmd_and_log(qsub_cmd)
            else:
                cmd += " 1>{olog} 2>{elog}".format(olog=real_upath(olog),
                                                   elog=real_upath(elog))
                self.run_cmd_and_log(cmd=cmd, olog=olog, elog=elog)
    def createPickles(self):
        """For each file in fasta_filenames, call 'ICE_PARTIAL_PY one' to
        build clusters and to save results to a pickle file. When all pickles
        are done, union all pickles.
        """
        self.add_log("Mapping non-full-length reads to consensus isoforms.")
        self.add_log("Creating pickles...", level=logging.INFO)

        for idx, fa in enumerate(self.fasta_filenames):
            # for each splitted non-full-length reads fasta file, build #
            # partial_uc.pickle

            # ex:
            #      python run_IcePartial2.py one isoseq_nfl.fasta isoseq_nfl.fastq \
            #  output/final.consensus.fasta isoseq_nfl.fasta.pickle --aligner_choice=blasr  --cpus=12
            if self.fastq_filenames is not None:
                fq = self.fastq_filenames[idx]
            else:
                fq = None

            cmd = ICE_PARTIAL_PY + " "
            cmd += "one {fa} ".format(fa=real_upath(fa))
            if fq is not None:
                cmd += "--fq {fq} ".format(fq=real_upath(fq))
            cmd += "{r} ".format(r=real_upath(self.ref_fasta)) + \
                  "{o} ".format(o=real_upath(self.pickle_filenames[idx])) + \
                  "--aligner_choice={c} ".format(c=self.ice_opts.aligner_choice) + \
                  "--cpus={n} ".format(n=self.cpus) + \
                  "--max_missed_start={0} ".format(self.ice_opts.max_missed_start) + \
                  "--max_missed_end={0} ".format(self.ice_opts.max_missed_end) + \
                  "--ece_penalty={0} ".format(self.ice_opts.ece_penalty) + \
                  "--ece_min_len={0} ".format(self.ice_opts.ece_min_len) + \
                  "--done={d} ".format(d=real_upath(self.done_filenames[idx]))
            if self.tmp_dir is not None:
                cmd += "--tmp_dir={t}".format(t=self.tmp_dir)

            self.add_log("Writing command to script {fsh}".
                         format(fsh=self.script_filenames[idx]))
            with open(self.script_filenames[idx], 'w') as fsh:
                fsh.write(cmd + "\n")

            # determine elog & olog
            partial_log_fn = op.join(self.log_dir,
                                     'IcePartial.{idx}'.format(idx=idx))
            elog = partial_log_fn + ".elog"
            olog = partial_log_fn + ".olog"
            jid = "ice_partial_{unique_id}_{name}".format(
                unique_id=self.sge_opts.unique_id,
                name=op.basename(fa))

            self.add_log("Creating a pickle for {f}".format(f=fa))

            if self.sge_opts.use_sge is True:
                qsub_cmd = self.sge_opts.qsub_cmd(script=real_upath(self.script_filenames[idx]),
                                                  num_threads=self.cpus,
                                                  wait_before_exit=False,
                                                  depend_on_jobs=None,
                                                  elog=real_upath(elog),
                                                  olog=real_upath(olog),
                                                  is_script=True,
                                                  jobid=jid)
                #          qsub_cmd = "qsub " + \
                #                     "-pe smp {n} ".format(n=self.sge_opts.blasr_nproc) + \
                #                     "-cwd -S /bin/bash -V " + \
                #                     "-e {elog} ".format(elog=real_upath(elog)) + \
                #                     "-o {olog} ".format(olog=real_upath(olog)) + \
                #                     "-N {jid} ".format(jid=jid) + \
                #                     "{sh}".format(sh=real_upath(self.script_filenames[idx]))
                self.qsub_cmd_and_log(qsub_cmd)
            else:
                cmd += " 1>{olog} 2>{elog}".format(olog=real_upath(olog),
                                                   elog=real_upath(elog))
                self.run_cmd_and_log(cmd=cmd, olog=olog, elog=elog)
#                    "output/final.consensus.fasta nfl.pickle " + \
#                    "--root_dir . --aligner_choice=blasr --cpus={c}\n".format(c=cpus))
#        cmd_f.write("run_IceArrow2.py all --subread_xml {s} ".format(s=subread_xml) + \
#                    "--blasr_nproc {c} --arrow_nproc {c} .\n".format(c=cpus))

    cmd_f.close()






if __name__ == "__main__":
    from argparse import ArgumentParser

    parser = ArgumentParser("Generate batch commands for running IceInit2->IceIterative2 for each preCluster output bin")
    parser.add_argument("precluster_csv", help="Cluster CSV file (ex: preCluster.cluster_info.csv)")
    parser.add_argument("precluster_dir", help="preCluster out directory (ex: preCluster_out/)")
    #parser.add_argument("nfl_filename", help="nFL filename (ex: isoseq_nfl.fasta)")
    #parser.add_argument("tucked_filename", help="tucked filename (ex: preCluster_out.tucked.fasta)")
    #parser.add_argument("subread_xml", help="Subread XML")
    parser.add_argument("--cpus", default=20, type=int, help="Number of CPUs (default: 20)")
    parser.add_argument("--cmd_filename", default='cmds', help="Output command filename (default: cmds)")
    args = parser.parse_args()

    generate_batch_cmds(args.precluster_csv, real_upath(args.precluster_dir),
                        args.cmd_filename,
                        #real_upath(args.nfl_filename),
                        #real_upath(args.tucked_filename),
                        #real_upath(args.subread_xml),
                        args.cpus)
Exemple #40
0
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle,
                          ccs_fofn=None,
                          done_filename=None, blasr_nproc=12, tmp_dir=None):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.
    blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use
    """
    input_fasta = _get_fasta_path(realpath(input_fasta))
    m5_file = os.path.basename(input_fasta) + ".blasr"
    if tmp_dir is not None:
        m5_file = op.join(tmp_dir, m5_file)

    out_pickle = realpath(out_pickle)

    cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \
          "{r} --bestn 5 ".format(r=real_upath(_get_fasta_path(ref_fasta))) + \
          "--nproc {n} -m 5 ".format(n=blasr_nproc) + \
          "--maxScore -1000 --minPctIdentity 85 " + \
          "--out {o} ".format(o=real_upath(m5_file)) + \
          "1>/dev/null 2>/dev/null"

    execute(cmd)

    if ccs_fofn is None:
        logging.info("Loading probability from model")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        # FIXME this will not work with current CCS bam output, which lacks
        # QV pulse features required - this is handled via a workaround in
        # pbtranscript.tasks.ice_partial
        logging.info("Loading probability from QV in %s", ccs_fofn)
        probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)

    logging.info("Calling blasr_against_ref ...")
    hitItems = blasr_against_ref(output_filename=m5_file,
                                 is_FL=False,
                                 sID_starts_with_c=True,
                                 qver_get_func=probqv.get_smoothed,
                                 qvmean_get_func=probqv.get_mean,
                                 ece_penalty=1,
                                 ece_min_len=10,
                                 same_strand_only=False)

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from BLASR hits.")
    for h in hitItems:
        if h.ece_arr is not None:
            if h.cID not in partial_uc:
                partial_uc[h.cID] = set()
            partial_uc[h.cID].add(h.qID)
            seen.add(h.qID)

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: %s.", out_pickle)
    with open(out_pickle, 'w') as f:
        if out_pickle.endswith(".pickle"):
            dump({'partial_uc': partial_uc, 'nohit': nohit}, f)
        elif out_pickle.endswith(".json"):
            f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit}))
        else:
            raise IOError("Unrecognized extension: %s" % out_pickle)

    os.remove(m5_file)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating %s.", done_filename)
    touch(done_filename)
Exemple #41
0
def build_uc_from_partial(input_fasta,
                          ref_fasta,
                          out_pickle,
                          ccs_fofn=None,
                          done_filename=None,
                          blasr_nproc=12,
                          tmp_dir=None):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.
    blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use
    """
    input_fasta = _get_fasta_path(realpath(input_fasta))
    m5_file = os.path.basename(input_fasta) + ".blasr"
    if tmp_dir is not None:
        m5_file = op.join(tmp_dir, m5_file)

    out_pickle = realpath(out_pickle)

    cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \
          "{r} --bestn 5 ".format(r=real_upath(_get_fasta_path(ref_fasta))) + \
          "--nproc {n} -m 5 ".format(n=blasr_nproc) + \
          "--maxScore -1000 --minPctIdentity 85 " + \
          "--out {o} ".format(o=real_upath(m5_file)) + \
          "1>/dev/null 2>/dev/null"

    execute(cmd)

    if ccs_fofn is None:
        logging.info("Loading probability from model")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        # FIXME this will not work with current CCS bam output, which lacks
        # QV pulse features required - this is handled via a workaround in
        # pbtranscript.tasks.ice_partial
        logging.info("Loading probability from QV in %s", ccs_fofn)
        probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)

    logging.info("Calling blasr_against_ref ...")
    hitItems = blasr_against_ref(output_filename=m5_file,
                                 is_FL=False,
                                 sID_starts_with_c=True,
                                 qver_get_func=probqv.get_smoothed,
                                 qvmean_get_func=probqv.get_mean,
                                 ece_penalty=1,
                                 ece_min_len=10,
                                 same_strand_only=False)

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from BLASR hits.")
    for h in hitItems:
        if h.ece_arr is not None:
            if h.cID not in partial_uc:
                partial_uc[h.cID] = set()
            partial_uc[h.cID].add(h.qID)
            seen.add(h.qID)

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0]
                  for r in ContigSetReaderWrapper(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: %s.", out_pickle)
    with open(out_pickle, 'w') as f:
        if out_pickle.endswith(".pickle"):
            dump({'partial_uc': partial_uc, 'nohit': nohit}, f)
        elif out_pickle.endswith(".json"):
            f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit}))
        else:
            raise IOError("Unrecognized extension: %s" % out_pickle)

    os.remove(m5_file)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating %s.", done_filename)
    touch(done_filename)
Exemple #42
0
    def quiver_cmds_for_bin(self, cids, quiver_nproc=2, bam=False):
        """
        Return a list of quiver related cmds. Input format can be FASTA or BAM.
        If inputs are in FASTA format, call samtoh5, loadPulses, comph5tools.py,
        samtools, loadChemistry, quiver...
        If inputs are in BAM format, call quiver directly.
        """
        first, last = cids[0], cids[-1]
        self.add_log("Creating quiver cmds for c{first} to c{last}".format(
            first=first, last=last))

        bin_ref_fa = self.ref_fa_of_quivered_bin(first, last)
        bin_sam_file = self.sam_of_quivered_bin(first, last)
        bin_cmph5 = self.cmph5_of_quivered_bin(first, last)
        bin_fq = self.fq_of_quivered_bin(first, last)

        bin_unsorted_bam_file = self.bam_of_quivered_bin(first,
                                                         last,
                                                         is_sorted=False)
        bin_bam_file = self.bam_of_quivered_bin(first, last, is_sorted=True)
        bin_bam_prefix = self._quivered_bin_prefix(first, last)

        quiver_input = bin_cmph5 if not bam else bin_bam_file

        cmds = []
        if not bam:
            raise IOError("conversion to cmp.h5 no longer supported")
            cmds.append("samtoh5 {sam} {ref} {cmph5} -smrtTitle".format(
                sam=real_upath(bin_sam_file),
                ref=real_upath(bin_ref_fa),
                cmph5=real_upath(bin_cmph5)))
            cmds.append("gzip {sam}".format(sam=real_upath(bin_sam_file)))
            metrics = [
                "QualityValue", "InsertionQV", "MergeQV", "DeletionQV",
                "DeletionTag", "SubstitutionTag", "SubstitutionQV"
            ]
            cmds.append("loadPulses {bas_fofn} ".format(
                bas_fofn=real_upath(self.bas_fofn)) +
                        "{cmph5} ".format(cmph5=real_upath(bin_cmph5)) +
                        "-byread -metrics " + ",".join(metrics))
            cmds.append("cmph5tools.py sort {cmph5}".format(
                cmph5=real_upath(bin_cmph5)))
            cmds.append("loadChemistry.py {bas_fofn} {cmph5}".format(
                bas_fofn=real_upath(self.bas_fofn),
                cmph5=real_upath(bin_cmph5)))
        else:
            cmds.append("samtools sort {f} {d}".format(
                f=real_upath(bin_unsorted_bam_file),
                d=real_upath(bin_bam_prefix)))
            cmds.append(
                "samtools index {f}".format(f=real_upath(bin_bam_file)))

        cmds.append("samtools faidx {ref}".format(ref=real_upath(bin_ref_fa)))
        cmds.append("pbindex {f}".format(f=real_upath(bin_bam_file)))
        cmds.append("variantCaller --algorithm=best " +
                    "{f} ".format(f=real_upath(quiver_input)) +
                    "--verbose -j{n} ".format(n=quiver_nproc) +
                    "--referenceFilename={ref} ".format(
                        ref=real_upath(bin_ref_fa)) +
                    "-o {fq}".format(fq=real_upath(bin_fq)))
        return cmds