Esempio n. 1
0
def sanity_check_gcon2():
    """Sanity check gcon."""
    cmd = gcon2_py + " --help"

    errmsg = gcon2_py + " is not installed."
    execute(cmd=cmd, errmsg=errmsg)
    return gcon2_py
Esempio n. 2
0
def resolved_tool_contract_runner(rtc):
    """
    For each cluster bin, clean up intermediate files under tmp.
    """
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, ClusterChunkTask) for task in p])

    cluster_bin_indices = [task.cluster_bin_index for task in p]
    # sanity check that Cluster indices are unique!
    assert len(set(cluster_bin_indices)) == len(cluster_bin_indices)

    sentinel_out = rtc.task.output_files[0]
    with open(sentinel_out, 'w') as writer:
        for task in p:
            icef = IceFiles(prog_name="ice_cleanup",
                            root_dir=task.cluster_out_dir)
            tmp_dir = icef.tmp_dir
            log.info("Cleaning up, removing %s", tmp_dir)
            writer.write("removing %s\n" % tmp_dir)
            execute("rm -rf %s" % tmp_dir)

            quivered_dir = icef.quivered_dir
            log.info("Cleaning up, removing %s", quivered_dir)
            writer.write("removing %s\n" % quivered_dir)
            execute("rm -rf %s" % quivered_dir)
Esempio n. 3
0
def convert_fofn_to_fasta(fofn_filename,
                          out_filename,
                          fasta_out_dir,
                          force_overwrite=False):
    """
    For each .bax.h5 file, create .bax.h5.fasta file and save paths to
    out_filename, which should usually be 'input.fasta.fofn'
    Modified: 09/14/2015, both ends of subreads in fasta files will
    be trimmed in IceQuiver (trim_and_write_raw_file) instead of here.
    """
    logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename))
    in_fns = get_files_from_file_or_fofn(fofn_filename)
    out_fns = []
    mkdir(fasta_out_dir)
    for in_fn in in_fns:
        logging.debug("converting h5 file: {f}.".format(f=in_fn))
        if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')):
            raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) +
                             "should only contain bax/bas.h5 files.")

        # e.g. m111xxxx.1.bax.h5 ==>
        #      tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp
        #      out_file = m11xxxx.1.bax.h5.fasta
        in_basename = op.basename(in_fn)
        out_file = op.join(fasta_out_dir, in_basename + '.fasta')
        if op.exists(out_file) and not force_overwrite:
            logging.debug(
                "File {0} already exists. skipping.".format(out_file))
        else:
            cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \
                  " {out} ".format(out=real_upath(out_file)) + \
                  "-minSubreadLength 300 -minReadScore 750 -trimByRegion"
            execute(cmd=cmd)
        out_fns.append(out_file)
    write_files_to_fofn(out_fns, out_filename)
Esempio n. 4
0
def sanity_check_gcon():
    """Sanity check gcon."""
    cmd = gcon_py + " --help"

    errmsg = gcon_py + " is not installed."
    execute(cmd=cmd, errmsg=errmsg)
    return gcon_py
Esempio n. 5
0
def blasr_for_quiver(query_fn,
                     ref_fasta,
                     out_fn,
                     bam=False,
                     run_cmd=True,
                     blasr_nproc=12):
    """
    query_fn  --- should be in.raw.fasta|bam
    ref_fasta --- reference fasta (ex: g_consensus.fasta) to align to
    out_fn    --- sam|bam output aligning query_fn to ref_fasta

    blasr query_fn ref_fasta -out out_fn -sam -clipping soft
    blasr query_fn ref_fasta -out out_fn -bam
    """
    cmd = "blasr {i} ".format(i=real_upath(query_fn)) + \
          "{r} ".format(r=real_upath(ref_fasta)) + \
          "--nproc {n} ".format(n=blasr_nproc) + \
          "--bestn 5 --nCandidates 10 " + \
          ("--sam --clipping soft " if not bam else "--bam ") + \
          "--out {o} ".format(o=real_upath(out_fn)) + \
          "1>/dev/null 2>/dev/null"
    if run_cmd:
        execute(cmd)
    else:
        logging.debug("CMD: " + cmd)
    return cmd
Esempio n. 6
0
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir,
                          force_overwrite=False):
    """
    For each .bax.h5 file, create .bax.h5.fasta file and save paths to
    out_filename, which should usually be 'input.fasta.fofn'
    Modified: 09/14/2015, both ends of subreads in fasta files will
    be trimmed in IceQuiver (trim_and_write_raw_file) instead of here.
    """
    logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename))
    in_fns = get_files_from_file_or_fofn(fofn_filename)
    out_fns = []
    mkdir(fasta_out_dir)
    for in_fn in in_fns:
        logging.debug("converting h5 file: {f}.".format(f=in_fn))
        if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')):
            raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) +
                             "should only contain bax/bas.h5 files.")

        # e.g. m111xxxx.1.bax.h5 ==>
        #      tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp
        #      out_file = m11xxxx.1.bax.h5.fasta
        in_basename = op.basename(in_fn)
        out_file = op.join(fasta_out_dir, in_basename + '.fasta')
        if op.exists(out_file) and not force_overwrite:
            logging.debug("File {0} already exists. skipping.".format(out_file))
        else:
            cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \
                  " {out} ".format(out=real_upath(out_file)) + \
                  "-minSubreadLength 300 -minReadScore 750 -trimByRegion"
            execute(cmd=cmd)
        out_fns.append(out_file)
    write_files_to_fofn(out_fns, out_filename)
 def map_isoforms_to_reference_transcripts(self):
     """Map isoforms to reference transcripts."""
     m5out = self.output_analysis_fn + ".blasr.out.m5"
     cmd = 'blasr %s %s --bestn 1 -m 5 --out %s' % \
           (self.isoseq_output_fa, self.reference_transcripts_fn, m5out)
     execute(cmd)
     return [r for r in BLASRM5Reader(m5out)]
Esempio n. 8
0
def resolved_tool_contract_runner(rtc):
    """
    For each cluster bin, clean up intermediate files under tmp.
    """
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, ClusterChunkTask) for task in p])

    cluster_bin_indices = [task.cluster_bin_index for task in p]
    # sanity check that Cluster indices are unique!
    assert len(set(cluster_bin_indices)) == len(cluster_bin_indices)

    sentinel_out = rtc.task.output_files[0]
    with open(sentinel_out, 'w') as writer:
        for task in p:
            icef = IceFiles(prog_name="ice_cleanup",
                            root_dir=task.cluster_out_dir)
            tmp_dir = icef.tmp_dir
            log.info("Cleaning up, removing %s", tmp_dir)
            writer.write("removing %s\n" % tmp_dir)
            execute("rm -rf %s" % real_upath(tmp_dir))

            quivered_dir = icef.quivered_dir
            log.info("Cleaning up, removing %s", quivered_dir)
            writer.write("removing %s\n" % quivered_dir)
            execute("rm -rf %s" % real_upath(quivered_dir))
Esempio n. 9
0
 def map_isoforms_to_reference_transcripts(self):
     """Map isoforms to reference transcripts."""
     m5out = self.output_analysis_fn + ".blasr.out.m5"
     cmd = 'blasr %s %s --bestn 1 -m 5 --out %s' % \
           (self.isoseq_output_fa, self.reference_transcripts_fn, m5out)
     execute(cmd)
     return [r for r in BLASRM5Reader(m5out)]
Esempio n. 10
0
def copy_in_fasta_to_out(in_dir, out_dir, filename):
    """copy filename from in_dir (e.g., data) to out_dir,
    return out_fasta
    """
    mknewdir(out_dir)
    cmd = "cp %s %s" % (op.join(in_dir, filename), op.join(out_dir, filename))
    execute(cmd=cmd)
    return op.join(out_dir, filename)
Esempio n. 11
0
def copy_in_fasta_to_out(in_dir, out_dir, filename):
    """copy filename from in_dir (e.g., data) to out_dir,
    return out_fasta
    """
    mknewdir(out_dir)
    cmd = "cp %s %s" % (op.join(in_dir, filename),
                        op.join(out_dir, filename))
    execute(cmd=cmd)
    return op.join(out_dir, filename)
def validate_with_Gencode(sorted_rep_sam, gencode_gtf, match_out):
    """
    Input:
      sorted_rep_sam -- sorted SAM output mapping (collapsed) representitve isoforms to reference
      eval_dir -- evaluation directory
    Run matchAnnot to compare sorted_rep_sam with gencode v25 and output to eval_dir
    """
    log.info("Writing matchAnnot output to %s", match_out)
    cmd = "matchAnnot.py --gtf={0} {1} > {2}".format(gencode_gtf, sorted_rep_sam, match_out)
    execute(cmd)
def validate_with_Gencode(sorted_rep_sam, gencode_gtf, match_out):
    """
    Input:
      sorted_rep_sam -- sorted SAM output mapping (collapsed) representitve isoforms to reference
      eval_dir -- evaluation directory
    Run matchAnnot to compare sorted_rep_sam with gencode v25 and output to eval_dir
    """
    log.info("Writing matchAnnot output to %s", match_out)
    cmd = "matchAnnot.py --gtf={0} {1} > {2}".format(gencode_gtf,
                                                     sorted_rep_sam, match_out)
    execute(cmd)
Esempio n. 14
0
def map_isoforms_and_sort(input_filename, sam_filename, gmap_db_dir,
                          gmap_db_name, gmap_nproc):
    """
    Map isoforms to references by gmap, generate a sam output and sort sam.
    Parameters:
        input_filename -- input isoforms. e.g., hq_isoforms.fasta|fastq|xml
        sam_filename -- output sam file, produced by gmap and sorted.
        gmap_db_dir -- gmap database directory
        gmap_db_name -- gmap database name
        gmap_nproc -- gmap nproc
    """
    unsorted_sam_filename = sam_filename + ".tmp"
    log_filename = sam_filename + ".log"

    gmap_input_filename = input_filename
    if input_filename.endswith('.xml'):
        # must consolidate dataset xml to FASTA/FASTQ
        w = ContigSetReaderWrapper(input_filename)
        gmap_input_filename = w.consolidate(out_prefix=sam_filename + '.input')
    if not op.exists(gmap_input_filename):
        raise IOError("Gmap input file %s does not exists" %
                      gmap_input_filename)

    # In order to prevent mount issues, cd to ${gmap_db_dir} and ls ${gmap_db_name}.* files
    cwd = realpath(os.getcwd())
    cmd_args = [
        'cd %s' % real_upath(op.join(gmap_db_dir, gmap_db_name)),
        'ls *.iit *meta', 'sleep 3',
        'cd %s' % real_upath(cwd)
    ]
    execute(' && '.join(cmd_args))

    cmd_args = [
        'gmap',
        '-D {d}'.format(d=real_upath(gmap_db_dir)),
        '-d {name}'.format(name=gmap_db_name),
        '-t {nproc}'.format(nproc=gmap_nproc),
        '-n 0',
        '-z sense_force',
        '--cross-species',
        '-f samse',
        '--max-intronlength-ends 200000',  # for long genes
        real_upath(gmap_input_filename),
        '>',
        real_upath(unsorted_sam_filename),
        '2>{log}'.format(log=real_upath(log_filename))
    ]
    # Call gmap to map isoforms to reference and output sam.
    try:
        execute(' '.join(cmd_args))
    except Exception:
        logging.debug("gmap failed, try again.")
        execute('sleep 3')
        execute(' '.join(cmd_args))

    # sort sam file
    sort_sam(in_sam=unsorted_sam_filename, out_sam=sam_filename)

    # remove intermediate unsorted sam file.
    rmpath(unsorted_sam_filename)
Esempio n. 15
0
    def setUp(self):
        """Initialize."""
        self.inputDir  = op.join(DATA_DIR, self.testName)
        self.outDir    = op.join(OUT_DIR,  self.testName)
        self.stdoutDir = op.join(STD_DIR,  self.testName)
        self.fastaFileName = "test_DazzIDHandler.fasta"

        self.stdout_dazz_fasta = op.join(self.stdoutDir,
                                         self.fastaFileName[0:-6] + ".dazz.fasta")
        self.stdout_pickle = self.stdout_dazz_fasta + ".pickle"

        mknewdir(self.outDir)
        # Copy inputDir/test_DazzIDHandler.fasta to outDir.
        execute("cp %s %s" % (op.join(self.inputDir, self.fastaFileName),
                              op.join(self.outDir,   self.fastaFileName)))
Esempio n. 16
0
def sort_sam(in_sam, out_sam):
    """
    Sort input sam file and write to output sam file.
    """
    # Copy SAM headers
    copy_sam_header(in_sam=in_sam, out_sam=out_sam)

    # Call sort to sort gmap output sam file
    cmd_args = ['sort', '-k 3,3', '-k 4,4n', in_sam,
                '| grep -v \'^@\' ', ' >> ', out_sam]

    if os.stat(in_sam).st_size == 0: # overwrite cmds if file is empty
        cmd_args = ['touch', out_sam]

    execute(' '.join(cmd_args))
Esempio n. 17
0
    def setUp(self):
        """Initialize."""
        self.inputDir = op.join(DATA_DIR, self.testName)
        self.outDir = op.join(OUT_DIR, self.testName)
        self.stdoutDir = op.join(STD_DIR, self.testName)
        self.fastaFileName = "test_DazzIDHandler.fasta"

        self.stdout_dazz_fasta = op.join(
            self.stdoutDir, self.fastaFileName[0:-6] + ".dazz.fasta")
        self.stdout_pickle = self.stdout_dazz_fasta + ".pickle"

        mknewdir(self.outDir)
        # Copy inputDir/test_DazzIDHandler.fasta to outDir.
        execute("cp %s %s" % (op.join(self.inputDir, self.fastaFileName),
                              op.join(self.outDir, self.fastaFileName)))
Esempio n. 18
0
 def _align_withBLASR(self, queryFa, targetFa, outFN, ice_opts, sge_opts):
     """Align input reads against itself using BLASR."""
     if op.exists(outFN):
         logging.info("{0} already exists. No need to run BLASR.".format(outFN))
     else:
         cmd = "blasr {q} ".format(q=real_upath(queryFa)) + \
               "{t} ".format(t=real_upath(targetFa)) + \
               "-m 5 --maxLCPLength 15 " + \
               "--nproc {cpu} ".format(cpu=sge_opts.blasr_nproc) + \
               "--maxScore {score} ".format(score=ice_opts.maxScore) + \
               "--bestn {n} --nCandidates {n} ".format(n=ice_opts.bestn) + \
               "--out {o} ".format(o=real_upath(outFN)) + \
               "1>/dev/null 2>/dev/null"
         logging.info("Calling {cmd}".format(cmd=cmd))
         execute(cmd)
Esempio n. 19
0
def sort_sam(in_sam, out_sam):
    """
    Sort input sam file and write to output sam file.
    """
    # Copy SAM headers
    copy_sam_header(in_sam=in_sam, out_sam=out_sam)

    # Call sort to sort gmap output sam file
    cmd_args = ['sort', '-k 3,3', '-k 4,4n', in_sam,
                '| grep -v \'^@\' ', ' >> ', out_sam]

    if os.stat(in_sam).st_size == 0: # overwrite cmds if file is empty
        cmd_args = ['touch', out_sam]

    execute(' '.join(cmd_args))
Esempio n. 20
0
def map_isoforms_and_sort(input_filename, sam_filename,
                          gmap_db_dir, gmap_db_name, gmap_nproc):
    """
    Map isoforms to references by gmap, generate a sam output and sort sam.
    Parameters:
        input_filename -- input isoforms. e.g., hq_isoforms.fasta|fastq|xml
        sam_filename -- output sam file, produced by gmap and sorted.
        gmap_db_dir -- gmap database directory
        gmap_db_name -- gmap database name
        gmap_nproc -- gmap nproc
    """
    unsorted_sam_filename = sam_filename + ".tmp"
    log_filename = sam_filename + ".log"

    gmap_input_filename = input_filename
    if input_filename.endswith('.xml'):
        # must consolidate dataset xml to FASTA/FASTQ
        w = ContigSetReaderWrapper(input_filename)
        gmap_input_filename = w.consolidate(out_prefix=sam_filename+'.input')
    if not op.exists(gmap_input_filename):
        raise IOError("Gmap input file %s does not exists" % gmap_input_filename)

    cmd_args = ['gmap', '-D {d}'.format(d=gmap_db_dir),
                '-d {name}'.format(name=gmap_db_name),
                '-t {nproc}'.format(nproc=gmap_nproc),
                '-n 0',
                '-z sense_force',
                '--cross-species',
                '-f samse',
                gmap_input_filename,
                '>', unsorted_sam_filename,
                '2>{log}'.format(log=log_filename)]
    # Call gmap to map isoforms to reference and output sam.
    execute(' '.join(cmd_args))

    # Copy SAM headers
    copy_sam_header(in_sam=unsorted_sam_filename,
                    out_sam=sam_filename)

    # Call sort to sort gmap output sam file
    cmd_args = ['sort', '-k 3,3', '-k 4,4n', unsorted_sam_filename,
                '| grep -v \'^@\'', '>>', sam_filename]

    execute(' '.join(cmd_args))

    # remove intermediate unsorted sam file.
    rmpath(unsorted_sam_filename)
Esempio n. 21
0
def map_isoforms_and_sort(input_filename, sam_filename, gmap_db_dir,
                          gmap_db_name, gmap_nproc):
    """
    Map isoforms to references by gmap, generate a sam output and sort sam.
    Parameters:
        input_filename -- input isoforms. e.g., hq_isoforms.fasta|fastq|xml
        sam_filename -- output sam file, produced by gmap and sorted.
        gmap_db_dir -- gmap database directory
        gmap_db_name -- gmap database name
        gmap_nproc -- gmap nproc
    """
    unsorted_sam_filename = sam_filename + ".tmp"
    log_filename = sam_filename + ".log"

    gmap_input_filename = input_filename
    if input_filename.endswith('.xml'):
        # must consolidate dataset xml to FASTA/FASTQ
        w = ContigSetReaderWrapper(input_filename)
        gmap_input_filename = w.consolidate(out_prefix=sam_filename + '.input')
    if not op.exists(gmap_input_filename):
        raise IOError("Gmap input file %s does not exists" %
                      gmap_input_filename)

    cmd_args = [
        'gmap', '-D {d}'.format(d=gmap_db_dir),
        '-d {name}'.format(name=gmap_db_name),
        '-t {nproc}'.format(nproc=gmap_nproc), '-n 0', '-z sense_force',
        '--cross-species', '-f samse', gmap_input_filename, '>',
        unsorted_sam_filename, '2>{log}'.format(log=log_filename)
    ]
    # Call gmap to map isoforms to reference and output sam.
    execute(' '.join(cmd_args))

    # Copy SAM headers
    copy_sam_header(in_sam=unsorted_sam_filename, out_sam=sam_filename)

    # Call sort to sort gmap output sam file
    cmd_args = [
        'sort', '-k 3,3', '-k 4,4n', unsorted_sam_filename, '| grep -v \'^@\'',
        '>>', sam_filename
    ]

    execute(' '.join(cmd_args))

    # remove intermediate unsorted sam file.
    rmpath(unsorted_sam_filename)
Esempio n. 22
0
def map_isoforms_and_sort(input_filename, sam_filename,
                          gmap_db_dir, gmap_db_name, gmap_nproc):
    """
    Map isoforms to references by gmap, generate a sam output and sort sam.
    Parameters:
        input_filename -- input isoforms. e.g., hq_isoforms.fasta|fastq|xml
        sam_filename -- output sam file, produced by gmap and sorted.
        gmap_db_dir -- gmap database directory
        gmap_db_name -- gmap database name
        gmap_nproc -- gmap nproc
    """
    unsorted_sam_filename = sam_filename + ".tmp"
    log_filename = sam_filename + ".log"

    gmap_input_filename = input_filename
    if input_filename.endswith('.xml'):
        # must consolidate dataset xml to FASTA/FASTQ
        w = ContigSetReaderWrapper(input_filename)
        gmap_input_filename = w.consolidate(out_prefix=sam_filename+'.input')
    if not op.exists(gmap_input_filename):
        raise IOError("Gmap input file %s does not exists" % gmap_input_filename)

    # In order to prevent mount issues, cd to ${gmap_db_dir} and ls ${gmap_db_name}.* files
    cwd = realpath(os.getcwd())
    cmd_args = ['cd %s' % op.join(gmap_db_dir, gmap_db_name),
                'ls *.iit *meta', 'sleep 3', 'cd %s' % cwd]
    execute(' && '.join(cmd_args))

    cmd_args = ['gmap', '-D {d}'.format(d=gmap_db_dir),
                '-d {name}'.format(name=gmap_db_name),
                '-t {nproc}'.format(nproc=gmap_nproc),
                '-n 0',
                '-z sense_force',
                '--cross-species',
                '-f samse',
                gmap_input_filename,
                '>', unsorted_sam_filename,
                '2>{log}'.format(log=log_filename)]
    # Call gmap to map isoforms to reference and output sam.
    try:
        execute(' '.join(cmd_args))
    except Exception:
        logging.debug("gmap failed, try again.")
        execute('sleep 3')
        execute(' '.join(cmd_args))

    # sort sam file
    sort_sam(in_sam=unsorted_sam_filename, out_sam=sam_filename)

    # remove intermediate unsorted sam file.
    rmpath(unsorted_sam_filename)
Esempio n. 23
0
    def test_concat_bam(self):
        """Test concat_bam, unaligned and aligned."""
        # cat aligned bam files with only one RG, one SN
        fns = [op.join(self.moreDir, "%d.bam" % i) for i in range(1,5)]
        out_fn = op.join(self.outDir, "test_concat_bam_1.bam")
        from pbtranscript.ice.IceUtils import concat_bam
        concat_bam(fns, out_fn)
        self.assertTrue(op.exists(out_fn))

        # cat aligned bam files to a big bam
        fns = [op.join(self.moreDir, "aligned.%d.bam" % i) for i in range(1, 6)]
        out_fn = op.join(self.outDir, "test_concat_bam_2.bam")
        from pbtranscript.Utils import execute
        concat_bam(fns, out_fn)
        self.assertTrue(op.exists(out_fn))

        # convert big bam to sam and compare with std output
        out_sam = out_fn + ".sam"
        stdout_sam = op.join(self.sivStdoutDir, "test_concat_bam_2.sam")
        cmd="samtools view -h %s -o %s" % (out_fn, out_sam)
        execute(cmd=cmd)
        self.cmp_sam(out_sam, stdout_sam)
Esempio n. 24
0
def blasr_for_quiver(query_fn, ref_fasta, out_fn, bam=False,
                     run_cmd=True, blasr_nproc=12):
    """
    query_fn  --- should be in.raw.fasta|bam
    ref_fasta --- reference fasta (ex: g_consensus.fasta) to align to
    out_fn    --- sam|bam output aligning query_fn to ref_fasta

    blasr query_fn ref_fasta -out out_fn -sam -clipping soft
    blasr query_fn ref_fasta -out out_fn -bam
    """
    cmd = "blasr {i} ".format(i=real_upath(query_fn)) + \
          "{r} ".format(r=real_upath(ref_fasta)) + \
          "--nproc {n} ".format(n=blasr_nproc) + \
          "--bestn 5 --nCandidates 10 " + \
          ("--sam --clipping soft " if not bam else "--bam ") + \
          "--out {o} ".format(o=real_upath(out_fn)) + \
          "1>/dev/null 2>/dev/null"
    if run_cmd:
        execute(cmd)
    else:
        logging.debug("CMD: " + cmd)
    return cmd
Esempio n. 25
0
    def test_as_contigset(self):
        """Test as_contigset"""
        out_dir = op.join(OUT_DIR, 'test_Utils')
        mknewdir(out_dir)
        fa = op.join(out_dir, "empty.fasta")
        xml = op.join(out_dir, "empty.contigset.xml")
        fai = fa + ".fai"

        execute("touch %s" % fa)
        as_contigset(fa, xml)
        self.assertTrue(op.exists(xml))
        self.assertTrue(op.exists(fai))

        fn = 'reads_of_insert.fasta'
        shutil.copy(src=op.join(DATA_DIR, fn), dst=op.join(out_dir, fn))
        fa = op.join(out_dir, fn)
        as_contigset(fa, fa)

        fai = fa + ".fai"
        xml = op.join(out_dir, 'reads_of_insert.contigset.xml')
        as_contigset(fa, xml)
        self.assertTrue(op.exists(xml))
        self.assertTrue(op.exists(fai))
Esempio n. 26
0
    def test_concat_bam(self):
        """Test concat_bam, unaligned and aligned."""
        # cat aligned bam files with only one RG, one SN
        fns = [op.join(self.moreDir, "%d.bam" % i) for i in range(1, 5)]
        out_fn = op.join(self.outDir, "test_concat_bam_1.bam")
        from pbtranscript.ice.IceUtils import concat_bam
        concat_bam(fns, out_fn)
        self.assertTrue(op.exists(out_fn))

        # cat aligned bam files to a big bam
        fns = [
            op.join(self.moreDir, "aligned.%d.bam" % i) for i in range(1, 6)
        ]
        out_fn = op.join(self.outDir, "test_concat_bam_2.bam")
        from pbtranscript.Utils import execute
        concat_bam(fns, out_fn)
        self.assertTrue(op.exists(out_fn))

        # convert big bam to sam and compare with std output
        out_sam = out_fn + ".sam"
        stdout_sam = op.join(self.sivStdoutDir, "test_concat_bam_2.sam")
        cmd = "samtools view -h %s -o %s" % (out_fn, out_sam)
        execute(cmd=cmd)
        self.assertTrue(filecmp.cmp(out_sam, stdout_sam))
Esempio n. 27
0
    def make_db(self):
        """Make dazz database for input file.
        1. fasta2DB
        2. DBsplit
        3. get & store number of blocks
        *.dazz.fasta.db will be created.
        """
        log.debug("Making DAZZ database for %s.", self.dazz_filename)
        if not op.exists(self.dazz_filename):
            raise RuntimeError("%s hasn't been converted to daligner-compatible format." %
                               self.input_filename)
        if op.exists(self.db_filename):
            cmd = "DBrm %s" % self.dazz_filename
            execute(cmd=cmd)

        cmd = "fasta2DB %s %s " % (self.dazz_filename, self.dazz_filename)
        execute(cmd=cmd)

        cmd = "DBsplit -s200 %s" % self.dazz_filename
        execute(cmd)
Esempio n. 28
0
    def make_db(self):
        """Make dazz database for input file.
        1. fasta2DB
        2. DBsplit
        3. get & store number of blocks
        *.dazz.fasta.db will be created.
        """
        log.debug("Making DAZZ database for %s.", self.dazz_filename)
        if not op.exists(self.dazz_filename):
            raise RuntimeError(
                "%s hasn't been converted to daligner-compatible format." %
                self.input_filename)
        if op.exists(self.db_filename):
            cmd = "DBrm %s" % self.dazz_filename
            execute(cmd=cmd)

        cmd = "fasta2DB %s %s " % (self.dazz_filename, self.dazz_filename)
        execute(cmd=cmd)

        cmd = "DBsplit -s200 %s" % self.dazz_filename
        execute(cmd)
Esempio n. 29
0
def concat_sam(samfiles, outsam_filename):
    """
    Header looks like:
    @HD     VN:1.3.1
    @SQ     SN:c31  LN:3104 M5:ef7d3f84dea9d9face43e6fd5b6336c4
    @RG     ID:2caa54eef6   PU:in.raw_with_partial.fasta       SM:NO_CHIP_ID
    @PG     ID:BLASR        VN:1.3.1.126469 CL:blasr in.raw_with_partial.fasta g_consensus.fasta -nproc 12 -bestn 5 -nCandidates 10 -sam -out out.sam

    NOTE: check for M5 conflicts; manipulate them if it conflicts
    """
    f_sq = open(outsam_filename + '.sq', 'w')
    f_bd = open(outsam_filename + '.bd', 'w')

    rg_line = None
    pg_line = None

    md5_seen = set()

    if len(samfiles) == 0:
        raise ValueError("No sam input files to concatenate.")

    h = open(samfiles[0])
    line = h.readline()
    assert line.startswith('@HD')
    f_sq.write(line)
    line = h.readline()
    assert line.startswith('@SQ')
    line = h.readline()
    assert line.startswith('@RG')
    rg_line = line  # write at the end
    line = h.readline()
    assert line.startswith('@PG')
    pg_line = line  # write at the end
    h.close()

    for f in samfiles:
        with open(f) as h:
            assert h.readline().startswith('@HD')
            line = h.readline()
            assert line.startswith('@SQ')
            # ------- check for MD5 conflicts ----------- #
            m5 = line.strip().split()[-1]
            assert m5.startswith("M5:")
            if m5 not in md5_seen:
                f_sq.write(line)
                md5_seen.add(m5)
            else:
                s = list(m5[3:])
                while True:
                    # create a random m5 string.
                    random.shuffle(s)
                    s = "".join(s)
                    if s not in md5_seen:
                        break
                line = line[:line.find('M5:')] + 'M5:' + s + '\n'
                logging.debug("MD5 conflict: change to {0}".format(s))
                md5_seen.add(s)
                f_sq.write(line)
            # ----- end MD5 checking and writing --------- #
            assert h.readline().startswith('@RG')
            assert h.readline().startswith('@PG')
            for line in h:
                f_bd.write(line)

    f_bd.close()
    f_sq.write(rg_line)
    f_sq.write(pg_line)
    f_sq.close()

    cmd = "cat {0}.sq {0}.bd > {0}".format(real_upath(outsam_filename))
    execute(cmd=cmd,
            errmsg="Failed to concat sam files! Abort.",
            errcls=IOError)

    os.remove(f_sq.name)
    os.remove(f_bd.name)
Esempio n. 30
0
def concat_sam(samfiles, outsam_filename):
    """
    Header looks like:
    @HD     VN:1.3.1
    @SQ     SN:c31  LN:3104 M5:ef7d3f84dea9d9face43e6fd5b6336c4
    @RG     ID:2caa54eef6   PU:in.raw_with_partial.fasta       SM:NO_CHIP_ID
    @PG     ID:BLASR        VN:1.3.1.126469 CL:blasr in.raw_with_partial.fasta g_consensus.fasta -nproc 12 -bestn 5 -nCandidates 10 -sam -out out.sam

    NOTE: check for M5 conflicts; manipulate them if it conflicts
    """
    f_sq = open(outsam_filename + '.sq', 'w')
    f_bd = open(outsam_filename + '.bd', 'w')

    rg_line = None
    pg_line = None

    md5_seen = set()

    if len(samfiles) == 0:
        raise ValueError("No sam input files to concatenate.")

    h = open(samfiles[0])
    line = h.readline()
    assert line.startswith('@HD')
    f_sq.write(line)
    line = h.readline()
    assert line.startswith('@SQ')
    line = h.readline()
    assert line.startswith('@RG')
    rg_line = line  # write at the end
    line = h.readline()
    assert line.startswith('@PG')
    pg_line = line  # write at the end
    h.close()

    for f in samfiles:
        with open(f) as h:
            assert h.readline().startswith('@HD')
            line = h.readline()
            assert line.startswith('@SQ')
            # ------- check for MD5 conflicts ----------- #
            m5 = line.strip().split()[-1]
            assert m5.startswith("M5:")
            if m5 not in md5_seen:
                f_sq.write(line)
                md5_seen.add(m5)
            else:
                s = list(m5[3:])
                while True:
                    # create a random m5 string.
                    random.shuffle(s)
                    s = "".join(s)
                    if s not in md5_seen:
                        break
                line = line[:line.find('M5:')] + 'M5:' + s + '\n'
                logging.debug("MD5 conflict: change to {0}".format(s))
                md5_seen.add(s)
                f_sq.write(line)
            # ----- end MD5 checking and writing --------- #
            assert h.readline().startswith('@RG')
            assert h.readline().startswith('@PG')
            for line in h:
                f_bd.write(line)

    f_bd.close()
    f_sq.write(rg_line)
    f_sq.write(pg_line)
    f_sq.close()

    cmd = "cat {0}.sq {0}.bd > {0}".format(real_upath(outsam_filename))
    execute(cmd=cmd,
            errmsg="Failed to concat sam files! Abort.",
            errcls=IOError)

    os.remove(f_sq.name)
    os.remove(f_bd.name)
Esempio n. 31
0
def build_uc_from_partial_blasr(input_fasta, ref_fasta, out_pickle,
                                done_filename,
                                ice_opts,
                                probqv,
                                qv_prob_threshold=0.3,
                                cpus=4,
                                no_qv_or_aln_checking=False,
                                tmp_dir=None,
                                sID_starts_with_c=False):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.
    """
    input_fasta = _get_fasta_path(realpath(input_fasta))
    m5_file = os.path.basename(input_fasta) + ".blasr"
    if tmp_dir is not None:
        m5_file = op.join(tmp_dir, m5_file)

    out_pickle = realpath(out_pickle)

    cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \
          "{r} --bestn 100 --nCandidates 200 ".format(r=real_upath(_get_fasta_path(ref_fasta))) + \
          "--nproc {n} -m 5 ".format(n=cpus) + \
          "--maxScore -1000 --minPctIdentity 85 " + \
          "--minAlnLength {a} ".format(a=ice_opts.min_match_len) + \
          "--out {o} ".format(o=real_upath(m5_file)) + \
          "1>/dev/null 2>/dev/null"

    execute(cmd)


    logging.info("Calling blasr_against_ref ...")

    # no need to provide full_missed_start/end for nFLs, since is_FL = False
    hitItems = blasr_against_ref2(output_filename=m5_file,
                                 is_FL=False,
                                 sID_starts_with_c=sID_starts_with_c,
                                 qver_get_func=probqv.get_smoothed,
                                 qvmean_get_func=probqv.get_mean,
                                 qv_prob_threshold=qv_prob_threshold,
                                 ece_penalty=ice_opts.ece_penalty,
                                 ece_min_len=ice_opts.ece_min_len,
                                 max_missed_start=ice_opts.max_missed_start,
                                 max_missed_end=ice_opts.max_missed_end,
                                 full_missed_start=ice_opts.full_missed_start,
                                 full_missed_end=ice_opts.full_missed_end,
                                 same_strand_only=False)


    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from BLASR hits.")
    for h in hitItems:
        if h.ece_arr is not None:
            if h.cID not in partial_uc:
                partial_uc[h.cID] = set()
            partial_uc[h.cID].add(h.qID)
            seen.add(h.qID)

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: %s.", out_pickle)
    with open(out_pickle, 'w') as f:
        if out_pickle.endswith(".pickle"):
            dump({'partial_uc': partial_uc, 'nohit': nohit}, f)
        elif out_pickle.endswith(".json"):
            f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit}))
        else:
            raise IOError("Unrecognized extension: %s" % out_pickle)

    os.remove(m5_file)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating %s.", done_filename)
    touch(done_filename)
Esempio n. 32
0
def build_uc_from_partial(input_fasta,
                          ref_fasta,
                          out_pickle,
                          ccs_fofn=None,
                          done_filename=None,
                          blasr_nproc=12,
                          tmp_dir=None):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.
    blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use
    """
    input_fasta = _get_fasta_path(realpath(input_fasta))
    m5_file = os.path.basename(input_fasta) + ".blasr"
    if tmp_dir is not None:
        m5_file = op.join(tmp_dir, m5_file)

    out_pickle = realpath(out_pickle)

    cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \
          "{r} --bestn 5 ".format(r=real_upath(_get_fasta_path(ref_fasta))) + \
          "--nproc {n} -m 5 ".format(n=blasr_nproc) + \
          "--maxScore -1000 --minPctIdentity 85 " + \
          "--out {o} ".format(o=real_upath(m5_file)) + \
          "1>/dev/null 2>/dev/null"

    execute(cmd)

    if ccs_fofn is None:
        logging.info("Loading probability from model")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        # FIXME this will not work with current CCS bam output, which lacks
        # QV pulse features required - this is handled via a workaround in
        # pbtranscript.tasks.ice_partial
        logging.info("Loading probability from QV in %s", ccs_fofn)
        probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)

    logging.info("Calling blasr_against_ref ...")
    hitItems = blasr_against_ref(output_filename=m5_file,
                                 is_FL=False,
                                 sID_starts_with_c=True,
                                 qver_get_func=probqv.get_smoothed,
                                 qvmean_get_func=probqv.get_mean,
                                 ece_penalty=1,
                                 ece_min_len=10,
                                 same_strand_only=False)

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from BLASR hits.")
    for h in hitItems:
        if h.ece_arr is not None:
            if h.cID not in partial_uc:
                partial_uc[h.cID] = set()
            partial_uc[h.cID].add(h.qID)
            seen.add(h.qID)

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0]
                  for r in ContigSetReaderWrapper(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: %s.", out_pickle)
    with open(out_pickle, 'w') as f:
        if out_pickle.endswith(".pickle"):
            dump({'partial_uc': partial_uc, 'nohit': nohit}, f)
        elif out_pickle.endswith(".json"):
            f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit}))
        else:
            raise IOError("Unrecognized extension: %s" % out_pickle)

    os.remove(m5_file)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating %s.", done_filename)
    touch(done_filename)
Esempio n. 33
0
def test_end_to_end():
    """Call separate_flnc.py from command line, end to end must exit gracefully."""
    cmd = "separate_flnc.py %s %s %s --bin_by_primer" % \
          (FLNC_FASTA, op.join(OUT_DIR, "separate_flnc_by_primer_fasta_input_e2e"),
           op.join(OUT_DIR, "end_to_end1.pickle"))
    execute(cmd)

    cmd = "separate_flnc.py %s %s %s --bin_by_primer" % \
          (FLNC_DATASET, op.join(OUT_DIR, "separate_flnc_by_primer_xml_input_e2e"),
           op.join(OUT_DIR, "end_to_end2.pickle"))
    execute(cmd)

    cmd = "separate_flnc.py %s %s %s --bin_size_kb 1" % \
          (FLNC_FASTA, op.join(OUT_DIR, "separate_flnc_by_size_fasta_input_e2e"),
           op.join(OUT_DIR, "end_to_end3.pickle"))
    execute(cmd)

    cmd = "separate_flnc.py %s %s %s --bin_size_kb 1" % \
          (FLNC_DATASET, op.join(OUT_DIR, "separate_flnc_by_size_xml_input_e2e"),
           op.join(OUT_DIR, "end_to_end4.pickle"))
    execute(cmd)

    cmd = "separate_flnc.py %s %s %s --bin_manual '[0,3,4,6]'" % \
          (FLNC_FASTA, op.join(OUT_DIR, "separate_flnc_by_size_fasta_input_manual_e2e"),
           op.join(OUT_DIR, "end_to_end5.pickle"))
    execute(cmd)

    cmd = "separate_flnc.py %s %s %s --bin_manual '[0,3,4,6]'" % \
          (FLNC_DATASET, op.join(OUT_DIR, "separate_flnc_by_size_xml_input_manual_e2e"),
           op.join(OUT_DIR, "end_to_end6.pickle"))
    execute(cmd)
Esempio n. 34
0
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle,
                          ccs_fofn=None,
                          done_filename=None, blasr_nproc=12, tmp_dir=None):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.
    blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use
    """
    input_fasta = _get_fasta_path(realpath(input_fasta))
    m5_file = os.path.basename(input_fasta) + ".blasr"
    if tmp_dir is not None:
        m5_file = op.join(tmp_dir, m5_file)

    out_pickle = realpath(out_pickle)

    cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \
          "{r} --bestn 5 ".format(r=real_upath(_get_fasta_path(ref_fasta))) + \
          "--nproc {n} -m 5 ".format(n=blasr_nproc) + \
          "--maxScore -1000 --minPctIdentity 85 " + \
          "--out {o} ".format(o=real_upath(m5_file)) + \
          "1>/dev/null 2>/dev/null"

    execute(cmd)

    if ccs_fofn is None:
        logging.info("Loading probability from model")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        # FIXME this will not work with current CCS bam output, which lacks
        # QV pulse features required - this is handled via a workaround in
        # pbtranscript.tasks.ice_partial
        logging.info("Loading probability from QV in %s", ccs_fofn)
        probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)

    logging.info("Calling blasr_against_ref ...")
    hitItems = blasr_against_ref(output_filename=m5_file,
                                 is_FL=False,
                                 sID_starts_with_c=True,
                                 qver_get_func=probqv.get_smoothed,
                                 qvmean_get_func=probqv.get_mean,
                                 ece_penalty=1,
                                 ece_min_len=10,
                                 same_strand_only=False)

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from BLASR hits.")
    for h in hitItems:
        if h.ece_arr is not None:
            if h.cID not in partial_uc:
                partial_uc[h.cID] = set()
            partial_uc[h.cID].add(h.qID)
            seen.add(h.qID)

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: %s.", out_pickle)
    with open(out_pickle, 'w') as f:
        if out_pickle.endswith(".pickle"):
            dump({'partial_uc': partial_uc, 'nohit': nohit}, f)
        elif out_pickle.endswith(".json"):
            f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit}))
        else:
            raise IOError("Unrecognized extension: %s" % out_pickle)

    os.remove(m5_file)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating %s.", done_filename)
    touch(done_filename)
Esempio n. 35
0
def test_end_to_end():
    """Call separate_flnc.py from command line, end to end must exit gracefully."""
    cmd = "separate_flnc.py %s %s %s --bin_by_primer" % \
          (FLNC_FASTA, op.join(OUT_DIR, "separate_flnc_by_primer_fasta_input_e2e"),
           op.join(OUT_DIR, "end_to_end1.pickle"))
    execute(cmd)

    cmd = "separate_flnc.py %s %s %s --bin_by_primer" % \
          (FLNC_DATASET, op.join(OUT_DIR, "separate_flnc_by_primer_xml_input_e2e"),
           op.join(OUT_DIR, "end_to_end2.pickle"))
    execute(cmd)

    cmd = "separate_flnc.py %s %s %s --bin_size_kb 1" % \
          (FLNC_FASTA, op.join(OUT_DIR, "separate_flnc_by_size_fasta_input_e2e"),
           op.join(OUT_DIR, "end_to_end3.pickle"))
    execute(cmd)

    cmd = "separate_flnc.py %s %s %s --bin_size_kb 1" % \
          (FLNC_DATASET, op.join(OUT_DIR, "separate_flnc_by_size_xml_input_e2e"),
           op.join(OUT_DIR, "end_to_end4.pickle"))
    execute(cmd)

    cmd = "separate_flnc.py %s %s %s --bin_manual '[0,3,4,6]'" % \
          (FLNC_FASTA, op.join(OUT_DIR, "separate_flnc_by_size_fasta_input_manual_e2e"),
           op.join(OUT_DIR, "end_to_end5.pickle"))
    execute(cmd)

    cmd = "separate_flnc.py %s %s %s --bin_manual '[0,3,4,6]'" % \
          (FLNC_DATASET, op.join(OUT_DIR, "separate_flnc_by_size_xml_input_manual_e2e"),
           op.join(OUT_DIR, "end_to_end6.pickle"))
    execute(cmd)