Example #1
0
def merge(proc_dir, job1, jobn, n, stride):
    pipeline = Plumber()
    map_jobn = NuMap(worker_num=jobn,
                     ordered=False,
                     stride=stride,
                     buffer=10000)
    p1 = Piper(Worker(merge_star, (proc_dir, n)), parallel=map_jobn)
    p2 = Piper(Worker(npasser), parallel=map_jobn)
    pipeline.add_pipe((p1, p2))
    return pipeline
Example #2
0
def csort(proc_dir, job1, jobn, memn, n, stride):
    csort_mem = "%sG" % memn
    pipeline = Plumber()
    map_jobn = NuMap(worker_num=jobn,
                     ordered=False,
                     stride=stride,
                     buffer=10000)
    p1 = Piper(Worker(sambamba_csort, (proc_dir, n, csort_mem)),
               parallel=map_jobn)
    p2 = Piper(Worker(npasser), parallel=map_jobn)
    pipeline.add_pipe((p1, p2))
    return pipeline
Example #3
0
# >>> arr = stamper([arr])

# Step 3 (representing computational resources)
# creates a resource that allows to utilize all local processors
local_computer = NuMap()

# Step 4 (creating processing nodes)
# this attaches a single computational resource to the two processing nodes
# the stamper_node will be tracked i.e. it will store the results of computation
# in memory.
cleaner_node = Piper(cleaner, parallel=local_computer)
stamper_node = Piper(stamper, parallel=local_computer, track=True)

# Step 5 (constructing a workflow graph)
# we construct a workflow graph add the two processing nodes and define the
# connection between them.
workflow = Plumber()
workflow.add_pipe((cleaner_node, stamper_node))

# Step 6 (execute the workflow)
# this starts the workflow, processes data in the "background" and waits
# until all data-items have been processed.
workflow.start([['AGA.TA', 'TG..AA']])
workflow.run()
workflow.wait()
results = workflow.stats['pipers_tracked'][stamper_node][0]
for seq in results.values():
    print "Object \"%s\" has time stamp: %s " % (seq, seq.meta['timestamp'])


Example #4
0

def l33t(inbox):
    word = inbox[0]
    return word.replace('e', '3').replace('o', '0')


def l33ter(inbox):
    word = inbox[0]
    return word.replace('l', '1')


# execution endgine
numap = NuMap()

# function nodes
l33t_piper = Piper(l33t, parallel=numap)
l33ter_piper = Piper(l33ter, parallel=numap, track=True)

# topology
pipeline = Plumber()
pipeline.add_pipe((l33t_piper, l33ter_piper))
end = pipeline.get_outputs()[0]

# runtime
pipeline.start([['hello', 'world']])
pipeline.run()
pipeline.wait()
print pipeline.stats['pipers_tracked'][end]
assert [{0: 'h3110', 1: 'w0r1d'}] == pipeline.stats['pipers_tracked'][end]
Example #5
0
from papy.core import Plumber, Piper

def l33t(inbox):
    word = inbox[0]
    return word.replace('e', '3').replace('o', '0')

def l33ter(inbox):
    word = inbox[0]
    return word.replace('l', '1')

# execution endgine
numap = NuMap()

# function nodes
l33t_piper = Piper(l33t, parallel=numap)
l33ter_piper = Piper(l33ter, parallel=numap, track=True)

# topology
pipeline = Plumber()
pipeline.add_pipe((l33t_piper, l33ter_piper))
end = pipeline.get_outputs()[0]

# runtime
pipeline.start([['hello', 'world']])
pipeline.run()
pipeline.wait()
print pipeline.stats['pipers_tracked'][end]
assert [{0: 'h3110', 1: 'w0r1d'}] == pipeline.stats['pipers_tracked'][end]


Example #6
0
# wraps timestamp
stamper = Worker(timestamp)
# >>> arr = stamper([arr])

# Step 3 (representing computational resources)
# creates a resource that allows to utilize all local processors
local_computer = NuMap()

# Step 4 (creating processing nodes)
# this attaches a single computational resource to the two processing nodes
# the stamper_node will be tracked i.e. it will store the results of computation
# in memory.
cleaner_node = Piper(cleaner, parallel=local_computer)
stamper_node = Piper(stamper, parallel=local_computer, track=True)

# Step 5 (constructing a workflow graph)
# we construct a workflow graph add the two processing nodes and define the
# connection between them.
workflow = Plumber()
workflow.add_pipe((cleaner_node, stamper_node))

# Step 6 (execute the workflow)
# this starts the workflow, processes data in the "background" and waits
# until all data-items have been processed.
workflow.start([['AGA.TA', 'TG..AA']])
workflow.run()
workflow.wait()
results = workflow.stats['pipers_tracked'][stamper_node][0]
for seq in results.values():
    print "Object \"%s\" has time stamp: %s " % (seq, seq.meta['timestamp'])
Example #7
0
def final(proc_dir, job1, jobn, n, stride, full_model, genome_idx, genome_seq,
          skip_mixcr, skip_cover):

    ## bamstat
    bamstat_cfg = {
        "id": "bamstat",
        "evaluator": EVALUATOR,
        "preamble": PREAMBLE,
        "dir": proc_dir,
        "executable": "bash",
        "script": "%s/%s" % (WORK_DIR, "work_bamstat.sh"),
        "in": ("alig_csort", ),
        "out": (
            ("main", "log"),
            ("idxstat", "txt"),
            ("flgstat", "txt"),
        ),
        "params": {}
    }

    virus_cfg = {
        "id": "virus",
        "evaluator": EVALUATOR,
        "preamble": PREAMBLE,
        "dir": proc_dir,
        "executable": "bash",
        "script": "%s/%s" % (WORK_DIR, "work_virus.sh"),
        "in": ("alig_csort", ),
        "out": (
            ("main", "log"),
            ("call", "txt"),
        ),
        "params": {}
    }

    gzip_cfg = {
        "id": "gzip",
        "evaluator": EVALUATOR,
        "preamble": PREAMBLE,
        "dir": proc_dir,
        "executable": "bash",
        "script": "%s/%s" % (WORK_DIR, "work_gzip.sh"),
        "in": (
            "sj",
            "junc_se",
            "junc_pe",
        ),
        "out": (("main", "log"), ),
        "params": {}
    }

    cram_cfg = {
        "id":
        "pack",
        "evaluator":
        EVALUATOR,
        "preamble":
        PREAMBLE,
        "dir":
        proc_dir,
        "executable":
        "bash",
        "script":
        "%s/%s" % (WORK_DIR, "work_cram.sh"),
        "in": (
            "alig_csort",
            "chim_pe_csort",
            "chim_se_csort",
        ),
        "out": (
            ("main", "log"),
            ("alig_csort", "cram"),
            ("chim_pe_csort", "cram"),
            ("chim_se_csort", "cram"),
        ),
        "params": {
            "genome": genome_seq,
            "qbin": "2,10,20,25,30,35,40,42"
        }
    }

    cover_cfg = {
        "id":
        "cover",
        "evaluator":
        EVALUATOR,
        "preamble":
        PREAMBLE,
        "dir":
        proc_dir,
        "executable":
        "python2",
        "script":
        "%s/%s" % (WORK_DIR, "work_coverage.py"),
        "in": (
            "alig_csort",
            "chim_pe_csort",
            "chim_se_csort",
        ),
        "out": (
            ("main", "log"),
            ("alig_csort", "bw"),
            ("chim_pe_csort", "bw"),
            ("chim_se_csort", "bw"),
        ),
        "params": {
            "chrom_length": genome_idx / "chrNameLength.txt"
        }
    }

    mixcr_cfg = {
        "id":
        "mixcr",
        "evaluator":
        EVALUATOR,
        "preamble":
        PREAMBLE,
        "dir":
        proc_dir,
        "executable":
        "bash",
        "script":
        "%s/%s" % (WORK_DIR, "work_mixcr.sh"),
        "in": (
            "alig_csort",
            "unmap_1",
            "unmap_2",
        ),
        "out": (
            ("main", "log"),
            ("aln.rep", "txt"),
            ("fix1.rep", "txt"),
            ("fix2.rep", "txt"),
            ("ext.rep", "txt"),
            ("asm.rep", "txt"),
            ("index", "bin"),
            ("alig", "vdjca"),
            ("clone", "clns"),
            ("dir", None),
        ),
        "params": {
            "p": n,
            "TRA": "chr14:21543538-22556037",
            "TRB": "chr7:142290000-142820000",
            "TRG": "chr7:38237000-38382000",
            "IGK": "chr2:88789991-90313268",
            "IGH": "chr14:105580000-106880000",
            "IGL": "chr22:22020000-22927000",
        }
    }

    ## topology
    pipeline = Plumber()
    map_job1 = NuMap(worker_num=job1,
                     ordered=False,
                     stride=stride,
                     buffer=10000)
    map_jobn = NuMap(worker_num=jobn,
                     ordered=False,
                     stride=stride,
                     buffer=10000)
    p1 = Piper(Worker(ipasser), parallel=map_job1)
    p2b = Piper(Worker(script, (cram_cfg, )), parallel=map_job1)
    p2c = Piper(Worker(script, (gzip_cfg, )), parallel=map_job1)
    p2d = Piper(Worker(script, (bamstat_cfg, )), parallel=map_job1)
    p2e = Piper(Worker(script, (virus_cfg, )), parallel=map_job1)
    p3 = Piper(Worker(npasser), parallel=map_job1)
    pipeline.add_pipe((p1, p2b, p3))
    pipeline.add_pipe((p1, p2c, p3))
    pipeline.add_pipe((p1, p2d, p3))
    pipeline.add_pipe((p1, p2e, p3))
    if not skip_mixcr:
        p2f = Piper(Worker(script, (mixcr_cfg, )), parallel=map_jobn)
        pipeline.add_pipe((p1, p2f, p3))
    if not skip_cover:
        p2g = Piper(Worker(script, (cover_cfg, )), parallel=map_job1)
        pipeline.add_pipe((p1, p2g, p3))
    return pipeline
Example #8
0
def quant(proc_dir, job1, jobn, n, stride, unstranded, prot_model, full_model,
          linc_model):

    ## gene counting
    prot_cfg = {
        "id": "prot",
        "evaluator": EVALUATOR,
        "preamble": PREAMBLE,
        "dir": proc_dir,
        "executable": "python2",
        "script": "%s/work_featc.py" % WORK_DIR,
        "in": ("alig_nsort", ),
        "out": (("main", "log"), ("cts", "cts"), ("tmp", None)),
        "params": {
            "paired_end": True,
            "stranded": "0",  # always unstranded
            "duplicates": "",  # count duplicates
            "gtf": prot_model,
            "n": n,
            "xargs": ""
        }
    }

    full_cfg = {
        "id": "full",
        "evaluator": EVALUATOR,
        "preamble": PREAMBLE,
        "dir": proc_dir,
        "executable": "python2",
        "script": "%s/work_featc.py" % WORK_DIR,
        "in": ("alig_nsort", ),
        "out": (("main", "log"), ("cts", "cts"), ("tmp", None)),
        "params": {
            "paired_end": True,
            "stranded": "0"
            if unstranded else "1",  # first read is on the transcript strand
            "duplicates": "",  # count duplicates
            "gtf": full_model,
            "n": n,
            "xargs": "",
        }
    }

    linc_cfg = {
        "id": "linc",
        "evaluator": EVALUATOR,
        "preamble": PREAMBLE,
        "dir": proc_dir,
        "executable": "python2",
        "script": "%s/work_featc.py" % WORK_DIR,
        "in": ("alig_nsort", ),
        "out": (("main", "log"), ("cts", "cts"), ("tmp", None)),
        "params": {
            "paired_end": True,
            "stranded": "0"
            if unstranded else "1",  # first read is on the transcript strand
            "duplicates": "",  # count duplicates
            "gtf": linc_model,
            "n": n,
            "xargs": ""
        }
    }

    cfgs = [prot_cfg, full_cfg, linc_cfg]
    ## topology
    pipeline = Plumber()
    map_job1 = NuMap(worker_num=job1,
                     ordered=False,
                     stride=stride,
                     buffer=10000)
    map_job4 = NuMap(worker_num=job1 / 4,
                     ordered=False,
                     stride=stride,
                     buffer=10000)

    p1 = Piper(Worker(ipasser), parallel=map_job1)
    p2 = Piper(Worker(npasser), parallel=map_job1)
    for cfg in cfgs:
        p = Piper(Worker(script, (cfg, )), parallel=map_job4)
        pipeline.add_pipe((p1, p, p2))
    return pipeline
Example #9
0
def align(proc_dir, job1, jobn, memn, stride, n, unstranded, genome_idx,
          full_model, genome_seq, rrna_seq, merge_mode, star_mem,
          alig_star_params, chim_star_params, prune, keep_fastq):

    cutmrg_script = "work_bbcutmrg_pe.sh" if merge_mode == "bb" else "work_cutmrg_pe.sh"
    cutmrg_cfg = {
        "id":
        "cutmrg",
        "evaluator":
        EVALUATOR,
        "preamble":
        PREAMBLE,
        "dir":
        proc_dir,
        "executable":
        "bash",
        "script":
        "%s/%s" % (WORK_DIR, cutmrg_script),
        "in": (
            "fastq1",
            "fastq2",
        ),
        "out": (
            ("main", "log"),
            ("cut", "log"),
            ("mrg", "log"),
            ("cutfq1", "fq"),
            ("cutfq2", "fq"),
            ("mrgfq1", "fq"),
            ("mrgfq2", "fq"),
            ("mrgfq3", "fq"),
            ("isize", "txt"),
            ("stats", "txt"),
            ("dir", None),
        ),
        "params": {
            "min_len": 25,
            "cutxargs": "k=31 qskip=3 rieb=t tbo=t tpe=t",
            "mrgxargs":
            "k=31 prefilter=2 minoverlap=10 extend2=20 iterations=5",
            "seq1": "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC",
            "seq2":
            "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT",
            "rrna": rrna_seq,
            "prune": prune,
            "xmx": memn,
            "p": n,
        }
    }

    star_alig_params = {
        "keep_fastq": keep_fastq,
        "genomeDir": genome_idx,
        "runThreadN": n,
        "genomeLoad": star_mem,
        ## spliced alignment
        "outFilterType": "BySJout",
        "outSAMstrandField": "intronMotif" if unstranded else "None",
        "alignSJoverhangMin": 8,
        "alignSJDBoverhangMin": 3,
        "scoreGenomicLengthLog2scale": 0,
        "alignIntronMin": 20,
        "alignIntronMax": 1000000,
        "alignMatesGapMax": 1000000,
    }
    star_alig_params.update(alig_star_params)

    star_chim_params = {
        "keep_fastq": keep_fastq,
        "genomeDir": genome_idx,
        "runThreadN": n,
        "genomeLoad": star_mem,
        "outFilterType": "Normal",
        ## chimeric alignment
        "alignIntronMax": 150000,
        "alignMatesGapMax": 150000,
        "chimSegmentMin": 10,
        "chimJunctionOverhangMin": 1,
        "chimScoreSeparation": 0,
        "chimScoreJunctionNonGTAG": 0,
        "chimScoreDropMax": 1000,
        "chimScoreMin": 1,
    }
    star_chim_params.update(chim_star_params)

    star_alig_cfg = {
        "id": "alig",
        "evaluator": EVALUATOR,
        "preamble": PREAMBLE,
        "dir": proc_dir,
        "executable": "python2",
        "script": "%s/%s" % (WORK_DIR, "work_star_alig_pe.py"),
        "in": (
            "cutfq1",
            "cutfq2",
        ),
        "out": (
            ("main", "log"),
            ("dir", None),
        ),
        "params": star_alig_params
    }

    star_chim_cfg = {
        "id": "chim",
        "evaluator": EVALUATOR,
        "preamble": PREAMBLE,
        "dir": proc_dir,
        "executable": "python2",
        "script": "%s/%s" % (WORK_DIR, "work_star_chim_pe.py"),
        "in": (
            "mrgfq1",
            "mrgfq2",
            "mrgfq3",
        ),
        "out": (
            ("main", "log"),
            ("dir", None),
        ),
        "params": star_chim_params
    }

    pipeline = Plumber()
    map_job1 = NuMap(worker_num=job1,
                     ordered=False,
                     stride=stride,
                     buffer=10000)
    map_jobn = NuMap(worker_num=jobn,
                     ordered=False,
                     stride=stride,
                     buffer=10000)
    p1 = Piper(Worker(link_fq, (proc_dir, )), parallel=map_job1)
    p2 = Piper(Worker(script, (cutmrg_cfg, )),
               parallel=(map_jobn if merge_mode == "bb" else map_job1))
    p3a = Piper(Worker(script, (star_alig_cfg, )), parallel=map_jobn)
    p3b = Piper(Worker(script, (star_chim_cfg, )), parallel=map_jobn)
    p4a = Piper(Worker(move_alig_star, (proc_dir, )), parallel=map_job1)
    p4b = Piper(Worker(move_chim_star, (proc_dir, )), parallel=map_job1)
    p5 = Piper(Worker(npasser), parallel=map_jobn)
    pipeline.add_pipe((p1, p2, p3a, p4a, p5))
    pipeline.add_pipe((p1, p2, p3b, p4b, p5))
    return pipeline
Example #10
0
def preqc(proc_dir, job1, jobn, n, stride):

    kmer_cfg = {
        "id": "kmer",
        "evaluator": EVALUATOR,
        "preamble": PREAMBLE,
        "dir": proc_dir,
        "executable": "bash",
        "script": "%s/%s" % (WORK_DIR, "work_kmer.sh"),
        "in": (
            "fastq1",
            "fastq2",
        ),
        "out": (
            ("main", "log"),
            ("kmer1", "txt"),
            ("kmer2", "txt"),
        ),
        "params": {
            "k": 6,
        }
    }

    fastqc_cfg = {
        "id": "fastqc",
        "evaluator": EVALUATOR,
        "preamble": PREAMBLE,
        "dir": proc_dir,
        "executable": "bash",
        "script": "%s/%s" % (WORK_DIR, "work_fastqc.sh"),
        "in": (
            "fastq1",
            "fastq2",
        ),
        "out": (
            ("main", "log"),
            ("report", None),
        ),
        "params": {}
    }

    map_job1 = NuMap(worker_num=job1,
                     ordered=False,
                     stride=stride,
                     buffer=10000)
    map_jobn = NuMap(worker_num=jobn,
                     ordered=False,
                     stride=stride,
                     buffer=10000)
    p1 = Piper(Worker(sample_fq, (
        proc_dir,
        1000000,
    )), parallel=map_job1)
    p2 = Piper(Worker(script, (kmer_cfg, )), parallel=map_job1)
    p3 = Piper(Worker(script, (fastqc_cfg, )), parallel=map_job1)
    p4 = Piper(Worker(npasser), parallel=map_job1)
    # topology
    pipeline = Plumber()
    pipeline.add_pipe((p1, p2, p4))
    pipeline.add_pipe((p1, p3, p4))
    return pipeline