def merge(proc_dir, job1, jobn, n, stride): pipeline = Plumber() map_jobn = NuMap(worker_num=jobn, ordered=False, stride=stride, buffer=10000) p1 = Piper(Worker(merge_star, (proc_dir, n)), parallel=map_jobn) p2 = Piper(Worker(npasser), parallel=map_jobn) pipeline.add_pipe((p1, p2)) return pipeline
def csort(proc_dir, job1, jobn, memn, n, stride): csort_mem = "%sG" % memn pipeline = Plumber() map_jobn = NuMap(worker_num=jobn, ordered=False, stride=stride, buffer=10000) p1 = Piper(Worker(sambamba_csort, (proc_dir, n, csort_mem)), parallel=map_jobn) p2 = Piper(Worker(npasser), parallel=map_jobn) pipeline.add_pipe((p1, p2)) return pipeline
#!/usr/bin/env python # -*- coding: utf-8 -*- from papy.core import Piper, Dagger def l33t(inbox): word = inbox[0] return word.replace('e', '3').replace('o', '0') def join(inbox): left_word, right_word = inbox return left_word + " " + right_word left_l33t = Piper(l33t, branch=1) right_l33t = Piper(l33t, branch=2) join_l33t = Piper(join) pipeline = Dagger() pipeline.add_pipe((left_l33t, join_l33t)) pipeline.add_pipe((right_l33t, join_l33t)) end = pipeline.get_outputs()[0] pipeline.connect([ ['hello', 'hi'], ['world', 'folks'] ]) pipeline.start() print list(end)
"dir": os.getcwd(), "executable": "python", "script": "python_script.py", "in": ("greeting", ), "out": (("package", "txt"), ), "params": {} } sh_worker = Worker(script, (sh_cfg, )) py_worker = Worker(script, (py_cfg, )) # execution engine numap = NuMap() # function nodes sh_piper = Piper(sh_worker, parallel=numap) py_piper = Piper(py_worker, parallel=numap) # topology pipeline = Dagger() pipeline.add_pipe((sh_piper, py_worker)) end = pipeline.get_outputs()[0] # runtime pipeline.connect([[{ "message": "work_moar.txt" }, { "message": "nevar_give_up.txt" }]]) pipeline.start()
#!/usr/bin/env python # -*- coding: utf-8 -*- from numap import NuMap from papy.core import Dagger, Piper from papy.util.func import ipasser def merge(inbox): word1 = inbox[0] word2 = inbox[1] return word1 + word2 # function nodes merge_p = Piper(merge) inp1_p = Piper(ipasser) inp2_p = Piper(ipasser) # topology pipeline = Dagger() pipeline.add_pipe((inp2_p, merge_p), branch="2") pipeline.add_pipe((inp1_p, merge_p), branch="1") end = pipeline.get_outputs()[0] # # runtime pipeline.connect([['hello ', 'world '], ["world", "hello"]]) pipeline.start() print list(end)
def align(proc_dir, job1, jobn, memn, stride, n, unstranded, genome_idx, full_model, genome_seq, rrna_seq, merge_mode, star_mem, alig_star_params, chim_star_params, prune, keep_fastq): cutmrg_script = "work_bbcutmrg_pe.sh" if merge_mode == "bb" else "work_cutmrg_pe.sh" cutmrg_cfg = { "id": "cutmrg", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "bash", "script": "%s/%s" % (WORK_DIR, cutmrg_script), "in": ( "fastq1", "fastq2", ), "out": ( ("main", "log"), ("cut", "log"), ("mrg", "log"), ("cutfq1", "fq"), ("cutfq2", "fq"), ("mrgfq1", "fq"), ("mrgfq2", "fq"), ("mrgfq3", "fq"), ("isize", "txt"), ("stats", "txt"), ("dir", None), ), "params": { "min_len": 25, "cutxargs": "k=31 qskip=3 rieb=t tbo=t tpe=t", "mrgxargs": "k=31 prefilter=2 minoverlap=10 extend2=20 iterations=5", "seq1": "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC", "seq2": "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT", "rrna": rrna_seq, "prune": prune, "xmx": memn, "p": n, } } star_alig_params = { "keep_fastq": keep_fastq, "genomeDir": genome_idx, "runThreadN": n, "genomeLoad": star_mem, ## spliced alignment "outFilterType": "BySJout", "outSAMstrandField": "intronMotif" if unstranded else "None", "alignSJoverhangMin": 8, "alignSJDBoverhangMin": 3, "scoreGenomicLengthLog2scale": 0, "alignIntronMin": 20, "alignIntronMax": 1000000, "alignMatesGapMax": 1000000, } star_alig_params.update(alig_star_params) star_chim_params = { "keep_fastq": keep_fastq, "genomeDir": genome_idx, "runThreadN": n, "genomeLoad": star_mem, "outFilterType": "Normal", ## chimeric alignment "alignIntronMax": 150000, "alignMatesGapMax": 150000, "chimSegmentMin": 10, "chimJunctionOverhangMin": 1, "chimScoreSeparation": 0, "chimScoreJunctionNonGTAG": 0, "chimScoreDropMax": 1000, "chimScoreMin": 1, } star_chim_params.update(chim_star_params) star_alig_cfg = { "id": "alig", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "python2", "script": "%s/%s" % (WORK_DIR, "work_star_alig_pe.py"), "in": ( "cutfq1", "cutfq2", ), "out": ( ("main", "log"), ("dir", None), ), "params": star_alig_params } star_chim_cfg = { "id": "chim", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "python2", "script": "%s/%s" % (WORK_DIR, "work_star_chim_pe.py"), "in": ( "mrgfq1", "mrgfq2", "mrgfq3", ), "out": ( ("main", "log"), ("dir", None), ), "params": star_chim_params } pipeline = Plumber() map_job1 = NuMap(worker_num=job1, ordered=False, stride=stride, buffer=10000) map_jobn = NuMap(worker_num=jobn, ordered=False, stride=stride, buffer=10000) p1 = Piper(Worker(link_fq, (proc_dir, )), parallel=map_job1) p2 = Piper(Worker(script, (cutmrg_cfg, )), parallel=(map_jobn if merge_mode == "bb" else map_job1)) p3a = Piper(Worker(script, (star_alig_cfg, )), parallel=map_jobn) p3b = Piper(Worker(script, (star_chim_cfg, )), parallel=map_jobn) p4a = Piper(Worker(move_alig_star, (proc_dir, )), parallel=map_job1) p4b = Piper(Worker(move_chim_star, (proc_dir, )), parallel=map_job1) p5 = Piper(Worker(npasser), parallel=map_jobn) pipeline.add_pipe((p1, p2, p3a, p4a, p5)) pipeline.add_pipe((p1, p2, p3b, p4b, p5)) return pipeline
from papy.util.func import dump_item, load_item from numap import NuMap, imports from papy.core import Piper, Worker @imports(['os']) def upstream(inbox): up_pid = os.getpid() return str(up_pid) @imports(['os']) def downstream(inbox): up_pid = inbox[0] down_pid = os.getpid() return "%s->%s" % (up_pid, down_pid) host1 = NuMap() host2 = NuMap() up = Worker((upstream, dump_item)) dn = Worker((load_item, downstream)) up_ = Piper(up, parallel=host1) dn_ = Piper(dn, parallel=host2) up_([['hello', 'world', 'hi', 'folks']]) dn_([up_]) up_.start() dn_.start() print list(dn_)
#!/usr/bin/env python # -*- coding: utf-8 -*- from papy.core import Piper def l33t(inbox): word = inbox[0] return word.replace('e', '3').replace('o', '0') l33t_piper = Piper(l33t) l33t_piper([['hello', 'world']]) l33t_piper.start() print list(l33t_piper)
def l33t(inbox): word = inbox[0] leet_yuk = (word.replace('e', '3').replace('o', '0'), 'yuk') print "I'll produce: %s and %s" % leet_yuk return leet_yuk def upper(inbox): word = inbox[0] return word.upper() def noyuk(inbox): print "I got 2 words and 2 yuks: %s" % inbox word_box1, yuk_box1, word_box2, yuk_box2 = inbox return word_box1[0] + ' ' + word_box2[0] l33t_piper = Piper(l33t, produce=2) upper_piper = Piper(upper, spawn=2) noyuk_piper = Piper(noyuk, consume=4) l33t_piper([['hello', 'world', 'hi', 'folks']]) upper_piper([l33t_piper]) noyuk_piper([upper_piper]) l33t_piper.start() upper_piper.start() noyuk_piper.start() print list(noyuk_piper)
#!/usr/bin/env python # -*- coding: utf-8 -*- from papy.core import Piper def l33t(inbox): word = inbox[0] leet_yuk = (word.replace('e', '3').replace('o', '0'), 'yuk') print "I'll produce: %s and %s" % leet_yuk return leet_yuk def noyuk(inbox): print "I got 2 words and 2 yuks: %s" % inbox word_box1, yuk_box1, word_box2, yuk_box2 = inbox return word_box1[0] + ' ' + word_box2[0] l33t_piper = Piper(l33t, produce=2) noyuk_piper = Piper(noyuk, consume=4) l33t_piper([['hello', 'world', 'hi', 'folks']]) noyuk_piper([l33t_piper]) l33t_piper.start() noyuk_piper.start() print list(noyuk_piper)
# wraps clean_seq and defines a specific sequence type and fixes cleaner = Worker(clean_seq, kwargs={'type': 'aa', 'fixes': [('.', '-')]}) # >>> arr = cleaner(['AGA.TA']) # wraps timestamp stamper = Worker(timestamp) # >>> arr = stamper([arr]) # Step 3 (representing computational resources) # creates a resource that allows to utilize all local processors local_computer = NuMap() # Step 4 (creating processing nodes) # this attaches a single computational resource to the two processing nodes # the stamper_node will be tracked i.e. it will store the results of computation # in memory. cleaner_node = Piper(cleaner, parallel=local_computer) stamper_node = Piper(stamper, parallel=local_computer, track=True) # Step 5 (constructing a workflow graph) # we construct a workflow graph add the two processing nodes and define the # connection between them. workflow = Plumber() workflow.add_pipe((cleaner_node, stamper_node)) # Step 6 (execute the workflow) # this starts the workflow, processes data in the "background" and waits # until all data-items have been processed. workflow.start([['AGA.TA', 'TG..AA']]) workflow.run() workflow.wait() results = workflow.stats['pipers_tracked'][stamper_node][0]
def final(proc_dir, job1, jobn, n, stride, full_model, genome_idx, genome_seq, skip_mixcr, skip_cover): ## bamstat bamstat_cfg = { "id": "bamstat", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "bash", "script": "%s/%s" % (WORK_DIR, "work_bamstat.sh"), "in": ("alig_csort", ), "out": ( ("main", "log"), ("idxstat", "txt"), ("flgstat", "txt"), ), "params": {} } virus_cfg = { "id": "virus", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "bash", "script": "%s/%s" % (WORK_DIR, "work_virus.sh"), "in": ("alig_csort", ), "out": ( ("main", "log"), ("call", "txt"), ), "params": {} } gzip_cfg = { "id": "gzip", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "bash", "script": "%s/%s" % (WORK_DIR, "work_gzip.sh"), "in": ( "sj", "junc_se", "junc_pe", ), "out": (("main", "log"), ), "params": {} } cram_cfg = { "id": "pack", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "bash", "script": "%s/%s" % (WORK_DIR, "work_cram.sh"), "in": ( "alig_csort", "chim_pe_csort", "chim_se_csort", ), "out": ( ("main", "log"), ("alig_csort", "cram"), ("chim_pe_csort", "cram"), ("chim_se_csort", "cram"), ), "params": { "genome": genome_seq, "qbin": "2,10,20,25,30,35,40,42" } } cover_cfg = { "id": "cover", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "python2", "script": "%s/%s" % (WORK_DIR, "work_coverage.py"), "in": ( "alig_csort", "chim_pe_csort", "chim_se_csort", ), "out": ( ("main", "log"), ("alig_csort", "bw"), ("chim_pe_csort", "bw"), ("chim_se_csort", "bw"), ), "params": { "chrom_length": genome_idx / "chrNameLength.txt" } } mixcr_cfg = { "id": "mixcr", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "bash", "script": "%s/%s" % (WORK_DIR, "work_mixcr.sh"), "in": ( "alig_csort", "unmap_1", "unmap_2", ), "out": ( ("main", "log"), ("aln.rep", "txt"), ("fix1.rep", "txt"), ("fix2.rep", "txt"), ("ext.rep", "txt"), ("asm.rep", "txt"), ("index", "bin"), ("alig", "vdjca"), ("clone", "clns"), ("dir", None), ), "params": { "p": n, "TRA": "chr14:21543538-22556037", "TRB": "chr7:142290000-142820000", "TRG": "chr7:38237000-38382000", "IGK": "chr2:88789991-90313268", "IGH": "chr14:105580000-106880000", "IGL": "chr22:22020000-22927000", } } ## topology pipeline = Plumber() map_job1 = NuMap(worker_num=job1, ordered=False, stride=stride, buffer=10000) map_jobn = NuMap(worker_num=jobn, ordered=False, stride=stride, buffer=10000) p1 = Piper(Worker(ipasser), parallel=map_job1) p2b = Piper(Worker(script, (cram_cfg, )), parallel=map_job1) p2c = Piper(Worker(script, (gzip_cfg, )), parallel=map_job1) p2d = Piper(Worker(script, (bamstat_cfg, )), parallel=map_job1) p2e = Piper(Worker(script, (virus_cfg, )), parallel=map_job1) p3 = Piper(Worker(npasser), parallel=map_job1) pipeline.add_pipe((p1, p2b, p3)) pipeline.add_pipe((p1, p2c, p3)) pipeline.add_pipe((p1, p2d, p3)) pipeline.add_pipe((p1, p2e, p3)) if not skip_mixcr: p2f = Piper(Worker(script, (mixcr_cfg, )), parallel=map_jobn) pipeline.add_pipe((p1, p2f, p3)) if not skip_cover: p2g = Piper(Worker(script, (cover_cfg, )), parallel=map_job1) pipeline.add_pipe((p1, p2g, p3)) return pipeline
def quant(proc_dir, job1, jobn, n, stride, unstranded, prot_model, full_model, linc_model): ## gene counting prot_cfg = { "id": "prot", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "python2", "script": "%s/work_featc.py" % WORK_DIR, "in": ("alig_nsort", ), "out": (("main", "log"), ("cts", "cts"), ("tmp", None)), "params": { "paired_end": True, "stranded": "0", # always unstranded "duplicates": "", # count duplicates "gtf": prot_model, "n": n, "xargs": "" } } full_cfg = { "id": "full", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "python2", "script": "%s/work_featc.py" % WORK_DIR, "in": ("alig_nsort", ), "out": (("main", "log"), ("cts", "cts"), ("tmp", None)), "params": { "paired_end": True, "stranded": "0" if unstranded else "1", # first read is on the transcript strand "duplicates": "", # count duplicates "gtf": full_model, "n": n, "xargs": "", } } linc_cfg = { "id": "linc", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "python2", "script": "%s/work_featc.py" % WORK_DIR, "in": ("alig_nsort", ), "out": (("main", "log"), ("cts", "cts"), ("tmp", None)), "params": { "paired_end": True, "stranded": "0" if unstranded else "1", # first read is on the transcript strand "duplicates": "", # count duplicates "gtf": linc_model, "n": n, "xargs": "" } } cfgs = [prot_cfg, full_cfg, linc_cfg] ## topology pipeline = Plumber() map_job1 = NuMap(worker_num=job1, ordered=False, stride=stride, buffer=10000) map_job4 = NuMap(worker_num=job1 / 4, ordered=False, stride=stride, buffer=10000) p1 = Piper(Worker(ipasser), parallel=map_job1) p2 = Piper(Worker(npasser), parallel=map_job1) for cfg in cfgs: p = Piper(Worker(script, (cfg, )), parallel=map_job4) pipeline.add_pipe((p1, p, p2)) return pipeline
def l33t(inbox): word = inbox[0] return word.replace('e', '3').replace('o', '0') def l33ter(inbox): word = inbox[0] return word.replace('l', '1') # execution endgine numap = NuMap() # function nodes l33t_piper = Piper(l33t, parallel=numap) l33ter_piper = Piper(l33ter, parallel=numap, track=True) # topology pipeline = Plumber() pipeline.add_pipe((l33t_piper, l33ter_piper)) end = pipeline.get_outputs()[0] # runtime pipeline.start([['hello', 'world']]) pipeline.run() pipeline.wait() print pipeline.stats['pipers_tracked'][end] assert [{0: 'h3110', 1: 'w0r1d'}] == pipeline.stats['pipers_tracked'][end]
#!/usr/bin/env python # -*- coding: utf-8 -*- from papy.core import Piper def nimm2(inbox): left_box, right_box = inbox left_word, right_word = left_box[0], right_box[0] return left_word + ' ' + right_word l33t_piper = Piper(nimm2, consume=2) l33t_piper([['hello', 'world', 'hi', 'folks']]) # length of 4 l33t_piper.start() out = list(l33t_piper) assert out == ['hello world', 'hi folks'] # length of 2 print out
#!/usr/bin/env python # -*- coding: utf-8 -*- #python /usr/lib/python2.6/site-packages/rpyc/servers/classic_server.py -m 'forking' from numap import NuMap, imports from papy.core import Piper @imports(['os']) def hello_from(inbox): word = inbox[0] up_pid = os.getpid() return (word, up_pid) somehost = NuMap(worker_num=0, worker_remote=[('localhost', 2)]) remote_piper = Piper(hello_from, parallel=somehost) remote_piper([['hello', 'world', 'hi', 'folks']]) remote_piper.start() print list(remote_piper)
def upper(inbox): word = inbox[0] return word.upper() def E_to_3(inbox): word = inbox[0] return word.replace('E', '3') def O_to_0(inbox): word = inbox[0] return word.replace('O', '0') upper_fork = Piper(upper) E_end = Piper(E_to_3, branch=1) O_end = Piper(O_to_0, branch=2) pipeline = Dagger() pipeline.add_pipe((upper_fork, E_end)) pipeline.add_pipe((upper_fork, O_end)) left_end, right_end = pipeline.get_outputs() pipeline.connect([['hello', 'world']]) pipeline.start() print zip(left_end, right_end)
#!/usr/bin/env python # -*- coding: utf-8 -*- from numap import NuMap from papy.core import Piper def l33t(inbox): word = inbox[0] return word.replace('e', '3').replace('o', '0') numap = NuMap() l33t_piper = Piper(l33t, parallel=numap) l33t_piper([['hello', 'world']]) l33t_piper.start() print list(l33t_piper)
#!/usr/bin/env python # -*- coding: utf-8 -*- from papy.core import Piper def l33t(inbox): word = inbox[0] return (word.replace('e', '3').replace('o', '0'), ) * 2 l33t_piper = Piper(l33t, produce=2) l33t_piper([['hello', 'world']]) l33t_piper.start() print list(l33t_piper)
from numap import NuMap from papy.core import Dagger, Piper def l33t(inbox): word = inbox[0] return word.replace('e', '3').replace('o', '0') def l33ter(inbox): word = inbox[0] return word.replace('l', '1') # execution endgine numap = NuMap() # function nodes l33t_piper = Piper(l33t, parallel=numap) l33ter_piper = Piper(l33ter, parallel=numap) # topology pipeline = Dagger() pipeline.add_pipe((l33t_piper, l33ter_piper)) end = pipeline.get_outputs()[0] # runtime pipeline.connect([['hello', 'world']]) pipeline.start() print list(end)
def preqc(proc_dir, job1, jobn, n, stride): kmer_cfg = { "id": "kmer", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "bash", "script": "%s/%s" % (WORK_DIR, "work_kmer.sh"), "in": ( "fastq1", "fastq2", ), "out": ( ("main", "log"), ("kmer1", "txt"), ("kmer2", "txt"), ), "params": { "k": 6, } } fastqc_cfg = { "id": "fastqc", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "bash", "script": "%s/%s" % (WORK_DIR, "work_fastqc.sh"), "in": ( "fastq1", "fastq2", ), "out": ( ("main", "log"), ("report", None), ), "params": {} } map_job1 = NuMap(worker_num=job1, ordered=False, stride=stride, buffer=10000) map_jobn = NuMap(worker_num=jobn, ordered=False, stride=stride, buffer=10000) p1 = Piper(Worker(sample_fq, ( proc_dir, 1000000, )), parallel=map_job1) p2 = Piper(Worker(script, (kmer_cfg, )), parallel=map_job1) p3 = Piper(Worker(script, (fastqc_cfg, )), parallel=map_job1) p4 = Piper(Worker(npasser), parallel=map_job1) # topology pipeline = Plumber() pipeline.add_pipe((p1, p2, p4)) pipeline.add_pipe((p1, p3, p4)) return pipeline