def merge(proc_dir, job1, jobn, n, stride): pipeline = Plumber() map_jobn = NuMap(worker_num=jobn, ordered=False, stride=stride, buffer=10000) p1 = Piper(Worker(merge_star, (proc_dir, n)), parallel=map_jobn) p2 = Piper(Worker(npasser), parallel=map_jobn) pipeline.add_pipe((p1, p2)) return pipeline
def csort(proc_dir, job1, jobn, memn, n, stride): csort_mem = "%sG" % memn pipeline = Plumber() map_jobn = NuMap(worker_num=jobn, ordered=False, stride=stride, buffer=10000) p1 = Piper(Worker(sambamba_csort, (proc_dir, n, csort_mem)), parallel=map_jobn) p2 = Piper(Worker(npasser), parallel=map_jobn) pipeline.add_pipe((p1, p2)) return pipeline
# >>> arr = stamper([arr]) # Step 3 (representing computational resources) # creates a resource that allows to utilize all local processors local_computer = NuMap() # Step 4 (creating processing nodes) # this attaches a single computational resource to the two processing nodes # the stamper_node will be tracked i.e. it will store the results of computation # in memory. cleaner_node = Piper(cleaner, parallel=local_computer) stamper_node = Piper(stamper, parallel=local_computer, track=True) # Step 5 (constructing a workflow graph) # we construct a workflow graph add the two processing nodes and define the # connection between them. workflow = Plumber() workflow.add_pipe((cleaner_node, stamper_node)) # Step 6 (execute the workflow) # this starts the workflow, processes data in the "background" and waits # until all data-items have been processed. workflow.start([['AGA.TA', 'TG..AA']]) workflow.run() workflow.wait() results = workflow.stats['pipers_tracked'][stamper_node][0] for seq in results.values(): print "Object \"%s\" has time stamp: %s " % (seq, seq.meta['timestamp'])
def l33t(inbox): word = inbox[0] return word.replace('e', '3').replace('o', '0') def l33ter(inbox): word = inbox[0] return word.replace('l', '1') # execution endgine numap = NuMap() # function nodes l33t_piper = Piper(l33t, parallel=numap) l33ter_piper = Piper(l33ter, parallel=numap, track=True) # topology pipeline = Plumber() pipeline.add_pipe((l33t_piper, l33ter_piper)) end = pipeline.get_outputs()[0] # runtime pipeline.start([['hello', 'world']]) pipeline.run() pipeline.wait() print pipeline.stats['pipers_tracked'][end] assert [{0: 'h3110', 1: 'w0r1d'}] == pipeline.stats['pipers_tracked'][end]
from papy.core import Plumber, Piper def l33t(inbox): word = inbox[0] return word.replace('e', '3').replace('o', '0') def l33ter(inbox): word = inbox[0] return word.replace('l', '1') # execution endgine numap = NuMap() # function nodes l33t_piper = Piper(l33t, parallel=numap) l33ter_piper = Piper(l33ter, parallel=numap, track=True) # topology pipeline = Plumber() pipeline.add_pipe((l33t_piper, l33ter_piper)) end = pipeline.get_outputs()[0] # runtime pipeline.start([['hello', 'world']]) pipeline.run() pipeline.wait() print pipeline.stats['pipers_tracked'][end] assert [{0: 'h3110', 1: 'w0r1d'}] == pipeline.stats['pipers_tracked'][end]
# wraps timestamp stamper = Worker(timestamp) # >>> arr = stamper([arr]) # Step 3 (representing computational resources) # creates a resource that allows to utilize all local processors local_computer = NuMap() # Step 4 (creating processing nodes) # this attaches a single computational resource to the two processing nodes # the stamper_node will be tracked i.e. it will store the results of computation # in memory. cleaner_node = Piper(cleaner, parallel=local_computer) stamper_node = Piper(stamper, parallel=local_computer, track=True) # Step 5 (constructing a workflow graph) # we construct a workflow graph add the two processing nodes and define the # connection between them. workflow = Plumber() workflow.add_pipe((cleaner_node, stamper_node)) # Step 6 (execute the workflow) # this starts the workflow, processes data in the "background" and waits # until all data-items have been processed. workflow.start([['AGA.TA', 'TG..AA']]) workflow.run() workflow.wait() results = workflow.stats['pipers_tracked'][stamper_node][0] for seq in results.values(): print "Object \"%s\" has time stamp: %s " % (seq, seq.meta['timestamp'])
def final(proc_dir, job1, jobn, n, stride, full_model, genome_idx, genome_seq, skip_mixcr, skip_cover): ## bamstat bamstat_cfg = { "id": "bamstat", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "bash", "script": "%s/%s" % (WORK_DIR, "work_bamstat.sh"), "in": ("alig_csort", ), "out": ( ("main", "log"), ("idxstat", "txt"), ("flgstat", "txt"), ), "params": {} } virus_cfg = { "id": "virus", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "bash", "script": "%s/%s" % (WORK_DIR, "work_virus.sh"), "in": ("alig_csort", ), "out": ( ("main", "log"), ("call", "txt"), ), "params": {} } gzip_cfg = { "id": "gzip", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "bash", "script": "%s/%s" % (WORK_DIR, "work_gzip.sh"), "in": ( "sj", "junc_se", "junc_pe", ), "out": (("main", "log"), ), "params": {} } cram_cfg = { "id": "pack", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "bash", "script": "%s/%s" % (WORK_DIR, "work_cram.sh"), "in": ( "alig_csort", "chim_pe_csort", "chim_se_csort", ), "out": ( ("main", "log"), ("alig_csort", "cram"), ("chim_pe_csort", "cram"), ("chim_se_csort", "cram"), ), "params": { "genome": genome_seq, "qbin": "2,10,20,25,30,35,40,42" } } cover_cfg = { "id": "cover", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "python2", "script": "%s/%s" % (WORK_DIR, "work_coverage.py"), "in": ( "alig_csort", "chim_pe_csort", "chim_se_csort", ), "out": ( ("main", "log"), ("alig_csort", "bw"), ("chim_pe_csort", "bw"), ("chim_se_csort", "bw"), ), "params": { "chrom_length": genome_idx / "chrNameLength.txt" } } mixcr_cfg = { "id": "mixcr", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "bash", "script": "%s/%s" % (WORK_DIR, "work_mixcr.sh"), "in": ( "alig_csort", "unmap_1", "unmap_2", ), "out": ( ("main", "log"), ("aln.rep", "txt"), ("fix1.rep", "txt"), ("fix2.rep", "txt"), ("ext.rep", "txt"), ("asm.rep", "txt"), ("index", "bin"), ("alig", "vdjca"), ("clone", "clns"), ("dir", None), ), "params": { "p": n, "TRA": "chr14:21543538-22556037", "TRB": "chr7:142290000-142820000", "TRG": "chr7:38237000-38382000", "IGK": "chr2:88789991-90313268", "IGH": "chr14:105580000-106880000", "IGL": "chr22:22020000-22927000", } } ## topology pipeline = Plumber() map_job1 = NuMap(worker_num=job1, ordered=False, stride=stride, buffer=10000) map_jobn = NuMap(worker_num=jobn, ordered=False, stride=stride, buffer=10000) p1 = Piper(Worker(ipasser), parallel=map_job1) p2b = Piper(Worker(script, (cram_cfg, )), parallel=map_job1) p2c = Piper(Worker(script, (gzip_cfg, )), parallel=map_job1) p2d = Piper(Worker(script, (bamstat_cfg, )), parallel=map_job1) p2e = Piper(Worker(script, (virus_cfg, )), parallel=map_job1) p3 = Piper(Worker(npasser), parallel=map_job1) pipeline.add_pipe((p1, p2b, p3)) pipeline.add_pipe((p1, p2c, p3)) pipeline.add_pipe((p1, p2d, p3)) pipeline.add_pipe((p1, p2e, p3)) if not skip_mixcr: p2f = Piper(Worker(script, (mixcr_cfg, )), parallel=map_jobn) pipeline.add_pipe((p1, p2f, p3)) if not skip_cover: p2g = Piper(Worker(script, (cover_cfg, )), parallel=map_job1) pipeline.add_pipe((p1, p2g, p3)) return pipeline
def quant(proc_dir, job1, jobn, n, stride, unstranded, prot_model, full_model, linc_model): ## gene counting prot_cfg = { "id": "prot", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "python2", "script": "%s/work_featc.py" % WORK_DIR, "in": ("alig_nsort", ), "out": (("main", "log"), ("cts", "cts"), ("tmp", None)), "params": { "paired_end": True, "stranded": "0", # always unstranded "duplicates": "", # count duplicates "gtf": prot_model, "n": n, "xargs": "" } } full_cfg = { "id": "full", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "python2", "script": "%s/work_featc.py" % WORK_DIR, "in": ("alig_nsort", ), "out": (("main", "log"), ("cts", "cts"), ("tmp", None)), "params": { "paired_end": True, "stranded": "0" if unstranded else "1", # first read is on the transcript strand "duplicates": "", # count duplicates "gtf": full_model, "n": n, "xargs": "", } } linc_cfg = { "id": "linc", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "python2", "script": "%s/work_featc.py" % WORK_DIR, "in": ("alig_nsort", ), "out": (("main", "log"), ("cts", "cts"), ("tmp", None)), "params": { "paired_end": True, "stranded": "0" if unstranded else "1", # first read is on the transcript strand "duplicates": "", # count duplicates "gtf": linc_model, "n": n, "xargs": "" } } cfgs = [prot_cfg, full_cfg, linc_cfg] ## topology pipeline = Plumber() map_job1 = NuMap(worker_num=job1, ordered=False, stride=stride, buffer=10000) map_job4 = NuMap(worker_num=job1 / 4, ordered=False, stride=stride, buffer=10000) p1 = Piper(Worker(ipasser), parallel=map_job1) p2 = Piper(Worker(npasser), parallel=map_job1) for cfg in cfgs: p = Piper(Worker(script, (cfg, )), parallel=map_job4) pipeline.add_pipe((p1, p, p2)) return pipeline
def align(proc_dir, job1, jobn, memn, stride, n, unstranded, genome_idx, full_model, genome_seq, rrna_seq, merge_mode, star_mem, alig_star_params, chim_star_params, prune, keep_fastq): cutmrg_script = "work_bbcutmrg_pe.sh" if merge_mode == "bb" else "work_cutmrg_pe.sh" cutmrg_cfg = { "id": "cutmrg", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "bash", "script": "%s/%s" % (WORK_DIR, cutmrg_script), "in": ( "fastq1", "fastq2", ), "out": ( ("main", "log"), ("cut", "log"), ("mrg", "log"), ("cutfq1", "fq"), ("cutfq2", "fq"), ("mrgfq1", "fq"), ("mrgfq2", "fq"), ("mrgfq3", "fq"), ("isize", "txt"), ("stats", "txt"), ("dir", None), ), "params": { "min_len": 25, "cutxargs": "k=31 qskip=3 rieb=t tbo=t tpe=t", "mrgxargs": "k=31 prefilter=2 minoverlap=10 extend2=20 iterations=5", "seq1": "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC", "seq2": "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT", "rrna": rrna_seq, "prune": prune, "xmx": memn, "p": n, } } star_alig_params = { "keep_fastq": keep_fastq, "genomeDir": genome_idx, "runThreadN": n, "genomeLoad": star_mem, ## spliced alignment "outFilterType": "BySJout", "outSAMstrandField": "intronMotif" if unstranded else "None", "alignSJoverhangMin": 8, "alignSJDBoverhangMin": 3, "scoreGenomicLengthLog2scale": 0, "alignIntronMin": 20, "alignIntronMax": 1000000, "alignMatesGapMax": 1000000, } star_alig_params.update(alig_star_params) star_chim_params = { "keep_fastq": keep_fastq, "genomeDir": genome_idx, "runThreadN": n, "genomeLoad": star_mem, "outFilterType": "Normal", ## chimeric alignment "alignIntronMax": 150000, "alignMatesGapMax": 150000, "chimSegmentMin": 10, "chimJunctionOverhangMin": 1, "chimScoreSeparation": 0, "chimScoreJunctionNonGTAG": 0, "chimScoreDropMax": 1000, "chimScoreMin": 1, } star_chim_params.update(chim_star_params) star_alig_cfg = { "id": "alig", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "python2", "script": "%s/%s" % (WORK_DIR, "work_star_alig_pe.py"), "in": ( "cutfq1", "cutfq2", ), "out": ( ("main", "log"), ("dir", None), ), "params": star_alig_params } star_chim_cfg = { "id": "chim", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "python2", "script": "%s/%s" % (WORK_DIR, "work_star_chim_pe.py"), "in": ( "mrgfq1", "mrgfq2", "mrgfq3", ), "out": ( ("main", "log"), ("dir", None), ), "params": star_chim_params } pipeline = Plumber() map_job1 = NuMap(worker_num=job1, ordered=False, stride=stride, buffer=10000) map_jobn = NuMap(worker_num=jobn, ordered=False, stride=stride, buffer=10000) p1 = Piper(Worker(link_fq, (proc_dir, )), parallel=map_job1) p2 = Piper(Worker(script, (cutmrg_cfg, )), parallel=(map_jobn if merge_mode == "bb" else map_job1)) p3a = Piper(Worker(script, (star_alig_cfg, )), parallel=map_jobn) p3b = Piper(Worker(script, (star_chim_cfg, )), parallel=map_jobn) p4a = Piper(Worker(move_alig_star, (proc_dir, )), parallel=map_job1) p4b = Piper(Worker(move_chim_star, (proc_dir, )), parallel=map_job1) p5 = Piper(Worker(npasser), parallel=map_jobn) pipeline.add_pipe((p1, p2, p3a, p4a, p5)) pipeline.add_pipe((p1, p2, p3b, p4b, p5)) return pipeline
def preqc(proc_dir, job1, jobn, n, stride): kmer_cfg = { "id": "kmer", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "bash", "script": "%s/%s" % (WORK_DIR, "work_kmer.sh"), "in": ( "fastq1", "fastq2", ), "out": ( ("main", "log"), ("kmer1", "txt"), ("kmer2", "txt"), ), "params": { "k": 6, } } fastqc_cfg = { "id": "fastqc", "evaluator": EVALUATOR, "preamble": PREAMBLE, "dir": proc_dir, "executable": "bash", "script": "%s/%s" % (WORK_DIR, "work_fastqc.sh"), "in": ( "fastq1", "fastq2", ), "out": ( ("main", "log"), ("report", None), ), "params": {} } map_job1 = NuMap(worker_num=job1, ordered=False, stride=stride, buffer=10000) map_jobn = NuMap(worker_num=jobn, ordered=False, stride=stride, buffer=10000) p1 = Piper(Worker(sample_fq, ( proc_dir, 1000000, )), parallel=map_job1) p2 = Piper(Worker(script, (kmer_cfg, )), parallel=map_job1) p3 = Piper(Worker(script, (fastqc_cfg, )), parallel=map_job1) p4 = Piper(Worker(npasser), parallel=map_job1) # topology pipeline = Plumber() pipeline.add_pipe((p1, p2, p4)) pipeline.add_pipe((p1, p3, p4)) return pipeline