def read_quality(workflow, conf, tex): if conf.pe: for raw, target in conf.treatment_pairs_pe: attach_back(workflow, ShellCommand( "{tool} {input[fastq][0]} {input[fastq][1]} {output[stat][0]} {output[stat][1]}", tool = "dac_pe_read_quality", input = {"fastq": raw}, output = {"stat": [ i + "_read_quality.qc" for i in target ]})) # attach_back(workflow, PythonCommand(stat_fastqStat, # input = {"seq": [ [ p + "_100k.seq" for p in target ] for target in conf.treatment_pair_data ]}, # output = {"json": conf.json_prefix + "_seq_quality.json"}, # param = {"samples": conf.treatment_bases, "seq_type": conf.pe})) # attach_back(workflow, PythonCommand( # seq_quality_doc, # input = {"tex": tex, "json": conf.json_prefix + "_seq_quality.json"}, # output = {"seq": conf.latex_prefix + "seq_quality.tex", "len": conf.latex_prefix + "len.tex"}, # param = {"seq_type": conf.seq_type, "reps": len(conf.treatment_pairs), # "pe_samples": conf.treatment_bases})) else: for raw, target in conf.treatment_pairs: sample_fq = {"stat": target + "_read_quality.qc"} attach_back(workflow, ShellCommand( "{tool} {input} {output[stat]}", tool = "dac_se_read_quality", input = raw, output = sample_fq, name = "100k read sequence quality and sequence length"))
def Phan(workflow, conf): # NSC, RSC, Qtag """ for calculating NSC, RSC score at 4M level http://code.google.com/p/phantompeakqualtools/ (1) Determine strand cross-correlation peak / predominant fragment length OR print out quality measures Rscript run_spp.R -c=<tagAlign/BAMfile> -savp -out=<outFile> """ # peaks calling by SPP needs control, for phantomqc, we do both treat and control independently for t in conf.sample_targets: if conf.down: ## default, this option ibam = t + "_4000000.bam" # elif conf.unsc: ## --total --unsc # ibam = t + "_rmdup.bam" else: ## --total ibam = t + ".bam" attach_back(workflow, ShellCommand("{tool} {param[script]} -c={input[chip]} -rf -savp -out={output[spp]} -odir={param[dir]}", tool = "Rscript", input = {"chip": ibam}, output = {"spp": t + ".spp", "pdf": t+"_4000000.pdf" if conf.down else t+".pdf"}, param = {"script": conf.get("tool", "spp"), "dir": os.path.dirname(t + ".spp")}, name = "SPP")) stat_phan(workflow, conf) if conf.long: tex_phan(workflow, conf)
def tex_conserv(workflow, conf): attach_back(workflow, PythonCommand( latex_conservation, input={"template": resource_filename("chilin2.modules.conservation", "conservation.tex")}, output={"latex": conf.latex_prefix + "_conserv.tex"}, param = {"prefix": conf.prefix}))
def tex_fastqc(workflow, conf): quality = attach_back(workflow, PythonCommand( load_latex, input={"json": conf.json_prefix + "_fastqc.json", "template": resource_filename("chilin2.modules.fastqc", "fastqc.tex"), "pdf": conf.prefix + "_raw_sequence_qc.pdf"}, output={"latex": conf.latex_prefix + "_fastqc.tex"})) quality.allow_fail = True quality.allow_dangling = True #these are name, png pairings if not conf.pe: gccontent_graphs = [(nm.replace("_"," "), os.path.join(conf.target_dir, "%s_100k_fastqc" % nm, "Images","per_sequence_gc_content.png"))\ for nm in conf.sample_bases] else: gccontent_graphs = [(nm.replace("_"," "), os.path.join(conf.target_dir, "%spair1_100k_fastqc" % nm, "Images","per_sequence_gc_content.png"))\ for nm in conf.sample_bases] gc = attach_back(workflow, PythonCommand( load_gc_latex, input={"template": resource_filename("chilin2.modules.fastqc", "fastqc_gc.tex"), "gccontent_graphs":gccontent_graphs }, output={"latex": conf.latex_prefix + "_fastqc_gc.tex"})) gc.allow_fail = True gc.allow_dangling = True
def latex_environ(workflow, conf): """ write out begin and end document including packages """ attach_back( workflow, PythonCommand(latex_start, input={ "template": resource_filename("chilin2.modules.summary", "begin.tex") }, output={"latex": conf.latex_prefix + "_begin.tex"}, param={ "id": conf.id, "version": conf.get("basics", "version"), "user": conf.get('basics', 'user'), "bmcard": resource_filename("chilin2.modules.summary", "bmcart.cls").rstrip('.cls') })) attach_back( workflow, PythonCommand(latex_end, input={ "template": resource_filename("chilin2.modules.summary", "end.tex") }, output={"latex": conf.latex_prefix + "_end.tex"}))
def contamination_check(workflow, conf): """ bowtie mapping back to different species """ if conf.items("contamination"): for target in conf.sample_targets: for species in dict(conf.items("contamination")): index = conf.get("contamination", species) if conf.mapper == "bwa": output = target + species + ".sam" if conf.pe: outsai = [target + species + "pair1.sai", target + species + "pair2.sai"] targets = [ target + "pair1", target + "pair2" ] else: outsai = target + species + ".sai" targets = target bwa(workflow, conf, targets, output, outsai, index) elif conf.mapper == "bowtie": output = target + species + ".sam" bowtie(workflow, conf, target, output, index) elif conf.mapper == "star": output = target + species + "Aligned.out.sam" star(workflow, conf, target, output, index) sam2bam = attach_back(workflow, ## use mapping quality 1 defined by samtools official FAQ ShellCommand( """ {tool} view -bS -t {param[genome]} -q {param[mapq]} {input[sam]} > {param[tmp_bam]} && {tool} sort -m {param[max_mem]} {param[tmp_bam]} {param[output_prefix]} """, tool="samtools", input={"sam": output}, output={"bam":target + species + ".bam"}, param={"tmp_bam": target + species + ".tmp.bam", "output_prefix": target + species, "mapq": 1, "genome": conf.get(conf.get("basics", "species"), "chrom_len"), "max_mem": 4000000000}, name = "filtering mapping and convert")) # Use 5G memory as default sam2bam.update(param=conf.items("sam2bam")) sam2bam.allow_dangling = True sam2bam.allow_fail = True rem = attach_back(workflow, ShellCommand( """ {tool} view -Sc {input[sam]} > {output[total]} {tool} flagstat {input[bam]} > {output[stat]} """, tool = "samtools", input = {"bam": target + species + ".bam", "sam": output}, output = {"stat": target + species + "_mapped." + conf.mapper, "total": target + species + "_total." + conf.mapper}, name = "contamination calculation")) rem.allow_fail = True rem.allow_dangling = True ## QC part stat_contamination(workflow, conf) if conf.long: tex_contamination(workflow, conf)
def _begin_latex(workflow, conf): attach_back(workflow, PythonCommand( latex_start, input={"template": latex_template}, output={"latex": conf.latex_prefix + "_start.latex"}, param={"id": conf.id}))
def sampling_bam(workflow, conf): ## sampling to 4M """ sampling bam files through macs2 and bedtools """ for target in conf.sample_targets: ## sampling treat and control simultaneously ## sampling bam by macs2 and convert to bam by bedtools ## if total mapped reads < 4M, use original bam files link to *4000000.bam ## extract mapped reads number from json files ## use uniquely mapped reads sampling sampling_u = attach_back( workflow, sampling(target + "_u.sam", target + "_4000000.bam", 4000000, "sam", conf)) sampling_u.allow_dangling = True sampling_u.allow_fail = True ## use encode version of 5M non chrM reads to evaluate if conf.frip: samp = attach_back( workflow, sampling(target + "_nochrM.sam", target + "_5000000_nochrM.bam", 5000000, "sam", conf)) samp.allow_fail = True samp.allow_dangling = True else: ## default ## change FRiP computing with merged peaks as reference, no chrM as comparison samp = attach_back( workflow, sampling(target + "_nochrM.sam", target + "_4000000_nochrM.bam", 4000000, "sam", conf)) samp.allow_fail = True samp.allow_dangling = True
def prepare_clean_up(workflow, conf): """ package all the necessary results and delete temporary files """ p_list = ['*.bam', '*.xls', '*_summits.bed', '*_peaks.bed', '*.bw', '*.png', '*.pdf', '*.R', '*.zip', '*cor*', 'json', "*summary*", "*seqpos","*fastqc", '*latex', "*.conf"] p_pattern = [os.path.join(conf.target_dir, p) for p in p_list] final_dir = conf.target_dir + '/dataset_' + conf.id attach_back(workflow, ShellCommand("if [ ! -d '{output}' ]; then mkdir -p {output}; fi", output=final_dir)) for pf in p_pattern: if not glob(pf): print(pf) continue move = attach_back(workflow, ShellCommand('mv {param[preserve_files]} {output[dir]} \n# Pattern: {param[p_pattern]}', output={"dir": final_dir}, param={"preserve_files": " ".join(glob(pf)), "p_pattern": pf}, )) move.allow_fail = True
def _summary_table_latex(workflow, conf): attach_back(workflow, PythonCommand( latex_summary_table, input={"template": latex_template}, output={"latex": conf.latex_prefix + "_summary_table.latex"}, param={"conf": conf}))
def read_quality(workflow, conf, tex): if conf.pe: for raw, target in conf.treatment_pairs_pe: attach_back( workflow, ShellCommand( "{tool} {input[fastq][0]} {input[fastq][1]} {output[stat][0]} {output[stat][1]}", tool="dac_pe_read_quality", input={"fastq": raw}, output={"stat": [i + "_read_quality.qc" for i in target]})) # attach_back(workflow, PythonCommand(stat_fastqStat, # input = {"seq": [ [ p + "_100k.seq" for p in target ] for target in conf.treatment_pair_data ]}, # output = {"json": conf.json_prefix + "_seq_quality.json"}, # param = {"samples": conf.treatment_bases, "seq_type": conf.pe})) # attach_back(workflow, PythonCommand( # seq_quality_doc, # input = {"tex": tex, "json": conf.json_prefix + "_seq_quality.json"}, # output = {"seq": conf.latex_prefix + "seq_quality.tex", "len": conf.latex_prefix + "len.tex"}, # param = {"seq_type": conf.seq_type, "reps": len(conf.treatment_pairs), # "pe_samples": conf.treatment_bases})) else: for raw, target in conf.treatment_pairs: sample_fq = {"stat": target + "_read_quality.qc"} attach_back( workflow, ShellCommand( "{tool} {input} {output[stat]}", tool="dac_se_read_quality", input=raw, output=sample_fq, name="100k read sequence quality and sequence length"))
def _bowtie_latex(workflow, conf): attach_back(workflow, PythonCommand( latex_bowtie, input={"json": conf.json_prefix + "_bowtie.json", "template": latex_template}, output={"latex": conf.latex_prefix + "_bowtie.latex"}))
def stat_fastqc(workflow, conf): # collect raw reads quality and GC contents """ long: generate long pages or not """ sums = [] for raw, target in conf.sample_pairs: if conf.pe: sums.append(target[0] + "_100k_fastqc/fastqc_data.txt") else: sums.append(target + "_100k_fastqc/fastqc_data.txt") attach_back(workflow, PythonCommand( json_fastqc, input={"fastqc_summaries": sums}, output={"json": conf.json_prefix + "_fastqc.json"}, param={"ids": conf.sample_bases, "id": conf.id}, name = "collect fastqc results")) if conf.long: ## prepare long document images and tex attach_back(workflow, PythonCommand(fastqc_detailed_figure, input = {"dbaccessor": resource_filename("chilin2.modules.dbaccessor", "ChiLinQC.db"), "template": resource_filename("chilin2.modules.summary", "R_culmulative_plot.R"), "json": conf.json_prefix + "_fastqc.json"}, output = {"R": conf.prefix + "_raw_sequence_qc.R", "pdf": conf.prefix + "_raw_sequence_qc.pdf"}, param={"ids": conf.sample_bases}))
def Phan(workflow, conf): # NSC, RSC, Qtag """ for calculating NSC, RSC score at 4M level http://code.google.com/p/phantompeakqualtools/ (1) Determine strand cross-correlation peak / predominant fragment length OR print out quality measures Rscript run_spp.R -c=<tagAlign/BAMfile> -savp -out=<outFile> """ # peaks calling by SPP needs control, for phantomqc, we do both treat and control independently for t in conf.sample_targets: if conf.down: ## default, this option ibam = t + "_4000000.bam" # elif conf.unsc: ## --total --unsc # ibam = t + "_rmdup.bam" else: ## --total ibam = t + ".bam" attach_back( workflow, ShellCommand( "{tool} {param[script]} -c={input[chip]} -rf -savp -out={output[spp]} -odir={param[dir]}", tool="Rscript", input={"chip": ibam}, output={ "spp": t + ".spp", "pdf": t + "_4000000.pdf" if conf.down else t + ".pdf" }, param={ "script": conf.get("tool", "spp"), "dir": os.path.dirname(t + ".spp") }, name="SPP")) stat_phan(workflow, conf) if conf.long: tex_phan(workflow, conf)
def fragment(workflow, conf): ## this is done after FRiP if conf.get("tool", "macs2"): macs2_bin = conf.get("tool", "macs2") else: macs2_bin = "macs2" for target in conf.treatment_targets: fragment_size = attach_back(workflow, ShellCommand( "{tool} predictd -i {input[bam]} --rfile {param[prefix]} -g {param[species]}", tool = macs2_bin, input = {"bam": target + ".bam"}, output = {"R": target + "_model.R"}, param = {"prefix": target + "_model.R", "species": 'hs'})) fragment_size.update(param = conf.items("macs2")) ## except two few peaks for modeling fragment_size.allow_fail = True fragment_size.allow_dangling = True ## extract standard deviation from MACS2 model.R, ## use m, p, and pileup value for standard deviation; mean fragment size is provided (choose the one with highest correlation) frag_qc = attach_back(workflow, PythonCommand( stat_frag_std, input = {"r": [target + "_model.R" for target in conf.treatment_targets]}, output = {"json": conf.json_prefix + "_frag.json", "r": [ target + "_frag_sd.R" for target in conf.treatment_targets ]}, param = {"samples": conf.treatment_bases, "frag_tool": "BAMSE"}, name = "macs2 model R script parser")) frag_qc.allow_fail = True frag_qc.allow_dangling = True
def _seqpos_latex(workflow, conf): attach_back(workflow, PythonCommand( latex_seqpos, input={"json": conf.json_prefix + "_seqpos.json", "template": latex_template}, output={"latex": conf.latex_prefix + "_seqpos.latex"}))
def tex_bwa(workflow, conf): attach_back(workflow, PythonCommand( long_tex, input = {"template": resource_filename("chilin2.modules.bwa", "bwa.tex"), "figure": conf.prefix + "_bwa_compare.pdf"}, output = {"latex": conf.latex_prefix + "_map.tex"}))
def stat_bedAnnotate(workflow, conf, has_dhs, has_velcro): """ Describe peaks' distribution # collect meta gene distribution info """ collect_meta2 = attach_back(workflow, PythonCommand( json_meta2, input={"meta": conf.prefix + ".meta"}, output={"json": conf.json_prefix + "_meta.json"}, param={"id": conf.id}, name="bedAnnotate summary")) collect_meta2.allow_fail = True collect_meta2.allow_dangling = True if has_dhs: collect_dhs = attach_back(workflow, PythonCommand( json_dhs, input={"dhs": conf.prefix + ".dhs", "top_peaks": 5000}, output={"json": conf.json_prefix + "_dhs.json"}, name="DHS summary")) collect_dhs.allow_dangling = True collect_dhs.allow_fail = True if has_velcro: collect_velcro = attach_back(workflow, PythonCommand( json_velcro, input={"velcro": conf.prefix + ".velcro", "top_peaks": 5000}, output={"json": conf.json_prefix + "_velcro.json"}, name="Velcro summary")) collect_velcro.allow_fail = True collect_velcro.allow_dangling = True
def _bowtie(workflow, conf): for target in conf.sample_targets: bowtie = attach_back(workflow, ShellCommand( "{tool} -p {param[threads]} -S -m {param[max_align]} \ {param[genome_index]} {input[fastq]} {output[sam]} 2> {output[bowtie_summary]}", input={"genome_dir": os.path.dirname(conf.get_path("lib", "genome_index")), "fastq": target + ".fastq"}, output={"sam": target + ".sam", "bowtie_summary": target + "_bowtie_summary.txt", }, tool="bowtie", param={"threads": 4, "max_align": 1, "genome_index": conf.get_path("lib", "genome_index")})) bowtie.update(param=conf.items("bowtie")) __sam2bam(workflow, conf) ## using bowtie standard error output attach_back(workflow, PythonCommand(stat_bowtie, input={"bowtie_summaries": [t + "_bowtie_summary.txt" for t in conf.sample_targets], "db": ChiLinQC_db, "template": rlang_template}, output={"json": conf.json_prefix + "_bowtie.json", "R": conf.prefix + "_bowtie.R", "pdf": conf.prefix + "_bowtie.pdf"}, param={"sams": [t + ".sam" for t in conf.sample_targets], }))
def _raw_QC_latex(workflow, conf): attach_back(workflow, PythonCommand( latex_fastqc, input={"json": conf.json_prefix + "_fastqc.json", "template": latex_template}, output={"latex": conf.latex_prefix + "_fastqc.latex"}))
def sample_bam_stat(workflow, conf, tex): """ sample non chrm bam to 15M for NSC and PBC sample non chrm bam to 5M for spot """ for i, target in enumerate(conf.treatment_targets): ## for PE, use name sorted in order to calculate PBC input_bam = target + "_name_sorted.bam" if conf.pe else target + "_final_nochrm.bam" attach_back(workflow, ShellCommand( "{tool} {input[namesorted]} {param[run_spp]} {output[bamstat]} {output[sppstat]} {param[pe]} {output[pbc]}", tool = "eap_dnase_stats", input = {"namesorted": input_bam}, output = {"bamstat": target + "_bam_stat.qc", ## 15M "sppstat": target + "_spp.qc", "pbc": target + "_final_nochrm_15M_pbc.qc"}, param = {"pe": "pe" if conf.pe else "se", "run_spp": conf.get("tool", "spp")})) if not "macs" in conf.get("tool", "peak_calling"): attach_back(workflow, ShellCommand( "{tool} {input[bamwithoutchrm]} {param[genome]} {param[readsize]} {output[spot]} {param[hotspot_dir]} {param[hotspot_output]} {param[hotspot_tmp]} {param[spot_tmp]}", tool = "dac_spot", ## 5M input = {"bamwithoutchrm": target + "_final_nochrm.bam"}, output = {"spot": target + "_spot_nochrm_5M.qc"}, param = {"genome": conf.species, "spot_tmp": conf.hotspot_reps_tmp_prefix[i] + "_final_nochrm.bam.5000000.spot.out", "readsize": conf.readsize, "hotspot_dir": conf.get("tool", "peak_calling"), "hotspot_output": target + "_hotspot", "hotspot_tmp": target + "_hotspot_tmp"}))
def _star_sam2bam(workflow, conf): # SAM -> BAM """ convert SAM to BAM and use mapping quality as cutoff :param workflow: samflow defined class :param conf: parsed config file :return: void """ import os for target in conf.sample_targets: sam2bam = attach_back( workflow, ShellCommand(""" ln -s {input[sam]} {output[sam]} {tool} view -q 255 -bt {param[genome]} {input[sam]} -o {output[bam]} """, tool="samtools", input={"sam": target + "Aligned.out.sam"}, output={ "bam": target + ".bam", "sam": target + ".sam" }, param={ "genome": conf.get(conf.get("basics", "species"), "chrom_len"), }, name="star sam2dam")) workflow.update(param=conf.items("sam2bam")) #From bwa/dc.py sam2bamnochrm = attach_back( workflow, ## use mapping quality 1 defined by samtools official FAQ ShellCommand( """ awk \'BEGIN{{OFS="\\t"}} {{print $1,0,$2}}\' {param[genome]} > {param[chrom_bed]} grep -v chrM {param[chrom_bed]} > {output[nochrmbed]} {tool} view -h -b -L {output[nochrmbed]} {input[bam]} > {output[nochrmbam]} {tool} view -h {output[nochrmbam]} > {output[nochrmsam]} {tool} view -h {input[bam]} > {output[usam]} """, tool="samtools", input={"bam": target + ".bam"}, output={ "nochrmbed": target + ".nochrM", "nochrmbam": target + "_nochrM.bam", "usam": target + "_u.sam", ## uniquely mapping sam for sampling "nochrmsam": target + "_nochrM.sam" }, param={ "tmp_bam": target + ".tmp.bam", "output_prefix": target, "chrom_bed": os.path.join(conf.target_dir, "chrom.bed"), "mapq": 1, "genome": conf.get(conf.get("basics", "species"), "chrom_len") }, name="filtering mapping and convert") ) # Use 5G memory as default sam2bamnochrm.update(param=conf.items("sam2bam"))
def _macs2_cor_latex(workflow, conf): attach_back(workflow, PythonCommand( latex_cor, input={"json": conf.json_prefix + "_cor.json", "template": latex_template}, output={"latex": conf.latex_prefix + "_cor.latex"}))
def _conservation_latex(workflow, conf): attach_back(workflow, PythonCommand( latex_conservation, input={"json": conf.json_prefix + "_conserv.json", "template": latex_template}, output={"latex": conf.latex_prefix + "_conserv.latex"}))
def _bwa(workflow, conf): """ incorpate ENCODE ChIP-seq alignment parameters """ for raw, target in conf.treatment_pairs: param = {"threads": conf.threads, "index":conf.get(conf.species, "genome_index"), "prefix": target + "_raw_sorted", "qc2": target + "_rawbam_stats.qc"} if conf.pe: bwa = attach_back(workflow, ShellCommand( "{tool} {param[threads]} {param[index]} {input[fastq][0]} {input[fastq][1]} {output[bam]} {output[qc]} {param[prefix]} {param[qc2]}", tool = "eap_run_bwa_pe", input = {"fastq": raw}, output = {"bam": target + "_raw_sorted.bam", "qc": target + "_rawbam.qc"}, param = param, name = "pair end mapping")) else: bwa = attach_back(workflow, ShellCommand( "{tool} {param[threads]} {param[index]} {input[fastq]} {output[bam]} {output[qc]} {param[prefix]} {param[qc2]}", tool = "eap_run_bwa_se", input = {"fastq": raw}, output = {"bam": target + "_raw_sorted.bam", "qc": target + "_rawbam.qc"}, param = param, name = "single end mapping")) bwa.update(param = conf.items("bwa"))
def reg_potential(workflow, conf): """ """ get_top_peaks = attach_back(workflow, ShellCommand( "{tool} -n {param[peaks]} {input} | cut -f 1,2,3,4,9> {output}", tool="head", input=conf.prefix + "_sort_peaks.narrowPeak" if conf.get("macs2", "type") in ["both", "narrow"] else conf.prefix + "_b_sort_peaks.broadPeak", output=conf.prefix + "_peaks_top_reg.bed", param={"peaks": 10000}, name="top summits for regpotential")) get_top_peaks.update(param=conf.items("reg_potential")) reg = attach_back(workflow, ShellCommand( "{tool} -t {input[peaks]} -g {param[geneTable]} -n {param[prefix]} -d {param[dist]}", tool = "RegPotential.py", input = {"peaks": conf.prefix + "_peaks_top_reg.bed"}, output = {"potential": conf.prefix + "_gene_score.txt"}, param = {"geneTable": conf.get_path(conf.get("basics", "species"), "geneTable"), "tool": resource_filename("chilin2.modules", "regulatory/RegPotential.py"), "prefix": conf.prefix, "dist": 100000}, name = "Regulatory Potential")) reg.update(param=conf.items("reg_potential"))
def _macs2_cor(workflow, conf): cor_on_bw = attach_back(workflow, ShellCommand( template= """{tool} \ -s {param[wig_correlation_step]} \ --min-score {param[wig_correlation_min]} --max-score {param[wig_correlation_max]} \ -r {output[R]} {param[bw]} {param[rep]} && \ mv {output[R]}.pdf {output[pdf]}""", tool="bigwig_correlation.py", input=[target + "_treat.bw" for target in conf.treatment_targets], output={"R": conf.prefix + "_cor.R", "pdf": conf.prefix + "_cor.pdf"}, param={"wig_correlation_method": "mean", "wig_correlation_min": 2, "wig_correlation_max": 50, "wig_correlation_step": 10,}, name="cor_on_bw")) cor_on_bw.param["bw"] = " ".join(cor_on_bw.input) cor_on_bw.param["rep"] = " ".join([" -l replicate_%s" % (x + 1) for x in range(len(conf.treatment_pairs))]) cor_on_bw.update(param=conf.items("correlation")) cor_on_bw.allow_fail = True attach_back(workflow, PythonCommand( stat_cor, input={"correlation_R": conf.prefix + "_cor.R", "cor_pdf": conf.prefix + "_cor.pdf"}, output={"json": conf.json_prefix + "_cor.json"}))
def stat_motif(workflow, conf): attach_back(workflow, PythonCommand( stat_seqpos, input={"seqpos":conf.prefix + "_seqpos/" + "motif_list.json"}, output={"json": conf.json_prefix + "_seqpos.json"}, param={"prefix": conf.prefix + "_seqpos/seqLogo/", "z_score_cutoff": -1}, name = "collect motif info"))
def summary_table_latex(workflow, conf): attach_back(workflow, PythonCommand( latex_summary_table, input={"template": resource_filename("chilin2.modules.summary", "summary_table.tex")}, output={"latex": conf.latex_prefix + "_summary_table.tex"}, param={"conf": conf, "layout": "l"+"c"*(1+len(conf.sample_bases))}))
def tex_frip(workflow, conf): attach_back(workflow, PythonCommand( load_latex, input={"json": conf.json_prefix + "_frip.json", "template": resource_filename("chilin2.modules.frip", "frip.tex"), }, output={"latex": conf.latex_prefix + "_frip.tex"}))
def test_workflow_attach_later_invoke_success(self): tree = self.create_tree() attach_front(tree, ShellCommand("touch {output}", output="outer_f1")) attach_back(tree, ShellCommand("rm {input}", input="outer_f1")) attach_front(tree, ShellCommand('echo "{0} decorator started {0}"'.format("="*10))) attach_back(tree, JinShCommand('echo "{0} decorator ended {0}"'.format("="*10))) self.assertTrue(tree.invoke())
def tex_contamination(workflow, conf): all_species = [i for i, _ in conf.items("contamination")] attach_back(workflow, PythonCommand( latex_contamination, input = {"template": resource_filename("chilin2.modules", "contamination/contamination.tex"), "json": conf.json_prefix + "_contam.json"}, output = {"latex": conf.latex_prefix + "_contam.tex"}, param = {'id': conf.id, 'layout': 'c'*(len(all_species)+1)}))
def create_tree(self): main_workflow = Workflow("main") sub_workflow = Workflow("sub") attach_back(sub_workflow, ShellCommand('echo "subtree started"')) attach_back(sub_workflow, JinShCommand('touch {{ output|join(" ") }}', output=["f1","f2"])) attach_back(sub_workflow, JinShCommand('rm {{ input|join(" ") }}', input=["f1", "f2"])) attach_back(sub_workflow, ShellCommand('echo "subtree ended"')) attach_back(main_workflow, sub_workflow) return main_workflow
def stat_phan(workflow, conf): """ collect NSC/RSC/Qtag and cross correlation figure """ attach_back(workflow, PythonCommand( json_phan, input = {"spp": [t + ".spp" for t in conf.sample_targets]}, output = {"json":conf.json_prefix + "_phan.json"}, param = {"sample": conf.sample_bases}))
def stat_pbc(workflow, conf): # collect pbc value """ statistics collected from *.pbc """ attach_back(workflow, PythonCommand( json_pbc, input = {"pbc": [t + ".pbc" for t in conf.sample_targets]}, output = {"json": conf.json_prefix + "_pbc.json"}, param = {"samples":conf.sample_bases}))
def stat_phan(workflow, conf): """ collect NSC/RSC/Qtag and cross correlation figure """ attach_back( workflow, PythonCommand(json_phan, input={"spp": [t + ".spp" for t in conf.sample_targets]}, output={"json": conf.json_prefix + "_phan.json"}, param={"sample": conf.sample_bases}))
def stat_pbc(workflow, conf): # collect pbc value """ statistics collected from *.pbc """ attach_back( workflow, PythonCommand(json_pbc, input={"pbc": [t + ".pbc" for t in conf.sample_targets]}, output={"json": conf.json_prefix + "_pbc.json"}, param={"samples": conf.sample_bases}))
def filter_bam(workflow, conf, tex): """ filter bam file by samtools and sample by ucsc app """ for target in conf.treatment_targets: input = {"raw": target + "_raw_sorted.bam"} if conf.pe: name = "pair" tool = "dac_bam_pe_post_filter" param = { "mapq": 3, "namesortedbamprefix": target + "_name_sorted", "finalprefix": target + "_final", "qc2": target + "_filter_bam_stats.qc" } output = { "finalbam": target + "_final.bam", "namesortedbam": target + "_name_sorted.bam", "bamwithoutchrm": target + "_final_nochrm.bam", "qc": target + "_filter_bam.qc" } attach_back( workflow, ShellCommand( "{tool} {input[raw]} {param[namesortedbamprefix]} {output[namesortedbam]} {param[finalprefix]} {output[finalbam]} {param[mapq]} {output[bamwithoutchrm]} {output[qc]} {param[qc2]}", tool=tool, input=input, output=output, param=param, name="%s end filtering" % name)) else: name = "single" tool = "dac_bam_se_post_filter" param = { "mapq": 3, "finalprefix": target + "_final", "qc2": target + "_filter_bam_stats.qc" } output = { "finalbam": target + "_final.bam", "bamwithoutchrm": target + "_final_nochrm.bam", "qc": target + "_filter_bam.qc" } attach_back( workflow, ShellCommand( "{tool} {input[raw]} {output[finalbam]} {param[mapq]} {output[qc]} {output[bamwithoutchrm]} {param[finalprefix]} {param[qc2]}", tool=tool, input=input, output=output, param=param, name="%s end filtering" % name))
def star(workflow, conf): # Mapping """ Use star to map reads to genome, call __bwa_sam2bam to convert sam to bam :param workflow: samflow defined class :param conf: parsed config files :return: void """ for target in conf.sample_targets: star = attach_back( workflow, ShellCommand( "{tool} --genomeDir {param[index]} --runThreadN {param[NUM_THREADS]} --readFilesIn {input[fastq]} --outFileNamePrefix {param[prefix]}", tool="STAR", input={"fastq": target + ".fastq"}, output={"sam": target + "Aligned.out.sam"}, param={ "NUM_THREADS": conf.threads, "prefix": target, ## judge chosen species from basics section "index": conf.get_path(conf.get("basics", "species"), "genome_index") }, name="star aln")) star.update(param=conf.items("bowtie")) _star_sam2bam(workflow, conf) ## QC part--NOTE keeping the bwa legacy code! stat_bwa(workflow, conf) if conf.long: tex_bwa(workflow, conf)
def PBC(workflow, conf): # PBC1 """ Introduce ENCODE II library complexity assessment methods N1 / Nd, N1 is the location with exact one read, Nd is distinct location number :param workflow: samflow class :param conf: parsed config :return: void """ for t in conf.sample_targets: pbc1 = attach_back( workflow, ShellCommand( """ bamToBed -i {input[bam]} | {tool} \'{{l[$1"\\t"$2"\\t"$3"\\t"$6]+=1}} END {{for(i in l) print l[i]}}\' \\ | awk \'{{n[$1]+=1}} END {{for (i in n) print i"\\t"n[i]}}\' \\ | sort -k1n - > {output[hist]} awk '{{ if (NR==1) {{N1=$2}} Nd+=$2 }} END {{print N1,Nd,N1/Nd}}' {output[hist]} > {output[pbc]} """, tool="awk", input={"bam": t + "_4000000.bam" if conf.down else t + ".bam"}, output={ "pbc": t + ".pbc", "hist": t + ".hist" }, name="PBC")) pbc1.allow_fail = True pbc1.allow_dangling = True ## QC part stat_pbc(workflow, conf)
def replicates_peaks_overlap(workflow, conf): # peaks bed from each replicate """ :param workflow: class from samflow :param conf: external parsed config file :return: workflow through attach_back """ for i in range(len(conf.treatment_targets)): for j in range(i + 1, len(conf.treatment_targets)): replicates_overlap = attach_back( workflow, ShellCommand( "{tool} -f {param[p]} -a {input[0]} -b {input[1]} | wc -l > {output}", tool="intersectBed", input=[ conf.treatment_targets[i] + "_sort_peaks.narrowPeak" if conf.get("macs2", "type").lower() in ["both", "narrow"] else conf.treatment_targets[i] + "_b_sort_peaks.broadPeak", conf.treatment_targets[j] + "_sort_peaks.narrowPeak" if conf.get( "macs2", "type").lower() in ["both", "narrow"] else conf.treatment_targets[j] + "_b_sort_peaks.broadPeak" ], output=conf.prefix + "_%s_%s.overlap" % (i, j), param={"p": 0.3}, name="Replicates peaks overlap QC")) replicates_overlap.allow_fail = True # in case 0 peak in macs2 replicates_overlap.allow_dangling = True ## generate a barplot for meta distribution replicates_overlap.update(param=conf.items("replicates")) return workflow
def merge_latex(workflow, conf): ## begin and end of the docs latex_order = [ "_begin.tex", "_summary_table.tex", ] if conf.long: latex_order += [ "_fastqc.tex", "_fastqc_gc.tex", "_map.tex", "_conserv.tex", # "_macs2.latex", "_macs2_on_sample.latex", # "_phan.tex", "_motif.tex", "_contam.tex", "_frip.tex", ] latex_order.append("_end.tex") latex_list = [conf.latex_prefix + i for i in latex_order] merge_cmd = attach_back( workflow, ShellCommand("cat {param[tex]} > {output}", output=conf.prefix + "_report.tex")) merge_cmd.allow_fail = True merge_cmd.param = {"tex": " ".join(latex_list)}
def bowtie(workflow, conf): # Mapping """ Use bowtie to map reads to genome, call __bwa_sam2bam to convert sam to bam :param workflow: samflow defined class :param conf: parsed config files :return: void """ for target in conf.sample_targets: bowtie = attach_back(workflow, ShellCommand( "{tool} -p {param[NUM_THREADS]} -S -m 1 {param[index]} {input[fastq]} {output[sam]}", tool = "bowtie", input = {"fastq": target + ".fastq"}, output = {"sam": target + ".sam"}, param = {"NUM_THREADS": conf.threads, ## judge chosen species from basics section "index": conf.get_path(conf.get("basics", "species"), "genome_index")}, name = "bowtie aln")) bowtie.update(param = conf.items("bowtie")) bowtie.allow_dangling = True bowtie.allow_fail = True _bowtie_sam2bam(workflow, conf) ## QC part--NOTE keeping the bwa legacy code! stat_bwa(workflow, conf) if conf.long: tex_bwa(workflow, conf)
def DHS(workflow, conf): # DHS overlap percentage """ get peaks overlapping percentage with union DHS :param workflow: uniform pipeline workflow from samflow :param conf: parsed config files :return: workflow """ peaks = conf.prefix + "_sort_peaks.narrowPeak" if conf.get( "macs2", "type") in ["both", "narrow" ] else conf.prefix + "_b_sort_peaks.broadPeak" DHS = attach_back( workflow, ShellCommand(""" n=$(head -n {param[p]} {input[MACS2_bed]} | wc -l) dhs=$(head -n {param[p]} {input[MACS2_bed]} | {tool} -wa -u -a - -b {input[DHS_peaks_bed]}|wc -l) ##dhs=$(echo \"scale=5;$dhs/$n\" | bc) echo $n,$dhs > {output} """, tool="intersectBed", input={ "MACS2_bed": peaks, "DHS_peaks_bed": conf.get(conf.get("basics", "species"), "dhs") }, output=conf.prefix + ".dhs", param={"p": 5000}, name="intersect DHS")) DHS.allow_dangling = True DHS.allow_fail = True
def fastqc(workflow, conf): """ fastqc to extract gc contents(not yet) and median sequence quality :param workflow: :param conf: :return: """ for raw, target in conf.sample_pairs: if conf.pe: fastqc_run = attach_back( workflow, ShellCommand( "{tool} {input} --extract -t {param[threads]} -o {output[target_dir]}", ## only check one pair input=target[0] + "_100k.fastq", output={ "target_dir": conf.target_dir, "fastqc_summary": target[0] + "_100k_fastqc/fastqc_data.txt" }, tool="fastqc", param={"threads": conf.threads}, name="fastqc")) else: fastqc_run = attach_back( workflow, ShellCommand( "{tool} {input} --extract -t {param[threads]} -o {output[target_dir]}", input=target + "_100k.fastq", output={ "target_dir": conf.target_dir, "fastqc_summary": target + "_100k_fastqc/fastqc_data.txt" }, tool="fastqc", param={"threads": conf.threads}, name="fastqc")) fastqc_run.update(param=conf.items("fastqc")) fastqc.allow_fail = True fastqc.allow_dangling = True ## QC part of chilin ## use conf property conf.long = True stat_fastqc(workflow, conf) if conf.long: tex_fastqc(workflow, conf)
def stat_fastqc(workflow, conf): # collect raw reads quality and GC contents """ long: generate long pages or not """ sums = [] for raw, target in conf.sample_pairs: if conf.pe: sums.append(target[0] + "_100k_fastqc/fastqc_data.txt") else: sums.append(target + "_100k_fastqc/fastqc_data.txt") collect = attach_back( workflow, PythonCommand(json_fastqc, input={"fastqc_summaries": sums}, output={"json": conf.json_prefix + "_fastqc.json"}, param={ "ids": conf.sample_bases, "id": conf.id }, name="collect fastqc results")) collect.allow_fail = True collect.allow_dangling = True if conf.long: ## prepare long document images and tex long_collect = attach_back( workflow, PythonCommand(fastqc_detailed_figure, name='fastqc', input={ "dbaccessor": resource_filename("chilin2.modules.dbaccessor", "ChiLinQC.db"), "template": resource_filename("chilin2.modules.summary", "R_culmulative_plot.R"), "json": conf.json_prefix + "_fastqc.json" }, output={ "R": conf.prefix + "_raw_sequence_qc.R", "pdf": conf.prefix + "_raw_sequence_qc.pdf" }, param={"ids": conf.sample_bases})) long_collect.allow_fail = True long_collect.allow_dangling = True
def fragment(workflow, conf): ## this is done after FRiP if conf.get("tool", "macs2"): macs2_bin = conf.get("tool", "macs2") else: macs2_bin = "macs2" for target in conf.treatment_targets: fragment_size = attach_back( workflow, ShellCommand( "{tool} predictd -i {input[bam]} --rfile {param[prefix]} -g {param[species]}", tool=macs2_bin, input={"bam": target + ".bam"}, output={"R": target + "_model.R"}, param={ "prefix": target + "_model.R", "species": 'hs' })) fragment_size.update(param=conf.items("macs2")) ## except too few peaks for modeling fragment_size.allow_fail = True fragment_size.allow_dangling = True ## extract standard deviation from MACS2 model.R, ## use m, p, and pileup value for standard deviation; mean fragment size is provided (choose the one with highest correlation) frag_qc = attach_back( workflow, PythonCommand( stat_frag_std, input={ "r": [target + "_model.R" for target in conf.treatment_targets] }, output={ "json": conf.json_prefix + "_frag.json", "r": [target + "_frag_sd.R" for target in conf.treatment_targets] }, param={ "samples": conf.treatment_bases, "frag_tool": "BAMSE" }, name="macs2 model R script parser")) frag_qc.allow_fail = True frag_qc.allow_dangling = True
def tex_phan(workflow, conf): figures = [] for t in conf.sample_targets: if conf.down: figures.append(t + "_4000000.pdf") else: figures.append(t + ".pdf") attach_back( workflow, PythonCommand(long_tex, input={ "template": resource_filename("chilin2.modules.phantompeak", "phan.tex"), "figure": figures }, output={"latex": conf.latex_prefix + "_phan.tex"}))
def hotspotv4(workflow, conf, tex): for target in conf.treatment_targets: hotspot=attach_back(workflow, ShellCommand( "{tool} {param[hotspot_dir]} {param[genome]} {input[bam]} {param[readsize]} {output[narrowbb]} {output[broadbb]} {output[bigwig]} {param[tmp]} {output[hotspot_output]} {input[narrowas]} {input[broadas]} {param[chromsize]} {output[narrow]} {output[broad]}", tool = "eap_run_hotspot", input = {"bam": target + "_final_nochrm.bam", "narrowas": narrow, "broadas": broad}, output = {"narrowbb": target + ".narrowPeak.bigBed", "broadbb": target + ".broadPeak.bigBed", "narrow": target + ".narrowPeak", # "qc1": target + ".narrowPeak.qc", # "qc2": target + ".broadPeak.qc", "broad": target + ".broadPeak", "bigwig": target + ".bigWig", "hotspot_output": target + "_hotspot"}, param = {"hotspot_dir": conf.get("tool", "peak_calling"), "genome": conf.species, "chromsize": conf.get(conf.species, "chrom_len"), "tmp": target + "_hotspot_peak_call_tmp", "readsize": 36})) have_treat_reps = len(conf.treatment_pairs) >= 2 ## replicates if have_treat_reps: eval_reps(workflow, conf, tex) catsam = attach_back(workflow, ShellCommand( "{tool} cat {param[bams]} > {output[bam]}", tool = "samtools", input ={"bams": [ target + "_final.bam" for target in conf.treatment_targets]}, output = {"bam": conf.prefix + "_pool.bam"})) catsam.param.update(bams=' '.join(catsam.input["bams"])) hotspot_merge = hotspot.clone hotspot_merge.param.update(tmp=conf.prefix+"_hotspot_peak_call_tmp") hotspot_merge.input.update(bam = conf.prefix + "_pool.bam") hotspot_merge.output ={"narrowbb": conf.prefix + ".narrowPeak.bigBed", "broadbb": conf.prefix + ".broadPeak.bigBed", "narrow": conf.prefix + ".narrowPeak", # "qc1": conf.prefix + ".narrowPeak.qc", # "qc2": conf.prefix + ".broadPeak.qc", "broad": conf.prefix + ".broadPeak", "bigwig": conf.prefix + ".bigWig", "hotspot_output": conf.prefix + "_hotspot"} attach_back(workflow, hotspot_merge)
def tex_fastqc(workflow, conf): quality = attach_back( workflow, PythonCommand(load_latex, input={ "json": conf.json_prefix + "_fastqc.json", "template": resource_filename("chilin2.modules.fastqc", "fastqc.tex"), "pdf": conf.prefix + "_raw_sequence_qc.pdf" }, output={"latex": conf.latex_prefix + "_fastqc.tex"})) quality.allow_fail = True quality.allow_dangling = True #these are name, png pairings if not conf.pe: gccontent_graphs = [(nm.replace("_"," "), os.path.join(conf.target_dir, "%s_100k_fastqc" % nm, "Images","per_sequence_gc_content.png"))\ for nm in conf.sample_bases] else: gccontent_graphs = [(nm.replace("_"," "), os.path.join(conf.target_dir, "%spair1_100k_fastqc" % nm, "Images","per_sequence_gc_content.png"))\ for nm in conf.sample_bases] gc = attach_back( workflow, PythonCommand(load_gc_latex, input={ "template": resource_filename("chilin2.modules.fastqc", "fastqc_gc.tex"), "gccontent_graphs": gccontent_graphs }, output={"latex": conf.latex_prefix + "_fastqc_gc.tex"})) gc.allow_fail = True gc.allow_dangling = True
def eval_reps(workflow, conf, tex): peaks = [ target + ".narrowPeak" for target in conf.treatment_targets ] attach_back(workflow, ShellCommand( """ cat {param[narrowPeaks]} | sort -k1,1 -k2,2n - | bedtools merge -i - > {output[mergedPeak]} bedToBigBed {output[mergedPeak]} {param[chromsize]} {output[mergedPeakbb]} bigWigCorrelate -restrict={output[mergedPeakbb]} {param[bigwigs]} 1>{output[qc1]} {tool} {param[narrowPeaksbb]} {output[qc2]} """, tool = "edwComparePeaks", input = {"narrowPeaks": peaks, "bigwigs": [ target + ".bigWig" for target in conf.treatment_targets ], "narrowPeakbbs": [ target + ".narrowPeak.bigBed" for target in conf.treatment_targets ]}, output = {"mergedPeak": conf.prefix + "_merge.bed", "mergedPeakbb": conf.prefix + "_merged.bigBed", "qc1": conf.prefix + "_cor.qc", "qc2": conf.prefix + "_overlap.qc"}, param = {"narrowPeaksbb": " ".join([ target + ".narrowPeak.bigBed" for target in conf.treatment_targets ]), "narrowPeaks": " ".join([ target + ".narrowPeak" for target in conf.treatment_targets ]), "bigwigs": " ".join([ target + ".bigWig" for target in conf.treatment_targets ]), "chromsize": conf.get(conf.species, "chrom_len")}))
def tex_bwa(workflow, conf): tex = attach_back( workflow, PythonCommand(long_tex, input={ "template": resource_filename("chilin2.modules.bwa", "bwa.tex"), "figure": conf.prefix + "_bwa_compare.pdf" }, output={"latex": conf.latex_prefix + "_map.tex"})) tex.allow_fail = True tex.allow_dangling = True
def stat_frip(workflow, conf): # collect frip score """ collect FRiP informative tag number and effective peaks number """ stat = attach_back( workflow, PythonCommand( json_frip, input={"frip": [t + ".frip" for t in conf.sample_targets]}, output={"json": conf.json_prefix + "_frip.json"}, param={"samples": conf.sample_bases})) stat.allow_fail = True stat.allow_dangling = True
def tex_conserv(workflow, conf): tex = attach_back( workflow, PythonCommand(latex_conservation, input={ "template": resource_filename("chilin2.modules.conservation", "conservation.tex") }, output={"latex": conf.latex_prefix + "_conserv.tex"}, param={"prefix": conf.prefix})) tex.allow_dangling = True tex.allow_fail = True
def stat_bedAnnotate(workflow, conf, has_dhs, has_velcro): """ Describe peaks' distribution # collect meta gene distribution info """ collect_meta2 = attach_back( workflow, PythonCommand(json_meta2, input={"meta": conf.prefix + ".meta"}, output={"json": conf.json_prefix + "_meta.json"}, param={"id": conf.id}, name="bedAnnotate summary")) collect_meta2.allow_fail = True collect_meta2.allow_dangling = True if has_dhs: collect_dhs = attach_back( workflow, PythonCommand(json_dhs, input={ "dhs": conf.prefix + ".dhs", "top_peaks": 5000 }, output={"json": conf.json_prefix + "_dhs.json"}, name="DHS summary")) collect_dhs.allow_dangling = True collect_dhs.allow_fail = True if has_velcro: collect_velcro = attach_back( workflow, PythonCommand(json_velcro, input={ "velcro": conf.prefix + ".velcro", "top_peaks": 5000 }, output={"json": conf.json_prefix + "_velcro.json"}, name="Velcro summary")) collect_velcro.allow_fail = True collect_velcro.allow_dangling = True
def stat_conservation(workflow, conf): collect = attach_back(workflow, PythonCommand( json_conservation, input={"score": conf.prefix + "_conserv.txt"}, output={"json": conf.json_prefix + "_conserv.json"}, param={"atype": conf.get("basics", "factor", "TF"), "id": conf.id}, name = "conservation score")) collect.allow_dangling = True collect.allow_fail = True if conf.long: ## cluster figures, obsolete, keep for compatible fig = attach_back(workflow, PythonCommand(conservation_figures, input ={"conservationR": conf.prefix + "_conserv.R", "historical_conservation_cluster_text": resource_filename("chilin2.modules.dbaccessor", "Histone_centers.txt")}, output = {"R": conf.prefix+"_conserv_cluster.R", "compare_pdf": conf.prefix + "_conserv_compare.pdf"}, param = {"id": conf.id})) fig.allow_fail = True fig.allow_dangling = True
def tex_frip(workflow, conf): tex = attach_back( workflow, PythonCommand(load_latex, input={ "json": conf.json_prefix + "_frip.json", "template": resource_filename("chilin2.modules.frip", "frip.tex"), }, output={"latex": conf.latex_prefix + "_frip.tex"})) tex.allow_dangling = True tex.allow_fail = True
def stat_motif(workflow, conf): collect = attach_back( workflow, PythonCommand( stat_seqpos, input={"seqpos": conf.prefix + "_seqpos/" + "motif_list.json"}, output={"json": conf.json_prefix + "_seqpos.json"}, param={ "prefix": conf.prefix + "_seqpos/seqLogo/", "z_score_cutoff": -1 }, name="collect motif info")) collect.allow_fail = True collect.allow_dangling = True