def json_dhs(input={"top_peaks": "", "dhs_peaks": ""}, output={"json": ""}, param={}): result_dict = {"stat": {}, "input": input, "output": output, "param": param} # result_dict["stat"] = float(open(input["dhs"]).read().strip()) content = open(input["dhs"]).read().strip().split(",") result_dict["stat"]["overlap"] = int(content[1]) result_dict["stat"]["number"] = int(content[0]) json_dump(result_dict)
def json_contamination(input={"summaries": [[]]}, output={"json": ""}, param={ "samples": "", "species": "", "id": "" }): library_contamination = {} library_contamination["meta"] = { "sample": param["id"], "species": param["species"] } library_contamination["value"] = {} for a_summary, s in zip(input["summaries"], map(underline_to_space, param["samples"])): ## each bowtie_summary has several species information library_contamination["value"][s] = {} for i, j in zip(a_summary, param["species"]): ## species 1, species2, species3 mapped = int(open(i[0]).readlines()[2].strip().split()[0]) total = int(open(i[1]).read().strip()) library_contamination["value"][s][j] = float(mapped) / total json_dict = {"stat": {}, "input": input, "output": output, "param": param} json_dict["stat"] = library_contamination json_dump(json_dict)
def stat_frag_std(input={ "r": "", "insert": "" }, output={ "json": "", "r": "" }, param={ "samples": "", "frag_tool": "" }): """ parse macs2 predictd r file into json file """ json_dict = {"input": input, "output": output, "param": param, "stat": {}} for rin, rout, s in zip(input["r"], output["r"], param["samples"]): values = get_size(rin) with open(rout, 'w') as f: f.write(values['positive']) f.write(values['minus']) f.write(values['xcorr']) f.write(values['ycorr']) f.write("xcorr.max = xcorr[which(ycorr==max(ycorr))]\n") f.write(values['x']) f.write("p.expect = sum(x * p/100) \n") f.write("m.expect = sum(x * m/100) \n") f.write("p.sd = sqrt(sum(((x-p.expect)^2)*p/100)) \n") f.write("m.sd = sqrt(sum(((x-m.expect)^2)*m/100)) \n") f.write("cat(paste((p.sd + m.sd)/2, '\t', xcorr.max)) \n") f.close() std_frag = os.popen("Rscript %s" % rout).read().strip().split() json_dict["stat"][s] = "%s" % (int(float(std_frag[1]))) json_dump(json_dict)
def enrich_in_meta(input = {'meta':'', 'mapped':''}, output = {"json": ""}, param = {'dhs': '', 'down': '', 'has_dhs':'', 'id':"", 'samples':""}): """ enrichment in meta regions """ json_dict = {"stat": {}, "input": input, "output": output, "param":param} for n, s in enumerate(param['samples']): ## total mapped reads mapped = float(open(input["mapped"][n]).readlines()[2].split()[0]) json_dict['stat'][s] = {} meta = open(input['meta'][n]).read().strip().split(",") meta = map(float, meta) if not param["down"]: json_dict['stat'][s]['exon'] = meta[0]/mapped json_dict['stat'][s]['promoter'] = meta[1]/mapped ## use all mapped reads else: json_dict['stat'][s]['exon'] = meta[0]/meta[2] json_dict['stat'][s]['promoter'] = meta[1]/meta[2] ## use 4M reads if param['has_dhs']: dhs = open(param["dhs"][n]).read().strip().split(",") dhs = map(float, dhs) if not param["down"]: json_dict['stat'][s]['dhs'] = dhs[0]/mapped else: json_dict['stat'][s]['dhs'] = dhs[0]/dhs[1] json_dump(json_dict)
def json_conservation(input={"score": ""}, output={"json": ""}, param={}): """ collect conservation_plot output Phastcon score """ json_dict = {"stat": [], "input": input, "output": output, "param": ""} rd = lambda x: str(round(float(x), 3)) json_dict['stat'] = map(rd, open(input['score']).read().strip().split()) json_dump(json_dict)
def json_reps(input, output, param): json_dict = {"stat": {}, "input": input, "output": output, "param": param} json_dict['stat']['cor'] = [ float(i.strip().split()[2]) for i in open(input['cor']).readlines() ] json_dict["stat"]['overlap'] = [ float(open(i).read().strip()) for i in input['overlap'] ] json_dump(json_dict)
def json_macs2(input={"macs2_peaks_xls": ""}, output={"json": ""}, param={"id": ""}): """ input macs2 _peaks.xls output conf.json_prefix + "_macs2.json" """ json_dict = {"stat": {}, "input": input, "output": output, "param": param} if os.path.exists(input['macs2_peaks_xls']): ## in case only broad peaks would break down sometimes, narrowPeak very seldom no peaks json_dict["stat"] = _peaks_parse(input["macs2_peaks_xls"]) json_dump(json_dict)
def json_velcro(input={}, output={}, param={}): result_dict = { "stat": {}, "input": input, "output": output, "param": param } result_dict["stat"] = 1 - float(open(input["velcro"]).read().strip()) json_dump(result_dict)
def stat_seqpos(input={ "template": "", "seqpos": "" }, output={"latex_section": ""}, param={ "prefix": "", "z_score_cutoff": -15 }): """parse mdsepose html file""" z_score_cutoff = param["z_score_cutoff"] seqpos_html_content = open(input['seqpos']).readlines() mdseqpos_result = [] ## parse motif list json file for m in seqpos_html_content: mdseqpos_result.append(json.loads(m.strip())) satisfied_motif_list = [] for a_motif in mdseqpos_result: if a_motif['seqpos_results']['zscore'] == 'None': a_motif['seqpos_results']['zscore'] = 65535 if a_motif['factors'] == None: a_motif['factors'] = ['denovo'] satisfied_motif_list.append(a_motif) satisfied_motif_list.sort(key=lambda x: x['seqpos_results']['zscore']) satisfied_count = 0 top_motifs = [] for a_motif in satisfied_motif_list: if a_motif['id'].find('observed') > 0: continue if satisfied_count == 10: break # z_score is a negative score, the smaller, the better if a_motif['seqpos_results']['zscore'] < z_score_cutoff: satisfied_count += 1 top_motifs.append(a_motif) ## choose first 5 motifs to fit into latex document for n, _ in enumerate(top_motifs): top_motifs[n][ "logoImg"] = param["prefix"] + top_motifs[n]['id'] + ".png" result_dict = { "stat": {}, "input": input, "output": output, "param": param } result_dict["stat"]["satisfied_motifs"] = top_motifs json_dump(result_dict)
def json_fastqc(input={"fastqc_summaries": []}, output={"R": "", "json": "", "pdf": ""}, param={"ids": [], "id": ""}): json_dict = {"stat": {}, "input": input, "output": output, "param": param} stat = json_dict["stat"] for a_summary, a_id in zip(input["fastqc_summaries"], param["ids"]): parsed = _fastqc_parse(input=a_summary) stat[a_id] = {} stat[a_id]["median"] = parsed["median"] stat[a_id]["sequence_length"] = parsed["sequence_length"] json_dump(json_dict)
def json_macs2(input={"macs2_peaks_xls": ""}, output={"json": ""}, param={"id": ""}): """ input macs2 _peaks.xls output conf.json_prefix + "_macs2.json" """ json_dict = {"stat": {}, "input": input, "output": output, "param": param} if os.path.exists( input['macs2_peaks_xls'] ): ## in case only broad peaks would break down sometimes, narrowPeak very seldom no peaks json_dict["stat"] = _peaks_parse(input["macs2_peaks_xls"]) json_dump(json_dict)
def json_frip(input={}, output={}, param={}): # convert to json """ input is *.frip output is conf.json_prefix + "_frip.json" param for matching samples """ json_dict = {"stat": {}, "input": input, "output": output, "param": param} for i,s in zip(input["frip"], param["samples"]): inf = open(i).read().strip().split(",") json_dict["stat"][s] = {} json_dict["stat"][s]["info_tag"] = int(inf[0]) json_dict["stat"][s]["total_tag"] = int(inf[1]) json_dict["stat"][s]["frip"] = float(int(inf[0]))/int(inf[1]) json_dump(json_dict)
def json_frip(input={}, output={}, param={}): # convert to json """ input is *.frip output is conf.json_prefix + "_frip.json" param for matching samples """ json_dict = {"stat": {}, "input": input, "output": output, "param": param} for i, s in zip(input["frip"], param["samples"]): inf = open(i).read().strip().split(",") json_dict["stat"][s] = {} json_dict["stat"][s]["info_tag"] = int(inf[0]) json_dict["stat"][s]["total_tag"] = int(inf[1]) json_dict["stat"][s]["frip"] = float(int(inf[0])) / int(inf[1]) json_dump(json_dict)
def json_macs2_on_reps(input={"all_peak_xls": []}, output={"json": ""}, param={"samples": []}): """ collect replicates macs2 info to json files compared to merged one, collect redundant ratio with --keep-dup 1 option """ json_dict = {"stat": {}, "input": input, "output": output, "param": param} parsed = [] for i in input["all_peak_xls"]: if os.path.exists(i): ## in case only broad peaks would break down sometimes, narrowPeak very seldom no peaks parsed.append(_peaks_parse(i)) if all(map(os.path.exists, input['all_peak_xls'])): for sample, stat in zip(param["samples"], parsed): json_dict["stat"][sample] = stat json_dump(json_dict)
def json_pbc(input={}, output={}, param={}): # convert to json format """ input is the target + ".pbc" output is the json files conf.json_prefix + "_pbc.json" param for matching samples order """ json_dict = {"stat": {}, "input": input, "output": output, "param": param} for i, s in zip(input["pbc"], param["samples"]): inl = open(i).readlines()[0].strip().split() json_dict["stat"][s] = {} json_dict["stat"][s]["N1"] = int(inl[0]) json_dict["stat"][s]["Nd"] = int(inl[1]) json_dict["stat"][s]["PBC"] = round(float(inl[2]), 3) json_dump(json_dict)
def json_contamination(input = {"summaries": [[]]}, output = {"json": ""}, param = {"samples": "", "species": "", "id": ""}): library_contamination = {} library_contamination["meta"] = {"sample": param["id"], "species": param["species"]} library_contamination["value"] = {} for a_summary, s in zip(input["summaries"], map(underline_to_space, param["samples"])): ## each bowtie_summary has several species information library_contamination["value"][s] = {} for i, j in zip(a_summary, param["species"]): ## species 1, species2, species3 mapped = int(open(i[0]).readlines()[2].strip().split()[0]) total = int(open(i[1]).read().strip()) library_contamination["value"][s][j] = float(mapped)/total json_dict = {"stat": {}, "input": input, "output": output, "param": param} json_dict["stat"] = library_contamination json_dump(json_dict)
def enrich_in_meta(input = {'exon':'','dhs':'','promoter':'', "mapped": ""}, output = {"json": ""}, param = {'id':"", 'samples':""}): """ enrichment in meta regions """ json_dict = {"stat": {}, "input": input, "output": output, "param":param} for n, s in enumerate(param['samples']): mapped = float(open(input["mapped"][n]).readlines()[2].split()[0]) json_dict['stat'][s] = {} json_dict['stat'][s]['exon'] = float(open(input['exon'][n]).read().strip())/mapped json_dict['stat'][s]['promoter'] = float(open(input['promoter'][n]).read().strip())/mapped if param['has_dhs']: json_dict['stat'][s]['dhs'] = float(open(param['dhs'][n]).read().strip())/mapped else: json_dict['stat'][s]['dhs'] = 0 json_dump(json_dict)
def json_fastqc(input={"fastqc_summaries": []}, output={ "R": "", "json": "", "pdf": "" }, param={ "ids": [], "id": "" }): json_dict = {"stat": {}, "input": input, "output": output, "param": param} stat = json_dict["stat"] for a_summary, a_id in zip(input["fastqc_summaries"], param["ids"]): parsed = _fastqc_parse(input=a_summary) stat[a_id] = {} stat[a_id]["median"] = parsed["median"] stat[a_id]["sequence_length"] = parsed["sequence_length"] json_dump(json_dict)
def json_dhs( input={ "top_peaks": "", "dhs_peaks": "" }, output={"json": ""}, param={}): result_dict = { "stat": {}, "input": input, "output": output, "param": param } # result_dict["stat"] = float(open(input["dhs"]).read().strip()) content = open(input["dhs"]).read().strip().split(",") result_dict["stat"]["overlap"] = int(content[1]) result_dict["stat"]["number"] = int(content[0]) json_dump(result_dict)
def json_macs2_on_reps(input={"all_peak_xls": []}, output={"json": ""}, param={"samples": []}): """ collect replicates macs2 info to json files compared to merged one, collect redundant ratio with --keep-dup 1 option """ json_dict = {"stat": {}, "input": input, "output": output, "param": param} parsed = [] for i in input["all_peak_xls"]: if os.path.exists( i ): ## in case only broad peaks would break down sometimes, narrowPeak very seldom no peaks parsed.append(_peaks_parse(i)) if all(map(os.path.exists, input['all_peak_xls'])): for sample, stat in zip(param["samples"], parsed): json_dict["stat"][sample] = stat json_dump(json_dict)
def json_meta2(input={}, output={}, param={}): """ generate json of genomic distribution (given by the bedAnnotate output) ***THE key difference between json_meta and this fn is that bedAnnotate conveniently outputs the distribution as a dictionary of peak counts """ f = open(input["meta"]) #f = something like: {'Intron': 68017, 'Exon': 7659, 'Intergenic': 73090, 'Promoter': 11229} content = eval(f.read()) total = 0 for k in content.keys(): total += content[k] json_dict = {"input": input, "stat": {}, "output": output, "param": param} json_dict["stat"]["exon"] = content['Exon']/float(total) json_dict["stat"]["intron"] = content['Intron']/float(total) json_dict["stat"]["promoter"] = content['Promoter']/float(total) json_dict["stat"]["inter"] = content['Intergenic']/float(total) f.close() json_dump(json_dict)
def json_meta2(input={}, output={}, param={}): """ generate json of genomic distribution (given by the bedAnnotate output) ***THE key difference between json_meta and this fn is that bedAnnotate conveniently outputs the distribution as a dictionary of peak counts """ f = open(input["meta"]) #f = something like: {'Intron': 68017, 'Exon': 7659, 'Intergenic': 73090, 'Promoter': 11229} content = eval(f.read()) total = 0 for k in content.keys(): total += content[k] json_dict = {"input": input, "stat": {}, "output": output, "param": param} json_dict["stat"]["exon"] = content['Exon'] / float(total) json_dict["stat"]["intron"] = content['Intron'] / float(total) json_dict["stat"]["promoter"] = content['Promoter'] / float(total) json_dict["stat"]["inter"] = content['Intergenic'] / float(total) f.close() json_dump(json_dict)
def json_phan(input = {"spp": ""}, output = {"json": ""}, param = {"sample": ""}): """ fragment size keep the maximus positive one """ json_dict = {"stat": {}, "input": input, "output": output, "param": param} frag = 0 for i, s in zip(input["spp"], param["sample"]): json_dict["stat"][s] = {} f = open(i) content = f.read().strip().split() f.close() json_dict["stat"][s]["NSC"] = content[8] json_dict["stat"][s]["RSC"] = content[9] for i in content[2].split(","): if i >= 0: frag = i break json_dict["stat"][s]["frag"] = frag ## pick the most correlated ones json_dict["stat"][s]["Qtag"] = content[10] json_dump(json_dict)
def json_phan(input={"spp": ""}, output={"json": ""}, param={"sample": ""}): """ fragment size keep the maximus positive one """ json_dict = {"stat": {}, "input": input, "output": output, "param": param} frag = 0 for i, s in zip(input["spp"], param["sample"]): json_dict["stat"][s] = {} f = open(i) content = f.read().strip().split() f.close() json_dict["stat"][s]["NSC"] = content[8] json_dict["stat"][s]["RSC"] = content[9] for i in content[2].split(","): if i >= 0: frag = i break json_dict["stat"][s]["frag"] = frag ## pick the most correlated ones json_dict["stat"][s]["Qtag"] = content[10] json_dump(json_dict)
def stat_seqpos(input = {"template": "", "seqpos": ""}, output={"latex_section": ""}, param = {"prefix": "", "z_score_cutoff":-15}): """parse mdsepose html file""" z_score_cutoff = param["z_score_cutoff"] seqpos_html_content = open(input['seqpos']).readlines() mdseqpos_result = [] ## parse motif list json file for m in seqpos_html_content: mdseqpos_result.append(json.loads(m.strip())) satisfied_motif_list = [] for a_motif in mdseqpos_result: if a_motif['seqpos_results']['zscore'] == 'None': a_motif['seqpos_results']['zscore'] = 65535 if a_motif['factors'] == None: a_motif['factors'] = ['denovo'] satisfied_motif_list.append(a_motif) satisfied_motif_list.sort(key=lambda x:x['seqpos_results']['zscore']) satisfied_count = 0 top_motifs = [] for a_motif in satisfied_motif_list: if a_motif['id'].find('observed')>0: continue if satisfied_count == 10: break # z_score is a negative score, the smaller, the better if a_motif['seqpos_results']['zscore'] < z_score_cutoff : satisfied_count += 1 top_motifs.append(a_motif) ## choose first 5 motifs to fit into latex document for n, _ in enumerate(top_motifs): top_motifs[n]["logoImg"] = param["prefix"] + top_motifs[n]['id'] + ".png" result_dict = {"stat": {}, "input": input, "output": output, "param": param} result_dict["stat"]["satisfied_motifs"] = top_motifs json_dump(result_dict)
def enrich_in_meta(input={ 'meta': '', 'mapped': '' }, output={"json": ""}, param={ 'dhs': '', 'down': '', 'has_dhs': '', 'id': "", 'samples': "" }): """ enrichment in meta regions """ json_dict = {"stat": {}, "input": input, "output": output, "param": param} for n, s in enumerate(param['samples']): ## total mapped reads mapped = float(open(input["mapped"][n]).readlines()[2].split()[0]) json_dict['stat'][s] = {} meta = open(input['meta'][n]).read().strip().split(",") meta = map(float, meta) if not param["down"]: json_dict['stat'][s]['exon'] = meta[0] / mapped json_dict['stat'][s][ 'promoter'] = meta[1] / mapped ## use all mapped reads else: json_dict['stat'][s]['exon'] = meta[0] / meta[2] json_dict['stat'][s]['promoter'] = meta[1] / meta[ 2] ## use 4M reads if param['has_dhs']: dhs = open(param["dhs"][n]).read().strip().split(",") dhs = map(float, dhs) if not param["down"]: json_dict['stat'][s]['dhs'] = dhs[0] / mapped else: json_dict['stat'][s]['dhs'] = dhs[0] / dhs[1] json_dump(json_dict)
def json_meta(input={}, output={}, param={}): """ ########################################################################### DEPRECATED!!! see json_meta2 ########################################################################### generate json of promoter, intergenic and exon only one output either from merged one or the best one overlap percentage info """ f = open(input["meta"]) content = f.read().strip().split(",") exon = content[0] intron = content[1] inter = content[2] promoter = content[3] json_dict = {"input": input, "stat": {}, "output": output, "param": param} json_dict["stat"]["exon"] = float(exon) json_dict["stat"]["intron"] = float(intron) json_dict["stat"]["promoter"] = float(promoter) json_dict["stat"]["inter"] = float(inter) f.close() json_dump(json_dict)
def json_bwa(input={}, output={}, param={}): ## convert values to json files """ input samtools flagstat standard output output json files kwargs for matching replicates order keep one value for each json for easier loading to html/pdf template example: 3815725 + 0 in total (QC-passed reads + QC-failed reads) 0 + 0 duplicates 3815723 + 0 mapped (100.00%:-nan%) """ json_dict = {"stat": {}, "input": input, "output": output, "param": param} for mapped, total, sam in zip(input["bwa_mapped"], input["bwa_total"], param["sample"]): inft = open(total, 'rU') infm = open(mapped, 'rU') json_dict["stat"][sam] = {} json_dict["stat"][sam]["mapped"] = int(infm.readlines()[2].split()[0]) json_dict["stat"][sam]["total"] = int(inft.readlines()[0].strip()) inft.close() infm.close() json_dump(json_dict)
def stat_frag_std(input = {"r": "", "insert": ""}, output = {"json": "", "r": ""}, param = {"samples": "", "frag_tool": ""}): """ parse macs2 predictd r file into json file """ json_dict = {"input": input, "output": output, "param": param, "stat": {}} for rin, rout, s in zip(input["r"], output["r"], param["samples"]): values = get_size(rin) with open(rout, 'w') as f: f.write(values['positive']) f.write(values['minus']) f.write(values['xcorr']) f.write(values['ycorr']) f.write("xcorr.max = xcorr[which(ycorr==max(ycorr))]\n") f.write(values['x']) f.write("p.expect = sum(x * p/100) \n") f.write("m.expect = sum(x * m/100) \n") f.write("p.sd = sqrt(sum(((x-p.expect)^2)*p/100)) \n") f.write("m.sd = sqrt(sum(((x-m.expect)^2)*m/100)) \n") f.write("cat(paste((p.sd + m.sd)/2, '\t', xcorr.max)) \n") f.close() std_frag = os.popen("Rscript %s" % rout).read().strip().split() json_dict["stat"][s] = "%s" % (int(float(std_frag[1]))) json_dump(json_dict)
def json_velcro(input={}, output={}, param={}): result_dict = {"stat": {}, "input": input, "output": output, "param": param} result_dict["stat"] = 1-float(open(input["velcro"]).read().strip()) json_dump(result_dict)