def gerprunner(): import pyBigWig b = pyBigWig.open("/scratch/ucgd/lustre/u1021864/serial/hg19.gerp.bw") # x = list(range(1,23)); x.append("X"), x.append("Y") input = sys.argv[1] iterator = JimFile(input) iterable = windower(iterator, chunker(1)) cutoff = 1e-3 def genchunks(): nsmall = 0 for i, chunk in enumerate(iterable): #if len(chunk) < 5: # continue score = b.stats("chr"+chunk[0].chrom, chunk[0].start, chunk[-1].end) yield chunk, score[0] if i % 100000 == 0: print i, chunk[0].chrom, chunk[0].start, score print >>sys.stderr, nsmall, "removed for being too short" print >>sys.stderr, i, "total chunks" vcf_path = "/scratch/ucgd/lustre/u1021864/serial/clinvar-anno.vcf.gz" res = eval2(genchunks(), vcf_path, "/scratch/ucgd/lustre/u1021864/serial/esp-common.vcf.gz") print metrics(res[True], res[False], "gerp.auc.png")
def uptonrunner(): input = "/scratch/ucgd/lustre/u1021864/serial/y.sort.bed.gz" iterator = JimFile(input) iterable = windower(iterator, chunker(20)) cutoff = 1e-3 def genchunks(): nsmall = 0 for i, chunk in enumerate(iterable): if i % 100000 == 0: print i, chunk[0].chrom, chunk[0].start if len(chunk) < 5: continue mafs = (float(x.mafs) for x in chunk) score = sum(1.0 - m for m in mafs if m < cutoff) / float(len(chunk)) if score == 1: nsmall += 1 continue yield chunk, score print >>sys.stderr, nsmall, "removed for being too short" print >>sys.stderr, i, "total chunks" # NOTE: these are for humvar only. not neede for clinvar. def is_pathogenic(d): return d['class'] == "deleterious" def not_pathogenic(d): return d['class'] == "neutral" eval_path = "/scratch/ucgd/lustre/u1021864/serial/clinvar-anno.vcf.gz" #res = evaldoms(genchunks(), eval_path, is_pathogenic=is_pathogenic, not_pathogenic=not_pathogenic) res = eval2(genchunks(), eval_path, "esp-vcommon.vcf.gz") #"/scratch/ucgd/lustre/u1021864/serial/esp-common.vcf.gz") print metrics(res[True], res[False], "upton-esp.auc.png")
def rvistest(): vcf_path = "/scratch/ucgd/lustre/u1021864/serial/clinvar-anno.vcf.gz" bed = "rvis.bed" def genregions(): for d in ts.reader("rvis.bed"): score = float(d['pct']) chunk = [interval(d['chrom'], int(d['start']), int(d['end']))] yield chunk, -score res = evaldoms(genregions(), vcf_path) print metrics(res[True], res[False], "x.auc.png")
def example3(): import toolshed as ts import matplotlib matplotlib.use('Agg') from matplotlib import pyplot as plt import seaborn as sns from scipy.stats import mannwhitneyu as mw import numpy as np iterator = JimFile(args.input, args.regions) #it = ts.reader(args.input) #'/scratch/ucgd/serial/quinlan_lab/data/u1021864/regionsmafsdnds.bed.gz' #iterable = (Interval(**iv) for iv in it) results = defaultdict(lambda : defaultdict(list)) ms = defaultdict(list) ff = args.genome cpg_cutoff = {} maf_cutoff = float(args.maf) if args.maf else 1e-05 start = 0 end = .2 step = .025 j = start #for i in frange(start, end, step): # cpg_cutoff[str(j)+"-"+str(i)] = (j, i) # j = i #cpg_cutoff['0.2-1'] = (.2, 1) cpg_cutoff['0-1'] = (0, 1) base = [] cons = [] genes = None #genes = Fasta(ff) if args.regions == "chunks": regioner = smallchunk chunksize = args.regionsize if args.regions in ["domains", "nodoms", "all"]: regioner = byregiondist chunksize = "" if args.regions == "genes": regioner = bytranscriptdist chunksize = "" y = list(windower(iterator, regioner, chunksize)) comparison = args.comparison if args.exclude: exclude = args.exclude ex = "ex" + args.exclude + "." else: exclude = None ex = "" cv = [] if args.conservation: for r in ts.reader(args.conservation): v = get_conservation(r) cv.append(v) cpg=1 if y: for iv in y: # iterable, size_grouper(1) #cpg = CpG(iv, genes = genes) b = baseline(iv, maf_cutoff = maf_cutoff, exclude = exclude, comparison = comparison, patt = patt) ms['baseline'].append((iv,b[3]/b[4],cpg)) base.append(b) count = 0.0 totlen = 0.0 if base: for b in base: count += b[3] totlen += b[4] baserate = count/totlen for iv, b in zip(y, base): u = upton(b, baserate) c = constraint(iv, maf_cutoff = maf_cutoff, genes = genes, upton = u) r = RVIS(iv, maf_cutoff = 1e-3, patt = patt) ct = (iv, c, cpg) if c != 0: ms['nzconstraint'].append(ct) ms['constraint'].append(ct) ct = (iv, u, cpg) ms['upton'].append((ct[0],ct[1][3],ct[2])) ct = (iv, r, cpg) ms['rvis'].append((ct[0],ct[1],ct[2])) cons.append((u[0],u[1],u[2],c)) # results['iafi'].append((iv, IAFI_inline(iv, n_samples=61000))) # results['frv'].append((iv, FRV_inline(iv, maf_cutoff=maf_cutoff))) # results['count_nons'].append((iv, count_nons(iv))) # TODO: jim add a lot more metrics here... e.g.: bedname = "."+ rtz(maf_cutoff) + "." + comparison + "." + args.regions + str(chunksize) + "." + ex f1 = open("constraint" + bedname + ".bed","w") f2 = open("baseline" + bedname + ".bed","w") for b,c in zip(base,cons): f1.write("\t".join(map(str,c))+"\n") f2.write("\t".join(map(str,b))+"\n") f1.close() f2.close() cutoffs = set() for cutoff in cpg_cutoff: co = str(cpg_cutoff[cutoff][0])+'-'+str(cpg_cutoff[cutoff][1]) cutoffs.add(co) for metric in ms: for ct in ms[metric]: if ct[2] >= cpg_cutoff[cutoff][0] and ct[2] <= cpg_cutoff[cutoff][1]: results[metric][co].append(ct) option = args.truetype trusrc = "" if option == "clinvar" or option == "c": func = clinvar trusrc = "clinvar" if option == "pli" or option == "p": func = pli trusrc = "pli" for metric in results: for cutoff in cutoffs: imgname = metric + "." + trusrc + "." + comparison + "." + args.regions + str(chunksize) + "." + ex + cutoff + "." + rtz(maf_cutoff) print metric, cutoff fig, axes = plt.subplots(2) fig.tight_layout() counts = evaldoms(results[metric][cutoff], args.pathogenic, # forweb_cleaned_exac_r03_march16_z_data_pLI.txt from ExAC ftp or clinvar_20150305.tidy.vcf.gz from clinvar src func) imin, imax = np.percentile(counts[True] + counts[False], [0.01, 99.99]) axes[0].hist(counts[True], bins=80) #,label = cutoff) axes[0].set_xlabel("pathogenic") axes[0].set_xlim(imin, imax) props = dict(boxstyle = 'round', facecolor = 'whitesmoke', alpha = 0.5) axes[0].text(.875, .8, "CpG frac:\n" + cutoff.replace("-"," - "), transform = axes[0].transAxes, bbox = props) #axes[0].legend(loc = 1, frameon = True) axes[1].hist(counts[False], bins=80) axes[1].set_xlabel("not-pathogenic") axes[1].set_xlim(imin, imax) plt.show() plt.savefig(imgname + ".dist.png", bbox_inches = 'tight') print metrics(counts[True], counts[False], imgname + ".auc.png", cutoff = cutoff) print mw(counts[True], counts[False]) del fig plt.close()