Example #1
0
def gerprunner():

    import pyBigWig

    b = pyBigWig.open("/scratch/ucgd/lustre/u1021864/serial/hg19.gerp.bw")
   # x = list(range(1,23)); x.append("X"), x.append("Y")

    input = sys.argv[1]
    iterator = JimFile(input)
    iterable = windower(iterator, chunker(1))
    cutoff = 1e-3

    def genchunks():
        nsmall = 0
        for i, chunk in enumerate(iterable):
            #if len(chunk) < 5:
            #    continue
            score = b.stats("chr"+chunk[0].chrom, chunk[0].start, chunk[-1].end)
            yield chunk, score[0]
            if i % 100000 == 0:
                print i, chunk[0].chrom, chunk[0].start, score
        print >>sys.stderr, nsmall, "removed for being too short"
        print >>sys.stderr, i, "total chunks"

    vcf_path = "/scratch/ucgd/lustre/u1021864/serial/clinvar-anno.vcf.gz"
    res = eval2(genchunks(), vcf_path,
        "/scratch/ucgd/lustre/u1021864/serial/esp-common.vcf.gz")
    print metrics(res[True], res[False], "gerp.auc.png")
Example #2
0
def uptonrunner():

    input = "/scratch/ucgd/lustre/u1021864/serial/y.sort.bed.gz"
    iterator = JimFile(input)
    iterable = windower(iterator, chunker(20))
    cutoff = 1e-3

    def genchunks():
        nsmall = 0
        for i, chunk in enumerate(iterable):
            if i % 100000 == 0:
                print i, chunk[0].chrom, chunk[0].start
            if len(chunk) < 5:
                continue
            mafs = (float(x.mafs) for x in chunk)
            score = sum(1.0 - m for m in mafs if m < cutoff) / float(len(chunk))
            if score == 1:
                nsmall += 1
                continue
            yield chunk, score
        print >>sys.stderr, nsmall, "removed for being too short"
        print >>sys.stderr, i, "total chunks"

    # NOTE: these are for humvar only. not neede for clinvar.
    def is_pathogenic(d):
        return d['class'] == "deleterious"
    def not_pathogenic(d):
        return d['class'] == "neutral"

    eval_path = "/scratch/ucgd/lustre/u1021864/serial/clinvar-anno.vcf.gz"
    #res = evaldoms(genchunks(), eval_path, is_pathogenic=is_pathogenic, not_pathogenic=not_pathogenic)
    res = eval2(genchunks(), eval_path,
        "esp-vcommon.vcf.gz")
        #"/scratch/ucgd/lustre/u1021864/serial/esp-common.vcf.gz")
    print metrics(res[True], res[False], "upton-esp.auc.png")
Example #3
0
def rvistest():
    vcf_path = "/scratch/ucgd/lustre/u1021864/serial/clinvar-anno.vcf.gz"
    bed = "rvis.bed"

    def genregions():
        for d in ts.reader("rvis.bed"):
            score = float(d['pct'])
            chunk = [interval(d['chrom'], int(d['start']), int(d['end']))]
            yield chunk, -score

    res = evaldoms(genregions(), vcf_path)
    print metrics(res[True], res[False], "x.auc.png")
Example #4
0
def example3():
    import toolshed as ts
    import matplotlib
    matplotlib.use('Agg')
    from matplotlib import pyplot as plt
    import seaborn as sns
    from scipy.stats import mannwhitneyu as mw
    import numpy as np

    iterator = JimFile(args.input, args.regions)
    #it = ts.reader(args.input) #'/scratch/ucgd/serial/quinlan_lab/data/u1021864/regionsmafsdnds.bed.gz'
    #iterable = (Interval(**iv) for iv in it)

    results = defaultdict(lambda : defaultdict(list))
    ms = defaultdict(list)
    ff = args.genome
    cpg_cutoff = {}
    maf_cutoff = float(args.maf) if args.maf else 1e-05
    start = 0
    end = .2
    step = .025
    j = start
    #for i in frange(start, end, step):
    #    cpg_cutoff[str(j)+"-"+str(i)] = (j, i)
    #    j = i
    #cpg_cutoff['0.2-1'] = (.2, 1)
    cpg_cutoff['0-1'] = (0, 1)

    base = []
    cons = []
    genes = None
    #genes = Fasta(ff)
    if args.regions == "chunks":
        regioner = smallchunk
        chunksize = args.regionsize
    if args.regions in ["domains", "nodoms", "all"]:
        regioner = byregiondist
        chunksize = ""
    if args.regions == "genes":
        regioner = bytranscriptdist
        chunksize = ""
    y = list(windower(iterator, regioner, chunksize))
    comparison = args.comparison
    if args.exclude:
        exclude = args.exclude
        ex = "ex" + args.exclude + "."
    else:
        exclude = None
        ex = ""
    cv = []
    if args.conservation:
        for r in ts.reader(args.conservation):
            v = get_conservation(r)
            cv.append(v)
    cpg=1
    if y:
        for iv in y: # iterable, size_grouper(1)
            #cpg = CpG(iv, genes = genes)
            b = baseline(iv, maf_cutoff = maf_cutoff, exclude = exclude, comparison = comparison, patt = patt)
            ms['baseline'].append((iv,b[3]/b[4],cpg))
            base.append(b)
    count = 0.0
    totlen = 0.0
    if base:
        for b in base:
            count += b[3]
            totlen += b[4]
        baserate = count/totlen
    for iv, b in zip(y, base):
        u = upton(b, baserate)
        c = constraint(iv, maf_cutoff = maf_cutoff, genes = genes, upton = u)
        r = RVIS(iv, maf_cutoff = 1e-3, patt = patt)
        ct = (iv,
               c,
               cpg)
        if c != 0:
            ms['nzconstraint'].append(ct)
        ms['constraint'].append(ct)
        ct = (iv,
                u,
                cpg)
        ms['upton'].append((ct[0],ct[1][3],ct[2]))
        ct = (iv,
                r,
                cpg)
        ms['rvis'].append((ct[0],ct[1],ct[2]))
        cons.append((u[0],u[1],u[2],c))
       # results['iafi'].append((iv, IAFI_inline(iv, n_samples=61000)))
       # results['frv'].append((iv, FRV_inline(iv, maf_cutoff=maf_cutoff)))
       # results['count_nons'].append((iv, count_nons(iv)))
        # TODO: jim add a lot more metrics here... e.g.:
    bedname = "."+ rtz(maf_cutoff) + "." + comparison + "." + args.regions + str(chunksize) + "." + ex
    f1 = open("constraint" + bedname + ".bed","w")
    f2 = open("baseline" + bedname + ".bed","w")
    for b,c in zip(base,cons):
        f1.write("\t".join(map(str,c))+"\n")
        f2.write("\t".join(map(str,b))+"\n")
    f1.close()
    f2.close()

    cutoffs = set()
    for cutoff in cpg_cutoff:
        co = str(cpg_cutoff[cutoff][0])+'-'+str(cpg_cutoff[cutoff][1])
        cutoffs.add(co)
        for metric in ms:
            for ct in ms[metric]:
                if ct[2] >= cpg_cutoff[cutoff][0] and ct[2] <= cpg_cutoff[cutoff][1]:
                    results[metric][co].append(ct)

    option = args.truetype
    trusrc = ""
    if option == "clinvar" or option == "c":
        func = clinvar
        trusrc = "clinvar"
    if option == "pli" or option == "p":
        func = pli
        trusrc = "pli"
    for metric in results:
        for cutoff in cutoffs:
            imgname = metric + "." + trusrc + "." + comparison + "." + args.regions + str(chunksize) + "." + ex + cutoff + "." + rtz(maf_cutoff)
            print metric, cutoff
            fig, axes = plt.subplots(2)
            fig.tight_layout()
            counts = evaldoms(results[metric][cutoff],
                    args.pathogenic, # forweb_cleaned_exac_r03_march16_z_data_pLI.txt from ExAC ftp or clinvar_20150305.tidy.vcf.gz from clinvar src
                    func)
            imin, imax = np.percentile(counts[True] + counts[False], [0.01, 99.99])
            axes[0].hist(counts[True], bins=80) #,label = cutoff)
            axes[0].set_xlabel("pathogenic")
            axes[0].set_xlim(imin, imax)
            props = dict(boxstyle = 'round', facecolor = 'whitesmoke', alpha = 0.5)
            axes[0].text(.875, .8, "CpG frac:\n" + cutoff.replace("-"," - "), transform = axes[0].transAxes, bbox = props)
            #axes[0].legend(loc = 1, frameon = True)
            axes[1].hist(counts[False], bins=80)
            axes[1].set_xlabel("not-pathogenic")
            axes[1].set_xlim(imin, imax)
            plt.show()
            plt.savefig(imgname + ".dist.png", bbox_inches = 'tight')
            print metrics(counts[True], counts[False], imgname + ".auc.png", cutoff = cutoff)
            print mw(counts[True], counts[False])
            del fig
            plt.close()