def Main(): args=ArgParse() # initiate genome object from UCSC genome browser chromInfo=Genome(db=args.genome).chromInfo chroms={} sum_size=0 sum_size_list=[] #store chromInfo for i in range(chromInfo.count()): try: if "random" in chromInfo[i].chrom: continue chroms[chromInfo[i].chrom]=chromInfo[i].size sum_size+=chromInfo[i].size sum_size_list.append((chromInfo[i].chrom,sum_size)) except: break print >> sys.stderr, sum_size_list[-1][1] print >> sys.stderr, chroms print >>sys.stderr, "Chromosome information readed, %d chromosomes"%(len(chroms)) i=0 while i < args.num: # randomly select one chromosome and a region in this chromosome chrom=random_chr(sum_size_list) size=chroms[chrom] length=int(gauss(args.mean,args.sd)) start=randrange(1,size-length-1) end=start+length-1 strand=choice(['+','-']) print "\t".join(str(f) for f in [chrom,start,end,i+1,0,strand]) i=i+1
def Main(): args = ArgParse() # initiate genome object from UCSC genome browser chromInfo = Genome(db=args.genome).chromInfo chroms = {} sum_size = 0 sum_size_list = [] #store chromInfo for i in range(chromInfo.count()): try: if "random" in chromInfo[i].chrom: continue chroms[chromInfo[i].chrom] = chromInfo[i].size sum_size += chromInfo[i].size sum_size_list.append((chromInfo[i].chrom, sum_size)) except: break print >> sys.stderr, sum_size_list[-1][1] print >> sys.stderr, chroms print >> sys.stderr, "Chromosome information readed, %d chromosomes" % ( len(chroms)) i = 0 while i < args.num: # randomly select one chromosome and a region in this chromosome chrom = random_chr(sum_size_list) size = chroms[chrom] length = int(gauss(args.mean, args.sd)) start = randrange(1, size - length - 1) end = start + length - 1 strand = choice(['+', '-']) print "\t".join(str(f) for f in [chrom, start, end, i + 1, 0, strand]) i = i + 1
def test_dataframe(self): g = Genome('hg18') kg = g.dataframe('cpgIslandExt') self.assert_(kg.shape[0] == g.cpgIslandExt.count()) q = g.cpgIslandExt.filter(g.cpgIslandExt.chromStart < 300000).limit(10) df = g.dataframe(q) self.assert_(df.shape[0] == 10)
def test_bed_gene_pred(self): g = Genome('hg19') from sqlalchemy import and_ from cStringIO import StringIO query = g.knownGene.filter(and_(g.knownGene.txStart > 10000, g.knownGene.txEnd < 20000)) c = StringIO() Genome.save_bed(query, c) c.seek(0) rows = c.readlines() for toks in (row.split("\t") for row in rows): self.assert_(len(toks) == 12) self.assert_(int(toks[1]) > 10000) self.assert_(int(toks[2]) < 20000)
def test_bed_gene_pred(self): g = Genome('hg19', host="localhost", user="******") from sqlalchemy import and_ from cStringIO import StringIO query = g.knownGene.filter(and_(g.table('knownGene').c.txStart > 10000, g.table('knownGene').c.txEnd < 20000)) c = StringIO() Genome.save_bed(query, c) c.seek(0) rows = c.readlines() for toks in (row.split("\t") for row in rows): self.assert_(len(toks) == 12) self.assert_(int(toks[1]) > 10000) self.assert_(int(toks[2]) < 20000)
def test_mirror(self): try: os.unlink('/tmp/__u.db') except OSError: pass g = Genome('hg18') g.mirror(['chromInfo'], 'sqlite:////tmp/__u.db') a = str(g.chromInfo.filter().first()) gs = Genome('sqlite:////tmp/__u.db') b = str(gs.chromInfo.filter().first()) self.assertEqual(a, b) os.unlink('/tmp/__u.db')
def test_blat(self): try: import requests except ImportError: return g = Genome('hg18') f = g.refGene[19] f.chrom = "chr6" f.txStart = 135646802 f.txEnd = 135646832 r = list(f.blat()) self.assert_(str(f.txStart) in repr(r), r) self.assert_(str(f.txEnd) in repr(r), r)
def mirror(genome, tables, connection_string): destination, dengine = make_session(connection_string) dmeta = MetaData(bind=dengine) orig_counts = [] for table_name in tables: # cause it ot be mapped table = getattr(genome, table_name)._table print(('Mirroring', table_name), file=sys.stderr) table = set_table(genome, table, table_name, connection_string, dmeta) try: table.create(dengine) except sqlalchemy.exc.OperationalError: pass destination.commit() ins = table.insert() columns = list(table.columns.keys()) records = [] table_obj = getattr(genome, table_name)._table t = getattr(genome, table_name) for ii, record in enumerate(page_query(table_obj.select(), t.session)): data = dict( (str(column), getattr(record, column)) for column in columns) records.append(data) if ii % 20000 == 0 and ii > 0: destination.execute(ins, records) print(("processing record %i" % ii), file=sys.stderr) destination.commit() records = [] destination.execute(ins, records) destination.commit() orig_counts.append(getattr(genome, table_name).count()) destination, dengine = make_session(connection_string) from . import Genome newg = Genome(connection_string) new_counts = [getattr(newg, table_name).count() for table_name in tables] for tbl, oc, nc in zip(tables, orig_counts, new_counts): if oc != nc: print(("ERROR: mirrored table '%s' has %i \ rows while the original had %i" % (tbl, nc, oc)), file=sys.stderr) return newg
def test_bins(self): bins = Genome.bins(12345, 56779) expected = set([1, 9, 73, 585]) self.assertEqual(bins, expected)
def setUp(self): self.dba = Genome('hg18') self.dbb = Genome('hg19')
def setUp(self): self.db = Genome('hg18') self.gene = self.db.refGene.filter_by(name2="MUC5B").first()
regions[item.chrom][0] = {} regions[item.chrom][0][item.chromEnd] = "+" #regions[item.chrom]["end"]=item.chromEnd #regions[item.chrom]["chrom"]=item.chrom #regions[item.chrom]["start"]=0 regions["chrM"] = {} regions["chrM"][0] = {} regions["chrM"][0][16569] = "+" return (regions) ###MAIN PROGRAM### file = '/Users/mok6/Desktop/Doug_NEW/dataNewII/SP04HU_PyPu' print "start" #get start and end of chromosomes, this is from dbcruz packages, local copy regions = get_regions( Genome( "sqlite:////Users/mok6/Dropbox/LiClipse/MelArray/hg19_c.db").cytoband) print "end load chr boundaries" #load dimer data, second parameter is 0=filtered, 1=not filtered DamagePos = loadDamagePosII(file, "1") print "dimer data loaded" #perform sliding window approach slidingWindowII(regions, DamagePos, file) print "sliding window done"
def pipeline(col_num, step, dist, acf_dist, prefix, threshold, seed, bed_files, mlog=True, region_filter_p=1, region_filter_n=None, genome_control=False, db=None, use_fdr=True): sys.path.insert(0, op.join(op.dirname(__file__), "..")) from cpv import acf, slk, fdr, peaks, region_p, stepsize, filter from cpv._common import genome_control_adjust, genomic_control, bediter import operator if step is None: step = min(acf_dist, stepsize.stepsize(bed_files, col_num)) print("calculated stepsize as: %i" % step, file=sys.stderr) lags = list(range(1, acf_dist, step)) lags.append(lags[-1] + step) prefix = prefix.rstrip(".") putative_acf_vals = acf.acf(bed_files, lags, col_num, simple=False, mlog=mlog) acf_vals = [] # go out to max requested distance but stop once an autocorrelation # < 0.05 is added. for a in putative_acf_vals: # a is ((lmin, lmax), (corr, N)) # this heuristic seems to work. stop just above the 0.08 correlation # lag. if a[1][0] < 0.04 and len(acf_vals) > 2: break acf_vals.append(a) if a[1][0] < 0.04 and len(acf_vals): break # save the arguments that this was called with. with open(prefix + ".args.txt", "w") as fh: print(" ".join(sys.argv[1:]) + "\n", file=fh) import datetime print("date: %s" % datetime.datetime.today(), file=fh) from .__init__ import __version__ print("version:", __version__, file=fh) with open(prefix + ".acf.txt", "w") as fh: acf_vals = acf.write_acf(acf_vals, fh) print("wrote: %s" % fh.name, file=fh) print("ACF:\n", open(prefix + ".acf.txt").read(), file=sys.stderr) spvals, opvals = array.array('f'), array.array('f') with ts.nopen(prefix + ".slk.bed.gz", "w") as fhslk: fhslk.write('#chrom\tstart\tend\tp\tregion-p\n') for chrom, results in slk.adjust_pvals(bed_files, col_num, acf_vals): fmt = chrom + "\t%i\t%i\t%.4g\t%.4g\n" for row in results: row = tuple(row) fhslk.write(fmt % row) opvals.append(row[-2]) spvals.append(row[-1]) print("# original lambda: %.2f" % genomic_control(opvals), file=sys.stderr) del opvals gc_lambda = genomic_control(spvals) print("wrote: %s with lambda: %.2f" % (fhslk.name, gc_lambda), file=sys.stderr) if genome_control: fhslk = ts.nopen(prefix + ".slk.gc.bed.gz", "w") adj = genome_control_adjust([d['p'] for d in bediter(prefix + ".slk.bed.gz", -1)]) for i, line in enumerate(ts.nopen(prefix + ".slk.bed.gz")): print("%s\t%.5g" % (line.rstrip("\r\n"), adj[i]), file=fhslk) fhslk.close() print("wrote: %s" % fhslk.name, file=sys.stderr) with ts.nopen(prefix + ".fdr.bed.gz", "w") as fh: fh.write('#chrom\tstart\tend\tp\tregion-p\tregion-q\n') for bh, l in fdr.fdr(fhslk.name, -1): fh.write("%s\t%.4g\n" % (l.rstrip("\r\n"), bh)) print("wrote: %s" % fh.name, file=sys.stderr) fregions = prefix + ".regions.bed.gz" with ts.nopen(fregions, "w") as fh: list(peaks.peaks(prefix + ".fdr.bed.gz", -1 if use_fdr else -2, threshold, seed, dist, fh, operator.le)) n_regions = sum(1 for _ in ts.nopen(fregions)) print("wrote: %s (%i regions)" % (fregions, n_regions), file=sys.stderr) if n_regions == 0: sys.exit() with ts.nopen(prefix + ".regions-p.bed.gz", "w") as fh: N = 0 fh.write("#chrom\tstart\tend\tmin_p\tn_probes\tz_p\tz_sidak_p\n") # use -2 for original, uncorrected p-values in slk.bed for region_line, slk_p, slk_sidak_p, sim_p in region_p.region_p( prefix + ".slk.bed.gz", prefix + ".regions.bed.gz", -2, step): fh.write("%s\t%.4g\t%.4g\n" % (region_line, slk_p, slk_sidak_p)) fh.flush() N += int(slk_sidak_p < 0.05) print("wrote: %s, (regions with corrected-p < 0.05: %i)" \ % (fh.name, N), file=sys.stderr) regions_bed = fh.name #if all(h in header for h in ('t', 'start', 'end')): if region_filter_n is None: region_filter_n = 0 with ts.nopen(prefix + ".regions-t.bed", "w") as fh: N = 0 for i, toks in enumerate(filter.filter(bed_files[0], regions_bed, p_col_name=col_num)): if i == 0: toks[0] = "#" + toks[0] else: if float(toks[6]) > region_filter_p: continue if int(toks[4]) < region_filter_n: continue #if region_filter_t and "/" in toks[7]: # # t-pos/t-neg. if the lower one is > region_filter_t? # vals = map(int, toks[7].split("/")) # if min(vals) > region_filter_t: continue N += 1 print("\t".join(toks), file=sys.stderr) print(("wrote: %s, (regions with region-p " "< %.3f and n-probes >= %i: %i)") \ % (fh.name, region_filter_p, region_filter_n, N), file=sys.stderr) try: from cpv import manhattan regions = manhattan.read_regions(fh.name) manhattan.manhattan(prefix + ".slk.bed.gz", 3, prefix.rstrip(".") + ".manhattan.png", False, ['#959899', '#484B4C'], "", False, None, regions=regions, bonferonni=False) except ImportError: pass # they dont have matplotlib if db is not None: from cruzdb import Genome g = Genome(db) lastf = fh.name with open(prefix + ".anno.%s.bed" % db, "w") as fh: fh.write('#') g.annotate(lastf, ("refGene", "cpgIslandExt"), out=fh, feature_strand=True, parallel=len(spvals) > 500) print("wrote: %s annotated with %s" % (fh.name, db), file=sys.stderr)
DIR_PROJ = "/home/mokha/Documents/Krauthammer_Lab" DIR_CURR = DIR_PROJ + "/PythonClasses/SVSv5" DIR_DATA = DIR_CURR + "/TestData" DIR_RESULTS = DIR_CURR + "/TestResults" # DIR_RESULTS = DIR_CURR + "/Results/160729_Analyze_KF" # DIR_RESULTS = DIR_CURR + "/Results/160731_Analyze_KF" # DIR_RESULTS = DIR_CURR + "/Results/160909_Analyze_KF" DIR_FUSION = DIR_PROJ + "/160510_GeneFusions" print "------------ TDD: 161002_IsoformFusion_1.py ------------" #set kinase gene annotation file KinaseFusion.set_kinasefile( DIR_FUSION + "/Data/160910_KinaseAnnots_hg38_Final.txt" ) obj_cruzdb = Genome( 'sqlite:////tmp/hg38_v2.db' ) #set cruzdb Genome database instance Isoform.set_cruzdb( obj_cruzdb ) #CASE: This returns "None" for the kinase domain for the kinase gene (TLK2 - NM_001284363) #Fusion - ASIC2:TLK2 hash_multi_isoform = { "orientation": 'fr', "chrom_start": 'chr17', "chrom_end": 'chr17', "pos_start": 34038904, "pos_end": 62565136, "read_span": 5, "read_matepair": 5, "read_matepair_break": 5 } obj_mif = MultiIsoformFusion( hash_multi_isoform ) #MIF = MultiIsoform Fusion instance
import os.path as op from toolshed import reader from cruzdb import Genome def lamina(): if not op.exists('lamina.bed'): fh = open('lamina.bed', 'w') fh.write("#chrom\tstart\tend\tvalue\n") for gff in reader('http://www.nature.com/nature/journal/v453/n7197/extref/nature06947-s2.txt', header=False): fh.write("\t".join([gff[0], gff[3], gff[4], gff[5]]) + "\n") fh.close() return 'lamina.bed' fname = 'supplement/Additional-File-11_lamina.anno.bed' hg18 = Genome('sqlite:///hg18.db') if not op.exists(fname): fhout = open(fname, 'w') hg18.annotate(lamina(), ('refGene', ), feature_strand=True, in_memory=True, parallel=True, out=fhout) fhout.close() for cutoff in (0.90, 0.95): fh = open('/tmp/genes-%.2f.txt' % cutoff, 'w') for d in reader(fname): if float(d['value']) < cutoff: continue if d['refGene_distance'] == '0' or \ d['refGene_distance'].startswith("0;"): print >>fh, "\n".join(d['refGene_name'].split(";")) fh.close() cutoff = 0.90
submitters='1000GENOMES,ABI,BCM-HGSC-SUB,BCM_SSAHASNP,BGI,BL,BUSHMAN,COMPLETE_GENOMICS,DDI,ENSEMBL,EVA-GONL,EVA_DECODE,EVA_GENOME_DK,EVA_UK10K_ALSPAC,EVA_UK10K_TWINSUK,GMI,HAMMER_LAB,HGSV,HUMANGENOME_JCVI,ILLUMINA-UK,JMKIDD_LAB,PJP,SSAHASNP,SSMP,TISHKOFF,WEILL_CORNELL_DGM,', alleleFreqCount=2, alleles='C,T,', alleleNs='2634.000000,2374.000000,', alleleFreqs='0.525958,0.474042,', bitfields=set(['maf-5-all-pops', 'maf-5-some-pop']) ) ''' # snps reader = TsvReader(snpfile, cnames=False) snplist = list(set(r[snpcol] for r in reader)) reader.close() from cruzdb import Genome g = Genome(genome) outfiletmp = outfile + '.tmp' writer = TsvWriter(outfiletmp) for i in range(0, len(snplist), 1000): chunk = snplist[i:i + 1000] sql = 'SELECT chrom, chromStart, chromEnd, name, score, strand, refUCSC, alleles, alleleFreqs FROM snp{dbsnpver} WHERE name in ({snps})'.format( dbsnpver=dbsnpver, snps=', '.join("'{}'".format(s) for s in chunk)) result = g.sql(sql) for r in result: allfreqs = dict(zip(r.alleles.split(','), r.alleleFreqs.split(','))) reffreq = allfreqs.get(r.refUCSC, '0') if r.refUCSC in allfreqs: del allfreqs[r.refUCSC] if '' in allfreqs: del allfreqs['']
for ii, record in enumerate(page_query(table_obj.select(), t.session)): data = dict( (str(column), getattr(record, column)) for column in columns) records.append(data) if ii % 20000 == 0 and ii > 0: destination.execute(ins, records) print >> sys.stderr, "processing record %i" % ii destination.commit() records = [] destination.execute(ins, records) destination.commit() orig_counts.append(getattr(genome, table_name).count()) destination, dengine = make_session(connection_string) from . import Genome newg = Genome(connection_string) new_counts = [getattr(newg, table_name).count() for table_name in tables] for tbl, oc, nc in zip(tables, orig_counts, new_counts): if oc != nc: print >> sys.stderr, "ERROR: mirrored table '%s' has %i \ rows while the original had %i" % (tbl, nc, oc) return newg if __name__ == "__main__": if True: from cruzdb import Genome g = Genome('hg18') mirror(g, ['chromInfo'], 'sqlite:////tmp/u.db')
{% if args.header %} skip = {{args.skip}} + 1 {% else %} skip = {{args.skip}} {% endif %} # get the snps delimit = {{args.delimit | quote}} comment = {{args.comment | quote}} col = {{args.col}} with open({{i.snpfile | quote}}) as f: snps = sorted(set([line.split(delimit)[col] for i, line in enumerate(f.read().splitlines()) if i >= skip and line.strip() and not line.startswith(comment)])) genome = {{args.genome | quote}} g = Genome (db=genome) dbsnp = g.snp{{args.dbsnpver}} fout = open ("{{o.outfile}}", "w") for snp in snps: s = dbsnp.filter_by(name=snp).first() if not s: sys.stderr.write('pyppl.log.warning: Cannot find coordinates for SNP: %s\n' % snp) else: # chr start end name score strand otherinfo chrom = s.chrom start = s.chromStart end = s.chromEnd name = snp strand = s.strand ref = s.refUCSC
print vcf_ex.head() # Rename columns to fix the syntax in chromosome number names = vcf_ex.columns.values new_names = ['CHROM'] new_names.extend(names[1:]) print "\n", new_names vcf_ex.columns = new_names # If QUAL > 0.5, sample passes vcf_ex_sub = vcf_ex.loc[vcf_ex.QUAL > 0.5, ['CHROM', 'POS']].copy() print vcf_ex_sub.head() # Get the Genome object from cruzdb # connects to MySQL genome browser at UCSC g = Genome('hg38') # Convert table 'refGene' to pandas dataframe # columns of interest 'chrom' (chrX, %s), 'txStart' (number, %s), 'txEnd' (number , %s) print "Extracting reference genome table (HG38) from UCSC Genome Browser" df = g.dataframe('refGene') df[['txStart', 'txEnd']] = df[['txStart', 'txEnd']].astype(int) genes = pd.Series(np.zeros(vcf_ex_sub.shape[0])) #gene = hg19.bin_query('refGene', vcf_ex_sub.CHROM[1], vcf_ex_sub.POS[1], vcf_ex_sub.POS[1]) #print vcf_ex_sub.POS.iloc[0] for i in range(0, vcf_ex_sub.shape[0]): #genes[i] = df[[df.chrom == str(vcf_ex_sub.CHROM.iloc[i]) and df.txStart >= str(vcf_ex_sub.POS.iloc[i])]].bool() #and df.txStart >= str(vcf_ex_sub.POS.iloc[i]) and df.txEnd <= vcf_ex_sub.POS.iloc[i]].bool(), chrom = vcf_ex_sub['CHROM'].iloc[i] location = vcf_ex_sub['POS'].iloc[i]
import csv from cruzdb import Genome # DEPENDENCIES: # python2 -m pip install --upgrade pip # python2 -m pip install setuptools # first 2 may be already done # python2 -m pip install cruzdb # python2 -m pip install sqlalchemy # MySQLdb stuff: # sudo apt-get install mysql-server # sudo apt-get install libmysqlclient-dev # gives us mysql_config # FINALLY download MySQLdb source, do process described in INSTALL file hg19 = Genome('hg19') INPUTFILE = "suggestive.pheno_simple.covar_none.test_wald.csv" filereader = csv.reader(open(INPUTFILE)) chrom_i = None pos_i = None for i, line in enumerate(filereader): if i == 0: # CHROM POS REF ALT N_INFORMATIVE Test Beta SE Pvalue PVALUE chrom_i = line.index('CHROM') pos_i = line.index('POS') continue chrom = 'chr' + str(line[chrom_i]) pos = int(line[pos_i]) start = pos - 50 # kind of arbitrary search 50 back 50 forward. end = pos + 50
#!/usr/bin/env python from cruzdb import Genome import sys import re import tqdm import pandas as pd limit = 1000 build = sys.argv[1] fname = sys.argv[2] mybuild = Genome(build) if (re.search("hg19", build)): snpdb = mybuild.snp138 elif (re.search("hg18", build)): snpdb = mybuild.snp130 with open(sys.argv[3], "w") as f: counter = 0 f.write("chrom\tstart\tend\tname\tscore\tstrand\n") rs_ids = pd.read_csv(fname, names=["id"]) for index in tqdm.tqdm(range(0, rs_ids.shape[0], limit)): result = snpdb.filter( snpdb.name.in_(list(rs_ids["id"][index:index + limit]))).limit(limit).all() f.write("\n".join(map(lambda x: str(x), result))) f.write("\n")
def pipeline(col_num, step, dist, prefix, threshold, seed, bed_files, mlog=False, region_filter_p=1, region_filter_n=1, genome_control=False, db=None): sys.path.insert(0, op.join(op.dirname(__file__), "..")) from cpv import acf, slk, fdr, peaks, region_p, stepsize, filter from cpv._common import genome_control_adjust, genomic_control, bediter import operator if step is None: step = stepsize.stepsize(bed_files, col_num) print >>sys.stderr, "calculated stepsize as: %i" % step lags = range(1, dist, step) lags.append(lags[-1] + step) prefix = prefix.rstrip(".") #if genome_control: # with open(prefix + ".adj.bed", "w") as fh: # genome_control_adjust_bed(bed_files, col_num, fh) # bed_files = [fh.name] putative_acf_vals = acf.acf(bed_files, lags, col_num, simple=False, mlog=mlog) acf_vals = [] # go out to max requested distance but stop once an autocorrelation # < 0.05 is added. for a in putative_acf_vals: # a is ((lmin, lmax), (corr, N)) # this heuristic seems to work. stop just above the 0.08 correlation # lag. if a[1][0] < 0.04 and len(acf_vals) > 2: break acf_vals.append(a) if a[1][0] < 0.04 and len(acf_vals): break # save the arguments that this was called with. with open(prefix + ".args.txt", "w") as fh: print >>fh, " ".join(sys.argv[1:]) + "\n" import datetime print >>fh, "date: %s" % datetime.datetime.today() with open(prefix + ".acf.txt", "w") as fh: acf_vals = acf.write_acf(acf_vals, fh) print >>sys.stderr, "wrote: %s" % fh.name print >>sys.stderr, "ACF:\n", open(prefix + ".acf.txt").read() spvals, opvals = [], [] with open(prefix + ".slk.bed", "w") as fhslk: for row in slk.adjust_pvals(bed_files, col_num, acf_vals): fhslk.write("%s\t%i\t%i\t%.4g\t%.4g\n" % row) opvals.append(row[-2]) spvals.append(row[-1]) print >>sys.stderr, "# original lambda: %.2f" % genomic_control(opvals) del opvals gc_lambda = genomic_control(spvals) print >>sys.stderr, "wrote: %s with lambda: %.2f" % (fhslk.name, gc_lambda) if genome_control: fhslk = open(prefix + ".slk.gc.bed", "w") adj = genome_control_adjust([d['p'] for d in bediter(prefix + ".slk.bed", -1)]) for i, line in enumerate(open(prefix + ".slk.bed")): print >>fhslk, "%s\t%.5g" % (line.rstrip("\r\n"), adj[i]) fhslk.close() print >>sys.stderr, "wrote: %s" % fhslk.name with open(prefix + ".fdr.bed", "w") as fh: for bh, l in fdr.fdr(fhslk.name, -1): fh.write("%s\t%.4g\n" % (l.rstrip("\r\n"), bh)) print >>sys.stderr, "wrote: %s" % fh.name fregions = prefix + ".regions.bed" with open(fregions, "w") as fh: list(peaks.peaks(prefix + ".fdr.bed", -1, threshold, seed, step, fh, operator.le)) n_regions = sum(1 for _ in open(fregions)) print >>sys.stderr, "wrote: %s (%i regions)" % (fregions, n_regions) with open(prefix + ".regions-p.bed", "w") as fh: N = 0 fh.write("#chrom\tstart\tend\tmin_p\tn_probes\tslk_p\tslk_sidak_p\n") # use -2 for original, uncorrected p-values in slk.bed for region_line, slk_p, slk_sidak_p, sim_p in region_p.region_p( prefix + ".slk.bed", prefix + ".regions.bed", -2, 0, step, mlog=mlog): fh.write("%s\t%.4g\t%.4g\n" % (region_line, slk_p, slk_sidak_p)) fh.flush() N += int(slk_sidak_p < 0.05) print >>sys.stderr, "wrote: %s, (regions with corrected-p < 0.05: %i)" \ % (fh.name, N) regions_bed = fh.name header = (gzip.open(bed_files[0]) if bed_files[0].endswith(".gz") else open(bed_files[0])).next().split("\t") if all(h in header for h in ('t', 'start', 'end')): with open(prefix + ".regions-t.bed", "w") as fh: N = 0 for i, toks in enumerate(filter.filter(bed_files[0], regions_bed, p_col_name=col_num)): if i == 0: toks[0] = "#" + toks[0] else: if float(toks[6]) > region_filter_p: continue if int(toks[4]) < region_filter_n: continue N += 1 print >>fh, "\t".join(toks) print >>sys.stderr, ("wrote: %s, (regions with region-p" "< %.3f and n-probes >= %i: %i)") \ % (fh.name, region_filter_p, region_filter_n, N) try: from cpv import manhattan regions = manhattan.read_regions(fh.name) manhattan.manhattan(prefix + ".slk.bed", 3, prefix.rstrip(".") + ".manhattan.png", False, ['#959899', '#484B4C'], "", False, None, regions=regions, bonferonni=True) except ImportError: pass # they dont have matplotlib if db is not None: from cruzdb import Genome g = Genome(db) lastf = fh.name with open(prefix + ".anno.%s.bed" % db, "w") as fh: g.annotate(lastf, ("refGene", "cpgIslandExt", "cytoBand"), out=fh, feature_strand=True, parallel=len(spvals) > 500) print >>sys.stderr, "wrote: %s annotated with %s" % (fh.name, db)
def getSnpInfo(x): from cruzdb import Genome hg19 = Genome(db="hg19") snp151 = hg19.snp151 info = snp151.filter_by(name=x).first() return info
def setUp(self): self.db = Genome('hg18')
def snpinfo(infile, outfile=None, notfound='ignore', genome='hg19', dbsnpver="150", inopts=None, outopts=None, snpcol=None, cachedir=gettempdir()): _inopts = Box(skip=0, comment='#', delimit='\t') _inopts.update(inopts or {}) inopts = _inopts _outopts = Box(delimit='\t', headDelimit='\t', headPrefix='', headTransform=None, head=True, ftype='bed', cnames='refUCSC, alleles, alleleFreqs, alleleFreqCount') _outopts.update(outopts) outopts = _outopts cnames = alwaysList(outopts['cnames']) reader = TsvReader(infile, **inopts) if not reader.meta: reader.autoMeta() snpcol = snpcol or reader.meta.keys()[0] snps = list(set([r[snpcol] for r in reader])) reader.rewind() dbfile = path.join(cachedir, 'snpinfo_%s_%s.db' % (genome, dbsnpver)) schema = { 'chrom': 'text', # chr8 'chromStart': 'int', # 128700232L 'chromEnd': 'end', # 128700233L 'name': 'text primary key', # rs7005394 'score': 'real', # 0 'strand': 'text', # + 'refNCBI': 'text', # T 'refUCSC': 'text', # T 'observed': 'text', # C/T 'class': 'single', # single 'avHet': 'real', # 0.49 'avHetSE': 'real', # 0.02 'func': 'text', # set(['ncRNA']) 'submitterCount': 'int', # 20 'submitters': 'text', # 1000GENOMES,ABI,BCM-HGSC-SUB,BCM_SSAHASNP,BGI,BL ... 'alleleFreqCount': 'int', # 2 'alleles': 'text', # C,T, 'alleleNs': 'text', # 2634.000000,2374.000000, 'alleleFreqs': 'text', # 0.525958,0.474042, } cache = Cache(dbfile, 'snpinfo', schema, 'name') dummies = { 'func': dict(query=Cache.DUMMY['array']['query'], find=Cache.DUMMY['array']['find'], insert=lambda col, data: (col, ' // ' + ' // '.join(Cache._uniqueData(list(data), True))), update=lambda col, data: (col, Function.concat(Field(col), value=' // ' + ' // '.join( Cache._uniqueData(list(d), True)))), result=lambda data: data if isinstance(data, list) else list( filter(None, data.split(' // ')))) } columns = ['chrom', 'name', 'chromStart', 'chromEnd', 'score', 'strand' ] + cnames ret, allrest = cache.query(columns, {'name': snps}, dummies) dbsnp = Genome(db=genome) dbsnp = getattr(dbsnp, "snp%s" % dbsnpver) writer = None if outfile: head = outopts['head'] headPrefix = outopts['headPrefix'] headDelimit = outopts['headDelimit'] headTransform = outopts['headTransform'] del outopts['head'] del outopts['headPrefix'] del outopts['headDelimit'] del outopts['headTransform'] writer = TsvWriter(outfile, **outopts) if head: writer.writeHead(prefix=headPrefix, delimit=headDelimit, transform=headTransform) if writer: for r in ret.values(): r.CHR = r.chrom r.START = r.chromStart r.END = r.chromEnd r.NAME = r.name r.SCORE = r.score r.STRAND = r.strand writer.write(r) cached = [] if allrest: for snp in allrest['name']: s = dbsnp.filter_by(name=snp).first() if not s: if notfound == 'error': raise RecordNotFound('Record not found: %s' % snp) elif notfound == 'skip': continue else: stderr.write('Record not found: %s \n' % snp) continue cached.append(s) if writer: r = TsvRecord() r.CHR = s.chrom r.START = s.chromStart r.END = s.chromEnd r.NAME = s.name r.SCORE = s.score r.STRAND = s.strand for cname in cnames: setattr(r, cname, getattr(s, cname)) writer.write(r) # save cached data cachedata = {} for c in cached: for k in schema.keys(): if not k in cachedata: cachedata[k] = [] cachedata[k].append(getattr(c, k)) if cachedata: cache.save(cachedata, dummies) return {r.name: r for r in ret.values() + cached}
from cruzdb import Genome import time from toolshed import nopen import os anno_file = "data_c_constant_early.bed" # sub-sample to get fewer rows. list(nopen("|awk 'NR == 1 || NR % 4 == 0'" +(" %s > %s.some" % (anno_file, anno_file)))) anno_file += ".some" nlines = sum(1 for _ in nopen(anno_file)) print "loc\tinstance\tparallel\ttime" for parallel in (True, False): for name, args in (('local\tsqlite', ('sqlite:///hg18.db',)), ('remote\tmysql', ('hg18',)), ('local\tmysql', ('hg18', 'brentp', 'localhost')) ): g = Genome(*args) out = "%s-%s.anno.txt" % (name.replace("\t", "-"), parallel) t0 = time.time() g.annotate(anno_file, ('refGene',), out=out, feature_strand=True, parallel=parallel) t1 = time.time() print "\t".join(map(str, (name, parallel, ("%.1f" % (t1 - t0))))) assert nlines == sum(1 for _ in nopen(out)) os.unlink(out)
import sys from cruzdb import Genome db = sys.argv[1] #db = Genome('sqlite:////usr/local/src/cruzdb/%s.db' % db) db = Genome(db) refGene = db.refGene for g in refGene.all(): for feat in g.gene_features: print "\t".join(map(str, feat))
def pipeline(col_num, step, dist, acf_dist, prefix, threshold, seed, bed_files, mlog=True, region_filter_p=1, region_filter_n=None, genome_control=False, db=None, use_fdr=True): sys.path.insert(0, op.join(op.dirname(__file__), "..")) from cpv import acf, slk, fdr, peaks, region_p, stepsize, filter from cpv._common import genome_control_adjust, genomic_control, bediter import operator if step is None: step = min(acf_dist, stepsize.stepsize(bed_files, col_num)) print >> sys.stderr, "calculated stepsize as: %i" % step lags = range(1, acf_dist, step) lags.append(lags[-1] + step) prefix = prefix.rstrip(".") putative_acf_vals = acf.acf(bed_files, lags, col_num, simple=False, mlog=mlog) acf_vals = [] # go out to max requested distance but stop once an autocorrelation # < 0.05 is added. for a in putative_acf_vals: # a is ((lmin, lmax), (corr, N)) # this heuristic seems to work. stop just above the 0.08 correlation # lag. if a[1][0] < 0.04 and len(acf_vals) > 2: break acf_vals.append(a) if a[1][0] < 0.04 and len(acf_vals): break # save the arguments that this was called with. with open(prefix + ".args.txt", "w") as fh: print >> fh, " ".join(sys.argv[1:]) + "\n" import datetime print >> fh, "date: %s" % datetime.datetime.today() from .__init__ import __version__ print >> fh, "version:", __version__ with open(prefix + ".acf.txt", "w") as fh: acf_vals = acf.write_acf(acf_vals, fh) print >> sys.stderr, "wrote: %s" % fh.name print >> sys.stderr, "ACF:\n", open(prefix + ".acf.txt").read() spvals, opvals = [], [] with ts.nopen(prefix + ".slk.bed.gz", "w") as fhslk: fhslk.write('#chrom\tstart\tend\tp\tregion-p\n') for row in slk.adjust_pvals(bed_files, col_num, acf_vals): fhslk.write("%s\t%i\t%i\t%.4g\t%.4g\n" % row) opvals.append(row[-2]) spvals.append(row[-1]) print >> sys.stderr, "# original lambda: %.2f" % genomic_control(opvals) del opvals gc_lambda = genomic_control(spvals) print >> sys.stderr, "wrote: %s with lambda: %.2f" % (fhslk.name, gc_lambda) if genome_control: fhslk = ts.nopen(prefix + ".slk.gc.bed.gz", "w") adj = genome_control_adjust( [d['p'] for d in bediter(prefix + ".slk.bed.gz", -1)]) for i, line in enumerate(ts.nopen(prefix + ".slk.bed.gz")): print >> fhslk, "%s\t%.5g" % (line.rstrip("\r\n"), adj[i]) fhslk.close() print >> sys.stderr, "wrote: %s" % fhslk.name with ts.nopen(prefix + ".fdr.bed.gz", "w") as fh: fh.write('#chrom\tstart\tend\tp\tregion-p\tregion-q\n') for bh, l in fdr.fdr(fhslk.name, -1): fh.write("%s\t%.4g\n" % (l.rstrip("\r\n"), bh)) print >> sys.stderr, "wrote: %s" % fh.name fregions = prefix + ".regions.bed.gz" with ts.nopen(fregions, "w") as fh: list( peaks.peaks(prefix + ".fdr.bed.gz", -1 if use_fdr else -2, threshold, seed, dist, fh, operator.le)) n_regions = sum(1 for _ in ts.nopen(fregions)) print >> sys.stderr, "wrote: %s (%i regions)" % (fregions, n_regions) if n_regions == 0: sys.exit() with ts.nopen(prefix + ".regions-p.bed.gz", "w") as fh: N = 0 fh.write("#chrom\tstart\tend\tmin_p\tn_probes\tz_p\tz_sidak_p\n") # use -2 for original, uncorrected p-values in slk.bed for region_line, slk_p, slk_sidak_p, sim_p in region_p.region_p( prefix + ".slk.bed.gz", prefix + ".regions.bed.gz", -2, step): fh.write("%s\t%.4g\t%.4g\n" % (region_line, slk_p, slk_sidak_p)) fh.flush() N += int(slk_sidak_p < 0.05) print >>sys.stderr, "wrote: %s, (regions with corrected-p < 0.05: %i)" \ % (fh.name, N) regions_bed = fh.name header = ts.header(bed_files[0]) #if all(h in header for h in ('t', 'start', 'end')): if region_filter_n is None: region_filter_n = 0 with ts.nopen(prefix + ".regions-t.bed", "w") as fh: N = 0 for i, toks in enumerate( filter.filter(bed_files[0], regions_bed, p_col_name=col_num)): if i == 0: toks[0] = "#" + toks[0] else: if float(toks[6]) > region_filter_p: continue if int(toks[4]) < region_filter_n: continue #if region_filter_t and "/" in toks[7]: # # t-pos/t-neg. if the lower one is > region_filter_t? # vals = map(int, toks[7].split("/")) # if min(vals) > region_filter_t: continue N += 1 print >> fh, "\t".join(toks) print >>sys.stderr, ("wrote: %s, (regions with region-p " "< %.3f and n-probes >= %i: %i)") \ % (fh.name, region_filter_p, region_filter_n, N) try: from cpv import manhattan regions = manhattan.read_regions(fh.name) manhattan.manhattan(prefix + ".slk.bed.gz", 3, prefix.rstrip(".") + ".manhattan.png", False, ['#959899', '#484B4C'], "", False, None, regions=regions, bonferonni=False) except ImportError: pass # they dont have matplotlib if db is not None: from cruzdb import Genome g = Genome(db) lastf = fh.name with open(prefix + ".anno.%s.bed" % db, "w") as fh: fh.write('#') g.annotate(lastf, ("refGene", "cpgIslandExt"), out=fh, feature_strand=True, parallel=len(spvals) > 500) print >> sys.stderr, "wrote: %s annotated with %s" % (fh.name, db)
""" "testing" of bin queries and nearest queries take a long time to run so not part of standard test suite """ import time from cruzdb import Genome from random import randrange, seed #g = Genome('hg18', host='localhost', user='******') #g.mirror(['refGene'], "sqlite:////tmp/u.db") g = Genome('sqlite:////tmp/u.db') # if we choose a huge distance all should have a distance of 0 #assert all(k.dist == 0 for k in g.knearest("refGene", "chr1", 1234, 9915555, k=3)) print g.upstream("refGene", "chr1", 9444, 9555, k=6) last = g.refGene.order_by(-g.refGene.table().c.txStart)[0] print last last.txStart = 1000 + last.txEnd last.txEnd = last.txStart + 100 last.strand = "-" print last print g.upstream("refGene", last, k=6)
import time from toolshed import nopen import os anno_file = "data_c_constant_early.bed" # sub-sample to get fewer rows. list( nopen("|awk 'NR == 1 || NR % 4 == 0'" + (" %s > %s.some" % (anno_file, anno_file)))) anno_file += ".some" nlines = sum(1 for _ in nopen(anno_file)) print "loc\tinstance\tparallel\ttime" for parallel in (True, False): for name, args in (('local\tsqlite', ('sqlite:///hg18.db', )), ('remote\tmysql', ('hg18', )), ('local\tmysql', ('hg18', 'brentp', 'localhost'))): g = Genome(*args) out = "%s-%s.anno.txt" % (name.replace("\t", "-"), parallel) t0 = time.time() g.annotate(anno_file, ('refGene', ), out=out, feature_strand=True, parallel=parallel) t1 = time.time() print "\t".join(map(str, (name, parallel, ("%.1f" % (t1 - t0))))) assert nlines == sum(1 for _ in nopen(out)) os.unlink(out)
## Purpose: Extract chrom/pos from rs # ## dbsnp144, build 38 ## ================================================= ## ************* NOTE ****************************** ## This script should be run in virtualenv with these installed: # pip install six # pip install cruzdb # pip install sqlalchemy # pip install mysql-python ## ************************************************* from cruzdb import Genome import sys fname = "variants_rs.csv" outfile = open('chrom_pos.table', 'w') lines = [line.rstrip('\r\n') for line in open(fname, "r")] for rs in lines: if rs.startswith("rs"): var_info = Genome('hg38').snp144.filter_by(name=rs).first() if var_info is not None: outputstring = str(var_info.chrom.split("chr")[1]) + "\t" + str( var_info.chromStart + 1) + "\n" outfile.write(outputstring) else: outfile.write(str("Caution:" + rs + "\n")) outfile.close()
#!/usr/bin/env python2.7 from cruzdb import Genome #this takes a list of genes on chromsome20 and gets their transcript coords g = Genome(db="hg19") genes = [ 'AHCY', 'ARFGEF2', 'BMP2', 'DNAJC5', 'EDN3', 'GSS', 'GNAS1', 'JAG1', 'PANK2', 'PRNP', 'tTG', 'SALL4', 'VAPB' ] for gene in genes: gene_obj = g.refGene.filter_by(name2=gene).first() if gene_obj: #one based intervals #http://gatkforums.broadinstitute.org/discussion/1204/what-input-files-does-the-gatk-accept-require print("{0}:{1}-{2}".format(gene_obj.chrom.replace('chr', ''), gene_obj.txStart, gene_obj.txEnd))
""" "testing" of bin queries and nearest queries take a long time to run so not part of standard test suite """ import time from cruzdb import Genome from random import randrange, seed #g = Genome('hg18', host='localhost', user='******') #g.mirror(['refGene'], "sqlite:////tmp/u.db") g = Genome('sqlite:////tmp/u.db') # if we choose a huge distance all should have a distance of 0 #assert all(k.dist == 0 for k in g.knearest("refGene", "chr1", 1234, 9915555, k=3)) print g.upstream("refGene", "chr1", 9444, 9555, k=6) last = g.refGene.order_by(-g.refGene.table().c.txStart)[0] print last last.txStart = 1000 + last.txEnd last.txEnd = last.txStart + 100 last.strand = "-" print last print g.upstream("refGene", last, k=6) 1 / 0 seed(1)
#/usr/bin/python import sys from cruzdb import Genome sys.path.insert(0, "/home/mokha/Documents/Krauthammer_Lab/PythonClasses") from SVSv5 import Exon, Isoform, MultiIsoform, SpliceJunction, IsoformSJ print "------------ Algorithm: 160919_Isoform_1.py ------------" """ Reconstruct transcripts based on Splice Junctions """ #assign all splice junctions to specific gene: go through cruzdb & find end points for each gene --> assign # g = Genome( 'sqlite:////tmp/hg19.db' ) g = Genome('sqlite:////tmp/hg19_v2.db') Isoform.set_cruzdb(g) #retrieve gene & print information on it based on gene = g.refGene.filter_by(name2='BRAF').all() # all_genes = g.refGene.filter_by( name2 = 'TTN' ).first() # all_genes = g.refGene.filter_by( name2 = 'AGRN' ).all() # all_genes = g.refGene.filter_by( name2 = 'AGRN' ).first() # all_genes = g.refGene.filter_by( name2 = 'DIXDC1' ).all() for each_isoform in gene: obj_iso = Isoform(each_isoform.name) #print name print obj_iso.isoform_id, ":", obj_iso.gene_sym print "obj_iso = ", obj_iso
from cruzdb import Genome from cruzdb.sequence import sequence # mirror the neede tables from UCSC to a local sqlite db local = Genome('hg19').mirror(('refGene', 'targetScanS'), 'sqlite:///hg19.mirna.db') # connect to the newly created local sqlite database instance. refseq_ids = [] # iterate over the coding in refGene for gene in (rgene for rgene in local.refGene if rgene.is_coding): if None in gene.utr3: continue # skip genes with no UTR utr_start, utr_end = gene.utr3 # query the targetScan miRNA table with efficient bin query sites = local.bin_query('targetScanS', gene.chrom, utr_start, utr_end) # print BED file of genes whose 3'UTR contains a miR-96 target site # with a score > 85. if any("miR-96" in s.name and s.score > 85 for s in sites): refseq_ids.append(gene.name) # save the refSeq for later GO analysis # gene is a python object but its string representation is BED format # we also print out the UTR sequence. print gene, sequence('hg19', gene.chrom, utr_start, utr_end) # open a webbrowser to show enrichment of the genes we've selected in DAVID Genome.david_go(refseq_ids)
print vcf_ex.head() # Rename columns to fix the syntax in chromosome number names = vcf_ex.columns.values new_names = ['CHROM'] new_names.extend(names[1:]) print "\n", new_names vcf_ex.columns = new_names # If QUAL > 0.5, sample passes vcf_ex_sub = vcf_ex.loc[vcf_ex.QUAL > 0.5, ['CHROM', 'POS']].copy() print vcf_ex_sub.head() # Get the Genome object from cruzdb # connects to MySQL genome browser at UCSC g = Genome('hg38') # Convert table 'refGene' to pandas dataframe # columns of interest 'chrom' (chrX, %s), 'txStart' (number, %s), 'txEnd' (number , %s) print "Extracting reference genome table (HG38) from UCSC Genome Browser" df = g.dataframe('refGene') df[['txStart', 'txEnd']] = df[['txStart', 'txEnd']].astype(int) genes = pd.Series(np.zeros(vcf_ex_sub.shape[0])) #gene = hg19.bin_query('refGene', vcf_ex_sub.CHROM[1], vcf_ex_sub.POS[1], vcf_ex_sub.POS[1]) #print vcf_ex_sub.POS.iloc[0] for i in range(0, vcf_ex_sub.shape[0]): #genes[i] = df[[df.chrom == str(vcf_ex_sub.CHROM.iloc[i]) and df.txStart >= str(vcf_ex_sub.POS.iloc[i])]].bool() #and df.txStart >= str(vcf_ex_sub.POS.iloc[i]) and df.txEnd <= vcf_ex_sub.POS.iloc[i]].bool(), chrom = vcf_ex_sub['CHROM'].iloc[i]
#/usr/bin/python #Script: setup_cruzdb_databases.py from cruzdb import Genome arrTables_hg19 = [ "refGene", "knownGene", "ensGene", "ccdsKgMap", "knownGeneMrna", "kgProtAlias", "knownToEnsembl", "knownToRefSeq", "wgEncodeGencodeBasicV19" ] #NOTE: "ensGene" is not present in hg38 #NOTE: "ensGene" arrTables_hg38 = [ "refGene", "knownGene", "ccdsKgMap", "knownGeneMrna", "kgProtAlias", "knownToEnsembl", "knownToRefSeq" ] # db_hg19_path = "sqlite:////tmp/hg19.db" # db_hg38_path = "sqlite:////tmp/hg38.db" db_hg19_path = "sqlite:////tmp/hg19_v2.db" db_hg38_path = "sqlite:////tmp/hg38_v2.db" Genome(db="hg19").mirror(arrTables_hg19, db_hg19_path) Genome(db="hg38").mirror(arrTables_hg38, db_hg38_path)