from cruzdb import Genome from cruzdb.sequence import sequence # mirror the neede tables from UCSC to a local sqlite db local = Genome('hg19').mirror(('refGene', 'targetScanS'), 'sqlite:///hg19.mirna.db') # connect to the newly created local sqlite database instance. refseq_ids = [] # iterate over the coding in refGene for gene in (rgene for rgene in local.refGene if rgene.is_coding): if None in gene.utr3: continue # skip genes with no UTR utr_start, utr_end = gene.utr3 # query the targetScan miRNA table with efficient bin query sites = local.bin_query('targetScanS', gene.chrom, utr_start, utr_end) # print BED file of genes whose 3'UTR contains a miR-96 target site # with a score > 85. if any("miR-96" in s.name and s.score > 85 for s in sites): refseq_ids.append(gene.name) # save the refSeq for later GO analysis # gene is a python object but its string representation is BED format # we also print out the UTR sequence. print gene, sequence('hg19', gene.chrom, utr_start, utr_end) # open a webbrowser to show enrichment of the genes we've selected in DAVID Genome.david_go(refseq_ids)
print g.upstream("refGene", last, k=6) 1 / 0 seed(1) istart = 12345 iend = 386539 qall = list(g.refGene.all()) #while True: for iend in (randrange(istart, 65555555) for i in range(100)): t = time.time() q = g.bin_query('refGene', 'chr1', istart, iend) a = list(q) print len(a) print time.time() - t #""" t = time.time() refGene = g.refGene rg = refGene.table() q = g.session.query(rg).filter(rg.c.chrom == "chr1", rg.c.txStart <= iend, rg.c.txEnd >= istart) q = refGene.filter(rg.c.chrom == "chr1", rg.c.txStart <= iend, rg.c.txEnd >= istart) b = list(q) print len(b)
if not op.exists(fname): fhout = open(fname, 'w') hg18.annotate(lamina(), ('refGene', ), feature_strand=True, in_memory=True, parallel=True, out=fhout) fhout.close() for cutoff in (0.90, 0.95): fh = open('/tmp/genes-%.2f.txt' % cutoff, 'w') for d in reader(fname): if float(d['value']) < cutoff: continue if d['refGene_distance'] == '0' or \ d['refGene_distance'].startswith("0;"): print >>fh, "\n".join(d['refGene_name'].split(";")) fh.close() cutoff = 0.90 fh = open('/tmp/genes-overlap-complete.txt', 'w') for d in (l for l in reader(lamina()) if float(l['value']) > cutoff): if float(d['value']) < cutoff: continue start, end = map(int, (d['start'], d['end'])) res = hg18.bin_query('refGene', d['chrom'], start, end).all() if len(res) == 0: continue for r in res: # genes completely contained within an LAD if start <= r.start and end >= r.end: print >>fh, r.gene_name
1/0 seed(1) istart = 12345 iend = 386539 qall = list(g.refGene.all()) #while True: for iend in (randrange(istart, 65555555) for i in range(100)): t = time.time() q = g.bin_query('refGene', 'chr1', istart, iend) a = list(q) print len(a) print time.time() - t #""" t = time.time() refGene = g.refGene rg = refGene.table() q = g.session.query(rg).filter(rg.c.chrom == "chr1", rg.c.txStart <= iend, rg.c.txEnd >= istart) q = refGene.filter(rg.c.chrom == "chr1", rg.c.txStart <= iend, rg.c.txEnd >= istart) b = list(q)
# MySQLdb stuff: # sudo apt-get install mysql-server # sudo apt-get install libmysqlclient-dev # gives us mysql_config # FINALLY download MySQLdb source, do process described in INSTALL file hg19 = Genome('hg19') INPUTFILE = "suggestive.pheno_simple.covar_none.test_wald.csv" filereader = csv.reader(open(INPUTFILE)) chrom_i = None pos_i = None for i, line in enumerate(filereader): if i == 0: # CHROM POS REF ALT N_INFORMATIVE Test Beta SE Pvalue PVALUE chrom_i = line.index('CHROM') pos_i = line.index('POS') continue chrom = 'chr' + str(line[chrom_i]) pos = int(line[pos_i]) start = pos - 50 # kind of arbitrary search 50 back 50 forward. end = pos + 50 genes = hg19.bin_query('refGene', chrom, start, end) ## formatting the output basic_str = ' '.join(map(str, [chrom, start, end])) padding = 30 - len(basic_str) if padding < 0: padding = 1 gene_string = ' '.join(set(g.name2 for g in genes)) print basic_str + ' ' * padding + gene_string