def test_dataframe(self): g = Genome('hg18') kg = g.dataframe('cpgIslandExt') self.assert_(kg.shape[0] == g.cpgIslandExt.count()) q = g.cpgIslandExt.filter(g.cpgIslandExt.chromStart < 300000).limit(10) df = g.dataframe(q) self.assert_(df.shape[0] == 10)
new_names.extend(names[1:]) print "\n", new_names vcf_ex.columns = new_names # If QUAL > 0.5, sample passes vcf_ex_sub = vcf_ex.loc[vcf_ex.QUAL > 0.5, ['CHROM', 'POS']].copy() print vcf_ex_sub.head() # Get the Genome object from cruzdb # connects to MySQL genome browser at UCSC g = Genome('hg38') # Convert table 'refGene' to pandas dataframe # columns of interest 'chrom' (chrX, %s), 'txStart' (number, %s), 'txEnd' (number , %s) print "Extracting reference genome table (HG38) from UCSC Genome Browser" df = g.dataframe('refGene') df[['txStart', 'txEnd']] = df[['txStart', 'txEnd']].astype(int) genes = pd.Series(np.zeros(vcf_ex_sub.shape[0])) #gene = hg19.bin_query('refGene', vcf_ex_sub.CHROM[1], vcf_ex_sub.POS[1], vcf_ex_sub.POS[1]) #print vcf_ex_sub.POS.iloc[0] for i in range(0, vcf_ex_sub.shape[0]): #genes[i] = df[[df.chrom == str(vcf_ex_sub.CHROM.iloc[i]) and df.txStart >= str(vcf_ex_sub.POS.iloc[i])]].bool() #and df.txStart >= str(vcf_ex_sub.POS.iloc[i]) and df.txEnd <= vcf_ex_sub.POS.iloc[i]].bool(), chrom = vcf_ex_sub['CHROM'].iloc[i] location = vcf_ex_sub['POS'].iloc[i] #print i, chrom, location tmp = df.copy() tmp = tmp[tmp.chrom == chrom] tmp = tmp[tmp['txStart'] <= location]
new_names.extend(names[1:]) print "\n", new_names vcf_ex.columns = new_names # If QUAL > 0.5, sample passes vcf_ex_sub = vcf_ex.loc[vcf_ex.QUAL > 0.5, ['CHROM', 'POS']].copy() print vcf_ex_sub.head() # Get the Genome object from cruzdb # connects to MySQL genome browser at UCSC g = Genome('hg38') # Convert table 'refGene' to pandas dataframe # columns of interest 'chrom' (chrX, %s), 'txStart' (number, %s), 'txEnd' (number , %s) print "Extracting reference genome table (HG38) from UCSC Genome Browser" df = g.dataframe('refGene') df[['txStart', 'txEnd']] = df[['txStart', 'txEnd']].astype(int) genes = pd.Series(np.zeros(vcf_ex_sub.shape[0])) #gene = hg19.bin_query('refGene', vcf_ex_sub.CHROM[1], vcf_ex_sub.POS[1], vcf_ex_sub.POS[1]) #print vcf_ex_sub.POS.iloc[0] for i in range(0, vcf_ex_sub.shape[0]): #genes[i] = df[[df.chrom == str(vcf_ex_sub.CHROM.iloc[i]) and df.txStart >= str(vcf_ex_sub.POS.iloc[i])]].bool() #and df.txStart >= str(vcf_ex_sub.POS.iloc[i]) and df.txEnd <= vcf_ex_sub.POS.iloc[i]].bool(), chrom = vcf_ex_sub['CHROM'].iloc[i] location = vcf_ex_sub['POS'].iloc[i] #print i, chrom, location tmp = df.copy() tmp = tmp[tmp.chrom == chrom] tmp = tmp[tmp['txStart'] <= location] tmp = tmp[tmp['txEnd'] >= location]