def readFiles(self): print 'Reading Data ...' X = None y = None Xname = None if self.fileType == 'plink': from pysnptools.snpreader import Bed snpreader = Bed(self.fileName+'.bed') snpdata = snpreader.read() X = snpdata.val Xname = snpdata.sid # from pysnptools.snpreader import Pheno # phenoreader = Pheno(self.fileName+".fam") # phenodata = phenoreader.read() # y = phenodata.val[:,-1] y = self.famReader(self.fileName+".fam") if self.fileType == 'csv': X = np.loadtxt(self.fileName+'.geno.csv', delimiter=',') y = np.loadtxt(self.fileName+'.pheno.csv', delimiter=',') try: Xname = np.loadtxt(self.fileName+'.marker.csv', delimiter=',') except: Xname = ['geno ' + str(i+1) for i in range(X.shape[1])] if self.imputationFlag: X = self.imputation(X) keep = True - np.isnan(y) return X[keep,:], y[keep], Xname else: X = self.simpleImputation(X) keep = True - np.isnan(y) return X[keep,:], y[keep], Xname
def setUpClass(self): self.currentFolder = os.path.dirname(os.path.realpath(__file__)) #TODO: get data set with NANs! snpreader = Bed(self.currentFolder + "/examples/toydata",count_A1=False) self.pheno_fn = self.currentFolder + "/examples/toydata.phe" self.snpdata = snpreader.read(order='F',force_python_only=True) self.snps = self.snpdata.val
def setUpClass(self): currentFolder = os.path.dirname(os.path.realpath(__file__)) self.snp_fn = currentFolder + "/../../tests/datasets/mouse/alldata" self.pheno_fn = currentFolder + "/../../tests/datasets/mouse/pheno_10_causals.txt" #self.cov_fn = currentFolder + "/examples/toydata.cov" # load data ################################################################### snp_reader = Bed(self.snp_fn) pheno = pstpheno.loadOnePhen(self.pheno_fn) #cov = pstpheno.loadPhen(self.cov_fn) # intersect sample ids snp_reader, pheno = pysnptools.util.intersect_apply([snp_reader, pheno]) self.G = snp_reader.read(order='C').val self.G = stdizer.Unit().standardize(self.G) self.G.flags.writeable = False self.y = pheno['vals'][:,0] self.y.flags.writeable = False # load pcs #self.G_cov = cov['vals'] self.G_cov = np.ones((len(self.y), 1)) self.G_cov.flags.writeable = False
def setUpClass(self): currentFolder = os.path.dirname(os.path.realpath(__file__)) self.snp_fn = currentFolder + "/../../tests/datasets/mouse/alldata" self.pheno_fn = currentFolder + "/../../tests/datasets/mouse/pheno_10_causals.txt" #self.cov_fn = currentFolder + "/examples/toydata.cov" # load data ################################################################### snp_reader = Bed(self.snp_fn) pheno = pstpheno.loadOnePhen(self.pheno_fn) #cov = pstpheno.loadPhen(self.cov_fn) # intersect sample ids snp_reader, pheno = pysnptools.util.intersect_apply( [snp_reader, pheno]) self.G = snp_reader.read(order='C').val self.G = stdizer.Unit().standardize(self.G) self.G.flags.writeable = False self.y = pheno['vals'][:, 0] self.y.flags.writeable = False # load pcs #self.G_cov = cov['vals'] self.G_cov = np.ones((len(self.y), 1)) self.G_cov.flags.writeable = False
def setUpClass(self): self.currentFolder = os.path.dirname(os.path.realpath(__file__)) #TODO: get data set with NANs! snpreader = Bed(self.currentFolder + "/examples/toydata", count_A1=False) self.pheno_fn = self.currentFolder + "/examples/toydata.phe" self.snpdata = snpreader.read(order='F', force_python_only=True) self.snps = self.snpdata.val
def getData(filename): mph=3; sFil=Bed(filename); yFil=Pheno(filename+".fam"); X=sFil.read().standardize().val; y=yFil.read().val[:,mph]; return [y,sFil];
def test_roundtrip(self): max_weight = 2 snpreader1 = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed",count_A1=True) snpdata1 = snpreader1.read() distreader1 = snpreader1.as_dist(max_weight) snpreader2 = distreader1.as_snp(max_weight) assert snpdata1.allclose(snpreader2.read(),equal_nan=True) snpdata1.val[0,0] = np.nan assert snpdata1.allclose(snpdata1.as_dist(max_weight).as_snp(max_weight).read(),equal_nan=True)
def main(): """ example that compares output to fastlmmc """ # set up data phen_fn = "../feature_selection/examples/toydata.phe" snp_fn = "../feature_selection/examples/toydata.5chrom.bed" #chrom_count = 5 # load data ################################################################### snp_reader = Bed(snp_fn) pheno = pstpheno.loadOnePhen(phen_fn) cov = None #cov = pstpheno.loadPhen(self.cov_fn) snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov]) G = snp_reader.read(order='C').val G = stdizer.Unit().standardize(G) G.flags.writeable = False y = pheno['vals'][:, 0] y.flags.writeable # load pcs #G_pc = cov['vals'] #G_pc.flags.writeable = False delta = 2.0 gwas = WindowingGwas(G, y, delta=delta) pv = gwas.run_gwas() from fastlmm.association.tests.test_gwas import GwasTest REML = False snp_pos_sim = snp_reader.sid snp_pos_test = snp_reader.sid os.environ["FastLmmUseAnyMklLib"] = "1" gwas_c = GwasTest(snp_fn, phen_fn, snp_pos_sim, snp_pos_test, delta, REML=REML, excludeByPosition=0) gwas_c.run_gwas() import pylab pylab.plot(np.log(pv), np.log(gwas_c.p_values), "+") pylab.plot(np.arange(-18, 0), np.arange(-18, 0), "-k") pylab.show() np.testing.assert_array_almost_equal(np.log(pv), np.log(gwas_c.p_values), decimal=3) simple_manhattan_plot(pv)
def read_plink(self, fn_plink = None): """ plink reader """ PL = Bed(fn_plink) PLOB = PL.read() self.GT = PLOB.val self.POS = PLOB.pos[:,[0,1]] self.SID = PLOB.iid[:,1] self.isNormalised = False
def test_hdf5_case3(self): snpreader1 = SnpHdf5(self.currentFolder + "/examples/toydata.snpmajor.snp.hdf5")[::2, :] snpreader2 = Bed(self.currentFolder + "/examples/toydata", count_A1=False)[::2, :] self.assertTrue( np.allclose(snpreader1.read().val, snpreader2.read().val, rtol=1e-05, atol=1e-05))
def read_plink(self, fn_plink=None): """ plink reader """ PL = Bed(fn_plink) PLOB = PL.read() self.GT = PLOB.val self.POS = PLOB.pos[:, [0, 1]] self.SID = PLOB.iid[:, 1] self.isNormalised = False
def gen_and_compare(self, output_file, **kwargs): from pysnptools.snpreader import Bed gen_snpdata = snp_gen(**kwargs) #pstutil.create_directory_if_necessary(self.currentFolder + "/tempdir/" + output_file,isfile=True) #Bed.write(gen_snpdata, self.currentFolder + "/tempdir/" + output_file) #comment out bed = Bed(self.currentFolder + "/../../tests/datasets/generate/" + output_file,count_A1=False) ref_snpdata = bed.read() assert gen_snpdata == ref_snpdata, "Failure on "+output_file return gen_snpdata
def test_write_x_x_cpp(self): snpreader = Bed(self.currentFolder + "/examples/toydata") for order in ['C','F']: for dtype in [np.float32,np.float64]: snpdata = snpreader.read(order=order,dtype=dtype) snpdata.val[-1,0] = float("NAN") output = "tempdir/toydata.{0}{1}.cpp".format(order,"32" if dtype==np.float32 else "64") create_directory_if_necessary(output) Bed.write(snpdata, output) snpdata2 = Bed(output).read() assert TestLoader.is_same(snpdata, snpdata2) #!!!define an equality method on snpdata?
def factory(snpreader, num_snps_in_memory, standardizer, blocksize): if isinstance(snpreader, str): snpreader = Bed(snpreader) if num_snps_in_memory >= snpreader.sid_count: in_memory = InMemory(snpreader.read(order='C').standardize(standardizer), standardizer, blocksize) in_memory._snpreader.val.flags.writeable = False in_memory._val = in_memory._snpreader.val return in_memory else: return FromDisk(snpreader, num_snps_in_memory, standardizer, blocksize, None)
def test_write_x_x_cpp(self): snpreader = Bed(self.currentFolder + "/examples/toydata") for order in ['C','F']: for dtype in [np.float32,np.float64]: snpdata = snpreader.read(order=order,dtype=dtype) snpdata.val[-1,0] = float("NAN") output = "tempdir/toydata.{0}{1}.cpp".format(order,"32" if dtype==np.float32 else "64") create_directory_if_necessary(output) Bed.write(output, snpdata) snpdata2 = Bed(output).read() np.testing.assert_array_almost_equal(snpdata.val, snpdata2.val, decimal=10)
def process_data(input_path, output_path, name): snpreader = Bed(os.path.join(input_path, name)) data = snpreader.read() values = data.val preproc_vals = pysnp_genpreproc(values) assert(np.any(np.isnan(preproc_vals)) == False) saved = os.path.join(output_path, name + ".h5py") path, keys = h5_save(path=saved, data_obj={name:preproc_vals}, dt='f') return {'n_subjects':data.iid_count, 'subject_ids':data.iid, 'n_snps':data.sid_count, 'snp_ids':data.sid, 'data_preprocessed_location': {'path':path, 'key':keys}}
def factory(snpreader, num_snps_in_memory, standardizer, blocksize,count_A1=None): if isinstance(snpreader, str): snpreader = Bed(snpreader,count_A1=count_A1) if num_snps_in_memory >= snpreader.sid_count: in_memory = InMemory(snpreader.read(order='C').standardize(standardizer), standardizer, blocksize) in_memory._snpreader.val.flags.writeable = False in_memory._val = in_memory._snpreader.val return in_memory else: return FromDisk(snpreader, num_snps_in_memory, standardizer, blocksize, None)
def test_subset_view(self): snpreader2 = Bed(self.currentFolder + "/examples/toydata",count_A1=False)[:,:] result = snpreader2.read(view_ok=True) self.assertFalse(snpreader2 is result) result2 = result[:,:].read() self.assertFalse(sp.may_share_memory(result2.val,result.val)) result3 = result[:,:].read(view_ok=True) self.assertTrue(sp.may_share_memory(result3.val,result.val)) result4 = result3.read() self.assertFalse(sp.may_share_memory(result4.val,result3.val)) result5 = result4.read(view_ok=True) self.assertTrue(sp.may_share_memory(result4.val,result5.val))
def main(args): print('reading seeed snps') seed_snps = pd.read_csv(args.seed_snps, header=None, names=['SNP'], index_col='SNP') seed_snps['ibs_length'] = 0 seed_snps['ibd'] = 0 print('reading typed snps') typed_snps = pd.read_csv(args.typed_snps, header=None, names=['SNP']) print('reading genotypes') data = Bed(args.bfile) X = data.read().val typed_snps_indices = np.sort(data.sid_to_index(typed_snps.SNP)) typed_snps_bp = data.col_property[typed_snps_indices,2] print(len(seed_snps), 'snps in list') print(data.iid_count, data.sid_count, 'are dimensions of X') def analyze_snp(i): # find first typed snp after query snp snp_bp = data.col_property[i,2] v = np.where(typed_snps_bp > snp_bp)[0] if len(v) > 0: typed_i = v[0] else: typed_i = len(typed_snps_indices)-1 n1, n2 = np.where(X[:,i] == 1)[0] if (X[n1,typed_snps_indices[typed_i]] - X[n2, typed_snps_indices[typed_i]])**2 == 4: return 0, 0 typed_il, typed_ir = fis.find_boundaries( X[n1,typed_snps_indices], X[n2,typed_snps_indices], typed_i) typed_ir -= 1 il = typed_snps_indices[typed_il] ir = typed_snps_indices[typed_ir] cM = data.col_property[ir, 1] - \ data.col_property[il, 1] ibd = (np.mean(X[n1,il:ir] == X[n2,il:ir]) > 0.99) return cM, int(ibd) for (i, snp) in iter.show_progress( it.izip(data.sid_to_index(seed_snps.index), seed_snps.index), total=len(seed_snps)): # total=10): seed_snps.ix[snp, ['ibs_length', 'ibd']] = analyze_snp(i) print(seed_snps.iloc[:100]) seed_snps.to_csv(args.outfile, sep='\t')
def cal_kin_val(bed_file, small_val=0.001): snp_on_disk = Bed(bed_file, count_A1=False) snp_mat = snp_on_disk.read().val freq = np.sum(snp_mat, axis=0) / (2 * snp_on_disk.iid_count) freq.shape = (1, snp_on_disk.sid_count) snp_mat = snp_mat - 2*freq scale = 2 * freq * (1 - freq) scale = np.sum(scale) kin = np.dot(snp_mat,snp_mat.T)/scale kin_diag = np.diag(kin) kin_diag = kin_diag + kin_diag * small_val np.fill_diagonal(kin, kin_diag) return kin
def test_subset_view(self): snpreader2 = Bed(self.currentFolder + "/examples/toydata", count_A1=False)[:, :] result = snpreader2.read(view_ok=True) self.assertFalse(snpreader2 is result) result2 = result[:, :].read() self.assertFalse(sp.may_share_memory(result2.val, result.val)) result3 = result[:, :].read(view_ok=True) self.assertTrue(sp.may_share_memory(result3.val, result.val)) result4 = result3.read() self.assertFalse(sp.may_share_memory(result4.val, result3.val)) result5 = result4.read(view_ok=True) self.assertTrue(sp.may_share_memory(result4.val, result5.val))
def main(): """ example that compares output to fastlmmc """ # set up data phen_fn = "../feature_selection/examples/toydata.phe" snp_fn = "../feature_selection/examples/toydata.5chrom" #chrom_count = 5 # load data ################################################################### snp_reader = Bed(snp_fn) pheno = pstpheno.loadOnePhen(phen_fn) cov = None #cov = pstpheno.loadPhen(self.cov_fn) snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov]) G = snp_reader.read(order='C').val G = stdizer.Unit().standardize(G) G.flags.writeable = False y = pheno['vals'][:,0] y.flags.writeable # load pcs #G_pc = cov['vals'] #G_pc.flags.writeable = False delta = 2.0 gwas = WindowingGwas(G, y, delta=delta) pv = gwas.run_gwas() from fastlmm.association.tests.test_gwas import GwasTest REML = False snp_pos_sim = snp_reader.sid snp_pos_test = snp_reader.sid os.environ["FastLmmUseAnyMklLib"] = "1" gwas_c = GwasTest(snp_fn, phen_fn, snp_pos_sim, snp_pos_test, delta, REML=REML, excludeByPosition=0) gwas_c.run_gwas() import pylab pylab.plot(np.log(pv), np.log(gwas_c.p_values), "+") pylab.plot(np.arange(-18, 0), np.arange(-18,0), "-k") pylab.show() np.testing.assert_array_almost_equal(np.log(pv), np.log(gwas_c.p_values), decimal=3) simple_manhattan_plot(pv)
def test_SNC(self): logging.info("TestSNC") test_snps = self.bedbase pheno = pstpheno.loadOnePhen(self.phen_fn,vectorize=True) covar = pstpheno.loadPhen(self.cov_fn) bed = Bed(test_snps, count_A1=False) snc = bed.read() snc.val[:,2] = [0] * snc.iid_count # make SNP #2 have constant values (aka a SNC) output_file_name = self.file_name("snc") frame = single_snp(test_snps=snc[:,:10], pheno=pheno, G0=snc, mixing=0,leave_out_one_chrom=False, covar=covar, output_file_name=output_file_name,count_A1=False ) self.compare_files(frame,"snc")
def test_write_x_x_cpp(self): for count_A1 in [False, True]: snpreader = Bed(self.currentFolder + "/examples/toydata", count_A1=count_A1) for order in ['C', 'F']: for dtype in [np.float32, np.float64]: snpdata = snpreader.read(order=order, dtype=dtype) snpdata.val[-1, 0] = float("NAN") output = "tempdir/toydata.{0}{1}.cpp".format( order, "32" if dtype == np.float32 else "64") create_directory_if_necessary(output) Bed.write(output, snpdata, count_A1=count_A1) snpdata2 = Bed(output, count_A1=count_A1).read() np.testing.assert_array_almost_equal(snpdata.val, snpdata2.val, decimal=10)
def load_plink_bed_bim_fam_dataset(path_dataset, snp_ids=None, subject_ids=None, count_A1=True): """ Load a Plink bed/bim/fam dataset as a SnpData instance. Optionnally a specific list of snps or subjects can be extracted to avoid loading everything in memory. Parameters ---------- path_dataset: str Path to the Plink bed/bim/fam dataset, with or without .bed extension. snp_ids: list/set of str, default None Snps that should be extracted if available in the dataset. By default None, all snps are loaded. subject_ids: list of str, default None Subjects that should be extracted if available in the dataset. By default None, all subjects are loaded. count_A1: bool, default True Genotypes are provided as allele counts, A1 if True else A2. Return ------ snp_data: pysnptools object PLINK data loaded by the 'pysnptools' library. """ # Load the metadata, without loading the genotypes snp_data = Bed(path_dataset, count_A1=count_A1) # If requested, filter on snp ids if snp_ids is not None: snp_ids = set(snp_ids) snp_bool_indexes = [(s in snp_ids) for s in snp_data.sid] snp_data = snp_data[:, snp_bool_indexes] # If requested, filter on subject ids if subject_ids is not None: subject_ids = set(subject_ids) subject_bool_indexes = [(s in subject_ids) for s in snp_data.iid[:, 1]] snp_data = snp_data[subject_bool_indexes, :] # Load the genotypes from the Plink dataset snp_data = snp_data.read() return snp_data
def genPheno(filename="../thinFam", per=.5, savename="fakePheno.txt", c=2.0, num=5): sFil = Bed(filename) D = sFil.read().val m = len(D[0]) n = len(D) print m print n I = [rand.randint(0, m - 1) for i in range(0, num)] SNP = [[D[j][i] for j in range(0, n)] for i in I] #p0=n*peir/sum([c**i*len([j for j in SNP if j==float(i)]) for i in range(0,3)]) print len(I) print len(SNP) print len(SNP[0]) print n print min([len(s) for s in SNP]) print SNP SNP = [[max(i, 0.0) for i in s] for s in SNP] for i in range(0, num): for j in range(0, n): if not SNP[i][j] in [1.0, 0.0, 2.0]: SNP[i][j] = 0.0 print[list(set(s)) for s in SNP] lst = [sum([SNP[j][i] for j in range(0, num)]) for i in range(0, n)] #print lst; print sum( [c**(sum([SNP[j][i] for j in range(0, num)])) for i in range(0, n)]) p0 = n * per / sum( [c**(sum([SNP[j][i] for j in range(0, num)])) for i in range(0, n)]) print p0 y = [ float( rand.uniform(0, 1) < p0 * c**sum([SNP[j][i] for j in range(0, num)])) for i in range(0, n) ] if len(savename) == 0: return y fil = open(savename, "w") for i in y: fil.write(str(i) + "\n") fil.close()
def test_SNC(self): logging.info("TestSNC") test_snps = self.bedbase pheno = pstpheno.loadOnePhen(self.phen_fn, vectorize=True) covar = pstpheno.loadPhen(self.cov_fn) bed = Bed(test_snps, count_A1=False) snc = bed.read() snc.val[:, 2] = 0 # make SNP #2 have constant values (aka a SNC) output_file_name = self.file_name("snc") frame = single_snp(test_snps=snc[:, :10], pheno=pheno, G0=snc, mixing=0, leave_out_one_chrom=False, covar=covar, output_file_name=output_file_name, count_A1=False) self.compare_files(frame, "snc")
def test_SNC(self): logging.info("TestSNC") from pysnptools.snpreader import Bed test_snps = self.bedbase pheno = pstpheno.loadOnePhen(self.phen_fn, vectorize=True) covar = pstpheno.loadPhen(self.cov_fn) bed = Bed(test_snps) snc = bed.read() snc.val[:, 2] = [ 0 ] * snc.iid_count # make SNP #2 have constant values (aka a SNC) output_file_name = self.file_name("snc") frame = single_snp(test_snps=snc[:, :10], pheno=pheno, G0=snc, mixing=0, covar=covar, output_file_name=output_file_name) self.compare_files(frame, "snc")
def cluster_data(snpreader): """ compute hierarchical clustering of snp data set in bed_fn """ if isinstance(snpreader,str): snpreader = Bed(snpreader) G = snpreader.read().standardize().val # Generate distance matrix from sklearn.metrics.pairwise import euclidean_distances D = euclidean_distances(G, G) # Compute and plot first dendrogram. fig = pylab.figure(figsize=(8,8)) ax1 = fig.add_axes([0.09,0.1,0.2,0.6]) Y = fc.linkage(D, method='average') #method="centroid" is cubic! Z1 = sch.dendrogram(Y, orientation='right') ax1.set_xticks([]) ax1.set_yticks([]) # Compute and plot second dendrogram. ax2 = fig.add_axes([0.3,0.71,0.6,0.2]) #Y = sch.linkage(D, method='single') Z2 = sch.dendrogram(Y) ax2.set_xticks([]) ax2.set_yticks([]) # Plot distance matrix. axmatrix = fig.add_axes([0.3,0.1,0.6,0.6]) idx1 = Z1['leaves'] #dx2 = Z2['leaves'] D = D[idx1,:] D = D[:,idx1] axmatrix.matshow(D, aspect='auto', origin='lower', cmap=pylab.cm.YlGnBu) axmatrix.set_xticks([]) axmatrix.set_yticks([]) pylab.show()
def genPheno(filename="../thinFam",per=.5,savename="fakePheno.txt",c=2.0,num=5): sFil=Bed(filename); D=sFil.read().val; m=len(D[0]); n=len(D); print m; print n; I=[rand.randint(0,m-1) for i in range(0,num)]; SNP=[[D[j][i] for j in range(0,n)] for i in I] #p0=n*peir/sum([c**i*len([j for j in SNP if j==float(i)]) for i in range(0,3)]) print len(I); print len(SNP); print len(SNP[0]); print n; print min([len(s) for s in SNP]) print SNP; SNP=[[max(i,0.0) for i in s] for s in SNP] for i in range(0,num): for j in range(0,n): if not SNP[i][j] in [1.0,0.0,2.0]: SNP[i][j]=0.0; print [list(set(s)) for s in SNP]
start=end if __name__ == "__main__": logging.basicConfig(level=logging.INFO) from pysnptools.snpreader import Pheno, Bed import pysnptools.util as pstutil data_file = 'd:\OneDrive\programs\epiCornell\syndata.bed' if False: from pysnptools.snpreader import SnpData import numpy as np bed1 = Bed("../../tests/datasets/synth/all") print(bed1.iid_count, bed1.sid_count, bed1.iid_count * bed1.sid_count) #goal 1500 individuals x 27000 SNP snpdata1 = bed1.read() iid = bed1.iid sid = ['sid{0}'.format(i) for i in xrange(27000)] val = np.tile(snpdata1.val,(3,6))[:,:27000].copy() #snpdata = Pheno('pysnptools/examples/toydata.phe').read() # Read data from Pheno format snpdata2 = SnpData(iid, sid, val) print(snpdata2.iid_count, snpdata2.sid_count, snpdata2.iid_count * snpdata2.sid_count) Bed.write(snpdata2,data_file,count_A1=False) synbed = Bed(data_file) print(synbed.iid_count, synbed.sid_count, synbed.iid_count * synbed.sid_count) part_count = 1000 part_list = list(split_on_sids(synbed,part_count)) pairs00 = _Pairs(part_list[0])
V_stds = np.std(V[:, 1:n_V], axis=0) V[:, 1:n_V] = zscore(V[:, 1:n_V], axis=0) else: V = np.ones((int(y.shape[0]), 1)) n_V = 1 V_names = np.array(['Intercept']) n_pars = n_X + n_V + 1 print(str(n_pars) + ' parameters in model') ### Read genotypes ### test_chr = Bed(args.genofile) # select subset to test if args.whole_chr: sid = test_chr.sid pos = test_chr.pos test_chr = test_chr.read() else: sid = test_chr.sid[args.start:args.end] pos = test_chr.pos[args.start:args.end] test_chr = test_chr[:, args.start:args.end].read() genotypes = test_chr.val # Get genotype matrix if genotypes.ndim == 1: chr_length = 1 genotypes = genotypes.reshape(genotypes.shape[0], 1) else: chr_length = genotypes.shape[1] print('Number of test loci: ' + str(genotypes.shape[1])) print('Genotypes for '+str(genotypes.shape[0])+' individuals read') # Get sample ids geno_id_dict = id_dict_make(np.array(test_chr.iid))
def gma_univariate_eigen_lt_gwas(y, xmat, bed_file, out_file=None, init=None, maxiter=100, cc=1.0e-8): # kinship print 'Build the kinship matrix' starttime = datetime.datetime.now() num_id = max(y.shape) snp_on_disk = Bed(bed_file, count_A1=False) snp_mat = snp_on_disk.read().val freq = np.sum(snp_mat, axis=0) / (2 * snp_on_disk.iid_count) freq.shape = (1, snp_on_disk.sid_count) snp_mat = snp_mat - 2 * freq scale = 2 * freq * (1 - freq) scale = np.sum(scale) kin = np.dot(snp_mat, snp_mat.T) / scale endtime = datetime.datetime.now() print "Running time", (endtime - starttime).seconds print 'Finish' print 'Eigen decomposition' starttime = datetime.datetime.now() kin_eigen_val, kin_eigen_vec = linalg.eigh(kin) kin_eigen_val = kin_eigen_val.reshape(len(kin_eigen_val), 1) endtime = datetime.datetime.now() print "Running time", (endtime - starttime).seconds print 'Finish' y = np.dot(kin_eigen_vec.T, y) xmat = np.dot(kin_eigen_vec.T, xmat) if init is not None: var = np.array(init) else: var = np.array([1.0, 1.0]) fd_mat = np.zeros(2) ai_mat = np.zeros((2, 2)) em_mat = np.zeros((2, 2)) ### 计算null model的方差组分 print 'Estimate variances' starttime = datetime.datetime.now() for i in range(maxiter): print 'Start the iteration:', i + 1 vmat = 1.0 / (kin_eigen_val * var[0] + var[1]) vx = np.multiply(vmat, xmat) xvx = np.dot(xmat.T, vx) xvx = np.linalg.inv(xvx) # py xvy = np.dot(vx.T, y) y_xb = y - np.dot(xmat, np.dot(xvx, xvy)) py = np.multiply(vmat, y_xb) # add_py p_add_py add_py = np.multiply(kin_eigen_val, py) xvy = np.dot(vx.T, add_py) y_xb = add_py - np.dot(xmat, np.dot(xvx, xvy)) p_add_py = np.multiply(vmat, y_xb) # res_py p_res_py res_py = py.copy() xvy = np.dot(vx.T, res_py) y_xb = res_py - np.dot(xmat, np.dot(xvx, xvy)) p_res_py = np.multiply(vmat, y_xb) # fd tr_vd = np.sum(np.multiply(vmat, kin_eigen_val)) xvdvx = np.dot(xmat.T, vmat * kin_eigen_val * vx) tr_2d = np.sum(np.multiply(xvdvx, xvx)) ypvpy = np.sum(np.dot(py.T, add_py)) fd_mat[0] = 0.5 * (-tr_vd + tr_2d + ypvpy) tr_vd = np.sum(vmat) xvdvx = np.dot(xmat.T, vmat * vx) tr_2d = np.sum(np.multiply(xvdvx, xvx)) ypvpy = np.sum(np.dot(py.T, res_py)) fd_mat[1] = 0.5 * (-tr_vd + tr_2d + ypvpy) # AI ai_mat[0, 0] = np.sum(np.dot(add_py.T, p_add_py)) ai_mat[0, 1] = ai_mat[1, 0] = np.sum(np.dot(add_py.T, p_res_py)) ai_mat[1, 1] = np.sum(np.dot(res_py.T, p_res_py)) ai_mat = 0.5 * ai_mat # EM em_mat[0, 0] = num_id / (var[0] * var[0]) em_mat[1, 1] = num_id / (var[1] * var[1]) print "FD:", fd_mat print "AI:", ai_mat print "EM:", em_mat for j in range(0, 51): gamma = j * 0.02 wemai_mat = (1 - gamma) * ai_mat + gamma * em_mat delta = np.dot(linalg.inv(wemai_mat), fd_mat) var_update = var + delta if min(var_update) > 0: print 'EM weight value:', gamma break print 'Updated variances:', var_update # Convergence criteria cc_val = np.sum(pow(delta, 2)) / np.sum(pow(var_update, 2)) cc_val = np.sqrt(cc_val) var = var_update.copy() print "CC: ", cc_val if cc_val < cc: break endtime = datetime.datetime.now() print "Running time", (endtime - starttime).seconds print 'Finish' # GWAS print 'Start GWAS' starttime = datetime.datetime.now() vmat = 1.0 / (kin_eigen_val * var[0] + var[1]) vx = np.multiply(vmat, xmat) xvx = np.dot(xmat.T, vx) xvx = np.linalg.inv(xvx) # py xvy = np.dot(vx.T, y) y_xb = y - np.dot(xmat, np.dot(xvx, xvy)) py = np.multiply(vmat, y_xb) snp_mat = np.dot(kin_eigen_vec.T, snp_mat) # 效应 chi_vec = [] p_vec = [] eff_vec = np.dot(snp_mat.T, py) * var[0] eff_vec = eff_vec[:, -1] for i in range(snp_on_disk.sid_count): snpi = snp_mat[:, i:(i + 1)] snp_var1 = np.sum(reduce(np.multiply, [snpi, vmat, snpi])) snp_var2 = np.dot(snpi.T, vx) snp_var2 = reduce(np.dot, [snp_var2, xvx, snp_var2.T]) snp_var = (snp_var1 + np.sum(snp_var2)) * var[0] * var[0] chi_val = eff_vec[i] * eff_vec[i] / snp_var p_val = chi2.sf(chi_val, 1) chi_vec.append(chi_val) p_vec.append(p_val) endtime = datetime.datetime.now() print "Running time", (endtime - starttime).seconds print 'Finish' snp_info_file = bed_file + '.bim' snp_info = pd.read_csv(snp_info_file, sep='\s+', header=None) res_df = snp_info.iloc[:, [0, 1, 3, 4, 5]] res_df.columns = ['chro', 'snp_ID', 'pos', 'allele1', 'allele2'] res_df.loc[:, 'eff_val'] = eff_vec res_df.loc[:, 'chi_val'] = chi_vec res_df.loc[:, 'p_val'] = p_vec if out_file is not None: try: res_df.to_csv(out_file, sep=' ', index=False) except Exception, e: print e print 'Fail to output the result!' exit()
V_stds = np.std(V[:, 1:n_V], axis=0) V[:, 1:n_V] = zscore(V[:, 1:n_V], axis=0) else: V = np.ones((int(y.shape[0]), 1)) n_V = 1 V_names = np.array(['Intercept']) n_pars = n_X + n_V + 1 print(str(n_pars) + ' parameters in model') ### Read genotypes ### test_chr = Bed(args.genofile) # select subset to test if args.whole_chr: sid = test_chr.sid pos = test_chr.pos test_chr = test_chr.read() else: sid = test_chr.sid[args.start:args.end] pos = test_chr.pos[args.start:args.end] test_chr = test_chr[:, args.start:args.end].read() genotypes = test_chr.val # Get genotype matrix if genotypes.ndim == 1: chr_length = 1 genotypes = genotypes.reshape(genotypes.shape[0], 1) else: chr_length = genotypes.shape[1] print('Number of test loci: ' + str(genotypes.shape[1])) print('Genotypes for ' + str(genotypes.shape[0]) + ' individuals read') # Get sample ids geno_id_dict = id_dict_make(np.array(test_chr.iid))
class _Epistasis(object) : #implements IDistributable def __init__(self,test_snps,pheno,G0, G1=None, mixing=0.0, covar=None,sid_list_0=None,sid_list_1=None, log_delta=None, min_log_delta=-5, max_log_delta=10, output_file=None, cache_file=None): self._ran_once = False self.test_snps = test_snps self.pheno = pheno self.output_file_or_none = output_file self.cache_file = cache_file self.covar = covar self.sid_list_0 = sid_list_0 self.sid_list_1 = sid_list_1 self.G0=G0 self.G1_or_none=G1 self.mixing=mixing self.external_log_delta=log_delta self.min_log_delta = min_log_delta self.max_log_delta = max_log_delta self._str = "{0}({1},{2},G0={6},G1={7},mixing={8},covar={3},output_file={12},sid_list_0={4},sid_list_1{5},log_delta={9},min_log_delta={10},max_log_delta={11},cache_file={13})".format( self.__class__.__name__, self.test_snps,self.pheno,self.covar,self.sid_list_0,self.sid_list_1, self.G0, self.G1_or_none, self.mixing, self.external_log_delta, self.min_log_delta, self.max_log_delta, output_file, cache_file) self.block_size = 1000 def set_sid_sets(self): sid_set_0 = set(self.sid_list_0) self.intersect = sid_set_0.intersection(self.sid_list_1) self.just_sid_0 = sid_set_0.difference(self.intersect) self.just_sid_1 = self.intersect.symmetric_difference(self.sid_list_1) self._pair_count = len(self.just_sid_0)*len(self.intersect) + len(self.just_sid_0)*len(self.just_sid_1) + len(self.intersect)*len(self.just_sid_1) + len(self.intersect) * (len(self.intersect)-1)//2 self.test_snps, self.pheno, self.covar, self.G0, self.G1_or_none = pstutil.intersect_apply([self.test_snps, self.pheno, self.covar, self.G0, self.G1_or_none]) #should put G0 and G1 first def _run_once(self): if self._ran_once: return self._ran_once = None if isinstance(self.test_snps, str): self.test_snps = Bed(self.test_snps) if isinstance(self.G0, str): self.G0 = Bed(self.G0) if isinstance(self.pheno, str): self.pheno = pstpheno.loadOnePhen(self.pheno,vectorize=True,missing='NaN') if self.covar is not None and isinstance(self.covar, str): self.covar = pstpheno.loadPhen(self.covar,missing='NaN') if self.G1_or_none is not None and isinstance(self.G1_or_none, str): self.G1_or_none = Bed(self.G1_or_none) if self.sid_list_0 is None: self.sid_list_0 = self.test_snps.sid if self.sid_list_1 is None: self.sid_list_1 = self.test_snps.sid self.set_sid_sets() #!!Should fix up to add only of no constant columns - will need to add a test case for this if self.covar is None: self.covar = np.ones((self.test_snps.iid_count, 1)) else: self.covar = np.hstack((self.covar['vals'],np.ones((self.test_snps.iid_count, 1)))) self.n_cov = self.covar.shape[1] if self.output_file_or_none is None: self.__tempdirectory = ".working" else: self.__tempdirectory = self.output_file_or_none + ".working" self._ran_once = True #start of IDistributable interface-------------------------------------- @property def work_count(self): self._run_once() block_count = self.div_ceil(self._pair_count, self.block_size) return block_count def work_sequence(self): self._run_once() return self.work_sequence_range(0,self.work_count) def work_sequence_range(self, start, end): self._run_once() lmm = self.lmm_from_cache_file() lmm.sety(self.pheno['vals']) for sid0_list, sid1_list in self.pair_block_sequence_range(start,end): yield lambda lmm=lmm,sid0_list=sid0_list,sid1_list=sid1_list : self.do_work(lmm,sid0_list,sid1_list) # the 'lmm=lmm,...' is need to get around a strangeness in Python def reduce(self, result_sequence): #doesn't need "run_once()" frame = pd.concat(result_sequence) frame.sort_values(by="PValue", inplace=True) frame.index = np.arange(len(frame)) if self.output_file_or_none is not None: frame.to_csv(self.output_file_or_none, sep="\t", index=False) return frame #!!Find a place to output info like this near the end of the run #logging.info("PhenotypeName\t{0}".format(pheno['header'])) #logging.info("SampleSize\t{0}".format(test_snps.iid_count)) #logging.info("SNPCount\t{0}".format(test_snps.sid_count)) #logging.info("Runtime\t{0}".format(time.time()-t0)) @property def tempdirectory(self): self._run_once() return self.__tempdirectory #optional override -- the str name of the instance is used by the cluster as the job name def __str__(self): #Doesn't need run_once return self._str def copyinputs(self, copier): self._run_once() if isinstance(self.test_snps, str): copier.input(self.test_snps + ".bed") copier.input(self.test_snps + ".bim") copier.input(self.test_snps + ".fam") else: copier.input(self.test_snps) copier.input(self.pheno) copier.input(self.covar) if isinstance(self.G0, str): copier.input(self.G0 + ".bed") copier.input(self.G0 + ".bim") copier.input(self.G0 + ".fam") else: copier.input(self.G0) copier.input(self.G1_or_none) copier.input(self.cache_file) def copyoutputs(self,copier): #Doesn't need run_once copier.output(self.output_file_or_none) #end of IDistributable interface--------------------------------------- @staticmethod def div_ceil(num, den): #!!move to utils? return -(-num//den) #The -/- trick makes it do ceiling instead of floor. "//" will do integer division even in the future and on floats. def pair_block_sequence_range(self,block_start,block_end): self._run_once() assert 0 <= block_start and block_start <= block_end and block_end <= self.work_count, "real assert" block_index = block_start start = block_index * self.pair_count // self.work_count next_start = (block_index+1) * self.pair_count // self.work_count size_goal = next_start - start end = block_end * self.pair_count // self.work_count sid0_list = [] sid1_list = [] for sid0, sid1 in self.pair_sequence_range(start,end): sid0_list.append(sid0) sid1_list.append(sid1) if len(sid0_list) == size_goal: yield sid0_list, sid1_list block_index += 1 if block_index == block_end: return sid0_list = [] sid1_list = [] start = next_start next_start = (block_index+1) * self.pair_count // self.work_count size_goal = next_start - start assert len(sid0_list) == 0, "real assert" #If start == end, then returns without yielding anything def pair_sequence_range(self,start,end): self._run_once() assert 0 <= start and start <= end and end <= self._pair_count, "real assert" i = start for sid0, sid1 in self.pair_sequence_with_start(start): yield sid0, sid1 i = i + 1 if i == end: break assert i == end, "Not enough items found. Didn't get to the end" def pair_sequence_with_start(self,start): self._run_once() skip_ref = [start] just_sid_0_list = list(self.just_sid_0) just_sid_1_list = list(self.just_sid_1) intersect_list = list(self.intersect) for sid0, sid1 in self.combo_distinct(just_sid_0_list, intersect_list, skip_ref): yield sid0, sid1 for sid0, sid1 in self.combo_distinct(just_sid_0_list, just_sid_1_list, skip_ref): yield sid0, sid1 for sid0, sid1 in self.combo_distinct(intersect_list, just_sid_1_list, skip_ref): yield sid0, sid1 for sid0, sid1 in self.combo_same(intersect_list, skip_ref): yield sid0, sid1 assert skip_ref[0] == 0, "real assert" def combo_distinct(self, distinct__list0, distinct__list1, skip_ref): row_count = len(distinct__list0) col_count = len(distinct__list1) if skip_ref[0] >= row_count * col_count: skip_ref[0] = skip_ref[0] - row_count * col_count assert skip_ref[0] >=0, "real assert" return row_start = skip_ref[0] // col_count skip_ref[0] = skip_ref[0] - row_start * col_count assert skip_ref[0] >=0, "real assert" for row_index in range(row_start, row_count): sid0 = distinct__list0[row_index] if row_index == row_start: col_start = skip_ref[0] skip_ref[0] = 0 else: col_start = 0 for col_index in range(col_start, col_count): sid1 = distinct__list1[col_index] yield sid0, sid1 def combo_same(self, list, skip_ref): count = len(list) full_size = count * (count + 1) // 2 if skip_ref[0] >= full_size: skip_ref[0] = skip_ref[0] - full_size assert skip_ref[0] >=0, "real assert" return row_start = int((-1 + 2*count - np.sqrt(1 - 4*count + 4*count**2 - 8*skip_ref[0]))/2) skip_ref[0] = skip_ref[0] - (count*row_start - (row_start*(1 + row_start))//2) assert skip_ref[0] >=0, "real assert" for row_index in range(row_start, count): sid0 = list[row_index] if row_index == row_start: col_start = skip_ref[0] skip_ref[0] = 0 else: col_start = 0 for col_index in range(col_start + 1 + row_index, count): sid1 = list[col_index] assert sid0 is not sid1, "real assert" yield sid0, sid1 @property def pair_count(self): self._run_once() return self._pair_count def lmm_from_cache_file(self): logging.info("Loading precomputation from {0}".format(self.cache_file)) lmm = LMM() with np.load(self.cache_file) as data: lmm.U = data['arr_0'] lmm.S = data['arr_1'] return lmm def fill_in_cache_file(self): self._run_once() logging.info("filling in the cache_file and log_delta, as needed") if self.G1_or_none is None: self.G1val_or_none = None else: self.G1val_or_none = self.G1_or_none.read().val # The S and U are always cached, in case they are needed for the cluster or for multi-threaded runs if self.cache_file is None: self.cache_file = os.path.join(self.__tempdirectory, "cache_file.npz") if os.path.exists(self.cache_file): # If there is already a cache file in the temp directory, it must be removed because it might be out-of-date os.remove(self.cache_file) lmm = None if not os.path.exists(self.cache_file): logging.info("Precomputing eigen") lmm = LMM() G0_standardized = self.G0.read().standardize() lmm.setG(G0_standardized.val, self.G1val_or_none, a2=self.mixing) logging.info("Saving precomputation to {0}".format(self.cache_file)) util.create_directory_if_necessary(self.cache_file) np.savez(self.cache_file, lmm.U,lmm.S) #using np.savez instead of pickle because it seems to be faster to read and write if self.external_log_delta is None: if lmm is None: lmm = self.lmm_from_cache_file() logging.info("searching for internal delta") lmm.setX(self.covar) lmm.sety(self.pheno['vals']) #log delta is used here. Might be better to use findH2, but if so will need to normalized G so that its K's diagonal would sum to iid_count result = lmm.find_log_delta(REML=False, sid_count=self.G0.sid_count, min_log_delta=self.min_log_delta, max_log_delta=self.max_log_delta ) #!!what about findA2H2? minH2=0.00001 self.external_log_delta = result['log_delta'] self.internal_delta = np.exp(self.external_log_delta) * self.G0.sid_count logging.info("internal_delta={0}".format(self.internal_delta)) logging.info("external_log_delta={0}".format(self.external_log_delta)) do_pair_count = 0 do_pair_time = time.time() def do_work(self, lmm, sid0_list, sid1_list): dataframe = pd.DataFrame( index=np.arange(len(sid0_list)), columns=('SNP0', 'Chr0', 'GenDist0', 'ChrPos0', 'SNP1', 'Chr1', 'GenDist1', 'ChrPos1', 'PValue', 'NullLogLike', 'AltLogLike') ) #!!Is this the only way to set types in a dataframe? dataframe['Chr0'] = dataframe['Chr0'].astype(np.float) dataframe['GenDist0'] = dataframe['GenDist0'].astype(np.float) dataframe['ChrPos0'] = dataframe['ChrPos0'].astype(np.float) dataframe['Chr1'] = dataframe['Chr1'].astype(np.float) dataframe['GenDist1'] = dataframe['GenDist1'].astype(np.float) dataframe['ChrPos1'] = dataframe['ChrPos1'].astype(np.float) dataframe['PValue'] = dataframe['PValue'].astype(np.float) dataframe['NullLogLike'] = dataframe['NullLogLike'].astype(np.float) dataframe['AltLogLike'] = dataframe['AltLogLike'].astype(np.float) #This is some of the code for a different way that reads and dot-products 50% more, but does less copying. Seems about the same speed #sid0_index_list = self.test_snps.sid_to_index(sid0_list) #sid1_index_list = self.test_snps.sid_to_index(sid1_list) #sid_index_union_dict = {} #sid0_index_index_list = self.create_index_index(sid_index_union_dict, sid0_index_list) #sid1_index_index_list = self.create_index_index(sid_index_union_dict, sid1_index_list) #snps0_read = self.test_snps[:,sid0_index_list].read().standardize() #snps1_read = self.test_snps[:,sid1_index_list].read().standardize() sid_union = set(sid0_list).union(sid1_list) sid_union_index_list = sorted(self.test_snps.sid_to_index(sid_union)) snps_read = self.test_snps[:,sid_union_index_list].read().standardize() sid0_index_list = snps_read.sid_to_index(sid0_list) sid1_index_list = snps_read.sid_to_index(sid1_list) products = snps_read.val[:,sid0_index_list] * snps_read.val[:,sid1_index_list] # in the products matrix, each column i is the elementwise product of sid i in each list X = np.hstack((self.covar, snps_read.val, products)) UX = lmm.U.T.dot(X) k = lmm.S.shape[0] N = X.shape[0] if (k<N): UUX = X - lmm.U.dot(UX) else: UUX = None for pair_index, sid0 in enumerate(sid0_list): sid1 = sid1_list[pair_index] sid0_index = sid0_index_list[pair_index] sid1_index = sid1_index_list[pair_index] index_list = np.array([pair_index]) #index to product index_list = index_list + len(sid_union_index_list) #Shift by the number of snps in the union index_list = np.hstack((np.array([sid0_index,sid1_index]),index_list)) # index to sid0 and sid1 index_list = index_list + self.covar.shape[1] #Shift by the number of values in the covar index_list = np.hstack((np.arange(self.covar.shape[1]),index_list)) #indexes of the covar index_list_less_product = index_list[:-1] #index to everything but the product #Null -- the two additive SNPs lmm.X = X[:,index_list_less_product] lmm.UX = UX[:,index_list_less_product] if (k<N): lmm.UUX = UUX[:,index_list_less_product] else: lmm.UUX = None res_null = lmm.nLLeval(delta=self.internal_delta, REML=False) ll_null = -res_null["nLL"] #Alt -- now with the product feature lmm.X = X[:,index_list] lmm.UX = UX[:,index_list] if (k<N): lmm.UUX = UUX[:,index_list] else: lmm.UUX = None res_alt = lmm.nLLeval(delta=self.internal_delta, REML=False) ll_alt = -res_alt["nLL"] test_statistic = ll_alt - ll_null degrees_of_freedom = 1 pvalue = stats.chi2.sf(2.0 * test_statistic, degrees_of_freedom) logging.debug("<{0},{1}>, null={2}, alt={3}, pvalue={4}".format(sid0,sid1,ll_null,ll_alt,pvalue)) dataframe.iloc[pair_index] = [ sid0, snps_read.pos[sid0_index,0], snps_read.pos[sid0_index,1], snps_read.pos[sid0_index,2], sid1, snps_read.pos[sid1_index,0], snps_read.pos[sid1_index,1], snps_read.pos[sid1_index,2], pvalue, ll_null, ll_alt] self.do_pair_count += 1 if self.do_pair_count % 100 == 0: start = self.do_pair_time self.do_pair_time = time.time() logging.info("do_pair_count={0}, time={1}".format(self.do_pair_count,self.do_pair_time-start)) return dataframe
class _Epistasis(object) : #implements IDistributable def __init__(self,test_snps,pheno,G0, G1=None, mixing=0.0, covar=None,sid_list_0=None,sid_list_1=None, log_delta=None, min_log_delta=-5, max_log_delta=10, output_file=None, cache_file=None): self.test_snps = test_snps self.pheno = pheno self.output_file_or_none = output_file self.cache_file = cache_file self.covar = covar self.sid_list_0 = sid_list_0 self.sid_list_1 = sid_list_1 self.G0=G0 self.G1_or_none=G1 self.mixing=mixing self.external_log_delta=log_delta self.min_log_delta = min_log_delta self.max_log_delta = max_log_delta self._ran_once = False self._str = "{0}({1},{2},G0={6},G1={7},mixing={8},covar={3},output_file={12},sid_list_0={4},sid_list_1{5},log_delta={9},min_log_delta={10},max_log_delta={11},cache_file={13})".format( self.__class__.__name__, self.test_snps,self.pheno,self.covar,self.sid_list_0,self.sid_list_1, self.G0, self.G1_or_none, self.mixing, self.external_log_delta, self.min_log_delta, self.max_log_delta, output_file, cache_file) self.block_size = 1000 def set_sid_sets(self): sid_set_0 = set(self.sid_list_0) self.intersect = sid_set_0.intersection(self.sid_list_1) self.just_sid_0 = sid_set_0.difference(self.intersect) self.just_sid_1 = self.intersect.symmetric_difference(self.sid_list_1) self._pair_count = len(self.just_sid_0)*len(self.intersect) + len(self.just_sid_0)*len(self.just_sid_1) + len(self.intersect)*len(self.just_sid_1) + len(self.intersect) * (len(self.intersect)-1)//2 self.test_snps, self.pheno, self.covar, self.G0, self.G1_or_none = pstutil.intersect_apply([self.test_snps, self.pheno, self.covar, self.G0, self.G1_or_none]) #should put G0 and G1 first def _run_once(self): if self._ran_once: return self._ran_once = None if isinstance(self.test_snps, str): self.test_snps = Bed(self.test_snps) if isinstance(self.G0, str): self.G0 = Bed(self.G0) if isinstance(self.pheno, str): self.pheno = pstpheno.loadOnePhen(self.pheno,vectorize=True) #!! what about missing=-9? if self.covar is not None and isinstance(self.covar, str): self.covar = pstpheno.loadPhen(self.covar)#!! what about missing=-9? if self.G1_or_none is not None and isinstance(self.G1_or_none, str): self.G1_or_none = Bed(self.G1_or_none) if self.sid_list_0 is None: self.sid_list_0 = self.test_snps.sid if self.sid_list_1 is None: self.sid_list_1 = self.test_snps.sid self.set_sid_sets() #!!Should fix up to add only of no constant columns - will need to add a test case for this if self.covar is None: self.covar = np.ones((self.test_snps.iid_count, 1)) else: self.covar = np.hstack((self.covar['vals'],np.ones((self.test_snps.iid_count, 1)))) self.n_cov = self.covar.shape[1] if self.output_file_or_none is None: self.__tempdirectory = ".working" else: self.__tempdirectory = self.output_file_or_none + ".working" self._ran_once = True #start of IDistributable interface-------------------------------------- @property def work_count(self): self._run_once() block_count = self.div_ceil(self._pair_count, self.block_size) return block_count def work_sequence(self): self._run_once() return self.work_sequence_range(0,self.work_count) def work_sequence_range(self, start, end): self._run_once() lmm = self.lmm_from_cache_file() lmm.sety(self.pheno['vals']) for sid0_list, sid1_list in self.pair_block_sequence_range(start,end): yield lambda lmm=lmm,sid0_list=sid0_list,sid1_list=sid1_list : self.do_work(lmm,sid0_list,sid1_list) # the 'lmm=lmm,...' is need to get around a strangeness in Python def reduce(self, result_sequence): #doesn't need "run_once()" frame = pd.concat(result_sequence) frame.sort("PValue", inplace=True) frame.index = np.arange(len(frame)) if self.output_file_or_none is not None: frame.to_csv(self.output_file_or_none, sep="\t", index=False) return frame #!!Find a place to output info like this near the end of the run #logging.info("PhenotypeName\t{0}".format(pheno['header'])) #logging.info("SampleSize\t{0}".format(test_snps.iid_count)) #logging.info("SNPCount\t{0}".format(test_snps.sid_count)) #logging.info("Runtime\t{0}".format(time.time()-t0)) @property def tempdirectory(self): self._run_once() return self.__tempdirectory #optional override -- the str name of the instance is used by the cluster as the job name def __str__(self): #Doesn't need run_once return self._str def copyinputs(self, copier): self._run_once() if isinstance(self.test_snps, str): copier.input(self.test_snps + ".bed") copier.input(self.test_snps + ".bim") copier.input(self.test_snps + ".fam") else: copier.input(self.test_snps) copier.input(self.pheno) copier.input(self.covar) if isinstance(self.G0, str): copier.input(self.G0 + ".bed") copier.input(self.G0 + ".bim") copier.input(self.G0 + ".fam") else: copier.input(self.G0) copier.input(self.G1_or_none) copier.input(self.cache_file) def copyoutputs(self,copier): #Doesn't need run_once copier.output(self.output_file_or_none) #end of IDistributable interface--------------------------------------- @staticmethod def div_ceil(num, den): #!!move to utils? return -(-num//den) #The -/- trick makes it do ceiling instead of floor. "//" will do integer division even in the future and on floats. def pair_block_sequence_range(self,block_start,block_end): self._run_once() assert 0 <= block_start and block_start <= block_end and block_end <= self.work_count, "real assert" block_index = block_start start = block_index * self.pair_count // self.work_count next_start = (block_index+1) * self.pair_count // self.work_count size_goal = next_start - start end = block_end * self.pair_count // self.work_count sid0_list = [] sid1_list = [] for sid0, sid1 in self.pair_sequence_range(start,end): sid0_list.append(sid0) sid1_list.append(sid1) if len(sid0_list) == size_goal: yield sid0_list, sid1_list block_index += 1 if block_index == block_end: return sid0_list = [] sid1_list = [] start = next_start next_start = (block_index+1) * self.pair_count // self.work_count size_goal = next_start - start assert len(sid0_list) == 0, "real assert" #If start == end, then returns without yielding anything def pair_sequence_range(self,start,end): self._run_once() assert 0 <= start and start <= end and end <= self._pair_count, "real assert" i = start for sid0, sid1 in self.pair_sequence_with_start(start): yield sid0, sid1 i = i + 1 if i == end: break assert i == end, "Not enough items found. Didn't get to the end" def pair_sequence_with_start(self,start): self._run_once() skip_ref = [start] just_sid_0_list = list(self.just_sid_0) just_sid_1_list = list(self.just_sid_1) intersect_list = list(self.intersect) for sid0, sid1 in self.combo_distinct(just_sid_0_list, intersect_list, skip_ref): yield sid0, sid1 for sid0, sid1 in self.combo_distinct(just_sid_0_list, just_sid_1_list, skip_ref): yield sid0, sid1 for sid0, sid1 in self.combo_distinct(intersect_list, just_sid_1_list, skip_ref): yield sid0, sid1 for sid0, sid1 in self.combo_same(intersect_list, skip_ref): yield sid0, sid1 assert skip_ref[0] == 0, "real assert" def combo_distinct(self, distinct__list0, distinct__list1, skip_ref): row_count = len(distinct__list0) col_count = len(distinct__list1) if skip_ref[0] >= row_count * col_count: skip_ref[0] = skip_ref[0] - row_count * col_count assert skip_ref[0] >=0, "real assert" return row_start = skip_ref[0] // col_count skip_ref[0] = skip_ref[0] - row_start * col_count assert skip_ref[0] >=0, "real assert" for row_index in xrange(row_start, row_count): sid0 = distinct__list0[row_index] if row_index == row_start: col_start = skip_ref[0] skip_ref[0] = 0 else: col_start = 0 for col_index in xrange(col_start, col_count): sid1 = distinct__list1[col_index] yield sid0, sid1 def combo_same(self, list, skip_ref): count = len(list) full_size = count * (count + 1) // 2 if skip_ref[0] >= full_size: skip_ref[0] = skip_ref[0] - full_size assert skip_ref[0] >=0, "real assert" return row_start = int((-1 + 2*count - np.sqrt(1 - 4*count + 4*count**2 - 8*skip_ref[0]))/2) skip_ref[0] = skip_ref[0] - (count*row_start - (row_start*(1 + row_start))//2) assert skip_ref[0] >=0, "real assert" for row_index in xrange(row_start, count): sid0 = list[row_index] if row_index == row_start: col_start = skip_ref[0] skip_ref[0] = 0 else: col_start = 0 for col_index in xrange(col_start + 1 + row_index, count): sid1 = list[col_index] assert sid0 is not sid1, "real assert" yield sid0, sid1 @property def pair_count(self): self._run_once() return self._pair_count def lmm_from_cache_file(self): logging.info("Loading precomputation from {0}".format(self.cache_file)) lmm = LMM() with np.load(self.cache_file) as data: lmm.U = data['arr_0'] lmm.S = data['arr_1'] return lmm def fill_in_cache_file(self): self._run_once() logging.info("filling in the cache_file and log_delta, as needed") if self.G1_or_none is None: self.G1val_or_none = None else: self.G1val_or_none = self.G1_or_none.read().val # The S and U are always cached, in case they are needed for the cluster or for multi-threaded runs if self.cache_file is None: self.cache_file = os.path.join(self.__tempdirectory, "cache_file.npz") if os.path.exists(self.cache_file): # If there is already a cache file in the temp directory, it must be removed because it might be out-of-date os.remove(self.cache_file) lmm = None if not os.path.exists(self.cache_file): logging.info("Precomputing eigen") lmm = LMM() G0_standardized = self.G0.read().standardize() lmm.setG(G0_standardized.val, self.G1val_or_none, a2=self.mixing) logging.info("Saving precomputation to {0}".format(self.cache_file)) util.create_directory_if_necessary(self.cache_file) np.savez(self.cache_file, lmm.U,lmm.S) #using np.savez instead of pickle because it seems to be faster to read and write if self.external_log_delta is None: if lmm is None: lmm = self.lmm_from_cache_file() logging.info("searching for internal delta") lmm.setX(self.covar) lmm.sety(self.pheno['vals']) #log delta is used here. Might be better to use findH2, but if so will need to normalized G so that its K's diagonal would sum to iid_count result = lmm.find_log_delta(REML=False, sid_count=self.G0.sid_count, min_log_delta=self.min_log_delta, max_log_delta=self.max_log_delta ) #!!what about findA2H2? minH2=0.00001 self.external_log_delta = result['log_delta'] self.internal_delta = np.exp(self.external_log_delta) * self.G0.sid_count logging.info("internal_delta={0}".format(self.internal_delta)) logging.info("external_log_delta={0}".format(self.external_log_delta)) do_pair_count = 0 do_pair_time = time.time() def do_work(self, lmm, sid0_list, sid1_list): dataframe = pd.DataFrame( index=np.arange(len(sid0_list)), columns=('SNP0', 'Chr0', 'GenDist0', 'ChrPos0', 'SNP1', 'Chr1', 'GenDist1', 'ChrPos1', 'PValue', 'NullLogLike', 'AltLogLike') ) #!!Is this the only way to set types in a dataframe? dataframe['Chr0'] = dataframe['Chr0'].astype(np.float) dataframe['GenDist0'] = dataframe['GenDist0'].astype(np.float) dataframe['ChrPos0'] = dataframe['ChrPos0'].astype(np.float) dataframe['Chr1'] = dataframe['Chr1'].astype(np.float) dataframe['GenDist1'] = dataframe['GenDist1'].astype(np.float) dataframe['ChrPos1'] = dataframe['ChrPos1'].astype(np.float) dataframe['PValue'] = dataframe['PValue'].astype(np.float) dataframe['NullLogLike'] = dataframe['NullLogLike'].astype(np.float) dataframe['AltLogLike'] = dataframe['AltLogLike'].astype(np.float) #This is some of the code for a different way that reads and dot-products 50% more, but does less copying. Seems about the same speed #sid0_index_list = self.test_snps.sid_to_index(sid0_list) #sid1_index_list = self.test_snps.sid_to_index(sid1_list) #sid_index_union_dict = {} #sid0_index_index_list = self.create_index_index(sid_index_union_dict, sid0_index_list) #sid1_index_index_list = self.create_index_index(sid_index_union_dict, sid1_index_list) #snps0_read = self.test_snps[:,sid0_index_list].read().standardize() #snps1_read = self.test_snps[:,sid1_index_list].read().standardize() sid_union = set(sid0_list).union(sid1_list) sid_union_index_list = sorted(self.test_snps.sid_to_index(sid_union)) snps_read = self.test_snps[:,sid_union_index_list].read().standardize() sid0_index_list = snps_read.sid_to_index(sid0_list) sid1_index_list = snps_read.sid_to_index(sid1_list) products = snps_read.val[:,sid0_index_list] * snps_read.val[:,sid1_index_list] # in the products matrix, each column i is the elementwise product of sid i in each list X = np.hstack((self.covar, snps_read.val, products)) UX = lmm.U.T.dot(X) k = lmm.S.shape[0] N = X.shape[0] if (k<N): UUX = X - lmm.U.dot(UX) else: UUX = None for pair_index, sid0 in enumerate(sid0_list): sid1 = sid1_list[pair_index] sid0_index = sid0_index_list[pair_index] sid1_index = sid1_index_list[pair_index] index_list = np.array([pair_index]) #index to product index_list = index_list + len(sid_union_index_list) #Shift by the number of snps in the union index_list = np.hstack((np.array([sid0_index,sid1_index]),index_list)) # index to sid0 and sid1 index_list = index_list + self.covar.shape[1] #Shift by the number of values in the covar index_list = np.hstack((np.arange(self.covar.shape[1]),index_list)) #indexes of the covar index_list_less_product = index_list[:-1] #index to everything but the product #Null -- the two additive SNPs lmm.X = X[:,index_list_less_product] lmm.UX = UX[:,index_list_less_product] if (k<N): lmm.UUX = UUX[:,index_list_less_product] else: lmm.UUX = None res_null = lmm.nLLeval(delta=self.internal_delta, REML=False) ll_null = -res_null["nLL"] #Alt -- now with the product feature lmm.X = X[:,index_list] lmm.UX = UX[:,index_list] if (k<N): lmm.UUX = UUX[:,index_list] else: lmm.UUX = None res_alt = lmm.nLLeval(delta=self.internal_delta, REML=False) ll_alt = -res_alt["nLL"] test_statistic = ll_alt - ll_null degrees_of_freedom = 1 pvalue = stats.chi2.sf(2.0 * test_statistic, degrees_of_freedom) logging.debug("<{0},{1}>, null={2}, alt={3}, pvalue={4}".format(sid0,sid1,ll_null,ll_alt,pvalue)) dataframe.iloc[pair_index] = [ sid0, snps_read.pos[sid0_index,0], snps_read.pos[sid0_index,1], snps_read.pos[sid0_index,2], sid1, snps_read.pos[sid1_index,0], snps_read.pos[sid1_index,1], snps_read.pos[sid1_index,2], pvalue, ll_null, ll_alt] self.do_pair_count += 1 if self.do_pair_count % 100 == 0: start = self.do_pair_time self.do_pair_time = time.time() logging.info("do_pair_count={0}, time={1}".format(self.do_pair_count,self.do_pair_time-start)) return dataframe
# Bed("all.bed") # Find out about iids and sids print snpreader.iid_count print snpreader.sid_count print snpreader.iid[:3] print snpreader.sid[:3] #500 #5000 #[['cid0P0' 'cid0P0'] # ['cid1P0' 'cid1P0'] # ['cid2P0' 'cid2P0']] #['snp625_m0_.03m1_.07' 'snp1750_m0_.02m1_.04' 'snp0_m0_.37m1_.24'] #Read all the SNP data in to memory snpdata = snpreader.read() #What is snpdata? # SnpData(Bed("all.bed")) #What do the iids and sid of snprdata look like? print snpdata.iid_count, snpdata.sid_count print snpdata.iid[:3] print snpdata.sid[:3] # The same. # print the SNP data print snpdata.val #[[ 2. 2. 1. ..., 2. 1. 2.] # [ 2. 2. 1. ..., 2. 0. 2.] # [ 2. 2. 1. ..., 1. 1. 1.] # ...,
def imputation_test( chromosomes, imputed_prefix='outputs/parent_imputed_chr', expected_prefix="../UKBioRDE_revision/data/tmp/filtered_ukb_chr", start=None, end=None): #Data files for chromosome i should be named in this fashion: "prefix{i}" chromosomes_expected_genes_o = [] chromosomes_expected_genes_pm = [] chromosomes_imputed_genes_o = [] chromosomes_imputed_genes_pm = [] for chromosome in chromosomes: with h5py.File(imputed_prefix + str(chromosome) + ".hdf5", 'r') as f: gts = np.array(f["imputed_par_gts"]) fids = np.array(f["families"]).astype(str) parental_status = np.array(f["parental_status"]) ped_array = np.array(f["pedigree"]).astype(str) ped = pd.DataFrame(ped_array[1:], columns=ped_array[0]) expected = Bed(expected_prefix + str(chromosome) + ".bed", count_A1=True) if start is not None and end is not None: expected_gts = expected[:, start:end].read().val else: expected_gts = expected.read().val expected_ids = expected.iid iid_to_bed_index = { i: index for index, i in enumerate(expected_ids[:, 1]) } #fids of control families start with _ #this has the predix _*_ index_of_families_in_imputation = { fid: index for index, fid in enumerate(fids) } # no parent control starts with _o_ # only has father control starts with _p_ # only has father control starts with _m_ control_o_families = list({ row["FID"][3:] for index, row in ped.iterrows() if row["FID"].startswith("_o_") }) #for each family select id of the parents parent_ids = ped.groupby("FID").agg({ 'FATHER_ID': lambda x: ([a for a in list(x) if a in ped["IID"].tolist()] + [None])[0], 'MOTHER_ID': lambda x: ([a for a in list(x) if a in ped["IID"].tolist()] + [None])[0], }) parents_of_control_o_families = parent_ids.loc[control_o_families] mother_indexes_control_o = [ iid_to_bed_index[parents_of_control_o_families.loc[i, "MOTHER_ID"]] for i in control_o_families ] father_indexes_control_o = [ iid_to_bed_index[parents_of_control_o_families.loc[i, "FATHER_ID"]] for i in control_o_families ] expected_parent_gts_control_o = ( expected_gts[mother_indexes_control_o, :] + expected_gts[father_indexes_control_o, :]) / 2 expected_genes_o = expected_parent_gts_control_o.reshape((1, -1)) index_of_control_families_in_imputation_o = [ index_of_families_in_imputation["_o_" + i] for i in control_o_families ] imputed_genes_o = gts[ index_of_control_families_in_imputation_o, :].reshape((1, -1)) mask_o = ~(np.isnan(expected_genes_o) | np.isnan(imputed_genes_o)) expected_genes_o = expected_genes_o[mask_o] imputed_genes_o = imputed_genes_o[mask_o] control_p = list({ row["FID"][3:] for index, row in ped.iterrows() if row["FID"].startswith("_p_") }) control_m = list({ row["FID"][3:] for index, row in ped.iterrows() if row["FID"].startswith("_m_") }) control_pm_families = control_p + control_m parent_of_control_m = parent_ids.loc[control_m] parent_of_control_p = parent_ids.loc[control_p] father_indexes_control_m = [ iid_to_bed_index[parent_of_control_m.loc[i, "FATHER_ID"]] for i in control_m ] mother_indexes_control_p = [ iid_to_bed_index[parent_of_control_p.loc[i, "MOTHER_ID"]] for i in control_p ] expected_parent_gts_control_pm = expected_gts[ mother_indexes_control_p + father_indexes_control_m, :] expected_genes_pm = expected_parent_gts_control_pm.reshape((1, -1)) index_of_control_families_in_imputation_pm = [ index_of_families_in_imputation["_p_" + i] for i in control_p ] + [index_of_families_in_imputation["_m_" + i] for i in control_m] imputed_genes_pm = gts[ index_of_control_families_in_imputation_pm, :].reshape((1, -1)) mask_pm = ~(np.isnan(expected_genes_pm) | np.isnan(imputed_genes_pm)) expected_genes_pm = expected_genes_pm[mask_pm] imputed_genes_pm = imputed_genes_pm[mask_pm] chromosomes_expected_genes_o.append(expected_genes_o) chromosomes_expected_genes_pm.append(expected_genes_pm) chromosomes_imputed_genes_o.append(imputed_genes_o) chromosomes_imputed_genes_pm.append(imputed_genes_pm) whole_expected_genes_o = np.concatenate(chromosomes_expected_genes_o) whole_imputed_genes_o = np.concatenate(chromosomes_imputed_genes_o) whole_expected_genes_pm = np.concatenate(chromosomes_expected_genes_pm) whole_imputed_genes_pm = np.concatenate(chromosomes_imputed_genes_pm) covs_o = np.cov(whole_expected_genes_o, whole_imputed_genes_o) coef_o = covs_o[0, 1] / covs_o[1, 1] residual_var_o = np.var(whole_expected_genes_o - coef_o * whole_imputed_genes_o) s2_o = residual_var_o / (len(control_o_families) * 22 * 2 * covs_o[1, 1]) z_o = (1 - coef_o) / np.sqrt(s2_o) q_o = norm.cdf(z_o) p_value_o = min(q_o, 1 - q_o) covs_pm = np.cov(whole_expected_genes_pm, whole_imputed_genes_pm) coef_pm = covs_pm[0, 1] / covs_pm[1, 1] residual_var_pm = np.var(whole_expected_genes_pm - coef_pm * whole_imputed_genes_pm) s2_pm = residual_var_pm / (len(control_pm_families) * 22 * 2 * covs_pm[1, 1]) z_pm = (1 - coef_pm) / np.sqrt(s2_pm) q_pm = norm.cdf(z_pm) p_value_pm = min(q_pm, 1 - q_pm) print(covs_pm, coef_pm, z_pm, p_value_pm) #TODO compute z correctly(find the correct sd) return (coef_o, coef_pm), (z_o, z_pm), (p_value_o, p_value_pm)
def test_c_reader_bed_count_A1(self): snpreader = Bed(self.currentFolder + "/examples/toydata", count_A1=True) snpdata = snpreader.read() snpdata.val = 2 - snpdata.val self.c_reader(snpdata)
from __future__ import print_function import numpy as np from pysnptools.snpreader import Bed data_dir = '/groups/price/hilary/ibd/data' bedfile = data_dir+'/1000G.EUR.QC.22' outfile = bedfile+'.f2snps' bed = Bed(bedfile) x = bed.read() b = np.array([sum(x.val[:,i]) in [2,976] and 1 in x.val[:,i] for i in range(len(x.sid))]) f2snps = x.sid[b] print('\n'.join(f2snps), file = open(outfile,'w'))
# In[5]: test_stat = pd.read_csv('Outputs/Fast-Lmm-Cache/Test-Stat-Cache.txt', header=None) test_stat = test_stat.replace('[\[\] ]', '', regex=True) test_stat = pd.to_numeric(test_stat[0]) results_df['Full ID'] = results_df['Chr'].astype('str') + '_' + results_df['ChrPos'].astype('str') results_df = pd.concat([results_df[['Chr', 'ChrPos', 'SNP', 'Full ID', 'PValue']], test_stat], axis = 1) results_df.columns = ['Chr', 'ChrPos', 'SNP', 'Full ID', 'PValue', 'F-test statistic'] mybed = Bed(VARIANTS_TO_TEST + '.bed') mysnpdata = mybed.read() print 'Time (s): ' + str(time.clock()-start) # In[6]: pheno = _pheno_fixup(PHENOTYPE_DATA, count_A1=None).read() pheno = pheno.val[np.searchsorted(pheno.iid[:,1], mysnpdata.iid[:,1])] snpdata = mysnpdata.val diff = range(snpdata.shape[1]) maf = range(snpdata.shape[1]) n_alleles = range(snpdata.shape[1]) mean_major = range(snpdata.shape[1]) for i in range(snpdata.shape[1]):
__author__ = 'Haohan Wang' from pysnptools.snpreader import Bed import numpy as np snp_on_disk = Bed('../data/ANDI.bed', count_A1=False) snps = snp_on_disk.read() np.save('../result/sampleID', snps.iid) sid = snps.sid markers = [line.strip() for line in open('../commonData/markers.txt')] mdic = {} for m in markers: mdic[m] = 0 idx = [] for i in range(len(sid)): if sid[i] in mdic: idx.append(i) idx = np.array(idx) data = snps.val[:, idx] # print data.shape # print snps.sid
class LeaveTwoChrOutSimulation(): def __init__(self, snp_fn, out_prefix): self.force_recompute = False #self.base_path = base_path self.snp_fn = snp_fn from pysnptools.snpreader import Bed self.snp_reader = Bed(snp_fn) self.eigen_fn = self.snp_fn + "_pcs.pickle" self.out_prefix = out_prefix def precompute_pca(self): """ compute pcs """ logging.info("computing PCA on train set") t0 = time.time() if not os.path.isfile(self.eigen_fn) or self.force_recompute: G = self.snp_reader.read(order='C').standardize().val G.flags.writeable = False chr1_idx, chr2_idx, rest_idx = split_data_helper.split_chr1_chr2_rest(self.snp_reader.pos) G_train = G.take(rest_idx, axis=1) from sklearn.decomposition import PCA pca = PCA() pcs = pca.fit_transform(G_train) logging.info("saving eigendecomp to file %s" % self.eigen_fn) eig_dec = {"pcs": pcs} save(self.eigen_fn, eig_dec) logging.info("time taken for pc computation: " + str(time.time()-t0)) else: logging.info("pc file already exists: %s" % (self.eigen_fn)) def run(self, methods, num_causal, num_repeats, num_pcs, description, runner, seed=None, plot_fn=None): self.precompute_pca() input_files = [self.snp_fn + ext for ext in [".bed", ".fam", ".bim"]] + [self.eigen_fn] input_args = [(methods, self.snp_fn, self.eigen_fn, num_causal, num_pcs, seed, sim_id) for sim_id in range(num_repeats)] output_list = distributed_map.d_map(semisynth_simulations.compute_core, input_args, runner, input_files=input_files) ############################################ results_fn = "%s_results.runs_%i.causals_%i.pickle.bzip" % (description, num_repeats, num_causal) reduced_results_fn = results_fn.replace("runs", "reduced.runs") save(results_fn, output_list) methods = output_list[0][0].keys() arg_list = [(method, results_fn) for method in methods] #reduce_runner = Hadoop(len(methods), mapmemory=90*1024, reducememory=90*1024, mkl_num_threads=1, queue="shared") reduce_runner = Local() combine_output = distributed_map.d_map(semisynth_simulations.combine_results, arg_list, reduce_runner, input_files=[results_fn]) save(reduced_results_fn, combine_output) title = "%i causal, %i repeats" % (num_causal, num_repeats) visualize_reduced_results(methods, combine_output, title=title, plot_fn=plot_fn) return combine_output
def test_generate_and_regress(self): #requies plink number_of_snps = 1000 min_f = 0.05 number_of_families = 100 filename = "outputs/tmp/generated" if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise start = 0 interval = 0 #generating population os.system('python example/simulate_pop.py ' + str(number_of_snps) + ' ' + str(min_f) + ' ' + str(number_of_families) + ' 0 0 0 "outputs/tmp/generated"') # Adding header to the ped.igree file os.system( 'echo -e "FID IID FATHER_ID MOTHER_ID\n$(cat outputs/tmp/generated_fams.ped)" > outputs/tmp/generated_fams.ped' ) #convert the generated data to a bed file os.system( 'plink/plink --noweb --file outputs/tmp/generated --make-bed --out outputs/tmp/generated' ) columns = ["FID", "IID", "FATHER_ID", "MOTHER_ID", "sex", "phenotype" ] + ["genotype_" + str(i) for i in range(number_of_snps)] ped = pd.read_csv("outputs/tmp/generated.ped", sep=" ", header=None, names=columns) ped = ped[["FID", "IID", "FATHER_ID", "MOTHER_ID"]].astype(str) only_remove_father_ids = [ str(i) + "_P" for i in range(number_of_families // 4) ] only_remove_mother_ids = [ str(i) + "_M" for i in range(number_of_families // 4, number_of_families // 2) ] remove_both_parents_ids = [ str(i) + "_M" for i in range(number_of_families // 2, number_of_families) ] + [ str(i) + "_P" for i in range(number_of_families // 2, number_of_families) ] parents = ped[ped["IID"].str.endswith("_P") | ped["IID"].str.endswith("_M")] sibs = ped[~ped["IID"].isin(only_remove_father_ids + only_remove_mother_ids + remove_both_parents_ids)] sibs.to_csv("outputs/tmp/generated_sibs.ped", sep=" ") parents.to_csv("outputs/tmp/generated_parents.ped", sep=" ") with open("outputs/tmp/generated_sibs.txt", "w") as f: for i, j in sibs[["FID", "IID"]].values.tolist(): f.write(str(i) + " " + str(j) + "\n") with open("outputs/tmp/generated_parents.txt", "w") as f: for i, j in ped[["FID", "IID"]].values.tolist(): if j.endswith("_P") or j.endswith("_M"): f.write(str(i) + " " + str(j) + "\n") #writing sibs only os.system( 'plink/plink --noweb --bfile outputs/tmp/generated --keep outputs/tmp/generated_sibs.txt --make-bed --out outputs/tmp/generated_sibs' ) #writing parents only os.system( 'plink/plink --noweb --bfile outputs/tmp/generated --keep outputs/tmp/generated_parents.txt --make-bed --out outputs/tmp/generated_parents' ) ibd = pd.read_csv("outputs/tmp/generated.segments.gz", sep="\t") sibships, iid_to_bed_index, gts, ibd, pos, chromosomes, hdf5_output_dict = prepare_data( sibs, "outputs/tmp/generated_sibs", ibd) gts = gts.astype(float) pos = pos.astype(int) imputed_fids, imputed_par_gts = impute(sibships, iid_to_bed_index, gts, ibd, pos, hdf5_output_dict, str(chromosomes), threads=2) expected_parents = Bed("outputs/tmp/generated_parents.bed", count_A1=True) expected_parents_gts = expected_parents.read().val expected_parents_ids = expected_parents.iid father = expected_parents_gts[[ bool(i % 2) for i in range(2 * number_of_families) ]] father = father[[int(t) for t in imputed_fids], :] mother = expected_parents_gts[[ not bool(i % 2) for i in range(2 * number_of_families) ]] mother = mother[[int(t) for t in imputed_fids], :] expected_parents = np.zeros(imputed_par_gts.shape) no_parent = ~sibships["has_father"] & ~sibships["has_mother"] only_mother = ~sibships["has_father"] & sibships["has_mother"] only_father = ~sibships["has_mother"] & sibships["has_father"] expected_parents[no_parent] = (mother[no_parent] + father[no_parent]) / 2 expected_parents[only_mother] = mother[only_mother] expected_parents[only_father] = father[only_father] expected_genotypes = expected_parents.reshape((1, -1)) imputed_genotypes = imputed_par_gts.reshape((1, -1)) covs = np.cov(expected_genotypes, imputed_genotypes) coef = covs[0, 1] / covs[1, 1] residual_var = np.var(expected_genotypes - coef * imputed_genotypes) s2 = residual_var / (number_of_snps * covs[1, 1]) #TODO should i divide by number_of_snps*covs[1,1]*number_of_families z = (1 - coef) / np.sqrt(s2) q = norm.cdf(z) p_val = min(q, 1 - q) self.assertGreaterEqual(p_val, 0.01)
def test_p_reader_bed_count_A1(self): snpreader = Bed(self.currentFolder + "/examples/toydata",count_A1=True) snpdata = snpreader.read(force_python_only=True) snpdata.val = 2 - snpdata.val self.c_reader(snpdata)
# Find out about iids and sids print snpreader.iid_count print snpreader.sid_count print snpreader.iid[:3] print snpreader.sid[:3] #500 #5000 #[['cid0P0' 'cid0P0'] # ['cid1P0' 'cid1P0'] # ['cid2P0' 'cid2P0']] #['snp625_m0_.03m1_.07' 'snp1750_m0_.02m1_.04' 'snp0_m0_.37m1_.24'] #Read all the SNP data in to memory snpdata = snpreader.read() #What is snpdata? # SnpData(Bed("all.bed")) #What do the iids and sid of snprdata look like? print snpdata.iid_count, snpdata.sid_count print snpdata.iid[:3] print snpdata.sid[:3] # The same. # print the SNP data print snpdata.val #[[ 2. 2. 1. ..., 2. 1. 2.] # [ 2. 2. 1. ..., 2. 0. 2.] # [ 2. 2. 1. ..., 1. 1. 1.] # ...,
def test_hdf5_case3(self): snpreader1 = SnpHdf5(self.currentFolder + "/examples/toydata.snpmajor.snp.hdf5")[::2,:] snpreader2 = Bed(self.currentFolder + "/examples/toydata",count_A1=False)[::2,:] self.assertTrue(np.allclose(snpreader1.read().val, snpreader2.read().val, rtol=1e-05, atol=1e-05))
class LeaveTwoChrOutSimulation(): def __init__(self, snp_fn, out_prefix): self.force_recompute = False #self.base_path = base_path self.snp_fn = snp_fn from pysnptools.snpreader import Bed self.snp_reader = Bed(snp_fn) self.eigen_fn = self.snp_fn + "_pcs.pickle" self.out_prefix = out_prefix def precompute_pca(self): """ compute pcs """ logging.info("computing PCA on train set") t0 = time.time() if not os.path.isfile(self.eigen_fn) or self.force_recompute: G = self.snp_reader.read(order='C').standardize().val G.flags.writeable = False chr1_idx, chr2_idx, rest_idx = split_data_helper.split_chr1_chr2_rest( self.snp_reader.pos) G_train = G.take(rest_idx, axis=1) from sklearn.decomposition import PCA pca = PCA() pcs = pca.fit_transform(G_train) logging.info("saving eigendecomp to file %s" % self.eigen_fn) eig_dec = {"pcs": pcs} save(self.eigen_fn, eig_dec) logging.info("time taken for pc computation: " + str(time.time() - t0)) else: logging.info("pc file already exists: %s" % (self.eigen_fn)) def run(self, methods, num_causal, num_repeats, num_pcs, description, runner, seed=None, plot_fn=None): self.precompute_pca() input_files = [self.snp_fn + ext for ext in [".bed", ".fam", ".bim"]] + [self.eigen_fn] input_args = [(methods, self.snp_fn, self.eigen_fn, num_causal, num_pcs, seed, sim_id) for sim_id in range(num_repeats)] output_list = distributed_map.d_map(semisynth_simulations.compute_core, input_args, runner, input_files=input_files) ############################################ results_fn = "%s_results.runs_%i.causals_%i.pickle.bzip" % ( description, num_repeats, num_causal) reduced_results_fn = results_fn.replace("runs", "reduced.runs") save(results_fn, output_list) methods = output_list[0][0].keys() arg_list = [(method, results_fn) for method in methods] #reduce_runner = Hadoop(len(methods), mapmemory=90*1024, reducememory=90*1024, mkl_num_threads=1, queue="shared") reduce_runner = Local() combine_output = distributed_map.d_map( semisynth_simulations.combine_results, arg_list, reduce_runner, input_files=[results_fn]) save(reduced_results_fn, combine_output) title = "%i causal, %i repeats" % (num_causal, num_repeats) visualize_reduced_results(methods, combine_output, title=title, plot_fn=plot_fn) return combine_output
if (args.indf != None): assert (args.e != 0), "Specify number of eigenvectors used to estimate allele frequencies!" # Parse Beagle file if args.plink == None: print "Parsing Beagle file" likeMatrix = pd.read_csv(str(args.beagle), sep="\t", engine="c", header=0, usecols=range(3, 3 + 3*args.n), dtype=np.float32, compression="gzip") likeMatrix = likeMatrix.as_matrix().T else: chunk_N = int(np.ceil(float(args.n)/args.threads)) chunks = [i * chunk_N for i in xrange(args.threads)] print "Parsing PLINK files" from pysnptools.snpreader import Bed # Import Microsoft Genomics PLINK reader snpClass = Bed(args.plink, count_A1=True) pos = np.copy(snpClass.sid) snpFile = snpClass.read(dtype=np.float32) # Read PLINK files into memory f = np.nanmean(snpFile.val, axis=0, dtype=np.float64)/2 likeMatrix = np.zeros((3*args.n, snpFile.val.shape[1]), dtype=np.float32) print "Converting PLINK files into genotype likelihood matrix" # Multithreading threads = [threading.Thread(target=convertPlink, args=(likeMatrix, snpFile.val, chunk, chunk_N, args.epsilon)) for chunk in chunks] for thread in threads: thread.start() for thread in threads: thread.join() del snpClass, snpFile ##### Estimate population allele frequencies ##### if args.plink == None: