def _read_with_standardizing(self, to_kerneldata, kernel_standardizer=DiagKtoN(), return_trained=False): ''' Reads a SnpKernel with two cases If returning KernelData, just calls snpreader._read_kernel, package, kernel_standardize If returning simple SnpKernel that needs no more standardization read the reference and learn both standardization (but can't this cause multiple reads?) Note that snp_standardizer should be None or the standardizer instead the SnpKernel should have the placeholder value Standardizer() ''' if to_kerneldata: val, snp_trained = self.snpreader._read_kernel( self.standardizer, block_size=self.block_size, return_trained=True) kernel = KernelData(iid=self.snpreader.iid, val=val, name=str(self)) kernel, kernel_trained = kernel.standardize(kernel_standardizer, return_trained=True) else: snpdata, snp_trained = self.snpreader.read().standardize( self.standardizer, return_trained=True) snpdata, kernel_trained = snpdata.standardize(kernel_standardizer, return_trained=True) kernel = SnpKernel(snpdata, SS_Identity()) if return_trained: return kernel, snp_trained, kernel_trained else: return kernel
def _read_with_standardizing(self, to_kerneldata, kernel_standardizer=DiagKtoN(), return_trained=False, num_threads=None): ''' Reads a SnpKernel with two cases If returning KernelData, just calls snpreader._read_kernel, package, kernel_standardize If returning simple SnpKernel that needs no more standardization read the reference and learn both standardization (but can't this cause multiple reads?) Note that snp_standardizer should be None or the standardizer instead the SnpKernel should have the placeholder value Standardizer() Will choose which array module to use based on the ARRAY_MODULE environment variable (e.g. 'numpy' (default) or 'cupy') ''' logging.info("Starting '_read_with_standardizing'") xp = pstutil.array_module() if to_kerneldata: val, snp_trained = self.snpreader._read_kernel( self.standardizer, block_size=self.block_size, return_trained=True, num_threads=num_threads) kernel = KernelData(iid=self.snpreader.iid, val=val, name=str(self), xp=xp) kernel, kernel_trained = kernel.standardize( kernel_standardizer, return_trained=True, num_threads=num_threads) else: snpdata, snp_trained = self.snpreader.read().standardize( self.standardizer, return_trained=True, num_threads=num_threads) snpdata, kernel_trained = snpdata.standardize( kernel_standardizer, return_trained=True, num_threads=num_threads) kernel = SnpKernel(snpdata, SS_Identity()) logging.info("Ending '_read_with_standardizing'") if return_trained: return kernel, snp_trained, kernel_trained else: return kernel
def reducer_closure(result_result_sequence): logging.info("starting ata reducer") iid = [[value, value] for value in sid] gtg_data = KernelData(iid=iid, val=np.zeros((sid_count,sid_count))) for result_result in result_result_sequence: for piece_index, gtg_piece in result_result: logging.info("combining ata reducer {0}".format(piece_index)) start = debatch_closure(piece_index) stop = debatch_closure(piece_index+1) gtg_data.val[start:,start:stop] = gtg_piece gtg_data.val[start:stop,start+gtg_piece.shape[1]:] = gtg_piece[gtg_piece.shape[1]:,:].T result = writer(gtg_data) return result
def k_mix(self,K0,K1): #!!!later add special case code for mixing==1 and 0 K0_b = K0.read().standardize(self.kernel_trained0) K1_b = K1.read().standardize(self.kernel_trained1) # similar code elsewhere K = np.empty(K0_b.val.shape) _mix_from_Ks(K, K0_b.val, K1_b.val, self.mixing) K = KernelData(val=K,iid0=K0_b.iid0,iid1=K0_b.iid1) return K
def predict(self, X=None, K0_whole_test=None, K1_whole_test=None, iid_if_none=None): """ Method for predicting from a fitted :class:`FastLMM` predictor. If the examples in X, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected. :param X: testing covariate information, optional: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string. :param K0_whole_test: Must be None. Represents the identity similarity matrix. :type K0_whole_test: None :param K1_whole_test: Must be None. Represents the identity similarity matrix. :type K1_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader` :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided. :type iid_if_none: an ndarray of two strings :rtype: A :class:`SnpData` of the means and a :class:`KernelData` of the covariance """ assert self.is_fitted, "Can only predict after predictor has been fitted" assert K0_whole_test is None or isinstance( K0_whole_test, KernelIdentity) # could also accept no snps assert K1_whole_test is None or isinstance( K1_whole_test, KernelIdentity) # could also accept no snps X = _pheno_fixup(X, iid_if_none=iid_if_none) X = X.read().standardize(self.covar_unit_trained) # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset X = SnpData(iid=X.iid, sid=FastLMM._new_snp_name(X), val=np.c_[X.read().val, np.ones((X.iid_count, 1))]) assert np.array_equal( X.sid, self.covar_sid ), "Expect covar sids to be the same in train and test." pheno_predicted = X.val.dot(self.beta).reshape(-1, 1) ret0 = SnpData(iid=X.iid, sid=self.pheno_sid, val=pheno_predicted, pos=np.array([[np.nan, np.nan, np.nan]]), name="linear regression Prediction" ) #!!!replace 'parent_string' with 'name' from pysnptools.kernelreader import KernelData ret1 = KernelData(iid=X.iid, val=np.eye(X.iid_count) * self.ssres / self.iid_count) return ret0, ret1
def _read_with_standardizing(self, to_kerneldata, kernel_standardizer=DiagKtoN(), return_trained=False): ''' Reads a SnpKernel with two cases If returning KernelData, just calls snpreader._read_kernel, package, kernel_standardize If returning simple SnpKernel that needs no more standardization read the reference and learn both standardization (but can't this cause multiple reads?) Note that snp_standardizer should be None or the standardizer instead the SnpKernel should have the placeholder value Standardizer() ''' if to_kerneldata: val, snp_trained = self.snpreader._read_kernel(self.standardizer,block_size=self.block_size,return_trained=True) kernel = KernelData(iid=self.snpreader.iid, val=val, name=str(self)) kernel, kernel_trained = kernel.standardize(kernel_standardizer,return_trained=True) else: snpdata, snp_trained = self.snpreader.read().standardize(self.standardizer, return_trained=True) snpdata, kernel_trained = snpdata.standardize(kernel_standardizer, return_trained=True) kernel = SnpKernel(snpdata, SS_Identity()) if return_trained: return kernel, snp_trained, kernel_trained else: return kernel
def write_kernel(self, path, filetype='hdf5'): """Write constructed background kernel :math:`K_0` to file, using eihter pysnptools.kernelreader.KernelHdf5 or pysnptools.kernelreader.KernelNpz. :param str path: Path to the output file to be created. :param str filetype: Either 'hdf5' or 'npz' """ if self.K0 is None: if self.G0 is not None: raise ValueError( 'G0 is initialized: Number of individuals < number of variants. In this case no kernel is constructed.' ) raise ValueError( 'K0 is not initialized, need to call compute_background_kernel() first' ) elif filetype == 'hdf5': KernelHdf5.write(path, KernelData(self.iid_fid.values, val=self.K0)) elif filetype == 'npz': KernelNpz.write(path, KernelData(self.iid_fid.values, val=self.K0)) else: raise ValueError( 'filetype has to be either "npz" or "hdf5", got {}'.format( filetype))
def generate_and_analyze(seed, N, do_shuffle, just_testing=True, map_function=None, cache_folder=None): #Generate SNPs snpdata = snp_gen(fst=.1, dfr=0, iid_count=N, sid_count=1000, chr_count=10, label_with_pop=True, seed=seed) K_causal = snpdata.read_kernel(Unit()).standardize() #Generate geo-spatial locations and K_loc distance_between_centers = 2500000 x0 = distance_between_centers * 0.5 x1 = distance_between_centers * 1.5 y0 = distance_between_centers y1 = distance_between_centers sd = distance_between_centers / 4. spatial_iid = snpdata.iid center_dict = {"0": (x0, y0), "1": (x1, y1)} centers = np.array( [center_dict[iid_item[0]] for iid_item in spatial_iid]) np.random.seed(seed) logging.info("Generating positions for seed {0}".format(seed)) spatial_coor = SnpData( iid=snpdata.iid, sid=["x", "y"], val=centers + np.random.multivariate_normal( [0, 0], [[1, 0], [0, 1]], size=len(centers)) * sd, parent_string="'spatial_coor_gen_original'") alpha = distance_between_centers spatial_val = spatial_similarity(spatial_coor.val, alpha, power=2) K_loc = KernelData(iid=snpdata.iid, val=spatial_val).standardize() #Generate phenotype iid = K_causal.iid iid_count = K_causal.iid_count np.random.seed(seed) pheno_causal = SnpData(iid=iid, sid=["causal"], val=np.random.multivariate_normal( np.zeros(iid_count), K_causal.val).reshape(-1, 1), parent_string="causal") np.random.seed(seed ^ 998372) pheno_noise = SnpData(iid=iid, sid=["noise"], val=np.random.normal(size=iid_count).reshape( -1, 1), parent_string="noise") np.random.seed(seed ^ 12230302) pheno_loc_original = SnpData(iid=iid, sid=["loc_original"], val=np.random.multivariate_normal( np.zeros(iid_count), K_loc.val).reshape(-1, 1), parent_string="loc_original") if do_shuffle: idx = np.arange(iid_count) np.random.seed(seed) np.random.shuffle(idx) pheno_loc = pheno_loc_original.read( view_ok=True ) #don't need to copy, because the next line will be fresh memory pheno_loc.val = pheno_loc.val[idx, :] else: pheno_loc = pheno_loc_original pheno = SnpData(iid=iid, sid=["pheno_all"], val=pheno_causal.val + pheno_noise.val + pheno_loc.val) #Analyze data alpha_list = [ int(v) for v in np.logspace(np.log10(100), np.log10(1e10), 100) ] dataframe = heritability_spatial_correction( snpdata, spatial_coor.val, spatial_iid, alpha_list=[alpha] if just_testing else alpha_list, pheno=pheno, alpha_power=2, jackknife_count=0, permute_plus_count=0, permute_times_count=0, just_testing=just_testing, map_function=map_function, cache_folder=cache_folder) logging.info(dataframe) return dataframe
def work_item2(pheno, G_kernel, spatial_coor, spatial_iid, alpha, alpha_power, xxx_todo_changeme, xxx_todo_changeme1, xxx_todo_changeme2, just_testing, do_uncorr, do_gxe2, a2): ######################################### # Load GPS info from filename if that's the way it is given ######################################## (jackknife_index, jackknife_count, jackknife_seed) = xxx_todo_changeme (permute_plus_index, permute_plus_count, permute_plus_seed) = xxx_todo_changeme1 (permute_times_index, permute_times_count, permute_times_seed) = xxx_todo_changeme2 if isinstance(spatial_coor, str): assert spatial_iid is None, "if spatial_coor is a str, then spatial_iid should be None" gps_table = pd.read_csv(spatial_coor, delimiter=" ").dropna() spatial_iid = np.array([(v, v) for v in gps_table["id"].values]) spatial_coor = gps_table[["south_new", "east_new"]].values ######################################### # Remove any missing values from pheno ######################################## assert pheno.sid_count == 1, "Expect only one pheno in work_item" pheno = pheno.read() pheno = pheno[pheno.val[:, 0] == pheno. val[:, 0], :] #Excludes NaN because NaN is not equal to NaN ######################################### # Environment: Turn spatial info info a KernelData ######################################### spatial_val = spatial_similarity(spatial_coor, alpha, power=alpha_power) E_kernel = KernelData(iid=spatial_iid, val=spatial_val) ######################################### # Intersect, apply the jackknife or permutation, and then (because we now know the iids) standardize appropriately ######################################### from pysnptools.util import intersect_apply G_kernel, E_kernel, pheno = intersect_apply([G_kernel, E_kernel, pheno]) if jackknife_index >= 0: assert jackknife_count <= G_kernel.iid_count, "expect the number of groups to be less than the number of iids" assert jackknife_index < jackknife_count, "expect the jackknife index to be less than the count" m_fold = model_selection.KFold(n_splits=jackknife_count, shuffle=True, random_state=jackknife_seed % 4294967295).split( list(range(G_kernel.iid_count))) iid_index, _ = _nth(m_fold, jackknife_index) pheno = pheno[iid_index, :] G_kernel = G_kernel[iid_index] E_kernel = E_kernel[iid_index] if permute_plus_index >= 0: #We shuffle the val, but not the iid, because that would cancel out. #Integrate the permute_plus_index into the random. np.random.seed((permute_plus_seed + permute_plus_index) % 4294967295) new_index = np.arange(G_kernel.iid_count) np.random.shuffle(new_index) E_kernel_temp = E_kernel[new_index].read() E_kernel = KernelData( iid=E_kernel.iid, val=E_kernel_temp.val, name="permutation {0}".format(permute_plus_index)) pheno = pheno.read().standardize() # defaults to Unit standardize G_kernel = G_kernel.read().standardize( ) # defaults to DiagKtoN standardize E_kernel = E_kernel.read().standardize( ) # defaults to DiagKtoN standardize ######################################### # find h2uncoor, the best mixing weight of pure random noise and G_kernel ######################################### if not do_uncorr: h2uncorr, nLLuncorr = np.nan, np.nan else: logging.info("Find best h2 for G_kernel") lmmg = LMM() lmmg.setK(K0=G_kernel.val) lmmg.setX(np.ones([G_kernel.iid_count, 1])) # just a bias column lmmg.sety(pheno.val[:, 0]) if not just_testing: resg = lmmg.findH2() h2uncorr, nLLuncorr = resg["h2"], resg["nLL"] else: h2uncorr, nLLuncorr = 0, 0 logging.info("just G: h2uncorr: {0}, nLLuncorr: {1}".format( h2uncorr, nLLuncorr)) ######################################### # Find a2, the best mixing for G_kernel and E_kernel ######################################### if a2 is None: logging.info("Find best mixing for G_kernel and E_kernel") lmm1 = LMM() lmm1.setK(K0=G_kernel.val, K1=E_kernel.val, a2=0.5) lmm1.setX(np.ones([G_kernel.iid_count, 1])) # just a bias column lmm1.sety(pheno.val[:, 0]) if not just_testing: res1 = lmm1.findA2() h2, a2, nLLcorr = res1["h2"], res1["a2"], res1["nLL"] h2corr = h2 * (1 - a2) e2 = h2 * a2 h2corr_raw = h2 else: h2corr, e2, a2, nLLcorr, h2corr_raw = 0, 0, .5, 0, 0 logging.info( "G plus E mixture: h2corr: {0}, e2: {1}, a2: {2}, nLLcorr: {3} (h2corr_raw:{4})" .format(h2corr, e2, a2, nLLcorr, h2corr_raw)) else: h2corr, e2, nLLcorr, h2corr_raw = np.nan, np.nan, np.nan, np.nan ######################################### # Find a2_gxe2, the best mixing for G+E_kernel and the GxE kernel ######################################### if not do_gxe2: gxe2, a2_gxe2, nLL_gxe2 = np.nan, np.nan, np.nan else: #Create the G+E kernel by mixing according to a2 val = (1 - a2) * G_kernel.val + a2 * E_kernel.val GplusE_kernel = KernelData(iid=G_kernel.iid, val=val, name="{0} G + {1} E".format(1 - a2, a2)) #Don't need to standardize GplusE_kernel because it's the weighted combination of standardized kernels # Create GxE Kernel and then find the best mixing of it and GplusE logging.info("Find best mixing for GxE and GplusE_kernel") val = G_kernel.val * E_kernel.val if permute_times_index >= 0: #We shuffle the val, but not the iid, because doing both would cancel out np.random.seed( (permute_times_seed + permute_times_index) % 4294967295) new_index = np.arange(G_kernel.iid_count) np.random.shuffle(new_index) val = pstutil.sub_matrix(val, new_index, new_index) GxE_kernel = KernelData( iid=G_kernel.iid, val=val, name="GxE" ) # recall that Python '*' is just element-wise multiplication GxE_kernel = GxE_kernel.standardize() lmm2 = LMM() lmm2.setK(K0=GplusE_kernel.val, K1=GxE_kernel.val, a2=0.5) lmm2.setX(np.ones([G_kernel.iid_count, 1])) # just a bias column lmm2.sety(pheno.val[:, 0]) if not just_testing: res2 = lmm2.findA2() gxe2, a2_gxe2, nLL_gxe2 = res2["h2"], res2["a2"], res2["nLL"] gxe2 *= a2_gxe2 else: gxe2, a2_gxe2, nLL_gxe2 = 0, .5, 0 logging.info( "G+E plus GxE mixture: gxe2: {0}, a2_gxe2: {1}, nLL_gxe2: {2}". format(gxe2, a2_gxe2, nLL_gxe2)) ######################################### # Return results ######################################### ret = { "h2uncorr": h2uncorr, "nLLuncorr": nLLuncorr, "h2corr": h2corr, "h2corr_raw": h2corr_raw, "e2": e2, "a2": a2, "nLLcorr": nLLcorr, "gxe2": gxe2, "a2_gxe2": a2_gxe2, "nLL_gxe2": nLL_gxe2, "alpha": alpha, "alpha_power": alpha_power, "phen": np.array(pheno.sid, dtype='str')[0], "jackknife_index": jackknife_index, "jackknife_count": jackknife_count, "jackknife_seed": jackknife_seed, "permute_plus_index": permute_plus_index, "permute_plus_count": permute_plus_count, "permute_plus_seed": permute_plus_seed, "permute_times_index": permute_times_index, "permute_times_count": permute_times_count, "permute_times_seed": permute_times_seed } logging.info("run_line: {0}".format(ret)) return ret
color_dict = {"0": "r", "1": "b", "2": "g"} colors = [color_dict[iid_item] for iid_item in snpdata.iid[:, 0]] plt.axis('equal') plt.scatter(spatial_coor_gen_original.val[:, 0], spatial_coor_gen_original.val[:, 1], c=colors) plt.show() from fastlmm.association.heritability_spatial_correction import spatial_similarity from pysnptools.kernelreader import KernelData alpha = distance_between_centers spatial_val = spatial_similarity(spatial_coor_gen_original.val, alpha, power=2) K_loc = KernelData(iid=snpdata.iid, val=spatial_val).standardize() if do_plot: pylab.suptitle("$K_{loc}$") pylab.imshow(K_loc.val, cmap=pylab.gray(), vmin=0, vmax=1) pylab.show() from pysnptools.snpreader import SnpData iid = K_causal.iid iid_count = K_causal.iid_count np.random.seed(seed) pheno_causal = SnpData(iid=iid, sid=["causal"], val=np.random.multivariate_normal( np.zeros(iid_count),
def combine_the_best_way(K0, K1, covar, y, mixing, h2, force_full_rank=False, force_low_rank=False,snp_standardizer=None,kernel_standardizer=None,block_size=None): from pysnptools.kernelstandardizer import Identity as KS_Identity assert K0.iid0 is K0.iid1, "Expect K0 to be square" assert K1.iid0 is K1.iid1, "Expect K1 to be square" assert K0 is not None assert K1 is not None assert np.array_equal(K0.iid,K1.iid), "Expect K0 and K1 to having matching iids" assert kernel_standardizer is not None, "expect values for kernel_standardizer" mixer = _Mixer(False,KS_Identity(),KS_Identity(),mixing) sid_count_0 = _Mixer.sid_counter(K0, force_full_rank, force_low_rank) sid_count_1 = _Mixer.sid_counter(K1, force_full_rank, force_low_rank) ################################# # Both Identity (or not given) ################################# if sid_count_0 + sid_count_1 == 0: h2 = h2 or 0 mixer.mixing = mixer.mixing or 0 K = K0.read() #would be nice to use LinearRegression or low-rank with 0 snps ################################# # ################################# elif sid_count_0 + sid_count_1 < K0.iid_count or force_low_rank: mixer.do_g = True #!!!there is no need for block_size here because we want G0 in full. But if starting with SNPs and not low-rank then batches are needed and the two standardizers must be remembered for use later if sid_count_0 > 0: K0, mixer.snp_trained0, mixer.kernel_trained0 = K0._read_with_standardizing(to_kerneldata=not mixer.do_g, kernel_standardizer=kernel_standardizer, return_trained=True) if sid_count_1 > 0: K1, mixer.snp_trained1, mixer.kernel_trained1 = K1._read_with_standardizing(to_kerneldata=not mixer.do_g, kernel_standardizer=kernel_standardizer, return_trained=True) if sid_count_1 == 0: mixer.mixing = mixer.mixing or 0 K = K0 elif sid_count_0 == 0: mixer.mixing = mixer.mixing or 1 K = K1 else: if mixer.do_g: G = np.empty((K0.iid_count, K0.sid_count + K1.sid_count)) if mixer.mixing is None: mixer.mixing, h2 = _find_mixing_from_Gs(G, covar, K0.snpreader.val, K1.snpreader.val, h2, y) if mixer.mixing == 0: K = K0 elif mixer.mixing == 1: K = K1 else: _mix_from_Gs(G, K0.snpreader.val, K1.snpreader.val, mixer.mixing) G = SnpData(iid=K0.iid, sid=["K0_{0}".format(i) for i in xrange(K0.sid_count)]+["K1_{0}".format(i) for i in xrange(K1.sid_count)], #rename the sids so that they can't collide. val=G,name="{0}&{1}".format(K0.snpreader,K1.snpreader), pos=np.concatenate((K0.pos,K1.pos),axis=0) ) K = SnpKernel(G,SS_Identity(),block_size=block_size) else: mixer.do_g = False if sid_count_0 > 0: #!!!but what if we have SNP data but still need to remember the standardizer? K0, mixer.snp_trained0, mixer.kernel_trained0 = K0._read_with_standardizing(to_kerneldata=True,return_trained=True)#!!!pass in a new argument, the kernel_standardizer(???) if sid_count_1 > 0: K1, mixer.snp_trained1, mixer.kernel_trained1 = K1._read_with_standardizing(to_kerneldata=True,return_trained=True) if sid_count_1 == 0: mixer.mixing = mixer.mixing or 0 K = K0 elif sid_count_0 == 0: mixer.mixing = mixer.mixing or 1 K = K1 else: K = np.empty(K0.val.shape) if mixer.mixing is None: mixer.mixing, h2 = _find_mixing_from_Ks(K, covar, K0.val, K1.val, h2, y) _mix_from_Ks(K, K0.val, K1.val, mixer.mixing) assert K.shape[0] == K.shape[1] and abs(np.diag(K).sum() - K.shape[0]) < 1e-7, "Expect mixed K to be standardized" K = KernelData(val=K,iid=K0.iid) return K, h2, mixer
def work_item(arg_tuple): (pheno, G_kernel, spatial_coor, spatial_iid, alpha,alpha_power, # The main inputs (jackknife_index, jackknife_count, jackknife_seed), # Jackknifing and permutations inputs (permute_plus_index, permute_plus_count, permute_plus_seed), (permute_times_index, permute_times_count, permute_times_seed), just_testing, do_uncorr, do_gxe2, a2) = arg_tuple # Shortcutting work ######################################### # Remove any missing values from pheno ######################################### pheno = pheno.read() pheno = pheno[pheno.val[:,0]==pheno.val[:,0],:] #Excludes NaN because NaN is not equal to NaN ######################################### # Environment: Turn spatial info info a KernelData ######################################### spatial_val = spatial_similarity(spatial_coor, alpha, power=alpha_power) E_kernel = KernelData(iid=spatial_iid,val=spatial_val) ######################################### # Intersect, apply the jackknife or permutation, and then (because we now know the iids) standardize appropriately ######################################### from pysnptools.util import intersect_apply G_kernel, E_kernel, pheno = intersect_apply([G_kernel, E_kernel, pheno]) if jackknife_index >= 0: assert jackknife_count <= G_kernel.iid_count, "expect the number of groups to be less than the number of iids" assert jackknife_index < jackknife_count, "expect the jackknife index to be less than the count" m_fold = cross_validation.KFold(n=G_kernel.iid_count, n_folds=jackknife_count, shuffle=True, random_state=jackknife_seed%4294967295) iid_index,_ = _nth(m_fold, jackknife_index) pheno = pheno[iid_index,:] G_kernel = G_kernel[iid_index] E_kernel = E_kernel[iid_index] if permute_plus_index >= 0: #We shuffle the val, but not the iid, because that would cancel out. #Integrate the permute_plus_index into the random. np.random.seed((permute_plus_seed + permute_plus_index)%4294967295) new_index = np.arange(G_kernel.iid_count) np.random.shuffle(new_index) E_kernel_temp = E_kernel[new_index].read() E_kernel = KernelData(iid=E_kernel.iid,val=E_kernel_temp.val,name="permutation {0}".format(permute_plus_index)) pheno = pheno.read().standardize() # defaults to Unit standardize G_kernel = G_kernel.read().standardize() # defaults to DiagKtoN standardize E_kernel = E_kernel.read().standardize() # defaults to DiagKtoN standardize ######################################### # find h2uncoor, the best mixing weight of pure random noise and G_kernel ######################################### if not do_uncorr: h2uncorr, nLLuncorr = np.nan,np.nan else: logging.info("Find best h2 for G_kernel") lmmg = LMM() lmmg.setK(K0=G_kernel.val) lmmg.setX(np.ones([G_kernel.iid_count,1])) # just a bias column lmmg.sety(pheno.val[:,0]) if not just_testing: resg = lmmg.findH2() h2uncorr, nLLuncorr = resg["h2"], resg["nLL"] else: h2uncorr, nLLuncorr = 0,0 logging.info("just G: h2uncorr: {0}, nLLuncorr: {1}".format(h2uncorr,nLLuncorr)) ######################################### # Find a2, the best mixing for G_kernel and E_kernel ######################################### if a2 is None: logging.info("Find best mixing for G_kernel and E_kernel") lmm1 = LMM() lmm1.setK(K0=G_kernel.val, K1=E_kernel.val, a2=0.5) lmm1.setX(np.ones([G_kernel.iid_count,1])) # just a bias column lmm1.sety(pheno.val[:,0]) if not just_testing: res1 = lmm1.findA2() h2, a2, nLLcorr = res1["h2"], res1["a2"], res1["nLL"] h2corr = h2 * (1-a2) e2 = h2 * a2 else: h2corr, e2, a2, nLLcorr = 0,0,.5,0 logging.info("G plus E mixture: h2corr: {0}, e2: {1}, a2: {2}, nLLcorr: {3}".format(h2corr,e2,a2,nLLcorr)) else: h2corr, e2, nLLcorr = np.nan, np.nan, np.nan ######################################### # Find a2_gxe2, the best mixing for G+E_kernel and the GxE kernel ######################################### if not do_gxe2: gxe2, a2_gxe2, nLL_gxe2 = np.nan, np.nan, np.nan else: #Create the G+E kernel by mixing according to a2 val=(1-a2)*G_kernel.val + a2*E_kernel.val GplusE_kernel = KernelData(iid=G_kernel.iid, val=val,name="{0} G + {1} E".format(1-a2,a2)) #Don't need to standardize GplusE_kernel because it's the weighted combination of standardized kernels # Create GxE Kernel and then find the best mixing of it and GplusE logging.info("Find best mixing for GxE and GplusE_kernel") val=G_kernel.val * E_kernel.val if permute_times_index >= 0: #We shuffle the val, but not the iid, because doing both would cancel out np.random.seed((permute_times_seed + permute_times_index)%4294967295) new_index = np.arange(G_kernel.iid_count) np.random.shuffle(new_index) val = pstutil.sub_matrix(val, new_index, new_index) GxE_kernel = KernelData(iid=G_kernel.iid, val=val,name="GxE") # recall that Python '*' is just element-wise multiplication GxE_kernel = GxE_kernel.standardize() lmm2 = LMM() lmm2.setK(K0=GplusE_kernel.val, K1=GxE_kernel.val, a2=0.5) lmm2.setX(np.ones([G_kernel.iid_count,1])) # just a bias column lmm2.sety(pheno.val[:,0]) if not just_testing: res2 = lmm2.findA2() gxe2, a2_gxe2, nLL_gxe2 = res2["h2"], res2["a2"], res2["nLL"] gxe2 *= a2_gxe2 else: gxe2, a2_gxe2, nLL_gxe2 = 0,.5,0 logging.info("G+E plus GxE mixture: gxe2: {0}, a2_gxe2: {1}, nLL_gxe2: {2}".format(gxe2, a2_gxe2, nLL_gxe2)) ######################################### # Return results ######################################### ret = {"h2uncorr": h2uncorr, "nLLuncorr": nLLuncorr, "h2corr": h2corr, "e2":e2, "a2": a2, "nLLcorr": nLLcorr, "gxe2": gxe2, "a2_gxe2": a2_gxe2, "nLL_gxe2": nLL_gxe2, "alpha": alpha, "alpha_power":alpha_power, "phen": pheno.sid[0], "jackknife_index": jackknife_index, "jackknife_count":jackknife_count, "jackknife_seed":jackknife_seed, "permute_plus_index": permute_plus_index, "permute_plus_count":permute_plus_count, "permute_plus_seed":permute_plus_seed, "permute_times_index": permute_times_index, "permute_times_count":permute_times_count, "permute_times_seed":permute_times_seed } logging.info("run_line: {0}".format(ret)) return ret
def read(self, order='F', dtype=np.float64, force_python_only=False, view_ok=False, num_threads=None): """Reads the kernel values and returns a :class:`.KernelData` (with :attr:`KernelData.val` property containing a new ndarray of the kernel values). :param order: {'F' (default), 'C', 'A'}, optional -- Specify the order of the ndarray. If order is 'F' (default), then the array will be in F-contiguous order (iid0-index varies the fastest). If order is 'C', then the returned array will be in C-contiguous order (iid1-index varies the fastest). If order is 'A', then the :attr:`KernelData.val` ndarray may be in any order (either C-, Fortran-contiguous, or even discontiguous). :type order: string or None :param dtype: {numpy.float64 (default), numpy.float32}, optional -- The data-type for the :attr:`KernelData.val` ndarray. :type dtype: data-type :param force_python_only: optional -- If False (default), may use outside library code. If True, requests that the read be done without outside library code. :type force_python_only: bool :param view_ok: optional -- If False (default), allocates new memory for the :attr:`KernelData.val`'s ndarray. If True, if practical and reading from a :class:`KernelData`, will return a new :class:`KernelData` with a ndarray shares memory with the original :class:`KernelData`. Typically, you'll also wish to use "order='A'" to increase the chance that sharing will be possible. Use these parameters with care because any change to either ndarray (for example, via :meth:`.KernelData.standardize`) will effect the others. Also keep in mind that :meth:`read` relies on ndarray's mechanisms to decide whether to actually share memory and so it may ignore your suggestion and allocate a new ndarray anyway. :type view_ok: bool :param num_threads: optional -- The number of threads with which to standardize data. Defaults to all available processors. Can also be set with these environment variables (listed in priority order): 'PST_NUM_THREADS', 'NUM_THREADS', 'MKL_NUM_THREADS'. :type num_threads: None or int :rtype: :class:`.KernelData` Calling the method again causes the kernel values to be re-read and creates a new in-memory :class:`.KernelData` with a new ndarray of kernel values. If you request the values for only a subset of the sids or iids, (to the degree practical) only that subset will be read from disk. :Example: >>> from pysnptools.kernelreader import KernelNpz >>> from pysnptools.util import example_file # Download and return local file name >>> npz_file = example_file('pysnptools/examples/toydata.kernel.npz') >>> kernel_on_disk = KernelNpz(npz_file) >>> kerneldata1 = kernel_on_disk.read() # Read all the kernel data returning a KernelData instance >>> print(type(kerneldata1.val).__name__) # The KernelData instance contains a ndarray of the data. ndarray >>> subset_kerneldata = kernel_on_disk[::2].read() # From the disk, read kernel values for every other iid >>> print('{0:.6f}'.format(subset_kerneldata.val[0,0])) # Print the first kernel value in the subset 9923.069928 >>> subsub_kerneldata = subset_kerneldata[:10].read(order='A',view_ok=True) # Create an in-memory subset of the subset with kernel values for the first ten iids. Share memory if practical. >>> import numpy as np >>> #print(np.may_share_memory(subset_kerneldata.val, subsub_kerneldata.val)) # Do the two ndarray's share memory? They could. Currently they won't. """ dtype = np.dtype(dtype) val = self._read(None, None, order, dtype, force_python_only, view_ok, num_threads) from pysnptools.kernelreader import KernelData ret = KernelData(iid0=self.iid0, iid1=self.iid1, val=val, name=str(self)) return ret
spatial_iid = np.array([(v, v) for v in gps_table["id"].values]) spatial_coor = gps_table[["south_new", "east_new"]].values ######################################### # Remove any missing values from pheno ######################################## assert pheno.sid_count == 1, "Expect only one pheno in work_item" pheno = pheno.read() pheno = pheno[pheno.val[:, 0] == pheno. val[:, 0], :] #Excludes NaN because NaN is not equal to NaN ######################################### # Environment: Turn spatial info info a KernelData ######################################### spatial_val = spatial_similarity(spatial_coor, alpha, power=alpha_power) E_kernel = KernelData(iid=spatial_iid, val=spatial_val) ######################################### # Intersect, apply the jackknife or permutation, and then (because we now know the iids) standardize appropriately ######################################### from pysnptools.util import intersect_apply G_kernel, E_kernel, pheno = intersect_apply([G_kernel, E_kernel, pheno]) if jackknife_index >= 0: assert jackknife_count <= G_kernel.iid_count, "expect the number of groups to be less than the number of iids" assert jackknife_index < jackknife_count, "expect the jackknife index to be less than the count" m_fold = model_selection.KFold(n_splits=jackknife_count, shuffle=True, random_state=jackknife_seed % 4294967295).split( range(G_kernel.iid_count))
def work_item(arg_tuple): ( pheno, G_kernel, spatial_coor, spatial_iid, alpha, alpha_power, # The main inputs (jackknife_index, jackknife_count, jackknife_seed), # Jackknifing and permutations inputs (permute_plus_index, permute_plus_count, permute_plus_seed), (permute_times_index, permute_times_count, permute_times_seed), just_testing, do_uncorr, do_gxe2, a2) = arg_tuple # Shortcutting work ######################################### # Remove any missing values from pheno ######################################### pheno = pheno.read() pheno = pheno[pheno.val[:, 0] == pheno. val[:, 0], :] #Excludes NaN because NaN is not equal to NaN ######################################### # Environment: Turn spatial info info a KernelData ######################################### spatial_val = spatial_similarity(spatial_coor, alpha, power=alpha_power) E_kernel = KernelData(iid=spatial_iid, val=spatial_val) ######################################### # Intersect, apply the jackknife or permutation, and then (because we now know the iids) standardize appropriately ######################################### from pysnptools.util import intersect_apply G_kernel, E_kernel, pheno = intersect_apply([G_kernel, E_kernel, pheno]) if jackknife_index >= 0: assert jackknife_count <= G_kernel.iid_count, "expect the number of groups to be less than the number of iids" assert jackknife_index < jackknife_count, "expect the jackknife index to be less than the count" m_fold = cross_validation.KFold(n=G_kernel.iid_count, n_folds=jackknife_count, shuffle=True, random_state=jackknife_seed % 4294967295) iid_index, _ = _nth(m_fold, jackknife_index) pheno = pheno[iid_index, :] G_kernel = G_kernel[iid_index] E_kernel = E_kernel[iid_index] if permute_plus_index >= 0: #We shuffle the val, but not the iid, because that would cancel out. #Integrate the permute_plus_index into the random. np.random.seed((permute_plus_seed + permute_plus_index) % 4294967295) new_index = np.arange(G_kernel.iid_count) np.random.shuffle(new_index) E_kernel_temp = E_kernel[new_index].read() E_kernel = KernelData( iid=E_kernel.iid, val=E_kernel_temp.val, parent_string="permutation {0}".format(permute_plus_index)) pheno = pheno.read().standardize() # defaults to Unit standardize G_kernel = G_kernel.read().standardize( ) # defaults to DiagKtoN standardize E_kernel = E_kernel.read().standardize( ) # defaults to DiagKtoN standardize ######################################### # find h2uncoor, the best mixing weight of pure random noise and G_kernel ######################################### if not do_uncorr: h2uncorr, nLLuncorr = np.nan, np.nan else: logging.info("Find best h2 for G_kernel") lmmg = LMM() lmmg.setK(K0=G_kernel.val) lmmg.setX(np.ones([G_kernel.iid_count, 1])) # just a bias column lmmg.sety(pheno.val[:, 0]) if not just_testing: resg = lmmg.findH2() h2uncorr, nLLuncorr = resg["h2"], resg["nLL"] else: h2uncorr, nLLuncorr = 0, 0 logging.info("just G: h2uncorr: {0}, nLLuncorr: {1}".format( h2uncorr, nLLuncorr)) ######################################### # Find a2, the best mixing for G_kernel and E_kernel ######################################### if a2 is None: logging.info("Find best mixing for G_kernel and E_kernel") lmm1 = LMM() lmm1.setK(K0=G_kernel.val, K1=E_kernel.val, a2=0.5) lmm1.setX(np.ones([G_kernel.iid_count, 1])) # just a bias column lmm1.sety(pheno.val[:, 0]) if not just_testing: res1 = lmm1.findA2() h2, a2, nLLcorr = res1["h2"], res1["a2"], res1["nLL"] h2corr = h2 * (1 - a2) e2 = h2 * a2 else: h2corr, e2, a2, nLLcorr = 0, 0, .5, 0 logging.info( "G plus E mixture: h2corr: {0}, e2: {1}, a2: {2}, nLLcorr: {3}". format(h2corr, e2, a2, nLLcorr)) else: h2corr, e2, nLLcorr = np.nan, np.nan, np.nan ######################################### # Find a2_gxe2, the best mixing for G+E_kernel and the GxE kernel ######################################### if not do_gxe2: gxe2, a2_gxe2, nLL_gxe2 = np.nan, np.nan, np.nan else: #Create the G+E kernel by mixing according to a2 val = (1 - a2) * G_kernel.val + a2 * E_kernel.val GplusE_kernel = KernelData(iid=G_kernel.iid, val=val, parent_string="{0} G + {1} E".format( 1 - a2, a2)) #Don't need to standardize GplusE_kernel because it's the weighted combination of standardized kernels # Create GxE Kernel and then find the best mixing of it and GplusE logging.info("Find best mixing for GxE and GplusE_kernel") val = G_kernel.val * E_kernel.val if permute_times_index >= 0: #We shuffle the val, but not the iid, because doing both would cancel out np.random.seed( (permute_times_seed + permute_times_index) % 4294967295) new_index = np.arange(G_kernel.iid_count) np.random.shuffle(new_index) val = pstutil.sub_matrix(val, new_index, new_index) GxE_kernel = KernelData( iid=G_kernel.iid, val=val, parent_string="GxE" ) # recall that Python '*' is just element-wise multiplication GxE_kernel = GxE_kernel.standardize() lmm2 = LMM() lmm2.setK(K0=GplusE_kernel.val, K1=GxE_kernel.val, a2=0.5) lmm2.setX(np.ones([G_kernel.iid_count, 1])) # just a bias column lmm2.sety(pheno.val[:, 0]) if not just_testing: res2 = lmm2.findA2() gxe2, a2_gxe2, nLL_gxe2 = res2["h2"], res2["a2"], res2["nLL"] gxe2 *= a2_gxe2 else: gxe2, a2_gxe2, nLL_gxe2 = 0, .5, 0 logging.info( "G+E plus GxE mixture: gxe2: {0}, a2_gxe2: {1}, nLL_gxe2: {2}". format(gxe2, a2_gxe2, nLL_gxe2)) ######################################### # Return results ######################################### ret = { "h2uncorr": h2uncorr, "nLLuncorr": nLLuncorr, "h2corr": h2corr, "e2": e2, "a2": a2, "nLLcorr": nLLcorr, "gxe2": gxe2, "a2_gxe2": a2_gxe2, "nLL_gxe2": nLL_gxe2, "alpha": alpha, "alpha_power": alpha_power, "phen": pheno.sid[0], "jackknife_index": jackknife_index, "jackknife_count": jackknife_count, "jackknife_seed": jackknife_seed, "permute_plus_index": permute_plus_index, "permute_plus_count": permute_plus_count, "permute_plus_seed": permute_plus_seed, "permute_times_index": permute_times_index, "permute_times_count": permute_times_count, "permute_times_seed": permute_times_seed } logging.info("run_line: {0}".format(ret)) return ret
def test_lmm(self): do_plot = False iid_count = 500 seed = 0 import pylab logging.info("TestLmmTrain test_lmm") iid = [["cid{0}P{1}".format(iid_index,iid_index//250)]*2 for iid_index in xrange(iid_count)] train_idx = np.r_[10:iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids #Every person is 100% related to everyone in one of 5 families K0a = KernelData(iid=iid,val=np.empty([iid_count,iid_count]),name="related by distance") for iid_index0 in xrange(iid_count): for iid_index1 in xrange(iid_count): K0a.val[iid_index0,iid_index1] = 1 if iid_index0 % 5 == iid_index1 % 5 else 0 if iid_index1 < iid_index0: assert K0a.val[iid_index0,iid_index1] == K0a.val[iid_index1,iid_index0] #every person lives on a line from 0 to 1 # They are related to every other person as a function of distance on the line np.random.seed(seed) home = np.random.random([iid_count]) K0b = KernelData(iid=iid,val=np.empty([iid_count,iid_count]),name="related by distance") for iid_index in xrange(iid_count): K0b.val[iid_index,:] = 1 - np.abs(home-home[iid_index])**.1 #make covar just numbers 0,1,... covar = SnpData(iid=iid,sid=["x"],val=np.array([[float(num)] for num in xrange(iid_count)])) covariate_train = covar[train_idx,:].read() covariate_test = covar[test_idx,:].read() for name, h2, K0 in [("clones", 1, K0a),("line_world",.75,K0b)]: sigma2x = 100 varg = sigma2x * h2 vare = sigma2x * (1-h2) ####################################################################### #make pheno # pheno = 2*covar+100+normal(0,1)*2.5+normal(0,K)*7.5 ####################################################################### #random.multivariate_normal is sensitive to mkl_num_thread, so we control it. if 'MKL_NUM_THREADS' in os.environ: mkl_num_thread = os.environ['MKL_NUM_THREADS'] else: mkl_num_thread = None os.environ['MKL_NUM_THREADS'] = '1' np.random.seed(seed) p1 = covar.val * 2.0 + 100 p2 = np.random.normal(size=covar.val.shape)*np.sqrt(vare) p3 = (np.random.multivariate_normal(np.zeros(iid_count),K0.val)*np.sqrt(varg)).reshape(-1,1) if mkl_num_thread is not None: os.environ['MKL_NUM_THREADS'] = mkl_num_thread else: del os.environ['MKL_NUM_THREADS'] pheno = SnpData(iid=iid,sid=["pheno0"],val= p1 + p2 + p3) pheno_train = pheno[train_idx,:].read() pheno_test = pheno[test_idx,:].read() if do_plot: #Plot training x and y, testing x and y pylab.plot(covariate_train.val, pheno_train.val,".",covariate_test.val, pheno_test.val,".") pylab.suptitle(name + ": Plot training x and y, testing x and y") pylab.show() Xtrain = np.c_[covariate_train.val,np.ones((covariate_train.iid_count,1))] Xtest = np.c_[covariate_test.val,np.ones((covariate_test.iid_count,1))] lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:,0],rcond=-1) bs=lsqSol[0] #weights r2=lsqSol[1] #squared residuals D=lsqSol[2] #rank of design matrix N=pheno_train.iid_count REML = False if not REML: sigma2 = float(r2/N) nLL = N*0.5*np.log(2*np.pi*sigma2) + N*0.5 else: sigma2 = float(r2 / (N-D)) nLL = N*0.5*np.log(2*np.pi*sigma2) + 0.5/sigma2*r2; nLL -= 0.5*D*np.log(2*np.pi*sigma2);#REML term predicted = Xtest.dot(bs) yerr = [np.sqrt(sigma2)] * len(predicted) if do_plot: pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.") pylab.xlim([-1, 10]) pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None') pylab.suptitle(name + ": real linear regression: actual to prediction") pylab.show() for factor in [1,100,.02]: K0 = K0.read() K0.val *= factor K0_train = K0[train_idx] K0_whole_test = K0[:,test_idx] #Learn model, save, load fastlmmx = FastLMM(GB_goal=2).fit(K0_train=K0_train, X=covariate_train, y=pheno_train) v2 = np.var(p2) v3 = np.var(p3) logging.debug("Original h2 of {0}. Generated h2 of {1}. Learned h2 of {2}".format(h2, v3/(v2+v3), fastlmmx.h2raw)) filename = self.tempout_dir + "/model_lmm.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(fastlmmx, filename) fastlmm = joblib.load(filename) do_test_on_train = True if do_test_on_train: #Predict with model (test on train) predicted_pheno, covar_pheno = fastlmm.predict(K0_whole_test=K0_train, X=covariate_train,count_A1=False) #test on train output_file = self.file_name("lmma_"+name) Dat.write(output_file,predicted_pheno) covar2 = SnpData(iid=covar_pheno.row,sid=covar_pheno.col[:,1],val=covar_pheno.val) #kludge to write kernel to text format output_file = self.file_name("lmma.cov_"+name) Dat.write(output_file,covar2) yerr = np.sqrt(np.diag(covar_pheno.val)) predicted = predicted_pheno.val if do_plot: pylab.plot(covariate_train.val, pheno_train.val,"g.",covariate_train.val, predicted,"r.") pylab.xlim([0, 50]) pylab.ylim([100, 200]) pylab.errorbar(covariate_train.val, predicted,yerr,linestyle='None') pylab.suptitle(name+": test on train: train X to true target (green) and prediction (red)") pylab.show() self.compare_files(predicted_pheno,"lmma_"+name) self.compare_files(covar2,"lmma.cov_"+name) predicted_pheno0, covar_pheno0 = fastlmm.predict(K0_whole_test=K0_train[:,0], X=covariate_train[0,:],count_A1=False) #test on train #0 assert np.abs(predicted_pheno0.val[0,0] - predicted_pheno.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases" assert np.abs(covar_pheno0.val[0,0] - covar_pheno.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases" #Predict with model (test on test) predicted_phenoB, covar_phenoB = fastlmm.predict(K0_whole_test=K0_whole_test, X=covariate_test,count_A1=False) #test on test output_file = self.file_name("lmmb_"+name) Dat.write(output_file,predicted_phenoB) covar2 = SnpData(iid=covar_phenoB.row,sid=covar_phenoB.col[:,1],val=covar_phenoB.val) #kludge to write kernel to text format output_file = self.file_name("lmmb.cov_"+name) Dat.write(output_file,covar2) yerr = np.sqrt(np.diag(covar_phenoB.val)) predicted = predicted_phenoB.val if do_plot: pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.") pylab.xlim([-1, 10]) pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None') pylab.suptitle(name+": test on test: test X to true target (green) and prediction (red)") pylab.show() self.compare_files(predicted_phenoB,"lmmb_"+name) self.compare_files(covar2,"lmmb.cov_"+name) predicted_phenoB0, covar_phenoB0 = fastlmm.predict(K0_whole_test=K0_whole_test[:,0], X=covariate_test[0,:],count_A1=False) #test on a single test case assert np.abs(predicted_phenoB0.val[0,0] - predicted_phenoB.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases" assert np.abs(covar_phenoB0.val[0,0] - covar_phenoB.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases" #Predict with model test on some train and some test some_idx = range(covar.iid_count) some_idx.remove(train_idx[0]) some_idx.remove(test_idx[0]) covariate_some = covar[some_idx,:] K0_whole_some = K0[:,some_idx] predicted_phenoC, covar_phenoC = fastlmm.predict(K0_whole_test=K0_whole_some, X=covariate_some,count_A1=False) for idxC, iidC in enumerate(predicted_phenoC.iid): meanC = predicted_phenoC.val[idxC] varC = covar_phenoC.val[idxC,idxC] if iidC in predicted_pheno.iid: predicted_pheno_ref = predicted_pheno covar_pheno_ref = covar_pheno else: assert iidC in predicted_phenoB.iid predicted_pheno_ref = predicted_phenoB covar_pheno_ref = covar_phenoB idx_ref = predicted_pheno_ref.iid_to_index([iidC])[0] mean_ref = predicted_pheno_ref.val[idx_ref] var_ref = covar_pheno_ref.val[idx_ref,idx_ref] assert np.abs(meanC - mean_ref) < 1e-6 assert np.abs(varC - var_ref) < 1e-6
def predict(self, X=None, K0_whole_test=None, K1_whole_test=None, iid_if_none=None, count_A1=None): """ Method for predicting from a fitted :class:`FastLMM` predictor. If the examples in X, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected. :param X: testing covariate information, optional: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type X: a PySnpTools `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ (such as `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__) or string. :param K0_whole_test: Must be None. Represents the identity similarity matrix. :type K0_whole_test: None :param K1_whole_test: Must be None. Represents the identity similarity matrix. :type K1_whole_test: `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string or `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__ :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided. :type iid_if_none: an ndarray of two strings :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True. :type count_A1: bool :rtype: A `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__ of the means and a :class:`KernelData` of the covariance """ with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _: assert self.is_fitted, "Can only predict after predictor has been fitted" assert K0_whole_test is None or isinstance( K0_whole_test, KernelIdentity) # could also accept no snps assert K1_whole_test is None or isinstance( K1_whole_test, KernelIdentity) # could also accept no snps X = _pheno_fixup(X, iid_if_none=iid_if_none, count_A1=count_A1) X = X.read().standardize(self.covar_unit_trained) # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset X = SnpData(iid=X.iid, sid=FastLMM._new_snp_name(X), val=np.c_[X.read().val, np.ones((X.iid_count, 1))]) assert np.array_equal( X.sid, self.covar_sid ), "Expect covar sids to be the same in train and test." pheno_predicted = X.val.dot(self.beta).reshape(-1, 1) ret0 = SnpData(iid=X.iid, sid=self.pheno_sid, val=pheno_predicted, pos=np.array([[np.nan, np.nan, np.nan]]), name="linear regression Prediction" ) #!!!replace 'parent_string' with 'name' from pysnptools.kernelreader import KernelData ret1 = KernelData(iid=X.iid, val=np.eye(X.iid_count) * self.ssres / self.iid_count) return ret0, ret1
iids = [] for i in range(len(tmp)): iids.append(re.split(r' +', tmp[i])) Gsm = np.genfromtxt('Raw_Data/freeze2.common.rel.mat', skip_header=True) # print Gsm Gsm = Gsm[:, 2:207] # Gsm = LMM(K=Gsm) # Gsm.setSU_fromK() # Gsm.K # np.savetxt('../Inputs/NCSU_GSM_U.txt', Gsm.U) # np.savetxt('../Inputs/NCSU_GSM_S.txt', Gsm.S) my_kernel = KernelData(iid=iids, val=Gsm.tolist()) # Now, performing GSM using the official GSM as kernel: results_df = single_snp(VARIANTS_TO_TEST, PHENOTYPE_DATA, K0=my_kernel, leave_out_one_chrom=False, save_test_statistic=True, output_file_name = 'Outputs/' + OUTPUT_NAME + '_Original.txt', ) print 'Total Time (s): ' + str(time.clock()-start) # In[5]:
def predict(self, X=None, K0_whole_test=None, K1_whole_test=None, iid_if_none=None, count_A1=None): """ Method for predicting from a fitted :class:`FastLMM` predictor. If the examples in X, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected. :param X: testing covariate information, optional: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type X: a PySnpTools `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ (such as `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__) or string. :param K0_whole_test: A similarity matrix from all the examples to the test examples. Alternatively, the test SNPs needed to construct such a similarity matrix. Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__. If you give a string, can be the name of a PLINK-formated Bed file. Can be PySnpTools `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__. If you give a string it can be the name of a `KernelNpz <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelnpz>`__ file. :type K0_whole_test: `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string or `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__ :param K1_whole_test: A second similarity matrix from all the examples to the test examples. Alternatively, the test SNPs needed to construct such a similarity matrix. Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__. If you give a string, can be the name of a PLINK-formated Bed file. Can be PySnpTools `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__. If you give a string it can be the name of a `KernelNpz <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelnpz>`__ file. :type K1_whole_test: `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string or `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__ :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided. :type iid_if_none: an ndarray of two strings :rtype: A `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__ of the means and a :class:`KernelData` of the covariance """ with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _: assert self.is_fitted, "Can only predict after predictor has been fitted" #assert K0_whole_test is not None, "K0_whole_test must be given" #!!!later is it too wasteful to keep both G0_train, G1_train, and lmm.G when storing to disk? #!!!later all _kernel_fixup's should use block_size input K0_whole_test_b = _kernel_fixup( K0_whole_test, train_snps=self.G0_train, iid_if_none=iid_if_none, standardizer=self.mixer.snp_trained0, test=K0_whole_test, test_iid_if_none=None, block_size=self.block_size, count_A1=count_A1) K1_whole_test = _kernel_fixup( K1_whole_test, train_snps=self.G1_train, iid_if_none=K0_whole_test_b.iid0, standardizer=self.mixer.snp_trained1, test=K1_whole_test, test_iid_if_none=K0_whole_test_b.iid1, block_size=self.block_size, count_A1=count_A1) X = _pheno_fixup(X, iid_if_none=K0_whole_test_b.iid1, count_A1=count_A1) K0_whole_test_c, K1_whole_test, X = intersect_apply( [K0_whole_test_b, K1_whole_test, X], intersect_before_standardize=True, is_test=True) X = X.read().standardize(self.covar_unit_trained) # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset X = SnpData(iid=X.iid, sid=self._new_snp_name(X), val=np.c_[X.read().val, np.ones((X.iid_count, 1))]) assert np.array_equal( X.sid, self.covar_sid ), "Expect covar sids to be the same in train and test." train_idx0 = K0_whole_test_c.iid0_to_index(self.K_train_iid) K0_train_test = K0_whole_test_c[train_idx0, :] train_idx1 = K1_whole_test.iid0_to_index(self.K_train_iid) K1_train_test = K1_whole_test[train_idx1, :] test_idx0 = K0_whole_test_c.iid0_to_index(K0_whole_test_c.iid1) K0_test_test = K0_whole_test_c[test_idx0, :] if K0_test_test.iid0 is not K0_test_test.iid1: raise Exception("real assert") test_idx1 = K1_whole_test.iid0_to_index(K0_whole_test_c.iid1) K1_test_test = K1_whole_test[test_idx1, :] if self.mixer.do_g: ################################################### # low rank from Rasmussen eq 2.9 + noise term added to covar ################################################### Gstar = self.mixer.g_mix(K0_train_test, K1_train_test) varg = self.h2raw * self.sigma2 vare = (1. - self.h2raw) * self.sigma2 Ainv = LA.inv((1. / vare) * np.dot(self.G.T, self.G) + (1. / varg) * np.eye(self.G.shape[1])) testAinv = np.dot(Gstar.test.val, Ainv) pheno_predicted = np.dot(X.val, self.beta) + ( 1. / vare) * np.dot(np.dot(testAinv, self.G.T), self.y - np.dot(self.X, self.beta)) pheno_predicted = pheno_predicted.reshape(-1, 1) covar = np.dot( testAinv, Gstar.test.val.T) + vare * np.eye(Gstar.test.val.shape[0]) else: lmm = LMM() lmm.U = self.U lmm.S = self.S lmm.G = self.G lmm.y = self.y lmm.Uy = self.Uy lmm.X = self.X lmm.UX = self.UX Kstar = self.mixer.k_mix( K0_train_test, K1_train_test ) #!!!later do we need/want reads here? how about view_OK? lmm.setTestData(Xstar=X.val, K0star=Kstar.val.T) Kstar_star = self.mixer.k_mix( K0_test_test, K1_test_test ) #!!!later do we need/want reads here?how about view_OK? pheno_predicted, covar = lmm.predict_mean_and_variance( beta=self.beta, h2=self.h2raw, sigma2=self.sigma2, Kstar_star=Kstar_star.val) #pheno_predicted = lmm.predictMean(beta=self.beta, h2=self.h2,scale=self.sigma2).reshape(-1,1) ret0 = SnpData(iid=X.iid, sid=self.pheno_sid, val=pheno_predicted, pos=np.array([[np.nan, np.nan, np.nan]]), name="lmm Prediction") from pysnptools.kernelreader import KernelData ret1 = KernelData(iid=K0_test_test.iid, val=covar) return ret0, ret1