def test_linreg(self): logging.info("TestSingleSnp test_linreg") test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file = self.file_name("linreg") frame1 = single_snp(test_snps=test_snps[:, :10], pheno=pheno, mixing=0, leave_out_one_chrom=False, G0=KernelIdentity(iid=test_snps.iid), covar=covar, output_file_name=output_file) frame1 = frame1[[ 'sid_index', 'SNP', 'Chr', 'GenDist', 'ChrPos', 'PValue' ]] self.compare_files(frame1, "linreg") frame2 = single_snp_linreg(test_snps=test_snps[:, :10], pheno=pheno, covar=covar, output_file_name=output_file) self.compare_files(frame2, "linreg")
def test_G0_has_reader(self): logging.info("TestSingleSnp test_G0_has_reader") test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file_name = self.file_name("G0_has_reader") frame0 = single_snp(test_snps=test_snps[:, :10], pheno=pheno, G0=test_snps, leave_out_one_chrom=False, covar=covar, mixing=0, output_file_name=output_file_name) self.compare_files(frame0, "one") frame1 = single_snp(test_snps=test_snps[:, :10], pheno=pheno, G0=KernelIdentity(test_snps.iid), G1=test_snps, leave_out_one_chrom=False, covar=covar, mixing=1, output_file_name=output_file_name) self.compare_files(frame1, "one")
def _internal_determine_block_size(K0, K1, mixing, force_full_rank, force_low_rank): assert not (force_full_rank and force_low_rank), "real assert" if isinstance(K0, SnpKernel) and K0.snpreader.sid_count == 0: K0 = KernelIdentity(K0.iid) if isinstance(K1, SnpKernel) and K1.snpreader.sid_count == 0: K1 = KernelIdentity(K1.iid) ########################## # A special case: both kernels are the Identity so just return the first one ########################## if isinstance(K0, KernelIdentity) and isinstance(K1, KernelIdentity): return K0.iid_count ########################## # Special cases: mixing says to use just one kernel or the other kernel is just identity, so just return one kernel ########################## if mixing == 0.0 or isinstance(K1, KernelIdentity): if isinstance(K0, SnpKernel) and not force_full_rank and ( force_low_rank or K0.snpreader.sid_count < K0.iid_count): return K0.snpreader.sid_count else: return K0.iid_count if mixing == 1.0 or isinstance(K0, KernelIdentity): if isinstance(K1, SnpKernel) and not force_full_rank and ( force_low_rank or K1.snpreader.sid_count < K1.iid_count): return K1.snpreader.sid_count else: return K1.iid_count ########################## # A special case: Treat the kernels as collections of snps (i.e. low-rank) ########################## if (isinstance(K0, SnpKernel) and isinstance(K1, SnpKernel) and not force_full_rank and (force_low_rank or K0.snpreader.sid_count + K1.snpreader.sid_count < K0.iid_count)): return K0.snpreader.sid_count + K1.snpreader.sid_count ########################## # The most general case, treat the new kernels as kernels (i.e.. full rank) ########################## return K0.iid_count
def _K_per_chrom(K, chrom, iid,count_A1=None): if K is None: return KernelIdentity(iid) else: K_all = _kernel_fixup(K, iid_if_none=iid, standardizer=Unit(),count_A1=count_A1) if isinstance(K_all, SnpKernel): return SnpKernel(K_all.snpreader[:,K_all.pos[:,0] != chrom],K_all.standardizer) else: raise Exception("Don't know how to make '{0}' work per chrom".format(K_all))
def test_none(self): logging.info("TestSingleSnp test_none") test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file = self.file_name("none") frame = single_snp(test_snps=test_snps[:, :10], pheno=pheno, mixing=0, leave_out_one_chrom=False, K0=KernelIdentity(test_snps.iid), covar=covar, output_file_name=output_file) self.compare_files(frame, "none")
def test_mixid(self): logging.info("TestSingleSnp test_mixid") test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file_name = self.file_name("mixid") frame = single_snp(test_snps=test_snps[:, :10], pheno=pheno, G0=test_snps[:, 10:100], leave_out_one_chrom=False, covar=covar, K1=KernelIdentity(test_snps.iid), mixing=.25, output_file_name=output_file_name) self.compare_files(frame, "mixid")
def k_index_to_nLL_mapper(k): _, G_in, pheno_in, covar_in = _fixup(test_snps, G, pheno, covar, count_A1=count_A1) nll_sum = 0 mse_sum = 0 n_folds_in = 0 for fold_index, (train_idx, test_idx) in _kfold(G.iid_count, n_folds, seed, end_with_all=False, iid_to_index=G.iid_to_index): assert set(train_idx).isdisjoint(set(test_idx)), "real assert" top_snps_in_fold = fold_index_to_top_snps[fold_index][:k] sid_idx_in_fold = G_in.sid_to_index(top_snps_in_fold) G_train = G_in[train_idx, sid_idx_in_fold] if k > 0 else None fastlmm = FastLMM(force_full_rank=force_full_rank, force_low_rank=force_low_rank, GB_goal=GB_goal) fastlmm.fit( K0_train=G_train, X=covar_in[train_idx, :], y=pheno_in[train_idx, :], h2raw=h2 ) #iid intersection means when can give the whole covariate and pheno G_test = G_in[ test_idx, sid_idx_in_fold] if k > 0 else KernelIdentity( G_in.iid, G_in.iid[test_idx] ) #!!! instead of this, which blows up when # of iids is large, should switch to linear regression model with k is 0 nll, mse = fastlmm.score( K0_whole_test=G_test, X=covar_in[test_idx, :], y=pheno_in[test_idx, :], return_mse_too=True ) #iid intersection means when can give the whole covariate and pheno nll_sum += nll mse_sum += mse n_folds_in += 1 logging.info("k={0},nLL={1},average mse={2}".format( k, nll_sum, mse_sum / n_folds_in)) return nll_sum
def _kernel_fixup(input, iid_if_none, standardizer, test=None, test_iid_if_none=None, block_size=None, train_snps=None, count_A1=None): if test is not None and input is None: input = test test = None if isinstance(input, str) and input.endswith(".npz"): return KernelNpz(input) if isinstance(input, str): input = Bed(input, count_A1=count_A1 ) #Note that we don't return here. Processing continues if isinstance(test, str): test = Bed(test, count_A1=count_A1 ) #Note that we don't return here. Processing continues if isinstance(input, SnpReader): if test is not None: return _SnpWholeTest(train=train_snps, test=test, standardizer=standardizer, block_size=block_size) else: return SnpKernel(input, standardizer=standardizer, block_size=block_size) if input is None: return KernelIdentity(iid=iid_if_none, test=test_iid_if_none) return input
def _kernel_fixup(input, iid_if_none, standardizer, test=None, test_iid_if_none=None): if test is not None and input is None: input = test test = None if isinstance(input, str) and input.endswith(".npz"): return KernelNpz(input) if isinstance(input, str): input = Bed(input) if isinstance(test, str): test = Bed(test) if isinstance(input, SnpReader): return SnpKernel(input, standardizer=standardizer, test=test) if input is None: return KernelIdentity(iid=iid_if_none, test=test_iid_if_none) return input
def test_match_cpp(self): ''' match FaSTLMM.207\Data\DemoData>..\.cd.\bin\windows\cpp_mkl\fastlmmc -bfile snps -extract topsnps.txt -bfileSim snps -extractSim ASout.snps.txt -pheno pheno.txt -covar covariate.txt -out topsnps.singlesnp.txt -logDelta 0 -verbose 100 ''' logging.info("TestSingleSnp test_match_cpp") snps = Bed( os.path.join(self.pythonpath, "tests/datasets/selecttest/snps")) pheno = os.path.join(self.pythonpath, "tests/datasets/selecttest/pheno.txt") covar = os.path.join(self.pythonpath, "tests/datasets/selecttest/covariate.txt") sim_sid = [ "snp26250_m0_.19m1_.19", "snp82500_m0_.28m1_.28", "snp63751_m0_.23m1_.23", "snp48753_m0_.4m1_.4", "snp45001_m0_.26m1_.26", "snp52500_m0_.05m1_.05", "snp75002_m0_.39m1_.39", "snp41253_m0_.07m1_.07", "snp11253_m0_.2m1_.2", "snp86250_m0_.33m1_.33", "snp3753_m0_.23m1_.23", "snp75003_m0_.32m1_.32", "snp30002_m0_.25m1_.25", "snp26252_m0_.19m1_.19", "snp67501_m0_.15m1_.15", "snp63750_m0_.28m1_.28", "snp30001_m0_.28m1_.28", "snp52502_m0_.35m1_.35", "snp33752_m0_.31m1_.31", "snp37503_m0_.37m1_.37", "snp15002_m0_.11m1_.11", "snp3751_m0_.34m1_.34", "snp7502_m0_.18m1_.18", "snp52503_m0_.3m1_.3", "snp30000_m0_.39m1_.39", "isnp4457_m0_.11m1_.11", "isnp23145_m0_.2m1_.2", "snp60001_m0_.39m1_.39", "snp33753_m0_.16m1_.16", "isnp60813_m0_.2m1_.2", "snp82502_m0_.34m1_.34", "snp11252_m0_.13m1_.13" ] sim_idx = snps.sid_to_index(sim_sid) test_sid = [ "snp26250_m0_.19m1_.19", "snp63751_m0_.23m1_.23", "snp82500_m0_.28m1_.28", "snp48753_m0_.4m1_.4", "snp45001_m0_.26m1_.26", "snp52500_m0_.05m1_.05", "snp75002_m0_.39m1_.39", "snp41253_m0_.07m1_.07", "snp86250_m0_.33m1_.33", "snp15002_m0_.11m1_.11", "snp33752_m0_.31m1_.31", "snp26252_m0_.19m1_.19", "snp30001_m0_.28m1_.28", "snp11253_m0_.2m1_.2", "snp67501_m0_.15m1_.15", "snp3753_m0_.23m1_.23", "snp52502_m0_.35m1_.35", "snp30000_m0_.39m1_.39", "snp30002_m0_.25m1_.25" ] test_idx = snps.sid_to_index(test_sid) for G0, G1 in [(snps[:, sim_idx], KernelIdentity(snps.iid)), (KernelIdentity(snps.iid), snps[:, sim_idx])]: frame_h2 = single_snp(test_snps=snps[:, test_idx], pheno=pheno, G0=G0, G1=G1, covar=covar, h2=.5, leave_out_one_chrom=False) frame_log_delta = single_snp(test_snps=snps[:, test_idx], pheno=pheno, G0=G0, G1=G1, covar=covar, log_delta=0, leave_out_one_chrom=False) for frame in [frame_h2, frame_log_delta]: referenceOutfile = TestFeatureSelection.reference_file( "single_snp/topsnps.single.txt") reference = pd.read_table( referenceOutfile, sep="\t" ) # We've manually remove all comments and blank lines from this file assert len(frame) == len(reference) for _, row in reference.iterrows(): sid = row.SNP pvalue = frame[frame['SNP'] == sid].iloc[0].PValue reldiff = abs(row.Pvalue - pvalue) / row.Pvalue assert reldiff < .035, "'{0}' pvalue_list differ too much {4} -- {2} vs {3}".format( sid, None, row.Pvalue, pvalue, reldiff)
def test_lr_real(self): do_plot = False import pylab logging.info("TestLinRegTrain test_lr_real") train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids #make covar just numbers 0,1,... covar = self.covariate_whole.read() covar.val = np.array([[float(num)] for num in xrange(covar.iid_count)]) covariate_train = covar[train_idx, :].read() covariate_test = covar[test_idx, :].read() K0_test_test = KernelIdentity(covariate_test.iid) #make pheno # pheno = 2*covar+100+normal(0,1)*10 pheno = self.pheno_whole.read() np.random.seed(0) pheno.val = covar.val * 2.0 + 100 + np.random.normal( size=covar.val.shape) * 10 pheno_train = pheno[train_idx, :].read() pheno_test = pheno[test_idx, :].read() if do_plot: #Plot training x and y, testing x and y pylab.plot(covariate_train.val, pheno_train.val, ".", covariate_test.val, pheno_test.val, ".") pylab.suptitle("Plot training x and y, testing x and y") pylab.show() Xtrain = np.c_[covariate_train.val, np.ones((covariate_train.iid_count, 1))] Xtest = np.c_[covariate_test.val, np.ones((covariate_test.iid_count, 1))] lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:, 0], rcond=-1) bs = lsqSol[0] #weights r2 = lsqSol[1] #squared residuals D = lsqSol[2] #rank of design matrix N = pheno_train.iid_count REML = False if not REML: sigma2 = float(r2 / N) nLL = N * 0.5 * np.log(2 * np.pi * sigma2) + N * 0.5 else: sigma2 = float(r2 / (N - D)) nLL = N * 0.5 * np.log(2 * np.pi * sigma2) + 0.5 / sigma2 * r2 nLL -= 0.5 * D * np.log(2 * np.pi * sigma2) #REML term predicted = Xtest.dot(bs) yerr = [np.sqrt(sigma2)] * len(predicted) if do_plot: pylab.plot(covariate_test.val, pheno_test.val, "g.", covariate_test.val, predicted, "r.") pylab.xlim([-1, 10]) pylab.errorbar(covariate_test.val, predicted, yerr, linestyle='None') pylab.suptitle("real linear regression: actual to prediction") pylab.show() #These should all give the same result first_name = None for name, K0_train, K0_whole_test in [("Identity Kernel", None, None)]: first_name = first_name or name #Learn model, save, load modelx = LinearRegression().fit(K0_train=K0_train, X=covariate_train, y=pheno_train) filename = self.tempout_dir + "/model_lr_real.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(modelx, filename) model = joblib.load(filename) do_test_on_train = True if do_test_on_train: #Predict with model (test on train) predicted_pheno, covar = model.predict( K0_whole_test=K0_train, X=covariate_train) #test on train output_file = self.file_name("lr_reala_" + name) Dat.write(output_file, predicted_pheno) covar2 = SnpData( iid=covar.row, sid=covar.col[:, 1], val=covar.val) #kludge to write kernel to text format output_file = self.file_name("lr_reala.cov_" + name) Dat.write(output_file, covar2) yerr = np.sqrt(np.diag(covar.val)) predicted = predicted_pheno.val if do_plot: pylab.plot(covariate_train.val, pheno_train.val, "g.", covariate_train.val, predicted, "r.") pylab.xlim([0, 50]) pylab.ylim([100, 200]) pylab.errorbar(covariate_train.val, predicted, yerr, linestyle='None') pylab.suptitle( name + ": test on train: train X to true target (green) and prediction (red)" ) pylab.show() self.compare_files(predicted_pheno, "lr2a_" + first_name) self.compare_files(covar2, "lr2a.cov_" + first_name) #Predict with model (test on test) predicted_pheno, covar = model.predict( K0_whole_test=K0_whole_test, X=covariate_test) #test on train output_file = self.file_name("lr_realb_" + name) Dat.write(output_file, predicted_pheno) covar2 = SnpData( iid=covar.row, sid=covar.col[:, 1], val=covar.val) #kludge to write kernel to text format output_file = self.file_name("lr_realb.cov_" + name) Dat.write(output_file, covar2) yerr = np.sqrt(np.diag(covar.val)) predicted = predicted_pheno.val if do_plot: pylab.plot(covariate_test.val, pheno_test.val, "g.", covariate_test.val, predicted, "r.") pylab.xlim([-1, 10]) pylab.errorbar(covariate_test.val, predicted, yerr, linestyle='None') pylab.suptitle( name + ": test on test: test X to true target (green) and prediction (red)" ) pylab.show() ## Plot y and predicted y (test on train) #pylab.plot(pheno_test.val,predicted_pheno.val,".") #pylab.suptitle(name+": test on test: true target to prediction") #pylab.show() self.compare_files(predicted_pheno, "lr2b_" + first_name) self.compare_files(covar2, "lr2b.cov_" + first_name)