def execute_dual_fs(test_snps, pheno, G0, covar): """ implementation of dual-kernel feature selection """ result = {} fs_result = {} # extract data G_test = test_snps.read().standardize().val G_train_unnorm = G0.read().standardize().val # fs conditioned on full kernel select = FeatureSelectionInSample(max_log_k=7, order_by_lmm=True) fs_result["insample_cond_full"] = select.run_select(G_train_unnorm, G_train_unnorm, pheno["vals"], cov=covar["vals"]) best_k, fs_idx, best_mix, best_delta = fs_result["insample_cond_full"] print "best_k:", best_k, ", best_mix:", best_mix # set up foreground kernel G1 = G0[:,fs_idx] result["full_fs_low"] = single_snp(test_snps, pheno, G0=G0, covar=covar, G1=G1, mixing=best_mix).sort(["Chr", "ChrPos"])["PValue"].as_matrix() return result, fs_result
def execute_dual_fs(test_snps, pheno, G0, covar): """ implementation of dual-kernel feature selection """ result = {} fs_result = {} # extract data G_test = test_snps.read().standardize().val G_train_unnorm = G0.read().standardize().val # fs conditioned on full kernel select = FeatureSelectionInSample(max_log_k=7, order_by_lmm=True) fs_result["insample_cond_full"] = select.run_select(G_train_unnorm, G_train_unnorm, pheno["vals"], cov=covar["vals"]) best_k, fs_idx, best_mix, best_delta = fs_result["insample_cond_full"] print "best_k:", best_k, ", best_mix:", best_mix # set up foreground kernel G1 = G0[:, fs_idx] result["full_fs_low"] = single_snp(test_snps, pheno, G0=G0, covar=covar, G1=G1, mixing=best_mix).sort( ["Chr", "ChrPos"])["PValue"].as_matrix() return result, fs_result
def test_regression_lr(self): # invoke fs select = FeatureSelectionInSample(n_folds=2, max_log_k=6, order_by_lmm=False, measure="mse", random_state=42) best_k, feat_idx, best_mix, best_delta = select.run_select(self.G, self.G, self.y, cov=self.G_cov) # print results print "best_k:", best_k print "best_mix:", best_mix print "best_delta:", best_delta self.assertEqual(best_k, 32) self.assertAlmostEqual(best_mix, 0.6786566031577429, places=6) self.assertAlmostEqual(best_delta, 0.70148446599200931, places=6)
def test_regression_lr(self): # invoke fs select = FeatureSelectionInSample(n_folds=2, max_log_k=6, order_by_lmm=False, measure="mse", random_state=42) best_k, feat_idx, best_mix, best_delta = select.run_select(self.G, self.G, self.y, cov=self.G_cov) # print results print("best_k:", best_k) print("best_mix:", best_mix) print("best_delta:", best_delta) self.assertEqual(best_k, 32) self.assertAlmostEqual(best_mix, 0.6786566031577429, places=6) self.assertAlmostEqual(best_delta, 0.70148446599200931, places=6)
def test_regression_lmm(self): # invoke fs select = FeatureSelectionInSample(n_folds=2, max_log_k=6, order_by_lmm=True, measure="mse", random_state=42) best_k, feat_idx, best_mix, best_delta = select.run_select(self.G, self.G, self.y, cov=self.G_cov) # print results print "best_k:", best_k print "best_mix:", best_mix print "best_delta:", best_delta self.assertEqual(best_k, 64) self.assertAlmostEqual(best_mix, 0.8621642030968627, places=6) self.assertAlmostEqual(best_delta, 0.7255878551207211, places=6)
def test_regression_lmm(self): # invoke fs select = FeatureSelectionInSample(n_folds=2, max_log_k=6, order_by_lmm=True, measure="mse", random_state=42) best_k, feat_idx, best_mix, best_delta = select.run_select(self.G, self.G, self.y, cov=self.G_cov) # print results print("best_k:", best_k) print("best_mix:", best_mix) print("best_delta:", best_delta) self.assertEqual(best_k, 64) self.assertAlmostEqual(best_mix, 0.8621642030968627, places=6) self.assertAlmostEqual(best_delta, 0.7255878551207211, places=6)
def test_old(self): do_plot = False from fastlmm.feature_selection.feature_selection_two_kernel import FeatureSelectionInSample from pysnptools.util import intersect_apply logging.info("TestSingleSnpAllPlusSelect test_old") bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed" pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt" cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt" #load data ################################################################### snp_reader = Bed(bed_fn, count_A1=False) pheno = Pheno(pheno_fn) cov = Pheno(cov_fn) # intersect sample ids snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov]) # read in snps # partition snps on chr5 vs rest test_chr = 5 G0 = snp_reader[:, snp_reader.pos[:, 0] != test_chr].read( order='C').standardize() test_snps = snp_reader[:, snp_reader.pos[:, 0] == test_chr].read( order='C').standardize() y = pheno.read().val[:, 0] y -= y.mean() y /= y.std() # load covariates X_cov = cov.read().val X_cov.flags.writeable = False # invoke feature selection to learn which SNPs to use to build G1 logging.info( "running feature selection conditioned on background kernel") # partition data into the first 50 SNPs on chr1 and all but chr1 select = FeatureSelectionInSample(max_log_k=7, n_folds=7, order_by_lmm=True, measure="ll", random_state=42) best_k, feat_idx, best_mix, best_delta = select.run_select(G0.val, G0.val, y, cov=X_cov) # plot out of sample error if do_plot: select.plot_results(measure="ll") # select.plot_results(measure="mse") # print results logging.info("best_k:{0}".format(best_k)) logging.info("best_mix:{0}".format(best_mix)) logging.info("best_delta:{0}".format(best_delta)) ############################### # use selected SNPs to build G1 logging.info(feat_idx) G1 = G0[:, feat_idx] output_file_name = self.file_name("old") results_df = single_snp(test_snps, pheno, G0=G0, G1=G1, mixing=best_mix, h2=None, leave_out_one_chrom=False, output_file_name=output_file_name, count_A1=False) logging.info("results:") logging.info("#" * 40) logging.info(results_df.head()) self.compare_files(results_df, "old")
def test_old(self): do_plot = False from fastlmm.feature_selection.feature_selection_two_kernel import FeatureSelectionInSample from pysnptools.util import intersect_apply logging.info("TestSingleSnpAllPlusSelect test_old") bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed" pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt" cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt" #load data ################################################################### snp_reader = Bed(bed_fn) pheno = Pheno(pheno_fn) cov = Pheno(cov_fn) # intersect sample ids snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov]) # read in snps # partition snps on chr5 vs rest test_chr = 5 G0 = snp_reader[:,snp_reader.pos[:,0] != test_chr].read(order='C').standardize() test_snps = snp_reader[:,snp_reader.pos[:,0] == test_chr].read(order='C').standardize() y = pheno.read().val[:,0] y -= y.mean() y /= y.std() # load covariates X_cov = cov.read().val X_cov.flags.writeable = False # invoke feature selection to learn which SNPs to use to build G1 logging.info("running feature selection conditioned on background kernel") # partition data into the first 50 SNPs on chr1 and all but chr1 select = FeatureSelectionInSample(max_log_k=7, n_folds=7, order_by_lmm=True, measure="ll", random_state=42) best_k, feat_idx, best_mix, best_delta = select.run_select(G0.val, G0.val, y, cov=X_cov) # plot out of sample error if do_plot: select.plot_results(measure="ll") # select.plot_results(measure="mse") # print results logging.info("best_k:{0}".format(best_k)) logging.info("best_mix:{0}".format(best_mix)) logging.info("best_delta:{0}".format(best_delta)) ############################### # use selected SNPs to build G1 logging.info(feat_idx) G1 = G0[:,feat_idx] output_file_name = self.file_name("old") results_df = single_snp(test_snps, pheno, G0=G0, G1=G1, mixing=best_mix, h2=None,leave_out_one_chrom=False,output_file_name=output_file_name) logging.info("results:") logging.info("#"*40) logging.info(results_df.head()) self.compare_files(results_df,"old")