def regression(self, snpreader, answers, cov_fn=None, num_pcs=0, strategy = "lmm_full_cv", delta=7): """ compare against previous results of this code base """ # set up grid ############################## num_steps_delta = 5 num_steps_k = 5 num_folds = 2 # log_2 space and all SNPs k_values = np.array(np.logspace(0, 9, base=2, num=num_steps_k, endpoint=True),dtype=np.int64).tolist() + [10000] #k_values = np.logspace(0, 9, base=2, num=num_steps_k, endpoint=True).tolist() + [10000] delta_values = np.logspace(-3, 3, endpoint=True, num=num_steps_delta, base=np.exp(1)) random_state = 42 output_prefix = None # select by LL fss = FeatureSelectionStrategy(snpreader, self.pheno_fn, num_folds, random_state=random_state, cov_fn=cov_fn, num_pcs=num_pcs, interpolate_delta=True) best_k, best_delta, best_obj, best_snps = fss.perform_selection(k_values, delta_values, strategy, output_prefix=output_prefix, select_by_ll=True) self.assertEqual(best_k, answers[0]) self.assertAlmostEqual(best_delta, answers[1], delta) self.assertTrue(abs(best_obj-answers[2])<.005) #accept a range answers for when standardization is done with doubles, floats, etc # select by MSE fss = FeatureSelectionStrategy(snpreader, self.pheno_fn, num_folds, random_state=random_state, cov_fn=cov_fn, num_pcs=num_pcs, interpolate_delta=True) best_k, best_delta, best_obj, best_snps = fss.perform_selection(k_values, delta_values, strategy, output_prefix=output_prefix, select_by_ll=False) self.assertEqual(best_k, answers[0]) self.assertAlmostEqual(best_delta, answers[1], delta) self.assertAlmostEqual(best_obj, answers[3])
def regression(self, snpreader, answers, cov_fn=None, num_pcs=0, strategy = "lmm_full_cv", delta=7): """ compare against previous results of this code base """ # set up grid ############################## num_steps_delta = 5 num_steps_k = 5 num_folds = 2 # log_2 space and all SNPs k_values = np.array(np.logspace(0, 9, base=2, num=num_steps_k, endpoint=True),dtype=np.int64).tolist() + [10000] #k_values = np.logspace(0, 9, base=2, num=num_steps_k, endpoint=True).tolist() + [10000] delta_values = np.logspace(-3, 3, endpoint=True, num=num_steps_delta, base=np.exp(1)) random_state = 42 output_prefix = None # select by LL fss = FeatureSelectionStrategy(snpreader, self.pheno_fn, num_folds, random_state=random_state, cov_fn=cov_fn, num_pcs=num_pcs, interpolate_delta=True,count_A1=False) best_k, best_delta, best_obj, best_snps = fss.perform_selection(k_values, delta_values, strategy, output_prefix=output_prefix, select_by_ll=True,create_pdf=False) self.assertEqual(best_k, answers[0]) self.assertAlmostEqual(best_delta, answers[1], delta) self.assertTrue(abs(best_obj-answers[2])<.005) #accept a range answers for when standardization is done with doubles, floats, etc # select by MSE fss = FeatureSelectionStrategy(snpreader, self.pheno_fn, num_folds, random_state=random_state, cov_fn=cov_fn, num_pcs=num_pcs, interpolate_delta=True,count_A1=False) best_k, best_delta, best_obj, best_snps = fss.perform_selection(k_values, delta_values, strategy, output_prefix=output_prefix, select_by_ll=False,create_pdf=False) self.assertEqual(best_k, answers[0]) self.assertAlmostEqual(best_delta, answers[1], delta) self.assertAlmostEqual(best_obj, answers[3])
def runselect(bed_fn=None, pheno_fn=None, strategy=None, select_by_ll=True, output_prefix=None,num_pcs=0, random_state=3, num_snps_in_memory=1000, cov_fn=None, k_values=None, delta_values=None,num_folds=10,penalty=0.0): logging.basicConfig(level=logging.INFO) # set up data ############################## if bed_fn is None: bed_fn = Bed("examples/toydata") pheno_fn = "examples/toydata.phe" # set up grid ############################## num_steps_delta = 10 num_steps_k = 5 # log_2 space and all SNPs #k_values = np.logspace(0, 9, base=2, num=num_steps_k, endpoint=True).tolist() + [10000] if k_values is None: k_values = [0, 1, 5, 10, 20, 50, 100, 500, 1000, 2000, 5000, 10000, 456345643256] #100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 400, 500, 1000] if delta_values is None: delta_values = np.logspace(-10, 10, endpoint=True, num=num_steps_delta, base=np.exp(1)) #delta_values = [np.exp(1), np.exp(2), np.exp(3), np.exp(4), np.exp(5), np.exp(6)] if strategy is None: strategy = 'lmm_full_cv' select_by_ll = True if 0: strategy = 'insample_cv' select_by_ll = True # where to save output ############################## if output_prefix is None: output_prefix = "example_pc%i" % (num_pcs) # go! fss = FeatureSelectionStrategy(bed_fn, pheno_fn, num_folds, random_state=random_state, num_pcs=num_pcs, num_snps_in_memory=num_snps_in_memory, interpolate_delta=False, cov_fn=cov_fn) best_k, best_delta, best_obj, best_snps = fss.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=select_by_ll, strategy = strategy, penalty=penalty) res = { 'best_k':best_k, 'best_delta':best_delta, 'best_obj':best_obj, 'best_snps':best_snps } return res
def blocking(self, snpreader, cov_fn=None, num_pcs=0, output_prefix = None, strategy="lmm_full_cv"): """ compare three different cases: To control memory use, we've introduced a parameter called "num_snps_in_memory", which defaults to 100000. Here are the interesting cases to consider (and choose num_snps_in_memory accordingly): 1) num_snps_in_memory > total_num_snps In this case, the same code as before should be executed (except the kernel matrix on all SNPs is now cached). 2) num_snps_in_memory < total_num_snps num_snps_in_memory > k (excluding all_snps) Here, the linear regression will be blocked, while the data for cross-validation is cached, saving time for loading and re-indexing. 3) num_snps_in_memory < total_num_snps num_snps_in_memory < k (excluding all_snps) Finally, both operations - linear regression and building the kernel will be blocked. 4,5,6) Same as #1,2,3, but with a phenos that has extra iids and for which the iids are shuffled. """ # set up grid ############################## num_steps_delta = 5 num_folds = 2 # log_2 space and all SNPs k_values = [0, 1, 5, 10, 100, 500, 700, 10000] delta_values = np.logspace(-3, 3, endpoint=True, num=num_steps_delta, base=np.exp(1)) random_state = 42 # case 1 fss_1 = FeatureSelectionStrategy(snpreader, self.pheno_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=20000) best_k_1, best_delta_1, best_obj_1, best_snps_1 = fss_1.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy) #some misc testing import PerformSelectionDistributable as psd perform_selection_distributable = psd.PerformSelectionDistributable(fss_1, k_values, delta_values, strategy, output_prefix, select_by_ll=True, penalty=0.0) self.assertEqual(perform_selection_distributable.work_count, 3) s = perform_selection_distributable.tempdirectory s = str(perform_selection_distributable) s = "%r" % perform_selection_distributable from fastlmm.feature_selection.feature_selection_cv import GClass s = "%r" % GClass.factory(snpreader,1000000, Unit(), 50) s = s #!!making test for each break point. # case 2 fss_2 = FeatureSelectionStrategy(snpreader, self.pheno_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=5000) best_k_2, best_delta_2, best_obj_2, best_snps_2 = fss_2.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy) # case 3 fss_3 = FeatureSelectionStrategy(snpreader, self.pheno_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=600) best_k_3, best_delta_3, best_obj_3, best_snps_3 = fss_3.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy) # case 4 fss_4 = FeatureSelectionStrategy(snpreader, self.pheno_shuffleplus_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=20000) best_k_4, best_delta_4, best_obj_4, best_snps_4 = fss_4.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy) # case 5 fss_5 = FeatureSelectionStrategy(snpreader, self.pheno_shuffleplus_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=5000) best_k_5, best_delta_5, best_obj_5, best_snps_5 = fss_5.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy) # case 6 fss_6 = FeatureSelectionStrategy(snpreader, self.pheno_shuffleplus_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=600) best_k_6, best_delta_6, best_obj_6, best_snps_6 = fss_6.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy) self.assertEqual(int(best_k_1), int(best_k_2)) self.assertEqual(int(best_k_1), int(best_k_3)) #self.assertEqual(int(best_k_1), int(best_k_4)) #self.assertEqual(int(best_k_1), int(best_k_5)) #self.assertEqual(int(best_k_1), int(best_k_6)) self.assertAlmostEqual(best_obj_1, best_obj_2) self.assertAlmostEqual(best_obj_1, best_obj_3) #self.assertAlmostEqual(best_obj_1, best_obj_4) self.assertAlmostEqual(best_obj_4, best_obj_5) self.assertAlmostEqual(best_obj_4, best_obj_6) if strategy != "insample_cv": self.assertAlmostEqual(best_delta_1, best_delta_2) self.assertAlmostEqual(best_delta_1, best_delta_3) #self.assertAlmostEqual(best_delta_1, best_delta_4) self.assertAlmostEqual(best_delta_4, best_delta_5) self.assertAlmostEqual(best_delta_4, best_delta_6)
def blocking(self, snpreader, cov_fn=None, num_pcs=0, output_prefix=None, strategy="lmm_full_cv"): """ compare three different cases: To control memory use, we've introduced a parameter called "num_snps_in_memory", which defaults to 100000. Here are the interesting cases to consider (and choose num_snps_in_memory accordingly): 1) num_snps_in_memory > total_num_snps In this case, the same code as before should be executed (except the kernel matrix on all SNPs is now cached). 2) num_snps_in_memory < total_num_snps num_snps_in_memory > k (excluding all_snps) Here, the linear regression will be blocked, while the data for cross-validation is cached, saving time for loading and re-indexing. 3) num_snps_in_memory < total_num_snps num_snps_in_memory < k (excluding all_snps) Finally, both operations - linear regression and building the kernel will be blocked. 4,5,6) Same as #1,2,3, but with a phenos that has extra iids and for which the iids are shuffled. """ # set up grid ############################## num_steps_delta = 5 num_folds = 2 # log_2 space and all SNPs k_values = [0, 1, 5, 10, 100, 500, 700, 10000] delta_values = np.logspace(-3, 3, endpoint=True, num=num_steps_delta, base=np.exp(1)) random_state = 42 # case 1 fss_1 = FeatureSelectionStrategy(snpreader, self.pheno_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=20000) best_k_1, best_delta_1, best_obj_1, best_snps_1 = fss_1.perform_selection( k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy) #some misc testing import PerformSelectionDistributable as psd perform_selection_distributable = psd.PerformSelectionDistributable( fss_1, k_values, delta_values, strategy, output_prefix, select_by_ll=True, penalty=0.0) self.assertEqual(perform_selection_distributable.work_count, 3) s = perform_selection_distributable.tempdirectory s = str(perform_selection_distributable) s = "%r" % perform_selection_distributable from fastlmm.feature_selection.feature_selection_cv import GClass s = "%r" % GClass.factory(snpreader, 1000000, Unit(), 50) s = s #!!making test for each break point. # case 2 fss_2 = FeatureSelectionStrategy(snpreader, self.pheno_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=5000) best_k_2, best_delta_2, best_obj_2, best_snps_2 = fss_2.perform_selection( k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy) # case 3 fss_3 = FeatureSelectionStrategy(snpreader, self.pheno_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=600) best_k_3, best_delta_3, best_obj_3, best_snps_3 = fss_3.perform_selection( k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy) # case 4 fss_4 = FeatureSelectionStrategy(snpreader, self.pheno_shuffleplus_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=20000) best_k_4, best_delta_4, best_obj_4, best_snps_4 = fss_4.perform_selection( k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy) # case 5 fss_5 = FeatureSelectionStrategy(snpreader, self.pheno_shuffleplus_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=5000) best_k_5, best_delta_5, best_obj_5, best_snps_5 = fss_5.perform_selection( k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy) # case 6 fss_6 = FeatureSelectionStrategy(snpreader, self.pheno_shuffleplus_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=600) best_k_6, best_delta_6, best_obj_6, best_snps_6 = fss_6.perform_selection( k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy) self.assertEqual(int(best_k_1), int(best_k_2)) self.assertEqual(int(best_k_1), int(best_k_3)) #self.assertEqual(int(best_k_1), int(best_k_4)) #self.assertEqual(int(best_k_1), int(best_k_5)) #self.assertEqual(int(best_k_1), int(best_k_6)) self.assertAlmostEqual(best_obj_1, best_obj_2) self.assertAlmostEqual(best_obj_1, best_obj_3) #self.assertAlmostEqual(best_obj_1, best_obj_4) self.assertAlmostEqual(best_obj_4, best_obj_5) self.assertAlmostEqual(best_obj_4, best_obj_6) if strategy != "insample_cv": self.assertAlmostEqual(best_delta_1, best_delta_2) self.assertAlmostEqual(best_delta_1, best_delta_3) #self.assertAlmostEqual(best_delta_1, best_delta_4) self.assertAlmostEqual(best_delta_4, best_delta_5) self.assertAlmostEqual(best_delta_4, best_delta_6)
def runselect(bed_fn=None, pheno_fn=None, strategy=None, select_by_ll=True, output_prefix=None, num_pcs=0, random_state=3, num_snps_in_memory=1000, cov_fn=None, k_values=None, delta_values=None, num_folds=10, penalty=0.0): logging.basicConfig(level=logging.INFO) # set up data ############################## if bed_fn is None: bed_fn = Bed("examples/toydata") pheno_fn = "examples/toydata.phe" # set up grid ############################## num_steps_delta = 10 num_steps_k = 5 # log_2 space and all SNPs #k_values = np.logspace(0, 9, base=2, num=num_steps_k, endpoint=True).tolist() + [10000] if k_values is None: k_values = [ 0, 1, 5, 10, 20, 50, 100, 500, 1000, 2000, 5000, 10000, 456345643256 ] #100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 400, 500, 1000] if delta_values is None: delta_values = np.logspace(-10, 10, endpoint=True, num=num_steps_delta, base=np.exp(1)) #delta_values = [np.exp(1), np.exp(2), np.exp(3), np.exp(4), np.exp(5), np.exp(6)] if strategy is None: strategy = 'lmm_full_cv' select_by_ll = True if 0: strategy = 'insample_cv' select_by_ll = True # where to save output ############################## if output_prefix is None: output_prefix = "example_pc%i" % (num_pcs) # go! fss = FeatureSelectionStrategy(bed_fn, pheno_fn, num_folds, random_state=random_state, num_pcs=num_pcs, num_snps_in_memory=num_snps_in_memory, interpolate_delta=False, cov_fn=cov_fn) best_k, best_delta, best_obj, best_snps = fss.perform_selection( k_values, delta_values, output_prefix=output_prefix, select_by_ll=select_by_ll, strategy=strategy, penalty=penalty) res = { 'best_k': best_k, 'best_delta': best_delta, 'best_obj': best_obj, 'best_snps': best_snps } return res