Beispiel #1
0
    def regression(self, snpreader, answers, cov_fn=None, num_pcs=0, strategy = "lmm_full_cv", delta=7):
        """
        compare against previous results of this code base
        """
    
        # set up grid
        ##############################
        num_steps_delta = 5
        num_steps_k = 5
        num_folds = 2


        # log_2 space and all SNPs
        k_values = np.array(np.logspace(0, 9, base=2, num=num_steps_k, endpoint=True),dtype=np.int64).tolist() + [10000]
        #k_values = np.logspace(0, 9, base=2, num=num_steps_k, endpoint=True).tolist() + [10000]
        delta_values = np.logspace(-3, 3, endpoint=True, num=num_steps_delta, base=np.exp(1))
        random_state = 42

        output_prefix = None

        # select by LL
        fss = FeatureSelectionStrategy(snpreader, self.pheno_fn, num_folds, random_state=random_state, cov_fn=cov_fn, num_pcs=num_pcs, interpolate_delta=True)
        best_k, best_delta, best_obj, best_snps = fss.perform_selection(k_values, delta_values, strategy, output_prefix=output_prefix, select_by_ll=True)
        
        self.assertEqual(best_k, answers[0])
        self.assertAlmostEqual(best_delta, answers[1], delta)
        self.assertTrue(abs(best_obj-answers[2])<.005) #accept a range answers for when standardization is done with doubles, floats, etc

        # select by MSE
        fss = FeatureSelectionStrategy(snpreader, self.pheno_fn, num_folds, random_state=random_state, cov_fn=cov_fn, num_pcs=num_pcs, interpolate_delta=True)
        best_k, best_delta, best_obj, best_snps = fss.perform_selection(k_values, delta_values, strategy, output_prefix=output_prefix, select_by_ll=False)
        
        self.assertEqual(best_k, answers[0])
        self.assertAlmostEqual(best_delta, answers[1], delta)
        self.assertAlmostEqual(best_obj, answers[3])
def runselect(bed_fn=None, pheno_fn=None, strategy=None, select_by_ll=True, output_prefix=None,num_pcs=0, random_state=3, num_snps_in_memory=1000, cov_fn=None, k_values=None, delta_values=None,num_folds=10,penalty=0.0):
    logging.basicConfig(level=logging.INFO)

    # set up data
    ##############################
    if bed_fn is None:
        bed_fn = Bed("examples/toydata")
        pheno_fn = "examples/toydata.phe"
    

    # set up grid
    ##############################
    num_steps_delta = 10
    num_steps_k = 5

    # log_2 space and all SNPs
    #k_values = np.logspace(0, 9, base=2, num=num_steps_k, endpoint=True).tolist() + [10000]
    if k_values is None:
        k_values = [0, 1, 5, 10, 20, 50, 100, 500, 1000, 2000, 5000, 10000, 456345643256] #100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 400, 500, 1000]
    if delta_values is None:
        delta_values = np.logspace(-10, 10, endpoint=True, num=num_steps_delta, base=np.exp(1))
    #delta_values = [np.exp(1), np.exp(2), np.exp(3), np.exp(4), np.exp(5), np.exp(6)]

    if strategy is None:
        strategy = 'lmm_full_cv'
        select_by_ll = True
    if 0:
        strategy = 'insample_cv'
        select_by_ll = True
    # where to save output
    ##############################
    if output_prefix is None:
        output_prefix = "example_pc%i" % (num_pcs)
    
    # go!
    fss = FeatureSelectionStrategy(bed_fn, pheno_fn, num_folds, random_state=random_state, num_pcs=num_pcs, num_snps_in_memory=num_snps_in_memory, interpolate_delta=False, cov_fn=cov_fn)

    best_k, best_delta, best_obj, best_snps = fss.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=select_by_ll, strategy = strategy, penalty=penalty)
    res = {
           'best_k':best_k,
           'best_delta':best_delta,
           'best_obj':best_obj, 
           'best_snps':best_snps
           }
    return res
Beispiel #3
0
def create_feature_selection_distributable(snp_reader,
                                           phen_fn,
                                           pc_fn,
                                           num_pcs_kernel,
                                           output_prefix,
                                           cov_fn=None,
                                           include_all=True):

    from fastlmm.feature_selection import FeatureSelectionStrategy
    import fastlmm.feature_selection.PerformSelectionDistributable as psd

    # set up parameters
    num_folds = 10
    random_state = 42
    num_snps_in_memory = 1000000

    ##############################
    num_steps_delta = 7
    num_steps_k = 7
    num_steps_mix = 7

    # log_2 space and all SNPs
    k_values = [
        int(k)
        for k in np.logspace(0, 10, base=2, num=num_steps_k, endpoint=True)
    ]
    if include_all:
        k_values.append(snp_reader.sid_count)
    delta_values = np.logspace(-5,
                               10,
                               endpoint=True,
                               num=num_steps_delta,
                               base=np.exp(1))

    if pc_fn is None:
        assert num_pcs_kernel == 0
        logging.info(
            "feature selection: no PCs specified, disabling loop over mixing parameter"
        )

    strategy = "insample_cv"
    select_by_ll = True

    # go!
    feature_selector = FeatureSelectionStrategy(
        snp_reader,
        phen_fn,
        num_folds,
        random_state=random_state,
        num_snps_in_memory=num_snps_in_memory,
        interpolate_delta=False,
        cov_fn=cov_fn)
    perform_selection_distributable = psd.PerformSelectionDistributable(
        feature_selector, k_values, delta_values, strategy, output_prefix,
        select_by_ll)

    return perform_selection_distributable
Beispiel #4
0
    def regression(self, snpreader, answers, cov_fn=None, num_pcs=0, strategy = "lmm_full_cv", delta=7):
        """
        compare against previous results of this code base
        """
    
        # set up grid
        ##############################
        num_steps_delta = 5
        num_steps_k = 5
        num_folds = 2


        # log_2 space and all SNPs
        k_values = np.array(np.logspace(0, 9, base=2, num=num_steps_k, endpoint=True),dtype=np.int64).tolist() + [10000]
        #k_values = np.logspace(0, 9, base=2, num=num_steps_k, endpoint=True).tolist() + [10000]
        delta_values = np.logspace(-3, 3, endpoint=True, num=num_steps_delta, base=np.exp(1))
        random_state = 42

        output_prefix = None

        # select by LL
        fss = FeatureSelectionStrategy(snpreader, self.pheno_fn, num_folds, random_state=random_state, cov_fn=cov_fn, num_pcs=num_pcs, interpolate_delta=True,count_A1=False)
        best_k, best_delta, best_obj, best_snps = fss.perform_selection(k_values, delta_values, strategy, output_prefix=output_prefix, select_by_ll=True,create_pdf=False)
        
        self.assertEqual(best_k, answers[0])
        self.assertAlmostEqual(best_delta, answers[1], delta)
        self.assertTrue(abs(best_obj-answers[2])<.005) #accept a range answers for when standardization is done with doubles, floats, etc

        # select by MSE
        fss = FeatureSelectionStrategy(snpreader, self.pheno_fn, num_folds, random_state=random_state, cov_fn=cov_fn, num_pcs=num_pcs, interpolate_delta=True,count_A1=False)
        best_k, best_delta, best_obj, best_snps = fss.perform_selection(k_values, delta_values, strategy, output_prefix=output_prefix, select_by_ll=False,create_pdf=False)
        
        self.assertEqual(best_k, answers[0])
        self.assertAlmostEqual(best_delta, answers[1], delta)
        self.assertAlmostEqual(best_obj, answers[3])
Beispiel #5
0
    def blocking(self, snpreader, cov_fn=None, num_pcs=0, output_prefix = None, strategy="lmm_full_cv"):
        """
        compare three different cases:

        To control memory use, we've introduced a parameter called "num_snps_in_memory", which defaults to 100000. 
        Here are the interesting cases to consider (and choose num_snps_in_memory accordingly):

        1) num_snps_in_memory > total_num_snps

           In this case, the same code as before should be 
           executed (except the kernel matrix on all SNPs is now cached). 


        2) num_snps_in_memory < total_num_snps
            num_snps_in_memory > k (excluding all_snps)

            Here, the linear regression will be blocked, 
            while the data for cross-validation is cached, 
            saving time for loading and re-indexing.


        3) num_snps_in_memory < total_num_snps
            num_snps_in_memory < k (excluding all_snps)

            Finally, both operations - linear regression 
            and building the kernel will be blocked.

        4,5,6) Same as #1,2,3, but with a phenos that has extra iids and for which the iids are shuffled.


        """

        # set up grid
        ##############################
        num_steps_delta = 5
        num_folds = 2

        # log_2 space and all SNPs
        k_values = [0, 1, 5, 10, 100, 500, 700, 10000] 
        delta_values = np.logspace(-3, 3, endpoint=True, num=num_steps_delta, base=np.exp(1))
        
        random_state = 42



        # case 1
        fss_1 = FeatureSelectionStrategy(snpreader, self.pheno_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=20000)
        best_k_1, best_delta_1, best_obj_1, best_snps_1 = fss_1.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy)

        #some misc testing
        import PerformSelectionDistributable as psd
        perform_selection_distributable = psd.PerformSelectionDistributable(fss_1, k_values, delta_values, strategy, output_prefix, select_by_ll=True, penalty=0.0)
        self.assertEqual(perform_selection_distributable.work_count, 3)
        s = perform_selection_distributable.tempdirectory
        s = str(perform_selection_distributable)
        s = "%r" % perform_selection_distributable
        from fastlmm.feature_selection.feature_selection_cv import GClass
        s = "%r" % GClass.factory(snpreader,1000000, Unit(), 50)
        s = s
        #!!making  test for each break point.


        # case 2
        fss_2 = FeatureSelectionStrategy(snpreader, self.pheno_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=5000)
        best_k_2, best_delta_2, best_obj_2, best_snps_2 = fss_2.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy)

        # case 3
        fss_3 = FeatureSelectionStrategy(snpreader, self.pheno_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=600)
        best_k_3, best_delta_3, best_obj_3, best_snps_3 = fss_3.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy)

        # case 4
        fss_4 = FeatureSelectionStrategy(snpreader, self.pheno_shuffleplus_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=20000)
        best_k_4, best_delta_4, best_obj_4, best_snps_4 = fss_4.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy)

        # case 5
        fss_5 = FeatureSelectionStrategy(snpreader, self.pheno_shuffleplus_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=5000)
        best_k_5, best_delta_5, best_obj_5, best_snps_5 = fss_5.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy)

        # case 6
        fss_6 = FeatureSelectionStrategy(snpreader, self.pheno_shuffleplus_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=600)
        best_k_6, best_delta_6, best_obj_6, best_snps_6 = fss_6.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy)

        self.assertEqual(int(best_k_1), int(best_k_2))
        self.assertEqual(int(best_k_1), int(best_k_3))
        #self.assertEqual(int(best_k_1), int(best_k_4))
        #self.assertEqual(int(best_k_1), int(best_k_5))
        #self.assertEqual(int(best_k_1), int(best_k_6))
        self.assertAlmostEqual(best_obj_1, best_obj_2)
        self.assertAlmostEqual(best_obj_1, best_obj_3)
        #self.assertAlmostEqual(best_obj_1, best_obj_4)
        self.assertAlmostEqual(best_obj_4, best_obj_5)
        self.assertAlmostEqual(best_obj_4, best_obj_6)

        if strategy != "insample_cv":
            self.assertAlmostEqual(best_delta_1, best_delta_2)
            self.assertAlmostEqual(best_delta_1, best_delta_3)
            #self.assertAlmostEqual(best_delta_1, best_delta_4)
            self.assertAlmostEqual(best_delta_4, best_delta_5)
            self.assertAlmostEqual(best_delta_4, best_delta_6)
Beispiel #6
0
    def blocking(self,
                 snpreader,
                 cov_fn=None,
                 num_pcs=0,
                 output_prefix=None,
                 strategy="lmm_full_cv"):
        """
        compare three different cases:

        To control memory use, we've introduced a parameter called "num_snps_in_memory", which defaults to 100000. 
        Here are the interesting cases to consider (and choose num_snps_in_memory accordingly):

        1) num_snps_in_memory > total_num_snps

           In this case, the same code as before should be 
           executed (except the kernel matrix on all SNPs is now cached). 


        2) num_snps_in_memory < total_num_snps
            num_snps_in_memory > k (excluding all_snps)

            Here, the linear regression will be blocked, 
            while the data for cross-validation is cached, 
            saving time for loading and re-indexing.


        3) num_snps_in_memory < total_num_snps
            num_snps_in_memory < k (excluding all_snps)

            Finally, both operations - linear regression 
            and building the kernel will be blocked.

        4,5,6) Same as #1,2,3, but with a phenos that has extra iids and for which the iids are shuffled.


        """

        # set up grid
        ##############################
        num_steps_delta = 5
        num_folds = 2

        # log_2 space and all SNPs
        k_values = [0, 1, 5, 10, 100, 500, 700, 10000]
        delta_values = np.logspace(-3,
                                   3,
                                   endpoint=True,
                                   num=num_steps_delta,
                                   base=np.exp(1))

        random_state = 42

        # case 1
        fss_1 = FeatureSelectionStrategy(snpreader,
                                         self.pheno_fn,
                                         num_folds,
                                         cov_fn=cov_fn,
                                         random_state=random_state,
                                         num_pcs=num_pcs,
                                         interpolate_delta=True,
                                         num_snps_in_memory=20000)
        best_k_1, best_delta_1, best_obj_1, best_snps_1 = fss_1.perform_selection(
            k_values,
            delta_values,
            output_prefix=output_prefix,
            select_by_ll=True,
            strategy=strategy)

        #some misc testing
        import PerformSelectionDistributable as psd
        perform_selection_distributable = psd.PerformSelectionDistributable(
            fss_1,
            k_values,
            delta_values,
            strategy,
            output_prefix,
            select_by_ll=True,
            penalty=0.0)
        self.assertEqual(perform_selection_distributable.work_count, 3)
        s = perform_selection_distributable.tempdirectory
        s = str(perform_selection_distributable)
        s = "%r" % perform_selection_distributable
        from fastlmm.feature_selection.feature_selection_cv import GClass
        s = "%r" % GClass.factory(snpreader, 1000000, Unit(), 50)
        s = s
        #!!making  test for each break point.

        # case 2
        fss_2 = FeatureSelectionStrategy(snpreader,
                                         self.pheno_fn,
                                         num_folds,
                                         cov_fn=cov_fn,
                                         random_state=random_state,
                                         num_pcs=num_pcs,
                                         interpolate_delta=True,
                                         num_snps_in_memory=5000)
        best_k_2, best_delta_2, best_obj_2, best_snps_2 = fss_2.perform_selection(
            k_values,
            delta_values,
            output_prefix=output_prefix,
            select_by_ll=True,
            strategy=strategy)

        # case 3
        fss_3 = FeatureSelectionStrategy(snpreader,
                                         self.pheno_fn,
                                         num_folds,
                                         cov_fn=cov_fn,
                                         random_state=random_state,
                                         num_pcs=num_pcs,
                                         interpolate_delta=True,
                                         num_snps_in_memory=600)
        best_k_3, best_delta_3, best_obj_3, best_snps_3 = fss_3.perform_selection(
            k_values,
            delta_values,
            output_prefix=output_prefix,
            select_by_ll=True,
            strategy=strategy)

        # case 4
        fss_4 = FeatureSelectionStrategy(snpreader,
                                         self.pheno_shuffleplus_fn,
                                         num_folds,
                                         cov_fn=cov_fn,
                                         random_state=random_state,
                                         num_pcs=num_pcs,
                                         interpolate_delta=True,
                                         num_snps_in_memory=20000)
        best_k_4, best_delta_4, best_obj_4, best_snps_4 = fss_4.perform_selection(
            k_values,
            delta_values,
            output_prefix=output_prefix,
            select_by_ll=True,
            strategy=strategy)

        # case 5
        fss_5 = FeatureSelectionStrategy(snpreader,
                                         self.pheno_shuffleplus_fn,
                                         num_folds,
                                         cov_fn=cov_fn,
                                         random_state=random_state,
                                         num_pcs=num_pcs,
                                         interpolate_delta=True,
                                         num_snps_in_memory=5000)
        best_k_5, best_delta_5, best_obj_5, best_snps_5 = fss_5.perform_selection(
            k_values,
            delta_values,
            output_prefix=output_prefix,
            select_by_ll=True,
            strategy=strategy)

        # case 6
        fss_6 = FeatureSelectionStrategy(snpreader,
                                         self.pheno_shuffleplus_fn,
                                         num_folds,
                                         cov_fn=cov_fn,
                                         random_state=random_state,
                                         num_pcs=num_pcs,
                                         interpolate_delta=True,
                                         num_snps_in_memory=600)
        best_k_6, best_delta_6, best_obj_6, best_snps_6 = fss_6.perform_selection(
            k_values,
            delta_values,
            output_prefix=output_prefix,
            select_by_ll=True,
            strategy=strategy)

        self.assertEqual(int(best_k_1), int(best_k_2))
        self.assertEqual(int(best_k_1), int(best_k_3))
        #self.assertEqual(int(best_k_1), int(best_k_4))
        #self.assertEqual(int(best_k_1), int(best_k_5))
        #self.assertEqual(int(best_k_1), int(best_k_6))
        self.assertAlmostEqual(best_obj_1, best_obj_2)
        self.assertAlmostEqual(best_obj_1, best_obj_3)
        #self.assertAlmostEqual(best_obj_1, best_obj_4)
        self.assertAlmostEqual(best_obj_4, best_obj_5)
        self.assertAlmostEqual(best_obj_4, best_obj_6)

        if strategy != "insample_cv":
            self.assertAlmostEqual(best_delta_1, best_delta_2)
            self.assertAlmostEqual(best_delta_1, best_delta_3)
            #self.assertAlmostEqual(best_delta_1, best_delta_4)
            self.assertAlmostEqual(best_delta_4, best_delta_5)
            self.assertAlmostEqual(best_delta_4, best_delta_6)
Beispiel #7
0
def runselect(bed_fn=None,
              pheno_fn=None,
              strategy=None,
              select_by_ll=True,
              output_prefix=None,
              num_pcs=0,
              random_state=3,
              num_snps_in_memory=1000,
              cov_fn=None,
              k_values=None,
              delta_values=None,
              num_folds=10,
              penalty=0.0):
    logging.basicConfig(level=logging.INFO)

    # set up data
    ##############################
    if bed_fn is None:
        bed_fn = Bed("examples/toydata")
        pheno_fn = "examples/toydata.phe"

    # set up grid
    ##############################
    num_steps_delta = 10
    num_steps_k = 5

    # log_2 space and all SNPs
    #k_values = np.logspace(0, 9, base=2, num=num_steps_k, endpoint=True).tolist() + [10000]
    if k_values is None:
        k_values = [
            0, 1, 5, 10, 20, 50, 100, 500, 1000, 2000, 5000, 10000,
            456345643256
        ]  #100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 400, 500, 1000]
    if delta_values is None:
        delta_values = np.logspace(-10,
                                   10,
                                   endpoint=True,
                                   num=num_steps_delta,
                                   base=np.exp(1))
    #delta_values = [np.exp(1), np.exp(2), np.exp(3), np.exp(4), np.exp(5), np.exp(6)]

    if strategy is None:
        strategy = 'lmm_full_cv'
        select_by_ll = True
    if 0:
        strategy = 'insample_cv'
        select_by_ll = True
    # where to save output
    ##############################
    if output_prefix is None:
        output_prefix = "example_pc%i" % (num_pcs)

    # go!
    fss = FeatureSelectionStrategy(bed_fn,
                                   pheno_fn,
                                   num_folds,
                                   random_state=random_state,
                                   num_pcs=num_pcs,
                                   num_snps_in_memory=num_snps_in_memory,
                                   interpolate_delta=False,
                                   cov_fn=cov_fn)

    best_k, best_delta, best_obj, best_snps = fss.perform_selection(
        k_values,
        delta_values,
        output_prefix=output_prefix,
        select_by_ll=select_by_ll,
        strategy=strategy,
        penalty=penalty)
    res = {
        'best_k': best_k,
        'best_delta': best_delta,
        'best_obj': best_obj,
        'best_snps': best_snps
    }
    return res