Exemple #1
0
def execute_dual_fs(test_snps, pheno, G0, covar):
    """
    implementation of dual-kernel feature selection
    """
    
    result = {}
    fs_result = {}
    
    
    # extract data
    G_test = test_snps.read().standardize().val
    G_train_unnorm = G0.read().standardize().val
    
    # fs conditioned on full kernel
    select = FeatureSelectionInSample(max_log_k=7, order_by_lmm=True)
    fs_result["insample_cond_full"] = select.run_select(G_train_unnorm, G_train_unnorm, pheno["vals"], cov=covar["vals"])
    best_k, fs_idx, best_mix, best_delta = fs_result["insample_cond_full"]
    print "best_k:", best_k, ", best_mix:", best_mix

    # set up foreground kernel
    G1 = G0[:,fs_idx]
    
    result["full_fs_low"] = single_snp(test_snps, pheno, G0=G0, covar=covar, G1=G1, mixing=best_mix).sort(["Chr", "ChrPos"])["PValue"].as_matrix()

    return result, fs_result
Exemple #2
0
def execute_dual_fs(test_snps, pheno, G0, covar):
    """
    implementation of dual-kernel feature selection
    """

    result = {}
    fs_result = {}

    # extract data
    G_test = test_snps.read().standardize().val
    G_train_unnorm = G0.read().standardize().val

    # fs conditioned on full kernel
    select = FeatureSelectionInSample(max_log_k=7, order_by_lmm=True)
    fs_result["insample_cond_full"] = select.run_select(G_train_unnorm,
                                                        G_train_unnorm,
                                                        pheno["vals"],
                                                        cov=covar["vals"])
    best_k, fs_idx, best_mix, best_delta = fs_result["insample_cond_full"]
    print "best_k:", best_k, ", best_mix:", best_mix

    # set up foreground kernel
    G1 = G0[:, fs_idx]

    result["full_fs_low"] = single_snp(test_snps,
                                       pheno,
                                       G0=G0,
                                       covar=covar,
                                       G1=G1,
                                       mixing=best_mix).sort(
                                           ["Chr",
                                            "ChrPos"])["PValue"].as_matrix()

    return result, fs_result
Exemple #3
0
    def test_regression_lr(self):

        # invoke fs
        select = FeatureSelectionInSample(n_folds=2, max_log_k=6, order_by_lmm=False, measure="mse", random_state=42)
        best_k, feat_idx, best_mix, best_delta = select.run_select(self.G, self.G, self.y, cov=self.G_cov)    
    
        # print results
        print "best_k:", best_k
        print "best_mix:", best_mix
        print "best_delta:", best_delta

        self.assertEqual(best_k, 32)
        self.assertAlmostEqual(best_mix, 0.6786566031577429, places=6)
        self.assertAlmostEqual(best_delta, 0.70148446599200931, places=6)
Exemple #4
0
    def test_regression_lr(self):

        # invoke fs
        select = FeatureSelectionInSample(n_folds=2, max_log_k=6, order_by_lmm=False, measure="mse", random_state=42)
        best_k, feat_idx, best_mix, best_delta = select.run_select(self.G, self.G, self.y, cov=self.G_cov)    
    
        # print results
        print("best_k:", best_k)
        print("best_mix:", best_mix)
        print("best_delta:", best_delta)

        self.assertEqual(best_k, 32)
        self.assertAlmostEqual(best_mix, 0.6786566031577429, places=6)
        self.assertAlmostEqual(best_delta, 0.70148446599200931, places=6)
Exemple #5
0
    def test_regression_lmm(self):


        # invoke fs
        select = FeatureSelectionInSample(n_folds=2, max_log_k=6, order_by_lmm=True, measure="mse", random_state=42)
        best_k, feat_idx, best_mix, best_delta = select.run_select(self.G, self.G, self.y, cov=self.G_cov)    
    
        # print results
        print "best_k:", best_k
        print "best_mix:", best_mix
        print "best_delta:", best_delta

        self.assertEqual(best_k, 64)
        self.assertAlmostEqual(best_mix, 0.8621642030968627, places=6)
        self.assertAlmostEqual(best_delta, 0.7255878551207211, places=6)
Exemple #6
0
    def test_regression_lmm(self):


        # invoke fs
        select = FeatureSelectionInSample(n_folds=2, max_log_k=6, order_by_lmm=True, measure="mse", random_state=42)
        best_k, feat_idx, best_mix, best_delta = select.run_select(self.G, self.G, self.y, cov=self.G_cov)    
    
        # print results
        print("best_k:", best_k)
        print("best_mix:", best_mix)
        print("best_delta:", best_delta)

        self.assertEqual(best_k, 64)
        self.assertAlmostEqual(best_mix, 0.8621642030968627, places=6)
        self.assertAlmostEqual(best_delta, 0.7255878551207211, places=6)
Exemple #7
0
    def test_old(self):
        do_plot = False
        from fastlmm.feature_selection.feature_selection_two_kernel import FeatureSelectionInSample
        from pysnptools.util import intersect_apply

        logging.info("TestSingleSnpAllPlusSelect test_old")

        bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed"
        pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"
        cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt"

        #load data
        ###################################################################
        snp_reader = Bed(bed_fn, count_A1=False)
        pheno = Pheno(pheno_fn)
        cov = Pheno(cov_fn)

        # intersect sample ids
        snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov])

        # read in snps

        # partition snps on chr5 vs rest
        test_chr = 5
        G0 = snp_reader[:, snp_reader.pos[:, 0] != test_chr].read(
            order='C').standardize()
        test_snps = snp_reader[:, snp_reader.pos[:, 0] == test_chr].read(
            order='C').standardize()

        y = pheno.read().val[:, 0]
        y -= y.mean()
        y /= y.std()

        # load covariates
        X_cov = cov.read().val
        X_cov.flags.writeable = False

        # invoke feature selection to learn which SNPs to use to build G1
        logging.info(
            "running feature selection conditioned on background kernel")
        # partition data into the first 50 SNPs on chr1 and all but chr1

        select = FeatureSelectionInSample(max_log_k=7,
                                          n_folds=7,
                                          order_by_lmm=True,
                                          measure="ll",
                                          random_state=42)
        best_k, feat_idx, best_mix, best_delta = select.run_select(G0.val,
                                                                   G0.val,
                                                                   y,
                                                                   cov=X_cov)

        # plot out of sample error
        if do_plot: select.plot_results(measure="ll")
        # select.plot_results(measure="mse")

        # print results
        logging.info("best_k:{0}".format(best_k))
        logging.info("best_mix:{0}".format(best_mix))
        logging.info("best_delta:{0}".format(best_delta))

        ###############################
        # use selected SNPs to build G1
        logging.info(feat_idx)
        G1 = G0[:, feat_idx]

        output_file_name = self.file_name("old")
        results_df = single_snp(test_snps,
                                pheno,
                                G0=G0,
                                G1=G1,
                                mixing=best_mix,
                                h2=None,
                                leave_out_one_chrom=False,
                                output_file_name=output_file_name,
                                count_A1=False)

        logging.info("results:")
        logging.info("#" * 40)
        logging.info(results_df.head())
        self.compare_files(results_df, "old")
    def test_old(self):
        do_plot = False
        from fastlmm.feature_selection.feature_selection_two_kernel import FeatureSelectionInSample
        from pysnptools.util import intersect_apply

        logging.info("TestSingleSnpAllPlusSelect test_old")

        bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed"
        pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"
        cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt"

        #load data
        ###################################################################
        snp_reader = Bed(bed_fn)
        pheno = Pheno(pheno_fn)
        cov = Pheno(cov_fn)

        # intersect sample ids
        snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov])

        # read in snps

        # partition snps on chr5 vs rest
        test_chr = 5
        G0 = snp_reader[:,snp_reader.pos[:,0] != test_chr].read(order='C').standardize()
        test_snps = snp_reader[:,snp_reader.pos[:,0] == test_chr].read(order='C').standardize()


        y = pheno.read().val[:,0]
        y -= y.mean()
        y /= y.std()

        # load covariates
        X_cov = cov.read().val
        X_cov.flags.writeable = False

        # invoke feature selection to learn which SNPs to use to build G1
        logging.info("running feature selection conditioned on background kernel")
        # partition data into the first 50 SNPs on chr1 and all but chr1

        select = FeatureSelectionInSample(max_log_k=7, n_folds=7, order_by_lmm=True, measure="ll", random_state=42)
        best_k, feat_idx, best_mix, best_delta = select.run_select(G0.val, G0.val, y, cov=X_cov)    

        # plot out of sample error
        if do_plot: select.plot_results(measure="ll")
        # select.plot_results(measure="mse")

        # print results
        logging.info("best_k:{0}".format(best_k))
        logging.info("best_mix:{0}".format(best_mix))
        logging.info("best_delta:{0}".format(best_delta))


        ###############################
        # use selected SNPs to build G1
        logging.info(feat_idx)
        G1 = G0[:,feat_idx]

        output_file_name = self.file_name("old")
        results_df = single_snp(test_snps, pheno, G0=G0, G1=G1, mixing=best_mix, h2=None,leave_out_one_chrom=False,output_file_name=output_file_name)

        logging.info("results:")
        logging.info("#"*40)
        logging.info(results_df.head())
        self.compare_files(results_df,"old")