Esempio n. 1
0
            def mapper_gather_lots(i_fold_and_pair):
                i_fold, (train_idx, test_idx) = i_fold_and_pair
                logging.info("Working on GWAS_1K and k search, chrom={0}, i_fold={1}".format(test_chr, i_fold))

                G_train = G_for_chrom[train_idx,:]

                #Precompute whole x whole standardized on train
                from fastlmm.association.single_snp import _internal_determine_block_size, _block_size_from_GB_goal
                min_count = _internal_determine_block_size(G_for_chrom, None, None, force_full_rank, force_low_rank)
                block_size = _block_size_from_GB_goal(GB_goal, G_for_chrom.iid_count, min_count)
                K_whole_unittrain = _SnpWholeWithTrain(whole=G_for_chrom,train_idx=train_idx, standardizer=Unit(), block_size=block_size).read()

                assert np.array_equal(K_whole_unittrain.iid,G_for_chrom.iid),"real assert"
                K_train = K_whole_unittrain[train_idx]
                    
                single_snp_result = single_snp(test_snps=G_train, K0=K_train, pheno=pheno, #iid intersection means when can give the whole covariate and pheno
                             covar=covar, leave_out_one_chrom=False,
                             GB_goal=GB_goal,  force_full_rank=force_full_rank, force_low_rank=force_low_rank, mixing=mixing, h2=h2)

                is_all = (i_fold == n_folds) if n_folds > 1 else True

                k_list_in =  [0] + [int(k) for k in k_list if 0 < k and k < len(single_snp_result)]

                if is_all:
                    top_snps = list(single_snp_result.SNP[:max_k])
                else:
                    top_snps = None

                if i_fold == n_folds:
                    k_index_to_nLL = None
                else:
                    k_index_to_nLL = []
                    for k in k_list_in:
                        top_k = G_for_chrom[:,G_for_chrom.sid_to_index(single_snp_result.SNP[:k])]
                        logging.info("Working on chr={0}, i_fold={1}, and K_{2}".format(test_chr,i_fold,k))

                        top_k_train = top_k[train_idx,:] if k > 0 else None
                        fastlmm = FastLMM(force_full_rank=force_full_rank, force_low_rank=force_low_rank,GB_goal=GB_goal)
                        fastlmm.fit(K0_train=K_train, K1_train=top_k_train, X=covar, y=pheno,mixing=mixing,h2raw=h2) #iid intersection means when can give the whole covariate and pheno
    
                        top_k_test = top_k[test_idx,:] if k > 0 else None
                        K0_whole_test = K_whole_unittrain[:,test_idx]
                        nLL = fastlmm.score(K0_whole_test=K0_whole_test,K1_whole_test=top_k_test,X=covar,y=pheno) #iid intersection means when can give the whole covariate and pheno
                        k_index_to_nLL.append(nLL)

                if i_fold > 0:
                    k_list_in = None
    
                return k_list_in, top_snps, k_index_to_nLL
            def mapper_gather_lots(i_fold_and_pair):
                i_fold, (train_idx, test_idx) = i_fold_and_pair
                logging.info("Working on GWAS_1K and k search, chrom={0}, i_fold={1}".format(test_chr, i_fold))

                G_train = G_for_chrom[train_idx,:]

                #Precompute whole x whole standardized on train
                from fastlmm.association.single_snp import _internal_determine_block_size, _block_size_from_GB_goal
                min_count = _internal_determine_block_size(G_for_chrom, None, None, force_full_rank, force_low_rank)
                block_size = _block_size_from_GB_goal(GB_goal, G_for_chrom.iid_count, min_count)
                K_whole_unittrain = _SnpWholeWithTrain(whole=G_for_chrom,train_idx=train_idx, standardizer=Unit(), block_size=block_size).read()

                assert np.array_equal(K_whole_unittrain.iid,G_for_chrom.iid),"real assert"
                K_train = K_whole_unittrain[train_idx]
                    
                single_snp_result = single_snp(test_snps=G_train, K0=K_train, pheno=pheno, #iid intersection means when can give the whole covariate and pheno
                             covar=covar, leave_out_one_chrom=False,
                             GB_goal=GB_goal,  force_full_rank=force_full_rank, force_low_rank=force_low_rank, mixing=mixing, h2=h2)

                is_all = (i_fold == n_folds) if n_folds > 1 else True

                k_list_in =  [0] + [int(k) for k in k_list if 0 < k and k < len(single_snp_result)]

                if is_all:
                    top_snps = list(single_snp_result.SNP[:max_k])
                else:
                    top_snps = None

                if i_fold == n_folds:
                    k_index_to_nLL = None
                else:
                    k_index_to_nLL = []
                    for k in k_list_in:
                        top_k = G_for_chrom[:,G_for_chrom.sid_to_index(single_snp_result.SNP[:k])]
                        logging.info("Working on chr={0}, i_fold={1}, and K_{2}".format(test_chr,i_fold,k))

                        top_k_train = top_k[train_idx,:] if k > 0 else None
                        fastlmm = FastLMM(force_full_rank=force_full_rank, force_low_rank=force_low_rank,GB_goal=GB_goal)
                        fastlmm.fit(K0_train=K_train, K1_train=top_k_train, X=covar, y=pheno,mixing=mixing,h2=h2) #iid intersection means when can give the whole covariate and pheno
    
                        top_k_test = top_k[test_idx,:] if k > 0 else None
                        K0_whole_test = K_whole_unittrain[:,test_idx]
                        nLL = fastlmm.score(K0_whole_test=K0_whole_test,K1_whole_test=top_k_test,X=covar,y=pheno) #iid intersection means when can give the whole covariate and pheno
                        k_index_to_nLL.append(nLL)

                if i_fold > 0:
                    k_list_in = None
    
                return k_list_in, top_snps, k_index_to_nLL
 def k_index_to_nLL_mapper(k):
     _, G_in, pheno_in, covar_in = _fixup(test_snps,
                                          G,
                                          pheno,
                                          covar,
                                          count_A1=count_A1)
     nll_sum = 0
     mse_sum = 0
     n_folds_in = 0
     for fold_index, (train_idx,
                      test_idx) in _kfold(G.iid_count,
                                          n_folds,
                                          seed,
                                          end_with_all=False,
                                          iid_to_index=G.iid_to_index):
         assert set(train_idx).isdisjoint(set(test_idx)), "real assert"
         top_snps_in_fold = fold_index_to_top_snps[fold_index][:k]
         sid_idx_in_fold = G_in.sid_to_index(top_snps_in_fold)
         G_train = G_in[train_idx, sid_idx_in_fold] if k > 0 else None
         fastlmm = FastLMM(force_full_rank=force_full_rank,
                           force_low_rank=force_low_rank,
                           GB_goal=GB_goal)
         fastlmm.fit(
             K0_train=G_train,
             X=covar_in[train_idx, :],
             y=pheno_in[train_idx, :],
             h2raw=h2
         )  #iid intersection means when can give the whole covariate and pheno
         G_test = G_in[
             test_idx, sid_idx_in_fold] if k > 0 else KernelIdentity(
                 G_in.iid, G_in.iid[test_idx]
             )  #!!! instead of this, which blows up when # of iids is large, should switch to linear regression model with k is 0
         nll, mse = fastlmm.score(
             K0_whole_test=G_test,
             X=covar_in[test_idx, :],
             y=pheno_in[test_idx, :],
             return_mse_too=True
         )  #iid intersection means when can give the whole covariate and pheno
         nll_sum += nll
         mse_sum += mse
         n_folds_in += 1
     logging.info("k={0},nLL={1},average mse={2}".format(
         k, nll_sum, mse_sum / n_folds_in))
     return nll_sum
    def test_api(self):
        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        #####################################################
        # Train and standardize cov and then apply to test
        #####################################################

        cov_train, unit_trained = self.covariate_whole[train_idx,:].read().standardize(Unit(),return_trained=True)
        cov_test = self.covariate_whole[test_idx,:].read().standardize(unit_trained)

        #####################################################
        # standardize whole kernel from snps (both ways) and then pull out the 3 parts
        #####################################################
        
        whole_kernel = SnpKernel(self.covariate_whole,Unit()).read().standardize(DiagKtoN())
        train_kernel = whole_kernel[train_idx].read(order='A',view_ok=True)
        test_kernel = whole_kernel[train_idx,test_idx].read(order='A',view_ok=True)
        test_test_kernel = whole_kernel[test_idx,test_idx].read(order='A',view_ok=True)

        #####################################################
        # create train_train, train_test, and test_test based on just the training snps (both standardizations)
        #####################################################

        K_train = SnpKernel(self.snpreader_whole[train_idx,:],Unit(),block_size=100)
        train_train_kernel, snp_trained, kernel_trained = K_train._read_with_standardizing(to_kerneldata=True, kernel_standardizer=DiagKtoN(), return_trained=True)

        K_whole_test = _SnpWholeTest(train=self.snpreader_whole[train_idx,:],test=self.snpreader_whole[test_idx,:],standardizer=snp_trained,block_size=100)
        train_idx2 = K_whole_test.iid0_to_index(self.snpreader_whole.iid[train_idx]) #The new reader may have the iids in a different order than the original reader
        train_test_kernel = K_whole_test[train_idx2,:].read().standardize(kernel_trained)

        test_idx2 = K_whole_test.iid0_to_index(self.snpreader_whole.iid[test_idx])
        test_test_kernel = K_whole_test[test_idx2,:].read().standardize(kernel_trained)

        #####################################################
        # How does predict look with whole_test as input?
        #####################################################

        # a. - standardize whole up front
        whole_kernel = SnpKernel(self.snpreader_whole,Unit(),block_size=100).read().standardize()
        train_kernel = whole_kernel[train_idx].read(order='A',view_ok=True)
        whole_test_kernel = whole_kernel[:,test_idx].read(order='A',view_ok=True)
        fastlmm1 = FastLMM(snp_standardizer=SS_Identity(), kernel_standardizer=KS_Identity())
        fastlmm1.fit(K0_train=train_kernel, X=self.covariate_whole, y=self.pheno_whole) #iid intersection means we won't really be using whole covar or pheno
        predicted_pheno, covar = fastlmm1.predict(K0_whole_test=whole_test_kernel, X=self.covariate_whole,count_A1=False)
        output_file = self.file_name("whole")
        Dat.write(output_file,predicted_pheno)
        self.compare_files(predicted_pheno,"whole")

        # b -- just files
        fastlmm2 = FastLMM()
        fastlmm2.fit(K0_train=self.snpreader_whole[train_idx,:], X=self.covariate_whole, y=self.pheno_whole[train_idx,:]) #iid intersection means we won't really be using whole covar
        predicted_pheno, covar = fastlmm2.predict(K0_whole_test=self.snpreader_whole[test_idx,:], X=self.covariate_whole,count_A1=False)
        self.compare_files(predicted_pheno,"one")
Esempio n. 5
0
    def predict(self,
                X=None,
                K0_whole_test=None,
                K1_whole_test=None,
                iid_if_none=None):
        """
        Method for predicting from a fitted :class:`FastLMM` predictor.
        If the examples in X, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected.

        :param X: testing covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param K0_whole_test: Must be None. Represents the identity similarity matrix.
        :type K0_whole_test: None

        :param K1_whole_test: Must be None. Represents the identity similarity matrix.
        :type K1_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader`

        :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided.
        :type iid_if_none: an ndarray of two strings

        :rtype: A :class:`SnpData` of the means and a :class:`KernelData` of the covariance
        """

        assert self.is_fitted, "Can only predict after predictor has been fitted"
        assert K0_whole_test is None or isinstance(
            K0_whole_test, KernelIdentity)  # could also accept no snps
        assert K1_whole_test is None or isinstance(
            K1_whole_test, KernelIdentity)  # could also accept no snps

        X = _pheno_fixup(X, iid_if_none=iid_if_none)
        X = X.read().standardize(self.covar_unit_trained)

        # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset
        X = SnpData(iid=X.iid,
                    sid=FastLMM._new_snp_name(X),
                    val=np.c_[X.read().val,
                              np.ones((X.iid_count, 1))])
        assert np.array_equal(
            X.sid, self.covar_sid
        ), "Expect covar sids to be the same in train and test."

        pheno_predicted = X.val.dot(self.beta).reshape(-1, 1)
        ret0 = SnpData(iid=X.iid,
                       sid=self.pheno_sid,
                       val=pheno_predicted,
                       pos=np.array([[np.nan, np.nan, np.nan]]),
                       name="linear regression Prediction"
                       )  #!!!replace 'parent_string' with 'name'

        from pysnptools.kernelreader import KernelData
        ret1 = KernelData(iid=X.iid,
                          val=np.eye(X.iid_count) * self.ssres /
                          self.iid_count)
        return ret0, ret1
 def k_index_to_nLL_mapper(k):
     _, G_in, pheno_in, covar_in = _fixup(test_snps, G, pheno, covar,count_A1=count_A1)
     nll_sum=0
     mse_sum = 0
     n_folds_in = 0
     for fold_index, (train_idx, test_idx) in _kfold(G.iid_count, n_folds, seed, end_with_all=False,iid_to_index=G.iid_to_index):
         assert set(train_idx).isdisjoint(set(test_idx)), "real assert"
         top_snps_in_fold = fold_index_to_top_snps[fold_index][:k]
         sid_idx_in_fold = G_in.sid_to_index(top_snps_in_fold)
         G_train = G_in[train_idx,sid_idx_in_fold] if k > 0 else None
         fastlmm = FastLMM(force_full_rank=force_full_rank, force_low_rank=force_low_rank,GB_goal=GB_goal)
         fastlmm.fit(K0_train=G_train, X=covar_in[train_idx,:], y=pheno_in[train_idx,:], h2raw=h2) #iid intersection means when can give the whole covariate and pheno
         G_test = G_in[test_idx,sid_idx_in_fold] if k > 0 else KernelIdentity(G_in.iid,G_in.iid[test_idx]) #!!! instead of this, which blows up when # of iids is large, should switch to linear regression model with k is 0
         nll,mse = fastlmm.score(K0_whole_test=G_test,X=covar_in[test_idx,:],y=pheno_in[test_idx,:],return_mse_too=True) #iid intersection means when can give the whole covariate and pheno
         nll_sum += nll
         mse_sum += mse
         n_folds_in += 1
     logging.info("k={0},nLL={1},average mse={2}".format(k,nll_sum,mse_sum / n_folds_in))
     return nll_sum
    def test_notebook1(self):
        do_plot=False

        import matplotlib.pyplot as plt
        from pysnptools.snpreader import Pheno,Bed
        bed = Bed(self.pythonpath + "/tests/datasets/synth/all",count_A1=False)
        cov = Pheno(self.pythonpath + "/tests/datasets/synth/cov.txt")
        pheno = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt").read()

        # Now we learn from the first 400 students.
        training = bed[:400,:] #!!!later: the learning code doesn't like it if there are two instances of bed[:400] that are not "is -equal"
        fastlmm2 = FastLMM(GB_goal=2).fit(K0_train=training,
                                            X=cov[:400,:],
                                            y=pheno[:400,:])

        # Predict on training data:
        predicted_score,covariance = fastlmm2.predict(K0_whole_test=training,
                                                            X=cov[:400,:],count_A1=False)

        assert np.array_equal(pheno.iid[:400],predicted_score.iid), "for plots to make sense, the iids must be in the order"
        if do_plot:
            plt.plot(pheno.val[:400,:],predicted_score.val,"b.",[-5,5],[-5,5],"-r")
            plt.errorbar(pheno.val[:400,:],predicted_score.val, yerr=np.sqrt(np.diag(covariance.val)),fmt='.')
            plt.xlabel('score (actual train)')
            plt.ylabel('predicted (test on train with stdev)')
            plt.show()

        # How well does this model predict the (unseen) TEST data?
        predicted_score,covariance = fastlmm2.predict(K0_whole_test=bed[400:500,:],
                                                            X=cov[400:500,:],count_A1=False)

        assert np.array_equal(pheno.iid[400:500],predicted_score.iid), "for plots to make sense, the iids must be in the order"
        if do_plot:
            plt.plot(pheno.val[400:500,:],predicted_score.val,"b.",[-5,5],[-5,5],"-r")
            plt.errorbar(pheno.val[400:500,:],predicted_score.val, yerr=np.sqrt(np.diag(covariance.val)),fmt='.')
            plt.xlabel('score (actual test)')
            plt.ylabel('predicted')
            plt.show()
    def predict(self,X=None,K0_whole_test=None,K1_whole_test=None,iid_if_none=None,count_A1=None):
        """
        Method for predicting from a fitted :class:`FastLMM` predictor.
        If the examples in X, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected.

        :param X: testing covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param K0_whole_test: Must be None. Represents the identity similarity matrix.
        :type K0_whole_test: None

        :param K1_whole_test: Must be None. Represents the identity similarity matrix.
        :type K1_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader`

        :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided.
        :type iid_if_none: an ndarray of two strings

        :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
             alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
        :type count_A1: bool

        :rtype: A :class:`SnpData` of the means and a :class:`KernelData` of the covariance
        """

        assert self.is_fitted, "Can only predict after predictor has been fitted"
        assert K0_whole_test is None or isinstance(K0_whole_test,KernelIdentity) # could also accept no snps
        assert K1_whole_test is None or isinstance(K1_whole_test,KernelIdentity) # could also accept no snps

        X = _pheno_fixup(X,iid_if_none=iid_if_none,count_A1=count_A1)
        X = X.read().standardize(self.covar_unit_trained)

        # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset
        X = SnpData(iid=X.iid,
                              sid=FastLMM._new_snp_name(X),
                              val=np.c_[X.read().val,np.ones((X.iid_count,1))])
        assert np.array_equal(X.sid,self.covar_sid), "Expect covar sids to be the same in train and test."

        pheno_predicted = X.val.dot(self.beta).reshape(-1,1)
        ret0 = SnpData(iid = X.iid, sid=self.pheno_sid,val=pheno_predicted,pos=np.array([[np.nan,np.nan,np.nan]]),name="linear regression Prediction") #!!!replace 'parent_string' with 'name'

        from pysnptools.kernelreader import KernelData
        ret1 = KernelData(iid=X.iid,val=np.eye(X.iid_count)* self.ssres / self.iid_count)
        return ret0, ret1
Esempio n. 9
0
    def fit(self,
            X=None,
            y=None,
            K0_train=None,
            K1_train=None,
            h2=None,
            mixing=None,
            count_A1=None):
        """
        Method for training a :class:`FastLMM` predictor. If the examples in X, y, K0_train, K1_train are not the same, they will be reordered and intersected.

        :param X: training covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ (such as `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__) or string.

        :param y: training phenotype:
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type y: a PySnpTools `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ (such as `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__) or string.

        :param K0_train: Must be None. Represents the identity similarity matrix.
        :type K0_train: None

        :param K1_train: Must be None. Represents the identity similarity matrix.
        :type K1_train: `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string or `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__

        :param h2: Ignored. Optional.
        :type h2: number

        :param mixing: Ignored. Optional.
        :type mixing: number

        :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
             alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
        :type count_A1: bool


        :rtype: self, the fitted Linear Regression predictor
        """
        with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _:

            self.is_fitted = True
            assert K0_train is None  # could also accept that ID or no snps
            assert K1_train is None  # could also accept that ID or no snps

            assert y is not None, "y must be given"

            y = _pheno_fixup(y, count_A1=count_A1)
            assert y.sid_count == 1, "Expect y to be just one variable"
            X = _pheno_fixup(X, iid_if_none=y.iid, count_A1=count_A1)

            X, y = intersect_apply([X, y])
            y = y.read()
            X, covar_unit_trained = X.read().standardize(
                self.covariate_standardizer, return_trained=True)

            # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset
            X = SnpData(iid=X.iid,
                        sid=FastLMM._new_snp_name(X),
                        val=np.c_[X.val, np.ones((X.iid_count, 1))])

            lsqSol = np.linalg.lstsq(X.val, y.val[:, 0], rcond=-1)
            bs = lsqSol[0]  #weights
            r2 = lsqSol[1]  #squared residuals
            D = lsqSol[2]  #rank of design matrix
            N = y.iid_count

            self.beta = bs
            self.ssres = float(r2)
            self.sstot = ((y.val - y.val.mean())**2).sum()
            self.covar_unit_trained = covar_unit_trained
            self.iid_count = X.iid_count
            self.covar_sid = X.sid
            self.pheno_sid = y.sid
            return self
Esempio n. 10
0
    def predict(self,
                X=None,
                K0_whole_test=None,
                K1_whole_test=None,
                iid_if_none=None,
                count_A1=None):
        """
        Method for predicting from a fitted :class:`FastLMM` predictor.
        If the examples in X, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected.

        :param X: testing covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ (such as `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__) or string.

        :param K0_whole_test: Must be None. Represents the identity similarity matrix.
        :type K0_whole_test: None

        :param K1_whole_test: Must be None. Represents the identity similarity matrix.
        :type K1_whole_test: `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string or `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__

        :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided.
        :type iid_if_none: an ndarray of two strings

        :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
             alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
        :type count_A1: bool

        :rtype: A `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__ of the means and a :class:`KernelData` of the covariance
        """
        with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _:
            assert self.is_fitted, "Can only predict after predictor has been fitted"
            assert K0_whole_test is None or isinstance(
                K0_whole_test, KernelIdentity)  # could also accept no snps
            assert K1_whole_test is None or isinstance(
                K1_whole_test, KernelIdentity)  # could also accept no snps

            X = _pheno_fixup(X, iid_if_none=iid_if_none, count_A1=count_A1)
            X = X.read().standardize(self.covar_unit_trained)

            # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset
            X = SnpData(iid=X.iid,
                        sid=FastLMM._new_snp_name(X),
                        val=np.c_[X.read().val,
                                  np.ones((X.iid_count, 1))])
            assert np.array_equal(
                X.sid, self.covar_sid
            ), "Expect covar sids to be the same in train and test."

            pheno_predicted = X.val.dot(self.beta).reshape(-1, 1)
            ret0 = SnpData(iid=X.iid,
                           sid=self.pheno_sid,
                           val=pheno_predicted,
                           pos=np.array([[np.nan, np.nan, np.nan]]),
                           name="linear regression Prediction"
                           )  #!!!replace 'parent_string' with 'name'

            from pysnptools.kernelreader import KernelData
            ret1 = KernelData(iid=X.iid,
                              val=np.eye(X.iid_count) * self.ssres /
                              self.iid_count)
            return ret0, ret1
Esempio n. 11
0
def run_fastlmm(args):
    from pysnptools.snpreader import SnpData, Pheno, SnpReader
    from utils import prepare_output_file, read_cvindex
    from fastlmm.inference import FastLMM
    import dill as pickle

    logger.info('read phenotypes from file: ' + args.phenotype_file)
    phenotypes = pd.read_table(args.phenotype_file)
    iid = np.repeat(phenotypes['id'].values.astype('S')[:, np.newaxis],
                    2,
                    axis=1)
    if args.cvindex_file is not None:
        logger.info('read indices from file: ' + args.cvindex_file)
        train_index, test_index = read_cvindex(args.cvindex_file)
    else:
        train_index = np.nonzero((phenotypes['type'] == 'training').values)[0]
        test_index = np.nonzero((phenotypes['type'] == 'test').values)[0]

    n_snps_total = get_num_snps(args.snp_file)
    n_snps_sel = min(n_snps_total, args.n_snps)
    logger.info('number of sampled SNPs: %d' % n_snps_sel)
    sel_snps = np.random.choice(n_snps_total, size=n_snps_sel)

    logger.info('read SNP file (for test): ' + args.snp_file)
    test_snps = get_snpdata(iid,
                            args.snp_file,
                            transpose=args.transpose_x,
                            snp_indices=sel_snps,
                            std_filter_indices=train_index)
    logger.info('number of sampled SNPs after filtering by std: %d' %
                test_snps.shape[1])
    logger.info('read SNP file (for K0): ' + args.k0_file)
    K0 = get_snpdata(iid, args.k0_file, transpose=args.transpose_k0)

    if args.seed:
        logger.info('set random seed for numpy: %d' % args.seed)
        np.seed(args.seed)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    df_pheno = phenotypes.copy()
    df_pheno['fid'] = df_pheno['id']
    df_pheno['iid'] = df_pheno['id']
    traits = ('trait1', 'trait2', 'trait3')
    for trait in traits:
        pheno_file = os.path.join(args.output_dir, 'pheno.%s.txt' % trait)
        logger.info('create Pheno file: ' + pheno_file)
        df_pheno.loc[train_index, ['fid', 'iid', trait]].to_csv(pheno_file,
                                                                index=False,
                                                                sep='\t',
                                                                header=False)
        pheno = Pheno(pheno_file)
        logger.info('train FastLMM model for %s' % trait)
        model = FastLMM(GB_goal=args.GB_goal, force_low_rank=True)
        model.fit(X=test_snps[train_index, :],
                  y=pheno,
                  K0_train=K0,
                  penalty=args.penalty,
                  Smin=1.0)
        logger.info('fitted h2: %f' % model.h2raw)
        logger.info('predict using the FastLMM model for %s' % trait)
        y_mean, y_var = model.predict(X=test_snps[test_index, :],
                                      K0_whole_test=K0[test_index, :])
        y_true = phenotypes[trait][test_index].values
        result_file = os.path.join(args.output_dir, 'predictions.%s' % trait)
        logger.info('save predictions to file: ' + result_file)
        prepare_output_file(result_file)
        with h5py.File(result_file, 'w') as f:
            f.create_dataset('y_mean', data=y_mean.val)
            f.create_dataset('y_var', data=y_var.val)
            f.create_dataset('y_true', data=y_true)
            f.create_dataset('h2raw', data=model.h2raw)
            f.create_dataset('sel_snps', data=sel_snps)

        model_file = os.path.join(args.output_dir, 'model.fastlmm.%s' % trait)
        logger.info('save model to file: ' + model_file)
        with open(model_file, 'wb') as f:
            pickle.dump(model, f)
Esempio n. 12
0
    def fit(self,
            X=None,
            y=None,
            K0_train=None,
            K1_train=None,
            h2=None,
            mixing=None):
        """
        Method for training a :class:`FastLMM` predictor. If the examples in X, y, K0_train, K1_train are not the same, they will be reordered and intersected.

        :param X: training covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param y: training phenotype:
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type y: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param K0_train: Must be None. Represents the identity similarity matrix.
        :type K0_train: None

        :param K1_train: Must be None. Represents the identity similarity matrix.
        :type K1_train: :class:`.SnpReader` or a string or :class:`.KernelReader`

        :param h2: Ignored. Optional.
        :type h2: number

        :param mixing: Ignored. Optional.
        :type mixing: number

        :rtype: self, the fitted Linear Regression predictor
        """
        self.is_fitted = True
        assert K0_train is None  # could also accept that ID or no snps
        assert K1_train is None  # could also accept that ID or no snps

        assert y is not None, "y must be given"

        y = _pheno_fixup(y)
        assert y.sid_count == 1, "Expect y to be just one variable"
        X = _pheno_fixup(X, iid_if_none=y.iid)

        X, y = intersect_apply([X, y])
        y = y.read()
        X, covar_unit_trained = X.read().standardize(
            self.covariate_standardizer, return_trained=True)

        # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset
        X = SnpData(iid=X.iid,
                    sid=FastLMM._new_snp_name(X),
                    val=np.c_[X.val, np.ones((X.iid_count, 1))])

        lsqSol = np.linalg.lstsq(X.val, y.val[:, 0])
        bs = lsqSol[0]  #weights
        r2 = lsqSol[1]  #squared residuals
        D = lsqSol[2]  #rank of design matrix
        N = y.iid_count

        self.beta = bs
        self.ssres = float(r2)
        self.sstot = ((y.val - y.val.mean())**2).sum()
        self.covar_unit_trained = covar_unit_trained
        self.iid_count = X.iid_count
        self.covar_sid = X.sid
        self.pheno_sid = y.sid
        return self
    def fit(self, X=None, y=None, K0_train=None, K1_train=None, h2=None, mixing=None,count_A1=None):
        """
        Method for training a :class:`FastLMM` predictor. If the examples in X, y, K0_train, K1_train are not the same, they will be reordered and intersected.

        :param X: training covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param y: training phenotype:
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type y: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param K0_train: Must be None. Represents the identity similarity matrix.
        :type K0_train: None

        :param K1_train: Must be None. Represents the identity similarity matrix.
        :type K1_train: :class:`.SnpReader` or a string or :class:`.KernelReader`

        :param h2: Ignored. Optional.
        :type h2: number

        :param mixing: Ignored. Optional.
        :type mixing: number

        :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
             alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
        :type count_A1: bool


        :rtype: self, the fitted Linear Regression predictor
        """
        self.is_fitted = True
        assert K0_train is None # could also accept that ID or no snps
        assert K1_train is None # could also accept that ID or no snps

        assert y is not None, "y must be given"

        y = _pheno_fixup(y,count_A1=count_A1)
        assert y.sid_count == 1, "Expect y to be just one variable"
        X = _pheno_fixup(X, iid_if_none=y.iid,count_A1=count_A1)

        X, y  = intersect_apply([X, y])
        y = y.read()
        X, covar_unit_trained = X.read().standardize(self.covariate_standardizer,return_trained=True)

        # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset
        X = SnpData(iid=X.iid,
                                sid=FastLMM._new_snp_name(X),
                                val=np.c_[X.val,np.ones((X.iid_count,1))])


        lsqSol = np.linalg.lstsq(X.val, y.val[:,0],rcond=-1)
        bs=lsqSol[0] #weights
        r2=lsqSol[1] #squared residuals
        D=lsqSol[2]  #rank of design matrix
        N=y.iid_count

        self.beta = bs
        self.ssres = float(r2)
        self.sstot = ((y.val-y.val.mean())**2).sum()
        self.covar_unit_trained = covar_unit_trained
        self.iid_count = X.iid_count
        self.covar_sid = X.sid
        self.pheno_sid = y.sid
        return self