Ejemplo n.º 1
0
    def _read_with_standardizing(self,
                                 to_kerneldata,
                                 kernel_standardizer=DiagKtoN(),
                                 return_trained=False):
        '''
        Reads a SnpKernel with two cases
              If returning KernelData,
                 just calls snpreader._read_kernel, package, kernel_standardize
              If returning simple SnpKernel that needs no more standardization
                  read the reference and learn both standardization (but can't this cause multiple reads?)
        Note that snp_standardizer should be None or the standardizer instead the SnpKernel should have the placeholder value Standardizer()

        '''
        if to_kerneldata:
            val, snp_trained = self.snpreader._read_kernel(
                self.standardizer,
                block_size=self.block_size,
                return_trained=True)
            kernel = KernelData(iid=self.snpreader.iid,
                                val=val,
                                name=str(self))
            kernel, kernel_trained = kernel.standardize(kernel_standardizer,
                                                        return_trained=True)
        else:
            snpdata, snp_trained = self.snpreader.read().standardize(
                self.standardizer, return_trained=True)
            snpdata, kernel_trained = snpdata.standardize(kernel_standardizer,
                                                          return_trained=True)
            kernel = SnpKernel(snpdata, SS_Identity())

        if return_trained:
            return kernel, snp_trained, kernel_trained
        else:
            return kernel
Ejemplo n.º 2
0
    def _read_with_standardizing(self,
                                 to_kerneldata,
                                 kernel_standardizer=DiagKtoN(),
                                 return_trained=False,
                                 num_threads=None):
        '''
        Reads a SnpKernel with two cases
              If returning KernelData,
                 just calls snpreader._read_kernel, package, kernel_standardize
              If returning simple SnpKernel that needs no more standardization
                  read the reference and learn both standardization (but can't this cause multiple reads?)
        Note that snp_standardizer should be None or the standardizer instead the SnpKernel should have the placeholder value Standardizer()

        Will choose which array module to use based on the ARRAY_MODULE environment variable (e.g. 'numpy' (default) or 'cupy')

        '''
        logging.info("Starting '_read_with_standardizing'")
        xp = pstutil.array_module()

        if to_kerneldata:
            val, snp_trained = self.snpreader._read_kernel(
                self.standardizer,
                block_size=self.block_size,
                return_trained=True,
                num_threads=num_threads)
            kernel = KernelData(iid=self.snpreader.iid,
                                val=val,
                                name=str(self),
                                xp=xp)
            kernel, kernel_trained = kernel.standardize(
                kernel_standardizer,
                return_trained=True,
                num_threads=num_threads)
        else:
            snpdata, snp_trained = self.snpreader.read().standardize(
                self.standardizer,
                return_trained=True,
                num_threads=num_threads)
            snpdata, kernel_trained = snpdata.standardize(
                kernel_standardizer,
                return_trained=True,
                num_threads=num_threads)
            kernel = SnpKernel(snpdata, SS_Identity())

        logging.info("Ending '_read_with_standardizing'")
        if return_trained:
            return kernel, snp_trained, kernel_trained
        else:
            return kernel
Ejemplo n.º 3
0
    def reducer_closure(result_result_sequence):
        logging.info("starting ata reducer")
        iid = [[value, value] for value in sid]
        gtg_data = KernelData(iid=iid, val=np.zeros((sid_count,sid_count)))
        for result_result in result_result_sequence:
            for piece_index, gtg_piece in result_result:
                logging.info("combining ata reducer {0}".format(piece_index))
                start = debatch_closure(piece_index)
                stop = debatch_closure(piece_index+1)
                gtg_data.val[start:,start:stop] = gtg_piece
                gtg_data.val[start:stop,start+gtg_piece.shape[1]:] = gtg_piece[gtg_piece.shape[1]:,:].T
        result = writer(gtg_data)

        
        return result
Ejemplo n.º 4
0
 def k_mix(self,K0,K1): #!!!later add special case code for mixing==1 and 0
     K0_b = K0.read().standardize(self.kernel_trained0)
     K1_b = K1.read().standardize(self.kernel_trained1)
     # similar code elsewhere
     K = np.empty(K0_b.val.shape)
     _mix_from_Ks(K, K0_b.val, K1_b.val, self.mixing)
     K = KernelData(val=K,iid0=K0_b.iid0,iid1=K0_b.iid1)
     return K
Ejemplo n.º 5
0
    def predict(self,
                X=None,
                K0_whole_test=None,
                K1_whole_test=None,
                iid_if_none=None):
        """
        Method for predicting from a fitted :class:`FastLMM` predictor.
        If the examples in X, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected.

        :param X: testing covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param K0_whole_test: Must be None. Represents the identity similarity matrix.
        :type K0_whole_test: None

        :param K1_whole_test: Must be None. Represents the identity similarity matrix.
        :type K1_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader`

        :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided.
        :type iid_if_none: an ndarray of two strings

        :rtype: A :class:`SnpData` of the means and a :class:`KernelData` of the covariance
        """

        assert self.is_fitted, "Can only predict after predictor has been fitted"
        assert K0_whole_test is None or isinstance(
            K0_whole_test, KernelIdentity)  # could also accept no snps
        assert K1_whole_test is None or isinstance(
            K1_whole_test, KernelIdentity)  # could also accept no snps

        X = _pheno_fixup(X, iid_if_none=iid_if_none)
        X = X.read().standardize(self.covar_unit_trained)

        # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset
        X = SnpData(iid=X.iid,
                    sid=FastLMM._new_snp_name(X),
                    val=np.c_[X.read().val,
                              np.ones((X.iid_count, 1))])
        assert np.array_equal(
            X.sid, self.covar_sid
        ), "Expect covar sids to be the same in train and test."

        pheno_predicted = X.val.dot(self.beta).reshape(-1, 1)
        ret0 = SnpData(iid=X.iid,
                       sid=self.pheno_sid,
                       val=pheno_predicted,
                       pos=np.array([[np.nan, np.nan, np.nan]]),
                       name="linear regression Prediction"
                       )  #!!!replace 'parent_string' with 'name'

        from pysnptools.kernelreader import KernelData
        ret1 = KernelData(iid=X.iid,
                          val=np.eye(X.iid_count) * self.ssres /
                          self.iid_count)
        return ret0, ret1
Ejemplo n.º 6
0
    def _read_with_standardizing(self, to_kerneldata, kernel_standardizer=DiagKtoN(), return_trained=False):
        '''
        Reads a SnpKernel with two cases
              If returning KernelData,
                 just calls snpreader._read_kernel, package, kernel_standardize
              If returning simple SnpKernel that needs no more standardization
                  read the reference and learn both standardization (but can't this cause multiple reads?)
        Note that snp_standardizer should be None or the standardizer instead the SnpKernel should have the placeholder value Standardizer()

        '''
        if to_kerneldata:
            val, snp_trained = self.snpreader._read_kernel(self.standardizer,block_size=self.block_size,return_trained=True)
            kernel = KernelData(iid=self.snpreader.iid, val=val, name=str(self))
            kernel, kernel_trained = kernel.standardize(kernel_standardizer,return_trained=True)
        else:
            snpdata, snp_trained = self.snpreader.read().standardize(self.standardizer, return_trained=True)
            snpdata, kernel_trained = snpdata.standardize(kernel_standardizer, return_trained=True)
            kernel = SnpKernel(snpdata, SS_Identity())

        if return_trained:
            return kernel, snp_trained, kernel_trained
        else:
            return kernel
Ejemplo n.º 7
0
    def write_kernel(self, path, filetype='hdf5'):
        """Write constructed background kernel :math:`K_0` to file, using eihter pysnptools.kernelreader.KernelHdf5 or pysnptools.kernelreader.KernelNpz.

        :param str path: Path to the output file to be created.
        :param str filetype: Either 'hdf5' or 'npz'
        """
        if self.K0 is None:
            if self.G0 is not None:
                raise ValueError(
                    'G0 is initialized: Number of individuals < number of variants. In this case no kernel is constructed.'
                )
            raise ValueError(
                'K0 is not initialized, need to call compute_background_kernel() first'
            )
        elif filetype == 'hdf5':
            KernelHdf5.write(path, KernelData(self.iid_fid.values,
                                              val=self.K0))
        elif filetype == 'npz':
            KernelNpz.write(path, KernelData(self.iid_fid.values, val=self.K0))
        else:
            raise ValueError(
                'filetype has to be either "npz" or "hdf5", got {}'.format(
                    filetype))
Ejemplo n.º 8
0
    def generate_and_analyze(seed,
                             N,
                             do_shuffle,
                             just_testing=True,
                             map_function=None,
                             cache_folder=None):

        #Generate SNPs
        snpdata = snp_gen(fst=.1,
                          dfr=0,
                          iid_count=N,
                          sid_count=1000,
                          chr_count=10,
                          label_with_pop=True,
                          seed=seed)
        K_causal = snpdata.read_kernel(Unit()).standardize()

        #Generate geo-spatial locations and K_loc
        distance_between_centers = 2500000
        x0 = distance_between_centers * 0.5
        x1 = distance_between_centers * 1.5
        y0 = distance_between_centers
        y1 = distance_between_centers
        sd = distance_between_centers / 4.

        spatial_iid = snpdata.iid
        center_dict = {"0": (x0, y0), "1": (x1, y1)}
        centers = np.array(
            [center_dict[iid_item[0]] for iid_item in spatial_iid])
        np.random.seed(seed)
        logging.info("Generating positions for seed {0}".format(seed))
        spatial_coor = SnpData(
            iid=snpdata.iid,
            sid=["x", "y"],
            val=centers + np.random.multivariate_normal(
                [0, 0], [[1, 0], [0, 1]], size=len(centers)) * sd,
            parent_string="'spatial_coor_gen_original'")
        alpha = distance_between_centers
        spatial_val = spatial_similarity(spatial_coor.val, alpha, power=2)
        K_loc = KernelData(iid=snpdata.iid, val=spatial_val).standardize()

        #Generate phenotype
        iid = K_causal.iid
        iid_count = K_causal.iid_count
        np.random.seed(seed)
        pheno_causal = SnpData(iid=iid,
                               sid=["causal"],
                               val=np.random.multivariate_normal(
                                   np.zeros(iid_count),
                                   K_causal.val).reshape(-1, 1),
                               parent_string="causal")
        np.random.seed(seed ^ 998372)
        pheno_noise = SnpData(iid=iid,
                              sid=["noise"],
                              val=np.random.normal(size=iid_count).reshape(
                                  -1, 1),
                              parent_string="noise")
        np.random.seed(seed ^ 12230302)
        pheno_loc_original = SnpData(iid=iid,
                                     sid=["loc_original"],
                                     val=np.random.multivariate_normal(
                                         np.zeros(iid_count),
                                         K_loc.val).reshape(-1, 1),
                                     parent_string="loc_original")

        if do_shuffle:
            idx = np.arange(iid_count)
            np.random.seed(seed)
            np.random.shuffle(idx)
            pheno_loc = pheno_loc_original.read(
                view_ok=True
            )  #don't need to copy, because the next line will be fresh memory
            pheno_loc.val = pheno_loc.val[idx, :]
        else:
            pheno_loc = pheno_loc_original

        pheno = SnpData(iid=iid,
                        sid=["pheno_all"],
                        val=pheno_causal.val + pheno_noise.val + pheno_loc.val)

        #Analyze data
        alpha_list = [
            int(v) for v in np.logspace(np.log10(100), np.log10(1e10), 100)
        ]
        dataframe = heritability_spatial_correction(
            snpdata,
            spatial_coor.val,
            spatial_iid,
            alpha_list=[alpha] if just_testing else alpha_list,
            pheno=pheno,
            alpha_power=2,
            jackknife_count=0,
            permute_plus_count=0,
            permute_times_count=0,
            just_testing=just_testing,
            map_function=map_function,
            cache_folder=cache_folder)

        logging.info(dataframe)
        return dataframe
Ejemplo n.º 9
0
def work_item2(pheno, G_kernel, spatial_coor, spatial_iid, alpha, alpha_power,
               xxx_todo_changeme, xxx_todo_changeme1, xxx_todo_changeme2,
               just_testing, do_uncorr, do_gxe2, a2):

    #########################################
    # Load GPS info from filename if that's the way it is given
    ########################################
    (jackknife_index, jackknife_count, jackknife_seed) = xxx_todo_changeme
    (permute_plus_index, permute_plus_count,
     permute_plus_seed) = xxx_todo_changeme1
    (permute_times_index, permute_times_count,
     permute_times_seed) = xxx_todo_changeme2
    if isinstance(spatial_coor, str):
        assert spatial_iid is None, "if spatial_coor is a str, then spatial_iid should be None"
        gps_table = pd.read_csv(spatial_coor, delimiter=" ").dropna()
        spatial_iid = np.array([(v, v) for v in gps_table["id"].values])
        spatial_coor = gps_table[["south_new", "east_new"]].values

    #########################################
    # Remove any missing values from pheno
    ########################################
    assert pheno.sid_count == 1, "Expect only one pheno in work_item"
    pheno = pheno.read()
    pheno = pheno[pheno.val[:, 0] == pheno.
                  val[:, 0], :]  #Excludes NaN because NaN is not equal to NaN

    #########################################
    # Environment: Turn spatial info info a KernelData
    #########################################
    spatial_val = spatial_similarity(spatial_coor, alpha, power=alpha_power)
    E_kernel = KernelData(iid=spatial_iid, val=spatial_val)

    #########################################
    # Intersect, apply the jackknife or permutation, and then (because we now know the iids) standardize appropriately
    #########################################
    from pysnptools.util import intersect_apply
    G_kernel, E_kernel, pheno = intersect_apply([G_kernel, E_kernel, pheno])

    if jackknife_index >= 0:
        assert jackknife_count <= G_kernel.iid_count, "expect the number of groups to be less than the number of iids"
        assert jackknife_index < jackknife_count, "expect the jackknife index to be less than the count"
        m_fold = model_selection.KFold(n_splits=jackknife_count,
                                       shuffle=True,
                                       random_state=jackknife_seed %
                                       4294967295).split(
                                           list(range(G_kernel.iid_count)))
        iid_index, _ = _nth(m_fold, jackknife_index)
        pheno = pheno[iid_index, :]
        G_kernel = G_kernel[iid_index]
        E_kernel = E_kernel[iid_index]

    if permute_plus_index >= 0:
        #We shuffle the val, but not the iid, because that would cancel out.
        #Integrate the permute_plus_index into the random.
        np.random.seed((permute_plus_seed + permute_plus_index) % 4294967295)
        new_index = np.arange(G_kernel.iid_count)
        np.random.shuffle(new_index)
        E_kernel_temp = E_kernel[new_index].read()
        E_kernel = KernelData(
            iid=E_kernel.iid,
            val=E_kernel_temp.val,
            name="permutation {0}".format(permute_plus_index))

    pheno = pheno.read().standardize()  # defaults to Unit standardize
    G_kernel = G_kernel.read().standardize(
    )  # defaults to DiagKtoN standardize
    E_kernel = E_kernel.read().standardize(
    )  # defaults to DiagKtoN standardize

    #########################################
    # find h2uncoor, the best mixing weight of pure random noise and G_kernel
    #########################################

    if not do_uncorr:
        h2uncorr, nLLuncorr = np.nan, np.nan
    else:
        logging.info("Find best h2 for G_kernel")
        lmmg = LMM()
        lmmg.setK(K0=G_kernel.val)
        lmmg.setX(np.ones([G_kernel.iid_count, 1]))  # just a bias column
        lmmg.sety(pheno.val[:, 0])
        if not just_testing:
            resg = lmmg.findH2()
            h2uncorr, nLLuncorr = resg["h2"], resg["nLL"]
        else:
            h2uncorr, nLLuncorr = 0, 0
        logging.info("just G: h2uncorr: {0}, nLLuncorr: {1}".format(
            h2uncorr, nLLuncorr))

    #########################################
    # Find a2, the best mixing for G_kernel and E_kernel
    #########################################

    if a2 is None:
        logging.info("Find best mixing for G_kernel and E_kernel")
        lmm1 = LMM()
        lmm1.setK(K0=G_kernel.val, K1=E_kernel.val, a2=0.5)
        lmm1.setX(np.ones([G_kernel.iid_count, 1]))  # just a bias column
        lmm1.sety(pheno.val[:, 0])
        if not just_testing:
            res1 = lmm1.findA2()
            h2, a2, nLLcorr = res1["h2"], res1["a2"], res1["nLL"]
            h2corr = h2 * (1 - a2)
            e2 = h2 * a2
            h2corr_raw = h2
        else:
            h2corr, e2, a2, nLLcorr, h2corr_raw = 0, 0, .5, 0, 0
        logging.info(
            "G plus E mixture: h2corr: {0}, e2: {1}, a2: {2}, nLLcorr: {3} (h2corr_raw:{4})"
            .format(h2corr, e2, a2, nLLcorr, h2corr_raw))
    else:
        h2corr, e2, nLLcorr, h2corr_raw = np.nan, np.nan, np.nan, np.nan

    #########################################
    # Find a2_gxe2, the best mixing for G+E_kernel and the GxE kernel
    #########################################

    if not do_gxe2:
        gxe2, a2_gxe2, nLL_gxe2 = np.nan, np.nan, np.nan
    else:
        #Create the G+E kernel by mixing according to a2
        val = (1 - a2) * G_kernel.val + a2 * E_kernel.val
        GplusE_kernel = KernelData(iid=G_kernel.iid,
                                   val=val,
                                   name="{0} G + {1} E".format(1 - a2, a2))
        #Don't need to standardize GplusE_kernel because it's the weighted combination of standardized kernels

        # Create GxE Kernel and then find the best mixing of it and GplusE
        logging.info("Find best mixing for GxE and GplusE_kernel")

        val = G_kernel.val * E_kernel.val
        if permute_times_index >= 0:
            #We shuffle the val, but not the iid, because doing both would cancel out
            np.random.seed(
                (permute_times_seed + permute_times_index) % 4294967295)
            new_index = np.arange(G_kernel.iid_count)
            np.random.shuffle(new_index)
            val = pstutil.sub_matrix(val, new_index, new_index)

        GxE_kernel = KernelData(
            iid=G_kernel.iid, val=val, name="GxE"
        )  # recall that Python '*' is just element-wise multiplication
        GxE_kernel = GxE_kernel.standardize()

        lmm2 = LMM()
        lmm2.setK(K0=GplusE_kernel.val, K1=GxE_kernel.val, a2=0.5)
        lmm2.setX(np.ones([G_kernel.iid_count, 1]))  # just a bias column
        lmm2.sety(pheno.val[:, 0])
        if not just_testing:
            res2 = lmm2.findA2()
            gxe2, a2_gxe2, nLL_gxe2 = res2["h2"], res2["a2"], res2["nLL"]
            gxe2 *= a2_gxe2
        else:
            gxe2, a2_gxe2, nLL_gxe2 = 0, .5, 0
        logging.info(
            "G+E plus GxE mixture: gxe2: {0}, a2_gxe2: {1}, nLL_gxe2: {2}".
            format(gxe2, a2_gxe2, nLL_gxe2))

    #########################################
    # Return results
    #########################################

    ret = {
        "h2uncorr": h2uncorr,
        "nLLuncorr": nLLuncorr,
        "h2corr": h2corr,
        "h2corr_raw": h2corr_raw,
        "e2": e2,
        "a2": a2,
        "nLLcorr": nLLcorr,
        "gxe2": gxe2,
        "a2_gxe2": a2_gxe2,
        "nLL_gxe2": nLL_gxe2,
        "alpha": alpha,
        "alpha_power": alpha_power,
        "phen": np.array(pheno.sid, dtype='str')[0],
        "jackknife_index": jackknife_index,
        "jackknife_count": jackknife_count,
        "jackknife_seed": jackknife_seed,
        "permute_plus_index": permute_plus_index,
        "permute_plus_count": permute_plus_count,
        "permute_plus_seed": permute_plus_seed,
        "permute_times_index": permute_times_index,
        "permute_times_count": permute_times_count,
        "permute_times_seed": permute_times_seed
    }

    logging.info("run_line: {0}".format(ret))
    return ret
Ejemplo n.º 10
0
        color_dict = {"0": "r", "1": "b", "2": "g"}
        colors = [color_dict[iid_item] for iid_item in snpdata.iid[:, 0]]
        plt.axis('equal')
        plt.scatter(spatial_coor_gen_original.val[:, 0],
                    spatial_coor_gen_original.val[:, 1],
                    c=colors)
        plt.show()

    from fastlmm.association.heritability_spatial_correction import spatial_similarity
    from pysnptools.kernelreader import KernelData

    alpha = distance_between_centers
    spatial_val = spatial_similarity(spatial_coor_gen_original.val,
                                     alpha,
                                     power=2)
    K_loc = KernelData(iid=snpdata.iid, val=spatial_val).standardize()

    if do_plot:
        pylab.suptitle("$K_{loc}$")
        pylab.imshow(K_loc.val, cmap=pylab.gray(), vmin=0, vmax=1)
        pylab.show()

    from pysnptools.snpreader import SnpData

    iid = K_causal.iid
    iid_count = K_causal.iid_count
    np.random.seed(seed)
    pheno_causal = SnpData(iid=iid,
                           sid=["causal"],
                           val=np.random.multivariate_normal(
                               np.zeros(iid_count),
Ejemplo n.º 11
0
    def combine_the_best_way(K0, K1, covar, y, mixing, h2, force_full_rank=False, force_low_rank=False,snp_standardizer=None,kernel_standardizer=None,block_size=None):
        from pysnptools.kernelstandardizer import Identity as KS_Identity

        assert K0.iid0 is K0.iid1, "Expect K0 to be square"
        assert K1.iid0 is K1.iid1, "Expect K1 to be square"
        assert K0 is not None
        assert K1 is not None
        assert np.array_equal(K0.iid,K1.iid), "Expect K0 and K1 to having matching iids"
        assert kernel_standardizer is not None, "expect values for kernel_standardizer"

        mixer = _Mixer(False,KS_Identity(),KS_Identity(),mixing)

        sid_count_0 = _Mixer.sid_counter(K0, force_full_rank, force_low_rank)
        sid_count_1 = _Mixer.sid_counter(K1, force_full_rank, force_low_rank)

        #################################
        # Both Identity (or not given)
        #################################
        if sid_count_0 + sid_count_1 == 0:
            h2 = h2 or 0
            mixer.mixing = mixer.mixing or 0
            K = K0.read() #would be nice to use LinearRegression or low-rank with 0 snps

        #################################
        #
        #################################
        elif sid_count_0 + sid_count_1 < K0.iid_count or force_low_rank:
            mixer.do_g = True
            #!!!there is no need for block_size here because we want G0 in full. But if starting with SNPs and not low-rank then batches are needed and the two standardizers must be remembered for use later

            if sid_count_0 > 0:
                K0, mixer.snp_trained0, mixer.kernel_trained0 = K0._read_with_standardizing(to_kerneldata=not mixer.do_g, kernel_standardizer=kernel_standardizer, return_trained=True)
            if sid_count_1 > 0:
                K1, mixer.snp_trained1, mixer.kernel_trained1 = K1._read_with_standardizing(to_kerneldata=not mixer.do_g, kernel_standardizer=kernel_standardizer, return_trained=True)

            if sid_count_1 == 0:
                mixer.mixing = mixer.mixing or 0
                K = K0
            elif sid_count_0 == 0:
                mixer.mixing = mixer.mixing or 1
                K = K1
            else:
                if mixer.do_g:
                    G = np.empty((K0.iid_count, K0.sid_count + K1.sid_count))
                    if mixer.mixing is None:
                        mixer.mixing, h2 = _find_mixing_from_Gs(G, covar, K0.snpreader.val, K1.snpreader.val, h2, y)

                    if mixer.mixing == 0:
                        K = K0
                    elif mixer.mixing == 1:
                        K = K1
                    else:
                        _mix_from_Gs(G, K0.snpreader.val, K1.snpreader.val, mixer.mixing)
                        G = SnpData(iid=K0.iid,
                                            sid=["K0_{0}".format(i) for i in xrange(K0.sid_count)]+["K1_{0}".format(i) for i in xrange(K1.sid_count)], #rename the sids so that they can't collide.
                                            val=G,name="{0}&{1}".format(K0.snpreader,K1.snpreader),
                                            pos=np.concatenate((K0.pos,K1.pos),axis=0)
                                            )
                        K = SnpKernel(G,SS_Identity(),block_size=block_size)
        else:
            mixer.do_g = False
            if sid_count_0 > 0: #!!!but what if we have SNP data but still need to remember the standardizer?
                K0, mixer.snp_trained0, mixer.kernel_trained0 = K0._read_with_standardizing(to_kerneldata=True,return_trained=True)#!!!pass in a new argument, the kernel_standardizer(???)

            if sid_count_1 > 0:
                K1, mixer.snp_trained1, mixer.kernel_trained1 = K1._read_with_standardizing(to_kerneldata=True,return_trained=True)

            if sid_count_1 == 0:
                mixer.mixing = mixer.mixing or 0
                K = K0
            elif sid_count_0 == 0:
                mixer.mixing = mixer.mixing or 1
                K = K1
            else:
                K = np.empty(K0.val.shape)
                if mixer.mixing is None:
                    mixer.mixing, h2 = _find_mixing_from_Ks(K, covar, K0.val, K1.val, h2, y)
                _mix_from_Ks(K, K0.val, K1.val, mixer.mixing)
                assert K.shape[0] == K.shape[1] and abs(np.diag(K).sum() - K.shape[0]) < 1e-7, "Expect mixed K to be standardized"
                K = KernelData(val=K,iid=K0.iid)

        return K, h2, mixer
def work_item(arg_tuple):               
    (pheno, G_kernel, spatial_coor, spatial_iid, alpha,alpha_power,    # The main inputs
     (jackknife_index, jackknife_count, jackknife_seed),               # Jackknifing and permutations inputs
     (permute_plus_index, permute_plus_count, permute_plus_seed),
     (permute_times_index, permute_times_count, permute_times_seed),
     just_testing, do_uncorr, do_gxe2, a2) = arg_tuple                 # Shortcutting work

    #########################################
    # Remove any missing values from pheno
    #########################################
    pheno = pheno.read()
    pheno = pheno[pheno.val[:,0]==pheno.val[:,0],:] #Excludes NaN because NaN is not equal to NaN

    #########################################
    # Environment: Turn spatial info info a KernelData
    #########################################
    spatial_val = spatial_similarity(spatial_coor, alpha, power=alpha_power)
    E_kernel = KernelData(iid=spatial_iid,val=spatial_val)

    #########################################
    # Intersect, apply the jackknife or permutation, and then (because we now know the iids) standardize appropriately
    #########################################
    from pysnptools.util import intersect_apply
    G_kernel, E_kernel, pheno  = intersect_apply([G_kernel, E_kernel, pheno])

    if jackknife_index >= 0:
        assert jackknife_count <= G_kernel.iid_count, "expect the number of groups to be less than the number of iids"
        assert jackknife_index < jackknife_count, "expect the jackknife index to be less than the count"
        m_fold = cross_validation.KFold(n=G_kernel.iid_count, n_folds=jackknife_count, shuffle=True, random_state=jackknife_seed%4294967295)
        iid_index,_ = _nth(m_fold, jackknife_index)
        pheno = pheno[iid_index,:]
        G_kernel = G_kernel[iid_index]
        E_kernel = E_kernel[iid_index]

    if permute_plus_index >= 0:
        #We shuffle the val, but not the iid, because that would cancel out.
        #Integrate the permute_plus_index into the random.
        np.random.seed((permute_plus_seed + permute_plus_index)%4294967295)
        new_index = np.arange(G_kernel.iid_count)
        np.random.shuffle(new_index)
        E_kernel_temp = E_kernel[new_index].read()
        E_kernel = KernelData(iid=E_kernel.iid,val=E_kernel_temp.val,name="permutation {0}".format(permute_plus_index))

    pheno = pheno.read().standardize()       # defaults to Unit standardize
    G_kernel = G_kernel.read().standardize() # defaults to DiagKtoN standardize
    E_kernel = E_kernel.read().standardize() # defaults to DiagKtoN standardize

    #########################################
    # find h2uncoor, the best mixing weight of pure random noise and G_kernel
    #########################################

    if not do_uncorr:
        h2uncorr, nLLuncorr = np.nan,np.nan
    else:
        logging.info("Find best h2 for G_kernel")
        lmmg = LMM()
        lmmg.setK(K0=G_kernel.val)
        lmmg.setX(np.ones([G_kernel.iid_count,1])) # just a bias column
        lmmg.sety(pheno.val[:,0])
        if not just_testing:
            resg = lmmg.findH2()
            h2uncorr, nLLuncorr = resg["h2"], resg["nLL"]
        else:
            h2uncorr, nLLuncorr = 0,0
        logging.info("just G: h2uncorr: {0}, nLLuncorr: {1}".format(h2uncorr,nLLuncorr))
    
    #########################################
    # Find a2, the best mixing for G_kernel and E_kernel
    #########################################

    if a2 is None:
        logging.info("Find best mixing for G_kernel and E_kernel")
        lmm1 = LMM()
        lmm1.setK(K0=G_kernel.val, K1=E_kernel.val, a2=0.5)
        lmm1.setX(np.ones([G_kernel.iid_count,1])) # just a bias column
        lmm1.sety(pheno.val[:,0])
        if not just_testing:
            res1 = lmm1.findA2()
            h2, a2, nLLcorr = res1["h2"], res1["a2"], res1["nLL"]
            h2corr = h2 * (1-a2)
            e2 = h2 * a2
        else:
            h2corr, e2, a2, nLLcorr = 0,0,.5,0
        logging.info("G plus E mixture: h2corr: {0}, e2: {1}, a2: {2}, nLLcorr: {3}".format(h2corr,e2,a2,nLLcorr))
    else:
        h2corr, e2, nLLcorr = np.nan, np.nan, np.nan

    #########################################
    # Find a2_gxe2, the best mixing for G+E_kernel and the GxE kernel
    #########################################

    if not do_gxe2:
        gxe2, a2_gxe2, nLL_gxe2 = np.nan, np.nan, np.nan
    else:
        #Create the G+E kernel by mixing according to a2
        val=(1-a2)*G_kernel.val + a2*E_kernel.val
        GplusE_kernel = KernelData(iid=G_kernel.iid, val=val,name="{0} G + {1} E".format(1-a2,a2))
        #Don't need to standardize GplusE_kernel because it's the weighted combination of standardized kernels

        # Create GxE Kernel and then find the best mixing of it and GplusE
        logging.info("Find best mixing for GxE and GplusE_kernel")

        val=G_kernel.val * E_kernel.val
        if permute_times_index >= 0:
            #We shuffle the val, but not the iid, because doing both would cancel out
            np.random.seed((permute_times_seed + permute_times_index)%4294967295)
            new_index = np.arange(G_kernel.iid_count)
            np.random.shuffle(new_index)
            val = pstutil.sub_matrix(val, new_index, new_index)

        GxE_kernel = KernelData(iid=G_kernel.iid, val=val,name="GxE") # recall that Python '*' is just element-wise multiplication
        GxE_kernel = GxE_kernel.standardize()

        lmm2 = LMM()
        lmm2.setK(K0=GplusE_kernel.val, K1=GxE_kernel.val, a2=0.5)
        lmm2.setX(np.ones([G_kernel.iid_count,1])) # just a bias column
        lmm2.sety(pheno.val[:,0])
        if not just_testing:
            res2 = lmm2.findA2()
            gxe2, a2_gxe2, nLL_gxe2 = res2["h2"], res2["a2"], res2["nLL"]
            gxe2 *= a2_gxe2
        else:
            gxe2, a2_gxe2, nLL_gxe2 = 0,.5,0
        logging.info("G+E plus GxE mixture: gxe2: {0}, a2_gxe2: {1}, nLL_gxe2: {2}".format(gxe2, a2_gxe2, nLL_gxe2))
        
    #########################################
    # Return results
    #########################################

    ret = {"h2uncorr": h2uncorr, "nLLuncorr": nLLuncorr, "h2corr": h2corr, "e2":e2, "a2": a2, "nLLcorr": nLLcorr,
           "gxe2": gxe2, "a2_gxe2": a2_gxe2, "nLL_gxe2": nLL_gxe2, "alpha": alpha, "alpha_power":alpha_power, "phen": pheno.sid[0],
           "jackknife_index": jackknife_index, "jackknife_count":jackknife_count, "jackknife_seed":jackknife_seed,
           "permute_plus_index": permute_plus_index, "permute_plus_count":permute_plus_count, "permute_plus_seed":permute_plus_seed,
           "permute_times_index": permute_times_index, "permute_times_count":permute_times_count, "permute_times_seed":permute_times_seed
           }
    
    logging.info("run_line: {0}".format(ret))
    return ret
Ejemplo n.º 13
0
    def read(self,
             order='F',
             dtype=np.float64,
             force_python_only=False,
             view_ok=False,
             num_threads=None):
        """Reads the kernel values and returns a :class:`.KernelData` (with :attr:`KernelData.val` property containing a new ndarray of the kernel values).

        :param order: {'F' (default), 'C', 'A'}, optional -- Specify the order of the ndarray. If order is 'F' (default),
            then the array will be in F-contiguous order (iid0-index varies the fastest).
            If order is 'C', then the returned array will be in C-contiguous order (iid1-index varies the fastest).
            If order is 'A', then the :attr:`KernelData.val`
            ndarray may be in any order (either C-, Fortran-contiguous, or even discontiguous).
        :type order: string or None

        :param dtype: {numpy.float64 (default), numpy.float32}, optional -- The data-type for the :attr:`KernelData.val` ndarray.
        :type dtype: data-type

        :param force_python_only: optional -- If False (default), may use outside library code. If True, requests that the read
            be done without outside library code.
        :type force_python_only: bool

        :param view_ok: optional -- If False (default), allocates new memory for the :attr:`KernelData.val`'s ndarray. If True,
            if practical and reading from a :class:`KernelData`, will return a new 
            :class:`KernelData` with a ndarray shares memory with the original :class:`KernelData`.
            Typically, you'll also wish to use "order='A'" to increase the chance that sharing will be possible.
            Use these parameters with care because any change to either ndarray (for example, via :meth:`.KernelData.standardize`) will effect
            the others. Also keep in mind that :meth:`read` relies on ndarray's mechanisms to decide whether to actually
            share memory and so it may ignore your suggestion and allocate a new ndarray anyway.
        :type view_ok: bool

        :param num_threads: optional -- The number of threads with which to standardize data. Defaults to all available
            processors. Can also be set with these environment variables (listed in priority order):
            'PST_NUM_THREADS', 'NUM_THREADS', 'MKL_NUM_THREADS'.
        :type num_threads: None or int

        :rtype: :class:`.KernelData`

        Calling the method again causes the kernel values to be re-read and creates a new in-memory :class:`.KernelData` with a new ndarray of kernel values.

        If you request the values for only a subset of the sids or iids, (to the degree practical) only that subset will be read from disk.

        :Example:

        >>> from pysnptools.kernelreader import KernelNpz
        >>> from pysnptools.util import example_file # Download and return local file name
        >>> npz_file = example_file('pysnptools/examples/toydata.kernel.npz')
        >>> kernel_on_disk = KernelNpz(npz_file)
        >>> kerneldata1 = kernel_on_disk.read() # Read all the kernel data returning a KernelData instance
        >>> print(type(kerneldata1.val).__name__) # The KernelData instance contains a ndarray of the data.
        ndarray
        >>> subset_kerneldata = kernel_on_disk[::2].read() # From the disk, read kernel values for every other iid
        >>> print('{0:.6f}'.format(subset_kerneldata.val[0,0])) # Print the first kernel value in the subset
        9923.069928
        >>> subsub_kerneldata = subset_kerneldata[:10].read(order='A',view_ok=True) # Create an in-memory subset of the subset with kernel values for the first ten iids. Share memory if practical.
        >>> import numpy as np
        >>> #print(np.may_share_memory(subset_kerneldata.val, subsub_kerneldata.val)) # Do the two ndarray's share memory? They could. Currently they won't.       
        """
        dtype = np.dtype(dtype)
        val = self._read(None, None, order, dtype, force_python_only, view_ok,
                         num_threads)
        from pysnptools.kernelreader import KernelData
        ret = KernelData(iid0=self.iid0,
                         iid1=self.iid1,
                         val=val,
                         name=str(self))
        return ret
        spatial_iid = np.array([(v, v) for v in gps_table["id"].values])
        spatial_coor = gps_table[["south_new", "east_new"]].values

    #########################################
    # Remove any missing values from pheno
    ########################################
    assert pheno.sid_count == 1, "Expect only one pheno in work_item"
    pheno = pheno.read()
    pheno = pheno[pheno.val[:, 0] == pheno.
                  val[:, 0], :]  #Excludes NaN because NaN is not equal to NaN

    #########################################
    # Environment: Turn spatial info info a KernelData
    #########################################
    spatial_val = spatial_similarity(spatial_coor, alpha, power=alpha_power)
    E_kernel = KernelData(iid=spatial_iid, val=spatial_val)

    #########################################
    # Intersect, apply the jackknife or permutation, and then (because we now know the iids) standardize appropriately
    #########################################
    from pysnptools.util import intersect_apply
    G_kernel, E_kernel, pheno = intersect_apply([G_kernel, E_kernel, pheno])

    if jackknife_index >= 0:
        assert jackknife_count <= G_kernel.iid_count, "expect the number of groups to be less than the number of iids"
        assert jackknife_index < jackknife_count, "expect the jackknife index to be less than the count"
        m_fold = model_selection.KFold(n_splits=jackknife_count,
                                       shuffle=True,
                                       random_state=jackknife_seed %
                                       4294967295).split(
                                           range(G_kernel.iid_count))
def work_item(arg_tuple):
    (
        pheno,
        G_kernel,
        spatial_coor,
        spatial_iid,
        alpha,
        alpha_power,  # The main inputs
        (jackknife_index, jackknife_count,
         jackknife_seed),  # Jackknifing and permutations inputs
        (permute_plus_index, permute_plus_count, permute_plus_seed),
        (permute_times_index, permute_times_count, permute_times_seed),
        just_testing,
        do_uncorr,
        do_gxe2,
        a2) = arg_tuple  # Shortcutting work

    #########################################
    # Remove any missing values from pheno
    #########################################
    pheno = pheno.read()
    pheno = pheno[pheno.val[:, 0] == pheno.
                  val[:, 0], :]  #Excludes NaN because NaN is not equal to NaN

    #########################################
    # Environment: Turn spatial info info a KernelData
    #########################################
    spatial_val = spatial_similarity(spatial_coor, alpha, power=alpha_power)
    E_kernel = KernelData(iid=spatial_iid, val=spatial_val)

    #########################################
    # Intersect, apply the jackknife or permutation, and then (because we now know the iids) standardize appropriately
    #########################################
    from pysnptools.util import intersect_apply
    G_kernel, E_kernel, pheno = intersect_apply([G_kernel, E_kernel, pheno])

    if jackknife_index >= 0:
        assert jackknife_count <= G_kernel.iid_count, "expect the number of groups to be less than the number of iids"
        assert jackknife_index < jackknife_count, "expect the jackknife index to be less than the count"
        m_fold = cross_validation.KFold(n=G_kernel.iid_count,
                                        n_folds=jackknife_count,
                                        shuffle=True,
                                        random_state=jackknife_seed %
                                        4294967295)
        iid_index, _ = _nth(m_fold, jackknife_index)
        pheno = pheno[iid_index, :]
        G_kernel = G_kernel[iid_index]
        E_kernel = E_kernel[iid_index]

    if permute_plus_index >= 0:
        #We shuffle the val, but not the iid, because that would cancel out.
        #Integrate the permute_plus_index into the random.
        np.random.seed((permute_plus_seed + permute_plus_index) % 4294967295)
        new_index = np.arange(G_kernel.iid_count)
        np.random.shuffle(new_index)
        E_kernel_temp = E_kernel[new_index].read()
        E_kernel = KernelData(
            iid=E_kernel.iid,
            val=E_kernel_temp.val,
            parent_string="permutation {0}".format(permute_plus_index))

    pheno = pheno.read().standardize()  # defaults to Unit standardize
    G_kernel = G_kernel.read().standardize(
    )  # defaults to DiagKtoN standardize
    E_kernel = E_kernel.read().standardize(
    )  # defaults to DiagKtoN standardize

    #########################################
    # find h2uncoor, the best mixing weight of pure random noise and G_kernel
    #########################################

    if not do_uncorr:
        h2uncorr, nLLuncorr = np.nan, np.nan
    else:
        logging.info("Find best h2 for G_kernel")
        lmmg = LMM()
        lmmg.setK(K0=G_kernel.val)
        lmmg.setX(np.ones([G_kernel.iid_count, 1]))  # just a bias column
        lmmg.sety(pheno.val[:, 0])
        if not just_testing:
            resg = lmmg.findH2()
            h2uncorr, nLLuncorr = resg["h2"], resg["nLL"]
        else:
            h2uncorr, nLLuncorr = 0, 0
        logging.info("just G: h2uncorr: {0}, nLLuncorr: {1}".format(
            h2uncorr, nLLuncorr))

    #########################################
    # Find a2, the best mixing for G_kernel and E_kernel
    #########################################

    if a2 is None:
        logging.info("Find best mixing for G_kernel and E_kernel")
        lmm1 = LMM()
        lmm1.setK(K0=G_kernel.val, K1=E_kernel.val, a2=0.5)
        lmm1.setX(np.ones([G_kernel.iid_count, 1]))  # just a bias column
        lmm1.sety(pheno.val[:, 0])
        if not just_testing:
            res1 = lmm1.findA2()
            h2, a2, nLLcorr = res1["h2"], res1["a2"], res1["nLL"]
            h2corr = h2 * (1 - a2)
            e2 = h2 * a2
        else:
            h2corr, e2, a2, nLLcorr = 0, 0, .5, 0
        logging.info(
            "G plus E mixture: h2corr: {0}, e2: {1}, a2: {2}, nLLcorr: {3}".
            format(h2corr, e2, a2, nLLcorr))
    else:
        h2corr, e2, nLLcorr = np.nan, np.nan, np.nan

    #########################################
    # Find a2_gxe2, the best mixing for G+E_kernel and the GxE kernel
    #########################################

    if not do_gxe2:
        gxe2, a2_gxe2, nLL_gxe2 = np.nan, np.nan, np.nan
    else:
        #Create the G+E kernel by mixing according to a2
        val = (1 - a2) * G_kernel.val + a2 * E_kernel.val
        GplusE_kernel = KernelData(iid=G_kernel.iid,
                                   val=val,
                                   parent_string="{0} G + {1} E".format(
                                       1 - a2, a2))
        #Don't need to standardize GplusE_kernel because it's the weighted combination of standardized kernels

        # Create GxE Kernel and then find the best mixing of it and GplusE
        logging.info("Find best mixing for GxE and GplusE_kernel")

        val = G_kernel.val * E_kernel.val
        if permute_times_index >= 0:
            #We shuffle the val, but not the iid, because doing both would cancel out
            np.random.seed(
                (permute_times_seed + permute_times_index) % 4294967295)
            new_index = np.arange(G_kernel.iid_count)
            np.random.shuffle(new_index)
            val = pstutil.sub_matrix(val, new_index, new_index)

        GxE_kernel = KernelData(
            iid=G_kernel.iid, val=val, parent_string="GxE"
        )  # recall that Python '*' is just element-wise multiplication
        GxE_kernel = GxE_kernel.standardize()

        lmm2 = LMM()
        lmm2.setK(K0=GplusE_kernel.val, K1=GxE_kernel.val, a2=0.5)
        lmm2.setX(np.ones([G_kernel.iid_count, 1]))  # just a bias column
        lmm2.sety(pheno.val[:, 0])
        if not just_testing:
            res2 = lmm2.findA2()
            gxe2, a2_gxe2, nLL_gxe2 = res2["h2"], res2["a2"], res2["nLL"]
            gxe2 *= a2_gxe2
        else:
            gxe2, a2_gxe2, nLL_gxe2 = 0, .5, 0
        logging.info(
            "G+E plus GxE mixture: gxe2: {0}, a2_gxe2: {1}, nLL_gxe2: {2}".
            format(gxe2, a2_gxe2, nLL_gxe2))

    #########################################
    # Return results
    #########################################

    ret = {
        "h2uncorr": h2uncorr,
        "nLLuncorr": nLLuncorr,
        "h2corr": h2corr,
        "e2": e2,
        "a2": a2,
        "nLLcorr": nLLcorr,
        "gxe2": gxe2,
        "a2_gxe2": a2_gxe2,
        "nLL_gxe2": nLL_gxe2,
        "alpha": alpha,
        "alpha_power": alpha_power,
        "phen": pheno.sid[0],
        "jackknife_index": jackknife_index,
        "jackknife_count": jackknife_count,
        "jackknife_seed": jackknife_seed,
        "permute_plus_index": permute_plus_index,
        "permute_plus_count": permute_plus_count,
        "permute_plus_seed": permute_plus_seed,
        "permute_times_index": permute_times_index,
        "permute_times_count": permute_times_count,
        "permute_times_seed": permute_times_seed
    }

    logging.info("run_line: {0}".format(ret))
    return ret
    def test_lmm(self):
        do_plot = False
        iid_count = 500
        seed = 0


        import pylab
        logging.info("TestLmmTrain test_lmm")

        iid = [["cid{0}P{1}".format(iid_index,iid_index//250)]*2 for iid_index in xrange(iid_count)]
        train_idx = np.r_[10:iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids


        #Every person is 100% related to everyone in one of 5 families
        K0a = KernelData(iid=iid,val=np.empty([iid_count,iid_count]),name="related by distance")
        for iid_index0 in xrange(iid_count):
            for iid_index1 in xrange(iid_count):
                K0a.val[iid_index0,iid_index1] = 1 if iid_index0 % 5 == iid_index1 % 5 else 0
                if iid_index1 < iid_index0:
                    assert K0a.val[iid_index0,iid_index1] == K0a.val[iid_index1,iid_index0]

        #every person lives on a line from 0 to 1
        # They are related to every other person as a function of distance on the line
        np.random.seed(seed)
        home = np.random.random([iid_count])
        K0b = KernelData(iid=iid,val=np.empty([iid_count,iid_count]),name="related by distance")
        for iid_index in xrange(iid_count):
            K0b.val[iid_index,:] = 1 - np.abs(home-home[iid_index])**.1

        #make covar just numbers 0,1,...
        covar = SnpData(iid=iid,sid=["x"],val=np.array([[float(num)] for num in xrange(iid_count)]))
        covariate_train = covar[train_idx,:].read()
        covariate_test = covar[test_idx,:].read()

        for name, h2, K0 in [("clones", 1, K0a),("line_world",.75,K0b)]:

            sigma2x = 100
            varg = sigma2x * h2
            vare = sigma2x * (1-h2)

            #######################################################################
            #make pheno  # pheno = 2*covar+100+normal(0,1)*2.5+normal(0,K)*7.5
            #######################################################################
            #random.multivariate_normal is sensitive to mkl_num_thread, so we control it.
            if 'MKL_NUM_THREADS' in os.environ:
                mkl_num_thread = os.environ['MKL_NUM_THREADS']
            else:
                mkl_num_thread = None
            os.environ['MKL_NUM_THREADS'] = '1'
            np.random.seed(seed)
            p1 = covar.val * 2.0 + 100
            p2 = np.random.normal(size=covar.val.shape)*np.sqrt(vare)
            p3 = (np.random.multivariate_normal(np.zeros(iid_count),K0.val)*np.sqrt(varg)).reshape(-1,1)
            if mkl_num_thread is not None:
                os.environ['MKL_NUM_THREADS'] = mkl_num_thread
            else:
                del os.environ['MKL_NUM_THREADS']
            pheno = SnpData(iid=iid,sid=["pheno0"],val= p1 + p2 + p3)

            pheno_train = pheno[train_idx,:].read()
            pheno_test = pheno[test_idx,:].read()

            if do_plot:
                #Plot training x and y, testing x and y
                pylab.plot(covariate_train.val, pheno_train.val,".",covariate_test.val, pheno_test.val,".")
                pylab.suptitle(name + ": Plot training x and y, testing x and y")
                pylab.show()

            Xtrain = np.c_[covariate_train.val,np.ones((covariate_train.iid_count,1))]
            Xtest = np.c_[covariate_test.val,np.ones((covariate_test.iid_count,1))]
            lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:,0],rcond=-1)
            bs=lsqSol[0] #weights
            r2=lsqSol[1] #squared residuals
            D=lsqSol[2]  #rank of design matrix
            N=pheno_train.iid_count
            REML = False
            if not REML:
                sigma2 = float(r2/N)
                nLL =  N*0.5*np.log(2*np.pi*sigma2) + N*0.5
            else:
                sigma2 = float(r2 / (N-D))
                nLL = N*0.5*np.log(2*np.pi*sigma2) + 0.5/sigma2*r2;
                nLL -= 0.5*D*np.log(2*np.pi*sigma2);#REML term

            predicted = Xtest.dot(bs)
            yerr = [np.sqrt(sigma2)] * len(predicted)
            if do_plot:
                pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.")
                pylab.xlim([-1, 10])
                pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None')
                pylab.suptitle(name + ": real linear regression: actual to prediction")
                pylab.show()

            for factor in [1,100,.02]:
                K0 = K0.read()
                K0.val *= factor

                K0_train = K0[train_idx]
                K0_whole_test = K0[:,test_idx]

                #Learn model, save, load
                fastlmmx = FastLMM(GB_goal=2).fit(K0_train=K0_train, X=covariate_train, y=pheno_train)
                v2 = np.var(p2)
                v3 = np.var(p3)
                logging.debug("Original h2 of {0}. Generated h2 of {1}. Learned h2 of {2}".format(h2, v3/(v2+v3), fastlmmx.h2raw))
                
                
                filename = self.tempout_dir + "/model_lmm.flm.p"
                pstutil.create_directory_if_necessary(filename)
                joblib.dump(fastlmmx, filename) 
                fastlmm = joblib.load(filename)


                do_test_on_train = True
                if do_test_on_train:
                    #Predict with model (test on train)
                    predicted_pheno, covar_pheno = fastlmm.predict(K0_whole_test=K0_train, X=covariate_train,count_A1=False) #test on train
                    output_file = self.file_name("lmma_"+name)
                    Dat.write(output_file,predicted_pheno)
                    covar2 = SnpData(iid=covar_pheno.row,sid=covar_pheno.col[:,1],val=covar_pheno.val) #kludge to write kernel to text format
                    output_file = self.file_name("lmma.cov_"+name)
                    Dat.write(output_file,covar2)

                    yerr = np.sqrt(np.diag(covar_pheno.val))
                    predicted = predicted_pheno.val
                    if do_plot:
                        pylab.plot(covariate_train.val, pheno_train.val,"g.",covariate_train.val, predicted,"r.")
                        pylab.xlim([0, 50])
                        pylab.ylim([100, 200])
                        pylab.errorbar(covariate_train.val, predicted,yerr,linestyle='None')
                        pylab.suptitle(name+": test on train: train X to true target (green) and prediction (red)")
                        pylab.show()

                    self.compare_files(predicted_pheno,"lmma_"+name)
                    self.compare_files(covar2,"lmma.cov_"+name)

                    predicted_pheno0, covar_pheno0 = fastlmm.predict(K0_whole_test=K0_train[:,0], X=covariate_train[0,:],count_A1=False) #test on train #0
                    assert np.abs(predicted_pheno0.val[0,0] - predicted_pheno.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases"
                    assert np.abs(covar_pheno0.val[0,0] - covar_pheno.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases"


                #Predict with model (test on test)
                predicted_phenoB, covar_phenoB  = fastlmm.predict(K0_whole_test=K0_whole_test, X=covariate_test,count_A1=False) #test on test
                output_file = self.file_name("lmmb_"+name)
                Dat.write(output_file,predicted_phenoB)
                covar2 = SnpData(iid=covar_phenoB.row,sid=covar_phenoB.col[:,1],val=covar_phenoB.val) #kludge to write kernel to text format
                output_file = self.file_name("lmmb.cov_"+name)
                Dat.write(output_file,covar2)

                yerr = np.sqrt(np.diag(covar_phenoB.val))
                predicted = predicted_phenoB.val
                if do_plot:
                    pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.")
                    pylab.xlim([-1, 10])
                    pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None')
                    pylab.suptitle(name+": test on test: test X to true target (green) and prediction (red)")
                    pylab.show()

                self.compare_files(predicted_phenoB,"lmmb_"+name)
                self.compare_files(covar2,"lmmb.cov_"+name)

                predicted_phenoB0, covar_phenoB0  = fastlmm.predict(K0_whole_test=K0_whole_test[:,0], X=covariate_test[0,:],count_A1=False) #test on a single test case
                assert np.abs(predicted_phenoB0.val[0,0] - predicted_phenoB.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases"
                assert np.abs(covar_phenoB0.val[0,0] - covar_phenoB.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases"

                #Predict with model test on some train and some test
                some_idx = range(covar.iid_count)
                some_idx.remove(train_idx[0])
                some_idx.remove(test_idx[0])
                covariate_some = covar[some_idx,:]
                K0_whole_some = K0[:,some_idx]
                predicted_phenoC, covar_phenoC  = fastlmm.predict(K0_whole_test=K0_whole_some, X=covariate_some,count_A1=False)
                for idxC, iidC in enumerate(predicted_phenoC.iid):
                    meanC = predicted_phenoC.val[idxC]
                    varC = covar_phenoC.val[idxC,idxC]
                    if iidC in predicted_pheno.iid:
                        predicted_pheno_ref = predicted_pheno
                        covar_pheno_ref = covar_pheno
                    else:
                        assert iidC in predicted_phenoB.iid
                        predicted_pheno_ref = predicted_phenoB
                        covar_pheno_ref = covar_phenoB
                    idx_ref = predicted_pheno_ref.iid_to_index([iidC])[0]
                    mean_ref = predicted_pheno_ref.val[idx_ref]
                    var_ref = covar_pheno_ref.val[idx_ref,idx_ref]
                    assert np.abs(meanC - mean_ref) < 1e-6
                    assert np.abs(varC - var_ref) < 1e-6
Ejemplo n.º 17
0
    def predict(self,
                X=None,
                K0_whole_test=None,
                K1_whole_test=None,
                iid_if_none=None,
                count_A1=None):
        """
        Method for predicting from a fitted :class:`FastLMM` predictor.
        If the examples in X, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected.

        :param X: testing covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ (such as `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__) or string.

        :param K0_whole_test: Must be None. Represents the identity similarity matrix.
        :type K0_whole_test: None

        :param K1_whole_test: Must be None. Represents the identity similarity matrix.
        :type K1_whole_test: `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string or `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__

        :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided.
        :type iid_if_none: an ndarray of two strings

        :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
             alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
        :type count_A1: bool

        :rtype: A `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__ of the means and a :class:`KernelData` of the covariance
        """
        with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _:
            assert self.is_fitted, "Can only predict after predictor has been fitted"
            assert K0_whole_test is None or isinstance(
                K0_whole_test, KernelIdentity)  # could also accept no snps
            assert K1_whole_test is None or isinstance(
                K1_whole_test, KernelIdentity)  # could also accept no snps

            X = _pheno_fixup(X, iid_if_none=iid_if_none, count_A1=count_A1)
            X = X.read().standardize(self.covar_unit_trained)

            # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset
            X = SnpData(iid=X.iid,
                        sid=FastLMM._new_snp_name(X),
                        val=np.c_[X.read().val,
                                  np.ones((X.iid_count, 1))])
            assert np.array_equal(
                X.sid, self.covar_sid
            ), "Expect covar sids to be the same in train and test."

            pheno_predicted = X.val.dot(self.beta).reshape(-1, 1)
            ret0 = SnpData(iid=X.iid,
                           sid=self.pheno_sid,
                           val=pheno_predicted,
                           pos=np.array([[np.nan, np.nan, np.nan]]),
                           name="linear regression Prediction"
                           )  #!!!replace 'parent_string' with 'name'

            from pysnptools.kernelreader import KernelData
            ret1 = KernelData(iid=X.iid,
                              val=np.eye(X.iid_count) * self.ssres /
                              self.iid_count)
            return ret0, ret1
Ejemplo n.º 18
0
    iids = []
    for i in range(len(tmp)):
         iids.append(re.split(r' +', tmp[i]))

    Gsm = np.genfromtxt('Raw_Data/freeze2.common.rel.mat', skip_header=True)
    # print Gsm

    Gsm = Gsm[:, 2:207]
    # Gsm = LMM(K=Gsm)
    # Gsm.setSU_fromK()
    # Gsm.K

    # np.savetxt('../Inputs/NCSU_GSM_U.txt', Gsm.U)
    # np.savetxt('../Inputs/NCSU_GSM_S.txt', Gsm.S)
    
    my_kernel = KernelData(iid=iids, val=Gsm.tolist())
    
    # Now, performing GSM using the official GSM as kernel:
    results_df = single_snp(VARIANTS_TO_TEST,
                            PHENOTYPE_DATA,
                            K0=my_kernel,
                            leave_out_one_chrom=False,
                            save_test_statistic=True,
                            output_file_name = 'Outputs/' + OUTPUT_NAME + '_Original.txt',
                            )

print 'Total Time (s): ' + str(time.clock()-start)


# In[5]:
Ejemplo n.º 19
0
    def predict(self,
                X=None,
                K0_whole_test=None,
                K1_whole_test=None,
                iid_if_none=None,
                count_A1=None):
        """
        Method for predicting from a fitted :class:`FastLMM` predictor.
        If the examples in X, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected.

        :param X: testing covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ (such as `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__) or string.

        :param K0_whole_test: A similarity matrix from all the examples to the test examples. Alternatively,
               the test SNPs needed to construct such a similarity matrix.
               Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__. If you give a string, can be the name of a PLINK-formated Bed file.
               Can be PySnpTools `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__. If you give a string it can be the name of a `KernelNpz <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelnpz>`__ file.
        :type K0_whole_test: `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string or `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__

        :param K1_whole_test: A second similarity matrix from all the examples to the test examples. Alternatively,
               the test SNPs needed to construct such a similarity matrix.
               Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__. If you give a string, can be the name of a PLINK-formated Bed file.
               Can be PySnpTools `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__. If you give a string it can be the name of a `KernelNpz <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelnpz>`__ file.
        :type K1_whole_test: `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string or `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__

        :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided.
        :type iid_if_none: an ndarray of two strings

        :rtype: A `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__ of the means and a :class:`KernelData` of the covariance
        """
        with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _:

            assert self.is_fitted, "Can only predict after predictor has been fitted"
            #assert K0_whole_test is not None, "K0_whole_test must be given"
            #!!!later is it too wasteful to keep both G0_train, G1_train, and lmm.G when storing to disk?
            #!!!later all _kernel_fixup's should use block_size input

            K0_whole_test_b = _kernel_fixup(
                K0_whole_test,
                train_snps=self.G0_train,
                iid_if_none=iid_if_none,
                standardizer=self.mixer.snp_trained0,
                test=K0_whole_test,
                test_iid_if_none=None,
                block_size=self.block_size,
                count_A1=count_A1)
            K1_whole_test = _kernel_fixup(
                K1_whole_test,
                train_snps=self.G1_train,
                iid_if_none=K0_whole_test_b.iid0,
                standardizer=self.mixer.snp_trained1,
                test=K1_whole_test,
                test_iid_if_none=K0_whole_test_b.iid1,
                block_size=self.block_size,
                count_A1=count_A1)
            X = _pheno_fixup(X,
                             iid_if_none=K0_whole_test_b.iid1,
                             count_A1=count_A1)
            K0_whole_test_c, K1_whole_test, X = intersect_apply(
                [K0_whole_test_b, K1_whole_test, X],
                intersect_before_standardize=True,
                is_test=True)
            X = X.read().standardize(self.covar_unit_trained)
            # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset
            X = SnpData(iid=X.iid,
                        sid=self._new_snp_name(X),
                        val=np.c_[X.read().val,
                                  np.ones((X.iid_count, 1))])
            assert np.array_equal(
                X.sid, self.covar_sid
            ), "Expect covar sids to be the same in train and test."

            train_idx0 = K0_whole_test_c.iid0_to_index(self.K_train_iid)
            K0_train_test = K0_whole_test_c[train_idx0, :]
            train_idx1 = K1_whole_test.iid0_to_index(self.K_train_iid)
            K1_train_test = K1_whole_test[train_idx1, :]
            test_idx0 = K0_whole_test_c.iid0_to_index(K0_whole_test_c.iid1)
            K0_test_test = K0_whole_test_c[test_idx0, :]
            if K0_test_test.iid0 is not K0_test_test.iid1:
                raise Exception("real assert")
            test_idx1 = K1_whole_test.iid0_to_index(K0_whole_test_c.iid1)
            K1_test_test = K1_whole_test[test_idx1, :]

            if self.mixer.do_g:
                ###################################################
                # low rank from Rasmussen  eq 2.9 + noise term added to covar
                ###################################################
                Gstar = self.mixer.g_mix(K0_train_test, K1_train_test)
                varg = self.h2raw * self.sigma2
                vare = (1. - self.h2raw) * self.sigma2
                Ainv = LA.inv((1. / vare) * np.dot(self.G.T, self.G) +
                              (1. / varg) * np.eye(self.G.shape[1]))
                testAinv = np.dot(Gstar.test.val, Ainv)
                pheno_predicted = np.dot(X.val, self.beta) + (
                    1. / vare) * np.dot(np.dot(testAinv, self.G.T),
                                        self.y - np.dot(self.X, self.beta))
                pheno_predicted = pheno_predicted.reshape(-1, 1)
                covar = np.dot(
                    testAinv,
                    Gstar.test.val.T) + vare * np.eye(Gstar.test.val.shape[0])

            else:
                lmm = LMM()
                lmm.U = self.U
                lmm.S = self.S
                lmm.G = self.G
                lmm.y = self.y
                lmm.Uy = self.Uy
                lmm.X = self.X
                lmm.UX = self.UX

                Kstar = self.mixer.k_mix(
                    K0_train_test, K1_train_test
                )  #!!!later do we need/want reads here? how about view_OK?
                lmm.setTestData(Xstar=X.val, K0star=Kstar.val.T)

                Kstar_star = self.mixer.k_mix(
                    K0_test_test, K1_test_test
                )  #!!!later do we need/want reads here?how about view_OK?
                pheno_predicted, covar = lmm.predict_mean_and_variance(
                    beta=self.beta,
                    h2=self.h2raw,
                    sigma2=self.sigma2,
                    Kstar_star=Kstar_star.val)

            #pheno_predicted = lmm.predictMean(beta=self.beta, h2=self.h2,scale=self.sigma2).reshape(-1,1)
            ret0 = SnpData(iid=X.iid,
                           sid=self.pheno_sid,
                           val=pheno_predicted,
                           pos=np.array([[np.nan, np.nan, np.nan]]),
                           name="lmm Prediction")

            from pysnptools.kernelreader import KernelData
            ret1 = KernelData(iid=K0_test_test.iid, val=covar)
            return ret0, ret1