Example #1
0
 def lmm_from_cache_file(self):
     logging.info("Loading precomputation from {0}".format(self.cache_file))
     lmm = LMM()
     with np.load(self.cache_file) as data:
         lmm.U = data['arr_0']
         lmm.S = data['arr_1']
     return lmm
Example #2
0
 def lmm_from_cache_file(self):
     logging.info("Loading precomputation from {0}".format(self.cache_file))
     lmm = LMM()
     with np.load(self.cache_file) as data:
         lmm.U = data['arr_0']
         lmm.S = data['arr_1']
     return lmm
Example #3
0
    def fill_in_cache_file(self):
        self._run_once()

        logging.info("filling in the cache_file and log_delta, as needed")

        if self.G1_or_none is None:
            self.G1val_or_none = None
        else:
            self.G1val_or_none = self.G1_or_none.read().standardize().val

        # The S and U are always cached, in case they are needed for the cluster or for multi-threaded runs
        if self.cache_file is None:
            self.cache_file = os.path.join(self.__tempdirectory,
                                           "cache_file.npz")
            if os.path.exists(
                    self.cache_file
            ):  # If there is already a cache file in the temp directory, it must be removed because it might be out-of-date
                os.remove(self.cache_file)

        lmm = None
        if not os.path.exists(self.cache_file):
            logging.info("Precomputing eigen")
            lmm = LMM()
            G0_standardized = self.G0.read().standardize()
            lmm.setG(G0_standardized.val, self.G1val_or_none, a2=self.mixing)
            logging.info("Saving precomputation to {0}".format(
                self.cache_file))
            pstutil.create_directory_if_necessary(self.cache_file)
            np.savez(
                self.cache_file, lmm.U, lmm.S
            )  #using np.savez instead of pickle because it seems to be faster to read and write

        if self.external_log_delta is None:
            if lmm is None:
                lmm = self.lmm_from_cache_file()

            logging.info("searching for internal delta")
            lmm.setX(self.covar)
            lmm.sety(self.pheno['vals'])
            #log delta is used here. Might be better to use findH2, but if so will need to normalized G so that its K's diagonal would sum to iid_count

            # As per the paper, we optimized delta with REML=True, but
            # we will later optimize beta and find log likelihood with ML (REML=False)
            result = lmm.find_log_delta(
                REML=True,
                sid_count=self.G0.sid_count,
                min_log_delta=self.min_log_delta,
                max_log_delta=self.max_log_delta
            )  #!!what about findA2H2? minH2=0.00001
            self.external_log_delta = result['log_delta']

        self.internal_delta = np.exp(
            self.external_log_delta) * self.G0.sid_count
        logging.info("internal_delta={0}".format(self.internal_delta))
        logging.info("external_log_delta={0}".format(self.external_log_delta))
Example #4
0
    def train_null(self):
        """
        train model under null hypothesis
        """

        logging.info("training null model")

        # use LMM
        self.lmm = LMM()
        self.lmm.setG(self.train_snps, self.train_pcs, a2=self.mixing)

        self.lmm.setX(self.cov)
        self.lmm.sety(self.phen)

        logging.info("finding delta")
        if self.delta is None:
            result = self.lmm.findH2(REML=self.REML, minH2=0.00001 )
            self.delta = 1.0/result['h2']-1.0
            
        # UX = lmm_null.U.dot(test_snps)
        self.res_null = self.lmm.nLLeval(delta=self.delta, REML=self.REML)
        self.ll_null = -self.res_null["nLL"]
Example #5
0
    def train_null(self):
        """
        find delta on all snps
        """

        logging.info("training null model")

        # use LMM
        self.lmm = LMM()
        self.lmm.setG(self.G0, self.G1, a2=self.mixing)

        self.lmm.setX(self.cov)
        self.lmm.sety(self.phen)

        logging.info("finding delta")

        #result = self.lmm.find_log_delta(self, self.N)
        #self.delta = np.exp(result['log_delta'])

        if self.delta is None:
            result = self.lmm.find_log_delta_chris()
            self.delta = result['delta']
Example #6
0
 def RealVar(self, y, X):
     lmmg = LMM()
     m = np.shape(X)[1]
     n = len(y)
     lmmg.setG(X / math.sqrt(m))
     lmmg.sety(y)
     lmmg.setX(np.ones([n, 1]))
     try:
         dct = lmmg.findH2()
     except:
         dct = {}
         dct['h2'] = .5
         mn = sum(y) / float(n)
         dct['sigma2'] = sum([(i - mn)**2 for i in y]) / float(n)
     h2 = dct['h2']
     s2 = dct['sigma2']
     sg2 = h2 * s2
     se2 = s2 - sg2
     return [se2, sg2]
Example #7
0
	def RealVar(self,y,X):
		lmmg=LMM()
		m=np.shape(X)[1];
		n=len(y);
		lmmg.setG(X/math.sqrt(m))
		lmmg.sety(y);
		lmmg.setX(np.ones([n,1]))
		try:
			dct=lmmg.findH2();
		except:
			dct={};
			dct['h2']=.5;
			mn=sum(y)/float(n);
			dct['sigma2']=sum([(i-mn)**2 for i in y])/float(n);
		h2=dct['h2'];
		s2=dct['sigma2'];
		sg2=h2*s2;
		se2=s2-sg2;
		return [se2,sg2];
Example #8
0
    def fill_in_cache_file(self):
        self._run_once()

        logging.info("filling in the cache_file and log_delta, as needed")

        if self.G1_or_none is None:
            self.G1val_or_none = None
        else:
            self.G1val_or_none = self.G1_or_none.read().val

        # The S and U are always cached, in case they are needed for the cluster or for multi-threaded runs
        if self.cache_file is None:
            self.cache_file = os.path.join(self.__tempdirectory, "cache_file.npz")
            if os.path.exists(self.cache_file): # If there is already a cache file in the temp directory, it must be removed because it might be out-of-date
                os.remove(self.cache_file)

        lmm = None
        if not os.path.exists(self.cache_file):
            logging.info("Precomputing eigen")
            lmm = LMM()
            G0_standardized = self.G0.read().standardize()
            lmm.setG(G0_standardized.val, self.G1val_or_none, a2=self.mixing)
            logging.info("Saving precomputation to {0}".format(self.cache_file))
            util.create_directory_if_necessary(self.cache_file)
            np.savez(self.cache_file, lmm.U,lmm.S) #using np.savez instead of pickle because it seems to be faster to read and write

        if self.external_log_delta is None:
            if lmm is None:
                lmm = self.lmm_from_cache_file()

            logging.info("searching for internal delta")
            lmm.setX(self.covar)
            lmm.sety(self.pheno['vals'])
            #log delta is used here. Might be better to use findH2, but if so will need to normalized G so that its K's diagonal would sum to iid_count
            result = lmm.find_log_delta(REML=False, sid_count=self.G0.sid_count, min_log_delta=self.min_log_delta, max_log_delta=self.max_log_delta  ) #!!what about findA2H2? minH2=0.00001
            self.external_log_delta = result['log_delta']

        self.internal_delta = np.exp(self.external_log_delta) * self.G0.sid_count
        logging.info("internal_delta={0}".format(self.internal_delta))
        logging.info("external_log_delta={0}".format(self.external_log_delta))
Example #9
0
    def train_null(self):
        """
        train model under null hypothesis
        """

        logging.info("training null model")

        # use LMM
        self.lmm = LMM()
        self.lmm.setG(self.train_snps, self.train_pcs, a2=self.mixing)

        self.lmm.setX(self.cov)
        self.lmm.sety(self.phen)

        logging.info("finding delta")
        if self.delta is None:
            result = self.lmm.findH2(REML=self.REML, minH2=0.00001 )
            self.delta = 1.0/result['h2']-1.0
            
        # UX = lmm_null.U.dot(test_snps)
        self.res_null = self.lmm.nLLeval(delta=self.delta, REML=self.REML)
        self.ll_null = -self.res_null["nLL"]
Example #10
0
    def train_null(self):
        """
        find delta on all snps
        """

        logging.info("training null model")

        # use LMM
        self.lmm = LMM()
        self.lmm.setG(self.G0, self.G1, a2=self.mixing)

        self.lmm.setX(self.cov)
        self.lmm.sety(self.phen)

        logging.info("finding delta")

        #result = self.lmm.find_log_delta(self, self.N)
        #self.delta = np.exp(result['log_delta'])

        if self.delta is None:
            result = self.lmm.find_log_delta_chris()
            self.delta = result['delta']
Example #11
0
class GwasPrototype(object):
    """
    class to perform genome-wide scan
    """
    

    def __init__(self, train_snps, test_snps, phen, delta=None, cov=None, REML=False, train_pcs=None, mixing=0.0):
        """
        set up GWAS object
        """

        self.REML = REML
        self.train_snps = train_snps
        self.test_snps = test_snps
        self.phen = phen
        if delta is None:
            self.delta=None
        else:
            self.delta = delta * train_snps.shape[1]
        self.n_test = test_snps.shape[1]
        self.n_ind = len(self.phen)

        self.train_pcs = train_pcs
        self.mixing = mixing

        # add bias if no covariates are used
        if cov is None:
            self.cov = np.ones((self.n_ind, 1))
        else:
            self.cov = cov
        self.n_cov = self.cov.shape[1] 
       
        self.lmm = None
        self.res_null = None
        self.res_alt = []

        self.ll_null = None
        self.ll_alt = np.zeros(self.n_test)
        self.p_values = np.zeros(self.n_test)
        self.sorted_p_values = np.zeros(self.n_test)

        # merge covariates and test snps
        self.X = np.hstack((self.cov, self.test_snps))

    
    def precompute_UX(self, X): 
        ''' 
        precompute UX for all snps to be tested
        --------------------------------------------------------------------------
        Input:
        X       : [N*D] 2-dimensional array of covariates
        --------------------------------------------------------------------------
        '''

        logging.info("precomputing UX")

        self.UX = self.lmm.U.T.dot(X)
        self.k = self.lmm.S.shape[0]
        self.N = self.lmm.X.shape[0]
        if (self.k<self.N):
            self.UUX = X - self.lmm.U.dot(self.UX)

        logging.info("done.")


    def train_null(self):
        """
        train model under null hypothesis
        """

        logging.info("training null model")

        # use LMM
        self.lmm = LMM()
        self.lmm.setG(self.train_snps, self.train_pcs, a2=self.mixing)

        self.lmm.setX(self.cov)
        self.lmm.sety(self.phen)

        logging.info("finding delta")
        if self.delta is None:
            result = self.lmm.findH2(REML=self.REML, minH2=0.00001 )
            self.delta = 1.0/result['h2']-1.0
            
        # UX = lmm_null.U.dot(test_snps)
        self.res_null = self.lmm.nLLeval(delta=self.delta, REML=self.REML)
        self.ll_null = -self.res_null["nLL"]


    def set_current_UX(self, idx):
        """
        set the current UX to pre-trained LMM
        """

        si = idx + self.n_cov

        self.lmm.X = np.hstack((self.X[:,0:self.n_cov], self.X[:,si:si+1]))
        self.lmm.UX = np.hstack((self.UX[:,0:self.n_cov], self.UX[:,si:si+1]))
        if (self.k<self.N):
            self.lmm.UUX = np.hstack((self.UUX[:,0:self.n_cov], self.UUX[:,si:si+1]))
    

    def train_alt(self):
        """
        train alternative model
        """ 
   
        assert self.lmm != None
        self.precompute_UX(self.X)

        for idx in xrange(self.n_test):

            self.set_current_UX(idx)
            res = self.lmm.nLLeval(delta=self.delta, REML=self.REML)

            self.res_alt.append(res)
            self.ll_alt[idx] = -res["nLL"]

            if idx % 1000 == 0:
                logging.info("processing snp {0}".format(idx))


    def compute_p_values(self):
        """
        given trained null and alt models, compute p-values
        """

        # from C++ (?)
        #real df = rank_beta[ snp ] - ((real)1.0 * rank_beta_0[ snp ]) ;
        #pvals[ snp ] = PvalFromLikelihoodRatioTest( LL[ snp ] - LL_0[ snp ], ((real)0.5 * df) );

        degrees_of_freedom = 1

        assert len(self.res_alt) == self.n_test

        for idx in xrange(self.n_test):
            test_statistic = self.ll_alt[idx] - self.ll_null
            self.p_values[idx] = stats.chi2.sf(2.0 * test_statistic, degrees_of_freedom)

        
        self.p_idx = np.argsort(self.p_values)
        self.sorted_p_values = self.p_values[self.p_idx]
        


    def plot_result(self):
        """
        plot results
        """
        
        import pylab
        pylab.semilogy(self.p_values)
        pylab.show()

        dummy = [self.res_alt[idx]["nLL"] for idx in xrange(self.n_test)]
        pylab.hist(dummy, bins=100)
        pylab.title("neg likelihood")
        pylab.show()

        pylab.hist(self.p_values, bins=100)
        pylab.title("p-values")
        pylab.show()
 

    def run_gwas(self):
        """
        invoke all steps in the right order
        """

        self.train_null()
        self.train_alt()
        self.compute_p_values()
Example #12
0
class WindowingGwas(object):
    """
    class to perform genome-wide scan with single-snp windowing
    """
    def __init__(self,
                 G0,
                 phen,
                 delta=None,
                 cov=None,
                 REML=False,
                 G1=None,
                 mixing=0.0):
        """
        set up GWAS object
        """

        self.REML = REML
        self.G0 = G0
        self.test_snps = G0
        self.phen = phen
        if delta is None:
            self.delta = None
        else:
            self.delta = delta * G0.shape[1]
        self.n_test = self.test_snps.shape[1]
        self.n_ind = len(self.phen)

        self.G1 = G1
        self.mixing = mixing

        # add bias if no covariates are used
        if cov is None:
            self.cov = np.ones((self.n_ind, 1))
        else:
            self.cov = cov
        self.n_cov = self.cov.shape[1]

        self.lmm = None
        self.res_null = None
        self.res_alt = []

        self.ll_null = np.zeros(self.n_test)
        self.ll_alt = np.zeros(self.n_test)
        self.p_values = np.zeros(self.n_test)
        self.sorted_p_values = np.zeros(self.n_test)

        # merge covariates and test snps
        self.X = np.hstack((self.cov, self.test_snps))
        self.N = self.X.shape[0]

    def precompute_UX(self, X):
        ''' 
        precompute UX for all snps to be tested
        --------------------------------------------------------------------------
        Input:
        X       : [N*D] 2-dimensional array of covariates
        --------------------------------------------------------------------------
        '''

        logging.info("precomputing UX")

        self.UX = self.lmm.U.T.dot(X)
        self.k = self.lmm.S.shape[0]
        self.N = self.lmm.X.shape[0]
        if (self.k < self.N):
            self.UUX = X - self.lmm.U.dot(self.UX)

        logging.info("done.")

    def train_null(self):
        """
        find delta on all snps
        """

        logging.info("training null model")

        # use LMM
        self.lmm = LMM()
        self.lmm.setG(self.G0, self.G1, a2=self.mixing)

        self.lmm.setX(self.cov)
        self.lmm.sety(self.phen)

        logging.info("finding delta")

        #result = self.lmm.find_log_delta(self, self.N)
        #self.delta = np.exp(result['log_delta'])

        if self.delta is None:
            result = self.lmm.find_log_delta_chris()
            self.delta = result['delta']

    def set_current_UX(self, idx):
        """
        set the current UX to pre-trained LMM
        """

        si = idx + self.n_cov

        self.lmm.X = np.hstack((self.X[:, 0:self.n_cov], self.X[:, si:si + 1]))
        self.lmm.UX = np.hstack((self.UX[:, 0:self.n_cov], self.UX[:,
                                                                   si:si + 1]))
        if (self.k < self.N):
            self.lmm.UUX = np.hstack(
                (self.UUX[:, 0:self.n_cov], self.UUX[:, si:si + 1]))

    def set_null_UX(self):
        """
        reset UX to covariates only
        """
        self.lmm.X = self.X[:, 0:self.n_cov]
        self.lmm.UX = self.UX[:, 0:self.n_cov]
        if (self.k < self.N):
            self.lmm.UUX = self.UUX[:, 0:self.n_cov]

    def train_windowing(self):
        """
        train null and alternative model
        """

        assert self.lmm != None
        self.precompute_UX(self.X)

        for idx in range(self.n_test):

            #TODO: this can be generalized to bigger window
            self.lmm.set_exclude_idx([idx])

            # null model
            self.set_null_UX()
            res = self.lmm.nLLeval(delta=self.delta, REML=self.REML)
            self.ll_null[idx] = -res["nLL"]

            # alternative model
            self.set_current_UX(idx)
            res = self.lmm.nLLeval(delta=self.delta, REML=self.REML)

            self.res_alt.append(res)
            self.ll_alt[idx] = -res["nLL"]

            if idx % 1000 == 0:
                logging.warning("processing snp {0}".format(idx))

    def compute_p_values(self):
        """
        given trained null and alt models, compute p-values
        """

        degrees_of_freedom = 1

        assert len(self.res_alt) == self.n_test

        for idx in range(self.n_test):
            test_statistic = self.ll_alt[idx] - self.ll_null[idx]
            self.p_values[idx] = stats.chi2.sf(2.0 * test_statistic,
                                               degrees_of_freedom)

        self.p_idx = np.argsort(self.p_values)
        self.sorted_p_values = self.p_values[self.p_idx]

        return self.p_values

    def plot_result(self):
        """
        plot results
        """

        import pylab
        pylab.semilogy(self.p_values)
        pylab.show()

        dummy = [self.res_alt[idx]["nLL"] for idx in range(self.n_test)]
        pylab.hist(dummy, bins=100)
        pylab.title("neg likelihood")
        pylab.show()

        pylab.hist(self.p_values, bins=100)
        pylab.title("p-values")
        pylab.show()

    def run_gwas(self):
        """
        invoke all steps in the right order
        """

        self.train_null()
        self.train_windowing()
        return self.compute_p_values()
Example #13
0
class GwasPrototype(object):
    """
    class to perform genome-wide scan
    """
    

    def __init__(self, train_snps, test_snps, phen, delta=None, cov=None, REML=False, train_pcs=None, mixing=0.0):
        """
        set up GWAS object
        """

        self.REML = REML
        self.train_snps = train_snps
        self.test_snps = test_snps
        self.phen = phen
        if delta is None:
            self.delta=None
        else:
            self.delta = delta * train_snps.shape[1]
        self.n_test = test_snps.shape[1]
        self.n_ind = len(self.phen)

        self.train_pcs = train_pcs
        self.mixing = mixing

        # add bias if no covariates are used
        if cov is None:
            self.cov = np.ones((self.n_ind, 1))
        else:
            self.cov = cov
        self.n_cov = self.cov.shape[1] 
       
        self.lmm = None
        self.res_null = None
        self.res_alt = []

        self.ll_null = None
        self.ll_alt = np.zeros(self.n_test)
        self.p_values = np.zeros(self.n_test)
        self.sorted_p_values = np.zeros(self.n_test)

        # merge covariates and test snps
        self.X = np.hstack((self.cov, self.test_snps))

    
    def precompute_UX(self, X): 
        ''' 
        precompute UX for all snps to be tested
        --------------------------------------------------------------------------
        Input:
        X       : [N*D] 2-dimensional array of covariates
        --------------------------------------------------------------------------
        '''

        logging.info("precomputing UX")

        self.UX = self.lmm.U.T.dot(X)
        self.k = self.lmm.S.shape[0]
        self.N = self.lmm.X.shape[0]
        if (self.k<self.N):
            self.UUX = X - self.lmm.U.dot(self.UX)

        logging.info("done.")


    def train_null(self):
        """
        train model under null hypothesis
        """

        logging.info("training null model")

        # use LMM
        self.lmm = LMM()
        self.lmm.setG(self.train_snps, self.train_pcs, a2=self.mixing)

        self.lmm.setX(self.cov)
        self.lmm.sety(self.phen)

        logging.info("finding delta")
        if self.delta is None:
            result = self.lmm.findH2(REML=self.REML, minH2=0.00001 )
            self.delta = 1.0/result['h2']-1.0
            
        # UX = lmm_null.U.dot(test_snps)
        self.res_null = self.lmm.nLLeval(delta=self.delta, REML=self.REML)
        self.ll_null = -self.res_null["nLL"]


    def set_current_UX(self, idx):
        """
        set the current UX to pre-trained LMM
        """

        si = idx + self.n_cov

        self.lmm.X = np.hstack((self.X[:,0:self.n_cov], self.X[:,si:si+1]))
        self.lmm.UX = np.hstack((self.UX[:,0:self.n_cov], self.UX[:,si:si+1]))
        if (self.k<self.N):
            self.lmm.UUX = np.hstack((self.UUX[:,0:self.n_cov], self.UUX[:,si:si+1]))
    

    def train_alt(self):
        """
        train alternative model
        """ 
   
        assert self.lmm != None
        self.precompute_UX(self.X)

        for idx in xrange(self.n_test):

            self.set_current_UX(idx)
            res = self.lmm.nLLeval(delta=self.delta, REML=self.REML)

            self.res_alt.append(res)
            self.ll_alt[idx] = -res["nLL"]

            if idx % 1000 == 0:
                logging.info("processing snp {0}".format(idx))


    def compute_p_values(self):
        """
        given trained null and alt models, compute p-values
        """

        # from C++ (?)
        #real df = rank_beta[ snp ] - ((real)1.0 * rank_beta_0[ snp ]) ;
        #pvals[ snp ] = PvalFromLikelihoodRatioTest( LL[ snp ] - LL_0[ snp ], ((real)0.5 * df) );

        degrees_of_freedom = 1

        assert len(self.res_alt) == self.n_test

        for idx in xrange(self.n_test):
            test_statistic = self.ll_alt[idx] - self.ll_null
            self.p_values[idx] = stats.chi2.sf(2.0 * test_statistic, degrees_of_freedom)

        
        self.p_idx = np.argsort(self.p_values)
        self.sorted_p_values = self.p_values[self.p_idx]
        


    def plot_result(self):
        """
        plot results
        """
        
        import pylab
        pylab.semilogy(self.p_values)
        pylab.show()

        dummy = [self.res_alt[idx]["nLL"] for idx in xrange(self.n_test)]
        pylab.hist(dummy, bins=100)
        pylab.title("neg likelihood")
        pylab.show()

        pylab.hist(self.p_values, bins=100)
        pylab.title("p-values")
        pylab.show()
 

    def run_gwas(self):
        """
        invoke all steps in the right order
        """

        self.train_null()
        self.train_alt()
        self.compute_p_values()
    def run_select(self, G0, G_bg, y, cov=None):
        """set up two kernel feature selection
    
        Parameters
        ----------
        G0 : numpy array of shape (num_ind, num_snps)
            Data matrix from which foreground snps will be selected

        G0_bg : numpy array of shape (num_ind, num_snps)
            Data matrix containing background snps on which will be conditioned

        y : numpy vector of shape (num_ind, )
            Vector of phenotypes

        cov : numpy array of shape (num_ind, num_covariates) or None
            Covariates to be used as fixed effects

        Returns
        -------
        best_k, feat_idx, best_mix, best_delta: tuple(int, np.array(int), float, float)
            best_k is the best number of SNPs selected,
            feat_idx is a np.array of integers denoting the indices of these snps,
            best_mix is the best mixing coefficient between foreground and background kernel,
            best_delta is the best regularization coefficient
        """

        num_ind = len(y)

        if cov is None:
            cov = np.ones((num_ind, 1))
        else:
            logging.info("normalizing covariates")
            cov = cov.copy()
            cov = 1. / np.sqrt((cov**2).sum() / float(cov.shape[0])) * cov
        cov.flags.writeable = False

        # normalize to diag(K) = N
        norm_factor = 1. / np.sqrt((G_bg**2).sum() / float(G_bg.shape[0]))

        # we copy in case G and G_bg are pointing to the same object
        G_bg = norm_factor * G_bg

        K_bg_full = G_bg.dot(G_bg.T)
        K_bg_full.flags.writeable = False

        # some asserts
        np.testing.assert_almost_equal(sum(np.diag(K_bg_full)), G_bg.shape[0])
        if self.debug:
            norm_factor_check = 1. / np.sqrt(G_bg.shape[1])
            np.testing.assert_array_almost_equal(norm_factor,
                                                 norm_factor_check,
                                                 decimal=1)

        for kfold_idx, (train_idx, test_idx) in enumerate(
                KFold(num_ind,
                      n_folds=self.n_folds,
                      random_state=self.random_state,
                      shuffle=True)):

            t0 = time.time()
            logging.info("running fold: %i" % kfold_idx)

            y_train = y.take(train_idx, axis=0)
            y_test = y.take(test_idx, axis=0)
            G0_train = G0.take(train_idx, axis=0)
            G0_test = G0.take(test_idx, axis=0)

            G_bg_train = G_bg.take(train_idx, axis=0)
            G_bg_test = G_bg.take(test_idx, axis=0)

            cov_train = cov.take(train_idx, axis=0)
            cov_test = cov.take(test_idx, axis=0)

            # write protect data
            y_train.flags.writeable = False
            y_test.flags.writeable = False
            G0_train.flags.writeable = False
            G0_test.flags.writeable = False
            G_bg_train.flags.writeable = False
            G_bg_test.flags.writeable = False
            cov_train.flags.writeable = False
            cov_test.flags.writeable = False

            # precompute background kernel
            K_bg_train = K_bg_full.take(train_idx, axis=0).take(train_idx,
                                                                axis=1)
            K_bg_train.flags.writeable = False

            if self.measure != "mse":
                K_bg_test = K_bg_full.take(test_idx, axis=0).take(test_idx,
                                                                  axis=1)
                K_bg_test.flags.writeable = False

            # rank features
            if self.order_by_lmm:
                logging.info("using linear mixed model to rank features")
                t0 = time.time()
                gwas = FastGwas(G_bg_train,
                                G0_train,
                                y_train,
                                delta=None,
                                train_pcs=None,
                                mixing=0.0,
                                cov=cov_train)
                gwas.run_gwas()
                _pval = gwas.p_values
                logging.info("time taken: %s" % (str(time.time() - t0)))
            else:
                logging.info("using linear regression to rank features")
                _F, _pval = lin_reg.f_regression_block(
                    lin_reg.f_regression_cov_alt,
                    G0_train,
                    y_train,
                    blocksize=10000,
                    C=cov_train)

            feat_idx = np.argsort(_pval)

            for k_idx, max_k in enumerate(self.grid_k):

                feat_idx_subset = feat_idx[0:max_k]
                G_fs_train = G0_train.take(feat_idx_subset, axis=1)
                G_fs_test = G0_test.take(feat_idx_subset, axis=1)

                # normalize to sum(diag)=N
                norm_factor = 1. / np.sqrt(
                    (G_fs_train**2).sum() / float(G_fs_train.shape[0]))

                G_fs_train *= norm_factor
                G_fs_test *= norm_factor

                G_fs_train.flags.writeable = False
                G_fs_test.flags.writeable = False

                # asserts
                if self.debug:
                    norm_factor_check = 1.0 / np.sqrt(max_k)
                    np.testing.assert_array_almost_equal(norm_factor,
                                                         norm_factor_check,
                                                         decimal=1)
                    np.testing.assert_almost_equal(
                        sum(np.diag(G_fs_train.dot(G_fs_train.T))),
                        G_fs_train.shape[0])

                logging.info("k: %i" % (max_k))

                # use LMM
                from fastlmm.inference.lmm_cov import LMM as fastLMM

                if G_bg_train.shape[1] <= G_bg_train.shape[0]:
                    lmm = fastLMM(X=cov_train,
                                  Y=y_train[:, np.newaxis],
                                  G=G_bg_train)
                else:
                    lmm = fastLMM(X=cov_train,
                                  Y=y_train[:, np.newaxis],
                                  K=K_bg_train)

                W = G_fs_train.copy()
                UGup, UUGup = lmm.rotate(W)

                i_up = np.zeros((G_fs_train.shape[1]), dtype=np.bool)
                i_G1 = np.ones((G_fs_train.shape[1]), dtype=np.bool)
                t0 = time.time()
                res = lmm.findH2_2K(nGridH2=10,
                                    minH2=0.0,
                                    maxH2=0.99999,
                                    i_up=i_up,
                                    i_G1=i_G1,
                                    UW=UGup,
                                    UUW=UUGup)
                logging.info("time taken for k=%i: %s" %
                             (max_k, str(time.time() - t0)))

                # recover a2 from alternate parameterization
                a2 = res["h2_1"] / float(res["h2"] + res["h2_1"])
                h2 = res["h2"] + res["h2_1"]
                delta = (1 - h2) / h2
                #res_cov = res

                # do final prediction using lmm.py
                from fastlmm.inference import LMM
                lmm = LMM(forcefullrank=False)
                lmm.setG(G0=G_bg_train, G1=G_fs_train, a2=a2)
                lmm.setX(cov_train)
                lmm.sety(y_train)

                # we take an additional step to estimate betas on covariates (not given from new model)
                res = lmm.nLLeval(delta=delta, REML=True)

                # predict on test set
                lmm.setTestData(Xstar=cov_test,
                                G0star=G_bg_test,
                                G1star=G_fs_test)
                out = lmm.predictMean(beta=res["beta"], delta=delta)

                mse = mean_squared_error(y_test, out)
                logging.info("mse: %f" % (mse))

                self.mse[kfold_idx, k_idx] = mse

                self.mixes[kfold_idx, k_idx] = a2
                self.deltas[kfold_idx, k_idx] = delta

                if self.measure != "mse":
                    K_test_test = a2 * G_fs_test.dot(
                        G_fs_test.T) + (1.0 - a2) * K_bg_test
                    ll = lmm.nLLeval_test(y_test,
                                          res["beta"],
                                          sigma2=res["sigma2"],
                                          delta=delta,
                                          Kstar_star=K_test_test,
                                          robust=True)

                    if self.debug:
                        ll2 = lmm.nLLeval_test(y_test,
                                               res["beta"],
                                               sigma2=res["sigma2"],
                                               delta=delta,
                                               Kstar_star=None,
                                               robust=True)
                        np.testing.assert_almost_equal(ll, ll2, decimal=4)

                    logging.info("ll: %f" % (ll))
                    self.ll[kfold_idx, k_idx] = ll

            logging.info("time taken for fold: %s" % str(time.time() - t0))

        best_k, best_mix, best_delta = self.select_best_k()

        logging.info("best_k: %i, best_mix: %f, best_delta: %f" %
                     (best_k, best_mix, best_delta))

        # final scan
        if self.order_by_lmm:
            logging.info("final scan using LMM")
            gwas = FastGwas(G_bg,
                            G0,
                            y,
                            delta=None,
                            train_pcs=None,
                            mixing=0.0,
                            cov=cov)
            gwas.run_gwas()
            _pval = gwas.p_values
            feat_idx = np.argsort(_pval)[0:best_k]
        else:
            logging.info("final scan using LR")
            _F, _pval = lin_reg.f_regression_block(
                lin_reg.f_regression_cov_alt, G0, y, C=cov, blocksize=10000)

        logging.info("number of snps selected: %i" % (best_k))

        return best_k, feat_idx, best_mix, best_delta
Example #15
0
class WindowingGwas(object):
    """
    class to perform genome-wide scan with single-snp windowing
    """
    

    def __init__(self, G0, phen, delta=None, cov=None, REML=False, G1=None, mixing=0.0):
        """
        set up GWAS object
        """

        self.REML = REML
        self.G0 = G0
        self.test_snps = G0
        self.phen = phen
        if delta is None:
            self.delta=None
        else:
            self.delta = delta * G0.shape[1]
        self.n_test = self.test_snps.shape[1]
        self.n_ind = len(self.phen)

        self.G1 = G1
        self.mixing = mixing

        # add bias if no covariates are used
        if cov is None:
            self.cov = np.ones((self.n_ind, 1))
        else:
            self.cov = cov
        self.n_cov = self.cov.shape[1] 
       
        self.lmm = None
        self.res_null = None
        self.res_alt = []

        self.ll_null = np.zeros(self.n_test)
        self.ll_alt = np.zeros(self.n_test)
        self.p_values = np.zeros(self.n_test)
        self.sorted_p_values = np.zeros(self.n_test)

        # merge covariates and test snps
        self.X = np.hstack((self.cov, self.test_snps))
        self.N = self.X.shape[0]
    
    def precompute_UX(self, X): 
        ''' 
        precompute UX for all snps to be tested
        --------------------------------------------------------------------------
        Input:
        X       : [N*D] 2-dimensional array of covariates
        --------------------------------------------------------------------------
        '''

        logging.info("precomputing UX")

        self.UX = self.lmm.U.T.dot(X)
        self.k = self.lmm.S.shape[0]
        self.N = self.lmm.X.shape[0]
        if (self.k<self.N):
            self.UUX = X - self.lmm.U.dot(self.UX)

        logging.info("done.")


    def train_null(self):
        """
        find delta on all snps
        """

        logging.info("training null model")

        # use LMM
        self.lmm = LMM()
        self.lmm.setG(self.G0, self.G1, a2=self.mixing)

        self.lmm.setX(self.cov)
        self.lmm.sety(self.phen)

        logging.info("finding delta")

        #result = self.lmm.find_log_delta(self, self.N)
        #self.delta = np.exp(result['log_delta'])

        if self.delta is None:
            result = self.lmm.find_log_delta_chris()
            self.delta = result['delta']



    def set_current_UX(self, idx):
        """
        set the current UX to pre-trained LMM
        """

        si = idx + self.n_cov

        self.lmm.X = np.hstack((self.X[:,0:self.n_cov], self.X[:,si:si+1]))
        self.lmm.UX = np.hstack((self.UX[:,0:self.n_cov], self.UX[:,si:si+1]))
        if (self.k<self.N):
            self.lmm.UUX = np.hstack((self.UUX[:,0:self.n_cov], self.UUX[:,si:si+1]))
    

    def set_null_UX(self):
        """
        reset UX to covariates only
        """
        self.lmm.X = self.X[:,0:self.n_cov]
        self.lmm.UX = self.UX[:,0:self.n_cov]
        if (self.k<self.N):
            self.lmm.UUX = self.UUX[:,0:self.n_cov]
    

    def train_windowing(self):
        """
        train null and alternative model
        """ 
   
        assert self.lmm != None
        self.precompute_UX(self.X)

        for idx in xrange(self.n_test):

            #TODO: this can be generalized to bigger window
            self.lmm.set_exclude_idx([idx])

            # null model
            self.set_null_UX()
            res = self.lmm.nLLeval(delta=self.delta, REML=self.REML)
            self.ll_null[idx] = -res["nLL"]

            # alternative model
            self.set_current_UX(idx)
            res = self.lmm.nLLeval(delta=self.delta, REML=self.REML)

            self.res_alt.append(res)
            self.ll_alt[idx] = -res["nLL"]

            if idx % 1000 == 0:
                logging.warning("processing snp {0}".format(idx))


    def compute_p_values(self):
        """
        given trained null and alt models, compute p-values
        """

        degrees_of_freedom = 1

        assert len(self.res_alt) == self.n_test

        for idx in xrange(self.n_test):
            test_statistic = self.ll_alt[idx] - self.ll_null[idx]
            self.p_values[idx] = stats.chi2.sf(2.0 * test_statistic, degrees_of_freedom)

        self.p_idx = np.argsort(self.p_values)
        self.sorted_p_values = self.p_values[self.p_idx]
        
        return self.p_values


    def plot_result(self):
        """
        plot results
        """
        
        import pylab
        pylab.semilogy(self.p_values)
        pylab.show()

        dummy = [self.res_alt[idx]["nLL"] for idx in xrange(self.n_test)]
        pylab.hist(dummy, bins=100)
        pylab.title("neg likelihood")
        pylab.show()

        pylab.hist(self.p_values, bins=100)
        pylab.title("p-values")
        pylab.show()
 

    def run_gwas(self):
        """
        invoke all steps in the right order
        """

        self.train_null()
        self.train_windowing()
        return self.compute_p_values()
    def run_select(self, G0, G_bg, y, cov=None):
        """set up two kernel feature selection
    
        Parameters
        ----------
        G0 : numpy array of shape (num_ind, num_snps)
            Data matrix from which foreground snps will be selected

        G0_bg : numpy array of shape (num_ind, num_snps)
            Data matrix containing background snps on which will be conditioned

        y : numpy vector of shape (num_ind, )
            Vector of phenotypes

        cov : numpy array of shape (num_ind, num_covariates) or None
            Covariates to be used as fixed effects

        Returns
        -------
        best_k, feat_idx, best_mix, best_delta: tuple(int, np.array(int), float, float)
            best_k is the best number of SNPs selected,
            feat_idx is a np.array of integers denoting the indices of these snps,
            best_mix is the best mixing coefficient between foreground and background kernel,
            best_delta is the best regularization coefficient
        """

        num_ind = len(y)

        if cov is None:
            cov = np.ones((num_ind,1))
        else:
            logging.info("normalizing covariates")
            cov = cov.copy()
            cov = 1./np.sqrt((cov**2).sum() / float(cov.shape[0])) * cov
        cov.flags.writeable = False
        
        # normalize to diag(K) = N
        norm_factor = 1./np.sqrt((G_bg**2).sum() / float(G_bg.shape[0]))

        # we copy in case G and G_bg are pointing to the same object
        G_bg = norm_factor * G_bg
       
        K_bg_full = G_bg.dot(G_bg.T)
        K_bg_full.flags.writeable = False
        
        # some asserts
        np.testing.assert_almost_equal(sum(np.diag(K_bg_full)), G_bg.shape[0])
        if self.debug:
            norm_factor_check = 1./np.sqrt(G_bg.shape[1])
            np.testing.assert_array_almost_equal(norm_factor, norm_factor_check, decimal=1)
            

        for kfold_idx, (train_idx, test_idx) in enumerate(KFold(num_ind, n_folds=self.n_folds, random_state=self.random_state, shuffle=True)):

            t0 = time.time()
            logging.info("running fold: %i" % kfold_idx)

            y_train = y.take(train_idx, axis=0)
            y_test = y.take(test_idx, axis=0)
            G0_train = G0.take(train_idx, axis=0)
            G0_test = G0.take(test_idx, axis=0)

            G_bg_train = G_bg.take(train_idx, axis=0)
            G_bg_test = G_bg.take(test_idx, axis=0)

            cov_train = cov.take(train_idx, axis=0)
            cov_test = cov.take(test_idx, axis=0)

            # write protect data
            y_train.flags.writeable = False
            y_test.flags.writeable = False
            G0_train.flags.writeable = False
            G0_test.flags.writeable = False
            G_bg_train.flags.writeable = False
            G_bg_test.flags.writeable = False
            cov_train.flags.writeable = False
            cov_test.flags.writeable = False

            # precompute background kernel
            K_bg_train = K_bg_full.take(train_idx, axis=0).take(train_idx, axis=1) 
            K_bg_train.flags.writeable = False

            if self.measure != "mse":
                K_bg_test = K_bg_full.take(test_idx, axis=0).take(test_idx, axis=1)
                K_bg_test.flags.writeable = False

            # rank features
            if self.order_by_lmm:
                logging.info("using linear mixed model to rank features")
                t0 = time.time()
                gwas = FastGwas(G_bg_train, G0_train, y_train, delta=None, train_pcs=None, mixing=0.0, cov=cov_train)
                gwas.run_gwas()
                _pval = gwas.p_values
                logging.info("time taken: %s" % (str(time.time()-t0)))
            else:
                logging.info("using linear regression to rank features")
                _F,_pval = lin_reg.f_regression_block(lin_reg.f_regression_cov_alt, G0_train, y_train, blocksize=10000, C=cov_train)

            feat_idx = np.argsort(_pval)
            
            for k_idx, max_k in enumerate(self.grid_k):

                feat_idx_subset = feat_idx[0:max_k]
                G_fs_train = G0_train.take(feat_idx_subset, axis=1)
                G_fs_test = G0_test.take(feat_idx_subset, axis=1)

                # normalize to sum(diag)=N
                norm_factor = 1./np.sqrt((G_fs_train**2).sum() / float(G_fs_train.shape[0]))

                G_fs_train *= norm_factor
                G_fs_test *= norm_factor
                                
                G_fs_train.flags.writeable = False
                G_fs_test.flags.writeable = False

                # asserts
                if self.debug:
                    norm_factor_check = 1.0 / np.sqrt(max_k)
                    np.testing.assert_array_almost_equal(norm_factor, norm_factor_check, decimal=1)
                    np.testing.assert_almost_equal(sum(np.diag(G_fs_train.dot(G_fs_train.T))), G_fs_train.shape[0])

                logging.info("k: %i" % (max_k))

                # use LMM
                from fastlmm.inference.lmm_cov import LMM as fastLMM

                if G_bg_train.shape[1] <= G_bg_train.shape[0]:
                    lmm = fastLMM(X=cov_train, Y=y_train[:,np.newaxis], G=G_bg_train)
                else:
                    lmm = fastLMM(X=cov_train, Y=y_train[:,np.newaxis], K=K_bg_train)

                W = G_fs_train.copy()
                UGup,UUGup = lmm.rotate(W)
                
                i_up = np.zeros((G_fs_train.shape[1]), dtype=np.bool)
                i_G1 = np.ones((G_fs_train.shape[1]), dtype=np.bool)
                t0 = time.time()
                res = lmm.findH2_2K(nGridH2=10, minH2=0.0, maxH2=0.99999, i_up=i_up, i_G1=i_G1, UW=UGup, UUW=UUGup)
                logging.info("time taken for k=%i: %s" % (max_k, str(time.time()-t0)))
                
                # recover a2 from alternate parameterization
                a2 = res["h2_1"] / float(res["h2"] + res["h2_1"])
                h2 = res["h2"] + res["h2_1"]
                delta = (1-h2) / h2
                #res_cov = res


                # do final prediction using lmm.py
                from fastlmm.inference import LMM
                lmm = LMM(forcefullrank=False)
                lmm.setG(G0=G_bg_train, G1=G_fs_train, a2=a2)
                lmm.setX(cov_train)
                lmm.sety(y_train)

                # we take an additional step to estimate betas on covariates (not given from new model)
                res = lmm.nLLeval(delta=delta, REML=True)
                
                # predict on test set
                lmm.setTestData(Xstar=cov_test, G0star=G_bg_test, G1star=G_fs_test)
                out = lmm.predictMean(beta=res["beta"], delta=delta)

                mse = mean_squared_error(y_test, out)
                logging.info("mse: %f" % (mse))

                self.mse[kfold_idx, k_idx] = mse

                self.mixes[kfold_idx, k_idx] = a2
                self.deltas[kfold_idx, k_idx] = delta

                if self.measure != "mse":
                    K_test_test = a2 * G_fs_test.dot(G_fs_test.T) + (1.0-a2) * K_bg_test 
                    ll = lmm.nLLeval_test(y_test, res["beta"], sigma2=res["sigma2"], delta=delta, Kstar_star=K_test_test, robust=True)

                    if self.debug:
                        ll2 = lmm.nLLeval_test(y_test, res["beta"], sigma2=res["sigma2"], delta=delta, Kstar_star=None, robust=True)
                        np.testing.assert_almost_equal(ll, ll2, decimal=4)

                    logging.info("ll: %f" % (ll))
                    self.ll[kfold_idx, k_idx]  = ll
                    

            logging.info("time taken for fold: %s" % str(time.time()-t0))
        

        best_k, best_mix, best_delta = self.select_best_k()

        logging.info("best_k: %i, best_mix: %f, best_delta: %f" % (best_k, best_mix, best_delta))

        # final scan 
        if self.order_by_lmm:
            logging.info("final scan using LMM")
            gwas = FastGwas(G_bg, G0, y, delta=None, train_pcs=None, mixing=0.0, cov=cov)
            gwas.run_gwas()
            _pval = gwas.p_values
            feat_idx = np.argsort(_pval)[0:best_k]
        else:
            logging.info("final scan using LR")
            _F,_pval = lin_reg.f_regression_block(lin_reg.f_regression_cov_alt, G0, y, C=cov, blocksize=10000)
        
        logging.info("number of snps selected: %i" % (best_k))

        return best_k, feat_idx, best_mix, best_delta