Esempio n. 1
0
def generator(data, n):
    """
    Generates mini-batches of data
    n = mini batch size
    """
    active_ind = u.expand_ind(data.active_ind)
    num_active_samples = data.active_len
    annotld = u.read_h5(data.X)
    chisq = np.array(pd.read_csv(data.y, delim_whitespace=True)['CHISQ'])
    all_w = np.array(
        pd.read_csv(data.weights, delim_whitespace=True).iloc[:, -1])
    mean_active_w, std_active_w = u.get_mean_std_w(data)
    stdized_w = (
        all_w - mean_active_w
    ) / std_active_w  # ideally we should standardize according to active_w without disrupting the indices
    num_batches = num_active_samples // n
    while True:
        i = 1
        while i <= num_batches:
            batch_ind = active_ind[n * (i - 1):n * i]
            batch_ws_annotld, batch_ws_chisq = get_batch(
                data, annotld, chisq, all_w, stdized_w, batch_ind)
            i += 1
            yield batch_ws_annotld, batch_ws_chisq
        # the last batch concatenates what remains and the head of data
        batch_ind = active_ind[n *
                               (i - 1):] + active_ind[:n - len(active_ind) +
                                                      n * (i - 1)]
        batch_ind.sort()
        batch_ws_annotld, batch_ws_chisq = get_batch(data, annotld, chisq,
                                                     all_w, stdized_w,
                                                     batch_ind)
        yield batch_ws_annotld, batch_ws_chisq
Esempio n. 2
0
 def num_features(self):
     if not self._num_features:
         d = u.read_h5(self.X)
         if d.ndim == 1:
             self._num_features = 1
         else:
             self._num_features = d.shape[1]
     return self._num_features
Esempio n. 3
0
 def std_X(self):
     if not self._std_X:
         d = u.read_h5(data.X)
         sum_sqdiff = 0
         for i in self.active_ind:
             start,end = i
             sum_sqdiff_chunck = np.sum((d[start:end,:] - self.mean_X)**2,axis=0)
             sum_sqdiff += sum_sqdiff_chunck
         self._std_X = np.sqrt(np.divide(sum_sqdiff,self.active_len))
     return self._std_X
Esempio n. 4
0
 def mean_X(self):
     if not self._mean_X:
         d = u.read_h5(data.X)
         sum_active_rows = 0
         for i in self.active_ind:
             start,end = i
             sum_chunck = np.sum(d[start:end,:],axis=0)
             sum_active_rows += sum_chunck
         self._mean_X = np.divide(sum_active_rows,self.active_len)
     return self._mean_X
Esempio n. 5
0
 def weighted_meanX(self):
     # weighted average of columns of  active X: (\sum w_ix_i)/(\sum w_i)
     w_mean = []
     w = u.get_active_weights(self.weights,self.active_ind)
     for strip in self.X_strips:
         X = u.read_h5(self.X)
         X_strip = u.get_strip_active_X(X,strip,self.active_ind)
         w_mean_strip = np.average(X_strip,weights=w,axis=0)
         w_mean.append(w_mean_strip)
     w_mean = np.array(w_mean)
     self._weighted_meanX = w_mean.flatten()
     return self._weighted_meanX
Esempio n. 6
0
 def X_scale(self):
     # L2 norm of X - weighted_meanX
     centered_norm = []
     for strip in self.X_strips:
         X = u.read_h5(self.X)
         X_active_strip = u.get_strip_active_X(X,strip,self.active_ind)
         X_offset_strip = self.weighted_meanX[strip[0]:strip[1]]
         X_centered_strip = X_active_strip - X_offset_strip
         norm_strip = np.linalg.norm(X_centered_strip,axis=0,ord=2)
         centered_norm.append(norm_strip)
     centered_norm = np.array(centered_norm)
     self._X_scale = centered_norm.flatten()
     return self._X_scale
Esempio n. 7
0
 def direct_fit(self, data):
     # direct fit skips the standardizing step
     X = u.read_h5(data.X)
     if X.ndim == 1:
         X = X[:].reshape(-1, 1)
     active_X = u.get_active(X, data.active_ind)
     active_y = u.read_chisq_from_ss(data.y, data.active_ind)
     w = u.get_active_weights(data.weights, data.active_ind)
     from sklearn.linear_model import LinearRegression
     model = LinearRegression()
     model.fit(active_X, active_y, sample_weight=w)
     self.coef = model.coef_ / data.N
     self.intercept = model.intercept_
     return self.coef, self.intercept
Esempio n. 8
0
 def direct_fit(self, data):
     # direct fit without manually weight and scale the data
     X = u.read_h5(data.X)
     if X.ndim == 1:
         X = X[:].reshape(-1, 1)
     active_X = u.get_active(X, data.active_ind)
     active_y = u.read_chisq_from_ss(data.y, data.active_ind)
     w = u.get_active_weights(data.weights, data.active_ind)
     from sklearn.linear_model import LassoCV
     model = LassoCV()
     model.fit(active_X, active_y, sample_weight=w)
     self.coef = model.coef_ / data.N
     self.intercept = model.intercept_
     self.alpha = model.alpha_
     return self.coef, self.intercept, self.alpha
Esempio n. 9
0
 def evaluate(self, data):
     # used self.coef and self.intercept to compute predicted sumstats on data
     # formula for chisq is N*annot_ld*self.coef + self.intercept
     # compute the weighted sum of squared loss on data
     d = u.read_h5(data.X)
     val_annotld = u.get_active(
         d,
         data.active_ind)  # output one ndarray represent validation matrix
     val_chisq = u.read_chisq_from_ss(
         data.y, data.active_ind)  # ndarray of validation chisq stats
     val_weights = u.get_active_weights(
         data.weights, data.active_ind)  # ndarray of validation weights
     if val_annotld.ndim == 1:
         val_annotld = val_annotld[np.newaxis].T
         pred_chisq = np.multiply(data.N, val_annotld.dot(
             self.coef)) + self.intercept
     else:
         pred_chisq = np.multiply(data.N, val_annotld.dot(
             self.coef)) + self.intercept
     weighted_sumsqerror = val_weights.dot((pred_chisq - val_chisq)**2)
     self.cv_loss = weighted_sumsqerror
     return weighted_sumsqerror