def generator(data, n): """ Generates mini-batches of data n = mini batch size """ active_ind = u.expand_ind(data.active_ind) num_active_samples = data.active_len annotld = u.read_h5(data.X) chisq = np.array(pd.read_csv(data.y, delim_whitespace=True)['CHISQ']) all_w = np.array( pd.read_csv(data.weights, delim_whitespace=True).iloc[:, -1]) mean_active_w, std_active_w = u.get_mean_std_w(data) stdized_w = ( all_w - mean_active_w ) / std_active_w # ideally we should standardize according to active_w without disrupting the indices num_batches = num_active_samples // n while True: i = 1 while i <= num_batches: batch_ind = active_ind[n * (i - 1):n * i] batch_ws_annotld, batch_ws_chisq = get_batch( data, annotld, chisq, all_w, stdized_w, batch_ind) i += 1 yield batch_ws_annotld, batch_ws_chisq # the last batch concatenates what remains and the head of data batch_ind = active_ind[n * (i - 1):] + active_ind[:n - len(active_ind) + n * (i - 1)] batch_ind.sort() batch_ws_annotld, batch_ws_chisq = get_batch(data, annotld, chisq, all_w, stdized_w, batch_ind) yield batch_ws_annotld, batch_ws_chisq
def num_features(self): if not self._num_features: d = u.read_h5(self.X) if d.ndim == 1: self._num_features = 1 else: self._num_features = d.shape[1] return self._num_features
def std_X(self): if not self._std_X: d = u.read_h5(data.X) sum_sqdiff = 0 for i in self.active_ind: start,end = i sum_sqdiff_chunck = np.sum((d[start:end,:] - self.mean_X)**2,axis=0) sum_sqdiff += sum_sqdiff_chunck self._std_X = np.sqrt(np.divide(sum_sqdiff,self.active_len)) return self._std_X
def mean_X(self): if not self._mean_X: d = u.read_h5(data.X) sum_active_rows = 0 for i in self.active_ind: start,end = i sum_chunck = np.sum(d[start:end,:],axis=0) sum_active_rows += sum_chunck self._mean_X = np.divide(sum_active_rows,self.active_len) return self._mean_X
def weighted_meanX(self): # weighted average of columns of active X: (\sum w_ix_i)/(\sum w_i) w_mean = [] w = u.get_active_weights(self.weights,self.active_ind) for strip in self.X_strips: X = u.read_h5(self.X) X_strip = u.get_strip_active_X(X,strip,self.active_ind) w_mean_strip = np.average(X_strip,weights=w,axis=0) w_mean.append(w_mean_strip) w_mean = np.array(w_mean) self._weighted_meanX = w_mean.flatten() return self._weighted_meanX
def X_scale(self): # L2 norm of X - weighted_meanX centered_norm = [] for strip in self.X_strips: X = u.read_h5(self.X) X_active_strip = u.get_strip_active_X(X,strip,self.active_ind) X_offset_strip = self.weighted_meanX[strip[0]:strip[1]] X_centered_strip = X_active_strip - X_offset_strip norm_strip = np.linalg.norm(X_centered_strip,axis=0,ord=2) centered_norm.append(norm_strip) centered_norm = np.array(centered_norm) self._X_scale = centered_norm.flatten() return self._X_scale
def direct_fit(self, data): # direct fit skips the standardizing step X = u.read_h5(data.X) if X.ndim == 1: X = X[:].reshape(-1, 1) active_X = u.get_active(X, data.active_ind) active_y = u.read_chisq_from_ss(data.y, data.active_ind) w = u.get_active_weights(data.weights, data.active_ind) from sklearn.linear_model import LinearRegression model = LinearRegression() model.fit(active_X, active_y, sample_weight=w) self.coef = model.coef_ / data.N self.intercept = model.intercept_ return self.coef, self.intercept
def direct_fit(self, data): # direct fit without manually weight and scale the data X = u.read_h5(data.X) if X.ndim == 1: X = X[:].reshape(-1, 1) active_X = u.get_active(X, data.active_ind) active_y = u.read_chisq_from_ss(data.y, data.active_ind) w = u.get_active_weights(data.weights, data.active_ind) from sklearn.linear_model import LassoCV model = LassoCV() model.fit(active_X, active_y, sample_weight=w) self.coef = model.coef_ / data.N self.intercept = model.intercept_ self.alpha = model.alpha_ return self.coef, self.intercept, self.alpha
def evaluate(self, data): # used self.coef and self.intercept to compute predicted sumstats on data # formula for chisq is N*annot_ld*self.coef + self.intercept # compute the weighted sum of squared loss on data d = u.read_h5(data.X) val_annotld = u.get_active( d, data.active_ind) # output one ndarray represent validation matrix val_chisq = u.read_chisq_from_ss( data.y, data.active_ind) # ndarray of validation chisq stats val_weights = u.get_active_weights( data.weights, data.active_ind) # ndarray of validation weights if val_annotld.ndim == 1: val_annotld = val_annotld[np.newaxis].T pred_chisq = np.multiply(data.N, val_annotld.dot( self.coef)) + self.intercept else: pred_chisq = np.multiply(data.N, val_annotld.dot( self.coef)) + self.intercept weighted_sumsqerror = val_weights.dot((pred_chisq - val_chisq)**2) self.cv_loss = weighted_sumsqerror return weighted_sumsqerror