def fast_euclid(X): gamma = 1.0 / X.shape[1] if X.shape[0] < 1000: L = rbf_kernel(X, gamma=gamma) else: L = Nystroem(gamma=gamma).fit_transform(X) L = L.dot(L.T) Ls = np.log(L) * (-1.0 / (gamma)) return Ls
def _dpp_sel(self, X_, y=None): """ DPP only relies on X. We will condition the sampling based on: * `self.coef_info['cols']` After sampling it will go ahead and then perform grouped wilcoxon selection. """ X = np.array(X_) print(X.shape) cols_to_index = [ idx for idx, x in enumerate(X_.columns) if x in self.coef_info['cols'] ] unseen_cols_to_index = [ idx for idx, x in enumerate(X_.columns) if x not in self.coef_info['cols'] ] if X.shape[0] < 1000 or X.shape[1] < 100: feat_dist = rbf_kernel(X.T) else: feat_dist = Nystroem().fit_transform(X.T) feat_dist = feat_dist.dot(feat_dist.T) #self._dpp_estimate_k(feat_dist) #k = self.dpp_k['pca'] #- len(self.coef_info['cols']) k = None feat_index = [] #while len(feat_index) == 0: if len(self.coef_info['cols']) == 0: feat_index = sample_dpp(decompose_kernel(feat_dist), k=k) else: feat_index = sample_conditional_dpp(feat_dist, cols_to_index, k=k) feat_index = [x for x in feat_index if x is not None] # select features using entropy measure # how can we order features from most to least relevant first? # we chould do it using f test? Or otherwise - presume DPP selects best one first s_b, s_w = class_separability(X, y) col_sel = evaluate_feats(s_b, s_w) #sel_cols = list(self.coef_info['cols']) + list(col_sel) """ feat_entropy = [] excl_entropy = [] X_sel = X[:, feat_index] for idx, feat in enumerate(X_sel.T): if len(feat_entropy) == 0: feat_entropy.append(idx) continue if entropy(X_sel[:, feat_entropy]) > entropy(X_sel[:, feat_entropy+[idx]]): feat_entropy.append(idx) else: excl_entropy.append(idx) """ # iterate over feat_index to determine # information on wilcoxon test # as the feat index are already "ordered" as that is how DPP would # perform the sampling - we will do the single pass in the same # way it was approached in the OGFS # feat index will have all previous sampled columns as well... if not self.unseen_only and len(feat_index) > 0: feat_check = [] excl_check = [] X_sel = X[:, feat_index] for idx, feat in enumerate(X_sel.T): if len(feat_check) == 0: feat_check.append(idx) continue wilcoxon_pval = wilcoxon_group(X_sel[:, feat_check], feat) #print("\tWilcoxon: {}".format(wilcoxon_pval)) if wilcoxon_pval < self.intragroup_alpha: feat_check.append(idx) else: excl_check.append(idx) feat_check_ = (feat_check + col_sel) index_to_col = [ col for idx, col in enumerate(X_.columns) if idx in feat_check_ ] elif self.unseen_only: # if we are considering unseen only, we will simply let the regulariser # act on it, sim. to grafting. index_to_col = [ col for idx, col in enumerate(X_.columns) if idx in feat_index ] else: # only use supervised criteria feat_check_ = (feat_check + col_sel) index_to_col = [ col for idx, col in enumerate(X_.columns) if idx in feat_index ] self.unseen_only = False # perhaps add more conditions around unseen - i.e. once unseen condition kicks in, it remains active? self.coef_info['cols'] = list( set(self.coef_info['cols'] + index_to_col)) col_rem = X_.columns.difference(self.coef_info['cols']) # update column exclusion... self.coef_info['excluded_cols'] = [ x for x in self.coef_info['excluded_cols'] if x not in self.coef_info['cols'] ] self.add_column_exclusion(col_rem)