def dpp_sampler(X): """ Takes in dataset and return set of features based on: only dpp sampling...will extend for supervised/unsupervised criterion """ feat_dist = rbf_kernel(X.T) feat_index = sample_dpp(decompose_kernel(feat_dist), k=None) return feat_index
def dpp_dropin_uniform(self, dropoutProb): if dropoutProb == 0: return p = (1-dropoutProb) L = (p/(1-p))*np.eye(self.W.shape[0]-1) D, V = dpp.decompose_kernel(L) J = dpp.sample(D, V) d_idx = np.zeros((self.W.shape[0]-1, 1)) d_idx[J.astype(int)] = 1 self.prevZ[:, 0:-1] = self.prevZ[:, 0:-1]*d_idx.T
def dpp_dropout(self, dropoutProb): if dropoutProb == 0: return W_n = self.W[0:-1, :] L = (W_n.dot(W_n.T))**2 D, V = dpp.decompose_kernel(L) k = int(np.floor((1-dropoutProb)*self.W.shape[0])) J = dpp.sample_k(k, D, V) d_idx = np.ones((self.W.shape[0]-1, 1)) d_idx[J.astype(int)] = 0 self.prevZ[:, 0:-1] = self.prevZ[:, 0:-1]*d_idx.T
def dpp_dropin_norm(self, dropoutProb): if dropoutProb == 0: return W_n = self.W[0:-1, :]/(np.atleast_2d(np.linalg.norm(self.W[0:-1, :], axis=1)).T) L = (W_n.dot(W_n.T))**2 D, V = dpp.decompose_kernel(L) k = int(np.floor((1-dropoutProb)*self.W.shape[0])) J = dpp.sample_k(k, D, V) d_idx = np.zeros((self.W.shape[0]-1, 1)) d_idx[J.astype(int)] = 1 self.prevZ[:, 0:-1] = self.prevZ[:, 0:-1]*d_idx.T
def dpp_dropin_EY(self, dropoutProb): if dropoutProb == 0: return p = (1-dropoutProb) W_n = self.W[0:-1, :] L = (W_n.dot(W_n.T))**2 D, V = dpp.decompose_kernel(L) (kmax, d) = dpp.analyze_bDPP(D) print kmax if kmax>=(p*L.shape[0]): J = dpp.sample_EY(D, V, p*L.shape[0]) d_idx = np.zeros((self.W.shape[0]-1, 1)) d_idx[J.astype(int)] = 1 self.prevZ[:, 0:-1] = self.prevZ[:, 0:-1]*d_idx.T else: self.random_dropout(dropoutProb)
def _dpp_sel(self, X_, y=None): """ DPP only relies on X. We will condition the sampling based on: * `self.coef_info['cols']` After sampling it will go ahead and then perform grouped wilcoxon selection. """ X = np.array(X_) print(X.shape) cols_to_index = [ idx for idx, x in enumerate(X_.columns) if x in self.coef_info['cols'] ] unseen_cols_to_index = [ idx for idx, x in enumerate(X_.columns) if x not in self.coef_info['cols'] ] if X.shape[0] < 1000 or X.shape[1] < 100: feat_dist = rbf_kernel(X.T) else: feat_dist = Nystroem().fit_transform(X.T) feat_dist = feat_dist.dot(feat_dist.T) #self._dpp_estimate_k(feat_dist) #k = self.dpp_k['pca'] #- len(self.coef_info['cols']) k = None feat_index = [] #while len(feat_index) == 0: if len(self.coef_info['cols']) == 0: feat_index = sample_dpp(decompose_kernel(feat_dist), k=k) else: feat_index = sample_conditional_dpp(feat_dist, cols_to_index, k=k) feat_index = [x for x in feat_index if x is not None] # select features using entropy measure # how can we order features from most to least relevant first? # we chould do it using f test? Or otherwise - presume DPP selects best one first s_b, s_w = class_separability(X, y) col_sel = evaluate_feats(s_b, s_w) #sel_cols = list(self.coef_info['cols']) + list(col_sel) """ feat_entropy = [] excl_entropy = [] X_sel = X[:, feat_index] for idx, feat in enumerate(X_sel.T): if len(feat_entropy) == 0: feat_entropy.append(idx) continue if entropy(X_sel[:, feat_entropy]) > entropy(X_sel[:, feat_entropy+[idx]]): feat_entropy.append(idx) else: excl_entropy.append(idx) """ # iterate over feat_index to determine # information on wilcoxon test # as the feat index are already "ordered" as that is how DPP would # perform the sampling - we will do the single pass in the same # way it was approached in the OGFS # feat index will have all previous sampled columns as well... if not self.unseen_only and len(feat_index) > 0: feat_check = [] excl_check = [] X_sel = X[:, feat_index] for idx, feat in enumerate(X_sel.T): if len(feat_check) == 0: feat_check.append(idx) continue wilcoxon_pval = wilcoxon_group(X_sel[:, feat_check], feat) #print("\tWilcoxon: {}".format(wilcoxon_pval)) if wilcoxon_pval < self.intragroup_alpha: feat_check.append(idx) else: excl_check.append(idx) feat_check_ = (feat_check + col_sel) index_to_col = [ col for idx, col in enumerate(X_.columns) if idx in feat_check_ ] elif self.unseen_only: # if we are considering unseen only, we will simply let the regulariser # act on it, sim. to grafting. index_to_col = [ col for idx, col in enumerate(X_.columns) if idx in feat_index ] else: # only use supervised criteria feat_check_ = (feat_check + col_sel) index_to_col = [ col for idx, col in enumerate(X_.columns) if idx in feat_index ] self.unseen_only = False # perhaps add more conditions around unseen - i.e. once unseen condition kicks in, it remains active? self.coef_info['cols'] = list( set(self.coef_info['cols'] + index_to_col)) col_rem = X_.columns.difference(self.coef_info['cols']) # update column exclusion... self.coef_info['excluded_cols'] = [ x for x in self.coef_info['excluded_cols'] if x not in self.coef_info['cols'] ] self.add_column_exclusion(col_rem)
# Number of samples to generate k = 100 # Randomly sample k points idx = np.arange(x.size) np.random.shuffle(idx) x_uniform = x[idx[:k]] y_uniform = y[idx[:k]] # Sample a k-DPP # First construct a Gaussian L-ensemble sigma = 0.1 L = np.exp(- ( np.power(x - x[:, None], 2) + np.power(y - y[:, None], 2) )/(sigma**2)) (s, logdet) = np.linalg.slogdet(L) D, V = dpp.decompose_kernel(L) Y = dpp.sample_k(k, D, V) print "Done Gaussian!" L = np.power(np.outer(x, x)+np.outer(y, y), 2) D, V = dpp.decompose_kernel(L) Y2 = dpp.sample_k(k, D, V) # Plot both plt.figure(1) plt.subplot(1, 3, 1) plt.plot(x_uniform, y_uniform, 'ro') plt.title('Uniform') plt.subplot(1, 3, 2) plt.title('DPP (Gaussian Similarity)')
# Number of samples to generate k = 100 # Randomly sample k points idx = np.arange(x.size) np.random.shuffle(idx) x_uniform = x[idx[:k]] y_uniform = y[idx[:k]] # Sample a k-DPP # First construct a Gaussian L-ensemble sigma = 0.1 L = np.exp(-(np.power(x - x[:, None], 2) + np.power(y - y[:, None], 2)) / (sigma**2)) (s, logdet) = np.linalg.slogdet(L) D, V = dpp.decompose_kernel(L) Y = dpp.sample_k(k, D, V) print "Done Gaussian!" L = np.power(np.outer(x, x) + np.outer(y, y), 2) D, V = dpp.decompose_kernel(L) Y2 = dpp.sample_k(k, D, V) # Plot both plt.figure(1) plt.subplot(1, 3, 1) plt.plot(x_uniform, y_uniform, 'ro') plt.title('Uniform') plt.subplot(1, 3, 2) plt.title('DPP (Gaussian Similarity)')