def test_pairwise_kernels(metric): # Test the pairwise_kernels helper function. rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) Y = rng.random_sample((2, 4)) function = PAIRWISE_KERNEL_FUNCTIONS[metric] # Test with Y=None K1 = pairwise_kernels(X, metric=metric) K2 = function(X) assert_array_almost_equal(K1, K2) # Test with Y=Y K1 = pairwise_kernels(X, Y=Y, metric=metric) K2 = function(X, Y=Y) assert_array_almost_equal(K1, K2) # Test with tuples as X and Y X_tuples = tuple([tuple([v for v in row]) for row in X]) Y_tuples = tuple([tuple([v for v in row]) for row in Y]) K2 = pairwise_kernels(X_tuples, Y_tuples, metric=metric) assert_array_almost_equal(K1, K2) # Test with sparse X and Y X_sparse = csr_matrix(X) Y_sparse = csr_matrix(Y) if metric in ["chi2", "additive_chi2"]: # these don't support sparse matrices yet assert_raises(ValueError, pairwise_kernels, X_sparse, Y=Y_sparse, metric=metric) return K1 = pairwise_kernels(X_sparse, Y=Y_sparse, metric=metric) assert_array_almost_equal(K1, K2)
def HilbertSchmidtNormIC(self,X,Y,metric='linear'): ''' Procedure to calculate the Hilbert-Schmidt Independence Criterion described in "Measuring Statistical Dependence with Hilbert-Schmidt Norms", Arthur Gretton et al. Parameters ---------- Assuming a joint distribution P(X,Y) X : list of X observations Y : list of Y obervations Returns ------- (HSIC, fake-p-value scaling HSIC to [0,1]) ''' m = float(len(X)) K = pairwise_kernels(X,X,metric=metric) L = pairwise_kernels(Y,Y,metric=metric) H = np.eye(m)-1/m res = (1/(m-1)**2 ) * np.trace(np.dot(np.dot(np.dot(K,H),L),H)) #Another way, maybe.. #CCm = pairwise_kernels(self.X,self.Y) #res = sum(np.linalg.eigvals(CCm)) #Now using Gamma approximation to get a p-value bone = np.ones((int(m),int(m))) Kc = H*K*H Lc = H*L*H #fit Gamma to testStat*m testStat = 1/m * sum(sum(np.dot(np.transpose(Kc),Lc) ) ) #TEST STATISTIC: m*HSICb (under H1) varHSIC = (1/6 * np.dot(Kc,Lc))**2; varHSIC = 1/m/(m-1)* ( sum( [sum(varHSIC[:,i]) for i in xrange(len(varHSIC))] ) - (np.trace(varHSIC)) ) varHSIC = 72*(m-4)*(m-5)/m/(m-1)/(m-2)/(m-3) * varHSIC #variance under H0 _K = K-np.diag(np.diag(K)); _L = L-np.diag(np.diag(L)); muX = 1/m/(m-1)*np.transpose(bone)*(K*bone) muY = 1/m/(m-1)*np.transpose(bone)*(L*bone) mHSIC = 1/m * ( 1 +muX*muY - muX - muY ) #mean under H0 al = mHSIC**2 / varHSIC; bet = varHSIC*m / mHSIC; #threshold for hsicArr*m alpha = 0.05 #This should be done, with varHSIC != 0 #from scipy.special import gdtria #thresh = gdtria(1-alpha,al,bet) #print 'thresh:', thresh #return (res,1-(1/(1+res) ) ) return (res,res )
def gram(mat, args): '''Computes the Gram Matrix of mat according to the kernel specified in args''' if args.kernel in ['rbf', 'polynomial', 'poly', 'laplacian']: gamma = dict(gamma=10. / mat.shape[1]) output = pairwise_kernels(mat, metric=args.kernel, n_jobs=-1, gamma=gamma) else: # gamma for chi squared should be left to default output = pairwise_kernels(mat, metric=args.kernel, n_jobs=-1) return output
def test_kernel_versus_pairwise(): # Check that GP kernels can also be used as pairwise kernels. for kernel in kernels: # Test auto-kernel if kernel != kernel_white: # For WhiteKernel: k(X) != k(X,X). This is assumed by # pairwise_kernels K1 = kernel(X) K2 = pairwise_kernels(X, metric=kernel) assert_array_almost_equal(K1, K2) # Test cross-kernel K1 = kernel(X, Y) K2 = pairwise_kernels(X, Y, metric=kernel) assert_array_almost_equal(K1, K2)
def transform(self, X): """Apply feature map to X. Computes an approximate feature map using the kernel between some training points and X. Parameters ---------- X : array-like, shape=(n_samples, n_features) Data to transform. Returns ------- X_transformed : array, shape=(n_samples, n_components) Transformed data. """ check_is_fitted(self, 'components_') X = check_array(X, accept_sparse='csr') kernel_params = self._get_kernel_params() embedded = pairwise_kernels(X, self.components_, metric=self.kernel, filter_params=True, **kernel_params) return np.dot(embedded, self.normalization_.T)
def links(self, data_matrix): data_size = data_matrix.shape[0] kernel_matrix = pairwise_kernels(data_matrix, metric=self.metric, **self.kwds) # compute instance density as average pairwise similarity density = np.sum(kernel_matrix, 0) / data_size # compute list of nearest neighbors kernel_matrix_sorted = np.argsort(-kernel_matrix) # make matrix of densities ordered by nearest neighbor density_matrix = density[kernel_matrix_sorted] # if a denser neighbor cannot be found then assign link to the instance itself link_ids = list(range(density_matrix.shape[0])) # for all instances determine link link for i, row in enumerate(density_matrix): i_density = row[0] # for all neighbors from the closest to the furthest for jj, d in enumerate(row): # proceed until n_nearest_neighbors have been explored if self.n_nearest_neighbors is not None and jj > self.n_nearest_neighbors: break j = kernel_matrix_sorted[i, jj] if jj > 0: j_density = d # if the density of the neighbor is higher than the density of the instance assign link if j_density > i_density: link_ids[i] = j break return link_ids
def parents(self, data_matrix, target=None): """parents.""" data_size = data_matrix.shape[0] kernel_matrix = pairwise_kernels(data_matrix, metric=self.metric, **self.kwds) # compute instance density as 1 over average pairwise distance density = np.sum(kernel_matrix, 0) / data_size # compute list of nearest neighbors kernel_matrix_sorted = np.argsort(-kernel_matrix) # make matrix of densities ordered by nearest neighbor density_matrix = density[kernel_matrix_sorted] # if a denser neighbor cannot be found then assign parent to the # instance itself parent_ids = list(range(density_matrix.shape[0])) # for all instances determine parent link for i, row in enumerate(density_matrix): i_density = row[0] # for all neighbors from the closest to the furthest for jj, d in enumerate(row): j = kernel_matrix_sorted[i, jj] if jj > 0: j_density = d # if the density of the neighbor is higher than the # density of the instance assign parent if j_density > i_density: parent_ids[i] = j break return parent_ids
def transform(self, data_matrix): """Transforms features as the instance similarity to a set of instances as defined by the selector. Parameters ---------- data_matrix : array, shape = (n_samples, n_features) Samples. Returns ------- data_matrix : array, shape = (n_samples, n_features_new) Transformed array. """ if self.selected_instances is None: raise Exception('Error: attempt to use transform on non fit model') if self.selected_instances.shape[0] == 0: raise Exception('Error: attempt to use transform using 0 selectors') # TODO: the first instance is more important than others in a selector, so it should # receive a weight proportional to the rank e.g. 1/rank^p # the selector should return also a rank information for each feature, note: for the # composite selector it is important to distinguish the rank of multiple selectors data_matrix_out = pairwise_kernels(data_matrix, Y=self.selected_instances, metric=self.metric, **self.kwds) if self.scale: data_matrix_out = self.scaler.transform(data_matrix_out) * self.scaling_factor return data_matrix_out
def diag(self, X): """Returns the diagonal of the kernel k(X, X). The result of this method is identical to np.diag(self(X)); however, it can be evaluated more efficiently since only the diagonal is evaluated. Parameters ---------- X : array, shape (n_samples_X, n_features) Left argument of the returned kernel k(X, Y) Returns ------- K_diag : array, shape (n_samples_X,) Diagonal of kernel k(X, X) """ prototypes_std = self.prototypes.std(0) n_prototypes = self.prototypes.shape[0] # kernel regression of noise levels K_pairwise = \ pairwise_kernels(self.prototypes / prototypes_std, X / prototypes_std, metric="rbf", gamma=self.gamma) return (K_pairwise * self.sigma_2[:, None]).sum(axis=0) \ / K_pairwise.sum(axis=0)
def KmeansForAgeEst2(db, where, users, n_clusters): X = [] X_users = [] centers = [] est = [] est_v = [] for at in where: _users = [users[i] for i in at] X.append(pymongo_utill.toTimeFreq(db, _users)) X_users.append(_users) for c, x in enumerate(X): km = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10) km.fit(x) centers.append(km.cluster_centers_) max_0 = 0 max_1 = 0 est_0_v = "" est_1_v = "" for i, u in enumerate(x): sim = pairwise_kernels(km.cluster_centers_, u, metric="cosine") if max_0 < sim[0]: est_0 = X_users[c][i] max_0 = sim[0] est_0_v = u if max_1 < sim[1]: est_1 = X_users[c][i] max_1 = sim[1] est_1_v = u est.append((est_0, est_1)) est_v.append((est_0_v, est_1_v)) return centers
def decision_function(self, X): """Scores related to the ordering of the samples X. Note that higher scores correspond to higher rankings. For example, for three ordered samples (say ranks 1, 2, 3) we would expect the corresponding scores to decrease (say 9.5, 6.2, 3.5). Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors. Returns ------- scores : array-like, shape = [n_samples] The higher the score, the higher the rank. For example, if the x_1's rank is 1 and x_2's rank is 2, then x_1's score will be higher than x_2's score. """ if self._rank_vectors is None: raise Exception('Attempted to predict before fitting model') alpha = self._alpha gram_matrix = pairwise.pairwise_kernels(self._rank_vectors, X, metric=self.kernel) scores = np.sum(alpha[alpha != 0, np.newaxis] * gram_matrix, 0) return scores
def fit(self, X, y, unlabeled_data=None): num_data = X.shape[0] + unlabeled_data.shape[0] num_labeled = X.shape[0] num_unlabeled = unlabeled_data.shape[0] labeled = np.zeros((num_data,), dtype=np.float32) labeled[0:num_labeled] = 1.0 if issparse(X): self.X_ = vstack((util.cast_to_float32(X), util.cast_to_float32(unlabeled_data)), format='csr') else: self.X_ = np.concatenate((util.cast_to_float32(X), util.cast_to_float32(unlabeled_data))) self.gamma = ( self.gamma if self.gamma is not None else 1.0 / X.shape[1]) self.kernel_params = {'gamma':self.gamma, 'degree':self.degree, 'coef0':self.coef0} kernel_matrix = pairwise_kernels(self.X_, metric=self.kernel, filter_params=True, **self.kernel_params) A = np.dot(np.diag(labeled), kernel_matrix) if self.nu2 != 0: if self.kernel == 'rbf': laplacian_kernel_matrix = kernel_matrix else: laplacian_kernel_matrix = rbf_kernel(self.X_, gamma=self.gamma) laplacian_x_kernel = np.dot(graph_laplacian( laplacian_kernel_matrix, normed=self.normalize_laplacian), kernel_matrix) A += self.nu2 * laplacian_x_kernel y = np.concatenate((y, -np.ones((num_unlabeled,), dtype=np.float32)), axis=0) super(LapRLSC, self).fit(A, y, class_for_unlabeled=-1)
def select(self, data_matrix, target=None): """select.""" # extract difference matrix kernel_matrix = pairwise_kernels(data_matrix, metric=self.metric, **self.kwds) # set minimum value m = - 1 # set diagonal to 0 to remove self similarity np.fill_diagonal(kernel_matrix, 0) # iterate size - k times, i.e. until only k instances are left for t in range(data_matrix.shape[0] - self.n_instances): # find pairs with largest kernel (max_i, max_j) = np.unravel_index( np.argmax(kernel_matrix), kernel_matrix.shape) # choose one instance at random if random.random() > 0.5: id = max_i else: id = max_j # remove instance with highest score by setting all its pairwise # similarity to min value kernel_matrix[id, :] = m kernel_matrix[:, id] = m # extract surviving elements, i.e. element that have 0 on the diagonal selected_instances_ids = np.array( [i for i, x in enumerate(np.diag(kernel_matrix)) if x == 0]) return selected_instances_ids
def test_cosine_kernel(): """ Test the cosine_kernels. """ rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) Y = rng.random_sample((3, 4)) Xcsr = csr_matrix(X) Ycsr = csr_matrix(Y) for X_, Y_ in ((X, None), (X, Y), (Xcsr, None), (Xcsr, Ycsr)): # Test that the cosine is kernel is equal to a linear kernel when data # has been previously normalized by L2-norm. K1 = pairwise_kernels(X_, Y=Y_, metric="cosine") X_ = normalize(X_) if Y_ is not None: Y_ = normalize(Y_) K2 = pairwise_kernels(X_, Y=Y_, metric="linear")
def score(self, Xh, yh): # not really a score, more a loss lambdak = self.alpha0 K_pred = pairwise_kernels(Xh, self.Xt, gamma=np.exp(lambdak[0]), metric='rbf') pred = K_pred.dot(self.dual_coef_) v = yh - pred return v.dot(v)
def h_sol_approx(x, lambdak, tol): # returns an approximate solution of the inner optimization K = pairwise_kernels(Xt, gamma=np.exp(lambdak[0]), metric='rbf') (out, success) = splinalg.cg( K + np.exp(lambdak[1]) * np.eye(x0.size), yt, x0=x) if success is False: raise ValueError return out
def test_lower_bound_multi_rbf(): K = pairwise_kernels(mult_dense, metric="rbf", gamma=0.1) Cmin = C_lower_bound(K, mult_target) Cmin2 = C_lower_bound(mult_dense, mult_target, kernel="rbf", gamma=0.1) Cmin3 = C_lower_bound(mult_dense, mult_target, kernel="rbf", gamma=0.1, search_size=60, random_state=0) assert_almost_equal(Cmin, Cmin2, 4) assert_almost_equal(Cmin, Cmin3, 4)
def test_fit_reg_squared_loss_nn_l2(): K = pairwise_kernels(digit.data, metric="poly", degree=4) clf = CDRegressor(C=1, random_state=0, penalty="nnl2", loss="squared", max_iter=100) clf.fit(K, digit.target) y_pred = (clf.predict(K) > 0.5).astype(int) acc = np.mean(digit.target == y_pred) assert_almost_equal(acc, 0.9444, 3)
def test_pairwise_kernels_callable(): # Test the pairwise_kernels helper function # with a callable function, with given keywords. rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) Y = rng.random_sample((2, 4)) metric = callable_rbf_kernel kwds = {'gamma': 0.1} K1 = pairwise_kernels(X, Y=Y, metric=metric, **kwds) K2 = rbf_kernel(X, Y=Y, **kwds) assert_array_almost_equal(K1, K2) # callable function, X=Y K1 = pairwise_kernels(X, Y=X, metric=metric, **kwds) K2 = rbf_kernel(X, Y=X, **kwds) assert_array_almost_equal(K1, K2)
def fit(self,PHI,targets, Parameters): if (self.kind=='rbf'or self.kind=='sigmoid'or self.kind=="polynomial" or self.kind== "lin" or self.kind =="cosine"): # save target need for kernel interpellation self.targets=targets; #parameters self.Parameters=Parameters; #Kernel matrix K=pairwise_kernels(PHI[:,:],PHI[:,:],metric=self.kind,filter_params= self.Parameters); #To make a prediction on sample x using n-training sample sum with respect to xn i.e sum k(x,xn) #must equal one i.e sum k(x,xn)=1 Normalization=np.power(np.tile(np.sum(K,1),(K.shape[1], 1)),-1) ; K=Normalization*K; # Prediction using training data y_pred=np.dot(K,targets); # Prediction varance self.bata=np.var(targets-y_pred); # Prediction variance of each sample self.sigma=(self.bata)+np.diag(K)/(self.bata); self.trainingdata=PHI; if (self.kind=='basis'): self.targets=targets; self.Parameters; dim=PHI.shape[1]; S0=np.identity(dim); Parameter=np.array(Parameters); if (Parameter.size==1): #zero mean ,broad prior, with one parameter with maximum likelihood estimation of prior Lambda=Parameter[0]; self.Sn=np.linalg.inv(Lambda*S0+np.dot(PHI.transpose(),PHI)) self.Mn=np.dot(self.Sn, np.dot(PHI.transpose(),targets)) # Prediction of training data y_pred=np.dot(PHI,self.Mn) self.bata=np.var(targets-y_pred); self.sigma=(self.bata)+np.diag(np.dot(PHI,np.dot(self.Sn,PHI.transpose()))); if (Parameter.size==2): #zero mean ,broad prior, with two parameter alfa=Parameter[0]; bata=Parameter[1]; self.Sn=np.linalg.inv(alfa*S0+bata*np.dot(PHI.transpose(),PHI)) self.Mn=bata*np.dot(self.Sn, np.dot(PHI.transpose(),targets)) # Prediction of training data y_pred=np.dot(PHI,self.Mn) #Calculate noise variance on training data using MAP estimate self.bata=np.var(targets-y_pred); # prediction variance on training data self.sigma=(self.bata)+np.diag(np.dot(PHI,np.dot(self.Sn,PHI.transpose())));
def _most_representative(self, structs): # compute kernel matrix with sequence_vectorizer data_matrix = self.sequence_vectorizer.transform(structs) kernel_matrix = pairwise_kernels(data_matrix, metric='rbf', gamma=1) # compute instance density as 1 over average pairwise distance density = np.sum(kernel_matrix, 0) / data_matrix.shape[0] # compute list of nearest neighbors max_id = np.argsort(-density)[0] return max_id
def _get_kernel(self, X, Y=None): if callable(self.kernel): params = self.kernel_params or {} else: params = {"gamma": self.gamma, "degree": self.degree, "coef0": self.coef0} return pairwise_kernels(X, Y, metric=self.kernel, filter_params=True, **params)
def gram(mat, args): '''Computes the gram matrix of mat according to a specified kernel function''' kwargs = {} if args.kernel in ['rbf', 'polynomial', 'poly', 'laplacian']: # gamma for chi squared should be left to default kwargs = dict(gamma=10. / mat_a.shape[1]) output = pairwise_kernels(mat, metric=args.kernel, n_jobs=1, **kwargs) return output
def g_cross(x, lambdak): K_pred = pairwise_kernels( Xh, Xt, gamma=np.exp(lambdak[0]), metric='rbf') K_pred_prime = -np.exp(lambdak[0]) * euclidean_distances( Xh, Xt, squared=True) * K_pred pred = K_pred.dot(x) v = yh - pred tmp = K_pred_prime.dot(x) return np.array((- 2 * tmp.dot(v), 0.0))
def test_pairwise_kernels_filter_param(): rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) Y = rng.random_sample((2, 4)) K = rbf_kernel(X, Y, gamma=0.1) params = {"gamma": 0.1, "blabla": ":)"} K2 = pairwise_kernels(X, Y, metric="rbf", filter_params=True, **params) assert_array_almost_equal(K, K2) assert_raises(TypeError, pairwise_kernels, X, Y, "rbf", **params)
def build_graph(self, data_matrix=None, target=None, k=3, k_quick_shift=1, k_outliers=5, knn_horizon=5): """Build graph.""" size = data_matrix.shape[0] # make kernel kernel_matrix = pairwise_kernels(data_matrix, metric=self.metric, **self.kwds) # compute instance density as average pairwise similarity density = np.sum(kernel_matrix, 0) / size # compute list of nearest neighbors distance_matrix = pairwise_distances(data_matrix) knn_ids = np.argsort(distance_matrix) # make matrix of densities ordered by nearest neighbor density_matrix = density[knn_ids] # make a graph with instances as nodes graph = nx.Graph() for v in range(size): graph.add_node(v, group=target[v], outlier=False) # build knn edges if k > 0: # find the closest selected instance and instantiate knn edges graph = self._add_knn_links( graph, target, kernel_matrix=kernel_matrix, knn_ids=knn_ids, nneighbors_th=k) self._annotate_outliers( graph, nneighbors_th=k_outliers, kernel_matrix=kernel_matrix, knn_ids=knn_ids) # build shift tree for th in range(1, k_quick_shift + 1): link_ids = self._kernel_shift_links( kernel_matrix=kernel_matrix, density_matrix=density_matrix, knn_ids=knn_ids, k_quick_shift=th, target=target, knn_horizon=knn_horizon) for i, link in enumerate(link_ids): if i != link: graph.add_edge(i, link, edge_type='shift', rank=th) graph = self._compute_edge_len(graph, data_matrix, target) return graph
def decision_function(self, X): if self.mode == 'exact': K = pairwise_kernels( X, self.X_train_, metric=self.kernel, filter_params=True, gamma=self.gamma ) else: K = self.kernel_sampler_.transform(X) return super(SparseKernelClassifier, self).decision_function(K)
def _get_kernel(self, X, Y=None): params = {"gamma": self.gamma, "degree": self.degree, "coef0": self.coef0} try: return pairwise_kernels(X, Y, metric=self.kernel, filter_params=True, **params) except AttributeError: raise ValueError("%s is not a valid kernel. Valid kernels are: " "rbf, poly, sigmoid, linear and precomputed." % self.kernel)
def fit(self, Xt, yt): self.Xt = Xt x0 = np.zeros(Xt.shape[0]) # returns an approximate solution of the inner optimization K = pairwise_kernels(Xt, gamma=np.exp(self.alpha0[0]), metric='rbf') (out, success) = splinalg.cg( K + np.exp(self.alpha0[1]) * np.eye(x0.size), yt, x0=x0) if success is False: raise ValueError self.dual_coef_ = out
def _get_kernel(self, X, Y=None): if callable(self.kernel): params = self.kernel_params or {} else: params = {"gamma": self.gamma, "degree": self.degree, "coef0": self.coef0} if self.kernel == "robust_kernel": return trimmedrbf_kernel(X, Y, gamma=self.gamma, robust_gamma = self.robust_gamma) else: return pairwise_kernels(X, Y, metric=self.kernel, filter_params=True, **params)
def test(self, dataset: BaseADDataset, device: str = 'cpu', n_jobs_dataloader: int = 0): """Tests the SSAD model on the test data.""" logger = logging.getLogger() #_, test_loader = dataset.loaders(batch_size=128, num_workers=n_jobs_dataloader) test_loader = self.my_dataset.test_loader # Get data from loader idx_label_score = [] X = () idxs = [] labels = [] for data in test_loader: inputs, label_batch, _, idx = data inputs, label_batch, idx = inputs.to(device), label_batch.to( device), idx.to(device) if self.hybrid: inputs = self.ae_net.encoder( inputs ) # in hybrid approach, take code representation of AE as features X_batch = inputs.view( inputs.size(0), -1 ) # X_batch.shape = (batch_size, n_channels * height * width) X += (X_batch.cpu().data.numpy(), ) idxs += idx.cpu().data.numpy().astype(np.int64).tolist() labels += label_batch.cpu().data.numpy().astype(np.int64).tolist() X = np.concatenate(X) # Testing logger.info('Starting testing...') start_time = time.time() # Build kernel kernel = pairwise_kernels(X, self.X_svs, metric=self.kernel, gamma=self.gamma) scores = (-1.0) * self.model.apply(kernel) self.results['test_time'] = time.time() - start_time scores = scores.flatten() self.rho = -self.model.threshold # Save triples of (idx, label, score) in a list idx_label_score += list(zip(idxs, labels, scores.tolist())) self.results['test_scores'] = idx_label_score # Compute AUC _, labels, scores = zip(*idx_label_score) labels = np.array(labels) scores = np.array(scores) self.results['test_auc'] = roc_auc_score(labels, scores) # If hybrid, also test model with linear kernel if self.hybrid: start_time = time.time() linear_kernel = pairwise_kernels(X, self.linear_X_svs, metric='linear') scores_linear = (-1.0) * self.linear_model.apply(linear_kernel) self.results['test_time_linear'] = time.time() - start_time scores_linear = scores_linear.flatten() self.results['test_auc_linear'] = roc_auc_score( labels, scores_linear) logger.info('Test AUC linear model: {:.2f}%'.format( 100. * self.results['test_auc_linear'])) logger.info('Test Time linear model: {:.3f}s'.format( self.results['test_time_linear'])) # Log results logger.info('Test AUC: {:.2f}%'.format(100. * self.results['test_auc'])) logger.info('Test Time: {:.3f}s'.format(self.results['test_time'])) logger.info('Finished testing.')
#normalizing doc_term_matrix_tfidf_l2 = [] for tf_vector in doc_term_matrix_tfidf: doc_term_matrix_tfidf_l2.append(l2_normalizer(tf_vector)) hasil_tfidf = np.matrix(doc_term_matrix_tfidf_l2) st.subheader("l2 tfidf normalizer") frequency_TFIDF = pd.DataFrame(hasil_tfidf, index=id_requirement, columns=kolom_df) st.write(frequency_TFIDF) st.subheader("IR using cosine") X = np.array(hasil_tfidf[0:]) Y = np.array(hasil_tfidf) cosine_similaritas = pairwise_kernels(X, Y, metric='linear') cosine_df = pd.DataFrame(cosine_similaritas, index=id_requirement, columns=id_requirement) st.write(cosine_df) # klaster klaster_value = st.sidebar.slider("Berapa Cluster?", 0, 5, len(id_requirement)) kmeans = KMeans( n_clusters=klaster_value ) # You want cluster the passenger records into 2: Survived or Not survived kmeans_df = kmeans.fit(cosine_similaritas) st.subheader("K-Means Cluster") correct = 0
def similarity_graph_from_term_document_matrix(sp_mat): dx = pairwise_kernels(sp_mat, metric='cosine') g = nx.from_numpy_matrix(dx) return g
def kernel_mat_pair(f, x, y=None): return pairwise_kernels(x, y, f)
# compare the eigenvalues of kernel matrics # In[108]: from sklearn.metrics.pairwise import pairwise_kernels from numpy.linalg import eigvals # In[109]: kernel_list = ['linear', 'rbf', 'poly', 'sigmoid'] for kernel in kernel_list: k = 10 kernel_matrix = pairwise_kernels(credit, metric=kernel) print "Kernel is " + str(kernel) print eigvals(kernel_matrix) print "\n" # In[156]: kernel_matrix.shape # In[157]: eigvals(kernel_matrix).shape # In[144]:
def prototype_selection(X, subsample=20, kernel='rbf'): return greedy_select_protos( pairwise_kernels(X, metric=kernel), np.array(range(X.shape[0])), subsample) if subsample > 1 else np.array(range(X.shape[0]))
def predict(self, X): K = pairwise_kernels(self.X, X, metric=self.kernel, gamma=self.gamma) return (K * self.y[:, None]).sum(axis=0) / K.sum(axis=0)
def rbf_kernels(X, n_jobs): return pairwise_kernels(X, metric="rbf", n_jobs=n_jobs, gamma=0.1)
def linear_kernel_test(testK, origK, n_jobs): return pairwise_kernels(testK, origK, metric="linear", n_jobs=n_jobs)
def estimate_Gap_statistics(self, nrefs): masknans = pl.ma.masked_not_equal(self._X[:, 0], 0).mask minvals = self._X[masknans, :].min(axis=0) maxvals = self._X[masknans, :].max(axis=0) meanvals = self._X[masknans, :].mean(axis=0) stdvals = self._X[masknans, :].std(axis=0) ref_Affinity = [] Dref = [] # Compute a random uniform reference distribution of features # precompute Distances and affinities. for i in range(nrefs): random_X = pl.ones_like(self._X) # random_X [:,0 ] =np.random.uniform (low = minvals[0] , high=maxvals[0], size=pl.int_( self._X.shape[0]/10 ) ) random_X[:, 1] = np.random.uniform( low=pl.quantile(q=0.16, a=self._X[masknans, 1]), high=pl.quantile(q=0.16, a=self._X[masknans, 1]), size=pl.int_(self._X.shape[0]), ) random_X[:, 0] = np.random.normal(loc=meanvals[0], scale=stdvals[0], size=pl.int_(self._X.shape[0])) ref_D = self._metric.pairwise(random_X) ref_D = pl.ma.fix_invalid(ref_D, fill_value=1.0).data Dref.append(ref_D) ref_Affinity.append(pairwise_kernels(ref_D, metric="precomputed")) self.Gaps = pl.zeros(len(self.Kvals)) self.sd = self.Gaps * 0.0 self.W = self.Gaps * 0.0 # KL index p = self._nfeat for j, K in enumerate(self.Kvals): if self.verbose: print(f"Running with K={K} clusters") self.clusters = AgglomerativeClustering( n_clusters=K, affinity="precomputed", linkage="average", connectivity=self.connectivity, ) self.clusters.fit_predict(self._Affinity) # estimate WCSS for the samples W = self.get_WCSS(K, self.clusters.labels_, self._distance_matr) self.W[j] = W # estimate WCSS for random samples ref_W = pl.zeros(nrefs) for i in range(nrefs): ref_clusters = AgglomerativeClustering( n_clusters=K, affinity="precomputed", linkage="average", connectivity=self.connectivity, ) ref_clusters.fit_predict(ref_Affinity[i]) ref_W[i] = self.get_WCSS(K, ref_clusters.labels_, Dref[i]) self.sd[j] = np.std(np.log(ref_W)) * np.sqrt(1 + 1.0 / nrefs) self.Gaps[j] = np.mean(np.log(ref_W)) - np.log(W) ## see section 4 of Tibishrani et al. http://web.stanford.edu/~hastie/Papers/gap.pdf gaps_criterion = pl.array( [self.Kvals[:-1], self.Gaps[:-1] - self.Gaps[1:] + self.sd[1:]]) mask = pl.array(gaps_criterion[1, :] >= 0) return pl.int_(gaps_criterion[0, mask][0])
def cluster_tightness(data, metric='cosine'): centroid = np.mean(data, axis=0).reshape(1, -1) return np.mean(pairwise_kernels(data, centroid, metric=metric))
def __init__( self, features, nfeatures, nside=16, include_haversine=False, galactic_mask=None, affinity="euclidean", scaler=preprocessing.StandardScaler(), file_affinity="", verbose=False, save_affinity=False, feature_weights=None, ): """ -features: list of features to cluster -nfeatures """ self._nside = nside self._nfeat = nfeatures if galactic_mask is None: self.galactic_mask = np.bool_(pl.ones_like(features[0])) else: self.galactic_mask = galactic_mask features[0] = features[0][galactic_mask] features[1] = features[1][galactic_mask] if self._nfeat > 1: assert features[0].shape[0] == features[1].shape[0] self._npix = features[0].shape[0] # hp.nside2npix(nside) else: self._npix = features.shape[0] if feature_weights is None: feature_weights = pl.ones(self._nfeat) self.verbose = verbose self._X = pl.zeros((self._npix, self._nfeat)) if self._nfeat == 1: features = [features] for i, x in zip(range(self._nfeat), features): self._X[:, i] = x # Standard rescaling of all the features if scaler is not None: self._X = scaler.fit_transform(self._X) for i in range(self._nfeat): self._X[:, i] *= feature_weights[i] self.estimate_affinity(affinity, file_affinity) self._has_angles = False if include_haversine: self._has_angles = True self.estimate_haversine() self._Affinity = pairwise_kernels(self._distance_matr, metric="precomputed") if save_affinity: pl.save(file_affinity, self._Affinity)
def gaussian(x, **kwargs): return pairwise_kernels(x, x, metric="rbf", **kwargs)
#!/usr/bin/env python import numpy as np from sklearn.preprocessing import KernelCenterer from sklearn.metrics.pairwise import pairwise_kernels X = np.array([[ 1., -2., 2.], [ -2., 1., 3.], [ 4., 1., -2.]]) K = pairwise_kernels(X, metric='linear') transformer = KernelCenterer().fit(K) centered_K = transformer.transform(K) print(centered_K) H = np.eye(3) - (1.0/3)*np.ones((3,3)) centered_K = H.dot(K).dot(H) print(centered_K)
def _calc_kernel(self, X, Y=None): return pairwise_kernels(X, Y, metric=self.kernel_type)
def transform(self, X, Y=None): n = X.shape[1] H = np.eye(n) - ((1 / n) * np.ones((n, n))) kernel_X_X = pairwise_kernels(X=X.T, Y=X.T, metric=self.kernel) X_transformed = (self.Theta.T).dot(H).dot(kernel_X_X).dot(H) return X_transformed
mae_cv = np.zeros((n_folds, 1)) # -------------------------------------------------------------------------- for i_fold, (train_idx, test_idx) in enumerate(kf.split(x, y)): x_train, x_test = x[train_idx], x[test_idx] y_train, y_test = y[train_idx], y[test_idx] print('CV iteration: %d' % (i_fold + 1)) # -------------------------------------------------------------------------- # Model gpr = GaussianProcessRegressor() X = np.atleast_2d(x) gramm_kernel = pairwise_kernels(X, metric='precomputed',filter_params=False) # -------------------------------------------------------------------------- # Model selection # Search space param_grid = {'kernel': [RBF(), WhiteKernel(), gramm_kernel]} # Gridsearch internal_cv = KFold(n_splits=5) grid_cv = GridSearchCV(estimator=gpr, param_grid=param_grid, cv=internal_cv, scoring='neg_mean_absolute_error', verbose=1, n_jobs=1) # --------------------------------------------------------------------------
def _density_func(self, data_matrix, target=None): kernel_matrix = pairwise_kernels(data_matrix, metric=self.metric, **self.kwds) # compute instance density as average pairwise similarity densities = np.mean(kernel_matrix, 0) return densities
def train(self, dataset: BaseADDataset, device: str = 'cpu', n_jobs_dataloader: int = 0): """Trains the SSAD model on the training data.""" logger = logging.getLogger() # do not drop last batch for non-SGD optimization shallow_ssad #train_loader = DataLoader(dataset=dataset.train_set, batch_size=128, shuffle=True,num_workers=n_jobs_dataloader, drop_last=False) train_loader = self.my_dataset.test_loader # Get data from loader X = () semi_targets = [] for data in train_loader: inputs, _, semi_targets_batch, _ = data inputs, semi_targets_batch = inputs.to( device), semi_targets_batch.to(device) if self.hybrid: inputs = self.ae_net.encoder( inputs ) # in hybrid approach, take code representation of AE as features X_batch = inputs.view( inputs.size(0), -1 ) # X_batch.shape = (batch_size, n_channels * height * width) X += (X_batch.cpu().data.numpy(), ) semi_targets += semi_targets_batch.cpu().data.numpy().astype( np.int).tolist() X, semi_targets = np.concatenate(X), np.array(semi_targets) # Training logger.info('Starting training...') # Select model via hold-out test set of 1000 samples gammas = np.logspace(-7, 2, num=10, base=2) best_auc = 0.0 # Sample hold-out set from test set #_, test_loader = dataset.loaders(batch_size=128, num_workers=n_jobs_dataloader) #--------------------------------------------- test_loader = self.my_dataset.test_loader X_test = () labels = [] for data in test_loader: inputs, label_batch, _, _ = data inputs, label_batch = inputs.to(device), label_batch.to(device) if self.hybrid: inputs = self.ae_net.encoder( inputs ) # in hybrid approach, take code representation of AE as features X_batch = inputs.view( inputs.size(0), -1 ) # X_batch.shape = (batch_size, n_channels * height * width) X_test += (X_batch.cpu().data.numpy(), ) labels += label_batch.cpu().data.numpy().astype(np.int64).tolist() X_test, labels = np.concatenate(X_test), np.array(labels) n_test, n_normal, n_outlier = len(X_test), np.sum(labels == 0), np.sum( labels == 1) n_val = int(0.1 * n_test) n_val_normal, n_val_outlier = int(n_val * (n_normal / n_test)), int( n_val * (n_outlier / n_test)) perm = np.random.permutation(n_test) X_val = np.concatenate( (X_test[perm][labels[perm] == 0][:n_val_normal], X_test[perm][labels[perm] == 1][:n_val_outlier])) labels = np.array([0] * n_val_normal + [1] * n_val_outlier) i = 1 for gamma in gammas: # Build the training kernel kernel = pairwise_kernels(X, X, metric=self.kernel, gamma=gamma) # Model candidate model = ConvexSSAD(kernel, semi_targets, Cp=self.Cp, Cu=self.Cu, Cn=self.Cn) # Train start_time = time.time() model.fit() train_time = time.time() - start_time # Test on small hold-out set from test set kernel_val = pairwise_kernels(X_val, X[model.svs, :], metric=self.kernel, gamma=gamma) scores = (-1.0) * model.apply(kernel_val) scores = scores.flatten() # Compute AUC auc = roc_auc_score(labels, scores) logger.info( f' | Model {i:02}/{len(gammas):02} | Gamma: {gamma:.8f} | Train Time: {train_time:.3f}s ' f'| Val AUC: {100. * auc:.2f} |') if auc > best_auc: best_auc = auc self.model = model self.gamma = gamma self.results['train_time'] = train_time i += 1 # Get support vectors for testing self.X_svs = X[self.model.svs, :] # If hybrid, also train a model with linear kernel if self.hybrid: linear_kernel = pairwise_kernels(X, X, metric='linear') self.linear_model = ConvexSSAD(linear_kernel, semi_targets, Cp=self.Cp, Cu=self.Cu, Cn=self.Cn) start_time = time.time() self.linear_model.fit() train_time = time.time() - start_time self.results['train_time_linear'] = train_time self.linear_X_svs = X[self.linear_model.svs, :] logger.info( f'Best Model: | Gamma: {self.gamma:.8f} | AUC: {100. * best_auc:.2f}' ) logger.info('Training Time: {:.3f}s'.format( self.results['train_time'])) logger.info('Finished training.')
n_jobs=self.n_jobs, degree=self.degree, gamma=self.gamma, coef0=self.coef0) if __name__ == "__main__": X = np.random.normal(size=(1000, 100)) Y = np.random.normal(size=(1000, 20)) kcca = KCCA(n_components=10, kernel="rbf", n_jobs=1, epsilon=0.1).fit(X, Y) """ matching on test data """ alpha = kcca.alpha beta = kcca.beta X_te = np.random.normal(size=(10, 100)) Y_te = np.random.normal(size=(10, 20)) Kx = kcca._pairwise_kernels(X_te, X) Ky = kcca._pairwise_kernels(Y_te, Y) F = np.dot(Kx, alpha) G = np.dot(Ky, beta) D = euclidean_distances(F, G) idx_pred = np.argmin(D, axis=0) print "matching result:", idx_pred, len(alpha), len(beta) """ similarity between true object and predicted object on test data """ idx_true = range(10) C = pairwise_kernels(Y_te[idx_true], Y_te[idx_pred], metric="cosine") print "1-best mean similarity:", np.mean(C.diagonal())
def fit(self, X, y, src_index, tgt_index, tgt_index_labeled=None, **fit_params): """ Fit KMM. Parameters ---------- X : numpy array Input data. y : numpy array Output data. src_index : iterable indexes of source labeled data in X, y. tgt_index : iterable indexes of target unlabeled data in X, y. tgt_index_labeled : iterable, optional (default=None) indexes of target labeled data in X, y. fit_params : key, value arguments Arguments given to the fit method of the estimator (epochs, batch_size...). Returns ------- self : returns an instance of self """ check_indexes(src_index, tgt_index, tgt_index_labeled) if tgt_index_labeled is None: Xs = X[src_index] ys = y[src_index] else: Xs = X[np.concatenate((src_index, tgt_index_labeled))] ys = y[np.concatenate((src_index, tgt_index_labeled))] Xt = X[tgt_index] n_s = len(Xs) n_t = len(Xt) # Get epsilon if self.epsilon is None: self.epsilon = (np.sqrt(n_s) - 1) / np.sqrt(n_s) # Compute Kernel Matrix K = pairwise.pairwise_kernels(Xs, Xs, metric=self.kernel, **self.kernel_params) K = (1 / 2) * (K + K.transpose()) # Compute q kappa = pairwise.pairwise_kernels(Xs, Xt, metric=self.kernel, **self.kernel_params) kappa = (n_s / n_t) * np.dot(kappa, np.ones((n_t, 1))) constraints = LinearConstraint(np.ones((1, n_s)), lb=n_s * (1 - self.epsilon), ub=n_s * (1 + self.epsilon)) def func(x): return (1 / 2) * x.T @ (K @ x) - kappa.T @ x weights = minimize(func, x0=np.ones((n_s, 1)), bounds=[(0, self.B)] * n_s, constraints=constraints)['x'] self.weights_ = np.array(weights).ravel() self.estimator_ = check_estimator(self.get_estimator, **self.kwargs) try: self.estimator_.fit(Xs, ys, sample_weight=self.weights_, **fit_params) except: bootstrap_index = np.random.choice(len(Xs), size=len(Xs), replace=True, p=self.weights_ / self.weights_.sum()) self.estimator_.fit(Xs[bootstrap_index], ys[bootstrap_index], **fit_params) return self