def GeneralizedNormalKernel(X, Y=None, gamma = None, beta = 1): """Compute the generalized normal kernel between X and Y. The generalized normal kernel is defined as:: K(x, y) = exp(-gamma ||x-y||_1^beta) for each pair of rows x in X and y in Y. Parameters ---------- X : array of shape (n_samples_X, n_features) Y : array of shape (n_samples_Y, n_features) gamma : float Returns ------- kernel_matrix : array of shape (n_samples_X, n_samples_Y) """ X, Y = check_pairwise_arrays(X, Y) if gamma is None: gamma = 1.0 / X.shape[1] if beta == 1: K = -gamma * manhattan_distances(X, Y) else: K = -gamma * manhattan_distances(X, Y) ** beta np.exp(K, K) # exponentiate K in-place return K
def med_distnaces(X): """ """ print 'medutilDist: custome distance fuction' D = manhattan_distances(X, Y, sum_over_features=False) return D
def mean_absolute_error(x, y): vector = manhattan_distances(x, y) summation = np.sum(vector) mae = summation / y.shape[0] return mae
def MaternKernel(X, Y=None, gamma = None, p = 0): """Compute the generalized normal kernel between X and Y. The generalized normal kernel is defined as:: K(x, y) = exp(-gamma ||x-y||_1^beta) for each pair of rows x in X and y in Y. Parameters ---------- X : array of shape (n_samples_X, n_features) Y : array of shape (n_samples_Y, n_features) gamma : float Returns ------- kernel_matrix : array of shape (n_samples_X, n_samples_Y) """ assert(p == int(p)) X, Y = check_pairwise_arrays(X, Y) if gamma is None: gamma = 1.0 / X.shape[1] r = manhattan_distances(X, Y) if p == 0: K = -gamma * r np.exp(K, K) # exponentiate K in-place if p == 1: K = -gamma * r * math.sqrt(3) np.exp(K, K) # exponentiate K in-place K *= (1+gamma * r * math.sqrt(3)) if p == 1: K = -gamma * r * math.sqrt(5) np.exp(K, K) # exponentiate K in-place K *= (1+gamma * r * math.sqrt(5) + 5./3. * (r*gamma)**2) return K
def search_query(images, query, proximity_by, rank_size,coding_kind): #print('The search has began') if coding_kind == 1: #frequencia de cd cor na imagem, agrupado em 32 cluster para cd cor, 96 no total t='rgb_hist' elif coding_kind == 2: #descrevo a imagem com base na cor dos pixels sorteados e das cores no codebook t = 'feature_vector_pixels_hard' else: t='feature_vector_hist_hard' distances = None if proximity_by == 'ed': for img in images: distances=np.append(distances,euclidean_distances(img[t],query[t])) elif proximity_by == 'md': for img in images: distances=np.append(distances,manhattan_distances(img[t],query[t],sum_over_features=False)) distances = np.array(np.delete(distances,0),dtype=np.float) #print('We have the distances') #print(distances) proximity_vector = None for i in range(rank_size): a = np.nanargmin(distances) proximity_vector = np.append(proximity_vector,np.int(a)) distances[a] = np.nan proximity_vector = np.array(np.delete(proximity_vector,0),dtype=np.int) #print(proximity_vector) return [proximity_vector]# ['obj1__0', 'obj1__10', ...]
def compute_distance(matrix, v, distance_metric): if (distance_metric == 'cosine'): return cosine_similarity(matrix, v.reshape(1, -1)).reshape(-1) elif (distance_metric == 'l2'): return euclidean_distances(matrix, v.reshape(1, -1)).reshape(-1) elif (distance_metric == 'l1'): return manhattan_distances(matrix, v.reshape(1, -1)).reshape(-1) else: raise ValueError( 'Invalid distance metric, must be in [cosine, l1, l2]')
def compute_distance_matrix(model_matrix, pred_matrix, distance_matrix_type): distance_matrix = [] if (distance_matrix_type == 'euclidean'): distance_matrix = euclidean_distances(model_matrix, pred_matrix) elif (distance_matrix_type == 'cosine'): distance_matrix = cosine_distances(model_matrix, pred_matrix) elif (distance_matrix_type == 'manhatten'): distance_matrix = manhattan_distances(model_matrix, pred_matrix) return distance_matrix
def l1_dist(X, Y): ''' Computes l1 metric between X and Y. May need to modify since I swapped the rows and columns. X: A.layers['log1p'] Y: B.layers['log1p'] ''' dist_AA = manhattan_distances(X, X) dist_AB = manhattan_distances(X, Y) # nkc are the kallisto-cellranger distances dist_AB = np.diagonal(dist_AB) # ncc are the kallisto-kallisto distances AA = [] for row in dist_AA: val = np.partition(row, 1)[1] AA.append(val) dist_AA = AA return dist_AA, dist_AB
def ehd_calculations(features, vector, query_vector): dim = calculate_dimensions(vector, 'EHD') data_feature_ehd, query_feature_ehd = fit_pca(dim, vector, query_vector) # manhattan distance between query image and complete dataset man_dis_ehd = pd.DataFrame(manhattan_distances( query_feature_ehd, data_feature_ehd, sum_over_features=True).flatten(), columns=['ehd_dis']) dataset = pd.concat([features, man_dis_ehd], axis=1) # Manhattan Distance sorting # man_dis_result = dataset_ehd.sort_values(by='ehd_dis', ascending=True)[[0, 'ehd_dis']].head(20) return dataset
def scalable_align(cs_lat, cs_lon, swath_lat, swath_lon): """ """ (n, m) = swath_lat.shape swath_points = np.stack((swath_lat.flatten(), swath_lon.flatten())).T track_points = np.stack((cs_lat, cs_lon), axis=1) dist = manhattan_distances(swath_points, track_points) mapping = np.unravel_index(np.argmin(dist, axis=0), (n, m)) return mapping
def __init__(self): ''' Sets the scaling and distance functions that define the Manhattan Similarity Metric. (1 / (.0000000001 + d)) is from https://stats.stackexchange.com/a/158285 2000 is manually chosen, by @Zoran, to help scale. ''' self.set_name("Manhattan") self.set_scaling(lambda dst: np.tanh(400 * (1 / (.0000000001 + dst)))) self.set_dist(lambda wm, uv: manhattan_distances(wm, uv)[0][0])
def measure_distance(self, sentences:list): transformed_sentences:list=self.transformer.extract_text_features_simple(sentences) #first= manhattan_distances(transformed_sentences,self.featurized_ex1) #second=manhattan_distances(transformed_sentences,self.featurized_ex2) third=manhattan_distances(transformed_sentences,self.featurized_cl1) return third #, np.var((first,second,third),axis=0)
def _predict_derivatives(self, x, kx): """ Evaluates the derivatives at a set of points. Arguments --------- x : np.ndarray [n_evals, dim] Evaluation point input variable values kx : int The 0-based index of the input variable with respect to which derivatives are desired. Returns ------- y : np.ndarray Derivative values. """ kx += 1 # Initialization n_eval, n_features_x = x.shape x = (x - self.X_mean) / self.X_std # Get pairwise componentwise L1-distances to the input training set dx = manhattan_distances(x, Y=self.X_norma.copy(), sum_over_features= False) d = self._componentwise_distance(dx) # Compute the correlation function r = self.options['corr'](self.optimal_theta, d).reshape(n_eval,self.nt) if self.options['corr'].__name__ != 'squar_exp': raise ValueError( 'The derivative is only available for square exponential kernel') if self.options['poly'].__name__ == 'constant': df = np.array([0]) elif self.options['poly'].__name__ == 'linear': df = np.zeros((self.nx + 1, self.nx)) df[1:,:] = 1 else: raise ValueError( 'The derivative is only available for ordinary kriging or '+ 'universal kriging using a linear trend') # Beta and gamma = R^-1(y-FBeta) beta = self.optimal_par['beta'] gamma = self.optimal_par['gamma'] df_dx = np.dot(df.T, beta) d_dx=x[:,kx-1].reshape((n_eval,1))-self.X_norma[:,kx-1].reshape((1,self.nt)) if self.name != 'Kriging' and 'KPLSK' not in self.name: theta = np.sum(self.optimal_theta * self.coeff_pls**2,axis=1) else: theta = self.optimal_theta y = (df_dx[0]-2*theta[kx-1]*np.dot(d_dx*r,gamma))*self.y_std/self.X_std[kx-1] return y
def compute_distances(X, Y, metric, metric_params): if metric == 'manhattan': distances = manhattan_distances(X, Y) elif metric == 'euclidean': distances = euclidean_distances(X, Y) elif metric == 'cosine': distances = cosine_distances(X, Y) elif metric == 'bm25': distances = bm25_similarity(X, Y, metric_params) return distances
def evaluate(test_emb, test_id, params): unique_ids, unique_counts = np.unique(test_id, return_counts=True) unique_ids = unique_ids[unique_counts >= 2] good_test_indices = np.in1d(test_id, unique_ids) valid_test_embs = test_emb[good_test_indices] valid_test_ids = test_id[good_test_indices] n_correct_at_k = np.zeros(params.max_k) if len(test_emb) < 40000: if params.dist == 'cos': #distances = 1.-np.dot(valid_test_embs, test_emb.T) distances = find_cos_distances(valid_test_embs, test_emb) elif params.dist == 'l2': distances = find_l2_distances(valid_test_embs, test_emb) elif params.dist == 'l1': distances = manhattan_distances(valid_test_embs, test_emb) elif params.dist == 'max_l1' or params.dist == 'max_l2': distances = max_distances(valid_test_embs, test_emb, params.dist) elif params.dist == 'softmax_l1': distances = softmax_distances(valid_test_embs, test_emb, params.softmax_l1_beta) elif params.dist == 'p_norm': distances = p_norm_distances(valid_test_embs, test_emb, params.p_norm_p) elif params.dist == 'max_n': distances = max_n_distances(valid_test_embs, test_emb, params.max_n) for idx, valid_test_id in enumerate(valid_test_ids): k_sorted_indices = np.argsort(distances[idx])[1:] first_correct_position = np.where( test_id[k_sorted_indices] == valid_test_id)[0][0] if first_correct_position < params.max_k: n_correct_at_k[first_correct_position:] += 1 return 100. * n_correct_at_k / len(valid_test_ids) else: #if params.dist == 'cos': # metric='cosine' #else: metric = 'l2' metric = 'l2' nn = NearestNeighbors(n_neighbors=params.max_k + 1, metric=metric, algorithm='kd_tree', n_jobs=-1).fit(test_emb) distances, indices = nn.kneighbors(valid_test_embs) for idx, valid_test_id in enumerate(valid_test_ids): k_sorted_indices = indices[idx] correct_positions = np.where( test_id[k_sorted_indices] == valid_test_id)[0][1:] first_correct_position = params.max_k if len(correct_positions) > 0: first_correct_position = correct_positions[0] - 1 if first_correct_position < params.max_k: n_correct_at_k[first_correct_position:] += 1 return 100. * n_correct_at_k / len(valid_test_ids)
def _multi_gini_seg(data, groups): """ Calculation of Multigroup Gini Segregation index Parameters ---------- data : a pandas DataFrame groups : list of strings. The variables names in data of the groups of interest of the analysis. Returns ------- statistic : float Multigroup Gini Segregation Index core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Notes ----- Based on Reardon, Sean F., and Glenn Firebaugh. "Measures of multigroup segregation." Sociological methodology 32.1 (2002): 33-67. Reference: :cite:`reardon2002measures`. """ core_data = data[groups] data = _nan_handle(core_data) df = np.array(core_data) K = df.shape[1] T = df.sum() ti = df.sum(axis=1) pik = df / ti[:, None] Pk = df.sum(axis=0) / df.sum() Is = (Pk * (1 - Pk)).sum() elements_sum = np.empty(K) for k in range(K): aux = np.multiply(np.outer(ti, ti), manhattan_distances(pik[:, k].reshape(-1, 1))).sum() elements_sum[k] = aux multi_Gini_Seg = elements_sum.sum() / (2 * (T**2) * Is) return multi_Gini_Seg, core_data, groups
def manhattan_dist(a, b): ######## EUCLIDEAN DISTANCE ######## # INPUT # a: 1-D array # b: 1-D array # a and b have the same length # OUTPUT # d: Manhattan distance between a and b # TODO: Manhattan distance d = 0 d = manhattan_distances(a, b) return d
def countError(self, data): error = 0 for cluster in range(self.n_clusters): clusterData = [] for i in range(len(data)): if (self.labels_[i] == cluster): clusterData.append(data[i]) distanceMatrix = manhattan_distances( clusterData, [self.clusters_centers_[cluster]]) # distanceMatrix[i][0], 0 karena hanya cuma ada satu cluster center yang diitung jaraknya for i in range(len(distanceMatrix)): error += distanceMatrix[i][0] return error
def distance(instance1, instance2, dis_type): if dis_type == 1: if instance1 == None or instance2 == None: return float("inf") sumOfSquares = 0 for i in range(1, len(instance1)): sumOfSquares += (instance1[i] - instance2[i])**2 elif dis_type == 2: #print(instance1) #print(instance2) if instance1 == None or instance2 == None: return float("inf") sumOfSquares = 0 #for i in range(1, len(instance1)): instance1 = list(instance1) instance2 = list(instance2) a = instance1.pop(0) a = instance2.pop(0) sumOfSquares += manhattan_distances([instance1], [instance2]) elif dis_type == 3: #print(instance1) #print(instance2) if instance1 == None or instance2 == None: return float("inf") sumOfSquares = 0 #for i in range(1, len(instance1)): instance1 = list(instance1) instance2 = list(instance2) a = instance1.pop(0) a = instance2.pop(0) sumOfSquares += (1 - cosine_similarity([instance1], [instance2])) elif dis_type == 4: #print(instance1) #print(instance2) if instance1 == None or instance2 == None: return float("inf") sumOfSquares = 0 #for i in range(1, len(instance1)): instance1 = list(instance1) instance2 = list(instance2) a = instance1.pop(0) a = instance2.pop(0) sumOfSquares += (1 - jaccard(instance1, instance2)) #print("The centroid is :") #print(instance2) #print("The SSE is :") #sse = 0 #for i in range(1, len(instance1)): # sse += (instance1[i] - instance2[i])**2 #print(sse) return sumOfSquares
def calculate_similarity(beer1, beer2): # find common reviewers beer_1_reviewers = df[df.beer_name==beer1].review_profilename.unique() beer_2_reviewers = df[df.beer_name==beer2].review_profilename.unique() common_reviewers = set(beer_1_reviewers).intersection(beer_2_reviewers) # get reviews beer_1_reviews = get_beer_reviews(beer1, common_reviewers) beer_2_reviews = get_beer_reviews(beer2, common_reviewers) dists = [] for f in ALL_FEATURES: dists.append(manhattan_distances(beer_1_reviews[f], beer_2_reviews[f])[0][0]) return dists
def investor_helper(betas): # weights for market porfolio mkt = betas.sum(axis=0) / betas.sum() # "AUM" weights to aggregate market portfolio x = betas.sum(axis=1) aum = x / x.sum() nbetas = betas / x[:, None] # distance to AUM weighted market portfolio l2 = cosine_similarity(X=betas, Y=np.expand_dims(mkt, axis=0)).flatten() l1 = 1 - manhattan_distances(X=nbetas, Y=np.expand_dims(mkt, axis=0), sum_over_features=True).flatten() / 2 return (aum, l2, l1)
def uclidean_distance(self, centroids, vector, key, cluster): result = {} for i in range(len(centroids)): try: result['c' + str(i)] = manhattan_distances( np.array(centroids[i]).reshape(1, -1), np.array(vector).reshape(1, -1)) #result['c'+str(i)] = distance.euclidean(centroids[i], vector) except: print(centroids[i]) exit() result = sorted(result.items(), key=lambda x: x[1]) result = dict(result) cluster[list(result.keys())[0]].append(key)
def get_actor_actor_similarity_matrix(similarity_measure=3, consider_zero_tag_vectors=True): actor_list, actor_tag_matrix = __get_actor_tag_matrix__(consider_zero_tag_vectors) actor_actor_similarity_matrix = [[[] for _ in range(len(actor_tag_matrix))] for _ in range(len(actor_tag_matrix))] if similarity_measure == 0: # inverse of manhattan distance matrix generation actor_actor_similarity_matrix = \ __get_similarity_from_distance_matrix__(manhattan_distances(actor_tag_matrix)) elif similarity_measure == 1: # cosine similarity matrix generation actor_actor_similarity_matrix = cosine_similarity(actor_tag_matrix) elif similarity_measure == 2: # inverse of cosine distance matrix generation actor_actor_similarity_matrix = \ __get_similarity_from_distance_matrix__(cosine_distances(actor_tag_matrix)) elif similarity_measure == 3: # mahalanobis distance calculation but the distance value is inverted i.e 1/value # once the distance is calculated and put in the similarity metric cov_matrix = np.cov(np.transpose(actor_tag_matrix)) for actor_index in range(len(actor_tag_matrix)): for actor_index_2 in range(len(actor_tag_matrix)): dist = distance.mahalanobis(u=actor_tag_matrix[actor_index], v=actor_tag_matrix[actor_index_2], VI=cov_matrix) if dist == 0: actor_actor_similarity_matrix[actor_index][actor_index_2] = 1 else: actor_actor_similarity_matrix[actor_index][actor_index_2] = 1/(1 + dist) else: # Euclidean distance measure # uses the distance function given above that returns reciprocal values of distance # making it mimic a similarity measure for actor_index in range(len(actor_tag_matrix)): for actor_index_2 in range(len(actor_tag_matrix)): dist = __get_actor_actor_euclidean_similarity__( actor_tag_matrix[actor_index], actor_tag_matrix[actor_index_2]) actor_actor_similarity_matrix[actor_index][actor_index_2] = dist return actor_list, actor_actor_similarity_matrix
def TFIDFManhattanDistance(doc, alldoc): cleantext = alldocclean_(alldoc) tfidf = TfidfVectorizer() tfs = tfidf.fit_transform(cleantext) df = pd.DataFrame(tfs.toarray(), columns=tfidf.get_feature_names()) mansim = manhattan_distances(df) mansimdf = pd.DataFrame(mansim) mansimpair = [] for index in range(len(alldoc)): if (doc != index): temp = [] temp.append(index) temp.append(mansimdf.at[index, doc]) mansimpair.append(temp) return mansimpair
def _multi_gini_seg(data, groups): """Calculate Multigroup Gini Segregation index. Parameters ---------- data : a pandas DataFrame dataframe holding group data groups : list of strings. The variables names in data of the groups of interest of the analysis. Returns ------- statistic : float Multigroup Gini Segregation Index core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Notes ----- Based on Reardon, Sean F., and Glenn Firebaugh. "Measures of multigroup segregation." Sociological methodology 32.1 (2002): 33-67. Reference: :cite:`reardon2002measures`. """ core_data = data[groups] df = np.array(core_data) K = df.shape[1] T = df.sum() ti = df.sum(axis=1) pik = df / ti[:, None] pik = np.nan_to_num( pik) # Replace NaN from zerodivision when unit has no population Pk = df.sum(axis=0) / df.sum() Is = (Pk * (1 - Pk)).sum() elements_sum = np.empty(K) for k in range(K): aux = np.multiply(np.outer(ti, ti), manhattan_distances(pik[:, k].reshape(-1, 1))).sum() elements_sum[k] = aux multi_Gini_Seg = elements_sum.sum() / (2 * (T**2) * Is) if isinstance(data, GeoDataFrame): core_data = data[[data.geometry.name]].join(core_data) return multi_Gini_Seg, core_data, groups
def write_distances_manhattan(visual_desc,file2): distance_list=[] id_list=[] distance=pd.DataFrame() #print(id_list) for input in visual_desc: for first in file2: dist = manhattan_distances(input[2:].reshape(1,-1),first[2:].reshape(1,-1))[0][0] distance_list.append(dist) id_list.append([input[0],input[1],first[0],first[1]]) #print(id_list) #distance=distance.append(pd.DataFrame({'Distance':[dist]})) distance=distance.append(pd.DataFrame(id_list, columns=['Input Location','Input Id','Second Location','Second Id'])) distance.insert(loc=0,column='Distance',value=distance_list) distance_sorted = distance.sort_values('Distance',ascending=True) return(distance_sorted)
def nn_ind(self, color_hist, num): """ Exact nearest neighbor seach through exhaustive comparison. """ if self.distance_metric == 'manhattan': dists = manhattan_distances(color_hist, self.hists_reduced) elif self.distance_metric == 'euclidean': dists = euclidean_distances(color_hist, self.hists_reduced, squared=True) elif self.distance_metric == 'chi_square': dists = -additive_chi2_kernel(color_hist, self.hists_reduced) dists = dists.flatten() nn_ind = np.argsort(dists).flatten()[:num] nn_dists = dists[nn_ind] return nn_ind, nn_dists
def KDE(x, kn_negs, Z, beta): ''' Calculate the density estimation using Gaussian-kernal-based kernel density estimator ''' num_neg_instances = 0 p = 0 x = np.reshape(x, (1, x.shape[0])) for instance in kn_negs: num_neg_instances += 1 inst = np.reshape(instance, (1, instance.shape[0])) dist = manhattan_distances(x, inst) p = p + math.exp(-1 * beta * dist) p = p / (Z * num_neg_instances) return p
def manhattan_from_data(training_data, test_data): """Manhattan distance Parameters ---------- training_data : dict test_data : dict Returns ------- distance : dict """ training_data = pd.DataFrame.from_dict(training_data, orient='index') test_data = pd.DataFrame.from_dict(test_data, orient='index') return manhattan_distances(test_data, training_data)
def gradient(self, x): """ Calculate the gradient of the posterior mean and variance Note that the nugget effect will not the change the computation below """ check_is_fitted(self, 'X') # Check input shapes x = np.atleast_2d(x) n_eval, _ = x.shape n_samples, n_features = self.X.shape if _ != n_features: raise Exception('x does not have the right size!') if n_eval != 1: raise Exception('x must be a vector!') # trend and its Jacobian f = self.regr(x).T f_dx = self.regr_dx(x) # correlation and its Jacobian d = manhattan_distances(x, Y=self.X, sum_over_features=False) r = self.corr(self.theta_, d).reshape(n_eval, n_samples) r_dx = self.corr_dx(x, X=self.X, r=r) # gradient of the posterior mean y_dx = dot(f_dx, self.beta) + dot(r_dx, self.gamma) # auxiliary variable: rt = C^-1 * r rt = solve_triangular(self.C, r.T, lower=True) rt_dx = solve_triangular(self.C, r_dx.T, lower=True).T # auxiliary variable: u = Ft^T * rt - f u = dot(self.Ft.T, rt) - f u_dx = dot(rt_dx, self.Ft) - f_dx mse_dx = -dot(rt_dx, rt) # for Simple Kriging if self.beta0 is None: # for Universal Kriging Ft2inv = inv(dot(self.Ft.T, self.Ft)) mse_dx += dot(u_dx, Ft2inv).dot(u) mse_dx = 2.0 * self.sigma2 * mse_dx return y_dx, mse_dx
def process_similarity(self, similarity): if similarity == "cosine": x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1) self._similarity_matrix[x, y] = cosine_similarity( self._data.sp_i_train_ratings.T)[x, y] elif similarity == "dot": self._similarity_matrix = ( self._data.sp_i_train_ratings.T @ self._data.sp_i_train_ratings).toarray() elif similarity == "euclidean": x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1) self._similarity_matrix[x, y] = ( 1 / (1 + euclidean_distances(self._data.sp_i_train_ratings.T)))[x, y] elif similarity == "manhattan": x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1) self._similarity_matrix[x, y] = ( 1 / (1 + manhattan_distances(self._data.sp_i_train_ratings.T)))[x, y] elif similarity == "haversine": x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1) self._similarity_matrix[x, y] = ( 1 / (1 + haversine_distances(self._data.sp_i_train_ratings.T)))[x, y] elif similarity == "chi2": x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1) self._similarity_matrix[x, y] = ( 1 / (1 + chi2_kernel(self._data.sp_i_train_ratings.T)))[x, y] elif similarity in ['cityblock', 'l1', 'l2']: x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1) self._similarity_matrix[x, y] = (1 / (1 + pairwise_distances( self._data.sp_i_train_ratings.T, metric=similarity)))[x, y] elif similarity in [ 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule' ]: x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1) self._similarity_matrix[x, y] = (1 / (1 + pairwise_distances( self._data.sp_i_train_ratings.T.toarray(), metric=similarity)) )[x, y] else: raise Exception("Not implemented similarity")
def predict(self, df): predictions = np.zeros(len(df)) callersInPipeline = list() for i in range(0, self.num_clusters): weight = len([1 for num in self.clusters if num == i]) distances = manhattan_distances(df.T, self.centers[i]) bestIndex = np.argmin(distances) bestCallerName = df.columns[bestIndex] representativeCaller = df[bestCallerName] callersInPipeline.append(bestCallerName) predictions += weight * representativeCaller predictions /= len(self.clusters) predictions = self.transformToPredictions(predictions) return predictions, callersInPipeline
def get_similarities(dtm, labels, sim_measure): # Try different distance metrics if sim_measure == 'cosine': dist = 1 - cosine_similarity(dtm) elif sim_measure == 'euclidean': dist = euclidean_distances(dtm) elif sim_measure == 'manhattan': dist = manhattan_distances(dtm) dist_df = pd.DataFrame((1 - dist), columns=labels, index=labels) # dist_df = dist_df[sorted(labels)] # dist_df = dist_df.sort_index() dist_df.to_csv('{}/distance-{}-matrix.csv'.format(path, sim_measure)) # Hierarchical Clustering get_hierarchical_clustering(dist, labels, path) return dist_df
def get_course_specific_similarities(course, dtm, labels, sim_measure): dtm = dtm.toarray() index = labels.index(course) # Try different distance metrics if sim_measure == 'cosine': dist = 1 - cosine_similarity(dtm, [dtm[index]]) elif sim_measure == 'euclidean': dist = euclidean_distances(dtm, [dtm[index]]) elif sim_measure == 'manhattan': dist = manhattan_distances(dtm, [dtm[index]]) dist_df = pd.DataFrame((1 - dist), columns=[course], index=labels) # dist_df = dist_df[sorted(labels)] # dist_df = dist_df.sort_index() dist_df.to_csv('{}/distance-{}-matrix-{}.csv'.format( path, sim_measure, course)) return dist_df
def __init__(self, base_estimator=DecisionTreeClassifier(max_depth=1), base_weighter=KernelMeanMatching(), similarity="euclidean"): self._base_estimator = base_estimator self._base_weighter = base_weighter if similarity == "euclidean": self._similarity = lambda x, y: 1 / (1 + pw.euclidean_distances( x, y)) elif similarity == "cosine": self._similarity = pw.cosine_similarity elif similarity == "l1": self._similarity = lambda x, y: 1 / (1 + pw.manhattan_distances( x, y)) else: self._similarity = similarity
def CM(imgsvec1, imgsvec2, imgpairs=True): #calculate the pair wise manhattan distance between two vector matrices allDis = manhattan_distances(imgsvec1, imgsvec2) #average of all the distances between one vector of first matrix to all the vectors of second matrix, for all vectors of first matrix imgDis = numpy.mean(allDis, axis=1) #average of all the distances from previous step to get the final combined distance between all images finalDis = numpy.mean(imgDis, axis=0) #if return most similar image pairs is True, also return the most similar images if imgpairs: return finalDis, sim_images(allDis) return finalDis
def nn(feat, feats, distance='euclidean', K=-1): """ Exact nearest neighbor seach through exhaustive comparison. """ if distance == 'manhattan': dists = metrics.manhattan_distances(feat, feats) elif distance == 'euclidean': dists = metrics.euclidean_distances(feat, feats, squared=True) elif distance == 'chi_square': dists = -metrics.additive_chi2_kernel(feat, feats) dists = dists.flatten() if K > 0: nn_ind = bn.argpartsort(dists, K).flatten()[:K] nn_ind = nn_ind[np.argsort(dists[nn_ind])] else: nn_ind = np.argsort(dists) nn_dist = dists[nn_ind] return nn_ind, nn_dist
def get_nearst_n_hist_index(self, color_hist, num=10): """ return n closest nearest to the color_hist and the distance to it :param color_hist: hist of the color palette :param num: n of the nearest to return :return: nearst_index: the index of the neighbors nearst_dists: the distance to the neighbors """ #------ manhattan_distance ------ #------ 曼哈顿距离 #------ 直方图相似度评估 dists = manhattan_distances(color_hist, self.img_set.img_set_hist) dists = dists.flatten() nearst_index = np.argsort(dists).flatten()[:num] nearst_dists = dists[nearst_index] nearst_file_name = [] for i in nearst_index: nearst_file_name.append(self.img_set.img_load_order[i]) if self.debug: print 'pic file names', nearst_file_name return nearst_index, nearst_dists
def _nn(self, image_id, feature, distance='cosine', K=-1): """ Exact nearest neighbor seach through exhaustive comparison. """ # S = self.S[feature] feats = self.features[feature] feat = feats[self.index.index(image_id)] if distance == 'manhattan': dists = metrics.manhattan_distances(feat, feats) elif distance == 'euclidean': dists = metrics.euclidean_distances(feat, feats, squared=True) elif distance == 'chi_square': dists = -metrics.additive_chi2_kernel(feat, feats) elif distance == 'dot': dists = -np.dot(feats, feat) elif distance == 'cosine': feats_norm = self.features_norm[feature] dists = -np.dot(feats, feat) / feats_norm / np.linalg.norm(feat, 2) elif distance == 'projected': feats = self.features_proj[feature] feat = feats[self.index.index(image_id)] dists = sklearn.utils.extmath.row_norms(feats - feat) dists = dists.flatten() if K > 0: nn_ind = np.argsort(dists).flatten()[:K] else: nn_ind = np.argsort(dists) nn_dist = dists[nn_ind] return nn_ind, nn_dist
def test_pairwise_distances(): # Test the pairwise_distance helper function. rng = np.random.RandomState(0) # Euclidean distance should be equivalent to calling the function. X = rng.random_sample((5, 4)) S = pairwise_distances(X, metric="euclidean") S2 = euclidean_distances(X) assert_array_almost_equal(S, S2) # Euclidean distance, with Y != X. Y = rng.random_sample((2, 4)) S = pairwise_distances(X, Y, metric="euclidean") S2 = euclidean_distances(X, Y) assert_array_almost_equal(S, S2) # Test with tuples as X and Y X_tuples = tuple([tuple([v for v in row]) for row in X]) Y_tuples = tuple([tuple([v for v in row]) for row in Y]) S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean") assert_array_almost_equal(S, S2) # "cityblock" uses sklearn metric, cityblock (function) is scipy.spatial. S = pairwise_distances(X, metric="cityblock") S2 = pairwise_distances(X, metric=cityblock) assert_equal(S.shape[0], S.shape[1]) assert_equal(S.shape[0], X.shape[0]) assert_array_almost_equal(S, S2) # The manhattan metric should be equivalent to cityblock. S = pairwise_distances(X, Y, metric="manhattan") S2 = pairwise_distances(X, Y, metric=cityblock) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) # Low-level function for manhattan can divide in blocks to avoid # using too much memory during the broadcasting S3 = manhattan_distances(X, Y, size_threshold=10) assert_array_almost_equal(S, S3) # Test cosine as a string metric versus cosine callable # "cosine" uses sklearn metric, cosine (function) is scipy.spatial S = pairwise_distances(X, Y, metric="cosine") S2 = pairwise_distances(X, Y, metric=cosine) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) # Test with sparse X and Y, # currently only supported for Euclidean, L1 and cosine. X_sparse = csr_matrix(X) Y_sparse = csr_matrix(Y) S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean") S2 = euclidean_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) S = pairwise_distances(X_sparse, Y_sparse, metric="cosine") S2 = cosine_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan") S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo()) assert_array_almost_equal(S, S2) S2 = manhattan_distances(X, Y) assert_array_almost_equal(S, S2) # Test with scipy.spatial.distance metric, with a kwd kwds = {"p": 2.0} S = pairwise_distances(X, Y, metric="minkowski", **kwds) S2 = pairwise_distances(X, Y, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # same with Y = None kwds = {"p": 2.0} S = pairwise_distances(X, metric="minkowski", **kwds) S2 = pairwise_distances(X, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # Test that scipy distance metrics throw an error if sparse matrix given assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski") assert_raises(TypeError, pairwise_distances, X, Y_sparse, metric="minkowski") # Test that a value error is raised if the metric is unknown assert_raises(ValueError, pairwise_distances, X, Y, metric="blah")
from flask import Flask, render_template, request from flaskext.markdown import Markdown import json import folium import glob # imports for classifier re-example from lyrics_classifier import LyricsClf lclf = LyricsClf('classifier.p') # imports for beer similarity import pandas as pd import numpy as np from sklearn.metrics.pairwise import manhattan_distances beers = pd.read_csv('beer.csv').set_index('Beer') distances = manhattan_distances(beers) # setup application app = Flask(__name__) app.debug = True Markdown(app) def blog_posts(): """this function should search for the blog posts that exist in the folder /posts, and create a list of tuples with the cleaned up title and the filename. ex: ['2015-04-06_my_first_blog_post'] """ files = glob.glob("posts/*.md") return [f.split('/')[1].split('.')[0] for f in files]
def test_pairwise_distances(): # Test the pairwise_distance helper function. rng = np.random.RandomState(0) # Euclidean distance should be equivalent to calling the function. X = rng.random_sample((5, 4)) S = pairwise_distances(X, metric="euclidean") S2 = euclidean_distances(X) assert_array_almost_equal(S, S2) # Euclidean distance, with Y != X. Y = rng.random_sample((2, 4)) S = pairwise_distances(X, Y, metric="euclidean") S2 = euclidean_distances(X, Y) assert_array_almost_equal(S, S2) # Test with tuples as X and Y X_tuples = tuple([tuple([v for v in row]) for row in X]) Y_tuples = tuple([tuple([v for v in row]) for row in Y]) S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean") assert_array_almost_equal(S, S2) # Test haversine distance # The data should be valid latitude and longitude X = rng.random_sample((5, 2)) X[:, 0] = (X[:, 0] - 0.5) * 2 * np.pi/2 X[:, 1] = (X[:, 1] - 0.5) * 2 * np.pi S = pairwise_distances(X, metric="haversine") S2 = haversine_distances(X) assert_array_almost_equal(S, S2) # Test haversine distance, with Y != X Y = rng.random_sample((2, 2)) Y[:, 0] = (Y[:, 0] - 0.5)*2*np.pi/2 Y[:, 1] = (Y[:, 1] - 0.5)*2*np.pi S = pairwise_distances(X, Y, metric="haversine") S2 = haversine_distances(X, Y) assert_array_almost_equal(S, S2) # "cityblock" uses scikit-learn metric, cityblock (function) is # scipy.spatial. S = pairwise_distances(X, metric="cityblock") S2 = pairwise_distances(X, metric=cityblock) assert_equal(S.shape[0], S.shape[1]) assert_equal(S.shape[0], X.shape[0]) assert_array_almost_equal(S, S2) # The manhattan metric should be equivalent to cityblock. S = pairwise_distances(X, Y, metric="manhattan") S2 = pairwise_distances(X, Y, metric=cityblock) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) # Test cosine as a string metric versus cosine callable # The string "cosine" uses sklearn.metric, # while the function cosine is scipy.spatial S = pairwise_distances(X, Y, metric="cosine") S2 = pairwise_distances(X, Y, metric=cosine) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) # Test with sparse X and Y, # currently only supported for Euclidean, L1 and cosine. X_sparse = csr_matrix(X) Y_sparse = csr_matrix(Y) S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean") S2 = euclidean_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) S = pairwise_distances(X_sparse, Y_sparse, metric="cosine") S2 = cosine_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan") S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo()) assert_array_almost_equal(S, S2) S2 = manhattan_distances(X, Y) assert_array_almost_equal(S, S2) # Test with scipy.spatial.distance metric, with a kwd kwds = {"p": 2.0} S = pairwise_distances(X, Y, metric="minkowski", **kwds) S2 = pairwise_distances(X, Y, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # same with Y = None kwds = {"p": 2.0} S = pairwise_distances(X, metric="minkowski", **kwds) S2 = pairwise_distances(X, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # Test that scipy distance metrics throw an error if sparse matrix given assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski") assert_raises(TypeError, pairwise_distances, X, Y_sparse, metric="minkowski") # Test that a value error is raised if the metric is unknown assert_raises(ValueError, pairwise_distances, X, Y, metric="blah")
def _e_step(self, X,w): self.labels_ = array([v[1]*v[0] for v in zip(w,manhattan_distances(X,self.cluster_centers_))]).argmin(axis=1)
def _e_step(self): self.labels_ = manhattan_distances(self.vectors, self.cluster_centers_).argmin(axis=1)
def med_distance_test(X, Y): print 'medutilDist: testing cross distances function (equivalent to manhattan_distances)' D = manhattan_distances(X, Y, sum_over_features=False) return D
x, y = dtm[i, :], dtm[j, :] dist[i, j] = np.sqrt(np.sum((x - y)**2)) return dist dist_eukl= zero_space(dist) dist_cos = zero_space(dist) dist_man = zero_space(dist) dist_eukl = euclidean_distances(dtm) np.round(dist_eukl, 1) dist_cos = 1 - cosine_similarity(dtm) np.round(dist_cos, 2) dist_man = manhattan_distances(dtm) np.round(dist_man, 1) norms = np.sqrt(np.sum(dtm * dtm, axis=1, keepdims=True)) dtm_normed = dtm / norms similarities = np.dot(dtm_normed, dtm_normed.T) np.round(similarities, 2) # Visualizing distances
print(dells_activities_data_frame.shape) # use dictionary object for mapping the response/target variable activity_to_binary = {'NO' : 0, 'YES' : 1, '': 0} for iname in binary_variable_names: dells_activities_data_frame[iname] = \ dells_activities_data_frame[iname].map(activity_to_binary) print(dells_activities_data_frame[0:10]) # examine the first 10 rows of data # convert DataFrame to numpy array representation of activities matrix activities_binary_matrix = dells_activities_data_frame.as_matrix().transpose() print(type(activities_binary_matrix)) print(activities_binary_matrix.shape) # compute distance matrix distance_matrix = manhattan_distances(activities_binary_matrix) print(distance_matrix.shape) # apply the multidimensional scaling algorithm and plot the map mds_method = manifold.MDS(n_components = 2, random_state = 9999,\ dissimilarity = 'precomputed') mds_fit = mds_method.fit(distance_matrix) mds_coordinates = mds_method.fit_transform(distance_matrix) activity_names = ['Shopping', 'Antiquing', 'Site Seeing', 'Fine Dining', 'Casual Dining', 'Family Style Dining', 'Fast Food Dining', 'Museums', 'Indoor Pool', 'Outdoor Pool', 'Hiking', 'Gambling', 'Boating/Swimming', 'Fishing', 'Golfing', 'Boat Tours', 'Ride the Ducks', 'Amusement Park', 'Minigolf', 'Go-carting', 'Waterpark', 'Circus World', 'Tommy Bartlett Ski Show',
def _e_step(self, X): self.labels_ = manhattan_distances(X, self.cluster_centers_).argmin(axis=1)
def test_pairwise_distances(): """ Test the pairwise_distance helper function. """ rng = np.random.RandomState(0) # Euclidean distance should be equivalent to calling the function. X = rng.random_sample((5, 4)) S = pairwise_distances(X, metric="euclidean") S2 = euclidean_distances(X) assert_array_almost_equal(S, S2) # Euclidean distance, with Y != X. Y = rng.random_sample((2, 4)) S = pairwise_distances(X, Y, metric="euclidean") S2 = euclidean_distances(X, Y) assert_array_almost_equal(S, S2) # Test with tuples as X and Y X_tuples = tuple([tuple([v for v in row]) for row in X]) Y_tuples = tuple([tuple([v for v in row]) for row in Y]) S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean") assert_array_almost_equal(S, S2) # "cityblock" uses sklearn metric, cityblock (function) is scipy.spatial. S = pairwise_distances(X, metric="cityblock") S2 = pairwise_distances(X, metric=cityblock) assert_equal(S.shape[0], S.shape[1]) assert_equal(S.shape[0], X.shape[0]) assert_array_almost_equal(S, S2) # The manhattan metric should be equivalent to cityblock. S = pairwise_distances(X, Y, metric="manhattan") S2 = pairwise_distances(X, Y, metric=cityblock) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) # manhattan does not support sparse matrices atm. assert_raises(ValueError, pairwise_distances, csr_matrix(X), metric="manhattan") # Low-level function for manhattan can divide in blocks to avoid # using too much memory during the broadcasting S3 = manhattan_distances(X, Y, size_threshold=10) assert_array_almost_equal(S, S3) # Test cosine as a string metric versus cosine callable # "cosine" uses sklearn metric, cosine (function) is scipy.spatial S = pairwise_distances(X, Y, metric="cosine") S2 = pairwise_distances(X, Y, metric=cosine) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) # Tests that precomputed metric returns pointer to, and not copy of, X. S = np.dot(X, X.T) S2 = pairwise_distances(S, metric="precomputed") assert_true(S is S2) # Test with sparse X and Y, # currently only supported for euclidean and cosine X_sparse = csr_matrix(X) Y_sparse = csr_matrix(Y) S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean") S2 = euclidean_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) S = pairwise_distances(X_sparse, Y_sparse, metric="cosine") S2 = cosine_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) # Test with scipy.spatial.distance metric, with a kwd kwds = {"p": 2.0} S = pairwise_distances(X, Y, metric="minkowski", **kwds) S2 = pairwise_distances(X, Y, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # same with Y = None kwds = {"p": 2.0} S = pairwise_distances(X, metric="minkowski", **kwds) S2 = pairwise_distances(X, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # Test that scipy distance metrics throw an error if sparse matrix given assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski") assert_raises(TypeError, pairwise_distances, X, Y_sparse, metric="minkowski")
def predict(self, X): return manhattan_distances(X, self.cluster_centers_).argmin(axis=1)
def predict(self, X, eval_MSE=False, batch_size=None): """ This function evaluates the Gaussian Process model at x. Parameters ---------- X : array_like An array with shape (n_eval, n_features) giving the point(s) at which the prediction(s) should be made. eval_MSE : boolean, optional A boolean specifying whether the Mean Squared Error should be evaluated or not. Default assumes evalMSE = False and evaluates only the BLUP (mean prediction). batch_size : integer, optional An integer giving the maximum number of points that can be evaluated simultaneously (depending on the available memory). Default is None so that all given points are evaluated at the same time. Returns ------- y : array_like, shape (n_samples, ) or (n_samples, n_targets) An array with shape (n_eval, ) if the Gaussian Process was trained on an array of shape (n_samples, ) or an array with shape (n_eval, n_targets) if the Gaussian Process was trained on an array of shape (n_samples, n_targets) with the Best Linear Unbiased Prediction at x. MSE : array_like, optional (if eval_MSE == True) An array with shape (n_eval, ) or (n_eval, n_targets) as with y, with the Mean Squared Error at x. """ check_is_fitted(self, "X") # Check input shapes X = check_array(X) n_eval, _ = X.shape n_samples, n_features = self.X.shape n_samples_y, n_targets = self.y.shape # Run input checks self._check_params(n_samples) if X.shape[1] != n_features: raise ValueError(("The number of features in X (X.shape[1] = %d) " "should match the number of features used " "for fit() " "which is %d.") % (X.shape[1], n_features)) if batch_size is None: # No memory management # (evaluates all given points in a single batch run) # Normalize input X = (X - self.X_mean) / self.X_std # Initialize output y = np.zeros(n_eval) if eval_MSE: MSE = np.zeros(n_eval) # Get pairwise componentwise L1-distances to the input training set dx = manhattan_distances(X, Y=self.X, sum_over_features=False) # Get regression function and correlation f = self.regr(X) r = self.corr(self.theta_, dx).reshape(n_eval, n_samples) # Scaled predictor y_ = np.dot(f, self.beta) + np.dot(r, self.gamma) # Predictor y = (self.y_mean + self.y_std * y_).reshape(n_eval, n_targets) if self.y_ndim_ == 1: y = y.ravel() # Mean Squared Error if eval_MSE: C = self.C if C is None: # Light storage mode (need to recompute C, F, Ft and G) if self.verbose: print("This GaussianProcess used 'light' storage mode " "at instantiation. Need to recompute " "autocorrelation matrix...") reduced_likelihood_function_value, par = \ self.reduced_likelihood_function() self.C = par['C'] self.Ft = par['Ft'] self.G = par['G'] rt = linalg.solve_triangular(self.C, r.T, lower=True) if self.beta0 is None: # Universal Kriging u = linalg.solve_triangular(self.G.T, np.dot(self.Ft.T, rt) - f.T, lower=True) else: # Ordinary Kriging u = np.zeros((n_targets, n_eval)) MSE = np.dot(self.sigma2.reshape(n_targets, 1), (1. - (rt ** 2.).sum(axis=0) + (u ** 2.).sum(axis=0))[np.newaxis, :]) MSE = np.sqrt((MSE ** 2.).sum(axis=0) / n_targets) # Mean Squared Error might be slightly negative depending on # machine precision: force to zero! MSE[MSE < 0.] = 0. if self.y_ndim_ == 1: MSE = MSE.ravel() return y, MSE else: return y else: # Memory management if type(batch_size) is not int or batch_size <= 0: raise Exception("batch_size must be a positive integer") if eval_MSE: y, MSE = np.zeros(n_eval), np.zeros(n_eval) for k in range(max(1, n_eval / batch_size)): batch_from = k * batch_size batch_to = min([(k + 1) * batch_size + 1, n_eval + 1]) y[batch_from:batch_to], MSE[batch_from:batch_to] = \ self.predict(X[batch_from:batch_to], eval_MSE=eval_MSE, batch_size=None) return y, MSE else: y = np.zeros(n_eval) for k in range(max(1, n_eval / batch_size)): batch_from = k * batch_size batch_to = min([(k + 1) * batch_size + 1, n_eval + 1]) y[batch_from:batch_to] = \ self.predict(X[batch_from:batch_to], eval_MSE=eval_MSE, batch_size=None) return y
def predict(self, X, eval_MSE=False, transformY=True, returnRV=False, integratedPrediction= False, eval_confidence_bounds=False,coef_bound=1.96, batch_size=None): """ This function evaluates the Gaussian Process model at x. Parameters ---------- X : array_like An array with shape (n_eval, n_features) giving the point(s) at which the prediction(s) should be made. eval_MSE : boolean, optional A boolean specifying whether the Mean Squared Error should be evaluated or not. Default assumes evalMSE = False and evaluates only the BLUP (mean prediction). transformY : boolean, optional A boolean specifying if the predicted values should correspond to the same space as the data given to the fit method, or to the warped space (in which the GP is fitted). Default is True. Setting to False can be useful to compute the Expected Improvement in an optimization process. returnRV : boolean, optional A boolean specifying if the method should return the predicted random variables at x instead of a float number. Default is False. integratedPrediction : boolean, optional A boolean specifying if the method should return the fully Bayesian prediction, ie compute the expectation given the posterior in the original space. If False, the returned value is the inverse value (by the mapping function) of the GP prediction. This is much more faster as the integratedPrediction needs to numerically compute the integral. Default is False. eval_confidence_bounds : boolean, optional A boolean specifying if the method should return the confidence bounds. Because of the non-linearity of the mapping function, this cannot be computed directly with the MSE, but needs to invert the mapping function. Default is False. If True, coef_bound specifies the boundary to compute. coef_bound : float, optional A float specifying the confidence bounds to compute. Upper and lower confidence bounds are computed as the inverse of m + coef_bound*sigma where m and sigma are the mean and the std of the posterior in the GP space. Default is 1.96 which corresponds to the 95% confidence bounds. batch_size : integer, optional An integer giving the maximum number of points that can be evaluated simultaneously (depending on the available memory). Default is None so that all given points are evaluated at the same time. Returns ------- y : array_like, shape (n_samples,) Prediction at x. MSE : array_like, optional (if eval_MSE == True) Mean Squared Error at x. LCB : array_like, optional (if eval_confidence_bounds == True) Lower confidence bound. UCB : array_like, optional (if eval_confidence_bounds == True) Upper confidence bound. """ # Check input shapes X = sk_utils.array2d(X) n_eval, _ = X.shape n_samples, n_features = self.X.shape n_samples_y, n_targets = self.y.shape if(n_targets > 1): raise ValueError('More than one target in the Y outputs. \ Currently only 1D outputs are handled') # Run input checks self._check_params(n_samples) if X.shape[1] != n_features: raise ValueError(("The number of features in X (X.shape[1] = %d) " "should match the number of features used " "for fit() " "which is %d.") % (X.shape[1], n_features)) # Normalize input if self.normalize: X = (X - self.X_mean) / self.X_std # Initialize output y = np.zeros(n_eval) if eval_MSE: MSE = np.zeros(n_eval) # Get pairwise componentwise L1-distances to the input training set dx = manhattan_distances(X, Y=self.X, sum_over_features=False) # Get regression function and correlation f = self.regr(X) r = self.corr(self.theta, dx).reshape(n_eval, n_samples) # Scaled predictor y_ = np.dot(f, self.beta) + np.dot(r, self.gamma) # Predictor y = (self.y_mean + self.y_std * y_).reshape(n_eval, n_targets) # transform the warped y, modeled as a Gaussian, to the real y size = y.shape[0] warped_y = np.copy(y) if(transformY): if( np.sum([ y[i][0] > 8.2 for i in range(size)]) >0): print('Warning : mapping_inversion failed') real_y = [ self.mapping_inv(X[i],y[i][0]) for i in range(size)] real_y = self.raw_y_std * np.asarray(real_y) +self.raw_y_mean y = real_y.reshape(n_eval, n_targets) if self.y_ndim_ == 1: y = y.ravel() warped_y = warped_y.ravel() # Mean Squared Error if eval_MSE: C = self.C if C is None: # Light storage mode (need to recompute C, F, Ft and G) if self.verbose: print("This GaussianProcess used 'light' storage mode " "at instantiation. Need to recompute " "autocorrelation matrix...") reduced_likelihood_function_value, par = \ self.reduced_likelihood_function() self.C = par['C'] self.Ft = par['Ft'] self.G = par['G'] rt = linalg.solve_triangular(self.C, r.T, lower=True) if self.beta0 is None: # Universal Kriging u = linalg.solve_triangular(self.G.T, np.dot(self.Ft.T, rt) - f.T) else: # Ordinary Kriging u = np.zeros((n_targets, n_eval)) MSE = np.dot(self.sigma2.reshape(n_targets, 1), (1. - (rt ** 2.).sum(axis=0) + (u ** 2.).sum(axis=0))[np.newaxis, :]) MSE = np.sqrt((MSE ** 2.).sum(axis=0) / n_targets) # Mean Squared Error might be slightly negative depending on # machine precision: force to zero! MSE[MSE < 0.] = 0. if self.y_ndim_ == 1: MSE = MSE.ravel() sigma = np.sqrt(MSE) if(returnRV): return [ self.predicted_RV([warped_y[i]],sigma[i],X[i]) for i in range(size)] else: if(eval_confidence_bounds): if not(transformY): print('Warning, transformY set to False but trying to evaluate conf bounds') warped_y_with_boundL = warped_y - coef_bound * sigma warped_y_with_boundU = warped_y + coef_bound * sigma pred_with_boundL = self.raw_y_std * np.asarray( [ self.mapping_inv(X[i],warped_y_with_boundL[i])[0] for i in range(size) ] ) +self.raw_y_mean pred_with_boundU = self.raw_y_std * np.asarray( [ self.mapping_inv(X[i],warped_y_with_boundU[i])[0] for i in range(size)] ) +self.raw_y_mean if(integratedPrediction): lb = self.raw_y_min - 3.*(self.raw_y_max-self.raw_y_min) ub = self.raw_y_max + 3.*(self.raw_y_max-self.raw_y_min) print(lb,ub) integrated_real_y = [ self.integrate_prediction([warped_y[i]],sigma[i],X[i],lb,ub) for i in range(size)] integrated_real_y = np.asarray(integrated_real_y) print('Integrated prediction') return integrated_real_y,MSE,pred_with_boundL,pred_with_boundU else: return y,MSE,pred_with_boundL,pred_with_boundU else: return y, MSE else: return y, MSE else: return y