def __calc_distances__(self, v1s, v2s, is_sparse=True): if is_sparse: dcosine = np.array([cosine(x.toarray(), y.toarray()) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dcityblock = np.array([cityblock(x.toarray(), y.toarray()) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dcanberra = np.array([canberra(x.toarray(), y.toarray()) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) deuclidean = np.array([euclidean(x.toarray(), y.toarray()) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dminkowski = np.array([minkowski(x.toarray(), y.toarray(), 3) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dbraycurtis = np.array([braycurtis(x.toarray(), y.toarray()) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dskew_q1 = [skew(x.toarray().ravel()) for x in v1s] dskew_q2 = [skew(x.toarray().ravel()) for x in v2s] dkur_q1 = [kurtosis(x.toarray().ravel()) for x in v1s] dkur_q2 = [kurtosis(x.toarray().ravel()) for x in v2s] dskew_diff = np.abs(np.array(dskew_q1) - np.array(dskew_q2)).reshape((-1,1)) dkur_diff = np.abs(np.array(dkur_q1) - np.array(dkur_q2)).reshape((-1,1)) else: dcosine = np.array([cosine(x, y) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dcityblock = np.array([cityblock(x, y) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dcanberra = np.array([canberra(x, y) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) deuclidean = np.array([euclidean(x, y) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dminkowski = np.array([minkowski(x, y, 3) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dbraycurtis = np.array([braycurtis(x, y) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dskew_q1 = [skew(x) for x in v1s] dskew_q2 = [skew(x) for x in v2s] dkur_q1 = [kurtosis(x) for x in v1s] dkur_q2 = [kurtosis(x) for x in v2s] dskew_diff = np.abs(np.array(dskew_q1) - np.array(dskew_q2)).reshape((-1,1)) dkur_diff = np.abs(np.array(dkur_q1) - np.array(dkur_q2)).reshape((-1,1)) return np.hstack((dcosine,dcityblock,dcanberra,deuclidean,dminkowski,dbraycurtis,dskew_diff,dkur_diff))
def subsample_points_low(pointcloud1, pointcloud2, num_subsampled_points, rotation_ab, translation_ab): # (num_points, 3) pointcloud1 = pointcloud1.T pointcloud2 = pointcloud2.T num_points = pointcloud1.shape[0] nbrs1 = NearestNeighbors( n_neighbors=num_subsampled_points, algorithm='auto', metric=lambda x, y: minkowski(x, y)).fit(pointcloud1) nbrs2 = NearestNeighbors( n_neighbors=num_subsampled_points, algorithm='auto', metric=lambda x, y: minkowski(x, y)).fit(pointcloud2) random_idx1 = np.random.choice(num_points) random_p1 = pointcloud1[random_idx1, :] distance = np.sum((pointcloud2 - random_p1)**2, axis=-1) random_idx2 = np.argmax(distance) random_p2 = pointcloud1[random_idx2, :] idx1 = nbrs1.kneighbors(random_p1.reshape(1, -1), return_distance=False).reshape( (num_subsampled_points, )) idx2 = nbrs2.kneighbors(random_p2.reshape(1, -1), return_distance=False).reshape( (num_subsampled_points, )) pointcloud1 = pointcloud1[idx1, :] pointcloud2 = pointcloud2[idx2, :] pointcloud2 = rotation_ab.apply(pointcloud2).T + np.expand_dims( translation_ab, axis=1) return pointcloud1.T, pointcloud2
def farthest_subsample_points(pointcloud1, pointcloud2, num_subsampled_points=768): pointcloud1 = pointcloud1.T pointcloud2 = pointcloud2.T num_points = pointcloud1.shape[0] nbrs1 = NearestNeighbors( n_neighbors=num_subsampled_points, algorithm='auto', metric=lambda x, y: minkowski(x, y)).fit(pointcloud1) random_p1 = np.random.random(size=( 1, 3)) + np.array([[500, 500, 500]]) * np.random.choice([1, -1, 1, -1]) idx1 = nbrs1.kneighbors(random_p1, return_distance=False).reshape( (num_subsampled_points, )) nbrs2 = NearestNeighbors( n_neighbors=num_subsampled_points, algorithm='auto', metric=lambda x, y: minkowski(x, y)).fit(pointcloud2) random_p2 = random_p1 #np.random.random(size=(1, 3)) + np.array([[500, 500, 500]]) * np.random.choice([1, -1, 2, -2]) # 这种去不完全部分的方式,可能并不能达到很好的效果,还是会造成重合部分很多 idx2 = nbrs2.kneighbors(random_p2, return_distance=False).reshape( (num_subsampled_points, )) # print('random_p1',random_p1) # print('pointcloud1',np.max(pointcloud1)) # print('pointcloud2',np.max(pointcloud2)) return pointcloud1[idx1, :].T, pointcloud2[idx2, :].T
def get_nearest_neighbour(x, vectors, p = 2): """ Finds the vector in <vectors> with the minimum distance from the vector parameter x. Args: x: 1xn vector vectors: qxn matrix with vectors that are nearest neighbour candidates of vector x. p: distance metric used. Default is 2 (Euclidean distance). Returns: Tuple with the index of the nearest neighbour of x in vectors and the distance between the nearest neighbour and x. Raises: InvalidParameterException: Raised if x and each vector in vectors have different dimensions. """ if(x.shape[0] != vectors.shape[1]): raise exc.InvalidParameterException("[Exception]: vector parameter x must have the same 'n'"\ "dimensions as in the qxn <vectors> matrix.") minimum_distance = minkowski(x, vectors[0], p) minimum_index = 0 for index, v in enumerate(vectors): distance = minkowski(x, v, p) if(minimum_distance > distance): minimum_distance = distance minimum_index = index #print(minimum_index) return (minimum_index, minimum_distance)
def covariance_and_correlation(arr): arr = np.reshape(arr, (-1, 7)) #arr = [a[1:] for a in arr] return_val = [] for val1 in arr: vals = [] #min_1 = Parallel(n_jobs=-1) (delayed (distance.minkowski) (val1, val2, 1) for val2 in arr) #min_2 = Parallel(n_jobs=-1) (delayed (distance.minkowski) (val1, val2, 2) for val2 in arr) for val2 in arr: #if val1 == val2: # continue #to_append = np.ndarray.flatten(np.cov(val1, val2) ).tolist() #coors = np.correlate(val1, val2).tolist() to_append = [] #minkowski distance 1 and 2 min_1 = distance.minkowski(val1, val2, 1) min_2 = distance.minkowski(val1, val2, 2) #kstat = [kstatp #cor = correlation(val1, val2) #to_append += coors #to_append.append(min_1) #to_append.append(min_2) min_d = [float(min_1), float(min_2)] to_append += min_d vals.append(to_append) return_val.append(np.mean(vals)) #return_val.append(np.mean(vals,))# axis=1 ) ) return np.mean(return_val, axis=0)
def kmeans(k, max_itr, x, init_method, distance_measure, power=3): cluster = [] wcss = [] centroid = [] old_centroid = [] n = X.shape[0] index = 0 #choose initi method if (init_method == 'random'): random.seed(3) for i in range(k): centroid.append(X[random.randint(1, n)]) centroid = np.array(centroid) elif (init_method == 'K++'): centroid = kpp(k, x) else: centroid = X[list(range(k)), :] print("\nInitialization method :" + init_method) #choose distance measure if (distance_measure == 'Manhattan'): h = 1 elif (distance_measure == 'Euclidean'): h = 2 elif (distance_measure == 'Minkowski'): h = power print("Distance measure :" + distance_measure) for i in range(n): cluster.append(-1) cluster = np.array(cluster).reshape(n, 1) print("K=", k) #kmeans logic for i in range(max_itr): old_centroid = np.copy(centroid) for j in range(n): mini = 99999 for z in range(k): tmp = distance.minkowski(x[j], centroid[z], h) if tmp < mini: mini = tmp index = z cluster[j] = index centroid = reinit(k, np.array(cluster), x, centroid) #if centroid donot change convergence point! if (old_centroid == centroid).all(): print("\nConverged at iteartion:", i, "\n") break #calculate wcss total = 0 for r in range(n): total = total + distance.minkowski(x[r], centroid[cluster[r]], h) wcss.append([total, k]) return cluster, centroid, wcss
def _process_input(self,input_np_array): bmu = self._get_BMU(input_np_array) for neu in self.map_neurons.values(): nhash = str(neu.x_c)+""+str(neu.y_c) '''weight adjustment if the neuron is in the neighbourhood of the BMU''' if minkowski(bmu.coords().astype(float), neu.coords().astype(float), 2) < self.nr: neu.weight_vs = neu.weight_vs + self.lr * (input_np_array-neu.weight_vs) neu.res_err += minkowski(neu.weight_vs, bmu.weight_vs, 2) self.map_neurons[nhash]=neu '''growth''' if bmu.res_err > self.thresh: neu = bmu down=str(neu.x_c)+str(int(neu.y_c)-1) up=str(neu.x_c)+str(int(neu.y_c)+1) left=str(int(neu.x_c)-1)+str(neu.y_c) right=str(int(neu.x_c)+1)+str(neu.y_c) nei_coords = np.array([down, up , left , right ] ) nei_coordi = np.array([[(neu.x_c),(int(neu.y_c)-1)], [(neu.x_c),(int(neu.y_c)+1)], [(int(neu.x_c)-1),(neu.y_c)], [(int(neu.x_c)+1),int(neu.y_c)]] ) p =0 for coord in nei_coords: n=None try: n= self.map_neurons[coord] n.res_err+=self.fd*n.res_err except KeyError: nwron=neuron(nei_coordi[p][0], nei_coordi[p][1], self.dim) new_weight = 0 #case a) new node has two consecutive nodes on one of its sides #tiroshan and lakmal please implement the code here #case b) between two old nodes new_weight_b = self._type_b_weight_init(p,neu) new_weight_a = self._type_a_weight_init(p,neu) new_weight_c = self._type_c_weight_init(p,neu) if new_weight_b.all() ==0: if new_weight_a.all() == 0: if new_weight_c.all() == 0: #print "c==0" new_weight = np.ndarray(shape=(self.dim)) new_weight.fill(0.5) else: new_weight = new_weight_c else: new_weight = new_weight_a else: new_weight = new_weight_b nwron.weight_vs = new_weight n=nwron self.map_neurons[coord]=n p+=1 bmu.res_err=self.thresh/2 self.map_neurons[str(bmu.x_c)+""+str(bmu.y_c)]=bmu return bmu.coords()
def run(proc_id, return_dict, counter, dataset, test_index, indices_train_examples, algorithm, relevant_only): try: results = np.zeros(len(indices_train_examples)) for array_index, example_index in enumerate(indices_train_examples): ### # Prepare examples ### if algorithm == 'feature_based': # feature based data is 2d-structured (examples,features) test_example = dataset.x_test_TSFresh_features[test_index, :] train_example = dataset.x_train_TSFresh_features[example_index, :] elif relevant_only: test_example = dataset.x_test[test_index] test_example, train_example = dataset.reduce_to_relevant(test_example, example_index) else: test_example = dataset.x_test[test_index] train_example = dataset.x_train[example_index] ## # Execute algorithm ## if algorithm == 'dtw': distance, _ = fastdtw(test_example, train_example, dist=euclidean) elif algorithm == 'dtw_weighting_nbr_features': distance, _ = fastdtw(test_example, train_example, dist=euclidean) distance = distance / test_example.shape[1] elif algorithm == 'feature_based': if relevant_only: masking = dataset.get_ts_fresh_masking(example_index) weights = masking / (np.sum(masking)) distance = minkowski(test_example, train_example, 2, weights) # Adjustment based on feature amount (improved performance) small_num_of_attributes_penalty = (1 / (np.sum(masking))) # if small_num_of_attributes_penalty > 1: # small_num_of_attributes_penalty = 1 distance = distance * small_num_of_attributes_penalty else: distance = minkowski(test_example, train_example, 2) else: raise ValueError('Unkown algorithm:', algorithm) results[array_index] = distance counter.increment() return_dict[proc_id] = results except KeyboardInterrupt: pass
def farthest_subsample_points(pointcloud1, pointcloud2, num_subsampled_points=768): pointcloud1 = pointcloud1.T pointcloud2 = pointcloud2.T num_points = pointcloud1.shape[0] nbrs1 = NearestNeighbors(n_neighbors=num_subsampled_points, algorithm='auto', metric=lambda x, y: minkowski(x, y)).fit(pointcloud1) random_p1 = np.random.random(size=(1, 3)) + np.array([[500, 500, 500]]) * np.random.choice([1, -1, 1, -1]) idx1 = nbrs1.kneighbors(random_p1, return_distance=False).reshape((num_subsampled_points,)) nbrs2 = NearestNeighbors(n_neighbors=num_subsampled_points, algorithm='auto', metric=lambda x, y: minkowski(x, y)).fit(pointcloud2) random_p2 = random_p1 idx2 = nbrs2.kneighbors(random_p2, return_distance=False).reshape((num_subsampled_points,)) return pointcloud1[idx1, :].T, pointcloud2[idx2, :].T
def similarity_metric(spectrum_A,spectrum_B,metric,power_value,p=3): """ ARGUMENTS: spectrum_A,spectrum_B: signals/spectra to be compared metric: similarity metric to be used (possible choices: 'cosine_sim','euclidean','cross_correlation' (TODO last one incomplete)) power_value: power to which result of similarity metric is raised RETURNS: value of similarity metric """ spectrum_A=np.array(spectrum_A).flatten() spectrum_B=np.array(spectrum_B).flatten() if metric=='cosine_sim': return cosine_sim_given_spectra(spectrum_A,spectrum_B,power_value) elif metric=='euclidean': return euclidean_distance(spectrum_A,spectrum_B,power_value) elif metric=='minkowski': zero_vector=np.zeros(spectrum_A.size) norm_A=minkowski(u=spectrum_A,v=zero_vector,p=p,w=None) norm_B=minkowski(u=spectrum_B,v=zero_vector,p=p,w=None) spectrum_A_normed=spectrum_A/norm_A spectrum_B_normed=spectrum_B/norm_B minkowski_AB=minkowski(u=spectrum_A_normed,v=spectrum_B_normed,p=p,w=None) return 1.-minkowski_AB elif metric=='cross_correlation': #Not sure if this doesnt give the same as skp since automatic mode is 'valid' .... https://docs.scipy.org/doc/numpy/reference/generated/numpy.correlate.html corr_A_B=np.correlate(spectrum_A,spectrum_B) corr_A_A=np.correlate(spectrum_A,spectrum_A) corr_B_B=np.correlate(spectrum_B,spectrum_B) normalized_corr=corr_A_B/(corr_A_A*corr_B_B) return np.power(normalized_corr,power_value) else: raise NotImplementedError("The chosen metric is not implemented.")
def distances(a, b, method='euclidean'): if (method == 'manhattan'): return distance.minkowski(a, b, 1) elif (method == 'euclidean'): return distance.minkowski(a, b, 2) elif (method == 'l3'): return distance.minkowski(a, b, 3) elif (method == 'bhat'): return -math.log(sum(np.sqrt(a * b))) elif (method == 'intersection'): return len(a) / (sum(np.minimum(a, b))) elif (method == 'corr'): return 1.0 - np.correlate(a, b) else: return 0
def feature3(data): question1_vectors = np.zeros((data.shape[0], 300)) error_count = 0 for i, q in tqdm(enumerate(data.question1.values)): question1_vectors[i, :] = sent2vec(q) question2_vectors = np.zeros((data.shape[0], 300)) for i, q in tqdm(enumerate(data.question2.values)): question2_vectors[i, :] = sent2vec(q) data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)] data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)] data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)] data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)] return data
def nearest_centroid_clustering(X_train, X_test, y_train, y_test, parameters, evaluation_metrics): # modify parameters to call the clustering algorithm with modified ones, this mainly purposes the distance parameter modified_parameters = prepare_parameters(parameters) if modified_parameters["distance"] == "minkowski" and modified_parameters[ "minkowski_p"] is not None: initial_classifier = NearestCentroid( metric=lambda x, y: distance.minkowski( x, y, modified_parameters["minkowski_p"])) else: if modified_parameters["distance"] == "mahalanobis": initial_classifier = NearestCentroid(metric="mahalanobis", metric_params={ "V": np.cov(X_train) }) # TODO: fix else: initial_classifier = NearestCentroid( metric=modified_parameters["distance"]) classifier = initial_classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) evaluation_metrics["accuracy"] = classifier.score(X_test, y_test) return evaluation_metrics
def calculate_distance(X, Y, metric='euclidean'): if metric == METRIC_EUCLIDEAN: return distance.euclidean(X, Y) elif metric == METRIC_JACCARD: return distance.jaccard(X, Y) elif metric == METRIC_CANBERRA: return distance.canberra(X, Y) elif metric == METRIC_CHEBYSHEV: return distance.chebyshev(X, Y) elif metric == METRIC_MINKOWSKI: return distance.minkowski(X, Y) elif metric == METRIC_WMINKOWSKI: return distance.wminkowski(X, Y) elif metric == METRIC_BRAYCURTIS: return distance.braycurtis(X, Y) elif metric == METRIC_HAMMING: return distance.hamming(X, Y) elif metric == METRIC_MAHALANOBIS: return distance.mahalanobis(X, Y) elif metric == METRIC_MANHATTAN: return sum(abs(a - b) for a, b in zip(X, Y)) elif metric == METRIC_COSINE: dot_product = np.dot(X, Y) norm_a = np.linalg.norm(X) norm_b = np.linalg.norm(Y) return dot_product / (norm_a * norm_b)
def find_similar_to_team(team_stats, filtered_list, similarity='pearson'): avg_stats = team_stats[0] sd_stats = team_stats[1] avg_stats['Age'] = 21 sd_stats = 1 / sd_stats sd_stats = sd_stats / sd_stats.sum() avg_stats = avg_stats.values.reshape(-1, len(avg_stats)) fil_list = filtered_list[TEAM_ATTRIBUTES] fil_list = fil_list.mul(sd_stats, axis=1) avg_series = pd.Series(avg_stats.flatten(), index=fil_list.columns) if similarity == 'pearson': pearson_sim = fil_list.corrwith(avg_series, axis=1) filtered_list['pearson'] = pearson_sim return filtered_list.sort_values(['pearson'], ascending=False) if similarity == 'cosine': cos_sim = cosine_similarity(fil_list, avg_stats) filtered_list['cosine'] = pd.Series( [x for row in cos_sim for x in row], index=filtered_list.index) return filtered_list.sort_values(['cosine'], ascending=False) if similarity == 'minkowski': minkowski_sim = [] for index, row in fil_list.iterrows(): minkowski_sim.append( minkowski(row.values, avg_stats.flatten(), p=2)) filtered_list['minkowski'] = pd.Series(minkowski_sim, index=filtered_list.index) return filtered_list.sort_values(['minkowski'], ascending=False)
def get_w2v_simi(query, title): q_vec = np.nan_to_num(sent2vec(query)) t_vec = np.nan_to_num(sent2vec(title)) w2v_consine = cosine(q_vec, t_vec) w2v_cityblock = cityblock(q_vec, t_vec) w2v_jaccard = jaccard(q_vec, t_vec) w2v_canberra = canberra(q_vec, t_vec) w2v_euclidean = euclidean(q_vec, t_vec) w2v_minkowski = minkowski(q_vec, t_vec) w2v_braycurtis = braycurtis(q_vec, t_vec) w2v_skew_qvec = skew(q_vec) w2v_skew_tvec = skew(t_vec) w2v_kur_qvec = kurtosis(q_vec) w2v_kur_tvec = kurtosis(t_vec) outlist = [w2v_consine, w2v_cityblock, w2v_jaccard, w2v_canberra, w2v_euclidean, w2v_minkowski, w2v_braycurtis, w2v_skew_qvec, w2v_skew_tvec, w2v_kur_qvec, w2v_kur_tvec ] outformat = ':'.join(['{}']*len(outlist)) return outformat.format(*outlist)
def Dist(array1, array2, dist): if dist == 'braycurtis': return distance.braycurtis(array1, array2) elif dist == 'correlation': return distance.correlation(array1, array2) elif dist == 'mahalanobis': return distance.mahalanobis(array1, array2) elif dist == 'minkowski': return distance.minkowski(array1, array2) elif dist == 'seuclidean': return distance.seuclidean(array1, array2) elif dist == 'sqeuclidean': return distance.sqeuclidean(array1, array2) elif dist == 'pearsonp': r, p = pearsonr(array1, array2) return p elif dist == 'pearsonr': r, p = pearsonr(array1, array2) return r elif dist == 'spearmanp': r, p = spearmanr(array1, array2) return p elif dist == 'spearmanr': r, p = spearmanr(array1, array2) return r
def rmsd(cluster_list, vectors, clu_vectors): ''' not a proper rmsd - it is a mean squared deviation of the CG modes wrt to AT modes, averaged along clusters ''' N_clu = len(cluster_list) RMSD = 0 N = len(vectors) for j in range(N_clu): clu_vector = clu_vectors[j] clu_true_vectors = vectors[cluster_list[j]] # normalize and multiply by 100 if np.linalg.norm(clu_vector) < 1e-7: clu_vector += np.ones(3) * 1e-6 clu_vector = (clu_vector / np.linalg.norm(clu_vector)) * 100 D = 0 for v in clu_true_vectors: # normalize vectors to unit norm if np.linalg.norm(v) < 1e-7: v += np.ones(3) * 1e-6 v = (v / np.linalg.norm(v)) * 100 d = minkowski(v, clu_vector, p=2) D += d**2 RMSD += float(D) return (RMSD / float(N_clu * N)) / 100.0
def extend_with_features(data): stop_words = stopwords.words('english') data['fuzz_qratio'] = data.apply( lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_WRatio'] = data.apply( lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) model = gensim.models.KeyedVectors.load_word2vec_format( google_news_model_path, binary=True) data['wmd'] = data.apply( lambda x: wmd(model, x['question1'], x['question2']), axis=1) norm_model = gensim.models.KeyedVectors.load_word2vec_format( google_news_model_path, binary=True) norm_model.init_sims(replace=True) data['norm_wmd'] = data.apply( lambda x: norm_wmd(norm_model, x['question1'], x['question2']), axis=1) question1_vectors = np.zeros((data.shape[0], 300)) for i, q in enumerate(data.question1.values): question1_vectors[i, :] = sent2vec(model, q) question2_vectors = np.zeros((data.shape[0], 300)) for i, q in enumerate(data.question2.values): question2_vectors[i, :] = sent2vec(model, q) question1_vectors = np.nan_to_num(question1_vectors) question2_vectors = np.nan_to_num(question2_vectors) data['cosine_distance'] = [ cosine(x, y) for (x, y) in zip(question1_vectors, question2_vectors) ] data['cityblock_distance'] = [ cityblock(x, y) for (x, y) in zip(question1_vectors, question2_vectors) ] data['jaccard_distance'] = [ jaccard(x, y) for (x, y) in zip(question1_vectors, question2_vectors) ] data['canberra_distance'] = [ canberra(x, y) for (x, y) in zip(question1_vectors, question2_vectors) ] data['euclidean_distance'] = [ euclidean(x, y) for (x, y) in zip(question1_vectors, question2_vectors) ] data['minkowski_distance'] = [ minkowski(x, y, 3) for (x, y) in zip(question1_vectors, question2_vectors) ] data['braycurtis_distance'] = [ braycurtis(x, y) for (x, y) in zip(question1_vectors, question2_vectors) ] data['skew_q1vec'] = [skew(x) for x in question1_vectors] data['skew_q2vec'] = [skew(x) for x in question2_vectors] data['kur_q1vec'] = [kurtosis(x) for x in question1_vectors] data['kur_q2vec'] = [kurtosis(x) for x in question2_vectors] return data
def compute_neighbors(self, X_test): ## Figure out the power p for minkowski distance if (self.metric == 'manhattan'): self.minkowski_p = 1 elif (self.metric == 'euclidean'): self.minkowski_p = 2 elif (self.metric == 'minkowski'): pass else: self.minkowski_p = 2 print( "WARNING: Unknown distance metric: %s specified! Reverting to euclidean" % (self.metric)) ## Compute distance of a test point to all train points distance_matrix = np.zeros([self.numObs]) neighbor_id = np.zeros([self.numObs]) for test_idx in range(self.XTestNumObs): for n_idx in range(self.numObs): neighbor_id[n_idx] = n_idx distance_matrix[n_idx] = minkowski(X_test[test_idx], self.X[n_idx], self.minkowski_p) ## Sort and record distances and neighbors sorted_distances = sorted(zip(distance_matrix, neighbor_id, self.y)) for k_idx in range(self.k): self.XTestNeighborsDist[ test_idx, k_idx], self.XTestNeighborsId[ test_idx, k_idx], self.XTestNeighborsY[ test_idx, k_idx] = sorted_distances[k_idx]
def compute_pl_distance( pls, median_pl, p, dest_dir, file_names, patient_category ): diffs = [] for pl in range(pls.shape[0]): # Loop through each patient patient_dist_from_avg = [] for h_dim in range(pls.shape[1]): # Loop through each dimension patient_dist_from_avg.append( distance.minkowski( pls[pl, h_dim, :].flatten(), median_pl[h_dim, :].flatten(), p, ) ) diffs.append(patient_dist_from_avg) diffs = np.array(diffs) # with open(dest_dir + ".npy", "wb") as f: # np.save(f, diffs) diffs = pd.DataFrame(diffs, columns=["H_0", "H_1", "H_2"]) file_names = np.array(file_names) outliers = pd.DataFrame() # Select patients who are outliers for col in diffs.columns: outliers[col] = list( file_names[np.array(diffs.nlargest(140, columns=col).index)] ) outliers.to_csv(dest_dir + f"outliers_{patient_category}.csv", index=False) diffs.index = file_names diffs.to_csv( dest_dir + f"distance_from_median_pl_{patient_category}.csv", index=True, )
def process_batch(self,batch_np_array, k=10): start_time= time.time() for j in range(k): self.map_sizes.append(len(self.map_neurons.keys())) for i in range(batch_np_array.shape[0]): sys.stdout.write("iteration %d :"%(j+1)) sys.stdout.write(" : NR = %d: "%(self.nr)) sys.stdout.write(" input %d "%(i)) sys.stdout.write(" map size %d "%(len(self.map_neurons.keys()))) sys.stdout.write(" time %d \r"%(time.time()-start_time)) sys.stdout.flush() tinp = batch_np_array[i] bcoords=self.process_input(tinp) bhash=str(bcoords[0])+""+str(bcoords[1]) winner = self.map_neurons[bhash] #here's the tricky part score= minkowski(winner.weight_vs,tinp,2)#/self.dim winner.coassoc_vs[i]= score winner.binarycoassoc_vs[i]=1 #print winner.coassoc_vs self.map_neurons[bhash]=winner self.nr=self.nr*(1-self.lr) self.lr = self.lr*self.lr_red_coef*(1-3.85/len(self.map_neurons.values())) if self.nr <=1 : print self.nr return return
def findClosest(textVector, genre): ''' Args: vector: vector of data analyzed from a text genre: genre of text ("fiction" or "nonfiction") Returns: a string explaining the three most stylistically similar authors and their minkowski distances from the author's style, ordered from least distance to greatest ''' distanceTuples = [] THIS_FOLDER = os.path.dirname(os.path.abspath(__file__)) if genre == "fiction": standardizedCompSet = readVectors( os.path.join(THIS_FOLDER, "data/exemplaryFictionDataSTDD.csv")) if genre == "nonfiction": standardizedCompSet = readVectors( os.path.join(THIS_FOLDER, "data/exemplaryNonfictionDataSTDD.csv")) stdTextVector = standardizeVector(textVector, genre) # generate minkowski distance for each comparison text for vector in standardizedCompSet: distance = minkowski(stdTextVector[1:], vector[1:], 2) distanceTuples.append((round(distance, 2), vector[0])) # return three closest (distance, author) tuples distanceTuples.sort() similarityReport = "The authors in our database whose styles are most like yours \ are {}, with a difference quotient of {}, followed by {} ({}) and {} ({}).\ ".format(distanceTuples[0][1], str(distanceTuples[0][0]), distanceTuples[1][1], str(distanceTuples[1][0]), distanceTuples[2][1], str(distanceTuples[2][0])) return similarityReport
def dataframe_with_bmu_for_each_input_vector_and_distance( self, passed_dataframe): """creating a dataframe with bmu indicator and the distance with each indicator""" temp_dataframe = copy.deepcopy(passed_dataframe) print temp_dataframe new_ind = pd.DataFrame(columns=[[ 'bmu_indicator1', 'indicator_1', 'bmu_indicator2', 'indicator_2', 'distance' ]]) print "dataframe size=", temp_dataframe.shape[0] for i in range(temp_dataframe.shape[0]): print "done for %d indicator" % (i) for j in range(temp_dataframe.shape[0]): #print "bmu1=",temp_dataframe.ix[i,0],"bmu2",temp_dataframe.ix[j,0] a = literal_eval(temp_dataframe.ix[i, 'node']) b = literal_eval(temp_dataframe.ix[j, 'node']) dist = minkowski(np.array(a), np.array(b), 1) #print 'dist=',dist X = pd.DataFrame(np.array([[ a, temp_dataframe.ix[i, 'column'], b, temp_dataframe.ix[j, 'column'], dist ]]), columns=[ 'bmu_indicator1', 'indicator_1', 'bmu_indicator2', 'indicator_2', 'distance' ]) new_ind = new_ind.append(X, ignore_index=True) return new_ind
def distance_features(data,genismModel): w2v_q1 = np.array([sent2vec(q, genismModel) for q in data.question1]) w2v_q2 = np.array([sent2vec(q, genismModel) for q in data.question2]) a=np.zeros(300) for i in range(len(w2v_q1)): if w2v_q1[i].size==1: w2v_q1[i]=a for i in range(len(w2v_q2)): if w2v_q2[i].size==1: w2v_q2[i]=a data['cosine_distance'] = [cosine(x,y) for (x,y) in zip(w2v_q1, w2v_q2)] data['cityblock_distance'] = [cityblock(x,y) for (x,y) in zip(w2v_q1, w2v_q2)] data['jaccard_distance'] = [jaccard(x,y) for (x,y) in zip(w2v_q1, w2v_q2)] data['canberra_distance'] = [canberra(x,y) for (x,y) in zip(w2v_q1, w2v_q2)] data['euclidean_distance'] = [euclidean(x,y) for (x,y) in zip(w2v_q1, w2v_q2)] data['minkowski_distance'] = [minkowski(x,y,3) for (x,y) in zip(w2v_q1, w2v_q2)] data['braycurtis_distance'] = [braycurtis(x,y) for (x,y) in zip(w2v_q1, w2v_q2)] data['skew_q1vec'] = [skew(x) for x in w2v_q1] data['skew_q2vec'] = [skew(x) for x in w2v_q2] data['kur_q1vec'] = [kurtosis(x) for x in w2v_q1] data['kur_q2vec'] = [kurtosis(x) for x in w2v_q2] fs_4 = ['cosine_distance', 'cityblock_distance', 'jaccard_distance', 'canberra_distance', 'euclidean_distance', 'minkowski_distance','braycurtis_distance','skew_q1vec', 'skew_q2vec','kur_q1vec','kur_q2vec'] return data,fs_4
def find_distance_score(curr_img, data_matrix, function_val): ##################### Minkowski 3 ####################### if function_val == "minkowski3": dis_score = [] for img in data_matrix: dis_score.append(distance.minkowski(img, curr_img, 3)) return dis_score ################ Cosine ####################### elif function_val == "cosine": sim_score = [] for img in data_matrix: dot = np.dot(curr_img, img) norm1 = np.linalg.norm(curr_img) norm2 = np.linalg.norm(img) sim_score.append(dot / (norm1 * norm2)) return sim_score ############## Euclidean Distance #################### elif function_val == "euclidean": return np.sqrt(np.sum(np.square(data_matrix - curr_img), axis=1)) ################# Manhattan Distance ################### else: return np.sum(np.absolute(data_matrix - curr_img), axis=1)
def sortneighbors(self, x, y, X_train, x_test): x = np.array(x).astype(np.float) x_test = np.array(x_test).astype(np.float) dist = np.empty(len(x)) for i in range(len(x)): # st=globals()["distance."+self.metric] # dist=st(x_train,x_test) if self.metric == 'cosine': dist[i] = distance.cosine(x[i], x_test) elif self.metric == 'chebyshev': dist[i] = distance.chebyshev(x[i], x_test) elif self.metric == 'cityblock': dist[i] = distance.cityblock(x[i], x_test) elif self.metric == 'euclidean': dist[i] = distance.euclidean(x[i], x_test) elif self.metric == 'minkowski': dist[i] = distance.minkowski(x[i], x_test) else: print( 'Error!!! Enter a correct distance function and try again \n' ) dist = np.argsort( dist ) # Returning the indices of the similarity values (distance values are sorted in ascending order) x_sorted = np.empty(shape=(len(x), len(X_train[1]))) y_sorted = [] k = 0 for i in dist: x_sorted[k] = x[i] y_sorted.append(y[i]) k = k + 1 return x_sorted, y_sorted
def minkowski(x, y, p=3): try: return distance.minkowski(x, y, p) except ValueError: return np.NaN except: return np.NaN
def get_labels(self, scores, u, yu): ######### labels = self.labels labels[u] = yu dist = [] neigh = [] for ele in self.X: #dist.append(euclidean(self.X[u],ele)) #dist.append(canberra(self.X[u],ele)) #dist.append(chebyshev(self.X[u],ele)) dist.append(minkowski(self.X[u], ele)) temp_score = np.argsort(dist) #n = self.X.shape[0] #k = int(math.sqrt(n)) k = self.k n_id = np.setdiff1d(temp_score[:k], self.queried_ids) # ids for unqueried neighbours for i in n_id: if (yu == 1 and scores[i] >= 0.5) or ( yu == -1 and scores[i] < 0.5): # or (yu == -1 and scores[i] < 0.5) neigh.append(i) labels[list(neigh)] = yu # k neighbours return labels
def distance(x, y, weights = [], p = 3, method = "euclidean"): ''' :param weights: :param p: :param x: X vector :param y: Y vector :param method: Method to Find Distance :return: The Distance Value ''' value = 0.00 if method == "euclidean": value = distance.euclidean(x, y) elif method == "minkowski": value = distance.minkowski(x, y, p) elif method == "cosine": value = distance.cosine(x, y) elif method == "manhattan": value = distance.cityblock(x, y) elif method == "dice": value = distance.dice(x, y) elif method == "jaccard": value = distance.jaccard(x, y) elif method == "hamming": value == distance.hamming(x, y) elif method == "canbera": value == distance.chebyshev(x, y) else: print(method, " Not Found! unsing Eclidean Distance!") value = distance.euclidean(x, y) return value
def process_ppr(self, n, m, k): self.output_dir = self.output_dir + "/{}_{}_{}".format( k, m, ",".join([str(x) for x in n])) Path(self.output_dir).mkdir(parents=True, exist_ok=True) sim_matrix = self.get_sim_matrix() adj_matrix = self.get_knn_nodes(sim_matrix, k) adj_matrix_norm = self.normalize(adj_matrix) size = adj_matrix_norm.shape[0] u_old = np.zeros(size, dtype=float).reshape((-1, 1)) v = np.zeros(size, dtype=float).reshape((-1, 1)) for value in n: u_old[value - 1] = 1 / len(n) v[value - 1] = 1 / len(n) A = adj_matrix_norm diff = 1 c = 0.65 while diff > 1e-20: u_new = ((1 - c) * np.matmul(A, u_old)) + (c * v) diff = distance.minkowski(u_new, u_old, 1) u_old = u_new res = [self.idx_file_map[x] for x in u_new.ravel().argsort()[::-1][:m]] self.plot_dominant_gestures(res, k) c = {} c['user_files'] = [self.idx_file_map[x] for x in n] c['dominant_gestures'] = res json.dump(c, open(self.output_dir + "/{}_{}_dominant.txt".format(k, m), "w"), indent='\t')
def KNN(c, dataSet, k=3, dist_eq="Euclidean", p=3, r_dist=False): neighbors = [] dist = [] for data in dataSet: distance = 0 if dist_eq == "Euclidean": distance = spd.euclidean(c, data) elif dist_eq == "Manhattan": distance = spd.cityblock(c, data) elif dist_eq == "Minkowski": distance = spd.minkowski(c, data, p) elif dist_eq == "Hamming": distance = spd.hamming(c, data) else: raise ValueError( "Invalid setting for dist_eq=Euclidean, Manhattan, Minkowski, Hamming" ) if len(neighbors) < k: neighbors.append(data) dist.append(distance) else: max_d = max(dist) if distance < max_d: indx = dist.index(max_d) neighbors[indx] = data dist[indx] = distance if r_dist == False: return neighbors else: return neighbors, dist
def vectors_features(in_data: pd.DataFrame, sent2vec: Callable[[str], np.array]) -> pd.DataFrame: assert "question1" in in_data.columns assert "question2" in in_data.columns vectors1 = np.array([sent2vec(x) for x in in_data['question1']]) vectors2 = np.array([sent2vec(x) for x in in_data['question2']]) in_data['cos'] = np.array( [cosine(x, y) for x, y in zip(vectors1, vectors2)]) in_data['jaccard'] = np.array( [jaccard(x, y) for x, y in zip(vectors1, vectors2)]) in_data['euclidean'] = np.array( [euclidean(x, y) for x, y in zip(vectors1, vectors2)]) in_data['minkowski'] = np.array( [minkowski(x, y) for x, y in zip(vectors1, vectors2)]) in_data['cityblock'] = np.array( [cityblock(x, y) for (x, y) in zip(vectors1, vectors2)]) in_data['canberra'] = np.array( [canberra(x, y) for (x, y) in zip(vectors1, vectors2)]) in_data['braycurtis'] = np.array( [braycurtis(x, y) for (x, y) in zip(vectors1, vectors2)]) in_data['skew_q1'] = np.array([skew(x) for x in vectors1]) in_data['skew_q2'] = np.array([skew(x) for x in vectors2]) in_data['kur_q1'] = np.array([kurtosis(x) for x in vectors1]) in_data['kur_q2'] = np.array([kurtosis(x) for x in vectors2]) in_data['skew_diff'] = np.abs(in_data['skew_q1'] - in_data['skew_q2']) in_data['kur_diff'] = np.abs(in_data['kur_q1'] - in_data['kur_q2']) return in_data
def _build_distance_matrix(self, detected_squares, tracked_squares): # Calculate distances between tracked squares and new squares. distance_matrix = {} for ds in detected_squares: distance_matrix[ds] = {} for ts in tracked_squares: distance_matrix[ds][ts] = distance.minkowski(ds.center, ts.center, 128) return distance_matrix
def _grow_map(self,input,k): bcoords = self._process_input(input) bhash=str(bcoords[0])+""+str(bcoords[1]) winner = self.map_neurons[bhash] score= minkowski(winner.weight_vs,input,2) winner.k_coassoc_vs[k] = score winner.binarycoassoc_vs[k] = 1 self.map_neurons[bhash] = winner return
def find_bmu(self, x): nodes = np.asarray(self.neurons.values()) mink = np.argmin(np.linalg.norm(x - nodes, axis=1)) # mink = pairwise_distances_argmin(nodes, np.array([x])) try: dist =minkowski(self.neurons.values()[mink], x, p = 2) except ValueError: print 'nan' return self.neurons.keys()[mink], dist #dist_sqr[mink]
def _compute_measure(vals_1, vals_2, method='bhat'): """-------------------------------------------------------------------- Computes the distance or dissimilairty between two 1-D lists of values. This function is called with pitch distribution values, while generating matrices. The function is symmetric, the two input lists are interchangable. ---------------------------------------------------------------------- vals_1, vals_2 : The input value lists. method : The choice of distance method ---------------------------------------------------------------------- manhattan : Minkowski distance of 1st degree euclidean : Minkowski distance of 2nd degree l3 : Minkowski distance of 3rd degree bhat : Bhattacharyya distance intersection : Intersection corr : Correlation -------------------------------------------------------------------""" if method in ['manhattan', 'l1']: dist = spdistance.minkowski(vals_1, vals_2, 1) elif method in ['euclidean', 'l2']: dist = spdistance.euclidean(vals_1, vals_2) elif method == 'l3': dist = spdistance.minkowski(vals_1, vals_2, 3) elif method == 'bhat': # bhattacharrya distance dist = -np.log(np.sum(np.sqrt(vals_1 * vals_2))) elif method == 'jeffrey': # Jeffrey's divergence dist = (np.sum(vals_1 * np.log(vals_1 / vals_2)) + np.sum(vals_2 * np.log(vals_2 / vals_1))) elif method == 'js': # Jensen–Shannon distance dist = np.sqrt( np.sum(vals_1 * np.log(2 * vals_1 / (vals_1 + vals_2))) * 0.5 + np.sum(vals_2 * np.log(2 * vals_2 / (vals_1 + vals_2))) * 0.5) # Since correlation and intersection are actually similarity measures, # we convert them to dissimilarities, by taking 1 - similarity elif method == 'dis_intersect': dist = 1.0 - np.sum(np.minimum(vals_1, vals_2)) / np.size(vals_1) elif method == 'dis_corr': dist = 1.0 - np.correlate(vals_1, vals_2) else: raise ValueError("Unknown method") return dist
def find_bmu(self, x): nodes = np.asarray(self.neurons.values()) deltas = nodes - x dist_sqr = np.sum(deltas**2, axis =1 ) mink = np.argmin(dist_sqr) # mink = pairwise_distances_argmin(nodes, np.array([x])) try: dist =minkowski(self.neurons.values()[mink], x, p = 2) except ValueError: print 'nan' return self.neurons.keys()[mink], dist #dist_sqr[mink]
def distance(vals_1, vals_2, method='euclidean'): """------------------------------------------------------------------------- Calculates the distance between two 1-D lists of values. This function is called with pitch distribution values, while generating distance matrices. The function is symmetric, the two inpıt lists are interchangable. ---------------------------------------------------------------------------- vals_1, vals_2 : The input value lists. method : The choice of distance method ---------------------------------------------------------------------------- manhattan : Minkowski distance of 1st degree euclidean : Minkowski distance of 2nd degree l3 : Minkowski distance of 3rd degree bhat : Bhattacharyya distance intersection : Intersection corr : Correlation -------------------------------------------------------------------------""" if (method == 'euclidean'): return distance.euclidean(vals_1, vals_2) elif (method == 'manhattan'): return distance.minkowski(vals_1, vals_2, 1) elif (method == 'l3'): return distance.minkowski(vals_1, vals_2, 3) elif (method == 'bhat'): return -math.log(sum(np.sqrt(vals_1 * vals_2))) # Since correlation and intersection are actually similarity measures, # we take their inverse to be able to use them as distances. In other # words, max. similarity would give the min. inverse and we are always # looking for minimum distances. elif (method == 'intersection'): return len(vals_1) / (sum(np.minimum(vals_1, vals_2))) elif (method == 'corr'): return 1.0 - np.correlate(vals_1, vals_2) else: return 0
def _find_in_map(gmap,ix_rng_s,ix_rng_e, inp_vec): keys= gmap.keys()[ix_rng_s:ix_rng_e] minDist=9223372036854775807 candidate= None for neu_key in keys: neu = gmap[neu_key] cand=minkowski(inp_vec, neu.weight_vs, 2) if minDist> cand: minDist = cand candidate= neu return candidate
def _get_BMU(self,input_nparray): minDist=9223372036854775807 candidate= None for neu in self.map_neurons.itervalues(): if self.boolean: cand = jaccard(input_nparray, neu.weight_vs) if minDist> cand: minDist = cand candidate= neu else: cand=minkowski(input_nparray, neu.weight_vs, 2) if minDist> cand: minDist = cand candidate= neu return candidate
def getBMU(self,input_nparray): minDist=9223372036854775807 candidate= None for neu in self.map_neurons.itervalues(): #print "input: "+str(input_nparray) #print "neuron: "+str (neu.weight_vs) if self.boolean: cand = jaccard(input_nparray, neu.weight_vs) if minDist> cand: minDist = cand candidate= neu else: cand=minkowski(input_nparray, neu.weight_vs, 2) if minDist> cand: #print "mindist:",minDist #print "cand:",cand minDist = cand candidate= neu #print "candidate'scoords",candidate.coords() return candidate
def score(data, labels=None, centroids=None, norm=2): """Given data and labels or centroids, calculate objective the data, and either labels or centroids has to be given, the other is None If both labels and centroids are None, an error will be thrown If neither labels and centroids are None, they will both be used (although this will typically result in a worse score than if leaving one of them as None) If one is None, the other isn't, the one that is None will be filled in using the other norm is the minkowski norm used for computing distances """ assert (labels is not None) or (centroids is not None), "At least one of labels and centroids must be not None" if centroids is None: centroids = getcentroids(data, labels) if labels is None: labels = getlabels(data, centroids, norm) distances = 0 labelnames = unique(labels) for i in range(len(labelnames)): datac = data[labels == labelnames[i]] distances = distances + sum(array([minkowski(datac[j, :], centroids[i, :], norm) for j in range(shape(datac)[0])])) return distances / float(len(labels))
def parallel_search_bmu(self, input_vector): mapsize=len(self.map_neurons.keys()) indices=[] for i in range (self.n_jobs): indices.append([i*mapsize/self.n_jobs , (i+1)*mapsize/self.n_jobs-1]) #res = Parallel(n_jobs=2) (delayed(check_paths) (Path(points), a) for points in b) res=Parallel(n_jobs=self.n_jobs)(delayed(_find_in_map)(self.map_neurons,ix_range[0],ix_range[1],input_vector)for ix_range in indices) for r in res: if r is None: print r minDist=9223372036854775807 candidate= None for neu in res: cand=minkowski(input_vector, neu.weight_vs, 2) if minDist> cand: minDist = cand candidate= neu return candidate
def compute_neighbors (self, X_test) : ## Figure out the power p for minkowski distance if (self.metric == 'manhattan') : self.minkowski_p = 1 elif (self.metric == 'euclidean') : self.minkowski_p = 2 elif (self.metric == 'minkowski') : pass else : self.minkowski_p = 2 print ("WARNING: Unknown distance metric: %s specified! Reverting to euclidean" %(self.metric)) ## Compute distance of a test point to all train points distance_matrix = np.zeros([self.numObs]) neighbor_id = np.zeros([self.numObs]) for test_idx in range(self.XTestNumObs) : for n_idx in range(self.numObs) : neighbor_id[n_idx] = n_idx distance_matrix[n_idx] = minkowski(X_test[test_idx], self.X[n_idx], self.minkowski_p) ## Sort and record distances and neighbors sorted_distances = sorted(zip(distance_matrix, neighbor_id, self.y)) for k_idx in range(self.k) : self.XTestNeighborsDist[test_idx, k_idx], self.XTestNeighborsId[test_idx, k_idx], self.XTestNeighborsY[test_idx, k_idx] = sorted_distances[k_idx]
data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)] data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)] data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)] data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)] cPickle.dump(question1_vectors, open('data/q1_w2v.pkl', 'wb'), -1) cPickle.dump(question2_vectors, open('data/q2_w2v.pkl', 'wb'), -1) data.to_csv('data/quora_features.csv', index=False)
def minkowski_distance(a,b,p): print distance.minkowski(a,b,p)
# In[4]: import numpy as np import pandas as pd from scipy import stats import matplotlib.pyplot as plt import matplotlib.mlab as mlab from scipy.spatial import distance # In[9]: df=pd.read_table('/Users/Li/Google Drive/UIUC_Study/2015Fall/CS412/HW/HW1/data/vectors.txt') # In[15]: x = np.array(df.ix[0,1:]) y = np.array(df.ix[1,1:]) # In[19]: distance.minkowski(x,y,2) # In[21]: distance.minkowski(x,y,3)
ax1.axes.get_yaxis().set_visible(False) pl.savefig(pdb1+'vs'+pdb2+'Isomap.png',dpi=250) pl.show() pl.clf() print TM_align['seqA'] print TM_align['matchs'] print TM_align['alignedList1'] print 'Aligned positions1:\n', ','.join(i[3:] for i in TM_align['alignedList1']) print TM_align['alignedList2'] print 'Aligned positions2:\n', ','.join(i[3:] for i in TM_align['alignedList2']) permu1,stdpermu1 = get_permutation(neojamming1, TM_align['alignedList1']) permu2,stdpermu2 = get_permutation(neojamming2, TM_align['alignedList2']) pl.title('Conformer ensemble distance: %.2f'%minkowski(stdpermu1,stdpermu2,pnorm)) pl.scatter(stdpermu1,stdpermu2,marker='o',s=55,facecolor='0.6',edgecolor='b') pl.xlim(0,len(stdpermu1)) pl.ylim(0,len(stdpermu1)) pl.savefig(pdb1+'vs'+pdb2+'.png', bbox_inches='tight',dpi=250) pl.show() print 'permu:\n',','.join([str(i) for i in permu1]) print 'std permu:\n',','.join([str(i) for i in stdpermu1]) print 'permu:\n',','.join([str(i) for i in permu2]) print 'std permu:\n',','.join([str(i) for i in stdpermu2]) print 'Conformation ensambles at distance:\t', minkowski(stdpermu1,stdpermu2,pnorm) fpermus = open(pdb1+'vs'+pdb2+'.txt','w') fpermus.write('%s\n%s\n' % (pdb1,','.join(map(str,stdpermu1))) )
def wvMkow(a): return [distance.minkowski(x[0], x[1], 2) for x in a]
def process_input(self,input_np_array): bmu = self.getBMU(input_np_array) bmu.hits += 1 bmu.time = self.count for neu in self.map_neurons.values(): nhash = str(neu.x_c)+""+str(neu.y_c) # print "bmu: "+str(bmu.coords()) #print "neu: "+str(neu.coords())nhash dist = minkowski(bmu.coords().astype(float), neu.coords().astype(float), 2) if dist< self.nr: '''weight adjustment *np.exp(-1*dist**2/2*self.nr**2)''' #neu.weight_vs = neu.weight_vs + self.lr * (input_np_array-neu.weight_vs) neu.weight_vs = neu.weight_vs + self.lr *np.exp(((dist/self.nr)**2)/(-2))* self.adjustment_gaus(input_np_array,neu.weight_vs) err =self.gaussian_error(input_np_array,neu.weight_vs) neu.res_err += err#minkowski(neu.weight_vs, bmu.weight_vs, 2) self.map_neurons[nhash]=neu if bmu.res_err > self.thresh: #print bmu.res_err neu = bmu down=str(neu.x_c)+str(int(neu.y_c)-1) up=str(neu.x_c)+str(int(neu.y_c)+1) left=str(int(neu.x_c)-1)+str(neu.y_c) right=str(int(neu.x_c)+1)+str(neu.y_c) nei_coords = np.array([down, up , left , right ] ) nei_coordi = np.array([[(neu.x_c),(int(neu.y_c)-1)], [(neu.x_c),(int(neu.y_c)+1)], [(int(neu.x_c)-1),(neu.y_c)], [(int(neu.x_c)+1),int(neu.y_c)]] ) p =0 for coord in nei_coords: n=None try: n= self.map_neurons[coord] n.res_err+=self.fd*n.res_err except KeyError: nwron=neuron(nei_coordi[p][0], nei_coordi[p][1], self.dim) nwron.time=self.t_time new_weight = 0 #case a) new node has two consecutive nodes on one of its sides #tiroshan and lakmal please implement the code here #case b) between two old nodes new_weight_b = self.type_b_weight_init(p,neu) new_weight_a = self.type_a_weight_init(p,neu) new_weight_c = self.type_c_weight_init(p,neu) if new_weight_b.all() ==0: if new_weight_a.all() == 0: if new_weight_c.all() == 0: #print "c==0" new_weight = np.ndarray(shape=(self.dim)) new_weight.fill(0.5) else: new_weight = new_weight_c else: new_weight = new_weight_a else: new_weight = new_weight_b # nwron.weight_vs=np.ndarray(shape=(self.dim)) # nwron.weight_vs.fill(0.5) nwron.weight_vs = new_weight n=nwron self.map_neurons[coord]=n p+=1 bmu.res_err=self.thresh/2 self.map_neurons[str(bmu.x_c)+""+str(bmu.y_c)]=bmu return bmu.coords()
def minkowski((x, y)): ret = [] for p in range(2, 6): ret += [distance.minkowski(x, y, p)] return ret
hists = [] for img in imgs: hist = [0 for i in range(64)] for i in range(len(img)): for j in range(len(img[0])): if random.random() > 0.0625: # sample 1/16 pixels for efficiency continue b1, b2, b3 = [c/64 for c in img[i][j]] idx = b1 * 16 + b2 * 4 + b3 hist[idx] += 1 hists.append(np.array(hist)) selected.append(img_names[0]) for i in range(1, len(hists)): hist1 = hists[i - 1] hist2 = hists[i] diff= minkowski(hist1, hist2, 1) print diff if diff > 128000.0/(640 * 480) * (160 * 120): # threshold selected.append(img_names[i]) # In[63]: if not os.path.exists(args.keyframes_dir): os.mkdir(args.keyframes_dir) for img in selected: os.system("cp {1}/{0} {2}/{0}".format(img, iframe_dir, keyframes_dir)) # In[64]:
def getlabels(data, centroids, norm=2): """Returns an array of labels, for the centroid that is closest to each datapoint""" return array([argmin([minkowski(data[i, :], centroids[c, :], norm) for c in range(shape(centroids)[0])]) for i in range(shape(data)[0])])
#HERE structures must have only atoms of selected chain TM_align = rcu.TM_aligned_residues(pdb1,pdb2,offset2, offset1) neojamming1 = neoj.neoJAMMING(pdb1,chain1) neojamming2 = neoj.neoJAMMING(pdb2,chain2) print TM_align['seqA'] print TM_align['matchs'] print TM_align['alignedList1'] print 'Aligned positions1:\n', ','.join(i[3:] for i in TM_align['alignedList1']) print TM_align['alignedList2'] print 'Aligned positions2:\n', ','.join(i[3:] for i in TM_align['alignedList2']) permu1,stdpermu1 = get_permutation(neojamming1, TM_align['alignedList1']) permu2,stdpermu2 = get_permutation(neojamming2, TM_align['alignedList2']) pl.title('Conformer ensemble distance: %.2f'%minkowski(stdpermu1,stdpermu2,PNORM)) pl.scatter(stdpermu1,stdpermu2,marker='o',s=55,facecolor='0.6',edgecolor='b') pl.xlim(0,len(stdpermu1)) pl.ylim(0,len(stdpermu1)) pl.savefig(pdb1+'vs'+pdb2+'.png', bbox_inches='tight',dpi=250) print 'permu:\n',','.join([str(i) for i in permu1]) print 'std permu:\n',','.join([str(i) for i in stdpermu1]) print 'permu:\n',','.join([str(i) for i in permu2]) print 'std permu:\n',','.join([str(i) for i in stdpermu2]) print 'Conformation ensambles at distance:\t', minkowski(stdpermu1,stdpermu2,PNORM) print 'Spearman rank-order correlation coefficient and p-value', spearmanr(permu1,permu2)
def centroidscore(datapoint, centroids, pheromone, beta): distances = array([minkowski(datapoint, centroids[i, :], beta) for i in range(shape(centroids)[0])]) return pheromone * (1.0 / distances)
def minkowski_distance(a,b,p): return distance.minkowski(a,b,p)
import numpy as np import pylab as pl import scipy.spatial.distance as dist def plotSamples(x, y, z=None): stars = np.matrix([[3., -2., 0.], [3., 2., 0.]]) if z is not None: x, y = z * np.matrix([x, y]) stars = z * stars pl.scatter(x, y, s=10) # 画 gaussian 随机点 pl.scatter(np.array(stars[0]), np.array(stars[1]), s=200, marker='*', color='r') # 画三个指定点 pl.axhline(linewidth=2, color='g') # 画 x 轴 pl.axvline(linewidth=2, color='g') # 画 y 轴 pl.axis('equal') pl.axis([-5, 5, -5, 5]) pl.show() # 产生高斯分布的随机点 mean = [0, 0] # 平均值 cov = [[2, 1], [1, 2]] # 协方差 x, y = np.random.multivariate_normal(mean, cov, 1000).T plotSamples(x, y) covMat = np.matrix(np.cov(x, y)) # 求 x 与 y 的协方差矩阵 Z = np.linalg.cholesky(covMat).I # 仿射矩阵 plotSamples(x, y, Z) # 求马氏距离 print '\n到原点的马氏距离分别是' print dist.mahalanobis([0,0], [3,3], covMat.I), dist.mahalanobis([0,0], [-2,2], covMat.I) # 求变换后的欧几里得距离 dots = (Z * np.matrix([[3, -2, 0], [3, 2, 0]])).T print '\n变换后到原点的欧几里得距离分别是:' print dist.minkowski([0, 0], np.array(dots[0]), 2), dist.minkowski([0, 0], np.array(dots[1]), 2)
def mixed_dist(x, y): if x[-1] != y[-1]: x[-1], y[-1] = 10000, 0 # arbitrary, about same scale as other else: x[-1], y[-1] = 0, 0 return minkowski(x, y, 2)