def calculate_distance(X, Y, metric='euclidean'): if metric == METRIC_EUCLIDEAN: return distance.euclidean(X, Y) elif metric == METRIC_JACCARD: return distance.jaccard(X, Y) elif metric == METRIC_CANBERRA: return distance.canberra(X, Y) elif metric == METRIC_CHEBYSHEV: return distance.chebyshev(X, Y) elif metric == METRIC_MINKOWSKI: return distance.minkowski(X, Y) elif metric == METRIC_WMINKOWSKI: return distance.wminkowski(X, Y) elif metric == METRIC_BRAYCURTIS: return distance.braycurtis(X, Y) elif metric == METRIC_HAMMING: return distance.hamming(X, Y) elif metric == METRIC_MAHALANOBIS: return distance.mahalanobis(X, Y) elif metric == METRIC_MANHATTAN: return sum(abs(a - b) for a, b in zip(X, Y)) elif metric == METRIC_COSINE: dot_product = np.dot(X, Y) norm_a = np.linalg.norm(X) norm_b = np.linalg.norm(Y) return dot_product / (norm_a * norm_b)
def hdbscan_with_knn(data, clf, thresh=None, mink_p=1.5, mink_kwargs=None): df = data.copy() mc = clf.min_cluster_size ms = clf.min_samples metric = clf.metric clf_method = clf.cluster_selection_method try: # run hdbscan if metric == 'wminkowski': mw = mink_weights(df, **mink_kwargs) metric = lambda x, y: wminkowski(x, y, p=mink_p, w=mw) clusterer = HDBSCAN(min_cluster_size=mc, min_samples=ms, prediction_data=True, metric=metric, cluster_selection_method=clf_method).fit(df) thresh = thresh if thresh else 1 / max(2, len(clusterer.exemplars_)) # get exemplars and labels exemplars = np.concatenate([e for e in clusterer.exemplars_]) labels = np.concatenate([ np.full((len(e)), fill_value=i) for i, e in enumerate(clusterer.exemplars_) ]) # fit knn on exemplars knn = KNeighborsClassifier(n_neighbors=1).fit(exemplars, labels) # map top soft cluster probabilities to obs probs = np.max(all_points_membership_vectors(clusterer), axis=1) df['top_prob'] = pd.Series(probs, index=df.index) # assign all points to outlier class (label:-1) df['label'] = -1 # take all points above a prob threshhold obs = df.top_prob >= thresh # predict labels from fitted knn df.loc[obs, 'label'] = knn.predict( df.loc[obs, df.columns.drop(['top_prob', 'label'])]) except: df['label'] = 0 return df.label #----------------------- TO-DO ----------------------------- # allow batch prediction # -- 1. assign points below thresh to outlier class # -- 2. take top n% of obs by cluster prob and predict label # -- 3. refit knn on assigned points # -- 4. repeat steps 2 & 3 for remaining percentage bins # allow for custom distance metrics and weight in hdbscan call return df.label
def wminkowski(self, x=None, y=None, p=2, w=np.ones(3)): """ 加权闵可夫斯基距离 x = [1, 0, 0] y = [0, 1, 0] """ x = x or self.x y = y or self.y w = w or self.w return distance.wminkowski(x, y, p, w)
def wieghted_euclidean(o1, o2, w=0.1): """ wieghted euclidean similarity function input: - o1: first object (List) - o2: Second object (List) output: - wieghted euclidean distance between 01 and o2 (float) """ o1, o2 = np.array(o1), np.array(o2) return distance.wminkowski(o1, o2, 1, w)
def compute_distance(X, centroid, type="euclidian", weight=1): """Computes the distance using the type passed as parameter. Can compute weighted distance only for minkowski.""" # Initialize the weight to all ones if not specified for weighted minkowski. if type is "wminkowski" and weight is 1: weight = np.ones(len(X)) # Computation of the distance using one of the implemented formulas distance = { "euclidian": (sp.euclidean(X, centroid)), "manhattan": (sp.cityblock(X, centroid)), "wminkowski": (sp.wminkowski(X, centroid, 2, weight)) }[type] return distance
def calcualateSimilarity(question_ebd, relation_ebd, metric): if metric == 'braycurtis': return distance.braycurtis(question_ebd, relation_ebd) elif metric == 'canberra': return distance.canberra(question_ebd, relation_ebd) elif metric == 'chebyshev': return distance.chebyshev(question_ebd, relation_ebd) elif metric == 'cityblock': return distance.cityblock(question_ebd, relation_ebd) elif metric == 'cosine': return distance.cosine(question_ebd, relation_ebd) elif metric == 'euclidean': return distance.euclidean(question_ebd, relation_ebd) elif metric == 'mahalanobis': return distance.mahalanobis(question_ebd, relation_ebd) elif metric == 'wminkowski': return distance.wminkowski(question_ebd, relation_ebd)
def wexpL4(u, v, w): x = clip(wminkowski(u, v, 4, w), a_max=700) return np.exp(x)
def wL4(u, v, w): return wminkowski(u, v, 4, w)
def wL2(u, v, w): return wminkowski(u, v, 2, w)
def main(argv): # visualize_ycc() # return # Parse arguments. parser = argparse.ArgumentParser( prog='SixBits', formatter_class=argparse.ArgumentDefaultsHelpFormatter) args = parser.parse_args() _ = args # Colors in this dictionary are describe in YCbCr space. color = {'white' : (255, 128, 128), 'black' : (0, 128, 128), 'yellow' : (226, 1, 149), 'blue' : (29, 255, 107), 'red' : (76, 85, 255), 'cyan' : (179, 171, 1), 'green' : (150, 44, 21), 'magenta' : (105, 212, 235)} all_values = [] num_discretizations = 9 side_discrets = 4 import YccLevels all_values = YccLevels.get_discrete_values() all_values = list(set(all_values)) # to_remove = [ # (100, 72, 57), # (132, 54, 215), # (147, 45, 205), # (100, 215, 57), # ( 85, 224, 67), # (132, 197, 216), # (147, 188, 205) # ] # for _ in to_remove: # all_values.remove(_) # all_values.remove((128, 128, 128)) # Remove most ambiguious chunks. print 'Values' for _val in sorted(all_values): print ' ', _val num_values = len(all_values) import math print 'Number of distinct values: %d (%.2f bits).' % \ (num_values, math.log(num_values, 2)) from scipy.spatial.distance import pdist, euclidean, wminkowski, cosine distances = pdist(all_values) count = 0 idx_val = {} for i in range(num_values): for j in range(i + 1, num_values): idx_val[count] = (i, j) count += 1 # for idx in idx_val: # first, second = idx_val[idx] # if distances[idx] < 28: # print idx, all_values[first], all_values[second], distances[idx] possible_values = all_values IMAGE_WIDTH = 8 IMAGE_HEIGHT = 8 im = Image.new('YCbCr', (IMAGE_WIDTH, IMAGE_HEIGHT)) pixels = im.load() original_vals = [] for row in range(0, IMAGE_HEIGHT, 2): for col in range(0, IMAGE_WIDTH, 2): val = tuple(map(int, random.choice(possible_values))) original_vals.append(val) print 'Written:', val pixels[row, col] = val pixels[row + 1, col] = val pixels[row, col + 1] = val pixels[row + 1, col + 1] = val original_vals.reverse() QUALITY = 75 im.save('test.jpg', quality = QUALITY) print 'ALL VALUES', all_values opened_im = Image.open('test.jpg') pixels = opened_im.load() for row in range(0, IMAGE_HEIGHT, 2): for col in range(0, IMAGE_WIDTH, 2): vals = {} for idx in range(3): val = 0 val += pixels[row, col][idx] val += pixels[row + 1, col][idx] val += pixels[row, col + 1][idx] val += pixels[row + 1, col + 1][idx] val /= 4.0 vals[idx] = val red = vals[0] green = vals[1] blue = vals[2] extracted = ColorSpace.to_ycc(red, green, blue) print ' Extracted', extracted _best_val = 1000 _best_match = () for vect in all_values: vect = map(int, map(round, vect)) dist = wminkowski(extracted, vect, 2, [5, 1, 1]) print extracted, vect, dist if dist < _best_val: _best_val = dist _best_match = vect print ' Best Val', _best_val, _best_match _orig = list(original_vals.pop()) _best_match = map(int, _best_match) # map(int, _min_vect) if _orig != _best_match: _mismatch_print = 'Mismatch at (%3d, %3d):\n' % (row, col) orig_extracted_dist = wminkowski(_orig, extracted, 2, [5, 1, 1]) _mismatch_print += ' original = (%3d, %3d, %3d) %6.2f\n' % tuple(_orig + [orig_extracted_dist]) best_match_dist = wminkowski(_orig, _best_match, 2, [5, 1, 1]) _mismatch_print += ' closest = (%3d, %3d, %3d) %6.2f\n' % tuple(_best_match + [best_match_dist]) _mismatch_print += ' extracted = (%3d, %3d, %3d)\n' % tuple(map(int, map(round, extracted))) print _mismatch_print
def getClosestTraining(pixel, trainData): dists = np.apply_along_axis( lambda x: wminkowski(pixel, x, 2, dist_weights), axis=1, arr=trainData) return np.argmin(dists)
def exec_similarity(dct, algorithm): if validate_similarity_algorithms(dct, algorithm): return {} if algorithm == 'braycurtis': return [ answer.update({ algorithm: braycurtis(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'canberra': return [ answer.update({ algorithm: canberra(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'chebyshev': return [ answer.update({ algorithm: chebyshev(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'cityblock': return [ answer.update({ algorithm: cityblock(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'correlation': return [ answer.update({ algorithm: correlation(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'cosine': return [ answer.update({ algorithm: cosine(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'euclidean': return [ answer.update({ algorithm: euclidean(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'mahalanobis': return [ answer.update({ algorithm: mahalanobis(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] #elif algorithm is 'minkowski': #return [answer.update({algorithm:minkowski(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf']))}) for answer in dct['answers']] elif algorithm == 'seuclidean': return [ answer.update({ algorithm: seuclidean(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'sqeuclidean': return [ answer.update({ algorithm: sqeuclidean(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'wminkowski': return [ answer.update({ algorithm: wminkowski(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'dice': return [ answer.update({ algorithm: dice(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'hamming': return [ answer.update({ algorithm: hamming(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'jaccard': return [ answer.update({ algorithm: jaccard(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'kulsinski': return [ answer.update({ algorithm: kulsinski(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'rogerstanimoto': return [ answer.update({ algorithm: rogerstanimoto(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'russellrao': return [ answer.update({ algorithm: russellrao(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'sokalmichener': return [ answer.update({ algorithm: sokalmichener(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'sokalsneath': return [ answer.update({ algorithm: sokalsneath(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'yule': return [ answer.update({ algorithm: yule(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ]
def getClosestTraining(pixel, trainData): dists = np.apply_along_axis( lambda x: wminkowski(pixel, x, 2, dist_weights) , axis=1, arr=trainData) return np.argmin(dists)
def main(): from scipy.spatial import distance a = np.array([1, 2, 43]) b = np.array([3, 2, 1]) d = Distance() print('-----------------------------------------------------------------') print('My braycurtis: {}'.format(d.braycurtis(a, b))) print('SciPy braycurtis: {}'.format(distance.braycurtis(a, b))) print('-----------------------------------------------------------------') print('My canberra: {}'.format(d.canberra(a, b))) print('SciPy canberra: {}'.format(distance.canberra(a, b))) print('-----------------------------------------------------------------') print('My chebyshev: {}'.format(d.chebyshev(a, b))) print('SciPy chebyshev: {}'.format(distance.chebyshev(a, b))) print('-----------------------------------------------------------------') print('My cityblock: {}'.format(d.cityblock(a, b))) print('SciPy cityblock: {}'.format(distance.cityblock(a, b))) print('-----------------------------------------------------------------') print('My correlation: {}'.format(d.correlation(a, b))) print('SciPy correlation: {}'.format(distance.correlation(a, b))) print('-----------------------------------------------------------------') print('My euclidean: {}'.format(d.euclidean(a, b))) print('SciPy euclidean: {}'.format(distance.euclidean(a, b))) print('-----------------------------------------------------------------') print('My hamming: {}'.format(d.hamming(a, b))) print('SciPy hamming: {}'.format(distance.hamming(a, b))) print('-----------------------------------------------------------------') print('My jaccard: {}'.format(d.jaccard(a, b))) print('SciPy jaccard: {}'.format(distance.jaccard(a, b))) print('-----------------------------------------------------------------') print('My manhattan: {}'.format(d.cityblock(a, b))) print('SciPy manhattan: {}'.format(distance.cityblock(a, b))) print('-----------------------------------------------------------------') print('My cosine: {}'.format(d.cosine(a, b))) print('SciPy cosine: {}'.format(distance.cosine(a, b))) print('-----------------------------------------------------------------') print('My dice: {}'.format(d.dice(a, b))) print('SciPy dice: {}'.format(distance.dice(a, b))) print('-----------------------------------------------------------------') print('My kulsinski: {}'.format(d.kulsinski(a, b))) print('SciPy kulsinski: {}'.format(distance.kulsinski(a, b))) print('-----------------------------------------------------------------') iv = np.array([[1, 0.5, 0.5], [0.5, 1, 0.5], [0.5, 0.5, 1]]) print('My mahalanobis: {}'.format(d.mahalanobis(a, b, iv))) print('SciPy mahalanobis: {}'.format(distance.mahalanobis(a, b, iv))) print('-----------------------------------------------------------------') print('My seuclidean: {}'.format( d.seuclidean(a, b, np.array([0.1, 0.1, 0.1])))) print('SciPy seuclidean: {}'.format( distance.seuclidean(a, b, [0.1, 0.1, 0.1]))) print('-----------------------------------------------------------------') print('My sokalmichener: {}'.format(d.sokalmichener(a, b))) print('SciPy sokalmichener: {}'.format(distance.sokalmichener(a, b))) print('-----------------------------------------------------------------') print('My sokal_sneath: {}'.format(d.sokalsneath(a, b))) print('SciPy sokal_sneath: {}'.format(distance.sokalsneath(a, b))) print('-----------------------------------------------------------------') print('My sqeuclidean: {}'.format(d.sqeuclidean(a, b))) print('SciPy sqeuclidean: {}'.format(distance.sqeuclidean(a, b))) print('-----------------------------------------------------------------') print('My minkowski: {}'.format(d.minkowski(a, b, 2))) print('SciPy minkowski: {}'.format(distance.minkowski(a, b, 2))) print('-----------------------------------------------------------------') print('My rogerstanimoto: {}'.format(d.rogerstanimoto(a, b))) print('SciPy rogerstanimoto: {}'.format(distance.rogerstanimoto(a, b))) print('-----------------------------------------------------------------') print('My russellrao: {}'.format(d.russellrao(a, b))) print('SciPy russellrao: {}'.format(distance.russellrao(a, b))) print('-----------------------------------------------------------------') print('My wminkowski: {}'.format(d.wminkowski(a, b, 2, np.ones(3)))) print('SciPy wminkowski: {}'.format( distance.wminkowski(a, b, 2, np.ones(3)))) print('-----------------------------------------------------------------') print('My yule: {}'.format(d.yule(a, b))) print('SciPy yule: {}'.format(distance.yule(a, b))) print('-----------------------------------------------------------------')
def _get_weight(self, x, centroid): return numpy.exp(-0.5 * self.scale * distance.wminkowski(x, centroid, self.power, self.precisions))
def _get_weight(self, x, centroid): return numpy.exp( -0.5 * self.scale * distance.wminkowski(x, centroid, self.power, self.precisions))