Beispiel #1
0
def calculate_distance(X, Y, metric='euclidean'):
    if metric == METRIC_EUCLIDEAN:
        return distance.euclidean(X, Y)
    elif metric == METRIC_JACCARD:
        return distance.jaccard(X, Y)
    elif metric == METRIC_CANBERRA:
        return distance.canberra(X, Y)
    elif metric == METRIC_CHEBYSHEV:
        return distance.chebyshev(X, Y)
    elif metric == METRIC_MINKOWSKI:
        return distance.minkowski(X, Y)
    elif metric == METRIC_WMINKOWSKI:
        return distance.wminkowski(X, Y)
    elif metric == METRIC_BRAYCURTIS:
        return distance.braycurtis(X, Y)
    elif metric == METRIC_HAMMING:
        return distance.hamming(X, Y)
    elif metric == METRIC_MAHALANOBIS:
        return distance.mahalanobis(X, Y)
    elif metric == METRIC_MANHATTAN:
        return sum(abs(a - b) for a, b in zip(X, Y))

    elif metric == METRIC_COSINE:
        dot_product = np.dot(X, Y)
        norm_a = np.linalg.norm(X)
        norm_b = np.linalg.norm(Y)
        return dot_product / (norm_a * norm_b)
def hdbscan_with_knn(data, clf, thresh=None, mink_p=1.5, mink_kwargs=None):
    df = data.copy()
    mc = clf.min_cluster_size
    ms = clf.min_samples
    metric = clf.metric
    clf_method = clf.cluster_selection_method

    try:
        # run hdbscan
        if metric == 'wminkowski':
            mw = mink_weights(df, **mink_kwargs)
            metric = lambda x, y: wminkowski(x, y, p=mink_p, w=mw)

        clusterer = HDBSCAN(min_cluster_size=mc,
                            min_samples=ms,
                            prediction_data=True,
                            metric=metric,
                            cluster_selection_method=clf_method).fit(df)

        thresh = thresh if thresh else 1 / max(2, len(clusterer.exemplars_))

        # get exemplars and labels
        exemplars = np.concatenate([e for e in clusterer.exemplars_])
        labels = np.concatenate([
            np.full((len(e)), fill_value=i)
            for i, e in enumerate(clusterer.exemplars_)
        ])

        # fit knn on exemplars
        knn = KNeighborsClassifier(n_neighbors=1).fit(exemplars, labels)

        # map top soft cluster probabilities to obs
        probs = np.max(all_points_membership_vectors(clusterer), axis=1)
        df['top_prob'] = pd.Series(probs, index=df.index)

        # assign all points to outlier class (label:-1)
        df['label'] = -1

        # take all points above a prob threshhold
        obs = df.top_prob >= thresh

        # predict labels from fitted knn
        df.loc[obs, 'label'] = knn.predict(
            df.loc[obs, df.columns.drop(['top_prob', 'label'])])
    except:
        df['label'] = 0
        return df.label


#----------------------- TO-DO -----------------------------
# allow batch prediction
# -- 1. assign points below thresh to outlier class
# -- 2. take top n% of obs by cluster prob and predict label
# -- 3. refit knn on assigned points
# -- 4. repeat steps 2 & 3 for remaining percentage bins

# allow for custom distance metrics and weight in hdbscan call
    return df.label
Beispiel #3
0
    def wminkowski(self, x=None, y=None, p=2, w=np.ones(3)):
        """
        加权闵可夫斯基距离

        x = [1, 0, 0]
        y = [0, 1, 0]
        """
        x = x or self.x
        y = y or self.y
        w = w or self.w
        return distance.wminkowski(x, y, p, w)
Beispiel #4
0
def wieghted_euclidean(o1, o2, w=0.1):
    """
    wieghted euclidean similarity function
    input:
    - o1: first object (List)
    - o2: Second object (List)
    output:
    - wieghted euclidean distance between 01 and o2 (float)
    """
    o1, o2 = np.array(o1), np.array(o2)
    return distance.wminkowski(o1, o2, 1, w)
Beispiel #5
0
def compute_distance(X, centroid, type="euclidian", weight=1):
    """Computes the distance using the type passed as parameter. Can compute weighted distance only for minkowski."""
    # Initialize the weight to all ones if not specified for weighted minkowski.
    if type is "wminkowski" and weight is 1:
        weight = np.ones(len(X))

    # Computation of the distance using one of the implemented formulas
    distance = {
        "euclidian": (sp.euclidean(X, centroid)),
        "manhattan": (sp.cityblock(X, centroid)),
        "wminkowski": (sp.wminkowski(X, centroid, 2, weight))
    }[type]

    return distance
def calcualateSimilarity(question_ebd, relation_ebd, metric):
    if metric == 'braycurtis':
        return distance.braycurtis(question_ebd, relation_ebd)
    elif metric == 'canberra':
        return distance.canberra(question_ebd, relation_ebd)
    elif metric == 'chebyshev':
        return distance.chebyshev(question_ebd, relation_ebd)
    elif metric == 'cityblock':
        return distance.cityblock(question_ebd, relation_ebd)
    elif metric == 'cosine':
        return distance.cosine(question_ebd, relation_ebd)
    elif metric == 'euclidean':
        return distance.euclidean(question_ebd, relation_ebd)
    elif metric == 'mahalanobis':
        return distance.mahalanobis(question_ebd, relation_ebd)
    elif metric == 'wminkowski':
        return distance.wminkowski(question_ebd, relation_ebd)
Beispiel #7
0
def wexpL4(u, v, w):
    x = clip(wminkowski(u, v, 4, w), a_max=700)
    return np.exp(x)
Beispiel #8
0
def wL4(u, v, w):
    return wminkowski(u, v, 4, w)
Beispiel #9
0
def wL2(u, v, w):
    return wminkowski(u, v, 2, w)
Beispiel #10
0
def main(argv):
  # visualize_ycc()
  # return

  # Parse arguments.
  parser = argparse.ArgumentParser(
    prog='SixBits', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  args = parser.parse_args()
  _ = args

  # Colors in this dictionary are describe in YCbCr space.
  color = {'white'  : (255, 128, 128),
           'black'  : (0, 128, 128),
           'yellow' : (226, 1, 149),
           'blue'   : (29, 255, 107),
           'red'    : (76, 85, 255),
           'cyan'   : (179, 171, 1),
           'green'  : (150, 44, 21),
           'magenta' : (105, 212, 235)}

  all_values = []

  num_discretizations = 9
  side_discrets = 4

  import YccLevels
  all_values = YccLevels.get_discrete_values()
  all_values = list(set(all_values))
  # to_remove = [
  #   (100,  72,  57),
  #   (132,  54, 215),
  #   (147,  45, 205),
  #   (100, 215,  57),
  #   ( 85, 224,  67),
  #   (132, 197, 216),
  #   (147, 188, 205)
  #   ]
  # for _ in to_remove:
  #   all_values.remove(_)

  # all_values.remove((128, 128, 128)) # Remove most ambiguious chunks.

  print 'Values'
  for _val in sorted(all_values):
    print ' ', _val

  num_values = len(all_values)
  import math
  print 'Number of distinct values: %d (%.2f bits).' % \
    (num_values, math.log(num_values, 2))
  from scipy.spatial.distance import pdist, euclidean, wminkowski, cosine
  distances = pdist(all_values)
  count = 0
  idx_val = {}
  for i in range(num_values):
    for j in range(i + 1, num_values):
      idx_val[count] = (i, j)
      count += 1

  # for idx in idx_val:
  #   first, second = idx_val[idx]
  #   if distances[idx] < 28:
  #     print idx, all_values[first], all_values[second], distances[idx]

  possible_values = all_values
  IMAGE_WIDTH = 8
  IMAGE_HEIGHT = 8
  im = Image.new('YCbCr', (IMAGE_WIDTH, IMAGE_HEIGHT))
  pixels = im.load()
  original_vals = []
  for row in range(0, IMAGE_HEIGHT, 2):
    for col in range(0, IMAGE_WIDTH, 2):
      val = tuple(map(int, random.choice(possible_values)))
      original_vals.append(val)
      print 'Written:', val
      pixels[row, col] = val
      pixels[row + 1, col] = val
      pixels[row, col + 1] = val
      pixels[row + 1, col + 1] = val
  original_vals.reverse()

  QUALITY = 75
  im.save('test.jpg', quality = QUALITY)

  print 'ALL VALUES', all_values

  opened_im = Image.open('test.jpg')
  pixels = opened_im.load()
  for row in range(0, IMAGE_HEIGHT, 2):
    for col in range(0, IMAGE_WIDTH, 2):
      vals = {}
      for idx in range(3):
        val = 0
        val += pixels[row, col][idx]
        val += pixels[row + 1, col][idx]
        val += pixels[row, col + 1][idx]
        val += pixels[row + 1, col + 1][idx]
        val /= 4.0
        vals[idx] = val

      red = vals[0]
      green = vals[1]
      blue = vals[2]
      extracted = ColorSpace.to_ycc(red, green, blue)
      print ' Extracted', extracted
      _best_val = 1000
      _best_match = ()

      for vect in all_values:
        vect = map(int, map(round, vect))
        dist = wminkowski(extracted, vect, 2, [5, 1, 1])
        print extracted, vect, dist
        if dist < _best_val:
          _best_val = dist
          _best_match = vect
      print ' Best Val', _best_val, _best_match
      _orig = list(original_vals.pop())
      _best_match = map(int, _best_match) # map(int, _min_vect)

      if _orig != _best_match:
        _mismatch_print = 'Mismatch at (%3d, %3d):\n' % (row, col)
        orig_extracted_dist = wminkowski(_orig, extracted, 2, [5, 1, 1])
        _mismatch_print += '  original  = (%3d, %3d, %3d) %6.2f\n' % tuple(_orig + [orig_extracted_dist])
        best_match_dist = wminkowski(_orig, _best_match, 2, [5, 1, 1])
        _mismatch_print += '  closest   = (%3d, %3d, %3d) %6.2f\n' % tuple(_best_match + [best_match_dist])
        _mismatch_print += '  extracted = (%3d, %3d, %3d)\n' % tuple(map(int, map(round, extracted)))

        print _mismatch_print
Beispiel #11
0
def getClosestTraining(pixel, trainData):
    dists = np.apply_along_axis(
        lambda x: wminkowski(pixel, x, 2, dist_weights), axis=1, arr=trainData)
    return np.argmin(dists)
Beispiel #12
0
def exec_similarity(dct, algorithm):
    if validate_similarity_algorithms(dct, algorithm):
        return {}
    if algorithm == 'braycurtis':
        return [
            answer.update({
                algorithm:
                braycurtis(ndarray_dict(dct['tf_idf']),
                           ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'canberra':
        return [
            answer.update({
                algorithm:
                canberra(ndarray_dict(dct['tf_idf']),
                         ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'chebyshev':
        return [
            answer.update({
                algorithm:
                chebyshev(ndarray_dict(dct['tf_idf']),
                          ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'cityblock':
        return [
            answer.update({
                algorithm:
                cityblock(ndarray_dict(dct['tf_idf']),
                          ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'correlation':
        return [
            answer.update({
                algorithm:
                correlation(ndarray_dict(dct['tf_idf']),
                            ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'cosine':
        return [
            answer.update({
                algorithm:
                cosine(ndarray_dict(dct['tf_idf']),
                       ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'euclidean':
        return [
            answer.update({
                algorithm:
                euclidean(ndarray_dict(dct['tf_idf']),
                          ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'mahalanobis':
        return [
            answer.update({
                algorithm:
                mahalanobis(ndarray_dict(dct['tf_idf']),
                            ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    #elif algorithm is 'minkowski':
    #return [answer.update({algorithm:minkowski(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf']))}) for answer in dct['answers']]
    elif algorithm == 'seuclidean':
        return [
            answer.update({
                algorithm:
                seuclidean(ndarray_dict(dct['tf_idf']),
                           ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'sqeuclidean':
        return [
            answer.update({
                algorithm:
                sqeuclidean(ndarray_dict(dct['tf_idf']),
                            ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'wminkowski':
        return [
            answer.update({
                algorithm:
                wminkowski(ndarray_dict(dct['tf_idf']),
                           ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'dice':
        return [
            answer.update({
                algorithm:
                dice(ndarray_dict(dct['tf_idf']),
                     ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'hamming':
        return [
            answer.update({
                algorithm:
                hamming(ndarray_dict(dct['tf_idf']),
                        ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'jaccard':
        return [
            answer.update({
                algorithm:
                jaccard(ndarray_dict(dct['tf_idf']),
                        ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'kulsinski':
        return [
            answer.update({
                algorithm:
                kulsinski(ndarray_dict(dct['tf_idf']),
                          ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'rogerstanimoto':
        return [
            answer.update({
                algorithm:
                rogerstanimoto(ndarray_dict(dct['tf_idf']),
                               ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'russellrao':
        return [
            answer.update({
                algorithm:
                russellrao(ndarray_dict(dct['tf_idf']),
                           ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'sokalmichener':
        return [
            answer.update({
                algorithm:
                sokalmichener(ndarray_dict(dct['tf_idf']),
                              ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'sokalsneath':
        return [
            answer.update({
                algorithm:
                sokalsneath(ndarray_dict(dct['tf_idf']),
                            ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'yule':
        return [
            answer.update({
                algorithm:
                yule(ndarray_dict(dct['tf_idf']),
                     ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
def getClosestTraining(pixel, trainData):
	dists = np.apply_along_axis( lambda x: wminkowski(pixel, x, 2, dist_weights) , axis=1, arr=trainData)
	return np.argmin(dists)
Beispiel #14
0
def main():
    from scipy.spatial import distance
    a = np.array([1, 2, 43])
    b = np.array([3, 2, 1])

    d = Distance()
    print('-----------------------------------------------------------------')

    print('My       braycurtis: {}'.format(d.braycurtis(a, b)))
    print('SciPy    braycurtis: {}'.format(distance.braycurtis(a, b)))
    print('-----------------------------------------------------------------')

    print('My       canberra: {}'.format(d.canberra(a, b)))
    print('SciPy    canberra: {}'.format(distance.canberra(a, b)))
    print('-----------------------------------------------------------------')

    print('My       chebyshev: {}'.format(d.chebyshev(a, b)))
    print('SciPy    chebyshev: {}'.format(distance.chebyshev(a, b)))
    print('-----------------------------------------------------------------')

    print('My       cityblock: {}'.format(d.cityblock(a, b)))
    print('SciPy    cityblock: {}'.format(distance.cityblock(a, b)))
    print('-----------------------------------------------------------------')

    print('My       correlation: {}'.format(d.correlation(a, b)))
    print('SciPy    correlation: {}'.format(distance.correlation(a, b)))
    print('-----------------------------------------------------------------')

    print('My       euclidean: {}'.format(d.euclidean(a, b)))
    print('SciPy    euclidean: {}'.format(distance.euclidean(a, b)))
    print('-----------------------------------------------------------------')

    print('My       hamming: {}'.format(d.hamming(a, b)))
    print('SciPy    hamming: {}'.format(distance.hamming(a, b)))
    print('-----------------------------------------------------------------')

    print('My       jaccard: {}'.format(d.jaccard(a, b)))
    print('SciPy    jaccard: {}'.format(distance.jaccard(a, b)))
    print('-----------------------------------------------------------------')

    print('My       manhattan: {}'.format(d.cityblock(a, b)))
    print('SciPy    manhattan: {}'.format(distance.cityblock(a, b)))
    print('-----------------------------------------------------------------')

    print('My       cosine: {}'.format(d.cosine(a, b)))
    print('SciPy    cosine: {}'.format(distance.cosine(a, b)))
    print('-----------------------------------------------------------------')

    print('My       dice: {}'.format(d.dice(a, b)))
    print('SciPy    dice: {}'.format(distance.dice(a, b)))
    print('-----------------------------------------------------------------')

    print('My       kulsinski: {}'.format(d.kulsinski(a, b)))
    print('SciPy    kulsinski: {}'.format(distance.kulsinski(a, b)))
    print('-----------------------------------------------------------------')

    iv = np.array([[1, 0.5, 0.5], [0.5, 1, 0.5], [0.5, 0.5, 1]])
    print('My       mahalanobis: {}'.format(d.mahalanobis(a, b, iv)))
    print('SciPy    mahalanobis: {}'.format(distance.mahalanobis(a, b, iv)))
    print('-----------------------------------------------------------------')

    print('My       seuclidean: {}'.format(
        d.seuclidean(a, b, np.array([0.1, 0.1, 0.1]))))
    print('SciPy    seuclidean: {}'.format(
        distance.seuclidean(a, b, [0.1, 0.1, 0.1])))
    print('-----------------------------------------------------------------')

    print('My       sokalmichener: {}'.format(d.sokalmichener(a, b)))
    print('SciPy    sokalmichener: {}'.format(distance.sokalmichener(a, b)))
    print('-----------------------------------------------------------------')

    print('My       sokal_sneath: {}'.format(d.sokalsneath(a, b)))
    print('SciPy    sokal_sneath: {}'.format(distance.sokalsneath(a, b)))
    print('-----------------------------------------------------------------')

    print('My       sqeuclidean: {}'.format(d.sqeuclidean(a, b)))
    print('SciPy    sqeuclidean: {}'.format(distance.sqeuclidean(a, b)))
    print('-----------------------------------------------------------------')

    print('My       minkowski: {}'.format(d.minkowski(a, b, 2)))
    print('SciPy    minkowski: {}'.format(distance.minkowski(a, b, 2)))
    print('-----------------------------------------------------------------')

    print('My       rogerstanimoto: {}'.format(d.rogerstanimoto(a, b)))
    print('SciPy    rogerstanimoto: {}'.format(distance.rogerstanimoto(a, b)))
    print('-----------------------------------------------------------------')

    print('My       russellrao: {}'.format(d.russellrao(a, b)))
    print('SciPy    russellrao: {}'.format(distance.russellrao(a, b)))
    print('-----------------------------------------------------------------')

    print('My       wminkowski: {}'.format(d.wminkowski(a, b, 2, np.ones(3))))
    print('SciPy    wminkowski: {}'.format(
        distance.wminkowski(a, b, 2, np.ones(3))))
    print('-----------------------------------------------------------------')

    print('My       yule: {}'.format(d.yule(a, b)))
    print('SciPy    yule: {}'.format(distance.yule(a, b)))
    print('-----------------------------------------------------------------')
 def _get_weight(self, x, centroid):
     return numpy.exp(-0.5 * self.scale * distance.wminkowski(x, centroid, self.power, self.precisions))
 def _get_weight(self, x, centroid):
     return numpy.exp(
         -0.5 * self.scale *
         distance.wminkowski(x, centroid, self.power, self.precisions))