def test_paired_distances():
    """ Test the pairwise_distance helper function. """
    rng = np.random.RandomState(0)
    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    # Euclidean distance, with Y != X.
    Y = rng.random_sample((5, 4))
    for metric, func in iteritems(PAIRED_DISTANCES):
        S = paired_distances(X, Y, metric=metric)
        S2 = func(X, Y)
        assert_array_almost_equal(S, S2)
        if metric in PAIRWISE_DISTANCE_FUNCTIONS:
            # Check the the pairwise_distances implementation
            # gives the same value
            distances = PAIRWISE_DISTANCE_FUNCTIONS[metric](X, Y)
            distances = np.diag(distances)
            assert_array_almost_equal(distances, S)

    # Check the callable implementation
    S = paired_distances(X, Y, metric='manhattan')
    S2 = paired_distances(X, Y,
        metric=lambda x, y: np.abs(x -y).sum(axis=0))
    assert_array_almost_equal(S, S2)

    # Test that a value error is raised when the lengths of X and Y should not
    # differ
    Y = rng.random_sample((3, 4))
    assert_raises(ValueError, paired_distances, X, Y)
Exemple #2
0
    def movement_df(self, other, metric="euclidean"):
        """
        Creates a dataframe that shows the movement from one embeddingset to another one.

        Arguments:
            other: the other embeddingset to compare against, will only keep the overlap
            metric: metric to use to calculate movement, must be scipy or sklearn compatible

        Usage:

        ```python
        from whatlies.language import SpacyLanguage

        lang = SpacyLanguage("en_core_web_sm")

        names = ['red', 'blue', 'green', 'yellow', 'cat', 'dog', 'mouse', 'rat', 'bike', 'car']
        emb = lang[names]
        emb_ort = lang[names] | lang['cat']
        emb.movement_df(emb_ort)
        ```
        """
        overlap = list(
            set(self.embeddings.keys()).intersection(
                set(other.embeddings.keys())))
        mat1 = np.array([w.vector for w in self[overlap]])
        mat2 = np.array([w.vector for w in other[overlap]])
        return (pd.DataFrame({
            "name": overlap,
            "movement": paired_distances(mat1, mat2, metric)
        }).sort_values(["movement"], ascending=False).reset_index())
Exemple #3
0
def compute_distances(vectorizer,
                      data,
                      vectorizer_name="",
                      train=True,
                      col1_name="question1",
                      col2_name="question2"):
    vector1 = vectorizer.transform(data[col2_name].values)
    vector2 = vectorizer.transform(data[col1_name].values)
    print(vectorizer_name, " data prepared")
    for distance_type in DISTANCES:
        metric_name = "distance_{0}_{1}".format(vectorizer_name, distance_type)
        metric_values = np.array([])
        if distance_type in ['cosine', 'l1', 'l2']:
            metric_values = paired_distances(vector1,
                                             vector2,
                                             metric=distance_type)
        else:
            metric = lambda x, y: pairwise_distances(
                x.reshape(1, -1), y.reshape(1, -1), metric=distance_type)
            for el1, el2 in zip(vector1, vector2):
                metric_values = np.append(metric_values,
                                          metric(el1.toarray(), el2.toarray()))
        if distance_type in ['canberra', 'l1']:
            if train:
                print(metric_name)
                metric_values = fit_min_max_scale(metric_values.reshape(
                    -1, 1)).flatten()
            else:
                metric_values = min_max_scale(metric_values.reshape(-1, 1),
                                              metric_name)
        data[metric_name] = metric_values
        print("metric {0}, minimum {1:.4f}, maximum {2:.4f}".format(
            distance_type, np.min(metric_values), np.max(metric_values)))
Exemple #4
0
    def transform_utterance(self, utt):
        """
        Computes vector representations, ranges, and cluster assignments for an utterance, using the two `ExpectedContextModelTransformer` instances. Also computes utterance-level orientation and shift. Note that the utterance must contain the input representation as a metadata field, specified by what was passed into the constructor as the `vect_field` argument.
        Will write all of these characterizations (including vectors) to the utterance's metadata.

        :param utt: Utterance
        :return: the utterance, with per-utterance attributes.
        """
        utt = self.ec_models[0].transform_utterance(utt)
        utt = self.ec_models[1].transform_utterance(utt)
        if self.wrapper_output_prefix == '':
            orn_field = 'orn'
            shift_field = 'shift'
        else:
            orn_field = self.wrapper_output_prefix + '_orn'
            shift_field = self.wrapper_output_prefix + '_shift'

        utt.meta[orn_field] = utt.meta[self.output_prefixes[0] + '_range'] \
            - utt.meta[self.output_prefixes[1] + '_range']

        utt.meta[shift_field] = float(
            paired_distances(
                np.array([utt.meta[self.output_prefixes[0] + '_repr']]),
                np.array([utt.meta[self.output_prefixes[1] + '_repr']]))[0])
        return utt
Exemple #5
0
    def transform(self, corpus, selector=lambda x: True):
        """
        Computes vector representations, ranges, and cluster assignments for utterances in a corpus, using the two `ExpectedContextModelTransformer` instances. Also computes utterance-level orientation and shift.

        :param corpus: Corpus
        :param selector: a boolean function of signature `filter(utterance)` that determines which utterances to transform. defaults to all utterances.
        :return: the Corpus, with per-utterance attributes.
        """
        self.ec_models[0].transform(corpus, selector=selector)
        self.ec_models[1].transform(corpus, selector=selector)
        if self.wrapper_output_prefix == '':
            orn_field = 'orn'
            shift_field = 'shift'
        else:
            orn_field = self.wrapper_output_prefix + '_orn'
            shift_field = self.wrapper_output_prefix + '_shift'

        for ut in corpus.iter_utterances(selector=selector):
            ut.meta[orn_field] = ut.meta[
                self.output_prefixes[0] +
                '_range'] - ut.meta[self.output_prefixes[1] + '_range']

        utt_shifts = paired_distances(
            corpus.get_vectors(self.output_prefixes[0] + '_repr'),
            corpus.get_vectors(self.output_prefixes[1] + '_repr'))
        for id, shift in zip(
                corpus.get_vector_matrix(self.output_prefixes[0] +
                                         '_repr').ids, utt_shifts):
            corpus.get_utterance(id).meta[shift_field] = shift
Exemple #6
0
def test_encoder(metric='euclidean'):
    root = '../../../data/AIC20_track3/train/S03'
    cams = ['c010', 'c011', 'c012', 'c013', 'c014', 'c015']

    detections = {}
    cap = {}
    for cam in cams:
        frame_detections = defaultdict(list)
        for det in parse_annotations_from_txt(
                os.path.join(root, cam, 'mtsc', 'mtsc_tc_mask_rcnn.txt')):
            if det.height >= 128 and det.width >= 128:
                frame_detections[det.frame].append(det)
        detections[cam] = frame_detections
        cap[cam] = cv2.VideoCapture(os.path.join(root, cam, 'vdo.avi'))

    def random_detection(cam=None, id=None):
        if cam is None:
            cam = np.random.choice(cams)
        if id is None:
            frame = np.random.choice(list(detections[cam].keys()))
            det = np.random.choice(detections[cam][frame])
        else:
            for frame in np.random.permutation(list(detections[cam].keys())):
                found = False
                for det in detections[cam][frame]:
                    if det.id == id:
                        found = True
                        break
                if found:
                    break
            else:
                raise ValueError(f'id {id} not found in cam {cam}')
        cap[cam].set(cv2.CAP_PROP_POS_FRAMES, det.frame)
        ret, img = cap[cam].read()
        img = img[int(det.ytl):int(det.ybr), int(det.xtl):int(det.xbr)]
        return img, (cam, det.id)

    encoder = Encoder(path='../metric_learning/checkpoints/epoch_19__ckpt.pth')
    print(encoder)
    encoder.eval()

    pairs = [(('c010', 15), ('c011', 29)), None]
    for p in pairs:
        if p is not None:
            img1, info1 = random_detection(*p[0])
            img2, info2 = random_detection(*p[1])
        else:
            img1, info1 = random_detection()
            img2, info2 = random_detection()

        embd1 = encoder.get_embedding(img1)
        embd2 = encoder.get_embedding(img2)

        dist = paired_distances([embd1], [embd2], metric).squeeze()
        print(dist)

        cv2.imshow('{}:{}'.format(*info1), img1)
        cv2.imshow('{}:{}'.format(*info2), img2)
        cv2.waitKey(0)
        cv2.destroyAllWindows()
Exemple #7
0
def cost_func(x0, *args) -> float:
    """Cost function to optimize weights from which the best trajectory for a joint is calculated.
    
    :param x0: 3 lambda weights for linear combination of marker vectors to retrieve joint location.
    :type x0: numpy.ndarray
    :param args: marker trajectories matrix,
                 marker indices belonging to rigid body 1 & rigid body 2, distance penalty weight factor.
    :type args: tuple
    :return: cost
    :rtype: float
    """
    trajectories = args[0]
    rigid1_indices = args[1]
    rigid2_indices = args[2]
    penalty = float(args[3])
    # First, construct the joint trajectory from rigid body 1 and weights.
    j = joint_from_markers(trajectories[:, rigid1_indices, :], x0)
    all_marker_indices = rigid1_indices + rigid2_indices
    # Todo: Is there a faster way? Distances of all markers to joint in parallel. Or use n_jobs for speedup?
    # Then calculate cost q.
    distances_to_joint = np.array([
        paired_distances(t, j, n_jobs=-1)
        for t in np.swapaxes(trajectories[:, all_marker_indices], 0, 1)
    ])
    mean_distances = np.mean(distances_to_joint, axis=1)
    var_distances = np.var(distances_to_joint, axis=1)
    q = (var_distances +
         penalty * mean_distances).sum() / len(all_marker_indices)
    return q
Exemple #8
0
def find_shot_time(data):
    num = np.unique(data['shot_num'])
    for u in range(0, 1):
        one_shot = data.loc[data['shot_num'] == num[u]]
        orig_shot_time = one_shot.loc[one_shot['shot_ind'] ==
                                      1]['game_clock'].item()
        shooter = one_shot.iloc[0]['shooter']
        time_range = one_shot.loc[
            (one_shot['game_clock'] <= orig_shot_time)
            & (one_shot['game_clock'] >= orig_shot_time - 500)]
        player_ball = time_range.loc[(time_range['player_name'] == 'ball') |
                                     (time_range['player_name'] == shooter)]
        moms = np.unique(player_ball['moment'])
        time_dist = pd.DataFrame(columns=['moment', 'distance'],
                                 index=np.arange(len(moms)))
        for p in range(0, len(moms)):
            sub = player_ball.loc[player_ball['moment'] == moms[p]]
            player_x = sub.loc[sub['player_name'] == shooter, ]['x_loc'].item()
            player_y = sub.loc[sub['player_name'] == shooter, ]['y_loc'].item()
            ball_x = sub.loc[sub['player_name'] == 'ball', ]['x_loc'].item()
            ball_y = sub.loc[sub['player_name'] == 'ball', ]['y_loc'].item()
            dist = paired_distances([player_x, player_y], [ball_x, ball_y])
            time_dist.loc[p, 'moment'] = moms[p]
            time_dist.loc[p, 'distance'] = dist
        return (time_dist)
Exemple #9
0
def fitness_manhattan_similarity_avg(catalog_matrix, exposure_data,
                                     original_data):
    new_matrix = np.dot(np.array(catalog_matrix), np.array(exposure_data))
    similarity = paired_distances(new_matrix.reshape(-1, 1),
                                  np.array(original_data).reshape(-1, 1),
                                  metric='manhattan')
    return -np.average(similarity)
Exemple #10
0
def fitness_manhattan_similarity_sum(catalog_matrix, exposure_data,
                                     original_data):
    new_matrix = np.dot(np.array(catalog_matrix), np.array(exposure_data))
    similarity = paired_distances(new_matrix.reshape(1, -1),
                                  np.array(original_data).reshape(1, -1),
                                  metric='euclidean')
    return -np.sum(similarity)
Exemple #11
0
def getDistances(x,y):

    distances = {}
    distances['mae'] = mean_absolute_error(x, y)
    distances['mse'] = mean_squared_error(x, y)
    distances['euclidean'] = np.mean(paired_distances(x, y, metric='euclidean'))
    distances['manhattan'] = np.mean(paired_distances(x, y, metric='manhattan'))
    distances['cosine'] = np.mean(paired_distances(x, y, metric='cosine'))
   
    distances['mae'] = round(distances['mae'], 5)
    distances['mse'] = round(distances['mse'], 5)
    distances['euclidean'] = round(distances['euclidean'], 5)
    distances['manhattan'] = round(distances['manhattan'], 5)
    distances['cosine'] = round(distances['cosine'], 5)
    
    return distances
def findClosestCentroids(X, centroids):
    idx = np.zeros((len(X), ))
    for i in range(len(X)):
        x = np.repeat(X[i].reshape(-1, 1), len(centroids), axis=1).T
        eucl_dist = paired_distances(x, centroids)
        idx[i] = np.argmin(eucl_dist)
    return idx
Exemple #13
0
def delaunay_graph(X, weighted=False):
  '''Delaunay triangulation graph.
  '''
  e1, e2 = _delaunay_edges(X)
  pairs = np.column_stack((e1, e2))
  w = paired_distances(X[e1], X[e2]) if weighted else None
  return Graph.from_edge_pairs(pairs, num_vertices=X.shape[0], symmetric=True,
                               weights=w)
def test_paired_distances_callable():
    # Test the pairwise_distance helper function
    # with the callable implementation
    rng = np.random.RandomState(0)
    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    # Euclidean distance, with Y != X.
    Y = rng.random_sample((5, 4))

    S = paired_distances(X, Y, metric='manhattan')
    S2 = paired_distances(X, Y, metric=lambda x, y: np.abs(x - y).sum(axis=0))
    assert_array_almost_equal(S, S2)

    # Test that a value error is raised when the lengths of X and Y should not
    # differ
    Y = rng.random_sample((3, 4))
    assert_raises(ValueError, paired_distances, X, Y)
def test_paired_distances_callable():
    # Test the pairwise_distance helper function
    # with the callable implementation
    rng = np.random.RandomState(0)
    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    # Euclidean distance, with Y != X.
    Y = rng.random_sample((5, 4))

    S = paired_distances(X, Y, metric='manhattan')
    S2 = paired_distances(X, Y, metric=lambda x, y: np.abs(x - y).sum(axis=0))
    assert_array_almost_equal(S, S2)

    # Test that a value error is raised when the lengths of X and Y should not
    # differ
    Y = rng.random_sample((3, 4))
    assert_raises(ValueError, paired_distances, X, Y)
Exemple #16
0
def gabriel_graph(X, metric='euclidean'):
  a,b = np.triu_indices(X.shape[0], k=1)
  midpoints = (X[a] + X[b]) / 2
  Dmid = pairwise_distances(midpoints, X, metric=metric).min(axis=1)
  Dedge = paired_distances(X[a], X[b], metric=metric)
  mask = (Dedge - Dmid * 2) < 1e-10
  pairs = np.transpose((a[mask],b[mask]))
  return Graph.from_edge_pairs(pairs, num_vertices=X.shape[0], symmetric=True)
Exemple #17
0
 def movement_df(self, other, metric="euclidean"):
     overlap = list(
         set(self.embeddings.keys()).union(set(other.embeddings.keys())))
     mat1 = np.array([w.vector for w in self[overlap]])
     mat2 = np.array([w.vector for w in other[overlap]])
     return pd.DataFrame({
         'name': overlap,
         'movement': paired_distances(mat1, mat2, metric)
     }).sort_values(['movement'], ascending=False).reset_index()
Exemple #18
0
 def get_average_displacement(df):
     if len(df) > 1:
         test = paired_distances(
             df[[self.identifiers[0], self.identifiers[1]]].iloc[1:, :],
             df[[self.identifiers[0],
                 self.identifiers[1]]].shift().iloc[1:, :])
         return np.mean(test)
     else:
         pass
Exemple #19
0
def analogy_solver(man, woman, king, W, top_n=5, return_score=False):
    """
    In the famous "man is to woman as king is to queen" example, queen
    is the word w that maximizes: cos(w, king) - cos(w, man) + cos(w, woman).
    """
    A = np.array([king] * len(W))
    B = np.array([man] * len(W))
    Y = np.array([woman] * len(W))

    score = paired_distances(W, A, 'cosine') - paired_distances(W, B, 'cosine') + paired_distances(W, Y, 'cosine')

    # score = score.flatten()
    sorted_score = score.argsort()  # [::-1]

    if not return_score:
        return sorted_score[:top_n]
    else:
        return sorted_score[:top_n], score[sorted_score][:top_n]
Exemple #20
0
def gabriel_graph(X, metric='euclidean', weighted=False):
  n = X.shape[0]
  a, b = np.triu_indices(n, k=1)
  midpoints = (X[a] + X[b]) / 2
  _, Dmid = pairwise_distances_argmin_min(midpoints, X, metric=metric)
  Dedge = paired_distances(X[a], X[b], metric=metric)
  mask = (Dedge - Dmid * 2) < 1e-10
  pairs = np.column_stack((a[mask], b[mask]))
  w = Dedge[mask] if weighted else None
  return Graph.from_edge_pairs(pairs, num_vertices=n, symmetric=True, weights=w)
Exemple #21
0
    def _converged(self, old_centers, iteration):
        if old_centers is None:
            return False

        diff = np.sum(paired_distances(self.centers, old_centers))

        if self.verbose:
            print("Iteration %s - Convergence crit. = %s" % (iteration, diff))

        return diff < self.tol**2 or iteration >= self.max_iter
def embedding_distance_bulk(
        embeddings1: Embedding,
        embeddings2: Embedding,
        distance_metric: DistanceMetric) -> np.ndarray:
    """Compares the distance between two arrays of embeddings
    """
    if distance_metric == DistanceMetric.EUCLIDEAN_SQUARED:
        return np.square(
            paired_distances(
                embeddings1,
                embeddings2,
                metric='euclidean'))
    elif distance_metric == DistanceMetric.ANGULAR_DISTANCE:
        # Angular Distance: https://en.wikipedia.org/wiki/Cosine_similarity
        similarity = 1 - paired_distances(
            embeddings1,
            embeddings2,
            metric='cosine')
        return np.arccos(similarity) / math.pi
Exemple #23
0
def delaunay_graph(X, weighted=False):
    '''Delaunay triangulation graph.
  '''
    e1, e2 = _delaunay_edges(X)
    pairs = np.column_stack((e1, e2))
    w = paired_distances(X[e1], X[e2]) if weighted else None
    return Graph.from_edge_pairs(pairs,
                                 num_vertices=X.shape[0],
                                 symmetric=True,
                                 weights=w)
Exemple #24
0
    def calculate(self, pairs: Iterable[Pair]) -> np.ndarray:
        embeddings1 = []
        embeddings2 = []
        for pair in pairs:
            embeddings1.append(pair.image1)
            embeddings2.append(pair.image2)
        if self._distance_metric == DistanceMetric.EUCLIDEAN_SQUARED:
            return np.square(
                paired_distances(embeddings1, embeddings2, metric='euclidean'))
        if self._distance_metric == DistanceMetric.ANGULAR_DISTANCE:
            # Angular Distance: https://en.wikipedia.org/wiki/Cosine_similarity
            similarity = 1 - paired_distances(
                embeddings1, embeddings2, metric='cosine')
            return np.arccos(similarity) / math.pi
        metrics = [str(metric) for metric in DistanceMetric]
        err = f"Undefined {DistanceMetric.__qualname__}. \
Choose from {metrics}"

        raise DistanceMetricException(err)
Exemple #25
0
def node_feature_similarities(node_features, sources, sinks):
    similarities = []
    row_features = node_features[sources]
    col_features = node_features[sinks]
    for metric in [cosine, euclidean, rbf]:
        similarities.append(
            paired_distances(row_features, col_features, metric=metric))
    similarities = np.array(similarities).T
    logger.info(f'node_feature_similarities generated: {similarities.shape}')
    return similarities
Exemple #26
0
 def vertex_connectivity(self, surface: Surface, mode: str="sparse", metric: Optional[str]=None,
                         symmetric: bool=False, verts_mask: Union[numpy.ndarray, list]=None) \
         -> Union[numpy.ndarray, scipy.sparse.csr.csr_matrix]:
     """
     It computes a sparse matrix of the connectivity among the vertices of a surface.
     :param surface: input surface object
     :param mode: "sparse" by default or "2D"
     :param metric: None by default, could be "euclidean"
     :param symmetric: True for symmetric matrix output
     :param verts_mask: a mask to apply the method to a a sub-surface of the original surface
     :return: the computed matrix.
     """
     if verts_mask is not None:
         (vertices, triangles) = self.extract_subsurf(
             surface, verts_mask, output='verts_triangls')[:2]
     else:
         vertices = surface.vertices
         triangles = surface.triangles
     # Get all pairs of vertex indexes (i.e., edges) that appear in each
     # face (triangle)
     edges = numpy.r_[triangles[:, [0, 1]],
                      triangles[:, [1, 2]], triangles[:, [2, 0]]]
     # Remove repetitions
     edges = numpy.vstack(set(map(tuple, edges)))
     # Mark all existing pairs to 1
     n_v = vertices.shape[0]
     n_e = edges.shape[0]
     # For symmetric output...
     if symmetric:
         # ...create for the moment the "double" edges
         edges2 = numpy.r_[edges, edges[:, [1, 0]]]
     if metric is None:
         # For symmetric output...
         if symmetric:
             # ...remove repetitions of edges2
             edges = numpy.vstack(set(map(tuple, edges2)))
             n_e = edges.shape[0]
         con = csr_matrix(
             (numpy.ones((n_e,)), (edges[:, 0], edges[:, 1])), shape=(n_v, n_v))
         if mode != "sparse":
             # Create non-sparse matrix
             con = con.todense()
     else:
         d = paired_distances(vertices[edges[:, 0]], vertices[
             edges[:, 1]], metric)
         # For symmetric output...
         if symmetric:
             # double also d...
             d = numpy.r_[d, d]
             edges = edges2
         if mode == "sparse":
             # Create sparse matrix
             con = csr_matrix(
                 (d, (edges[:, 0], edges[:, 1])), shape=(n_v, n_v))
     return con
Exemple #27
0
    def fit(self, attributes0, label0):
        attributes = np.array(attributes0)
        label = np.array(label0)

        self.clustering_model.fit(attributes)
        pred = self.clustering_model.predict(attributes)

        lab = pd.Series(label, name='lab', dtype=int)
        pre = pd.Series(pred, name='pre', dtype=int)
        kmcomp = pd.concat([lab, pre], axis=1)

        kmc1 = kmcomp[kmcomp['lab'] == 1]
        kmc0 = kmcomp[kmcomp['lab'] == 0]

        sta1 = kmc1['pre'].groupby(kmc1['pre']).count()
        sta0 = kmc0['pre'].groupby(kmc0['pre']).count()

        align = pd.concat([sta1 / len(kmc1), sta0 / len(kmc0)],
                          axis=1).fillna(0)
        align.columns = ['sta1a', 'sta0a']

        sta1a = align['sta1a']
        sta0a = align['sta0a']

        dif = sta1a / (sta1a + sta0a)
        result = pd.concat(
            [pd.concat([sta1, sta0, sta1a, sta0a], axis=1).fillna(0), dif],
            axis=1)
        result.columns = ['sta1', 'sta0', 'sta1a', 'sta0a', 'dif']
        resee = result.sort_values('dif', ascending=False)
        resee['cumsta1a'] = np.cumsum(resee['sta1a'])
        resee['cumsta0a'] = np.cumsum(resee['sta0a'])
        resee['cumsta1'] = np.cumsum(resee['sta1'])
        resee['cumsta0'] = np.cumsum(resee['sta0'])

        kmcomp['dis'] = pd.Series(
            map(
                lambda x1, x2: paired_distances(
                    x1.reshape(1, -1), self.clustering_model.cluster_centers_[
                        x2].reshape(1, -1))[0], attributes, pred))

        distance_max = kmcomp['dis'].groupby(
            kmcomp['pre']).max().rename('distance_max')
        distance_mean = kmcomp['dis'].groupby(
            kmcomp['pre']).mean().rename('distance_mean')
        distances = pd.concat([distance_max, distance_mean], axis=1)
        resee = pd.merge(resee,
                         pd.DataFrame(distances),
                         left_index=True,
                         right_index=True)

        self.comparison_summary = resee
        self.ordered_centers = pd.DataFrame(
            self.clustering_model.cluster_centers_).iloc[resee.index]
    def getdd(self,tx,ty):
	    # calculate distances and pad shortest array
        # pad shortest array with it's last value
        gap = tx.shape[0]-ty.shape[0]
        if gap>0:
            ty=np.pad(ty, ((0,gap),(0,0)), 'edge')
        elif gap <0:
            tx=np.pad(tx, ((0,-gap),(0,0)), 'edge')
        else:
            pass
        # use any distance metric that you would like
        return paired_distances(tx,ty,metric='l2').sum()
Exemple #29
0
def statistic(centers, labels, vectors):
    times = dict(zip(*np.unique(labels, return_counts=True)))
    for i in times:
        time = times[i]
        var = 0
        sum = vectors[0] * 0
        num = 0
        for index, label in enumerate(labels):
            if label == i:
                num += 1
                sum += vectors[index]
        mean = sum / num
        for index, label in enumerate(labels):
            if label == i:
                var += paired_distances(vectors[index].reshape(1, -1),
                                        mean.reshape(1, -1))
        var /= num
        norm = paired_distances(centers[i].reshape(1, -1),
                                (centers[i] * 0).reshape(1, -1))
        times[i] = [time, var, norm]
    return times
def getdd(tx, ty):
    # calculate distances and pad shortest array
    # pad shortest array with it's last value
    gap = tx.shape[0] - ty.shape[0]
    if gap > 0:
        ty = np.pad(ty, ((0, gap), (0, 0)), 'edge')
    elif gap < 0:
        tx = np.pad(tx, ((0, -gap), (0, 0)), 'edge')
    else:
        pass
    # use any distance metric that you would like
    return paired_distances(tx, ty, metric='l2').sum()
Exemple #31
0
def distance_cluster(image_feats, clusters):
    X = []
    len_clusters = clusters.shape[0]
    """
    For each row, calculate its distance to each k-cluster and determine the k-cluster with the smallest distance
    and build a matrix of these k-clusters
    """
    for e in image_feats:
        row_matrix = np.tile(e, (len_clusters, 1))
        distances = paired_distances(row_matrix, clusters)
        X.append(np.argmin(distances))
    return X
Exemple #32
0
def gabriel_graph(X, metric='euclidean', weighted=False):
    n = X.shape[0]
    a, b = np.triu_indices(n, k=1)
    midpoints = (X[a] + X[b]) / 2
    _, Dmid = pairwise_distances_argmin_min(midpoints, X, metric=metric)
    Dedge = paired_distances(X[a], X[b], metric=metric)
    mask = (Dedge - Dmid * 2) < 1e-10
    pairs = np.column_stack((a[mask], b[mask]))
    w = Dedge[mask] if weighted else None
    return Graph.from_edge_pairs(pairs,
                                 num_vertices=n,
                                 symmetric=True,
                                 weights=w)
Exemple #33
0
def urquhart_graph(X, weighted=False):
  '''Urquhart graph: made from the 2 shortest edges of each Delaunay triangle.
  '''
  e1, e2 = _delaunay_edges(X)
  w = paired_distances(X[e1], X[e2])
  mask = np.ones_like(w, dtype=bool)
  bad_inds = w.reshape((-1, 3)).argmax(axis=1) + np.arange(0, len(e1), 3)
  mask[bad_inds] = False

  weights = w[mask] if weighted else None
  pairs = np.column_stack((e1[mask], e2[mask]))
  return Graph.from_edge_pairs(pairs, num_vertices=X.shape[0], symmetric=True,
                               weights=weights)
Exemple #34
0
def _distance_between_embeddings(embeddings1: np.ndarray,
                                 embeddings2: np.ndarray) -> np.ndarray:
    # if distance_metric == DistanceMetric.EUCLIDEAN_SQUARED:
    #     return np.square(
    #         paired_distances(
    #             embeddings1,
    #             embeddings2,
    #             metric='euclidean'))
    # elif distance_metric == DistanceMetric.ANGULAR_DISTANCE:
    # Angular Distance: https://en.wikipedia.org/wiki/Cosine_similarity
    similarity = 1 - paired_distances(
        embeddings1, embeddings2, metric='cosine')
    return np.arccos(similarity) / math.pi
Exemple #35
0
def K_initialize(vectors, k, zero_ini):
    n = vectors.shape[0]
    d = vectors.shape[1]
    labels_dist = np.zeros((n, 2))
    centers = np.zeros((k, d))
    # np.random.seed(0)
    # ini = np.random.randint(n)
    ini = zero_ini
    for i, item in enumerate(labels_dist):
        item[0] = 0
        item[1] = paired_distances(vectors[ini].reshape((1, -1)),
                                   vectors[i].reshape((1, -1)))[0]
    centers[0] = vectors[ini]
    return labels_dist, centers
Exemple #36
0
    def evaluate(self, sess, X_test, Y_test, n_test, dropout):

        total_pred = np.zeros(Y_test.shape)
        if self.n_neurons_aleat_unc > 1:
            total_aleat_unc_pred = np.zeros(Y_test.shape)
        else:
            total_aleat_unc_pred = np.zeros(len(Y_test))
        total_loss = []
        for batch_idx, batch in enumerate(range(0, n_test, self.batch_size)):
            start_idx = batch
            end_idx = batch + self.batch_size

            x = X_test[start_idx:end_idx]
            y = Y_test[start_idx:end_idx]

            exclude = 0
            if len(x) < self.batch_size:
                exclude = self.batch_size - len(x)
                x = np.pad(x, ((0, exclude), (0, 0), (0, 0)), 'constant')
                y = np.pad(y, ((0, exclude), (0, 0)), 'constant')

            p, al_un, l = sess.run(
                [self.out_layer, self.noise_out_layer, self.loss],
                feed_dict={
                    self.input_pl: x,
                    self.output_pl: y,
                    self.dropout_pl: dropout
                })

            if self.n_neurons_aleat_unc <= 1:
                al_un = al_un.flatten()

            if exclude > 0:
                total_pred[start_idx:end_idx] = p[:-exclude]
                #total_loss += l
                total_aleat_unc_pred[start_idx:end_idx] = al_un[:-exclude]
            else:
                total_pred[start_idx:end_idx] = p
                total_loss.append(l)
                total_aleat_unc_pred[start_idx:end_idx] = al_un

        mse = np.mean(np.square(total_pred - Y_test))
        print('Test MSE Loss {:5.8f} '.format(mse))

        if not self.use_aleat_unc:
            total_aleat_unc_pred = None

        distances = paired_distances(Y_test, total_pred)
        return total_pred, total_aleat_unc_pred, distances
Exemple #37
0
def urquhart_graph(X, weighted=False):
    '''Urquhart graph: made from the 2 shortest edges of each Delaunay triangle.
  '''
    e1, e2 = _delaunay_edges(X)
    w = paired_distances(X[e1], X[e2])
    mask = np.ones_like(w, dtype=bool)
    bad_inds = w.reshape((-1, 3)).argmax(axis=1) + np.arange(0, len(e1), 3)
    mask[bad_inds] = False

    weights = w[mask] if weighted else None
    pairs = np.column_stack((e1[mask], e2[mask]))
    return Graph.from_edge_pairs(pairs,
                                 num_vertices=X.shape[0],
                                 symmetric=True,
                                 weights=weights)
Exemple #38
0
 def reweight_by_distance(self, coords, metric='l2', copy=False):
   '''Replaces existing edge weights by distances between connected vertices.
   The new weight of edge (i,j) is given by: metric(coords[i], coords[j]).
   coords : (num_vertices x d) array of coordinates, in vertex order
   metric : str or callable, see sklearn.metrics.pairwise.paired_distances'''
   if not self.is_weighted():
     warnings.warn('Cannot supply weights for unweighted graph; '
                   'ignoring call to reweight_by_distance')
     return self
   # TODO: take advantage of symmetry of metric function
   ii, jj = self.pairs().T
   if metric == 'precomputed':
     assert coords.ndim == 2 and coords.shape[0] == coords.shape[1]
     d = coords[ii,jj]
   else:
     d = paired_distances(coords[ii], coords[jj], metric=metric)
   return self._update_edges(d, copy=copy)
Exemple #39
0
def extract_feature(net,transformer,ImagePath1, ImagePath2,layer_name, image_as_grey = False):
    """
    Extracts features for given model and image list.

    Input
    network_proto_path: network definition file, in prototxt format.
    network_model_path: trainded network model file
    image_list: A list contains paths of all images, which will be fed into the
                network and their features would be saved.
    layer_name: The name of layer whose output would be extracted.
    save_path: The file path of extracted features to be saved.
    """
    net.blobs['data'].reshape(2,3,128,128)
    img = cv2.imread(ImagePath1)
    img1 = cv2.imread(ImagePath2)
    shape0 = img.shape
    shape1 = img1.shape
    if shape0[0]!=128 and shape0[1]!=128:
        cv2.resize(img,(128,128))
    if shape1[0]!=128 and shape1[1]!=128:
        cv2.resize(img1,(128,128))
    gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    gray1 = cv2.cvtColor(img1,cv2.COLOR_BGR2GRAY)
    
    gray = gray / 256.0
    gray1 = gray1 / 256.0
    ImageBatch= []
    ImageBatch.append(gray)
    ImageBatch.append(gray)
    net.blobs['data'].data[0] = transformer.preprocess('data', gray)
    net.blobs['data'].data[1] = transformer.preprocess('data', gray1)

    out = net.forward()
    a = net.blobs[layer_name].data[0].copy()
    b = net.blobs[layer_name].data[1].copy()

    #b = b.reshape(256,1)
    #dst = dis_cos(a,b)
    dst = pw.paired_distances(a,b,'cosine')
    #b = b.reshape(256,1)
    #dst = dis_cos(a,b)
    #print 'dst0:',dst0,'     dst:',dst
    return 1-dst
def test_paired_distances(metric, func):
    # Test the pairwise_distance helper function.
    rng = np.random.RandomState(0)
    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    # Euclidean distance, with Y != X.
    Y = rng.random_sample((5, 4))

    S = paired_distances(X, Y, metric=metric)
    S2 = func(X, Y)
    assert_array_almost_equal(S, S2)
    S3 = func(csr_matrix(X), csr_matrix(Y))
    assert_array_almost_equal(S, S3)
    if metric in PAIRWISE_DISTANCE_FUNCTIONS:
        # Check the pairwise_distances implementation
        # gives the same value
        distances = PAIRWISE_DISTANCE_FUNCTIONS[metric](X, Y)
        distances = np.diag(distances)
        assert_array_almost_equal(distances, S)
Exemple #41
0
def _prune_edges(G, X, traj_lengths, pruning_thresh=0.1, verbose=False):
  '''Prune edges in graph G via cosine distance with trajectory edges.'''
  W = G.matrix(dense=True).copy()
  degree = G.degree(kind='out', weighted=False)
  i = 0
  num_bad = 0
  for n in traj_lengths:
    s, t = np.nonzero(W[i:i+n-1])
    graph_edges = X[t] - X[s+i]
    traj_edges = np.diff(X[i:i+n], axis=0)
    traj_edges = np.repeat(traj_edges, degree[i:i+n-1], axis=0)
    theta = paired_distances(graph_edges, traj_edges, 'cosine')
    bad_edges = theta > pruning_thresh
    s, t = s[bad_edges], t[bad_edges]
    if verbose:  # pragma: no cover
      num_bad += np.count_nonzero(W[s,t])
    W[s,t] = 0
    i += n
  if verbose:  # pragma: no cover
    print('removed %d bad edges' % num_bad)
  return Graph.from_adj_matrix(W)
econ = econ.suburb_econ()
geo = geography.suburb_geo()

suburbs = []
for item in geo:
    suburbs.append(item)

cos_values = []
ecld_dist = []
level_dist = []
gnlz_dist = []

for (suburb_one, suburb_two) in combinations(suburbs, 2):
    # economy test
    cos_values.append(1 - paired_distances(econ[suburb_one], econ[suburb_two], metric="cosine")[0])

    # population test
    # cos_values.append(1 - paired_distances(pop[suburb_one], pop[suburb_two], metric="cosine")[0])

    # euclidean distance
    # hypot(x2 - x1, y2 - y1)
    (x_one, y_one) = geo[suburb_one][0]
    (x_two, y_two) = geo[suburb_two][0]
    dist = hypot(x_one - x_two, y_one - y_two)
    ecld_dist.append(dist)

    '''
    # generalized distance based on euclidean distance
    # polar distance to GPO
    polar_dist_one = geo[suburb_one][1][0]
Exemple #43
0
def linkage_tree(X, connectivity=None, n_components=None,
                 n_clusters=None, linkage='complete', affinity="euclidean",
                 return_distance=False):
    """Linkage agglomerative clustering based on a Feature matrix.

    The inertia matrix uses a Heapq-based representation.

    This is the structured version, that takes into account some topological
    structure between samples.

    Read more in the :ref:`User Guide <hierarchical_clustering>`.

    Parameters
    ----------
    X : array, shape (n_samples, n_features)
        feature matrix representing n_samples samples to be clustered

    connectivity : sparse matrix (optional).
        connectivity matrix. Defines for each sample the neighboring samples
        following a given structure of the data. The matrix is assumed to
        be symmetric and only the upper triangular half is used.
        Default is None, i.e, the Ward algorithm is unstructured.

    n_components : int (optional)
        Number of connected components. If None the number of connected
        components is estimated from the connectivity matrix.
        NOTE: This parameter is now directly determined directly
        from the connectivity matrix and will be removed in 0.18

    n_clusters : int (optional)
        Stop early the construction of the tree at n_clusters. This is
        useful to decrease computation time if the number of clusters is
        not small compared to the number of samples. In this case, the
        complete tree is not computed, thus the 'children' output is of
        limited use, and the 'parents' output should rather be used.
        This option is valid only when specifying a connectivity matrix.

    linkage : {"average", "complete"}, optional, default: "complete"
        Which linkage critera to use. The linkage criterion determines which
        distance to use between sets of observation.
            - average uses the average of the distances of each observation of
              the two sets
            - complete or maximum linkage uses the maximum distances between
              all observations of the two sets.

    affinity : string or callable, optional, default: "euclidean".
        which metric to use. Can be "euclidean", "manhattan", or any
        distance know to paired distance (see metric.pairwise)

    return_distance : bool, default False
        whether or not to return the distances between the clusters.

    Returns
    -------
    children : 2D array, shape (n_nodes-1, 2)
        The children of each non-leaf node. Values less than `n_samples`
        correspond to leaves of the tree which are the original samples.
        A node `i` greater than or equal to `n_samples` is a non-leaf
        node and has children `children_[i - n_samples]`. Alternatively
        at the i-th iteration, children[i][0] and children[i][1]
        are merged to form node `n_samples + i`

    n_components : int
        The number of connected components in the graph.

    n_leaves : int
        The number of leaves in the tree.

    parents : 1D array, shape (n_nodes, ) or None
        The parent of each node. Only returned when a connectivity matrix
        is specified, elsewhere 'None' is returned.

    distances : ndarray, shape (n_nodes-1,)
        Returned when return_distance is set to True.

        distances[i] refers to the distance between children[i][0] and
        children[i][1] when they are merged.

    See also
    --------
    ward_tree : hierarchical clustering with ward linkage
    """
    X = np.asarray(X)
    if X.ndim == 1:
        X = np.reshape(X, (-1, 1))
    n_samples, n_features = X.shape

    linkage_choices = {'complete': _hierarchical.max_merge,
                       'average': _hierarchical.average_merge}
    try:
        join_func = linkage_choices[linkage]
    except KeyError:
        raise ValueError(
            'Unknown linkage option, linkage should be one '
            'of %s, but %s was given' % (linkage_choices.keys(), linkage))

    if connectivity is None:
        from scipy.cluster import hierarchy     # imports PIL

        if n_clusters is not None:
            warnings.warn('Partial build of the tree is implemented '
                          'only for structured clustering (i.e. with '
                          'explicit connectivity). The algorithm '
                          'will build the full tree and only '
                          'retain the lower branches required '
                          'for the specified number of clusters',
                          stacklevel=2)

        if affinity == 'precomputed':
            # for the linkage function of hierarchy to work on precomputed
            # data, provide as first argument an ndarray of the shape returned
            # by pdist: it is a flat array containing the upper triangular of
            # the distance matrix.
            i, j = np.triu_indices(X.shape[0], k=1)
            X = X[i, j]
        elif affinity == 'l2':
            # Translate to something understood by scipy
            affinity = 'euclidean'
        elif affinity in ('l1', 'manhattan'):
            affinity = 'cityblock'
        elif callable(affinity):
            X = affinity(X)
            i, j = np.triu_indices(X.shape[0], k=1)
            X = X[i, j]
        out = hierarchy.linkage(X, method=linkage, metric=affinity)
        children_ = out[:, :2].astype(np.int)

        if return_distance:
            distances = out[:, 2]
            return children_, 1, n_samples, None, distances
        return children_, 1, n_samples, None

    if n_components is not None:
        warnings.warn(
            "n_components is now directly calculated from the connectivity "
            "matrix and will be removed in 0.18",
            DeprecationWarning)
    connectivity, n_components = _fix_connectivity(X, connectivity)

    connectivity = connectivity.tocoo()
    # Put the diagonal to zero
    diag_mask = (connectivity.row != connectivity.col)
    connectivity.row = connectivity.row[diag_mask]
    connectivity.col = connectivity.col[diag_mask]
    connectivity.data = connectivity.data[diag_mask]
    del diag_mask

    if affinity == 'precomputed':
        distances = X[connectivity.row, connectivity.col]
    else:
        # FIXME We compute all the distances, while we could have only computed
        # the "interesting" distances
        distances = paired_distances(X[connectivity.row],
                                     X[connectivity.col],
                                     metric=affinity)
    connectivity.data = distances

    if n_clusters is None:
        n_nodes = 2 * n_samples - 1
    else:
        assert n_clusters <= n_samples
        n_nodes = 2 * n_samples - n_clusters

    if return_distance:
        distances = np.empty(n_nodes - n_samples)
    # create inertia heap and connection matrix
    A = np.empty(n_nodes, dtype=object)
    inertia = list()

    # LIL seems to the best format to access the rows quickly,
    # without the numpy overhead of slicing CSR indices and data.
    connectivity = connectivity.tolil()
    # We are storing the graph in a list of IntFloatDict
    for ind, (data, row) in enumerate(zip(connectivity.data,
                                          connectivity.rows)):
        A[ind] = IntFloatDict(np.asarray(row, dtype=np.intp),
                              np.asarray(data, dtype=np.float64))
        # We keep only the upper triangular for the heap
        # Generator expressions are faster than arrays on the following
        inertia.extend(_hierarchical.WeightedEdge(d, ind, r)
                       for r, d in zip(row, data) if r < ind)
    del connectivity

    heapify(inertia)

    # prepare the main fields
    parent = np.arange(n_nodes, dtype=np.intp)
    used_node = np.ones(n_nodes, dtype=np.intp)
    children = []

    # recursive merge loop
    for k in xrange(n_samples, n_nodes):
        # identify the merge
        while True:
            edge = heappop(inertia)
            if used_node[edge.a] and used_node[edge.b]:
                break
        i = edge.a
        j = edge.b

        if return_distance:
            # store distances
            distances[k - n_samples] = edge.weight

        parent[i] = parent[j] = k
        children.append((i, j))
        # Keep track of the number of elements per cluster
        n_i = used_node[i]
        n_j = used_node[j]
        used_node[k] = n_i + n_j
        used_node[i] = used_node[j] = False

        # update the structure matrix A and the inertia matrix
        # a clever 'min', or 'max' operation between A[i] and A[j]
        coord_col = join_func(A[i], A[j], used_node, n_i, n_j)
        for l, d in coord_col:
            A[l].append(k, d)
            # Here we use the information from coord_col (containing the
            # distances) to update the heap
            heappush(inertia, _hierarchical.WeightedEdge(d, k, l))
        A[k] = coord_col
        # Clear A[i] and A[j] to save memory
        A[i] = A[j] = 0

    # Separate leaves in children (empty lists up to now)
    n_leaves = n_samples

    # # return numpy array for efficient caching
    children = np.array(children)[:, ::-1]

    if return_distance:
        return children, n_components, n_leaves, parent, distances
    return children, n_components, n_leaves, parent