def get_best_thread(self, question, tag_name):
        """ Returns id of the most similar thread for the question.
            The search is performed across the threads with a given tag.
        """
        thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name)

        question_vec = question_to_vec(question, self.word_embeddings,
                                       self.embeddings_dim)

        n = int(len(thread_ids) / 2)
        best_thread1, dist1 = pairwise_distances_argmin_min(
            question_vec.reshape((1, self.embeddings_dim)),
            thread_embeddings[:n, :],
            metric='cosine')
        best_thread2, dist2 = pairwise_distances_argmin_min(
            question_vec.reshape((1, self.embeddings_dim)),
            thread_embeddings[n:, :],
            metric='cosine')

        if dist1[0] <= dist2[0]:
            best_thread = best_thread1[0]
        else:
            best_thread = best_thread2[0] + n

        return thread_ids[best_thread]
Esempio n. 2
0
def _labels_inertia_precompute_dense(norm, X, sample_weight, centers,
                                     distances):
    """
    Computes labels and inertia using a full distance matrix.

    This will overwrite the 'distances' array in-place.

    Parameters
    ----------
    norm : 'l1' or 'l2'

    X : numpy array, shape (n_sample, n_features)
        Input data.

    sample_weight : array-like, shape (n_samples,)
        The weights for each observation in X.

    centers : numpy array, shape (n_clusters, n_features)
        Cluster centers which data is assigned to.

    distances : numpy array, shape (n_samples,)
        Pre-allocated array in which distances are stored.

    Returns
    -------
    labels : numpy array, dtype=numpy.int, shape (n_samples,)
        Indices of clusters that samples are assigned to.

    inertia : float
        Sum of squared distances of samples to their closest cluster center.
    """
    n_samples = X.shape[0]
    if norm == 'l2':
        labels, mindist = pairwise_distances_argmin_min(
            X=X,
            Y=centers,
            metric='euclidean',
            metric_kwargs={'squared': True})
    elif norm == 'l1':
        labels, mindist = pairwise_distances_argmin_min(X=X,
                                                        Y=centers,
                                                        metric='manhattan')
    else:  # pragma no cover
        raise NotImplementedError(
            "Not implemented for norm '{}'.".format(norm))
    # cython k-means code assumes int32 inputs
    labels = labels.astype(numpy.int32, copy=False)
    if n_samples == distances.shape[0]:
        # distances will be changed in-place
        distances[:] = mindist
    inertia = (mindist * sample_weight).sum()
    return labels, inertia
Esempio n. 3
0
def test_pairwise_distances_argmin_min():
    """ Check pairwise minimum distances computation for any metric"""
    X = [[0], [1]]
    Y = [[-1], [2]]

    # euclidean metric
    D, E = pairwise_distances_argmin_min(X, Y, metric="euclidean")
    D2 = pairwise_distances_argmin(X, Y, metric="euclidean")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(D2, [0, 1])
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean sklearn metric
    D, E = pairwise_distances_argmin_min(X, Y, metric="manhattan")
    D2 = pairwise_distances_argmin(X, Y, metric="manhattan")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(D2, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean Scipy distance (callable)
    D, E = pairwise_distances_argmin_min(X,
                                         Y,
                                         metric=minkowski,
                                         metric_kwargs={"p": 2})
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean Scipy distance (string)
    D, E = pairwise_distances_argmin_min(X,
                                         Y,
                                         metric="minkowski",
                                         metric_kwargs={"p": 2})
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Compare with naive implementation
    rng = np.random.RandomState(0)
    X = rng.randn(97, 149)
    Y = rng.randn(111, 149)

    dist = pairwise_distances(X, Y, metric="manhattan")
    dist_orig_ind = dist.argmin(axis=0)
    dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))]

    dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min(
        X, Y, axis=0, metric="manhattan", batch_size=50)
    np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7)
    np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)
Esempio n. 4
0
    def predict(self, tracks):
        """
        Predict the closest cluster each trajectory in tracks belongs to
        for each descriptor type.

        In the vector quantization literature, centroids is called the
        code book and each value returned by predict is the index of
        the closest code in the code book.

        Parameters
        ----------
        tracks : stuctured ndarray
            Trajectories to predict.

        Returns
        -------
        labels : dict of array
            Index of the cluster each trajectory belongs to for each descriptor type.
        """
        labels = dict()

        for desc in self.descriptors:
            if len(self.codebooks[desc]) > 0:
                cb_indices, dist = pairwise_distances_argmin_min(
                    tracks[desc],
                    self.codebooks[desc]['cluster_centers'],
                    metric='euclidean')
            else:
                cb_indices = np.full(len(tracks), fill_value=-1, dtype=np.int)

            labels[desc] = cb_indices

        return labels
Esempio n. 5
0
def kmeans_plot(X, y, cluster_centers, ax=None):
    import matplotlib.patheffects as path_effects
    from sklearn.metrics.pairwise import pairwise_distances_argmin_min

    if ax is None:
        ax = plt.gca()

    colors = cm.spectral(y.astype(float) / len(cluster_centers))
    ax.scatter(*list(zip(*X)), lw=0, c=colors, s=30)

    offset = max(list(zip(*cluster_centers))[0]) * 0.2

    for i, cluster in enumerate(cluster_centers):
        index, _ = pairwise_distances_argmin_min(cluster.reshape(1, -1), Y=X)
        cluster_color = colorConverter.to_rgb(colors[index[0]])

        if is_luminous(cluster_color) is False:
            cluster_color = darken_rgb(cluster_color, 0.35)

        label = ax.text(x=cluster[0] + offset,
                        y=cluster[1],
                        s='{:d}'.format(i + 1),
                        color=cluster_color)
        label.set_path_effects([path_effects.Stroke(lw=2, foreground='white'),
                                path_effects.Normal()])

    limit = max(*ax.get_xlim(), *ax.get_xlim())

    ax.set_xlim(0, limit)
    ax.set_ylim(0, limit)

    ax.set_xlabel("Feature space for the 1st feature")
    ax.set_ylabel("Feature space for the 2nd feature")
    return ax
Esempio n. 6
0
    def get_best_thread(self, question, tag_name):
        """ Returns id of the most similar thread for the question.
            The search is performed across the threads with a given tag.
        """
        thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name)

        # HINT: you have already implemented a similar routine in the 3rd assignment.
        #### YOUR CODE HERE ####
        question_vec = question_to_vec(question, self.word_embeddings, dim=self.embeddings_dim)
        #### YOUR CODE HERE ####
        # best_thread = pairwise_distances_argmin(question_vec.reshape(1, -1), thread_embeddings, metric='cosine')[0]

        # Due to memory errors we load the threads in chunks
        scores_list = []
        k = 10
        n = int(len(thread_ids) / k)
        for i in range(k):
            if i == k - 1:
                break # Due to memory error (AWS 1GB Free Tier) we do not include the last chunk of threads
                # best_thread, dist = pairwise_distances_argmin_min(question_vec.reshape((1, self.embeddings_dim)),
                #                                                   thread_embeddings[i * n:, :], metric='cosine')
            else:
                best_thread, dist = pairwise_distances_argmin_min(question_vec.reshape((1, self.embeddings_dim)),
                                                                  thread_embeddings[i * n: (i + 1) * n, :],
                                                                  metric='cosine')

            scores_list.append({'thread': i * n + best_thread[0], 'dist': dist[0]})

        df = pd.DataFrame(scores_list).sort_values(by='dist')
        best_thread = int(df.iloc[0]['thread'])
        return thread_ids[best_thread]
Esempio n. 7
0
def project_cells_to_epg(adata):
    input_data = adata.obsm['X_dr']
    epg = adata.uns['epg']
    dict_nodes_pos = nx.get_node_attributes(epg, 'pos')
    nodes_pos = np.empty((0, input_data.shape[1]))
    nodes = np.empty((0, 1), dtype=int)
    for key in dict_nodes_pos.keys():
        nodes_pos = np.vstack((nodes_pos, dict_nodes_pos[key]))
        nodes = np.append(nodes, key)
    indices = pairwise_distances_argmin_min(input_data,
                                            nodes_pos,
                                            axis=1,
                                            metric='euclidean')[0]
    x_node = nodes[indices]
    adata.obs['node'] = x_node
    #update the projection info for each cell
    flat_tree = adata.uns['flat_tree']
    dict_branches_nodes = nx.get_edge_attributes(flat_tree, 'nodes')
    dict_branches_id = nx.get_edge_attributes(flat_tree, 'id')
    dict_node_state = nx.get_node_attributes(flat_tree, 'label')
    list_x_br_id = list()
    list_x_br_id_alias = list()
    list_x_lam = list()
    list_x_dist = list()
    for ix, xp in enumerate(input_data):
        list_br_id = [
            flat_tree.edges[br_key]['id']
            for br_key, br_value in dict_branches_nodes.items()
            if x_node[ix] in br_value
        ]
        dict_br_matrix = dict()
        for br_id in list_br_id:
            dict_br_matrix[br_id] = np.array(
                [dict_nodes_pos[i] for i in flat_tree.edges[br_id]['nodes']])
        dict_results = dict()
        list_dist_xp = list()
        for br_id in list_br_id:
            dict_results[br_id] = project_point_to_line_segment_matrix(
                dict_br_matrix[br_id], xp)
            list_dist_xp.append(dict_results[br_id][2])
        x_br_id = list_br_id[np.argmin(list_dist_xp)]
        x_br_id_alias = dict_node_state[x_br_id[0]], dict_node_state[
            x_br_id[1]]
        br_len = flat_tree.edges[x_br_id]['len']
        results = dict_results[x_br_id]
        x_dist = results[2]
        x_lam = results[3]
        if (x_lam > br_len):
            x_lam = br_len
        list_x_br_id.append(x_br_id)
        list_x_br_id_alias.append(x_br_id_alias)
        list_x_lam.append(x_lam)
        list_x_dist.append(x_dist)
    adata.obs['branch_id'] = list_x_br_id
    adata.obs['branch_id_alias'] = list_x_br_id_alias
    #     adata.uns['branch_id'] = list(set(adata.obs['branch_id'].tolist()))
    adata.obs['branch_lam'] = list_x_lam
    adata.obs['branch_dist'] = list_x_dist
    return None
Esempio n. 8
0
def select_instance(
        X_training: modALinput,
        X_pool: modALinput,
        X_uncertainty: np.ndarray,
        mask: np.ndarray,
        metric: Union[str, Callable],
        n_jobs: Union[int, None]
) -> Tuple[np.ndarray, modALinput, np.ndarray]:
    """
    Core iteration strategy for selecting another record from our unlabeled records.

    Given a set of labeled records (X_training) and unlabeled records (X_pool) with uncertainty scores (X_uncertainty),
    we'd like to identify the best instance in X_pool that best balances uncertainty and dissimilarity.

    Refer to Cardoso et al.'s "Ranked batch-mode active learning":
        https://www.sciencedirect.com/science/article/pii/S0020025516313949

    TODO:
        - Add notebook for Active Learning bake-off (passive vs interactive vs batch vs ranked batch)

    Args:
        X_training: Mix of both labeled and unlabeled records.
        X_pool: Unlabeled records to be selected for labeling.
        X_uncertainty: Uncertainty scores for unlabeled records to be selected for labeling.
        mask: Mask to exclude previously selected instances from the pool.
        metric: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`.
        n_jobs: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`.

    Returns:
        Index of the best index from X chosen to be labelled; a single record from our unlabeled set that is considered
        the most optimal incremental record for including in our query set.
    """
    # Extract the number of labeled and unlabeled records.
    n_labeled_records, _ = X_training.shape
    n_unlabeled, _ = X_pool[mask].shape

    # Determine our alpha parameter as |U| / (|U| + |D|). Note that because we
    # append to X_training and remove from X_pool within `ranked_batch`,
    # :alpha: is not fixed throughout our model's lifetime.
    alpha = n_unlabeled / (n_unlabeled + n_labeled_records)

    # Compute pairwise distance (and then similarity) scores from every unlabeled record
    # to every record in X_training. The result is an array of shape (n_samples, ).
    if n_jobs == 1 or n_jobs is None:
        _, distance_scores = pairwise_distances_argmin_min(X_pool[mask], X_training, metric=metric)
    else:
        distance_scores = pairwise_distances(X_pool[mask], X_training, metric=metric, n_jobs=n_jobs).min(axis=1)

    similarity_scores = 1 / (1 + distance_scores)

    # Compute our final scores, which are a balance between how dissimilar a given record
    # is with the records in X_uncertainty and how uncertain we are about its class.
    scores = alpha * (1 - similarity_scores) + (1 - alpha) * X_uncertainty[mask]

    # Isolate and return our best instance for labeling as the one with the largest score.
    best_instance_index = np.argmax(scores)
    mask[best_instance_index] = 0
    return best_instance_index, X_pool[best_instance_index].reshape(1, -1), mask
Esempio n. 9
0
 def weights(self, split):
     if self.distanceMatrix is None:
         validActive = self.features[(split == 0)
                                     & (self.labels == 1)].astype(bool)
         validDecoy = self.features[(split == 0)
                                    & (self.labels == 0)].astype(bool)
         trainActive = self.features[(split == 1)
                                     & (self.labels == 1)].astype(bool)
         trainDecoy = self.features[(split == 1)
                                    & (self.labels == 0)].astype(bool)
         actActDistances = pairwise_distances_argmin_min(validActive,
                                                         trainActive,
                                                         metric='jaccard')
         actDecDistances = pairwise_distances_argmin_min(validActive,
                                                         trainDecoy,
                                                         metric='jaccard')
         decActDistances = pairwise_distances_argmin_min(validDecoy,
                                                         trainActive,
                                                         metric='jaccard')
         decDecDistances = pairwise_distances_argmin_min(validDecoy,
                                                         trainDecoy,
                                                         metric='jaccard')
         decWeights = decDecDistances / decActDistances
         actWeights = actActDistances / actDecDistances
     else:
         actActDistances = self.distanceMatrix[(split == 0) & (
             self.labels == 1), :][:, (split == 1) & (self.labels == 1)]
         actDecDistances = self.distanceMatrix[(split == 0) & (
             self.labels == 1), :][:, (split == 1) & (self.labels == 0)]
         decActDistances = self.distanceMatrix[(split == 0) & (
             self.labels == 0), :][:, (split == 1) & (self.labels == 1)]
         decDecDistances = self.distanceMatrix[(split == 0) & (
             self.labels == 0), :][:, (split == 1) & (self.labels == 0)]
         decWeights = np.amin(decDecDistances, axis=1) / np.amin(
             decActDistances, axis=1)
         actWeights = np.amin(actActDistances, axis=1) / np.amin(
             actDecDistances, axis=1)
     holdWeights = np.zeros(self.size)
     validActiveIndices = np.where((split == 0) & (self.labels == 1))[0]
     for i in range(len(validActiveIndices)):
         holdWeights[validActiveIndices[i]] = actWeights[i]
     validDecoyIndices = np.where((split == 0) & (self.labels == 0))[0]
     for i in range(len(validDecoyIndices)):
         holdWeights[validDecoyIndices[i]] = decWeights[i]
     return holdWeights
def test_pairwise_distances_argmin_min():
    # Check pairwise minimum distances computation for any metric
    X = [[0], [1]]
    Y = [[-1], [2]]

    Xsp = dok_matrix(X)
    Ysp = csr_matrix(Y, dtype=np.float32)

    # euclidean metric
    D, E = pairwise_distances_argmin_min(X, Y, metric="euclidean")
    D2 = pairwise_distances_argmin(X, Y, metric="euclidean")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(D2, [0, 1])
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # sparse matrix case
    Dsp, Esp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean")
    assert_array_equal(Dsp, D)
    assert_array_equal(Esp, E)
    # We don't want np.matrix here
    assert_equal(type(Dsp), np.ndarray)
    assert_equal(type(Esp), np.ndarray)

    # Non-euclidean sklearn metric
    D, E = pairwise_distances_argmin_min(X, Y, metric="manhattan")
    D2 = pairwise_distances_argmin(X, Y, metric="manhattan")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(D2, [0, 1])
    assert_array_almost_equal(E, [1., 1.])
    D, E = pairwise_distances_argmin_min(Xsp, Ysp, metric="manhattan")
    D2 = pairwise_distances_argmin(Xsp, Ysp, metric="manhattan")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean Scipy distance (callable)
    D, E = pairwise_distances_argmin_min(X, Y, metric=minkowski,
                                         metric_kwargs={"p": 2})
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean Scipy distance (string)
    D, E = pairwise_distances_argmin_min(X, Y, metric="minkowski",
                                         metric_kwargs={"p": 2})
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Compare with naive implementation
    rng = np.random.RandomState(0)
    X = rng.randn(97, 149)
    Y = rng.randn(111, 149)

    dist = pairwise_distances(X, Y, metric="manhattan")
    dist_orig_ind = dist.argmin(axis=0)
    dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))]

    dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min(
        X, Y, axis=0, metric="manhattan", batch_size=50)
    np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7)
    np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)
Esempio n. 11
0
def test_pairwise_distances_argmin_min():
    """ Check pairwise minimum distances computation for any metric"""
    X = [[0], [1]]
    Y = [[-1], [2]]

    # euclidean metric
    D, E = pairwise_distances_argmin_min(X, Y, metric="euclidean")
    D2 = pairwise_distances_argmin(X, Y, metric="euclidean")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(D2, [0, 1])
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean sklearn metric
    D, E = pairwise_distances_argmin_min(X, Y, metric="manhattan")
    D2 = pairwise_distances_argmin(X, Y, metric="manhattan")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(D2, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean Scipy distance (callable)
    D, E = pairwise_distances_argmin_min(X, Y, metric=minkowski,
                                         metric_kwargs={"p": 2})
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean Scipy distance (string)
    D, E = pairwise_distances_argmin_min(X, Y, metric="minkowski",
                                         metric_kwargs={"p": 2})
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Compare with naive implementation
    rng = np.random.RandomState(0)
    X = rng.randn(97, 149)
    Y = rng.randn(111, 149)

    dist = pairwise_distances(X, Y, metric="manhattan")
    dist_orig_ind = dist.argmin(axis=0)
    dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))]

    dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min(
        X, Y, axis=0, metric="manhattan", batch_size=50)
    np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7)
    np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)
Esempio n. 12
0
def test_pairwise_distances_argmin_min():
    # Check pairwise minimum distances computation for any metric
    X = [[0], [1]]
    Y = [[-1], [2]]

    Xsp = dok_matrix(X)
    Ysp = csr_matrix(Y, dtype=np.float32)

    # euclidean metric
    D, E = pairwise_distances_argmin_min(X, Y, metric="euclidean")
    D2 = pairwise_distances_argmin(X, Y, metric="euclidean")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(D2, [0, 1])
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # sparse matrix case
    Dsp, Esp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean")
    assert_array_equal(Dsp, D)
    assert_array_equal(Esp, E)
    # We don't want np.matrix here
    assert_equal(type(Dsp), np.ndarray)
    assert_equal(type(Esp), np.ndarray)

    # Non-euclidean scikit-learn metric
    D, E = pairwise_distances_argmin_min(X, Y, metric="manhattan")
    D2 = pairwise_distances_argmin(X, Y, metric="manhattan")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(D2, [0, 1])
    assert_array_almost_equal(E, [1., 1.])
    D, E = pairwise_distances_argmin_min(Xsp, Ysp, metric="manhattan")
    D2 = pairwise_distances_argmin(Xsp, Ysp, metric="manhattan")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean Scipy distance (callable)
    D, E = pairwise_distances_argmin_min(X, Y, metric=minkowski,
                                         metric_kwargs={"p": 2})
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean Scipy distance (string)
    D, E = pairwise_distances_argmin_min(X, Y, metric="minkowski",
                                         metric_kwargs={"p": 2})
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Compare with naive implementation
    rng = np.random.RandomState(0)
    X = rng.randn(97, 149)
    Y = rng.randn(111, 149)

    dist = pairwise_distances(X, Y, metric="manhattan")
    dist_orig_ind = dist.argmin(axis=0)
    dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))]

    dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min(
        X, Y, axis=0, metric="manhattan", batch_size=50)
    np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7)
    np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)
Esempio n. 13
0
def gabriel_graph(X, metric='euclidean', weighted=False):
  n = X.shape[0]
  a, b = np.triu_indices(n, k=1)
  midpoints = (X[a] + X[b]) / 2
  _, Dmid = pairwise_distances_argmin_min(midpoints, X, metric=metric)
  Dedge = paired_distances(X[a], X[b], metric=metric)
  mask = (Dedge - Dmid * 2) < 1e-10
  pairs = np.column_stack((a[mask], b[mask]))
  w = Dedge[mask] if weighted else None
  return Graph.from_edge_pairs(pairs, num_vertices=n, symmetric=True, weights=w)
Esempio n. 14
0
 def computeScore(self, split):
     if not self.validSplit(split):
         return 2.0,
     else:
         if self.distanceMatrix is None:
             validActive = self.features[(split == 0) & (self.labels == 1)]
             validDecoy = self.features[(split == 0) & (self.labels == 0)]
             trainActive = self.features[(split == 1) & (self.labels == 1)]
             trainDecoy = self.features[(split == 1) & (self.labels == 0)]
             actActDistances = pairwise_distances_argmin_min(
                 validActive, trainActive, metric='jaccard')[1]
             actDecoyDistances = pairwise_distances_argmin_min(
                 validActive, trainDecoy, metric='jaccard')[1]
             decoyActDistances = pairwise_distances_argmin_min(
                 validDecoy, trainActive, metric='jaccard')[1]
             decoyDecoyDistances = pairwise_distances_argmin_min(
                 validDecoy, trainDecoy, metric='jaccard')[1]
         else:
             actActDistances = np.amin(self.distanceMatrix[(split == 0) & (
                 self.labels == 1), :][:,
                                       (split == 1) & (self.labels == 1)],
                                       axis=1)
             actDecoyDistances = np.amin(self.distanceMatrix[
                 (split == 0) & (self.labels == 1), :][:, (split == 1) &
                                                       (self.labels == 0)],
                                         axis=1)
             decoyActDistances = np.amin(self.distanceMatrix[
                 (split == 0) & (self.labels == 0), :][:, (split == 1) &
                                                       (self.labels == 1)],
                                         axis=1)
             decoyDecoyDistances = np.amin(self.distanceMatrix[
                 (split == 0) & (self.labels == 0), :][:, (split == 1) &
                                                       (self.labels == 0)],
                                           axis=1)
         activeMeanDistance = np.mean(actDecoyDistances - actActDistances)
         decoyMeanDistance = np.mean(decoyActDistances -
                                     decoyDecoyDistances)
         if self.AVE:
             score = activeMeanDistance + decoyMeanDistance
         else:
             score = np.sqrt(activeMeanDistance**2 + decoyMeanDistance**2)
         return score,
Esempio n. 15
0
def _centroids(n_clusters: int,
               points: List[List[float]]) -> List[List[float]]:
    """ Return n_clusters centroids of points
    """

    k_means = KMeans(n_clusters=n_clusters)
    k_means.fit(points)

    closest, _ = pairwise_distances_argmin_min(k_means.cluster_centers_,
                                               points)

    return list(map(list, np.array(points)[closest.tolist()]))
Esempio n. 16
0
def gabriel_graph(X, metric='euclidean', weighted=False):
    n = X.shape[0]
    a, b = np.triu_indices(n, k=1)
    midpoints = (X[a] + X[b]) / 2
    _, Dmid = pairwise_distances_argmin_min(midpoints, X, metric=metric)
    Dedge = paired_distances(X[a], X[b], metric=metric)
    mask = (Dedge - Dmid * 2) < 1e-10
    pairs = np.column_stack((a[mask], b[mask]))
    w = Dedge[mask] if weighted else None
    return Graph.from_edge_pairs(pairs,
                                 num_vertices=n,
                                 symmetric=True,
                                 weights=w)
    def score(self, X):
        """
        mean distance between X and cluster mean

        :param X: array-like or sparse matrix of shape = [n_samples, n_features]
        :return: mean distance between X and cluster mean
        """
        cluster_centers_ = self.model[[1, 3]].values
        if self.encode_type == 2:
            cluster_centers_ = self.model[[1, 3, 5]].values
        labels, mindist = pairwise_distances_argmin_min(
            X=X,
            Y=cluster_centers_,
            metric='euclidean',
            metric_kwargs={'squared': True})
        return np.array(mindist).mean()
Esempio n. 18
0
    def _predict_l1(self, X, sample_weight=None, return_distances=False):
        """
        Returns the distance of each point in *X* to
        every fit clusters.

        :param X: features
        :param sample_weight: (unused)
        :param return_distances: returns distances as well
        :return: labels or `labels, distances`
        """
        labels, mindist = pairwise_distances_argmin_min(
            X=X, Y=self.cluster_centers_, metric='manhattan')
        labels = labels.astype(numpy.int32, copy=False)
        if return_distances:
            return labels, mindist
        return labels
Esempio n. 19
0
def _labels_inertia_precompute_dense(X, sample_weight, x_squared_norms,
                                     centers, distances):
    """Compute labels and inertia using a full distance matrix.

    This will overwrite the 'distances' array in-place.

    Parameters
    ----------
    X : numpy array, shape (n_sample, n_features)
        Input data.

    sample_weight : array-like, shape (n_samples,)
        The weights for each observation in X.

    x_squared_norms : numpy array, shape (n_samples,)
        Precomputed squared norms of X.

    centers : numpy array, shape (n_clusters, n_features)
        Cluster centers which data is assigned to.

    distances : numpy array, shape (n_samples,)
        Pre-allocated array in which distances are stored.

    Returns
    -------
    labels : numpy array, dtype=np.int, shape (n_samples,)
        Indices of clusters that samples are assigned to.

    inertia : float
        Sum of squared distances of samples to their closest cluster center.

    """
    n_samples = X.shape[0]

    # Breakup nearest neighbor distance computation into batches to prevent
    # memory blowup in the case of a large number of samples and clusters.
    # TODO: Once PR #7383 is merged use check_inputs=False in metric_kwargs.
    labels, mindist = pairwise_distances_argmin_min(
        X=X, Y=centers, metric='euclidean', metric_kwargs={'squared': True})
    # cython k-means code assumes int32 inputs
    labels = labels.astype(np.int32, copy=False)
    if n_samples == distances.shape[0]:
        # distances will be changed in-place
        distances[:] = mindist
    inertia = (mindist * sample_weight).sum()
    return labels, inertia
def Subspace_iter(X, n_clusters, init='k-means++', max_iter=300, tol=1e-4, tol_eig=-1e-10, x_squared_norms=None, random_state=None):
    random_state = check_random_state(random_state)
    centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms)

    new_labels, new_inertia, new_centers = None, None, None

    distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype)
    d_shape = X.shape[1]
    randomval = random_state.random_sample(d_shape ** 2).reshape(d_shape, d_shape)
    V_val, _ = np.linalg.qr(randomval, mode='complete')
    m_val = d_shape // 2
    S_D = np.dot(X.T, X)
    P_Cluster = np.eye(m_val, M=d_shape).T
    for i in range(max_iter):
        centers_old = centers.copy()
        X_values = np.dot(np.dot(X, V_val), P_Cluster)
        centers_c = np.dot(np.dot(centers, V_val), P_Cluster)
        labels, _ = pairwise_distances_argmin_min(X = X_values, Y = centers_c,  metric='euclidean',metric_kwargs={'squared': True})
        labels = labels.astype(np.int32)
        centers = _k_means._centers_dense(X, labels, n_clusters, distances)
        S = np.zeros((d_shape, d_shape))
        for it in range(n_clusters):
            X_it = X[:][labels == it] - centers[:][it]
            S += np.dot(X_it.T, X_it)
        Sigma = S - S_D
        EV, _ = np.linalg.eigh(Sigma)
        m = len(np.where(EV < tol_eig)[0])
        P_Cluster = np.eye(m, M=d_shape).T
        inertia = 0.0
        for j in range(n_clusters):
            inertia += row_norms( X[:][labels == j] - centers[:][j],squared=True).sum()

        if new_inertia is None or inertia < new_inertia:
            new_labels = labels.copy()
            new_centers = centers.copy()
            new_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            break

    if center_shift_total > 0:
        new_labels, new_inertia = _labels_inertia(X, x_squared_norms, new_centers,
                            precompute_distances=False,
                            distances=distances)
    return new_labels, new_inertia, new_centers, i + 1
Esempio n. 21
0
    def score(self, X, y=None):
        """Opposite of the value of X on the K-centers objective.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            New data.
        Returns
        -------
        score : float
            Opposite of the value of X on the K-centers objective.
        """
        check_is_fitted(self, 'cluster_centers_')

        X = self._check_test_data(X)
        return \
            -pairwise_distances_argmin_min(
                self.cluster_centers_, X, metric=self.metric,
                metric_kwargs=self.metric_kw)[0]
Esempio n. 22
0
def _assign_labels(X, V, centers_subspace, P_subspace):
    """
    Assign each point in each subspace to its nearest cluster center.
    :param X: input data
    :param V: orthogonal rotation matrix
    :param centers_subspace: cluster centers of the subspace
    :param P_subspace: projecitons of the subspace
    :return: list with cluster assignments
    """
    cropped_X = np.matmul(X, V[:, P_subspace])
    cropped_centers = np.matmul(centers_subspace, V[:, P_subspace])
    # Find nearest center
    labels, _ = pairwise_distances_argmin_min(X=cropped_X,
                                              Y=cropped_centers,
                                              metric='euclidean',
                                              metric_kwargs={'squared': True})
    # cython k-means code assumes int32 inputs
    labels = labels.astype(np.int32)
    return labels
    def predict(self, data, std_threshold=2.0, min_cluster_size=3):
        """
        calculate abnormal state of data

        :param data: array-like or sparse matrix of shape = [n_samples, n_features]
        :param std_threshold: threshold of distance between data and cluster mean
        :param min_cluster_size: min cluster size, if less then size whole group will be abnormal

        :return:
            - predict_list : predict abnormal state

                - -1 : abnormal data

                - 1 : normal data

            - labels: group id of data
        """
        assert len(data) > 0, 'X is empty'
        assert isinstance(self.model, pd.DataFrame), 'not fit'
        if self.encode_type == 2:
            cluster_centers_ = self.model[[1, 3, 5]].values
        else:
            cluster_centers_ = self.model[[1, 3]].values
        labels, mindist = pairwise_distances_argmin_min(
            X=data,
            Y=cluster_centers_,
            metric='euclidean',
            metric_kwargs={'squared': True})
        cluster_centers_ = self.normal_model.iloc[labels]
        predict_list = []
        for index in range(len(data)):
            cluster_centers_data = cluster_centers_.iloc[index]
            if cluster_centers_data[0] + 1 <= min_cluster_size:
                predict_list.append(-1)
            elif math.fabs(data[index][0] - cluster_centers_data[1]) > cluster_centers_data[2] * std_threshold and \
                    math.fabs(data[index][1] - cluster_centers_data[3]) > cluster_centers_data[4] * std_threshold:
                predict_list.append(-1)
            else:
                predict_list.append(1)
        predict_list = np.array(predict_list)
        return predict_list, labels
Esempio n. 24
0
    def predict(self, X):
        """Predict the closest cluster each sample in X belongs to.
        In the vector quantization literature, `cluster_centers_` is called
        the code book and each value returned by `predict` is the index of
        the closest code in the code book.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            New data to predict.
        Returns
        -------
        labels : array, shape [n_samples,]
            Index of the cluster each sample belongs to.
        """
        check_is_fitted(self, 'cluster_centers_')

        X = self._check_test_data(X)
        return \
            pairwise_distances_argmin_min(
                self.cluster_centers_, X, metric=self.metric,
                metric_kwargs=self.metric_kw)[0]
Esempio n. 25
0
    def hsv_method(img, n_clusters, n_colors):
        img = rgb2hsv(img)

        # make img array for kmeans
        X = img.reshape(-1, 3)

        n_clusters = n_clusters if n_clusters > n_colors else n_colors
        km = MiniBatchKMeans(n_clusters=n_clusters,
                             init='k-means++',
                             n_init=3,
                             random_state=0)
        labels = km.fit_predict(X)
        cluster_centers = km.cluster_centers_
        bincount = np.bincount(labels)

        # find index of data point closest to each center
        closest, _ = pairwise_distances_argmin_min(cluster_centers, X)

        # sort colors by frequency
        dominants = 1. * X[closest][np.argsort(bincount,
                                               axis=0)[::-1]][:n_colors]

        colors = [hsv2rgb([[x]])[0][0] for x in dominants]
        return colors
Esempio n. 26
0
def _labels_inertia(X, sample_weight, x_squared_norms, centers, distances, same_cluster_size=False):
    """E step of the K-means EM algorithm.
    Compute the labels and the inertia of the given samples and centers.
    This will compute the distances in-place.
    Parameters
    ----------
    X : float64 array-like or CSR sparse matrix, shape (n_samples, n_features)
        The input samples to assign to the labels.
    sample_weight : array-like, shape (n_samples,)
        The weights for each observation in X.
    x_squared_norms : array, shape (n_samples,)
        Precomputed squared euclidean norm of each data point, to speed up
        computations.
    centers : float array, shape (k, n_features)
        The cluster centers.
    distances : float array, shape (n_samples,)
        Pre-allocated array to be filled in with each sample's distance
        to the closest center.
    Returns
    -------
    labels : int array of shape(n)
        The resulting assignment
    inertia : float
        Sum of squared distances of samples to their closest cluster center.
    """
    sample_weight = _check_sample_weight(X, sample_weight)
    n_samples = X.shape[0]
    n_clusters = centers.shape[0]

    # See http://jmonlong.github.io/Hippocamplus/2018/06/09/cluster-same-size/#same-size-k-means-variation
    if same_cluster_size:
        cluster_size = n_samples // n_clusters
        labels = np.zeros(n_samples, dtype=np.int32)
        mindist = np.zeros(n_samples, dtype=np.float32)
        # count how many samples have been labeled in a cluster
        counters = np.zeros(n_clusters, dtype=np.int32)
        # dist: (n_samples, n_clusters)
        dist = euclidean_distances(X, centers, squared=False)
        closeness = dist.min(axis=-1) - dist.max(axis=-1)
        ranking = np.argsort(closeness)
        for r in ranking:
            while True:
                label = dist[r].argmin()
                if counters[label] < cluster_size:
                    labels[r] = label
                    counters[label] += 1
                    # squared distances are used for inertia in this function
                    mindist[r] = dist[r, label] ** 2
                    break
                else:
                    dist[r, label] = np.inf
    else:
        # Breakup nearest neighbor distance computation into batches to prevent
        # memory blowup in the case of a large number of samples and clusters.
        # TODO: Once PR #7383 is merged use check_inputs=False in metric_kwargs.
        labels, mindist = pairwise_distances_argmin_min(
            X=X, Y=centers, metric='euclidean', metric_kwargs={'squared': True})

    # cython k-means code assumes int32 inputs
    labels = labels.astype(np.int32, copy=False)
    if n_samples == distances.shape[0]:
        # distances will be changed in-place
        distances[:] = mindist
    inertia = (mindist * sample_weight).sum()
    return labels, inertia
Esempio n. 27
0
def test_pairwise_distances_argmin_min():
    # Check pairwise minimum distances computation for any metric
    X = [[0], [1]]
    Y = [[-2], [3]]

    Xsp = dok_matrix(X)
    Ysp = csr_matrix(Y, dtype=np.float32)

    expected_idx = [0, 1]
    expected_vals = [2, 2]
    expected_vals_sq = [4, 4]

    # euclidean metric
    idx, vals = pairwise_distances_argmin_min(X, Y, metric="euclidean")
    idx2 = pairwise_distances_argmin(X, Y, metric="euclidean")
    assert_array_almost_equal(idx, expected_idx)
    assert_array_almost_equal(idx2, expected_idx)
    assert_array_almost_equal(vals, expected_vals)
    # sparse matrix case
    idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean")
    assert_array_almost_equal(idxsp, expected_idx)
    assert_array_almost_equal(valssp, expected_vals)
    # We don't want np.matrix here
    assert_equal(type(idxsp), np.ndarray)
    assert_equal(type(valssp), np.ndarray)

    # euclidean metric squared
    idx, vals = pairwise_distances_argmin_min(X,
                                              Y,
                                              metric="euclidean",
                                              metric_kwargs={"squared": True})
    assert_array_almost_equal(idx, expected_idx)
    assert_array_almost_equal(vals, expected_vals_sq)

    # Non-euclidean scikit-learn metric
    idx, vals = pairwise_distances_argmin_min(X, Y, metric="manhattan")
    idx2 = pairwise_distances_argmin(X, Y, metric="manhattan")
    assert_array_almost_equal(idx, expected_idx)
    assert_array_almost_equal(idx2, expected_idx)
    assert_array_almost_equal(vals, expected_vals)
    # sparse matrix case
    idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="manhattan")
    assert_array_almost_equal(idxsp, expected_idx)
    assert_array_almost_equal(valssp, expected_vals)

    # Non-euclidean Scipy distance (callable)
    idx, vals = pairwise_distances_argmin_min(X,
                                              Y,
                                              metric=minkowski,
                                              metric_kwargs={"p": 2})
    assert_array_almost_equal(idx, expected_idx)
    assert_array_almost_equal(vals, expected_vals)

    # Non-euclidean Scipy distance (string)
    idx, vals = pairwise_distances_argmin_min(X,
                                              Y,
                                              metric="minkowski",
                                              metric_kwargs={"p": 2})
    assert_array_almost_equal(idx, expected_idx)
    assert_array_almost_equal(vals, expected_vals)

    # Compare with naive implementation
    rng = np.random.RandomState(0)
    X = rng.randn(97, 149)
    Y = rng.randn(111, 149)

    dist = pairwise_distances(X, Y, metric="manhattan")
    dist_orig_ind = dist.argmin(axis=0)
    dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))]

    dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min(
        X, Y, axis=0, metric="manhattan")
    np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7)
    np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)

    # Test batch_size deprecation warning
    assert_warns_message(DeprecationWarning,
                         "version 0.22",
                         pairwise_distances_argmin_min,
                         X,
                         Y,
                         batch_size=500,
                         metric='euclidean')
Esempio n. 28
0
def test_pairwise_distances_argmin_min():
    # Check pairwise minimum distances computation for any metric
    X = [[0], [1]]
    Y = [[-2], [3]]

    Xsp = dok_matrix(X)
    Ysp = csr_matrix(Y, dtype=np.float32)

    expected_idx = [0, 1]
    expected_vals = [2, 2]
    expected_vals_sq = [4, 4]

    # euclidean metric
    idx, vals = pairwise_distances_argmin_min(X, Y, metric="euclidean")
    idx2 = pairwise_distances_argmin(X, Y, metric="euclidean")
    assert_array_almost_equal(idx, expected_idx)
    assert_array_almost_equal(idx2, expected_idx)
    assert_array_almost_equal(vals, expected_vals)
    # sparse matrix case
    idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean")
    assert_array_almost_equal(idxsp, expected_idx)
    assert_array_almost_equal(valssp, expected_vals)
    # We don't want np.matrix here
    assert_equal(type(idxsp), np.ndarray)
    assert_equal(type(valssp), np.ndarray)

    # euclidean metric squared
    idx, vals = pairwise_distances_argmin_min(X, Y, metric="euclidean",
                                              metric_kwargs={"squared": True})
    assert_array_almost_equal(idx, expected_idx)
    assert_array_almost_equal(vals, expected_vals_sq)

    # Non-euclidean scikit-learn metric
    idx, vals = pairwise_distances_argmin_min(X, Y, metric="manhattan")
    idx2 = pairwise_distances_argmin(X, Y, metric="manhattan")
    assert_array_almost_equal(idx, expected_idx)
    assert_array_almost_equal(idx2, expected_idx)
    assert_array_almost_equal(vals, expected_vals)
    # sparse matrix case
    idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="manhattan")
    assert_array_almost_equal(idxsp, expected_idx)
    assert_array_almost_equal(valssp, expected_vals)

    # Non-euclidean Scipy distance (callable)
    idx, vals = pairwise_distances_argmin_min(X, Y, metric=minkowski,
                                              metric_kwargs={"p": 2})
    assert_array_almost_equal(idx, expected_idx)
    assert_array_almost_equal(vals, expected_vals)

    # Non-euclidean Scipy distance (string)
    idx, vals = pairwise_distances_argmin_min(X, Y, metric="minkowski",
                                              metric_kwargs={"p": 2})
    assert_array_almost_equal(idx, expected_idx)
    assert_array_almost_equal(vals, expected_vals)

    # Compare with naive implementation
    rng = np.random.RandomState(0)
    X = rng.randn(97, 149)
    Y = rng.randn(111, 149)

    dist = pairwise_distances(X, Y, metric="manhattan")
    dist_orig_ind = dist.argmin(axis=0)
    dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))]

    dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min(
        X, Y, axis=0, metric="manhattan")
    np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7)
    np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)
def subspace_kmeans_single(X,
                           sample_weight,
                           n_clusters,
                           init='k-means++',
                           max_iter=300,
                           tol=1e-4,
                           tol_eig=-1e-10,
                           verbose=False,
                           x_squared_norms=None,
                           random_state=None):
    random_state = check_random_state(random_state)
    sample_weight = _check_sample_weight(X, sample_weight)

    best_labels, best_inertia, best_centers = None, None, None
    # init
    centers = _init_centroids(X,
                              n_clusters,
                              init,
                              random_state=random_state,
                              x_squared_norms=x_squared_norms)
    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype)

    # === Beginning of original implementation of initialization ===

    # Dimensionality of original space
    d = X.shape[1]

    # Set initial V as QR-decomposed Q of random matrix
    rand_vals = random_state.random_sample(d**2).reshape(d, d)
    V, _ = np.linalg.qr(rand_vals, mode='complete')

    # Set initial m as d/2
    m = d // 2

    # Scatter matrix of the dataset in the original space
    S_D = np.dot(X.T, X)

    # Projection onto the first m attributes
    P_C = np.eye(m, M=d).T

    # === End of original implementation of initialization ===

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()

        # === Beginning of original implementation of E-step of EM ===

        X_C = np.dot(np.dot(X, V), P_C)
        mu_C = np.dot(np.dot(centers, V), P_C)
        labels, _ = pairwise_distances_argmin_min(
            X=X_C, Y=mu_C, metric='euclidean', metric_kwargs={'squared': True})
        labels = labels.astype(np.int32)

        # === End of original implementation of E-step of EM ===

        # computation of the means is also called the M-step of EM
        centers = _k_means._centers_dense(X, sample_weight, labels, n_clusters,
                                          distances)

        # === Beginning of original implementation of M-step of EM ===

        S = np.zeros((d, d))
        for i in range(n_clusters):
            X_i = X[:][labels == i] - centers[:][i]
            S += np.dot(X_i.T, X_i)
        Sigma = S - S_D
        evals, evecs = np.linalg.eigh(Sigma)
        idx = np.argsort(evals)[::1]
        V = evecs[:, idx]
        m = len(np.where(evals < tol_eig)[0])
        if m == 0:
            raise ValueError(
                'Dimensionality of clustered space is 0. '
                'The dataset is better explained by a single cluster.')
        P_C = np.eye(m, M=d).T
        inertia = 0.0
        for i in range(n_clusters):
            inertia += row_norms(X[:][labels == i] - centers[:][i],
                                 squared=True).sum()

        # === End of original implementation of M-step of EM ===

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            if verbose:
                print("Converged at iteration %d: "
                      "center shift %e within tolerance %e" %
                      (i, center_shift_total, tol))
            break

    if center_shift_total > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_inertia(X, sample_weight,x_squared_norms, best_centers,
                            precompute_distances=False,
                            distances=distances)

    return best_labels, best_inertia, best_centers, i + 1
Esempio n. 30
0
np.argmax(cs_1)  ### 0
from sklearn.metrics.pairwise import _argmin_min_reduce
cs5 = _argmin_min_reduce(x.values.reshape(1, -1), df1)
cs6 = _argmin_min_reduce(x.values.reshape(1, -1), df2)
cs5[0][0] = 0
print(cs5)
print(cs6)
print(np.argmax(cs5))
print(np.argmax(cs6))
r = [0, 0]
cs_2 = [cs5[0][0], cs6[0][0]]
print(cs_2)  ###################  [0,15]
np.argmax(cs_2)  ### 1
from sklearn.metrics.pairwise import pairwise_distances_argmin_min
cs7 = pairwise_distances_argmin_min(x.values.reshape(1, -1),
                                    df1,
                                    metric="euclidean")
cs8 = pairwise_distances_argmin_min(x.values.reshape(1, -1),
                                    df2,
                                    metric="euclidean")
cs7[0][0] = 1
print(cs7)
print(cs8)
print(np.argmax(cs7))
print(np.argmax(cs8))
r = [0, 0]
cs_3 = [cs7[0][0], cs8[0][0]]
print(cs_3)  ###################  [1,85]
np.argmax(cs_3)  ### 1
cs9 = pairwise_distances_argmin_min(x.values.reshape(1, -1),
                                    df1,
Esempio n. 31
0
def _labels_inertia_precompute_dense(X,
                                     sample_weight,
                                     x_squared_norms,
                                     centers,
                                     distances,
                                     group=None):
    """Compute labels and inertia using a full distance matrix.

    This will overwrite the 'distances' array in-place.

    Parameters
    ----------
    X : numpy array, shape (n_sample, n_features)
        Input data.

    sample_weight : array-like, shape (n_samples,)
        The weights for each observation in X.

    x_squared_norms : numpy array, shape (n_samples,)
        Precomputed squared norms of X.

    centers : numpy array, shape (n_clusters, n_features)
        Cluster centers which data is assigned to.

    distances : numpy array, shape (n_samples,)
        Pre-allocated array in which distances are stored.

    Returns
    -------
    labels : numpy array, dtype=np.int, shape (n_samples,)
        Indices of clusters that samples are assigned to.

    inertia : float
        Sum of squared distances of samples to their closest cluster center.

    """
    n_samples = X.shape[0]

    # Breakup nearest neighbor distance computation into batches to prevent
    # memory blowup in the case of a large number of samples and clusters.
    # TODO: Once PR #7383 is merged use check_inputs=False in metric_kwargs.

    if group is None:
        labels, mindist = pairwise_distances_argmin_min(
            X=X,
            Y=centers,
            metric='euclidean',
            metric_kwargs={'squared': True})
    else:
        dists = pairwise_distances(X=X, Y=centers, metric='l2')

        if isinstance(group, int):
            dists = mix_utils.const_grouped_mean(dists, group)
        else:
            import mix_utils.mixture.mix_utils._utils as _utils
            assert X.shape[0] == group.shape[0]

            # for g in range(group.min(), group.max()):
            #     mask = group == g
            #     dists[mask] = dists[mask].mean(axis=0)
            # dists = mix_utils.grouped_mean(dists, group)

            dists = _utils.grouped_mean(dists, group)

        labels = dists.argmin(axis=1)
        mindist = dists[np.arange(dists.shape[0]), labels]

    # cython k-means code assumes int32 inputs
    labels = labels.astype(np.int32)
    if n_samples == distances.shape[0]:
        # distances will be changed in-place
        distances[:] = mindist
    inertia = (mindist * sample_weight).sum()

    # print(inertia)

    return labels, inertia
Esempio n. 32
0
def _labels_inertia(X, x_squared_norms, centers):
    labels, distances = pairwise_distances_argmin_min(
        X=X, Y=centers, metric='euclidean', metric_kwargs={'squared': True})
    labels = labels.astype(np.int32, copy=False)
    inertia = distances.sum()
    return labels, inertia
Esempio n. 33
0
def mcbc(data, n_clusters, m, max_iter=100, tol=1e-4):
    n_samples, n_dims = data.shape
    km = KMeans(n_clusters=n_clusters, max_iter=1)
    centers = km.fit(data).cluster_centers_

    for iter in xrange(max_iter):
        # Assign points to clusters
        nearest_center, dists = pairwise_distances_argmin_min(data, centers)
        is_unfree = np.zeros([n_samples])

        mh = np.zeros([n_clusters])
        iter_cluster_assignments = np.zeros([n_samples], dtype=np.uint)

        unsatisified = 0
        for cluster in xrange(n_clusters):
            members = np.where(nearest_center == cluster)[0]

            if len(members) > m:
                tree = KDTree(
                    np.insert(data[[members]],
                              0,
                              centers[cluster].reshape(1, -1),
                              axis=0))
                # Potential bug that needs fixing tree.query may return distances
                # to other centers rather than datapoints
                dist, ind = tree.query(centers[cluster].reshape(1, -1),
                                       k=m + 1)
                is_unfree[members[ind[0][1:] - 1]] = 1
                mh[cluster] = 0
            else:
                is_unfree[[members]] = 1
                mh[cluster] = m - len(members)

            unsatisified = unsatisified + mh[cluster]
            iter_cluster_assignments[[members]] = cluster

        hneari = np.zeros([n_samples], dtype=np.uint)
        hsupporti = np.zeros([n_samples], dtype=np.uint)
        dists = euclidean_distances(data, centers)

        print("Iteration %d, unsatisfied: %d" % (iter, unsatisified))
        while unsatisified > 0:

            #for cluster in xrange(n_clusters):
            #    members = np.where(iter_cluster_assignments == cluster)[0]
            #    print("Cluster %d:" % cluster)
            #    print("Size: %d, unfree: %d, mh: %d" %
            #          (len(members),
            #           np.sum(is_unfree[[members]]),
            #           mh[cluster]))

            xi0 = -1
            min_xi0 = np.inf
            for sample in xrange(n_samples):
                if not is_unfree[sample]:
                    dneari = np.min(dists[sample])
                    hneari[sample] = np.argmin(dists[sample])

                    mh_candidates_ind = mh.nonzero()[0]
                    dsupporti = np.min(dists[sample][[mh_candidates_ind]])
                    mh_min_ind = np.argmin(dists[sample][[mh_candidates_ind]])
                    hsupporti[sample] = mh_candidates_ind[mh_min_ind]

                    diff = dsupporti**2 - dneari**2
                    if diff < min_xi0:
                        min_xi0 = diff
                        xi0 = sample

            is_unfree[xi0] = 1
            iter_cluster_assignments[xi0] = hsupporti[xi0]
            if mh[hsupporti[xi0]] > 0:
                mh[hsupporti[xi0]] = mh[hsupporti[xi0]] - 1
            unsatisified = unsatisified - 1
            #print("Moving sample %d from cluster %d to cluster %d" %
            #      (xi0, hneari[xi0], hsupporti[xi0]))

        # Update centers
        new_centers = []
        for cluster in xrange(n_clusters):
            members = np.where(iter_cluster_assignments == cluster)[0]
            #print(len(members))
            #print("Old centers for cluster %d:" % cluster)
            #print(centers[cluster])
            new_centers.append(np.mean(data[[members]], axis=0))
        new_centers = np.array(new_centers)
        center_shift = np.sqrt(np.sum((new_centers - centers)**2, axis=0))
        center_shift_total = np.sum(center_shift)
        print(center_shift_total)
        if center_shift_total**2 < tol:
            print("center shift %e within tolerance %e" %
                  (center_shift_total, tol))
            break
        centers = new_centers
        #print("New centers for cluster %d:" % cluster)
        #print(centers[cluster])
        #print(mh)
        #print(np.unique(iter_cluster_assignments, return_counts=True))
        #print
    return iter_cluster_assignments
def select_instance(X_training,
                    X_pool,
                    X_training_feat,
                    X_pool_feat,
                    n_annotated,
                    X_uncertainty: np.ndarray,
                    mask: np.ndarray,
                    metric: Union[str, Callable],
                    n_jobs: Union[int, None] = -1):
    """
    Core iteration strategy for selecting another record from our unlabeled records.

    Given a set of labeled records (X_training) and unlabeled records (X_pool) with uncertainty scores (X_uncertainty),
    we'd like to identify the best instance in X_pool that best balances uncertainty and dissimilarity.

    Refer to Cardoso et al.'s "Ranked batch-mode active learning":
        https://www.sciencedirect.com/science/article/pii/S0020025516313949

    TODO:
        - Add notebook for Active Learning bake-off (passive vs interactive vs batch vs ranked batch)

    Args:
        X_training: Mix of both labeled and unlabeled records.
        X_pool: Unlabeled records to be selected for labeling.
        X_training_feat: feature vectors for the training data
        X_pool_feat: feature vectors for the unlabeld data
        X_uncertainty: Uncertainty scores for unlabeled records to be selected for labeling.
        mask: Mask to exclude previously selected instances from the pool.
        metric: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`.
        n_jobs: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`.

    Returns:
        Index of the best index from X chosen to be labelled; a single record from our unlabeled set that is considered
        the most optimal incremental record for including in our query set.
    """
    # Extract the number of labeled and unlabeled records.
    n_labeled_records = X_training.shape[0] + n_annotated
    n_unlabeled = X_pool[mask].shape[0]

    # Determine our alpha parameter as |U| / (|U| + |D|). Note that because we
    # append to X_training and remove from X_pool within `ranked_batch`,
    # :alpha: is not fixed throughout our model's lifetime.
    alpha = n_unlabeled / (n_unlabeled + n_labeled_records)

    # Compute pairwise distance (and then similarity) scores from every unlabeled record
    # to every record in X_training. The result is an array of shape (n_samples, ).

    ################## TODO: replace this part with a better similarity computation ############################
    '''
    Args:
        X_u: unlabeled data
        X_l: labeled data
    Returns:
        pairwise distance between the two points
    '''
    if X_pool_feat is None or X_training_feat is None:
        X_pool_features = X_pool[mask].reshape((len(X_pool[mask]), -1))
        X_training_features = X_training.reshape((len(X_training), -1))
    else:
        X_pool_features = X_pool_feat[mask]
        X_training_features = X_training_feat

    if n_jobs == 1 or n_jobs is None:
        _, distance_scores = pairwise_distances_argmin_min(X_pool_features,
                                                           X_training_features,
                                                           metric=metric)
    else:
        #distance_scores = pairwise_distances(X_pool_features, X_training_features, metric=metric, n_jobs=n_jobs).min(axis=1)
        distance_scores = cdist(X_pool_features,
                                X_training_features,
                                metric=metric).min(axis=1)
    ############################################################################################################

    similarity_scores = 1 / (1 + distance_scores)

    # Compute our final scores, which are a balance between how dissimilar a given record
    # is with the records in X_uncertainty and how uncertain we are about its class.
    scores = alpha * (1 - similarity_scores) + (1 -
                                                alpha) * X_uncertainty[mask]

    # Isolate and return our best instance for labeling as the one with the largest score.
    best_instance_index_in_unlabeled = np.argmax(scores)
    n_pool, *rest = X_pool.shape
    unlabeled_indices = [i for i in range(n_pool) if mask[i]]
    best_instance_index = unlabeled_indices[best_instance_index_in_unlabeled]
    mask[best_instance_index] = 0
    return best_instance_index, np.expand_dims(X_pool[best_instance_index],
                                               axis=0), mask