Beispiel #1
0
    def fit_transform(self, X, init=None, y=None):
        """
        Fit the data from X, and returns the embedded coordinates

        Parameters
        ----------
        X : array, shape=[n_samples, n_features], or [n_samples, n_samples] \
                if dissimilarity='precomputed'
            Input data.

        init : {None or ndarray, shape (n_samples,)}, optional
            If None, randomly chooses the initial configuration
            if ndarray, initialize the SMACOF algorithm with this array.

        """
        if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed":
            warnings.warn("The MDS API has changed. ``fit`` now constructs an"
                          " dissimilarity matrix from data. To use a custom "
                          "dissimilarity matrix, set "
                          "``dissimilarity=precomputed``.")

        if self.dissimilarity == "precomputed":
            self.dissimilarity_matrix_ = X
        elif self.dissimilarity == "euclidean":
            self.dissimilarity_matrix_ = euclidean_distances(X)
        else:
            raise ValueError("Proximity must be 'precomputed' or 'euclidean'."
                             " Got %s instead" % str(self.dissimilarity))

        self.embedding_, self.stress_ = smacof(
            self.dissimilarity_matrix_, metric=self.metric,
            n_components=self.n_components, init=init, n_init=self.n_init,
            n_jobs=self.n_jobs, max_iter=self.max_iter, verbose=self.verbose,
            eps=self.eps, random_state=self.random_state)

        return self.embedding_
Beispiel #2
0
    def predict(self, features, ignore_first=False):
        """Predict from features, a numpy array of size (n_samples, n_features) Use the
        training data to predict labels on the test features. For each testing sample, compare it
        to the training samples. Look at the self.n_neighbors closest samples to the
        test sample by comparing their feature vectors. The label for the test sample
        is the determined by aggregating the K nearest neighbors in the training data.

        Note that when using KNN for imputation, the predicted labels are the imputed testing data
        and the shape is (n_samples, n_features).

        Arguments:
            features {np.ndarray} -- Features of each data point, shape of (n_samples,
                n_features).
            ignore_first {bool} -- If this is True, then we ignore the closest point
                when doing the aggregation. This is used for collaborative
                filtering, where the closest point is itself and thus is not a neighbor.
                In this case, we would use 1:(n_neighbors + 1).

        Returns:
            labels {np.ndarray} -- Labels for each data point, of shape (n_samples,
                n_dimensions). This n_dimensions should be the same as n_dimensions of targets in fit function.
        """
        labels = np.zeros([len(features), self.n_dimensions])

        for i in range(
                0, len(features)
        ):  # loops through each sample to find its closest neighbor
            if self.distance_measure == 'euclidean':
                distances = euclidean_distances(
                    self.features, features[i]
                )  # finds the distances between the testing set and the new sample
            elif self.distance_measure == 'manhattan':
                distances = manhattan_distances(self.features, features[i])
            else:
                distances = cosine_distances(self.features, features[i])
            """
            print('Rows of self.features = %d' % len(self.features))
            print('Columns of self.features = %d' % len(self.features[0]))
            print('Columns of features[%d] = %d' % (i, len(features[i])))
            """

            new_list = np.append(
                distances, self.targets, axis=1
            )  # makes 1 big matrix with the 1st column being the distances and the other columns being their corresponding targets
            new_list = new_list[np.argsort(
                new_list[:, 0]
            )]  # sorts the matrix by the 1st column by ascending order (first rows have lowest value)

            if ignore_first:
                close_neighbors = new_list[1:self.n_neighbors +
                                           1]  # KNN is closest K neighbors
            else:
                close_neighbors = new_list[:self.n_neighbors]

            char = np.empty(self.n_dimensions)
            """
            if self.aggregator == 'mean':
                char = statistics.mean(close_neighbors[:, 1:]) # finds mean of first column of closest neighbors
            elif self.aggregator == 'mode':
                char = statistics.mode(close_neighbors[:, 1:]) # finds mode of first column of closest neighbors
            else:
                char = statistics.median(close_neighbors[:, 1:]) # finds median of first column of closest neighbors
            """

            for j in range(0, self.n_dimensions):
                if self.aggregator == 'mean':
                    closest_neighbor = statistics.mean(
                        close_neighbors[:, j + 1]
                    )  # finds mean of first column of closest neighbors
                elif self.aggregator == 'mode':
                    closest_neighbor = statistics.mode(
                        close_neighbors[:, j + 1]
                    )  # finds mode of first column of closest neighbors
                else:
                    closest_neighbor = statistics.median(
                        close_neighbors[:, j + 1]
                    )  # finds median of first column of closest neighbors

                char[j] = closest_neighbor

            labels[i, :] = char
            #labels = np.append(labels, char, axis = 1)

        return labels
Beispiel #3
0
def _smacof_single(similarities, metric=True, n_components=2, init=None,
                   max_iter=300, verbose=0, eps=1e-3, random_state=None):
    """
    Computes multidimensional scaling using SMACOF algorithm

    Parameters
    ----------
    similarities: symmetric ndarray, shape [n * n]
        similarities between the points

    metric: boolean, optional, default: True
        compute metric or nonmetric SMACOF algorithm

    n_components: int, optional, default: 2
        number of dimension in which to immerse the similarities
        overwritten if initial array is provided.

    init: {None or ndarray}, optional
        if None, randomly chooses the initial configuration
        if ndarray, initialize the SMACOF algorithm with this array

    max_iter: int, optional, default: 300
        Maximum number of iterations of the SMACOF algorithm for a single run

    verbose: int, optional, default: 0
        level of verbosity

    eps: float, optional, default: 1e-6
        relative tolerance w.r.t stress to declare converge

    random_state: integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.

    Returns
    -------
    X: ndarray (n_samples, n_components), float
               coordinates of the n_samples points in a n_components-space

    stress_: float
        The final value of the stress (sum of squared distance of the
        disparities and the distances for all constrained points)

    """
    n_samples = similarities.shape[0]
    random_state = check_random_state(random_state)

    if similarities.shape[0] != similarities.shape[1]:
        raise ValueError("similarities must be a square array (shape=%d)" %
                         n_samples)
    if not np.allclose(similarities, similarities.T):
        raise ValueError("similarities must be symmetric")

    sim_flat = ((1 - np.tri(n_samples)) * similarities).ravel()
    sim_flat_w = sim_flat[sim_flat != 0]
    if init is None:
        # Randomly choose initial configuration
        X = random_state.rand(n_samples * n_components)
        X = X.reshape((n_samples, n_components))
    else:
        # overrides the parameter p
        n_components = init.shape[1]
        if n_samples != init.shape[0]:
            raise ValueError("init matrix should be of shape (%d, %d)" %
                             (n_samples, n_components))
        X = init

    old_stress = None
    ir = IsotonicRegression()
    for it in range(max_iter):
        # Compute distance and monotonic regression
        dis = euclidean_distances(X)

        if metric:
            disparities = similarities
        else:
            dis_flat = dis.ravel()
            # similarities with 0 are considered as missing values
            dis_flat_w = dis_flat[sim_flat != 0]

            # Compute the disparities using a monotonic regression
            disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w)
            disparities = dis_flat.copy()
            disparities[sim_flat != 0] = disparities_flat
            disparities = disparities.reshape((n_samples, n_samples))
            disparities *= np.sqrt((n_samples * (n_samples - 1) / 2) /
                                   (disparities ** 2).sum())

        # Compute stress
        stress = ((dis.ravel() - disparities.ravel()) ** 2).sum() / 2

        # Update X using the Guttman transform
        dis[dis == 0] = 1e-5
        ratio = disparities / dis
        B = - ratio
        B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1)
        X = 1. / n_samples * np.dot(B, X)

        dis = np.sqrt((X ** 2).sum(axis=1)).sum()
        if verbose >= 2:
            print('it: %d, stress %s' % (it, stress))
        if old_stress is not None:
            if(old_stress - stress / dis) < eps:
                if verbose:
                    print('breaking at iteration %d with stress %s' % (it,
                                                                       stress))
                break
        old_stress = stress / dis

    return X, stress