def fit_transform(self, X, init=None, y=None): """ Fit the data from X, and returns the embedded coordinates Parameters ---------- X : array, shape=[n_samples, n_features], or [n_samples, n_samples] \ if dissimilarity='precomputed' Input data. init : {None or ndarray, shape (n_samples,)}, optional If None, randomly chooses the initial configuration if ndarray, initialize the SMACOF algorithm with this array. """ if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed": warnings.warn("The MDS API has changed. ``fit`` now constructs an" " dissimilarity matrix from data. To use a custom " "dissimilarity matrix, set " "``dissimilarity=precomputed``.") if self.dissimilarity == "precomputed": self.dissimilarity_matrix_ = X elif self.dissimilarity == "euclidean": self.dissimilarity_matrix_ = euclidean_distances(X) else: raise ValueError("Proximity must be 'precomputed' or 'euclidean'." " Got %s instead" % str(self.dissimilarity)) self.embedding_, self.stress_ = smacof( self.dissimilarity_matrix_, metric=self.metric, n_components=self.n_components, init=init, n_init=self.n_init, n_jobs=self.n_jobs, max_iter=self.max_iter, verbose=self.verbose, eps=self.eps, random_state=self.random_state) return self.embedding_
def predict(self, features, ignore_first=False): """Predict from features, a numpy array of size (n_samples, n_features) Use the training data to predict labels on the test features. For each testing sample, compare it to the training samples. Look at the self.n_neighbors closest samples to the test sample by comparing their feature vectors. The label for the test sample is the determined by aggregating the K nearest neighbors in the training data. Note that when using KNN for imputation, the predicted labels are the imputed testing data and the shape is (n_samples, n_features). Arguments: features {np.ndarray} -- Features of each data point, shape of (n_samples, n_features). ignore_first {bool} -- If this is True, then we ignore the closest point when doing the aggregation. This is used for collaborative filtering, where the closest point is itself and thus is not a neighbor. In this case, we would use 1:(n_neighbors + 1). Returns: labels {np.ndarray} -- Labels for each data point, of shape (n_samples, n_dimensions). This n_dimensions should be the same as n_dimensions of targets in fit function. """ labels = np.zeros([len(features), self.n_dimensions]) for i in range( 0, len(features) ): # loops through each sample to find its closest neighbor if self.distance_measure == 'euclidean': distances = euclidean_distances( self.features, features[i] ) # finds the distances between the testing set and the new sample elif self.distance_measure == 'manhattan': distances = manhattan_distances(self.features, features[i]) else: distances = cosine_distances(self.features, features[i]) """ print('Rows of self.features = %d' % len(self.features)) print('Columns of self.features = %d' % len(self.features[0])) print('Columns of features[%d] = %d' % (i, len(features[i]))) """ new_list = np.append( distances, self.targets, axis=1 ) # makes 1 big matrix with the 1st column being the distances and the other columns being their corresponding targets new_list = new_list[np.argsort( new_list[:, 0] )] # sorts the matrix by the 1st column by ascending order (first rows have lowest value) if ignore_first: close_neighbors = new_list[1:self.n_neighbors + 1] # KNN is closest K neighbors else: close_neighbors = new_list[:self.n_neighbors] char = np.empty(self.n_dimensions) """ if self.aggregator == 'mean': char = statistics.mean(close_neighbors[:, 1:]) # finds mean of first column of closest neighbors elif self.aggregator == 'mode': char = statistics.mode(close_neighbors[:, 1:]) # finds mode of first column of closest neighbors else: char = statistics.median(close_neighbors[:, 1:]) # finds median of first column of closest neighbors """ for j in range(0, self.n_dimensions): if self.aggregator == 'mean': closest_neighbor = statistics.mean( close_neighbors[:, j + 1] ) # finds mean of first column of closest neighbors elif self.aggregator == 'mode': closest_neighbor = statistics.mode( close_neighbors[:, j + 1] ) # finds mode of first column of closest neighbors else: closest_neighbor = statistics.median( close_neighbors[:, j + 1] ) # finds median of first column of closest neighbors char[j] = closest_neighbor labels[i, :] = char #labels = np.append(labels, char, axis = 1) return labels
def _smacof_single(similarities, metric=True, n_components=2, init=None, max_iter=300, verbose=0, eps=1e-3, random_state=None): """ Computes multidimensional scaling using SMACOF algorithm Parameters ---------- similarities: symmetric ndarray, shape [n * n] similarities between the points metric: boolean, optional, default: True compute metric or nonmetric SMACOF algorithm n_components: int, optional, default: 2 number of dimension in which to immerse the similarities overwritten if initial array is provided. init: {None or ndarray}, optional if None, randomly chooses the initial configuration if ndarray, initialize the SMACOF algorithm with this array max_iter: int, optional, default: 300 Maximum number of iterations of the SMACOF algorithm for a single run verbose: int, optional, default: 0 level of verbosity eps: float, optional, default: 1e-6 relative tolerance w.r.t stress to declare converge random_state: integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. Returns ------- X: ndarray (n_samples, n_components), float coordinates of the n_samples points in a n_components-space stress_: float The final value of the stress (sum of squared distance of the disparities and the distances for all constrained points) """ n_samples = similarities.shape[0] random_state = check_random_state(random_state) if similarities.shape[0] != similarities.shape[1]: raise ValueError("similarities must be a square array (shape=%d)" % n_samples) if not np.allclose(similarities, similarities.T): raise ValueError("similarities must be symmetric") sim_flat = ((1 - np.tri(n_samples)) * similarities).ravel() sim_flat_w = sim_flat[sim_flat != 0] if init is None: # Randomly choose initial configuration X = random_state.rand(n_samples * n_components) X = X.reshape((n_samples, n_components)) else: # overrides the parameter p n_components = init.shape[1] if n_samples != init.shape[0]: raise ValueError("init matrix should be of shape (%d, %d)" % (n_samples, n_components)) X = init old_stress = None ir = IsotonicRegression() for it in range(max_iter): # Compute distance and monotonic regression dis = euclidean_distances(X) if metric: disparities = similarities else: dis_flat = dis.ravel() # similarities with 0 are considered as missing values dis_flat_w = dis_flat[sim_flat != 0] # Compute the disparities using a monotonic regression disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w) disparities = dis_flat.copy() disparities[sim_flat != 0] = disparities_flat disparities = disparities.reshape((n_samples, n_samples)) disparities *= np.sqrt((n_samples * (n_samples - 1) / 2) / (disparities ** 2).sum()) # Compute stress stress = ((dis.ravel() - disparities.ravel()) ** 2).sum() / 2 # Update X using the Guttman transform dis[dis == 0] = 1e-5 ratio = disparities / dis B = - ratio B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1) X = 1. / n_samples * np.dot(B, X) dis = np.sqrt((X ** 2).sum(axis=1)).sum() if verbose >= 2: print('it: %d, stress %s' % (it, stress)) if old_stress is not None: if(old_stress - stress / dis) < eps: if verbose: print('breaking at iteration %d with stress %s' % (it, stress)) break old_stress = stress / dis return X, stress