def test_row_norms():
    X = np.random.RandomState(42).randn(100, 100)
    for dtype in (np.float32, np.float64):
        if dtype is np.float32:
            precision = 4
        else:
            precision = 5

        X = X.astype(dtype)
        sq_norm = (X ** 2).sum(axis=1)

        assert_array_almost_equal(sq_norm, row_norms(X, squared=True),
                                  precision)
        assert_array_almost_equal(np.sqrt(sq_norm), row_norms(X), precision)

        for csr_index_dtype in [np.int32, np.int64]:
            Xcsr = sparse.csr_matrix(X, dtype=dtype)
            # csr_matrix will use int32 indices by default,
            # up-casting those to int64 when necessary
            if csr_index_dtype is np.int64:
                Xcsr.indptr = Xcsr.indptr.astype(csr_index_dtype)
                Xcsr.indices = Xcsr.indices.astype(csr_index_dtype)
            assert Xcsr.indices.dtype == csr_index_dtype
            assert Xcsr.indptr.dtype == csr_index_dtype
            assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True),
                                      precision)
            assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr),
                                      precision)
Exemple #2
0
def test_row_norms():
    X = np.random.RandomState(42).randn(100, 100)
    sq_norm = (X ** 2).sum(axis=1)

    assert_array_almost_equal(sq_norm, row_norms(X, squared=True), 5)
    assert_array_almost_equal(np.sqrt(sq_norm), row_norms(X))

    Xcsr = sparse.csr_matrix(X, dtype=np.float32)
    assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True), 5)
    assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr))
def euclidean_distances(X, Y=None):
    YY = row_norms(Y, squared=True)[np.newaxis, :]
    if X is Y:  # shortcut in the common case euclidean_distances(X, X)
        XX = YY.T
    else:
        XX = row_norms(X, squared=True)[:, np.newaxis]

    distances = np.dot(X, Y.T)
    distances *= -2
    distances += XX
    distances += YY
    np.maximum(distances, 0, out=distances)

    return distances
def test_get_auto_step_size():
    X = np.array([[1, 2, 3], [2, 3, 4], [2, 3, 2]], dtype=np.float64)
    alpha = 1.2
    fit_intercept = False
    # sum the squares of the second sample because that's the largest
    max_squared_sum = 4 + 9 + 16
    max_squared_sum_ = row_norms(X, squared=True).max()
    assert_almost_equal(max_squared_sum, max_squared_sum_, decimal=4)

    for fit_intercept in (True, False):
        step_size_sqr = 1.0 / (max_squared_sum + alpha + int(fit_intercept))
        step_size_log = 4.0 / (max_squared_sum + 4.0 * alpha +
                               int(fit_intercept))

        step_size_sqr_ = get_auto_step_size(max_squared_sum_, alpha, "squared",
                                            fit_intercept)
        step_size_log_ = get_auto_step_size(max_squared_sum_, alpha, "log",
                                            fit_intercept)

        assert_almost_equal(step_size_sqr, step_size_sqr_, decimal=4)
        assert_almost_equal(step_size_log, step_size_log_, decimal=4)

    msg = 'Unknown loss function for SAG solver, got wrong instead of'
    assert_raise_message(ValueError, msg, get_auto_step_size,
                         max_squared_sum_, alpha, "wrong", fit_intercept)
    def fit(self, X, y):
        """Fit factorization machine to training data.

        Parameters
        ----------
        X : array-like or sparse, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------
        self : Estimator
            Returns self.
        """
        if self.degree > 3:
            raise ValueError("FMs with degree >3 not yet supported.")

        X, y = self._check_X_y(X, y)
        X = self._augment(X)
        n_features = X.shape[1]  # augmented
        X_col_norms = row_norms(X.T, squared=True)
        dataset = get_dataset(X, order="fortran")
        rng = check_random_state(self.random_state)
        loss_obj = self._get_loss(self.loss)

        if not (self.warm_start and hasattr(self, 'w_')):
            self.w_ = np.zeros(n_features, dtype=np.double)

        if self.fit_lower == 'explicit':
            n_orders = self.degree - 1
        else:
            n_orders = 1

        if not (self.warm_start and hasattr(self, 'P_')):
            self.P_ = 0.01 * rng.randn(n_orders, self.n_components, n_features)

        if not (self.warm_start and hasattr(self, 'lams_')):
            if self.init_lambdas == 'ones':
                self.lams_ = np.ones(self.n_components)
            elif self.init_lambdas == 'random_signs':
                self.lams_ = np.sign(rng.randn(self.n_components))
            else:
                raise ValueError("Lambdas must be initialized as ones "
                                 "(init_lambdas='ones') or as random "
                                 "+/- 1 (init_lambdas='random_signs').")

        y_pred = self._get_output(X)

        converged = _cd_direct_ho(self.P_, self.w_, dataset, X_col_norms, y,
                                  y_pred, self.lams_, self.degree, self.alpha,
                                  self.beta, self.fit_linear,
                                  self.fit_lower == 'explicit', loss_obj,
                                  self.max_iter, self.tol, self.verbose)
        if not converged:
            warnings.warn("Objective did not converge. Increase max_iter.")

        return self
Exemple #6
0
    def compute_distances(self, x1, x2=None):
        """
        The method
        - extracts normalized continuous attributes and then uses `row_norms`
          and `safe_sparse_do`t to compute the distance as x^2 - 2xy - y^2
          (the trick from sklearn);
        - calls a function in Cython that adds the contributions of discrete
          columns
        """
        if self.normalize:
            x1 = x1 - self.means
            x1 /= np.sqrt(2 * self.vars)

        # adapted from sklearn.metric.euclidean_distances
        xx = row_norms(x1.T, squared=True)[:, np.newaxis]
        distances = safe_sparse_dot(x1.T, x1, dense_output=True)
        distances *= -2
        distances += xx
        distances += xx.T
        with np.errstate(invalid="ignore"):  # Nans are fixed below
            np.maximum(distances, 0, out=distances)
        distances.flat[::distances.shape[0] + 1] = 0.0

        fixer = _distance.fix_euclidean_cols_normalized if self.normalize \
            else _distance.fix_euclidean_cols
        fixer(distances, x1, self.means, self.vars)
        return np.sqrt(distances)
def test_labels_assignment_and_inertia():
    # pure numpy implementation as easily auditable reference gold
    # implementation
    rng = np.random.RandomState(42)
    noisy_centers = centers + rng.normal(size=centers.shape)
    labels_gold = - np.ones(n_samples, dtype=np.int)
    mindist = np.empty(n_samples)
    mindist.fill(np.infty)
    for center_id in range(n_clusters):
        dist = np.sum((X - noisy_centers[center_id]) ** 2, axis=1)
        labels_gold[dist < mindist] = center_id
        mindist = np.minimum(dist, mindist)
    inertia_gold = mindist.sum()
    assert_true((mindist >= 0.0).all())
    assert_true((labels_gold != -1).all())

    # perform label assignment using the dense array input
    x_squared_norms = (X ** 2).sum(axis=1)
    labels_array, inertia_array = _labels_inertia(
        X, x_squared_norms, noisy_centers)
    assert_array_almost_equal(inertia_array, inertia_gold)
    assert_array_equal(labels_array, labels_gold)

    # perform label assignment using the sparse CSR input
    x_squared_norms_from_csr = row_norms(X_csr, squared=True)
    labels_csr, inertia_csr = _labels_inertia(
        X_csr, x_squared_norms_from_csr, noisy_centers)
    assert_array_almost_equal(inertia_csr, inertia_gold)
    assert_array_equal(labels_csr, labels_gold)
Exemple #8
0
    def compute_distances(self, x1, x2=None):
        """
        The method
        - extracts normalized continuous attributes and then uses `row_norms`
          and `safe_sparse_do`t to compute the distance as x^2 - 2xy - y^2
          (the trick from sklearn);
        - calls a function in Cython that recomputes the distances between pairs
          of rows that yielded nan
        - calls a function in Cython that adds the contributions of discrete
          columns
        """
        if self.continuous.any():
            data1, data2 = self.continuous_columns(
                x1, x2, self.means, np.sqrt(2 * self.vars))

            # adapted from sklearn.metric.euclidean_distances
            xx = row_norms(data1, squared=True)[:, np.newaxis]
            if x2 is not None:
                yy = row_norms(data2, squared=True)[np.newaxis, :]
            else:
                yy = xx.T
            distances = safe_sparse_dot(data1, data2.T, dense_output=True)
            distances *= -2
            distances += xx
            distances += yy
            with np.errstate(invalid="ignore"):  # Nans are fixed below
                np.maximum(distances, 0, out=distances)
            if x2 is None:
                distances.flat[::distances.shape[0] + 1] = 0.0
            fixer = _distance.fix_euclidean_rows_normalized if self.normalize \
                else _distance.fix_euclidean_rows
            fixer(distances, data1, data2,
                  self.means, self.vars, self.dist_missing2_cont,
                  x2 is not None)
        else:
            distances = np.zeros((x1.shape[0],
                                  (x2 if x2 is not None else x1).shape[0]))

        if np.any(self.discrete):
            data1, data2 = self.discrete_columns(x1, x2)
            _distance.euclidean_rows_discrete(
                distances, data1, data2, self.dist_missing_disc,
                self.dist_missing2_disc, x2 is not None)

        if x2 is None:
            _distance.lower_to_symmetric(distances)
        return np.sqrt(distances)
def get_kpp_init(X,n_clusters,random_state=None):
    random_state = None
    random_state = check_random_state(random_state)
    x_squared_norms = row_norms(X, squared=True)
    centers = sklearn.cluster.k_means_._k_init(X, n_clusters, random_state=random_state,x_squared_norms=x_squared_norms) # n_clusters x D
    W =  np.transpose( centers )  # D x D^(1)
    W_tf = tf.constant(W)
    return centers,W,W_tf
Exemple #10
0
def _kmeans_spark(X, n_clusters, max_iter=300, worker_nums=10, init='k-means++', random_state=None, tol=1e-4):
    from pyspark import SparkContext, SparkConf

    conf = SparkConf().setAppName('K-Means_Spark').setMaster('local[%d]'%worker_nums)
    sc = SparkContext(conf=conf)
    data = sc.parallelize(X)
    data.cache()

    random_state = check_random_state(random_state)

    best_labels, best_inertia, best_centers = None, None, None

    x_squared_norms = row_norms(X, squared=True)
    #  x_squared_norms = data.map(lambda x: (x*x).sum(axis=0)).collect()
    #  x_squared_norms = np.array(x_squared_norms, dtype='float64')

    centers = _init_centroids(X, n_clusters, init, random_state, x_squared_norms=x_squared_norms)

    bs = X.shape[0]/worker_nums
    data_temp = []
    for i in range(worker_nums-1):
        data_temp.append(X[i*bs:(i+1)*bs])
    data_temp.append(X[(worker_nums-1)*bs:])
    data_temp = np.array(data_temp, dtype='float64')
    data_temp = sc.parallelize(data_temp)
    data_temp.cache()


    for i in range(max_iter):
        centers_old = centers.copy()

        all_distances = data_temp.map(lambda x: euclidean_distances(centers, x, squared=True)).collect()
        temp_all_distances = all_distances[0]
        for i in range(1, worker_nums):
            temp_all_distances = np.hstack((temp_all_distances, all_distances[i]))
        all_distances = temp_all_distances

        #  all_distances = data.map(lambda x: euclidean_distances(centers, x, squared=True)).collect()
        #  # reshape, from (1, n_samples, k) to (k, n_samples)
        #  all_distances = np.asarray(all_distances, dtype="float64").T[0]

        # Assignment, also called E-step of EM
        labels, inertia = _labels_inertia(X, x_squared_norms, centers, all_distances=all_distances)
        # re-computation of the centroids, also called M-step of EM
        centers = _centers(X, labels, n_clusters)

        if best_inertia is None or inertia < best_inertia:
            best_labels  = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        shift = squared_norm(centers_old - centers)
        if shift <= tol:
            break

    return best_centers, best_labels, best_inertia
def test_row_norms():
    X = np.random.RandomState(42).randn(100, 100)
    for dtype in (np.float32, np.float64):
        if dtype is np.float32:
            precision = 4
        else:
            precision = 5

        X = X.astype(dtype)
        sq_norm = (X ** 2).sum(axis=1)

        assert_array_almost_equal(sq_norm, row_norms(X, squared=True),
                                  precision)
        assert_array_almost_equal(np.sqrt(sq_norm), row_norms(X), precision)

        Xcsr = sparse.csr_matrix(X, dtype=dtype)
        assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True),
                                  precision)
        assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr), precision)
Exemple #12
0
def get_auto_step_size(X, alpha, loss, gamma=None, sample_weight=None):
    """Compute automatic step size for SAG solver
    Stepsize computed using the following objective:
        minimize_w  1 / n_samples * \sum_i loss(w^T x_i, y_i)
                    + alpha * 0.5 * ||w||^2_2
    Parameters
    ----------
    X : ndarray
        Array of samples x_i.
    alpha : float
        Constant that multiplies the l2 penalty term.
    loss : string, in {"log", "squared"}
        The loss function used in SAG solver.

    Returns
    -------
    step_size : float
        Step size used in SAG/SAGA solver.
    """
    if sample_weight is None:
        weighted_norms = row_norms(X, squared=True)
    else:
        weighted_norms = sample_weight * row_norms(X, squared=True)
    L = np.max(weighted_norms)
    n_samples = X.shape[0]

    if loss == 'log':
        # inverse Lipschitz constant for log loss
        lipschitz_constant = 0.25 * L + alpha
    elif loss == 'squared':
        lipschitz_constant = L + alpha
    elif loss == 'modified_huber':
        lipschitz_constant = 2 * L + alpha
    elif loss == 'smooth_hinge':
        lipschitz_constant = L + gamma + alpha
    elif loss == 'squared_hinge':
        lipschitz_constant = 2 * L + alpha
    else:
        raise ValueError("`auto` stepsize is only available for `squared` or "
                         "`log` losses (got `%s` loss). Please specify a "
                         "stepsize." % loss)
    return 1.0 / lipschitz_constant
Exemple #13
0
    def fit(self, X):
        x_squared_norms = row_norms(X, squared=True)
        rng = np.random.RandomState(self.random_state)

        if self.init == "kmeans++":
            # Private function of sklearn.cluster.k_means_, to get the initial centers.
            init_centers = _k_init(X, self.n_clusters, x_squared_norms, rng)
        elif self.init == "random":
            random_samples = rng.random_integers(0, X.shape[0], size=self.n_clusters)
            init_centers = X[random_samples, :]
        else:
            raise ValueError("init should be either kmeans++ or random")

        # Assign initial labels. skip norm of x**2
        init_distances = np.sum(init_centers**2, axis=1) - 2 * np.dot(X, init_centers.T)
        init_labels = np.argmin(init_distances, axis=1)
        self.labels_ = init_labels

        self.centers_ = init_centers
        self.n_samples_ = np.zeros(self.n_clusters)

        # Count the number of samples in each cluster.
        for i in range(self.n_clusters):
            self.n_samples_[i] = np.sum(self.labels_ == i)

        for i, (sample, label) in enumerate(zip(X, self.labels_)):
            curr_label = label
            max_cost = np.inf
            while max_cost > 0:
                distances = x_squared_norms[i] - 2 * np.dot(sample, self.centers_.T) + np.sum(self.centers_**2, axis=1)

                curr_distance = distances[curr_label]
                other_distance = np.delete(distances, curr_label)
                curr_n_samples = self.n_samples_[curr_label]
                other_n_samples = np.delete(self.n_samples_, curr_label)
                cost = (curr_n_samples / (curr_n_samples - 1) * curr_distance) - (other_n_samples / (other_n_samples + 1) * other_distance)
                max_cost_ind = np.argmax(cost)
                max_cost = cost[max_cost_ind]

                if max_cost > 0:
                    # We deleted the label index from other_n_samples
                    if max_cost_ind > curr_label:
                        max_cost_ind += 1

                    # Reassign the clusters
                    self.labels_[i] = max_cost_ind

                    self.centers_[curr_label] = (curr_n_samples * self.centers_[curr_label] - sample) / (curr_n_samples - 1)
                    moved_n_samples = self.n_samples_[max_cost_ind]
                    self.centers_[max_cost_ind] = (moved_n_samples * self.centers_[max_cost_ind] + sample) / (moved_n_samples + 1)
                    self.n_samples_[curr_label] -= 1
                    self.n_samples_[max_cost_ind] += 1
                    curr_label = max_cost_ind
Exemple #14
0
 def prepare_data(x):
     if self.discrete.any():
         data = Cosine.discrete_to_indicators(x, self.discrete)
     else:
         data = x.copy()
     for col, mean in enumerate(self.means):
         column = data[:, col]
         column[np.isnan(column)] = mean
     if self.axis == 0:
         data = data.T
     data /= row_norms(data)[:, np.newaxis]
     return data
Exemple #15
0
def kmeans_subsample(X, n_clusters, random_state=None, n_local_trials=10):

    random_state = check_random_state(random_state)

    n_samples, n_features = X.shape
    x_squared_norms = row_norms(X, squared=True)
    centers = np.empty((n_clusters, n_features))

    # Pick first center randomly
    center_id = random_state.randint(n_samples)
    centers[0] = X[center_id]

    # Initialize list of closest distances and calculate current potential
    closest_dist_sq = euclidean_distances(centers[0].reshape(1, -1), X, Y_norm_squared=x_squared_norms, squared=True)
    current_pot = closest_dist_sq.sum()

    # Pick the remaining n_clusters-1 points
    for c in range(1, n_clusters):
        # Choose center candidates by sampling with probability proportional
        # to the squared distance to the closest existing center
        rand_vals = random_state.random_sample(n_local_trials) * current_pot
        candidate_ids = np.searchsorted(closest_dist_sq.cumsum(), rand_vals)

        # Compute distances to center candidates
        distance_to_candidates = euclidean_distances(X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True)

        # Decide which candidate is the best
        best_candidate = None
        best_pot = None
        best_dist_sq = None
        for trial in range(n_local_trials):
            # Compute potential when including center candidate
            new_dist_sq = np.minimum(closest_dist_sq, distance_to_candidates[trial])
            new_pot = new_dist_sq.sum()

            # Store result if it is the best local trial so far
            if (best_candidate is None) or (new_pot < best_pot):
                best_candidate = candidate_ids[trial]
                best_pot = new_pot
                best_dist_sq = new_dist_sq

        # Permanently add best center candidate found in local tries
        centers[c] = X[best_candidate]
        current_pot = best_pot
        closest_dist_sq = best_dist_sq

    return centers
Exemple #16
0
def kmeanspp(X, k, seed):
    # That we need to do this is a bug in _init_centroids
    x_squared_norms = row_norms(X, squared=True)
    # Use k-means++ to initialise the centroids
    centroids = _init_centroids(X, k, 'k-means++', random_state=seed, x_squared_norms=x_squared_norms)
    # OK, we should just short-circuit and get these from k-means++...
    # quick and dirty solution
    nns = NearestNeighbors()
    nns.fit(X)
    centroid_candidatess = nns.radius_neighbors(X=centroids, radius=0, return_distance=False)
    # Account for "degenerated" solutions: serveral voxels at distance 0, each becoming a centroid
    centroids = set()
    for centroid_candidates in centroid_candidatess:
        centroid_candidates = set(centroid_candidates) - centroids
        if len(set(centroid_candidates) - centroids) == 0:
            raise Exception('Cannot get an unambiguous set of centers;'
                            'theoretically this cannot happen, so check for bugs')
        centroids.add(centroid_candidates.pop())
    return np.array(sorted(centroids))
Exemple #17
0
def _init_centroids(X, k, init, random_state, x_squared_norms=None):
    random_state = check_random_state(random_state)
    n_samples = X.shape[0]

    if x_squared_norms is None:
        x_squared_norms = row_norms(X, squared=True)


    if n_samples < k:
        raise ValueError("n_samples=%d should be larger than k=%d"%(n_samples, k))

    if init == 'k-means++':
        centers = _k_init(X, k, random_state=random_state,
                            x_squared_norms=x_squared_norms)
    elif init == 'random':
        seeds = random_state.permutation(n_samples)[:k]
        centers = X[seeds]

    return centers
Exemple #18
0
def test_get_auto_step_size():
    X = np.array([[1, 2, 3], [2, 3, 4], [2, 3, 2]], dtype=np.float64)
    alpha = 1.2
    fit_intercept = False
    # sum the squares of the second sample because that's the largest
    max_squared_sum = 4 + 9 + 16
    max_squared_sum_ = row_norms(X, squared=True).max()
    n_samples = X.shape[0]
    assert_almost_equal(max_squared_sum, max_squared_sum_, decimal=4)

    for saga in [True, False]:
        for fit_intercept in (True, False):
            if saga:
                L_sqr = (max_squared_sum + alpha + int(fit_intercept))
                L_log = (max_squared_sum + 4.0 * alpha +
                         int(fit_intercept)) / 4.0
                mun_sqr = min(2 * n_samples * alpha, L_sqr)
                mun_log = min(2 * n_samples * alpha, L_log)
                step_size_sqr = 1 / (2 * L_sqr + mun_sqr)
                step_size_log = 1 / (2 * L_log + mun_log)
            else:
                step_size_sqr = 1.0 / (max_squared_sum +
                                       alpha + int(fit_intercept))
                step_size_log = 4.0 / (max_squared_sum + 4.0 * alpha +
                                       int(fit_intercept))

            step_size_sqr_ = get_auto_step_size(max_squared_sum_, alpha,
                                                "squared",
                                                fit_intercept,
                                                n_samples=n_samples,
                                                is_saga=saga)
            step_size_log_ = get_auto_step_size(max_squared_sum_, alpha, "log",
                                                fit_intercept,
                                                n_samples=n_samples,
                                                is_saga=saga)

            assert_almost_equal(step_size_sqr, step_size_sqr_, decimal=4)
            assert_almost_equal(step_size_log, step_size_log_, decimal=4)

    msg = 'Unknown loss function for SAG solver, got wrong instead of'
    assert_raise_message(ValueError, msg, get_auto_step_size,
                         max_squared_sum_, alpha, "wrong", fit_intercept)
Exemple #19
0
    def predict(self, X):
            """Predict the closest cluster each sample in X belongs to.

            In the vector quantization literature, `cluster_centers_` is called
            the code book and each value returned by `predict` is the index of
            the closest code in the code book.

            Parameters
            ----------
            X : {array-like, sparse matrix}, shape = [n_samples, n_features]
                New data to predict.

            Returns
            -------
            labels : array, shape [n_samples,]
                Index of the cluster each sample belongs to.
            """
            #check_is_fitted(self, 'cluster_centers_')

            X = self._check_test_data(X)
            x_squared_norms = row_norms(X, squared=True)
            return _labels_inertia(X, x_squared_norms, self.cluster_centers_)[0]
Exemple #20
0
 def run_step(self,run_number,step_size,howlong):
     df_slot = self.get_input_slot('df')
     df_slot.update(run_number, buffer_created=True, buffer_updated=True)
     if df_slot.has_deleted():
         self.reset()
         df_slot.reset()
         df_slot.update(run_number)
     input_df = df_slot.data()
     columns = self.get_columns(input_df)
     if input_df is None or len(input_df)==0:
         return self._return_run_step(self.state_blocked, steps_run=0)
     indices = df_slot.next_created(step_size)
     steps = indices_len(indices)
     step_size -= steps
     steps_run = steps
     if steps != 0:
         indices = fix_loc(indices)
         self._buffer.append(input_df.loc[indices])
         self._df = self._buffer.df()
         self._df.loc[indices,self.UPDATE_COLUMN] = run_number
     if step_size > 0 and df_slot.has_updated():
         indices = df_slot.next_updated(step_size,as_slice=False)
         steps = indices_len(indices)
         if steps != 0:
             steps_run += steps
             indices = fix_loc(indices) # no need, but stick to the stereotype
             updated = self.filter_columns(input_df, indices)
             df = self.filter_columns(self._df, indices)
             norms = row_norms(updated-df)
             selected = (norms > (self._delta*self.get_scale()))
             indices = indices[selected]
             if selected.any():
                 logger.debug('updating at %d', run_number)
                 self._df.loc[indices, self._columns] = updated.loc[indices, self._columns]
                 self._df.loc[indices, self.UPDATE_COLUMN] = run_number
             else:
                 logger.debug('Not updating at %d', run_number)
     return self._return_run_step(df_slot.next_state(), steps_run=steps_run)
Exemple #21
0
def _kmeans_single(X, n_clusters, max_iter=300, init='k-means++', random_state=None, tol=1e-4):
    random_state = check_random_state(random_state)

    best_labels, best_inertia, best_centers = None, None, None

    # init
    x_squared_norms = row_norms(X, squared=True)
    centers = _init_centroids(X, n_clusters, init, random_state, x_squared_norms=x_squared_norms)

    #  distances = np.zeros(shape=(X.shape[0],), dtype=np.float64)

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()
        # Assignment, also called E-step of EM
        labels, inertia = _labels_inertia(X, x_squared_norms, centers)

        # re-computation of the centroids, also called M-step of EM
        centers = _centers(X, labels, n_clusters)

        if best_inertia is None or inertia < best_inertia:
            best_labels  = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        shift = squared_norm(centers_old - centers)
        if shift <= tol:
            break

    if shift > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_inertia(X, x_squared_norms, best_centers)


    return best_centers, best_labels, best_inertia
Exemple #22
0
from sklearn.utils import check_random_state
from sklearn.cluster import KMeans as skKMeans
# %%

# data = pd.read_csv('s1.csv', sep=',')
data = pd.read_csv('s1.csv', sep=',')
# %%
scaler = MinMaxScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data))

# %%
# K-means++ für erste Cluster
N = 15
random_state = 42
fit_data = np.asarray(data_scaled)
x_squared_norms = row_norms(fit_data, squared=True)
random = check_random_state(random_state)
initial_clusters = k_init(fit_data, N, x_squared_norms, random)

max_norm = KMeans(N, initial_clusters, order=np.inf)
max_norm.fit(fit_data)
manhattan = KMeans(N, initial_clusters, order=1)
manhattan.fit(fit_data)
euclid = KMeans(N, initial_clusters, order=2)
euclid.fit(fit_data)
# %%
fig, axs = plt.subplots(2, 3)
data_scaled['max'] = max_norm.labels
center = max_norm.centroids
axs[0, 0].scatter(data_scaled[0],
                  data_scaled[1],
Exemple #23
0
data = data_original[idx]
labels = labels_original[idx]
x_train, x_test, y_train, y_test = train_test_split(data, labels,test_size=0.20)

#Scaling of features
scaler = StandardScaler() 
scaled_x_train = scaler.fit_transform(x_train)
scaled_x_test = scaler.transform(x_test)
scaled_data_original = scaler.transform(data_original)


random_state = 0
K = 5896
gamma = 177.82
random_state = check_random_state(random_state)
x_squared_norms = row_norms(scaled_x_train, squared=True)

if not sp.issparse(scaled_x_train):
        scaled_x_train_mean = scaled_x_train.mean(axis=0)
        scaled_x_train -= scaled_x_train_mean
        
if not sp.issparse(scaled_x_test):
        scaled_x_test_mean = scaled_x_test.mean(axis=0)
        scaled_x_test -= scaled_x_test_mean
        
if not sp.issparse(scaled_data_original):
        scaled_data_original_mean = scaled_data_original.mean(axis=0)
        scaled_data_original -= scaled_data_original_mean
        
#Initializing the centers using k-means++ algorithm implementation of sklearn
centers = _k_init(scaled_x_train, K, random_state=random_state, x_squared_norms=x_squared_norms)
from sklearn import metrics
from sklearn.cluster import KMeans
import numpy as np
from time import time
from sklearn.utils.extmath import row_norms, squared_norm
from sklearn.metrics.pairwise import euclidean_distances

k = 10
csv = np.genfromtxt('census_50k.csv', delimiter=",")
sh = csv.shape
mu = np.ones((k, sh[1]))
x_squared_norms = row_norms(csv, squared=True)

t0 = time()
for i in range(100):
    all_distances = euclidean_distances(mu, csv, x_squared_norms, squared=True)
    mu = mu + 1

t = time() - t0
print t
def convert_sklearn_kmeans(scope, operator, container):
    """
    Computation graph of distances to all centroids for a batch of examples.
    Note that a centriod is just the center of a cluster. We use ``[]`` to
    denote the dimension of a variable; for example, ``X[3, 2]`` means that
    *X* is a *3-by-2* tensor. In addition, for a matrix *X*, $X'$ denotes its
    transpose.

    Symbols:

    * *l*: # of examples.
    * *n*: # of features per input example.
    * *X*: input examples, l-by-n tensor.
    * *C*: centroids, k-by-n tensor.
    * $C^2$: 2-norm of all centriod vectors, its shape is ``[k]``.
    * *Y*: 2-norm of difference between examples and centroids,
      *l-by-k* tensor. The value at i-th row and k-th column row,
      ``Y[i,k]``,is the distance from example *i* to centroid *k*.
    * *L*: the id of the nearest centroid for each input example,
      its shape is ``[l]``.

    ::

     .------------------------------------------------------.
     |                                                      |
     |                                                      v
X [l, n] --> ReduceSumSquare -> X^2 [l]   Gemm (alpha=-2, transB=1) <- C [k, n]
                                 |                  |
                                 |                  v
                                 `------> Add <-- -2XC' [l, k]
                                           |
                                           v
             C^2 [k] --------> Add <----- Z [l, k]
                                |
                                v
         L [l] <-- ArgMin <--  Y2 [l, k] --> Sqrt --> Y2 [l, k]

    *scikit-learn* code:

    ::

        X = data
        Y = model.cluster_centers_
        XX = row_norms(X, squared=True)
        YY = row_norms(Y, squared=True)
        distances = safe_sparse_dot(X, Y.T, dense_output=True)
        distances *= -2
        distances += XX[:, numpy.newaxis]
        distances += YY[numpy.newaxis, :]
        numpy.sqrt(distances, out=distances)
    """
    op = operator.raw_operator
    variable = operator.inputs[0]
    N = variable.type.shape[0]

    # centroids
    shapeC = list(op.cluster_centers_.shape)
    nameC = scope.get_unique_variable_name('centroid')
    container.add_initializer(nameC, onnx_proto.TensorProto.FLOAT,
                              shapeC, op.cluster_centers_.flatten())

    nameX2 = scope.get_unique_variable_name('X2')
    nameX = operator.inputs[0].full_name
    container.add_node('ReduceSumSquare', [nameX], [nameX2], axes=[1],
                       keepdims=1,
                       name=scope.get_unique_operator_name('ReduceSumSquare'))

    # Compute -2XC'
    zero_name = scope.get_unique_variable_name('zero')
    zeros = np.zeros((N, ))
    container.add_initializer(zero_name, onnx_proto.TensorProto.FLOAT,
                              list(zeros.shape), zeros)
    nameXC2 = scope.get_unique_variable_name('XC2')
    apply_gemm(scope, [nameX, nameC, zero_name], [nameXC2], container,
               alpha=-2., transB=1)

    # Compute Z = X^2 - 2XC'
    nameZ = scope.get_unique_variable_name("Z")
    apply_add(scope, [nameXC2, nameX2], [nameZ], container)

    # centroids ^2
    nameC2 = scope.get_unique_variable_name('C2')
    c2 = row_norms(op.cluster_centers_, squared=True)
    container.add_initializer(nameC2, onnx_proto.TensorProto.FLOAT,
                              [1, shapeC[0]], c2.flatten())

    # Compute Y2 = Z + C^2
    nameY2 = scope.get_unique_variable_name('Y2')
    apply_add(scope, [nameZ, nameC2], [nameY2], container)

    # Compute Y = sqrt(Y2)
    nameY = operator.outputs[1].full_name
    apply_sqrt(scope, [nameY2], [nameY], container)

    # Compute the most-matched cluster index, L
    nameL = operator.outputs[0].full_name
    container.add_node('ArgMin', [nameY2], [nameL],
                       name=scope.get_unique_operator_name('ArgMin'),
                       axis=1, keepdims=0)
Exemple #26
0
def row_norms(X, squared=False):
    if isinstance(X, np.ndarray):
        return skm.row_norms(X, squared=squared)
    return X.map_blocks(
        skm.row_norms, chunks=(X.chunks[0],), drop_axis=1, squared=squared
    )
def k_means_gpu_sparsity(weight_vector,
                         n_clusters,
                         ratio=0.5,
                         verbosity=0,
                         seed=int(time.time()),
                         gpu_id=0):

    if ratio == 0:

        return k_means_gpu(weight_vector=weight_vector,
                           n_clusters=n_clusters,
                           verbosity=verbosity,
                           seed=seed,
                           gpu_id=gpu_id)

    if ratio == 1:

        if n_clusters == 1:

            mean_sample = np.mean(weight_vector, axis=0)

            weight_vector = np.tile(mean_sample, (weight_vector.shape[0], 1))

            return weight_vector

        elif weight_vector.shape[0] == n_clusters:

            return weight_vector

        else:
            weight_vector_1_mean = np.mean(weight_vector, axis=0)

            weight_vector_compress = np.zeros(
                (weight_vector.shape[0], weight_vector.shape[1]),
                dtype=np.float32)
            for v in weight_vector.shape[0]:
                weight_vector_compress[v, :] = weight_vector_1_mean

            return weight_vector_compress

    else:

        if n_clusters == 1:

            mean_sample = np.mean(weight_vector, axis=0)

            weight_vector = np.tile(mean_sample, (weight_vector.shape[0], 1))

            return weight_vector

        elif weight_vector.shape[0] == n_clusters:

            return weight_vector

        elif weight_vector.shape[1] == 1:

            return k_means_sparsity(weight_vector,
                                    n_clusters,
                                    ratio,
                                    seed=seed)

        else:
            num_samples = weight_vector.shape[0]
            mean_sample = np.mean(weight_vector, axis=0)

            center_cluster_index = np.argsort(
                np.linalg.norm(weight_vector - mean_sample,
                               axis=1))[:int(num_samples * ratio)]
            weight_vector_1_mean = np.mean(
                weight_vector[center_cluster_index, :], axis=0)

            remaining_cluster_index = np.asarray([
                i for i in np.arange(num_samples)
                if i not in center_cluster_index
            ])

            weight_vector_train = weight_vector[remaining_cluster_index, :]
            init_centers = k_means_._k_init(X=weight_vector_train,
                                            n_clusters=n_clusters - 1,
                                            x_squared_norms=row_norms(
                                                weight_vector_train,
                                                squared=True),
                                            random_state=RandomState(seed))
            centers, labels = kmeans_cuda(samples=weight_vector_train,
                                          clusters=n_clusters - 1,
                                          init=init_centers,
                                          yinyang_t=0,
                                          seed=seed,
                                          device=gpu_id,
                                          verbosity=verbosity)
            weight_vector_compress = np.zeros(
                (weight_vector.shape[0], weight_vector.shape[1]),
                dtype=np.float32)
            for v in center_cluster_index:
                weight_vector_compress[v, :] = weight_vector_1_mean

            for i, v in enumerate(remaining_cluster_index):
                weight_vector_compress[v, :] = centers[labels[i], :]
            return weight_vector_compress
Exemple #28
0
def _initialize_nrkmeans_parameters(X, n_clusters, V, m, P, centers, max_iter,
                                    random_state):
    """
    Initialize the input parameters form NrKmeans. This means that all input values which are None must be defined.
    Also all input parameters which are not None must be checked, if a correct execution is possible.
    :param X: input data
    :param n_clusters: list containing number of clusters for each subspace
    :param V: orthogonal rotation matrix
    :param m: list containing number of dimensionalities for each subspace
    :param P: list containing projections for each subspace
    :param centers: list containing the cluster centers for each subspace
    :param max_iter: maximum number of iterations for the algorithm
    :param random_state: use a fixed random state to get a repeatable solution
    :return: checked V, m, P, centers, random_state, number of subspaces, labels, scatter_matrices
    """
    data_dimensionality = X.shape[1]
    random_state = check_random_state(random_state)
    # Check if n_clusters is a list
    if not type(n_clusters) is list:
        raise ValueError(
            "Number of clusters must be specified for each subspace and therefore be a list.\nYour input:\n"
            + str(n_clusters))
    # Check if n_clusters contains negative values
    if len([x for x in n_clusters if x < 1]) > 0:
        raise ValueError(
            "Number of clusters must not contain negative values or 0.\nYour input:\n"
            + str(n_clusters))
    # Check if n_clusters contains more than one noise space
    nr_noise_spaces = len([x for x in n_clusters if x == 1])
    if nr_noise_spaces > 1:
        raise ValueError(
            "Only one subspace can be the noise space (number of clusters = 1).\nYour input:\n"
            + str(n_clusters))
    # Check if noise space is not the last member in n_clusters
    if nr_noise_spaces != 0 and n_clusters[-1] != 1:
        raise ValueError(
            "Noise space (number of clusters = 1) must be the last entry in n_clusters.\nYour input:\n"
            + str(n_clusters))
    # Get number of subspaces
    subspaces = len(n_clusters)
    # Check if V is orthogonal
    if V is None:
        V = ortho_group.rvs(dim=data_dimensionality, random_state=random_state)
    if not _is_matrix_orthogonal(V):
        raise Exception("Your input matrix V is not orthogonal.\nV:\n" +
                        str(V))
    # Calculate dimensionalities m
    if m is None and P is None:
        m = [int(data_dimensionality / subspaces)] * subspaces
        if data_dimensionality % subspaces != 0:
            choices = random_state.choice(range(subspaces),
                                          data_dimensionality - sum(m))
            for choice in choices:
                m[choice] += 1
    # If m is None but P is defined use P's dimensionality
    elif m is None:
        m = [len(x) for x in P]
    if not type(m) is list or not len(m) is subspaces:
        raise ValueError(
            "A dimensionality list m must be specified for each subspace.\nYour input:\n"
            + str(m))
    # Calculate projections P
    if P is None:
        possible_projections = list(range(data_dimensionality))
        P = []
        for dimensionality in m:
            choices = random_state.choice(possible_projections,
                                          dimensionality,
                                          replace=False)
            P.append(choices)
            possible_projections = list(
                set(possible_projections) - set(choices))
    if not type(P) is list or not len(P) is subspaces:
        raise ValueError(
            "Projection lists must be specified for each subspace.\nYour input:\n"
            + str(P))
    else:
        # Check if the length of entries in P matches values of m
        used_dimensionalities = []
        for i, dimensionality in enumerate(m):
            used_dimensionalities.extend(P[i])
            if not len(P[i]) == dimensionality:
                raise ValueError(
                    "Values for dimensionality m and length of projection list P do not match.\nDimensionality m:\n"
                    + str(dimensionality) + "\nDimensionality P:\n" +
                    str(P[i]))
        # Check if every dimension in considered in P
        if sorted(used_dimensionalities) != list(range(data_dimensionality)):
            raise ValueError(
                "Projections P must include all dimensionalities.\nYour used dimensionalities:\n"
                + str(used_dimensionalities))
    # Define initial cluster centers with kmeans++ for each subspace
    if centers is None:
        centers = []
        for i in range(subspaces):
            k = n_clusters[i]
            if k > 1:
                P_subspace = P[i]
                cropped_X = np.matmul(X, V[:, P_subspace])
                centers_cropped = kpp(cropped_X, k,
                                      row_norms(cropped_X, squared=True),
                                      random_state)
                labels, _ = pairwise_distances_argmin_min(
                    X=cropped_X,
                    Y=centers_cropped,
                    metric='euclidean',
                    metric_kwargs={'squared': True})

                centers_sub = np.zeros((k, X.shape[1]))
                # Update cluster parameters
                for center_id, _ in enumerate(centers_sub):
                    # Get points in this cluster
                    points_in_cluster = np.where(labels == center_id)[0]

                    # Update center
                    centers_sub[center_id] = np.average(X[points_in_cluster],
                                                        axis=0)
                centers.append(centers_sub)
            else:
                centers.append(np.expand_dims(np.average(X, axis=0), 0))

    if not type(centers) is list or not len(centers) is subspaces:
        raise ValueError(
            "Cluster centers must be specified for each subspace.\nYour input:\n"
            + str(centers))
    else:
        # Check if number of centers for subspaces matches value in n_clusters
        for i, subspace_centers in enumerate(centers):
            if not n_clusters[i] == len(subspace_centers):
                raise ValueError(
                    "Values for number of clusters n_clusters and number of centers do not match.\nNumber of clusters:\n"
                    + str(n_clusters[i]) + "\nNumber of centers:\n" +
                    str(len(subspace_centers)))
    # Check max iter
    if max_iter is None or type(max_iter) is not int or max_iter <= 0:
        raise ValueError(
            "Max_iter must be an integer larger than 0. Your Max_iter:\n" +
            str(max_iter))
    # Initial labels and scatter matrices
    labels = [None] * subspaces
    scatter_matrices = [None] * subspaces
    return V, m, P, centers, random_state, subspaces, labels, scatter_matrices
Exemple #29
0
def _sparse_encode(X, dictionary, gram, cov=None, algorithm='lasso_lars',
                   regularization=None, copy_cov=True,
                   init=None, max_iter=1000):
    """Generic sparse coding

    Each column of the result is the solution to a Lasso problem.

    Parameters
    ----------
    X: array of shape (n_samples, n_features)
        Data matrix.

    dictionary: array of shape (n_components, n_features)
        The dictionary matrix against which to solve the sparse coding of
        the data. Some of the algorithms assume normalized rows.

    gram: None | array, shape=(n_components, n_components)
        Precomputed Gram matrix, dictionary * dictionary'
        gram can be None if method is 'threshold'.

    cov: array, shape=(n_components, n_samples)
        Precomputed covariance, dictionary * X'

    algorithm: {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}
        lars: uses the least angle regression method (linear_model.lars_path)
        lasso_lars: uses Lars to compute the Lasso solution
        lasso_cd: uses the coordinate descent method to compute the
        Lasso solution (linear_model.Lasso). lasso_lars will be faster if
        the estimated components are sparse.
        omp: uses orthogonal matching pursuit to estimate the sparse solution
        threshold: squashes to zero all coefficients less than regularization
        from the projection dictionary * data'

    regularization : int | float
        The regularization parameter. It corresponds to alpha when
        algorithm is 'lasso_lars', 'lasso_cd' or 'threshold'.
        Otherwise it corresponds to n_nonzero_coefs.

    init: array of shape (n_samples, n_components)
        Initialization value of the sparse code. Only used if
        `algorithm='lasso_cd'`.

    max_iter: int, 1000 by default
        Maximum number of iterations to perform if `algorithm='lasso_cd'`.

    copy_cov: boolean, optional
        Whether to copy the precomputed covariance matrix; if False, it may be
        overwritten.

    Returns
    -------
    code: array of shape (n_components, n_features)
        The sparse codes

    See also
    --------
    sklearn.linear_model.lars_path
    sklearn.linear_model.orthogonal_mp
    sklearn.linear_model.Lasso
    SparseCoder
    """
    if X.ndim == 1:
        X = X[:, np.newaxis]
    n_samples, n_features = X.shape
    if cov is None and algorithm != 'lasso_cd':
        # overwriting cov is safe
        copy_cov = False
        cov = np.dot(dictionary, X.T)

    if algorithm == 'lasso_admm':
        alpha = float(regularization) / n_features  # account for scaling
        try:
            err_mgt = np.seterr(all='ignore')

            code, dictionary = lasso_admm(X.T, dictionary.T,
                                          gamma=alpha,
                                          gram=gram, cov=cov,
                                          max_iter=max_iter)

            new_code = code.T
        finally:
            np.seterr(**err_mgt)

    elif algorithm == 'lasso_lars':
        alpha = float(regularization) / n_features  # account for scaling
        try:
            err_mgt = np.seterr(all='ignore')
            lasso_lars = LassoLars(alpha=alpha, fit_intercept=False,
                                   verbose=False, normalize=False,
                                   precompute=gram, fit_path=False)
            lasso_lars.fit(dictionary.T, X.T, Xy=cov)
            new_code = lasso_lars.coef_
        finally:
            np.seterr(**err_mgt)

    elif algorithm == 'lasso_cd':
        alpha = float(regularization) / n_features  # account for scaling
        clf = Lasso(alpha=alpha, fit_intercept=False, precompute=gram,
                    max_iter=max_iter, warm_start=True)
        clf.coef_ = init
        clf.fit(dictionary.T, X.T)
        new_code = clf.coef_

    elif algorithm == 'lars':
        try:
            err_mgt = np.seterr(all='ignore')
            lars = Lars(fit_intercept=False, verbose=False, normalize=False,
                        precompute=gram, n_nonzero_coefs=int(regularization),
                        fit_path=False)
            lars.fit(dictionary.T, X.T, Xy=cov)
            new_code = lars.coef_
        finally:
            np.seterr(**err_mgt)

    elif algorithm == 'threshold':
        new_code = ((np.sign(cov) *
                    np.maximum(np.abs(cov) - regularization, 0)).T)

    elif algorithm == 'omp':
        new_code = orthogonal_mp_gram(gram, cov, regularization, None,
                                      row_norms(X, squared=True),
                                      copy_Xy=copy_cov).T
    else:
        raise ValueError('Sparse coding method must be "lasso_lars" '
                         '"lasso_cd",  "lasso", "threshold" or "omp", got %s.'
                         % algorithm)
    return new_code
def k_means_gpu_sparsity(weight_vector,
                         n_clusters,
                         ratio=0.5,
                         verbosity=0,
                         seed=int(time.time()),
                         gpu_id=0):

    # print(n_clusters)
    if ratio == 0:

        return k_means_gpu(weight_vector=weight_vector,
                           n_clusters=n_clusters,
                           verbosity=verbosity,
                           seed=seed,
                           gpu_id=gpu_id)

    if ratio == 1:

        if n_clusters == 1:

            mean_sample = np.mean(weight_vector, axis=0)

            weight_vector = np.tile(mean_sample, (weight_vector.shape[0], 1))

            return weight_vector

        elif weight_vector.shape[0] == n_clusters:

            return weight_vector

        else:
            # mean_sample = np.mean(weight_vector, axis=0)
            weight_vector_1_mean = np.mean(weight_vector, axis=0)

            weight_vector_compress = np.zeros(
                (weight_vector.shape[0], weight_vector.shape[1]),
                dtype=np.float32)
            for v in weight_vector.shape[0]:
                weight_vector_compress[v, :] = weight_vector_1_mean

            return weight_vector_compress

    else:

        if n_clusters == 1:

            mean_sample = np.mean(weight_vector, axis=0)

            weight_vector = np.tile(mean_sample, (weight_vector.shape[0], 1))

            return weight_vector

        elif weight_vector.shape[0] == n_clusters:

            return weight_vector

        elif weight_vector.shape[1] == 1:

            return k_means_sparsity(weight_vector,
                                    n_clusters,
                                    ratio,
                                    seed=seed)

        else:
            # print('n_clusters', n_clusters)
            # print('weight_vector.shape',weight_vector.shape)
            # print('kmeans++ init start')
            num_samples = weight_vector.shape[0]
            mean_sample = np.mean(weight_vector, axis=0)

            center_cluster_index = np.argsort(
                np.linalg.norm(weight_vector - mean_sample,
                               axis=1))[:int(num_samples * ratio)]
            # weight_vector_1 = weight_vector[min_index, :]
            weight_vector_1_mean = np.mean(
                weight_vector[center_cluster_index, :], axis=0)

            remaining_cluster_index = np.asarray([
                i for i in np.arange(num_samples)
                if i not in center_cluster_index
            ])

            weight_vector_train = weight_vector[remaining_cluster_index, :]
            # weight_vector_train = [element for i, element in enumerate(weight_vector) if i not in min_index]
            # weight_vector = np.tile(mean_sample, (weight_vector.shape[0], 1))
            init_centers = sklearn.cluster.k_means_._k_init(
                X=weight_vector_train,
                n_clusters=n_clusters - 1,
                x_squared_norms=row_norms(weight_vector_train, squared=True),
                random_state=RandomState(seed))
            # # print('kmeans++ init finished')
            # # print('init_centers.shape',init_centers.shape)
            centers, labels = kmeans_cuda(samples=weight_vector_train,
                                          clusters=n_clusters - 1,
                                          init=init_centers,
                                          yinyang_t=0,
                                          seed=seed,
                                          device=gpu_id,
                                          verbosity=verbosity)
            # print(np.unique(labels, axis=0).shape[0]+1)
            # centers, labels = kmeans_cuda(samples = weight_vector, clusters = n_clusters, init="k-means++", yinyang_t=0, seed=seed, device=gpu_id, verbosity=verbosity)
            # centers, labels = kmeans_cuda(samples = weight_vector, clusters = n_clusters, init="random", yinyang_t=0, seed=seed, device=gpu_id, verbosity=verbosity)
            # centers, labels = kmeans_cuda(samples = weight_vector, clusters = n_clusters, init="afk-mc2", yinyang_t=0, seed=seed, device=gpu_id, verbosity=verbosity)
            weight_vector_compress = np.zeros(
                (weight_vector.shape[0], weight_vector.shape[1]),
                dtype=np.float32)
            for v in center_cluster_index:
                weight_vector_compress[v, :] = weight_vector_1_mean

            for i, v in enumerate(remaining_cluster_index):
                weight_vector_compress[v, :] = centers[labels[i], :]
            # weight_compress = np.reshape(weight_vector_compress, (filters_num, filters_channel, filters_size, filters_size))
            # print(np.unique(weight_vector_compress, axis=0).shape[0])
            # print(n_clusters, '\n')
            # assert np.unique(weight_vector_compress, axis=0).shape[0]==n_clusters, "cluster number mismatch"
            return weight_vector_compress
Exemple #31
0
    def fit(self, X):
        rng = np.random.RandomState(self.random_state)
        new_cluster_centers = np.zeros((self.n_clusters, X.shape[1]))
        n_samples_arrays = np.arange(X.shape[0])

        if self.return_cost_per_iteration:
            self.cost_array_ = np.zeros(self.max_iter)

        if self.n_clusters > 20:
            raise ValueError("Group clustering not supported yet")

        if self.init == "random":
            old_cluster_centers_ = X[rng.randint(0, X.shape[0], self.n_clusters), :]
        else:
            raise ValueError("wait till we support other initializations.")

        # Run K-Means for the first time.
        # Don't do cluster.KMeans().fit(X) because of input_validation etc.
        dot_product = 2 * np.dot(X, old_cluster_centers_.T)
        cluster_norms = row_norms(old_cluster_centers_, squared=True).reshape(1, -1)
        self.distances_ = row_norms(X, squared=True).reshape(-1, 1) - dot_product + cluster_norms

        # Remove the closest and the second closest cluster.
        upper_and_lower_bounds = np.argpartition(self.distances_, 1, axis=1)
        self.labels_ = upper_and_lower_bounds[:, 0]
        self.almost_labels_ = upper_and_lower_bounds[:, 1]
        self.upper_and_lower_bounds_ = self.distances_[n_samples_arrays.reshape(-1, 1), upper_and_lower_bounds]

        # Update cluster centers
        for i in range(self.n_clusters):
            new_cluster_centers[i] = np.mean(X[self.labels_ == i], axis=0)
        self.cluster_centers_ = new_cluster_centers

        for n_iter in range(self.max_iter):

            if self.return_cost_per_iteration:
                self.cost_array_[n_iter] = _calculate_cost(X, self.labels_, self.cluster_centers_)

            # Calculate how much each center has drifted.
            drift = ((old_cluster_centers_ - self.cluster_centers_)**2).sum(axis=1)
            if np.sum(drift) < self.tol:
                break
            old_cluster_centers_ = np.copy(self.cluster_centers_)

            # Add the drift to the upper bounds and subtract the drift from the lower bounds.
            for i in range(self.n_clusters):
                mask = self.labels_ == i
                self.upper_and_lower_bounds_[:, 0][mask] += drift[i]
                self.upper_and_lower_bounds_[:, 1][mask] -= drift[i]

            # If the previously second_largest_bound is now lesser than the largest bound
            # set the upper bound to the distance between the largest_bound
            # This is based on d(old_center, new_center) + d(old_center, X) > d(X, new_center)
            mask_changed_bounds = self.upper_and_lower_bounds_[:, 1] < self.upper_and_lower_bounds_[:, 0]

            #XXX: Vectorize?
            for i in range(self.n_clusters):
                cluster = self.cluster_centers_[i]
                new_mask = np.logical_and(mask_changed_bounds, self.labels_ == i)
                distances = np.sum((X[new_mask] - cluster)**2, axis=1)
                self.upper_and_lower_bounds_[:, 0][new_mask] = distances

            # Now we can be sure that the second closest center is actually the closest.
            # Reassign the labels.
            mask_changed_bounds = self.upper_and_lower_bounds_[:, 1] < self.upper_and_lower_bounds_[:, 0]
            tmp = self.labels_[mask_changed_bounds]
            self.labels_[mask_changed_bounds] = self.almost_labels_[mask_changed_bounds]
            self.almost_labels_[mask_changed_bounds] = tmp

            self.upper_and_lower_bounds_[:, 1][mask_changed_bounds] = self.upper_and_lower_bounds_[:, 0][mask_changed_bounds]

            #XXX: Vectorize?
            for i in range(self.n_clusters):
                cluster = self.cluster_centers_[i]
                new_mask = np.logical_and(mask_changed_bounds, self.labels_ == i)
                distances = np.sum((X[new_mask] - cluster)**2, axis=1)
                self.upper_and_lower_bounds_[:, 0][new_mask] = distances

            # TODO: Optimize this step.
            for i in range(self.n_clusters):
                mask = self.labels_ == i
                self.cluster_centers_[i] = np.mean(X[mask], axis=0)

        self.n_iter_ = n_iter
def spherical_k_means(X,
                      n_clusters,
                      init='k-means++',
                      n_init=10,
                      max_iter=300,
                      verbose=False,
                      tol=1e-4,
                      random_state=None,
                      copy_x=True,
                      n_jobs=1,
                      algorithm="auto",
                      return_n_iter=False):
    """Modified from sklearn.cluster.k_means_.k_means.
    """
    if n_init <= 0:
        raise ValueError("Invalid number of initializations."
                         " n_init=%d must be bigger than zero." % n_init)
    random_state = check_random_state(random_state)

    if max_iter <= 0:
        raise ValueError('Number of iterations should be a positive number,'
                         ' got %d instead' % max_iter)

    best_inertia = np.infty
    X = as_float_array(X, copy=copy_x)
    tol = _tolerance(X, tol)

    if hasattr(init, '__array__'):
        init = check_array(init, dtype=X.dtype.type, copy=True)
        _validate_center_shape(X, n_clusters, init)

        if n_init != 1:
            warnings.warn(
                'Explicit initial center position passed: '
                'performing only one init in k-means instead of n_init=%d' %
                n_init,
                RuntimeWarning,
                stacklevel=2)
            n_init = 1

    # precompute squared norms of data points
    x_squared_norms = row_norms(X, squared=True)

    if n_jobs == 1:
        # For a single thread, less memory is needed if we just store one set
        # of the best results (as opposed to one set per run per thread).
        for it in range(n_init):
            # run a k-means once
            labels, inertia, centers, n_iter_ = _spherical_kmeans_single_lloyd(
                X,
                n_clusters,
                max_iter=max_iter,
                init=init,
                verbose=verbose,
                tol=tol,
                x_squared_norms=x_squared_norms,
                random_state=random_state)

            # determine if these results are the best so far
            if best_inertia is None or inertia < best_inertia:
                best_labels = labels.copy()
                best_centers = centers.copy()
                best_inertia = inertia
                best_n_iter = n_iter_
    else:
        # parallelisation of k-means runs
        seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
        results = Parallel(n_jobs=n_jobs, verbose=0)(
            delayed(_spherical_kmeans_single_lloyd)(
                X,
                n_clusters,
                max_iter=max_iter,
                init=init,
                verbose=verbose,
                tol=tol,
                x_squared_norms=x_squared_norms,
                # Change seed to ensure variety
                random_state=seed) for seed in seeds)

        # Get results with the lowest inertia
        labels, inertia, centers, n_iters = zip(*results)
        best = np.argmin(inertia)
        best_labels = labels[best]
        best_inertia = inertia[best]
        best_centers = centers[best]
        best_n_iter = n_iters[best]

    if return_n_iter:
        return best_centers, best_labels, best_inertia, best_n_iter
    else:
        return best_centers, best_labels, best_inertia
Exemple #33
0
    def mbkmean(self, options, n_clusters, n_init, batch_size, n_iter,
                n_samples, labels_true, k_means, X):
        #to do with online MBK_mean
        #Compute clustering with MiniBatchKMeans
        mbk = cluster.MiniBatchKMeans(init=self.init,
                                      n_clusters=n_clusters,
                                      batch_size=batch_size,
                                      n_init=10,
                                      max_no_improvement=n_iter,
                                      verbose=0)

        #INIT THREADs
        try:
            if options[2] == '-pp' or options[3] == '-pp':
                thread_1 = afficheur('starting threads', labels_true, mbk,
                                     k_means, X, n_clusters)
                thread_1.start()
        except IndexError:
            pass

        try:
            if options[2] == '-s':

                #init state
                n_batches = int(np.ceil(float(n_samples) / batch_size))
                max_iter = 100

                tol = 0
                _, n_features = X.shape
                old_center_buffer = np.zeros(n_features, dtype=X.dtype)
                random_state = check_random_state(None)
                init_size = 3 * batch_size
                if init_size > n_samples:
                    init_size = n_samples

                validation_indices = random_state.randint(
                    0, n_samples, init_size)
                X_valid = X[validation_indices]
                x_squared_norms = row_norms(X, squared=True)
                x_squared_norms_valid = x_squared_norms[validation_indices]
                counts = np.zeros(n_clusters, dtype=np.int32)
                best_inertia = None
                cluster_centers = None

                for init_idx in range(n_init):

                    cluster_centers = cluster._init_centroids(
                        X,
                        n_clusters,
                        self.init,
                        random_state=random_state,
                        x_squared_norms=x_squared_norms,
                        init_size=init_size)
                    batch_inertia, centers_squared_diff = cluster._mini_batch_step(
                        X_valid,
                        x_squared_norms[validation_indices],
                        cluster_centers,
                        counts,
                        old_center_buffer,
                        False,
                        distances=None,
                        verbose=False)
                    _, inertia = cluster._labels_inertia(
                        X_valid, x_squared_norms_valid, cluster_centers)
                    if best_inertia is None or inertia < best_inertia:
                        mbk.cluster_centers_ = cluster_centers
                        mbk.counts_ = counts
                        best_inertia = inertia
                        print('best inertia %d' % best_inertia)

                while (True):
                    thread_1 = afficheur('starting threads', labels_true, mbk,
                                         k_means, X, n_clusters)
                    thread_1.start()
                    t0 = time.time()

                    for iteration_idx in range(n_iter):
                        minibatch_indices = random_state.randint(
                            0, n_samples, batch_size)
                        mbk = mbk.partial_fit(X[minibatch_indices])
                        thread_1.update(mbk)

                    t_mini_batch = time.time() - t0
                    thread_1.stop()
                    thread_1.join()

                    n_iter = self.input_num("Iterations suivante : ")

                    if n_iter == "stop":
                        return mbk, t_mini_batch
                        break
                    if isinstance(n_iter, int) == False:
                        print('error integer is required !!! type %s' %
                              type(n_iter))
                        break

        except IndexError:
            pass

        try:
            if options[2] == '-pp':

                random_state = check_random_state(None)
                t0 = time.time()
                # Sample a minibatch from the full dataset
                for iteration_idx in range(n_iter - 1):
                    minibatch_indices = random_state.randint(
                        0, n_samples, batch_size)
                    mbk = mbk.partial_fit(X[minibatch_indices])

                    thread_1.update(mbk)
                t_mini_batch = time.time() - t0
                thread_1.stop()
                thread_1.join()
                return mbk, t_mini_batch

        except IndexError:
            pass

        try:
            if options[2] == '-p':

                random_state = check_random_state(None)
                t0 = time.time()
                for iteration_idx in range(n_iter):
                    minibatch_indices = random_state.randint(
                        0, n_samples, batch_size)
                    mbk = mbk.partial_fit(X[minibatch_indices])

                t_mini_batch = time.time() - t0
                return mbk, t_mini_batch

        except IndexError:
            pass

        try:
            if options[2] == '-n':
                t0 = time.time()
                mbk = mbk.fit(X)
                t_mini_batch = time.time() - t0
                return mbk, t_mini_batch

        except IndexError:
            pass

        try:
            if options[2] == None:
                random_state = check_random_state(None)
                # Sample a minibatch from the full dataset
                t0 = time.time()
                for iteration_idx in range(n_iter - 1):
                    minibatch_indices = random_state.randint(
                        0, n_samples, self.batch_size)
                    mbk = mbk.partial_fit(X,
                                          minibatch_indices=minibatch_indices)
                t_mini_batch = time.time() - t0
                return mbk, t_mini_batch
        except IndexError:
            pass

        try:
            if options[2] == '-o':
                n_batches = int(np.ceil(float(n_samples) / batch_size))
                max_iter = 100

                n_iter = int(max_iter * n_batches)
                tol = 0
                _, n_features = X.shape
                old_center_buffer = np.zeros(n_features, dtype=X.dtype)
                try:
                    #  print('self.max_iter %d , n_batches %d '%(n_iter,n_batches))
                    if options[3] == '-pp':
                        #init state

                        random_state = check_random_state(None)
                        init_size = 3 * batch_size

                        if init_size > n_samples:
                            init_size = n_samples

                        validation_indices = random_state.randint(
                            0, n_samples, init_size)
                        X_valid = X[validation_indices]
                        x_squared_norms = row_norms(X, squared=True)
                        x_squared_norms_valid = x_squared_norms[
                            validation_indices]
                        counts = np.zeros(n_clusters, dtype=np.int32)
                        best_inertia = None
                        cluster_centers = None

                        #Random init with minimum inertia
                        for init_idx in range(n_init):
                            cluster_centers = cluster._init_centroids(
                                X,
                                n_clusters,
                                self.init,
                                random_state=random_state,
                                x_squared_norms=x_squared_norms,
                                init_size=init_size)
                            batch_inertia, centers_squared_diff = cluster._mini_batch_step(
                                X_valid,
                                x_squared_norms[validation_indices],
                                cluster_centers,
                                counts,
                                old_center_buffer,
                                False,
                                distances=None,
                                verbose=False)
                            _, inertia = cluster._labels_inertia(
                                X_valid, x_squared_norms_valid,
                                cluster_centers)
                            if best_inertia is None or inertia < best_inertia:
                                mbk.cluster_centers_ = cluster_centers
                                mbk.counts_ = counts
                                best_inertia = inertia
                                print('best inertia %d' % best_inertia)

                        convergence_context = {}
                        mbk.batch_inertia = batch_inertia
                        mbk.centers_squared_diff = centers_squared_diff
                        t0 = time.time()
                        for iteration_idx in range(n_iter):
                            minibatch_indices = random_state.randint(
                                0, n_samples, batch_size)
                            mbk = mbk.partial_fit(X[minibatch_indices])
                            tol = self._tolerance(X, tol)
                            thread_1.update(mbk)

                            # Monitor convergence and do early stopping if necessary
                            if cluster._mini_batch_convergence(
                                    mbk,
                                    iteration_idx,
                                    n_iter,
                                    tol,
                                    n_samples,
                                    mbk.centers_squared_diff,
                                    mbk.batch_inertia,
                                    convergence_context,
                                    verbose=mbk.verbose):
                                t_mini_batch = time.time() - t0
                                thread_1.stop()
                                thread_1.join()
                                return mbk, t_mini_batch
                                break

                    elif options[3] == '-p':
                        random_state = check_random_state(None)
                        convergence_context = {}
                        t0 = time.time()
                        for iteration_idx in range(n_iter):
                            minibatch_indices = random_state.randint(
                                0, n_samples, batch_size)
                            mbk = mbk.partial_fit(X[minibatch_indices])
                            tol = self._tolerance(X, tol)

                            # Monitor convergence and do early stopping if necessary
                            if cluster._mini_batch_convergence(
                                    mbk,
                                    iteration_idx,
                                    n_iter,
                                    tol,
                                    n_samples,
                                    mbk.centers_squared_diff,
                                    mbk.batch_inertia,
                                    convergence_context,
                                    verbose=False):
                                t_mini_batch = time.time() - t0
                                return mbk, t_mini_batch
                                break
                except IndexError:
                    pass
        except IndexError:
            pass
 def __init__(self,K,X,weights):
     self.K = K
     self.x_squared_norms = row_norms(X, squared=True)
     self.X = X
     self.weights = weights
def k_means_constrained(X,
                        n_clusters,
                        size_min=None,
                        size_max=None,
                        init='k-means++',
                        n_init=10,
                        max_iter=300,
                        verbose=False,
                        tol=1e-4,
                        random_state=None,
                        copy_x=True,
                        n_jobs=1,
                        return_n_iter=False):
    """K-Means clustering with minimum and maximum cluster size constraints.

    Read more in the :ref:`User Guide <k_means>`.

    Parameters
    ----------
    X : array-like or sparse matrix, shape (n_samples, n_features)
        The observations to cluster.

    size_min : int, optional, default: None
        Constrain the label assignment so that each cluster has a minimum
        size of size_min. If None, no constrains will be applied

    size_max : int, optional, default: None
        Constrain the label assignment so that each cluster has a maximum
        size of size_max. If None, no constrains will be applied

    n_clusters : int
        The number of clusters to form as well as the number of
        centroids to generate.

    init : {'k-means++', 'random', or ndarray, or a callable}, optional
        Method for initialization, default to 'k-means++':

        'k-means++' : selects initial cluster centers for k-mean
        clustering in a smart way to speed up convergence. See section
        Notes in k_init for more details.

        'random': generate k centroids from a Gaussian with mean and
        variance estimated from the data.

        If an ndarray is passed, it should be of shape (n_clusters, n_features)
        and gives the initial centers.

        If a callable is passed, it should take arguments X, k and
        and a random state and return an initialization.

    n_init : int, optional, default: 10
        Number of time the k-means algorithm will be run with different
        centroid seeds. The final results will be the best output of
        n_init consecutive runs in terms of inertia.

    max_iter : int, optional, default 300
        Maximum number of iterations of the k-means algorithm to run.

    verbose : boolean, optional
        Verbosity mode.

    tol : float, optional
        The relative increment in the results before declaring convergence.

    random_state : int, RandomState instance or None, optional, default: None
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    copy_x : boolean, optional
        When pre-computing distances it is more numerically accurate to center
        the data first.  If copy_x is True, then the original data is not
        modified.  If False, the original data is modified, and put back before
        the function returns, but small numerical differences may be introduced
        by subtracting and then adding the data mean.

    n_jobs : int
        The number of jobs to use for the computation. This works by computing
        each of the n_init runs in parallel.

        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging. For n_jobs below -1,
        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
        are used.

    return_n_iter : bool, optional
        Whether or not to return the number of iterations.

    Returns
    -------
    centroid : float ndarray with shape (k, n_features)
        Centroids found at the last iteration of k-means.

    label : integer ndarray with shape (n_samples,)
        label[i] is the code or index of the centroid the
        i'th observation is closest to.

    inertia : float
        The final value of the inertia criterion (sum of squared distances to
        the closest centroid for all observations in the training set).

    best_n_iter : int
        Number of iterations corresponding to the best results.
        Returned only if `return_n_iter` is set to True.

    """
    if n_init <= 0:
        raise ValueError("Invalid number of initializations."
                         " n_init=%d must be bigger than zero." % n_init)
    random_state = check_random_state(random_state)

    if max_iter <= 0:
        raise ValueError('Number of iterations should be a positive number,'
                         ' got %d instead' % max_iter)

    X = as_float_array(X, copy=copy_x)
    tol = _tolerance(X, tol)

    # Validate init array
    if hasattr(init, '__array__'):
        init = check_array(init, dtype=X.dtype.type, copy=True)
        _validate_center_shape(X, n_clusters, init)

        if n_init != 1:
            warnings.warn(
                'Explicit initial center position passed: '
                'performing only one init in k-means instead of n_init=%d' %
                n_init,
                RuntimeWarning,
                stacklevel=2)
            n_init = 1

    # subtract of mean of x for more accurate distance computations
    if not sp.issparse(X):
        X_mean = X.mean(axis=0)
        # The copy was already done above
        X -= X_mean

        if hasattr(init, '__array__'):
            init -= X_mean

    # precompute squared norms of data points
    x_squared_norms = row_norms(X, squared=True)

    best_labels, best_inertia, best_centers = None, None, None

    if n_jobs == 1:
        # For a single thread, less memory is needed if we just store one set
        # of the best results (as opposed to one set per run per thread).
        for it in range(n_init):
            # run a k-means once
            labels, inertia, centers, n_iter_ = kmeans_constrained_single(
                X,
                n_clusters,
                size_min=size_min,
                size_max=size_max,
                max_iter=max_iter,
                init=init,
                verbose=verbose,
                tol=tol,
                x_squared_norms=x_squared_norms,
                random_state=random_state)
            # determine if these results are the best so far
            if best_inertia is None or inertia < best_inertia:
                best_labels = labels.copy()
                best_centers = centers.copy()
                best_inertia = inertia
                best_n_iter = n_iter_
    else:
        # parallelisation of k-means runs
        seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
        results = Parallel(n_jobs=n_jobs, verbose=0)(
            delayed(kmeans_constrained_single)(
                X,
                n_clusters,
                size_min=size_min,
                size_max=size_max,
                max_iter=max_iter,
                init=init,
                verbose=verbose,
                tol=tol,
                x_squared_norms=x_squared_norms,
                # Change seed to ensure variety
                random_state=seed) for seed in seeds)
        # Get results with the lowest inertia
        labels, inertia, centers, n_iters = zip(*results)
        best = np.argmin(inertia)
        best_labels = labels[best]
        best_inertia = inertia[best]
        best_centers = centers[best]
        best_n_iter = n_iters[best]

    if not sp.issparse(X):
        if not copy_x:
            X += X_mean
        best_centers += X_mean

    if return_n_iter:
        return best_centers, best_labels, best_inertia, best_n_iter
    else:
        return best_centers, best_labels, best_inertia
Exemple #36
0
def k_means(X,
            n_clusters,
            init='k-means++',
            precompute_distances='auto',
            n_init=10,
            max_iter=300,
            verbose=False,
            tol=1e-4,
            random_state=None,
            copy_x=True,
            n_jobs=1,
            return_n_iter=False,
            sample_weight=None):
    """K-means clustering algorithm.
    Read more in the :ref:`User Guide <k_means>`.
    Parameters
    ----------
    X : array-like or sparse matrix, shape (n_samples, n_features)
        The observations to cluster.
    n_clusters : int
        The number of clusters to form as well as the number of
        centroids to generate.
    max_iter : int, optional, default 300
        Maximum number of iterations of the k-means algorithm to run.
    n_init : int, optional, default: 10
        Number of time the k-means algorithm will be run with different
        centroid seeds. The final results will be the best output of
        n_init consecutive runs in terms of inertia.
    init : {'k-means++', 'random', or ndarray, or a callable}, optional
        Method for initialization, default to 'k-means++':
        'k-means++' : selects initial cluster centers for k-mean
        clustering in a smart way to speed up convergence. See section
        Notes in k_init for more details.
        'random': generate k centroids from a Gaussian with mean and
        variance estimated from the data.
        If an ndarray is passed, it should be of shape (n_clusters, n_features)
        and gives the initial centers.
        If a callable is passed, it should take arguments X, k and
        and a random state and return an initialization.
    precompute_distances : {'auto', True, False}
        Precompute distances (faster but takes more memory).
        'auto' : do not precompute distances if n_samples * n_clusters > 12
        million. This corresponds to about 100MB overhead per job using
        double precision.
        True : always precompute distances
        False : never precompute distances
    tol : float, optional
        The relative increment in the results before declaring convergence.
    verbose : boolean, optional
        Verbosity mode.
    random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.
    copy_x : boolean, optional
        When pre-computing distances it is more numerically accurate to center
        the data first.  If copy_x is True, then the original data is not
        modified.  If False, the original data is modified, and put back before
        the function returns, but small numerical differences may be introduced
        by subtracting and then adding the data mean.
    n_jobs : int
        The number of jobs to use for the computation. This works by computing
        each of the n_init runs in parallel.
        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging. For n_jobs below -1,
        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
        are used.
    return_n_iter : bool, optional
        Whether or not to return the number of iterations.
    Returns
    -------
    centroid : float ndarray with shape (k, n_features)
        Centroids found at the last iteration of k-means.
    label : integer ndarray with shape (n_samples,)
        label[i] is the code or index of the centroid the
        i'th observation is closest to.
    inertia : float
        The final value of the inertia criterion (sum of squared distances to
        the closest centroid for all observations in the training set).
    best_n_iter: int
        Number of iterations corresponding to the best results.
        Returned only if `return_n_iter` is set to True.
    """
    if n_init <= 0:
        raise ValueError("Invalid number of initializations."
                         " n_init=%d must be bigger than zero." % n_init)
    random_state = check_random_state(random_state)

    if max_iter <= 0:
        raise ValueError('Number of iterations should be a positive number,'
                         ' got %d instead' % max_iter)

    best_inertia = np.infty
    X = as_float_array(X, copy=copy_x)
    tol = _tolerance(X, tol)

    # If the distances are precomputed every job will create a matrix of shape
    # (n_clusters, n_samples). To stop KMeans from eating up memory we only
    # activate this if the created matrix is guaranteed to be under 100MB. 12
    # million entries consume a little under 100MB if they are of type double.
    if precompute_distances == 'auto':
        n_samples = X.shape[0]
        precompute_distances = (n_clusters * n_samples) < 12e6
    elif isinstance(precompute_distances, bool):
        pass
    else:
        raise ValueError("precompute_distances should be 'auto' or True/False"
                         ", but a value of %r was passed" %
                         precompute_distances)

    # subtract of mean of x for more accurate distance computations
    if not sp.issparse(X) or hasattr(init, '__array__'):
        X_mean = X.mean(axis=0)
    if not sp.issparse(X):
        # The copy was already done above
        X -= X_mean

    if hasattr(init, '__array__'):
        init = check_array(init, dtype=np.float64, copy=True)
        _validate_center_shape(X, n_clusters, init)

        init -= X_mean
        if n_init != 1:
            warnings.warn(
                'Explicit initial center position passed: '
                'performing only one init in k-means instead of n_init=%d' %
                n_init,
                RuntimeWarning,
                stacklevel=2)
            n_init = 1

    # precompute squared norms of data points
    x_squared_norms = row_norms(X, squared=True)

    best_labels, best_inertia, best_centers = None, None, None
    if n_jobs == 1:
        # For a single thread, less memory is needed if we just store one set
        # of the best results (as opposed to one set per run per thread).
        for it in range(n_init):
            # run a k-means once
            labels, inertia, centers, n_iter_ = _kmeans_single(
                X,
                n_clusters,
                max_iter=max_iter,
                init=init,
                verbose=verbose,
                precompute_distances=precompute_distances,
                tol=tol,
                x_squared_norms=x_squared_norms,
                random_state=random_state,
                sample_weight=sample_weight)
            # determine if these results are the best so far
            if best_inertia is None or inertia < best_inertia:
                best_labels = labels.copy()
                best_centers = centers.copy()
                best_inertia = inertia
                best_n_iter = n_iter_
    else:
        # parallelisation of k-means runs
        seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
        results = Parallel(n_jobs=n_jobs, verbose=0)(
            delayed(_kmeans_single)(
                X,
                n_clusters,
                max_iter=max_iter,
                init=init,
                verbose=verbose,
                tol=tol,
                precompute_distances=precompute_distances,
                x_squared_norms=x_squared_norms,
                # Change seed to ensure variety
                random_state=seed) for seed in seeds)
        # Get results with the lowest inertia
        labels, inertia, centers, n_iters = zip(*results)
        best = np.argmin(inertia)
        best_labels = labels[best]
        best_inertia = inertia[best]
        best_centers = centers[best]
        best_n_iter = n_iters[best]

    if not sp.issparse(X):
        if not copy_x:
            X += X_mean
        best_centers += X_mean

    if return_n_iter:
        return best_centers, best_labels, best_inertia, best_n_iter
    else:
        return best_centers, best_labels, best_inertia
 def setup(self):
     self.X = _china_dataset()
     self.n_clusters = 64
     self.x_squared_norms = row_norms(self.X, squared=True)
Exemple #38
0
def _estimate_log_gaussian_prob(X, means, precisions_chol, covariance_type):
    """Estimate the log Gaussian probability.

    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)

    means : array-like, shape (n_components, n_features)

    precisions_chol : array-like
        Cholesky decompositions of the precision matrices.
        'full' : shape of (n_components, n_features, n_features)
        'tied' : shape of (n_features, n_features)
        'diag' : shape of (n_components, n_features)
        'spherical' : shape of (n_components,)

    covariance_type : {'full', 'tied', 'diag', 'spherical'}

    Returns
    -------
    log_prob : array, shape (n_samples, n_components)
    """
    n_samples, n_features = X.shape
    n_components, _ = means.shape
    # det(precision_chol) is half of det(precision)
    log_det = _compute_log_det_cholesky(
        precisions_chol, covariance_type, n_features)

    # print(log_det)
    if covariance_type == 'full':
        log_prob = np.empty((n_samples, n_components))
        for k, (mu, prec_chol) in enumerate(zip(means, precisions_chol)):
            # import time
            # t0 = time.time()
            y = torch.mm(torch.Tensor(X).cuda(), torch.Tensor(prec_chol).cuda()) - \
                torch.mv(torch.Tensor(prec_chol).t().cuda(), torch.Tensor(mu).cuda())
            # t1 = time.time()
            log_prob[:, k] = torch.sum(y**2, dim=1).cpu().numpy()

            # t00 = time.time()
            # y = np.dot(X, prec_chol) - np.dot(mu, prec_chol)
            # t11 = time.time()
            # log_prob[:, k] = np.sum(np.square(y), axis=1)
            # print(time.time() - t11, t11-t00, 'cpus', t00 - t1, t1 - t0, 'gpus')

    elif covariance_type == 'tied':
        log_prob = np.empty((n_samples, n_components))
        for k, mu in enumerate(means):
            y = np.dot(X, precisions_chol) - np.dot(mu, precisions_chol)
            log_prob[:, k] = np.sum(np.square(y), axis=1)

    elif covariance_type == 'diag':
        precisions = precisions_chol ** 2
        log_prob = (np.sum((means ** 2 * precisions), 1) -
                    2. * np.dot(X, (means * precisions).T) +
                    np.dot(X ** 2, precisions.T))

    elif covariance_type == 'spherical':
        precisions = precisions_chol ** 2
        log_prob = (np.sum(means ** 2, 1) * precisions -
                    2 * np.dot(X, means.T * precisions) +
                    np.outer(row_norms(X, squared=True), precisions))
    return -.5 * (n_features * np.log(2 * np.pi) + log_prob) + log_det
def test_minibatch_update_consistency():
    # Check that dense and sparse minibatch update give the same results
    rng = np.random.RandomState(42)

    centers_old = centers + rng.normal(size=centers.shape)
    centers_old_csr = centers_old.copy()

    centers_new = np.zeros_like(centers_old)
    centers_new_csr = np.zeros_like(centers_old_csr)

    weight_sums = np.zeros(centers_old.shape[0], dtype=X.dtype)
    weight_sums_csr = np.zeros(centers_old.shape[0], dtype=X.dtype)

    x_squared_norms = (X**2).sum(axis=1)
    x_squared_norms_csr = row_norms(X_csr, squared=True)

    sample_weight = np.ones(X.shape[0], dtype=X.dtype)

    # extract a small minibatch
    X_mb = X[:10]
    X_mb_csr = X_csr[:10]
    x_mb_squared_norms = x_squared_norms[:10]
    x_mb_squared_norms_csr = x_squared_norms_csr[:10]
    sample_weight_mb = sample_weight[:10]

    # step 1: compute the dense minibatch update
    old_inertia = _mini_batch_step(
        X_mb,
        x_mb_squared_norms,
        sample_weight_mb,
        centers_old,
        centers_new,
        weight_sums,
        np.random.RandomState(0),
        random_reassign=False,
    )
    assert old_inertia > 0.0

    # compute the new inertia on the same batch to check that it decreased
    labels, new_inertia = _labels_inertia(
        X_mb, sample_weight_mb, x_mb_squared_norms, centers_new
    )
    assert new_inertia > 0.0
    assert new_inertia < old_inertia

    # step 2: compute the sparse minibatch update
    old_inertia_csr = _mini_batch_step(
        X_mb_csr,
        x_mb_squared_norms_csr,
        sample_weight_mb,
        centers_old_csr,
        centers_new_csr,
        weight_sums_csr,
        np.random.RandomState(0),
        random_reassign=False,
    )
    assert old_inertia_csr > 0.0

    # compute the new inertia on the same batch to check that it decreased
    labels_csr, new_inertia_csr = _labels_inertia(
        X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, centers_new_csr
    )
    assert new_inertia_csr > 0.0
    assert new_inertia_csr < old_inertia_csr

    # step 3: check that sparse and dense updates lead to the same results
    assert_array_equal(labels, labels_csr)
    assert_allclose(centers_new, centers_new_csr)
    assert_allclose(old_inertia, old_inertia_csr)
    assert_allclose(new_inertia, new_inertia_csr)
def _daal4py_check(self, X, y_, check_input):
    #conver to 2d format
    X = make2d(X)
    y = make2d(y_)

    #convet from list type
    if isinstance(X, list):
        X = np.asarray(X, np.float64)
    if isinstance(y, list):
        y = np.asarray(y, np.float64)

    _fptype = getFPType(X)

    #check alpha
    if self.alpha == 0:
        warnings.warn(
            "With alpha=0, this algorithm does not converge "
            "well. You are advised to use the LinearRegression "
            "estimator",
            stacklevel=2)

    #check precompute
    if isinstance(self.precompute, np.ndarray):
        if check_input:
            check_array(self.precompute, dtype=_fptype)
        self.precompute = make2d(self.precompute)
        #only for compliance with Sklearn
        if self.fit_intercept:
            X_offset = np.average(X, axis=0, weights=None)
            if self.normalize:
                X_scale = row_norms(X)
                if np.isscalar(X_scale):
                    if X_scale == .0:
                        X_scale = 1.
                elif isinstance(X_scale, np.ndarray):
                    X_scale[X_scale == 0.0] = 1.0
            else:
                X_scale = np.ones(X.shape[1], dtype=_fptype)
        else:
            X_offset = np.zeros(X.shape[1], dtype=_fptype)
            X_scale = np.ones(X.shape[1], dtype=_fptype)
        if (self.fit_intercept
                and not np.allclose(X_offset, np.zeros(X.shape[1]))
                or self.normalize
                and not np.allclose(X_scale, np.ones(X.shape[1]))):
            warnings.warn(
                "Gram matrix was provided but X was centered"
                " to fit intercept, "
                "or X was normalized : recomputing Gram matrix.", UserWarning)
    else:
        if self.precompute not in [False, True, 'auto']:
            raise ValueError("precompute should be one of True, False, "
                             "'auto' or array-like. Got %r" % self.precompute)
    #check X and y
    if check_input:
        X, y = check_X_y(X,
                         y,
                         dtype=[np.float64, np.float32],
                         multi_output=True,
                         y_numeric=True)
    else:
        #only for compliance with Sklearn, this assert is not required for DAAL
        if (X.flags['F_CONTIGUOUS'] == False):
            raise ValueError("ndarray is not Fortran contiguous")

    #check selection
    if self.selection not in ['random', 'cyclic']:
        raise ValueError("selection should be either random or cyclic.")

    return X, y
Exemple #41
0
def test_minibatch_update_consistency():
    # Check that dense and sparse minibatch update give the same results
    rng = np.random.RandomState(42)
    old_centers = centers + rng.normal(size=centers.shape)

    new_centers = old_centers.copy()
    new_centers_csr = old_centers.copy()

    counts = np.zeros(new_centers.shape[0], dtype=np.int32)
    counts_csr = np.zeros(new_centers.shape[0], dtype=np.int32)

    x_squared_norms = (X**2).sum(axis=1)
    x_squared_norms_csr = row_norms(X_csr, squared=True)

    buffer = np.zeros(centers.shape[1], dtype=np.double)
    buffer_csr = np.zeros(centers.shape[1], dtype=np.double)

    # extract a small minibatch
    X_mb = X[:10]
    X_mb_csr = X_csr[:10]
    x_mb_squared_norms = x_squared_norms[:10]
    x_mb_squared_norms_csr = x_squared_norms_csr[:10]

    # step 1: compute the dense minibatch update
    old_inertia, incremental_diff = _mini_batch_step(X_mb,
                                                     x_mb_squared_norms,
                                                     new_centers,
                                                     counts,
                                                     buffer,
                                                     1,
                                                     None,
                                                     random_reassign=False)
    assert_greater(old_inertia, 0.0)

    # compute the new inertia on the same batch to check that it decreased
    labels, new_inertia = _labels_inertia(X_mb, x_mb_squared_norms,
                                          new_centers)
    assert_greater(new_inertia, 0.0)
    assert_less(new_inertia, old_inertia)

    # check that the incremental difference computation is matching the
    # final observed value
    effective_diff = np.sum((new_centers - old_centers)**2)
    assert_almost_equal(incremental_diff, effective_diff)

    # step 2: compute the sparse minibatch update
    old_inertia_csr, incremental_diff_csr = _mini_batch_step(
        X_mb_csr,
        x_mb_squared_norms_csr,
        new_centers_csr,
        counts_csr,
        buffer_csr,
        1,
        None,
        random_reassign=False)
    assert_greater(old_inertia_csr, 0.0)

    # compute the new inertia on the same batch to check that it decreased
    labels_csr, new_inertia_csr = _labels_inertia(X_mb_csr,
                                                  x_mb_squared_norms_csr,
                                                  new_centers_csr)
    assert_greater(new_inertia_csr, 0.0)
    assert_less(new_inertia_csr, old_inertia_csr)

    # check that the incremental difference computation is matching the
    # final observed value
    effective_diff = np.sum((new_centers_csr - old_centers)**2)
    assert_almost_equal(incremental_diff_csr, effective_diff)

    # step 3: check that sparse and dense updates lead to the same results
    assert_array_equal(labels, labels_csr)
    assert_array_almost_equal(new_centers, new_centers_csr)
    assert_almost_equal(incremental_diff, incremental_diff_csr)
    assert_almost_equal(old_inertia, old_inertia_csr)
    assert_almost_equal(new_inertia, new_inertia_csr)
Exemple #42
0
def get_data(dataname, verbose=True):

    assert dataname in data_bank, 'Dataset name not recognized!'

    # meta information about the data
    meta_dict = {
        'isSparse': False,
        'n_true_classes': None,
        'xsn_train': None,
        'xsn_test': None
    }

    train_lb = None
    if dataname == 'rcv1':
        print('Fetching RCV1 data from sklearn')
        train = fetch_rcv1(subset='test')
        train = train.data
        test = fetch_rcv1(subset='train')
        test = test.data
        meta_dict['n_true_classes'] = 103

    elif dataname == 'mnist':
        print('Fetching MNIST data from sklearn')
        mnist = fetch_mldata('MNIST original')
        data_ind = range(mnist.data.shape[0])
        random.shuffle(data_ind)
        train_ind = data_ind[:60000]
        test_ind = data_ind[-10000:]

        train = mnist.data[train_ind, :]
        test = mnist.data[test_ind, :]
        meta_dict['n_true_classes'] = 10

    elif dataname == 'gauss':
        """ Synthetic Gaussian """
        print('Generating Gaussian blurbs datasets')
        #centers = [[2, 2], [-2, -2], [2, -2]]
        #centers = np.asarray(centers)
        n_s = 7000
        gauss, _ = make_blobs(n_samples=n_s,
                              n_features=10,
                              centers=50,
                              center_box=(-30, 30),
                              cluster_std=10.0)

        data_ind = range(gauss.shape[0])
        random.shuffle(data_ind)
        train_ind = data_ind[:6 * n_s / 7]
        test_ind = data_ind[-n_s / 7:]

        train = gauss[train_ind, :]
        test = gauss[test_ind, :]
        meta_dict['n_true_classes'] = 50

    elif dataname == 'covtype':
        """ Forest covertype """
        print('Fetching forest covertype datasets')
        cov = fetch_covtype()
        train = cov.data[:500000]
        train_lb = cov.target[:500000]
        test = cov.data[500000:]
        test_lb = cov.target[500000:]
        meta_dict['n_true_classes'] = 7

    elif dataname == 'cifar10_raw':
        """
		This will make over 4 million data points for training, each with
		dim 8*8*3
		"""
        # take user input to know which batch to preprocess
        usr_input = raw_input('Which batch to preprocess: ')
        # get data home
        data_home = '/Users/tangch/scikit_learn_data/cifar10'  # make this portable in the future

        os.chdir(data_home)

        prefix = 'data_batch_'
        # combine training batches
        if usr_input != 'test':
            curr_batch = prefix + str(usr_input)
            fname = curr_batch
            dataname = dataname + 'batch_' + str(usr_input)
        else:
            fname = data_home + 'test_batch'
            dataname = dataname + 'test_batch'
            #print 'Opening '+fname + 'in directory' + os.getcwd()
        with open(fname, 'rb') as f:
            dict = pickle.load(f)

        train = dict['data']
        train_lb = dict['labels']

        print 'processing batch %s' % fname

        print '%d by %d training data with %d label' %(train.shape[0],\
          train.shape[1], len(train_lb))
        os.chdir(
            '/Users/tangch/Documents/Python_projects/myprojects/mbkm_2016/mbkm'
        )

        # Reduce the dimension of data by random sampling
        train = train.reshape(train.shape[0], 32, 32, 3)
        width = 8
        height = width

        # subsampling patches u.a.r.
        train, train_lb = convsubsample(train,
                                        1,
                                        width,
                                        height,
                                        labels=train_lb)
        # random shuffle training data
        #ind = range(train.shape[0])
        print 'shuffling dataset randomly'
        #ind = random.shuffle(ind)
        #pdb.set_trace()
        #train = train[ind]
        #train_lb = train_lb[ind]
        train = np.random.sample(train, train.shape[0])
        train = np.array(train)
        train_lb = np.random.sample(train_lb, train_lb.shape[0])
        train_lb = np.array(train_lb)
        test = None

        # get test data/labels
        #fname = data_home + 'test_batch'
        #with open(fname,'rb') as f:
        #	dict = pickle.load(f)
        #test = dict['data']
        #test_lb = dict['labels']

        # subsampling patches u.a.r.
        #test, test_lb = convsubsample(test, 1, width,height, labels = test_lb)

        meta_dict['n_true_classes'] = 10
        print 'finished preprocessing current batch'

    elif dataname == 'cifar10_norm':
        """ 
		  Only works if we have cifar10_raw
		"""
        usr_input = raw_input('Which batch to preprocess: ')
        ### load cifar10_raw
        fname = 'cifar10_raw'
        if usr_input != 'test':
            fname = fname + 'batch_' + str(usr_input)
            dataname = dataname + 'batch_' + str(usr_input)
        else:
            fname = fname + 'test_batch'
            dataname = dataname + 'test_batch'

        try:
            with open(fname, 'rb') as f:
                train, test, meta = pickle.load(f)
        except Exception as e:
            print('Cannot open file ' + fname)

        ### Normalization
        train = normalize(train, norm='l2')
        test = normalize(test, norm='l2')

        ### meta info extraction
        meta_dict['n_true_classes'] = meta['n_true_classes']
        train_lb = meta['train_lb']
        test_lb = meta['test_lb']

    elif dataname == 'cifar10_white_norm':
        """ 
		  Only works if we have cifar10_norm
		"""
        ### load cifar10_norm
        fname = 'cifar10_norm'
        try:
            with open(fname, 'rb') as f:
                train_old, test_old, meta = pickle.load(f)
        except Exception as e:
            print('Cannot find file ' + fname)

        train_test = np.vstack((train_old, test_old))
        ### Whitening
        pca = RandomizedPCA(whiten=True)  #use approx PCA to save computation
        train_test = pca.fit_transform(train_test)
        train = train_test[:train_old.shape[0]]
        test = train_test[train_old.shape[0]:]
        ### Extract meta info
        meta_dict['n_true_classes'] = meta['n_true_classes']
        train_lb = meta['train_lb']
        test_lb = meta['test_lb']

    else:
        print 'nothing'

    meta_dict['dataname'] = dataname

    # add true labels if exists
    if not train_lb is None:
        meta_dict['train_lb'] = train_lb
        #meta_dict['test_lb'] = test_lb

    # Check if data is sparse
    if sp.issparse(train):
        meta_dict['isSparse'] = True
        print('The %s data is sparse' % dataname)

    print('%d training data' % train.shape[0])
    print('data dimension is %d' % train.shape[1])
    if len(train.shape) == 3:
        print('data has %d channels' % train.shape[2])

    if test is None:
        print 'No test data'
    else:
        print('%d test data' % test.shape[0])
    print('The number of true classes is %d' % meta_dict['n_true_classes'])

    # What does this do?
    train = as_float_array(train, copy=True)
    if not test is None:
        test = as_float_array(test, copy=True)

    # precompute squared norms for faster computation
    x_squared_norms_tr = row_norms(train, squared=True)
    meta_dict['xsn_train'] = x_squared_norms_tr
    if not test is None:
        x_squared_norms_tt = row_norms(test, squared=True)
        meta_dict['xsn_test'] = x_squared_norms_tt

    return train, test, meta_dict
def constraint_kmeans(
        X,
        labels,
        sample_weight,
        centers,
        inertia,
        iter,
        max_iter,  # pylint: disable=W0622
        strategy='gain',
        verbose=0,
        state=None,
        learning_rate=1.,
        history=False,
        fLOG=None):
    """
    Completes the constraint :epkg:`k-means`.

    @param      X               features
    @param      labels          initialized labels (unused)
    @param      sample_weight   sample weight
    @param      centers         initialized centers
    @param      inertia         initialized inertia (unused)
    @param      iter            number of iteration already done
    @param      max_iter        maximum of number of iteration
    @param      strategy        strategy used to sort observations before
                                mapping them to clusters
    @param      verbose         verbose
    @param      state           random state
    @param      learning_rate   used by strategy `'weights'`
    @param      history         return list of centers accross iterations
    @param      fLOG            logging function (needs to be specified otherwise
                                verbose has no effects)
    @return                     tuple (best_labels, best_centers, best_inertia,
                                iter, all_centers)
    """
    if labels.dtype != numpy.int32:
        raise TypeError("Labels must be an array of int not '{0}'".format(
            labels.dtype))

    if strategy == 'weights':
        return _constraint_kmeans_weights(X,
                                          labels,
                                          sample_weight,
                                          centers,
                                          inertia,
                                          iter,
                                          max_iter,
                                          verbose=verbose,
                                          state=state,
                                          learning_rate=learning_rate,
                                          history=history,
                                          fLOG=fLOG)
    else:
        if isinstance(X, DataFrame):
            X = X.values
        x_squared_norms = row_norms(X, squared=True)
        counters = numpy.empty((centers.shape[0], ), dtype=numpy.int32)
        limit = X.shape[0] // centers.shape[0]
        leftover = X.shape[0] - limit * centers.shape[0]
        leftclose = numpy.empty((centers.shape[0], ), dtype=numpy.int32)
        n_clusters = centers.shape[0]
        distances_close = numpy.empty((X.shape[0], ), dtype=X.dtype)
        best_inertia = None
        best_iter = None
        all_centers = []

        # association
        _constraint_association(leftover,
                                counters,
                                labels,
                                leftclose,
                                distances_close,
                                centers,
                                X,
                                x_squared_norms,
                                limit,
                                strategy,
                                state=state)

        if sample_weight is None:
            sw = numpy.ones((X.shape[0], ))
        else:
            sw = sample_weight

        if scipy.sparse.issparse(X):
            _centers_fct = _centers_sparse
        else:
            _centers_fct = _centers_dense

        while iter < max_iter:
            # compute new clusters
            centers = _centers_fct(X, sw, labels, n_clusters, distances_close)

            if history:
                all_centers.append(centers)

            # association
            _constraint_association(leftover,
                                    counters,
                                    labels,
                                    leftclose,
                                    distances_close,
                                    centers,
                                    X,
                                    x_squared_norms,
                                    limit,
                                    strategy,
                                    state=state)

            # inertia
            _, inertia = _labels_inertia_skl(X=X,
                                             sample_weight=sw,
                                             x_squared_norms=x_squared_norms,
                                             centers=centers,
                                             distances=distances_close)

            iter += 1
            if verbose and fLOG:
                fLOG("CKMeans %d/%d inertia=%f" % (iter, max_iter, inertia))

            # best option so far?
            if best_inertia is None or inertia < best_inertia:
                best_inertia = inertia
                best_centers = centers.copy()
                best_labels = labels.copy()
                best_iter = iter

            # early stop
            if (best_inertia is not None and inertia >= best_inertia
                    and iter > best_iter + 5):
                break

        return (best_labels, best_centers, best_inertia, None, iter,
                all_centers)
def spherical_k_means(X, n_clusters, init='k-means++', n_init=10,
            max_iter=300, verbose=False, tol=1e-4, random_state=None,
            copy_x=True, n_jobs=1, algorithm="auto", return_n_iter=False):
    """Modified from sklearn.cluster.k_means_.k_means.
    """
    if n_init <= 0:
        raise ValueError("Invalid number of initializations."
                         " n_init=%d must be bigger than zero." % n_init)
    random_state = check_random_state(random_state)

    if max_iter <= 0:
        raise ValueError('Number of iterations should be a positive number,'
                         ' got %d instead' % max_iter)

    best_inertia = np.infty
    X = as_float_array(X, copy=copy_x)
    tol = _tolerance(X, tol)

    if hasattr(init, '__array__'):
        init = check_array(init, dtype=X.dtype.type, copy=True)
        _validate_center_shape(X, n_clusters, init)

        if n_init != 1:
            warnings.warn(
                'Explicit initial center position passed: '
                'performing only one init in k-means instead of n_init=%d'
                % n_init, RuntimeWarning, stacklevel=2)
            n_init = 1

    # precompute squared norms of data points
    x_squared_norms = row_norms(X, squared=True)

    if n_jobs == 1:
        # For a single thread, less memory is needed if we just store one set
        # of the best results (as opposed to one set per run per thread).
        for it in range(n_init):
            # run a k-means once
            labels, inertia, centers, n_iter_ = _spherical_kmeans_single_lloyd(
                X, n_clusters, max_iter=max_iter, init=init, verbose=verbose,
                tol=tol, x_squared_norms=x_squared_norms,
                random_state=random_state)

            # determine if these results are the best so far
            if best_inertia is None or inertia < best_inertia:
                best_labels = labels.copy()
                best_centers = centers.copy()
                best_inertia = inertia
                best_n_iter = n_iter_
    else:
        # parallelisation of k-means runs
        seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
        results = Parallel(n_jobs=n_jobs, verbose=0)(
            delayed(_spherical_kmeans_single_lloyd)(X, n_clusters,
                                   max_iter=max_iter, init=init,
                                   verbose=verbose, tol=tol,
                                   x_squared_norms=x_squared_norms,
                                   # Change seed to ensure variety
                                   random_state=seed)
            for seed in seeds)

        # Get results with the lowest inertia
        labels, inertia, centers, n_iters = zip(*results)
        best = np.argmin(inertia)
        best_labels = labels[best]
        best_inertia = inertia[best]
        best_centers = centers[best]
        best_n_iter = n_iters[best]

    if return_n_iter:
        return best_centers, best_labels, best_inertia, best_n_iter
    else:
        return best_centers, best_labels, best_inertia
Exemple #45
0
 def calc_sampling_distribution(self):
     x_squared_norms = row_norms(self.X, squared=True)
     centers = _init_centroids(self.X, self.n_clusters, self.init, random_state=self.random_state,
                               x_squared_norms=x_squared_norms)
     sens = sensitivity.kmeans_sensitivity(self.X, self.w, centers, max(np.log(self.n_clusters), 1))
     self.p = sens / np.sum(sens)
Exemple #46
0
 def func(dat_matrix):
     x_squared_norms = row_norms(dat_matrix, squared=True)
     inertias = _labels_inertia(dat_matrix, x_squared_norms,
                                km.cluster_centers_)[1]
     return inertias
Exemple #47
0
def _k_init(X, n_clusters, random_state, n_local_trials=None):
    """Init n_clusters seeds according to k-means++
    Parameters
    -----------
    X: array or sparse matrix, shape (n_samples, n_features)
        The data to pick seeds for. To avoid memory copy, the input data
        should be double precision (dtype=np.float64).
    n_clusters: integer
        The number of seeds to choose
    x_squared_norms: array, shape (n_samples,)
        Squared Euclidean norm of each data point.
    random_state: numpy.RandomState
        The generator used to initialize the centers.
    n_local_trials: integer, optional
        The number of seeding trials for each center (except the first),
        of which the one reducing inertia the most is greedily chosen.
        Set to None to make the number of trials depend logarithmically
        on the number of seeds (2+log(k)); this is the default.
    Notes
    -----
    Selects initial cluster centers for k-mean clustering in a smart way
    to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
    "k-means++: the advantages of careful seeding". ACM-SIAM symposium
    on Discrete algorithms. 2007
    Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip,
    which is the implementation used in the aforementioned paper.
    """
    n_samples, n_features = X.shape

    centers = np.empty((n_clusters, n_features), dtype=X.dtype)

    # Modified !!!!
    # assert x_squared_norms is not None, 'x_squared_norms None in _k_init'
    x_squared_norms = row_norms(X, squared=True)
    # Set the number of local seeding trials if none is given
    if n_local_trials is None:
        # This is what Arthur/Vassilvitskii tried, but did not report
        # specific results for other than mentioning in the conclusion
        # that it helped.
        n_local_trials = 2 + int(np.log(n_clusters))

    # Pick first center randomly
    center_id = random_state.randint(n_samples)
    if sp.issparse(X):
        centers[0] = X[center_id].toarray()
    else:
        centers[0] = X[center_id]

    # Initialize list of closest distances and calculate current potential
    closest_dist_sq = euclidean_distances(centers[0, np.newaxis],
                                          X,
                                          Y_norm_squared=x_squared_norms,
                                          squared=True)
    current_pot = closest_dist_sq.sum()

    # Pick the remaining n_clusters-1 points
    for c in range(1, n_clusters):
        # Choose center candidates by sampling with probability proportional
        # to the squared distance to the closest existing center
        rand_vals = random_state.random_sample(n_local_trials) * current_pot
        candidate_ids = np.searchsorted(closest_dist_sq.cumsum(), rand_vals)

        # Compute distances to center candidates
        distance_to_candidates = euclidean_distances(
            X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True)

        # Decide which candidate is the best
        best_candidate = None
        best_pot = None
        best_dist_sq = None
        for trial in range(n_local_trials):
            # Compute potential when including center candidate
            new_dist_sq = np.minimum(closest_dist_sq,
                                     distance_to_candidates[trial])
            new_pot = new_dist_sq.sum()

            # Store result if it is the best local trial so far
            if (best_candidate is None) or (new_pot < best_pot):
                best_candidate = candidate_ids[trial]
                best_pot = new_pot
                best_dist_sq = new_dist_sq

        # Permanently add best center candidate found in local tries
        if sp.issparse(X):
            centers[c] = X[best_candidate].toarray()
        else:
            centers[c] = X[best_candidate]
        current_pot = best_pot
        closest_dist_sq = best_dist_sq

    return centers
Exemple #48
0
def convert_sklearn_kmeans(scope, operator, container):
    """
    Computation graph of distances to all centroids for a batch of examples.
    Note that a centriod is just the center of a cluster. We use ``[]`` to
    denote the dimension of a variable; for example, ``X[3, 2]`` means that
    *X* is a *3-by-2* tensor. In addition, for a matrix *X*, $X'$ denotes its
    transpose.

    Symbols:

    * *l*: # of examples.
    * *n*: # of features per input example.
    * *X*: input examples, l-by-n tensor.
    * *C*: centroids, k-by-n tensor.
    * :math:`C^2`: 2-norm of all centriod vectors, its shape is ``[k]``.
    * *Y*: 2-norm of difference between examples and centroids,
      *l-by-k* tensor. The value at i-th row and k-th column row,
      ``Y[i,k]``,is the distance from example *i* to centroid *k*.
    * *L*: the id of the nearest centroid for each input example,
      its shape is ``[l]``.

    ::

         .------------------------------------------------------.
         |                                                      |
         |                                                      v
        X [l, n] --> ReduceSumSquare -> X^2 [l]   Gemm (alpha=-2, transB=1)
                                         |                  |  |- C [k, n]
                                         |                  |
                                         |                  v
                                         `------> Add <-- -2XC' [l, k]
                                                   |
                                                   v
                     C^2 [k] --------> Add <----- Z [l, k]
                                        |
                                        v
                 L [l] <-- ArgMin <--  Y2 [l, k] --> Sqrt --> Y2 [l, k]

    *scikit-learn* code:

    ::

        X = data
        Y = model.cluster_centers_
        XX = row_norms(X, squared=True)
        YY = row_norms(Y, squared=True)
        distances = safe_sparse_dot(X, Y.T, dense_output=True)
        distances *= -2
        distances += XX[:, numpy.newaxis]
        distances += YY[numpy.newaxis, :]
        numpy.sqrt(distances, out=distances)
    """
    X = operator.inputs[0]
    out = operator.outputs
    op = operator.raw_operator
    opv = container.target_opset
    C = op.cluster_centers_
    input_name = X
    dtype = guess_numpy_type(X.type)
    if dtype != np.float64:
        dtype = np.float32

    if type(X.type) == Int64TensorType:
        x_cast = OnnxCast(X, to=onnx_proto.TensorProto.FLOAT, op_version=opv)
        input_name = x_cast

    C2 = row_norms(C, squared=True).astype(dtype)
    C = C.astype(dtype)
    rs = OnnxReduceSumSquare(input_name, axes=[1], keepdims=1, op_version=opv)

    N = X.type.shape[0]
    if isinstance(N, int):
        zeros = np.zeros((N, ), dtype=dtype)
    else:
        zeros = OnnxMul(rs, np.array([0], dtype=dtype),
                        op_version=opv)

    z = OnnxAdd(rs, OnnxGemm(input_name, C, zeros, alpha=-2.,
                             transB=1, op_version=opv),
                op_version=opv)
    y2 = OnnxAdd(C2, z, op_version=opv)
    ll = OnnxArgMin(y2, axis=1, keepdims=0, output_names=out[:1],
                    op_version=opv)
    y2s = OnnxSqrt(y2, output_names=out[1:], op_version=opv)
    ll.add_to(scope, container)
    y2s.add_to(scope, container)
Exemple #49
0
 def normalize(self, X, norm='l2', axis=1, copy=True):
     """Normalize a dataset along any axis
 
     Parameters
     ----------
     X : array or scipy.sparse matrix with shape [n_samples, n_features]
         The data to normalize, element by element.
         scipy.sparse matrices should be in CSR format to avoid an
         un-necessary copy.
 
     norm : 'l1' or 'l2', optional ('l2' by default)
         The norm to use to normalize each non zero sample (or each non-zero
         feature if axis is 0).
 
     axis : 0 or 1, optional (1 by default)
         axis used to normalize the data along. If 1, independently normalize
         each sample, otherwise (if 0) normalize each feature.
 
     copy : boolean, optional, default is True
         set to False to perform inplace row normalization and avoid a
         copy (if the input is already a numpy array or a scipy.sparse
         CSR matrix and if axis is 1).
 
     See also
     --------
     :class:`sklearn.preprocessing.Normalizer` to perform normalization
     using the ``Transformer`` API (e.g. as part of a preprocessing
     :class:`sklearn.pipeline.Pipeline`)
     """
     if norm not in ('l1', 'l2'):
         raise ValueError("'%s' is not a supported norm" % norm)
 
     if axis == 0:
         sparse_format = 'csc'
     elif axis == 1:
         sparse_format = 'csr'
     else:
         raise ValueError("'%d' is not a supported axis" % axis)
 
     X = check_arrays(X, sparse_format=sparse_format, copy=copy)[0]
     warn_if_not_float(X, 'The normalize function')
     if axis == 0:
         X = X.T
 
     if sparse.issparse(X):
         if norm == 'l1':
             inplace_csr_row_normalize_l1(X)
         elif norm == 'l2':
             inplace_csr_row_normalize_l2(X)
     else:
         if norm == 'l1':
             norms = np.abs(X).sum(axis=1)
             norms[norms == 0.0] = 1.0
         elif norm == 'l2':
             norms = row_norms(X)
             norms[norms == 0.0] = 1.0
         X /= norms[:, np.newaxis]
 
     if axis == 0:
         X = X.T
 
     return X
Exemple #50
0
def normalize(X, norm='l2', axis=1, copy=True):
    """Scale input vectors individually to unit norm (vector length).

    Parameters
    ----------
    X : array or scipy.sparse matrix with shape [n_samples, n_features]
        The data to normalize, element by element.
        scipy.sparse matrices should be in CSR format to avoid an
        un-necessary copy.

    norm : 'l1' or 'l2', optional ('l2' by default)
        The norm to use to normalize each non zero sample (or each non-zero
        feature if axis is 0).

    axis : 0 or 1, optional (1 by default)
        axis used to normalize the data along. If 1, independently normalize
        each sample, otherwise (if 0) normalize each feature.

    copy : boolean, optional, default True
        set to False to perform inplace row normalization and avoid a
        copy (if the input is already a numpy array or a scipy.sparse
        CSR matrix and if axis is 1).

    See also
    --------
    :class:`sklearn.preprocessing.Normalizer` to perform normalization
    using the ``Transformer`` API (e.g. as part of a preprocessing
    :class:`sklearn.pipeline.Pipeline`)
    """
    if norm not in ('l1', 'l2'):
        raise ValueError("'%s' is not a supported norm" % norm)

    if axis == 0:
        sparse_format = 'csc'
    elif axis == 1:
        sparse_format = 'csr'
    else:
        raise ValueError("'%d' is not a supported axis" % axis)

    X = check_array(X, sparse_format, copy=copy)
    warn_if_not_float(X, 'The normalize function')
    if axis == 0:
        X = X.T

    if sparse.issparse(X):
        X = check_array(X, accept_sparse=sparse_format, dtype=np.float64)
        if norm == 'l1':
            inplace_csr_row_normalize_l1(X)
        elif norm == 'l2':
            inplace_csr_row_normalize_l2(X)
    else:
        if norm == 'l1':
            norms = np.abs(X).sum(axis=1)
            norms[norms == 0.0] = 1.0
        elif norm == 'l2':
            norms = row_norms(X)
            norms[norms == 0.0] = 1.0
        X /= norms[:, np.newaxis]

    if axis == 0:
        X = X.T

    return X
    def fit(self,X,y,sample_weight=None):

        if not isinstance(self.C, numbers.Number) or self.C < 0:
            raise ValueError("Penalty term must be positive; got (C=%r)"
                             % self.C)
        if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:
            raise ValueError("Maximum number of iteration must be positive;"
                             " got (max_iter=%r)" % self.max_iter)
        if not isinstance(self.tol, numbers.Number) or self.tol < 0:
            raise ValueError("Tolerance for stopping criteria must be "
                             "positive; got (tol=%r)" % self.tol)

        solver = _check_solver(self.solver, self.penalty, self.dual)

        if solver in ['newton-cg']:
            _dtype = [np.float64, np.float32]
        else:
            _dtype = np.float64

        X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C",
                         accept_large_sparse=solver != 'liblinear')

        self.X_fit_ = X
        X_k = self._get_kernel(X)

        check_classification_targets(y)
        self.classes_ = np.unique(y)
        n_samples, n_features = X_k.shape

        multi_class = _check_multi_class(self.multi_class, solver,
                                         len(self.classes_))

        if solver == 'liblinear':
            if effective_n_jobs(self.n_jobs) != 1:
                warnings.warn("'n_jobs' > 1 does not have any effect when"
                              " 'solver' is set to 'liblinear'. Got 'n_jobs'"
                              " = {}.".format(effective_n_jobs(self.n_jobs)))
            self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
                X_k, y, self.C, self.fit_intercept, self.intercept_scaling,
                self.class_weight, self.penalty, self.dual, self.verbose,
                self.max_iter, self.tol, self.random_state,
                sample_weight=sample_weight)
            self.n_iter_ = np.array([n_iter_])
            return self

        if solver in ['sag', 'saga']:
            max_squared_sum = row_norms(X_k, squared=True).max()
        else:
            max_squared_sum = None

        n_classes = len(self.classes_)
        classes_ = self.classes_
        if n_classes < 2:
            raise ValueError("This solver needs samples of at least 2 classes"
                             " in the data, but the data contains only one"
                             " class: %r" % classes_[0])

        if len(self.classes_) == 2:
            n_classes = 1
            classes_ = classes_[1:]

        if self.warm_start:
            warm_start_coef = getattr(self, 'coef_', None)
        else:
            warm_start_coef = None
        if warm_start_coef is not None and self.fit_intercept:
            warm_start_coef = np.append(warm_start_coef,
                                        self.intercept_[:, np.newaxis],
                                        axis=1)

        self.coef_ = list()
        self.intercept_ = np.zeros(n_classes)

        # Hack so that we iterate only once for the multinomial case.
        if multi_class == 'multinomial':
            classes_ = [None]
            warm_start_coef = [warm_start_coef]
        if warm_start_coef is None:
            warm_start_coef = [None] * n_classes

        path_func = delayed(logistic_regression_path)

        # The SAG solver releases the GIL so it's more efficient to use
        # threads for this solver.
        if solver in ['sag', 'saga']:
            prefer = 'threads'
        else:
            prefer = 'processes'
        fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                               **_joblib_parallel_args(prefer=prefer))(
            path_func(X_k, y, pos_class=class_, Cs=[self.C],
                      fit_intercept=self.fit_intercept, tol=self.tol,
                      verbose=self.verbose, solver=solver,
                      multi_class=multi_class, max_iter=self.max_iter,
                      class_weight=self.class_weight, check_input=False,
                      random_state=self.random_state, coef=warm_start_coef_,
                      penalty=self.penalty,
                      max_squared_sum=max_squared_sum,
                      sample_weight=sample_weight)
            for class_, warm_start_coef_ in zip(classes_, warm_start_coef))

        fold_coefs_, _, n_iter_ = zip(*fold_coefs_)
        self.n_iter_ = np.asarray(n_iter_, dtype=np.int32)[:, 0]

        if multi_class == 'multinomial':
            self.coef_ = fold_coefs_[0][0]
        else:
            self.coef_ = np.asarray(fold_coefs_)
            self.coef_ = self.coef_.reshape(n_classes, n_features +
                                            int(self.fit_intercept))

        if self.fit_intercept:
            self.intercept_ = self.coef_[:, -1]
            self.coef_ = self.coef_[:, :-1]

        return self
Exemple #52
0
    def fit(self, X, y=None, sample_weight=None):
        """Compute k-means clustering.

        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
            Training instances to cluster. It must be noted that the data
            will be converted to C ordering, which will cause a memory
            copy if the given data is not C-contiguous.

        y : Ignored
            Not used, present here for API consistency by convention.

        sample_weight : array-like, shape (n_samples,), optional
            The weights for each observation in X. If None, all observations
            are assigned equal weight (default: None).

        Returns
        -------
        self
            Fitted estimator.
        """
        random_state = check_random_state(self.random_state)

        n_init = self.n_init
        if n_init <= 0:
            raise ValueError("Invalid number of initializations."
                             " n_init=%d must be bigger than zero." % n_init)

        if self.max_iter <= 0:
            raise ValueError(
                'Number of iterations should be a positive number,'
                ' got %d instead' % self.max_iter)

        # avoid forcing order when copy_x=False
        order = "C" if self.copy_x else None
        X = check_array(X,
                        accept_sparse='csr',
                        dtype=[np.float64, np.float32],
                        order=order,
                        copy=self.copy_x)
        # verify that the number of samples given is larger than k
        if _num_samples(X) < self.n_clusters:
            raise ValueError("n_samples=%d should be >= n_clusters=%d" %
                             (_num_samples(X), self.n_clusters))

        tol = _tolerance(X, self.tol)

        # If the distances are precomputed every job will create a matrix of
        # shape (n_clusters, n_samples). To stop KMeans from eating up memory
        # we only activate this if the created matrix is guaranteed to be
        # under 100MB. 12 million entries consume a little under 100MB if they
        # are of type double.
        precompute_distances = self.precompute_distances
        if precompute_distances == 'auto':
            n_samples = X.shape[0]
            precompute_distances = (self.n_clusters * n_samples) < 12e6
        elif isinstance(precompute_distances, bool):
            pass
        else:
            raise ValueError(
                "precompute_distances should be 'auto' or True/False"
                ", but a value of %r was passed" % precompute_distances)

        # Validate init array
        init = self.init
        if hasattr(init, '__array__'):
            init = check_array(init, dtype=X.dtype.type, copy=True)
            _validate_center_shape(X, self.n_clusters, init)

            if n_init != 1:
                warnings.warn(
                    'Explicit initial center position passed: '
                    'performing only one init in k-means instead of n_init=%d'
                    % n_init,
                    RuntimeWarning,
                    stacklevel=2)
                n_init = 1

        # subtract of mean of x for more accurate distance computations
        if not sp.issparse(X):
            X_mean = X.mean(axis=0)
            # The copy was already done above
            X -= X_mean

            if hasattr(init, '__array__'):
                init -= X_mean

        # precompute squared norms of data points
        x_squared_norms = row_norms(X, squared=True)

        best_labels, best_inertia, best_centers = None, None, None
        algorithm = self.algorithm
        if self.n_clusters == 1:
            # elkan doesn't make sense for a single cluster, full will produce
            # the right result.
            algorithm = "full"
        if algorithm == "auto":
            algorithm = "full" if sp.issparse(X) else 'elkan'
        if algorithm == "full":
            kmeans_single = _fuzzykmeans_single_lloyd
        elif algorithm == "elkan":
            kmeans_single = _fuzzykmeans_single_elkan
        else:
            raise ValueError("Algorithm must be 'auto', 'full' or 'elkan', got"
                             " %s" % str(algorithm))

        seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
        if effective_n_jobs(self.n_jobs) == 1:
            # For a single thread, less memory is needed if we just store one
            # set of the best results (as opposed to one set per run per
            # thread).
            for seed in seeds:
                # run a k-means once
                fuzzy_labels, labels, inertia, centers, n_iter_ = kmeans_single(
                    X,
                    self.m,
                    sample_weight,
                    self.n_clusters,
                    max_iter=self.max_iter,
                    init=init,
                    verbose=self.verbose,
                    precompute_distances=precompute_distances,
                    tol=tol,
                    x_squared_norms=x_squared_norms,
                    random_state=seed)
                # determine if these results are the best so far
                if best_inertia is None or inertia < best_inertia:
                    best_fuzzy_labels = fuzzy_labels.copy()
                    best_labels = labels.copy()
                    best_centers = centers.copy()
                    best_inertia = inertia
                    best_n_iter = n_iter_
        else:
            # parallelisation of k-means runs
            results = Parallel(n_jobs=self.n_jobs, verbose=0)(
                delayed(kmeans_single)(
                    X,
                    self.m,
                    sample_weight,
                    self.n_clusters,
                    max_iter=self.max_iter,
                    init=init,
                    verbose=self.verbose,
                    tol=tol,
                    precompute_distances=precompute_distances,
                    x_squared_norms=x_squared_norms,
                    # Change seed to ensure variety
                    random_state=seed) for seed in seeds)
            # Get results with the lowest inertia
            fuzzy_labels, labels, inertia, centers, n_iters = zip(*results)
            best = np.argmin(inertia)
            best_fuzzy_labels = fuzzy_labels[best]
            best_labels = labels[best]
            best_inertia = inertia[best]
            best_centers = centers[best]
            best_n_iter = n_iters[best]

        if not sp.issparse(X):
            if not self.copy_x:
                X += X_mean
            best_centers += X_mean

        distinct_clusters = len(set(best_labels))
        if distinct_clusters < self.n_clusters:
            warnings.warn(
                "Number of distinct clusters ({}) found smaller than "
                "n_clusters ({}). Possibly due to duplicate points "
                "in X.".format(distinct_clusters, self.n_clusters),
                ConvergenceWarning,
                stacklevel=2)

        self.cluster_centers_ = best_centers
        self.fuzzy_labels_ = best_fuzzy_labels
        self.labels_ = best_labels
        self.inertia_ = best_inertia
        self.n_iter_ = best_n_iter
        return self
    assert indices.shape[0] == n_clusters
    assert (indices >= 0).all()
    assert (indices <= data.shape[0]).all()

    # Check for the correct number of seeds and that they are bound by the data
    assert centers.shape[0] == n_clusters
    assert (centers.max(axis=0) <= data.max(axis=0)).all()
    assert (centers.min(axis=0) >= data.min(axis=0)).all()

    # Check that indices correspond to reported centers
    # Use X for comparison rather than data, test still works against centers
    # calculated with sparse data.
    assert_allclose(X[indices].astype(dtype), centers)


@pytest.mark.parametrize("x_squared_norms", [row_norms(X, squared=True), None])
def test_kmeans_plusplus_norms(x_squared_norms):
    # Check that defining x_squared_norms returns the same as default=None.
    centers, indices = kmeans_plusplus(X,
                                       n_clusters,
                                       x_squared_norms=x_squared_norms)

    assert_allclose(X[indices], centers)


def test_kmeans_plusplus_dataorder():
    # Check that memory layout does not effect result
    centers_c, _ = kmeans_plusplus(X, n_clusters, random_state=0)

    X_fortran = np.asfortranarray(X)
Exemple #54
0
def _estimate_log_gaussian_prob(x, means, precisions_chol, covariance_type):
    """Estimate the log Gaussian probability.

    Parameters
    ----------
    x : array-like or csr_matrix, shape (n_samples, n_features)

    means : array-like, shape (n_components, n_features)

    precisions_chol : array-like,
        Cholesky decompositions of the precision matrices.
        'full' : shape of (n_components, n_features, n_features)
        'tied' : shape of (n_features, n_features)
        'diag' : shape of (n_components, n_features)
        'spherical' : shape of (n_components,)

    covariance_type : {'full', 'tied', 'diag', 'spherical'}

    Returns
    -------
    log_prob : array, shape (n_samples, n_components)
    """
    n_samples, n_features = x.shape
    n_components, _ = means.shape
    # det(precision_chol) is half of det(precision)
    log_det = _compute_log_det_cholesky(precisions_chol, covariance_type,
                                        n_features)

    if covariance_type == 'full':
        log_prob = np.empty((n_samples, n_components))
        for k, (mu, prec_chol) in enumerate(zip(means, precisions_chol)):
            if issparse(x):
                y = x.dot(prec_chol) - np.dot(mu, prec_chol)
            else:
                y = np.matmul(x, prec_chol) - np.dot(mu, prec_chol)
            log_prob[:, k] = np.sum(np.square(y), axis=1)

    elif covariance_type == 'tied':
        log_prob = np.empty((n_samples, n_components))
        for k, mu in enumerate(means):
            if issparse(x):
                y = x.dot(precisions_chol) - np.dot(mu, precisions_chol)
            else:
                y = np.dot(x, precisions_chol) - np.dot(mu, precisions_chol)
            log_prob[:, k] = np.sum(np.square(y), axis=1)

    elif covariance_type == 'diag':
        precisions = precisions_chol**2
        if issparse(x):
            log_prob = (np.sum((means**2 * precisions), 1) - 2. *
                        (x * (means * precisions).T) +
                        x.multiply(x).dot(precisions.T))
        else:
            log_prob = (np.sum((means**2 * precisions), 1) -
                        2. * np.dot(x, (means * precisions).T) +
                        np.dot(x**2, precisions.T))

    elif covariance_type == 'spherical':
        precisions = precisions_chol**2
        if issparse(x):
            log_prob = (np.sum(means**2, 1) * precisions - 2 *
                        (x * (means.T * precisions)) +
                        np.outer(row_norms(x, squared=True), precisions))
        else:
            log_prob = (np.sum(means**2, 1) * precisions -
                        2 * np.dot(x, means.T * precisions) +
                        np.outer(row_norms(x, squared=True), precisions))
    else:  # pragma: no cover
        raise ValueError()
    return -.5 * (n_features * np.log(2 * np.pi) + log_prob) + log_det
def test_minibatch_update_consistency():
    # Check that dense and sparse minibatch update give the same results
    rng = np.random.RandomState(42)
    old_centers = centers + rng.normal(size=centers.shape)

    new_centers = old_centers.copy()
    new_centers_csr = old_centers.copy()

    counts = np.zeros(new_centers.shape[0], dtype=np.int32)
    counts_csr = np.zeros(new_centers.shape[0], dtype=np.int32)

    x_squared_norms = (X ** 2).sum(axis=1)
    x_squared_norms_csr = row_norms(X_csr, squared=True)

    buffer = np.zeros(centers.shape[1], dtype=np.double)
    buffer_csr = np.zeros(centers.shape[1], dtype=np.double)

    # extract a small minibatch
    X_mb = X[:10]
    X_mb_csr = X_csr[:10]
    x_mb_squared_norms = x_squared_norms[:10]
    x_mb_squared_norms_csr = x_squared_norms_csr[:10]

    # step 1: compute the dense minibatch update
    old_inertia, incremental_diff = _mini_batch_step(
        X_mb, x_mb_squared_norms, new_centers, counts,
        buffer, 1, None, random_reassign=False)
    assert_greater(old_inertia, 0.0)

    # compute the new inertia on the same batch to check that it decreased
    labels, new_inertia = _labels_inertia(
        X_mb, x_mb_squared_norms, new_centers)
    assert_greater(new_inertia, 0.0)
    assert_less(new_inertia, old_inertia)

    # check that the incremental difference computation is matching the
    # final observed value
    effective_diff = np.sum((new_centers - old_centers) ** 2)
    assert_almost_equal(incremental_diff, effective_diff)

    # step 2: compute the sparse minibatch update
    old_inertia_csr, incremental_diff_csr = _mini_batch_step(
        X_mb_csr, x_mb_squared_norms_csr, new_centers_csr, counts_csr,
        buffer_csr, 1, None, random_reassign=False)
    assert_greater(old_inertia_csr, 0.0)

    # compute the new inertia on the same batch to check that it decreased
    labels_csr, new_inertia_csr = _labels_inertia(
        X_mb_csr, x_mb_squared_norms_csr, new_centers_csr)
    assert_greater(new_inertia_csr, 0.0)
    assert_less(new_inertia_csr, old_inertia_csr)

    # check that the incremental difference computation is matching the
    # final observed value
    effective_diff = np.sum((new_centers_csr - old_centers) ** 2)
    assert_almost_equal(incremental_diff_csr, effective_diff)

    # step 3: check that sparse and dense updates lead to the same results
    assert_array_equal(labels, labels_csr)
    assert_array_almost_equal(new_centers, new_centers_csr)
    assert_almost_equal(incremental_diff, incremental_diff_csr)
    assert_almost_equal(old_inertia, old_inertia_csr)
    assert_almost_equal(new_inertia, new_inertia_csr)
Exemple #56
0
 def _init_w(self, V, X):
     """
     Initialize the topics W.
     If self.init='k-means++', we use the init method of
     sklearn.cluster.KMeans.
     If self.init='random', topics are initialized with a Gamma
     distribution.
     If self.init='k-means', topics are initialized with a KMeans on the
     n-grams counts.
     """
     if self.init == 'k-means++':
         if LooseVersion(sklearn_version) < LooseVersion('0.24'):
             W = _k_init(V,
                         self.n_components,
                         x_squared_norms=row_norms(V, squared=True),
                         random_state=self.random_state,
                         n_local_trials=None) + .1
         else:
             W, _ = kmeans_plusplus(V,
                                    self.n_components,
                                    x_squared_norms=row_norms(V,
                                                              squared=True),
                                    random_state=self.random_state,
                                    n_local_trials=None)
             W = W + .1  # To avoid restricting topics to few n-grams only
     elif self.init == 'random':
         W = self.random_state.gamma(shape=self.gamma_shape_prior,
                                     scale=self.gamma_scale_prior,
                                     size=(self.n_components, self.n_vocab))
     elif self.init == 'k-means':
         prototypes = get_kmeans_prototypes(X,
                                            self.n_components,
                                            analyzer=self.analyzer,
                                            random_state=self.random_state)
         W = self.ngrams_count_.transform(prototypes).A + .1
         if self.add_words:
             W2 = self.word_count_.transform(prototypes).A + .1
             W = np.hstack((W, W2))
         # if k-means doesn't find the exact number of prototypes
         if W.shape[0] < self.n_components:
             if LooseVersion(sklearn_version) < LooseVersion('0.24'):
                 W2 = _k_init(V,
                              self.n_components - W.shape[0],
                              x_squared_norms=row_norms(V, squared=True),
                              random_state=self.random_state,
                              n_local_trials=None) + .1
             else:
                 W2, _ = kmeans_plusplus(V,
                                         self.n_components - W.shape[0],
                                         x_squared_norms=row_norms(
                                             V, squared=True),
                                         random_state=self.random_state,
                                         n_local_trials=None)
                 W2 = W2 + .1
             W = np.concatenate((W, W2), axis=0)
     else:
         raise AttributeError('Initialization method %s does not exist.' %
                              self.init)
     W /= W.sum(axis=1, keepdims=True)
     A = np.ones((self.n_components, self.n_vocab)) * 1e-10
     B = A.copy()
     return W, A, B