def _generate_repeated_sample_indices(random_state, sample_imbalance, y,
                                      verbose):
    """Draw randomly repeated samples, return an array of arrays of indeces to train models on
    along with a last sample array as the last one may not be the same size as the others.
    """
    class_idxs = _generate_class_indices(y)
    class_len = [len(class_idx) for class_idx in class_idxs]
    majority_class_idx = np.argmax(class_len)
    minority_class_idx = int(not majority_class_idx)
    tot_min_samples = class_len[minority_class_idx]
    tot_maj_samples = class_len[majority_class_idx]
    maj_samples_per_sample = int(tot_min_samples / sample_imbalance)
    estimators = math.ceil(tot_maj_samples / maj_samples_per_sample)
    maj_indices = class_idxs[majority_class_idx]
    # maj_samples is a table of estimator-1 rows by maj_samples_per_sample columns
    maj_samples = choice(maj_indices,
                         size=(estimators - 1, maj_samples_per_sample),
                         replace=False,
                         random_state=random_state)
    # last_maj_sample is a different length than each row of maj_samples to get every examle into a sample
    last_maj_sample = np.setxor1d(maj_samples, maj_indices)
    min_indices = class_idxs[minority_class_idx]
    samples = np.hstack((maj_samples, np.tile(min_indices,
                                              (estimators - 1, 1))))
    last_sample = np.hstack((last_maj_sample, min_indices))
    if verbose > 0:
        print(
            "generating {} samples of indices to use to train multiple estimators, "
            "sized {} elements with last being {} elements".format(
                len(samples) + 1, len(samples[0]), len(last_sample)))
    return samples, last_sample
    def fit(self, X, y=None):
        """Fit the model with X.
        Samples a couple of random based vectors to approximate a Gaussian
        random projection matrix to generate n_components features.
        Parameters
        ----------
        X : {array-like}, shape (n_samples, n_features)
            Training data, where n_samples in the number of samples
            and n_features is the number of features.
        Returns
        -------
        self : object
            Returns the transformer.
        """
        X = check_array(X)

        d_orig = X.shape[1]  # Initial number of features

        # n_components (self.n) is the final number of features
        # times_to_stack_v is the integer division of n by d
        # we use times_to_stack_v according to the paper FastFood:
        # "When n > d, we replicate (7) for n/d independent random matrices
        # Vi, and stack them via Vt = [V_1, V_2, ..., V_(n/d)]t until we have
        # enoug dimensions."
        self.d, self.n, self.times_to_stack_v = \
            Fastfood.enforce_dimensionality_constraints(d_orig,
                                                        self.n_components)

        self.number_of_features_to_pad_with_zeros = self.d - d_orig

        if self.d != d_orig:
            warn(
                "Dimensionality of the input space as been changed (zero padding) from {} to {}."
                .format(d_orig, self.d))

        self.G = self.rng.normal(size=(self.times_to_stack_v, self.d))
        # G is a random matrix following normal distribution

        self.B = choice([-1, 1],
                        size=(self.times_to_stack_v, self.d),
                        replace=True,
                        random_state=self.random_state)
        # B is a random matrix of -1 and 1

        self.P = np.hstack([(i * self.d) + self.rng.permutation(self.d)
                            for i in range(self.times_to_stack_v)])
        # P is a matrix of size d*n/d = n -> the dimension of the embedding space
        # P is for the permutation and respects the V stacks (see FastFood paper)

        self.S = np.multiply(
            1 / self.l2norm_along_axis1(self.G).reshape((-1, 1)),
            chi.rvs(self.d, size=(self.times_to_stack_v, self.d)))

        self.H = scipy.linalg.hadamard(self.d)

        self.U = self.uniform_vector()

        return self
Example #3
0
def _generate_sample_indices(random_state,
                             y,
                             target_imbalance_ratio,
                             verbose=0):
    """Private function used to _parallel_build_trees function."""
    random_instance = check_random_state(random_state)

    class_idxs = _generate_class_indices(y)
    class_len = [len(class_idx) for class_idx in class_idxs]
    minority_class_idx = np.argmin(class_len)
    majority_class_idx = np.argmax(class_len)
    min_samples = class_len[minority_class_idx]
    maj_samples = int(min_samples / target_imbalance_ratio)
    n_samples = min_samples + maj_samples
    if verbose > 1:
        print(
            "len(y):{} target_imbalance_ratio:{} minorities:{} majorities:{} "
            "n_samples:{}".format(len(y), target_imbalance_ratio, min_samples,
                                  maj_samples, n_samples))

    maj_indices = choice(class_idxs[majority_class_idx],
                         size=maj_samples,
                         replace=False,
                         random_state=random_instance)
    min_indices = class_idxs[minority_class_idx]
    indices_to_choose_from = np.hstack((min_indices, maj_indices))
    if verbose > 99:
        print("possible indicies to choose from: {}".format(
            indices_to_choose_from))

    sample_indices = choice(indices_to_choose_from,
                            size=n_samples,
                            replace=True,
                            random_state=random_instance)
    if verbose > 99:
        print("chosen indicies: {}".format(sample_indices))

    return sample_indices
Example #4
0
def test_countvectorizer_vocab_dicts_when_pickling():
    rng = np.random.RandomState(0)
    vocab_words = np.array(['beer', 'burger', 'celeri', 'coke', 'pizza',
                            'salad', 'sparkling', 'tomato', 'water'])
    for x in range(0, 100):
        vocab_dict = dict()
        words = choice(vocab_words, size=5, replace=False, random_state=rng)
        for y in range(0, 5):
            vocab_dict[words[y]] = y
        cv = CountVectorizer(vocabulary=vocab_dict)
        unpickled_cv = pickle.loads(pickle.dumps(cv))
        cv.fit(ALL_FOOD_DOCS)
        unpickled_cv.fit(ALL_FOOD_DOCS)
        assert_equal(cv.get_feature_names(), unpickled_cv.get_feature_names())
Example #5
0
def test_countvectorizer_vocab_sets_when_pickling():
    # ensure that vocabulary of type set is coerced to a list to
    # preserve iteration ordering after deserialization
    rng = np.random.RandomState(0)
    vocab_words = np.array(['beer', 'burger', 'celeri', 'coke', 'pizza',
                            'salad', 'sparkling', 'tomato', 'water'])
    for x in range(0, 100):
        vocab_set = set(choice(vocab_words, size=5, replace=False,
                               random_state=rng))
        cv = CountVectorizer(vocabulary=vocab_set)
        unpickled_cv = pickle.loads(pickle.dumps(cv))
        cv.fit(ALL_FOOD_DOCS)
        unpickled_cv.fit(ALL_FOOD_DOCS)
        assert_equal(cv.get_feature_names(), unpickled_cv.get_feature_names())
def test_countvectorizer_vocab_dicts_when_pickling():
    rng = np.random.RandomState(0)
    vocab_words = np.array(['beer', 'burger', 'celeri', 'coke', 'pizza',
                            'salad', 'sparkling', 'tomato', 'water'])
    for x in range(0, 100):
        vocab_dict = dict()
        words = choice(vocab_words, size=5, replace=False, random_state=rng)
        for y in range(0, 5):
            vocab_dict[words[y]] = y
        cv = CountVectorizer(vocabulary=vocab_dict)
        unpickled_cv = pickle.loads(pickle.dumps(cv))
        cv.fit(ALL_FOOD_DOCS)
        unpickled_cv.fit(ALL_FOOD_DOCS)
        assert_equal(cv.get_feature_names(), unpickled_cv.get_feature_names())
def test_countvectorizer_vocab_sets_when_pickling():
    # ensure that vocabulary of type set is coerced to a list to
    # preserve iteration ordering after deserialization
    rng = np.random.RandomState(0)
    vocab_words = np.array(['beer', 'burger', 'celeri', 'coke', 'pizza',
                            'salad', 'sparkling', 'tomato', 'water'])
    for x in range(0, 100):
        vocab_set = set(choice(vocab_words, size=5, replace=False,
                               random_state=rng))
        cv = CountVectorizer(vocabulary=vocab_set)
        unpickled_cv = pickle.loads(pickle.dumps(cv))
        cv.fit(ALL_FOOD_DOCS)
        unpickled_cv.fit(ALL_FOOD_DOCS)
        assert_equal(cv.get_feature_names(), unpickled_cv.get_feature_names())
Example #8
0
    def fit(self, X, y=None):
        """Fit the model with X.
        Samples a couple of random based vectors to approximate a Gaussian
        random projection matrix to generate n_components features.
        Parameters
        ----------
        X : {array-like}, shape (n_samples, n_features)
            Training data, where n_samples in the number of samples
            and n_features is the number of features.
        Returns
        -------
        self : object
            Returns the transformer.
        """
        X = check_array(X)

        d_orig = X.shape[1]

        self.d, self.n, self.times_to_stack_v = \
            Fastfood.enforce_dimensionality_constraints(d_orig,
                                                        self.n_components)
        self.number_of_features_to_pad_with_zeros = self.d - d_orig

        self.G = self.rng.normal(size=(self.times_to_stack_v, self.d))
        self.B = choice([-1, 1],
                        size=(self.times_to_stack_v, self.d),
                        replace=True,
                        random_state=self.random_state)
        self.P = np.hstack([(i * self.d) + self.rng.permutation(self.d)
                            for i in range(self.times_to_stack_v)])
        self.S = np.multiply(
            1 / self.l2norm_along_axis1(self.G).reshape((-1, 1)),
            chi.rvs(self.d, size=(self.times_to_stack_v, self.d)))

        self.U = self.uniform_vector()

        return self
def MB_step(X,
            x_squared_norms,
            centers,
            counts,
            curr_iter,
            old_center_buffer,
            compute_squared_diff,
            distances,
            random_reassign=False,
            random_state=None,
            reassignment_ratio=.01,
            verbose=False,
            learn_rate=0.0):
    """Incremental update of the centers for the Minibatch K-Means algorithm.
    Parameters
    ----------
    X : array, shape (n_samples, n_features)
        The original data array.
    x_squared_norms : array, shape (n_samples,)
        Squared euclidean norm of each data point.
    centers : array, shape (k, n_features)
        The cluster centers. This array is MODIFIED IN PLACE
    counts : array, shape (k,)
         The vector in which we keep track of the numbers of elements in a
         cluster. This array is MODIFIED IN PLACE
    distances : array, dtype float64, shape (n_samples), optional
        If not None, should be a pre-allocated array that will be used to store
        the distances of each sample to its closest center.
        May not be None when random_reassign is True.
    random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.
    random_reassign : boolean, optional
        If True, centers with very low counts are randomly reassigned
        to observations.
    reassignment_ratio : float, optional
        Control the fraction of the maximum number of counts for a
        center to be reassigned. A higher value means that low count
        centers are more likely to be reassigned, which means that the
        model will take longer to converge, but should converge in a
        better clustering.
    verbose : bool, optional, default False
        Controls the verbosity.
    compute_squared_diff : bool
        If set to False, the squared diff computation is skipped.
    old_center_buffer : int
        Copy of old centers for monitoring convergence.
    
    learn_rate: learning rate
    
    Returns
    -------
    centers: 
    	Updated centers
    inertia : float
        Sum of distances of samples to their closest cluster center.
    squared_diff : numpy array, shape (n_clusters,)
        Squared distances between previous and updated cluster centers.
    """
    # Perform label assignment to nearest centers
    nearest_center, inertia = k_means_._labels_inertia(X,
                                                       x_squared_norms,
                                                       centers,
                                                       distances=distances)

    if random_reassign and reassignment_ratio > 0:
        random_state = check_random_state(random_state)
        # Reassign clusters that have very low counts
        to_reassign = counts < reassignment_ratio * counts.max()
        # pick at most .5 * batch_size samples as new centers
        if to_reassign.sum() > .5 * X.shape[0]:
            indices_dont_reassign = np.argsort(counts)[int(.5 * X.shape[0]):]
            to_reassign[indices_dont_reassign] = False
        n_reassigns = to_reassign.sum()
        if n_reassigns:
            # Pick new clusters amongst observations with uniform probability
            new_centers = choice(X.shape[0],
                                 replace=False,
                                 size=n_reassigns,
                                 random_state=random_state)
            if verbose:
                print("[MiniBatchKMeans] Reassigning %i cluster centers." %
                      n_reassigns)

            if sp.issparse(X) and not sp.issparse(centers):
                assign_rows_csr(X, astype(new_centers, np.intp),
                                astype(np.where(to_reassign)[0], np.intp),
                                centers)
            else:
                centers[to_reassign] = X[new_centers]
        # reset counts of reassigned centers, but don't reset them too small
        # to avoid instant reassignment. This is a pretty dirty hack as it
        # also modifies the learning rates.
        counts[to_reassign] = np.min(counts[~to_reassign])

    squared_diff = 0.0
    ## implementation for the sparse CSR representation completely written in
    # cython
    if sp.issparse(X):
        if compute_squared_diff:
            old_center_buffer = centers
        #rand_vec = make_rand_vector(X.shape[1])
        #learn_rate = 0.0
        centers = _MB_step._mini_batch_update_csr(X, x_squared_norms, centers,
                                                  counts, nearest_center,
                                                  old_center_buffer,
                                                  compute_squared_diff,
                                                  curr_iter, learn_rate)

        if compute_squared_diff:
            diff = centers - old_center_buffer
            squared_diff = row_norms(diff, squared=True).sum()

        return centers, squared_diff, inertia

    ## dense variant in mostly numpy (not as memory efficient though)
    k = centers.shape[0]
    for center_idx in range(k):
        # find points from minibatch that are assigned to this center
        center_mask = nearest_center == center_idx
        old_count = counts[center_idx]
        this_count = center_mask.sum()
        counts[center_idx] += this_count  # update counts

        if this_count > 0:
            new_count = counts[center_idx]
            if compute_squared_diff:
                old_center_buffer[:] = centers[center_idx]

            # inplace remove previous count scaling
            #centers[center_idx] *= counts[center_idx]

            # inplace sum with new points members of this cluster
            #centers[center_idx] += np.sum(X[center_mask], axis=0)

            # update the count statistics for this center
            #counts[center_idx] += count

            # inplace rescale to compute mean of all points (old and new)
            #centers[center_idx] /= counts[center_idx]
            new_center = np.sum(X[center_mask], axis=0)
            if learn_rate == 0.0:
                learn_rate = (new_count - old_count) / float(new_count)

            centers[center_idx] = centers[center_idx] + learn_rate * (
                new_center / (new_count - old_count) - centers[center_idx])

            # update the squared diff if necessary
            if compute_squared_diff:
                diff = centers[center_idx].ravel() - old_center_buffer.ravel()
                squared_diff += np.dot(diff, diff)

    return centers, squared_diff, inertia
Example #10
0
def test_non_p_array():
    keeper = choice(np.array(1), size=())
    assert_equal(keeper[()], 0)
Example #11
0
def test_returns_1d_scalar_object_based_off_distribution():
    keeper = choice(3, replace=False, p=np.array([0, 0, 1.0]))
    assert_equal(keeper, 2)