def _generate_repeated_sample_indices(random_state, sample_imbalance, y, verbose): """Draw randomly repeated samples, return an array of arrays of indeces to train models on along with a last sample array as the last one may not be the same size as the others. """ class_idxs = _generate_class_indices(y) class_len = [len(class_idx) for class_idx in class_idxs] majority_class_idx = np.argmax(class_len) minority_class_idx = int(not majority_class_idx) tot_min_samples = class_len[minority_class_idx] tot_maj_samples = class_len[majority_class_idx] maj_samples_per_sample = int(tot_min_samples / sample_imbalance) estimators = math.ceil(tot_maj_samples / maj_samples_per_sample) maj_indices = class_idxs[majority_class_idx] # maj_samples is a table of estimator-1 rows by maj_samples_per_sample columns maj_samples = choice(maj_indices, size=(estimators - 1, maj_samples_per_sample), replace=False, random_state=random_state) # last_maj_sample is a different length than each row of maj_samples to get every examle into a sample last_maj_sample = np.setxor1d(maj_samples, maj_indices) min_indices = class_idxs[minority_class_idx] samples = np.hstack((maj_samples, np.tile(min_indices, (estimators - 1, 1)))) last_sample = np.hstack((last_maj_sample, min_indices)) if verbose > 0: print( "generating {} samples of indices to use to train multiple estimators, " "sized {} elements with last being {} elements".format( len(samples) + 1, len(samples[0]), len(last_sample))) return samples, last_sample
def fit(self, X, y=None): """Fit the model with X. Samples a couple of random based vectors to approximate a Gaussian random projection matrix to generate n_components features. Parameters ---------- X : {array-like}, shape (n_samples, n_features) Training data, where n_samples in the number of samples and n_features is the number of features. Returns ------- self : object Returns the transformer. """ X = check_array(X) d_orig = X.shape[1] # Initial number of features # n_components (self.n) is the final number of features # times_to_stack_v is the integer division of n by d # we use times_to_stack_v according to the paper FastFood: # "When n > d, we replicate (7) for n/d independent random matrices # Vi, and stack them via Vt = [V_1, V_2, ..., V_(n/d)]t until we have # enoug dimensions." self.d, self.n, self.times_to_stack_v = \ Fastfood.enforce_dimensionality_constraints(d_orig, self.n_components) self.number_of_features_to_pad_with_zeros = self.d - d_orig if self.d != d_orig: warn( "Dimensionality of the input space as been changed (zero padding) from {} to {}." .format(d_orig, self.d)) self.G = self.rng.normal(size=(self.times_to_stack_v, self.d)) # G is a random matrix following normal distribution self.B = choice([-1, 1], size=(self.times_to_stack_v, self.d), replace=True, random_state=self.random_state) # B is a random matrix of -1 and 1 self.P = np.hstack([(i * self.d) + self.rng.permutation(self.d) for i in range(self.times_to_stack_v)]) # P is a matrix of size d*n/d = n -> the dimension of the embedding space # P is for the permutation and respects the V stacks (see FastFood paper) self.S = np.multiply( 1 / self.l2norm_along_axis1(self.G).reshape((-1, 1)), chi.rvs(self.d, size=(self.times_to_stack_v, self.d))) self.H = scipy.linalg.hadamard(self.d) self.U = self.uniform_vector() return self
def _generate_sample_indices(random_state, y, target_imbalance_ratio, verbose=0): """Private function used to _parallel_build_trees function.""" random_instance = check_random_state(random_state) class_idxs = _generate_class_indices(y) class_len = [len(class_idx) for class_idx in class_idxs] minority_class_idx = np.argmin(class_len) majority_class_idx = np.argmax(class_len) min_samples = class_len[minority_class_idx] maj_samples = int(min_samples / target_imbalance_ratio) n_samples = min_samples + maj_samples if verbose > 1: print( "len(y):{} target_imbalance_ratio:{} minorities:{} majorities:{} " "n_samples:{}".format(len(y), target_imbalance_ratio, min_samples, maj_samples, n_samples)) maj_indices = choice(class_idxs[majority_class_idx], size=maj_samples, replace=False, random_state=random_instance) min_indices = class_idxs[minority_class_idx] indices_to_choose_from = np.hstack((min_indices, maj_indices)) if verbose > 99: print("possible indicies to choose from: {}".format( indices_to_choose_from)) sample_indices = choice(indices_to_choose_from, size=n_samples, replace=True, random_state=random_instance) if verbose > 99: print("chosen indicies: {}".format(sample_indices)) return sample_indices
def test_countvectorizer_vocab_dicts_when_pickling(): rng = np.random.RandomState(0) vocab_words = np.array(['beer', 'burger', 'celeri', 'coke', 'pizza', 'salad', 'sparkling', 'tomato', 'water']) for x in range(0, 100): vocab_dict = dict() words = choice(vocab_words, size=5, replace=False, random_state=rng) for y in range(0, 5): vocab_dict[words[y]] = y cv = CountVectorizer(vocabulary=vocab_dict) unpickled_cv = pickle.loads(pickle.dumps(cv)) cv.fit(ALL_FOOD_DOCS) unpickled_cv.fit(ALL_FOOD_DOCS) assert_equal(cv.get_feature_names(), unpickled_cv.get_feature_names())
def test_countvectorizer_vocab_sets_when_pickling(): # ensure that vocabulary of type set is coerced to a list to # preserve iteration ordering after deserialization rng = np.random.RandomState(0) vocab_words = np.array(['beer', 'burger', 'celeri', 'coke', 'pizza', 'salad', 'sparkling', 'tomato', 'water']) for x in range(0, 100): vocab_set = set(choice(vocab_words, size=5, replace=False, random_state=rng)) cv = CountVectorizer(vocabulary=vocab_set) unpickled_cv = pickle.loads(pickle.dumps(cv)) cv.fit(ALL_FOOD_DOCS) unpickled_cv.fit(ALL_FOOD_DOCS) assert_equal(cv.get_feature_names(), unpickled_cv.get_feature_names())
def fit(self, X, y=None): """Fit the model with X. Samples a couple of random based vectors to approximate a Gaussian random projection matrix to generate n_components features. Parameters ---------- X : {array-like}, shape (n_samples, n_features) Training data, where n_samples in the number of samples and n_features is the number of features. Returns ------- self : object Returns the transformer. """ X = check_array(X) d_orig = X.shape[1] self.d, self.n, self.times_to_stack_v = \ Fastfood.enforce_dimensionality_constraints(d_orig, self.n_components) self.number_of_features_to_pad_with_zeros = self.d - d_orig self.G = self.rng.normal(size=(self.times_to_stack_v, self.d)) self.B = choice([-1, 1], size=(self.times_to_stack_v, self.d), replace=True, random_state=self.random_state) self.P = np.hstack([(i * self.d) + self.rng.permutation(self.d) for i in range(self.times_to_stack_v)]) self.S = np.multiply( 1 / self.l2norm_along_axis1(self.G).reshape((-1, 1)), chi.rvs(self.d, size=(self.times_to_stack_v, self.d))) self.U = self.uniform_vector() return self
def MB_step(X, x_squared_norms, centers, counts, curr_iter, old_center_buffer, compute_squared_diff, distances, random_reassign=False, random_state=None, reassignment_ratio=.01, verbose=False, learn_rate=0.0): """Incremental update of the centers for the Minibatch K-Means algorithm. Parameters ---------- X : array, shape (n_samples, n_features) The original data array. x_squared_norms : array, shape (n_samples,) Squared euclidean norm of each data point. centers : array, shape (k, n_features) The cluster centers. This array is MODIFIED IN PLACE counts : array, shape (k,) The vector in which we keep track of the numbers of elements in a cluster. This array is MODIFIED IN PLACE distances : array, dtype float64, shape (n_samples), optional If not None, should be a pre-allocated array that will be used to store the distances of each sample to its closest center. May not be None when random_reassign is True. random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. random_reassign : boolean, optional If True, centers with very low counts are randomly reassigned to observations. reassignment_ratio : float, optional Control the fraction of the maximum number of counts for a center to be reassigned. A higher value means that low count centers are more likely to be reassigned, which means that the model will take longer to converge, but should converge in a better clustering. verbose : bool, optional, default False Controls the verbosity. compute_squared_diff : bool If set to False, the squared diff computation is skipped. old_center_buffer : int Copy of old centers for monitoring convergence. learn_rate: learning rate Returns ------- centers: Updated centers inertia : float Sum of distances of samples to their closest cluster center. squared_diff : numpy array, shape (n_clusters,) Squared distances between previous and updated cluster centers. """ # Perform label assignment to nearest centers nearest_center, inertia = k_means_._labels_inertia(X, x_squared_norms, centers, distances=distances) if random_reassign and reassignment_ratio > 0: random_state = check_random_state(random_state) # Reassign clusters that have very low counts to_reassign = counts < reassignment_ratio * counts.max() # pick at most .5 * batch_size samples as new centers if to_reassign.sum() > .5 * X.shape[0]: indices_dont_reassign = np.argsort(counts)[int(.5 * X.shape[0]):] to_reassign[indices_dont_reassign] = False n_reassigns = to_reassign.sum() if n_reassigns: # Pick new clusters amongst observations with uniform probability new_centers = choice(X.shape[0], replace=False, size=n_reassigns, random_state=random_state) if verbose: print("[MiniBatchKMeans] Reassigning %i cluster centers." % n_reassigns) if sp.issparse(X) and not sp.issparse(centers): assign_rows_csr(X, astype(new_centers, np.intp), astype(np.where(to_reassign)[0], np.intp), centers) else: centers[to_reassign] = X[new_centers] # reset counts of reassigned centers, but don't reset them too small # to avoid instant reassignment. This is a pretty dirty hack as it # also modifies the learning rates. counts[to_reassign] = np.min(counts[~to_reassign]) squared_diff = 0.0 ## implementation for the sparse CSR representation completely written in # cython if sp.issparse(X): if compute_squared_diff: old_center_buffer = centers #rand_vec = make_rand_vector(X.shape[1]) #learn_rate = 0.0 centers = _MB_step._mini_batch_update_csr(X, x_squared_norms, centers, counts, nearest_center, old_center_buffer, compute_squared_diff, curr_iter, learn_rate) if compute_squared_diff: diff = centers - old_center_buffer squared_diff = row_norms(diff, squared=True).sum() return centers, squared_diff, inertia ## dense variant in mostly numpy (not as memory efficient though) k = centers.shape[0] for center_idx in range(k): # find points from minibatch that are assigned to this center center_mask = nearest_center == center_idx old_count = counts[center_idx] this_count = center_mask.sum() counts[center_idx] += this_count # update counts if this_count > 0: new_count = counts[center_idx] if compute_squared_diff: old_center_buffer[:] = centers[center_idx] # inplace remove previous count scaling #centers[center_idx] *= counts[center_idx] # inplace sum with new points members of this cluster #centers[center_idx] += np.sum(X[center_mask], axis=0) # update the count statistics for this center #counts[center_idx] += count # inplace rescale to compute mean of all points (old and new) #centers[center_idx] /= counts[center_idx] new_center = np.sum(X[center_mask], axis=0) if learn_rate == 0.0: learn_rate = (new_count - old_count) / float(new_count) centers[center_idx] = centers[center_idx] + learn_rate * ( new_center / (new_count - old_count) - centers[center_idx]) # update the squared diff if necessary if compute_squared_diff: diff = centers[center_idx].ravel() - old_center_buffer.ravel() squared_diff += np.dot(diff, diff) return centers, squared_diff, inertia
def test_non_p_array(): keeper = choice(np.array(1), size=()) assert_equal(keeper[()], 0)
def test_returns_1d_scalar_object_based_off_distribution(): keeper = choice(3, replace=False, p=np.array([0, 0, 1.0])) assert_equal(keeper, 2)