def test_random_choice_csc(n_samples=10000, random_state=24): # Explicit class probabilities classes = [np.array([0, 1]), np.array([0, 1, 2])] class_probabilites = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])] got = random_choice_csc(n_samples, classes, class_probabilites, random_state) assert_true(sp.issparse(got)) for k in range(len(classes)): p = np.bincount(got.getcol(k).toarray().ravel()) / float(n_samples) assert_array_almost_equal(class_probabilites[k], p, decimal=1) # Implicit class probabilities classes = [[0, 1], [1, 2]] # test for array-like support class_probabilites = [np.array([0.5, 0.5]), np.array([0, 1/2, 1/2])] got = random_choice_csc(n_samples=n_samples, classes=classes, random_state=random_state) assert_true(sp.issparse(got)) for k in range(len(classes)): p = np.bincount(got.getcol(k).toarray().ravel()) / float(n_samples) assert_array_almost_equal(class_probabilites[k], p, decimal=1) # Edge case proabilites 1.0 and 0.0 classes = [np.array([0, 1]), np.array([0, 1, 2])] class_probabilites = [np.array([1.0, 0.0]), np.array([0.0, 1.0, 0.0])] got = random_choice_csc(n_samples, classes, class_probabilites, random_state) assert_true(sp.issparse(got)) for k in range(len(classes)): p = np.bincount(got.getcol(k).toarray().ravel(), minlength=len(class_probabilites[k])) / n_samples assert_array_almost_equal(class_probabilites[k], p, decimal=1) # One class target data classes = [[1], [0]] # test for array-like support class_probabilites = [np.array([0.0, 1.0]), np.array([1.0])] got = random_choice_csc(n_samples=n_samples, classes=classes, random_state=random_state) assert_true(sp.issparse(got)) for k in range(len(classes)): p = np.bincount(got.getcol(k).toarray().ravel()) / n_samples assert_array_almost_equal(class_probabilites[k], p, decimal=1)
def test_random_choice_csc(n_samples=10000, random_state=24): # Explicit class probabilities classes = [np.array([0, 1]), np.array([0, 1, 2])] class_probabilites = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])] got = random_choice_csc(n_samples, classes, class_probabilites, random_state) assert_true(sp.issparse(got)) for k in range(len(classes)): p = np.bincount(got.getcol(k).toarray().ravel()) / float(n_samples) assert_array_almost_equal(class_probabilites[k], p, decimal=1) # Implicit class probabilities classes = [[0, 1], [1, 2]] # test for array-like support class_probabilites = [np.array([0.5, 0.5]), np.array([0, 1 / 2, 1 / 2])] got = random_choice_csc(n_samples=n_samples, classes=classes, random_state=random_state) assert_true(sp.issparse(got)) for k in range(len(classes)): p = np.bincount(got.getcol(k).toarray().ravel()) / float(n_samples) assert_array_almost_equal(class_probabilites[k], p, decimal=1) # Edge case probabilities 1.0 and 0.0 classes = [np.array([0, 1]), np.array([0, 1, 2])] class_probabilites = [np.array([1.0, 0.0]), np.array([0.0, 1.0, 0.0])] got = random_choice_csc(n_samples, classes, class_probabilites, random_state) assert_true(sp.issparse(got)) for k in range(len(classes)): p = np.bincount(got.getcol(k).toarray().ravel(), minlength=len(class_probabilites[k])) / n_samples assert_array_almost_equal(class_probabilites[k], p, decimal=1) # One class target data classes = [[1], [0]] # test for array-like support class_probabilites = [np.array([0.0, 1.0]), np.array([1.0])] got = random_choice_csc(n_samples=n_samples, classes=classes, random_state=random_state) assert_true(sp.issparse(got)) for k in range(len(classes)): p = np.bincount(got.getcol(k).toarray().ravel()) / n_samples assert_array_almost_equal(class_probabilites[k], p, decimal=1)
def predict(self, X): """ Perform classification on test vectors X. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Input vectors, where n_samples is the number of samples and n_features is the number of features. Returns ------- y : array, shape = [n_samples] or [n_samples, n_outputs] Predicted target values for X. """ if not hasattr(self, "classes_"): raise ValueError("DummyClassifier not fitted.") X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) # numpy random_state expects Python int and not long as size argument # under Windows n_samples = int(X.shape[0]) rs = check_random_state(self.random_state) n_classes_ = self.n_classes_ classes_ = self.classes_ class_prior_ = self.class_prior_ constant = self.constant if self.n_outputs_ == 1: # Get same type even for self.n_outputs_ == 1 n_classes_ = [n_classes_] classes_ = [classes_] class_prior_ = [class_prior_] constant = [constant] # Compute probability only once if self.strategy == "stratified": proba = self.predict_proba(X) if self.n_outputs_ == 1: proba = [proba] if self.sparse_output_: class_prob = None if self.strategy == "most_frequent": classes_ = [np.array([cp.argmax()]) for cp in class_prior_] elif self.strategy == "stratified": class_prob = class_prior_ elif self.strategy == "uniform": raise ValueError("Sparse target prediction is not " "supported with the uniform strategy") elif self.strategy == "constant": classes_ = [np.array([c]) for c in constant] y = random_choice_csc(n_samples, classes_, class_prob, self.random_state) else: if self.strategy == "most_frequent": y = np.tile([classes_[k][class_prior_[k].argmax()] for k in range(self.n_outputs_)], [n_samples, 1]) elif self.strategy == "stratified": y = np.vstack(classes_[k][proba[k].argmax(axis=1)] for k in range(self.n_outputs_)).T elif self.strategy == "uniform": ret = [classes_[k][rs.randint(n_classes_[k], size=n_samples)] for k in range(self.n_outputs_)] y = np.vstack(ret).T elif self.strategy == "constant": y = np.tile(self.constant, (n_samples, 1)) if self.n_outputs_ == 1 and not self.output_2d_: y = np.ravel(y) return y
def test_random_choice_csc(): with pytest.warns(DeprecationWarning, match="removed in version 0.24"): random_choice_csc(10, [[2]])
def streaming_file_projections( train_dir='train', test_dir='test', file_ext='bytes', dim=256, percentile=75, n_jobs=-1): """Convert all files in given folder with given file extension to grayscale images and save them back to the same directory as png files. """ train_paths = file_paths(train_dir, file_ext) test_paths = file_paths(test_dir, file_ext) logging.info('converting %d %s training files to png files' % ( len(train_paths), file_ext)) logging.info('converting %d %s testing files to png files' % ( len(test_paths), file_ext)) # Determine normalized file length; balance trade-off on padding vs. loss of # info from file trimming. sizes = np.array([os.path.getsize(path) for path in train_paths], dtype=np.int) cutoff = int(dim * np.round(np.percentile(sizes, percentile) / dim)) logging.info('using cutoff of %d' % cutoff) # Log some info on what kind of tradeoff is being made. MB = 1024 ** 3 diff = sizes - cutoff trimmed = float(diff[diff > 0].sum()) padded = float(abs(diff[diff < 0].sum())) logging.info('%.2fMB will be trimmed' % (trimmed / MB)) logging.info('%.2fMB will be padded' % (padded / MB)) logging.info('trim-to-pad ratio: %d / 1000' % (1000 * (trimmed / padded))) # Build random projection matrix R. # We interpret the cutoff as the number of features ("pixels"). # s = 1 / density, where density = 1 / sqrt(n_features). n_components = dim * dim # reduced dimension/rank after projection logging.info('constructing random projection matrix R (%d x %d)' % ( cutoff, n_components)) s = np.sqrt(cutoff) val = np.sqrt(s / n_components) vals = np.array([-1, 0, 1], dtype=np.int8) probs = np.array([1 / (2 * s), 1 - (1 / s), 1 / (2 * s)]) probs = probs / probs.sum() # remove rounding error # Now create R as a sparse csc matrix. This function from sklearn requires # "classes" for each column and class probability distributions for each. # https://github.com/scikit-learn/scikit-learn/blob/51a765acfa4c5d1ec05fc4b406968ad233c75162/sklearn/utils/random.py#L205 k = vals.shape[0] classes = np.tile(vals, n_components).reshape(n_components, k) class_probs = np.tile(probs, n_components).reshape(n_components, k) R = skrandom.random_choice_csc( n_samples=cutoff, classes=classes, class_probability=class_probs) R = R.tocsr() # especially suitable for fast matrix vector products R_nbytes = R.data.nbytes + R.indices.nbytes + R.indptr.nbytes R_mb = float(R_nbytes) / MB logging.info('done building projection matrix R (%.2fMB)' % R_mb) # Map work across all files, distributed based on n_jobs. n_jobs = n_jobs if n_jobs > 0 else (mp.cpu_count() - 2) logging.info('converting files using %d processes' % n_jobs) # Convert both train and test sets. all_paths = itertools.chain(train_paths, test_paths) if n_jobs == 1: arg_iter = itertools.izip( all_paths, itertools.repeat(cutoff), itertools.repeat(dim), itertools.repeat(R), itertools.repeat(file_ext)) map(mappable_convert_and_project, arg_iter) save_csr_matrix(R, file_ext, percentile, dim) return # If we have more than one job, we'll want to share R. # Create shared memory space for the projection matrix R. global shared_data global shared_indices global shared_indptr global shared_shape shared_data = mp.Array(ctypes.c_double, R.data.shape[0], lock=False) shared_indices = mp.Array(ctypes.c_int32, R.indices.shape[0], lock=False) shared_indptr = mp.Array(ctypes.c_int32, R.indptr.shape[0], lock=False) shared_shape = mp.Array(ctypes.c_int32, len(R.shape), lock=False) # Fill shared memory with R data. shared_data[:] = R.data shared_indices[:] = R.indices shared_indptr[:] = R.indptr shared_shape[:] = R.shape arg_iter = itertools.izip( all_paths, itertools.repeat(cutoff), itertools.repeat(dim), itertools.repeat(file_ext)) pool = mp.Pool(processes=n_jobs) pool.map(convert_project_shared, arg_iter) save_csr_matrix(R, file_ext, percentile, dim)