def split(self, x, y=None): """Generates K-fold splits. Parameters ---------- x : ds-array Samples array. y : ds-array, optional (default=None) Corresponding labels or values. Yields ------ train_data : train_x, train_y The training ds-arrays for that split. If y is None, train_y is None. test_data : test_x, test_y The testing ds-arrays data for that split. If y is None, test_y is None. """ k = self.n_splits if self.shuffle: shuffled = utils.shuffle(x, y, self.random_state) if y is None: x = shuffled else: x, y = shuffled n_total = x.shape[0] n_each_section, extras = divmod(n_total, k) section_sizes = np.empty((k,), dtype=int) section_sizes[:extras] = n_each_section + 1 section_sizes[extras:] = n_each_section div_points = np.cumsum(section_sizes) yield get_kfold_partition(x, y, 0, div_points[0]) for i in range(1, k): yield get_kfold_partition(x, y, div_points[i - 1], div_points[i])
def test_shuffle_xy_sparse(self): """ Tests shuffle for given sparse x and sparse y, and random_state. Tests that the shuffled arrays contain the same rows as the original data, and that the position has changed for some row. """ np.random.seed(0) x = sparse.random(8, 10, density=0.5).tocsr() x_ds = ds.array(x, (3, 5)) y = sparse.random(8, 1, density=0.5).tocsr() y_ds = ds.array(y, (4, 1)) shuffled_x, shuffled_y = shuffle(x_ds, y_ds, random_state=0) shuffled_x = shuffled_x.collect() shuffled_y = shuffled_y.collect() # Assert that at least one of the first 2 samples has changed self.assertFalse((x[0:2] != shuffled_x[0:2]).nnz == 0) # Assert that the shuffled data has the same shape. self.assertEqual(shuffled_x.shape, x.shape) self.assertEqual(shuffled_y.shape[0], y.shape[0]) # Assert that all rows from x are found in the shuffled_x, and that the # same permutation has been used to shuffle x and y. for idx, x_row in enumerate(x): found = False for shuffled_idx, shuffle_x_row in enumerate(shuffled_x): if (shuffle_x_row != x_row).nnz == 0: # If rows are equal found = True self.assertEqual(y[idx, 0], shuffled_y[shuffled_idx, 0]) break self.assertTrue(found)
def test_shuffle_xy(self): """ Tests shuffle for given x, y and random_state. Tests that the shuffled arrays contain the same rows as the original data, and that the position has changed for some row. """ np.random.seed(0) x = np.random.rand(8, 3) y = np.random.rand(8, 1) x_ds = ds.array(x, (3, 2)) y_ds = ds.array(y, (4, 1)) shuffled_x, shuffled_y = shuffle(x_ds, y_ds, random_state=0) shuffled_x = shuffled_x.collect() shuffled_y = shuffled_y.collect() # Assert that at least one of the first 2 samples has changed self.assertFalse(np.array_equal(x[0:2], shuffled_x[0:2])) # Assert that the shuffled data has the same shape. self.assertEqual(shuffled_x.shape, x.shape) self.assertEqual(shuffled_y.shape[0], y.shape[0]) # Assert that all rows from x are found in the shuffled_x, and that the # same permutation has been used to shuffle x and y. for idx, x_row in enumerate(x): found = False for shuffled_idx, shuffle_x_row in enumerate(shuffled_x): if (shuffle_x_row == x_row).all(): found = True self.assertEqual(y[idx], shuffled_y[shuffled_idx]) break self.assertTrue(found)
def main(): x_kdd = ds.load_txt_file( "/fefs/scratch/bsc19/bsc19029/PERFORMANCE/datasets/train.csv", block_size=(11482, 122)) x_kdd = shuffle(x_kdd) y_kdd = x_kdd[:, 121:122] x_kdd = x_kdd[:, :121] csvm = CascadeSVM(c=10000, gamma=0.01) performance.measure("CSVM", "KDD99", csvm.fit, x_kdd, y_kdd)
def main(): x_kdd = ds.load_txt_file( "/gpfs/projects/bsc19/COMPSs_DATASETS/dislib/kdd99/train.csv", block_size=(11482, 122)) x_kdd = shuffle(x_kdd) y_kdd = x_kdd[:, 121:122] x_kdd = x_kdd[:, :121] x_ij, y_ij = ds.load_svmlight_file( "/gpfs/projects/bsc19/COMPSs_DATASETS/dislib/ijcnn1/train", block_size=(5000, 22), n_features=22, store_sparse=True) csvm = CascadeSVM(c=10000, gamma=0.01) performance.measure("CSVM", "KDD99", csvm.fit, x_kdd, y_kdd) performance.measure("CSVM", "ijcnn1", csvm.fit, x_ij, y_ij)
def test_refit_false(self): """Tests GridSearchCV fit() with refit=False.""" x_np, y_np = datasets.load_iris(return_X_y=True) x = ds.array(x_np, (30, 4)) y = ds.array(y_np[:, np.newaxis], (30, 1)) seed = 0 x, y = shuffle(x, y, random_state=seed) param_grid = {'max_iter': range(1, 5)} csvm = CascadeSVM(check_convergence=False) searcher = GridSearchCV(csvm, param_grid, cv=3, refit=False) searcher.fit(x, y) self.assertFalse(hasattr(searcher, 'best_estimator_')) self.assertTrue(hasattr(searcher, 'best_score_')) self.assertTrue(hasattr(searcher, 'best_params_')) self.assertTrue(hasattr(searcher, 'best_index_')) self.assertTrue(hasattr(searcher, 'scorer_')) self.assertEqual(searcher.n_splits_, 3)
def test_shuffle_x(self): """ Tests shuffle for given x and random_state. Tests that the shuffled array contains the same rows as the original data, and that the position has changed for some row. """ x = np.random.rand(8, 3) x_ds = ds.array(x, (3, 2)) shuffled_x = shuffle(x_ds, random_state=0) shuffled_x = shuffled_x.collect() # Assert that at least one of the first 2 samples has changed self.assertFalse(np.array_equal(x[0:2], shuffled_x[0:2])) # Assert that the shuffled data has the same shape. self.assertEqual(shuffled_x.shape, x.shape) # Assert that all rows from x are found in the shuffled_x. for x_row in x: found = False for shuffled_idx, shuffle_x_row in enumerate(shuffled_x): if (shuffle_x_row == x_row).all(): found = True break self.assertTrue(found)
def test_shuffle_x_sparse(self): """ Tests shuffle for given sparse x, and random_state. Tests that the shuffled array contains the same rows as the original data, and that the position has changed for some row. """ np.random.seed(0) x = sparse.random(8, 10, density=0.5).tocsr() x_ds = ds.array(x, (3, 5)) shuffled_x = shuffle(x_ds, random_state=0) shuffled_x = shuffled_x.collect() # Assert that at least one of the first 2 samples has changed self.assertFalse((x[0:2] != shuffled_x[0:2]).nnz == 0) # Assert that the shuffled data has the same shape. self.assertEqual(shuffled_x.shape, x.shape) # Assert that all rows from x are found in the shuffled_x. for x_row in x: found = False for shuffled_idx, shuffle_x_row in enumerate(shuffled_x): if (shuffle_x_row != x_row).nnz == 0: # If rows are equal found = True break self.assertTrue(found)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--svmlight", help="read files in SVMLight format", action="store_true") parser.add_argument("-dt", "--detailed_times", help="get detailed execution times (read and fit)", action="store_true") parser.add_argument("-k", "--kernel", metavar="KERNEL", type=str, help="linear or rbf (default is rbf)", choices=["linear", "rbf"], default="rbf") parser.add_argument("-a", "--arity", metavar="CASCADE_ARITY", type=int, help="default is 2", default=2) parser.add_argument("-b", "--block_size", metavar="BLOCK_SIZE", type=str, help="two comma separated ints that represent the " "size of the blocks in which to divide the input " "data (default is 100,100)", default="100,100") parser.add_argument("-i", "--iteration", metavar="MAX_ITERATIONS", type=int, help="default is 5", default=5) parser.add_argument("-g", "--gamma", metavar="GAMMA", type=float, help="(only for rbf kernel) default is 1 / n_features", default=None) parser.add_argument("-c", metavar="C", type=float, default=1, help="Penalty parameter C of the error term. " "Default:1") parser.add_argument("-f", "--features", metavar="N_FEATURES", help="number of features of the input data " "(only for SVMLight files)", type=int, default=None, required=False) parser.add_argument("-t", "--test-file", metavar="TEST_FILE_PATH", help="test file path", type=str, required=False) parser.add_argument("-o", "--output_file", metavar="OUTPUT_FILE_PATH", help="output file path", type=str, required=False) parser.add_argument("--convergence", help="check for convergence", action="store_true") parser.add_argument("--dense", help="store data in dense format (only " "for SVMLight files)", action="store_true") parser.add_argument("train_data", help="input file in CSV or SVMLight format", type=str) parser.add_argument("-v", "--verbose", action="store_true") parser.add_argument("-s", "--shuffle", help="shuffle input data", action="store_true") args = parser.parse_args() train_data = args.train_data s_time = time.time() read_time = 0 if not args.gamma: gamma = "auto" else: gamma = args.gamma sparse = not args.dense bsize = args.block_size.split(",") block_size = (int(bsize[0]), int(bsize[1])) if args.svmlight: x, y = ds.load_svmlight_file(train_data, block_size, args.features, sparse) else: x = ds.load_txt_file(train_data, block_size) y = x[:, x.shape[1] - 2: x.shape[1] - 1] x = x[:, :x.shape[1] - 1] if args.shuffle: x, y = shuffle(x, y) if args.detailed_times: barrier() read_time = time.time() - s_time s_time = time.time() csvm = CascadeSVM(cascade_arity=args.arity, max_iter=args.iteration, c=args.c, gamma=gamma, check_convergence=args.convergence, verbose=args.verbose) csvm.fit(x, y) barrier() fit_time = time.time() - s_time out = [args.kernel, args.arity, args.part_size, csvm._clf_params["gamma"], args.c, csvm.iterations, csvm.converged, read_time, fit_time] if os.path.isdir(train_data): n_files = os.listdir(train_data) out.append(len(n_files)) if args.test_file: if args.svmlight: x_test, y_test = ds.load_svmlight_file(args.test_file, block_size, args.features, sparse) else: x_test = ds.load_txt_file(args.test_file, block_size) y_test = x_test[:, x_test.shape[1] - 1: x_test.shape[1]] x_test = x_test[:, :x_test.shape[1] - 1] out.append(compss_wait_on(csvm.score(x_test, y_test))) if args.output_file: with open(args.output_file, "ab") as f: wr = csv.writer(f) wr.writerow(out) else: print(out)