def test_load_csv_file(self): """ Tests loading a CSV file. """ csv_f = "tests/files/csv/1" data = ds.load_txt_file(csv_f, block_size=(300, 50)) csv = np.loadtxt(csv_f, delimiter=",") self.assertEqual(data._top_left_shape, (300, 50)) self.assertEqual(data._reg_shape, (300, 50)) self.assertEqual(data.shape, (4235, 122)) self.assertEqual(data._n_blocks, (15, 3)) self.assertTrue(np.array_equal(data.collect(), csv)) csv_f = "tests/files/other/4" data = ds.load_txt_file(csv_f, block_size=(1000, 122), delimiter=" ") csv = np.loadtxt(csv_f, delimiter=" ") self.assertTrue(np.array_equal(data.collect(), csv)) csv_f = "tests/files/csv/4" data = ds.load_txt_file(csv_f, block_size=(1, 2)) csv = np.loadtxt(csv_f, delimiter=",") self.assertTrue(_equal_arrays(data.collect(), csv))
def main(): data = ds.load_txt_file("/gpfs/projects/bsc19/COMPSs_DATASETS/dislib/gaia" "/dbscan/data_scaled.csv", block_size=(10000, 5)) dbscan = DBSCAN(eps=0.19, min_samples=5, max_samples=5000, n_regions=17, dimensions=[0, 1]) performance.measure("DBSCAN", "gaia", dbscan.fit, data)
def main(): x_kdd = ds.load_txt_file( "/fefs/scratch/bsc19/bsc19029/PERFORMANCE/datasets/train.csv", block_size=(11482, 122)) x_kdd = x_kdd[:, :121] pca = PCA(arity=48) performance.measure("PCA", "KDD99", pca.fit, x_kdd)
def main(): x_kdd = ds.load_txt_file( "/fefs/scratch/bsc19/bsc19029/PERFORMANCE/datasets/train.csv", block_size=(11482, 122)) y_kdd = x_kdd[:, 121:122] x_kdd = x_kdd[:, :121] rf = RandomForestClassifier(n_estimators=100, distr_depth=2) performance.measure("RF", "KDD99", rf.fit, x_kdd, y_kdd)
def main(): file = "/fefs/scratch/bsc19/bsc19029/PERFORMANCE/datasets/data_scaled.csv" data = ds.load_txt_file(file, block_size=(10000, 5)) dbscan = DBSCAN(eps=0.19, min_samples=5, max_samples=5000, n_regions=17, dimensions=[0, 1]) performance.measure("DBSCAN", "gaia", dbscan.fit, data)
def main(): x_kdd = ds.load_txt_file( "/gpfs/projects/bsc19/COMPSs_DATASETS/dislib/kdd99/train.csv", block_size=(11482, 122)) y_kdd = x_kdd[:, 121:122] x_kdd = x_kdd[:, :121] regression = LinearRegression(arity=48) performance.measure("LR", "KDD99", regression.fit, x_kdd, y_kdd)
def main(): x_kdd = ds.load_txt_file( "/fefs/scratch/bsc19/bsc19029/PERFORMANCE/datasets/train.csv", block_size=(11482, 122)) x_kdd = shuffle(x_kdd) y_kdd = x_kdd[:, 121:122] x_kdd = x_kdd[:, :121] csvm = CascadeSVM(c=10000, gamma=0.01) performance.measure("CSVM", "KDD99", csvm.fit, x_kdd, y_kdd)
def main(): x_kdd = ds.load_txt_file( "/gpfs/projects/bsc19/COMPSs_DATASETS/dislib/kdd99/train.csv", block_size=(11482, 122)) x_kdd = shuffle(x_kdd) y_kdd = x_kdd[:, 121:122] x_kdd = x_kdd[:, :121] x_ij, y_ij = ds.load_svmlight_file( "/gpfs/projects/bsc19/COMPSs_DATASETS/dislib/ijcnn1/train", block_size=(5000, 22), n_features=22, store_sparse=True) csvm = CascadeSVM(c=10000, gamma=0.01) performance.measure("CSVM", "KDD99", csvm.fit, x_kdd, y_kdd) performance.measure("CSVM", "ijcnn1", csvm.fit, x_ij, y_ij)
def main(): x_kdd = ds.load_txt_file( "/gpfs/projects/bsc19/COMPSs_DATASETS/dislib/kdd99/train.csv", block_size=(11482, 122)) y_kdd = x_kdd[:, 121:122] x_kdd = x_kdd[:, :121] x_mn, y_mn = ds.load_svmlight_file( "/gpfs/projects/bsc19/COMPSs_DATASETS/dislib/mnist/train.scaled", block_size=(5000, 780), n_features=780, store_sparse=False) rf = RandomForestClassifier(n_estimators=100, distr_depth=2) performance.measure("RF", "KDD99", rf.fit, x_kdd, y_kdd) rf = RandomForestClassifier(n_estimators=100, distr_depth=2) performance.measure("RF", "mnist", rf.fit, x_mn, y_mn)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--svmlight", help="read files in SVMLight format", action="store_true") parser.add_argument("-dt", "--detailed_times", help="get detailed execution times (read and fit)", action="store_true") parser.add_argument("-e", "--estimators", metavar="N_ESTIMATORS", type=int, help="default is 10", default=10) parser.add_argument("-b", "--block_size", metavar="BLOCK_SIZE", type=str, help="two comma separated ints that represent the " "size of the blocks in which to divide the input " "data (default is 100,100)", default="100,100") parser.add_argument("-md", "--max_depth", metavar="MAX_DEPTH", type=int, help="default is np.inf", required=False) parser.add_argument("-dd", "--dist_depth", metavar="DIST_DEPTH", type=int, help="default is auto", required=False) parser.add_argument("-f", "--features", metavar="N_FEATURES", help="number of features of the input data " "(only for SVMLight files)", type=int, default=None, required=False) parser.add_argument("--dense", help="use dense data structures", action="store_true") parser.add_argument("-t", "--test-file", metavar="TEST_FILE_PATH", help="test file path", type=str, required=False) parser.add_argument("train_data", help="input file in CSV or SVMLight format", type=str) args = parser.parse_args() train_data = args.train_data s_time = time.time() read_time = 0 sparse = not args.dense bsize = args.block_size.split(",") block_size = (int(bsize[0]), int(bsize[1])) if args.svmlight: x, y = ds.load_svmlight_file(train_data, block_size, args.features, sparse) else: x = ds.load_txt_file(train_data, block_size) y = x[:, x.shape[1] - 2:x.shape[1] - 1] x = x[:, :x.shape[1] - 1] if args.detailed_times: barrier() read_time = time.time() - s_time s_time = time.time() if args.dist_depth: dist_depth = args.dist_depth else: dist_depth = "auto" if args.max_depth: max_depth = args.max_depth else: max_depth = np.inf forest = RandomForestClassifier(n_estimators=args.estimators, max_depth=max_depth, distr_depth=dist_depth) forest.fit(x, y) barrier() fit_time = time.time() - s_time out = [ forest.n_estimators, forest.distr_depth, forest.max_depth, read_time, fit_time ] if args.test_file: if args.svmlight: x_test, y_test = ds.load_svmlight_file(args.test_file, block_size, args.features, sparse) else: x_test = ds.load_txt_file(args.test_file, block_size) y_test = x_test[:, x_test.shape[1] - 1:x_test.shape[1]] x_test = x_test[:, :x_test.shape[1] - 1] out.append(compss_wait_on(forest.score(x_test, y_test))) print(out)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--svmlight", help="read files in SVMLight format", action="store_true") parser.add_argument("-dt", "--detailed_times", help="get detailed execution times (read and fit)", action="store_true") parser.add_argument("-k", "--kernel", metavar="KERNEL", type=str, help="linear or rbf (default is rbf)", choices=["linear", "rbf"], default="rbf") parser.add_argument("-a", "--arity", metavar="CASCADE_ARITY", type=int, help="default is 2", default=2) parser.add_argument("-b", "--block_size", metavar="BLOCK_SIZE", type=str, help="two comma separated ints that represent the " "size of the blocks in which to divide the input " "data (default is 100,100)", default="100,100") parser.add_argument("-i", "--iteration", metavar="MAX_ITERATIONS", type=int, help="default is 5", default=5) parser.add_argument("-g", "--gamma", metavar="GAMMA", type=float, help="(only for rbf kernel) default is 1 / n_features", default=None) parser.add_argument("-c", metavar="C", type=float, default=1, help="Penalty parameter C of the error term. " "Default:1") parser.add_argument("-f", "--features", metavar="N_FEATURES", help="number of features of the input data " "(only for SVMLight files)", type=int, default=None, required=False) parser.add_argument("-t", "--test-file", metavar="TEST_FILE_PATH", help="test file path", type=str, required=False) parser.add_argument("-o", "--output_file", metavar="OUTPUT_FILE_PATH", help="output file path", type=str, required=False) parser.add_argument("--convergence", help="check for convergence", action="store_true") parser.add_argument("--dense", help="store data in dense format (only " "for SVMLight files)", action="store_true") parser.add_argument("train_data", help="input file in CSV or SVMLight format", type=str) parser.add_argument("-v", "--verbose", action="store_true") parser.add_argument("-s", "--shuffle", help="shuffle input data", action="store_true") args = parser.parse_args() train_data = args.train_data s_time = time.time() read_time = 0 if not args.gamma: gamma = "auto" else: gamma = args.gamma sparse = not args.dense bsize = args.block_size.split(",") block_size = (int(bsize[0]), int(bsize[1])) if args.svmlight: x, y = ds.load_svmlight_file(train_data, block_size, args.features, sparse) else: x = ds.load_txt_file(train_data, block_size) y = x[:, x.shape[1] - 2: x.shape[1] - 1] x = x[:, :x.shape[1] - 1] if args.shuffle: x, y = shuffle(x, y) if args.detailed_times: barrier() read_time = time.time() - s_time s_time = time.time() csvm = CascadeSVM(cascade_arity=args.arity, max_iter=args.iteration, c=args.c, gamma=gamma, check_convergence=args.convergence, verbose=args.verbose) csvm.fit(x, y) barrier() fit_time = time.time() - s_time out = [args.kernel, args.arity, args.part_size, csvm._clf_params["gamma"], args.c, csvm.iterations, csvm.converged, read_time, fit_time] if os.path.isdir(train_data): n_files = os.listdir(train_data) out.append(len(n_files)) if args.test_file: if args.svmlight: x_test, y_test = ds.load_svmlight_file(args.test_file, block_size, args.features, sparse) else: x_test = ds.load_txt_file(args.test_file, block_size) y_test = x_test[:, x_test.shape[1] - 1: x_test.shape[1]] x_test = x_test[:, :x_test.shape[1] - 1] out.append(compss_wait_on(csvm.score(x_test, y_test))) if args.output_file: with open(args.output_file, "ab") as f: wr = csv.writer(f) wr.writerow(out) else: print(out)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--svmlight", help="read files in SVMLight format", action="store_true") parser.add_argument("-dt", "--detailed_times", help="get detailed execution times (read and fit)", action="store_true") parser.add_argument("-a", "--arity", metavar="CASCADE_ARITY", type=int, help="default is 50", default=50) parser.add_argument("-c", "--centers", metavar="N_CENTERS", type=int, help="default is 2", default=2) parser.add_argument("-b", "--block_size", metavar="BLOCK_SIZE", type=str, help="two comma separated ints that represent the " "size of the blocks in which to divide the input " "data (default is 100,100)", default="100,100") parser.add_argument("-i", "--iteration", metavar="MAX_ITERATIONS", type=int, help="default is 5", default=5) parser.add_argument("-f", "--features", metavar="N_FEATURES", help="number of features of the input data " "(only for SVMLight files)", type=int, default=None, required=False) parser.add_argument("--dense", help="store data in dense format (only " "for SVMLight files)", action="store_true") parser.add_argument("--labeled", help="the last column of the input file " "represents labels (only for text " "files)", action="store_true") parser.add_argument("train_data", help="input file in CSV or SVMLight format", type=str) args = parser.parse_args() train_data = args.train_data s_time = time.time() read_time = 0 sparse = not args.dense bsize = args.block_size.split(",") block_size = (int(bsize[0]), int(bsize[1])) if args.svmlight: x, y = ds.load_svmlight_file(train_data, block_size, args.features, sparse) else: x = ds.load_txt_file(train_data, block_size) n_features = x.shape[1] if args.labeled and not args.svmlight: x = x[:, :n_features - 1] if args.detailed_times: barrier() read_time = time.time() - s_time s_time = time.time() kmeans = KMeans(n_clusters=args.clusters, max_iter=args.iteration, arity=args.arity, verbose=True) kmeans.fit(x) barrier() fit_time = time.time() - s_time out = [args.clusters, args.arity, args.part_size, read_time, fit_time] print(out)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--svmlight", help="read file in SVMlLight format", action="store_true") parser.add_argument("-dt", "--detailed_times", help="get detailed execution times (read and fit)", action="store_true") parser.add_argument("-e", "--epsilon", metavar="EPSILON", type=float, help="default is 0.5", default=0.5) parser.add_argument("-r", "--regions", metavar="N_REGIONS", type=int, help="number of regions to create", default=1) parser.add_argument("-d", "--dimensions", metavar="DIMENSIONS", type=str, help="comma separated dimensions to use in the grid", required=False) parser.add_argument("-x", "--max_samples", metavar="MAX_SAMPLES", type=int, help="maximum samples to process per task (" "default is 1000)", default=1000) parser.add_argument("-m", "--min_samples", metavar="MIN_SAMPLES", type=int, help="default is 5", default=5) parser.add_argument("-b", "--block_size", metavar="BLOCK_SIZE", type=str, help="two comma separated ints that represent the " "size of the blocks in which to divide the input " "data (default is 100,100)", default="100,100") parser.add_argument("-f", "--features", metavar="N_FEATURES", help="number of features of the input data " "(only for SVMLight files)", type=int, default=None, required=False) parser.add_argument("--dense", help="store data in dense format (only " "for SVMLight files)", action="store_true") parser.add_argument("--labeled", help="the last column of the input file " "represents labels (only for text " "files)", action="store_true") parser.add_argument("train_data", help="input file in CSV or SVMLight format", type=str) args = parser.parse_args() train_data = args.train_data s_time = time.time() read_time = 0 sparse = not args.dense bsize = args.block_size.split(",") block_size = (int(bsize[0]), int(bsize[1])) if args.svmlight: x, y = ds.load_svmlight_file(train_data, block_size, args.features, sparse) else: x = ds.load_txt_file(train_data, block_size) n_features = x.shape[1] if args.labeled and not args.svmlight: x = x[:, :n_features - 1] if args.detailed_times: compss_barrier() read_time = time.time() - s_time s_time = time.time() dims = range(args.features) if args.dimensions: dims = args.dimensions.split(",") dims = np.array(dims, dtype=int) dbscan = DBSCAN(eps=args.epsilon, min_samples=args.min_samples, max_samples=args.max_samples, n_regions=args.regions, dimensions=dims) dbscan.fit(x) compss_barrier() fit_time = time.time() - s_time out = [ dbscan.eps, dbscan.min_samples, dbscan.max_samples, dbscan.n_regions, len(dims), args.part_size, dbscan.n_clusters, read_time, fit_time ] print(out)