Esempio n. 1
0
    def test_arrange_sparse(self):
        """ Tests that arrange produces the same results with sparse and
        dense data structures."""
        file_ = "tests/files/libsvm/2"

        sparse, _ = ds.load_svmlight_file(file_, (10, 300), 780, True)
        dense, _ = ds.load_svmlight_file(file_, (10, 200), 780, False)

        arranged_d, sort_d, _ = _arrange_samples(dense, 3, [128, 184])
        arranged_sp, sort_sp, _ = _arrange_samples(sparse, 3, [128, 184])

        arranged_sp = compss_wait_on(arranged_sp)
        arranged_d = compss_wait_on(arranged_d)
        sort_d = compss_wait_on(sort_d)
        sort_sp = compss_wait_on(sort_sp)

        self.assertEqual(len(arranged_sp), len(arranged_d))
        self.assertFalse(issparse(arranged_d[0]))
        self.assertTrue(issparse(arranged_sp[0]))

        self.assertTrue(
            np.array_equal(np.concatenate(np.concatenate(sort_sp).flatten()),
                           np.concatenate(np.concatenate(sort_d).flatten())))

        for index in range(len(arranged_sp)):
            samples_sp = arranged_sp[index].toarray()
            samples_d = arranged_d[index]
            self.assertTrue(np.array_equal(samples_sp, samples_d))
Esempio n. 2
0
    def test_sparse(self):
        """ Tests GaussianMixture produces the same results using dense and
        sparse data structures """
        file_ = "tests/files/libsvm/2"

        x_sparse, _ = ds.load_svmlight_file(file_, (10, 780), 780, True)
        x_dense, _ = ds.load_svmlight_file(file_, (10, 780), 780, False)

        covariance_types = 'full', 'tied', 'diag', 'spherical'

        for cov_type in covariance_types:
            gm = GaussianMixture(n_components=4, random_state=0,
                                 covariance_type=cov_type)
            labels_sparse = gm.fit_predict(x_sparse).collect()
            labels_dense = gm.fit_predict(x_dense).collect()
            self.assertTrue(np.array_equal(labels_sparse, labels_dense))
Esempio n. 3
0
    def test_fit(self):
        seed = 666
        file_ = "tests/files/libsvm/2"

        x, y = ds.load_svmlight_file(file_, (10, 300), 780, False)

        csvm = CascadeSVM(cascade_arity=3,
                          max_iter=5,
                          tol=1e-4,
                          kernel='linear',
                          c=2,
                          gamma=0.1,
                          check_convergence=True,
                          random_state=seed,
                          verbose=False)

        csvm.fit(x, y)

        self.assertTrue(csvm.converged)

        csvm = CascadeSVM(cascade_arity=3,
                          max_iter=1,
                          tol=1e-4,
                          kernel='linear',
                          c=2,
                          gamma=0.1,
                          check_convergence=False,
                          random_state=seed,
                          verbose=False)

        csvm.fit(x, y)
        self.assertFalse(csvm.converged)
        self.assertEqual(csvm.iterations, 1)
Esempio n. 4
0
    def test_fit_default_gamma(self):
        """ Tests that the fit method converges when using gamma=auto on a
        toy dataset """
        seed = 666
        file_ = "tests/files/libsvm/2"

        x, y = ds.load_svmlight_file(file_, (10, 300), 780, False)

        csvm = CascadeSVM(cascade_arity=3,
                          max_iter=5,
                          tol=1e-4,
                          kernel='linear',
                          c=2,
                          check_convergence=True,
                          random_state=seed,
                          verbose=False)

        csvm.fit(x, y)

        self.assertTrue(csvm.converged)

        csvm = CascadeSVM(cascade_arity=3,
                          max_iter=1,
                          tol=1e-4,
                          kernel='linear',
                          c=2,
                          gamma=0.1,
                          check_convergence=False,
                          random_state=seed,
                          verbose=False)

        csvm.fit(x, y)
        self.assertFalse(csvm.converged)
        self.assertEqual(csvm.iterations, 1)
Esempio n. 5
0
def main():
    x_ij, y_ij = ds.load_svmlight_file(
        "/fefs/scratch/bsc19/bsc19029/PERFORMANCE/datasets/train",
        block_size=(5000, 22), n_features=22, store_sparse=True)

    csvm = CascadeSVM(c=10000, gamma=0.01)

    performance.measure("CSVM", "ijcnn1", csvm.fit, x_ij, y_ij)
Esempio n. 6
0
    def test_load_svmlight_file(self):
        """ Tests loading a LibSVM file  """
        file_ = "tests/files/libsvm/1"

        x_np, y_np = load_svmlight_file(file_, n_features=780)

        # Load SVM and store in sparse
        x, y = ds.load_svmlight_file(file_, (25, 100), n_features=780,
                                     store_sparse=True)

        self.assertTrue(_equal_arrays(x.collect(), x_np))
        self.assertTrue(_equal_arrays(y.collect(), y_np))

        # Load SVM and store in dense
        x, y = ds.load_svmlight_file(file_, (25, 100), n_features=780,
                                     store_sparse=False)

        self.assertTrue(_equal_arrays(x.collect(), x_np.toarray()))
        self.assertTrue(_equal_arrays(y.collect(), y_np))
Esempio n. 7
0
def main():

    x_mn, y_mn = ds.load_svmlight_file(
        "/fefs/scratch/bsc19/bsc19029/PERFORMANCE/datasets/train.scaled",
        block_size=(5000, 780),
        n_features=780,
        store_sparse=False)

    rf = RandomForestClassifier(n_estimators=100, distr_depth=2)
    performance.measure("RF", "mnist", rf.fit, x_mn, y_mn)
Esempio n. 8
0
    def test_sparse(self):
        """ Tests K-means produces the same results using dense and sparse
        data structures. """
        file_ = "tests/files/libsvm/2"

        x_sp, _ = ds.load_svmlight_file(file_, (10, 300), 780, True)
        x_ds, _ = ds.load_svmlight_file(file_, (10, 300), 780, False)

        kmeans = KMeans(random_state=170)

        y_sparse = kmeans.fit_predict(x_sp).collect()
        sparse_c = kmeans.centers.toarray()

        kmeans = KMeans(random_state=170)

        y_dense = kmeans.fit_predict(x_ds).collect()
        dense_c = kmeans.centers

        self.assertTrue(np.allclose(sparse_c, dense_c))
        self.assertTrue(np.array_equal(y_sparse, y_dense))
Esempio n. 9
0
    def test_sparse(self):
        """ Tests PCA produces the same results using dense and sparse
        data structures. """
        file_ = "tests/files/libsvm/2"
        x_sp, _ = ds.load_svmlight_file(file_, (10, 300), 780, True)
        x_ds, _ = ds.load_svmlight_file(file_, (10, 300), 780, False)

        pca = PCA()
        transform_dense = pca.fit_transform(x_ds).collect()
        dense_components = pca.components_
        dense_variance = pca.explained_variance_

        pca = PCA()
        transform_sparse = pca.fit_transform(x_sp).collect()
        sparse_components = pca.components_
        sparse_variance = pca.explained_variance_

        self.assertTrue(np.array_equal(transform_sparse, transform_dense))
        self.assertTrue(np.allclose(sparse_components, dense_components))
        self.assertTrue(np.allclose(sparse_variance, dense_variance))
Esempio n. 10
0
    def test_sparse(self):
        """ Tests PCA produces the same results using dense and sparse
        data structures. """
        file_ = "tests/files/libsvm/2"
        x_sp, _ = ds.load_svmlight_file(file_, (10, 300), 780, True)
        x_ds, _ = ds.load_svmlight_file(file_, (10, 300), 780, False)

        pca = PCA()
        transform_dense = pca.fit_transform(x_ds).collect()
        dense_variance = pca.explained_variance_.collect()

        pca = PCA()
        transform_sparse = pca.fit_transform(x_sp).collect()
        sparse_variance = pca.explained_variance_.collect()

        self.assertTrue(np.allclose(transform_sparse, transform_dense))
        self.assertTrue(np.allclose(sparse_variance, dense_variance))

        # Test error for sparse data and method=svd
        with self.assertRaises(NotImplementedError):
            pca = PCA(method='svd')
            pca.fit(x_sp)
Esempio n. 11
0
    def test_load_libsvm_file(self):
        """ Tests loading a LibSVM file in dense mode.
        """
        file_ = "tests/files/libsvm/1"

        x, y = load_svmlight_file(file_, n_features=780)

        bn, bm = 25, 100

        # Load SVM and store in sparse
        arr_x, arr_y = ds.load_svmlight_file(file_, (25, 100), n_features=780,
                                             store_sparse=True)

        _validate_arrays(self, arr_x, x, (bn, bm))
        _validate_arrays(self, arr_y, y, (bn, 1))

        # Load SVM and store in dense
        arr_x, arr_y = ds.load_svmlight_file(file_, (25, 100), n_features=780,
                                             store_sparse=False)

        _validate_arrays(self, arr_x, x.toarray(), (bn, bm))
        _validate_arrays(self, arr_y, y, (bn, 1))
Esempio n. 12
0
    def test_sparse(self):
        """ Tests that C-SVM produces the same results with sparse and dense
        data"""
        seed = 666
        train = "tests/files/libsvm/3"

        x_sp, y_sp = ds.load_svmlight_file(train, (10, 300), 780, True)
        x_d, y_d = ds.load_svmlight_file(train, (10, 300), 780, False)

        csvm_sp = CascadeSVM(random_state=seed)
        csvm_sp.fit(x_sp, y_sp)

        csvm_d = CascadeSVM(random_state=seed)
        csvm_d.fit(x_d, y_d)

        sv_d = csvm_d._clf.support_vectors_
        sv_sp = csvm_sp._clf.support_vectors_.toarray()

        self.assertTrue(np.array_equal(sv_d, sv_sp))

        coef_d = csvm_d._clf.dual_coef_
        coef_sp = csvm_sp._clf.dual_coef_.toarray()

        self.assertTrue(np.array_equal(coef_d, coef_sp))
Esempio n. 13
0
def main():
    n_blocks = 384
    data = "/gpfs/projects/bsc19/COMPSs_DATASETS/dislib/recommendation" \
           "/netflix/netflix_data_libsvm.txt"
    n_factors = 100
    n_features = 480189

    block_size = (int(ceil(17770 / n_blocks)),
                  int(ceil(n_features / n_blocks)))

    x, y = ds.load_svmlight_file(data, block_size=block_size,
                                 n_features=n_features, store_sparse=True)

    als = ALS(tol=0.0001, random_state=676, n_f=n_factors, max_iter=10,
              verbose=False)

    performance.measure("ALS", "Netflix", als, x)
Esempio n. 14
0
def main():
    x_kdd = ds.load_txt_file(
        "/gpfs/projects/bsc19/COMPSs_DATASETS/dislib/kdd99/train.csv",
        block_size=(11482, 122))

    x_kdd = shuffle(x_kdd)
    y_kdd = x_kdd[:, 121:122]
    x_kdd = x_kdd[:, :121]

    x_ij, y_ij = ds.load_svmlight_file(
        "/gpfs/projects/bsc19/COMPSs_DATASETS/dislib/ijcnn1/train",
        block_size=(5000, 22), n_features=22, store_sparse=True)

    csvm = CascadeSVM(c=10000, gamma=0.01)

    performance.measure("CSVM", "KDD99", csvm.fit, x_kdd, y_kdd)
    performance.measure("CSVM", "ijcnn1", csvm.fit, x_ij, y_ij)
Esempio n. 15
0
def main():
    x_kdd = ds.load_txt_file(
        "/gpfs/projects/bsc19/COMPSs_DATASETS/dislib/kdd99/train.csv",
        block_size=(11482, 122))

    y_kdd = x_kdd[:, 121:122]
    x_kdd = x_kdd[:, :121]

    x_mn, y_mn = ds.load_svmlight_file(
        "/gpfs/projects/bsc19/COMPSs_DATASETS/dislib/mnist/train.scaled",
        block_size=(5000, 780),
        n_features=780,
        store_sparse=False)

    rf = RandomForestClassifier(n_estimators=100, distr_depth=2)
    performance.measure("RF", "KDD99", rf.fit, x_kdd, y_kdd)

    rf = RandomForestClassifier(n_estimators=100, distr_depth=2)
    performance.measure("RF", "mnist", rf.fit, x_mn, y_mn)
Esempio n. 16
0
    def test_fit_private_params(self):
        kernel = 'rbf'
        c = 2
        gamma = 0.1
        seed = 666
        file_ = "tests/files/libsvm/2"

        x, y = ds.load_svmlight_file(file_, (10, 300), 780, False)
        csvm = CascadeSVM(kernel=kernel, c=c, gamma=gamma, random_state=seed)
        csvm.fit(x, y)

        self.assertEqual(csvm._clf_params['kernel'], kernel)
        self.assertEqual(csvm._clf_params['C'], c)
        self.assertEqual(csvm._clf_params['gamma'], gamma)

        kernel, c = 'linear', 0.3
        csvm = CascadeSVM(kernel=kernel, c=c, random_state=seed)
        csvm.fit(x, y)
        self.assertEqual(csvm._clf_params['kernel'], kernel)
        self.assertEqual(csvm._clf_params['C'], c)
Esempio n. 17
0
def main():
    n_blocks = 384
    data = "/fefs/scratch/bsc19/bsc19029/PERFORMANCE/datasets/" \
           "netflix_data_libsvm.txt"
    n_factors = 100
    n_features = 480189

    block_size = (int(ceil(17770 / n_blocks)),
                  int(ceil(n_features / n_blocks)))

    x, y = ds.load_svmlight_file(data,
                                 block_size=block_size,
                                 n_features=n_features,
                                 store_sparse=True)

    als = ALS(tol=0.0001,
              random_state=676,
              n_f=n_factors,
              max_iter=10,
              verbose=False)

    performance.measure("ALS", "Netflix", als.fit, x)
Esempio n. 18
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--svmlight",
                        help="read file in SVMlLight format",
                        action="store_true")
    parser.add_argument("-dt",
                        "--detailed_times",
                        help="get detailed execution times (read and fit)",
                        action="store_true")
    parser.add_argument("-e",
                        "--epsilon",
                        metavar="EPSILON",
                        type=float,
                        help="default is 0.5",
                        default=0.5)
    parser.add_argument("-r",
                        "--regions",
                        metavar="N_REGIONS",
                        type=int,
                        help="number of regions to create",
                        default=1)
    parser.add_argument("-d",
                        "--dimensions",
                        metavar="DIMENSIONS",
                        type=str,
                        help="comma separated dimensions to use in the grid",
                        required=False)
    parser.add_argument("-x",
                        "--max_samples",
                        metavar="MAX_SAMPLES",
                        type=int,
                        help="maximum samples to process per task ("
                        "default is 1000)",
                        default=1000)
    parser.add_argument("-m",
                        "--min_samples",
                        metavar="MIN_SAMPLES",
                        type=int,
                        help="default is 5",
                        default=5)
    parser.add_argument("-b",
                        "--block_size",
                        metavar="BLOCK_SIZE",
                        type=str,
                        help="two comma separated ints that represent the "
                        "size of the blocks in which to divide the input "
                        "data (default is 100,100)",
                        default="100,100")
    parser.add_argument("-f",
                        "--features",
                        metavar="N_FEATURES",
                        help="number of features of the input data "
                        "(only for SVMLight files)",
                        type=int,
                        default=None,
                        required=False)
    parser.add_argument("--dense",
                        help="store data in dense format (only "
                        "for SVMLight files)",
                        action="store_true")
    parser.add_argument("--labeled",
                        help="the last column of the input file "
                        "represents labels (only for text "
                        "files)",
                        action="store_true")
    parser.add_argument("train_data",
                        help="input file in CSV or SVMLight format",
                        type=str)
    args = parser.parse_args()

    train_data = args.train_data

    s_time = time.time()
    read_time = 0

    sparse = not args.dense

    bsize = args.block_size.split(",")
    block_size = (int(bsize[0]), int(bsize[1]))

    if args.svmlight:
        x, y = ds.load_svmlight_file(train_data, block_size, args.features,
                                     sparse)
    else:
        x = ds.load_txt_file(train_data, block_size)

    n_features = x.shape[1]

    if args.labeled and not args.svmlight:
        x = x[:, :n_features - 1]

    if args.detailed_times:
        compss_barrier()
        read_time = time.time() - s_time
        s_time = time.time()

    dims = range(args.features)

    if args.dimensions:
        dims = args.dimensions.split(",")
        dims = np.array(dims, dtype=int)

    dbscan = DBSCAN(eps=args.epsilon,
                    min_samples=args.min_samples,
                    max_samples=args.max_samples,
                    n_regions=args.regions,
                    dimensions=dims)
    dbscan.fit(x)

    compss_barrier()
    fit_time = time.time() - s_time

    out = [
        dbscan.eps, dbscan.min_samples, dbscan.max_samples, dbscan.n_regions,
        len(dims), args.part_size, dbscan.n_clusters, read_time, fit_time
    ]

    print(out)
Esempio n. 19
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--svmlight", help="read files in SVMLight format",
                        action="store_true")
    parser.add_argument("-dt", "--detailed_times",
                        help="get detailed execution times (read and fit)",
                        action="store_true")
    parser.add_argument("-k", "--kernel", metavar="KERNEL", type=str,
                        help="linear or rbf (default is rbf)",
                        choices=["linear", "rbf"], default="rbf")
    parser.add_argument("-a", "--arity", metavar="CASCADE_ARITY", type=int,
                        help="default is 2", default=2)
    parser.add_argument("-b", "--block_size", metavar="BLOCK_SIZE", type=str,
                        help="two comma separated ints that represent the "
                             "size of the blocks in which to divide the input "
                             "data (default is 100,100)",
                        default="100,100")
    parser.add_argument("-i", "--iteration", metavar="MAX_ITERATIONS",
                        type=int, help="default is 5", default=5)
    parser.add_argument("-g", "--gamma", metavar="GAMMA", type=float,
                        help="(only for rbf kernel) default is 1 / n_features",
                        default=None)
    parser.add_argument("-c", metavar="C", type=float, default=1,
                        help="Penalty parameter C of the error term. "
                             "Default:1")
    parser.add_argument("-f", "--features", metavar="N_FEATURES",
                        help="number of features of the input data "
                             "(only for SVMLight files)",
                        type=int, default=None, required=False)
    parser.add_argument("-t", "--test-file", metavar="TEST_FILE_PATH",
                        help="test file path", type=str, required=False)
    parser.add_argument("-o", "--output_file", metavar="OUTPUT_FILE_PATH",
                        help="output file path", type=str, required=False)
    parser.add_argument("--convergence", help="check for convergence",
                        action="store_true")
    parser.add_argument("--dense", help="store data in dense format (only "
                                        "for SVMLight files)",
                        action="store_true")
    parser.add_argument("train_data",
                        help="input file in CSV or SVMLight format", type=str)
    parser.add_argument("-v", "--verbose", action="store_true")
    parser.add_argument("-s", "--shuffle", help="shuffle input data",
                        action="store_true")
    args = parser.parse_args()

    train_data = args.train_data

    s_time = time.time()
    read_time = 0

    if not args.gamma:
        gamma = "auto"
    else:
        gamma = args.gamma

    sparse = not args.dense

    bsize = args.block_size.split(",")
    block_size = (int(bsize[0]), int(bsize[1]))

    if args.svmlight:
        x, y = ds.load_svmlight_file(train_data, block_size, args.features,
                                     sparse)
    else:
        x = ds.load_txt_file(train_data, block_size)
        y = x[:, x.shape[1] - 2: x.shape[1] - 1]
        x = x[:, :x.shape[1] - 1]

    if args.shuffle:
        x, y = shuffle(x, y)

    if args.detailed_times:
        barrier()
        read_time = time.time() - s_time
        s_time = time.time()

    csvm = CascadeSVM(cascade_arity=args.arity, max_iter=args.iteration,
                      c=args.c, gamma=gamma,
                      check_convergence=args.convergence, verbose=args.verbose)

    csvm.fit(x, y)

    barrier()
    fit_time = time.time() - s_time

    out = [args.kernel, args.arity, args.part_size, csvm._clf_params["gamma"],
           args.c, csvm.iterations, csvm.converged, read_time, fit_time]

    if os.path.isdir(train_data):
        n_files = os.listdir(train_data)
        out.append(len(n_files))

    if args.test_file:
        if args.svmlight:
            x_test, y_test = ds.load_svmlight_file(args.test_file, block_size,
                                                   args.features,
                                                   sparse)
        else:
            x_test = ds.load_txt_file(args.test_file, block_size)
            y_test = x_test[:, x_test.shape[1] - 1: x_test.shape[1]]
            x_test = x_test[:, :x_test.shape[1] - 1]

        out.append(compss_wait_on(csvm.score(x_test, y_test)))

    if args.output_file:
        with open(args.output_file, "ab") as f:
            wr = csv.writer(f)
            wr.writerow(out)
    else:
        print(out)
Esempio n. 20
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--svmlight",
                        help="read files in SVMLight format",
                        action="store_true")
    parser.add_argument("-dt",
                        "--detailed_times",
                        help="get detailed execution times (read and fit)",
                        action="store_true")
    parser.add_argument("-e",
                        "--estimators",
                        metavar="N_ESTIMATORS",
                        type=int,
                        help="default is 10",
                        default=10)
    parser.add_argument("-b",
                        "--block_size",
                        metavar="BLOCK_SIZE",
                        type=str,
                        help="two comma separated ints that represent the "
                        "size of the blocks in which to divide the input "
                        "data (default is 100,100)",
                        default="100,100")
    parser.add_argument("-md",
                        "--max_depth",
                        metavar="MAX_DEPTH",
                        type=int,
                        help="default is np.inf",
                        required=False)
    parser.add_argument("-dd",
                        "--dist_depth",
                        metavar="DIST_DEPTH",
                        type=int,
                        help="default is auto",
                        required=False)
    parser.add_argument("-f",
                        "--features",
                        metavar="N_FEATURES",
                        help="number of features of the input data "
                        "(only for SVMLight files)",
                        type=int,
                        default=None,
                        required=False)
    parser.add_argument("--dense",
                        help="use dense data structures",
                        action="store_true")
    parser.add_argument("-t",
                        "--test-file",
                        metavar="TEST_FILE_PATH",
                        help="test file path",
                        type=str,
                        required=False)
    parser.add_argument("train_data",
                        help="input file in CSV or SVMLight format",
                        type=str)
    args = parser.parse_args()

    train_data = args.train_data

    s_time = time.time()
    read_time = 0

    sparse = not args.dense

    bsize = args.block_size.split(",")
    block_size = (int(bsize[0]), int(bsize[1]))

    if args.svmlight:
        x, y = ds.load_svmlight_file(train_data, block_size, args.features,
                                     sparse)
    else:
        x = ds.load_txt_file(train_data, block_size)
        y = x[:, x.shape[1] - 2:x.shape[1] - 1]
        x = x[:, :x.shape[1] - 1]

    if args.detailed_times:
        barrier()
        read_time = time.time() - s_time
        s_time = time.time()

    if args.dist_depth:
        dist_depth = args.dist_depth
    else:
        dist_depth = "auto"

    if args.max_depth:
        max_depth = args.max_depth
    else:
        max_depth = np.inf

    forest = RandomForestClassifier(n_estimators=args.estimators,
                                    max_depth=max_depth,
                                    distr_depth=dist_depth)
    forest.fit(x, y)

    barrier()
    fit_time = time.time() - s_time

    out = [
        forest.n_estimators, forest.distr_depth, forest.max_depth, read_time,
        fit_time
    ]

    if args.test_file:
        if args.svmlight:
            x_test, y_test = ds.load_svmlight_file(args.test_file, block_size,
                                                   args.features, sparse)
        else:
            x_test = ds.load_txt_file(args.test_file, block_size)
            y_test = x_test[:, x_test.shape[1] - 1:x_test.shape[1]]
            x_test = x_test[:, :x_test.shape[1] - 1]

        out.append(compss_wait_on(forest.score(x_test, y_test)))

    print(out)
Esempio n. 21
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--svmlight", help="read files in SVMLight format",
                        action="store_true")
    parser.add_argument("-dt", "--detailed_times",
                        help="get detailed execution times (read and fit)",
                        action="store_true")
    parser.add_argument("-a", "--arity", metavar="CASCADE_ARITY", type=int,
                        help="default is 50", default=50)
    parser.add_argument("-c", "--centers", metavar="N_CENTERS", type=int,
                        help="default is 2", default=2)
    parser.add_argument("-b", "--block_size", metavar="BLOCK_SIZE", type=str,
                        help="two comma separated ints that represent the "
                             "size of the blocks in which to divide the input "
                             "data (default is 100,100)",
                        default="100,100")
    parser.add_argument("-i", "--iteration", metavar="MAX_ITERATIONS",
                        type=int, help="default is 5", default=5)
    parser.add_argument("-f", "--features", metavar="N_FEATURES",
                        help="number of features of the input data "
                             "(only for SVMLight files)",
                        type=int, default=None, required=False)
    parser.add_argument("--dense", help="store data in dense format (only "
                                        "for SVMLight files)",
                        action="store_true")
    parser.add_argument("--labeled", help="the last column of the input file "
                                          "represents labels (only for text "
                                          "files)",
                        action="store_true")
    parser.add_argument("train_data",
                        help="input file in CSV or SVMLight format", type=str)
    args = parser.parse_args()

    train_data = args.train_data

    s_time = time.time()
    read_time = 0

    sparse = not args.dense

    bsize = args.block_size.split(",")
    block_size = (int(bsize[0]), int(bsize[1]))

    if args.svmlight:
        x, y = ds.load_svmlight_file(train_data, block_size, args.features,
                                     sparse)
    else:
        x = ds.load_txt_file(train_data, block_size)

    n_features = x.shape[1]

    if args.labeled and not args.svmlight:
        x = x[:, :n_features - 1]

    if args.detailed_times:
        barrier()
        read_time = time.time() - s_time
        s_time = time.time()

    kmeans = KMeans(n_clusters=args.clusters, max_iter=args.iteration,
                    arity=args.arity, verbose=True)
    kmeans.fit(x)

    barrier()
    fit_time = time.time() - s_time

    out = [args.clusters, args.arity, args.part_size, read_time, fit_time]

    print(out)