def test_fit(self): """ Tests that the fit method returns the expected centers using toy data. """ arr = np.array([[1, 2], [2, 1], [-1, -2], [-2, -1]]) x = ds.array(arr, block_size=(2, 2)) km = KMeans(n_clusters=2, random_state=666, verbose=False) km.fit(x) expected_centers = np.array([[1.5, 1.5], [-1.5, -1.5]]) self.assertTrue((km.centers == expected_centers).all())
def test_predict(self): """ Tests that labels are correctly predicted using toy data. """ p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] arr1 = np.array([p1, p2, p3, p4]) x = ds.array(arr1, block_size=(2, 2)) km = KMeans(n_clusters=2, random_state=666) km.fit(x) p5, p6 = [10, 10], [-10, -10] arr2 = np.array([p1, p2, p3, p4, p5, p6]) x_test = ds.array(arr2, block_size=(2, 2)) labels = km.predict(x_test).collect() expected_labels = np.array([0, 0, 1, 1, 0, 1]) self.assertTrue(np.array_equal(labels, expected_labels))
def test_init(self): # With dense data x, y = make_blobs(n_samples=1500, random_state=170) x_filtered = np.vstack( (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10])) x_train = ds.array(x_filtered, block_size=(300, 2)) init = np.random.random((5, 2)) km = KMeans(n_clusters=5, init=init) km.fit(x_train) self.assertTrue(np.array_equal(km._init, init)) self.assertFalse(np.array_equal(km.centers, init)) # With sparse data x_sp = ds.array(csr_matrix(x_filtered), block_size=(300, 2)) init = csr_matrix(np.random.random((5, 2))) km = KMeans(n_clusters=5, init=init) km.fit(x_sp) self.assertTrue(np.array_equal(km._init.toarray(), init.toarray())) self.assertFalse(np.array_equal(km.centers.toarray(), init.toarray()))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--svmlight", help="read files in SVMLight format", action="store_true") parser.add_argument("-dt", "--detailed_times", help="get detailed execution times (read and fit)", action="store_true") parser.add_argument("-a", "--arity", metavar="CASCADE_ARITY", type=int, help="default is 50", default=50) parser.add_argument("-c", "--centers", metavar="N_CENTERS", type=int, help="default is 2", default=2) parser.add_argument("-b", "--block_size", metavar="BLOCK_SIZE", type=str, help="two comma separated ints that represent the " "size of the blocks in which to divide the input " "data (default is 100,100)", default="100,100") parser.add_argument("-i", "--iteration", metavar="MAX_ITERATIONS", type=int, help="default is 5", default=5) parser.add_argument("-f", "--features", metavar="N_FEATURES", help="number of features of the input data " "(only for SVMLight files)", type=int, default=None, required=False) parser.add_argument("--dense", help="store data in dense format (only " "for SVMLight files)", action="store_true") parser.add_argument("--labeled", help="the last column of the input file " "represents labels (only for text " "files)", action="store_true") parser.add_argument("train_data", help="input file in CSV or SVMLight format", type=str) args = parser.parse_args() train_data = args.train_data s_time = time.time() read_time = 0 sparse = not args.dense bsize = args.block_size.split(",") block_size = (int(bsize[0]), int(bsize[1])) if args.svmlight: x, y = ds.load_svmlight_file(train_data, block_size, args.features, sparse) else: x = ds.load_txt_file(train_data, block_size) n_features = x.shape[1] if args.labeled and not args.svmlight: x = x[:, :n_features - 1] if args.detailed_times: barrier() read_time = time.time() - s_time s_time = time.time() kmeans = KMeans(n_clusters=args.clusters, max_iter=args.iteration, arity=args.arity, verbose=True) kmeans.fit(x) barrier() fit_time = time.time() - s_time out = [args.clusters, args.arity, args.part_size, read_time, fit_time] print(out)