def test_fit_iris(self): # get some test data iris = ht.load("heat/datasets/data/iris.csv", sep=";") # fit the clusters k = 3 kmeans = ht.cluster.KMeans(n_clusters=k) kmeans.fit(iris) # check whether the results are correct self.assertIsInstance(kmeans.cluster_centers_, ht.DNDarray) self.assertEqual(kmeans.cluster_centers_.shape, (k, iris.shape[1])) # same test with init=kmeans++ kmeans = ht.cluster.KMeans(n_clusters=k, init="kmeans++") kmeans.fit(iris) # check whether the results are correct self.assertIsInstance(kmeans.cluster_centers_, ht.DNDarray) self.assertEqual(kmeans.cluster_centers_.shape, (k, iris.shape[1])) iris_split = ht.load("heat/datasets/data/iris.csv", sep=";", split=1) kmeans = ht.cluster.KMeans(n_clusters=k) with self.assertRaises(NotImplementedError): kmeans.fit(iris_split) kmeans = ht.cluster.KMeans(n_clusters=k, init="random_number") with self.assertRaises(ValueError): kmeans.fit(iris_split)
def test_load(self): # HDF5 if ht.io.supports_hdf5(): iris = ht.load(self.HDF5_PATH, dataset="data") self.assertIsInstance(iris, ht.DNDarray) # shape invariant self.assertEqual(iris.shape, self.IRIS.shape) self.assertEqual(iris.larray.shape, self.IRIS.shape) # data type self.assertEqual(iris.dtype, ht.float32) self.assertEqual(iris.larray.dtype, torch.float32) # content self.assertTrue((self.IRIS == iris.larray).all()) else: with self.assertRaises(ValueError): _ = ht.load(self.HDF5_PATH, dataset=self.HDF5_DATASET) # netCDF if ht.io.supports_netcdf(): iris = ht.load(self.NETCDF_PATH, variable=self.NETCDF_VARIABLE) self.assertIsInstance(iris, ht.DNDarray) # shape invariant self.assertEqual(iris.shape, self.IRIS.shape) self.assertEqual(iris.larray.shape, self.IRIS.shape) # data type self.assertEqual(iris.dtype, ht.float32) self.assertEqual(iris.larray.dtype, torch.float32) # content self.assertTrue((self.IRIS == iris.larray).all()) else: with self.assertRaises(ValueError): _ = ht.load(self.NETCDF_PATH, variable=self.NETCDF_VARIABLE)
def test_fit_iris(self): if ht.MPI_WORLD.size <= 4: # todo: fix tests with >7 processes, NaNs appearing in spectral._spectral_embedding # get some test data iris = ht.load("heat/datasets/data/iris.csv", sep=";", split=0) m = 10 # fit the clusters spectral = ht.cluster.Spectral(n_clusters=3, gamma=1.0, metric="rbf", laplacian="fully_connected", n_lanczos=m) spectral.fit(iris) self.assertIsInstance(spectral.labels_, ht.DNDarray) spectral = ht.cluster.Spectral( metric="euclidean", laplacian="eNeighbour", threshold=0.5, boundary="upper", n_lanczos=m, ) labels = spectral.fit_predict(iris) self.assertIsInstance(labels, ht.DNDarray) spectral = ht.cluster.Spectral( gamma=0.1, metric="rbf", laplacian="eNeighbour", threshold=0.5, boundary="upper", n_lanczos=m, ) labels = spectral.fit_predict(iris) self.assertIsInstance(labels, ht.DNDarray) kmeans = {"kmeans++": "kmeans++", "max_iter": 30, "tol": -1} spectral = ht.cluster.Spectral(n_clusters=3, gamma=1.0, normalize=True, n_lanczos=m, params=kmeans) labels = spectral.fit_predict(iris) self.assertIsInstance(labels, ht.DNDarray) # Errors with self.assertRaises(NotImplementedError): spectral = ht.cluster.Spectral(metric="ahalanobis", n_lanczos=m) iris_split = ht.load("heat/datasets/data/iris.csv", sep=";", split=1) spectral = ht.cluster.Spectral(n_lanczos=20) with self.assertRaises(NotImplementedError): spectral.fit(iris_split)
def test_fit_iris_unsplit(self): split = 0 # get some test data iris = ht.load("heat/datasets/iris.csv", sep=";", split=split) ht.random.seed(1) # fit the clusters k = 3 kmedoid = ht.cluster.KMedoids(n_clusters=k, random_state=1) kmedoid.fit(iris) # check whether the results are correct self.assertIsInstance(kmedoid.cluster_centers_, ht.DNDarray) self.assertEqual(kmedoid.cluster_centers_.shape, (k, iris.shape[1])) # same test with init=kmedoids++ kmedoid = ht.cluster.KMedoids(n_clusters=k, init="kmedoids++") kmedoid.fit(iris) # check whether the results are correct self.assertIsInstance(kmedoid.cluster_centers_, ht.DNDarray) self.assertEqual(kmedoid.cluster_centers_.shape, (k, iris.shape[1])) # check whether result is actually a datapoint for i in range(kmedoid.cluster_centers_.shape[0]): self.assertTrue( ht.any( ht.sum(ht.abs(kmedoid.cluster_centers_[i, :] - iris), axis=1) == 0))
def test_load_csv(self): csv_file_length = 150 csv_file_cols = 4 first_value = torch.tensor( [5.1, 3.5, 1.4, 0.2], dtype=torch.float32, device=self.device.torch_device ) tenth_value = torch.tensor( [4.9, 3.1, 1.5, 0.1], dtype=torch.float32, device=self.device.torch_device ) a = ht.load_csv(self.CSV_PATH, sep=";") self.assertEqual(len(a), csv_file_length) self.assertEqual(a.shape, (csv_file_length, csv_file_cols)) self.assertTrue(torch.equal(a._DNDarray__array[0], first_value)) self.assertTrue(torch.equal(a._DNDarray__array[9], tenth_value)) a = ht.load_csv(self.CSV_PATH, sep=";", split=0) rank = a.comm.Get_rank() expected_gshape = (csv_file_length, csv_file_cols) self.assertEqual(a.gshape, expected_gshape) counts, _, _ = a.comm.counts_displs_shape(expected_gshape, 0) expected_lshape = (counts[rank], csv_file_cols) self.assertEqual(a.lshape, expected_lshape) if rank == 0: self.assertTrue(torch.equal(a._DNDarray__array[0], first_value)) a = ht.load_csv(self.CSV_PATH, sep=";", header_lines=9, dtype=ht.float32, split=0) expected_gshape = (csv_file_length - 9, csv_file_cols) counts, _, _ = a.comm.counts_displs_shape(expected_gshape, 0) expected_lshape = (counts[rank], csv_file_cols) self.assertEqual(a.gshape, expected_gshape) self.assertEqual(a.lshape, expected_lshape) self.assertEqual(a.dtype, ht.float32) if rank == 0: self.assertTrue(torch.equal(a._DNDarray__array[0], tenth_value)) a = ht.load_csv(self.CSV_PATH, sep=";", split=1) self.assertEqual(a.shape, (csv_file_length, csv_file_cols)) self.assertEqual(a.lshape[0], csv_file_length) a = ht.load_csv(self.CSV_PATH, sep=";", split=0) b = ht.load(self.CSV_PATH, sep=";", split=0) self.assertTrue(ht.equal(a, b)) # Test for csv where header is longer then the first process`s share of lines a = ht.load_csv(self.CSV_PATH, sep=";", header_lines=100, split=0) self.assertEqual(a.shape, (50, 4)) with self.assertRaises(TypeError): ht.load_csv(12314) with self.assertRaises(TypeError): ht.load_csv(self.CSV_PATH, sep=11) with self.assertRaises(TypeError): ht.load_csv(self.CSV_PATH, header_lines="3", sep=";", split=0)
def test_balance_and_lshape_map(self): data = ht.zeros((70, 20), split=0, device=ht_device) data = data[:50] lshape_map = data.create_lshape_map() self.assertEqual(sum(lshape_map[..., 0]), 50) if sum(data.lshape) == 0: self.assertTrue(all(lshape_map[data.comm.rank] == 0)) data.balance_() self.assertTrue(data.is_balanced()) data = ht.zeros((4, 120), split=1, device=ht_device) data = data[:, 40:70] lshape_map = data.create_lshape_map() self.assertEqual(sum(lshape_map[..., 1]), 30) if sum(data.lshape) == 0: self.assertTrue(all(lshape_map[data.comm.rank] == 0)) data.balance_() self.assertTrue(data.is_balanced()) data = ht.zeros((70, 20), split=0, dtype=ht.float64, device=ht_device) data = data[:50] data.balance_() self.assertTrue(data.is_balanced()) data = ht.zeros((4, 120), split=1, dtype=ht.int64, device=ht_device) data = data[:, 40:70] data.balance_() self.assertTrue(data.is_balanced()) data = np.loadtxt("heat/datasets/data/iris.csv", delimiter=";") htdata = ht.load("heat/datasets/data/iris.csv", sep=";", split=0, device=ht_device) self.assertTrue( ht.equal(htdata, ht.array(data, split=0, dtype=ht.float, device=ht_device))) if ht.MPI_WORLD.size > 4: rank = ht.MPI_WORLD.rank if rank == 2: arr = torch.tensor([0, 1]) elif rank == 3: arr = torch.tensor([2, 3, 4, 5]) elif rank == 4: arr = torch.tensor([6, 7, 8, 9]) else: arr = torch.empty([0], dtype=torch.int64) a = ht.array(arr, is_split=0, device=ht_device) a.balance_() comp = ht.arange(10, split=0, device=ht_device) self.assertTrue(ht.equal(a, comp))
def test_exceptions(self): # get some test data iris_split = ht.load("heat/datasets/iris.csv", sep=";", split=1) # build a clusterer k = 3 kmedoid = ht.cluster.KMedoids(n_clusters=k) with self.assertRaises(NotImplementedError): kmedoid.fit(iris_split) with self.assertRaises(ValueError): kmedoid.set_params(foo="bar") with self.assertRaises(ValueError): kmedoid = ht.cluster.KMedoids(n_clusters=k, init="random_number") kmedoid.fit(iris_split)
def test_fit_iris(self): # get some test data iris = ht.load("heat/datasets/data/iris.csv", sep=";") # fit the clusters k = 3 kmeans = ht.cluster.KMeans(n_clusters=k) kmeans.fit(iris) # check whether the results are correct self.assertIsInstance(kmeans.cluster_centers_, ht.DNDarray) self.assertEqual(kmeans.cluster_centers_.shape, (k, iris.shape[1])) # same test with init=kmeans++ kmeans = ht.cluster.KMeans(n_clusters=k, init="kmeans++") kmeans.fit(iris) # check whether the results are correct self.assertIsInstance(kmeans.cluster_centers_, ht.DNDarray) self.assertEqual(kmeans.cluster_centers_.shape, (k, iris.shape[1]))
def test_mean(self): array_0_len = 5 array_1_len = 5 array_2_len = 5 x = ht.zeros((2, 3, 4)) with self.assertRaises(ValueError): x.mean(axis=10) with self.assertRaises(ValueError): x.mean(axis=[4]) with self.assertRaises(ValueError): x.mean(axis=[-4]) with self.assertRaises(TypeError): ht.mean(x, axis="01") with self.assertRaises(ValueError): ht.mean(x, axis=(0, "10")) with self.assertRaises(ValueError): ht.mean(x, axis=(0, 0)) with self.assertRaises(ValueError): ht.mean(x, axis=torch.Tensor([0, 0])) a = ht.arange(1, 5) self.assertEqual(a.mean(), 2.5) # ones dimensions = [] for d in [array_0_len, array_1_len, array_2_len]: dimensions.extend([d]) hold = list(range(len(dimensions))) hold.append(None) for split in hold: # loop over the number of split dimension of the test array z = ht.ones(dimensions, split=split) res = z.mean() total_dims_list = list(z.shape) self.assertTrue((res == 1).all()) for it in range( len(z.shape) ): # loop over the different single dimensions for mean res = z.mean(axis=it) self.assertTrue((res == 1).all()) target_dims = [ total_dims_list[q] for q in range(len(total_dims_list)) if q != it ] if not target_dims: target_dims = () self.assertEqual(res.gshape, tuple(target_dims)) if z.split is None: sp = None else: sp = z.split if it > z.split else z.split - 1 if it == split: sp = None self.assertEqual(res.split, sp) loop_list = [ ",".join(map(str, comb)) for comb in combinations(list(range(len(z.shape))), 2) ] for it in loop_list: # loop over the different combinations of dimensions for mean lp_split = [int(q) for q in it.split(",")] res = z.mean(axis=lp_split) self.assertTrue((res == 1).all()) target_dims = [ total_dims_list[q] for q in range(len(total_dims_list)) if q not in lp_split ] if not target_dims: target_dims = (1, ) if res.gshape: self.assertEqual(res.gshape, tuple(target_dims)) if res.split is not None: if any([split >= x for x in lp_split]): self.assertEqual(res.split, len(target_dims) - 1) else: self.assertEqual(res.split, z.split) # values for the iris dataset mean measured by libreoffice calc ax0 = ht.array( [5.84333333333333, 3.054, 3.75866666666667, 1.19866666666667]) for sp in [None, 0, 1]: iris = ht.load("heat/datasets/data/iris.csv", sep=";", split=sp) self.assertTrue(ht.allclose(ht.mean(iris), 3.46366666666667)) self.assertTrue(ht.allclose(ht.mean(iris, axis=0), ax0))
def test_var(self): array_0_len = ht.MPI_WORLD.size * 2 array_1_len = ht.MPI_WORLD.size * 2 array_2_len = ht.MPI_WORLD.size * 2 # test raises x = ht.zeros((2, 3, 4), device=ht_device) with self.assertRaises(TypeError): ht.var(x, axis=0, bessel=1) with self.assertRaises(ValueError): ht.var(x, axis=10) with self.assertRaises(TypeError): ht.var(x, axis="01") a = ht.arange(1, 5, device=ht_device) self.assertEqual(a.var(), 1.666666666666666) # ones dimensions = [] for d in [array_0_len, array_1_len, array_2_len]: dimensions.extend([d]) hold = list(range(len(dimensions))) hold.append(None) for split in hold: # loop over the number of dimensions of the test array z = ht.ones(dimensions, split=split, device=ht_device) res = z.var() total_dims_list = list(z.shape) self.assertTrue((res == 0).all()) # loop over the different single dimensions for mean for it in range(len(z.shape)): res = z.var(axis=it) self.assertTrue(ht.allclose(res, 0)) target_dims = [ total_dims_list[q] for q in range(len(total_dims_list)) if q != it ] if not target_dims: target_dims = () # print(split, it, z.shape, res.shape) self.assertEqual(res.gshape, tuple(target_dims)) # if res.split is not None: # if i >= it: # self.assertEqual(res.split, len(target_dims) - 1) # else: # self.assertEqual(res.split, z.split) if z.split is None: sp = None else: sp = z.split if it > z.split else z.split - 1 if it == split: sp = None self.assertEqual(res.split, sp) if split == it: res = z.var(axis=it) self.assertTrue(ht.allclose(res, 0)) z = ht.ones(dimensions, split=split, device=ht_device) res = z.var(bessel=False) self.assertTrue(ht.allclose(res, 0)) # values for the iris dataset var measured by libreoffice calc for sp in [None, 0, 1]: iris = ht.load("heat/datasets/data/iris.csv", sep=";", split=sp, device=ht_device) self.assertTrue( ht.allclose(ht.var(iris, bessel=True), 3.90318519755147))
def test_var(self): array_0_len = ht.MPI_WORLD.size * 2 array_1_len = ht.MPI_WORLD.size * 2 array_2_len = ht.MPI_WORLD.size * 2 # test raises x = ht.zeros((2, 3, 4)) with self.assertRaises(ValueError): x.var(axis=10) with self.assertRaises(ValueError): x.var(axis=[4]) with self.assertRaises(ValueError): x.var(axis=[-4]) with self.assertRaises(TypeError): ht.var(x, axis="01") with self.assertRaises(ValueError): ht.var(x, axis=(0, "10")) with self.assertRaises(ValueError): ht.var(x, axis=(0, 0)) with self.assertRaises(NotImplementedError): ht.var(x, ddof=2) with self.assertRaises(ValueError): ht.var(x, ddof=-2) with self.assertRaises(ValueError): ht.mean(x, axis=torch.Tensor([0, 0])) a = ht.arange(1, 5) self.assertEqual(a.var(ddof=1), 1.666666666666666) # ones dimensions = [] for d in [array_0_len, array_1_len, array_2_len]: dimensions.extend([d]) hold = list(range(len(dimensions))) hold.append(None) for split in hold: # loop over the number of dimensions of the test array z = ht.ones(dimensions, split=split) res = z.var(ddof=0) total_dims_list = list(z.shape) self.assertTrue((res == 0).all()) # loop over the different single dimensions for var for it in range(len(z.shape)): res = z.var(axis=it) self.assertTrue(ht.allclose(res, 0)) target_dims = [ total_dims_list[q] for q in range(len(total_dims_list)) if q != it ] if not target_dims: target_dims = () self.assertEqual(res.gshape, tuple(target_dims)) if z.split is None: sp = None else: sp = z.split if it > z.split else z.split - 1 if it == split: sp = None self.assertEqual(res.split, sp) if split == it: res = z.var(axis=it) self.assertTrue(ht.allclose(res, 0)) loop_list = [ ",".join(map(str, comb)) for comb in combinations(list(range(len(z.shape))), 2) ] for it in loop_list: # loop over the different combinations of dimensions for var lp_split = [int(q) for q in it.split(",")] res = z.var(axis=lp_split) self.assertTrue((res == 0).all()) target_dims = [ total_dims_list[q] for q in range(len(total_dims_list)) if q not in lp_split ] if not target_dims: target_dims = (1, ) if res.gshape: self.assertEqual(res.gshape, tuple(target_dims)) if res.split is not None: if any([split >= x for x in lp_split]): self.assertEqual(res.split, len(target_dims) - 1) else: self.assertEqual(res.split, z.split) # values for the iris dataset var measured by libreoffice calc for sp in [None, 0, 1]: iris = ht.load("heat/datasets/data/iris.csv", sep=";", split=sp) self.assertTrue( ht.allclose(ht.var(iris, bessel=True), 3.90318519755147))
def test_partial_h5_dataset(self): # load h5 data and get the total shape full_data = ht.load("heat/datasets/iris.h5", dataset="data", split=None) target_shape = full_data.shape class TestDataset(ht.utils.data.partial_dataset.PartialH5Dataset): def __init__(self, file, comm, load, load_len, use_gpus=False): super(TestDataset, self).__init__( file, comm=comm, initial_load=load, load_length=load_len, use_gpu=use_gpus ) def __getitem__(self, item): return self.data[item] partial_dset = TestDataset("heat/datasets/iris.h5", full_data.comm, 30, 20) dl = ht.utils.data.DataLoader(dataset=partial_dset, batch_size=7) first_epoch = None second_epoch = None for epoch in range(2): elems = 0 last_batch = None for batch in dl: elems += batch.shape[0] if last_batch is not None: self.assertFalse(torch.allclose(last_batch, batch)) self.assertEqual(batch.shape, (7, 4)) last_batch = batch if epoch == 0: if first_epoch is None: first_epoch = batch else: first_epoch = torch.cat((first_epoch, batch), dim=0) else: if second_epoch is None: second_epoch = batch else: second_epoch = torch.cat((second_epoch, batch), dim=0) self.assertTrue(elems >= (target_shape[0] - 7) // full_data.comm.size) self.assertFalse(torch.allclose(first_epoch, second_epoch)) partial_dset = TestDataset("heat/datasets/iris.h5", full_data.comm, 30, 20, True) dl = ht.utils.data.DataLoader( dataset=partial_dset, batch_size=7, pin_memory=True if torch.cuda.is_available() else False, ) first_epoch = None second_epoch = None for epoch in range(2): elems = 0 last_batch = None for batch in dl: elems += batch.shape[0] if last_batch is not None: self.assertFalse(torch.allclose(last_batch, batch)) self.assertEqual(batch.shape, (7, 4)) last_batch = batch if epoch == 0: if first_epoch is None: first_epoch = batch else: first_epoch = torch.cat((first_epoch, batch), dim=0) else: if second_epoch is None: second_epoch = batch else: second_epoch = torch.cat((second_epoch, batch), dim=0) self.assertTrue(elems >= (target_shape[0] - 7) // full_data.comm.size) self.assertFalse(torch.allclose(first_epoch, second_epoch))
#!/usr/bin/env python import argparse import heat as ht import time if __name__ == "__main__": parser = argparse.ArgumentParser(description="HeAT lasso gpu benchmark") parser.add_argument("--file", type=str, help="file to benchmark") parser.add_argument("--dataset", type=str, help="dataset within file to benchmark") parser.add_argument("--labels", type=str, help="dataset within file pointing to the labels") parser.add_argument("--trials", type=int, help="number of benchmark trials") parser.add_argument("--iterations", type=int, help="iterations") args = parser.parse_args() ht.use_device("gpu") print("Loading data... {}[{}]".format(args.file, args.dataset), end="") data = ht.load(args.file, args.dataset, split=0) labels = ht.load(args.file, args.labels, split=0) print("\t[OK]") for trial in range(args.trials): print("Trial {}...".format(trial), end="") lasso = ht.regression.Lasso(max_iter=args.iterations, tol=-1.0) start = time.perf_counter() lasso.fit(data, labels) end = time.perf_counter() print("\t{}s".format(end - start))
def test_load_exception(self): # correct extension, file does not exist if ht.io.supports_hdf5(): with self.assertRaises(IOError): ht.load("foo.h5", "data") else: with self.assertRaises(ValueError): ht.load("foo.h5", "data") if ht.io.supports_netcdf(): with self.assertRaises(IOError): ht.load("foo.nc", "data") else: with self.assertRaises(ValueError): ht.load("foo.nc", "data") # unknown file extension with self.assertRaises(ValueError): ht.load(os.path.join(os.getcwd(), "heat/datasets/iris.json"), "data") with self.assertRaises(ValueError): ht.load("iris", "data")
import heat as ht import time if __name__ == "__main__": parser = argparse.ArgumentParser( description="HeAT statistical moments cpu benchmark") parser.add_argument("--file", type=str, help="file to benchmark") parser.add_argument("--dataset", type=str, help="dataset within file to benchmark") parser.add_argument("--trials", type=int, help="number of benchmark trials") args = parser.parse_args() ht.use_device("cpu") print("Loading data... {}[{}]".format(args.file, args.dataset), end="") data = ht.load(args.file, dataset=args.dataset, split=0) print("\t[OK]") for function in [ht.mean, ht.std]: for axis in [None, 0, 1]: print("{} axis={}".format(function, axis)) for trial in range(args.trials): print("Trial {}...".format(trial), end="") start = time.perf_counter() function(data, axis=axis) end = time.perf_counter() print("\t{}s".format(end - start))
def test_load_exception(self): # correct extension, file does not exist if ht.io.supports_hdf5(): with self.assertRaises(IOError): ht.load('foo.h5', 'data') else: with self.assertRaises(ValueError): ht.load('foo.h5', 'data') if ht.io.supports_netcdf(): with self.assertRaises(IOError): ht.load('foo.nc', 'data') else: with self.assertRaises(ValueError): ht.load('foo.nc', 'data') # unknown file extension with self.assertRaises(ValueError): ht.load(os.path.join(os.getcwd(), 'heat/datasets/data/iris.csv'), 'data') with self.assertRaises(ValueError): ht.load('iris', 'data')
def test_fit_iris(self): # load sklearn train/test sets and resulting probabilities X_train = ht.load( "heat/datasets/data/iris_X_train.csv", sep=";", dtype=ht.float64, device=ht_device ) X_test = ht.load( "heat/datasets/data/iris_X_test.csv", sep=";", dtype=ht.float64, device=ht_device ) y_train = ht.load( "heat/datasets/data/iris_y_train.csv", sep=";", dtype=ht.int64, device=ht_device ).squeeze() y_test = ht.load( "heat/datasets/data/iris_y_test.csv", sep=";", dtype=ht.int64, device=ht_device ).squeeze() y_pred_proba_sklearn = ht.load( "heat/datasets/data/iris_y_pred_proba.csv", sep=";", dtype=ht.float64, device=ht_device ) # test ht.GaussianNB from heat.naive_bayes import GaussianNB gnb_heat = GaussianNB() self.assertEqual(gnb_heat.priors, None) with self.assertRaises(AttributeError): gnb_heat.classes_ with self.assertRaises(AttributeError): gnb_heat.class_prior_ with self.assertRaises(AttributeError): gnb_heat.epsilon_ # test GaussianNB locally, no weights local_fit = gnb_heat.fit(X_train, y_train) self.assert_array_equal(gnb_heat.classes_, np.array([0, 1, 2])) local_fit_no_classes = gnb_heat.partial_fit(X_train, y_train, classes=None) y_pred_local = local_fit_no_classes.predict(X_test) y_pred_proba_local = local_fit.predict_proba(X_test) sklearn_class_prior = np.array([0.38666667, 0.26666667, 0.34666667]) sklearn_epsilon = np.array([3.6399040000000003e-09]) sklearn_theta = ht.array( [ [4.97586207, 3.35862069, 1.44827586, 0.23448276], [5.935, 2.71, 4.185, 1.3], [6.77692308, 3.09230769, 5.73461538, 2.10769231], ], dtype=X_train.dtype, device=ht_device, ) sklearn_sigma = ht.array( [ [0.10321047, 0.13208086, 0.01629013, 0.00846612], [0.256275, 0.0829, 0.255275, 0.046], [0.38869823, 0.10147929, 0.31303255, 0.04763314], ], dtype=X_train.dtype, device=ht_device, ) self.assertIsInstance(y_pred_local, ht.DNDarray) self.assertEqual((y_pred_local != y_test).sum(), ht.array(4)) self.assert_array_equal(gnb_heat.class_prior_, sklearn_class_prior) self.assert_array_equal(gnb_heat.epsilon_, sklearn_epsilon) self.assertTrue(ht.isclose(gnb_heat.theta_, sklearn_theta).all()) self.assertTrue(ht.isclose(gnb_heat.sigma_, sklearn_sigma, atol=1e-1).all()) self.assertTrue(ht.isclose(y_pred_proba_sklearn, y_pred_proba_local, atol=1e-1).all()) # test GaussianNB when sample_weight is not None, sample_weight not distributed sample_weight = ht.ones((y_train.gshape[0]), dtype=ht.float32, split=None) local_fit_weight = gnb_heat.fit(X_train, y_train, sample_weight=sample_weight) y_pred_local_weight = local_fit_weight.predict(X_test) y_pred_proba_local_weight = local_fit_weight.predict_proba(X_test) self.assertIsInstance(y_pred_local_weight, ht.DNDarray) self.assert_array_equal(gnb_heat.class_prior_, sklearn_class_prior) self.assert_array_equal(gnb_heat.epsilon_, sklearn_epsilon) self.assertTrue(ht.isclose(gnb_heat.theta_, sklearn_theta).all()) self.assertTrue(ht.isclose(gnb_heat.sigma_, sklearn_sigma, atol=1e-1).all()) self.assert_array_equal(y_pred_local_weight, y_pred_local.numpy()) self.assertTrue(ht.isclose(y_pred_proba_sklearn, y_pred_proba_local_weight).all()) # test GaussianNB, data and labels distributed along split axis 0 X_train_split = ht.resplit(X_train, axis=0) X_test_split = ht.resplit(X_test, axis=0) y_train_split = ht.resplit(y_train, axis=0) y_test_split = ht.resplit(y_test, axis=0) y_pred_split = gnb_heat.fit(X_train_split, y_train_split).predict(X_test_split) self.assert_array_equal(gnb_heat.class_prior_, sklearn_class_prior) self.assert_array_equal(gnb_heat.epsilon_, sklearn_epsilon) self.assertTrue(ht.isclose(gnb_heat.theta_, sklearn_theta).all()) self.assertTrue(ht.isclose(gnb_heat.sigma_, sklearn_sigma, atol=1e-1).all()) self.assert_array_equal(y_pred_split, y_pred_local.numpy()) self.assertEqual((y_pred_split != y_test_split).sum(), ht.array(4)) sample_weight_split = ht.ones(y_train_split.gshape[0], dtype=ht.float32, split=0) y_pred_split_weight = gnb_heat.fit( X_train_split, y_train_split, sample_weight=sample_weight_split ).predict(X_test_split) self.assertIsInstance(y_pred_split_weight, ht.DNDarray) self.assert_array_equal(y_pred_split_weight, y_pred_split.numpy()) # test exceptions X_torch = torch.ones(75, 4) y_np = np.zeros(75) y_2D = ht.ones((75, 1), split=None, device=ht_device) weights_torch = torch.zeros(75) X_3D = ht.ones((75, 4, 4), split=None, device=ht_device) X_wrong_size = ht.ones((75, 5), split=None, device=ht_device) y_wrong_size = ht.zeros(76, device=ht_device) X_train_split = ht.resplit(X_train, axis=0) y_train_split = ht.resplit(y_train, axis=0) weights_2D_split = y_2D = ht.ones((75, 1), split=0, device=ht_device) weights_wrong_size = ht.ones(76, device=ht_device) priors_wrong_shape = ht.random.randn(4, device=ht_device) priors_wrong_sum = ht.random.randn(3, dtype=ht.float32, device=ht_device) priors_wrong_sign = ht.array([-0.3, 0.7, 0.6]) wrong_classes = ht.array([3, 4, 5]) with self.assertRaises(ValueError): gnb_heat.fit(X_torch, y_train) with self.assertRaises(ValueError): gnb_heat.fit(X_train, y_np) with self.assertRaises(ValueError): gnb_heat.fit(X_train, y_2D) with self.assertRaises(ValueError): gnb_heat.fit(X_train, y_train, sample_weight=weights_torch) with self.assertRaises(ValueError): gnb_heat.fit(X_3D, y_train) with self.assertRaises(ValueError): gnb_heat.fit(X_train, y_wrong_size) with self.assertRaises(ValueError): gnb_heat.fit(X_train, y_train) gnb_heat.predict(X_torch) with self.assertRaises(ValueError): gnb_heat.fit(X_train, y_train) gnb_heat.partial_fit(X_wrong_size, y_train) with self.assertRaises(ValueError): gnb_heat.fit(X_train, y_train) gnb_heat.partial_fit(X_train, y_train, classes=wrong_classes) with self.assertRaises(ValueError): gnb_heat.classes_ = None gnb_heat.partial_fit(X_train, y_train, classes=None) with self.assertRaises(ValueError): gnb_heat.fit(X_train_split, y_train_split, sample_weight=weights_2D_split) with self.assertRaises(ValueError): gnb_heat.fit(X_train, y_train, sample_weight=weights_wrong_size) with self.assertRaises(ValueError): gnb_heat.priors = priors_wrong_shape gnb_heat.fit(X_train, y_train) with self.assertRaises(ValueError): gnb_heat.priors = priors_wrong_sum gnb_heat.fit(X_train, y_train) with self.assertRaises(ValueError): gnb_heat.priors = priors_wrong_sign gnb_heat.fit(X_train, y_train)
def test_cov(self): x = ht.array([[0, 2], [1, 1], [2, 0]], dtype=ht.float, split=1).T if x.comm.size < 3: cov = ht.cov(x) actual = ht.array([[1, -1], [-1, 1]], split=0) self.assertTrue(ht.equal(cov, actual)) data = np.loadtxt("heat/datasets/data/iris.csv", delimiter=";") np_cov = np.cov(data[:, 0], data[:, 1:3], rowvar=False) htdata = ht.load("heat/datasets/data/iris.csv", sep=";", split=0) ht_cov = ht.cov(htdata[:, 0], htdata[:, 1:3], rowvar=False) comp = ht.array(np_cov, dtype=ht.float) self.assertTrue(ht.allclose(comp - ht_cov, 0, atol=1e-4)) np_cov = np.cov(data, rowvar=False) ht_cov = ht.cov(htdata, rowvar=False) self.assertTrue( ht.allclose(ht.array(np_cov, dtype=ht.float) - ht_cov, 0, atol=1e-4)) np_cov = np.cov(data, rowvar=False, ddof=1) ht_cov = ht.cov(htdata, rowvar=False, ddof=1) self.assertTrue( ht.allclose(ht.array(np_cov, dtype=ht.float) - ht_cov, 0, atol=1e-4)) np_cov = np.cov(data, rowvar=False, bias=True) ht_cov = ht.cov(htdata, rowvar=False, bias=True) self.assertTrue( ht.allclose(ht.array(np_cov, dtype=ht.float) - ht_cov, 0, atol=1e-4)) if 1 < x.comm.size < 5: htdata = ht.load("heat/datasets/data/iris.csv", sep=";", split=1) np_cov = np.cov(data, rowvar=False) ht_cov = ht.cov(htdata, rowvar=False) self.assertTrue( ht.allclose(ht.array(np_cov, dtype=ht.float), ht_cov, atol=1e-4)) np_cov = np.cov(data, data, rowvar=True) htdata = ht.load("heat/datasets/data/iris.csv", sep=";", split=0) ht_cov = ht.cov(htdata, htdata, rowvar=True) self.assertTrue( ht.allclose(ht.array(np_cov, dtype=ht.float), ht_cov, atol=1e-4)) htdata = ht.load("heat/datasets/data/iris.csv", sep=";", split=0) with self.assertRaises(RuntimeError): ht.cov(htdata[1:], rowvar=False) with self.assertRaises(RuntimeError): ht.cov(htdata, htdata[1:], rowvar=False) with self.assertRaises(TypeError): ht.cov(np_cov) with self.assertRaises(TypeError): ht.cov(htdata, np_cov) with self.assertRaises(TypeError): ht.cov(htdata, ddof="str") with self.assertRaises(ValueError): ht.cov(ht.zeros((1, 2, 3))) with self.assertRaises(ValueError): ht.cov(htdata, ht.zeros((1, 2, 3))) with self.assertRaises(ValueError): ht.cov(htdata, ddof=10000)
ht.save(a, 'data.h5', 'DATA', mode='w') comm.Barrier() print0(rank, a.split) print0(rank, type(a.larray), a.larray.shape) print0(rank, a.gshape, a.lshape, a.split) print(rank, a.larray.dtype, a.larray.device, a.larray.layout) # cache = np.from_file('') cache = np.random.rand(1000, 1000 * int(sys.argv[1])) a.larray = torch.from_numpy(cache).cuda(rank) print0(rank, a.gshape, a.lshape, a.split) print(rank, a.larray.dtype, a.larray.device, a.larray.layout) t1 = time.time() if rank == 0: b = ht.load('data.h5', dataset='DATA') t2 = time.time() comm.Barrier() print0( rank, "the loading time is {:.2f} s for a {:.2f} MB matrix.".format( (t2 - t1), size * size / 1024. / 1024.)) if rank == 0: print(rank, b.split) x = np.fromfile('rbc_conf_3264_m0.004_0.03_000290', dtype='>f8') x = x.reshape(4, 3, 3, 2, 64, 32, 32, 32) x = x.transpose((4, 5, 6, 7, 0, 2, 1, 3)) x = x[..., 0] + x[..., 1] * 1j print(x.shape)