Example #1
0
    def test_fit_iris(self):
        # get some test data
        iris = ht.load("heat/datasets/data/iris.csv", sep=";")

        # fit the clusters
        k = 3
        kmeans = ht.cluster.KMeans(n_clusters=k)
        kmeans.fit(iris)

        # check whether the results are correct
        self.assertIsInstance(kmeans.cluster_centers_, ht.DNDarray)
        self.assertEqual(kmeans.cluster_centers_.shape, (k, iris.shape[1]))
        # same test with init=kmeans++
        kmeans = ht.cluster.KMeans(n_clusters=k, init="kmeans++")
        kmeans.fit(iris)

        # check whether the results are correct
        self.assertIsInstance(kmeans.cluster_centers_, ht.DNDarray)
        self.assertEqual(kmeans.cluster_centers_.shape, (k, iris.shape[1]))

        iris_split = ht.load("heat/datasets/data/iris.csv", sep=";", split=1)
        kmeans = ht.cluster.KMeans(n_clusters=k)

        with self.assertRaises(NotImplementedError):
            kmeans.fit(iris_split)

        kmeans = ht.cluster.KMeans(n_clusters=k, init="random_number")
        with self.assertRaises(ValueError):
            kmeans.fit(iris_split)
Example #2
0
    def test_load(self):
        # HDF5
        if ht.io.supports_hdf5():
            iris = ht.load(self.HDF5_PATH, dataset="data")
            self.assertIsInstance(iris, ht.DNDarray)
            # shape invariant
            self.assertEqual(iris.shape, self.IRIS.shape)
            self.assertEqual(iris.larray.shape, self.IRIS.shape)
            # data type
            self.assertEqual(iris.dtype, ht.float32)
            self.assertEqual(iris.larray.dtype, torch.float32)
            # content
            self.assertTrue((self.IRIS == iris.larray).all())
        else:
            with self.assertRaises(ValueError):
                _ = ht.load(self.HDF5_PATH, dataset=self.HDF5_DATASET)

        # netCDF
        if ht.io.supports_netcdf():
            iris = ht.load(self.NETCDF_PATH, variable=self.NETCDF_VARIABLE)
            self.assertIsInstance(iris, ht.DNDarray)
            # shape invariant
            self.assertEqual(iris.shape, self.IRIS.shape)
            self.assertEqual(iris.larray.shape, self.IRIS.shape)
            # data type
            self.assertEqual(iris.dtype, ht.float32)
            self.assertEqual(iris.larray.dtype, torch.float32)
            # content
            self.assertTrue((self.IRIS == iris.larray).all())
        else:
            with self.assertRaises(ValueError):
                _ = ht.load(self.NETCDF_PATH, variable=self.NETCDF_VARIABLE)
Example #3
0
    def test_fit_iris(self):
        if ht.MPI_WORLD.size <= 4:
            # todo: fix tests with >7 processes, NaNs appearing in spectral._spectral_embedding
            # get some test data
            iris = ht.load("heat/datasets/data/iris.csv", sep=";", split=0)
            m = 10

            # fit the clusters
            spectral = ht.cluster.Spectral(n_clusters=3,
                                           gamma=1.0,
                                           metric="rbf",
                                           laplacian="fully_connected",
                                           n_lanczos=m)
            spectral.fit(iris)
            self.assertIsInstance(spectral.labels_, ht.DNDarray)

            spectral = ht.cluster.Spectral(
                metric="euclidean",
                laplacian="eNeighbour",
                threshold=0.5,
                boundary="upper",
                n_lanczos=m,
            )
            labels = spectral.fit_predict(iris)
            self.assertIsInstance(labels, ht.DNDarray)

            spectral = ht.cluster.Spectral(
                gamma=0.1,
                metric="rbf",
                laplacian="eNeighbour",
                threshold=0.5,
                boundary="upper",
                n_lanczos=m,
            )
            labels = spectral.fit_predict(iris)
            self.assertIsInstance(labels, ht.DNDarray)

            kmeans = {"kmeans++": "kmeans++", "max_iter": 30, "tol": -1}
            spectral = ht.cluster.Spectral(n_clusters=3,
                                           gamma=1.0,
                                           normalize=True,
                                           n_lanczos=m,
                                           params=kmeans)
            labels = spectral.fit_predict(iris)
            self.assertIsInstance(labels, ht.DNDarray)

            # Errors
            with self.assertRaises(NotImplementedError):
                spectral = ht.cluster.Spectral(metric="ahalanobis",
                                               n_lanczos=m)

            iris_split = ht.load("heat/datasets/data/iris.csv",
                                 sep=";",
                                 split=1)
            spectral = ht.cluster.Spectral(n_lanczos=20)
            with self.assertRaises(NotImplementedError):
                spectral.fit(iris_split)
Example #4
0
    def test_fit_iris_unsplit(self):
        split = 0
        # get some test data
        iris = ht.load("heat/datasets/iris.csv", sep=";", split=split)
        ht.random.seed(1)
        # fit the clusters
        k = 3
        kmedoid = ht.cluster.KMedoids(n_clusters=k, random_state=1)
        kmedoid.fit(iris)

        # check whether the results are correct
        self.assertIsInstance(kmedoid.cluster_centers_, ht.DNDarray)
        self.assertEqual(kmedoid.cluster_centers_.shape, (k, iris.shape[1]))
        # same test with init=kmedoids++
        kmedoid = ht.cluster.KMedoids(n_clusters=k, init="kmedoids++")
        kmedoid.fit(iris)

        # check whether the results are correct
        self.assertIsInstance(kmedoid.cluster_centers_, ht.DNDarray)
        self.assertEqual(kmedoid.cluster_centers_.shape, (k, iris.shape[1]))

        # check whether result is actually a datapoint
        for i in range(kmedoid.cluster_centers_.shape[0]):
            self.assertTrue(
                ht.any(
                    ht.sum(ht.abs(kmedoid.cluster_centers_[i, :] - iris),
                           axis=1) == 0))
Example #5
0
File: test_io.py Project: mtar/heat
    def test_load_csv(self):
        csv_file_length = 150
        csv_file_cols = 4
        first_value = torch.tensor(
            [5.1, 3.5, 1.4, 0.2], dtype=torch.float32, device=self.device.torch_device
        )
        tenth_value = torch.tensor(
            [4.9, 3.1, 1.5, 0.1], dtype=torch.float32, device=self.device.torch_device
        )

        a = ht.load_csv(self.CSV_PATH, sep=";")
        self.assertEqual(len(a), csv_file_length)
        self.assertEqual(a.shape, (csv_file_length, csv_file_cols))
        self.assertTrue(torch.equal(a._DNDarray__array[0], first_value))
        self.assertTrue(torch.equal(a._DNDarray__array[9], tenth_value))

        a = ht.load_csv(self.CSV_PATH, sep=";", split=0)
        rank = a.comm.Get_rank()
        expected_gshape = (csv_file_length, csv_file_cols)
        self.assertEqual(a.gshape, expected_gshape)

        counts, _, _ = a.comm.counts_displs_shape(expected_gshape, 0)
        expected_lshape = (counts[rank], csv_file_cols)
        self.assertEqual(a.lshape, expected_lshape)

        if rank == 0:
            self.assertTrue(torch.equal(a._DNDarray__array[0], first_value))

        a = ht.load_csv(self.CSV_PATH, sep=";", header_lines=9, dtype=ht.float32, split=0)
        expected_gshape = (csv_file_length - 9, csv_file_cols)
        counts, _, _ = a.comm.counts_displs_shape(expected_gshape, 0)
        expected_lshape = (counts[rank], csv_file_cols)

        self.assertEqual(a.gshape, expected_gshape)
        self.assertEqual(a.lshape, expected_lshape)
        self.assertEqual(a.dtype, ht.float32)
        if rank == 0:
            self.assertTrue(torch.equal(a._DNDarray__array[0], tenth_value))

        a = ht.load_csv(self.CSV_PATH, sep=";", split=1)
        self.assertEqual(a.shape, (csv_file_length, csv_file_cols))
        self.assertEqual(a.lshape[0], csv_file_length)

        a = ht.load_csv(self.CSV_PATH, sep=";", split=0)
        b = ht.load(self.CSV_PATH, sep=";", split=0)
        self.assertTrue(ht.equal(a, b))

        # Test for csv where header is longer then the first process`s share of lines
        a = ht.load_csv(self.CSV_PATH, sep=";", header_lines=100, split=0)
        self.assertEqual(a.shape, (50, 4))

        with self.assertRaises(TypeError):
            ht.load_csv(12314)
        with self.assertRaises(TypeError):
            ht.load_csv(self.CSV_PATH, sep=11)
        with self.assertRaises(TypeError):
            ht.load_csv(self.CSV_PATH, header_lines="3", sep=";", split=0)
Example #6
0
    def test_balance_and_lshape_map(self):
        data = ht.zeros((70, 20), split=0, device=ht_device)
        data = data[:50]
        lshape_map = data.create_lshape_map()
        self.assertEqual(sum(lshape_map[..., 0]), 50)
        if sum(data.lshape) == 0:
            self.assertTrue(all(lshape_map[data.comm.rank] == 0))
        data.balance_()
        self.assertTrue(data.is_balanced())

        data = ht.zeros((4, 120), split=1, device=ht_device)
        data = data[:, 40:70]
        lshape_map = data.create_lshape_map()
        self.assertEqual(sum(lshape_map[..., 1]), 30)
        if sum(data.lshape) == 0:
            self.assertTrue(all(lshape_map[data.comm.rank] == 0))
        data.balance_()
        self.assertTrue(data.is_balanced())

        data = ht.zeros((70, 20), split=0, dtype=ht.float64, device=ht_device)
        data = data[:50]
        data.balance_()
        self.assertTrue(data.is_balanced())

        data = ht.zeros((4, 120), split=1, dtype=ht.int64, device=ht_device)
        data = data[:, 40:70]
        data.balance_()
        self.assertTrue(data.is_balanced())

        data = np.loadtxt("heat/datasets/data/iris.csv", delimiter=";")
        htdata = ht.load("heat/datasets/data/iris.csv",
                         sep=";",
                         split=0,
                         device=ht_device)
        self.assertTrue(
            ht.equal(htdata,
                     ht.array(data, split=0, dtype=ht.float,
                              device=ht_device)))

        if ht.MPI_WORLD.size > 4:
            rank = ht.MPI_WORLD.rank
            if rank == 2:
                arr = torch.tensor([0, 1])
            elif rank == 3:
                arr = torch.tensor([2, 3, 4, 5])
            elif rank == 4:
                arr = torch.tensor([6, 7, 8, 9])
            else:
                arr = torch.empty([0], dtype=torch.int64)
            a = ht.array(arr, is_split=0, device=ht_device)
            a.balance_()
            comp = ht.arange(10, split=0, device=ht_device)

            self.assertTrue(ht.equal(a, comp))
Example #7
0
    def test_exceptions(self):
        # get some test data
        iris_split = ht.load("heat/datasets/iris.csv", sep=";", split=1)

        # build a clusterer
        k = 3
        kmedoid = ht.cluster.KMedoids(n_clusters=k)

        with self.assertRaises(NotImplementedError):
            kmedoid.fit(iris_split)
        with self.assertRaises(ValueError):
            kmedoid.set_params(foo="bar")
        with self.assertRaises(ValueError):
            kmedoid = ht.cluster.KMedoids(n_clusters=k, init="random_number")
            kmedoid.fit(iris_split)
Example #8
0
    def test_fit_iris(self):
        # get some test data
        iris = ht.load("heat/datasets/data/iris.csv", sep=";")

        # fit the clusters
        k = 3
        kmeans = ht.cluster.KMeans(n_clusters=k)
        kmeans.fit(iris)

        # check whether the results are correct
        self.assertIsInstance(kmeans.cluster_centers_, ht.DNDarray)
        self.assertEqual(kmeans.cluster_centers_.shape, (k, iris.shape[1]))
        # same test with init=kmeans++
        kmeans = ht.cluster.KMeans(n_clusters=k, init="kmeans++")
        kmeans.fit(iris)

        # check whether the results are correct
        self.assertIsInstance(kmeans.cluster_centers_, ht.DNDarray)
        self.assertEqual(kmeans.cluster_centers_.shape, (k, iris.shape[1]))
Example #9
0
    def test_mean(self):
        array_0_len = 5
        array_1_len = 5
        array_2_len = 5

        x = ht.zeros((2, 3, 4))
        with self.assertRaises(ValueError):
            x.mean(axis=10)
        with self.assertRaises(ValueError):
            x.mean(axis=[4])
        with self.assertRaises(ValueError):
            x.mean(axis=[-4])
        with self.assertRaises(TypeError):
            ht.mean(x, axis="01")
        with self.assertRaises(ValueError):
            ht.mean(x, axis=(0, "10"))
        with self.assertRaises(ValueError):
            ht.mean(x, axis=(0, 0))
        with self.assertRaises(ValueError):
            ht.mean(x, axis=torch.Tensor([0, 0]))

        a = ht.arange(1, 5)
        self.assertEqual(a.mean(), 2.5)

        # ones
        dimensions = []

        for d in [array_0_len, array_1_len, array_2_len]:
            dimensions.extend([d])
            hold = list(range(len(dimensions)))
            hold.append(None)
            for split in hold:  # loop over the number of split dimension of the test array
                z = ht.ones(dimensions, split=split)
                res = z.mean()
                total_dims_list = list(z.shape)
                self.assertTrue((res == 1).all())
                for it in range(
                        len(z.shape)
                ):  # loop over the different single dimensions for mean
                    res = z.mean(axis=it)
                    self.assertTrue((res == 1).all())
                    target_dims = [
                        total_dims_list[q] for q in range(len(total_dims_list))
                        if q != it
                    ]
                    if not target_dims:
                        target_dims = ()
                    self.assertEqual(res.gshape, tuple(target_dims))
                    if z.split is None:
                        sp = None
                    else:
                        sp = z.split if it > z.split else z.split - 1
                        if it == split:
                            sp = None
                    self.assertEqual(res.split, sp)
                loop_list = [
                    ",".join(map(str, comb))
                    for comb in combinations(list(range(len(z.shape))), 2)
                ]

                for it in loop_list:  # loop over the different combinations of dimensions for mean
                    lp_split = [int(q) for q in it.split(",")]
                    res = z.mean(axis=lp_split)
                    self.assertTrue((res == 1).all())
                    target_dims = [
                        total_dims_list[q] for q in range(len(total_dims_list))
                        if q not in lp_split
                    ]
                    if not target_dims:
                        target_dims = (1, )
                    if res.gshape:
                        self.assertEqual(res.gshape, tuple(target_dims))
                    if res.split is not None:
                        if any([split >= x for x in lp_split]):
                            self.assertEqual(res.split, len(target_dims) - 1)
                        else:
                            self.assertEqual(res.split, z.split)

        # values for the iris dataset mean measured by libreoffice calc
        ax0 = ht.array(
            [5.84333333333333, 3.054, 3.75866666666667, 1.19866666666667])
        for sp in [None, 0, 1]:
            iris = ht.load("heat/datasets/data/iris.csv", sep=";", split=sp)
            self.assertTrue(ht.allclose(ht.mean(iris), 3.46366666666667))
            self.assertTrue(ht.allclose(ht.mean(iris, axis=0), ax0))
Example #10
0
    def test_var(self):
        array_0_len = ht.MPI_WORLD.size * 2
        array_1_len = ht.MPI_WORLD.size * 2
        array_2_len = ht.MPI_WORLD.size * 2

        # test raises
        x = ht.zeros((2, 3, 4), device=ht_device)
        with self.assertRaises(TypeError):
            ht.var(x, axis=0, bessel=1)
        with self.assertRaises(ValueError):
            ht.var(x, axis=10)
        with self.assertRaises(TypeError):
            ht.var(x, axis="01")

        a = ht.arange(1, 5, device=ht_device)
        self.assertEqual(a.var(), 1.666666666666666)

        # ones
        dimensions = []
        for d in [array_0_len, array_1_len, array_2_len]:
            dimensions.extend([d])
            hold = list(range(len(dimensions)))
            hold.append(None)
            for split in hold:  # loop over the number of dimensions of the test array
                z = ht.ones(dimensions, split=split, device=ht_device)
                res = z.var()
                total_dims_list = list(z.shape)
                self.assertTrue((res == 0).all())
                # loop over the different single dimensions for mean
                for it in range(len(z.shape)):
                    res = z.var(axis=it)
                    self.assertTrue(ht.allclose(res, 0))
                    target_dims = [
                        total_dims_list[q] for q in range(len(total_dims_list))
                        if q != it
                    ]
                    if not target_dims:
                        target_dims = ()
                    # print(split, it, z.shape, res.shape)
                    self.assertEqual(res.gshape, tuple(target_dims))
                    # if res.split is not None:
                    #     if i >= it:
                    #         self.assertEqual(res.split, len(target_dims) - 1)
                    #     else:
                    #         self.assertEqual(res.split, z.split)
                    if z.split is None:
                        sp = None
                    else:
                        sp = z.split if it > z.split else z.split - 1
                        if it == split:
                            sp = None
                    self.assertEqual(res.split, sp)
                    if split == it:
                        res = z.var(axis=it)
                        self.assertTrue(ht.allclose(res, 0))
                z = ht.ones(dimensions, split=split, device=ht_device)
                res = z.var(bessel=False)
                self.assertTrue(ht.allclose(res, 0))

        # values for the iris dataset var measured by libreoffice calc
        for sp in [None, 0, 1]:
            iris = ht.load("heat/datasets/data/iris.csv",
                           sep=";",
                           split=sp,
                           device=ht_device)
            self.assertTrue(
                ht.allclose(ht.var(iris, bessel=True), 3.90318519755147))
Example #11
0
    def test_var(self):
        array_0_len = ht.MPI_WORLD.size * 2
        array_1_len = ht.MPI_WORLD.size * 2
        array_2_len = ht.MPI_WORLD.size * 2

        # test raises
        x = ht.zeros((2, 3, 4))
        with self.assertRaises(ValueError):
            x.var(axis=10)
        with self.assertRaises(ValueError):
            x.var(axis=[4])
        with self.assertRaises(ValueError):
            x.var(axis=[-4])
        with self.assertRaises(TypeError):
            ht.var(x, axis="01")
        with self.assertRaises(ValueError):
            ht.var(x, axis=(0, "10"))
        with self.assertRaises(ValueError):
            ht.var(x, axis=(0, 0))
        with self.assertRaises(NotImplementedError):
            ht.var(x, ddof=2)
        with self.assertRaises(ValueError):
            ht.var(x, ddof=-2)
        with self.assertRaises(ValueError):
            ht.mean(x, axis=torch.Tensor([0, 0]))

        a = ht.arange(1, 5)
        self.assertEqual(a.var(ddof=1), 1.666666666666666)

        # ones
        dimensions = []
        for d in [array_0_len, array_1_len, array_2_len]:
            dimensions.extend([d])
            hold = list(range(len(dimensions)))
            hold.append(None)
            for split in hold:  # loop over the number of dimensions of the test array
                z = ht.ones(dimensions, split=split)
                res = z.var(ddof=0)
                total_dims_list = list(z.shape)
                self.assertTrue((res == 0).all())
                # loop over the different single dimensions for var
                for it in range(len(z.shape)):
                    res = z.var(axis=it)
                    self.assertTrue(ht.allclose(res, 0))
                    target_dims = [
                        total_dims_list[q] for q in range(len(total_dims_list))
                        if q != it
                    ]
                    if not target_dims:
                        target_dims = ()
                    self.assertEqual(res.gshape, tuple(target_dims))
                    if z.split is None:
                        sp = None
                    else:
                        sp = z.split if it > z.split else z.split - 1
                        if it == split:
                            sp = None
                    self.assertEqual(res.split, sp)
                    if split == it:
                        res = z.var(axis=it)
                        self.assertTrue(ht.allclose(res, 0))
                loop_list = [
                    ",".join(map(str, comb))
                    for comb in combinations(list(range(len(z.shape))), 2)
                ]

                for it in loop_list:  # loop over the different combinations of dimensions for var
                    lp_split = [int(q) for q in it.split(",")]
                    res = z.var(axis=lp_split)
                    self.assertTrue((res == 0).all())
                    target_dims = [
                        total_dims_list[q] for q in range(len(total_dims_list))
                        if q not in lp_split
                    ]
                    if not target_dims:
                        target_dims = (1, )
                    if res.gshape:
                        self.assertEqual(res.gshape, tuple(target_dims))
                    if res.split is not None:
                        if any([split >= x for x in lp_split]):
                            self.assertEqual(res.split, len(target_dims) - 1)
                        else:
                            self.assertEqual(res.split, z.split)

        # values for the iris dataset var measured by libreoffice calc
        for sp in [None, 0, 1]:
            iris = ht.load("heat/datasets/data/iris.csv", sep=";", split=sp)
            self.assertTrue(
                ht.allclose(ht.var(iris, bessel=True), 3.90318519755147))
Example #12
0
    def test_partial_h5_dataset(self):
        # load h5 data and get the total shape
        full_data = ht.load("heat/datasets/iris.h5", dataset="data", split=None)
        target_shape = full_data.shape

        class TestDataset(ht.utils.data.partial_dataset.PartialH5Dataset):
            def __init__(self, file, comm, load, load_len, use_gpus=False):
                super(TestDataset, self).__init__(
                    file, comm=comm, initial_load=load, load_length=load_len, use_gpu=use_gpus
                )

            def __getitem__(self, item):
                return self.data[item]

        partial_dset = TestDataset("heat/datasets/iris.h5", full_data.comm, 30, 20)
        dl = ht.utils.data.DataLoader(dataset=partial_dset, batch_size=7)
        first_epoch = None
        second_epoch = None
        for epoch in range(2):
            elems = 0
            last_batch = None
            for batch in dl:
                elems += batch.shape[0]
                if last_batch is not None:
                    self.assertFalse(torch.allclose(last_batch, batch))
                self.assertEqual(batch.shape, (7, 4))
                last_batch = batch
                if epoch == 0:
                    if first_epoch is None:
                        first_epoch = batch
                    else:
                        first_epoch = torch.cat((first_epoch, batch), dim=0)
                else:
                    if second_epoch is None:
                        second_epoch = batch
                    else:
                        second_epoch = torch.cat((second_epoch, batch), dim=0)
            self.assertTrue(elems >= (target_shape[0] - 7) // full_data.comm.size)
        self.assertFalse(torch.allclose(first_epoch, second_epoch))

        partial_dset = TestDataset("heat/datasets/iris.h5", full_data.comm, 30, 20, True)
        dl = ht.utils.data.DataLoader(
            dataset=partial_dset,
            batch_size=7,
            pin_memory=True if torch.cuda.is_available() else False,
        )
        first_epoch = None
        second_epoch = None
        for epoch in range(2):
            elems = 0
            last_batch = None
            for batch in dl:
                elems += batch.shape[0]
                if last_batch is not None:
                    self.assertFalse(torch.allclose(last_batch, batch))
                self.assertEqual(batch.shape, (7, 4))
                last_batch = batch
                if epoch == 0:
                    if first_epoch is None:
                        first_epoch = batch
                    else:
                        first_epoch = torch.cat((first_epoch, batch), dim=0)
                else:
                    if second_epoch is None:
                        second_epoch = batch
                    else:
                        second_epoch = torch.cat((second_epoch, batch), dim=0)
            self.assertTrue(elems >= (target_shape[0] - 7) // full_data.comm.size)
        self.assertFalse(torch.allclose(first_epoch, second_epoch))
Example #13
0
#!/usr/bin/env python

import argparse
import heat as ht
import time

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="HeAT lasso gpu benchmark")
    parser.add_argument("--file", type=str, help="file to benchmark")
    parser.add_argument("--dataset", type=str, help="dataset within file to benchmark")
    parser.add_argument("--labels", type=str, help="dataset within file pointing to the labels")
    parser.add_argument("--trials", type=int, help="number of benchmark trials")
    parser.add_argument("--iterations", type=int, help="iterations")
    args = parser.parse_args()

    ht.use_device("gpu")

    print("Loading data... {}[{}]".format(args.file, args.dataset), end="")
    data = ht.load(args.file, args.dataset, split=0)
    labels = ht.load(args.file, args.labels, split=0)
    print("\t[OK]")

    for trial in range(args.trials):
        print("Trial {}...".format(trial), end="")
        lasso = ht.regression.Lasso(max_iter=args.iterations, tol=-1.0)
        start = time.perf_counter()
        lasso.fit(data, labels)
        end = time.perf_counter()
        print("\t{}s".format(end - start))
Example #14
0
    def test_load_exception(self):
        # correct extension, file does not exist
        if ht.io.supports_hdf5():
            with self.assertRaises(IOError):
                ht.load("foo.h5", "data")
        else:
            with self.assertRaises(ValueError):
                ht.load("foo.h5", "data")

        if ht.io.supports_netcdf():
            with self.assertRaises(IOError):
                ht.load("foo.nc", "data")
        else:
            with self.assertRaises(ValueError):
                ht.load("foo.nc", "data")

        # unknown file extension
        with self.assertRaises(ValueError):
            ht.load(os.path.join(os.getcwd(), "heat/datasets/iris.json"),
                    "data")
        with self.assertRaises(ValueError):
            ht.load("iris", "data")
Example #15
0
import heat as ht
import time

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="HeAT statistical moments cpu benchmark")
    parser.add_argument("--file", type=str, help="file to benchmark")
    parser.add_argument("--dataset",
                        type=str,
                        help="dataset within file to benchmark")
    parser.add_argument("--trials",
                        type=int,
                        help="number of benchmark trials")
    args = parser.parse_args()

    ht.use_device("cpu")

    print("Loading data... {}[{}]".format(args.file, args.dataset), end="")
    data = ht.load(args.file, dataset=args.dataset, split=0)
    print("\t[OK]")

    for function in [ht.mean, ht.std]:
        for axis in [None, 0, 1]:
            print("{} axis={}".format(function, axis))
            for trial in range(args.trials):
                print("Trial {}...".format(trial), end="")
                start = time.perf_counter()
                function(data, axis=axis)
                end = time.perf_counter()
                print("\t{}s".format(end - start))
Example #16
0
    def test_load_exception(self):
        # correct extension, file does not exist
        if ht.io.supports_hdf5():
            with self.assertRaises(IOError):
                ht.load('foo.h5', 'data')
        else:
            with self.assertRaises(ValueError):
                ht.load('foo.h5', 'data')

        if ht.io.supports_netcdf():
            with self.assertRaises(IOError):
                ht.load('foo.nc', 'data')
        else:
            with self.assertRaises(ValueError):
                ht.load('foo.nc', 'data')

        # unknown file extension
        with self.assertRaises(ValueError):
            ht.load(os.path.join(os.getcwd(), 'heat/datasets/data/iris.csv'), 'data')
        with self.assertRaises(ValueError):
            ht.load('iris', 'data')
Example #17
0
    def test_fit_iris(self):
        # load sklearn train/test sets and resulting probabilities
        X_train = ht.load(
            "heat/datasets/data/iris_X_train.csv", sep=";", dtype=ht.float64, device=ht_device
        )
        X_test = ht.load(
            "heat/datasets/data/iris_X_test.csv", sep=";", dtype=ht.float64, device=ht_device
        )
        y_train = ht.load(
            "heat/datasets/data/iris_y_train.csv", sep=";", dtype=ht.int64, device=ht_device
        ).squeeze()
        y_test = ht.load(
            "heat/datasets/data/iris_y_test.csv", sep=";", dtype=ht.int64, device=ht_device
        ).squeeze()
        y_pred_proba_sklearn = ht.load(
            "heat/datasets/data/iris_y_pred_proba.csv", sep=";", dtype=ht.float64, device=ht_device
        )

        # test ht.GaussianNB
        from heat.naive_bayes import GaussianNB

        gnb_heat = GaussianNB()
        self.assertEqual(gnb_heat.priors, None)
        with self.assertRaises(AttributeError):
            gnb_heat.classes_
        with self.assertRaises(AttributeError):
            gnb_heat.class_prior_
        with self.assertRaises(AttributeError):
            gnb_heat.epsilon_

        # test GaussianNB locally, no weights
        local_fit = gnb_heat.fit(X_train, y_train)
        self.assert_array_equal(gnb_heat.classes_, np.array([0, 1, 2]))
        local_fit_no_classes = gnb_heat.partial_fit(X_train, y_train, classes=None)
        y_pred_local = local_fit_no_classes.predict(X_test)
        y_pred_proba_local = local_fit.predict_proba(X_test)
        sklearn_class_prior = np.array([0.38666667, 0.26666667, 0.34666667])
        sklearn_epsilon = np.array([3.6399040000000003e-09])
        sklearn_theta = ht.array(
            [
                [4.97586207, 3.35862069, 1.44827586, 0.23448276],
                [5.935, 2.71, 4.185, 1.3],
                [6.77692308, 3.09230769, 5.73461538, 2.10769231],
            ],
            dtype=X_train.dtype,
            device=ht_device,
        )
        sklearn_sigma = ht.array(
            [
                [0.10321047, 0.13208086, 0.01629013, 0.00846612],
                [0.256275, 0.0829, 0.255275, 0.046],
                [0.38869823, 0.10147929, 0.31303255, 0.04763314],
            ],
            dtype=X_train.dtype,
            device=ht_device,
        )
        self.assertIsInstance(y_pred_local, ht.DNDarray)
        self.assertEqual((y_pred_local != y_test).sum(), ht.array(4))
        self.assert_array_equal(gnb_heat.class_prior_, sklearn_class_prior)
        self.assert_array_equal(gnb_heat.epsilon_, sklearn_epsilon)
        self.assertTrue(ht.isclose(gnb_heat.theta_, sklearn_theta).all())
        self.assertTrue(ht.isclose(gnb_heat.sigma_, sklearn_sigma, atol=1e-1).all())
        self.assertTrue(ht.isclose(y_pred_proba_sklearn, y_pred_proba_local, atol=1e-1).all())

        # test GaussianNB when sample_weight is not None, sample_weight not distributed
        sample_weight = ht.ones((y_train.gshape[0]), dtype=ht.float32, split=None)
        local_fit_weight = gnb_heat.fit(X_train, y_train, sample_weight=sample_weight)
        y_pred_local_weight = local_fit_weight.predict(X_test)
        y_pred_proba_local_weight = local_fit_weight.predict_proba(X_test)
        self.assertIsInstance(y_pred_local_weight, ht.DNDarray)
        self.assert_array_equal(gnb_heat.class_prior_, sklearn_class_prior)
        self.assert_array_equal(gnb_heat.epsilon_, sklearn_epsilon)
        self.assertTrue(ht.isclose(gnb_heat.theta_, sklearn_theta).all())
        self.assertTrue(ht.isclose(gnb_heat.sigma_, sklearn_sigma, atol=1e-1).all())
        self.assert_array_equal(y_pred_local_weight, y_pred_local.numpy())
        self.assertTrue(ht.isclose(y_pred_proba_sklearn, y_pred_proba_local_weight).all())

        # test GaussianNB, data and labels distributed along split axis 0
        X_train_split = ht.resplit(X_train, axis=0)
        X_test_split = ht.resplit(X_test, axis=0)
        y_train_split = ht.resplit(y_train, axis=0)
        y_test_split = ht.resplit(y_test, axis=0)
        y_pred_split = gnb_heat.fit(X_train_split, y_train_split).predict(X_test_split)
        self.assert_array_equal(gnb_heat.class_prior_, sklearn_class_prior)
        self.assert_array_equal(gnb_heat.epsilon_, sklearn_epsilon)
        self.assertTrue(ht.isclose(gnb_heat.theta_, sklearn_theta).all())
        self.assertTrue(ht.isclose(gnb_heat.sigma_, sklearn_sigma, atol=1e-1).all())
        self.assert_array_equal(y_pred_split, y_pred_local.numpy())
        self.assertEqual((y_pred_split != y_test_split).sum(), ht.array(4))
        sample_weight_split = ht.ones(y_train_split.gshape[0], dtype=ht.float32, split=0)
        y_pred_split_weight = gnb_heat.fit(
            X_train_split, y_train_split, sample_weight=sample_weight_split
        ).predict(X_test_split)
        self.assertIsInstance(y_pred_split_weight, ht.DNDarray)
        self.assert_array_equal(y_pred_split_weight, y_pred_split.numpy())

        # test exceptions
        X_torch = torch.ones(75, 4)
        y_np = np.zeros(75)
        y_2D = ht.ones((75, 1), split=None, device=ht_device)
        weights_torch = torch.zeros(75)
        X_3D = ht.ones((75, 4, 4), split=None, device=ht_device)
        X_wrong_size = ht.ones((75, 5), split=None, device=ht_device)
        y_wrong_size = ht.zeros(76, device=ht_device)
        X_train_split = ht.resplit(X_train, axis=0)
        y_train_split = ht.resplit(y_train, axis=0)
        weights_2D_split = y_2D = ht.ones((75, 1), split=0, device=ht_device)
        weights_wrong_size = ht.ones(76, device=ht_device)
        priors_wrong_shape = ht.random.randn(4, device=ht_device)
        priors_wrong_sum = ht.random.randn(3, dtype=ht.float32, device=ht_device)
        priors_wrong_sign = ht.array([-0.3, 0.7, 0.6])
        wrong_classes = ht.array([3, 4, 5])

        with self.assertRaises(ValueError):
            gnb_heat.fit(X_torch, y_train)
        with self.assertRaises(ValueError):
            gnb_heat.fit(X_train, y_np)
        with self.assertRaises(ValueError):
            gnb_heat.fit(X_train, y_2D)
        with self.assertRaises(ValueError):
            gnb_heat.fit(X_train, y_train, sample_weight=weights_torch)
        with self.assertRaises(ValueError):
            gnb_heat.fit(X_3D, y_train)
        with self.assertRaises(ValueError):
            gnb_heat.fit(X_train, y_wrong_size)
        with self.assertRaises(ValueError):
            gnb_heat.fit(X_train, y_train)
            gnb_heat.predict(X_torch)
        with self.assertRaises(ValueError):
            gnb_heat.fit(X_train, y_train)
            gnb_heat.partial_fit(X_wrong_size, y_train)
        with self.assertRaises(ValueError):
            gnb_heat.fit(X_train, y_train)
            gnb_heat.partial_fit(X_train, y_train, classes=wrong_classes)
        with self.assertRaises(ValueError):
            gnb_heat.classes_ = None
            gnb_heat.partial_fit(X_train, y_train, classes=None)
        with self.assertRaises(ValueError):
            gnb_heat.fit(X_train_split, y_train_split, sample_weight=weights_2D_split)
        with self.assertRaises(ValueError):
            gnb_heat.fit(X_train, y_train, sample_weight=weights_wrong_size)
        with self.assertRaises(ValueError):
            gnb_heat.priors = priors_wrong_shape
            gnb_heat.fit(X_train, y_train)
        with self.assertRaises(ValueError):
            gnb_heat.priors = priors_wrong_sum
            gnb_heat.fit(X_train, y_train)
        with self.assertRaises(ValueError):
            gnb_heat.priors = priors_wrong_sign
            gnb_heat.fit(X_train, y_train)
Example #18
0
    def test_cov(self):
        x = ht.array([[0, 2], [1, 1], [2, 0]], dtype=ht.float, split=1).T
        if x.comm.size < 3:
            cov = ht.cov(x)
            actual = ht.array([[1, -1], [-1, 1]], split=0)
            self.assertTrue(ht.equal(cov, actual))

        data = np.loadtxt("heat/datasets/data/iris.csv", delimiter=";")
        np_cov = np.cov(data[:, 0], data[:, 1:3], rowvar=False)

        htdata = ht.load("heat/datasets/data/iris.csv", sep=";", split=0)
        ht_cov = ht.cov(htdata[:, 0], htdata[:, 1:3], rowvar=False)
        comp = ht.array(np_cov, dtype=ht.float)
        self.assertTrue(ht.allclose(comp - ht_cov, 0, atol=1e-4))

        np_cov = np.cov(data, rowvar=False)
        ht_cov = ht.cov(htdata, rowvar=False)
        self.assertTrue(
            ht.allclose(ht.array(np_cov, dtype=ht.float) - ht_cov,
                        0,
                        atol=1e-4))

        np_cov = np.cov(data, rowvar=False, ddof=1)
        ht_cov = ht.cov(htdata, rowvar=False, ddof=1)
        self.assertTrue(
            ht.allclose(ht.array(np_cov, dtype=ht.float) - ht_cov,
                        0,
                        atol=1e-4))

        np_cov = np.cov(data, rowvar=False, bias=True)
        ht_cov = ht.cov(htdata, rowvar=False, bias=True)
        self.assertTrue(
            ht.allclose(ht.array(np_cov, dtype=ht.float) - ht_cov,
                        0,
                        atol=1e-4))

        if 1 < x.comm.size < 5:
            htdata = ht.load("heat/datasets/data/iris.csv", sep=";", split=1)
            np_cov = np.cov(data, rowvar=False)
            ht_cov = ht.cov(htdata, rowvar=False)
            self.assertTrue(
                ht.allclose(ht.array(np_cov, dtype=ht.float),
                            ht_cov,
                            atol=1e-4))

            np_cov = np.cov(data, data, rowvar=True)

            htdata = ht.load("heat/datasets/data/iris.csv", sep=";", split=0)
            ht_cov = ht.cov(htdata, htdata, rowvar=True)
            self.assertTrue(
                ht.allclose(ht.array(np_cov, dtype=ht.float),
                            ht_cov,
                            atol=1e-4))

            htdata = ht.load("heat/datasets/data/iris.csv", sep=";", split=0)
            with self.assertRaises(RuntimeError):
                ht.cov(htdata[1:], rowvar=False)
            with self.assertRaises(RuntimeError):
                ht.cov(htdata, htdata[1:], rowvar=False)

        with self.assertRaises(TypeError):
            ht.cov(np_cov)
        with self.assertRaises(TypeError):
            ht.cov(htdata, np_cov)
        with self.assertRaises(TypeError):
            ht.cov(htdata, ddof="str")
        with self.assertRaises(ValueError):
            ht.cov(ht.zeros((1, 2, 3)))
        with self.assertRaises(ValueError):
            ht.cov(htdata, ht.zeros((1, 2, 3)))
        with self.assertRaises(ValueError):
            ht.cov(htdata, ddof=10000)
Example #19
0
ht.save(a, 'data.h5', 'DATA', mode='w')
comm.Barrier()

print0(rank, a.split)
print0(rank, type(a.larray), a.larray.shape)
print0(rank, a.gshape, a.lshape, a.split)
print(rank, a.larray.dtype, a.larray.device, a.larray.layout)
# cache = np.from_file('')
cache = np.random.rand(1000, 1000 * int(sys.argv[1]))
a.larray = torch.from_numpy(cache).cuda(rank)
print0(rank, a.gshape, a.lshape, a.split)
print(rank, a.larray.dtype, a.larray.device, a.larray.layout)

t1 = time.time()
if rank == 0:
    b = ht.load('data.h5', dataset='DATA')
t2 = time.time()
comm.Barrier()
print0(
    rank, "the loading time is {:.2f} s for a {:.2f} MB matrix.".format(
        (t2 - t1), size * size / 1024. / 1024.))

if rank == 0:
    print(rank, b.split)

x = np.fromfile('rbc_conf_3264_m0.004_0.03_000290', dtype='>f8')
x = x.reshape(4, 3, 3, 2, 64, 32, 32, 32)
x = x.transpose((4, 5, 6, 7, 0, 2, 1, 3))
x = x[..., 0] + x[..., 1] * 1j
print(x.shape)