Exemple #1
0
    def test4():
        from dazzle.core.dataset import DataSet
        from dazzle.core.table import Table

        test_dir = os.path.join("/temp", "dazzle-test")
        ds = DataSet(test_dir, force_create=True)
        t = Table("t", ds, [("a", np.array([np.nan, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, np.nan]))])
        ca = t.get_column("a")
        print(ca.str_values(format="%.4f"))
Exemple #2
0
    def test2():
        from dazzle.core.dataset import DataSet
        from dazzle.core.table import Table

        test_dir = os.path.join("/temp", "dazzle-test")
        ds = DataSet(test_dir, force_create=True)
        t = Table("t", ds, [("a", np.int)])
        ca = t.get_column("a")
        #t.append({'a': np.random.randint(10000000, size=5*(10**8)).astype(np.int32)})
        print(ca.__dict__)
Exemple #3
0
    def test_rebuild01(self):
        cat = Table.from_csv("Category", self.ds, os.path.join(AVITO_DATA_DIR, "Category.tsv"), delimiter='\t',
                                   usecols=['CategoryID', 'ParentCategoryID', 'Level'], verbose=False)


        cat.rebuild({"CategoryID": np.int8, "Level": np.int8, "ParentCategoryID": np.int8})
        self.assertEqual(len(cat[:]), 69)
        self.assertEqual(cat['CategoryID'].dtype, np.int8)
        self.assertEqual(cat[0]['CategoryID'], -128) # int8.min
        self.assertEqual(cat[0]['Level'], -128) # int8.min
        self.assertEqual(cat[0]['ParentCategoryID'], -128) # int8.min
Exemple #4
0
    def open(data_dir):
        """Open and return an existing DataSet.
        Side effect: open each Table in this dataset.
        """
        json_file = os.path.join(data_dir, "dataset.json")
        if not os.path.exists(json_file):
            raise DazzleError("No 'dataset.json' file found in %s" % data_dir)

        ds = DataSet(data_dir, mode='open')

        with open(json_file, 'rb') as f:
            data = json.loads(f.read().decode('ascii'))
            params = data["compression_params"]
            ds._compression_params = bcolz.cparams(clevel=params["_clevel"], shuffle=params["_shuffle"],
                                                   cname=params["_cname"])
            for table in data["tables"]:
                table = Table(table["name"], ds, [], mode='open')
                table._ctable = bcolz.open(table.data_dir)
                table._build_columns_from_ctable()
                table._dataset._add_table(table)

        ds.save()
        return ds
Exemple #5
0
 def setUp(self):
     self.a = [6, 4, 7, 4, 6, 9]
     self.test_dir = os.path.join("/temp", "dazzle-test")
     self.ds = DataSet(self.test_dir, force_create=True)
     self.t = Table("t", self.ds, [("a", np.array([], np.int)), ("b", np.array([], np.float))], force_create=True)
     self.u = Table("u", self.ds, [("a", np.array([1, 2], np.int)), ("b", np.array([1.1, 2.2], np.float))], force_create=True)
Exemple #6
0
 def test_copy02(self):
     Table.copy("t", self.ds, "/bim/bam")
Exemple #7
0
def load_dataset():
    """'Raw'-dataset is the result of loading the CSV sources data into dazzle tables, only filtering out data
    that we don't want to further process.

    The method is programmed in a non-destructive way so as to be able to launch it several times
    before getting the job done.
    """
    import os
    from dazzle.core.dataset import DataSet



    if DataSet.exists(raw_dir):
        ds = DataSet.open(raw_dir)
    else:
        ds = DataSet(raw_dir, force_create=True)

    # Notes:
    # - many of the following attributes should be unsigned int instead of signed int, but numexpr works only on
    # signed data.
    # - Simlarly to pandas, we use the types required to contain the existing data, not the types we desire to use
    if ds.get_table("Category") is None:
        t = Table.from_csv("Category", ds, os.path.join(csv_dir, "Category.tsv"), delimiter='\t', chunksize=10**7,
                           usecols=['CategoryID', 'ParentCategoryID', 'Level'],
                           dtype={'CategoryID': 'i4', 'ParentCategoryID': 'i1', 'Level': 'i1'})
        t = None

    # Notice the filter attribute that does not exist in pandas.read_csv(). It makes it possible to skip some rows
    # based on a numexpr expression. IsClick == IsClick is true iff IsClick is not na
    if ds.get_table("TrainSearchStream") is None:
        t = Table.from_csv("TrainSearchStream", ds, os.path.join(csv_dir, "trainSearchStream.tsv"), delimiter='\t', chunksize=10**7,
                         usecols=['SearchID', 'AdID', 'Position', 'ObjectType', 'HistCTR', 'IsClick'],
                         dtype={'SearchID':'i4', 'AdID':'i4', 'Position':'i1', 'ObjectType':'i1', 'HistCTR':'f4', 'IsClick':'f1'},
                         filter='(ObjectType == 3) & (IsClick == IsClick)')
        t = None

    # We avoid to load the string fields. We will see this problem later with Don
    if ds.get_table("AdsInfo") is None:
        t = Table.from_csv("AdsInfo", ds, os.path.join(csv_dir, "AdsInfo.tsv"), delimiter='\t', chunksize=10**7,
                           usecols=['AdID', 'LocationID', 'CategoryID', 'Price', 'IsContext'],
                           dtype={'AdID':'i4', 'LocationID':'f4', 'CategoryID':'f4', 'Price': 'f4', 'IsContext': 'f1'})
        t = None

    # We avoid to load the string fields. We will see this problem later with Don
    if ds.get_table("SearchInfo") is None:
        t = Table.from_csv("SearchInfo", ds, os.path.join(csv_dir, "SearchInfo.tsv"), delimiter='\t', chunksize=10**7,
                           usecols=['SearchID', 'IPID', 'UserID', 'IsUserLoggedOn', 'LocationID', 'CategoryID'],
                           dtype={'SearchID':'i4', 'IPID':'i4', 'UserID':'f4', 'IsUserLoggedOn':'f1',
                                       'LocationID':'f4', 'CategoryID':'f4'})
        t = None

    if ds.get_table("userInfo") is None:
        t = Table.from_csv("userInfo", ds, os.path.join(csv_dir, "userInfo.tsv"), delimiter='\t', chunksize=10**7,
                            usecols=['UserID', 'UserAgentID', 'UserAgentOSID','UserDeviceID', 'UserAgentFamilyID'],
                            dtype={'UserID':'i4', 'UserAgentID':'i4', 'UserAgentOSID':'i4',
                                   'UserDeviceID':'i4', 'UserAgentFamilyID':'i4'})
        t = None

    if ds.get_table("Location") is None:
        t = Table.from_csv("Location", ds, os.path.join(csv_dir, "Location.tsv"), delimiter='\t', chunksize=10**7,
                           usecols=['LocationID', 'CityID', 'RegionID'],
                           dtype={'LocationID': 'i4', 'CityID':'f4', 'RegionID': 'f4'})
        t = None

    if ds.get_table("PhoneRequestsStream") is None:
        t = Table.from_csv("PhoneRequestsStream", ds, os.path.join(csv_dir, "PhoneRequestsStream.tsv"), delimiter='\t', chunksize=10**7,
                           usecols=['UserID', 'IPID', 'AdID', 'PhoneRequestDate'],
                           dtype={'UserID':'i4', 'IPID':'i4', 'AdID':'i4', 'PhoneRequestDate': 'object'})
        t = None

    if ds.get_table("VisitsStream") is None:
        t = Table.from_csv("VisitsStream", ds, os.path.join(csv_dir, "VisitsStream.tsv"), delimiter='\t', chunksize=10**7,
                           usecols=['UserID', 'IPID', 'AdID', 'ViewDate'],
                           dtype={'UserID':'i4', 'IPID':'i4', 'AdID':'i4', 'ViewDate': 'object'})
        t = None

    return ds
Exemple #8
0
class TestColumn(unittest.TestCase):

    def setUp(self):
        self.a = [6, 4, 7, 4, 6, 9]
        self.test_dir = os.path.join("/temp", "dazzle-test")
        ds = DataSet(self.test_dir, force_create=True)
        self.t = Table("t", ds, [("a", np.array([1, 3], dtype=np.int8)), ("x", np.array([2, 4], dtype=np.float))], force_create=True)
        self.ca = self.t.get_column("a")

    @raises(DazzleError)
    def test_data_dir01(self):
        """no table associated"""
        print(LiteralColumn("a", np.array([], np.int)).data_dir)

    def test_data_dir02(self):
        self.assertEqual(self.ca.data_dir, os.path.join(self.test_dir, "t", "a"))

    @raises(DazzleError)
    def test_carray01(self):
        """no table associated"""
        print(LiteralColumn("a", np.array([], np.int)).carray)

    def test_carray02(self):
        assert_array_equal(self.ca.carray[:], [1, 3])

    @raises(DazzleError)
    def test_init01(self):
        LiteralColumn("", [])

    @raises(DazzleError)
    def test_init02(self):
        LiteralColumn("1a", [])

    @raises(DazzleError)
    def test_init03(self):
        LiteralColumn("_a", [])

    @raises(DazzleError)
    def test_init04(self):
        LiteralColumn("a", "XX")

    @raises(DazzleError)
    def test_init05(self):
        LiteralColumn("a", self)

    def test_init06(self):
        assert_array_equal(self.ca.carray[:], [1, 3])

    def test_len01(self):
        self.assertEqual(len(self.ca), 2)

    def test_position01(self):
        self.t.append({'a': self.a, 'x': self.a})
        self.assertEqual(self.ca.position, 0)

    def test_position02(self):
        self.t.append({'a': self.a, 'x': self.a})
        self.assertEqual(self.t.get_column("x").position, 1)

    def test_getitem01(self):
        self.assertEqual(self.ca[0], 1)

    def test_getitem02(self):
        self.assertEqual(self.ca[1], 3)

    @raises(IndexError)
    def test_getitem03(self):
        self.t.append({'a': self.a, 'x': self.a})
        _ = self.ca[10]

    def test_getitem04(self):
        self.t.append({'a': self.a, 'x': self.a})
        assert_array_equal(self.ca[:], self.ca.carray[:])

    def test_getitem05(self):
        self.t.append({'a': self.a, 'x': self.a})
        assert_array_equal(self.ca[0:5], [1, 3, 6, 4, 7])

    def test_setitem01(self):
        self.ca[0] = 2
        self.assertEqual(self.ca[0], 2)
        assert_array_equal(self.ca.carray[:], [2, 3])

    def test_append01(self):
        self.t.append({'a': self.a, 'x': self.a})
        self.ca.append(self.a)
        assert_array_equal(self.ca.carray[:], [1, 3, 6, 4, 7, 4, 6, 9, 6, 4, 7, 4, 6, 9])

    @raises(DazzleError)
    def test_rename01(self):
        self.t.append({'a': self.a, 'x': self.a})
        self.ca.rename("")

    @raises(DazzleError)
    def test_rename02(self):
        self.t.append({'a': self.a, 'x': self.a})
        self.ca.rename("x")

    def test_rename03(self):
        self.ca.rename("b")
        self.assertTrue(os.path.exists(os.path.join(self.test_dir, "t", "b")), "'b' dir should exist")
        self.assertFalse(os.path.exists(os.path.join(self.test_dir, "t", "a")), "'a' dir should not exist")
        self.assertEqual(self.ca.data_dir, os.path.join(self.test_dir, "t", "b"))
        self.assertEqual(self.ca._name, "b", "column should be named 'b'")
        self.assertEqual(self.t.get_column("b").position, 0, "column should be at position 0")

    def test_sum01(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([], np.int))], force_create=True).get_column("a")
        assert_close(ca.sum(skipna=True), 0.0)

    def test_sum02(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([], np.int))], force_create=True).get_column("a")
        assert_close(ca.sum(skipna=False), 0.0)

    def test_sum03(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([6, 4, 7, 4, 6, 9], np.int))], force_create=True).get_column("a")
        assert_close(ca.sum(skipna=True), 36.0)

    def test_sum04(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([6, 4, 7, 4, 6, 9], np.int))], force_create=True).get_column("a")
        assert_close(ca.sum(skipna=False), 36.0)

    def test_sum05(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([6, 4, np.nan, 4, 6, 9], np.float))], force_create=True).get_column("a")
        assert_close(ca.sum(skipna=True), 29.0)

    def test_sum06(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([6, 4, np.nan, 4, 6, 9], np.float))], force_create=True).get_column("a")
        self.assertTrue(ca.isnan(ca.sum(skipna=False)))

    def test_sum07(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([np.nan, np.nan], np.float))], force_create=True).get_column("a")
        assert_close(ca.sum(skipna=True), 0.0)

    def test_sum08(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([np.nan, np.nan], np.float))], force_create=True).get_column("a")
        self.assertTrue(ca.isnan(ca.sum(skipna=False)))

    def test_sum09(self):
        ds = DataSet(self.test_dir, force_create=True)
        nan = np.iinfo(np.int8).min
        ca = Table("t", ds, [("a", np.array([3, nan, 2], np.int8))], force_create=True).get_column("a")
        self.assertEqual(ca.sum(skipna=True), 5)

    def test_sum10(self):
        ds = DataSet(self.test_dir, force_create=True)
        nan = np.iinfo(np.int8).min
        ca = Table("t", ds, [("a", np.array([nan, 2], np.int8))], force_create=True).get_column("a")
        self.assertTrue(ca.isnan(ca.sum(skipna=False)))

    def test_sum11(self):
        ds = DataSet(self.test_dir, force_create=True)
        nan = np.iinfo(np.int8).min
        ca = Table("t", ds, [("a", np.array([nan, nan], np.int8))], force_create=True).get_column("a")
        self.assertEqual(ca.sum(skipna=True), 0)

    def test_mean01(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([], np.int))], force_create=True).get_column("a")
        self.assertTrue(ca.isnan(ca.mean(skipna=True)))

    def test_mean02(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([], np.int))], force_create=True).get_column("a")
        self.assertTrue(ca.isnan(ca.mean(skipna=False)))

    def test_mean03(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", [np.array([6, 4, 7, 4, 6, 9], np.int)])], force_create=True).get_column("a")
        assert_close(ca.mean(skipna=True), 6.0)

    def test_mean04(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds,[("a", [np.array([6, 4, 7, 4, 6, 9], np.int)])], force_create=True).get_column("a")
        assert_close(ca.mean(skipna=False), 6.0)

    def test_mean05(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([6, 4, np.nan, 4, 6, 9], np.float))], force_create=True).get_column("a")
        assert_close(ca.mean(skipna=True), 5.0)

    def test_mean06(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([6, 4, np.nan, 4, 6, 9], np.float))], force_create=True).get_column("a")
        self.assertTrue(ca.isnan(ca.mean(skipna=False)))

    def test_mean07(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([np.nan, np.nan], np.float))], force_create=True).get_column("a")
        self.assertTrue(ca.isnan(ca.mean(skipna=True)))

    def test_mean08(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([np.nan, np.nan], np.float))], force_create=True).get_column("a")
        self.assertTrue(ca.isnan(ca.mean(skipna=False)))

    def test_min01(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([], np.int))], force_create=True).get_column("a")
        x = ca.min()
        self.assertTrue(ca.isnan(ca.min(skipna=True)))

    def test_min02(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([], np.int))], force_create=True).get_column("a")
        self.assertTrue(ca.isnan(ca.min(skipna=False)))

    def test_min03(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", [np.array([6, 4, 7, 4, 6, 9], np.int)])], force_create=True).get_column("a")
        assert_close(ca.min(skipna=True), 4)

    def test_min04(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", [np.array([6, 4, 7, 4, 6, 9], np.int)])], force_create=True).get_column("a")
        assert_close(ca.min(skipna=False), 4)

    def test_min05(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([6, 4, np.nan, 4, 6, 9], np.float))], force_create=True).get_column("a")
        assert_close(ca.min(skipna=True), 4)

    def test_min06(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([6, 4, np.nan, 4, 6, 9], np.float))], force_create=True).get_column("a")
        self.assertTrue(ca.isnan(ca.min(skipna=False)))

    def test_min07(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([np.nan, np.nan], np.float))], force_create=True).get_column("a")
        self.assertTrue(ca.isnan(ca.min(skipna=True)))

    def test_min08(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([np.nan, np.nan], np.float))], force_create=True).get_column("a")
        self.assertTrue(ca.isnan(ca.min(skipna=False)))

    def test_min09(self):
        ds = DataSet(self.test_dir, force_create=True)
        nan = np.iinfo(np.int8).min
        ca = Table("t", ds, [("a", np.array([3, nan, 2], np.int8))], force_create=True).get_column("a")
        self.assertEqual(ca.min(skipna=True), 2)

    def test_min10(self):
        ds = DataSet(self.test_dir, force_create=True)
        nan = np.iinfo(np.int8).min
        ca = Table("t", ds, [("a", np.array([nan, 2], np.int8))], force_create=True).get_column("a")
        self.assertTrue(ca.isnan(ca.min(skipna=False)))

    def test_min11(self):
        ds = DataSet(self.test_dir, force_create=True)
        nan = np.iinfo(np.int8).min
        ca = Table("t", ds, [("a", np.array([nan, nan], np.int8))], force_create=True).get_column("a")
        self.assertTrue(ca.isnan(ca.min(skipna=False)))

    def test_max01(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([], np.int))], force_create=True).get_column("a")
        self.assertTrue(ca.isnan(ca.max(skipna=True)))

    def test_max02(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([], np.int))], force_create=True).get_column("a")
        self.assertTrue(ca.isnan(ca.max(skipna=False)))

    def test_max03(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", [np.array([6, 4, 7, 4, 6, 9], np.int)])], force_create=True).get_column("a")
        assert_close(ca.max(skipna=True), 9)

    def test_max04(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", [np.array([6, 4, 7, 4, 6, 9], np.int)])], force_create=True).get_column("a")
        assert_close(ca.max(skipna=False), 9)

    def test_max05(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([6, 4, np.nan, 4, 6, 9], np.float))], force_create=True).get_column("a")
        assert_close(ca.max(skipna=True), 9)

    def test_max06(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([6, 4, np.nan, 4, 6, 9], np.float))], force_create=True).get_column("a")
        self.assertTrue(ca.isnan(ca.max(skipna=False)))

    def test_max07(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([np.nan, np.nan], np.float))], force_create=True).get_column("a")
        self.assertTrue(ca.isnan(ca.max(skipna=True)))

    def test_max08(self):
        ds = DataSet(self.test_dir, force_create=True)
        ca = Table("t", ds, [("a", np.array([np.nan, np.nan], np.float))], force_create=True).get_column("a")
        self.assertTrue(ca.isnan(ca.max(skipna=False)))

    def test_max09(self):
        ds = DataSet(self.test_dir, force_create=True)
        nan = np.iinfo(np.int8).min
        ca = Table("t", ds, [("a", np.array([3, nan, 2], np.int8))], force_create=True).get_column("a")
        self.assertEqual(ca.max(skipna=True), 3)

    def test_max10(self):
        ds = DataSet(self.test_dir, force_create=True)
        nan = np.iinfo(np.int8).min
        ca = Table("t", ds, [("a", np.array([nan, 2], np.int8))], force_create=True).get_column("a")
        self.assertTrue(ca.isnan(ca.max(skipna=False)))

    def test_max11(self):
        ds = DataSet(self.test_dir, force_create=True)
        nan = np.iinfo(np.int8).min
        ca = Table("t", ds, [("a", np.array([nan, nan], np.int8))], force_create=True).get_column("a")
        self.assertTrue(ca.isnan(ca.max(skipna=False)))
Exemple #9
0
 def setUp(self):
     self.a = [6, 4, 7, 4, 6, 9]
     self.test_dir = os.path.join("/temp", "dazzle-test")
     ds = DataSet(self.test_dir, force_create=True)
     self.t = Table("t", ds, [("a", np.array([1, 3], dtype=np.int8)), ("x", np.array([2, 4], dtype=np.float))], force_create=True)
     self.ca = self.t.get_column("a")
Exemple #10
0
 def test_from_csv04(self):
     cat = Table.from_csv("Category", self.ds, os.path.join(AVITO_DATA_DIR, "Category.tsv"), delimiter='\t',
                                usecols=['CategoryID', 'ParentCategoryID'], verbose=False)
     self.assertEqual(len(cat.ctable), 68)
     self.assertEqual(len(cat.columns), 2)
Exemple #11
0
 def test_sum07(self):
     ds = DataSet(self.test_dir, force_create=True)
     ca = Table("t", ds, [("a", np.array([np.nan, np.nan], np.float))], force_create=True).get_column("a")
     assert_close(ca.sum(skipna=True), 0.0)
Exemple #12
0
 def test_from_csv02(self):
     Table.from_csv("Category", self.ds, "/temp/dazzle-test/dataset.json", usecols=['CategoryID', 'ParentCategoryID'], verbose=False)
Exemple #13
0
 def test_from_csv03(self):
     cat = Table.from_csv("Category", self.ds, os.path.join(AVITO_DATA_DIR, "Category.tsv"), verbose=False)
Exemple #14
0
 def test_from_csv01(self):
     Table.from_csv("Category", self.ds, "/bim/bam/test.csv", usecols=['CategoryID', 'ParentCategoryID'], verbose=False)
Exemple #15
0
 def test_copy04(self):
     test_dir = os.path.join("/temp/dazzle-test2")
     ds2 = DataSet(test_dir, force_create=True)
     t = Table.copy("t", ds2, "/temp/dazzle-test/t")
     assert_equal_table(t, self.ds.get_table("t"))
Exemple #16
0
 def test_copy03(self):
     test_dir = os.path.join("/temp/dazzle-test2")
     ds2 = DataSet(test_dir, force_create=True)
     Table.copy("_", ds2, "/temp/dazzle-test/t")
Exemple #17
0
 def test_sum03(self):
     ds = DataSet(self.test_dir, force_create=True)
     ca = Table("t", ds, [("a", np.array([6, 4, 7, 4, 6, 9], np.int))], force_create=True).get_column("a")
     assert_close(ca.sum(skipna=True), 36.0)
Exemple #18
0
 def test_from_dataframe01(self):
     df = pd.DataFrame({'a': [1,2], 'b': [3., 4.]})
     v = Table.from_dataframe("v", self.ds, df)
     self.assertEqual(len(v.ctable), 2)
Exemple #19
0
 def test_sum06(self):
     ds = DataSet(self.test_dir, force_create=True)
     ca = Table("t", ds, [("a", np.array([6, 4, np.nan, 4, 6, 9], np.float))], force_create=True).get_column("a")
     self.assertTrue(ca.isnan(ca.sum(skipna=False)))
Exemple #20
0
class TestTable(unittest.TestCase):

    def assert_string_equal(self, s1, s2):
        return self.assertEqual(''.join(s1.split()), ''.join(s2.split()))

    def assert_table_content(self, table, to_check):
        for check, val in to_check.items():
            if check == 'data_dir':
                self.assertEqual(table.data_dir, val)
            elif check == 'len':
                self. assertEqual(len(table.ctable), val)
            elif check == 'type':
                self. assertEqual(type(table), val)
            elif check == 'columns':
                index = 0
                for col_name, attrs in val:
                    self.assert_column_content(table, col_name, index, attrs)
                    index += 1
            else:
                raise DazzleError("Invalid key: %s" % check)

    def assert_column_content(self, table, col_name, index, to_check):
        self.assertTrue(isinstance(table._columns[index], LiteralColumn))
        col = table._columns[index]
        self.assertTrue(col._table == table)
        self.assertTrue(col._name == col_name)
        self.assertTrue(table.ctable.names[index] == col_name)
        bz_col = table.ctable.cols._cols[col_name]
        self.assertEqual(col.carray, bz_col)
        self.assertTrue(isinstance(bz_col, bcolz.carray))
        for check, val in to_check.items():
            if check == 'len':
                self.assertEqual(bz_col.len, val)
            elif check == 'content':
                assert_array_equal(bz_col[:], val)
            elif check == 'type':
                self.assertEqual(col.dtype, val)
            else:
                raise DazzleError("Invalid key: %s" % check)

    def setUp(self):
        self.a = [6, 4, 7, 4, 6, 9]
        self.test_dir = os.path.join("/temp", "dazzle-test")
        self.ds = DataSet(self.test_dir, force_create=True)
        self.t = Table("t", self.ds, [("a", np.array([], np.int)), ("b", np.array([], np.float))], force_create=True)
        self.u = Table("u", self.ds, [("a", np.array([1, 2], np.int)), ("b", np.array([1.1, 2.2], np.float))], force_create=True)

    def test_init01(self):
        self.assert_table_content(self.t, {
            'data_dir': os.path.join(self.test_dir, self.t._name),
            'len': 0,
            'type': Table,
            'columns': [('a', {'type': np.int, 'content': []})]})

    @raises(DazzleError)
    def test_init02(self):
        Table("_", self.ds, [("a", np.array([], np.int)), ("b", np.array([], np.float))], force_create=True)

    @raises(DazzleError)
    def test_init03(self):
        Table("t", self.ds, [("a", np.array([], np.int)), ("b", np.array([], np.float))], mode='open', force_create=True)

    @raises(DazzleError)
    def test_init04(self):
        Table("t", self.ds, [("a", np.array([], np.int)), ("b", np.array([], np.float))], force_create=True)

    @raises(ValueError)
    def test_init05(self):
        Table("t", self.ds, [("a", np.array([], np.int)), ("b", np.array([], np.float))], mode='open')

    @raises(ValueError)
    def test_init06(self):
        Table("t", self.ds, [{"a": np.array([], np.int)}], force_create=True)

    @raises(DazzleError)
    def test_init07(self):
        Table("t", self.ds, [], force_create=True)

    @raises(ValueError)
    def test_init08(self):
        Table("t", self.ds, [("a", 3)], force_create=True)

    @raises(ValueError)
    def test_init09(self):
        Table("t", self.ds, [{"a": np.array([True, False], np.bool)}], force_create=True)

    @raises(ValueError)
    def test_init10(self):
        Table("t", self.ds, ("a", np.array([], np.int)), force_create=True)

    @raises(ValueError)
    def test_init11(self):
        Table("t", self.ds, [("a", np.array([], np.int)), ("b", np.array([], np.float), 'oops')], force_create=True)

    @raises(DazzleError)
    def test_init11(self):
        Table("t", self.ds, [("a", np.array([], np.bool)), ("b", np.array([], np.float))], force_create=True)

    def test_init12(self):
        v = Table("v", self.ds,  [("a", [3])])
        self.assert_table_content(v, {
            'data_dir': os.path.join(self.test_dir, "v"),
            'len': 1,
            'type': Table,
            'columns': [('a', {'type': np.int, 'content': [3]})]})

    def test_dataset01(self):
        self.assertEqual(self.ds, self.t.dataset)

    @raises(DazzleError)
    def test_data_dir01(self):
        """no table associated"""
        print(LiteralColumn("a", None).data_dir)

    @raises(DazzleError)
    def test_copy01(self):
        Table.copy("t", self.ds, "/temp/dazzle-test")

    @raises(DazzleError)
    def test_copy02(self):
        Table.copy("t", self.ds, "/bim/bam")

    @raises(DazzleError)
    def test_copy03(self):
        test_dir = os.path.join("/temp/dazzle-test2")
        ds2 = DataSet(test_dir, force_create=True)
        Table.copy("_", ds2, "/temp/dazzle-test/t")

    def test_copy04(self):
        test_dir = os.path.join("/temp/dazzle-test2")
        ds2 = DataSet(test_dir, force_create=True)
        t = Table.copy("t", ds2, "/temp/dazzle-test/t")
        assert_equal_table(t, self.ds.get_table("t"))

    @raises(FileNotFoundError)
    def test_from_csv01(self):
        Table.from_csv("Category", self.ds, "/bim/bam/test.csv", usecols=['CategoryID', 'ParentCategoryID'], verbose=False)

    @raises(ValueError)
    def test_from_csv02(self):
        Table.from_csv("Category", self.ds, "/temp/dazzle-test/dataset.json", usecols=['CategoryID', 'ParentCategoryID'], verbose=False)

    @raises(DazzleError)
    def test_from_csv03(self):
        cat = Table.from_csv("Category", self.ds, os.path.join(AVITO_DATA_DIR, "Category.tsv"), verbose=False)

    def test_from_csv04(self):
        cat = Table.from_csv("Category", self.ds, os.path.join(AVITO_DATA_DIR, "Category.tsv"), delimiter='\t',
                                   usecols=['CategoryID', 'ParentCategoryID'], verbose=False)
        self.assertEqual(len(cat.ctable), 68)
        self.assertEqual(len(cat.columns), 2)

    def test_from_dataframe01(self):
        df = pd.DataFrame({'a': [1,2], 'b': [3., 4.]})
        v = Table.from_dataframe("v", self.ds, df)
        self.assertEqual(len(v.ctable), 2)

    def test_get_column01(self):
        self.assertTrue(self.t.get_column("x") is None)

    def test_get_column02(self):
        self.assertEqual(self.t.get_column("a").name, "a")

    @raises(ValueError)
    def test_remove_column01(self):
        self.t.remove_column("x")

    def test_remove_column02(self):
        self.t.remove_column("a")
        self.assertTrue(self.t.get_column("a") is None)
        self.assertEqual(self.t.columns[0], self.t.get_column("b"))
        self.assertEqual(self.t.ctable.names[0], "b")

    def test_to_dataframe01(self):
        self.assertEqual(len(self.u.to_dataframe()), 2)

    def test_append01(self):
        self.t.append({'a': [1,2], 'b': [3., 4.]})
        self.assert_table_content(self.t, {
            'len': 2,
            'columns': [('a', {'content': [1,2]}), ('b', {'content': [3., 4.]})]})

    def test_append02(self):
        self.t.append({'b': [3., 4.], 'a': [1,2]})
        self.assert_table_content(self.t, {
            'len': 2,
            'columns': [('a', {'content': [1,2]}), ('b', {'content': [3., 4.]})]})

    def test_append03(self):
        self.t.append({'a': [5.4, 2], 'b': [3., 4.]})

    @raises(ValueError)
    def test_append04(self):
        self.t.append({'a': ["bla", 2], 'b': [3., 4.]})

    @raises(ValueError)
    def test_append05(self):
        self.t.append({'a': [], 'b': [3., 4.]})

    @raises(ValueError)
    def test_append06(self):
        self.t.append({'a': []})

    @raises(ValueError)
    def test_append05(self):
        self.t.append([[], [3., 4.]])

    def test_get_item01(self):
        self.assertEqual(self.u[0]['a'], 1)
        self.assertEqual(self.u[0]['b'], 1.1)

    def test_get_item02(self):
        assert_array_equal(self.u['a'], np.array([1,2]))

    @raises(IndexError)
    def test_get_item03(self):
        print(self.u[0,1])

    def test_get_item04(self):
        assert_array_equal(self.u[[0,1]]['a'], np.array([1, 2]))
        assert_array_equal(self.u[[0,1]]['b'], np.array([1.1, 2.2]))

    def test_get_item05(self):
        assert_array_equal(self.u['a'][[0,1]], np.array([1, 2]))
        assert_array_equal(self.u['b'][[0,1]], np.array([1.1, 2.2]))

    def test_set_item01(self):
        self.u[0] = (10, 20.2)
        self.assertEqual(self.u[0]['a'], 10)
        self.assertEqual(self.u[0]['b'], 20.2)

    def test_set_item02(self):
        self.u[[0, 1]] = [(10, 20.2), (190, 32.4)]
        self.assertEqual(self.u[0]['b'], 20.2)
        self.assertEqual(self.u[1]['a'], 190)

    # def test_set_item03(self):
    #     self.u[[0, 1]]['a'] = 40 # makes a copy; u is not modified
    #     self.assertEqual(self.u[0]['a'], 40)

    # def test_set_item04(self):
    #     self.u[0]['a'] = 14  # makes a copy; u is not modified
    #     self.assertEqual(self.u[0]['a'], 14)

    def test_str01(self):
        s = \
            "u(a: int32, b: float64)" \
            "2 row(s) - compressed: 2.00 MB - comp. ratio: 0.00" \
            "+---+-------+" \
            "| a |     b |" \
            "+---+-------+" \
            "| 1 | 1.100 |" \
            "| 2 | 2.200 |" \
            "+---+-------+"
        self.assert_string_equal(self.u.__str__(), s)

    def test_str02(self):
        s = \
            "u(a: int32, b: float64)" \
            "2 row(s) - compressed: 2.00 MB - comp. ratio: 0.00" \
            "+---+-------+" \
            "| a |     b |" \
            "+---+-------+" \
            "| 1 | 1.100 |" \
            "| 2 | 2.200 |" \
            "+---+-------+"
        self.assert_string_equal(self.u.__str__(head=20), s)

    def test_str03(self):
        s = \
            "u(a: int32, b: float64)" \
            "2 row(s) - compressed: 2.00 MB - comp. ratio: 0.00" \
            "+---+-----+" \
            "| a |   b |" \
            "+---+-----+" \
            "| 1 | 1.1 |" \
            "| 2 | 2.2 |" \
            "+---+-----+"
        self.u.get_column("b").format = "%.1f"
        self.assert_string_equal(self.u.__str__(head=20), s)

    def test_head01(self):
        s = \
            "u(a: int32, b: float64)" \
            "2 row(s) - compressed: 2.00 MB - comp. ratio: 0.00" \
            "+-----+-------+" \
            "| a   |     b |" \
            "+-----+-------+" \
            "|   1 | 1.100 |" \
            "| ... |   ... |" \
            "+-----+-------+"
        self.assert_string_equal(self.u.head(1), s)

    def test_tail01(self):
        s = \
            "u(a: int32, b: float64)" \
            "2 row(s) - compressed: 2.00 MB - comp. ratio: 0.00" \
            "+-----+-------+" \
            "| a   |     b |" \
            "+-----+-------+" \
            "| ... |   ... |" \
            "|   2 | 2.200 |" \
            "+-----+-------+"
        self.assert_string_equal(self.u.tail(1), s)

    def test_rebuild01(self):
        cat = Table.from_csv("Category", self.ds, os.path.join(AVITO_DATA_DIR, "Category.tsv"), delimiter='\t',
                                   usecols=['CategoryID', 'ParentCategoryID', 'Level'], verbose=False)


        cat.rebuild({"CategoryID": np.int8, "Level": np.int8, "ParentCategoryID": np.int8})
        self.assertEqual(len(cat[:]), 69)
        self.assertEqual(cat['CategoryID'].dtype, np.int8)
        self.assertEqual(cat[0]['CategoryID'], -128) # int8.min
        self.assertEqual(cat[0]['Level'], -128) # int8.min
        self.assertEqual(cat[0]['ParentCategoryID'], -128) # int8.min

    @raises(DazzleError)
    def test_rebuild02(self):
        cat = Table.from_csv("Category", self.ds, os.path.join(AVITO_DATA_DIR, "Category.tsv"), delimiter='\t',
                                   usecols=['CategoryID', 'ParentCategoryID', 'Level'], verbose=False)
        cat.rebuild({"CategoryID": np.uint8, "Level": np.uint8, "ParentCategoryID": np.uint8})

    def test_add_join_column(self):
        ds = DataSet("/temp/dazzle-test", force_create=True)
        t = Table("t", ds, [('a', np.array([10, 2, 3, 5, 4, 7, 1, 8, 6, 9])),
                            ('c', np.array([100, 20, 30, 50, 40, 70, 10, 80, 60, np.nan]))])

        a_ref = np.array([1, 5, 4, 5, 6, 4, 1, 1, 9, 7, 8, 4, 5, 5, 2, 2, 8, 5, 4, 20])
        u = Table("u", ds, [('a', a_ref), ("y", a_ref * 10)])

        u.get_column("a").ref_column = t.get_column("a")
        t.rebuild({'a': np.int8, 'c': np.int8})
        u.rebuild({'a': np.int8, 'y': np.int16})

        u.add_reference_column(u.get_column("a"), t.get_column("a"))
        # print(t.head(20))
        # print(u.head(30))

        u.add_join_column("result", [u.get_column("a_ref"), t.get_column("c")])
        #print(u.head(30))
        assert np.array_equal(u['result'],
                              [-128, 10, 50, 40, 50, 60, 40, 10, 10, -128, 70, 80, 40, 50, 50, 20, 20, 80, 50, 40, -128])
Exemple #21
0
 def test_sum11(self):
     ds = DataSet(self.test_dir, force_create=True)
     nan = np.iinfo(np.int8).min
     ca = Table("t", ds, [("a", np.array([nan, nan], np.int8))], force_create=True).get_column("a")
     self.assertEqual(ca.sum(skipna=True), 0)
Exemple #22
0
 def test_max05(self):
     ds = DataSet(self.test_dir, force_create=True)
     ca = Table("t", ds, [("a", np.array([6, 4, np.nan, 4, 6, 9], np.float))], force_create=True).get_column("a")
     assert_close(ca.max(skipna=True), 9)
Exemple #23
0
 def test_min02(self):
     ds = DataSet(self.test_dir, force_create=True)
     ca = Table("t", ds, [("a", np.array([], np.int))], force_create=True).get_column("a")
     self.assertTrue(ca.isnan(ca.min(skipna=False)))
Exemple #24
0
 def test_max09(self):
     ds = DataSet(self.test_dir, force_create=True)
     nan = np.iinfo(np.int8).min
     ca = Table("t", ds, [("a", np.array([3, nan, 2], np.int8))], force_create=True).get_column("a")
     self.assertEqual(ca.max(skipna=True), 3)
Exemple #25
0
 def test_max04(self):
     ds = DataSet(self.test_dir, force_create=True)
     ca = Table("t", ds, [("a", [np.array([6, 4, 7, 4, 6, 9], np.int)])], force_create=True).get_column("a")
     assert_close(ca.max(skipna=False), 9)
Exemple #26
0
 def test_copy01(self):
     Table.copy("t", self.ds, "/temp/dazzle-test")
Exemple #27
0
 def test_max08(self):
     ds = DataSet(self.test_dir, force_create=True)
     ca = Table("t", ds, [("a", np.array([np.nan, np.nan], np.float))], force_create=True).get_column("a")
     self.assertTrue(ca.isnan(ca.max(skipna=False)))
Exemple #28
0
 def test_rebuild02(self):
     cat = Table.from_csv("Category", self.ds, os.path.join(AVITO_DATA_DIR, "Category.tsv"), delimiter='\t',
                                usecols=['CategoryID', 'ParentCategoryID', 'Level'], verbose=False)
     cat.rebuild({"CategoryID": np.uint8, "Level": np.uint8, "ParentCategoryID": np.uint8})
Exemple #29
0
 def test_max11(self):
     ds = DataSet(self.test_dir, force_create=True)
     nan = np.iinfo(np.int8).min
     ca = Table("t", ds, [("a", np.array([nan, nan], np.int8))], force_create=True).get_column("a")
     self.assertTrue(ca.isnan(ca.max(skipna=False)))
Exemple #30
0
    def test_add_join_column(self):
        ds = DataSet("/temp/dazzle-test", force_create=True)
        t = Table("t", ds, [('a', np.array([10, 2, 3, 5, 4, 7, 1, 8, 6, 9])),
                            ('c', np.array([100, 20, 30, 50, 40, 70, 10, 80, 60, np.nan]))])

        a_ref = np.array([1, 5, 4, 5, 6, 4, 1, 1, 9, 7, 8, 4, 5, 5, 2, 2, 8, 5, 4, 20])
        u = Table("u", ds, [('a', a_ref), ("y", a_ref * 10)])

        u.get_column("a").ref_column = t.get_column("a")
        t.rebuild({'a': np.int8, 'c': np.int8})
        u.rebuild({'a': np.int8, 'y': np.int16})

        u.add_reference_column(u.get_column("a"), t.get_column("a"))
        # print(t.head(20))
        # print(u.head(30))

        u.add_join_column("result", [u.get_column("a_ref"), t.get_column("c")])
        #print(u.head(30))
        assert np.array_equal(u['result'],
                              [-128, 10, 50, 40, 50, 60, 40, 10, 10, -128, 70, 80, 40, 50, 50, 20, 20, 80, 50, 40, -128])