def compute_category_similarity(): ds = DataSet.open(dataset_dir) TrainSearchStream = ds.get_table("TrainSearchStream") AdsInfo = ds.get_table("AdsInfo") Category = ds.get_table("Category") SearchInfo = ds.get_table("SearchInfo") TrainSearchStream.add_join_column("AdCategoryID", [TrainSearchStream.get_column("AdID_ref"), AdsInfo.get_column("CategoryID_ref")]) TrainSearchStream.add_join_column("AdCategoryLevel", [TrainSearchStream.get_column("AdID_ref"), AdsInfo.get_column("CategoryID_ref"), Category.get_column("Level")]) TrainSearchStream.add_join_column("AdCategoryParentID", [TrainSearchStream.get_column("AdID_ref"), AdsInfo.get_column("CategoryID_ref"), Category.get_column("ParentCategoryID")]) TrainSearchStream.add_join_column("SearchCategoryID", [TrainSearchStream.get_column("SearchID_ref"), SearchInfo.get_column("CategoryID_ref")]) TrainSearchStream.add_join_column("SearchCategoryLevel", [TrainSearchStream.get_column("SearchID_ref"), SearchInfo.get_column("CategoryID_ref"), Category.get_column("Level")]) TrainSearchStream.add_join_column("SearchCategoryParentID", [TrainSearchStream.get_column("SearchID_ref"), SearchInfo.get_column("CategoryID_ref"), Category.get_column("ParentCategoryID")]) print(TrainSearchStream)
def test_sum(): import time ds = DataSet.open(dataset_dir) tss = ds.get_table("TrainSearchStream") p = tss.get_column("Position") t = time.time() print(p.sum()) print(time.time() - t)
def check_raw_dataset(): # Category(CategoryID: int32, Level: int8, ParentCategoryID: int8): 68 row(s) - compressed: 0.06 MB - comp. ratio: 0.01 # TrainSearchStream(SearchID: int32, AdID: int32, Position: int8, ObjectType: int8, HistCTR: float32, IsClick: float32): 190,157,735 row(s) - compressed: 1479.89 MB - comp. ratio: 2.21 # AdsInfo(AdID: int32, LocationID: float32, CategoryID: float32, Price: float32, IsContext: float32): 36,893,298 row(s) - compressed: 280.61 MB - comp. ratio: 2.51 # SearchInfo(SearchID: int32, IPID: int32, UserID: float32, IsUserLoggedOn: float32, LocationID: float32, CategoryID: float32): 91,019,228 row(s) - compressed: 1043.73 MB - comp. ratio: 2.00 # userInfo(UserID: int32, UserAgentID: int32, UserAgentOSID: int32, UserDeviceID: int32, UserAgentFamilyID: int32): 4,284,823 row(s) - compressed: 20.32 MB - comp. ratio: 4.02 # Location(LocationID: int32, RegionID: float32, CityID: float32): 4,080 row(s) - compressed: 0.38 MB - comp. ratio: 0.12 # PhoneRequestsStream(UserID: int32, IPID: int32, AdID: int32, PhoneRequestDate: bytes168): 13,717,580 row(s) - compressed: 139.27 MB - comp. ratio: 3.10 # VisitsStream(UserID: int32, IPID: int32, AdID: int32, ViewDate: bytes168): 286,821,375 row(s) - compressed: 2548.20 MB - comp. ratio: 3.54 ds = DataSet.open(raw_dir) for table in ds.tables: print(table.short_descr())
def setUp(self): self.a = [6, 4, 7, 4, 6, 9] self.test_dir = os.path.join("/temp", "dazzle-test") self.ds = DataSet(self.test_dir, force_create=True) self.t = Table("t", self.ds, [("a", np.array([], np.int)), ("b", np.array([], np.float))], force_create=True) self.u = Table("u", self.ds, [("a", np.array([1, 2], np.int)), ("b", np.array([1.1, 2.2], np.float))], force_create=True)
class TestTable(unittest.TestCase): def assert_string_equal(self, s1, s2): return self.assertEqual(''.join(s1.split()), ''.join(s2.split())) def assert_table_content(self, table, to_check): for check, val in to_check.items(): if check == 'data_dir': self.assertEqual(table.data_dir, val) elif check == 'len': self. assertEqual(len(table.ctable), val) elif check == 'type': self. assertEqual(type(table), val) elif check == 'columns': index = 0 for col_name, attrs in val: self.assert_column_content(table, col_name, index, attrs) index += 1 else: raise DazzleError("Invalid key: %s" % check) def assert_column_content(self, table, col_name, index, to_check): self.assertTrue(isinstance(table._columns[index], LiteralColumn)) col = table._columns[index] self.assertTrue(col._table == table) self.assertTrue(col._name == col_name) self.assertTrue(table.ctable.names[index] == col_name) bz_col = table.ctable.cols._cols[col_name] self.assertEqual(col.carray, bz_col) self.assertTrue(isinstance(bz_col, bcolz.carray)) for check, val in to_check.items(): if check == 'len': self.assertEqual(bz_col.len, val) elif check == 'content': assert_array_equal(bz_col[:], val) elif check == 'type': self.assertEqual(col.dtype, val) else: raise DazzleError("Invalid key: %s" % check) def setUp(self): self.a = [6, 4, 7, 4, 6, 9] self.test_dir = os.path.join("/temp", "dazzle-test") self.ds = DataSet(self.test_dir, force_create=True) self.t = Table("t", self.ds, [("a", np.array([], np.int)), ("b", np.array([], np.float))], force_create=True) self.u = Table("u", self.ds, [("a", np.array([1, 2], np.int)), ("b", np.array([1.1, 2.2], np.float))], force_create=True) def test_init01(self): self.assert_table_content(self.t, { 'data_dir': os.path.join(self.test_dir, self.t._name), 'len': 0, 'type': Table, 'columns': [('a', {'type': np.int, 'content': []})]}) @raises(DazzleError) def test_init02(self): Table("_", self.ds, [("a", np.array([], np.int)), ("b", np.array([], np.float))], force_create=True) @raises(DazzleError) def test_init03(self): Table("t", self.ds, [("a", np.array([], np.int)), ("b", np.array([], np.float))], mode='open', force_create=True) @raises(DazzleError) def test_init04(self): Table("t", self.ds, [("a", np.array([], np.int)), ("b", np.array([], np.float))], force_create=True) @raises(ValueError) def test_init05(self): Table("t", self.ds, [("a", np.array([], np.int)), ("b", np.array([], np.float))], mode='open') @raises(ValueError) def test_init06(self): Table("t", self.ds, [{"a": np.array([], np.int)}], force_create=True) @raises(DazzleError) def test_init07(self): Table("t", self.ds, [], force_create=True) @raises(ValueError) def test_init08(self): Table("t", self.ds, [("a", 3)], force_create=True) @raises(ValueError) def test_init09(self): Table("t", self.ds, [{"a": np.array([True, False], np.bool)}], force_create=True) @raises(ValueError) def test_init10(self): Table("t", self.ds, ("a", np.array([], np.int)), force_create=True) @raises(ValueError) def test_init11(self): Table("t", self.ds, [("a", np.array([], np.int)), ("b", np.array([], np.float), 'oops')], force_create=True) @raises(DazzleError) def test_init11(self): Table("t", self.ds, [("a", np.array([], np.bool)), ("b", np.array([], np.float))], force_create=True) def test_init12(self): v = Table("v", self.ds, [("a", [3])]) self.assert_table_content(v, { 'data_dir': os.path.join(self.test_dir, "v"), 'len': 1, 'type': Table, 'columns': [('a', {'type': np.int, 'content': [3]})]}) def test_dataset01(self): self.assertEqual(self.ds, self.t.dataset) @raises(DazzleError) def test_data_dir01(self): """no table associated""" print(LiteralColumn("a", None).data_dir) @raises(DazzleError) def test_copy01(self): Table.copy("t", self.ds, "/temp/dazzle-test") @raises(DazzleError) def test_copy02(self): Table.copy("t", self.ds, "/bim/bam") @raises(DazzleError) def test_copy03(self): test_dir = os.path.join("/temp/dazzle-test2") ds2 = DataSet(test_dir, force_create=True) Table.copy("_", ds2, "/temp/dazzle-test/t") def test_copy04(self): test_dir = os.path.join("/temp/dazzle-test2") ds2 = DataSet(test_dir, force_create=True) t = Table.copy("t", ds2, "/temp/dazzle-test/t") assert_equal_table(t, self.ds.get_table("t")) @raises(FileNotFoundError) def test_from_csv01(self): Table.from_csv("Category", self.ds, "/bim/bam/test.csv", usecols=['CategoryID', 'ParentCategoryID'], verbose=False) @raises(ValueError) def test_from_csv02(self): Table.from_csv("Category", self.ds, "/temp/dazzle-test/dataset.json", usecols=['CategoryID', 'ParentCategoryID'], verbose=False) @raises(DazzleError) def test_from_csv03(self): cat = Table.from_csv("Category", self.ds, os.path.join(AVITO_DATA_DIR, "Category.tsv"), verbose=False) def test_from_csv04(self): cat = Table.from_csv("Category", self.ds, os.path.join(AVITO_DATA_DIR, "Category.tsv"), delimiter='\t', usecols=['CategoryID', 'ParentCategoryID'], verbose=False) self.assertEqual(len(cat.ctable), 68) self.assertEqual(len(cat.columns), 2) def test_from_dataframe01(self): df = pd.DataFrame({'a': [1,2], 'b': [3., 4.]}) v = Table.from_dataframe("v", self.ds, df) self.assertEqual(len(v.ctable), 2) def test_get_column01(self): self.assertTrue(self.t.get_column("x") is None) def test_get_column02(self): self.assertEqual(self.t.get_column("a").name, "a") @raises(ValueError) def test_remove_column01(self): self.t.remove_column("x") def test_remove_column02(self): self.t.remove_column("a") self.assertTrue(self.t.get_column("a") is None) self.assertEqual(self.t.columns[0], self.t.get_column("b")) self.assertEqual(self.t.ctable.names[0], "b") def test_to_dataframe01(self): self.assertEqual(len(self.u.to_dataframe()), 2) def test_append01(self): self.t.append({'a': [1,2], 'b': [3., 4.]}) self.assert_table_content(self.t, { 'len': 2, 'columns': [('a', {'content': [1,2]}), ('b', {'content': [3., 4.]})]}) def test_append02(self): self.t.append({'b': [3., 4.], 'a': [1,2]}) self.assert_table_content(self.t, { 'len': 2, 'columns': [('a', {'content': [1,2]}), ('b', {'content': [3., 4.]})]}) def test_append03(self): self.t.append({'a': [5.4, 2], 'b': [3., 4.]}) @raises(ValueError) def test_append04(self): self.t.append({'a': ["bla", 2], 'b': [3., 4.]}) @raises(ValueError) def test_append05(self): self.t.append({'a': [], 'b': [3., 4.]}) @raises(ValueError) def test_append06(self): self.t.append({'a': []}) @raises(ValueError) def test_append05(self): self.t.append([[], [3., 4.]]) def test_get_item01(self): self.assertEqual(self.u[0]['a'], 1) self.assertEqual(self.u[0]['b'], 1.1) def test_get_item02(self): assert_array_equal(self.u['a'], np.array([1,2])) @raises(IndexError) def test_get_item03(self): print(self.u[0,1]) def test_get_item04(self): assert_array_equal(self.u[[0,1]]['a'], np.array([1, 2])) assert_array_equal(self.u[[0,1]]['b'], np.array([1.1, 2.2])) def test_get_item05(self): assert_array_equal(self.u['a'][[0,1]], np.array([1, 2])) assert_array_equal(self.u['b'][[0,1]], np.array([1.1, 2.2])) def test_set_item01(self): self.u[0] = (10, 20.2) self.assertEqual(self.u[0]['a'], 10) self.assertEqual(self.u[0]['b'], 20.2) def test_set_item02(self): self.u[[0, 1]] = [(10, 20.2), (190, 32.4)] self.assertEqual(self.u[0]['b'], 20.2) self.assertEqual(self.u[1]['a'], 190) # def test_set_item03(self): # self.u[[0, 1]]['a'] = 40 # makes a copy; u is not modified # self.assertEqual(self.u[0]['a'], 40) # def test_set_item04(self): # self.u[0]['a'] = 14 # makes a copy; u is not modified # self.assertEqual(self.u[0]['a'], 14) def test_str01(self): s = \ "u(a: int32, b: float64)" \ "2 row(s) - compressed: 2.00 MB - comp. ratio: 0.00" \ "+---+-------+" \ "| a | b |" \ "+---+-------+" \ "| 1 | 1.100 |" \ "| 2 | 2.200 |" \ "+---+-------+" self.assert_string_equal(self.u.__str__(), s) def test_str02(self): s = \ "u(a: int32, b: float64)" \ "2 row(s) - compressed: 2.00 MB - comp. ratio: 0.00" \ "+---+-------+" \ "| a | b |" \ "+---+-------+" \ "| 1 | 1.100 |" \ "| 2 | 2.200 |" \ "+---+-------+" self.assert_string_equal(self.u.__str__(head=20), s) def test_str03(self): s = \ "u(a: int32, b: float64)" \ "2 row(s) - compressed: 2.00 MB - comp. ratio: 0.00" \ "+---+-----+" \ "| a | b |" \ "+---+-----+" \ "| 1 | 1.1 |" \ "| 2 | 2.2 |" \ "+---+-----+" self.u.get_column("b").format = "%.1f" self.assert_string_equal(self.u.__str__(head=20), s) def test_head01(self): s = \ "u(a: int32, b: float64)" \ "2 row(s) - compressed: 2.00 MB - comp. ratio: 0.00" \ "+-----+-------+" \ "| a | b |" \ "+-----+-------+" \ "| 1 | 1.100 |" \ "| ... | ... |" \ "+-----+-------+" self.assert_string_equal(self.u.head(1), s) def test_tail01(self): s = \ "u(a: int32, b: float64)" \ "2 row(s) - compressed: 2.00 MB - comp. ratio: 0.00" \ "+-----+-------+" \ "| a | b |" \ "+-----+-------+" \ "| ... | ... |" \ "| 2 | 2.200 |" \ "+-----+-------+" self.assert_string_equal(self.u.tail(1), s) def test_rebuild01(self): cat = Table.from_csv("Category", self.ds, os.path.join(AVITO_DATA_DIR, "Category.tsv"), delimiter='\t', usecols=['CategoryID', 'ParentCategoryID', 'Level'], verbose=False) cat.rebuild({"CategoryID": np.int8, "Level": np.int8, "ParentCategoryID": np.int8}) self.assertEqual(len(cat[:]), 69) self.assertEqual(cat['CategoryID'].dtype, np.int8) self.assertEqual(cat[0]['CategoryID'], -128) # int8.min self.assertEqual(cat[0]['Level'], -128) # int8.min self.assertEqual(cat[0]['ParentCategoryID'], -128) # int8.min @raises(DazzleError) def test_rebuild02(self): cat = Table.from_csv("Category", self.ds, os.path.join(AVITO_DATA_DIR, "Category.tsv"), delimiter='\t', usecols=['CategoryID', 'ParentCategoryID', 'Level'], verbose=False) cat.rebuild({"CategoryID": np.uint8, "Level": np.uint8, "ParentCategoryID": np.uint8}) def test_add_join_column(self): ds = DataSet("/temp/dazzle-test", force_create=True) t = Table("t", ds, [('a', np.array([10, 2, 3, 5, 4, 7, 1, 8, 6, 9])), ('c', np.array([100, 20, 30, 50, 40, 70, 10, 80, 60, np.nan]))]) a_ref = np.array([1, 5, 4, 5, 6, 4, 1, 1, 9, 7, 8, 4, 5, 5, 2, 2, 8, 5, 4, 20]) u = Table("u", ds, [('a', a_ref), ("y", a_ref * 10)]) u.get_column("a").ref_column = t.get_column("a") t.rebuild({'a': np.int8, 'c': np.int8}) u.rebuild({'a': np.int8, 'y': np.int16}) u.add_reference_column(u.get_column("a"), t.get_column("a")) # print(t.head(20)) # print(u.head(30)) u.add_join_column("result", [u.get_column("a_ref"), t.get_column("c")]) #print(u.head(30)) assert np.array_equal(u['result'], [-128, 10, 50, 40, 50, 60, 40, 10, 10, -128, 70, 80, 40, 50, 50, 20, 20, 80, 50, 40, -128])
def load_dataset(): """'Raw'-dataset is the result of loading the CSV sources data into dazzle tables, only filtering out data that we don't want to further process. The method is programmed in a non-destructive way so as to be able to launch it several times before getting the job done. """ import os from dazzle.core.dataset import DataSet if DataSet.exists(raw_dir): ds = DataSet.open(raw_dir) else: ds = DataSet(raw_dir, force_create=True) # Notes: # - many of the following attributes should be unsigned int instead of signed int, but numexpr works only on # signed data. # - Simlarly to pandas, we use the types required to contain the existing data, not the types we desire to use if ds.get_table("Category") is None: t = Table.from_csv("Category", ds, os.path.join(csv_dir, "Category.tsv"), delimiter='\t', chunksize=10**7, usecols=['CategoryID', 'ParentCategoryID', 'Level'], dtype={'CategoryID': 'i4', 'ParentCategoryID': 'i1', 'Level': 'i1'}) t = None # Notice the filter attribute that does not exist in pandas.read_csv(). It makes it possible to skip some rows # based on a numexpr expression. IsClick == IsClick is true iff IsClick is not na if ds.get_table("TrainSearchStream") is None: t = Table.from_csv("TrainSearchStream", ds, os.path.join(csv_dir, "trainSearchStream.tsv"), delimiter='\t', chunksize=10**7, usecols=['SearchID', 'AdID', 'Position', 'ObjectType', 'HistCTR', 'IsClick'], dtype={'SearchID':'i4', 'AdID':'i4', 'Position':'i1', 'ObjectType':'i1', 'HistCTR':'f4', 'IsClick':'f1'}, filter='(ObjectType == 3) & (IsClick == IsClick)') t = None # We avoid to load the string fields. We will see this problem later with Don if ds.get_table("AdsInfo") is None: t = Table.from_csv("AdsInfo", ds, os.path.join(csv_dir, "AdsInfo.tsv"), delimiter='\t', chunksize=10**7, usecols=['AdID', 'LocationID', 'CategoryID', 'Price', 'IsContext'], dtype={'AdID':'i4', 'LocationID':'f4', 'CategoryID':'f4', 'Price': 'f4', 'IsContext': 'f1'}) t = None # We avoid to load the string fields. We will see this problem later with Don if ds.get_table("SearchInfo") is None: t = Table.from_csv("SearchInfo", ds, os.path.join(csv_dir, "SearchInfo.tsv"), delimiter='\t', chunksize=10**7, usecols=['SearchID', 'IPID', 'UserID', 'IsUserLoggedOn', 'LocationID', 'CategoryID'], dtype={'SearchID':'i4', 'IPID':'i4', 'UserID':'f4', 'IsUserLoggedOn':'f1', 'LocationID':'f4', 'CategoryID':'f4'}) t = None if ds.get_table("userInfo") is None: t = Table.from_csv("userInfo", ds, os.path.join(csv_dir, "userInfo.tsv"), delimiter='\t', chunksize=10**7, usecols=['UserID', 'UserAgentID', 'UserAgentOSID','UserDeviceID', 'UserAgentFamilyID'], dtype={'UserID':'i4', 'UserAgentID':'i4', 'UserAgentOSID':'i4', 'UserDeviceID':'i4', 'UserAgentFamilyID':'i4'}) t = None if ds.get_table("Location") is None: t = Table.from_csv("Location", ds, os.path.join(csv_dir, "Location.tsv"), delimiter='\t', chunksize=10**7, usecols=['LocationID', 'CityID', 'RegionID'], dtype={'LocationID': 'i4', 'CityID':'f4', 'RegionID': 'f4'}) t = None if ds.get_table("PhoneRequestsStream") is None: t = Table.from_csv("PhoneRequestsStream", ds, os.path.join(csv_dir, "PhoneRequestsStream.tsv"), delimiter='\t', chunksize=10**7, usecols=['UserID', 'IPID', 'AdID', 'PhoneRequestDate'], dtype={'UserID':'i4', 'IPID':'i4', 'AdID':'i4', 'PhoneRequestDate': 'object'}) t = None if ds.get_table("VisitsStream") is None: t = Table.from_csv("VisitsStream", ds, os.path.join(csv_dir, "VisitsStream.tsv"), delimiter='\t', chunksize=10**7, usecols=['UserID', 'IPID', 'AdID', 'ViewDate'], dtype={'UserID':'i4', 'IPID':'i4', 'AdID':'i4', 'ViewDate': 'object'}) t = None return ds
def preprocess_dataset(): # This step takes around 3 mins (6 mins if dataset must be copied) # # Table.add_reference_column(), which uses pandas is partly responsible for this. In addition, # it consumes a lot of RAM. # 1. Make a copy of the raw dataset, if this has not already be done: we don't want to reload the whole CSV stuff # if something wrong happens if not DataSet.exists(dataset_dir): print("Copying dataset ...") raw_ds = DataSet.open(raw_dir) ds = raw_ds.copy(dataset_dir) # almost 3 mins ! else: ds = DataSet.open(dataset_dir) # 2. Rebuild each table. This means: # # 2.1 inserting a nan row at the head of each table. This is necessary because we use index=0 in each RefColumn # for indicating a null reference # # 2.2 assigning the desired dtype of each column # # 2.3 Setting data in each column using the setting dtype # # 2.4 Replace Numpy NA values by those of the corresponding column class (Ref/Literal) and dtype # Category = ds.get_table("Category") Location = ds.get_table("Location") userInfo = ds.get_table("userInfo") AdsInfo = ds.get_table("AdsInfo") SearchInfo = ds.get_table("SearchInfo") TrainSearchStream = ds.get_table("TrainSearchStream") print("Re-building tables with given dtypes ...") Category.rebuild({"CategoryID": np.int32, "Level": np.int8, "ParentCategoryID": np.int32}) Location.rebuild({"LocationID": np.int16, "RegionID": np.int8, "CityID": np.int16}) userInfo.rebuild({"UserID": np.int32, "UserAgentID": np.int32, "UserAgentOSID": np.int8, "UserDeviceID": np.int16, "UserAgentFamilyID": np.int8}) AdsInfo.rebuild({"AdID": np.int32, "LocationID": np.int16, "CategoryID": np.int32, "Price": np.float32, "IsContext": np.int8}) SearchInfo.rebuild({"SearchID": np.int32, "IPID": np.int32, "UserID": np.int32, "IsUserLoggedOn": np.int8, "LocationID": np.int16, "CategoryID": np.int32}) TrainSearchStream.rebuild({"SearchID": np.int32, "AdID": np.int32, "Position": np.int8, "ObjectType": np.int8, "HistCTR": np.float32, "IsClick": np.int8}) # 3. Add references between columns: foreign keys (like LocationID in AdsInfo) are kept # but an additional column (xxx_ref) is added with the index of the row containing the referenced value # print("Building references from AdsInfo ...") AdsInfo.add_reference_column(AdsInfo.get_column("LocationID"), Location.get_column("LocationID")) AdsInfo.add_reference_column(AdsInfo.get_column("CategoryID"), Category.get_column("CategoryID")) print(AdsInfo) print("Building references from SearchInfo ...") SearchInfo.add_reference_column(SearchInfo.get_column("UserID"), userInfo.get_column("UserID")) SearchInfo.add_reference_column(SearchInfo.get_column("LocationID"), Location.get_column("LocationID")) SearchInfo.add_reference_column(SearchInfo.get_column("CategoryID"), Category.get_column("CategoryID")) print("Building references from TrainSearchStream ...") TrainSearchStream.add_reference_column(TrainSearchStream.get_column("SearchID"), SearchInfo.get_column("SearchID")) TrainSearchStream.add_reference_column(TrainSearchStream.get_column("AdID"), AdsInfo.get_column("AdID")) print(TrainSearchStream) print("Done")