def test_join_multiple2(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") filename = os.path.join(os.path.split( __file__)[0], "data", "database_linked.zip") temp = get_temp_folder(__file__, "temp_join_multiple2") filename = unzip(filename, temp) assert os.path.exists(filename) db = Database(filename, LOG=fLOG) db.connect() where = {"bucket": ("==", "bu###1")} n1 = db.JoinTreeNode("profile_QSSH", where=where, parent_key="query", key="query") n2 = db.JoinTreeNode("url_QSSH", where=where, parent_key=('url', 'pos'), key=('url', 'pos')) n1.append(n2) sql, fields = db.inner_joins(n1, execute=False, create_index=False) view = db.execute_view(sql) assert view == [('facebbooklogin', 1, 0, 'bu###1', 86, 0, 'digg.com/security/Hackers_Put_Social_Networks_In_Crosshairs', 'digg.com/security/Hackers_Put_Social_Networks_In_Crosshairs', 1, 0, 1, 1, 0, 0, 0, 0)] assert "WHERE" in sql db.close()
def test_unicode(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__", LogFile="temp_hal_log2.txt") filename = os.path.join(os.path.split( __file__)[0], "data", "database_linked.zip") temp = get_temp_folder(__file__, "temp_unicode") filename = unzip(filename, temp) assert os.path.exists(filename) db = Database(filename, LOG=fLOG) db.connect() file = os.path.join(os.path.split(__file__)[0], "data", "unicode.txt") assert os.path.exists(file) def filter_case(s): return s.replace(" ", "") db.import_table_from_flat_file( file, "uni", columns=None, header=True, filter_case=filter_case) sql = "select * from uni limit 2" view = db.execute_view(sql) exp = [[(b'.\u904a\u6232\u6a5f\u5730'.decode("utf8"), b'.\u904a\u6232\u57fa\u5730'.decode("utf8"), 0.66666666666700003, 4, 6, 0.0, 0.0, 0, 0.0, 1, 0.0, 0, 0.25, 1, 999999999, 999999999, 0, 1, 3, 1, 0.66666666666700003), (b'0401\u5f71\u97f3live\u79c0'.decode("utf8"), b'0401\u5f71\u97f3\u8996\u8a0a'.decode("utf8"), 0.41666666666699997, 5, 12, 0.0, 0.45454545454500001, 5, 0.27272727272699998, 1, 0.5, 4, 0.5, 2, 999999999, 999999999, 0, 1, 2, 0, 1.0)], [(b'.\xe9\x81\x8a\xe6\x88\xb2\xe6\xa9\x9f\xe5\x9c\xb0'.decode("utf8"), b'.\xe9\x81\x8a\xe6\x88\xb2\xe5\x9f\xba\xe5\x9c\xb0'.decode( "utf8"), 0.666666666666667, 4, 6, 0.0, 0.0, 0, 0.0, 1, 0.0, 0, 0.25, 1, 999999999, 999999999, 0, 1, 3, 1, 0.666666666666667), (b'0401\xe5\xbd\xb1\xe9\x9f\xb3live\xe7\xa7\x80'.decode("utf8"), b'0401\xe5\xbd\xb1\xe9\x9f\xb3\xe8\xa6\x96\xe8\xa8\x8a'.decode( "utf8"), 0.416666666666667, 5, 12, 0.0, 0.454545454545455, 5, 0.272727272727273, 1, 0.5, 4, 0.5, 2, 999999999, 999999999, 0, 1, 2, 0, 1.0)], ] if view not in exp: print("2", str(view).encode("utf8")) i = 0 for a, b in zip(exp[1], view): if a != b: print("i", i) if isinstance(a, tuple): for c, d in zip(a, b): print("1", c) print("2", d) else: print("1", a) print("2", b) i += 1 raise Exception("problem") db.close()
def test_join_multiple3(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") filename = os.path.join(os.path.split( __file__)[0], "data", "database_linked.zip") temp = get_temp_folder(__file__, "temp_join_multiple3") filename = unzip(filename, temp) assert os.path.exists(filename) db = Database(filename, LOG=fLOG) db.connect() where = {"bucket": ("==", "bu###1")} root = db.JoinTreeNode("query_QSSH", where=where) n1 = db.JoinTreeNode("profile_QSSH", where=where, parent_key="query", key="query") n2 = db.JoinTreeNode("url_QSSH", where=where, parent_key=('url', 'pos'), key=('url', 'pos')) root.append(n1) n1.append(n2) sql, fields = db.inner_joins(root, execute=False, create_index=False) view = db.execute_view(sql) assert view == [('facebbooklogin', 'bu###1', 86, 157, 520, 0, 63, 0, 503, 0, 619, 1, 3906365, 'facebbooklogin', 1, 0, 'bu###1', 86, 0, 'digg.com/security/Hackers_Put_Social_Networks_In_Crosshairs', 'digg.com/security/Hackers_Put_Social_Networks_In_Crosshairs', 1, 0, 1, 1, 0, 0, 0, 0)] assert fields == [('query', 'query_QSSH', 'query'), ('bucket', 'query_QSSH', 'bucket'), ('nbq', 'query_QSSH', 'nbq'), ('sum_num', 'query_QSSH', 'sum_num'), ('sum_view_url', 'query_QSSH', 'sum_view_url'), ('sum_click_url', 'query_QSSH', 'sum_click_url'), ('sum_rewrite', 'query_QSSH', 'sum_rewrite'), ('sum_click_ads', 'query_QSSH', 'sum_click_ads'), ( 'sum_max_pos_view', 'query_QSSH', 'sum_max_pos_view'), ('sum_max_pos_click', 'query_QSSH', 'sum_max_pos_click'), ( 'sum_duration', 'query_QSSH', 'sum_duration'), ('sum_unknown', 'query_QSSH', 'sum_unknown'), ('sum_daysec', 'query_QSSH', 'sum_daysec'), ('aquery', 'profile_QSSH', 'query'), ('apos', 'profile_QSSH', 'pos'), ('atype', 'profile_QSSH', 'type'), ('abucket', 'profile_QSSH', 'bucket'), ('amax_nb', 'profile_QSSH', 'max_nb'), ('asum_difftime', 'profile_QSSH', 'sum_difftime'), ('aurl', 'profile_QSSH', 'url'), ('aaurl', 'url_QSSH', 'url'), ('aapos', 'url_QSSH', 'pos'), ('aaco', 'url_QSSH', 'co'), ('aanb_view', 'url_QSSH', 'nb_view'), ('aasum_nb_view', 'url_QSSH', 'sum_nb_view'), ('aasum_difftime_view', 'url_QSSH', 'sum_difftime_view'), ('aanb_click', 'url_QSSH', 'nb_click'), ('aasum_nb_click', 'url_QSSH', 'sum_nb_click'), ('aasum_difftime_click', 'url_QSSH', 'sum_difftime_click')] assert "WHERE" in sql db.close()
def test_summary(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") filename = os.path.join(os.path.split( __file__)[0], "data", "database_linked.zip") temp = get_temp_folder(__file__, "temp_summary") filename = unzip(filename, temp) assert os.path.exists(filename) db = Database(filename, LOG=fLOG) db.connect() res = db.summary() assert len(res) > 0 db.close()
def prepare_cresus_data(dbfile, outfold=None, fLOG=fLOG): """ Prepare the data for the challenge. @param dbfile database file @param outfold output folder @param fLOG logging function @return dictionary of table files """ db = Database(dbfile) db.connect() if outfold is None: outfold = "." remove_column = ['nom', 'prenom', 'tel_fixe', 'tel_mobile', 'email', 'adresse', 'rdv1', 'rdv2', 'rdv3', 'fichier_suivi', 'fichier_suivi2', 'media', 'indicateur_suivi', 'memo', 'num_dossier', 'etat_old', 'orientation_old', 'indicateur_suivi_old', 'transfert', 'plan_bdf', 'effacement_dett', "etat", # 'tel_fixe', 'tel_port', ] new_mapping = {'': 'nul1', None: 'nul2', 'Sur-endettement': 'Surendettement', '0': 'nul'} res = {} tables = db.get_table_list() for table in tables: fLOG("[prepare_cresus_data] exporting", table) df = pandas.read_sql("select * from " + table, db._connection) cols = [_ for _ in df.columns if _ not in remove_column] cols.sort() if "orientation" in cols: cols = [_ for _ in cols if _ not in ("orientation", "nature")] cols += ["orientation", "nature"] df["nature"] = df.nature.apply( lambda x: new_mapping.get(x, x).replace("é", "e").lower()) fLOG(set(df["nature"])) df = df[cols] name = os.path.join(outfold, "tbl_" + table + ".txt") df.to_csv(name, sep="\t", encoding="utf-8", index=False) res[table] = name db.close() return res
def test_histogram(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") filename = os.path.join( os.path.split(__file__)[0], "data", "database_linked.zip") temp = get_temp_folder(__file__, "temp_histogram") filename = unzip(filename, temp) assert os.path.exists(filename) db = Database(filename, LOG=fLOG) db.connect() sql = db.histogram("url_QRW2", col_sums=["sum_nb_click"], columns=("pos", "url")) view = db.execute_view(sql) assert len(view) == 38216 sql = db.histogram("url_QRW2", col_sums=["sum_nb_click"], columns="url") view = db.execute_view(sql) assert len(view) == 28436 sql = db.histogram("url_QRW2", col_sums=["sum_nb_click"], columns="pos", values=[1, 2, 3, 4, 5]) view = db.execute_view(sql) assert view == [(1, 2370, 87049), (2, 5734, 11522), (3, 4009, 5383), (4, 4304, 1778), (5, 21799, 3588)] sql = db.histogram("url_QRW2", col_sums=["sum_nb_click"], columns="pos", values={ "pos123": [1, 2, 3], "others": [4, 5, 6, 7, 8, 9, 10] }) view = db.execute_view(sql) assert view == [('none', 21, 0), ('others', 26082, 5366), ('pos123', 12113, 103954)] db.close()
def test_join_multiple2bis(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") filename = os.path.join(os.path.split( __file__)[0], "data", "database_linked.zip") temp = get_temp_folder(__file__, "temp_join_multiple2bis") filename = unzip(filename, temp) assert os.path.exists(filename) db = Database(filename, LOG=fLOG) db.connect() where = {("profile_QSSH", "bucket"): ("==", "bu###1")} n1 = db.JoinTreeNode("profile_QSSH", where=where, parent_key="query", key="query") n2 = db.JoinTreeNode("url_QSSH", where=where, parent_key=('url', 'pos'), key=('url', 'pos')) n1.append(n2) sql, fields = db.inner_joins(n1, execute=False, create_index=False) view = db.execute_view(sql) assert "WHERE" in sql assert view == [('facebbooklogin', 1, 0, 'bu###1', 86, 0, 'digg.com/security/Hackers_Put_Social_Networks_In_Crosshairs', 'digg.com/security/Hackers_Put_Social_Networks_In_Crosshairs', 1, 0, 1, 1, 0, 0, 0, 0)] where = {("profile_QSSH.bucket"): ("==", "bu###1")} n1 = db.JoinTreeNode("profile_QSSH", where=where, parent_key="query", key="query") n2 = db.JoinTreeNode("url_QSSH", where=where, parent_key=('url', 'pos'), key=('url', 'pos')) n1.append(n2) sql, fields = db.inner_joins(n1, execute=False, create_index=False) view = db.execute_view(sql) assert "WHERE" in sql assert view == [('facebbooklogin', 1, 0, 'bu###1', 86, 0, 'digg.com/security/Hackers_Put_Social_Networks_In_Crosshairs', 'digg.com/security/Hackers_Put_Social_Networks_In_Crosshairs', 1, 0, 1, 1, 0, 0, 0, 0)] db.close()
def test_histogram(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") filename = os.path.join(os.path.split( __file__)[0], "data", "database_linked.zip") temp = get_temp_folder(__file__, "temp_histogram") filename = unzip(filename, temp) assert os.path.exists(filename) db = Database(filename, LOG=fLOG) db.connect() sql = db.histogram("url_QRW2", col_sums=["sum_nb_click"], columns=("pos", "url")) view = db.execute_view(sql) assert len(view) == 38216 sql = db.histogram("url_QRW2", col_sums=["sum_nb_click"], columns="url") view = db.execute_view(sql) assert len(view) == 28436 sql = db.histogram("url_QRW2", col_sums=["sum_nb_click"], columns="pos", values=[1, 2, 3, 4, 5]) view = db.execute_view(sql) assert view == [(1, 2370, 87049), (2, 5734, 11522), (3, 4009, 5383), (4, 4304, 1778), (5, 21799, 3588)] sql = db.histogram("url_QRW2", col_sums=["sum_nb_click"], columns="pos", values={"pos123": [1, 2, 3], "others": [4, 5, 6, 7, 8, 9, 10]}) view = db.execute_view(sql) assert view == [('none', 21, 0), ('others', 26082, 5366), ('pos123', 12113, 103954)] db.close()
def test_histogram2(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") filename = os.path.join(os.path.split( __file__)[0], "data", "database_linked.zip") temp = get_temp_folder(__file__, "temp_histogram2") filename = unzip(filename, temp) assert os.path.exists(filename) db = Database(filename, LOG=fLOG) db.connect() sql = db.histogram("url_QRW2", values={"cat1": [(1, 1), (1, 0)], "cat2": [ (1, 10), (2, 10), (2, 1)]}, col_sums=["sum_nb_click"], columns=("pos", "co")) view = db.execute_view(sql) assert view == [('cat1', 1115, 15), ('cat2', 3792, 411), ('none', 33309, 108894)] db.close()
def test_attach_database(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__", LogFile="temp_hal_log2.txt") filename = os.path.join(os.path.split( __file__)[0], "data", "database_linked.zip") temp = get_temp_folder(__file__, "temp_attach_database") filename = unzip(filename, temp) assert os.path.exists(filename) file2 = os.path.join(os.path.split(__file__)[ 0], "data", "database_linked_cor.zip") temp = get_temp_folder(__file__, "temp_attach_database") file2 = unzip(file2, temp) assert os.path.exists(file2) # method 1 attach = {"seco": file2} db = Database(filename, attach=attach, LOG=fLOG) db.connect() sql = "SELECT COUNT(*) FROM seco.word_QSSH" vie = db.execute_view(sql) self.assertEqual(len(vie), 1) db.close() # method 2 all = filename + " ; seco , " + file2 db = Database(all, LOG=fLOG) db.connect() sql = "SELECT COUNT(*) FROM seco.word_QSSH" vi2 = db.execute_view(sql) self.assertEqual(len(vi2), 1) self.assertEqual(vi2, vie) att = db.get_attached_database_list() self.assertEqual(att, ['seco']) ts = db.get_table_list(True) self.assertEqual(ts, ['seco.query', 'seco.idx_query_query', 'seco.qtok', 'seco.idx_qtok_qtok', 'seco.pairs', 'seco.pairs_query___q1', 'seco.bucket', 'seco.idx_bucket_bucket', 'seco.url', 'seco.idx_url_url', 'seco.profile', 'seco.profile_query___', 'seco.profile_QRW2', 'seco.profile_QRW2_query___', 'seco.profile_QSSH', 'seco.profile_QSSH_query___', 'seco.query_QRW2', 'seco.query_QRW2_query___', 'seco.query_QSSH', 'seco.query_QSSH_query___', 'seco.url_QRW2', 'seco.url_QRW2_url___', 'seco.url_QSSH', 'seco.url_QSSH_url___', 'seco.word', 'seco.idx_word_word', 'seco.word_QRW2', 'seco.word_QRW2_word___', 'seco.word_QSSH', 'seco.word_QSSH_word___']) assert "db.attach_database" in db.get_python_code()[1] assert db.has_table("seco.word_QSSH_word___") files = db.get_file(True) file = db.get_file() assert file in files for alias, file in db.get_attached_database_list(True): assert alias in files assert file in files db2 = Database(files, LOG=fLOG) db2.connect() db2.close() db.close()
def test_cross_1(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__", LogFile="temp_hal_log2.txt") filename = os.path.join(os.path.split( __file__)[0], "data", "database_linked.zip") temp = get_temp_folder(__file__, "temp_cross_1") filename = unzip(filename, temp) assert os.path.exists(filename) db = Database(filename, LOG=fLOG) db.connect() sql = """CROSS pos PLACE nb_click,url FROM url_QSSH ORDER BY nb_click -- comment DESC LIMIT 100""" cur = db.execute(sql) mat = list(cur) view = db.execute_view(sql, add_column_name=True) self.assertEqual(len(view), len(mat) + 1) self.assertEqual(view[0], ['1;nb_click', '1;url', '2;nb_click', '2;url', '3;nb_click', '3;url', '4;nb_click', '4;url', '5;nb_click', '5;url', '6;nb_click', '6;url', '7;nb_click', '7;url', '8;nb_click', '8;url', '9;nb_click', '9;url', '10;nb_click', '10;url', '11;nb_click', '11;url', '12;nb_click', '12;url', '13;nb_click', '13;url', '14;nb_click', '14;url', '15;nb_click', '15;url', '16;nb_click', '16;url', '18;nb_click', '18;url']) self.assertEqual(len(view[0]), len(view[1])) exp = { 1: [268.0, 'www.facebook.com/login.php', 97.0, 'www.friendster.com/login.php', 29.0, 'https://login.facebook.com/login.php', 15.0, 'https://login.yahoo.com/', 10.0, 'https://login.facebook.com/login.php', 10.0, 'lite.facebook.com/login/?next=http%3A%2F%2Flite.facebook.com%2Ftheamloong%2Fvideo%2Fof%2FTNT-sanshou', 13.0, 'lite.facebook.com/login/?next=http%3A%2F%2Flite.facebook.com%2Ftheamloong%2Fvideo%2Fof%2FTNT-sanshou', 5.0, 'https://login.verizonwireless.com/amserver/UI/Login', 6.0, 'https://login.facebook.com/login.php', 2.0, 'https://login.comcast.net/', 0.0, 'HottieMatchUp.com/matchcomlogin', 0.0, 'FreshPCFix.com/loginscreen', 0.0, 'FreshPCFix.com/loginscreen', 0.0, 'login.marketingblacksmith.com', 0.0, 'sites.managerslogin.com', 0.0, 'sites.managerslogin.com', 0.0, 'sites.managerslogin.com'], -1: [3.0, 'https://sso.uboc.com/obc/forms/login.fcc?user_type=R', 3.0, 'https://www.atitesting.com/login.aspx', 1.0, 'askabouttech.com/how-to-bypass-login-password-on-windows-vista/', 1.0, 'homegrownfreaks.net/forums/login.html', 1.0, 'https://trade.htsc.com.cn/webtrade/login/loginAction1.do?method=preLogin2&opType=TD', 1.0, 'moodle.dist113.org/login/in/in.php?p=addicting+games+the+impossible+quiz', 1.0, 'www.edweek.org/login.html?source=http://www.edweek.org/ew/articles/2009/08/25/02sat.h29.html&destina', 1.0, 'www.gifs.net/image/Holidays/Birthday/Cake/9451?n=login.php3', 1.0, 'www.grazeit.com/pages/blackplanet-com-login-1814/', 1.0, 'www.myspace-login.us/', None, None, None, None, None, None, None, None, None, None, None, None, None, None], } for k, v in exp.items(): if view[k] != v: if len(view[k]) != len(v): raise Exception( "exp[%d] and view [%d] have different lengths" % (k, k)) for a, b in zip(v, view[k]): if a != b: raise Exception( "k=%d, different values\nexp %s\n != %s" % (k, str(a), str(b))) self.assertEqual(len(view[0]), len(view[-1])) sql = """CROSS pos PLACE nb_click FROM url_QSSH ORDER BY nb_click -- comment DESC LIMIT 100""" cur = db.execute(sql) mat = list(cur) view = db.execute_view(sql, add_column_name=True) self.assertEqual(len(view), len(mat) + 1) self.assertEqual(len(view[0]), len(view[1])) sql = """CROSS pos,pos PLACE nb_click FROM url_QSSH ORDER BY nb_click -- comment DESC LIMIT 100""" cur = db.execute(sql) mat = list(cur) view = db.execute_view(sql, add_column_name=True) self.assertEqual(len(view), len(mat) + 1) self.assertEqual(len(view[0]), len(view[1])) db.close()
def test_join_bis(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") filename = os.path.join( os.path.split(__file__)[0], "data", "database_linked.zip") temp = get_temp_folder(__file__, "temp_join_bis") filename = unzip(filename, temp) assert os.path.exists(filename) db = Database(filename, LOG=fLOG) db.connect() sql = "SELECT COUNT(*) FROM profile_QSSH" exe = db.execute_view(sql) assert exe[0][0] == 16 sql, fields = db.inner_join("profile_QSSH", "url_QSSH", "url", None, execute=False, create_index=False, unique=False) sql = sql.strip(" \n\r\t") tep = TestDatabaseJoin._memo_SQL1.strip(" \n\r\t") if sql.replace(" ", "") != tep.replace(" ", ""): print(sql) raise Exception("sql queries should be identifical") assert fields == [('query', 'query'), ('profile_QSSH.pos', 'profile_QSSH_pos'), ('type', 'type'), ('bucket', 'bucket'), ('max_nb', 'max_nb'), ('sum_difftime', 'sum_difftime'), ('profile_QSSH.url', 'url'), ('url_QSSH.pos', 'url_QSSH_pos'), ('co', 'co'), ('nb_view', 'nb_view'), ('sum_nb_view', 'sum_nb_view'), ('sum_difftime_view', 'sum_difftime_view'), ('nb_click', 'nb_click'), ('sum_nb_click', 'sum_nb_click'), ('sum_difftime_click', 'sum_difftime_click')] view = db.execute_view(sql) assert len(view) == 2 sql, fields = db.inner_join("profile_QSSH", "url_QSSH", ("url", "pos"), None, execute=False, create_index=False, where="bucket == 'bu###1'") sql = sql.strip(" \n\r\t") tep = TestDatabaseJoin._memo_SQL2.strip(" \n\r\t") if sql.replace(" ", "") != tep.replace(" ", ""): for a, b in zip(sql.split("\n"), tep.split("\n")): print("res", a) print("exp", b) print(a == b) assert sql.replace(" ", "") == tep.replace(" ", "") assert fields == [('query', 'query'), ('profile_QSSH.pos', 'pos'), ('type', 'type'), ('bucket', 'bucket'), ('max_nb', 'max_nb'), ('sum_difftime', 'sum_difftime'), ('profile_QSSH.url', 'url'), ('co', 'co'), ('nb_view', 'nb_view'), ('sum_nb_view', 'sum_nb_view'), ('sum_difftime_view', 'sum_difftime_view'), ('nb_click', 'nb_click'), ('sum_nb_click', 'sum_nb_click'), ('sum_difftime_click', 'sum_difftime_click')] view = db.execute_view(sql) assert len(view) == 1 db.close()
def test_cross_1(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__", LogFile="temp_hal_log2.txt") filename = os.path.join(os.path.split( __file__)[0], "data", "database_linked.zip") temp = get_temp_folder(__file__, "temp_cross_1") filename = unzip(filename, temp) assert os.path.exists(filename) db = Database(filename, LOG=fLOG) db.connect() sql = """CROSS pos PLACE nb_click,url FROM url_QSSH ORDER BY nb_click -- comment DESC LIMIT 100""" cur = db.execute(sql) mat = [line for line in cur] view = db.execute_view(sql, add_column_name=True) self.assertEqual(len(view), len(mat) + 1) self.assertEqual(view[0], ['1;nb_click', '1;url', '2;nb_click', '2;url', '3;nb_click', '3;url', '4;nb_click', '4;url', '5;nb_click', '5;url', '6;nb_click', '6;url', '7;nb_click', '7;url', '8;nb_click', '8;url', '9;nb_click', '9;url', '10;nb_click', '10;url', '11;nb_click', '11;url', '12;nb_click', '12;url', '13;nb_click', '13;url', '14;nb_click', '14;url', '15;nb_click', '15;url', '16;nb_click', '16;url', '18;nb_click', '18;url']) self.assertEqual(len(view[0]), len(view[1])) exp = { 1: [268.0, 'www.facebook.com/login.php', 97.0, 'www.friendster.com/login.php', 29.0, 'https://login.facebook.com/login.php', 15.0, 'https://login.yahoo.com/', 10.0, 'https://login.facebook.com/login.php', 10.0, 'lite.facebook.com/login/?next=http%3A%2F%2Flite.facebook.com%2Ftheamloong%2Fvideo%2Fof%2FTNT-sanshou', 13.0, 'lite.facebook.com/login/?next=http%3A%2F%2Flite.facebook.com%2Ftheamloong%2Fvideo%2Fof%2FTNT-sanshou', 5.0, 'https://login.verizonwireless.com/amserver/UI/Login', 6.0, 'https://login.facebook.com/login.php', 2.0, 'https://login.comcast.net/', 0.0, 'HottieMatchUp.com/matchcomlogin', 0.0, 'FreshPCFix.com/loginscreen', 0.0, 'FreshPCFix.com/loginscreen', 0.0, 'login.marketingblacksmith.com', 0.0, 'sites.managerslogin.com', 0.0, 'sites.managerslogin.com', 0.0, 'sites.managerslogin.com'], -1: [3.0, 'https://sso.uboc.com/obc/forms/login.fcc?user_type=R', 3.0, 'https://www.atitesting.com/login.aspx', 1.0, 'askabouttech.com/how-to-bypass-login-password-on-windows-vista/', 1.0, 'homegrownfreaks.net/forums/login.html', 1.0, 'https://trade.htsc.com.cn/webtrade/login/loginAction1.do?method=preLogin2&opType=TD', 1.0, 'moodle.dist113.org/login/in/in.php?p=addicting+games+the+impossible+quiz', 1.0, 'www.edweek.org/login.html?source=http://www.edweek.org/ew/articles/2009/08/25/02sat.h29.html&destina', 1.0, 'www.gifs.net/image/Holidays/Birthday/Cake/9451?n=login.php3', 1.0, 'www.grazeit.com/pages/blackplanet-com-login-1814/', 1.0, 'www.myspace-login.us/', None, None, None, None, None, None, None, None, None, None, None, None, None, None], } for k, v in exp.items(): if view[k] != v: if len(view[k]) != len(v): raise Exception( "exp[%d] and view [%d] have different lengths" % (k, k)) for a, b in zip(v, view[k]): if a != b: raise Exception( "k=%d, different values\nexp %s\n != %s" % (k, str(a), str(b))) self.assertEqual(len(view[0]), len(view[-1])) sql = """CROSS pos PLACE nb_click FROM url_QSSH ORDER BY nb_click -- comment DESC LIMIT 100""" cur = db.execute(sql) mat = [line for line in cur] view = db.execute_view(sql, add_column_name=True) self.assertEqual(len(view), len(mat) + 1) self.assertEqual(len(view[0]), len(view[1])) sql = """CROSS pos,pos PLACE nb_click FROM url_QSSH ORDER BY nb_click -- comment DESC LIMIT 100""" cur = db.execute(sql) mat = [line for line in cur] view = db.execute_view(sql, add_column_name=True) self.assertEqual(len(view), len(mat) + 1) self.assertEqual(len(view[0]), len(view[1])) db.close()
def test_join_bis(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") filename = os.path.join(os.path.split( __file__)[0], "data", "database_linked.zip") temp = get_temp_folder(__file__, "temp_join_bis") filename = unzip(filename, temp) assert os.path.exists(filename) db = Database(filename, LOG=fLOG) db.connect() sql = "SELECT COUNT(*) FROM profile_QSSH" exe = db.execute_view(sql) assert exe[0][0] == 16 sql, fields = db.inner_join("profile_QSSH", "url_QSSH", "url", None, execute=False, create_index=False, unique=False) sql = sql.strip(" \n\r\t") tep = TestDatabaseJoin._memo_SQL1.strip(" \n\r\t") if sql.replace(" ", "") != tep.replace(" ", ""): print(sql) raise Exception("sql queries should be identifical") assert fields == [('query', 'query'), ('profile_QSSH.pos', 'profile_QSSH_pos'), ('type', 'type'), ('bucket', 'bucket'), ('max_nb', 'max_nb'), ('sum_difftime', 'sum_difftime'), ('profile_QSSH.url', 'url'), ('url_QSSH.pos', 'url_QSSH_pos'), ('co', 'co'), ('nb_view', 'nb_view'), ('sum_nb_view', 'sum_nb_view'), ('sum_difftime_view', 'sum_difftime_view'), ('nb_click', 'nb_click'), ('sum_nb_click', 'sum_nb_click'), ('sum_difftime_click', 'sum_difftime_click')] view = db.execute_view(sql) assert len(view) == 2 sql, fields = db.inner_join("profile_QSSH", "url_QSSH", ("url", "pos"), None, execute=False, create_index=False, where="bucket == 'bu###1'") sql = sql.strip(" \n\r\t") tep = TestDatabaseJoin._memo_SQL2.strip(" \n\r\t") if sql.replace(" ", "") != tep.replace(" ", ""): for a, b in zip(sql.split("\n"), tep.split("\n")): print("res", a) print("exp", b) print(a == b) assert sql.replace(" ", "") == tep.replace(" ", "") assert fields == [('query', 'query'), ('profile_QSSH.pos', 'pos'), ('type', 'type'), ('bucket', 'bucket'), ('max_nb', 'max_nb'), ('sum_difftime', 'sum_difftime'), ('profile_QSSH.url', 'url'), ('co', 'co'), ('nb_view', 'nb_view'), ('sum_nb_view', 'sum_nb_view'), ('sum_difftime_view', 'sum_difftime_view'), ('nb_click', 'nb_click'), ('sum_nb_click', 'sum_nb_click'), ('sum_difftime_click', 'sum_difftime_click')] view = db.execute_view(sql) assert len(view) == 1 db.close()
def test_unicode(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__", LogFile="temp_hal_log2.txt") filename = os.path.join( os.path.split(__file__)[0], "data", "database_linked.zip") temp = get_temp_folder(__file__, "temp_unicode") filename = unzip(filename, temp) assert os.path.exists(filename) db = Database(filename, LOG=fLOG) db.connect() file = os.path.join(os.path.split(__file__)[0], "data", "unicode.txt") assert os.path.exists(file) def filter_case(s): return s.replace(" ", "") db.import_table_from_flat_file(file, "uni", columns=None, header=True, filter_case=filter_case) sql = "select * from uni limit 2" view = db.execute_view(sql) exp = [ [(b'.\u904a\u6232\u6a5f\u5730'.decode("utf8"), b'.\u904a\u6232\u57fa\u5730'.decode("utf8"), 0.66666666666700003, 4, 6, 0.0, 0.0, 0, 0.0, 1, 0.0, 0, 0.25, 1, 999999999, 999999999, 0, 1, 3, 1, 0.66666666666700003), (b'0401\u5f71\u97f3live\u79c0'.decode("utf8"), b'0401\u5f71\u97f3\u8996\u8a0a'.decode("utf8"), 0.41666666666699997, 5, 12, 0.0, 0.45454545454500001, 5, 0.27272727272699998, 1, 0.5, 4, 0.5, 2, 999999999, 999999999, 0, 1, 2, 0, 1.0)], [(b'.\xe9\x81\x8a\xe6\x88\xb2\xe6\xa9\x9f\xe5\x9c\xb0'.decode( "utf8"), b'.\xe9\x81\x8a\xe6\x88\xb2\xe5\x9f\xba\xe5\x9c\xb0'.decode( "utf8"), 0.666666666666667, 4, 6, 0.0, 0.0, 0, 0.0, 1, 0.0, 0, 0.25, 1, 999999999, 999999999, 0, 1, 3, 1, 0.666666666666667), (b'0401\xe5\xbd\xb1\xe9\x9f\xb3live\xe7\xa7\x80'.decode("utf8"), b'0401\xe5\xbd\xb1\xe9\x9f\xb3\xe8\xa6\x96\xe8\xa8\x8a'.decode( "utf8"), 0.416666666666667, 5, 12, 0.0, 0.454545454545455, 5, 0.272727272727273, 1, 0.5, 4, 0.5, 2, 999999999, 999999999, 0, 1, 2, 0, 1.0)], ] if view not in exp: print("2", str(view).encode("utf8")) i = 0 for a, b in zip(exp[1], view): if a != b: print("i", i) if isinstance(a, tuple): for c, d in zip(a, b): print("1", c) print("2", d) else: print("1", a) print("2", b) i += 1 raise Exception("problem") db.close()