def test_join_multiple2(self):
        fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__")
        filename = os.path.join(os.path.split(
            __file__)[0], "data", "database_linked.zip")
        temp = get_temp_folder(__file__, "temp_join_multiple2")
        filename = unzip(filename, temp)
        assert os.path.exists(filename)

        db = Database(filename, LOG=fLOG)
        db.connect()

        where = {"bucket": ("==", "bu###1")}
        n1 = db.JoinTreeNode("profile_QSSH", where=where,
                             parent_key="query", key="query")
        n2 = db.JoinTreeNode("url_QSSH", where=where,
                             parent_key=('url', 'pos'), key=('url', 'pos'))
        n1.append(n2)

        sql, fields = db.inner_joins(n1, execute=False, create_index=False)

        view = db.execute_view(sql)
        assert view == [('facebbooklogin', 1, 0, 'bu###1', 86, 0,
                         'digg.com/security/Hackers_Put_Social_Networks_In_Crosshairs',
                         'digg.com/security/Hackers_Put_Social_Networks_In_Crosshairs',
                         1, 0, 1, 1, 0, 0, 0, 0)]
        assert "WHERE" in sql

        db.close()
    def test_unicode(self):
        fLOG(__file__, self._testMethodName, OutputPrint=__name__ ==
             "__main__", LogFile="temp_hal_log2.txt")
        filename = os.path.join(os.path.split(
            __file__)[0], "data", "database_linked.zip")
        temp = get_temp_folder(__file__, "temp_unicode")
        filename = unzip(filename, temp)
        assert os.path.exists(filename)

        db = Database(filename, LOG=fLOG)
        db.connect()

        file = os.path.join(os.path.split(__file__)[0], "data", "unicode.txt")
        assert os.path.exists(file)

        def filter_case(s):
            return s.replace(" ", "")

        db.import_table_from_flat_file(
            file, "uni", columns=None, header=True, filter_case=filter_case)

        sql = "select * from uni limit 2"
        view = db.execute_view(sql)
        exp = [[(b'.\u904a\u6232\u6a5f\u5730'.decode("utf8"),
                 b'.\u904a\u6232\u57fa\u5730'.decode("utf8"),
                 0.66666666666700003, 4, 6, 0.0, 0.0, 0, 0.0, 1, 0.0, 0,
                 0.25, 1, 999999999, 999999999, 0, 1, 3, 1,
                 0.66666666666700003),
                (b'0401\u5f71\u97f3live\u79c0'.decode("utf8"),
                 b'0401\u5f71\u97f3\u8996\u8a0a'.decode("utf8"),
                 0.41666666666699997, 5, 12, 0.0, 0.45454545454500001,
                 5, 0.27272727272699998, 1, 0.5, 4, 0.5, 2, 999999999,
                 999999999, 0, 1, 2, 0, 1.0)],
               [(b'.\xe9\x81\x8a\xe6\x88\xb2\xe6\xa9\x9f\xe5\x9c\xb0'.decode("utf8"),
                 b'.\xe9\x81\x8a\xe6\x88\xb2\xe5\x9f\xba\xe5\x9c\xb0'.decode(
                   "utf8"),
                   0.666666666666667, 4, 6, 0.0, 0.0, 0, 0.0, 1, 0.0, 0, 0.25, 1, 999999999, 999999999, 0, 1, 3, 1, 0.666666666666667),
                (b'0401\xe5\xbd\xb1\xe9\x9f\xb3live\xe7\xa7\x80'.decode("utf8"),
                   b'0401\xe5\xbd\xb1\xe9\x9f\xb3\xe8\xa6\x96\xe8\xa8\x8a'.decode(
                    "utf8"),
                   0.416666666666667, 5, 12, 0.0, 0.454545454545455, 5, 0.272727272727273,
                   1, 0.5, 4, 0.5, 2, 999999999, 999999999, 0, 1, 2, 0, 1.0)],
               ]
        if view not in exp:
            print("2", str(view).encode("utf8"))
            i = 0
            for a, b in zip(exp[1], view):
                if a != b:
                    print("i", i)
                    if isinstance(a, tuple):
                        for c, d in zip(a, b):
                            print("1", c)
                            print("2", d)
                    else:
                        print("1", a)
                        print("2", b)
                i += 1
            raise Exception("problem")

        db.close()
    def test_join_multiple3(self):
        fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__")
        filename = os.path.join(os.path.split(
            __file__)[0], "data", "database_linked.zip")
        temp = get_temp_folder(__file__, "temp_join_multiple3")
        filename = unzip(filename, temp)
        assert os.path.exists(filename)

        db = Database(filename, LOG=fLOG)
        db.connect()

        where = {"bucket": ("==", "bu###1")}
        root = db.JoinTreeNode("query_QSSH", where=where)
        n1 = db.JoinTreeNode("profile_QSSH", where=where,
                             parent_key="query", key="query")
        n2 = db.JoinTreeNode("url_QSSH", where=where,
                             parent_key=('url', 'pos'), key=('url', 'pos'))
        root.append(n1)
        n1.append(n2)

        sql, fields = db.inner_joins(root,
                                     execute=False,
                                     create_index=False)

        view = db.execute_view(sql)
        assert view == [('facebbooklogin', 'bu###1', 86, 157, 520, 0, 63, 0, 503, 0, 619, 1, 3906365, 'facebbooklogin', 1, 0, 'bu###1', 86, 0,
                         'digg.com/security/Hackers_Put_Social_Networks_In_Crosshairs',
                         'digg.com/security/Hackers_Put_Social_Networks_In_Crosshairs', 1, 0, 1, 1, 0, 0, 0, 0)]
        assert fields == [('query', 'query_QSSH', 'query'), ('bucket', 'query_QSSH', 'bucket'),
                          ('nbq', 'query_QSSH', 'nbq'), ('sum_num',
                                                         'query_QSSH', 'sum_num'),
                          ('sum_view_url', 'query_QSSH', 'sum_view_url'),
                          ('sum_click_url', 'query_QSSH',
                           'sum_click_url'), ('sum_rewrite', 'query_QSSH', 'sum_rewrite'),
                          ('sum_click_ads', 'query_QSSH', 'sum_click_ads'), (
                              'sum_max_pos_view', 'query_QSSH', 'sum_max_pos_view'),
                          ('sum_max_pos_click', 'query_QSSH', 'sum_max_pos_click'), (
                              'sum_duration', 'query_QSSH', 'sum_duration'),
                          ('sum_unknown', 'query_QSSH',
                           'sum_unknown'), ('sum_daysec', 'query_QSSH', 'sum_daysec'),
                          ('aquery', 'profile_QSSH', 'query'),
                          ('apos', 'profile_QSSH', 'pos'), ('atype',
                                                            'profile_QSSH', 'type'),
                          ('abucket', 'profile_QSSH',
                           'bucket'), ('amax_nb', 'profile_QSSH', 'max_nb'),
                          ('asum_difftime', 'profile_QSSH',
                           'sum_difftime'), ('aurl', 'profile_QSSH', 'url'),
                          ('aaurl', 'url_QSSH', 'url'), ('aapos',
                                                         'url_QSSH', 'pos'), ('aaco', 'url_QSSH', 'co'),
                          ('aanb_view', 'url_QSSH', 'nb_view'), ('aasum_nb_view',
                                                                 'url_QSSH', 'sum_nb_view'),
                          ('aasum_difftime_view', 'url_QSSH',
                           'sum_difftime_view'), ('aanb_click', 'url_QSSH', 'nb_click'),
                          ('aasum_nb_click', 'url_QSSH', 'sum_nb_click'), ('aasum_difftime_click', 'url_QSSH', 'sum_difftime_click')]
        assert "WHERE" in sql
        db.close()
Exemple #4
0
    def test_summary(self):
        fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__")

        filename = os.path.join(os.path.split(
            __file__)[0], "data", "database_linked.zip")
        temp = get_temp_folder(__file__, "temp_summary")
        filename = unzip(filename, temp)
        assert os.path.exists(filename)

        db = Database(filename, LOG=fLOG)
        db.connect()

        res = db.summary()
        assert len(res) > 0
        db.close()
def prepare_cresus_data(dbfile, outfold=None, fLOG=fLOG):
    """
    Prepare the data for the challenge.

    @param      dbfile      database file
    @param      outfold     output folder
    @param      fLOG        logging function
    @return                 dictionary of table files
    """
    db = Database(dbfile)
    db.connect()

    if outfold is None:
        outfold = "."

    remove_column = ['nom', 'prenom',
                     'tel_fixe', 'tel_mobile', 'email',
                     'adresse', 'rdv1', 'rdv2', 'rdv3',
                     'fichier_suivi', 'fichier_suivi2', 'media',
                     'indicateur_suivi', 'memo', 'num_dossier',
                     'etat_old', 'orientation_old', 'indicateur_suivi_old',
                     'transfert', 'plan_bdf', 'effacement_dett', "etat",
                     #
                     'tel_fixe', 'tel_port',
                     ]

    new_mapping = {'': 'nul1', None: 'nul2',
                   'Sur-endettement': 'Surendettement', '0': 'nul'}

    res = {}
    tables = db.get_table_list()
    for table in tables:
        fLOG("[prepare_cresus_data] exporting", table)
        df = pandas.read_sql("select * from " + table, db._connection)
        cols = [_ for _ in df.columns if _ not in remove_column]
        cols.sort()
        if "orientation" in cols:
            cols = [_ for _ in cols if _ not in ("orientation", "nature")]
            cols += ["orientation", "nature"]
            df["nature"] = df.nature.apply(
                lambda x: new_mapping.get(x, x).replace("é", "e").lower())
            fLOG(set(df["nature"]))
        df = df[cols]
        name = os.path.join(outfold, "tbl_" + table + ".txt")
        df.to_csv(name, sep="\t", encoding="utf-8", index=False)
        res[table] = name
    db.close()
    return res
Exemple #6
0
    def test_histogram(self):
        fLOG(__file__,
             self._testMethodName,
             OutputPrint=__name__ == "__main__")

        filename = os.path.join(
            os.path.split(__file__)[0], "data", "database_linked.zip")
        temp = get_temp_folder(__file__, "temp_histogram")
        filename = unzip(filename, temp)
        assert os.path.exists(filename)

        db = Database(filename, LOG=fLOG)
        db.connect()

        sql = db.histogram("url_QRW2",
                           col_sums=["sum_nb_click"],
                           columns=("pos", "url"))
        view = db.execute_view(sql)
        assert len(view) == 38216

        sql = db.histogram("url_QRW2",
                           col_sums=["sum_nb_click"],
                           columns="url")
        view = db.execute_view(sql)
        assert len(view) == 28436

        sql = db.histogram("url_QRW2",
                           col_sums=["sum_nb_click"],
                           columns="pos",
                           values=[1, 2, 3, 4, 5])
        view = db.execute_view(sql)
        assert view == [(1, 2370, 87049), (2, 5734, 11522), (3, 4009, 5383),
                        (4, 4304, 1778), (5, 21799, 3588)]

        sql = db.histogram("url_QRW2",
                           col_sums=["sum_nb_click"],
                           columns="pos",
                           values={
                               "pos123": [1, 2, 3],
                               "others": [4, 5, 6, 7, 8, 9, 10]
                           })
        view = db.execute_view(sql)
        assert view == [('none', 21, 0), ('others', 26082, 5366),
                        ('pos123', 12113, 103954)]

        db.close()
    def test_join_multiple2bis(self):
        fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__")
        filename = os.path.join(os.path.split(
            __file__)[0], "data", "database_linked.zip")
        temp = get_temp_folder(__file__, "temp_join_multiple2bis")
        filename = unzip(filename, temp)
        assert os.path.exists(filename)

        db = Database(filename, LOG=fLOG)
        db.connect()

        where = {("profile_QSSH", "bucket"): ("==", "bu###1")}
        n1 = db.JoinTreeNode("profile_QSSH", where=where,
                             parent_key="query", key="query")
        n2 = db.JoinTreeNode("url_QSSH", where=where,
                             parent_key=('url', 'pos'), key=('url', 'pos'))
        n1.append(n2)

        sql, fields = db.inner_joins(n1, execute=False, create_index=False)

        view = db.execute_view(sql)
        assert "WHERE" in sql
        assert view == [('facebbooklogin', 1, 0, 'bu###1', 86, 0,
                         'digg.com/security/Hackers_Put_Social_Networks_In_Crosshairs',
                         'digg.com/security/Hackers_Put_Social_Networks_In_Crosshairs',
                         1, 0, 1, 1, 0, 0, 0, 0)]

        where = {("profile_QSSH.bucket"): ("==", "bu###1")}
        n1 = db.JoinTreeNode("profile_QSSH", where=where,
                             parent_key="query", key="query")
        n2 = db.JoinTreeNode("url_QSSH", where=where,
                             parent_key=('url', 'pos'), key=('url', 'pos'))
        n1.append(n2)

        sql, fields = db.inner_joins(n1, execute=False, create_index=False)

        view = db.execute_view(sql)
        assert "WHERE" in sql
        assert view == [('facebbooklogin', 1, 0,
                         'bu###1', 86, 0,
                         'digg.com/security/Hackers_Put_Social_Networks_In_Crosshairs',
                         'digg.com/security/Hackers_Put_Social_Networks_In_Crosshairs',
                         1, 0, 1, 1, 0, 0, 0, 0)]

        db.close()
    def test_histogram(self):
        fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__")

        filename = os.path.join(os.path.split(
            __file__)[0], "data", "database_linked.zip")
        temp = get_temp_folder(__file__, "temp_histogram")
        filename = unzip(filename, temp)
        assert os.path.exists(filename)

        db = Database(filename, LOG=fLOG)
        db.connect()

        sql = db.histogram("url_QRW2",
                           col_sums=["sum_nb_click"],
                           columns=("pos", "url"))
        view = db.execute_view(sql)
        assert len(view) == 38216

        sql = db.histogram("url_QRW2",
                           col_sums=["sum_nb_click"],
                           columns="url")
        view = db.execute_view(sql)
        assert len(view) == 28436

        sql = db.histogram("url_QRW2",
                           col_sums=["sum_nb_click"],
                           columns="pos",
                           values=[1, 2, 3, 4, 5])
        view = db.execute_view(sql)
        assert view == [(1, 2370, 87049), (2, 5734, 11522),
                        (3, 4009, 5383), (4, 4304, 1778), (5, 21799, 3588)]

        sql = db.histogram("url_QRW2",
                           col_sums=["sum_nb_click"],
                           columns="pos",
                           values={"pos123": [1, 2, 3], "others": [4, 5, 6, 7, 8, 9, 10]})
        view = db.execute_view(sql)
        assert view == [('none', 21, 0), ('others', 26082,
                                          5366), ('pos123', 12113, 103954)]

        db.close()
    def test_histogram2(self):
        fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__")

        filename = os.path.join(os.path.split(
            __file__)[0], "data", "database_linked.zip")
        temp = get_temp_folder(__file__, "temp_histogram2")
        filename = unzip(filename, temp)
        assert os.path.exists(filename)

        db = Database(filename, LOG=fLOG)
        db.connect()

        sql = db.histogram("url_QRW2",
                           values={"cat1": [(1, 1), (1, 0)], "cat2": [
                               (1, 10), (2, 10), (2, 1)]},
                           col_sums=["sum_nb_click"],
                           columns=("pos", "co"))
        view = db.execute_view(sql)
        assert view == [('cat1', 1115, 15), ('cat2', 3792,
                                             411), ('none', 33309, 108894)]

        db.close()
    def test_attach_database(self):
        fLOG(__file__, self._testMethodName, OutputPrint=__name__ ==
             "__main__", LogFile="temp_hal_log2.txt")

        filename = os.path.join(os.path.split(
            __file__)[0], "data", "database_linked.zip")
        temp = get_temp_folder(__file__, "temp_attach_database")
        filename = unzip(filename, temp)
        assert os.path.exists(filename)

        file2 = os.path.join(os.path.split(__file__)[
                             0], "data", "database_linked_cor.zip")
        temp = get_temp_folder(__file__, "temp_attach_database")
        file2 = unzip(file2, temp)
        assert os.path.exists(file2)

        # method 1
        attach = {"seco": file2}

        db = Database(filename, attach=attach, LOG=fLOG)
        db.connect()

        sql = "SELECT COUNT(*) FROM seco.word_QSSH"
        vie = db.execute_view(sql)
        self.assertEqual(len(vie), 1)
        db.close()

        # method 2
        all = filename + " ; seco , " + file2

        db = Database(all, LOG=fLOG)
        db.connect()

        sql = "SELECT COUNT(*) FROM seco.word_QSSH"
        vi2 = db.execute_view(sql)
        self.assertEqual(len(vi2), 1)
        self.assertEqual(vi2, vie)

        att = db.get_attached_database_list()
        self.assertEqual(att, ['seco'])
        ts = db.get_table_list(True)
        self.assertEqual(ts, ['seco.query',
                              'seco.idx_query_query', 'seco.qtok', 'seco.idx_qtok_qtok',
                              'seco.pairs', 'seco.pairs_query___q1', 'seco.bucket', 'seco.idx_bucket_bucket',
                              'seco.url', 'seco.idx_url_url', 'seco.profile', 'seco.profile_query___',
                              'seco.profile_QRW2', 'seco.profile_QRW2_query___', 'seco.profile_QSSH',
                              'seco.profile_QSSH_query___', 'seco.query_QRW2', 'seco.query_QRW2_query___',
                              'seco.query_QSSH', 'seco.query_QSSH_query___', 'seco.url_QRW2', 'seco.url_QRW2_url___',
                              'seco.url_QSSH', 'seco.url_QSSH_url___', 'seco.word', 'seco.idx_word_word',
                              'seco.word_QRW2', 'seco.word_QRW2_word___', 'seco.word_QSSH', 'seco.word_QSSH_word___'])

        assert "db.attach_database" in db.get_python_code()[1]
        assert db.has_table("seco.word_QSSH_word___")
        files = db.get_file(True)

        file = db.get_file()
        assert file in files
        for alias, file in db.get_attached_database_list(True):
            assert alias in files
            assert file in files

        db2 = Database(files, LOG=fLOG)
        db2.connect()
        db2.close()

        db.close()
    def test_cross_1(self):
        fLOG(__file__, self._testMethodName, OutputPrint=__name__ ==
             "__main__", LogFile="temp_hal_log2.txt")
        filename = os.path.join(os.path.split(
            __file__)[0], "data", "database_linked.zip")
        temp = get_temp_folder(__file__, "temp_cross_1")
        filename = unzip(filename, temp)
        assert os.path.exists(filename)

        db = Database(filename, LOG=fLOG)
        db.connect()

        sql = """CROSS pos PLACE nb_click,url FROM url_QSSH ORDER BY nb_click -- comment
                    DESC LIMIT 100"""
        cur = db.execute(sql)
        mat = list(cur)
        view = db.execute_view(sql, add_column_name=True)
        self.assertEqual(len(view), len(mat) + 1)
        self.assertEqual(view[0], ['1;nb_click', '1;url', '2;nb_click', '2;url', '3;nb_click', '3;url',
                                   '4;nb_click', '4;url', '5;nb_click', '5;url', '6;nb_click', '6;url', '7;nb_click',
                                   '7;url', '8;nb_click', '8;url', '9;nb_click',
                                   '9;url', '10;nb_click', '10;url', '11;nb_click', '11;url', '12;nb_click',
                                   '12;url', '13;nb_click', '13;url', '14;nb_click',
                                   '14;url', '15;nb_click', '15;url', '16;nb_click', '16;url', '18;nb_click', '18;url'])
        self.assertEqual(len(view[0]), len(view[1]))

        exp = {
            1: [268.0, 'www.facebook.com/login.php', 97.0, 'www.friendster.com/login.php',
                29.0, 'https://login.facebook.com/login.php', 15.0,
                'https://login.yahoo.com/', 10.0, 'https://login.facebook.com/login.php', 10.0,
                'lite.facebook.com/login/?next=http%3A%2F%2Flite.facebook.com%2Ftheamloong%2Fvideo%2Fof%2FTNT-sanshou',
                13.0, 'lite.facebook.com/login/?next=http%3A%2F%2Flite.facebook.com%2Ftheamloong%2Fvideo%2Fof%2FTNT-sanshou',
                5.0, 'https://login.verizonwireless.com/amserver/UI/Login', 6.0, 'https://login.facebook.com/login.php', 2.0,
                'https://login.comcast.net/', 0.0, 'HottieMatchUp.com/matchcomlogin', 0.0,
                'FreshPCFix.com/loginscreen', 0.0, 'FreshPCFix.com/loginscreen', 0.0,
                'login.marketingblacksmith.com', 0.0, 'sites.managerslogin.com', 0.0,
                'sites.managerslogin.com', 0.0, 'sites.managerslogin.com'],
            -1: [3.0, 'https://sso.uboc.com/obc/forms/login.fcc?user_type=R', 3.0,
                 'https://www.atitesting.com/login.aspx', 1.0, 'askabouttech.com/how-to-bypass-login-password-on-windows-vista/',
                 1.0, 'homegrownfreaks.net/forums/login.html',
                 1.0, 'https://trade.htsc.com.cn/webtrade/login/loginAction1.do?method=preLogin2&opType=TD',
                 1.0, 'moodle.dist113.org/login/in/in.php?p=addicting+games+the+impossible+quiz',
                 1.0, 'www.edweek.org/login.html?source=http://www.edweek.org/ew/articles/2009/08/25/02sat.h29.html&destina',
                 1.0, 'www.gifs.net/image/Holidays/Birthday/Cake/9451?n=login.php3',
                 1.0, 'www.grazeit.com/pages/blackplanet-com-login-1814/', 1.0, 'www.myspace-login.us/',
                 None, None, None, None, None, None, None, None, None, None, None, None, None, None],
        }

        for k, v in exp.items():
            if view[k] != v:
                if len(view[k]) != len(v):
                    raise Exception(
                        "exp[%d] and view [%d] have different lengths" % (k, k))
                for a, b in zip(v, view[k]):
                    if a != b:
                        raise Exception(
                            "k=%d, different values\nexp %s\n != %s" % (k, str(a), str(b)))

        self.assertEqual(len(view[0]), len(view[-1]))

        sql = """CROSS pos PLACE nb_click FROM url_QSSH ORDER BY nb_click -- comment
                    DESC LIMIT 100"""
        cur = db.execute(sql)
        mat = list(cur)
        view = db.execute_view(sql, add_column_name=True)
        self.assertEqual(len(view), len(mat) + 1)
        self.assertEqual(len(view[0]), len(view[1]))

        sql = """CROSS pos,pos PLACE nb_click FROM url_QSSH ORDER BY nb_click -- comment
                    DESC LIMIT 100"""
        cur = db.execute(sql)
        mat = list(cur)
        view = db.execute_view(sql, add_column_name=True)
        self.assertEqual(len(view), len(mat) + 1)
        self.assertEqual(len(view[0]), len(view[1]))

        db.close()
Exemple #12
0
    def test_join_bis(self):
        fLOG(__file__,
             self._testMethodName,
             OutputPrint=__name__ == "__main__")
        filename = os.path.join(
            os.path.split(__file__)[0], "data", "database_linked.zip")
        temp = get_temp_folder(__file__, "temp_join_bis")
        filename = unzip(filename, temp)
        assert os.path.exists(filename)

        db = Database(filename, LOG=fLOG)
        db.connect()

        sql = "SELECT COUNT(*) FROM profile_QSSH"
        exe = db.execute_view(sql)
        assert exe[0][0] == 16

        sql, fields = db.inner_join("profile_QSSH",
                                    "url_QSSH",
                                    "url",
                                    None,
                                    execute=False,
                                    create_index=False,
                                    unique=False)
        sql = sql.strip(" \n\r\t")
        tep = TestDatabaseJoin._memo_SQL1.strip(" \n\r\t")
        if sql.replace(" ", "") != tep.replace(" ", ""):
            print(sql)
            raise Exception("sql queries should be identifical")

        assert fields == [('query', 'query'),
                          ('profile_QSSH.pos', 'profile_QSSH_pos'),
                          ('type', 'type'), ('bucket', 'bucket'),
                          ('max_nb', 'max_nb'),
                          ('sum_difftime', 'sum_difftime'),
                          ('profile_QSSH.url', 'url'),
                          ('url_QSSH.pos', 'url_QSSH_pos'), ('co', 'co'),
                          ('nb_view', 'nb_view'),
                          ('sum_nb_view', 'sum_nb_view'),
                          ('sum_difftime_view', 'sum_difftime_view'),
                          ('nb_click', 'nb_click'),
                          ('sum_nb_click', 'sum_nb_click'),
                          ('sum_difftime_click', 'sum_difftime_click')]
        view = db.execute_view(sql)
        assert len(view) == 2

        sql, fields = db.inner_join("profile_QSSH",
                                    "url_QSSH", ("url", "pos"),
                                    None,
                                    execute=False,
                                    create_index=False,
                                    where="bucket == 'bu###1'")
        sql = sql.strip(" \n\r\t")
        tep = TestDatabaseJoin._memo_SQL2.strip(" \n\r\t")
        if sql.replace(" ", "") != tep.replace(" ", ""):
            for a, b in zip(sql.split("\n"), tep.split("\n")):
                print("res", a)
                print("exp", b)
                print(a == b)
        assert sql.replace(" ", "") == tep.replace(" ", "")
        assert fields == [('query', 'query'), ('profile_QSSH.pos', 'pos'),
                          ('type', 'type'), ('bucket', 'bucket'),
                          ('max_nb', 'max_nb'),
                          ('sum_difftime', 'sum_difftime'),
                          ('profile_QSSH.url', 'url'), ('co', 'co'),
                          ('nb_view', 'nb_view'),
                          ('sum_nb_view', 'sum_nb_view'),
                          ('sum_difftime_view', 'sum_difftime_view'),
                          ('nb_click', 'nb_click'),
                          ('sum_nb_click', 'sum_nb_click'),
                          ('sum_difftime_click', 'sum_difftime_click')]
        view = db.execute_view(sql)
        assert len(view) == 1

        db.close()
    def test_attach_database(self):
        fLOG(__file__, self._testMethodName, OutputPrint=__name__ ==
             "__main__", LogFile="temp_hal_log2.txt")

        filename = os.path.join(os.path.split(
            __file__)[0], "data", "database_linked.zip")
        temp = get_temp_folder(__file__, "temp_attach_database")
        filename = unzip(filename, temp)
        assert os.path.exists(filename)

        file2 = os.path.join(os.path.split(__file__)[
                             0], "data", "database_linked_cor.zip")
        temp = get_temp_folder(__file__, "temp_attach_database")
        file2 = unzip(file2, temp)
        assert os.path.exists(file2)

        # method 1
        attach = {"seco": file2}

        db = Database(filename, attach=attach, LOG=fLOG)
        db.connect()

        sql = "SELECT COUNT(*) FROM seco.word_QSSH"
        vie = db.execute_view(sql)
        self.assertEqual(len(vie), 1)
        db.close()

        # method 2
        all = filename + " ; seco , " + file2

        db = Database(all, LOG=fLOG)
        db.connect()

        sql = "SELECT COUNT(*) FROM seco.word_QSSH"
        vi2 = db.execute_view(sql)
        self.assertEqual(len(vi2), 1)
        self.assertEqual(vi2, vie)

        att = db.get_attached_database_list()
        self.assertEqual(att, ['seco'])
        ts = db.get_table_list(True)
        self.assertEqual(ts, ['seco.query',
                              'seco.idx_query_query', 'seco.qtok', 'seco.idx_qtok_qtok',
                              'seco.pairs', 'seco.pairs_query___q1', 'seco.bucket', 'seco.idx_bucket_bucket',
                              'seco.url', 'seco.idx_url_url', 'seco.profile', 'seco.profile_query___',
                              'seco.profile_QRW2', 'seco.profile_QRW2_query___', 'seco.profile_QSSH',
                              'seco.profile_QSSH_query___', 'seco.query_QRW2', 'seco.query_QRW2_query___',
                              'seco.query_QSSH', 'seco.query_QSSH_query___', 'seco.url_QRW2', 'seco.url_QRW2_url___',
                              'seco.url_QSSH', 'seco.url_QSSH_url___', 'seco.word', 'seco.idx_word_word',
                              'seco.word_QRW2', 'seco.word_QRW2_word___', 'seco.word_QSSH', 'seco.word_QSSH_word___'])

        assert "db.attach_database" in db.get_python_code()[1]
        assert db.has_table("seco.word_QSSH_word___")
        files = db.get_file(True)

        file = db.get_file()
        assert file in files
        for alias, file in db.get_attached_database_list(True):
            assert alias in files
            assert file in files

        db2 = Database(files, LOG=fLOG)
        db2.connect()
        db2.close()

        db.close()
    def test_cross_1(self):
        fLOG(__file__, self._testMethodName, OutputPrint=__name__ ==
             "__main__", LogFile="temp_hal_log2.txt")
        filename = os.path.join(os.path.split(
            __file__)[0], "data", "database_linked.zip")
        temp = get_temp_folder(__file__, "temp_cross_1")
        filename = unzip(filename, temp)
        assert os.path.exists(filename)

        db = Database(filename, LOG=fLOG)
        db.connect()

        sql = """CROSS pos PLACE nb_click,url FROM url_QSSH ORDER BY nb_click -- comment
                    DESC LIMIT 100"""
        cur = db.execute(sql)
        mat = [line for line in cur]
        view = db.execute_view(sql, add_column_name=True)
        self.assertEqual(len(view), len(mat) + 1)
        self.assertEqual(view[0], ['1;nb_click', '1;url', '2;nb_click', '2;url', '3;nb_click', '3;url',
                                   '4;nb_click', '4;url', '5;nb_click', '5;url', '6;nb_click', '6;url', '7;nb_click',
                                   '7;url', '8;nb_click', '8;url', '9;nb_click',
                                   '9;url', '10;nb_click', '10;url', '11;nb_click', '11;url', '12;nb_click',
                                   '12;url', '13;nb_click', '13;url', '14;nb_click',
                                   '14;url', '15;nb_click', '15;url', '16;nb_click', '16;url', '18;nb_click', '18;url'])
        self.assertEqual(len(view[0]), len(view[1]))

        exp = {
            1: [268.0, 'www.facebook.com/login.php', 97.0, 'www.friendster.com/login.php',
                29.0, 'https://login.facebook.com/login.php', 15.0,
                'https://login.yahoo.com/', 10.0, 'https://login.facebook.com/login.php', 10.0,
                'lite.facebook.com/login/?next=http%3A%2F%2Flite.facebook.com%2Ftheamloong%2Fvideo%2Fof%2FTNT-sanshou',
                13.0, 'lite.facebook.com/login/?next=http%3A%2F%2Flite.facebook.com%2Ftheamloong%2Fvideo%2Fof%2FTNT-sanshou',
                5.0, 'https://login.verizonwireless.com/amserver/UI/Login', 6.0, 'https://login.facebook.com/login.php', 2.0,
                'https://login.comcast.net/', 0.0, 'HottieMatchUp.com/matchcomlogin', 0.0,
                'FreshPCFix.com/loginscreen', 0.0, 'FreshPCFix.com/loginscreen', 0.0,
                'login.marketingblacksmith.com', 0.0, 'sites.managerslogin.com', 0.0,
                'sites.managerslogin.com', 0.0, 'sites.managerslogin.com'],
            -1: [3.0, 'https://sso.uboc.com/obc/forms/login.fcc?user_type=R', 3.0,
                 'https://www.atitesting.com/login.aspx', 1.0, 'askabouttech.com/how-to-bypass-login-password-on-windows-vista/',
                 1.0, 'homegrownfreaks.net/forums/login.html',
                 1.0, 'https://trade.htsc.com.cn/webtrade/login/loginAction1.do?method=preLogin2&opType=TD',
                 1.0, 'moodle.dist113.org/login/in/in.php?p=addicting+games+the+impossible+quiz',
                 1.0, 'www.edweek.org/login.html?source=http://www.edweek.org/ew/articles/2009/08/25/02sat.h29.html&destina',
                 1.0, 'www.gifs.net/image/Holidays/Birthday/Cake/9451?n=login.php3',
                 1.0, 'www.grazeit.com/pages/blackplanet-com-login-1814/', 1.0, 'www.myspace-login.us/',
                 None, None, None, None, None, None, None, None, None, None, None, None, None, None],
        }

        for k, v in exp.items():
            if view[k] != v:
                if len(view[k]) != len(v):
                    raise Exception(
                        "exp[%d] and view [%d] have different lengths" % (k, k))
                for a, b in zip(v, view[k]):
                    if a != b:
                        raise Exception(
                            "k=%d, different values\nexp %s\n != %s" % (k, str(a), str(b)))

        self.assertEqual(len(view[0]), len(view[-1]))

        sql = """CROSS pos PLACE nb_click FROM url_QSSH ORDER BY nb_click -- comment
                    DESC LIMIT 100"""
        cur = db.execute(sql)
        mat = [line for line in cur]
        view = db.execute_view(sql, add_column_name=True)
        self.assertEqual(len(view), len(mat) + 1)
        self.assertEqual(len(view[0]), len(view[1]))

        sql = """CROSS pos,pos PLACE nb_click FROM url_QSSH ORDER BY nb_click -- comment
                    DESC LIMIT 100"""
        cur = db.execute(sql)
        mat = [line for line in cur]
        view = db.execute_view(sql, add_column_name=True)
        self.assertEqual(len(view), len(mat) + 1)
        self.assertEqual(len(view[0]), len(view[1]))

        db.close()
    def test_join_bis(self):
        fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__")
        filename = os.path.join(os.path.split(
            __file__)[0], "data", "database_linked.zip")
        temp = get_temp_folder(__file__, "temp_join_bis")
        filename = unzip(filename, temp)
        assert os.path.exists(filename)

        db = Database(filename, LOG=fLOG)
        db.connect()

        sql = "SELECT COUNT(*) FROM profile_QSSH"
        exe = db.execute_view(sql)
        assert exe[0][0] == 16

        sql, fields = db.inner_join("profile_QSSH", "url_QSSH",
                                    "url", None,
                                    execute=False,
                                    create_index=False,
                                    unique=False)
        sql = sql.strip(" \n\r\t")
        tep = TestDatabaseJoin._memo_SQL1.strip(" \n\r\t")
        if sql.replace(" ", "") != tep.replace(" ", ""):
            print(sql)
            raise Exception("sql queries should be identifical")

        assert fields == [('query', 'query'), ('profile_QSSH.pos', 'profile_QSSH_pos'), ('type', 'type'),
                          ('bucket', 'bucket'), ('max_nb',
                                                 'max_nb'), ('sum_difftime', 'sum_difftime'),
                          ('profile_QSSH.url', 'url'), ('url_QSSH.pos', 'url_QSSH_pos'),
                          ('co', 'co'), ('nb_view',
                                         'nb_view'), ('sum_nb_view', 'sum_nb_view'),
                          ('sum_difftime_view', 'sum_difftime_view'),
                          ('nb_click', 'nb_click'), ('sum_nb_click', 'sum_nb_click'), ('sum_difftime_click', 'sum_difftime_click')]
        view = db.execute_view(sql)
        assert len(view) == 2

        sql, fields = db.inner_join("profile_QSSH", "url_QSSH",
                                    ("url", "pos"), None,
                                    execute=False,
                                    create_index=False,
                                    where="bucket == 'bu###1'")
        sql = sql.strip(" \n\r\t")
        tep = TestDatabaseJoin._memo_SQL2.strip(" \n\r\t")
        if sql.replace(" ", "") != tep.replace(" ", ""):
            for a, b in zip(sql.split("\n"), tep.split("\n")):
                print("res", a)
                print("exp", b)
                print(a == b)
        assert sql.replace(" ", "") == tep.replace(" ", "")
        assert fields == [('query', 'query'), ('profile_QSSH.pos', 'pos'), ('type', 'type'),
                          ('bucket', 'bucket'), ('max_nb',
                                                 'max_nb'), ('sum_difftime', 'sum_difftime'),
                          ('profile_QSSH.url', 'url'), ('co',
                                                        'co'), ('nb_view', 'nb_view'),
                          ('sum_nb_view', 'sum_nb_view'), ('sum_difftime_view',
                                                           'sum_difftime_view'),
                          ('nb_click', 'nb_click'), ('sum_nb_click', 'sum_nb_click'),
                          ('sum_difftime_click', 'sum_difftime_click')]
        view = db.execute_view(sql)
        assert len(view) == 1

        db.close()
Exemple #16
0
    def test_unicode(self):
        fLOG(__file__,
             self._testMethodName,
             OutputPrint=__name__ == "__main__",
             LogFile="temp_hal_log2.txt")
        filename = os.path.join(
            os.path.split(__file__)[0], "data", "database_linked.zip")
        temp = get_temp_folder(__file__, "temp_unicode")
        filename = unzip(filename, temp)
        assert os.path.exists(filename)

        db = Database(filename, LOG=fLOG)
        db.connect()

        file = os.path.join(os.path.split(__file__)[0], "data", "unicode.txt")
        assert os.path.exists(file)

        def filter_case(s):
            return s.replace(" ", "")

        db.import_table_from_flat_file(file,
                                       "uni",
                                       columns=None,
                                       header=True,
                                       filter_case=filter_case)

        sql = "select * from uni limit 2"
        view = db.execute_view(sql)
        exp = [
            [(b'.\u904a\u6232\u6a5f\u5730'.decode("utf8"),
              b'.\u904a\u6232\u57fa\u5730'.decode("utf8"), 0.66666666666700003,
              4, 6, 0.0, 0.0, 0, 0.0, 1, 0.0, 0, 0.25, 1, 999999999, 999999999,
              0, 1, 3, 1, 0.66666666666700003),
             (b'0401\u5f71\u97f3live\u79c0'.decode("utf8"),
              b'0401\u5f71\u97f3\u8996\u8a0a'.decode("utf8"),
              0.41666666666699997, 5, 12, 0.0, 0.45454545454500001, 5,
              0.27272727272699998, 1, 0.5, 4, 0.5, 2, 999999999, 999999999, 0,
              1, 2, 0, 1.0)],
            [(b'.\xe9\x81\x8a\xe6\x88\xb2\xe6\xa9\x9f\xe5\x9c\xb0'.decode(
                "utf8"),
              b'.\xe9\x81\x8a\xe6\x88\xb2\xe5\x9f\xba\xe5\x9c\xb0'.decode(
                  "utf8"), 0.666666666666667, 4, 6, 0.0, 0.0, 0, 0.0, 1, 0.0,
              0, 0.25, 1, 999999999, 999999999, 0, 1, 3, 1, 0.666666666666667),
             (b'0401\xe5\xbd\xb1\xe9\x9f\xb3live\xe7\xa7\x80'.decode("utf8"),
              b'0401\xe5\xbd\xb1\xe9\x9f\xb3\xe8\xa6\x96\xe8\xa8\x8a'.decode(
                  "utf8"), 0.416666666666667, 5, 12, 0.0, 0.454545454545455, 5,
              0.272727272727273, 1, 0.5, 4, 0.5, 2, 999999999, 999999999, 0, 1,
              2, 0, 1.0)],
        ]
        if view not in exp:
            print("2", str(view).encode("utf8"))
            i = 0
            for a, b in zip(exp[1], view):
                if a != b:
                    print("i", i)
                    if isinstance(a, tuple):
                        for c, d in zip(a, b):
                            print("1", c)
                            print("2", d)
                    else:
                        print("1", a)
                        print("2", b)
                i += 1
            raise Exception("problem")

        db.close()