Ejemplo n.º 1
0
    def testSillyCleaningOpalyticsOne(self):
        tdf = TicDatFactory(**sillyMeSchema())
        tdf.set_data_type("c",
                          "cData4",
                          number_allowed=False,
                          strings_allowed=['d'])
        ticDat = tdf.TicDat(**sillyMeData())

        input_set = create_inputset_mock(tdf, ticDat)

        pdf = PanDatFactory(**sillyMeSchema())
        pdf.set_data_type("c",
                          "cData4",
                          number_allowed=False,
                          strings_allowed=['d'])

        panDat = pdf.opalytics.create_pan_dat(input_set, raw_data=True)
        self.assertTrue(tdf._same_data(pdf.copy_to_tic_dat(panDat), ticDat))

        panDatPurged = pdf.opalytics.create_pan_dat(input_set, raw_data=False)
        self.assertFalse(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))

        ticDat.c.pop()
        ticDat.c.pop(0)
        self.assertTrue(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
Ejemplo n.º 2
0
    def testDietCleaningOpalytisThree(self):
        tdf = TicDatFactory(**dietSchema())
        tdf.add_data_row_predicate("categories",
                                   lambda row: row["maxNutrition"] >= 66)
        addDietForeignKeys(tdf)
        ticDat = tdf.copy_tic_dat(dietData())

        pdf = PanDatFactory(**tdf.schema())
        pdf.add_data_row_predicate("categories",
                                   lambda row: row["maxNutrition"] >= 66)
        addDietForeignKeys(pdf)

        input_set = create_inputset_mock(tdf, ticDat)

        panDat = pdf.opalytics.create_pan_dat(input_set, raw_data=True)
        self.assertTrue(tdf._same_data(pdf.copy_to_tic_dat(panDat), ticDat))

        panDatPurged = pdf.opalytics.create_pan_dat(input_set, raw_data=False)
        self.assertFalse(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))

        ticDat.categories.pop("fat")
        self.assertFalse(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
        tdf.remove_foreign_key_failures(ticDat)
        self.assertTrue(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
Ejemplo n.º 3
0
    def testDietCleaningOpalyticsTwo(self):
        tdf = TicDatFactory(**dietSchema())
        addDietForeignKeys(tdf)
        tdf.set_data_type("categories",
                          "maxNutrition",
                          min=66,
                          inclusive_max=True)
        ticDat = tdf.copy_tic_dat(dietData())

        input_set = create_inputset_mock(tdf, ticDat)
        pdf = PanDatFactory(**dietSchema())
        addDietForeignKeys(pdf)
        pdf.set_data_type("categories",
                          "maxNutrition",
                          min=66,
                          inclusive_max=True)

        panDat = pdf.opalytics.create_pan_dat(input_set, raw_data=True)
        self.assertTrue(tdf._same_data(pdf.copy_to_tic_dat(panDat), ticDat))

        panDatPurged = pdf.opalytics.create_pan_dat(input_set, raw_data=False)
        self.assertFalse(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))

        ticDat.categories.pop("fat")
        self.assertFalse(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
        tdf.remove_foreign_key_failures(ticDat)
        self.assertTrue(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
Ejemplo n.º 4
0
    def testDietCleaningOpalytics(self):
        sch = dietSchema()
        sch["categories"][-1].append("_active")
        tdf1 = TicDatFactory(**dietSchema())
        tdf2 = TicDatFactory(**sch)

        ticDat2 = tdf2.copy_tic_dat(dietData())
        for v in ticDat2.categories.values():
            v["_active"] = True
        ticDat2.categories["fat"]["_active"] = False
        ticDat1 = tdf1.copy_tic_dat(dietData())

        input_set = create_inputset_mock_with_active_hack(tdf2, ticDat2)
        pdf1 = PanDatFactory(**tdf1.schema())
        panDat = pdf1.opalytics.create_pan_dat(input_set, raw_data=True)
        self.assertTrue(tdf1._same_data(pdf1.copy_to_tic_dat(panDat), ticDat1))

        panDatPurged = pdf1.opalytics.create_pan_dat(input_set)
        self.assertFalse(
            tdf1._same_data(pdf1.copy_to_tic_dat(panDatPurged), ticDat1))

        ticDat1.categories.pop("fat")
        tdf1.remove_foreign_key_failures(ticDat1)
        self.assertTrue(
            tdf1._same_data(pdf1.copy_to_tic_dat(panDatPurged), ticDat1))
Ejemplo n.º 5
0
    def testDietOpalytics(self):
        if not self.can_run:
            return
        for hack, raw_data, activeEnabled in list(
                itertools.product(*(([True, False], ) * 3))):
            tdf = TicDatFactory(**dietSchema())
            ticDat = tdf.freeze_me(tdf.copy_tic_dat(dietData()))
            inputset = create_inputset_mock(tdf, ticDat, hack, activeEnabled)

            pdf = PanDatFactory(**dietSchema())
            panDat = pdf.opalytics.create_pan_dat(inputset)
            self.assertFalse(pdf.find_duplicates(panDat))
            ticDat2 = pdf.copy_to_tic_dat(panDat)
            self.assertTrue(tdf._same_data(ticDat, ticDat2))

            tdf2 = TicDatFactory(
                **{
                    k: [pks, list(dfs) + ["dmy"]]
                    for k, (pks, dfs) in tdf.schema().items()
                })
            _dat = tdf2.copy_tic_dat(ticDat)
            panDat = pdf.opalytics.create_pan_dat(
                create_inputset_mock(tdf2, _dat, hack))

            self.assertTrue(tdf._same_data(ticDat,
                                           pdf.copy_to_tic_dat(panDat)))

            pdf2 = PanDatFactory(**tdf2.schema())
            ex = self.firesException(lambda: pdf2.opalytics.create_pan_dat(
                inputset, raw_data=raw_data))
            self.assertTrue(
                all(_ in ex for _ in ["(table, field) pairs missing"] +
                    ["'%s', 'dmy'" % _ for _ in pdf2.all_tables]))
Ejemplo n.º 6
0
 def testDupsOpalytics(self):
     if not self.can_run:
         return
     for hack in [True, False]:
         tdf = TicDatFactory(one=[["a"], ["b", "c"]],
                             two=[["a", "b"], ["c"]],
                             three=[["a", "b", "c"], []])
         tdf2 = TicDatFactory(
             **{t: [[], ["a", "b", "c"]]
                for t in tdf.all_tables})
         td = tdf2.TicDat(
             **{
                 t: [[1, 2, 1], [1, 2, 2], [2, 1, 3], [2, 2, 3], [1, 2, 2],
                     ["new", 1, 2]]
                 for t in tdf.all_tables
             })
         inputset = create_inputset_mock(tdf2, td, hack)
         pdf = PanDatFactory(**tdf.schema())
         panDat = pdf.opalytics.create_pan_dat(inputset, raw_data=True)
         self.assertTrue(
             all(len(getattr(panDat, t)) == 6 for t in tdf.all_tables))
         panDat = pdf.opalytics.create_pan_dat(inputset, raw_data=False)
         self.assertTrue(
             all(len(getattr(panDat, t)) < 6 for t in tdf.all_tables))
         td_1 = tdf.TicDat(
             **{
                 t: [[1, 2, 1], [1, 2, 2], [2, 1, 3], [2, 2, 3], [1, 2, 2],
                     ["new", 1, 2]]
                 for t in tdf.all_tables
             })
         td_2 = pdf.copy_to_tic_dat(panDat)
         self.assertTrue(
             all(
                 set(getattr(td_1, t)) == set(getattr(td_2, t))
                 for t in tdf.all_tables))
Ejemplo n.º 7
0
    def test_datetime(self):
        core_path = os.path.join(_scratchDir, "parameters")
        pdf = PanDatFactory(table_with_stuffs=[["field one"], ["field two"]],
                            parameters=[["a"], ["b"]])
        pdf.add_parameter("p1", "Dec 15 1970", datetime=True)
        pdf.add_parameter("p2", None, datetime=True, nullable=True)
        pdf.set_data_type("table_with_stuffs", "field one", datetime=True)
        pdf.set_data_type("table_with_stuffs",
                          "field two",
                          datetime=True,
                          nullable=True)
        dat = TicDatFactory(**pdf.schema()).TicDat(
            table_with_stuffs=[[dateutil.parser.parse("July 11 1972"), None],
                               [
                                   datetime.datetime.now(),
                                   dateutil.parser.parse("Sept 11 2011")
                               ]],
            parameters=[["p1", "7/11/1911"], ["p2", None]])
        dat = TicDatFactory(**pdf.schema()).copy_to_pandas(
            dat, drop_pk_columns=False)
        self.assertFalse(
            pdf.find_data_type_failures(dat)
            or pdf.find_data_row_failures(dat))

        for attr, path in [["csv", core_path + "_csv"],
                           ["xls", core_path + ".xlsx"],
                           ["sql", core_path + ".db"],
                           ["json", core_path + ".json"]]:
            func = "write_directory" if attr == "csv" else "write_file"
            getattr(getattr(pdf, attr), func)(dat, path)
            dat_1 = getattr(pdf, attr).create_pan_dat(path)
            self.assertFalse(pdf._same_data(dat, dat_1))
            self.assertFalse(
                pdf.find_data_type_failures(dat_1)
                or pdf.find_data_row_failures(dat_1))
            dat_1 = pdf.copy_to_tic_dat(dat_1)
            self.assertTrue(set(dat_1.parameters) == {'p1', 'p2'})
            self.assertTrue(
                isinstance(dat_1.parameters["p1"]["b"],
                           (datetime.datetime, numpy.datetime64))
                and not pd.isnull(dat_1.parameters["p1"]["b"]))
            self.assertTrue(pd.isnull(dat_1.parameters["p2"]["b"]))
            self.assertTrue(
                all(
                    isinstance(_, (datetime.datetime,
                                   numpy.datetime64)) and not pd.isnull(_)
                    for _ in dat_1.table_with_stuffs))
            self.assertTrue(
                all(
                    isinstance(_, (datetime.datetime, numpy.datetime64))
                    or _ is None or utils.safe_apply(math.isnan)(_)
                    for v in dat_1.table_with_stuffs.values()
                    for _ in v.values()))
            self.assertTrue({
                pd.isnull(_)
                for v in dat_1.table_with_stuffs.values() for _ in v.values()
            } == {True, False})
Ejemplo n.º 8
0
    def testNetflowOpalytics(self):
        if not self.can_run:
            return
        for hack, raw_data in list(itertools.product(*(([True, False], ) *
                                                       2))):
            tdf = TicDatFactory(**netflowSchema())
            ticDat = tdf.copy_tic_dat(netflowData())
            inputset = create_inputset_mock(tdf, ticDat, hack)
            pdf = PanDatFactory(**tdf.schema())
            panDat = pdf.opalytics.create_pan_dat(inputset, raw_data=raw_data)
            self.assertTrue(tdf._same_data(ticDat,
                                           pdf.copy_to_tic_dat(panDat)))

            ticDat.nodes[12] = {}
            inputset = create_inputset_mock(tdf, ticDat, hack)
            pdf = PanDatFactory(**tdf.schema())
            panDat = pdf.opalytics.create_pan_dat(inputset, raw_data=raw_data)
            self.assertTrue(tdf._same_data(ticDat,
                                           pdf.copy_to_tic_dat(panDat)))
Ejemplo n.º 9
0
    def testSillyTwoTablesOpalytics(self):
        if not self.can_run:
            return
        for hack, raw_data in list(itertools.product(*(([True, False], ) *
                                                       2))):
            tdf = TicDatFactory(**sillyMeSchema())
            ticDat = tdf.TicDat(**sillyMeData())

            inputset = create_inputset_mock(tdf, ticDat, hack)
            pdf = PanDatFactory(**tdf.schema())
            panDat = pdf.opalytics.create_pan_dat(inputset, raw_data=raw_data)
            self.assertTrue(tdf._same_data(ticDat,
                                           pdf.copy_to_tic_dat(panDat)))

            ticDat = tdf.TicDat(**sillyMeDataTwoTables())
            inputset = create_inputset_mock(tdf, ticDat, hack)
            pdf = PanDatFactory(**tdf.schema())
            panDat = pdf.opalytics.create_pan_dat(inputset, raw_data=raw_data)
            self.assertTrue(tdf._same_data(ticDat,
                                           pdf.copy_to_tic_dat(panDat)))
Ejemplo n.º 10
0
    def testSillyCleaningOpalyticsTwo(self):
        tdf = TicDatFactory(**sillyMeSchema())
        tdf.add_data_row_predicate("c", lambda row: row["cData4"] == 'd')
        ticDat = tdf.TicDat(**sillyMeData())

        input_set = create_inputset_mock(tdf, ticDat)

        pdf = PanDatFactory(**sillyMeSchema())
        pdf.add_data_row_predicate("c", lambda row: row["cData4"] == 'd')

        panDat = pdf.opalytics.create_pan_dat(input_set, raw_data=True)
        self.assertTrue(tdf._same_data(pdf.copy_to_tic_dat(panDat), ticDat))

        panDatPurged = pdf.opalytics.create_pan_dat(input_set, raw_data=False)
        self.assertFalse(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))

        ticDat.c.pop()
        ticDat.c.pop(0)
        self.assertTrue(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
Ejemplo n.º 11
0
    def testMissingOpalyticsTable(self):
        if not self.can_run:
            return
        tdf = TicDatFactory(**dietSchema())
        ticDat = tdf.freeze_me(tdf.copy_tic_dat(dietData()))
        inputset = create_inputset_mock(tdf, ticDat)

        pdf = PanDatFactory(
            **(dict(dietSchema(), missing_table=[["a"], ["b"]])))
        panDat = pdf.opalytics.create_pan_dat(inputset)
        ticDat2 = pdf.copy_to_tic_dat(panDat)
        self.assertTrue(tdf._same_data(ticDat, ticDat2))
        self.assertFalse(ticDat2.missing_table)
Ejemplo n.º 12
0
    def testRoundTrips(self):
        if not self.canRun:
            return
        tdf = TicDatFactory(**dietSchema())
        tdf.enable_foreign_key_links()
        oldDat = tdf.freeze_me(
            tdf.TicDat(
                **{t: getattr(dietData(), t)
                   for t in tdf.primary_key_fields}))
        pdf = PanDatFactory.create_from_full_schema(
            tdf.schema(include_ancillary_info=True))
        pan_dat = tdf.copy_to_pandas(oldDat, drop_pk_columns=False)
        self.assertTrue(pdf.good_pan_dat_object(pan_dat))
        tic_dat = pdf.copy_to_tic_dat(pan_dat)
        self.assertTrue(tdf._same_data(oldDat, tic_dat))

        tdf = TicDatFactory(**netflowSchema())
        tdf.enable_foreign_key_links()
        addNetflowForeignKeys(tdf)
        oldDat = tdf.freeze_me(
            tdf.TicDat(
                **
                {t: getattr(netflowData(), t)
                 for t in tdf.primary_key_fields}))
        pdf = PanDatFactory.create_from_full_schema(
            tdf.schema(include_ancillary_info=True))
        pan_dat = tdf.copy_to_pandas(oldDat, drop_pk_columns=False)
        self.assertTrue(pdf.good_pan_dat_object(pan_dat))
        tic_dat = pdf.copy_to_tic_dat(pan_dat)
        self.assertTrue(tdf._same_data(oldDat, tic_dat))

        pdf = PanDatFactory(table=[["a", "b"], ["c"]])
        pan_dat = pdf.PanDat(table=utils.DataFrame({
            "a": [1, 2, 1, 1],
            "b": [10, 10, 10, 11],
            "c": [101, 102, 103, 104]
        }))
        self.assertTrue(
            len(pdf.find_duplicates(pan_dat, keep=False)["table"]) == 2)
        tic_dat = pdf.copy_to_tic_dat(pan_dat)
        self.assertTrue(len(tic_dat.table) == len(pan_dat.table) - 1)

        tdf = TicDatFactory(**pdf.schema())
        tic_dat = tdf.TicDat(table=[[1, 2, 3], [None, 2, 3], [2, 1, None]])
        self.assertTrue(len(tic_dat.table) == 3)
        tic_dat_two = pdf.copy_to_tic_dat(
            tdf.copy_to_pandas(tic_dat, drop_pk_columns=False))
        self.assertFalse(tdf._same_data(tic_dat, tic_dat_two))
        tic_dat3 = tdf.TicDat(
            table=[[1, 2, 3], [float("nan"), 2, 3], [2, 1, float("nan")]])
        # this fails because _same_data isn't smart enough to check against nan in the keys,
        # because float("nan") != float("nan")
        self.assertFalse(tdf._same_data(tic_dat3, tic_dat_two))

        pdf = PanDatFactory(table=[["a"], ["b", "c"]])
        tdf = TicDatFactory(**pdf.schema())
        tic_dat = tdf.TicDat(table=[[1, 2, 3], [2, None, 3], [2, 1, None]])
        tic_dat_two = pdf.copy_to_tic_dat(
            tdf.copy_to_pandas(tic_dat, drop_pk_columns=False))
        self.assertFalse(tdf._same_data(tic_dat, tic_dat_two))
        tic_dat3 = tdf.TicDat(
            table=[[1, 2, 3], [2, float("nan"), 3], [2, 1, float("nan")]])
        # _same_data works fine in checking nan equivalence in data rows - which maybe
        self.assertTrue(
            tdf._same_data(tic_dat3,
                           tic_dat_two,
                           nans_are_same_for_data_rows=True))