def testDietCleaningOpalytics(self):
        sch = dietSchema()
        sch["categories"][-1].append("_active")
        tdf1 = TicDatFactory(**dietSchema())
        tdf2 = TicDatFactory(**sch)

        ticDat2 = tdf2.copy_tic_dat(dietData())
        for v in ticDat2.categories.values():
            v["_active"] = True
        ticDat2.categories["fat"]["_active"] = False
        ticDat1 = tdf1.copy_tic_dat(dietData())

        input_set = create_inputset_mock_with_active_hack(tdf2, ticDat2)
        pdf1 = PanDatFactory(**tdf1.schema())
        panDat = pdf1.opalytics.create_pan_dat(input_set, raw_data=True)
        self.assertTrue(tdf1._same_data(pdf1.copy_to_tic_dat(panDat), ticDat1))

        panDatPurged = pdf1.opalytics.create_pan_dat(input_set)
        self.assertFalse(
            tdf1._same_data(pdf1.copy_to_tic_dat(panDatPurged), ticDat1))

        ticDat1.categories.pop("fat")
        tdf1.remove_foreign_key_failures(ticDat1)
        self.assertTrue(
            tdf1._same_data(pdf1.copy_to_tic_dat(panDatPurged), ticDat1))
    def testDiet(self):
        if not self.can_run:
            return
        for hack, raw_data, activeEnabled in list(product(*(([True, False],)*3))):
            tdf = TicDatFactory(**dietSchema())
            ticDat = tdf.freeze_me(tdf.copy_tic_dat(dietData()))
            inputset = create_inputset_mock(tdf, ticDat, hack, activeEnabled)
            self.assertFalse(tdf.opalytics.find_duplicates(inputset, raw_data=raw_data))
            ticDat2 = tdf.opalytics.create_tic_dat(inputset, raw_data=raw_data)
            self.assertTrue(tdf._same_data(ticDat, ticDat2))

            def change() :
                ticDat2.categories["calories"]["minNutrition"]=12
            self.assertFalse(firesException(change))
            self.assertFalse(tdf._same_data(ticDat, ticDat2))

            ticDat2 = tdf.opalytics.create_tic_dat(inputset, freeze_it=True, raw_data=raw_data)
            self.assertTrue(tdf._same_data(ticDat, ticDat2))
            self.assertTrue(firesException(change))
            self.assertTrue(tdf._same_data(ticDat, ticDat2))

            tdf2 = TicDatFactory(**{k:[pks, list(dfs) + ["dmy"]] for k,(pks, dfs) in tdf.schema().items()})
            _dat = tdf2.copy_tic_dat(ticDat)
            self.assertTrue(tdf._same_data(ticDat,
                                           tdf.opalytics.create_tic_dat(create_inputset_mock(tdf2, _dat, hack),
                                                                        raw_data=raw_data)))

            ex = self.firesException(lambda: tdf2.opalytics.create_tic_dat(inputset, raw_data=raw_data))
            self.assertTrue("field dmy can't be found" in ex)
    def testDietOpalytics(self):
        if not self.can_run:
            return
        for hack, raw_data, activeEnabled in list(
                itertools.product(*(([True, False], ) * 3))):
            tdf = TicDatFactory(**dietSchema())
            ticDat = tdf.freeze_me(tdf.copy_tic_dat(dietData()))
            inputset = create_inputset_mock(tdf, ticDat, hack, activeEnabled)

            pdf = PanDatFactory(**dietSchema())
            panDat = pdf.opalytics.create_pan_dat(inputset)
            self.assertFalse(pdf.find_duplicates(panDat))
            ticDat2 = pdf.copy_to_tic_dat(panDat)
            self.assertTrue(tdf._same_data(ticDat, ticDat2))

            tdf2 = TicDatFactory(
                **{
                    k: [pks, list(dfs) + ["dmy"]]
                    for k, (pks, dfs) in tdf.schema().items()
                })
            _dat = tdf2.copy_tic_dat(ticDat)
            panDat = pdf.opalytics.create_pan_dat(
                create_inputset_mock(tdf2, _dat, hack))

            self.assertTrue(tdf._same_data(ticDat,
                                           pdf.copy_to_tic_dat(panDat)))

            pdf2 = PanDatFactory(**tdf2.schema())
            ex = self.firesException(lambda: pdf2.opalytics.create_pan_dat(
                inputset, raw_data=raw_data))
            self.assertTrue(
                all(_ in ex for _ in ["(table, field) pairs missing"] +
                    ["'%s', 'dmy'" % _ for _ in pdf2.all_tables]))
Example #4
0
    def testMissingTable(self):
        if not self.can_run:
            return
        tdf = TicDatFactory(**dietSchema())
        tdf2 = TicDatFactory(**{
            k: v
            for k, v in dietSchema().items() if k != "nutritionQuantities"
        })
        ticDat2 = tdf2.copy_tic_dat(dietData())
        filePath = makeCleanPath(os.path.join(_scratchDir,
                                              "diet_missing.xlsx"))
        tdf2.xls.write_file(ticDat2, filePath)
        ticDat3 = tdf.xls.create_tic_dat(filePath)
        self.assertTrue(tdf2._same_data(ticDat2, ticDat3))
        self.assertTrue(all(hasattr(ticDat3, x) for x in tdf.all_tables))
        self.assertFalse(ticDat3.nutritionQuantities)
        self.assertTrue(ticDat3.categories and ticDat3.foods)

        tdf2 = TicDatFactory(
            **{k: v
               for k, v in dietSchema().items() if k == "categories"})
        ticDat2 = tdf2.copy_tic_dat(dietData())
        filePath = makeCleanPath(os.path.join(_scratchDir,
                                              "diet_missing.xlsx"))
        tdf2.xls.write_file(ticDat2, filePath)
        ticDat3 = tdf.xls.create_tic_dat(filePath)
        self.assertTrue(tdf2._same_data(ticDat2, ticDat3))
        self.assertTrue(all(hasattr(ticDat3, x) for x in tdf.all_tables))
        self.assertFalse(ticDat3.nutritionQuantities or ticDat3.foods)
        self.assertTrue(ticDat3.categories)
Example #5
0
    def _test_generic_free_copy(self, ticDat, tdf, skip_tables=None):
        assert all(tdf.primary_key_fields.get(t) for t in tdf.all_tables)
        replace_name = lambda f: "name_" if f == "name" else f
        clean_tdf = TicDatFactory(
            **{
                t: [list(map(replace_name, pks)), dfs]
                for t, (pks, dfs) in tdf.schema().items()
            })

        temp_tdf = TicDatFactory(
            **{
                t: v if t in (skip_tables or []) else '*'
                for t, v in clean_tdf.schema().items()
            })
        temp_dat = temp_tdf.TicDat(
            **{t: getattr(ticDat, t)
               for t in (skip_tables or [])})
        for t in temp_tdf.generic_tables:
            setattr(
                temp_dat, t,
                getattr(
                    clean_tdf.copy_to_pandas(ticDat, drop_pk_columns=False),
                    t))
        generic_free_dat, _ = utils.create_generic_free(temp_dat, temp_tdf)
        check_dat = clean_tdf.TicDat()
        for t in temp_tdf.generic_tables:
            for r in getattr(generic_free_dat, t):
                pks = clean_tdf.primary_key_fields[t]
                getattr(check_dat, t)[r[pks[0]] if len(pks) == 1 else tuple(r[_] for _ in pks)] = \
                    {df:r[df] for df in clean_tdf.data_fields.get(t, [])}
        for t in (skip_tables or []):
            for k, v in getattr(generic_free_dat, t).items():
                getattr(check_dat, t)[k] = v
        self.assertTrue(
            clean_tdf._same_data(check_dat, clean_tdf.copy_tic_dat(ticDat)))
Example #6
0
 def testDietWithInfFlagging(self):
     diet_pdf = PanDatFactory(**dietSchema())
     addDietDataTypes(diet_pdf)
     tdf = TicDatFactory(**dietSchema())
     dat = tdf.copy_to_pandas(tdf.copy_tic_dat(dietData()),
                              drop_pk_columns=False)
     diet_pdf.set_infinity_io_flag(999999999)
     core_path = os.path.join(_scratchDir, "diet_with_inf_flagging")
     diet_pdf.sql.write_file(dat, core_path + ".db")
     diet_pdf.csv.write_directory(dat, core_path + "_csv")
     diet_pdf.json.write_file(dat, core_path + ".json")
     diet_pdf.xls.write_file(dat, core_path + ".xlsx")
     for attr, f in [["sql", core_path + ".db"],
                     ["csv", core_path + "_csv"],
                     ["json", core_path + ".json"],
                     ["xls", core_path + ".xlsx"]]:
         dat_1 = getattr(diet_pdf, attr).create_pan_dat(f)
         self.assertTrue(diet_pdf._same_data(dat, dat_1, epsilon=1e-5))
         pdf = diet_pdf.clone()
         dat_1 = getattr(pdf, attr).create_pan_dat(f)
         self.assertTrue(pdf._same_data(dat, dat_1, epsilon=1e-5))
         pdf = PanDatFactory(**diet_pdf.schema())
         dat_1 = getattr(pdf, attr).create_pan_dat(f)
         self.assertFalse(pdf._same_data(dat, dat_1, epsilon=1e-5))
         protein = dat_1.categories["name"] == "protein"
         self.assertTrue(
             list(dat_1.categories[protein]["maxNutrition"])[0] ==
             999999999)
         dat_1.categories.loc[protein, "maxNutrition"] = float("inf")
         self.assertTrue(pdf._same_data(dat, dat_1, epsilon=1e-5))
Example #7
0
    def testTryCreateSpace(self):
        def test_(schema_factory, data_factory):
            tdf = TicDatFactory(**schema_factory())
            dat = tdf.copy_tic_dat(data_factory())
            mapping = tlingo._try_create_space_case_mapping(tdf,
                                                            dat)["mapping"]
            remapdat = tlingo._apply_space_case_mapping(
                tdf, dat, {v: k
                           for k, v in mapping.items()})
            mapmapdat = tlingo._apply_space_case_mapping(
                tdf, remapdat, mapping)
            self.assertTrue(tdf._same_data(dat, mapmapdat))
            self.assertFalse(tdf._same_data(dat, remapdat))

        test_(dietSchema, dietData)
        test_(netflowSchema, netflowData)
        test_(sillyMeSchema,
              lambda: TicDatFactory(**sillyMeSchema()).TicDat(**sillyMeData()))

        tdf = TicDatFactory(**dietSchema())
        dat = tdf.copy_tic_dat(dietData())
        dat.foods["ice_cream"] = dat.foods["ice cream"]
        dat.categories["ICE CREAM"] = {}
        dat.categories["fAt"] = dat.categories["fat"]
        failures = tlingo._try_create_space_case_mapping(tdf, dat)["failures"]
        self.assertTrue(
            failures == {
                'ICE_CREAM': ('ICE CREAM', 'ice cream', 'ice_cream'),
                'FAT': ('fAt', 'fat')
            })
    def testDietCleaningOpalytisThree(self):
        tdf = TicDatFactory(**dietSchema())
        tdf.add_data_row_predicate("categories",
                                   lambda row: row["maxNutrition"] >= 66)
        addDietForeignKeys(tdf)
        ticDat = tdf.copy_tic_dat(dietData())

        pdf = PanDatFactory(**tdf.schema())
        pdf.add_data_row_predicate("categories",
                                   lambda row: row["maxNutrition"] >= 66)
        addDietForeignKeys(pdf)

        input_set = create_inputset_mock(tdf, ticDat)

        panDat = pdf.opalytics.create_pan_dat(input_set, raw_data=True)
        self.assertTrue(tdf._same_data(pdf.copy_to_tic_dat(panDat), ticDat))

        panDatPurged = pdf.opalytics.create_pan_dat(input_set, raw_data=False)
        self.assertFalse(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))

        ticDat.categories.pop("fat")
        self.assertFalse(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
        tdf.remove_foreign_key_failures(ticDat)
        self.assertTrue(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
    def testDietCleaningOpalyticsTwo(self):
        tdf = TicDatFactory(**dietSchema())
        addDietForeignKeys(tdf)
        tdf.set_data_type("categories",
                          "maxNutrition",
                          min=66,
                          inclusive_max=True)
        ticDat = tdf.copy_tic_dat(dietData())

        input_set = create_inputset_mock(tdf, ticDat)
        pdf = PanDatFactory(**dietSchema())
        addDietForeignKeys(pdf)
        pdf.set_data_type("categories",
                          "maxNutrition",
                          min=66,
                          inclusive_max=True)

        panDat = pdf.opalytics.create_pan_dat(input_set, raw_data=True)
        self.assertTrue(tdf._same_data(pdf.copy_to_tic_dat(panDat), ticDat))

        panDatPurged = pdf.opalytics.create_pan_dat(input_set, raw_data=False)
        self.assertFalse(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))

        ticDat.categories.pop("fat")
        self.assertFalse(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
        tdf.remove_foreign_key_failures(ticDat)
        self.assertTrue(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
Example #10
0
    def _test_generic_copy(self, ticDat, tdf, skip_tables=None):
        assert all(tdf.primary_key_fields.get(t) for t in tdf.all_tables)
        path = makeCleanDir(os.path.join(_scratchDir, "generic_copy"))
        replace_name  = lambda f : "name_" if f == "name" else f
        clean_tdf = TicDatFactory(**{t:[list(map(replace_name, pks)), dfs]
                                     for t,(pks, dfs) in tdf.schema().items()})

        temp_tdf = TicDatFactory(**{t:v if t in (skip_tables or []) else '*'
                                    for t,v in clean_tdf.schema().items()})
        temp_dat = temp_tdf.TicDat(**{t:getattr(ticDat, t) for t in (skip_tables or [])})
        for t in temp_tdf.generic_tables:
            setattr(temp_dat, t, getattr(clean_tdf.copy_to_pandas(ticDat, drop_pk_columns=False) ,t))

        temp_tdf.sql.write_db_data(temp_dat, os.path.join(path, "f.db"))
        temp_tdf.sql.write_sql_file(temp_dat, os.path.join(path, "f1.sql"), include_schema=False)
        temp_tdf.sql.write_sql_file(temp_dat, os.path.join(path, "f2.sql"), include_schema=True)

        for file_name, includes_schema in [("f.db", False), ("f1.sql", False), ("f2.sql", True)]:
            file_path = os.path.join(path, file_name)
            if file_path.endswith(".db"):
                self.assertFalse(temp_tdf.sql.find_duplicates(file_path))
                read_dat = temp_tdf.sql.create_tic_dat(file_path)
            else:
                read_dat = temp_tdf.sql.create_tic_dat_from_sql(file_path, includes_schema)
            generic_free_dat, _ = utils.create_generic_free(read_dat, temp_tdf)
            check_dat = clean_tdf.TicDat()
            for t in temp_tdf.generic_tables:
                for r in getattr(generic_free_dat, t):
                    pks = clean_tdf.primary_key_fields[t]
                    getattr(check_dat, t)[r[pks[0]] if len(pks) == 1 else tuple(r[_] for _ in pks)] = \
                        {df:r[df] for df in clean_tdf.data_fields.get(t, [])}
            for t in (skip_tables or []):
                for k,v in getattr(generic_free_dat, t).items():
                    getattr(check_dat, t)[k] = v
            self.assertTrue(clean_tdf._same_data(check_dat, clean_tdf.copy_tic_dat(ticDat)))
Example #11
0
 def testDietWithInfFlagging(self):
     tdf = TicDatFactory(**dietSchema())
     dat = tdf.copy_tic_dat(dietData())
     tdf.set_infinity_io_flag(999999999)
     file_one = os.path.join(_scratchDir, "dietInfFlag.xls")
     file_two = os.path.join(_scratchDir, "dietInfFlag.xlsx")
     tdf.xls.write_file(dat, file_one)
     tdf.xls.write_file(dat, file_two)
     dat_1 = tdf.xls.create_tic_dat(file_one)
     dat_2 = tdf.xls.create_tic_dat(file_two)
     self.assertTrue(tdf._same_data(dat, dat_1))
     self.assertTrue(tdf._same_data(dat, dat_2))
     tdf = tdf.clone()
     dat_1 = tdf.xls.create_tic_dat(file_one)
     self.assertTrue(tdf._same_data(dat, dat_1))
     tdf = TicDatFactory(**dietSchema())
     dat_1 = tdf.xls.create_tic_dat(file_one)
     dat_2 = tdf.xls.create_tic_dat(file_two)
     self.assertFalse(tdf._same_data(dat, dat_1))
     self.assertFalse(tdf._same_data(dat, dat_2))
     self.assertTrue(
         {_.categories["protein"]["maxNutrition"]
          for _ in [dat_1, dat_2]} == {999999999})
     for _ in [dat_1, dat_2]:
         _.categories["protein"]["maxNutrition"] = float("inf")
     self.assertTrue(tdf._same_data(dat, dat_1))
     self.assertTrue(tdf._same_data(dat, dat_2))
Example #12
0
    def testDiet(self):
        def doTheTests(tdf) :
            ticDat = tdf.freeze_me(tdf.TicDat(**{t:getattr(dietData(),t) for t in tdf.primary_key_fields}))
            filePath = makeCleanPath(os.path.join(_scratchDir, "diet.db"))
            tdf.sql.write_db_data(ticDat, filePath)
            sqlTicDat = tdf.sql.create_tic_dat(filePath)
            self.assertTrue(tdf._same_data(ticDat, sqlTicDat))
            def changeit() :
                sqlTicDat.categories["calories"]["minNutrition"]=12
            changeit()
            self.assertFalse(tdf._same_data(ticDat, sqlTicDat))

            self.assertTrue(self.firesException(lambda : tdf.sql.write_db_data(ticDat, filePath)))
            tdf.sql.write_db_data(ticDat, filePath, allow_overwrite=True)
            sqlTicDat = tdf.sql.create_tic_dat(filePath, freeze_it=True)
            self.assertTrue(tdf._same_data(ticDat, sqlTicDat))
            self.assertTrue(self.firesException(changeit))
            self.assertTrue(tdf._same_data(ticDat, sqlTicDat))

            filePath = makeCleanPath(os.path.join(_scratchDir, "diet.sql"))
            tdf.sql.write_sql_file(ticDat, filePath)
            sqlTicDat = tdf.sql.create_tic_dat_from_sql(filePath)
            self.assertTrue(tdf._same_data(ticDat, sqlTicDat))
            changeit()
            self.assertFalse(tdf._same_data(ticDat, sqlTicDat))

            tdf.sql.write_sql_file(ticDat, filePath, include_schema=True)
            sqlTicDat = tdf.sql.create_tic_dat_from_sql(filePath, includes_schema=True, freeze_it=True)
            self.assertTrue(tdf._same_data(ticDat, sqlTicDat))
            self.assertTrue(self.firesException(changeit))
            self.assertTrue(tdf._same_data(ticDat, sqlTicDat))

        doTheTests(TicDatFactory(**dietSchema()))

        tdf = TicDatFactory(**dietSchema())
        self.assertFalse(tdf.foreign_keys)
        tdf.set_default_values(categories =  {'maxNutrition': float("inf"), 'minNutrition': 0.0},
                               foods =  {'cost': 0.0},
                               nutritionQuantities =  {'qty': 0.0})
        addDietForeignKeys(tdf)
        ordered = tdf.sql._ordered_tables()
        self.assertTrue(ordered.index("categories") < ordered.index("nutritionQuantities"))
        self.assertTrue(ordered.index("foods") < ordered.index("nutritionQuantities"))

        ticDat = tdf.TicDat(**{t:getattr(dietData(),t) for t in tdf.primary_key_fields})
        origTicDat = tdf.copy_tic_dat(ticDat)
        self.assertTrue(tdf._same_data(ticDat, origTicDat))
        self.assertFalse(tdf.find_foreign_key_failures(ticDat))
        ticDat.nutritionQuantities['hot dog', 'boger'] = ticDat.nutritionQuantities['junk', 'protein'] = -12
        self.assertTrue(tdf.find_foreign_key_failures(ticDat) ==
        {('nutritionQuantities', 'foods', ('food', 'name'), 'many-to-one'): (('junk',), (('junk', 'protein'),)),
         ('nutritionQuantities', 'categories', ('category', 'name'), 'many-to-one'):
             (('boger',), (('hot dog', 'boger'),))})

        self.assertFalse(tdf._same_data(ticDat, origTicDat))
        tdf.remove_foreign_keys_failures(ticDat)
        self.assertFalse(tdf.find_foreign_key_failures(ticDat))
        self.assertTrue(tdf._same_data(ticDat, origTicDat))

        doTheTests(tdf)
    def testDietCleaningFive(self):
        tdf = TicDatFactory(**dietSchema())
        tdf.add_data_row_predicate("categories",
                                   lambda row: row["maxNutrition"] >= 66)
        tdf.set_data_type("categories",
                          "minNutrition",
                          max=0,
                          inclusive_max=True)
        addDietForeignKeys(tdf)
        ticDat = tdf.copy_tic_dat(dietData())

        input_set = create_inputset_mock(tdf, ticDat)

        self.assertTrue(
            tdf._same_data(
                tdf.opalytics.create_tic_dat(input_set, raw_data=True),
                ticDat))

        ticDatPurged = tdf.opalytics.create_tic_dat(input_set, raw_data=False)
        self.assertFalse(tdf._same_data(ticDatPurged, ticDat))

        ticDat.categories.pop("fat")
        ticDat.categories.pop("calories")
        ticDat.categories.pop("protein")

        self.assertFalse(tdf._same_data(ticDatPurged, ticDat))
        tdf.remove_foreign_keys_failures(ticDat)
        self.assertTrue(tdf._same_data(ticDatPurged, ticDat))
Example #14
0
    def testDataTypes(self):
        if not self.canRun:
            return
        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**dietSchema())

        ticdat = tdf.TicDat()
        ticdat.foods["a"] = 12
        ticdat.foods["b"] = None
        ticdat.categories["1"] = {"maxNutrition":100, "minNutrition":40}
        ticdat.categories["2"] = [10,20]
        for f, p in itertools.product(ticdat.foods, ticdat.categories):
            ticdat.nutritionQuantities[f,p] = 5
        ticdat.nutritionQuantities['a', 2] = 12

        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))

        self.assertFalse(pdf.find_data_type_failures(pandat))
        pandat_copy = pdf.replace_data_type_failures(pdf.copy_pan_dat(pandat))
        self.assertTrue(pdf._same_data(pandat, pandat_copy, epsilon=0.00001))

        pdf = PanDatFactory(**dietSchema())
        pdf.set_data_type("foods", "cost", nullable=False)
        pdf.set_data_type("nutritionQuantities", "qty", min=5, inclusive_min=False, max=12, inclusive_max=True)
        failed = pdf.find_data_type_failures(pandat)
        self.assertTrue(set(failed) == {('foods', 'cost'), ('nutritionQuantities', 'qty')})
        self.assertTrue(set(failed['foods', 'cost']["name"]) == {'b'})
        self.assertTrue(set({(v["food"], v["category"])
                             for v in failed['nutritionQuantities', 'qty'].T.to_dict().values()}) ==
                            {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')})

        failed = pdf.find_data_type_failures(pandat, as_table=False)
        self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True])
        fixed = pdf.replace_data_type_failures(pdf.copy_pan_dat(pandat), {("nutritionQuantities", "qty"): 5.15})
        self.assertTrue(set(fixed.foods["cost"]) == {0.0, 12.0})
        self.assertTrue(set(fixed.nutritionQuantities["qty"]) == {5.15, 12.0})

        tdf = TicDatFactory(**netflowSchema())
        tdf.enable_foreign_key_links()
        addNetflowForeignKeys(tdf)
        pdf = PanDatFactory(**netflowSchema())
        ticdat = tdf.copy_tic_dat(netflowData())
        for n in ticdat.nodes["Detroit"].arcs_source:
            ticdat.arcs["Detroit", n] = n
        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))
        self.assertFalse(pdf.find_data_type_failures(pandat))

        pdf = PanDatFactory(**netflowSchema())
        pdf.set_data_type("arcs", "capacity", strings_allowed="*")
        self.assertFalse(pdf.find_data_type_failures(pandat))

        pdf = PanDatFactory(**netflowSchema())
        pdf.set_data_type("arcs", "capacity", strings_allowed=["Boston", "Seattle", "lumberjack"])
        failed = pdf.find_data_type_failures(pandat)
        self.assertTrue(set(failed) == {('arcs', 'capacity')})
        self.assertTrue(set({(v["source"], v["destination"])
                             for v in failed['arcs', 'capacity'].T.to_dict().values()}) == {("Detroit", "New York")})
        pdf.replace_data_type_failures(pandat)
        self.assertTrue(set(pandat.arcs["capacity"]) == {120, 'Boston', 0, 'Seattle'})
Example #15
0
    def testMissingTable(self):
        if not self.can_run:
            return
        tdf = TicDatFactory(**dietSchema())
        ticDat = tdf.freeze_me(tdf.copy_tic_dat(dietData()))
        inputset = create_inputset_mock(tdf, ticDat)

        tdf2 = TicDatFactory(**(dict(dietSchema(), missing_table=[["a"],["b"]])))
        ticDat2 = tdf2.opalytics.create_tic_dat(inputset)
        self.assertTrue(tdf._same_data(ticDat, ticDat2))
        self.assertFalse(ticDat2.missing_table)
Example #16
0
 def test_(schema_factory, data_factory):
     tdf = TicDatFactory(**schema_factory())
     dat = tdf.copy_tic_dat(data_factory())
     mapping = tlingo._try_create_space_case_mapping(tdf,
                                                     dat)["mapping"]
     remapdat = tlingo._apply_space_case_mapping(
         tdf, dat, {v: k
                    for k, v in mapping.items()})
     mapmapdat = tlingo._apply_space_case_mapping(
         tdf, remapdat, mapping)
     self.assertTrue(tdf._same_data(dat, mapmapdat))
     self.assertFalse(tdf._same_data(dat, remapdat))
Example #17
0
    def testNetflow(self):
        if not self.can_run:
            return
        for hack, raw_data in list(product(*(([True, False],)*2))):
            tdf = TicDatFactory(**netflowSchema())
            ticDat = tdf.copy_tic_dat(netflowData())
            self.assertTrue(tdf._same_data(ticDat, tdf.opalytics.create_tic_dat(
                create_inputset_mock(tdf, ticDat, hack), raw_data=raw_data)))

            ticDat.nodes[12] = {}
            self.assertTrue(tdf._same_data(ticDat, tdf.opalytics.create_tic_dat(
                create_inputset_mock(tdf, ticDat, hack), raw_data=raw_data)))
Example #18
0
    def testOne(self):
        def _cleanIt(x) :
            x.foods['macaroni'] = {"cost": 2.09}
            x.foods['milk'] = {"cost":0.89}
            return x
        dataObj = dietData()
        tdf = TicDatFactory(**dietSchema())
        self.assertTrue(tdf.good_tic_dat_object(dataObj))
        dataObj2 = tdf.copy_tic_dat(dataObj)
        dataObj3 = tdf.copy_tic_dat(dataObj, freeze_it=True)
        dataObj4 = tdf.TicDat(**tdf.as_dict(dataObj3))
        self.assertTrue(all (tdf._same_data(dataObj, x) and dataObj is not x for x in (dataObj2, dataObj3, dataObj4)))
        dataObj = _cleanIt(dataObj)
        self.assertTrue(tdf.good_tic_dat_object(dataObj))
        self.assertTrue(all (tdf._same_data(dataObj, x) and dataObj is not x for x in (dataObj2, dataObj3)))
        def hackit(x) :
            x.foods["macaroni"] = 100
        self.assertTrue(self.firesException(lambda :hackit(dataObj3)))
        hackit(dataObj2)
        self.assertTrue(not tdf._same_data(dataObj, dataObj2) and  tdf._same_data(dataObj, dataObj3))

        msg = []
        dataObj.foods[("milk", "cookies")] = {"cost": float("inf")}
        dataObj.boger = object()
        self.assertFalse(tdf.good_tic_dat_object(dataObj) or
                         tdf.good_tic_dat_object(dataObj, bad_message_handler =msg.append))
        self.assertTrue({"foods : Inconsistent key lengths"} == set(msg))
        self.assertTrue(all(tdf.good_tic_dat_table(getattr(dataObj, t), t)
                            for t in ("categories", "nutritionQuantities")))

        dataObj = dietData()
        dataObj.categories["boger"] = {"cost":1}
        dataObj.categories["boger"] = {"cost":1}
        self.assertFalse(tdf.good_tic_dat_object(dataObj) or
                         tdf.good_tic_dat_object(dataObj, bad_message_handler=msg.append))
        self.assertTrue({'foods : Inconsistent key lengths',
                         'categories : Inconsistent data field name keys.'} == set(msg))
        ex = firesException(lambda : tdf.freeze_me(tdf.TicDat(**{t:getattr(dataObj,t)
                                                                for t in tdf.primary_key_fields}))).message
        self.assertTrue("categories cannot be treated as a ticDat table : Inconsistent data field name keys" in ex)
Example #19
0
    def testDietCleaningThree_2(self):
        tdf = TicDatFactory(**dietSchema())
        addDietForeignKeys(tdf)
        ticDat = tdf.copy_tic_dat(dietData())
        ticDat.categories.pop("fat")
        input_set = create_inputset_mock(tdf, ticDat)

        self.assertTrue(tdf._same_data(tdf.opalytics.create_tic_dat(input_set, raw_data=True), ticDat))

        ticDatPurged = tdf.opalytics.create_tic_dat(input_set, raw_data=False)
        self.assertFalse(tdf._same_data(ticDatPurged, ticDat))
        tdf.remove_foreign_key_failures(ticDat)
        self.assertTrue(tdf._same_data(ticDatPurged, ticDat))
Example #20
0
 def testDietWithInfFlagging(self):
     tdf = TicDatFactory(**dietSchema())
     dat = tdf.copy_tic_dat(dietData())
     tdf.set_infinity_io_flag(999999999)
     path = os.path.join(_scratchDir, "dietInfFlag")
     tdf.csv.write_directory(dat, path)
     dat_1 = tdf.csv.create_tic_dat(path)
     self.assertTrue(tdf._same_data(dat, dat_1))
     tdf = tdf.clone()
     dat_1 = tdf.csv.create_tic_dat(path)
     self.assertTrue(tdf._same_data(dat, dat_1))
     tdf = TicDatFactory(**dietSchema())
     dat_1 = tdf.csv.create_tic_dat(path)
     self.assertFalse(tdf._same_data(dat, dat_1))
Example #21
0
    def testAdditionalFKs(self):
        pdf = PanDatFactory(pt1=[["F1"], []],
                            pt2=[["F2"], []],
                            pt3=[["F1", "F2"], []],
                            pt4=[["F1"], ["F2"]],
                            pt5=[[], ["F1", "F2"]])
        for c in ["pt3", "pt4", "pt5"]:
            pdf.add_foreign_key(c, "pt1", ["F1", "F1"])
            pdf.add_foreign_key(c, "pt2", ["F2", "F2"])
        tdf = TicDatFactory(**pdf.schema())

        def pan_dat_(_):
            rtn = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, _))
            self.assertFalse(pdf.find_duplicates(rtn))
            return rtn

        ticDat = tdf.TicDat(pt1=[1, 2, 3, 4], pt2=[5, 6, 7, 8])
        for f1, f2 in itertools.product(range(1, 5), range(5, 9)):
            ticDat.pt3[f1, f2] = {}
            ticDat.pt4[f1] = f2
            ticDat.pt5.append((f1, f2))
        origDat = tdf.copy_tic_dat(ticDat, freeze_it=True)
        self.assertFalse(pdf.find_foreign_key_failures(pan_dat_(origDat)))
        ticDat.pt3["no", 6] = ticDat.pt3[1, "no"] = {}
        ticDat.pt4["no"] = 6
        ticDat.pt4["nono"] = 6.01
        panDat = pan_dat_(ticDat)
        fails1 = pdf.find_foreign_key_failures(panDat)
        self.assertTrue(fails1)
        pdf.remove_foreign_key_failures(panDat)
        self.assertFalse(pdf.find_foreign_key_failures(panDat))
        self.assertTrue(pdf._same_data(panDat, pan_dat_(origDat)))

        orig_lens = {t: len(getattr(origDat, t)) for t in tdf.all_tables}
        ticDat.pt3["no", 6] = ticDat.pt3[1, "no"] = {}
        ticDat.pt4["no"] = 6
        ticDat.pt4["nono"] = 6.01
        ticDat.pt5.append(("no", 6))
        ticDat.pt5.append((1, "no"))
        panDat = pan_dat_(ticDat)
        fails2 = pdf.find_foreign_key_failures(panDat)
        self.assertTrue(
            set(fails1) != set(fails2) and set(fails1).issubset(fails2))
        pdf.remove_foreign_key_failures(panDat)
        self.assertFalse(pdf.find_foreign_key_failures(panDat))
        self.assertTrue({t: len(getattr(panDat, t))
                         for t in tdf.all_tables} == orig_lens)
Example #22
0
 def testDietWithInfFlagging(self):
     tdf = TicDatFactory(**dietSchema())
     dat = tdf.copy_tic_dat(dietData())
     tdf.set_infinity_io_flag(999999999)
     file_one = os.path.join(_scratchDir, "dietInfFlag_1.json")
     file_two = os.path.join(_scratchDir, "dietInfFlag_2.json")
     tdf.json.write_file(dat, file_one, verbose=True)
     tdf.json.write_file(dat, file_two, verbose=False)
     dat_1 = tdf.json.create_tic_dat(file_one)
     dat_2 = tdf.json.create_tic_dat(file_two)
     self.assertTrue(tdf._same_data(dat, dat_1))
     self.assertTrue(tdf._same_data(dat, dat_2))
     tdf = tdf.clone()
     dat_1 = tdf.json.create_tic_dat(file_one)
     self.assertTrue(tdf._same_data(dat, dat_1))
     tdf = TicDatFactory(**dietSchema())
     dat_1 = tdf.json.create_tic_dat(file_one)
     self.assertFalse(tdf._same_data(dat, dat_1))
Example #23
0
 def testDietWithInfFlagging(self):
     tdf = TicDatFactory(**dietSchema())
     dat = tdf.copy_tic_dat(dietData())
     tdf.set_infinity_io_flag(999999999)
     file_one = os.path.join(_scratchDir, "dietInfFlag.sql")
     file_two = os.path.join(_scratchDir, "dietInfFlag.db")
     tdf.sql.write_sql_file(dat, file_one)
     tdf.sql.write_db_data(dat, file_two)
     dat_1 = tdf.sql.create_tic_dat_from_sql(file_one)
     dat_2 = tdf.sql.create_tic_dat(file_two)
     self.assertTrue(tdf._same_data(dat, dat_1))
     self.assertTrue(tdf._same_data(dat, dat_2))
     tdf = tdf.clone()
     dat_1 = tdf.sql.create_tic_dat_from_sql(file_one)
     self.assertTrue(tdf._same_data(dat, dat_1))
     tdf = TicDatFactory(**dietSchema())
     dat_1 = tdf.sql.create_tic_dat_from_sql(file_one)
     self.assertFalse(tdf._same_data(dat, dat_1))
Example #24
0
    def testNetflowOpalytics(self):
        if not self.can_run:
            return
        for hack, raw_data in list(itertools.product(*(([True, False], ) *
                                                       2))):
            tdf = TicDatFactory(**netflowSchema())
            ticDat = tdf.copy_tic_dat(netflowData())
            inputset = create_inputset_mock(tdf, ticDat, hack)
            pdf = PanDatFactory(**tdf.schema())
            panDat = pdf.opalytics.create_pan_dat(inputset, raw_data=raw_data)
            self.assertTrue(tdf._same_data(ticDat,
                                           pdf.copy_to_tic_dat(panDat)))

            ticDat.nodes[12] = {}
            inputset = create_inputset_mock(tdf, ticDat, hack)
            pdf = PanDatFactory(**tdf.schema())
            panDat = pdf.opalytics.create_pan_dat(inputset, raw_data=raw_data)
            self.assertTrue(tdf._same_data(ticDat,
                                           pdf.copy_to_tic_dat(panDat)))
Example #25
0
    def testDataRowPredicatesTwo(self):
        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**dietSchema())
        num_calls=[0]
        mess_it_up=[]
        def pre_processor(dat):
            num_calls[0] += 1
            if mess_it_up:
                dat.messing_it_up+=1
            return {t:len(getattr(dat, t)) for t in tdf.all_tables}
        pdf.add_data_row_predicate("foods", lambda row, y: y==12, predicate_kwargs_maker=lambda dat: {"y":12})
        pdf.add_data_row_predicate("categories", lambda row, nutritionQuantities, foods, categories:
                               row["name"] == "fat" or categories == 4,
                               predicate_name="catfat", predicate_kwargs_maker=pre_processor)
        pdf.add_data_row_predicate("foods", lambda row, nutritionQuantities, foods, categories:
                               row["name"] == "pizza" or foods == 9,
                               predicate_name= "foodza", predicate_kwargs_maker=pre_processor)
        def dummy_kwargs_maker(dat):
            if pdf.good_pan_dat_object(dat):
                return {"x":1}
        for t in tdf.all_tables:
            pdf.add_data_row_predicate(t, lambda row, x: x==1, predicate_name=f"dummy_{t}",
                                       predicate_kwargs_maker=dummy_kwargs_maker)
        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, tdf.copy_tic_dat(dietData())))
        self.assertFalse(pdf.find_data_row_failures(pandat))
        self.assertTrue(num_calls[0] == 1)
        pandat.foods = pandat.foods[pandat.foods["name"] != "pizza"].copy()
        pandat.categories = pandat.categories[pandat.categories["name"] != "fat"].copy()
        fails = pdf.find_data_row_failures(pandat)
        self.assertTrue(num_calls[0] == 2)
        self.assertTrue(set(map(tuple, fails)) == {('categories', 'catfat'), ('foods', 'foodza')})
        self.assertTrue(set(fails['categories', 'catfat']["name"]) == set(dietData().categories).difference(["fat"]))
        self.assertTrue(set(fails['foods', 'foodza']["name"]) == set(dietData().foods).difference(["pizza"]))

        mess_it_up.append(1)
        ex = []
        try:
            pdf.find_data_row_failures(pandat)
        except Exception as e:
            ex[:] = [str(e.__class__)]
        self.assertTrue("AttributeError" in ex[0])
        fails = pdf.find_data_row_failures(pandat, exception_handling="Handled as Failure")
        self.assertTrue(set(map(tuple, fails)) == {('categories', 'catfat'), ('foods', 'foodza')})
        self.assertTrue(num_calls[0] == 4)
        for v in fails.values():
            self.assertTrue(v.primary_key == '*' and "no attribute" in v.error_message)
        pdf = pdf.clone()
        fails = pdf.find_data_row_failures(pandat, exception_handling="Handled as Failure")
        self.assertTrue(set(map(tuple, fails)) == {('categories', 'catfat'), ('foods', 'foodza')})
        mess_it_up=[]
        def fail_on_bad_name(row, bad_name):
            if row["name"] == bad_name:
                return f"{bad_name} is bad"
            return True
        pdf.add_data_row_predicate("foods", fail_on_bad_name, predicate_name="baddy",
                                   predicate_kwargs_maker=lambda dat: {"bad_name": sorted(dat.foods["name"])[0]},
                                   predicate_failure_response="Error Message")
        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, tdf.copy_tic_dat(dietData())))
        fails = pdf.find_data_row_failures(pandat)
        self.assertTrue(set(map(tuple, fails)) == {('foods', 'baddy')})
        self.assertTrue(len(fails['foods', 'baddy']) == 1)
        self.assertTrue(list(fails['foods', 'baddy']["Error Message"])[0] == "chicken is bad")
Example #26
0
    def testBasicFKs(self):
        for cloning in [True, False, "*"]:
            clone_me_maybe = lambda x : x.clone(tdf.all_tables if cloning == "*" else None) if cloning else x

            pdf = PanDatFactory(plants = [["name"], ["stuff", "otherstuff"]],
                                lines = [["name"], ["plant", "weird stuff"]],
                                line_descriptor = [["name"], ["booger"]],
                                products = [["name"],["gover"]],
                                production = [["line", "product"], ["min", "max"]],
                                pureTestingTable = [[], ["line", "plant", "product", "something"]],
                                extraProduction = [["line", "product"], ["extramin", "extramax"]],
                                weirdProduction = [["line1", "line2", "product"], ["weirdmin", "weirdmax"]])
            pdf.add_foreign_key("production", "lines", ("line", "name"))
            pdf.add_foreign_key("production", "products", ("product", "name"))
            pdf.add_foreign_key("lines", "plants", ("plant", "name"))
            pdf.add_foreign_key("line_descriptor", "lines", ("name", "name"))
            for f in set(pdf.data_fields["pureTestingTable"]).difference({"something"}):
                pdf.add_foreign_key("pureTestingTable", "%ss"%f, (f,"name"))
            pdf.add_foreign_key("extraProduction", "production", (("line", "line"), ("product","product")))
            pdf.add_foreign_key("weirdProduction", "production", (("line1", "line"), ("product","product")))
            pdf.add_foreign_key("weirdProduction", "extraProduction", (("line2","line"), ("product","product")))
            self._testPdfReproduction(pdf)
            pdf = clone_me_maybe(pdf)

            tdf = TicDatFactory(**pdf.schema())
            goodDat = tdf.TicDat()
            goodDat.plants["Cleveland"] = ["this", "that"]
            goodDat.plants["Newark"]["otherstuff"] =1
            goodDat.products["widgets"] = goodDat.products["gadgets"] = "shizzle"

            for i,p in enumerate(goodDat.plants):
                goodDat.lines[i]["plant"] = p

            for i,(pl, pd) in enumerate(itertools.product(goodDat.lines, goodDat.products)):
                goodDat.production[pl, pd] = {"min":1, "max":10+i}

            badDat1 = tdf.copy_tic_dat(goodDat)
            badDat1.production["notaline", "widgets"] = [0,1]
            badDat2 = tdf.copy_tic_dat(badDat1)


            def pan_dat_(_):
                rtn = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, _))
                self.assertFalse(pdf.find_duplicates(rtn))
                return rtn
            fk, fkm = ForeignKey, ForeignKeyMapping
            fk_fails1 = pdf.find_foreign_key_failures(pan_dat_(badDat1))
            fk_fails2 = pdf.find_foreign_key_failures(pan_dat_(badDat2))

            self.assertTrue(set(fk_fails1) == set(fk_fails2) ==
                            {fk('production', 'lines', fkm('line', 'name'), 'many-to-one')})
            self.assertTrue(set(pdf.find_foreign_key_failures(pan_dat_(badDat1), verbosity="Low")) ==
                            set(pdf.find_foreign_key_failures(pan_dat_(badDat2), verbosity="Low")) ==
                             {('production', 'lines', ('line', 'name'))})
            for row_fails in [next(iter(_.values())) for _ in [fk_fails1, fk_fails2]]:
                self.assertTrue(set(row_fails["line"]) == {"notaline"} and set(row_fails["product"]) == {"widgets"})

            badDat1.lines["notaline"]["plant"] = badDat2.lines["notaline"]["plant"] = "notnewark"
            fk_fails1 = pdf.find_foreign_key_failures(pan_dat_(badDat1))
            fk_fails2 = pdf.find_foreign_key_failures(pan_dat_(badDat2))
            self.assertTrue(set(fk_fails1) == set(fk_fails2) ==
                            {fk('lines', 'plants', fkm('plant', 'name'), 'many-to-one')})
            for row_fails in [next(iter(_.values())) for _ in [fk_fails1, fk_fails2]]:
                self.assertTrue(set(row_fails["name"]) == {"notaline"} and set(row_fails["plant"]) == {"notnewark"})


            for bad in [badDat1, badDat2]:
                bad_pan = pdf.remove_foreign_key_failures(pan_dat_(bad))
                self.assertFalse(pdf.find_foreign_key_failures(bad_pan))
                self.assertTrue(pdf._same_data(bad_pan, pan_dat_(goodDat)))


            _ = len(goodDat.lines)
            for i,p in enumerate(list(goodDat.plants.keys()) + list(goodDat.plants.keys())):
                goodDat.lines[i+_]["plant"] = p
            for l in goodDat.lines:
                if i%2:
                    goodDat.line_descriptor[l] = i+10

            for i,(l,pl,pdct) in enumerate(sorted(itertools.product(goodDat.lines, goodDat.plants, goodDat.products))):
                goodDat.pureTestingTable.append((l,pl,pdct,i))
            self.assertFalse(pdf.find_foreign_key_failures(pan_dat_(goodDat)))
            badDat = tdf.copy_tic_dat(goodDat)
            badDat.pureTestingTable.append(("j", "u", "nk", "ay"))
            fk_fails = pdf.find_foreign_key_failures(pan_dat_(badDat))
            self.assertTrue(set(fk_fails) ==
                {fk('pureTestingTable', 'plants', fkm('plant', 'name'), 'many-to-one'),
                 fk('pureTestingTable', 'products', fkm('product', 'name'), 'many-to-one'),
                 fk('pureTestingTable', 'lines', fkm('line', 'name'), 'many-to-one')})

            for df in fk_fails.values():
                df = df.T
                c = df.columns[0]
                self.assertTrue({'ay', 'j', 'nk', 'u'} == set(df[c]))
Example #27
0
    def testDataPredicates(self):
        if not self.canRun:
            return
        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**dietSchema())

        ticdat = tdf.TicDat()
        ticdat.foods["a"] = 12
        ticdat.foods["b"] = None
        ticdat.categories["1"] = {"maxNutrition":100, "minNutrition":40}
        ticdat.categories["2"] = [21,20]
        for f, p in itertools.product(ticdat.foods, ticdat.categories):
            ticdat.nutritionQuantities[f,p] = 5


        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))
        self.assertFalse(pdf.find_duplicates(pandat))
        self.assertFalse(pdf.find_data_row_failures(pandat))

        ticdat.nutritionQuantities['a', 2] = 12
        ticdat.categories["3"] = ['a', 100]
        pandat_2 = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))

        def perform_predicate_checks(sch):
            pdf = PanDatFactory(**sch)
            pdf.add_data_row_predicate("foods", lambda row: numericish(row["cost"]) and not isnan(row["cost"]), "cost")
            good_qty = lambda qty : 5 < qty <= 12
            pdf.add_data_row_predicate("nutritionQuantities", lambda row: good_qty(row["qty"]), "qty")
            pdf.add_data_row_predicate("categories",
                                       lambda row: row["maxNutrition"] >= row["minNutrition"],
                                       "minmax")
            failed = pdf.find_data_row_failures(pandat)
            self.assertTrue(set(failed) == {('foods', 'cost'), ('nutritionQuantities', 'qty'), ('categories', 'minmax')})
            self.assertTrue(set(failed['foods', 'cost']["name"]) == {'b'})
            self.assertTrue(set({(v["food"], v["category"])
                                 for v in failed['nutritionQuantities', 'qty'].T.to_dict().values()}) ==
                                {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')})
            self.assertTrue(set(failed['categories', 'minmax']["name"]) == {'2'})
            failed = pdf.find_data_row_failures(pandat, as_table=False)
            self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True])
            failed = pdf.find_data_row_failures(pandat_2)
            self.assertTrue(set(failed['categories', 'minmax']["name"]) == {'2', '3'})

        perform_predicate_checks(dietSchema())
        perform_predicate_checks({t:'*' for t in dietSchema()})

        tdf = TicDatFactory(**netflowSchema())
        tdf.enable_foreign_key_links()
        addNetflowForeignKeys(tdf)
        pdf = PanDatFactory(**netflowSchema())
        ticdat = tdf.copy_tic_dat(netflowData())
        for n in ticdat.nodes["Detroit"].arcs_source:
            ticdat.arcs["Detroit", n] = n
        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))
        self.assertFalse(pdf.find_duplicates(pandat))
        self.assertFalse(pdf.find_data_row_failures(pandat))

        pdf = PanDatFactory(**netflowSchema())
        pdf.add_data_row_predicate("arcs", lambda row: True, "capacity")
        self.assertFalse(pdf.find_data_row_failures(pandat))

        pdf = PanDatFactory(**netflowSchema())
        good_capacity = lambda capacity: numericish(capacity) or capacity in ["Boston", "Seattle", "lumberjack"]
        pdf.add_data_row_predicate("arcs", lambda row: good_capacity(row["capacity"]), "capacity")
        failed = pdf.find_data_row_failures(pandat)
        self.assertTrue(set(failed) == {('arcs', 'capacity')})
        self.assertTrue(set({(v["source"], v["destination"])
                             for v in failed['arcs', 'capacity'].T.to_dict().values()}) == {("Detroit", "New York")})
Example #28
0
    def testSix(self):
        tdf = TicDatFactory(plants = [["name"], ["stuff", "otherstuff"]],
                            lines = [["name"], ["plant", "weird stuff"]],
                            line_descriptor = [["name"], ["booger"]],
                            products = [["name"],["gover"]],
                            production = [["line", "product"], ["min", "max"]],
                            pureTestingTable = [[], ["line", "plant", "product", "something"]],
                            extraProduction = [["line", "product"], ["extramin", "extramax"]],
                            weirdProduction = [["line1", "line2", "product"], ["weirdmin", "weirdmax"]])
        tdf.add_foreign_key("production", "lines", ("line", "name"))
        tdf.add_foreign_key("production", "products", ("product", "name"))
        tdf.add_foreign_key("lines", "plants", ("plant", "name"))
        tdf.add_foreign_key("line_descriptor", "lines", ("name", "name"))
        for f in set(tdf.data_fields["pureTestingTable"]).difference({"something"}):
            tdf.add_foreign_key("pureTestingTable", "%ss"%f, (f,"name"))
        tdf.add_foreign_key("extraProduction", "production", (("line", "line"), ("product","product")))
        tdf.add_foreign_key("weirdProduction", "production", (("line1", "line"), ("product","product")))
        tdf.add_foreign_key("weirdProduction", "extraProduction", (("line2","line"), ("product","product")))

        goodDat = tdf.TicDat()
        goodDat.plants["Cleveland"] = ["this", "that"]
        goodDat.plants["Newark"]["otherstuff"] =1
        goodDat.products["widgets"] = goodDat.products["gadgets"] = "shizzle"

        for i,p in enumerate(goodDat.plants):
            goodDat.lines[i]["plant"] = p

        for i,(pl, pd) in enumerate(itertools.product(goodDat.lines, goodDat.products)):
            goodDat.production[pl, pd] = {"min":1, "max":10+i}

        badDat1 = tdf.copy_tic_dat(goodDat)
        badDat1.production["notaline", "widgets"] = [0,1]
        badDat2 = tdf.copy_tic_dat(badDat1)

        fk, fkm = _ForeignKey, _ForeignKeyMapping
        self.assertTrue(tdf.find_foreign_key_failures(badDat1) == tdf.find_foreign_key_failures(badDat2) ==
                        {fk('production', 'lines', fkm('line', 'name'), 'many-to-one'):
                             (('notaline',), (('notaline', 'widgets'),))})
        badDat1.lines["notaline"]["plant"] = badDat2.lines["notaline"]["plant"] = "notnewark"
        self.assertTrue(tdf.find_foreign_key_failures(badDat1) == tdf.find_foreign_key_failures(badDat2) ==
                        {fk('lines', 'plants', fkm('plant', 'name'), 'many-to-one'):
                             (('notnewark',), ('notaline',))})
        tdf.remove_foreign_keys_failures(badDat1, propagate=False)
        tdf.remove_foreign_keys_failures(badDat2, propagate=True)
        self.assertTrue(tdf._same_data(badDat2, goodDat) and not tdf.find_foreign_key_failures(badDat2))
        self.assertTrue(tdf.find_foreign_key_failures(badDat1) ==
                {fk('production', 'lines', fkm('line', 'name'), 'many-to-one'):
                     (('notaline',), (('notaline', 'widgets'),))})

        tdf.remove_foreign_keys_failures(badDat1, propagate=False)
        self.assertTrue(tdf._same_data(badDat1, goodDat) and not tdf.find_foreign_key_failures(badDat1))

        _ = len(goodDat.lines)
        for i,p in enumerate(goodDat.plants.keys() + goodDat.plants.keys()):
            goodDat.lines[i+_]["plant"] = p
        for l in goodDat.lines:
            if i%2:
                goodDat.line_descriptor[l] = i+10

        for i,(l,pl,pdct) in enumerate(sorted(itertools.product(goodDat.lines, goodDat.plants, goodDat.products))):
            goodDat.pureTestingTable.append((l,pl,pdct,i))
        self.assertFalse(tdf.find_foreign_key_failures(goodDat))
        badDat = tdf.copy_tic_dat(goodDat)
        badDat.pureTestingTable.append(("j", "u", "nk", "ay"))
        l = len(goodDat.pureTestingTable)
        self.assertTrue(tdf.find_foreign_key_failures(badDat) ==
         {fk('pureTestingTable', 'plants', fkm('plant', 'name'), 'many-to-one'): (('u',),(l,)),
          fk('pureTestingTable', 'products', fkm('product', 'name'), 'many-to-one'): (('nk',), (l,)),
          fk('pureTestingTable', 'lines', fkm('line', 'name'), 'many-to-one'): (('j',), (l,))})

        obfudat = tdf.obfusimplify(goodDat, freeze_it=True)
        self.assertTrue(all(len(getattr(obfudat.copy, t)) == len(getattr(goodDat, t))
                            for t in tdf.all_tables))
        for n in goodDat.plants.keys() + goodDat.lines.keys() + goodDat.products.keys() :
            self.assertTrue(n in {_[1] for _ in obfudat.renamings.values()})
            self.assertFalse(n in obfudat.renamings)
        self.assertTrue(obfudat.copy.plants['P2']['otherstuff'] == 1)
        self.assertFalse(tdf._same_data(obfudat.copy, goodDat))
        for k,r in obfudat.copy.line_descriptor.items():
            i = r.values()[0] - 10
            self.assertTrue(i%2 and (goodDat.line_descriptor[i].values()[0] == i+10))

        obfudat2 = tdf.obfusimplify(goodDat, {"plants": "P", "lines" : "L", "products" :"PR"})
        self.assertTrue(tdf._same_data(obfudat.copy, obfudat2.copy))

        obfudat3 = tdf.obfusimplify(goodDat, skip_tables=["plants", "lines", "products"])
        self.assertTrue(tdf._same_data(obfudat3.copy, goodDat))

        obfudat4 = tdf.obfusimplify(goodDat, skip_tables=["lines", "products"])
        self.assertFalse(tdf._same_data(obfudat4.copy, goodDat))
        self.assertFalse(tdf._same_data(obfudat4.copy, obfudat.copy))
Example #29
0
    def testDiet(self):
        if not self.can_run:
            return
        def doTheTests(tdf) :
            ticDat = tdf.freeze_me(tdf.TicDat(**{t:getattr(dietData(),t) for t in tdf.primary_key_fields}))
            filePath = makeCleanPath(os.path.join(_scratchDir, "diet.db"))
            tdf.sql.write_db_data(ticDat, filePath)
            self.assertFalse(tdf.sql.find_duplicates(filePath))
            sqlTicDat = tdf.sql.create_tic_dat(filePath)
            self.assertTrue(tdf._same_data(ticDat, sqlTicDat))
            def changeit() :
                sqlTicDat.categories["calories"]["minNutrition"]=12
            changeit()
            self.assertFalse(tdf._same_data(ticDat, sqlTicDat))

            self.assertTrue(self.firesException(lambda : tdf.sql.write_db_data(ticDat, filePath)))
            tdf.sql.write_db_data(ticDat, filePath, allow_overwrite=True)
            sqlTicDat = tdf.sql.create_tic_dat(filePath, freeze_it=True)
            self.assertTrue(tdf._same_data(ticDat, sqlTicDat))
            self.assertTrue(self.firesException(changeit))
            self.assertTrue(tdf._same_data(ticDat, sqlTicDat))

            filePath = makeCleanPath(os.path.join(_scratchDir, "diet.sql"))
            tdf.sql.write_sql_file(ticDat, filePath)
            sqlTicDat = tdf.sql.create_tic_dat_from_sql(filePath)
            self.assertTrue(tdf._same_data(ticDat, sqlTicDat))
            changeit()
            self.assertFalse(tdf._same_data(ticDat, sqlTicDat))

            tdf.sql.write_sql_file(ticDat, filePath, include_schema=True, allow_overwrite=True)
            sqlTicDat = tdf.sql.create_tic_dat_from_sql(filePath, includes_schema=True, freeze_it=True)
            self.assertTrue(tdf._same_data(ticDat, sqlTicDat))
            self.assertTrue(self.firesException(changeit))
            self.assertTrue(tdf._same_data(ticDat, sqlTicDat))

        doTheTests(TicDatFactory(**dietSchema()))

        tdf = TicDatFactory(**dietSchema())
        self.assertFalse(tdf.foreign_keys)
        tdf.set_default_values(categories =  {'maxNutrition': float("inf"), 'minNutrition': 0.0},
                               foods =  {'cost': 0.0},
                               nutritionQuantities =  {'qty': 0.0})
        addDietForeignKeys(tdf)
        ordered = tdf.sql._ordered_tables()
        self.assertTrue(ordered.index("categories") < ordered.index("nutritionQuantities"))
        self.assertTrue(ordered.index("foods") < ordered.index("nutritionQuantities"))

        ticDat = tdf.TicDat(**{t:getattr(dietData(),t) for t in tdf.primary_key_fields})
        self._test_generic_copy(ticDat, tdf)
        self._test_generic_copy(ticDat, tdf, ["nutritionQuantities"])
        origTicDat = tdf.copy_tic_dat(ticDat)
        self.assertTrue(tdf._same_data(ticDat, origTicDat))
        self.assertFalse(tdf.find_foreign_key_failures(ticDat))
        ticDat.nutritionQuantities['hot dog', 'boger'] = ticDat.nutritionQuantities['junk', 'protein'] = -12
        self.assertTrue(tdf.find_foreign_key_failures(ticDat) ==
        {('nutritionQuantities', 'foods', ('food', 'name'), 'many-to-one'): (('junk',), (('junk', 'protein'),)),
         ('nutritionQuantities', 'categories', ('category', 'name'), 'many-to-one'):
             (('boger',), (('hot dog', 'boger'),))})

        self.assertFalse(tdf._same_data(ticDat, origTicDat))
        tdf.remove_foreign_key_failures(ticDat)
        self.assertFalse(tdf.find_foreign_key_failures(ticDat))
        self.assertTrue(tdf._same_data(ticDat, origTicDat))

        doTheTests(tdf)
Example #30
0
    def testEight(self):
        tdf = TicDatFactory(**dietSchema())
        def makeIt() :
            rtn = tdf.TicDat()
            rtn.foods["a"] = 12
            rtn.foods["b"] = None
            rtn.categories["1"] = {"maxNutrition":100, "minNutrition":40}
            rtn.categories["2"] = [10,20]
            for f, p in itertools.product(rtn.foods, rtn.categories):
                rtn.nutritionQuantities[f,p] = 5
            rtn.nutritionQuantities['a', 2] = 12
            return tdf.freeze_me(rtn)
        dat = makeIt()
        self.assertFalse(tdf.find_data_type_failures(dat))

        tdf = TicDatFactory(**dietSchema())
        tdf.set_data_type("foods", "cost", nullable=False)
        tdf.set_data_type("nutritionQuantities", "qty", min=5, inclusive_min=False, max=12, inclusive_max=True)
        tdf.set_default_value("foods", "cost", 2)
        dat = makeIt()
        failed = tdf.find_data_type_failures(dat)
        self.assertTrue(set(failed) == {('foods', 'cost'), ('nutritionQuantities', 'qty')})
        self.assertTrue(set(failed['nutritionQuantities', 'qty'].pks) ==
                        {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')})
        self.assertTrue(failed['nutritionQuantities', 'qty'].bad_values == (5,))
        ex = self.firesException(lambda : tdf.replace_data_type_failures(tdf.copy_tic_dat(dat)))
        self.assertTrue(all(_ in ex for _ in ("replacement value", "nutritionQuantities", "qty")))
        fixedDat = tdf.replace_data_type_failures(tdf.copy_tic_dat(dat),
                            replacement_values={("nutritionQuantities", "qty"):5.001})
        self.assertFalse(tdf.find_data_type_failures(fixedDat) or tdf._same_data(fixedDat, dat))
        self.assertTrue(all(fixedDat.nutritionQuantities[pk]["qty"] == 5.001 for pk in
                            failed['nutritionQuantities', 'qty'].pks))
        self.assertTrue(fixedDat.foods["a"]["cost"] == 12 and fixedDat.foods["b"]["cost"] == 2 and
                        fixedDat.nutritionQuantities['a', 2]["qty"] == 12)

        tdf = TicDatFactory(**dietSchema())
        tdf.set_data_type("foods", "cost", nullable=False)
        tdf.set_data_type("nutritionQuantities", "qty", min=5, inclusive_min=False, max=12, inclusive_max=True)
        fixedDat2 = tdf.replace_data_type_failures(tdf.copy_tic_dat(dat),
                            replacement_values={("nutritionQuantities", "qty"):5.001, ("foods", "cost") : 2})
        self.assertTrue(tdf._same_data(fixedDat, fixedDat2))

        tdf = TicDatFactory(**dietSchema())
        tdf.set_data_type("foods", "cost", nullable=True)
        tdf.set_data_type("nutritionQuantities", "qty",number_allowed=False)
        failed = tdf.find_data_type_failures(dat)
        self.assertTrue(set(failed) == {('nutritionQuantities', 'qty')})
        self.assertTrue(set(failed['nutritionQuantities', 'qty'].pks) == set(dat.nutritionQuantities))
        ex = self.firesException(lambda : tdf.replace_data_type_failures(tdf.copy_tic_dat(dat)))
        self.assertTrue(all(_ in ex for _ in ("replacement value", "nutritionQuantities", "qty")))

        tdf = TicDatFactory(**dietSchema())
        tdf.set_data_type("foods", "cost")
        fixedDat = tdf.replace_data_type_failures(tdf.copy_tic_dat(makeIt()))
        self.assertTrue(fixedDat.foods["a"]["cost"] == 12 and fixedDat.foods["b"]["cost"] == 0)

        tdf = TicDatFactory(**netflowSchema())
        addNetflowForeignKeys(tdf)
        dat = tdf.copy_tic_dat(netflowData(), freeze_it=1)
        self.assertFalse(hasattr(dat.nodes["Detroit"], "arcs_source"))

        tdf = TicDatFactory(**netflowSchema())
        addNetflowForeignKeys(tdf)
        tdf.enable_foreign_key_links()
        dat = tdf.copy_tic_dat(netflowData(), freeze_it=1)
        self.assertTrue(hasattr(dat.nodes["Detroit"], "arcs_source"))

        tdf = TicDatFactory(**netflowSchema())
        def makeIt() :
            if not tdf.foreign_keys:
                tdf.enable_foreign_key_links()
                addNetflowForeignKeys(tdf)
            orig = netflowData()
            rtn = tdf.copy_tic_dat(orig)
            for n in rtn.nodes["Detroit"].arcs_source:
                rtn.arcs["Detroit", n] = n
            self.assertTrue(all(len(getattr(rtn, t)) == len(getattr(orig, t)) for t in tdf.all_tables))
            return tdf.freeze_me(rtn)
        dat = makeIt()
        self.assertFalse(tdf.find_data_type_failures(dat))

        tdf = TicDatFactory(**netflowSchema())
        tdf.set_data_type("arcs", "capacity", strings_allowed="*")
        dat = makeIt()
        self.assertFalse(tdf.find_data_type_failures(dat))

        tdf = TicDatFactory(**netflowSchema())
        tdf.set_data_type("arcs", "capacity", strings_allowed=["Boston", "Seattle", "lumberjack"])
        dat = makeIt()
        failed = tdf.find_data_type_failures(dat)
        self.assertTrue(failed == {('arcs', 'capacity'):(("New York",), (("Detroit", "New York"),))})
        fixedDat = tdf.replace_data_type_failures(tdf.copy_tic_dat(makeIt()))
        netflowData_ = tdf.copy_tic_dat(netflowData())
        self.assertFalse(tdf.find_data_type_failures(fixedDat) or tdf._same_data(dat, netflowData_))
        fixedDat = tdf.copy_tic_dat(tdf.replace_data_type_failures(tdf.copy_tic_dat(makeIt()),
                                        {("arcs", "capacity"):80, ("cost","cost") :"imok"}))
        fixedDat.arcs["Detroit", "Boston"] = 100
        fixedDat.arcs["Detroit", "Seattle"] = 120
        self.assertTrue(tdf._same_data(fixedDat, netflowData_))
Example #31
0
    def testDataPredicates(self):
        # this test won't run properly if the -O flag is applied
        if not self.canRun:
            return
        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**dietSchema())

        ticdat = tdf.TicDat()
        ticdat.foods["a"] = 12
        ticdat.foods["b"] = None
        ticdat.categories["1"] = {"maxNutrition":100, "minNutrition":40}
        ticdat.categories["2"] = [21,20]
        for f, p in itertools.product(ticdat.foods, ticdat.categories):
            ticdat.nutritionQuantities[f,p] = 5


        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))
        self.assertFalse(pdf.find_duplicates(pandat))
        self.assertFalse(pdf.find_data_row_failures(pandat))

        ticdat.nutritionQuantities['a', 2] = 12
        ticdat.categories["3"] = ['a', 100]
        pandat_2 = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))

        def perform_predicate_checks(sch):
            pdf = PanDatFactory(**sch)
            pdf.add_data_row_predicate("foods", lambda row: numericish(row["cost"]) and not isnan(row["cost"]), "cost")
            good_qty = lambda qty : 5 < qty <= 12
            pdf.add_data_row_predicate("nutritionQuantities", lambda row: good_qty(row["qty"]), "qty")
            pdf.add_data_row_predicate("categories",
                                       lambda row: row["maxNutrition"] >= row["minNutrition"],
                                       "minmax")
            pdf2 = PanDatFactory(**sch)
            def make_error_message_predicate(f, name):
                def error_message_predicate(row):
                    rtn = f(row)
                    if rtn:
                        return True
                    return f"{name} failed!"
                return error_message_predicate
            for t, preds in pdf._data_row_predicates.items():
                for p_name, rpi in preds.items():
                    pdf2.add_data_row_predicate(t, make_error_message_predicate(rpi.predicate, p_name),
                                                predicate_name=p_name, predicate_failure_response="Error Message")
            failed = pdf.find_data_row_failures(pandat)
            failed2 = pdf2.find_data_row_failures(pandat)
            self.assertTrue(set(failed) == set(failed2) ==  {('foods', 'cost'),
                                            ('nutritionQuantities', 'qty'), ('categories', 'minmax')})
            self.assertTrue(set(failed['foods', 'cost']["name"]) == set(failed2['foods', 'cost']["name"]) == {'b'})
            for f in [failed, failed2]:
                self.assertTrue(set({(v["food"], v["category"])
                                     for v in f['nutritionQuantities', 'qty'].T.to_dict().values()}) ==
                                    {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')})
                self.assertTrue(set(f['categories', 'minmax']["name"]) == {'2'})
            for t, n in failed2:
                self.assertTrue(set(failed2[t, n]["Error Message"]) == {f'{n} failed!'})
            for _pdf in [pdf, pdf2]:
                failed = _pdf.find_data_row_failures(pandat, as_table=False)
                self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True])
                ex = []
                try:
                    _pdf.find_data_row_failures(pandat_2)
                except Exception as e:
                    ex[:] = [str(e.__class__)]
                self.assertTrue("TypeError" in ex[0])
                failed = _pdf.find_data_row_failures(pandat_2, exception_handling="Handled as Failure")
                self.assertTrue(set(failed['categories', 'minmax']["name"]) == {'2', '3'})
            failed = pdf2.find_data_row_failures(pandat_2, exception_handling="Handled as Failure")
            df = failed['categories', 'minmax']
            err_str = list(df[df['name'] == '3']["Error Message"])[0]
            self.assertTrue(err_str=="Exception<'>=' not supported between instances of 'int' and 'str'>")

        perform_predicate_checks(dietSchema())
        perform_predicate_checks({t:'*' for t in dietSchema()})

        tdf = TicDatFactory(**netflowSchema())
        tdf.enable_foreign_key_links()
        addNetflowForeignKeys(tdf)
        pdf = PanDatFactory(**netflowSchema())
        ticdat = tdf.copy_tic_dat(netflowData())
        for n in ticdat.nodes["Detroit"].arcs_source:
            ticdat.arcs["Detroit", n] = n
        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))
        self.assertFalse(pdf.find_duplicates(pandat))
        self.assertFalse(pdf.find_data_row_failures(pandat))

        pdf = PanDatFactory(**netflowSchema())
        pdf.add_data_row_predicate("arcs", lambda row: True, "capacity")
        self.assertFalse(pdf.find_data_row_failures(pandat))

        pdf = PanDatFactory(**netflowSchema())
        good_capacity = lambda capacity: numericish(capacity) or capacity in ["Boston", "Seattle", "lumberjack"]
        pdf.add_data_row_predicate("arcs", lambda row: good_capacity(row["capacity"]), "capacity")
        failed = pdf.find_data_row_failures(pandat)
        self.assertTrue(set(failed) == {('arcs', 'capacity')})
        self.assertTrue(set({(v["source"], v["destination"])
                             for v in failed['arcs', 'capacity'].T.to_dict().values()}) == {("Detroit", "New York")})

        pdf = PanDatFactory(table=[[],["Field", "Error Message", "Error Message (1)"]])
        pdf.add_data_row_predicate("table", predicate=lambda row: f"Oops {row['Field']}" if row["Field"] > 1 else True,
                                   predicate_name="silly", predicate_failure_response="Error Message")
        df = DataFrame({"Field":[2, 1], "Error Message":["what", "go"], "Error Message (1)": ["now", "go"]})
        fails = pdf.find_data_row_failures(pdf.PanDat(table=df))
        df = fails["table", "silly"]
        self.assertTrue(list(df.columns) == ["Field", "Error Message", "Error Message (1)", "Error Message (2)"])
        self.assertTrue(set(df["Field"]) == {2} and set(df["Error Message (2)"]) == {'Oops 2'})