コード例 #1
0
ファイル: testpandat_utils.py プロジェクト: adampkehoe/ticdat
    def testDataTypes_two(self):
        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**tdf.schema())

        def makeIt():
            rtn = tdf.TicDat()
            rtn.foods["a"] = 12
            rtn.foods["b"] = None
            rtn.foods[None] = 101
            rtn.categories["1"] = {"maxNutrition": 100, "minNutrition": 40}
            rtn.categories["2"] = [10, 20]
            for f, p in itertools.product(rtn.foods, rtn.categories):
                rtn.nutritionQuantities[f, p] = 5
            rtn.nutritionQuantities['a', 2] = 12
            return tdf.copy_to_pandas(rtn, drop_pk_columns=False)

        dat = makeIt()
        errs = pdf.find_data_type_failures(dat)
        self.assertTrue(len(errs) == 2 and not pdf.find_duplicates(dat))
        dat_copied = pdf.copy_pan_dat(dat)
        pdf.replace_data_type_failures(dat)
        self.assertTrue(pdf._same_data(dat, dat_copied, epsilon=0.00001))
        pdf2 = pdf.clone()
        pdf2.set_default_value("foods", "name", "a")
        pdf2.set_default_value("nutritionQuantities", "food", "a")
        pdf2.replace_data_type_failures(dat_copied)
        self.assertFalse(pdf._same_data(dat, dat_copied, epsilon=0.00001))
        self.assertFalse(pdf.find_data_type_failures(dat_copied))
        dups = pdf.find_duplicates(dat_copied)
        self.assertTrue(
            len(dups) == 2 and len(dups["foods"]) == 1
            and len(dups["nutritionQuantities"]) == 2)

        from pandas import isnull

        def noneify(iter_of_tuples):
            return {
                tuple(None if isnull(_) else _ for _ in tuple_)
                for tuple_ in iter_of_tuples
            }

        self.assertTrue(
            noneify(errs['nutritionQuantities', 'food'].itertuples(
                index=False)) == {(None, "1", 5), (None, "2", 5)})
        self.assertTrue(
            noneify(errs['foods',
                         'name'].itertuples(index=False)) == {(None, 101)})
        pdf = PanDatFactory(**tdf.schema())
        pdf.set_data_type("foods", "name", nullable=True, strings_allowed='*')
        pdf.set_data_type("nutritionQuantities",
                          "food",
                          nullable=True,
                          strings_allowed='*')
        self.assertFalse(pdf.find_data_type_failures(dat))
        pdf.set_data_type("foods", "cost", nullable=False)
        errs = pdf.find_data_type_failures(dat)
        self.assertTrue(len(errs) == 1)
        self.assertTrue(
            noneify(errs['foods',
                         'cost'].itertuples(index=False)) == {('b', None)})
コード例 #2
0
ファイル: testpandat_io.py プロジェクト: nandi6uc/ticdat
    def test_datetime(self):
        core_path = os.path.join(_scratchDir, "parameters")
        pdf = PanDatFactory(table_with_stuffs=[["field one"], ["field two"]],
                            parameters=[["a"], ["b"]])
        pdf.add_parameter("p1", "Dec 15 1970", datetime=True)
        pdf.add_parameter("p2", None, datetime=True, nullable=True)
        pdf.set_data_type("table_with_stuffs", "field one", datetime=True)
        pdf.set_data_type("table_with_stuffs",
                          "field two",
                          datetime=True,
                          nullable=True)
        dat = TicDatFactory(**pdf.schema()).TicDat(
            table_with_stuffs=[[dateutil.parser.parse("July 11 1972"), None],
                               [
                                   datetime.datetime.now(),
                                   dateutil.parser.parse("Sept 11 2011")
                               ]],
            parameters=[["p1", "7/11/1911"], ["p2", None]])
        dat = TicDatFactory(**pdf.schema()).copy_to_pandas(
            dat, drop_pk_columns=False)
        self.assertFalse(
            pdf.find_data_type_failures(dat)
            or pdf.find_data_row_failures(dat))

        for attr, path in [["csv", core_path + "_csv"],
                           ["xls", core_path + ".xlsx"],
                           ["sql", core_path + ".db"],
                           ["json", core_path + ".json"]]:
            func = "write_directory" if attr == "csv" else "write_file"
            getattr(getattr(pdf, attr), func)(dat, path)
            dat_1 = getattr(pdf, attr).create_pan_dat(path)
            self.assertFalse(pdf._same_data(dat, dat_1))
            self.assertFalse(
                pdf.find_data_type_failures(dat_1)
                or pdf.find_data_row_failures(dat_1))
            dat_1 = pdf.copy_to_tic_dat(dat_1)
            self.assertTrue(set(dat_1.parameters) == {'p1', 'p2'})
            self.assertTrue(
                isinstance(dat_1.parameters["p1"]["b"],
                           (datetime.datetime, numpy.datetime64))
                and not pd.isnull(dat_1.parameters["p1"]["b"]))
            self.assertTrue(pd.isnull(dat_1.parameters["p2"]["b"]))
            self.assertTrue(
                all(
                    isinstance(_, (datetime.datetime,
                                   numpy.datetime64)) and not pd.isnull(_)
                    for _ in dat_1.table_with_stuffs))
            self.assertTrue(
                all(
                    isinstance(_, (datetime.datetime, numpy.datetime64))
                    or _ is None or utils.safe_apply(math.isnan)(_)
                    for v in dat_1.table_with_stuffs.values()
                    for _ in v.values()))
            self.assertTrue({
                pd.isnull(_)
                for v in dat_1.table_with_stuffs.values() for _ in v.values()
            } == {True, False})
コード例 #3
0
ファイル: testpandat_io.py プロジェクト: nandi6uc/ticdat
    def test_nullables(self):
        core_path = os.path.join(_scratchDir, "nullables")
        pdf = PanDatFactory(table_with_stuffs=[["field one"], ["field two"]])
        pdf.set_data_type("table_with_stuffs", "field one")
        pdf.set_data_type("table_with_stuffs",
                          "field two",
                          number_allowed=False,
                          strings_allowed='*',
                          nullable=True)
        dat = TicDatFactory(**pdf.schema()).TicDat(
            table_with_stuffs=[[101, "022"], [202, None], [303, "111"]])
        dat = TicDatFactory(**pdf.schema()).copy_to_pandas(
            dat, drop_pk_columns=False)
        self.assertFalse(pdf.find_data_type_failures(dat))

        for attr, path in [["csv", core_path + "_csv"],
                           ["xls", core_path + ".xlsx"],
                           ["sql", core_path + ".db"],
                           ["json", core_path + ".json"]]:
            f_or_d = "directory" if attr == "csv" else "file"
            write_func, write_kwargs = utils._get_write_function_and_kwargs(
                pdf, path, f_or_d)
            write_func(dat, path, **write_kwargs)
            dat_1 = utils._get_dat_object(pdf, "create_pan_dat", path, f_or_d,
                                          False)
            self.assertTrue(
                pdf._same_data(dat, dat_1, nans_are_same_for_data_rows=True))
コード例 #4
0
    def testDataTypes(self):
        if not self.canRun:
            return
        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**dietSchema())

        ticdat = tdf.TicDat()
        ticdat.foods["a"] = 12
        ticdat.foods["b"] = None
        ticdat.categories["1"] = {"maxNutrition":100, "minNutrition":40}
        ticdat.categories["2"] = [10,20]
        for f, p in itertools.product(ticdat.foods, ticdat.categories):
            ticdat.nutritionQuantities[f,p] = 5
        ticdat.nutritionQuantities['a', 2] = 12

        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))

        self.assertFalse(pdf.find_data_type_failures(pandat))
        pandat_copy = pdf.replace_data_type_failures(pdf.copy_pan_dat(pandat))
        self.assertTrue(pdf._same_data(pandat, pandat_copy, epsilon=0.00001))

        pdf = PanDatFactory(**dietSchema())
        pdf.set_data_type("foods", "cost", nullable=False)
        pdf.set_data_type("nutritionQuantities", "qty", min=5, inclusive_min=False, max=12, inclusive_max=True)
        failed = pdf.find_data_type_failures(pandat)
        self.assertTrue(set(failed) == {('foods', 'cost'), ('nutritionQuantities', 'qty')})
        self.assertTrue(set(failed['foods', 'cost']["name"]) == {'b'})
        self.assertTrue(set({(v["food"], v["category"])
                             for v in failed['nutritionQuantities', 'qty'].T.to_dict().values()}) ==
                            {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')})

        failed = pdf.find_data_type_failures(pandat, as_table=False)
        self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True])
        fixed = pdf.replace_data_type_failures(pdf.copy_pan_dat(pandat), {("nutritionQuantities", "qty"): 5.15})
        self.assertTrue(set(fixed.foods["cost"]) == {0.0, 12.0})
        self.assertTrue(set(fixed.nutritionQuantities["qty"]) == {5.15, 12.0})

        tdf = TicDatFactory(**netflowSchema())
        tdf.enable_foreign_key_links()
        addNetflowForeignKeys(tdf)
        pdf = PanDatFactory(**netflowSchema())
        ticdat = tdf.copy_tic_dat(netflowData())
        for n in ticdat.nodes["Detroit"].arcs_source:
            ticdat.arcs["Detroit", n] = n
        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))
        self.assertFalse(pdf.find_data_type_failures(pandat))

        pdf = PanDatFactory(**netflowSchema())
        pdf.set_data_type("arcs", "capacity", strings_allowed="*")
        self.assertFalse(pdf.find_data_type_failures(pandat))

        pdf = PanDatFactory(**netflowSchema())
        pdf.set_data_type("arcs", "capacity", strings_allowed=["Boston", "Seattle", "lumberjack"])
        failed = pdf.find_data_type_failures(pandat)
        self.assertTrue(set(failed) == {('arcs', 'capacity')})
        self.assertTrue(set({(v["source"], v["destination"])
                             for v in failed['arcs', 'capacity'].T.to_dict().values()}) == {("Detroit", "New York")})
        pdf.replace_data_type_failures(pandat)
        self.assertTrue(set(pandat.arcs["capacity"]) == {120, 'Boston', 0, 'Seattle'})
コード例 #5
0
ファイル: testpandat_utils.py プロジェクト: adampkehoe/ticdat
 def test_data_type_max_failures(self):
     pdf = PanDatFactory(table_one=[["Field"], []],
                         table_two=[[], ["Field"]])
     for t in ["table_one", "table_two"]:
         pdf.set_data_type(t, "Field")
     dat = pdf.PanDat(table_one=DataFrame(
         {"Field": list(range(1, 11)) + [-_ for _ in range(1, 11)]}),
                      table_two=DataFrame(
                          {"Field": [10.1] * 10 + [-2] * 10}))
     errs = pdf.find_data_type_failures(dat)
     self.assertTrue(
         len(errs) == 2 and all(len(_) == 10 for _ in errs.values()))
     errs = pdf.find_data_type_failures(dat, max_failures=11)
     self.assertTrue(len(errs) == 2)
     self.assertTrue(
         any(len(_) == 10 for _ in errs.values())
         and any(len(_) == 1 for _ in errs.values()))
     errs = pdf.find_data_type_failures(dat, max_failures=10)
     self.assertTrue(
         len(errs) == 1 and all(len(_) == 10 for _ in errs.values()))
     errs = pdf.find_data_type_failures(dat, max_failures=9)
     self.assertTrue(
         len(errs) == 1 and all(len(_) == 9 for _ in errs.values()))