Ejemplo n.º 1
0
 def _read_cell(x, field):
     # reminder - data fields have a default default of zero, primary keys don't get a default default
     dv = self.tic_dat_factory.default_values.get(table, {}).get(
         field, ["LIST", "NOT", "POSSIBLE"])
     dt = self.tic_dat_factory.data_types.get(table, {}).get(field)
     rtn = x[field_indicies[field]]
     if rtn == "" and ((dt and dt.nullable) or (not dt and dv is None)):
         return None
     if treat_inf_as_infinity and utils.stringish(
             rtn) and rtn.lower() in ["inf", "-inf"]:
         return float(rtn.lower())
     if utils.numericish(rtn) and utils.safe_apply(int)(
             rtn) == rtn and dt and dt.must_be_int:
         rtn = int(rtn)
     if rtn == "":
         try_rtn = self.tic_dat_factory._general_read_cell(
             table, field, None)  # None as infinity flagging
         if utils.numericish(try_rtn):
             return try_rtn
     if utils.numericish(rtn) and dt and dt.datetime:
         rtn = utils.safe_apply(
             lambda: xlrd.xldate_as_tuple(rtn, datemode))()
         if rtn is not None:
             f = datetime.datetime
             if utils.pd:
                 f = utils.pd.Timestamp
             return f(year=rtn[0],
                      month=rtn[1],
                      day=rtn[2],
                      hour=rtn[3],
                      minute=rtn[4],
                      second=rtn[5])
     return self.tic_dat_factory._general_read_cell(table, field, rtn)
Ejemplo n.º 2
0
 def _read_cell(x, field):
     dv, dt = self._get_dv_dt(table, field)
     rtn = x[field_indicies[field]]
     if rtn == "" and ((dt and dt.nullable) or (not dt and dv is None)):
         return None
     if treat_inf_as_infinity and utils.stringish(
             rtn) and rtn.lower() in ["inf", "-inf"]:
         return float(rtn.lower())
     if utils.numericish(rtn) and utils.safe_apply(int)(
             rtn) == rtn and dt and dt.must_be_int:
         rtn = int(rtn)
     if rtn == "":
         try_rtn = self.tic_dat_factory._general_read_cell(
             table, field, None)  # None as infinity flagging
         if utils.numericish(try_rtn):
             return try_rtn
     if utils.numericish(rtn) and dt and dt.datetime:
         rtn = utils.safe_apply(
             lambda: xlrd.xldate_as_tuple(rtn, datemode))()
         if rtn is not None:
             f = datetime.datetime
             if utils.pd:
                 f = utils.pd.Timestamp
             return f(year=rtn[0],
                      month=rtn[1],
                      day=rtn[2],
                      hour=rtn[3],
                      minute=rtn[4],
                      second=rtn[5])
     return self.tic_dat_factory._general_read_cell(table, field, rtn)
Ejemplo n.º 3
0
    def set_data_type(self, table, field, number_allowed = True,
                      inclusive_min = True, inclusive_max = False, min = 0, max = float("inf"),
                      must_be_int = False, strings_allowed= (), nullable = False):
        """
        sets the data type for a field. By default, fields don't have types. Adding a data type doesn't block
        data of the wrong type from being entered. Data types are useful for recognizing errant data entries
        with find_data_type_failures(). Errant data entries can be replaced with replace_data_type_failures().

        :param table: a table in the schema

        :param field: a data field for this table

        :param number_allowed: boolean does this field allow numbers?

        :param inclusive_min: boolean : if number allowed, is the min inclusive?

        :param inclusive_max: boolean : if number allowed, is the max inclusive?

        :param min: if number allowed, the minimum value

        :param max: if number allowed, the maximum value

        :param must_be_int: boolean : if number allowed, must the number be integral?

        :param strings_allowed: if a collection - then a list of the strings allowed.
                                The empty collection prohibits strings.
                                If a "*", then any string is accepted.
        :param nullable : boolean : can this value contain null (aka None aka nan (since pandas treats null as nan))

        :return:
        """
        verify(not self._has_been_used,
               "The data types can't be changed after a PanDatFactory has been used.")
        verify(table in self.all_tables, "Unrecognized table name %s"%table)
        verify(table not in self.generic_tables, "Cannot set data type for generic table")
        verify(field in self.data_fields[table] + self.primary_key_fields[table],
               "%s does not refer to a field for %s"%(field, table))

        verify((strings_allowed == '*') or
               (containerish(strings_allowed) and all(utils.stringish(x) for x in strings_allowed)),
"""The strings_allowed argument should be a container of strings, or the single '*' character.""")
        if utils.containerish(strings_allowed):
            strings_allowed = tuple(strings_allowed) # defensive copy
        if number_allowed:
            verify(utils.numericish(max), "max should be numeric")
            verify(utils.numericish(min), "min should be numeric")
            verify(max >= min, "max cannot be smaller than min")
            self._data_types[table][field] = TypeDictionary(number_allowed=True,
                strings_allowed=strings_allowed,  nullable = bool(nullable),
                min = min, max = max, inclusive_min= bool(inclusive_min), inclusive_max = bool(inclusive_max),
                must_be_int = bool(must_be_int))
        else :
            self._data_types[table][field] = TypeDictionary(number_allowed=False,
                strings_allowed=strings_allowed,  nullable = bool(nullable),
                min = 0, max = float("inf"), inclusive_min= True, inclusive_max = True,
                must_be_int = False)
Ejemplo n.º 4
0
 def add_var(self, lb=0, ub=float("inf"), type="continuous", name=""):
     """
     Add a variable to the model.
     :param lb: The lower bound of the variable.
     :param ub: The upper bound of the variable.
     :param type: either 'binary' or 'continuous'
     :param name: The name of the variable. (Ignored if falsey).
     :return: The variable object associated with the model_type engine API
     """
     verify(type in ["continuous", "binary"],
            "type needs to be 'continuous' or 'binary'")
     verify(
         utils.numericish(lb) and utils.numericish(ub),
         "lb, ub need to be numbers")
     verify(ub >= lb, "lb cannot be bigger than ub")
     verify(lb < float("inf"), "lb cannot be positive infinity")
     verify(ub > -float("inf"), "ub cannot be negative infinity")
     if type == "binary":
         ub = 1 if ub == float("inf") else ub
         verify(lb in [0, 1] and ub in [0, 1],
                "lb,ub need to be 0 or 1 when type = 'binary'")
     name_dict = {"name": name} if name else {}
     if self.model_type == "gurobi":
         vtype = {
             "continuous": gurobi.GRB.CONTINUOUS,
             "binary": gurobi.GRB.BINARY
         }[type]
         rtn = self.core_model.addVar(lb=lb,
                                      ub=ub,
                                      vtype=vtype,
                                      **name_dict)
         return rtn
     if self.model_type == "cplex":
         if type == "continuous":
             return self.core_model.continuous_var(lb=lb,
                                                   ub=ub,
                                                   **name_dict)
         rtn = self.core_model.binary_var(**name_dict)
         rhs = ub if ub == 0 else (lb if lb == 1 else None)
         if utils.numericish(rhs):
             self.core_model.add_constraint(rtn == rhs)
         return rtn
     if self.model_type == "xpress":
         vtype = {
             "continuous": xpress.continuous,
             "binary": xpress.binary
         }[type]
         rtn = xpress.var(lb=lb, ub=ub, vartype=vtype, **name_dict)
         self.core_model.addVariable(rtn)
         return rtn
Ejemplo n.º 5
0
 def _read_cell(x, field):
     dv, dt = self._get_dv_dt(table, field)
     rtn = x[field_indicies[field]]
     if rtn == "" and ((dt and dt.nullable) or (not dt and dv is None)):
         return None
     if treat_inf_as_infinity and utils.stringish(rtn) and rtn.lower() in ["inf", "-inf"]:
         return float(rtn.lower())
     if utils.numericish(rtn) and utils.safe_apply(int)(rtn) == rtn and dt and dt.must_be_int:
         rtn = int(rtn)
     if rtn == "":
         try_rtn = self.tic_dat_factory._general_read_cell(table, field, None) # None as infinity flagging
         if utils.numericish(try_rtn):
             return try_rtn
     if utils.numericish(rtn) and dt and dt.datetime and hasattr(sheet, "xldate_as_tuple_munge"):
         rtn = sheet.xldate_as_tuple_munge(rtn)
     return self.tic_dat_factory._general_read_cell(table, field, rtn)
Ejemplo n.º 6
0
def _read_data(x):
    if utils.numericish(x):
        if x >= _mdb_inf:
            return float("inf")
        if x <= -_mdb_inf:
            return -float("inf")
    return x
Ejemplo n.º 7
0
 def _convert_float(x, field):
     rtn = x[field_indicies[field]]
     if utils.numericish(rtn) and utils.safe_apply(int)(rtn) == rtn and \
        table in data_types and field in data_types[table] and \
        data_types[table][field].must_be_int:
         return int(rtn)
     return rtn
Ejemplo n.º 8
0
 def _create_tic_dat_dict(self, xls_file_path, row_offsets,
                          headers_present):
     verify(
         utils.dictish(row_offsets)
         and set(row_offsets).issubset(self.tic_dat_factory.all_tables)
         and all(
             utils.numericish(x) and (x >= 0)
             for x in row_offsets.values()),
         "row_offsets needs to map from table names to non negative row offset"
     )
     row_offsets = dict({t: 0
                         for t in self.tic_dat_factory.all_tables},
                        **row_offsets)
     tdf = self.tic_dat_factory
     rtn = {}
     sheets, field_indicies = self._get_sheets_and_fields(
         xls_file_path,
         set(tdf.all_tables).difference(tdf.generator_tables),
         row_offsets,
         headers_present,
         print_missing_tables=True)
     ho = 1 if headers_present else 0
     for tbl, sheet in sheets.items():
         fields = tdf.primary_key_fields.get(tbl, ()) + tdf.data_fields.get(
             tbl, ())
         assert fields or tbl in self.tic_dat_factory.generic_tables
         indicies = field_indicies[tbl]
         table_len = min(
             len(sheet.col_values(indicies[field]))
             for field in (fields or indicies))
         if tdf.primary_key_fields.get(tbl, ()):
             tableObj = {
                 self._sub_tuple(tbl, tdf.primary_key_fields[tbl],
                                 indicies)(x):
                 self._sub_tuple(tbl, tdf.data_fields.get(tbl, ()),
                                 indicies)(x)
                 for x in (sheet.row_values(i)
                           for i in range(table_len)[row_offsets[tbl] +
                                                     ho:])
             }
         elif tbl in tdf.generic_tables:
             tableObj = [{
                 f: x[i]
                 for f, i in field_indicies[tbl].items()
             } for x in (sheet.row_values(i)
                         for i in range(table_len)[row_offsets[tbl] + ho:])]
         else:
             tableObj = [
                 self._sub_tuple(tbl, tdf.data_fields.get(tbl, ()),
                                 indicies)(x)
                 for x in (sheet.row_values(i)
                           for i in range(table_len)[row_offsets[tbl] +
                                                     ho:])
             ]
         rtn[tbl] = tableObj
     for tbl in tdf.generator_tables:
         rtn[tbl] = self._create_generator_obj(xls_file_path, tbl,
                                               row_offsets[tbl],
                                               headers_present)
     return rtn
Ejemplo n.º 9
0
 def _inner_rtn(x):
     if table == "parameters" and self.tic_dat_factory.parameters:
         return x
     dv, dt = self._get_dv_dt(table, field)
     if x == "" and ((dt and dt.nullable) or (not dt and dv is None) or
                     numericish(self.tic_dat_factory._general_read_cell(table, field, None))):
         return None
     should_try_float = (dt and dt.number_allowed) or (not dt and numericish(dv)) or \
                        (table in self.tic_dat_factory.generic_tables)
     if should_try_float:
         try:
             x = float(x)
             if int(x) == x and dt and dt.must_be_int:
                 x = int(x)
         except:
             return x
     return x
Ejemplo n.º 10
0
 def data_type(t, f):
     def_ = default_(t, f)
     if numericish(def_):
         if safe_apply(int)(def_) == def_:
             return "INT"
         return "FLOAT"
     # the TEXT data type doesn't seem to have much value for my purposes.
     return ""
Ejemplo n.º 11
0
 def data_type(t, f):
     if t == "parameters" and self.tic_dat_factory.parameters:
         return ""  # the TEXT data type doesn't seem to have much value for my purposes.
     def_ = default_(t, f)
     if numericish(def_):
         if safe_apply(int)(def_) == def_:
             return "INT"
         return "FLOAT"
     return ""  # the TEXT data type doesn't seem to have much value for my purposes.
Ejemplo n.º 12
0
 def find_duplicates(self,
                     xls_file_path,
                     row_offsets={},
                     headers_present=True):
     """
     Find the row counts for duplicated rows.
     :param xls_file_path: An Excel file containing sheets whose names match
                           the table names in the schema (non primary key tables ignored).
     :param row_offsets: (optional) A mapping from table names to initial
                         number of rows to skip (non primary key tables ignored)
     :param headers_present: Boolean. Does the first row of data contain the
                             column headers?
     caveats: Missing sheets resolve to an empty table, but missing primary fields
              on matching sheets throw an Exception.
              Sheet names are considered case insensitive.
     :return: A dictionary whose keys are the table names for the primary key tables.
              Each value of the return dictionary is itself a dictionary.
              The inner dictionary is keyed by the primary key values encountered
              in the table, and the value is the count of records in the
              Excel sheet with this primary key.
              Row counts smaller than 2 are pruned off, as they aren't duplicates
     """
     self._verify_differentiable_sheet_names()
     verify(xlrd, "xlrd needs to be installed to use this subroutine")
     verify(
         utils.dictish(row_offsets)
         and set(row_offsets).issubset(self.tic_dat_factory.all_tables)
         and all(
             utils.numericish(x) and (x >= 0)
             for x in row_offsets.values()),
         "row_offsets needs to map from table names to non negative row offset"
     )
     row_offsets = dict({t: 0
                         for t in self.tic_dat_factory.all_tables},
                        **row_offsets)
     tdf = self.tic_dat_factory
     pk_tables = tuple(t for t, _ in tdf.primary_key_fields.items() if _)
     rtn = {t: defaultdict(int) for t in pk_tables}
     sheets, fieldIndicies = self._get_sheets_and_fields(
         xls_file_path, pk_tables, row_offsets, headers_present)
     ho = 1 if headers_present else 0
     for table, sheet in sheets.items():
         fields = tdf.primary_key_fields[table] + tdf.data_fields.get(
             table, ())
         indicies = fieldIndicies[table]
         table_len = min(
             len(sheet.col_values(indicies[field])) for field in fields)
         for x in (sheet.row_values(i)
                   for i in range(table_len)[row_offsets[table] + ho:]):
             rtn[table][self._sub_tuple(table,
                                        tdf.primary_key_fields[table],
                                        indicies)(x)] += 1
     for t in list(rtn.keys()):
         rtn[t] = {k: v for k, v in rtn[t].items() if v > 1}
         if not rtn[t]:
             del (rtn[t])
     return rtn
Ejemplo n.º 13
0
 def perform_predicate_checks(sch):
     pdf = PanDatFactory(**sch)
     pdf.add_data_row_predicate("foods", lambda row: numericish(row["cost"]) and not isnan(row["cost"]), "cost")
     good_qty = lambda qty :  numericish(qty) and 5 < qty <= 12
     pdf.add_data_row_predicate("nutritionQuantities", lambda row: good_qty(row["qty"]), "qty")
     pdf.add_data_row_predicate("categories",
                                lambda row: all(map(numericish, [row["minNutrition"], row["maxNutrition"]]))
                                            and row["maxNutrition"] >= row["minNutrition"],
                                "minmax")
     failed = pdf.find_data_row_failures(pandat)
     self.assertTrue(set(failed) == {('foods', 'cost'), ('nutritionQuantities', 'qty'), ('categories', 'minmax')})
     self.assertTrue(set(failed['foods', 'cost']["name"]) == {'b'})
     self.assertTrue(set({(v["food"], v["category"])
                          for v in failed['nutritionQuantities', 'qty'].T.to_dict().values()}) ==
                         {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')})
     self.assertTrue(set(failed['categories', 'minmax']["name"]) == {'2'})
     failed = pdf.find_data_row_failures(pandat, as_table=False)
     self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True])
Ejemplo n.º 14
0
 def _inner_rtn(x):
     if table == "parameters" and self.tic_dat_factory.parameters:
         return x
     # reminder - data fields have a default default of zero, primary keys don't get a default default
     dv = self.tic_dat_factory.default_values.get(table, {}).get(field, ["LIST", "NOT", "POSSIBLE"])
     dt = self.tic_dat_factory.data_types.get(table, {}).get(field)
     if x == "" and ((dt and dt.nullable) or (not dt and dv is None) or
                     numericish(self.tic_dat_factory._general_read_cell(table, field, None))):
         return None
     should_try_float = (dt and dt.number_allowed) or (not dt and numericish(dv)) or \
                        (table in self.tic_dat_factory.generic_tables)
     if should_try_float:
         try:
             x = float(x)
             if int(x) == x and dt and dt.must_be_int:
                 x = int(x)
         except:
             return x
     return x
Ejemplo n.º 15
0
 def perform_predicate_checks(sch):
     pdf = PanDatFactory(**sch)
     pdf.add_data_row_predicate("foods", lambda row: numericish(row["cost"]) and not isnan(row["cost"]), "cost")
     good_qty = lambda qty : 5 < qty <= 12
     pdf.add_data_row_predicate("nutritionQuantities", lambda row: good_qty(row["qty"]), "qty")
     pdf.add_data_row_predicate("categories",
                                lambda row: row["maxNutrition"] >= row["minNutrition"],
                                "minmax")
     pdf2 = PanDatFactory(**sch)
     def make_error_message_predicate(f, name):
         def error_message_predicate(row):
             rtn = f(row)
             if rtn:
                 return True
             return f"{name} failed!"
         return error_message_predicate
     for t, preds in pdf._data_row_predicates.items():
         for p_name, rpi in preds.items():
             pdf2.add_data_row_predicate(t, make_error_message_predicate(rpi.predicate, p_name),
                                         predicate_name=p_name, predicate_failure_response="Error Message")
     failed = pdf.find_data_row_failures(pandat)
     failed2 = pdf2.find_data_row_failures(pandat)
     self.assertTrue(set(failed) == set(failed2) ==  {('foods', 'cost'),
                                     ('nutritionQuantities', 'qty'), ('categories', 'minmax')})
     self.assertTrue(set(failed['foods', 'cost']["name"]) == set(failed2['foods', 'cost']["name"]) == {'b'})
     for f in [failed, failed2]:
         self.assertTrue(set({(v["food"], v["category"])
                              for v in f['nutritionQuantities', 'qty'].T.to_dict().values()}) ==
                             {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')})
         self.assertTrue(set(f['categories', 'minmax']["name"]) == {'2'})
     for t, n in failed2:
         self.assertTrue(set(failed2[t, n]["Error Message"]) == {f'{n} failed!'})
     for _pdf in [pdf, pdf2]:
         failed = _pdf.find_data_row_failures(pandat, as_table=False)
         self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True])
         ex = []
         try:
             _pdf.find_data_row_failures(pandat_2)
         except Exception as e:
             ex[:] = [str(e.__class__)]
         self.assertTrue("TypeError" in ex[0])
         failed = _pdf.find_data_row_failures(pandat_2, exception_handling="Handled as Failure")
         self.assertTrue(set(failed['categories', 'minmax']["name"]) == {'2', '3'})
     failed = pdf2.find_data_row_failures(pandat_2, exception_handling="Handled as Failure")
     df = failed['categories', 'minmax']
     err_str = list(df[df['name'] == '3']["Error Message"])[0]
     self.assertTrue(err_str=="Exception<'>=' not supported between instances of 'int' and 'str'>")
Ejemplo n.º 16
0
    def testDataPredicates(self):
        if not self.canRun:
            return
        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**dietSchema())

        ticdat = tdf.TicDat()
        ticdat.foods["a"] = 12
        ticdat.foods["b"] = None
        ticdat.categories["1"] = {"maxNutrition":100, "minNutrition":40}
        ticdat.categories["2"] = [21,20]
        for f, p in itertools.product(ticdat.foods, ticdat.categories):
            ticdat.nutritionQuantities[f,p] = 5


        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))
        self.assertFalse(pdf.find_duplicates(pandat))
        self.assertFalse(pdf.find_data_row_failures(pandat))

        ticdat.nutritionQuantities['a', 2] = 12
        ticdat.categories["3"] = ['a', 100]
        pandat_2 = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))

        def perform_predicate_checks(sch):
            pdf = PanDatFactory(**sch)
            pdf.add_data_row_predicate("foods", lambda row: numericish(row["cost"]) and not isnan(row["cost"]), "cost")
            good_qty = lambda qty : 5 < qty <= 12
            pdf.add_data_row_predicate("nutritionQuantities", lambda row: good_qty(row["qty"]), "qty")
            pdf.add_data_row_predicate("categories",
                                       lambda row: row["maxNutrition"] >= row["minNutrition"],
                                       "minmax")
            failed = pdf.find_data_row_failures(pandat)
            self.assertTrue(set(failed) == {('foods', 'cost'), ('nutritionQuantities', 'qty'), ('categories', 'minmax')})
            self.assertTrue(set(failed['foods', 'cost']["name"]) == {'b'})
            self.assertTrue(set({(v["food"], v["category"])
                                 for v in failed['nutritionQuantities', 'qty'].T.to_dict().values()}) ==
                                {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')})
            self.assertTrue(set(failed['categories', 'minmax']["name"]) == {'2'})
            failed = pdf.find_data_row_failures(pandat, as_table=False)
            self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True])
            failed = pdf.find_data_row_failures(pandat_2)
            self.assertTrue(set(failed['categories', 'minmax']["name"]) == {'2', '3'})

        perform_predicate_checks(dietSchema())
        perform_predicate_checks({t:'*' for t in dietSchema()})

        tdf = TicDatFactory(**netflowSchema())
        tdf.enable_foreign_key_links()
        addNetflowForeignKeys(tdf)
        pdf = PanDatFactory(**netflowSchema())
        ticdat = tdf.copy_tic_dat(netflowData())
        for n in ticdat.nodes["Detroit"].arcs_source:
            ticdat.arcs["Detroit", n] = n
        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))
        self.assertFalse(pdf.find_duplicates(pandat))
        self.assertFalse(pdf.find_data_row_failures(pandat))

        pdf = PanDatFactory(**netflowSchema())
        pdf.add_data_row_predicate("arcs", lambda row: True, "capacity")
        self.assertFalse(pdf.find_data_row_failures(pandat))

        pdf = PanDatFactory(**netflowSchema())
        good_capacity = lambda capacity: numericish(capacity) or capacity in ["Boston", "Seattle", "lumberjack"]
        pdf.add_data_row_predicate("arcs", lambda row: good_capacity(row["capacity"]), "capacity")
        failed = pdf.find_data_row_failures(pandat)
        self.assertTrue(set(failed) == {('arcs', 'capacity')})
        self.assertTrue(set({(v["source"], v["destination"])
                             for v in failed['arcs', 'capacity'].T.to_dict().values()}) == {("Detroit", "New York")})
Ejemplo n.º 17
0
 def _write_data_cell(self, t, f, x):
     rtn = self.tdf._infinity_flag_write_cell(t, f, x)
     if numericish(rtn):
         rtn = float(rtn) if safe_apply(int)(rtn) != rtn else int(rtn)
     return rtn
Ejemplo n.º 18
0
    def testDataPredicates(self):
        # this test won't run properly if the -O flag is applied
        if not self.canRun:
            return
        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**dietSchema())

        ticdat = tdf.TicDat()
        ticdat.foods["a"] = 12
        ticdat.foods["b"] = None
        ticdat.categories["1"] = {"maxNutrition":100, "minNutrition":40}
        ticdat.categories["2"] = [21,20]
        for f, p in itertools.product(ticdat.foods, ticdat.categories):
            ticdat.nutritionQuantities[f,p] = 5


        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))
        self.assertFalse(pdf.find_duplicates(pandat))
        self.assertFalse(pdf.find_data_row_failures(pandat))

        ticdat.nutritionQuantities['a', 2] = 12
        ticdat.categories["3"] = ['a', 100]
        pandat_2 = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))

        def perform_predicate_checks(sch):
            pdf = PanDatFactory(**sch)
            pdf.add_data_row_predicate("foods", lambda row: numericish(row["cost"]) and not isnan(row["cost"]), "cost")
            good_qty = lambda qty : 5 < qty <= 12
            pdf.add_data_row_predicate("nutritionQuantities", lambda row: good_qty(row["qty"]), "qty")
            pdf.add_data_row_predicate("categories",
                                       lambda row: row["maxNutrition"] >= row["minNutrition"],
                                       "minmax")
            pdf2 = PanDatFactory(**sch)
            def make_error_message_predicate(f, name):
                def error_message_predicate(row):
                    rtn = f(row)
                    if rtn:
                        return True
                    return f"{name} failed!"
                return error_message_predicate
            for t, preds in pdf._data_row_predicates.items():
                for p_name, rpi in preds.items():
                    pdf2.add_data_row_predicate(t, make_error_message_predicate(rpi.predicate, p_name),
                                                predicate_name=p_name, predicate_failure_response="Error Message")
            failed = pdf.find_data_row_failures(pandat)
            failed2 = pdf2.find_data_row_failures(pandat)
            self.assertTrue(set(failed) == set(failed2) ==  {('foods', 'cost'),
                                            ('nutritionQuantities', 'qty'), ('categories', 'minmax')})
            self.assertTrue(set(failed['foods', 'cost']["name"]) == set(failed2['foods', 'cost']["name"]) == {'b'})
            for f in [failed, failed2]:
                self.assertTrue(set({(v["food"], v["category"])
                                     for v in f['nutritionQuantities', 'qty'].T.to_dict().values()}) ==
                                    {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')})
                self.assertTrue(set(f['categories', 'minmax']["name"]) == {'2'})
            for t, n in failed2:
                self.assertTrue(set(failed2[t, n]["Error Message"]) == {f'{n} failed!'})
            for _pdf in [pdf, pdf2]:
                failed = _pdf.find_data_row_failures(pandat, as_table=False)
                self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True])
                ex = []
                try:
                    _pdf.find_data_row_failures(pandat_2)
                except Exception as e:
                    ex[:] = [str(e.__class__)]
                self.assertTrue("TypeError" in ex[0])
                failed = _pdf.find_data_row_failures(pandat_2, exception_handling="Handled as Failure")
                self.assertTrue(set(failed['categories', 'minmax']["name"]) == {'2', '3'})
            failed = pdf2.find_data_row_failures(pandat_2, exception_handling="Handled as Failure")
            df = failed['categories', 'minmax']
            err_str = list(df[df['name'] == '3']["Error Message"])[0]
            self.assertTrue(err_str=="Exception<'>=' not supported between instances of 'int' and 'str'>")

        perform_predicate_checks(dietSchema())
        perform_predicate_checks({t:'*' for t in dietSchema()})

        tdf = TicDatFactory(**netflowSchema())
        tdf.enable_foreign_key_links()
        addNetflowForeignKeys(tdf)
        pdf = PanDatFactory(**netflowSchema())
        ticdat = tdf.copy_tic_dat(netflowData())
        for n in ticdat.nodes["Detroit"].arcs_source:
            ticdat.arcs["Detroit", n] = n
        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))
        self.assertFalse(pdf.find_duplicates(pandat))
        self.assertFalse(pdf.find_data_row_failures(pandat))

        pdf = PanDatFactory(**netflowSchema())
        pdf.add_data_row_predicate("arcs", lambda row: True, "capacity")
        self.assertFalse(pdf.find_data_row_failures(pandat))

        pdf = PanDatFactory(**netflowSchema())
        good_capacity = lambda capacity: numericish(capacity) or capacity in ["Boston", "Seattle", "lumberjack"]
        pdf.add_data_row_predicate("arcs", lambda row: good_capacity(row["capacity"]), "capacity")
        failed = pdf.find_data_row_failures(pandat)
        self.assertTrue(set(failed) == {('arcs', 'capacity')})
        self.assertTrue(set({(v["source"], v["destination"])
                             for v in failed['arcs', 'capacity'].T.to_dict().values()}) == {("Detroit", "New York")})

        pdf = PanDatFactory(table=[[],["Field", "Error Message", "Error Message (1)"]])
        pdf.add_data_row_predicate("table", predicate=lambda row: f"Oops {row['Field']}" if row["Field"] > 1 else True,
                                   predicate_name="silly", predicate_failure_response="Error Message")
        df = DataFrame({"Field":[2, 1], "Error Message":["what", "go"], "Error Message (1)": ["now", "go"]})
        fails = pdf.find_data_row_failures(pdf.PanDat(table=df))
        df = fails["table", "silly"]
        self.assertTrue(list(df.columns) == ["Field", "Error Message", "Error Message (1)", "Error Message (2)"])
        self.assertTrue(set(df["Field"]) == {2} and set(df["Error Message (2)"]) == {'Oops 2'})
Ejemplo n.º 19
0
def _write_data(x):
    return max(min(x, _mdb_inf), -_mdb_inf) if numericish(x) else x
Ejemplo n.º 20
0
 def _create_tic_dat_dict(self, xls_file_path, row_offsets, headers_present,
                          treat_inf_as_infinity):
     tiai = treat_inf_as_infinity
     verify(
         utils.dictish(row_offsets)
         and set(row_offsets).issubset(self.tic_dat_factory.all_tables)
         and all(
             utils.numericish(x) and (x >= 0)
             for x in row_offsets.values()),
         "row_offsets needs to map from table names to non negative row offset"
     )
     row_offsets = dict({t: 0
                         for t in self.tic_dat_factory.all_tables},
                        **row_offsets)
     tdf = self.tic_dat_factory
     rtn = {}
     sheets, field_indicies, dm = self._get_sheets_and_fields(
         xls_file_path,
         set(tdf.all_tables).difference(tdf.generator_tables),
         row_offsets,
         headers_present,
         print_missing_tables=True)
     ho = 1 if headers_present else 0
     if xls_file_path.endswith('.xlsx'):
         for tbl, sheet in sheets.items():
             fields = tdf.primary_key_fields.get(tbl,
                                                 ()) + tdf.data_fields.get(
                                                     tbl, ())
             assert fields or tbl in self.tic_dat_factory.generic_tables
             indicies = field_indicies[tbl]
             table_len = min(
                 len(list(self.iter_cols(sheet))[indicies[field]])
                 for field in (fields or indicies))
             if tdf.primary_key_fields.get(tbl, ()):
                 row_list = list(self.iter_rows(sheet))
                 tableObj = {
                     self._sub_tuple(tbl, tdf.primary_key_fields[tbl],
                                     indicies, tiai, dm)(x):
                     self._sub_tuple(tbl, tdf.data_fields.get(tbl, ()),
                                     indicies, tiai, dm)(x)
                     for x in (row_list[i]
                               for i in range(table_len)[row_offsets[tbl] +
                                                         ho:])
                 }
             elif tbl in tdf.generic_tables:
                 tableObj = None  # will be read via PanDatFactory
             else:
                 row_list = list(self.iter_rows(sheet))
                 tableObj = [
                     self._sub_tuple(tbl, tdf.data_fields.get(tbl, ()),
                                     indicies, tiai, dm)(x)
                     for x in (row_list[i]
                               for i in range(table_len)[row_offsets[tbl] +
                                                         ho:])
                 ]
             if tableObj is not None:
                 rtn[tbl] = tableObj
     else:
         for tbl, sheet in sheets.items():
             fields = tdf.primary_key_fields.get(tbl,
                                                 ()) + tdf.data_fields.get(
                                                     tbl, ())
             assert fields or tbl in self.tic_dat_factory.generic_tables
             indicies = field_indicies[tbl]
             table_len = min(
                 len(sheet.col_values(indicies[field]))
                 for field in (fields or indicies))
             if tdf.primary_key_fields.get(tbl, ()):
                 tableObj = {
                     self._sub_tuple(tbl, tdf.primary_key_fields[tbl],
                                     indicies, tiai, dm)(x):
                     self._sub_tuple(tbl, tdf.data_fields.get(tbl, ()),
                                     indicies, tiai, dm)(x)
                     for x in (sheet.row_values(i)
                               for i in range(table_len)[row_offsets[tbl] +
                                                         ho:])
                 }
             elif tbl in tdf.generic_tables:
                 tableObj = None  # will be read via PanDatFactory
             else:
                 tableObj = [
                     self._sub_tuple(tbl, tdf.data_fields.get(tbl, ()),
                                     indicies, tiai, dm)(x)
                     for x in (sheet.row_values(i)
                               for i in range(table_len)[row_offsets[tbl] +
                                                         ho:])
                 ]
             if tableObj is not None:
                 rtn[tbl] = tableObj
     for tbl in tdf.generator_tables:
         rtn[tbl] = self._create_generator_obj(xls_file_path, tbl,
                                               row_offsets[tbl],
                                               headers_present, tiai)
     return rtn