Python freezable_factory Exemples, ticdat.utils.freezable_factory Python Exemples

Exemple #1

0

Afficher le fichier

class XlsPanFactory(freezable_factory(object, "_isFrozen")):
    """
    Primary class for reading/writing Excel files with panDat objects.
    Don't create this object explicitly. A XlsPanFactory will automatically be associated
    with the xls attribute of the parent PanDatFactory.
    """
    def __init__(self, pan_dat_factory):
        """
        Don't create this object explicitly. A XlsPanFactory will
        automatically be associated with the xls attribute of the parent
        PanDatFactory.

        :param pan_dat_factory:

        :return:
        """
        self.pan_dat_factory = pan_dat_factory
        self._isFrozen = True

    def create_pan_dat(self, xls_file_path, fill_missing_fields=False):
        """
        Create a PanDat object from an Excel file

        :param xls_file_path: An Excel file containing sheets whose names match
                              the table names in the schema.

        :param fill_missing_fields: boolean. If truthy, missing fields will be filled in
                                    with their default value. Otherwise, missing fields
                                    throw an Exception.

        :return: a PanDat object populated by the matching sheets.

        caveats: Missing sheets resolve to an empty table, but missing fields
                 on matching sheets throw an Exception (unless fill_missing_fields is truthy).
                 Table names are matched to sheets with with case-space insensitivity, but spaces and
                 case are respected for field names.
                 (ticdat supports whitespace in field names but not table names).

        Note that if you save a DataFrame to excel and then recover it, the type of data might change. For example

            df = pd.DataFrame({"a":["100", "200", "300"]})
            df.to_excel("something.xlsx")
            df2 = pd.read_excel("something.xlsx")

        results in a numeric column in df2. To address this, you need to either use set_data_type for your
        PanDatFactory.

        This problem is even worse with df = pd.DataFrame({"a":["0100", "1200", "2300"]})
        """
        rtn = {}
        for t, s in self._get_sheet_names(xls_file_path).items():
            rtn[t] = pd.read_excel(
                xls_file_path,
                s,
                dtype=self.pan_dat_factory._dtypes_for_pandas_read(t))
        missing_tables = {
            t
            for t in self.pan_dat_factory.all_tables if t not in rtn
        }
        if missing_tables:
            print(
                "The following table names could not be found in the %s file.\n%s\n"
                % (xls_file_path, "\n".join(missing_tables)))
        missing_fields = {(t, f)
                          for t in rtn
                          for f in all_fields(self.pan_dat_factory, t)
                          if f not in rtn[t].columns}
        if fill_missing_fields:
            for t, f in missing_fields:
                rtn[t][f] = self.pan_dat_factory.default_values[t][f]
        verify(
            fill_missing_fields or not missing_fields,
            "The following are (table, field) pairs missing from the %s file.\n%s"
            % (xls_file_path, missing_fields))
        return _clean_pandat_creator(self.pan_dat_factory, rtn)

    def _get_sheet_names(self, xls_file_path):
        sheets = defaultdict(list)
        try:
            xl = pd.ExcelFile(xls_file_path)
        except Exception as e:
            raise TicDatError("Unable to open %s as xls file : %s" %
                              (xls_file_path, e))
        for table, sheet in product(self.pan_dat_factory.all_tables,
                                    xl.sheet_names):
            if table.lower()[:_longest_sheet] == sheet.lower().replace(
                    ' ', '_')[:_longest_sheet]:
                sheets[table].append(sheet)
        duplicated_sheets = tuple(_t for _t, _s in sheets.items()
                                  if len(_s) > 1)
        verify(
            not duplicated_sheets,
            "The following sheet names were duplicated : " +
            ",".join(duplicated_sheets))
        sheets = FrozenDict({k: v[0] for k, v in sheets.items()})
        return sheets

    def write_file(self, pan_dat, file_path, case_space_sheet_names=False):
        """
        write the panDat data to an excel file

        :param pan_dat: the PanDat object to write

        :param file_path: The file path of the excel file to create

        :param case_space_sheet_names: boolean - make best guesses how to add spaces and upper case
                                      characters to sheet names

        :return:

        caveats: The row names (index) isn't written.
        """
        msg = []
        verify(
            self.pan_dat_factory.good_pan_dat_object(pan_dat, msg.append),
            "pan_dat not a good object for this factory : %s" % "\n".join(msg))
        pan_dat = self.pan_dat_factory._pre_write_adjustment(pan_dat)
        verify(not os.path.isdir(file_path),
               "A directory is not a valid xls file path")
        case_space_sheet_names = case_space_sheet_names and \
                                 len(set(self.pan_dat_factory.all_tables)) == \
                                 len(set(map(case_space_to_pretty, self.pan_dat_factory.all_tables)))
        writer = pd.ExcelWriter(file_path)
        for t in self.pan_dat_factory.all_tables:
            getattr(pan_dat, t).to_excel(
                writer,
                case_space_to_pretty(t) if case_space_sheet_names else t,
                index=False)
        writer.save()

Exemple #2

0

Afficher le fichier

class JsonPanFactory(freezable_factory(object, "_isFrozen")):
    """
    Primary class for reading/writing json data with PanDat objects.
    Don't create this object explicitly. A JsonPanFactory will
    automatically be associated with the json attribute of the parent
    PanDatFactory.
    """
    def __init__(self, pan_dat_factory):
        """
        Don't create this object explicitly. A JsonPanFactory will
        automatically be associated with the json attribute of the parent
        PanDatFactory.
        :param pan_dat_factory:
        :return:
        """
        self.pan_dat_factory = pan_dat_factory
        to_json_args = inspect.getfullargspec(pd.DataFrame.to_json).args
        assert "orient" in to_json_args
        self._modern_pandas = "index" in to_json_args
        self._isFrozen = True

    def create_pan_dat(self,
                       path_or_buf,
                       fill_missing_fields=False,
                       orient='split',
                       **kwargs):
        """
        Create a PanDat object from a JSON file or string

        :param path_or_buf:  a valid JSON string or file-like

        :param fill_missing_fields: boolean. If truthy, missing fields will be filled in
                                    with their default value. Otherwise, missing fields
                                    throw an Exception.

        :param orient: Indication of expected JSON string format. See pandas.read_json for more details.

        :param kwargs: additional named arguments to pass to pandas.read_json

        :return: a PanDat object populated by the matching tables.

        caveats: Missing tables always resolve to an empty table.

                 Table names are matched with case-space insensitivity, but spaces
                 are respected for field names.

                 (ticdat supports whitespace in field names but not table names).

        Note that if you save a DataFrame to json and then recover it, the type of data might change.
        Specifically, text that looks numeric might be recovered as a number, to include the loss of leading zeros.
        To address this, you need to either use set_data_type for your
        PanDatFactory, or specify "dtype" in kwargs. (The former is obviously better).
        """
        if stringish(path_or_buf) and os.path.exists(path_or_buf):
            verify(
                os.path.isfile(path_or_buf),
                "%s appears to be a directory and not a file." % path_or_buf)
            with open(path_or_buf, "r") as f:
                loaded_dict = json.load(f)
        else:
            verify(stringish(path_or_buf), "%s isn't a string" % path_or_buf)
            loaded_dict = json.loads(path_or_buf)
        verify(dictish(loaded_dict),
               "the json.load result doesn't resolve to a dictionary")
        verify(
            all(map(dictish, loaded_dict.values())),
            "the json.load result doesn't resolve to a dictionary whose values are themselves dictionaries"
        )

        tbl_names = self._get_table_names(loaded_dict)
        verify("orient" not in kwargs,
               "orient should be passed as a non-kwargs argument")
        rtn = {}
        for t, f in tbl_names.items():
            kwargs_ = dict(kwargs)
            if "dtype" not in kwargs_:
                kwargs_[
                    "dtype"] = self.pan_dat_factory._dtypes_for_pandas_read(t)
            rtn[t] = pd.read_json(json.dumps(loaded_dict[f]),
                                  orient=orient,
                                  **kwargs_)
        missing_fields = {(t, f)
                          for t in rtn
                          for f in all_fields(self.pan_dat_factory, t)
                          if f not in rtn[t].columns}
        if fill_missing_fields:
            for t, f in missing_fields:
                rtn[t][f] = self.pan_dat_factory.default_values[t][f]
        verify(
            fill_missing_fields or not missing_fields,
            "The following (table, field) pairs are missing fields.\n%s" %
            [(t, f) for t, f in missing_fields])
        missing_tables = sorted(
            set(self.pan_dat_factory.all_tables).difference(rtn))
        if missing_tables:
            print(
                "The following table names could not be found in the SQLite database.\n%s\n"
                % "\n".join(missing_tables))
        return _clean_pandat_creator(self.pan_dat_factory, rtn, json_read=True)

    def _get_table_names(self, loaded_dict):
        rtn = {}
        for table in self.pan_dat_factory.all_tables:
            rtn[table] = [
                c for c in loaded_dict
                if c.lower().replace(" ", "_") == table.lower()
            ]
            verify(
                len(rtn[table]) <= 1,
                "Multiple dictionary key choices found for table %s" % table)
            if rtn[table]:
                rtn[table] = rtn[table][0]
            else:
                rtn.pop(table)
        return rtn

    def write_file(self,
                   pan_dat,
                   json_file_path,
                   case_space_table_names=False,
                   orient='split',
                   index=False,
                   indent=2,
                   sort_keys=False,
                   **kwargs):
        """
        write the PanDat data to a json file (or json string)

        :param pan_dat: the PanDat object to write

        :param json_file_path: the json file into which the data is to be written. If falsey, will return a
                               JSON  string

        :param case_space_table_names: boolean - make best guesses how to add spaces and upper case
                                       characters to table names

        :param orient: Indication of expected JSON string format. See pandas.to_json for more details.

        :param index: boolean - whether or not to write the index.

        :param indent: 2. See json.dumps

        :param sort_keys: See json.dumps

        :param kwargs: additional named arguments to pass to pandas.to_json

        :return:
        """
        msg = []
        verify(
            self.pan_dat_factory.good_pan_dat_object(pan_dat, msg.append),
            "pan_dat not a good object for this factory : %s" % "\n".join(msg))
        verify("orient" not in kwargs,
               "orient should be passed as a non-kwargs argument")
        verify("index" not in kwargs,
               "index should be passed as a non-kwargs argument")
        pan_dat = self.pan_dat_factory._pre_write_adjustment(pan_dat)

        if self._modern_pandas:
            # FYI - pandas Exception: ValueError: 'index=False' is only valid when 'orient' is 'split' or 'table'
            kwargs["index"] = index if orient in ("split", "table") else True
        case_space_table_names = case_space_table_names and \
                                 len(set(self.pan_dat_factory.all_tables)) == \
                                 len(set(map(case_space_to_pretty, self.pan_dat_factory.all_tables)))
        rtn = {}
        from ticdat.pandatfactory import _faster_df_apply
        for t in self.pan_dat_factory.all_tables:
            df = getattr(pan_dat, t).copy(deep=True).replace(
                float("inf"), "inf").replace(-float("inf"), "-inf")
            for f in df.columns:
                dt = self.pan_dat_factory.data_types.get(t, {}).get(f, None)
                if dt and dt.datetime:
                    # pandas can be a real PIA when trying to mix types in a column
                    def fixed(row):  # this might not always fix things
                        if isinstance(row[f],
                                      (pd.Timestamp, numpy.datetime64)):
                            return str(row[f])
                        if pd.isnull(row[f]):
                            return None
                        return row[f]

                    df[f] = _faster_df_apply(df, fixed)
            k = case_space_to_pretty(t) if case_space_table_names else t
            rtn[k] = json.loads(
                df.to_json(path_or_buf=None, orient=orient, **kwargs))
            if orient == 'split' and not index:
                rtn[k].pop("index", None)
        if json_file_path:
            with open(json_file_path, "w") as f:
                json.dump(rtn, f, indent=indent, sort_keys=sort_keys)
        else:
            return json.dumps(rtn, indent=indent, sort_keys=sort_keys)

Exemple #3

0

Afficher le fichier

class CsvPanFactory(freezable_factory(object, "_isFrozen")):
    """
    Primary class for reading/writing csv files with PanDat objects.
    Don't create this object explicitly. A CsvPanFactory will
    automatically be associated with the csv attribute of the parent
    PanDatFactory.
    """
    def __init__(self, pan_dat_factory):
        """
        Don't create this object explicitly. A CsvPanFactory will
        automatically be associated with the csv attribute of the parent
        PanDatFactory.

        :param pan_dat_factory:

        :return:
        """
        self.pan_dat_factory = pan_dat_factory
        self._isFrozen = True

    def create_pan_dat(self, dir_path, fill_missing_fields=False, **kwargs):
        """
        Create a PanDat object from a directory of csv files.

        :param db_file_path: the directory containing the .csv files.

        :param fill_missing_fields: boolean. If truthy, missing fields will be filled in
                                    with their default value. Otherwise, missing fields
                                    throw an Exception.

        :param kwargs: additional named arguments to pass to pandas.read_csv

        :return: a PanDat object populated by the matching tables.

        caveats: Missing tables always throw an Exception.
                 Table names are matched with case-space insensitivity, but spaces
                 are respected for field names.
                 (ticdat supports whitespace in field names but not table names).

        Note that if you save a DataFrame to csv and then recover it, the type of data might change. For example

            df = pd.DataFrame({"a":["100", "200", "300"]})
            df.to_csv("something.csv")
            df2 = pd.read_csv("something.csv")

        results in a numeric column in df2. To address this, you need to either use set_data_type for your
        PanDatFactory, or specify "dtype" in kwargs. (The former is obviously better).

        This problem is even worse with df = pd.DataFrame({"a":["0100", "1200", "2300"]})
        """
        verify(os.path.isdir(dir_path), "%s not a directory path" % dir_path)
        tbl_names = self._get_table_names(dir_path)
        rtn = {}
        for t, f in tbl_names.items():
            kwargs_ = dict(kwargs)
            if "dtype" not in kwargs_:
                kwargs_[
                    "dtype"] = self.pan_dat_factory._dtypes_for_pandas_read(t)
            rtn[t] = pd.read_csv(f, **kwargs_)
        missing_tables = {
            t
            for t in self.pan_dat_factory.all_tables if t not in rtn
        }
        if missing_tables:
            print(
                "The following table names could not be found in the %s directory.\n%s\n"
                % (dir_path, "\n".join(missing_tables)))
        missing_fields = {(t, f)
                          for t in rtn
                          for f in all_fields(self.pan_dat_factory, t)
                          if f not in rtn[t].columns}
        if fill_missing_fields:
            for t, f in missing_fields:
                rtn[t][f] = self.pan_dat_factory.default_values[t][f]
        verify(
            fill_missing_fields or not missing_fields,
            "The following (table, file_name, field) triplets are missing fields.\n%s"
            % [(t, os.path.basename(tbl_names[t]), f)
               for t, f in missing_fields])
        return _clean_pandat_creator(self.pan_dat_factory, rtn)

    def _get_table_names(self, dir_path):
        rtn = {}
        for table in self.pan_dat_factory.all_tables:
            rtn[table] = [
                path for f in os.listdir(dir_path)
                for path in [os.path.join(dir_path, f)] if os.path.isfile(path)
                and f.lower().replace(" ", "_") == "%s.csv" % table.lower()
            ]
            verify(
                len(rtn[table]) <= 1,
                "Multiple possible csv files found for table %s" % table)
            if len(rtn[table]) == 1:
                rtn[table] = rtn[table][0]
            else:
                rtn.pop(table)
        return rtn

    def write_directory(self,
                        pan_dat,
                        dir_path,
                        case_space_table_names=False,
                        index=False,
                        **kwargs):
        """
        write the PanDat data to a collection of csv files

        :param pan_dat: the PanDat object to write

        :param dir_path: the directory in which to write the csv files
                             Set to falsey if using con argument.

        :param case_space_table_names: boolean - make best guesses how to add spaces and upper case
                                       characters to table names

        :param index: boolean - whether or not to write the index.

        :param kwargs: additional named arguments to pass to pandas.to_csv

        :return:

        caveats: The row names (index) isn't written (unless kwargs indicates it should be).
        """
        verify(not os.path.isfile(dir_path),
               "A file is not a valid directory path")
        msg = []
        verify(
            self.pan_dat_factory.good_pan_dat_object(pan_dat, msg.append),
            "pan_dat not a good object for this factory : %s" % "\n".join(msg))
        pan_dat = self.pan_dat_factory._pre_write_adjustment(pan_dat)
        verify("index" not in kwargs,
               "index should be passed as a non-kwargs argument")
        kwargs["index"] = index
        case_space_table_names = case_space_table_names and \
                                 len(set(self.pan_dat_factory.all_tables)) == \
                                 len(set(map(case_space_to_pretty, self.pan_dat_factory.all_tables)))
        if not os.path.isdir(dir_path):
            os.mkdir(dir_path)
        for t in self.pan_dat_factory.all_tables:
            f = os.path.join(
                dir_path,
                (case_space_to_pretty(t) if case_space_table_names else t) +
                ".csv")
            getattr(pan_dat, t).to_csv(f, **kwargs)

Exemple #4

0

Afficher le fichier

class SqlPanFactory(freezable_factory(object, "_isFrozen")):
    """
    Primary class for reading/writing SQLite files with PanDat objects.
    Don't create this object explicitly. A SqlPanFactory will
    automatically be associated with the sql attribute of the parent
    PanDatFactory.
    """
    def __init__(self, pan_dat_factory):
        """
        Don't create this object explicitly. A SqlPanFactory will
        automatically be associated with the sql attribute of the parent
        PanDatFactory.
        :param pan_dat_factory:
        :return:
        """
        self.pan_dat_factory = pan_dat_factory
        self._isFrozen = True

    def create_pan_dat(self,
                       db_file_path,
                       con=None,
                       fill_missing_fields=False):
        """
        Create a PanDat object from a SQLite database file

        :param db_file_path: A SQLite DB File. Set to falsey if using con argument

        :param con: A connection object that can be passed to pandas read_sql.
                    Set to falsey if using db_file_path argument.

        :param fill_missing_fields: boolean. If truthy, missing fields will be filled in
                                    with their default value. Otherwise, missing fields
                                    throw an Exception.

        :return: a PanDat object populated by the matching tables.

        caveats: Missing tables always resolve to an empty table, but missing fields on matching tables throw
                 an exception (unless fill_missing_fields is truthy).

                 Table names are matched with case-space insensitivity, but spaces
                 are respected for field names.
                 (ticdat supports whitespace in field names but not table names).
        """
        verify(
            bool(db_file_path) != bool(con),
            "use either the con argument or the db_file_path argument but not both"
        )
        if db_file_path:
            verify(
                os.path.exists(db_file_path)
                and not os.path.isdir(db_file_path),
                "%s not a file path" % db_file_path)
        rtn = {}
        con_maker = lambda: _sql_con(
            db_file_path) if db_file_path else _DummyContextManager(con)
        with con_maker() as _:
            con_ = con or _
            for t, s in self._get_table_names(con_).items():
                rtn[t] = pd.read_sql(sql="Select * from [%s]" % s, con=con_)
        missing_fields = {(t, f)
                          for t in rtn
                          for f in all_fields(self.pan_dat_factory, t)
                          if f not in rtn[t].columns}
        if fill_missing_fields:
            for t, f in missing_fields:
                rtn[t][f] = self.pan_dat_factory.default_values[t][f]
        verify(
            fill_missing_fields or not missing_fields,
            "The following are (table, field) pairs missing from the %s file.\n%s"
            % (db_file_path, missing_fields))
        missing_tables = sorted(
            set(self.pan_dat_factory.all_tables).difference(rtn))
        if missing_tables:
            print(
                "The following table names could not be found in the SQLite database.\n%s\n"
                % "\n".join(missing_tables))
        return _clean_pandat_creator(self.pan_dat_factory, rtn)

    def _get_table_names(self, con):
        rtn = {}

        def try_name(name):
            try:
                con.execute("Select * from [%s]" % name)
            except:
                return False
            return True

        for table in self.pan_dat_factory.all_tables:
            rtn[table] = [
                t for t in all_underscore_replacements(table) if try_name(t)
            ]
            verify(
                len(rtn[table]) <= 1,
                "Multiple possible tables found for table %s" % table)
            if rtn[table]:
                rtn[table] = rtn[table][0]
            else:
                rtn.pop(table)
        return rtn

    def write_file(self,
                   pan_dat,
                   db_file_path,
                   con=None,
                   if_exists='replace',
                   case_space_table_names=False):
        """

        write the PanDat data to an excel file

        :param pan_dat: the PanDat object to write

        :param db_file_path: The file path of the SQLite file to create.
                             Set to falsey if using con argument.

        :param con: A connection object that can be passed to pandas to_sql.
                    Set to falsey if using db_file_path argument

        :param if_exists: ‘fail’, ‘replace’ or ‘append’. How to behave if the table already exists

        :param case_space_table_names: boolean - make best guesses how to add spaces and upper case
                                          characters to table names

        :return:

        caveats: The row names (index) isn't written. The default pandas schema generation is used,
                 and thus foreign key relationships aren't written.
        """
        # The code to generate foreign keys is written and tested as part of TicDatFactory, and
        # thus this shortcoming could be easily rectified if need be).
        # note - pandas has an unfortunate tendency to push types into SQLite columns. This can result in
        # writing-reading round trips converting your numbers to text if they are mixed type columns.
        verify(
            bool(db_file_path) != bool(con),
            "use either the con argument or the db_file_path argument but not both"
        )
        msg = []
        verify(
            self.pan_dat_factory.good_pan_dat_object(pan_dat, msg.append),
            "pan_dat not a good object for this factory : %s" % "\n".join(msg))
        pan_dat = self.pan_dat_factory._pre_write_adjustment(pan_dat)
        if db_file_path:
            verify(not os.path.isdir(db_file_path),
                   "A directory is not a valid SQLLite file path")
        case_space_table_names = case_space_table_names and \
                                 len(set(self.pan_dat_factory.all_tables)) == \
                                 len(set(map(case_space_to_pretty, self.pan_dat_factory.all_tables)))
        con_maker = lambda: _sql_con(
            db_file_path) if db_file_path else _DummyContextManager(con)
        with con_maker() as _:
            con_ = con or _
            for t in self.pan_dat_factory.all_tables:
                getattr(pan_dat, t).to_sql(name=case_space_to_pretty(t)
                                           if case_space_table_names else t,
                                           con=con_,
                                           if_exists=if_exists,
                                           index=False)

Exemple #5

0

Afficher le fichier

Fichier : jsontd.py Projet : nandi6uc/ticdat

class JsonTicFactory(freezable_factory(object, "_isFrozen")):
    """
    Primary class for reading/writing json files with TicDat objects.
    You need the json package to be installed to use it.
    """
    def __init__(self, tic_dat_factory):
        """
        Don't call this function explicitly. A JsonTicFactory will
        automatically be associated with the json attribute of the parent
        TicDatFactory.
        :param tic_dat_factory:
        :return:
        """
        self.tic_dat_factory = tic_dat_factory
        self._isFrozen = True

    def create_tic_dat(self,
                       json_file_path,
                       freeze_it=False,
                       from_pandas=False):
        """
        Create a TicDat object from a json file

        :param json_file_path: A json file path. It should encode a dictionary
                               with table names as keys. Could also be an actual JSON string

        :param freeze_it: boolean. should the returned object be frozen?

        :param from_pandas: boolean.  If truthy, then use pandas json readers. See
                            PanDatFactory json readers for more details.

        :return: a TicDat object populated by the matching tables.

        caveats: Table names matches are case insensitive and also
                 underscore-space insensitive.
                 Tables that don't find a match are interpreted as an empty table.
                 Dictionary keys that don't match any table are ignored.
        """
        _standard_verify(self.tic_dat_factory)
        if from_pandas:
            from ticdat import PanDatFactory
            pdf = PanDatFactory.create_from_full_schema(
                self.tic_dat_factory.schema(include_ancillary_info=True))
            _rtn = pdf.json.create_pan_dat(json_file_path)
            return pdf.copy_to_tic_dat(_rtn)
        jdict = self._create_jdict(json_file_path)
        tic_dat_dict = self._create_tic_dat_dict(jdict)
        missing_tables = set(
            self.tic_dat_factory.all_tables).difference(tic_dat_dict)
        if missing_tables:
            print(
                "The following table names could not be found in the json file/string\n%s\n"
                % "\n".join(missing_tables))
        rtn = self.tic_dat_factory.TicDat(**tic_dat_dict)
        rtn = self.tic_dat_factory._parameter_table_post_read_adjustment(rtn)
        if freeze_it:
            return self.tic_dat_factory.freeze_me(rtn)
        return rtn

    def find_duplicates(self, json_file_path, from_pandas=False):
        """
        Find the row counts for duplicated rows.

        :param json_file_path: A json file path. It should encode a dictionary
                               with table names as keys.

        :param from_pandas: boolean.  If truthy, then use pandas json readers. See
                            PanDatFactory json readers for more details.

        :return: A dictionary whose keys are table names for the primary-ed key tables.
                 Each value of the return dictionary is itself a dictionary.
                 The inner dictionary is keyed by the primary key values encountered in the table,
                 and the value is the count of records in the json entry with this primary key.
                 Row counts smaller than 2 are pruned off, as they aren't duplicates
        """
        _standard_verify(self.tic_dat_factory)
        if from_pandas:
            from ticdat import PanDatFactory
            pdf = PanDatFactory.create_from_full_schema(
                self.tic_dat_factory.schema(include_ancillary_info=True))
            _rtn = pdf.json.create_pan_dat(json_file_path)
            jdict = {
                t:
                [tuple(_) for _ in getattr(_rtn, t).itertuples(index=False)]
                for t in pdf.all_tables
            }
        else:
            jdict = self._create_jdict(json_file_path)
        rtn = find_duplicates_from_dict_ticdat(self.tic_dat_factory, jdict)
        return rtn or {}

    def _create_jdict(self, path_or_buf):
        if stringish(path_or_buf) and os.path.exists(path_or_buf):
            reasonble_string = path_or_buf
            verify(os.path.isfile(path_or_buf),
                   "json_file_path is not a valid file path.")
            try:
                with open(path_or_buf, "r") as fp:
                    jdict = json.load(fp)
            except Exception as e:
                raise TicDatError("Unable to interpret %s as json file : %s" %
                                  (path_or_buf, e))
        else:
            verify(stringish(path_or_buf), "%s isn't a string" % path_or_buf)
            reasonble_string = path_or_buf[:10]
            try:
                jdict = json.loads(path_or_buf)
            except Exception as e:
                raise TicDatError(
                    "Unable to interpret %s as json string : %s" %
                    (reasonble_string, e))

        verify(dictish(jdict),
               "%s failed to load a dictionary" % reasonble_string)
        verify(
            all(map(stringish, jdict)),
            "The dictionary loaded from %s isn't indexed by strings" %
            reasonble_string)
        verify(
            all(map(containerish, jdict.values())),
            "The dictionary loaded from %s doesn't have containers as values" %
            reasonble_string)
        return jdict

    def _create_tic_dat_dict(self, jdict):
        tdf = self.tic_dat_factory
        rtn = {}
        table_keys = defaultdict(list)
        for t in tdf.all_tables:
            for t2 in jdict:
                if stringish(t2) and t.lower() == t2.replace(" ", "_").lower():
                    table_keys[t].append(t2)
            if len(table_keys[t]) >= 1:
                verify(
                    len(table_keys[t]) < 2,
                    "Found duplicate matching keys for table %s" % t)
                rtn[t] = jdict[table_keys[t][0]]
        orig_rtn, rtn = rtn, {}
        for t, rows in orig_rtn.items():
            all_fields = tdf.primary_key_fields.get(t,
                                                    ()) + tdf.data_fields.get(
                                                        t, ())
            rtn[t] = []
            for row in rows:
                if dictish(row):
                    rtn[t].append({
                        f: tdf._general_read_cell(t, f, x)
                        for f, x in row.items()
                    })
                else:
                    rtn[t].append([
                        tdf._general_read_cell(t, f, x)
                        for f, x in zip(all_fields, row)
                    ])
        return rtn

    def write_file(self,
                   tic_dat,
                   json_file_path,
                   allow_overwrite=False,
                   verbose=False,
                   to_pandas=False):
        """
        write the ticDat data to a json file (or json string)

        :param tic_dat: the data object to write (typically a TicDat)

        :param json_file_path: The file path of the json file to create. If empty string, then return a JSON string.

        :param allow_overwrite: boolean - are we allowed to overwrite an
                                existing file?

        :param verbose: boolean. Verbose mode writes the data rows as dicts
                        keyed by field name. Otherwise, they are lists.

        :param to_pandas: boolean. if truthy, then use the PanDatFactory method of writing to json.

        :return:
        """
        _standard_verify(self.tic_dat_factory)
        verify(not (to_pandas and verbose),
               "verbose argument is inconsistent with to_pandas")
        verify(
            not (json_file_path and os.path.exists(json_file_path)
                 and not allow_overwrite),
            "%s exists and allow_overwrite is not enabled" % json_file_path)
        if to_pandas:
            from ticdat import PanDatFactory
            pdf = PanDatFactory.create_from_full_schema(
                self.tic_dat_factory.schema(include_ancillary_info=True))
            return pdf.json.write_file(
                self.tic_dat_factory.copy_to_pandas(tic_dat,
                                                    drop_pk_columns=False),
                json_file_path)
        msg = []
        if not self.tic_dat_factory.good_tic_dat_object(
                tic_dat, lambda m: msg.append(m)):
            raise TicDatError("Not a valid TicDat object for this schema : " +
                              " : ".join(msg))
        jdict = make_json_dict(self.tic_dat_factory,
                               tic_dat,
                               verbose,
                               use_infinity_io_flag_if_provided=True)
        if not json_file_path:
            return json.dumps(jdict, sort_keys=True, indent=2)
        with open(json_file_path, "w") as fp:
            json.dump(jdict, fp, sort_keys=True, indent=2)

Exemple #6

0

Afficher le fichier

Fichier : opalytics.py Projet : snelson3/opalytics-ticdat

class OpalyticsTicFactory(freezable_factory(object, "_isFrozen")):
    """
    Primary class for reading ticDat objects from the Opalytics Cloud Platform
    Not expected to used outside of Opalytics Cloud hosted notebooks
    """
    def __init__(self, tic_dat_factory):
        """
        Don't call this function explicitly. A OpalyticsTicFactory will
        automatically be associated with the opalytics attribute of the parent
        TicDatFactory.
        :param tic_dat_factory:
        :return:
        """
        self.tic_dat_factory = tic_dat_factory
        self._duplicate_focused_tdf = create_duplicate_focused_tdf(
            tic_dat_factory)
        self._isFrozen = True

    def _find_table_matchings(self, inputset):
        rtn = defaultdict(list)
        for t, x in product(self.tic_dat_factory.all_tables, inputset.schema):
            if stringish(x) and t.lower() == x.lower().replace(" ", "_"):
                rtn[t].append(x)
        return rtn

    def _good_inputset(self, inputset, message_writer=lambda x: x):
        if not hasattr(inputset, "schema") and dictish(inputset.schema):
            message_writer("Failed to find dictish schema attribute")
            return False
        if not hasattr(inputset, "getTable") and callable(inputset.getTable):
            message_writer("Failed to find calleable getTable attribute")
            return False
        table_matchings = self._find_table_matchings(inputset)
        badly_matched = {t for t, v in table_matchings.items() if len(v) != 1}
        if badly_matched:
            message_writer(
                "Following tables could not be uniquely resolved in inputset.schema\n%s"
                % badly_matched)
            return False
        return True

    def find_duplicates(self, inputset, raw_data=False):
        """
        Find the row counts for duplicated rows.
        :param inputset: An opalytics inputset consistent with this TicDatFactory
        :param raw_data: boolean. should data cleaning be skipped? See create_tic_dat.
        :return: A dictionary whose keys are table names for the primary-ed key tables.
                 Each value of the return dictionary is itself a dictionary.
                 The inner dictionary is keyed by the primary key values encountered in the table,
                 and the value is the count of records in the table with this primary key.
                 Row counts smaller than 2 are pruned off, as they aren't duplicates.
        """
        message = []
        verify(
            self._good_inputset(inputset, message.append),
            "inputset is inconsistent with this TicDatFactory : %s" %
            (message or [None])[0])
        if not self._duplicate_focused_tdf:
            return {}
        tdf = self._duplicate_focused_tdf
        return find_duplicates(
            tdf.opalytics.create_tic_dat(inputset, raw_data=raw_data), tdf)

    def _table_as_lists(self, t, df):
        verify(isinstance(df, DataFrame), "table %s isn't a DataFrame" % t)
        all_fields = set(self.tic_dat_factory.primary_key_fields[t]).\
                     union(self.tic_dat_factory.data_fields[t])
        verify(
            "_active" not in all_fields,
            "Table %s has a field named '_active'.\n" +
            "This conflicts with internal data processing.\n" +
            " Don't use '_active' for in your TicDatFactory definition if you want to use this reader."
        )
        for f in all_fields:
            verify(f in df.columns,
                   "field %s can't be found in the DataFrame for %s" % (f, t))
        all_fields = {f: list(df.columns).index(f) for f in all_fields}
        has_active = "_active" in df.columns
        active_index = list(
            df.columns).index("_active") if has_active else None
        rtn = []
        for row in df.itertuples(index=False, name=None):
            if not has_active or row[active_index]:
                rtn.append(
                    tuple(row[all_fields[f]]
                          for f in self.tic_dat_factory.primary_key_fields[t] +
                          self.tic_dat_factory.data_fields[t]))
        return rtn

    def create_tic_dat(self, inputset, raw_data=False, freeze_it=False):
        """
        Create a TicDat object from an opalytics inputset
        :param inputset: An opalytics inputset consistent with this TicDatFactory
        :param raw_data: boolean. should data cleaning be skipped? On the Opalytics Cloud Platform
                         cleaned data will be passed to instant apps. Data cleaning involves
                         removing data type failures, data row predicate failures, foreign key
                         failures and deactivated records.
        :param freeze_it: boolean. should the returned object be frozen?
        :return: a TicDat object populated by the tables as they are rendered by inputset
        """
        message = []
        verify(
            self._good_inputset(inputset, message.append),
            "inputset is inconsistent with this TicDatFactory : %s" %
            (message or [None])[0])
        verify(
            DataFrame,
            "pandas needs to be installed to use the opalytics functionality")
        verify(
            not self.tic_dat_factory.generator_tables
            or self.tic_dat_factory.generic_tables,
            "The opalytics data reader is not yet working for generic tables nor generator tables"
        )

        tms = {
            k: v[0]
            for k, v in self._find_table_matchings(inputset).items()
        }
        ia = {}
        if "includeActive" in inspect.getargspec(inputset.getTable)[0]:
            ia = {"includeActive": not raw_data}
        tl = lambda t: self._table_as_lists(t, inputset.getTable(tms[t], **ia))
        rtn = self.tic_dat_factory.TicDat(
            **{t: tl(t)
               for t in self.tic_dat_factory.all_tables})
        if not raw_data:

            def removing():
                dtfs = self.tic_dat_factory.find_data_type_failures(rtn)
                for (t, f), (bvs, pks) in dtfs.items():
                    if pks is None:  # i.e. no primary keys
                        for dr in getattr(rtn, t):
                            if dr[f] in bvs:
                                getattr(rtn, t).remove(dr)
                    else:
                        for k in pks:
                            getattr(rtn, t).pop(
                                k, None)  # could be popped for two fields
                drfs = self.tic_dat_factory.find_data_row_failures(rtn)
                cant_remove_again = set()
                for (t, pn), row_posns in drfs.items():
                    if self.tic_dat_factory.primary_key_fields[t]:
                        for k in row_posns:
                            getattr(rtn, t).pop(
                                k, None)  # could be popped for two predicates
                    elif t not in cant_remove_again:
                        bad_drs = [
                            dr for i, dr in enumerate(getattr(rtn, t))
                            if i in row_posns
                        ]
                        for dr in bad_drs:
                            getattr(rtn, t).remove(dr)
                        # once we start removing data rows by row index, the remaining row indicies
                        # become invalid, so will need to ignoring any more such indicies for this table
                        cant_remove_again.add(t)

                fkfs = self.tic_dat_factory.find_foreign_key_failures(rtn)
                if fkfs:
                    self.tic_dat_factory.remove_foreign_keys_failures(rtn)
                return dtfs or drfs or fkfs

            while removing():
                pass
        if freeze_it:
            return self.tic_dat_factory.freeze_me(rtn)
        return rtn

Exemple #7

0

Afficher le fichier

Fichier : xls.py Projet : nandi6uc/ticdat

class XlsTicFactory(freezable_factory(object, "_isFrozen")):
    """
    Primary class for reading/writing Excel files with TicDat objects.
    Your system will need the xlrd package to read .xls and .xlsx files,
    the xlwt package to write .xls files, and the xlsxwriter package to
    write .xlsx files.
    Don't create this object explicitly. A XlsTicDatFactory will
    automatically be associated with the xls attribute of the parent
    TicDatFactory.
    """
    def __init__(self, tic_dat_factory):
        """
        Don't create this object explicitly. A XlsTicDatFactory will
        automatically be associated with the xls attribute of the parent
        TicDatFactory.
        :param tic_dat_factory:
        :return:
        """
        self.tic_dat_factory = tic_dat_factory
        self._isFrozen = True

    def create_tic_dat(self,
                       xls_file_path,
                       row_offsets=None,
                       headers_present=True,
                       treat_inf_as_infinity=True,
                       freeze_it=False):
        """
        Create a TicDat object from an Excel file

        :param xls_file_path: An Excel file containing sheets whose names match
                              the table names in the schema.

        :param row_offsets: (optional) A mapping from table names to initial
                            number of rows to skip

        :param headers_present: Boolean. Does the first row of data contain the
                                column headers?

        :param treat_inf_as_infinity: Boolean. Treat the "inf" string (case insensitive) as
                                               as infinity. Similar for "-inf"

        :param freeze_it: boolean. should the returned object be frozen?

        :return: a TicDat object populated by the matching sheets.

        caveats: Missing sheets resolve to an empty table, but missing fields
                 on matching sheets throw an Exception.
                 Sheet names are considered case insensitive, and white space is replaced
                 with underscore for table name matching.
                 Field names are considered case insensitive, but white space is respected.
                 (ticdat supports whitespace in field names but not table names).
                 The following two caveats apply only if data_types are used.
                 --> Any field for which an empty string is invalid data and None is
                     valid data will replace the empty string with None.
                 --> Any field for which must_be_int is true will replace numeric
                     data that satisfies int(x)==x with int(x). In other words,
                     the ticdat equivalent of pandas.read_excel convert_float
                     is to set must_be_int to true in data_types.
        """
        self._verify_differentiable_sheet_names()
        verify(xlrd, "xlrd needs to be installed to use this subroutine")
        tdf = self.tic_dat_factory
        verify(not (treat_inf_as_infinity and tdf.generator_tables),
               "treat_inf_as_infinity not implemented for generator tables")
        verify(headers_present or not tdf.generic_tables,
               "headers need to be present to read generic tables")
        verify(utils.DataFrame or not tdf.generic_tables,
               "Strange absence of pandas despite presence of generic tables")
        if self.tic_dat_factory.generic_tables:
            verify(
                headers_present and treat_inf_as_infinity and not row_offsets,
                "headers_present, treat_inf_as_infinity and row_offsets must all be at default values\n"
                + "to use generic tables")
        rtn = self._create_tic_dat_dict(xls_file_path, row_offsets or {},
                                        headers_present, treat_inf_as_infinity)
        if self.tic_dat_factory.generic_tables:
            if xls_file_path.endswith(".xls"):
                print(
                    "** Warning : pandas doesn't always play well with older Excel formats."
                )
            pdf = PanDatFactory(
                **{t: '*'
                   for t in self.tic_dat_factory.generic_tables})
            pandat = pdf.xls.create_pan_dat(xls_file_path)
            for t in self.tic_dat_factory.generic_tables:
                rtn[t] = getattr(pandat, t)
        rtn = tdf._parameter_table_post_read_adjustment(tdf.TicDat(**rtn))
        if freeze_it:
            return self.tic_dat_factory.freeze_me(rtn)
        return rtn

    def _verify_differentiable_sheet_names(self):
        rtn = defaultdict(set)
        for t in self.tic_dat_factory.all_tables:
            rtn[t[:_longest_sheet]].add(t)
        rtn = [v for k, v in rtn.items() if len(v) > 1]
        verify(
            not rtn,
            "The following tables collide when names are truncated to %s characters.\n%s"
            % (_longest_sheet, sorted(map(sorted, rtn))))

    def _get_sheets_and_fields(self,
                               xls_file_path,
                               all_tables,
                               row_offsets,
                               headers_present,
                               print_missing_tables=False):
        verify(
            utils.stringish(xls_file_path) and os.path.exists(xls_file_path),
            "xls_file_path argument %s is not a valid file path." %
            xls_file_path)
        try:
            book = xlrd.open_workbook(xls_file_path)
        except Exception as e:
            raise TicDatError("Unable to open %s as xls file : %s" %
                              (xls_file_path, e))
        sheets = defaultdict(list)
        for table, sheet in product(all_tables, book.sheets()):
            if table.lower()[:_longest_sheet] == sheet.name.lower().replace(
                    ' ', '_')[:_longest_sheet]:
                sheets[table].append(sheet)
        duplicated_sheets = tuple(_t for _t, _s in sheets.items()
                                  if len(_s) > 1)
        verify(
            not duplicated_sheets,
            "The following sheet names were duplicated : " +
            ",".join(duplicated_sheets))
        sheets = FrozenDict({k: v[0] for k, v in sheets.items()})
        missing_tables = {t for t in all_tables if t not in sheets}
        if missing_tables and print_missing_tables:
            print(
                "The following table names could not be found in the %s file.\n%s\n"
                % (xls_file_path, "\n".join(missing_tables)))
        field_indicies, missing_fields, dup_fields = {}, {}, {}
        for table, sheet in sheets.items():
            field_indicies[table], missing_fields[table], dup_fields[table] = \
                self._get_field_indicies(table, sheet, row_offsets[table], headers_present)
        verify(
            not any(_ for _ in missing_fields.values()),
            "The following field names could not be found : \n" +
            "\n".join("%s : " % t + ",".join(bf)
                      for t, bf in missing_fields.items() if bf))
        verify(
            not any(_ for _ in dup_fields.values()),
            "The following field names were duplicated : \n" +
            "\n".join("%s : " % t + ",".join(bf)
                      for t, bf in dup_fields.items() if bf))
        return sheets, field_indicies, book.datemode

    def _create_generator_obj(self, xlsFilePath, table, row_offset,
                              headers_present, treat_inf_as_infinity):
        tdf = self.tic_dat_factory
        ho = 1 if headers_present else 0

        def tableObj():
            sheets, field_indicies, datemode = self._get_sheets_and_fields(
                xlsFilePath, (table, ), {table: row_offset}, headers_present)
            if table in sheets:
                sheet = sheets[table]
                table_len = min(
                    len(sheet.col_values(field_indicies[table][field]))
                    for field in tdf.data_fields[table])
                for x in (sheet.row_values(i)
                          for i in range(table_len)[row_offset + ho:]):
                    yield self._sub_tuple(table, tdf.data_fields[table],
                                          field_indicies[table],
                                          treat_inf_as_infinity, datemode)(x)

        return tableObj

    def _create_tic_dat_dict(self, xls_file_path, row_offsets, headers_present,
                             treat_inf_as_infinity):
        tiai = treat_inf_as_infinity
        verify(
            utils.dictish(row_offsets)
            and set(row_offsets).issubset(self.tic_dat_factory.all_tables)
            and all(
                utils.numericish(x) and (x >= 0)
                for x in row_offsets.values()),
            "row_offsets needs to map from table names to non negative row offset"
        )
        row_offsets = dict({t: 0
                            for t in self.tic_dat_factory.all_tables},
                           **row_offsets)
        tdf = self.tic_dat_factory
        rtn = {}
        sheets, field_indicies, dm = self._get_sheets_and_fields(
            xls_file_path,
            set(tdf.all_tables).difference(tdf.generator_tables),
            row_offsets,
            headers_present,
            print_missing_tables=True)
        ho = 1 if headers_present else 0
        for tbl, sheet in sheets.items():
            fields = tdf.primary_key_fields.get(tbl, ()) + tdf.data_fields.get(
                tbl, ())
            assert fields or tbl in self.tic_dat_factory.generic_tables
            indicies = field_indicies[tbl]
            table_len = min(
                len(sheet.col_values(indicies[field]))
                for field in (fields or indicies))
            if tdf.primary_key_fields.get(tbl, ()):
                tableObj = {
                    self._sub_tuple(tbl, tdf.primary_key_fields[tbl], indicies,
                                    tiai, dm)(x):
                    self._sub_tuple(tbl, tdf.data_fields.get(tbl, ()),
                                    indicies, tiai, dm)(x)
                    for x in (sheet.row_values(i)
                              for i in range(table_len)[row_offsets[tbl] +
                                                        ho:])
                }
            elif tbl in tdf.generic_tables:
                tableObj = None  # will be read via PanDatFactory
            else:
                tableObj = [
                    self._sub_tuple(tbl, tdf.data_fields.get(tbl, ()),
                                    indicies, tiai, dm)(x)
                    for x in (sheet.row_values(i)
                              for i in range(table_len)[row_offsets[tbl] +
                                                        ho:])
                ]
            if tableObj is not None:
                rtn[tbl] = tableObj
        for tbl in tdf.generator_tables:
            rtn[tbl] = self._create_generator_obj(xls_file_path, tbl,
                                                  row_offsets[tbl],
                                                  headers_present, tiai)
        return rtn

    def find_duplicates(self,
                        xls_file_path,
                        row_offsets={},
                        headers_present=True):
        """
        Find the row counts for duplicated rows.

        :param xls_file_path: An Excel file containing sheets whose names match
                              the table names in the schema (non primary key tables ignored).

        :param row_offsets: (optional) A mapping from table names to initial
                            number of rows to skip (non primary key tables ignored)

        :param headers_present: Boolean. Does the first row of data contain the
                                column headers?

        caveats: Missing sheets resolve to an empty table, but missing primary fields
                 on matching sheets throw an Exception.
                 Sheet names are considered case insensitive.

        :return: A dictionary whose keys are the table names for the primary key tables.
                 Each value of the return dictionary is itself a dictionary.
                 The inner dictionary is keyed by the primary key values encountered
                 in the table, and the value is the count of records in the
                 Excel sheet with this primary key.
                 Row counts smaller than 2 are pruned off, as they aren't duplicates
        """
        self._verify_differentiable_sheet_names()
        verify(xlrd, "xlrd needs to be installed to use this subroutine")
        verify(
            utils.dictish(row_offsets)
            and set(row_offsets).issubset(self.tic_dat_factory.all_tables)
            and all(
                utils.numericish(x) and (x >= 0)
                for x in row_offsets.values()),
            "row_offsets needs to map from table names to non negative row offset"
        )
        row_offsets = dict({t: 0
                            for t in self.tic_dat_factory.all_tables},
                           **row_offsets)
        tdf = self.tic_dat_factory
        pk_tables = tuple(t for t, _ in tdf.primary_key_fields.items() if _)
        rtn = {t: defaultdict(int) for t in pk_tables}
        sheets, fieldIndicies, dm = self._get_sheets_and_fields(
            xls_file_path, pk_tables, row_offsets, headers_present)
        ho = 1 if headers_present else 0
        for table, sheet in sheets.items():
            fields = tdf.primary_key_fields[table] + tdf.data_fields.get(
                table, ())
            indicies = fieldIndicies[table]
            table_len = min(
                len(sheet.col_values(indicies[field])) for field in fields)
            for x in (sheet.row_values(i)
                      for i in range(table_len)[row_offsets[table] + ho:]):
                rtn[table][self._sub_tuple(table,
                                           tdf.primary_key_fields[table],
                                           indicies,
                                           treat_inf_as_infinity=True,
                                           datemode=dm)(x)] += 1
        for t in list(rtn.keys()):
            rtn[t] = {k: v for k, v in rtn[t].items() if v > 1}
            if not rtn[t]:
                del (rtn[t])
        return rtn

    def _sub_tuple(self, table, fields, field_indicies, treat_inf_as_infinity,
                   datemode):
        assert set(fields).issubset(field_indicies)
        if self.tic_dat_factory.infinity_io_flag != "N/A" or \
            (table == "parameters" and self.tic_dat_factory.parameters):
            treat_inf_as_infinity = False

        def _read_cell(x, field):
            # reminder - data fields have a default default of zero, primary keys don't get a default default
            dv = self.tic_dat_factory.default_values.get(table, {}).get(
                field, ["LIST", "NOT", "POSSIBLE"])
            dt = self.tic_dat_factory.data_types.get(table, {}).get(field)
            rtn = x[field_indicies[field]]
            if rtn == "" and ((dt and dt.nullable) or (not dt and dv is None)):
                return None
            if treat_inf_as_infinity and utils.stringish(
                    rtn) and rtn.lower() in ["inf", "-inf"]:
                return float(rtn.lower())
            if utils.numericish(rtn) and utils.safe_apply(int)(
                    rtn) == rtn and dt and dt.must_be_int:
                rtn = int(rtn)
            if rtn == "":
                try_rtn = self.tic_dat_factory._general_read_cell(
                    table, field, None)  # None as infinity flagging
                if utils.numericish(try_rtn):
                    return try_rtn
            if utils.numericish(rtn) and dt and dt.datetime:
                rtn = utils.safe_apply(
                    lambda: xlrd.xldate_as_tuple(rtn, datemode))()
                if rtn is not None:
                    f = datetime.datetime
                    if utils.pd:
                        f = utils.pd.Timestamp
                    return f(year=rtn[0],
                             month=rtn[1],
                             day=rtn[2],
                             hour=rtn[3],
                             minute=rtn[4],
                             second=rtn[5])
            return self.tic_dat_factory._general_read_cell(table, field, rtn)

        def rtn(x):
            if len(fields) == 1:
                return _read_cell(x, fields[0])
            return tuple(_read_cell(x, field) for field in fields)

        return rtn

    def _get_field_indicies(self, table, sheet, row_offset, headers_present):
        fields = self.tic_dat_factory.primary_key_fields.get(table, ()) + \
                 self.tic_dat_factory.data_fields.get(table, ())
        if not headers_present:
            row_len = len(sheet.row_values(
                row_offset)) if sheet.nrows > 0 else len(fields)
            return ({f: i
                     for i, f in enumerate(fields) if i < row_len},
                    [f for i, f in enumerate(fields) if i >= row_len], [])
        if sheet.nrows - row_offset <= 0:
            return {}, fields, []
        if table in self.tic_dat_factory.generic_tables:
            temp_rtn = defaultdict(list)
            for ind, val in enumerate(sheet.row_values(row_offset)):
                temp_rtn[val].append(ind)
        else:
            temp_rtn = {field: list() for field in fields}
            for field, (ind, val) in product(
                    fields, enumerate(sheet.row_values(row_offset))):
                if field == val or (all(map(utils.stringish, (field, val)))
                                    and field.lower() == val.lower()):
                    temp_rtn[field].append(ind)
        return ({
            field: inds[0]
            for field, inds in temp_rtn.items() if len(inds) == 1
        }, [field for field, inds in temp_rtn.items() if len(inds) == 0],
                [field for field, inds in temp_rtn.items() if len(inds) > 1])

    def write_file(self,
                   tic_dat,
                   file_path,
                   allow_overwrite=False,
                   case_space_sheet_names=False):
        """
        write the ticDat data to an excel file

        :param tic_dat: the data object to write (typically a TicDat)

        :param file_path: The file path of the excel file to create
                          Needs to end in either ".xls" or ".xlsx"
                          The latter is capable of writing out larger tables,
                          but the former handles infinity seamlessly.
                          If ".xlsx", then be advised that +/- float("inf") will be replaced
                          with "inf"/"-inf", unless infinity_io_flag is being applied.

        :param allow_overwrite: boolean - are we allowed to overwrite an
                                existing file?
              case_space_sheet_names: boolean - make best guesses how to add spaces and upper case
                                      characters to sheet names

        :return:

        caveats: None may be written out as an empty string. This reflects the behavior of xlwt.
        """
        self._verify_differentiable_sheet_names()
        verify(
            utils.stringish(file_path)
            and (file_path.endswith(".xls") or file_path.endswith(".xlsx")),
            "file_path argument needs to end in .xls or .xlsx")
        msg = []
        if not self.tic_dat_factory.good_tic_dat_object(
                tic_dat, lambda m: msg.append(m)):
            raise TicDatError("Not a valid ticDat object for this schema : " +
                              " : ".join(msg))
        verify(not os.path.isdir(file_path),
               "A directory is not a valid xls file path")
        verify(allow_overwrite or not os.path.exists(file_path),
               "The %s path exists and overwrite is not allowed" % file_path)
        if self.tic_dat_factory.generic_tables:
            dat, tdf = utils.create_generic_free(tic_dat, self.tic_dat_factory)
            return tdf.xls.write_file(dat, file_path, allow_overwrite,
                                      case_space_sheet_names)
        case_space_sheet_names = case_space_sheet_names and \
                                 len(set(self.tic_dat_factory.all_tables)) == \
                                 len(set(map(case_space_to_pretty, self.tic_dat_factory.all_tables)))
        tbl_name_mapping = {
            t: case_space_to_pretty(t) if case_space_sheet_names else t
            for t in self.tic_dat_factory.all_tables
        }
        if file_path.endswith(".xls"):
            self._xls_write(tic_dat, file_path, tbl_name_mapping)
        else:
            self._xlsx_write(tic_dat, file_path, tbl_name_mapping)

    def _xls_write(self, tic_dat, file_path, tbl_name_mapping):
        verify(xlwt,
               "Can't write .xls files because xlwt package isn't installed.")
        tdf = self.tic_dat_factory

        def clean_for_write(t, f, x):
            if isinstance(x, datetime.datetime):
                return str(x)
            return self.tic_dat_factory._infinity_flag_write_cell(t, f, x)

        book = xlwt.Workbook()
        for t in sorted(sorted(tdf.all_tables),
                        key=lambda x: len(tdf.primary_key_fields.get(x, ()))):
            all_flds = self.tic_dat_factory.primary_key_fields.get(
                t, ()) + self.tic_dat_factory.data_fields.get(t, ())
            sheet = book.add_sheet(tbl_name_mapping[t][:_longest_sheet])
            for i, f in enumerate(
                    tdf.primary_key_fields.get(t, ()) +
                    tdf.data_fields.get(t, ())):
                sheet.write(0, i, f)
            _t = getattr(tic_dat, t)
            if utils.dictish(_t):
                for row_ind, (p_key, data) in enumerate(_t.items()):
                    for field_ind, cell in enumerate(
                        (p_key if containerish(p_key) else (p_key, )) +
                            tuple(data[_f]
                                  for _f in tdf.data_fields.get(t, ()))):
                        sheet.write(
                            row_ind + 1, field_ind,
                            clean_for_write(t, all_flds[field_ind], cell))
            else:
                for row_ind, data in enumerate(
                        _t if containerish(_t) else _t()):
                    for field_ind, cell in enumerate(
                            tuple(data[_f] for _f in tdf.data_fields[t])):
                        sheet.write(
                            row_ind + 1, field_ind,
                            clean_for_write(t, all_flds[field_ind], cell))
        if os.path.exists(file_path):
            os.remove(file_path)
        book.save(file_path)

    def _xlsx_write(self, tic_dat, file_path, tbl_name_mapping):
        verify(
            xlsx,
            "Can't write .xlsx files because xlsxwriter package isn't installed."
        )
        tdf = self.tic_dat_factory
        if os.path.exists(file_path):
            os.remove(file_path)
        book = xlsx.Workbook(file_path)

        def clean_for_write(t, f, x):
            if self.tic_dat_factory.infinity_io_flag != "N/A" or \
               (t == "parameters" and self.tic_dat_factory.parameters):
                return self.tic_dat_factory._infinity_flag_write_cell(t, f, x)
            if x in [float("inf"), -float("inf")] or isinstance(
                    x, datetime.datetime):
                return str(x)
            return x

        for t in sorted(sorted(tdf.all_tables),
                        key=lambda x: len(tdf.primary_key_fields.get(x, ()))):
            all_flds = self.tic_dat_factory.primary_key_fields.get(
                t, ()) + self.tic_dat_factory.data_fields.get(t, ())
            sheet = book.add_worksheet(tbl_name_mapping[t][:_longest_sheet])
            for i, f in enumerate(
                    tdf.primary_key_fields.get(t, ()) +
                    tdf.data_fields.get(t, ())):
                sheet.write(0, i, f)
            _t = getattr(tic_dat, t)
            if utils.dictish(_t):
                for row_ind, (p_key, data) in enumerate(_t.items()):
                    for field_ind, cell in enumerate(
                        (p_key if containerish(p_key) else (p_key, )) +
                            tuple(data[_f]
                                  for _f in tdf.data_fields.get(t, ()))):
                        sheet.write(
                            row_ind + 1, field_ind,
                            clean_for_write(t, all_flds[field_ind], cell))
            else:
                for row_ind, data in enumerate(
                        _t if containerish(_t) else _t()):
                    for field_ind, cell in enumerate(
                            tuple(data[_f] for _f in tdf.data_fields[t])):
                        sheet.write(
                            row_ind + 1, field_ind,
                            clean_for_write(t, all_flds[field_ind], cell))
        book.close()

Exemple #8

0

Afficher le fichier

Fichier : pandatio.py Projet : vn8317x/opalytics-ticdat

class CsvPanFactory(freezable_factory(object, "_isFrozen")):
    """
    Primary class for reading/writing csv files with PanDat objects.
    Don't create this object explicitly. A CsvPanFactory will
    automatically be associated with the csv attribute of the parent
    PanDatFactory.
    """
    def __init__(self, pan_dat_factory):
        """
        Don't create this object explicitly. A CsvPanFactory will
        automatically be associated with the csv attribute of the parent
        PanDatFactory.

        :param pan_dat_factory:

        :return:
        """
        self.pan_dat_factory = pan_dat_factory
        self._isFrozen = True
    def create_pan_dat(self, dir_path, fill_missing_fields=False, **kwargs):
        """
        Create a PanDat object from a SQLite database file

        :param db_file_path: the directory containing the .csv files.

        :param fill_missing_fields: boolean. If truthy, missing fields will be filled in
                                    with their default value. Otherwise, missing fields
                                    throw an Exception.

        :param kwargs: additional named arguments to pass to pandas.read_csv

        :return: a PanDat object populated by the matching tables.

        caveats: Missing tables always throw an Exception.
                 Table names are matched with case-space insensitivity, but spaces
                 are respected for field names.
                 (ticdat supports whitespace in field names but not table names).
        """
        verify(os.path.isdir(dir_path), "%s not a directory path"%dir_path)
        tbl_names = self._get_table_names(dir_path)
        rtn = {t: pd.read_csv(f, **kwargs) for t,f in tbl_names.items()}
        missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t)
                          if f not in rtn[t].columns}
        if fill_missing_fields:
            for t,f in missing_fields:
                rtn[t][f] = self.pan_dat_factory.default_values[t][f]
        verify(fill_missing_fields or not missing_fields,
               "The following (table, file_name, field) triplets are missing fields.\n%s" %
               [(t, os.path.basename(tbl_names[t]), f) for t,f in missing_fields])
        rtn = self.pan_dat_factory.PanDat(**rtn)
        msg = []
        assert self.pan_dat_factory.good_pan_dat_object(rtn, msg.append), str(msg)
        return rtn
    def _get_table_names(self, dir_path):
        rtn = {}
        for table in self.pan_dat_factory.all_tables:
            rtn[table] = [path for f in os.listdir(dir_path) for path in [os.path.join(dir_path, f)]
                          if os.path.isfile(path) and
                          f.lower().replace(" ", "_") == "%s.csv"%table.lower()]
            verify(len(rtn[table]) >= 1, "Unable to recognize table %s" % table)
            verify(len(rtn[table]) <= 1, "Multiple possible csv files found for table %s" % table)
            rtn[table] = rtn[table][0]
        return rtn
    def write_directory(self, pan_dat, dir_path, case_space_table_names=False, index=False, **kwargs):
        """
        write the PanDat data to a collection of csv files

        :param pan_dat: the PanDat object to write

        :param dir_path: the directory in which to write the csv files
                             Set to falsey if using con argument.

        :param case_space_table_names: boolean - make best guesses how to add spaces and upper case
                                       characters to table names

        :param index: boolean - whether or not to write the index.

        :param kwargs: additional named arguments to pass to pandas.to_csv

        :return:

        caveats: The row names (index) isn't written (unless kwargs indicates it should be).
        """
        verify(not os.path.isfile(dir_path), "A file is not a valid directory path")
        msg = []
        verify(self.pan_dat_factory.good_pan_dat_object(pan_dat, msg.append),
               "pan_dat not a good object for this factory : %s"%"\n".join(msg))
        verify("index" not in kwargs, "index should be passed as a non-kwargs argument")
        kwargs["index"] = index
        case_space_table_names = case_space_table_names and \
                                 len(set(self.pan_dat_factory.all_tables)) == \
                                 len(set(map(case_space_to_pretty, self.pan_dat_factory.all_tables)))
        if not os.path.isdir(dir_path) :
            os.mkdir(dir_path)
        for t in self.pan_dat_factory.all_tables :
            f = os.path.join(dir_path, (case_space_to_pretty(t) if case_space_table_names else t) + ".csv")
            getattr(pan_dat, t).to_csv(f, **kwargs)

Exemple #9

0

Afficher le fichier

class CsvTicFactory(freezable_factory(object, "_isFrozen")) :
    """
    Primary class for reading/writing csv files with TicDat objects.
    Your system will need the csv package if you want to use this class.
    Don't create this object explicitly. A CsvTicFactory will
    automatically be associated with the csv attribute of the parent
    """
    def __init__(self, tic_dat_factory):
        """
        Don't create this object explicitly. A CsvTicDatFactory will
        automatically be associated with the csv attribute of the parent
        TicDatFactory.

        :param tic_dat_factory:

        :return:
        """
        self.tic_dat_factory = tic_dat_factory
        self._isFrozen = True
    def create_tic_dat(self, dir_path, dialect='excel', headers_present = True,
                       freeze_it = False, encoding=None):
        """
        Create a TicDat object from the csv files in a directory

        :param dir_path: the directory containing the .csv files.

        :param dialect: the csv dialect. Consult csv documentation for details.

        :param headers_present: Boolean. Does the first row of data contain the
                                column headers?

        :param encoding: see docstring for the Python.open function

        :param freeze_it: boolean. should the returned object be frozen?

        :return: a TicDat object populated by the matching files.

        caveats: Missing files resolve to an empty table, but missing fields on
                 matching files throw an Exception.
                 By default, data field values (but not primary key values) will be coerced into floats if possible.
                 This includes coercing "inf" and "-inf" into +/- float("inf")
                 The rules over which fields are/are-not coerced can be controlled  via hints from the default values
                 or data types. Default values and data types can also be used to control whether or not the empty
                 string should be coerced into None.
                 The infinity_io_flag rules are applied subsequent to this coercion.

        Note - pandas doesn't really do a fantastic job handling date types either, since it coerces all columns
        to be the same type. If that's the behavior you want you can use PanDatFactory.

        JSON is just a better file format than csv for a variety of reasons, to include typing of data.

        """
        verify(csv, "csv needs to be installed to use this subroutine")
        tdf = self.tic_dat_factory
        verify(headers_present or not tdf.generic_tables,
               "headers need to be present to read generic tables")
        verify(DataFrame or not tdf.generic_tables,
               "Strange absence of pandas despite presence of generic tables")
        rtn = self.tic_dat_factory.TicDat(**self._create_tic_dat(dir_path, dialect,
                                                                  headers_present, encoding))
        rtn = self.tic_dat_factory._parameter_table_post_read_adjustment(rtn)
        if freeze_it:
            return self.tic_dat_factory.freeze_me(rtn)
        return rtn
    def _read_cell(self, table, field, x):
        def _inner_rtn(x):
            if table == "parameters" and self.tic_dat_factory.parameters:
                return x
            # reminder - data fields have a default default of zero, primary keys don't get a default default
            dv = self.tic_dat_factory.default_values.get(table, {}).get(field, ["LIST", "NOT", "POSSIBLE"])
            dt = self.tic_dat_factory.data_types.get(table, {}).get(field)
            if x == "" and ((dt and dt.nullable) or (not dt and dv is None) or
                            numericish(self.tic_dat_factory._general_read_cell(table, field, None))):
                return None
            should_try_float = (dt and dt.number_allowed) or (not dt and numericish(dv)) or \
                               (table in self.tic_dat_factory.generic_tables)
            if should_try_float:
                try:
                    x = float(x)
                    if int(x) == x and dt and dt.must_be_int:
                        x = int(x)
                except:
                    return x
            return x
        return self.tic_dat_factory._general_read_cell(table, field, _inner_rtn(x))
    def _create_tic_dat(self, dir_path, dialect, headers_present, encoding):
        verify(dialect in csv.list_dialects(), "Invalid dialect %s"%dialect)
        verify(os.path.isdir(dir_path), "Invalid directory path %s"%dir_path)
        rtn =  {t : self._create_table(dir_path, t, dialect, headers_present, encoding)
                for t in self.tic_dat_factory.all_tables}
        missing_tables = {t for t in self.tic_dat_factory.all_tables if not rtn[t]}
        if missing_tables:
            print ("The following table names could not be found in the %s directory.\n%s\n"%
                   (dir_path,"\n".join(missing_tables)))
        return {k:v for k,v in rtn.items() if v}
    def find_duplicates(self, dir_path, dialect='excel', headers_present = True, encoding=None):
        """
        Find the row counts for duplicated rows.

        :param dir_path: the directory containing .csv files.
    
        :param dialect: the csv dialect. Consult csv documentation for details.

        :param headers_present: Boolean. Does the first row of data contain
                                the column headers?

        :param encoding: see docstring for the Python.open function

        :return: A dictionary whose keys are the table names for the primary key tables.
                 Each value of the return dictionary is itself a dictionary.
                 The inner dictionary is keyed by the primary key values encountered
                 in the table, and the value is the count of records in the
                 Excel sheet with this primary key.
                 Row counts smaller than 2 are pruned off, as they aren't duplicates

        caveats: Missing files resolve to an empty table, but missing fields (data or primary key) on
                 matching files throw an Exception.
        """
        verify(csv, "csv needs to be installed to use this subroutine")
        verify(dialect in csv.list_dialects(), "Invalid dialect %s"%dialect)
        verify(os.path.isdir(dir_path), "Invalid directory path %s"%dir_path)
        tdf = self.tic_dat_factory
        rtn = {t:defaultdict(int) for t,_ in tdf.primary_key_fields.items()
               if _ and self._get_file_path(dir_path, t)}
        for t in rtn:
            with open(self._get_file_path(dir_path, t), encoding=encoding) as csvfile:
                for r in self._get_data(csvfile, t, dialect, headers_present):
                    p_key = r[tdf.primary_key_fields[t][0]] \
                            if len(tdf.primary_key_fields[t])==1 else \
                            tuple(r[_] for _ in tdf.primary_key_fields[t])
                    rtn[t][p_key] += 1
        for t in list(rtn.keys()):
            rtn[t] = {k:v for k,v in rtn[t].items() if v > 1}
            if not rtn[t]:
                del(rtn[t])
        return rtn
    def _get_file_path(self, dir_path, table):
        rtn = [path for f in os.listdir(dir_path) for path in [os.path.join(dir_path, f)]
               if os.path.isfile(path) and
               f.lower().replace(" ", "_") == "%s.csv"%table.lower()]
        verify(len(rtn) <= 1, "duplicate .csv files found for %s"%table)
        if rtn:
            return rtn[0]
    def _get_data(self, csvfile, table, dialect, headers_present):
        tdf = self.tic_dat_factory
        fieldnames=tdf.primary_key_fields.get(table, ()) + tdf.data_fields.get(table, ())
        assert fieldnames or table in self.tic_dat_factory.generic_tables
        for row in csv.DictReader(csvfile, dialect = dialect,
                            **({"fieldnames":fieldnames} if not headers_present else {})):
            if not headers_present:
                verify(len(row) == len(fieldnames),
                   "Need %s columns for table %s"%(len(fieldnames), table))
                yield {f: self._read_cell(table, f, row[f]) for f in fieldnames}
            else:
                key_matching = defaultdict(list)
                for k,f in product(row.keys(), fieldnames or row.keys()):
                    if k.lower() ==f.lower():
                        key_matching[f].append(k)
                fieldnames = fieldnames or row.keys()
                for f in fieldnames:
                    verify(f in key_matching, "Unable to find field name %s for table %s"%(f, table))
                    verify(len(key_matching[f]) <= 1,
                           "Duplicate field names found for field %s table %s"%(f, table))
                yield {f: self._read_cell(table, f, row[key_matching[f][0]]) for f in fieldnames}

    def _create_table(self, dir_path, table, dialect, headers_present, encoding):
        file_path = self._get_file_path(dir_path, table)
        if not (file_path and  os.path.isfile(file_path)) :
            return
        tdf = self.tic_dat_factory
        if table in tdf.generator_tables:
            def rtn() :
                with open(file_path, encoding=encoding) as csvfile:
                    for r in self._get_data(csvfile, table, dialect, headers_present):
                        yield tuple(r[_] for _ in tdf.data_fields[table])
        else:
            rtn = {} if tdf.primary_key_fields.get(table) else []
            with open(file_path, encoding=encoding) as csvfile:
                for r in self._get_data(csvfile, table, dialect, headers_present) :
                    if tdf.primary_key_fields.get(table) :
                        p_key = r[tdf.primary_key_fields[table][0]] \
                                if len(tdf.primary_key_fields[table]) == 1 else \
                                tuple(r[_] for _ in tdf.primary_key_fields[table])
                        rtn[p_key] = tuple(r[_] for _ in tdf.data_fields.get(table,()))
                    elif table in tdf.generic_tables:
                        rtn.append(r)
                    else:
                        rtn.append(tuple(r[_] for _ in tdf.data_fields[table]))
        return rtn

    def write_directory(self, tic_dat, dir_path, allow_overwrite = False, dialect='excel',
                        write_header = True, case_space_table_names = False):
        """

        write the ticDat data to a collection of csv files

        :param tic_dat: the data object

        :param dir_path: the directory in which to write the csv files

        :param allow_overwrite: boolean - are we allowed to overwrite existing
                                files?

        :param dialect: the csv dialect. Consult csv documentation for details.

        :param write_header: Boolean. Should the header information be written
                             as the first row?

        :param case_space_table_names: boolean - make best guesses how to add spaces and upper case
                                       characters to table names

        :return:
        """
        verify(csv, "csv needs to be installed to use this subroutine")
        verify(dialect in csv.list_dialects(), "Invalid dialect %s"%dialect)
        verify(not os.path.isfile(dir_path), "A file is not a valid directory path")
        if self.tic_dat_factory.generic_tables:
            dat, tdf = create_generic_free(tic_dat, self.tic_dat_factory)
            return tdf.csv.write_directory(dat, dir_path, allow_overwrite, dialect, write_header)
        tdf = self.tic_dat_factory
        msg = []
        if not self.tic_dat_factory.good_tic_dat_object(tic_dat, lambda m : msg.append(m)) :
            raise TicDatError("Not a valid TicDat object for this schema : " + " : ".join(msg))
        if not allow_overwrite:
            for t in tdf.all_tables :
                f = os.path.join(dir_path, t + ".csv")
                verify(not os.path.exists(f), "The %s path exists and overwrite is not allowed"%f)
        if not os.path.isdir(dir_path) :
            os.mkdir(dir_path)
        case_space_table_names = case_space_table_names and \
                                 len(set(self.tic_dat_factory.all_tables)) == \
                                 len(set(map(case_space_to_pretty, self.tic_dat_factory.all_tables)))
        for t in tdf.all_tables :
            f = os.path.join(dir_path, (case_space_to_pretty(t) if case_space_table_names else t) + ".csv")
            with open(f, 'w', newline='') as csvfile:
                 writer = csv.DictWriter(csvfile,dialect=dialect, fieldnames=
                        tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ()) )
                 writer.writeheader() if write_header else None
                 def infinty_io_dict(d):
                     return {f: self.tic_dat_factory._infinity_flag_write_cell(t, f, x) for f,x in d.items()}
                 _t =  getattr(tic_dat, t)
                 if dictish(_t) :
                     for p_key, data_row in _t.items() :
                         primaryKeyDict = {f:v for f,v in zip(tdf.primary_key_fields[t],
                                            p_key if containerish(p_key) else (p_key,))}
                         writer.writerow(infinty_io_dict(dict(data_row, **primaryKeyDict)))
                 else :
                     for data_row in (_t if containerish(_t) else _t()) :
                         writer.writerow(infinty_io_dict(data_row))

Exemple #10

0

Afficher le fichier

class SQLiteTicFactory(freezable_factory(object, "_isFrozen")):
    """
    Primary class for reading/writing SQLite files with TicDat objects.
    Don't create this object explicitly. A SQLiteTicFactory will
    automatically be associated with the sql attribute of the parent
    TicDatFactory. You need the sqlite3 package to be installed to use it.
    """
    def __init__(self, tic_dat_factory):
        """
        Don't call this function explicitly. A SQLiteTicFactory will
        automatically be associated with the sql attribute of the parent
        TicDatFactory.
        :param tic_dat_factory:
        :return:
        """
        self.tic_dat_factory = tic_dat_factory
        self._duplicate_focused_tdf = create_duplicate_focused_tdf(
            tic_dat_factory)
        self._isFrozen = True

    def _Rtn(self, freeze_it):
        def rtn(*args, **kwargs):
            rtn = self.tic_dat_factory.TicDat(*args, **kwargs)
            rtn = self.tic_dat_factory._parameter_table_post_read_adjustment(
                rtn)
            if freeze_it:
                return self.tic_dat_factory.freeze_me(rtn)
            return rtn

        return rtn

    def create_tic_dat(self, db_file_path, freeze_it=False):
        """
        Create a TicDat object from a SQLite database file

        :param db_file_path: A SQLite db with a consistent schema.

        :param freeze_it: boolean. should the returned object be frozen?

        :return: a TicDat object populated by the matching tables.

        caveats : "inf" and "-inf" (case insensitive) are read as floats, unless the infinity_io_flag
                  is being applied.
                  "true"/"false" (case insensitive) are read as booleans booleans.
                  Tables that don't find a match are interpreted as an empty table.
                  Missing fields on matching tables throw an exception.
        """
        verify(sql, "sqlite3 needs to be installed to use this subroutine")
        return self._Rtn(freeze_it)(**self._create_tic_dat(db_file_path))

    def create_tic_dat_from_sql(self,
                                sql_file_path,
                                includes_schema=False,
                                freeze_it=False):
        """
        Create a TicDat object from an SQLite sql text file

        :param sql_file_path: A text file containing SQLite compatible SQL statements delimited by ;

        :param includes_schema: boolean - does the sql_file_path contain schema generating SQL?

        :param freeze_it: boolean. should the returned object be frozen?

        :return: a TicDat object populated by the db created from the SQL. See create_tic_dat for caveats.
        """
        verify(sql, "sqlite3 needs to be installed to use this subroutine")
        return self._Rtn(freeze_it)(
            **self._create_tic_dat_from_sql(sql_file_path, includes_schema))

    def find_duplicates(self, db_file_path):
        """
        Find the row counts for duplicated rows.

        :param db_file_path: A SQLite db with a consistent schema.

        :return: A dictionary whose keys are table names for the primary-ed key tables.
                 Each value of the return dictionary is itself a dictionary.
                 The inner dictionary is keyed by the primary key values encountered in the table,
                 and the value is the count of records in the SQLite table with this primary key.
                 Row counts smaller than 2 are pruned off, as they aren't duplicates
        """
        verify(sql, "sqlite3 needs to be installed to use this subroutine")
        if not self._duplicate_focused_tdf:
            return {}
        return find_duplicates(
            self._duplicate_focused_tdf.sql.create_tic_dat(db_file_path),
            self._duplicate_focused_tdf)

    def _fks(self):
        rtn = defaultdict(set)
        for fk in self.tic_dat_factory.foreign_keys:
            rtn[fk.native_table].add(fk)
        return FrozenDict({k: tuple(v) for k, v in rtn.items()})

    def _create_tic_dat_from_sql(self, sql_file_path, includes_schema):
        verify(os.path.exists(sql_file_path),
               "%s isn't a valid file path" % sql_file_path)
        verify(
            not self.tic_dat_factory.generator_tables,
            "recovery of generator tables from sql files not yet implemented")
        tdf = self.tic_dat_factory
        with sql.connect(":memory:") as con:
            if not includes_schema:
                for str in self._get_schema_sql(
                        set(tdf.all_tables).difference(tdf.generic_tables)):
                    con.execute(str)
            with open(sql_file_path, "r") as f:
                for str in f.read().split(";"):
                    con.execute(str)
            return self._create_tic_dat_from_con(
                con, {t: t
                      for t in self.tic_dat_factory.all_tables})

    def _get_table_names(self, db_file_path, tables):
        rtn = {}
        with sql.connect(db_file_path) as con:

            def try_name(name):
                try:
                    con.execute("Select * from [%s]" % name)
                except:
                    return False
                return True

            for table in tables:
                rtn[table] = [
                    t for t in all_underscore_replacements(table)
                    if try_name(t)
                ]
                verify(
                    len(rtn[table]) <= 1,
                    "Duplicate tables found for table %s in SQLite file %s" %
                    (table, db_file_path))
                if rtn[table]:
                    rtn[table] = rtn[table][0]
                else:
                    rtn.pop(table)
        return rtn

    def _check_tables_fields(self, db_file_path, tables):
        tdf = self.tic_dat_factory
        TDE = TicDatError
        verify(os.path.exists(db_file_path),
               "%s isn't a valid file path" % db_file_path)
        try:
            with sql.connect(db_file_path) as _:
                pass
        except Exception as e:
            raise TDE("Unable to open %s as SQLite file : %s" %
                      (db_file_path, e))
        table_names = self._get_table_names(db_file_path, tables)
        with sql.connect(db_file_path) as con:
            for table, sql_table in table_names.items():
                for field in tdf.primary_key_fields.get(table, ()) + \
                             tdf.data_fields.get(table, ()):
                    try:
                        con.execute("Select [%s] from [%s]" %
                                    (field, sql_table))
                    except:
                        raise TDE(
                            "Unable to recognize field %s in table %s for file %s"
                            % (field, table, db_file_path))
        return table_names

    def _read_data_cell(self, t, f, x):
        if stringish(x) and x.lower() in ("inf", "-inf") and self.tic_dat_factory.infinity_io_flag == "N/A" and \
            not (t == "parameters" and self.tic_dat_factory.parameters):
            return float(x)
        if stringish(x) and x.lower() == "true":
            return True
        if stringish(x) and x.lower() == "false":
            return False
        return self.tic_dat_factory._general_read_cell(t, f, x)

    def _create_gen_obj(self, db_file_path, table, table_name):
        tdf = self.tic_dat_factory

        def tableObj():
            assert (not tdf.primary_key_fields.get(table)) and (
                tdf.data_fields.get(table))
            with sql.connect(db_file_path) as con:
                for row in con.execute("Select %s from [%s]" % (", ".join(
                        _brackets(tdf.data_fields[table])), table_name)):
                    yield [
                        self._read_data_cell(table, f, x)
                        for f, x in zip(tdf.data_fields[table], row)
                    ]

        return tableObj

    def _create_tic_dat(self, db_file_path):
        tdf = self.tic_dat_factory
        table_names = self._check_tables_fields(db_file_path, tdf.all_tables)
        with sql.connect(db_file_path) as con:
            rtn = self._create_tic_dat_from_con(con, table_names)
        for table in tdf.generator_tables:
            if table in table_names:
                rtn[table] = self._create_gen_obj(db_file_path, table,
                                                  table_names[table])
        return rtn

    def _create_tic_dat_from_con(self, con, table_names):
        missing_tables = sorted(
            set(self.tic_dat_factory.all_tables).difference(table_names))
        if missing_tables:
            print(
                "The following table names could not be found in the SQLite database.\n%s\n"
                % "\n".join(missing_tables))
        tdf = self.tic_dat_factory
        rtn = {}
        for table in set(tdf.all_tables).difference(
                tdf.generator_tables).difference(missing_tables):
            fields = tdf.primary_key_fields.get(table,
                                                ()) + tdf.data_fields.get(
                                                    table, ())
            if not fields:
                assert table in tdf.generic_tables
                fields = tuple(
                    x[1] for x in con.execute("PRAGMA table_info(%s)" % table))
            rtn[table] = {} if tdf.primary_key_fields.get(table, ()) else []
            for row in con.execute(
                    "Select %s from [%s]" %
                (", ".join(_brackets(fields)), table_names[table])):
                if table in tdf.generic_tables:
                    rtn[table].append({
                        f: self._read_data_cell(table, f, d)
                        for f, d in zip(fields, row)
                    })
                else:
                    pkfs = tdf.primary_key_fields.get(table, ())
                    pk = tuple(
                        self._read_data_cell(table, f, x)
                        for f, x in zip(pkfs, row[:len(pkfs)]))
                    data = [
                        self._read_data_cell(table, f, x)
                        for f, x in zip(fields[len(pkfs):], row[len(pkfs):])
                    ]
                    if dictish(rtn[table]):
                        rtn[table][pk[0] if len(pk) == 1 else tuple(pk)] = data
                    else:
                        rtn[table].append(data)
        return rtn

    def _ordered_tables(self):
        rtn = []
        fks = self._fks()

        def processTable(t):
            if t not in rtn:
                for fk in fks.get(t, ()):
                    processTable(fk.foreign_table)
                rtn.append(t)

        list(map(processTable, self.tic_dat_factory.all_tables))
        return tuple(rtn)

    def _get_schema_sql(self, tables):
        assert not set(
            self.tic_dat_factory.generic_tables).intersection(tables)
        rtn = []
        fks = self._fks()
        default_ = lambda t, f: self.tic_dat_factory.default_values[t][f]

        def data_type(t, f):
            if t == "parameters" and self.tic_dat_factory.parameters:
                return ""  # the TEXT data type doesn't seem to have much value for my purposes.
            def_ = default_(t, f)
            if numericish(def_):
                if safe_apply(int)(def_) == def_:
                    return "INT"
                return "FLOAT"
            return ""  # the TEXT data type doesn't seem to have much value for my purposes.

        for t in [_ for _ in self._ordered_tables() if _ in tables]:
            str = "Create TABLE [%s] (\n" % t
            strl = _brackets(self.tic_dat_factory.primary_key_fields.get(t, ())) + \
                   ["[%s] " % f + data_type(t,f) + " default [%s]"%default_(t,f)
                    for f in self.tic_dat_factory.data_fields.get(t, ())]
            for fk in fks.get(t, ()):
                nativefields, foreignfields = zip(
                    *(fk.nativetoforeignmapping().items()))
                strl.append(
                    "FOREIGN KEY(%s) REFERENCES [%s](%s)" %
                    (",".join(_brackets(nativefields)), fk.foreign_table,
                     ",".join(_brackets(foreignfields))))
            if self.tic_dat_factory.primary_key_fields.get(t):
                strl.append("PRIMARY KEY(%s)" % (",".join(
                    _brackets(self.tic_dat_factory.primary_key_fields[t]))))
            str += ",\n".join(strl) + "\n);"
            rtn.append(str)
        return tuple(rtn)

    def _write_data_cell(self, t, f, x):
        if x is True or x is False:
            return str(x)
        return self.tic_dat_factory._infinity_flag_write_cell(t, f, x)

    def _get_data(self, tic_dat, as_sql):
        rtn = []
        for t in self.tic_dat_factory.all_tables:
            _t = getattr(tic_dat, t)
            if dictish(_t):
                primarykeys = tuple(self.tic_dat_factory.primary_key_fields[t])
                for pkrow, sqldatarow in _t.items():
                    _items = list(sqldatarow.items())
                    fields = primarykeys + tuple(x[0] for x in _items)
                    datarow = ((pkrow, ) if len(primarykeys) == 1 else
                               pkrow) + tuple(x[1] for x in _items)
                    assert len(datarow) == len(fields)
                    datarow = tuple(
                        self._write_data_cell(t, f, x)
                        for f, x in zip(fields, datarow))
                    str = "INSERT INTO [%s] (%s) VALUES (%s)" % (t, ",".join(
                        _brackets(fields)), ",".join("%s" if as_sql else "?"
                                                     for _ in fields))
                    if as_sql:
                        rtn.append((str %
                                    tuple(map(_insert_format, datarow))) + ";")
                    else:
                        rtn.append((str, datarow))
            else:
                for sqldatarow in (_t if containerish(_t) else _t()):
                    k, v = zip(*sqldatarow.items())
                    str = "INSERT INTO [%s] (%s) VALUES (%s)"%\
                             (t, ",".join(_brackets(k)), ",".join(
                                ["%s" if as_sql else "?"]*len(sqldatarow)))
                    if as_sql:
                        rtn.append((str % tuple(map(_insert_format, v))) + ";")
                    else:
                        rtn.append((str, v))
        return tuple(rtn)

    def write_db_schema(self, db_file_path):
        """
        :param db_file_path: the file path of the SQLite database to create

        :return:
        """
        verify(
            not self.tic_dat_factory.generic_tables,
            "generic_tables are not compatible with write_db_schema. " +
            "Use write_db_data instead.")
        with _sql_con(db_file_path, foreign_keys=False) as con:
            for str in self._get_schema_sql(self.tic_dat_factory.all_tables):
                con.execute(str)

    def write_db_data(self, tic_dat, db_file_path, allow_overwrite=False):
        """
        write the ticDat data to an SQLite database file

        :param tic_dat: the data object to write

        :param db_file_path: the file path of the SQLite database to populate

        :param allow_overwrite: boolean - are we allowed to overwrite pre-existing data

        :return:

        caveats : True, False are written as "True", "False". Also see infinity_io_flag __doc__
        """
        verify(sql, "sqlite3 needs to be installed to use this subroutine")
        msg = []
        if not self.tic_dat_factory.good_tic_dat_object(
                tic_dat, lambda m: msg.append(m)):
            raise TicDatError("Not a valid TicDat object for this schema : " +
                              " : ".join(msg))
        verify(not os.path.isdir(db_file_path),
               "A directory is not a valid SQLite file path")
        if self.tic_dat_factory.generic_tables:
            dat, tdf = create_generic_free(tic_dat, self.tic_dat_factory)
            return tdf.sql.write_db_data(dat, db_file_path, allow_overwrite)
        if not os.path.exists(db_file_path):
            self.write_db_schema(db_file_path)
        table_names = self._check_tables_fields(
            db_file_path, self.tic_dat_factory.all_tables)
        with _sql_con(db_file_path, foreign_keys=False) as con:
            for t in self.tic_dat_factory.all_tables:
                verify(
                    table_names.get(t) == t,
                    "Failed to find table %s in path %s" % (t, db_file_path))
                verify(
                    allow_overwrite
                    or not any(True
                               for _ in con.execute("Select * from [%s]" % t)),
                    "allow_overwrite is False, but there are already data records in %s"
                    % t)
                con.execute("Delete from [%s]" %
                            t) if allow_overwrite else None
            for sql_str, data in self._get_data(tic_dat, as_sql=False):
                con.execute(sql_str, list(data))

    def write_sql_file(self,
                       tic_dat,
                       sql_file_path,
                       include_schema=False,
                       allow_overwrite=False):
        """
        write the sql for the ticDat data to a text file

        :param tic_dat: the data object to write

        :param sql_file_path: the path of the text file to hold the sql statements for the data

        :param include_schema: boolean - should we write the schema sql first?

        :param allow_overwrite: boolean - are we allowed to overwrite pre-existing file

        :return:

        caveats : float("inf"), float("-inf") are written as "inf", "-inf" (unless infinity_io_flag
                  is being applied). True/False are written as "True", "False"
        """
        verify(sql, "sqlite3 needs to be installed to use this subroutine")
        verify(
            allow_overwrite or not os.path.exists(sql_file_path),
            "The %s path exists and overwrite is not allowed" % sql_file_path)
        must_schema = set(
            self.tic_dat_factory.all_tables if include_schema else [])
        if self.tic_dat_factory.generic_tables:
            gt = self.tic_dat_factory.generic_tables
            dat, tdf = create_generic_free(tic_dat, self.tic_dat_factory)
            return tdf.sql._write_sql_file(dat, sql_file_path,
                                           must_schema.union(gt))
        return self._write_sql_file(tic_dat, sql_file_path, must_schema)

    def _write_sql_file(self, tic_dat, sql_file_path, schema_tables):
        with open(sql_file_path, "w") as f:
            for str in self._get_schema_sql(schema_tables) + \
                       self._get_data(tic_dat, as_sql=True):
                f.write(str + "\n")

Exemple #11

0

Afficher le fichier

Fichier : csvtd.py Projet : snelson3/opalytics-ticdat

class CsvTicFactory(freezable_factory(object, "_isFrozen")) :
    """
    Primary class for reading/writing csv files with ticDat objects.
    Your system will need the csv package if you want to use this class.
    """
    def __init__(self, tic_dat_factory):
        """
        Don't create this object explicitly. A CsvTicDatFactory will
        automatically be associated with the csv attribute of the parent
        TicDatFactory.
        :param tic_dat_factory:
        :return:
        """
        self.tic_dat_factory = tic_dat_factory
        self._isFrozen = True
    def create_tic_dat(self, dir_path, dialect='excel', headers_present = True,
                       freeze_it = False):
        """
        Create a TicDat object from the csv files in a directory
        :param dir_path: the directory containing the .csv files.
        :param dialect: the csv dialect. Consult csv documentation for details.
        :param headers_present: Boolean. Does the first row of data contain the
                                column headers?
        :param freeze_it: boolean. should the returned object be frozen?
        :return: a TicDat object populated by the matching files.
        caveats: Missing files resolve to an empty table, but missing fields on
                 matching files throw an Exception.
                 Data field values (but not primary key values) will be coerced
                 into floats if possible.
        """
        verify(csv, "csv needs to be installed to use this subroutine")
        tdf = self.tic_dat_factory
        verify(headers_present or not tdf.generic_tables,
               "headers need to be present to read generic tables")
        verify(DataFrame or not tdf.generic_tables,
               "Strange absence of pandas despite presence of generic tables")
        rtn =  self.tic_dat_factory.TicDat(**self._create_tic_dat(dir_path, dialect,
                                                                  headers_present))
        if freeze_it:
            return self.tic_dat_factory.freeze_me(rtn)
        return rtn
    def _create_tic_dat(self, dir_path, dialect, headers_present):
        verify(dialect in csv.list_dialects(), "Invalid dialect %s"%dialect)
        verify(os.path.isdir(dir_path), "Invalid directory path %s"%dir_path)
        rtn =  {t : self._create_table(dir_path, t, dialect, headers_present)
                for t in self.tic_dat_factory.all_tables}
        missing_tables = {t for t in self.tic_dat_factory.all_tables if not rtn[t]}
        if missing_tables:
            print ("The following table names could not be found in the %s directory.\n%s\n"%
                   (dir_path,"\n".join(missing_tables)))
        return {k:v for k,v in rtn.items() if v}
    def find_duplicates(self, dir_path, dialect='excel', headers_present = True):
        """
        Find the row counts for duplicated rows.
        :param dir_path: the directory containing .csv files.
        :param dialect: the csv dialect. Consult csv documentation for details.
        :param headers_present: Boolean. Does the first row of data contain
                                the column headers?
        :return: A dictionary whose keys are the table names for the primary key tables.
                 Each value of the return dictionary is itself a dictionary.
                 The inner dictionary is keyed by the primary key values encountered
                 in the table, and the value is the count of records in the
                 Excel sheet with this primary key.
                 Row counts smaller than 2 are pruned off, as they aren't duplicates
        caveats: Missing files resolve to an empty table, but missing fields (data or primary key) on
                 matching files throw an Exception.
        """
        verify(csv, "csv needs to be installed to use this subroutine")
        verify(dialect in csv.list_dialects(), "Invalid dialect %s"%dialect)
        verify(os.path.isdir(dir_path), "Invalid directory path %s"%dir_path)
        tdf = self.tic_dat_factory
        rtn = {t:defaultdict(int) for t,_ in tdf.primary_key_fields.items()
               if _ and self._get_file_path(dir_path, t)}
        for t in rtn:
            with open(self._get_file_path(dir_path, t)) as csvfile:
                for r in self._get_data(csvfile, t, dialect, headers_present):
                    p_key = r[tdf.primary_key_fields[t][0]] \
                            if len(tdf.primary_key_fields[t])==1 else \
                            tuple(r[_] for _ in tdf.primary_key_fields[t])
                    rtn[t][p_key] += 1
        for t in list(rtn.keys()):
            rtn[t] = {k:v for k,v in rtn[t].items() if v > 1}
            if not rtn[t]:
                del(rtn[t])
        return rtn
    def _get_file_path(self, dir_path, table):
        rtn = [path for f in os.listdir(dir_path) for path in [os.path.join(dir_path, f)]
               if os.path.isfile(path) and
               f.lower().replace(" ", "_") == "%s.csv"%table.lower()]
        verify(len(rtn) <= 1, "duplicate .csv files found for %s"%table)
        if rtn:
            return rtn[0]
    def _get_data(self, csvfile, table, dialect, headers_present):
        tdf = self.tic_dat_factory
        fieldnames=tdf.primary_key_fields.get(table, ()) + tdf.data_fields.get(table, ())
        assert fieldnames or table in self.tic_dat_factory.generic_tables
        for row in csv.DictReader(csvfile, dialect = dialect,
                            **({"fieldnames":fieldnames} if not headers_present else {})):
            if not headers_present:
                verify(len(row) == len(fieldnames),
                   "Need %s columns for table %s"%(len(fieldnames), table))
                yield {f: _try_float(row[f]) for f in fieldnames}
            else:
                key_matching = defaultdict(list)
                for k,f in product(row.keys(), fieldnames or row.keys()):
                    if k.lower() ==f.lower():
                        key_matching[f].append(k)
                fieldnames = fieldnames or row.keys()
                for f in fieldnames:
                    verify(f in key_matching, "Unable to find field name %s for table %s"%(f, table))
                    verify(len(key_matching[f]) <= 1,
                           "Duplicate field names found for field %s table %s"%(f, table))
                yield {f: _try_float(row[key_matching[f][0]]) for f in fieldnames}

    def _create_table(self, dir_path, table, dialect, headers_present):
        file_path = self._get_file_path(dir_path, table)
        if not (file_path and  os.path.isfile(file_path)) :
            return
        tdf = self.tic_dat_factory
        if table in tdf.generator_tables:
            def rtn() :
                with open(file_path) as csvfile:
                    for r in self._get_data(csvfile, table, dialect, headers_present):
                        yield tuple(r[_] for _ in tdf.data_fields[table])
        else:
            rtn = {} if tdf.primary_key_fields.get(table) else []
            with open(file_path) as csvfile:
                for r in self._get_data(csvfile, table, dialect, headers_present) :
                    if tdf.primary_key_fields.get(table) :
                        p_key = r[tdf.primary_key_fields[table][0]] \
                                if len(tdf.primary_key_fields[table]) == 1 else \
                                tuple(r[_] for _ in tdf.primary_key_fields[table])
                        rtn[p_key] = tuple(r[_] for _ in tdf.data_fields.get(table,()))
                    elif table in tdf.generic_tables:
                        rtn.append(r)
                    else:
                        rtn.append(tuple(r[_] for _ in tdf.data_fields[table]))
        return rtn

    def write_directory(self, tic_dat, dir_path, allow_overwrite = False, dialect='excel',
                        write_header = True):
        """
        write the ticDat data to a collection of csv files
        :param tic_dat: the data object
        :param dir_path: the directory in which to write the csv files
        :param allow_overwrite: boolean - are we allowed to overwrite existing
                                files?
        :param dialect: the csv dialect. Consult csv documentation for details.
        :param write_header: Boolean. Should the header information be written
                             as the first row?
        :return:
        """
        verify(csv, "csv needs to be installed to use this subroutine")
        verify(dialect in csv.list_dialects(), "Invalid dialect %s"%dialect)
        verify(not os.path.isfile(dir_path), "A file is not a valid directory path")
        if self.tic_dat_factory.generic_tables:
            dat, tdf = create_generic_free(tic_dat, self.tic_dat_factory)
            return tdf.csv.write_directory(dat, dir_path, allow_overwrite, dialect, write_header)
        tdf = self.tic_dat_factory
        msg = []
        if not self.tic_dat_factory.good_tic_dat_object(tic_dat, lambda m : msg.append(m)) :
            raise TicDatError("Not a valid TicDat object for this schema : " + " : ".join(msg))
        if not allow_overwrite:
            for t in tdf.all_tables :
                f = os.path.join(dir_path, t + ".csv")
                verify(not os.path.exists(f), "The %s path exists and overwrite is not allowed"%f)
        if not os.path.isdir(dir_path) :
            os.mkdir(dir_path)
        for t in tdf.all_tables :
            f = os.path.join(dir_path, t + ".csv")
            with open(f, 'w') as csvfile:
                 writer = csv.DictWriter(csvfile,dialect=dialect, fieldnames=
                        tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ()) )
                 writer.writeheader() if write_header else None
                 _t =  getattr(tic_dat, t)
                 if dictish(_t) :
                     for p_key, data_row in _t.items() :
                         primaryKeyDict = {f:v for f,v in zip(tdf.primary_key_fields[t],
                                            p_key if containerish(p_key) else (p_key,))}
                         writer.writerow(dict(data_row, **primaryKeyDict))
                 else :
                     for data_row in (_t if containerish(_t) else _t()) :
                         writer.writerow(dict(data_row))

Exemple #12

0

Afficher le fichier

class XlsTicFactory(freezable_factory(object, "_isFrozen")):
    """
    Primary class for reading/writing Excel files with ticDat objects.
    Your system will need the xlrd package to read .xls and .xlsx files,
    the xlwt package to write .xls files, and the xlsxwriter package to
    write .xlsx files.
    """
    def __init__(self, tic_dat_factory):
        """
        Don't create this object explicitly. A XlsTicDatFactory will
        automatically be associated with the xls attribute of the parent
        TicDatFactory.
        :param tic_dat_factory:
        :return:
        """
        self.tic_dat_factory = tic_dat_factory
        self._isFrozen = True

    def create_tic_dat(self,
                       xls_file_path,
                       row_offsets={},
                       headers_present=True,
                       treat_large_as_inf=False,
                       freeze_it=False):
        """
        Create a TicDat object from an Excel file
        :param xls_file_path: An Excel file containing sheets whose names match
                              the table names in the schema.
        :param row_offsets: (optional) A mapping from table names to initial
                            number of rows to skip
        :param headers_present: Boolean. Does the first row of data contain the
                                column headers?
        :param treat_large_as_inf: Boolean. Treat numbers >= 1e100 as infinity
                                   Generally only needed for .xlsx files that were
                                   themselves created by ticdat (see write_file docs)
        :param freeze_it: boolean. should the returned object be frozen?
        :return: a TicDat object populated by the matching sheets.
        caveats: Missing sheets resolve to an empty table, but missing fields
                 on matching sheets throw an Exception.
                 Sheet names are considered case insensitive, and white space is replaced
                 with underscore for table name matching.
                 Field names are considered case insensitive, but white space is respected.
                 (ticdat supports whitespace in field names but not table names).
                 The following two caveats apply only if data_types are used.
                 --> Any field for which an empty string is invalid data and None is
                     valid data will replace the empty string with None.
                 --> Any field for which must_be_int is true will replace numeric
                     data that satisfies int(x)==x with int(x). In other words,
                     the ticdat equivalent of pandas.read_excel convert_float
                     is to set must_be_int to true in data_types.
        """
        self._verify_differentiable_sheet_names()
        verify(xlrd, "xlrd needs to be installed to use this subroutine")
        tdf = self.tic_dat_factory
        verify(not (treat_large_as_inf and tdf.generator_tables),
               "treat_large_as_inf not implemented for generator tables")
        verify(headers_present or not tdf.generic_tables,
               "headers need to be present to read generic tables")
        verify(utils.DataFrame or not tdf.generic_tables,
               "Strange absence of pandas despite presence of generic tables")
        rtn = tdf.TicDat(**self._create_tic_dat_dict(
            xls_file_path, row_offsets, headers_present))
        replaceable = defaultdict(dict)
        for t, dfs in tdf.data_types.items():
            replaceable[t] = {
                df
                for df, dt in dfs.items()
                if (not dt.valid_data('')) and dt.valid_data(None)
            }
        for t in set(tdf.all_tables).difference(tdf.generator_tables,
                                                tdf.generic_tables):
            _t = getattr(rtn, t)
            for r in _t.values() if utils.dictish(_t) else _t:
                for f, v in r.items():
                    if f in replaceable[t] and v == '':
                        r[f] = None
                    elif treat_large_as_inf:
                        if v >= _xlsx_hack_inf:
                            r[f] = float("inf")
                        if v <= -_xlsx_hack_inf:
                            r[f] = -float("inf")
        if freeze_it:
            return self.tic_dat_factory.freeze_me(rtn)
        return rtn

    def _verify_differentiable_sheet_names(self):
        rtn = defaultdict(set)
        for t in self.tic_dat_factory.all_tables:
            rtn[t[:_longest_sheet]].add(t)
        rtn = [v for k, v in rtn.items() if len(v) > 1]
        verify(
            not rtn,
            "The following tables collide when names are truncated to %s characters.\n%s"
            % (_longest_sheet, sorted(map(sorted, rtn))))

    def _get_sheets_and_fields(self, xls_file_path, all_tables, row_offsets,
                               headers_present):
        verify(
            utils.stringish(xls_file_path) and os.path.exists(xls_file_path),
            "xls_file_path argument %s is not a valid file path." %
            xls_file_path)
        try:
            book = xlrd.open_workbook(xls_file_path)
        except Exception as e:
            raise TicDatError("Unable to open %s as xls file : %s" %
                              (xls_file_path, e.message))
        sheets = defaultdict(list)
        for table, sheet in product(all_tables, book.sheets()):
            if table.lower()[:_longest_sheet] == sheet.name.lower().replace(
                    ' ', '_')[:_longest_sheet]:
                sheets[table].append(sheet)
        duplicated_sheets = tuple(_t for _t, _s in sheets.items()
                                  if len(_s) > 1)
        verify(
            not duplicated_sheets,
            "The following sheet names were duplicated : " +
            ",".join(duplicated_sheets))
        sheets = FrozenDict({k: v[0] for k, v in sheets.items()})
        field_indicies, missing_fields, dup_fields = {}, {}, {}
        for table, sheet in sheets.items():
            field_indicies[table], missing_fields[table], dup_fields[table] = \
                self._get_field_indicies(table, sheet, row_offsets[table], headers_present)
        verify(
            not any(_ for _ in missing_fields.values()),
            "The following field names could not be found : \n" +
            "\n".join("%s : " % t + ",".join(bf)
                      for t, bf in missing_fields.items() if bf))
        verify(
            not any(_ for _ in dup_fields.values()),
            "The following field names were duplicated : \n" +
            "\n".join("%s : " % t + ",".join(bf)
                      for t, bf in dup_fields.items() if bf))
        return sheets, field_indicies

    def _create_generator_obj(self, xlsFilePath, table, row_offset,
                              headers_present):
        tdf = self.tic_dat_factory
        ho = 1 if headers_present else 0

        def tableObj():
            sheets, field_indicies = self._get_sheets_and_fields(
                xlsFilePath, (table, ), {table: row_offset}, headers_present)
            if table in sheets:
                sheet = sheets[table]
                table_len = min(
                    len(sheet.col_values(field_indicies[table][field]))
                    for field in tdf.data_fields[table])
                for x in (sheet.row_values(i)
                          for i in range(table_len)[row_offset + ho:]):
                    yield self._sub_tuple(table, tdf.data_fields[table],
                                          field_indicies[table])(x)

        return tableObj

    def _create_tic_dat_dict(self, xls_file_path, row_offsets,
                             headers_present):
        verify(
            utils.dictish(row_offsets)
            and set(row_offsets).issubset(self.tic_dat_factory.all_tables)
            and all(
                utils.numericish(x) and (x >= 0)
                for x in row_offsets.values()),
            "row_offsets needs to map from table names to non negative row offset"
        )
        row_offsets = dict({t: 0
                            for t in self.tic_dat_factory.all_tables},
                           **row_offsets)
        tdf = self.tic_dat_factory
        rtn = {}
        sheets, field_indicies = self._get_sheets_and_fields(
            xls_file_path,
            set(tdf.all_tables).difference(tdf.generator_tables), row_offsets,
            headers_present)
        ho = 1 if headers_present else 0
        for tbl, sheet in sheets.items():
            fields = tdf.primary_key_fields.get(tbl, ()) + tdf.data_fields.get(
                tbl, ())
            assert fields or tbl in self.tic_dat_factory.generic_tables
            indicies = field_indicies[tbl]
            table_len = min(
                len(sheet.col_values(indicies[field]))
                for field in (fields or indicies))
            if tdf.primary_key_fields.get(tbl, ()):
                tableObj = {
                    self._sub_tuple(tbl, tdf.primary_key_fields[tbl],
                                    indicies)(x):
                    self._sub_tuple(tbl, tdf.data_fields.get(tbl, ()),
                                    indicies)(x)
                    for x in (sheet.row_values(i)
                              for i in range(table_len)[row_offsets[tbl] +
                                                        ho:])
                }
            elif tbl in tdf.generic_tables:
                tableObj = [{
                    f: x[i]
                    for f, i in field_indicies[tbl].items()
                } for x in (sheet.row_values(i)
                            for i in range(table_len)[row_offsets[tbl] + ho:])]
            else:
                tableObj = [
                    self._sub_tuple(tbl, tdf.data_fields.get(tbl, ()),
                                    indicies)(x)
                    for x in (sheet.row_values(i)
                              for i in range(table_len)[row_offsets[tbl] +
                                                        ho:])
                ]
            rtn[tbl] = tableObj
        for tbl in tdf.generator_tables:
            rtn[tbl] = self._create_generator_obj(xls_file_path, tbl,
                                                  row_offsets[tbl],
                                                  headers_present)
        return rtn

    def find_duplicates(self,
                        xls_file_path,
                        row_offsets={},
                        headers_present=True):
        """
        Find the row counts for duplicated rows.
        :param xls_file_path: An Excel file containing sheets whose names match
                              the table names in the schema (non primary key tables ignored).
        :param row_offsets: (optional) A mapping from table names to initial
                            number of rows to skip (non primary key tables ignored)
        :param headers_present: Boolean. Does the first row of data contain the
                                column headers?
        caveats: Missing sheets resolve to an empty table, but missing primary fields
                 on matching sheets throw an Exception.
                 Sheet names are considered case insensitive.
        :return: A dictionary whose keys are the table names for the primary key tables.
                 Each value of the return dictionary is itself a dictionary.
                 The inner dictionary is keyed by the primary key values encountered
                 in the table, and the value is the count of records in the
                 Excel sheet with this primary key.
                 Row counts smaller than 2 are pruned off, as they aren't duplicates
        """
        self._verify_differentiable_sheet_names()
        verify(xlrd, "xlrd needs to be installed to use this subroutine")
        verify(
            utils.dictish(row_offsets)
            and set(row_offsets).issubset(self.tic_dat_factory.all_tables)
            and all(
                utils.numericish(x) and (x >= 0)
                for x in row_offsets.values()),
            "row_offsets needs to map from table names to non negative row offset"
        )
        row_offsets = dict({t: 0
                            for t in self.tic_dat_factory.all_tables},
                           **row_offsets)
        tdf = self.tic_dat_factory
        pk_tables = tuple(t for t, _ in tdf.primary_key_fields.items() if _)
        rtn = {t: defaultdict(int) for t in pk_tables}
        sheets, fieldIndicies = self._get_sheets_and_fields(
            xls_file_path, pk_tables, row_offsets, headers_present)
        ho = 1 if headers_present else 0
        for table, sheet in sheets.items():
            fields = tdf.primary_key_fields[table] + tdf.data_fields.get(
                table, ())
            indicies = fieldIndicies[table]
            table_len = min(
                len(sheet.col_values(indicies[field])) for field in fields)
            for x in (sheet.row_values(i)
                      for i in range(table_len)[row_offsets[table] + ho:]):
                rtn[table][self._sub_tuple(table,
                                           tdf.primary_key_fields[table],
                                           indicies)(x)] += 1
        for t in list(rtn.keys()):
            rtn[t] = {k: v for k, v in rtn[t].items() if v > 1}
            if not rtn[t]:
                del (rtn[t])
        return rtn

    def _sub_tuple(self, table, fields, field_indicies):
        assert set(fields).issubset(field_indicies)
        data_types = self.tic_dat_factory.data_types

        def _convert_float(x, field):
            rtn = x[field_indicies[field]]
            if utils.numericish(rtn) and utils.safe_apply(int)(rtn) == rtn and \
               table in data_types and field in data_types[table] and \
               data_types[table][field].must_be_int:
                return int(rtn)
            return rtn

        def rtn(x):
            if len(fields) == 1:
                return _convert_float(x, fields[0])
            return tuple(_convert_float(x, field) for field in fields)

        return rtn

    def _get_field_indicies(self, table, sheet, row_offset, headers_present):
        fields = self.tic_dat_factory.primary_key_fields.get(table, ()) + \
                 self.tic_dat_factory.data_fields.get(table, ())
        if not headers_present:
            row_len = len(sheet.row_values(
                row_offset)) if sheet.nrows > 0 else len(fields)
            return ({f: i
                     for i, f in enumerate(fields) if i < row_len},
                    [f for i, f in enumerate(fields) if i >= row_len], [])
        if sheet.nrows - row_offset <= 0:
            return {}, fields, []
        if table in self.tic_dat_factory.generic_tables:
            temp_rtn = defaultdict(list)
            for ind, val in enumerate(sheet.row_values(row_offset)):
                temp_rtn[val].append(ind)
        else:
            temp_rtn = {field: list() for field in fields}
            for field, (ind, val) in product(
                    fields, enumerate(sheet.row_values(row_offset))):
                if field == val or (all(map(utils.stringish, (field, val)))
                                    and field.lower() == val.lower()):
                    temp_rtn[field].append(ind)
        return ({
            field: inds[0]
            for field, inds in temp_rtn.items() if len(inds) == 1
        }, [field for field, inds in temp_rtn.items() if len(inds) == 0],
                [field for field, inds in temp_rtn.items() if len(inds) > 1])

    def write_file(self, tic_dat, file_path, allow_overwrite=False):
        """
        write the ticDat data to an excel file
        :param tic_dat: the data object to write (typically a TicDat)
        :param file_path: The file path of the excel file to create
                          Needs to end in either ".xls" or ".xlsx"
                          The latter is capable of writing out larger tables,
                          but the former handles infinity seamlessly.
                          If ".xlsx", then be advised that +/- float("inf") will be replaced
                          with +/- 1e+100
        :param allow_overwrite: boolean - are we allowed to overwrite an
                                existing file?
        :return:
        caveats: None may be written out as an empty string. This reflects the behavior of xlwt.
        """
        self._verify_differentiable_sheet_names()
        verify(
            utils.stringish(file_path)
            and (file_path.endswith(".xls") or file_path.endswith(".xlsx")),
            "file_path argument needs to end in .xls or .xlsx")
        msg = []
        if not self.tic_dat_factory.good_tic_dat_object(
                tic_dat, lambda m: msg.append(m)):
            raise TicDatError("Not a valid ticDat object for this schema : " +
                              " : ".join(msg))
        verify(not os.path.isdir(file_path),
               "A directory is not a valid xls file path")
        verify(allow_overwrite or not os.path.exists(file_path),
               "The %s path exists and overwrite is not allowed" % file_path)
        if self.tic_dat_factory.generic_tables:
            dat, tdf = utils.create_generic_free(tic_dat, self.tic_dat_factory)
            return tdf.xls.write_file(dat, file_path, allow_overwrite)
        if file_path.endswith(".xls"):
            self._xls_write(tic_dat, file_path)
        else:
            self._xlsx_write(tic_dat, file_path)

    def _xls_write(self, tic_dat, file_path):
        verify(xlwt,
               "Can't write .xls files because xlwt package isn't installed.")
        tdf = self.tic_dat_factory
        book = xlwt.Workbook()
        for t in sorted(sorted(tdf.all_tables),
                        key=lambda x: len(tdf.primary_key_fields.get(x, ()))):
            sheet = book.add_sheet(t[:_longest_sheet])
            for i, f in enumerate(
                    tdf.primary_key_fields.get(t, ()) +
                    tdf.data_fields.get(t, ())):
                sheet.write(0, i, f)
            _t = getattr(tic_dat, t)
            if utils.dictish(_t):
                for row_ind, (p_key, data) in enumerate(_t.items()):
                    for field_ind, cell in enumerate(
                        (p_key if containerish(p_key) else (p_key, )) +
                            tuple(data[_f]
                                  for _f in tdf.data_fields.get(t, ()))):
                        sheet.write(row_ind + 1, field_ind, cell)
            else:
                for row_ind, data in enumerate(
                        _t if containerish(_t) else _t()):
                    for field_ind, cell in enumerate(
                            tuple(data[_f] for _f in tdf.data_fields[t])):
                        sheet.write(row_ind + 1, field_ind, cell)
        if os.path.exists(file_path):
            os.remove(file_path)
        book.save(file_path)

    def _xlsx_write(self, tic_dat, file_path):
        verify(
            xlsx,
            "Can't write .xlsx files because xlsxwriter package isn't installed."
        )
        tdf = self.tic_dat_factory
        if os.path.exists(file_path):
            os.remove(file_path)
        book = xlsx.Workbook(file_path)

        def clean_inf(x):
            if x == float("inf"):
                return _xlsx_hack_inf
            if x == -float("inf"):
                return -_xlsx_hack_inf
            return x

        for t in sorted(sorted(tdf.all_tables),
                        key=lambda x: len(tdf.primary_key_fields.get(x, ()))):
            sheet = book.add_worksheet(t)
            for i, f in enumerate(
                    tdf.primary_key_fields.get(t, ()) +
                    tdf.data_fields.get(t, ())):
                sheet.write(0, i, f)
            _t = getattr(tic_dat, t)
            if utils.dictish(_t):
                for row_ind, (p_key, data) in enumerate(_t.items()):
                    for field_ind, cell in enumerate(
                        (p_key if containerish(p_key) else (p_key, )) +
                            tuple(data[_f]
                                  for _f in tdf.data_fields.get(t, ()))):
                        sheet.write(row_ind + 1, field_ind, clean_inf(cell))
            else:
                for row_ind, data in enumerate(
                        _t if containerish(_t) else _t()):
                    for field_ind, cell in enumerate(
                            tuple(data[_f] for _f in tdf.data_fields[t])):
                        sheet.write(row_ind + 1, field_ind, clean_inf(cell))
        book.close()

Exemple #13

0

Afficher le fichier

Fichier : pandatio.py Projet : vn8317x/opalytics-ticdat

class OpalyticsPanFactory(freezable_factory(object, "_isFrozen")) :
    """
    Primary class for reading PanDat objects from the Opalytics Cloud Platform.
    Not expected to be used outside of Opalytics Cloud hosted notebooks.
    Don't create this object explicitly. An OpalyticsPanFactory will
    automatically be associated with the opalytics attribute of the parent
    PanDatFactory.
    """
    def __init__(self, pan_dat_factory):
        """
        Don't create this object explicitly. A JsonPanFactory will
        automatically be associated with the json attribute of the parent
        PanDatFactory.
        :param pan_dat_factory:
        :return:
        """
        self.pan_dat_factory = pan_dat_factory
        self._isFrozen = True
    def _find_table_matchings(self, inputset):
        rtn = defaultdict(list)
        for t,x in product(self.pan_dat_factory.all_tables, inputset.schema):
            if stringish(x) and t.lower() == x.lower().replace(" ", "_"):
                rtn[t].append(x)
        return rtn
    def _good_inputset(self, inputset, message_writer = lambda x : x):
        if not hasattr(inputset, "schema") and dictish(inputset.schema):
            message_writer("Failed to find dictish schema attribute")
            return False
        if not hasattr(inputset, "getTable") and callable(inputset.getTable):
            message_writer("Failed to find calleable getTable attribute")
            return False
        table_matchings = self._find_table_matchings(inputset)
        badly_matched = {t for t,v in table_matchings.items() if len(v) != 1}
        if badly_matched:
            message_writer("Following tables could not be uniquely resolved in inputset.schema\n%s"%
                           badly_matched)
            return False
        return True
    def create_pan_dat(self, inputset, raw_data=False, freeze_it=False):
        """
        Create a PanDat object from an opalytics inputset

        :param inputset: An opalytics inputset consistent with this PanDatFactory

        :param raw_data: boolean. should data cleaning be skipped? On the Opalytics Cloud Platform
                         cleaned data will be passed to instant apps. Data cleaning involves
                         removing data type failures, data row predicate failures, foreign key
                         failures, duplicated rows and deactivated records.

        :return: a PanDat object populated by the tables as they are rendered by inputset
        """
        message = []
        verify(self._good_inputset(inputset, message.append),
               "inputset is inconsistent with this PanDatFactory : %s"%(message or [None])[0])
        for t in self.pan_dat_factory.all_tables:
            all_fields = set(self.pan_dat_factory.primary_key_fields[t]).\
                         union(self.pan_dat_factory.data_fields[t])
            verify("_active" not in all_fields, "Table %s has a field named '_active'.\n" +
                   "This conflicts with internal data processing.\n" +
                   " Don't use '_active' for in your PanDatFactory definition if you want to use this reader.")

        tms = {k:v[0] for k,v in self._find_table_matchings(inputset).items()}
        ia = {}
        if "includeActive" in inspect.getargspec(inputset.getTable)[0]:
            ia = {"includeActive": not raw_data}
        rtn = self.pan_dat_factory.PanDat(**{t:inputset.getTable(tms[t], **ia) for t in tms})
        for t in self.pan_dat_factory.all_tables:
            df = getattr(rtn, t)
            if "_active" in df.columns:
                df = df[df["_active"]].drop('_active', axis=1)
                setattr(rtn, t, df)
        if not raw_data:
            def removing():
                removal_occured = set()
                for (t,_), brs in list(self.pan_dat_factory.find_data_type_failures(rtn, as_table=False).items()) + \
                                  list(self.pan_dat_factory.find_data_row_failures(rtn, as_table=False).items()) + \
                                  [((t, None), brs) for t,brs in
                                   self.pan_dat_factory.find_duplicates(rtn, as_table=False).items()]:
                    if t not in removal_occured:
                        removal_occured.add(t)
                        setattr(rtn, t, getattr(rtn, t)[[not _ for _ in brs]])
                fkfs = self.pan_dat_factory.find_foreign_key_failures(rtn)
                if fkfs:
                    self.pan_dat_factory.remove_foreign_key_failures(rtn)
                return removal_occured or fkfs
            while removing():
                pass
        return rtn

Exemple #14

0

Afficher le fichier

Fichier : jsontd.py Projet : snelson3/opalytics-ticdat

class JsonTicFactory(freezable_factory(object, "_isFrozen")):
    """
    Primary class for reading/writing json files with ticDat objects.
    You need the json package to be installed to use it.
    """
    def __init__(self, tic_dat_factory):
        """
        Don't call this function explicitly. A JsonTicFactory will
        automatically be associated with the json attribute of the parent
        TicDatFactory.
        :param tic_dat_factory:
        :return:
        """
        self.tic_dat_factory = tic_dat_factory
        self._isFrozen = True

    def create_tic_dat(self, json_file_path, freeze_it=False):
        """
        Create a TicDat object from a json file
        :param json_file_path: A json file path. It should encode a dictionary
                               with table names as keys.
        :param freeze_it: boolean. should the returned object be frozen?
        :return: a TicDat object populated by the matching tables.
        caveats: Table names matches are case insensitive and also
                 underscore-space insensitive.
                 Tables that don't find a match are interpreted as an empty table.
                 Dictionary keys that don't match any table are ignored.
        """
        _standard_verify(self.tic_dat_factory)
        jdict = self._create_jdict(json_file_path)
        tic_dat_dict = self._create_tic_dat_dict(jdict)
        missing_tables = set(
            self.tic_dat_factory.all_tables).difference(tic_dat_dict)
        if missing_tables:
            print(
                "The following table names could not be found in the %s file.\n%s\n"
                % (json_file_path, "\n".join(missing_tables)))
        rtn = self.tic_dat_factory.TicDat(**tic_dat_dict)
        if freeze_it:
            return self.tic_dat_factory.freeze_me(rtn)
        return rtn

    def find_duplicates(self, json_file_path):
        """
        Find the row counts for duplicated rows.
        :param json_file_path: A json file path. It should encode a dictionary
                               with table names as keys.
        :return: A dictionary whose keys are table names for the primary-ed key tables.
                 Each value of the return dictionary is itself a dictionary.
                 The inner dictionary is keyed by the primary key values encountered in the table,
                 and the value is the count of records in the json entry with this primary key.
                 Row counts smaller than 2 are pruned off, as they aren't duplicates
        """
        _standard_verify(self.tic_dat_factory)
        jdict = self._create_jdict(json_file_path)
        rtn = find_duplicates_from_dict_ticdat(self.tic_dat_factory, jdict)
        return rtn or {}

    def _create_jdict(self, json_file_path):
        verify(os.path.isfile(json_file_path),
               "json_file_path is not a valid file path.")
        try:
            with open(json_file_path, "r") as fp:
                jdict = json.load(fp)
        except Exception as e:
            raise TicDatError("Unable to interpret %s as json file : %s" %
                              (json_file_path, e.message))
        verify(dictish(jdict),
               "%s failed to load a dictionary" % json_file_path)
        verify(
            all(map(stringish, jdict)),
            "The dictionary loaded from %s isn't indexed by strings" %
            json_file_path)
        verify(
            all(map(containerish, jdict.values())),
            "The dictionary loaded from %s doesn't have containers as values" %
            json_file_path)
        return jdict

    def _create_tic_dat_dict(self, jdict):
        tdf = self.tic_dat_factory
        rtn = {}
        table_keys = defaultdict(list)
        for t in tdf.all_tables:
            for t2 in jdict:
                if stringish(t2) and t.lower() == t2.replace(" ", "_").lower():
                    table_keys[t].append(t2)
            if len(table_keys[t]) >= 1:
                verify(
                    len(table_keys[t]) < 2,
                    "Found duplicate matching keys for table %s" % t)
                rtn[t] = jdict[table_keys[t][0]]
        return rtn

    def write_file(self,
                   tic_dat,
                   json_file_path,
                   allow_overwrite=False,
                   verbose=False):
        """
        write the ticDat data to an excel file
        :param tic_dat: the data object to write (typically a TicDat)
        :param json_file_path: The file path of the json file to create.
        :param allow_overwrite: boolean - are we allowed to overwrite an
                                existing file?
        :param verbose: boolean. Verbose mode writes the data rows as dicts
                        keyed by field name. Otherwise, they are lists.
        :return:
        """
        _standard_verify(self.tic_dat_factory)
        verify(not (os.path.exists(json_file_path) and not allow_overwrite),
               "%s exists and allow_overwrite is not enabled" % json_file_path)
        msg = []
        if not self.tic_dat_factory.good_tic_dat_object(
                tic_dat, lambda m: msg.append(m)):
            raise TicDatError("Not a valid TicDat object for this schema : " +
                              " : ".join(msg))
        jdict = make_json_dict(self.tic_dat_factory, tic_dat, verbose)
        with open(json_file_path, "w") as fp:
            json.dump(jdict, fp, sort_keys=True, indent=2)

Exemple #15

0

Afficher le fichier

class MdbTicFactory(freezable_factory(object, "_isFrozen")):
    """
    Primary class for reading/writing Access/MDB files with ticDat objects.
    Your system will need the required pypyodbc package if you want to actually
    do something with it.
    """
    def __init__(self, tic_dat_factory):
        """
        Don't create this object explicitly. A MdbTicDatFactory will
        automatically be associated with the mdb attribute of the parent
        TicDatFactory.
        :param tic_dat_factory:
        :return:
        """
        self.tic_dat_factory = tic_dat_factory
        self._duplicate_focused_tdf = create_duplicate_focused_tdf(
            tic_dat_factory)
        self._isFrozen = True

    def create_tic_dat(self, mdb_file_path, freeze_it=False):
        """
        Create a TicDat object from an Access MDB file
        :param mdb_file_path: An Access db with a consistent schema.
        :param freeze_it: boolean. should the returned object be frozen?
        :return: a TicDat object populated by the matching tables.
        caveats : Numbers with absolute values larger than 1e+100 will
                  be read as float("inf") or float("-inf")
        """
        _standard_verify(self.tic_dat_factory.generic_tables)
        rtn = self.tic_dat_factory.TicDat(
            **self._create_tic_dat(mdb_file_path))
        if freeze_it:
            return self.tic_dat_factory.freeze_me(rtn)
        return rtn

    def find_duplicates(self, mdb_file_path):
        """
        Find the row counts for duplicated rows.
        :param mdb_file_path: An Access db with a consistent schema.
        :return: A dictionary whose keys are table names for the primary-ed key tables.
                 Each value of the return dictionary is itself a dictionary.
                 The inner dictionary is keyed by the primary key values encountered in the table,
                 and the value is the count of records in the mdb table with this primary key.
                 Row counts smaller than 2 are pruned off, as they aren't duplicates
        """
        _standard_verify(self.tic_dat_factory.generic_tables)
        if not self._duplicate_focused_tdf:
            return {}
        return find_duplicates(
            self._duplicate_focused_tdf.mdb.create_tic_dat(mdb_file_path),
            self._duplicate_focused_tdf)

    def _get_table_names(self, db_file_path, tables):
        rtn = {}
        with _connect(_connection_str(db_file_path)) as con:

            def try_name(name):
                with con.cursor() as cur:
                    try:
                        cur.execute("Select * from [%s]" % name)
                    except:
                        return False
                return True

            for table in tables:
                rtn[table] = [
                    t for t in all_underscore_replacements(table)
                    if try_name(t)
                ]
                verify(
                    len(rtn[table]) >= 1,
                    "Unable to recognize table %s in MS Access file %s" %
                    (table, db_file_path))
                verify(
                    len(rtn[table]) <= 1,
                    "Duplicate tables found for table %s in MS Access file %s"
                    % (table, db_file_path))
                rtn[table] = rtn[table][0]
        return rtn

    def _check_tables_fields(self, mdb_file_path, tables):
        tdf = self.tic_dat_factory
        TDE = TicDatError
        verify(os.path.exists(mdb_file_path),
               "%s isn't a valid file path" % mdb_file_path)
        try:
            with _connect(_connection_str(mdb_file_path)) as _:
                pass
        except Exception as e:
            raise TDE("Unable to open %s as MS Access file : %s" %
                      (mdb_file_path, e.message))
        table_names = self._get_table_names(mdb_file_path, tables)
        with _connect(_connection_str(mdb_file_path)) as con:
            for table in tables:
                with con.cursor() as cur:
                    cur.execute("Select * from [%s]" % table_names[table])
                    fields = set(_[0].lower() for _ in cur.description)
                    for field in tdf.primary_key_fields.get(
                            table, ()) + tdf.data_fields.get(table, ()):
                        verify(
                            field.lower() in fields,
                            "Unable to recognize field %s in table %s for file %s"
                            % (field, table, mdb_file_path))
        return table_names

    def _create_gen_obj(self, mdbFilePath, table, table_name):
        tdf = self.tic_dat_factory

        def tableObj():
            assert (not tdf.primary_key_fields.get(table)) and (
                tdf.data_fields.get(table))
            with _connect(_connection_str(mdbFilePath)) as con:
                with con.cursor() as cur:
                    cur.execute("Select %s from [%s]" % (", ".join(
                        _brackets(tdf.data_fields[table])), table_name))
                    for row in cur.fetchall():
                        yield list(map(_read_data, row))

        return tableObj

    def _create_tic_dat(self, mdbFilePath):
        tdf = self.tic_dat_factory
        table_names = self._check_tables_fields(mdbFilePath, tdf.all_tables)
        rtn = {}
        with _connect(_connection_str(mdbFilePath)) as con:
            for table in set(tdf.all_tables).difference(tdf.generator_tables):
                fields = tdf.primary_key_fields.get(table,
                                                    ()) + tdf.data_fields.get(
                                                        table, ())
                rtn[table] = {} if tdf.primary_key_fields.get(table,
                                                              ()) else []
                with con.cursor() as cur:
                    cur.execute(
                        "Select %s from [%s]" %
                        (", ".join(_brackets(fields)), table_names[table]))
                    for row in cur.fetchall():
                        pk = row[:len(tdf.primary_key_fields.get(table, ()))]
                        data = list(
                            map(
                                _read_data,
                                row[len(tdf.primary_key_fields.get(table, (
                                ))):]))
                        if dictish(rtn[table]):
                            rtn[table][pk[0] if len(pk) ==
                                       1 else tuple(pk)] = data
                        else:
                            rtn[table].append(data)
        for table in tdf.generator_tables:
            rtn[table] = self._create_gen_obj(mdbFilePath, table,
                                              table_names[table])
        return rtn

    @property
    def can_write_new_file(self):
        """
        :return: True if this environment can write to a new mdb database files,
                 False otherwise
        """
        return _write_new_file_works

    def write_schema(self, mdb_file_path, **field_types):
        """
        :param mdb_file_path: The file path of the mdb database to create
        :param field_types: Named arguments are table names. Argument values
                            are mapping of field name to field type.
                            Allowable field types are text, double and int
                            If missing, primary key fields are text, and data
                            fields are double
        :return:
        """
        verify(
            not self.tic_dat_factory.generic_tables,
            "generic_tables are not compatible with write_schema. " +
            "Use write_file instead.")
        _standard_verify(self.tic_dat_factory.generic_tables)
        verify(dictish(field_types), "field_types should be a dict")
        for k, v in field_types.items():
            verify(k in self.tic_dat_factory.all_tables,
                   "%s isn't a table name" % k)
            verify(dictish(v),
                   "Need a mapping from field names to field types for %s" % k)
            for fld, type_ in v.items():
                verify(
                    fld in self.tic_dat_factory.primary_key_fields.get(k, ()) +
                    self.tic_dat_factory.data_fields.get(k, ()),
                    "%s isn't a field name for table %s" % (fld, k))
                verify(
                    type_ in ("text", "double", "int"),
                    "For table %s, field %s, %s isn't one of (text, double, int)"
                    % (k, fld, type_))
        get_fld_type = lambda tbl, fld, default: field_types.get(tbl, {}).get(
            fld, default)
        if not os.path.exists(mdb_file_path):
            verify(
                mdb_file_path.endswith(".mdb")
                or mdb_file_path.endswith(".accdb"),
                "For file creation, specify either an .mdb or .accdb file name"
            )
            if mdb_file_path.endswith(".mdb"):
                verify(
                    py,
                    "pypyodbc needs to be installed to create a new .mdb file")
                verify(self.can_write_new_file,
                       "Creating a new file not enabled for this OS")
                py.win_create_mdb(mdb_file_path)
            else:
                blank_accdb = os.path.join(_code_dir(), "blank.accdb")
                verify(
                    os.path.exists(blank_accdb)
                    and os.path.isfile(blank_accdb),
                    "You need to run accdb_create_setup.py as a post pip install operation "
                    + "to configure writing to new .accdb files.")
                shutil.copy(blank_accdb, mdb_file_path)
        with _connect(_connection_str(mdb_file_path)) as con:
            for t in self.tic_dat_factory.all_tables:
                str = "Create TABLE [%s] (\n" % t
                strl = ["[%s] %s"%(f, get_fld_type(t, f, "text")) for
                        f in self.tic_dat_factory.primary_key_fields.get(t, ())] + \
                       ["[%s] %s"%(f, get_fld_type(t, f, "double"))
                        for f in self.tic_dat_factory.data_fields.get(t, ())]
                if self.tic_dat_factory.primary_key_fields.get(t):
                    strl.append("PRIMARY KEY(%s)" % ",".join(
                        _brackets(self.tic_dat_factory.primary_key_fields[t])))
                str += ",\n".join(strl) + "\n);"
                con.cursor().execute(str).commit()

    def write_file(self, tic_dat, mdb_file_path, allow_overwrite=False):
        """
        write the ticDat data to an SQLite database file
        :param tic_dat: the data object to write
        :param mdb_file_path: the file path of the SQLite database to populate
        :param allow_overwrite: boolean - are we allowed to overwrite pre-existing data
        :return:
        caveats : Numbers with absolute values larger than 1e+100 will
                  be written as 1e+100 or -1e+100
        NB - thrown Exceptions of the form "Data type mismatch in criteria expression"
             generally result either from Access's inability to store different data
             types in the same field, or from a mismatch between the data object
             and the default field types ticdat uses when creating an Access schema.
             For the latter, feel free to call the write_schema function on the data
             file first with explicitly identified field types.
        """
        _standard_verify(self.tic_dat_factory.generic_tables)
        msg = []
        if not self.tic_dat_factory.good_tic_dat_object(
                tic_dat, lambda m: msg.append(m)):
            raise TicDatError("Not a valid TicDat object for this schema : " +
                              " : ".join(msg))
        verify(not os.path.isdir(mdb_file_path),
               "A directory is not a valid Access file path")
        if not os.path.exists(mdb_file_path):
            self.write_schema(mdb_file_path)
        table_names = self._check_tables_fields(
            mdb_file_path, self.tic_dat_factory.all_tables)
        with _connect(_connection_str(mdb_file_path)) as con:
            for t in self.tic_dat_factory.all_tables:
                verify(
                    table_names[t] == t,
                    "Failed to find table %s in path %s" % (t, mdb_file_path))
                if not allow_overwrite:
                    with con.cursor() as cur:
                        cur.execute("Select * from %s" % t)
                        verify(
                            not any(True for _ in cur.fetchall()),
                            "allow_overwrite is False, but there are already data records in %s"
                            % t)
                con.cursor().execute("Delete from %s" %
                                     t).commit() if allow_overwrite else None
                _t = getattr(tic_dat, t)
                if dictish(_t):
                    primary_keys = tuple(
                        self.tic_dat_factory.primary_key_fields[t])
                    for pk_row, sql_data_row in _t.items():
                        _items = tuple(sql_data_row.items())
                        fields = _brackets(primary_keys +
                                           tuple(x[0] for x in _items))
                        data_row = ((pk_row,) if len(primary_keys)==1 else pk_row) + \
                                  tuple(_write_data(x[1]) for x in _items)
                        assert len(data_row) == len(fields)
                        str = "INSERT INTO %s (%s) VALUES (%s)"%\
                              (t, ",".join(fields), ",".join("?" for _ in fields))
                        con.cursor().execute(str, data_row).commit()
                else:
                    for sql_data_row in (_t if containerish(_t) else _t()):
                        str = "INSERT INTO %s (%s) VALUES (%s)" % (t, ",".join(
                            _brackets(sql_data_row.keys())), ",".join(
                                ["?"] * len(sql_data_row)))
                        con.cursor().execute(
                            str, tuple(map(_write_data,
                                           sql_data_row.values())))

Exemple #16

0

Afficher le fichier

class _PostgresFactory(
        freezable_factory(object, "_isFrozen"), ):
    def __init__(self, tdf):
        self.tdf = tdf
        self._isFrozen = True

    def _check_good_pgtd_compatible_table_field_names(self):
        all_fields = lambda t: self.tdf.primary_key_fields.get(t, (
        )) + self.tdf.data_fields.get(t, ())
        for t in self.tdf.all_tables:  # play nice with the table/field names or don't play at all
            verify(
                _pg_name(t) == t,
                f"Table {t} doesn't obey a postgres friendly naming convention."
                + f"It should be have been named {_pg_name(t)}\n" +
                "This is a postgres specific requirement. See pgsql doc string for more info."
            )
            verify(
                len(all_fields(t)) == len(set(map(_pg_name, all_fields(t)))),
                f"Table {t} has field names that collide with each other under case/space insensitivity.\n"
                +
                "This is a postgres specific requirement. See pgsql doc string for more info."
            )
            # a little testing indicated that the problem is with reserved words as fields, but not tables
            reserved_word_collision = {
                _
                for _ in all_fields(t) if _.lower() in _the_reserved_words
            }
            verify(
                not reserved_word_collision,
                f"The following field names from table {t} collide with PostGres "
                + f"reserved words {reserved_word_collision}")

    def check_tables_fields(self,
                            engine,
                            schema,
                            error_on_missing_table=False):
        '''
        throws a TicDatError if there there isn't a postgres schema in engine with the proper tables and fields.
        :param engine: has an .execute method
        :param schema: string that represents a postgres schema
        :param error_on_missing_table: boolean - should an error be thrown for missing tables? If falsey, then
               print a warning instead.
        :return: A list of missing tables. Will raise TicDatError if there are missing tables and
                 error_on_missing_table is truthy.
        '''
        tdf = self.tdf
        verify(
            schema in [
                row[0] for row in engine.execute(
                    "select schema_name from information_schema.schemata")
            ], f"Schema {schema} is missing from engine {engine}")
        pg_tables = [
            row[0] for row in engine.execute(
                f"select table_name from information_schema.tables where table_schema ='{schema}'"
            )
        ]
        missing_tables = []
        for table in tdf.all_tables:
            if table in pg_tables:
                pg_fields = [
                    row[0] for row in engine.execute(
                        f"""SELECT column_name FROM information_schema.columns 
                             WHERE table_schema = '{schema}' AND table_name = '{table}'"""
                    )
                ]
                for field in tdf.primary_key_fields.get(table, ()) + \
                             tdf.data_fields.get(table, ()):
                    matches = [f for f in pg_fields if f == _pg_name(field)]
                    verify(
                        len(matches) == 1,
                        f"Unable to recognize {table}.{_pg_name(field)} in postgres schema {schema}"
                    )
            else:
                missing_tables.append(table)
        verify(
            not (missing_tables and error_on_missing_table),
            f"Unable to recognize tables {missing_tables} in postgres schema {schema}"
        )
        if missing_tables:
            print(
                "The following table names could not be found in the %s schema.\n%s\n"
                % (schema, "\n".join(missing_tables)))
        return missing_tables

    def _fks(self):
        rtn = defaultdict(set)
        for fk in self.tdf.foreign_keys:
            rtn[fk.native_table].add(fk)
        return FrozenDict({k: tuple(v) for k, v in rtn.items()})

    def _ordered_tables(self):
        rtn = []
        fks = self._fks()

        def processTable(t):
            if t not in rtn:
                for fk in fks.get(t, ()):
                    processTable(fk.foreign_table)
                rtn.append(t)

        list(map(processTable, self.tdf.all_tables))
        return tuple(rtn)

    def _get_schema_sql(self, tables, schema, forced_field_types):
        rtn = []
        fks = self._fks()

        def get_fld_type(t, f, default_type):
            if (t, f) in forced_field_types:
                return forced_field_types[t, f]
            if t == "parameters" and self.tdf.parameters:
                return "text"
            fld_type = self.tdf.data_types.get(t, {}).get(f)
            if not fld_type:
                return default_type
            if fld_type.datetime:
                return "timestamp"
            verify(
                not (fld_type.number_allowed and fld_type.strings_allowed),
                f"Select one of string or numeric for {t}.{f} if declaring type and using postgres"
            )
            if fld_type.strings_allowed:
                return 'text'
            if fld_type.number_allowed:
                if fld_type.must_be_int:
                    return 'integer'
                else:
                    return 'float'
            else:
                TicDatError(
                    f"Allow one of text or numeric for {t}.{f} if declaring type and using postgres"
                )

        def db_default(t, f):
            rtn = self.tdf.default_values[t][f]
            if forced_field_types.get((t, f)) in ("bool", "boolean"):
                return bool(rtn)
            if rtn is None:
                return "NULL"
            return rtn

        def nullable(t, f):
            fld_type = self.tdf.data_types.get(t, {}).get(f)
            if not fld_type:
                return True
            if fld_type.number_allowed and self.tdf.infinity_io_flag is None:
                return True
            return fld_type.nullable

        def default_sql_str(t, f):
            fld_type = self.tdf.data_types.get(t, {}).get(f)
            if fld_type and fld_type.datetime:
                return ""
            return f" DEFAULT {db_default(t, f)}"

        for t in [_ for _ in self._ordered_tables() if _ in tables]:
            str = f"CREATE TABLE {schema}.{t} (\n"
            strl = [f"{_pg_name(f)} " + get_fld_type(t, f, 'text') for f in
                    self.tdf.primary_key_fields.get(t, ())] + \
                   [f"{_pg_name(f)} " + get_fld_type(t, f, 'float') +
                    (f"{' NOT NULL' if not nullable(t,f) else ''}") + default_sql_str(t, f)
                    for f in self.tdf.data_fields.get(t, ())]
            if self.tdf.primary_key_fields.get(t):
                strl.append(
                    f"PRIMARY KEY ({','.join(map(_pg_name, self.tdf.primary_key_fields[t]))})"
                )
            for fk in fks.get(t, ()):
                nativefields, foreignfields = zip(
                    *(fk.nativetoforeignmapping().items()))
                strl.append(
                    f"FOREIGN KEY ({','.join(map(_pg_name, nativefields))}) REFERENCES "
                    +
                    f"{schema}.{fk.foreign_table} ({','.join(map(_pg_name, foreignfields))})"
                )
            str += ",\n".join(strl) + "\n);"
            rtn.append(str)
        return tuple(rtn)

    def write_schema(self,
                     engine,
                     schema,
                     forced_field_types=None,
                     include_ancillary_info=True):
        """
        :param engine: typically a sqlalchemy database engine with drivertype postgres (really just needs an .execute)

        :param schema: a string naming the postgres schema to populate (will create if needed)

        :param forced_field_types : A dictionary mappying (table, field) to a field type
                                    Absent forcing, types are inferred from tic_dat_factory.data_types if possible,
                                    and set via the assumption that PK fields are text and data fields are floats if
                                    not.
        :param  include_ancillary_info : boolean. If False, no primary key or foreign key info will be written
        :return:
        """
        self._check_good_pgtd_compatible_table_field_names()
        forced_field_types = forced_field_types or {}
        all_fields = lambda t: self.tdf.primary_key_fields.get(t, (
        )) + self.tdf.data_fields.get(t, ())
        good_forced_field_type_entry = lambda k, v: isinstance(k, tuple) and len(k) == 2 \
                        and k[1] in all_fields(k[0]) and v in \
                        ["text", "integer", "float", "bool", "boolean", "timestamp"]
        verify(
            dictish(forced_field_types) and all(
                good_forced_field_type_entry(k, v)
                for k, v in forced_field_types.items()),
            "bad forced_field_types argument")
        if not include_ancillary_info:
            from ticdat import TicDatFactory
            tdf = TicDatFactory(**{
                t: [[], pks + dfs]
                for t, (pks, dfs) in self.tdf.schema().items()
            })
            for t, dts in self.tdf.data_types.items():
                for f, dt in dts.items():
                    tdf.set_data_type(t, f, *dt)
            forced_field_types_ = {
                (t, f): "text"
                for t, (pks, dfs) in self.tdf.schema().items() for f in pks
                if f not in tdf.data_types.get(t, {})
            }
            forced_field_types_.update(forced_field_types)
            return PostgresTicFactory(tdf).write_schema(
                engine, schema, forced_field_types_)

        verify(not getattr(self.tdf, "generic_tables", None),
               "TicDat for postgres does not yet support generic tables")

        if schema not in [
                row[0] for row in engine.execute(
                    "select schema_name from information_schema.schemata")
        ]:
            engine.execute(sa.schema.CreateSchema(schema))
        for str in self._get_schema_sql(self.tdf.all_tables, schema,
                                        forced_field_types):
            engine.execute(str)

    def _handle_prexisting_rows(self, engine, schema, pre_existing_rows):
        verify(isinstance(pre_existing_rows, dict),
               "pre_existing_rows needs to dict")
        verify(
            set(pre_existing_rows).issubset(self.tdf.all_tables),
            "bad pre_existing_rows keys")
        verify(
            set(pre_existing_rows.values()).issubset({'delete', 'append'}),
            "bad pre_existing_rows values")
        pre_existing_rows = dict({t: "delete"
                                  for t in self.tdf.all_tables},
                                 **pre_existing_rows)
        # need to iterate from leaves (children) upwards to avoid breaking foreign keys with delete
        for t in reversed(self._ordered_tables()):
            if pre_existing_rows[t] == "delete":
                try:
                    engine.execute(
                        f"truncate table {schema}.{t}"
                    )  # postgres truncate will fail on FKs re:less
                except Exception as e:
                    assert "foreign key" in str(
                        e
                    ), "truncate should only fail due to foreign key issues"
                    engine.execute(f"DELETE FROM {schema}.{t}")

Exemple #17

0

Afficher le fichier

Fichier : pandatio.py Projet : vn8317x/opalytics-ticdat

class JsonPanFactory(freezable_factory(object, "_isFrozen")):
    """
    Primary class for reading/writing json data with PanDat objects.
    Don't create this object explicitly. A JsonPanFactory will
    automatically be associated with the json attribute of the parent
    PanDatFactory.
    """
    def __init__(self, pan_dat_factory):
        """
        Don't create this object explicitly. A JsonPanFactory will
        automatically be associated with the json attribute of the parent
        PanDatFactory.
        :param pan_dat_factory:
        :return:
        """
        self.pan_dat_factory = pan_dat_factory
        to_json_args = inspect.getargspec(pd.DataFrame.to_json).args
        assert "orient" in to_json_args
        self._modern_pandas = "index" in to_json_args
        self._isFrozen = True
    def create_pan_dat(self, path_or_buf, fill_missing_fields=False, orient='split', **kwargs):
        """
        Create a PanDat object from a SQLite database file

        :param path_or_buf:  a valid JSON string or file-like

        :param fill_missing_fields: boolean. If truthy, missing fields will be filled in
                                    with their default value. Otherwise, missing fields
                                    throw an Exception.

        :param orient: Indication of expected JSON string format. See pandas.read_json for more details.

        :param kwargs: additional named arguments to pass to pandas.read_json

        :return: a PanDat object populated by the matching tables.

        caveats: Missing tables always throw an Exception.
                 Table names are matched with case-space insensitivity, but spaces
                 are respected for field names.
                 (ticdat supports whitespace in field names but not table names).
                 +- "inf", "-inf" strings will be converted to +-float("inf")
        """
        if os.path.exists(path_or_buf):
            verify(os.path.isfile(path_or_buf), "%s appears to be a directory and not a file." % path_or_buf)
            with open(path_or_buf, "r") as f:
                loaded_dict = json.load(f)
        else:
            verify(stringish(path_or_buf), "%s isn't a string" % path_or_buf)
            loaded_dict = json.loads(path_or_buf)
        verify(dictish(loaded_dict), "path_or_buf to json.load as a dict")
        verify(all(map(dictish, loaded_dict.values())),
               "the json.load result doesn't resolve to a dictionary whose values are themselves dictionaries")

        tbl_names = self._get_table_names(loaded_dict)
        verify("orient" not in kwargs, "orient should be passed as a non-kwargs argument")
        rtn = {t: pd.read_json(json.dumps(loaded_dict[f]), orient=orient, **kwargs) for t,f in tbl_names.items()}
        missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t)
                          if f not in rtn[t].columns}
        if fill_missing_fields:
            for t,f in missing_fields:
                rtn[t][f] = self.pan_dat_factory.default_values[t][f]
        verify(fill_missing_fields or not missing_fields,
               "The following (table, field) pairs are missing fields.\n%s" % [(t, f) for t,f in missing_fields])
        for v in rtn.values():
            v.replace("inf", float("inf"), inplace=True)
            v.replace("-inf", -float("inf"), inplace=True)
        rtn = self.pan_dat_factory.PanDat(**rtn)
        msg = []
        assert self.pan_dat_factory.good_pan_dat_object(rtn, msg.append), str(msg)
        return rtn
    def _get_table_names(self, loaded_dict):
        rtn = {}
        for table in self.pan_dat_factory.all_tables:
            rtn[table] = [c for c in loaded_dict if c.lower().replace(" ", "_") == table.lower()]
            verify(len(rtn[table]) >= 1, "Unable to recognize table %s" % table)
            verify(len(rtn[table]) <= 1, "Multiple dictionary key choices found for table %s" % table)
            rtn[table] = rtn[table][0]
        return rtn
    def write_file(self, pan_dat, json_file_path, case_space_table_names=False, orient='split',
                   index=False, indent=None, sort_keys=False, **kwargs):
        """
        write the PanDat data to a collection of csv files

        :param pan_dat: the PanDat object to write

        :param json_file_path: the json file into which the data is to be written. If falsey, will return a
                               JSON  string

        :param case_space_table_names: boolean - make best guesses how to add spaces and upper case
                                       characters to table names

        :param orient: Indication of expected JSON string format. See pandas.to_json for more details.

        :param index: boolean - whether or not to write the index.

        :param indent: None. See json.dumps

        :param sort_keys: See json.dumps

        :param kwargs: additional named arguments to pass to pandas.to_json

        :return:

        caveats:  +-float("inf") will be converted to "inf", "-inf"
        """
        msg = []
        verify(self.pan_dat_factory.good_pan_dat_object(pan_dat, msg.append),
               "pan_dat not a good object for this factory : %s"%"\n".join(msg))
        verify("orient" not in kwargs, "orient should be passed as a non-kwargs argument")
        verify("index" not in kwargs, "index should be passed as a non-kwargs argument")

        if self._modern_pandas:
            # FYI - pandas Exception: ValueError: 'index=False' is only valid when 'orient' is 'split' or 'table'
            kwargs["index"] = index if orient in ("split", "table") else True
        case_space_table_names = case_space_table_names and \
                                 len(set(self.pan_dat_factory.all_tables)) == \
                                 len(set(map(case_space_to_pretty, self.pan_dat_factory.all_tables)))
        rtn = {}
        for t in self.pan_dat_factory.all_tables:
            df = getattr(pan_dat, t).replace(float("inf"), "inf").replace(-float("inf"), "-inf")
            k = case_space_to_pretty(t) if case_space_table_names else t
            rtn[k] = json.loads(df.to_json(path_or_buf=None, orient=orient, **kwargs))
            if orient == 'split' and not index:
                rtn[k].pop("index", None)
        if json_file_path:
            with open(json_file_path, "w") as f:
                json.dump(rtn, f, indent=indent, sort_keys=sort_keys)
        else:
            return json.dumps(rtn, indent=indent, sort_keys=sort_keys)