Ejemplo n.º 1
0
    def apply_checks(
            self, tables, path=None, script_name=None,
            object_name="dict_checks", dictionary=None, **kwargs):
        module_logger.info("Starting `apply_checks`")
        if (script_name is not None) & (object_name is not None):
            dict_checks = import_attr(path, script_name, object_name)
        elif dictionary is not None:
            if type(dictionary).__name__ != "dict":
                var_msg = "The `dictionary` argument is not a dictionary"
                module_logger.error(var_msg)
                raise ValueError(var_msg)
            dict_checks = dictionary
        else:
            var_msg = ("Either `dictionary` or both of `script_name` and "
                       "`path` need to be none null")
            module_logger.error(var_msg)
            raise ValueError(var_msg)

        if type(tables).__name__ == "dict":
            for table_key in tables.keys():
                for check_key in dict_checks.keys():
                    self.__apply_the_check(
                        tables[table_key], dict_checks[check_key], check_key,
                        table_key, **kwargs)
        elif type(tables).__name__ == "DataFrame":
            for check_key in dict_checks.keys():
                self.__apply_the_check(tables, dict_checks[check_key],
                                       check_key, np.nan, **kwargs)

        module_logger.info("Completed `apply_checks`")
Ejemplo n.º 2
0
    def alter_tables(self,
                     path=None,
                     script_name=None,
                     object_name="dict_alter",
                     dictionary=None,
                     **kwargs):
        """
        Use this functionality to make alterations to the table(s)
        """
        module_logger.info("Starting `alter_tables`")
        # TODO move this check to own function (applies to convert_columns too)
        if (script_name is not None) & (object_name is not None):
            dict_alter = import_attr(path, script_name, object_name)
        elif dictionary is not None:
            if type(dictionary).__name__ != "dict":
                var_msg = "The `dictionary` argument is not a dictionary"
                module_logger.error(var_msg)
                raise ValueError(var_msg)
            dict_alter = dictionary
        else:
            var_msg = ("Either `dictionary` or both of `script_name` and "
                       "`path` need to be none null")
            module_logger.error(var_msg)
            raise ValueError(var_msg)

        if type(self.tables).__name__ == "DataFrame":
            df = self.tables.copy()
            df_new = self.__alter_cols(
                df, dict_alter, [self.__key_1, self.__key_2, self.__key_3],
                np.nan, **kwargs)
            self.set_table(df_new)
        elif type(self.tables).__name__ == "dict":
            dfs = self.tables
            for key in self.tables.keys():
                df = dfs[key].copy()
                df_new = self.__alter_cols(
                    df, dict_alter, [self.__key_1, self.__key_2, self.__key_3],
                    key, **kwargs)
                self.set_table(df_new, key)
        else:
            var_msg = ("The tables are in neither a DataFrame or dictionary "
                       "format, which means something is seriously wrong...")
            module_logger.error(var_msg)
            raise ValueError(var_msg)

        module_logger.info("Completed `alter_tables`")
Ejemplo n.º 3
0
    def link_headers(self,
                     path=None,
                     script_name=None,
                     func_name="link_headers",
                     function=None,
                     **kwargs):
        # TODO Need to see if we can isolate just a set of new tables? Maybe
        #  have a list of dictionary keys that have had their headers
        #  done already?
        module_logger.info("Starting `link_headers`")

        if function is not None:
            if type(function).__name__ != "function":
                var_msg = ("The function passed to `self.link_headers` is "
                           "not a function.")
                module_logger.error(var_msg)
                raise ValueError(var_msg)
        elif script_name is not None:
            function = import_attr(path, script_name, func_name)
        else:
            function = self._link_headers

        try:
            dict_link = function(self.tables, self.headers, **kwargs)
        except AttributeError:
            if len([x for x in kwargs.keys()]) > 0:
                var_msg = (
                    f"Function link_headers, kwargs may have been passed when "
                    f"the function {func_name} in the script {script_name} does"
                    f" not take kwargs")
            else:
                var_msg = (f"Function link_headers: The {func_name} function "
                           f"does not exist in the {script_name} script.")
            module_logger.error(var_msg)
            raise AttributeError(var_msg)

        list_unallocated_keys = set(self.tables.keys()) - set(dict_link.keys())
        if len(list_unallocated_keys) != 0:
            var_msg = (f"Not all the headers are linked, the unlinked tables "
                       f"are: {list_unallocated_keys}")
            module_logger.error(var_msg)
            raise ValueError(var_msg)

        self.__link_headers = dict(dict_link)

        module_logger.info("Completed `link_headers`")
Ejemplo n.º 4
0
    def form_summary_tables(self,
                            path=None,
                            script_name=None,
                            func_name="form_tables",
                            function=None,
                            **kwargs):
        """
        Use a function to create summaries off the main table set.

        The function is passed the arguments:
            self.tables, self.formed_tables, self.__grouping, self.__key_1,
            self.__key_2, self.__key_3, self.__key_separator, **kwargs
        """
        module_logger.info("Starting `form_summary_tables`")

        if function is not None:
            if type(function).__name__ != "function":
                var_msg = ("The function passed to `self.form_summary_tables` "
                           "is not a function.")
                module_logger.error(var_msg)
                raise ValueError(var_msg)
        elif script_name is not None:
            function = import_attr(path, script_name, func_name)
        else:
            var_msg = (
                "One of the `function` or `script_name` arguments needs "
                "to be completed. And if `script name is then `path` "
                "needs to be too.")
            module_logger.error(var_msg)
            raise ValueError(var_msg)

        dict_formed_tables = function(self.tables, self.formed_tables,
                                      self.__grouping, self.__key_1,
                                      self.__key_2, self.__key_3,
                                      self.__key_separator, **kwargs)
        if type(dict_formed_tables).__name__ != 'dict':
            var_msg = ('The output of the function for `form_summary_table` '
                       'is not a dictionary and it needs to be')
            module_logger.error(var_msg)
            raise ValueError(var_msg)
        self.formed_tables = dict_formed_tables

        module_logger.info("Completed `form_summary_tables`")
Ejemplo n.º 5
0
    def convert_columns(self,
                        path=None,
                        script_name=None,
                        object_name="dict_convert",
                        dictionary=None,
                        **kwargs):
        module_logger.info("Starting `convert_columns`")
        if (script_name is not None) & (object_name is not None):
            dict_convert = import_attr(path, script_name, object_name)
        elif dictionary is not None:
            if type(dictionary).__name__ != "dict":
                var_msg = "The `dictionary` argument is not a dictionary"
                module_logger.error(var_msg)
                raise ValueError(var_msg)
            dict_convert = dictionary
        else:
            var_msg = ("Either `dictionary` or both of `script_name` and "
                       "`path` need to be none null")
            module_logger.error(var_msg)
            raise ValueError(var_msg)

        if type(self.tables).__name__ == "DataFrame":
            df = self.tables.copy()
            df_new = self.__convert_col(df, dict_convert, "", **kwargs)
            self.set_table(df_new, overwrite=True)
        elif type(self.tables).__name__ == "dict":
            dfs = self.tables
            for key in self.tables.keys():
                df = dfs[key].copy()
                df_new = self.__convert_col(df, dict_convert, key, **kwargs)
                dfs[key] = df_new.copy()
            self.set_table(dfs, overwrite=True)
        else:
            var_msg = ("The tables are in neither a DataFrame or dictionary "
                       "format, which means something is seriously wrong...")
            module_logger.error(var_msg)
            raise ValueError(var_msg)

        module_logger.info("Completed `convert_columns`")
Ejemplo n.º 6
0
    def find_files(self,
                   path=None,
                   script_name=None,
                   func_name="list_the_files",
                   function=None,
                   files_path='.',
                   append=False,
                   **kwargs):
        """
        Using an externally defined function, as specified in the module
        argument script, acquire a list of files to be read in.

        In the case that we want to accumulate a list of files from different
        main paths there is an append option.
        """
        module_logger.info("Starting `find_files`")
        # TODO move this to an internal function as it's used so often!
        if script_name is not None:
            function = import_attr(path, script_name, func_name)
        elif function is not None:
            if type(function).__name__ != "function":
                var_msg = "The `function` argument needs to be a function"
                module_logger.error(var_msg)
                raise ValueError(var_msg)
        else:
            var_msg = ("One of `script_name` or `function` needs to be not "
                       "None in the function `find_files`")
            module_logger.error(var_msg)
            raise ValueError(var_msg)
        list_files = function(files_path, **kwargs)
        # TODO move these to be calls on the self.set_file_list function instead
        #  of setting the value here
        if append:
            self.list_files += list_files
        else:
            self.list_files = list_files
        module_logger.info(
            f"Completed `find_files`, the list of files is: {self.list_files}")
Ejemplo n.º 7
0
    def summary(self, path=None, script_name=None,
                object_name="dict_checks", dictionary=None):
        if (script_name is not None) & (object_name is not None):
            dict_checks = import_attr(path, script_name, object_name)
        elif dictionary is not None:
            if type(dictionary).__name__ != "dict":
                var_msg = "The `dictionary` argument is not a dictionary"
                module_logger.error(var_msg)
                raise ValueError(var_msg)
            dict_checks = dictionary
        else:
            var_msg = ("Either `dictionary` or both of `script_name` and "
                       "`path` need to be none null")
            module_logger.error(var_msg)
            raise ValueError(var_msg)

        list_keys = [
            'calc_condition', 'long_description', 'check_condition', 'columns',
            'count_condition', 'index_position', 'relevant_columns', 'idx_flag',
            'category'
        ]

        dict_checks_values = deepcopy(dict_checks)
        for check in [key for key in dict_checks_values.keys()]:
            for key in [key for key in list_keys if
                        key not in dict_checks_values[check].keys()]:
                dict_checks_values[check][key] = self.__checks_defaults[key]

        for check in [key for key in dict_checks_values.keys()]:
            for key in [key for key in dict_checks_values[check].keys()]:
                dict_checks_values[check][key] = self.__func_summary_(
                    dict_checks_values[check][key])

        df_summary = pd.DataFrame(
            dict_checks_values
        ).T.reset_index().rename(columns={'index': 'check'})

        return {'df': df_summary, 'dict': dict_checks}
Ejemplo n.º 8
0
    def set_headers(self,
                    path=None,
                    script_name=None,
                    func_name=None,
                    list_cols=None,
                    function=None,
                    ideal_headers=None,
                    required_headers=None):
        module_logger.info("Starting `set_headers`")
        if list_cols is not None:
            if type(list_cols).__name__ != "list":
                var_msg = (
                    "The argument `list_cols` of function `set_headers` "
                    "needs to be a list")
                module_logger.error(var_msg)
                raise ValueError(var_msg)
        elif function is not None:
            if type(function).__name__ != "function":
                var_msg = ("The argument `function` of function `set_headers` "
                           "needs to be a function")
                module_logger.error(var_msg)
                raise ValueError(var_msg)
        elif script_name is not None:
            function = import_attr(path, script_name, func_name)
        elif ideal_headers is not None:
            if type(ideal_headers).__name__ != 'list':
                var_msg = ("The argument `ideal_headers` of function "
                           "`set_headers` needs to be a list")
                module_logger.error(var_msg)
                raise ValueError(var_msg)
        elif required_headers is not None:
            if type(required_headers).__name__ != 'list':
                var_msg = ("The argument `required_headers` of function "
                           "`set_headers` needs to be a list")
                module_logger.error(var_msg)
                raise ValueError(var_msg)
        var_type = type(self.tables).__name__
        if var_type == "dict":
            dict_dfs = self.tables.copy()
            var_cond = len(
                set([dict_dfs[key].shape[1] for key in dict_dfs.keys()]))
            var_cond = var_cond != 1
            if var_cond:
                var_msg = ("There are an inconsistent number of columns "
                           "present in the dictionary of tables")
                module_logger.error(var_msg)
                raise ValueError(var_msg)
            if list_cols is not None:
                if (len(list_cols) != dict_dfs[[x for x in dict_dfs.keys()
                                                ][0]].shape[1]):
                    var_msg = ("The length of `list_cols` is different to the "
                               "number of columns present in the table")
                    module_logger.error(var_msg)
                    raise ValueError(var_msg)
            elif function is not None:
                list_cols_org = dict_dfs[[x for x in dict_dfs.keys()
                                          ][0]].columns.tolist()
                list_cols = [function(x) for x in list_cols_org]
            for key in dict_dfs.keys():
                if list_cols is not None:
                    dict_dfs[key].columns = list_cols
                elif function is not None:
                    dict_dfs[key].columns = list_cols
                elif ideal_headers is not None:
                    for col in [
                            col for col in ideal_headers
                            if col not in dict_dfs[key].columns.tolist()
                    ]:
                        dict_dfs[key][col] = np.nan
                    dict_dfs[key] = dict_dfs[key][ideal_headers].copy()
                elif required_headers is not None:
                    for col in [
                            col for col in required_headers
                            if col not in dict_dfs[key].columns.tolist()
                    ]:
                        dict_dfs[key][col] = np.nan
            self.set_table(dict_dfs, overwrite=True)
        elif var_type == "DataFrame":
            if len(list_cols) != self.tables.shape[1]:
                var_msg = ("The length of `list_cols` is different to the "
                           "number of columns present in the table")
                module_logger.error(var_msg)
                raise ValueError(var_msg)
            df = self.tables.copy()
            if list_cols is not None:
                df.columns = list_cols
            elif function is not None:
                df.columns = [function(x) for x in df.columns.tolist()]
            elif ideal_headers is not None:
                for col in [
                        col for col in ideal_headers
                        if col not in df.columns.tolist()
                ]:
                    df[col] = np.nan
                df = df[ideal_headers].copy()
            elif required_headers is not None:
                for col in [
                        col for col in required_headers
                        if col not in df.columns.tolist()
                ]:
                    df[col] = np.nan
            self.set_table(df, overwrite=True)
        else:
            var_msg = (
                "Somehow the tables are not a dictionary or a DataFrame "
                "for function `set_headers`")
            module_logger.error(var_msg)
            raise ValueError(var_msg)

        module_logger.info("Completed `set_headers`")
Ejemplo n.º 9
0
    def set_comparison_headers(self,
                               path=None,
                               script_name=None,
                               func_name="read_headers",
                               function=None,
                               dictionary=None,
                               **kwargs):
        # TODO Need to see if we can isolate just a set of new tables? Maybe
        #  have a list of dictionary keys that have had their headers done
        #  already?
        module_logger.info("Starting `set_comparison_headers`")

        if function is not None:
            if type(function).__name__ != "function":
                var_msg = ("The function passed to "
                           "`self.set_comparison_headers` is not a function.")
                module_logger.error(var_msg)
                raise ValueError(var_msg)
        elif script_name is not None:
            function = import_attr(path, script_name, func_name)
        elif dictionary is not None:

            def function(**kwargs):
                return dictionary
        else:
            var_msg = (
                "One of the `function` or `script_name` arguments needs "
                "to be completed. And if `script name is then `path` "
                "needs to be too.")
            module_logger.error(var_msg)
            raise ValueError(var_msg)

        try:
            dict_headers = function(**kwargs)
        except AttributeError:
            if len([x for x in kwargs.keys()]) > 0:
                var_msg = (
                    f"Function set_comparison_headers, kwargs may have been "
                    f"passed when the function {func_name} in the script "
                    f"{script_name} does not take kwargs")
            else:
                var_msg = (
                    f"Function set_comparison_headers: The {func_name} function"
                    f" does not exist in the {script_name} script.")
            module_logger.error(var_msg)
            raise AttributeError(var_msg)

        if type(dict_headers).__name__ != 'dict':
            var_msg = 'The headers output should be a dictionary'
            module_logger.error(var_msg)
            raise Exception(var_msg)
        list_keys = [
            key for key in dict_headers.keys() if key != 'ideal_headers'
        ]
        list_keys = [
            key for key in list_keys
            if (dict_headers[key].get('expected_headers') is None)
            | (dict_headers[key].get('new_headers') is None)
            | (dict_headers[key].get('remove') is None)
        ]
        if len(list_keys) > 0:
            var_msg = (
                f'There are dictionary keys that do not have all the required '
                f'values: {", ".join([str(key) for key in list_keys])}')
            module_logger.error(var_msg)
            raise Exception(var_msg)
        if dict_headers.get('ideal_headers') is None:
            var_msg = ('There needs to be a key to the headers dictionary that'
                       ' is "ideal_headers"')
            module_logger.error(var_msg)
            raise Exception(var_msg)
        if type(dict_headers.get('ideal_headers')).__name__ != 'list':
            var_msg = 'The value of key "ideal_headers" needs to be a list'
            module_logger.error(var_msg)
            raise Exception(var_msg)

        self.headers = dict(dict_headers)

        module_logger.info(
            f"There are {len(dict_headers)} header keys and they are: "
            f"{', '.join([key for key in dict_headers.keys()])}")

        module_logger.info("Completed `set_comparison_headers`")
Ejemplo n.º 10
0
    def reading_in(self,
                   path=None,
                   script_name=None,
                   func_name="read_files",
                   function=None,
                   overwrite=True,
                   **kwargs):
        """
        Using an externally defined reading in function, and the internally
        defined list of files, read in each of the tables required.

        `path` being the relative script file path
        """
        module_logger.info("Starting `reading_in`")
        if type(self.tables).__name__ != "dict":
            var_msg = ("The tables need to be in dictionary format for this "
                       "`self.reading_in` step")
            module_logger.error(var_msg)
            raise ValueError(var_msg)
        if function is not None:
            if type(function).__name__ != "function":
                var_msg = ("The function passed to `self.reading_in` is not a "
                           "function.")
                module_logger.error(var_msg)
                raise ValueError(var_msg)
        elif script_name is not None:
            function = import_attr(path, script_name, func_name)
        else:
            var_msg = (
                "One of the `function` or `script_name` arguments needs "
                "to be completed. And if `script name is then `path` "
                "needs to be too.")
            module_logger.error(var_msg)
            raise ValueError(var_msg)

        try:
            dfs = function(self.list_files, **kwargs)
        except AttributeError:
            if len([x for x in kwargs.keys()]) > 0:
                var_msg = (f"Function reading_in, kwargs may have been passed "
                           f"when the function {func_name} in the script "
                           f"{script_name} does not take kwargs")
            else:
                var_msg = (f"Function reading in: The {func_name} function "
                           f"does not exist in the {script_name} script.")
            module_logger.error(var_msg)
            raise AttributeError(var_msg)
        if overwrite is False:
            df_org = self.tables.copy()
            df_org.update(dfs)
        elif overwrite is True:
            pass
        else:
            var_msg = ("The attribute `overwrite` in the function "
                       "`reading_in` needs to be `True` or `False`")
            module_logger.error(var_msg)
            raise ValueError(var_msg)
        self.set_table(dfs, overwrite=overwrite)
        if type(dfs).__name__ == "DataFrame":
            module_logger.info(f"The table has shape '{dfs.shape}'")
        else:
            for key in dfs:
                module_logger.info(
                    f"The table with key '{key}' has shape '{dfs[key].shape}'")

        module_logger.info("Completed `reading_in`")