コード例 #1
0
    def run_component_pipeline(self, canonical: pd.DataFrame=None, feature_names: [str, list]=None,
                               auto_connectors: bool=None, save: bool=None, reset_changed: bool=None,
                               has_changed: bool=None):
        """runs all features within the feature catalog or an optional set of features

        :param canonical: (optional) A canonical if the source canonical isn't to be used
        :param feature_names: (optional) a single or list of features to run
        :param auto_connectors: (optional) Adds a versioned feature connector if not yet added. Default to True
        :param save: (optional) if True, persist changes to property manager. Default is True
        :param reset_changed: (optional) resets the has_changed boolean to True
        :param has_changed: (optional) tests if the underline canonical has changed since last load else error returned
        """
        auto_connectors = auto_connectors if isinstance(auto_connectors, bool) else True
        if isinstance(feature_names, (str, list)):
            feature_names = Commons.list_formatter(feature_names)
        else:
            feature_names = Commons.list_formatter(self.pm.get_intent())
        if not isinstance(canonical, (pd.DataFrame, str)):
            canonical = self.load_source_canonical(reset_changed=reset_changed, has_changed=has_changed)
        for feature in feature_names:
            if not self.pm.has_connector(feature):
                if not auto_connectors:
                    continue
                self.set_feature_bootstrap(feature_name=feature, versioned=True, save=save)
            result = self.intent_model.run_intent_pipeline(canonical, feature)
            self.save_catalog_feature(feature_name=feature, canonical=result)
        return
 def test_get_cols(self):
     tools = self.tools
     cleaner = self.clean
     df = pd.DataFrame()
     df['int'] = tools.get_number(100, size=10)
     df['float'] = tools.get_number(0, 1.0, size=10)
     df['object'] = tools.get_category(list('abcdef'), size=10)
     df['date'] = tools.get_datetime('01/01/2010', '01/01/2018', size=10)
     df['category'] = tools.get_category(list('vwxyz'), size=10)
     df = cleaner.to_category_type(df, headers='category')
     df = cleaner.to_date_type(df, headers='date')
     control = ['float', 'object', 'date', 'category', 'int']
     result = Commons.filter_headers(df)
     self.assertTrue(set(result).intersection(control))
     control = ['object']
     result = Commons.filter_headers(df, dtype=[object])
     self.assertEqual(control, result)
     result = Commons.filter_headers(df, dtype=['object'])
     self.assertEqual(control, result)
     control = ['float', 'int']
     result = Commons.filter_headers(df, dtype=['number'])
     self.assertTrue(set(result).intersection(control))
     control = ['date']
     result = Commons.filter_headers(df, dtype=['datetime'])
     self.assertEqual(control, result)
コード例 #3
0
    def _report(self, canonical: pd.DataFrame, index_header: str, bold: [str, list]=None, large_font: [str, list]=None):
        """ generates a stylised report

        :param canonical
        :param index_header:
        :param bold:
        :param large_font
        :return: stylised report DataFrame
        """
        pd.set_option('max_colwidth', 200)
        pd.set_option('expand_frame_repr', True)
        bold = Commons.list_formatter(bold)
        bold.append(index_header)
        large_font = Commons.list_formatter(large_font)
        large_font.append(index_header)
        style = [{'selector': 'th', 'props': [('font-size', "120%"), ("text-align", "center")]},
                 {'selector': '.row_heading, .blank', 'props': [('display', 'none;')]}]
        index = canonical[canonical[index_header].duplicated()].index.to_list()
        canonical.loc[index, index_header] = ''
        canonical = canonical.reset_index(drop=True)
        df_style = canonical.style.set_table_styles(style)
        _ = df_style.set_properties(**{'text-align': 'left'})
        if len(bold) > 0:
            _ = df_style.set_properties(subset=bold, **{'font-weight': 'bold'})
        if len(large_font) > 0:
            _ = df_style.set_properties(subset=large_font, **{'font-size': "120%"})
        return df_style
コード例 #4
0
 def test_associate_analysis_complex(self):
     builder = SyntheticBuilder.from_memory()
     clinical_health = 'https://assets.datacamp.com/production/repositories/628/datasets/444cdbf175d5fbf564b564bd36ac21740627a834/diabetes.csv'
     builder.add_connector_uri('clinical_health', uri=clinical_health)
     discover: DataDiscovery = Transition.from_memory().discover
     A = discover.analysis2dict(header='age',
                                dtype='int',
                                granularity=10.0,
                                lower=21,
                                upper=90)
     B = discover.analysis2dict(header='pregnancies')
     columns_list = [A, B]
     df_clinical = builder.load_canonical('clinical_health')
     analysis_blob = discover.analyse_association(df_clinical,
                                                  columns_list=columns_list)
     canonical = pd.DataFrame(index=range(1973))
     df = builder.tools.model_analysis(canonical,
                                       analysis_blob=analysis_blob,
                                       column_name='clinical')
     self.assertEqual((1973, 2), df.shape)
     pregnancies = Commons.list_standardize(
         Commons.list_formatter(df_clinical.pregnancies))
     low, high = discover.bootstrap_confidence_interval(
         pd.Series(pregnancies), func=np.mean)
     pregnancies = Commons.list_standardize(
         Commons.list_formatter(df.pregnancies))
     self.assertTrue(low <= np.mean(pregnancies) <= high)
コード例 #5
0
    def register_estimator(self,
                           canonical: pd.DataFrame,
                           target: str,
                           headers: list,
                           class_name: str,
                           module_name: str,
                           hyper_param: dict = None,
                           test_size: float = None,
                           random_state: int = None,
                           save_intent: bool = None,
                           model_name: str = None,
                           intent_order: int = None,
                           replace_intent: bool = None,
                           remove_duplicates: bool = None):
        """ registers and fits an estimator model returning the model fit

        :param canonical: the model canonical
        :param class_name: the name of the model class
        :param target: the model target
        :param headers: the model features header names
        :param hyper_param: (optional) hyper parameters for the model instance
        :param test_size:  (optional) the size of the test sample (default tp 0.33)
        :param random_state:  (optional) a random state value for the test sample
        :param module_name: (optional) the name of the module
        :param save_intent: (optional) if the intent contract should be saved to the property manager
        :param model_name: (optional) the name of the model
        :param intent_order: (optional) the order in which each intent should run.
                        If None: default's to -1
                        if -1: added to a level above any current instance of the intent section, level 0 if not found
                        if int: added to the level specified, overwriting any that already exist
        :param replace_intent: (optional) if the intent method exists at the level, or default level
                        True - replaces the current intent method with the new
                        False - leaves it untouched, disregarding the new intent
        :param remove_duplicates: (optional) removes any duplicate intent in any level that is identical
        :return: CatBoostClassifier.
        """
        # resolve intent persist options
        _method = inspect.currentframe().f_code.co_name
        self._set_intend_signature(self._intent_builder(method=_method,
                                                        params=locals()),
                                   model_name=model_name,
                                   intent_order=intent_order,
                                   replace_intent=replace_intent,
                                   remove_duplicates=remove_duplicates,
                                   save_intent=save_intent)
        # Code block for intent
        local_intent = {}
        if model_name and self._pm.has_intent(model_name):
            local_intent = self._pm.get_intent(level=model_name,
                                               intent=_method)
        module_name = module_name if isinstance(
            module_name, str) else local_intent.get('module_name', None)
        X = Commons.filter_columns(canonical, headers=headers)
        y = Commons.filter_columns(canonical, headers=target)
        module = HandlerFactory.get_module(module_name='ds_behavioral')
    def report_canonical_schema(self,
                                schema: [str, dict] = None,
                                roots: [str, list] = None,
                                sections: [str, list] = None,
                                elements: [str, list] = None,
                                stylise: bool = True):
        """ presents the current canonical schema

        :param schema: (optional) the name of the schema
        :param roots: (optional) one or more tree roots
        :param sections: (optional) the section under the root
        :param elements: (optional) the element in the section
        :param stylise: if True present the report stylised.
        :return: pd.DataFrame
        """
        if not isinstance(schema, dict):
            schema = schema if isinstance(schema, str) else self.REPORT_SCHEMA
            if not self.pm.has_canonical_schema(name=schema):
                raise ValueError(
                    f"There is no Schema currently stored under the name '{schema}'"
                )
            schema = self.pm.get_canonical_schema(name=schema)
        df = pd.DataFrame(columns=['root', 'section', 'element', 'value'])
        root_list = DataAnalytics.get_tree_roots(analytics_blob=schema)
        if isinstance(roots, (str, list)):
            roots = Commons.list_formatter(roots)
            for root in roots:
                if root not in root_list:
                    raise ValueError(
                        f"The root '{root}' can not be found in the analytics tree roots"
                    )
            root_list = roots
        for root_items in root_list:
            data_analysis = DataAnalytics.from_root(analytics_blob=schema,
                                                    root=root_items)
            for section in data_analysis.section_names:
                if isinstance(sections, (str, list)):
                    if section not in Commons.list_formatter(sections):
                        continue
                for element, value in data_analysis.get(section).items():
                    if isinstance(elements, (str, list)):
                        if element not in Commons.list_formatter(elements):
                            continue
                    to_append = [root_items, section, element, value]
                    a_series = pd.Series(to_append, index=df.columns)
                    df = df.append(a_series, ignore_index=True)
        if stylise:
            return Commons.report(df,
                                  index_header=['root', 'section'],
                                  bold='element')
        return df
    def report_notes(self,
                     catalog: [str, list] = None,
                     labels: [str, list] = None,
                     regex: [str, list] = None,
                     re_ignore_case: bool = False,
                     stylise: bool = True,
                     drop_dates: bool = False):
        """ generates a report on the notes

        :param catalog: (optional) the catalog to filter on
        :param labels: (optional) s label or list of labels to filter on
        :param regex: (optional) a regular expression on the notes
        :param re_ignore_case: (optional) if the regular expression should be case sensitive
        :param stylise: (optional) returns a stylised dataframe with formatting
        :param drop_dates: (optional) excludes the 'date' column from the report
        :return: pd.Dataframe
        """
        report = self.pm.report_notes(catalog=catalog,
                                      labels=labels,
                                      regex=regex,
                                      re_ignore_case=re_ignore_case,
                                      drop_dates=drop_dates)
        df = pd.DataFrame.from_dict(data=report)
        if stylise:
            return Commons.report(df, index_header='section', bold='label')
        return df
    def report_connectors(self,
                          connector_filter: [str, list] = None,
                          inc_pm: bool = None,
                          inc_template: bool = None,
                          stylise: bool = True):
        """ generates a report on the source contract

        :param connector_filter: (optional) filters on the connector name.
        :param inc_pm: (optional) include the property manager connector
        :param inc_template: (optional) include the template connectors
        :param stylise: (optional) returns a stylised DataFrame with formatting
        :return: pd.DataFrame
        """
        report = self.pm.report_connectors(connector_filter=connector_filter,
                                           inc_pm=inc_pm,
                                           inc_template=inc_template)
        df = pd.DataFrame.from_dict(data=report)
        # sort out any values that start with a $ as it throws formatting
        for c in df.columns:
            df[c] = [
                f"{x[1:]}" if str(x).startswith('$') else x for x in df[c]
            ]
        if stylise:
            return Commons.report(df, index_header='connector_name')
        return df
 def test_make_list(self):
     for value in [
             '', 0, 0.0,
             pd.Timestamp(2018, 1, 1), [], (),
             pd.Series(dtype=str),
             list(),
             tuple(), 'name', ['list1', 'list2'], ('tuple1', 'tuple2'),
             pd.Series(['series1', 'series2']), {
                 'key1': 'value1',
                 'key2': 'value2'
             }, {},
             dict()
     ]:
         result = Commons.list_formatter(value)
         self.assertTrue(isinstance(result, list), value)
     self.assertEqual([], Commons.list_formatter(None))
コード例 #10
0
 def test_filter(self):
     tools = self.tools
     sample_size = 1000
     df = pd.DataFrame()
     df['normal_num'] = tools.get_number(1, 10, size=sample_size, seed=31)
     df['single num'] = tools.get_number(1, 2, quantity=0.8, size=sample_size, seed=31)
     df['weight_num'] = tools.get_number(1, 3, relative_freq=[90, 1], size=sample_size, seed=31)
     df['null'] = tools.get_number(1, 100, quantity=0, size=sample_size, seed=31)
     df['single cat'] = tools.get_category(['A'], quantity=0.6, size=sample_size, seed=31)
     df['weight_cat'] = tools.get_category(['A', 'B', 'C'], relative_freq=[80, 1, 1], size=sample_size, seed=31)
     df['normal_cat'] = tools.get_category(['A', 'B', 'C'], size=sample_size, seed=31)
     result = Commons.filter_headers(df, headers=['normal_num', 'single num'])
     control = ['normal_num', 'single num']
     self.assertCountEqual(control, result)
     result = Commons.filter_headers(df, dtype=['number'])
     control = ['null', 'weight_num', 'normal_num', 'single num']
     self.assertCountEqual(control, result)
    def report_intent(self,
                      levels: [str, int, list] = None,
                      stylise: bool = True):
        """ generates a report on all the intent

        :param levels: (optional) a filter on the levels. passing a single value will report a single parameterised view
        :param stylise: (optional) returns a stylised dataframe with formatting
        :return: pd.Dataframe
        """
        if isinstance(levels, (int, str)):
            df = pd.DataFrame.from_dict(data=self.pm.report_intent_params(
                level=levels))
            if stylise:
                return Commons.report(df, index_header='order')
        df = pd.DataFrame.from_dict(data=self.pm.report_intent(levels=levels))
        if stylise:
            return Commons.report(df, index_header='level')
        return df
    def report_run_book(self, stylise: bool = True):
        """ generates a report on all the intent

        :param stylise: returns a stylised dataframe with formatting
        :return: pd.Dataframe
        """
        df = pd.DataFrame.from_dict(data=self.pm.report_run_book())
        if stylise:
            return Commons.report(df, index_header='name')
        return df
コード例 #13
0
 def _correlated_columns(self, canonical: pd.DataFrame):
     """returns th percentage of useful colums"""
     threshold = 0.98
     pad = self.scratch_pad()
     canonical = pad.auto_to_category(canonical, unique_max=1000, inplace=False)
     canonical = pad.to_category_type(canonical, dtype='category', as_num=True)
     for c in canonical.columns:
         if all(Commons.valid_date(x) for x in canonical[c].dropna()):
             canonical = pad.to_date_type(canonical, dtype='datetime', as_num=True)
     canonical = Commons.filter_columns(canonical, dtype=['number'], exclude=False)
     for c in canonical.columns:
         canonical[c] = Commons.fillna(canonical[c])
     col_corr = set()
     corr_matrix = canonical.corr()
     for i in range(len(corr_matrix.columns)):
         for j in range(i):
             if abs(corr_matrix.iloc[i, j]) > threshold:  # we are interested in absolute coeff value
                 colname = corr_matrix.columns[i]  # getting the name of column
                 col_corr.add(colname)
     return col_corr
コード例 #14
0
    def report_run_book(self, stylise: bool=True):
        """ generates a report on all the intent

        :param stylise: returns a stylised dataframe with formatting
        :return: pd.Dataframe
        """
        report = pd.DataFrame(self.pm.report_run_book())
        explode = report.explode(column='run_book', ignore_index=True)
        canonical = explode.join(pd.json_normalize(explode['run_book'])).drop(columns=['run_book']).replace(np.nan, '')
        if stylise:
            return Commons.report(canonical, index_header='name')
        return canonical
コード例 #15
0
    def runbook2dict(task: str, source: [str, int]=None, persist: bool=None, end_source: bool=None) -> dict:
        """ a utility method to help build feature conditions by aligning method parameters with dictionary format.

        :param task: the task name (intent level) name this runbook is applied too or a number if synthetic generation
        :param source: (optional) a task name indicating where the source of this task will come from. Optionally:
                            '@' will use the source contract of this task as the source input.
                            '@<connector>' will use the connector contract that must exist in the task connectors
        :param persist: (optional) if true persist to an event book named after the intent. if False do nothing
        :param end_source: (optional) if true indicates the source canonical can be removed from in-memory
        :return: dictionary of the parameters
        """
        return Commons.param2dict(**locals())
    def report_environ(self, hide_not_set: bool = True, stylise: bool = True):
        """ generates a report on all the intent

        :param hide_not_set: hide environ keys that are not set.
        :param stylise: returns a stylised dataframe with formatting
        :return: pd.Dataframe
        """
        df = pd.DataFrame.from_dict(data=super().report_environ(hide_not_set),
                                    orient='index').reset_index()
        df.columns = ["environ", "value"]
        if stylise:
            return Commons.report(df, index_header='environ')
        return df
コード例 #17
0
    def action2dict(method: Any, **kwargs) -> dict:
        """ a utility method to help build feature conditions by aligning method parameters with dictionary format.

        :param method: the method to execute
        :param kwargs: name value pairs associated with the method
        :return: dictionary of the parameters

        Special method values
            @header: use a column as the value reference, expects the 'header' key
            @constant: use a value constant, expects the key 'value'
            @sample: use to get sample values, expected 'name' of the Sample method, optional 'shuffle' boolean
            @eval: evaluate a code string, expects the key 'code_str' and any locals() required


        """
        return Commons.param2dict(method=method, **kwargs)
コード例 #18
0
    def report_provenance(self, as_dict: bool=None, stylise: bool=None):
        """ a report on the provenance set as part of the domain contract

        :param as_dict: (optional) if the result should be a dictionary. Default is False
        :param stylise: (optional) if as_dict is False, if the return dataFrame should be stylised
        :return:
        """
        as_dict = as_dict if isinstance(as_dict, bool) else False
        stylise = stylise if isinstance(stylise, bool) else True
        report = self.pm.report_provenance()
        if as_dict:
            return report
        df = pd.DataFrame(report, index=['values'])
        df = df.transpose().reset_index()
        df.columns = ['provenance', 'values']
        if stylise:
            return Commons.report(df, index_header='provenance')
        return df
コード例 #19
0
    def canonical2dict(method: Any, **kwargs) -> dict:
        """ a utility method to help build feature conditions by aligning method parameters with dictionary format.
        The method parameter can be wither a 'model_*' or 'frame_*' method with two special reserved options

        Special reserved method values
            @empty: returns an empty dataframe, optionally the key values size: int and headers: list
            @generate: generates a dataframe either from_env(task_name) o from a remote repo uri. params are
                task_name: the task name of the generator
                repo_uri: (optional) a remote repo to retrieve the the domain contract
                size: (optional) the generated sample size
                seed: (optional) if seeding should be applied the seed value
                run_book: (optional) a domain contract runbook to execute as part of the pipeline

        :param method: the method to execute
        :param kwargs: name value pairs associated with the method
        :return: dictionary of the parameters
        """
        return Commons.param2dict(method=method, **kwargs)
    def run_intent_pipeline(self, canonical: pd.DataFrame, intent_levels: [int, str, list]=None, run_book: str=None,
                            **kwargs):
        """ Collectively runs all parameterised intent taken from the property manager against the code base as
        defined by the intent_contract.

        It is expected that all intent methods have the 'canonical' as the first parameter of the method signature
        and will contain 'inplace' and 'save_intent' as parameters.

        :param canonical: this is the iterative value all intent are applied to and returned.
        :param intent_levels: (optional) an single or list of levels to run, if list, run in order given
        :param run_book: (optional) a preset runbook of intent_level to run in order
        :param kwargs: additional kwargs to add to the parameterised intent, these will replace any that already exist
        :return Canonical with parameterised intent applied or None if inplace is True
        """
        # test if there is any intent to run
        if self._pm.has_intent():
            # get the list of levels to run
            if isinstance(intent_levels, (int, str, list)):
                intent_levels = Commons.list_formatter(intent_levels)
            elif isinstance(run_book, str) and self._pm.has_run_book(book_name=run_book):
                intent_levels = self._pm.get_run_book(book_name=run_book)
            else:
                intent_levels = sorted(self._pm.get_intent().keys())
            for level in intent_levels:
                level_key = self._pm.join(self._pm.KEY.intent_key, level)
                for order in sorted(self._pm.get(level_key, {})):
                    for method, params in self._pm.get(self._pm.join(level_key, order), {}).items():
                        if method in self.__dir__():
                            # fail safe in case kwargs was sored as the reference
                            params.update(params.pop('kwargs', {}))
                            # add method kwargs to the params
                            if isinstance(kwargs, dict):
                                params.update(kwargs)
                            # remove the creator param
                            _ = params.pop('intent_creator', 'Unknown')
                            # add excluded params and set to False
                            params.update({'inplace': False, 'save_intent': False})
                            canonical = eval(f"self.{method}(canonical, **{params})", globals(), locals())
        return canonical
コード例 #21
0
    def select2dict(column: str, condition: str, expect: str=None, logic: str=None, date_format: str=None,
                    offset: int=None) -> dict:
        """ a utility method to help build feature conditions by aligning method parameters with dictionary format.

        :param column: the column name to apply the condition to
        :param condition: the condition string (special conditions are 'date.now' for current date
        :param expect: (optional) the data type to expect. If None then the data type is assumed from the dtype
        :param logic: (optional) the logic to provide, see below for options
        :param date_format: (optional) a format of the date if only a specific part of the date and time is required
        :param offset: (optional) a time delta in days (+/-) from the current date and time (minutes not supported)
        :return: dictionary of the parameters

        logic:
            AND: the intersect of the current state with the condition result (common to both)
            NAND: outside the intersect of the current state with the condition result (not common to both)
            OR: the union of the current state with the condition result (everything in both)
            NOR: outside the union of the current state with the condition result (everything not in both)
            NOT: the difference between the current state and the condition result
            XOR: the difference between the union and the intersect current state with the condition result
        extra logic:
            ALL: the intersect of the whole index with the condition result irrelevant of level or current state index
            ANY: the intersect of the level index with the condition result irrelevant of current state index
        """
        return Commons.param2dict(**locals())
コード例 #22
0
    def run_controller(self, run_book: [str, list, dict]=None, mod_tasks: [list, dict]=None, repeat: int=None,
                       sleep: int=None, run_time: int=None, source_check_uri: str=None, run_cycle_report: str=None):
        """ Runs the components pipeline based on the runbook instructions. The run_book can be a simple list of
        controller registered task name that will run in the given order passing the resulting outcome of one to the
        input of the next, a list of task dictionaries that contain more detailed run commands (see below) or a
        mixture of task names and task dictionaries. If no runbook is given,  all registered task names are taken from
        the intent list and run in no particular order and independent of each other using their connector source and
        persist as data input

        run book list elements can be a dictionary contain more detailed run commands for a particular task. if a
        dictionary is used it must contain the task_name as a minimum
        The dictionary keys are as follows:
            - task_name: The task name (intent level) this run detail is applied to
            - source: (optional) The task name of the source or '@<intent_name>' to reference a known event book
            - persist: (optional) if true persist to an event book named after the intent. if False do nothing
            - end_source (optional) if this task will be the last to use the source, remove it from memory on completion

        mod_tasks are a dictionary of modifications to tasks in the runbook. The run_book will still define the run
        order and modification tasks not found in the run_book will be ignored. The dictionary is indexed on the task
        name with the modifications a sub-dictionary of name value pairs.
            for example: mod_tasks = {'my_synth_gen': {source: 1000}}
            changes 'my_synth_gen' to now have a source reference of 1000 meaning it will generate 1000 synthetic rows.

        :param run_book: (optional) a run_book reference, a list of task names (intent levels)
        :param mod_tasks: (optional) a dict of modifications that override an existing task in the runbook
        :param repeat: (optional) the number of times this intent should be repeated. None or -1 -> never, 0 -> forever
        :param sleep: (optional) number of seconds to sleep before repeating
        :param run_time: (optional) number of seconds to run the controller using repeat and sleep cycles time is up
        :param source_check_uri: (optional) The source uri to check for change since last controller instance cycle
        :param run_cycle_report: (optional) The run cycle report name that provides the run cycle activities
        """
        _lock = threading.Lock()
        mod_tasks = mod_tasks if isinstance(mod_tasks, (list, dict)) else []
        if isinstance(run_cycle_report, str):
            self.add_connector_persist(connector_name='run_cycle_report', uri_file=run_cycle_report)
            df_report = pd.DataFrame(columns=['time', 'text'])
        if isinstance(mod_tasks, dict):
            mod_tasks = [mod_tasks]
        if not self.pm.has_intent():
            return
        if isinstance(run_book, str):
            if not self.pm.has_run_book(run_book) and run_book not in self.pm.get_intent().keys():
                raise ValueError(f"The run book or intent level '{run_book}' can not be found in the controller")
            if self.pm.has_run_book(run_book):
                intent_levels = self.pm.get_run_book(book_name=run_book)
            else:
                intent_levels = Commons.list_formatter(run_book)
        elif isinstance(run_book, list):
            intent_levels = run_book
        elif isinstance(run_book, dict):
            intent_levels = [run_book]
        elif self.pm.has_run_book(book_name=self.pm.PRIMARY_RUN_BOOK):
            intent_levels = self.pm.get_run_book(book_name=self.pm.PRIMARY_RUN_BOOK)
        else:
            intent_levels = Commons.list_formatter(self.pm.get_intent().keys())
            # always put the DEFAULT_INTENT_LEVEL first
            if self.pm.DEFAULT_INTENT_LEVEL in intent_levels:
                intent_levels.insert(0, intent_levels.pop(intent_levels.index(self.pm.DEFAULT_INTENT_LEVEL)))
        for idx in range(len(intent_levels)):
            if isinstance(intent_levels[idx], str):
                intent_levels[idx] = {'task': intent_levels[idx]}
            if 'end_source' not in intent_levels[idx].keys():
                intent_levels[idx].update({'end_source': False})
            if 'persist' not in intent_levels[idx].keys():
                _persist = True if idx == len(intent_levels) - 1 else False
                intent_levels[idx].update({'persist': _persist})
            if 'source' not in intent_levels[idx].keys():
                _level0 = self.pm.get_intent(intent_levels[idx].get('task')).get('0', {})
                if 'synthetic_builder' in _level0.keys():
                    _source = int(_level0.get('synthetic_builder', {}).get('size', 1000))
                else:
                    _source = f'@{self.CONNECTOR_SOURCE}' if idx == 0 else intent_levels[idx - 1].get('task')
                intent_levels[idx].update({'source': _source})
            if intent_levels[idx].get('source') == '@':
                intent_levels[idx].update({'source': f'@{self.CONNECTOR_SOURCE}'})
            for mod in mod_tasks:
                if intent_levels[idx].get('task') in mod.keys():
                    intent_levels[idx].update(mod.get(intent_levels[idx].get('task'), {}))
        handler = None
        if isinstance(source_check_uri, str):
            self.add_connector_uri(connector_name='source_checker', uri=source_check_uri)
            handler = self.pm.get_connector_handler(connector_name='source_checker')
        repeat = repeat if isinstance(repeat, int) and repeat > 0 else 1
        run_time = run_time if isinstance(run_time, int) else 0
        if run_time > 0 and not isinstance(sleep, int):
            sleep = 1
        end_time = datetime.datetime.now() + datetime.timedelta(seconds=run_time)
        run_count = 0
        while True: # run_time always runs once
            if isinstance(run_cycle_report, str):
                df_report.loc[len(df_report.index)] = [datetime.datetime.now(), f'start run-cycle {run_count}']
            for count in range(repeat):
                if isinstance(run_cycle_report, str):
                    df_report.loc[len(df_report.index)] = [datetime.datetime.now(), f'start run count {count}']
                if handler and handler.exists():
                    if handler.has_changed():
                        handler.reset_changed(False)
                    else:
                        if isinstance(run_cycle_report, str):
                            df_report.loc[len(df_report.index)] = [datetime.datetime.now(), 'Source has not changed']
                        if isinstance(sleep, int) and count < repeat - 1:
                            time.sleep(sleep)
                        continue
                for intent in intent_levels:
                    task = intent.get('task')
                    source = intent.get('source', '')
                    to_persist = intent.get('persist')
                    end_source = intent.get('end_source', False)
                    if isinstance(run_cycle_report, str):
                        df_report.loc[len(df_report.index)] = [datetime.datetime.now(), f'running {task}']
                    if isinstance(source, int) or (isinstance(source, str) and source.startswith('@')):
                        canonical = source
                    elif isinstance(source, str) and source.isnumeric():
                        canonical = int(source)
                    else:
                        if self.eb_portfolio.is_active_book(source):
                            canonical = self.eb_portfolio.current_state(source)
                            if end_source:
                                self.eb_portfolio.remove_event_books(book_names=task)
                        else:
                            raise ValueError(f"The task '{task}' source event book '{source}' does not exist")
                    # get the result
                    canonical = self.intent_model.run_intent_pipeline(canonical=canonical, intent_level=task,
                                                                      persist_result=to_persist,
                                                                      controller_repo=self.URI_PM_REPO)
                    if isinstance(run_cycle_report, str):
                        df_report.loc[len(df_report.index)] = [datetime.datetime.now(), f"canonical shape is "
                                                                                        f"{canonical.shape}"]
                    if to_persist:
                        continue
                    if self.eb_portfolio.is_event_book(task):
                        self.eb_portfolio.remove_event_books(task)
                    eb = self.eb_portfolio.intent_model.add_event_book(book_name=task, start_book=True)
                    self.eb_portfolio.add_book_to_portfolio(book_name=task, event_book=eb)
                    self.eb_portfolio.add_event(book_name=task, event=canonical)
                self.eb_portfolio.reset_portfolio()
                if isinstance(run_cycle_report, str):
                    df_report.loc[len(df_report.index)] = [datetime.datetime.now(), 'tasks complete']
                if isinstance(sleep, int) and count < repeat-1:
                    time.sleep(sleep)
            if isinstance(run_cycle_report, str):
                run_count += 1
            if end_time < datetime.datetime.now():
                break
            else:
                time.sleep(sleep)
        if isinstance(run_cycle_report, str):
            df_report.loc[len(df_report.index)] = [datetime.datetime.now(), 'end of report']
            self.save_canonical(connector_name='run_cycle_report', canonical=df_report)
        return
コード例 #23
0
    def report_quality_summary(self, canonical: pd.DataFrame=None, as_dict: bool=None, stylise: bool=None):
        """ a summary quality report of the canonical

        :param canonical: (optional) the canonical to be sumarised. If not passed then loads the canonical source
        :param as_dict: (optional) if the result should be a dictionary. Default is False
        :param stylise: (optional) if as_dict is False, if the return dataFrame should be stylised
        :return: a dict or pd.DataFrame
        """
        as_dict = as_dict if isinstance(as_dict, bool) else False
        stylise = stylise if isinstance(stylise, bool) else True
        if not isinstance(canonical, pd.DataFrame):
            canonical = self._auto_transition()
        # provinance
        _provenance_headers = ['title', 'license', 'domain', 'description', 'provider',  'author', 'cost']
        _provenance_count = len(list(filter(lambda x: x in _provenance_headers, self.pm.provenance.keys())))
        _provenance_cost = self.pm.provenance.get('cost', {}).get('price', 'NA')
        # descibed
        _descibed_keys = self.pm.get_knowledge(catalog='attributes').keys()
        _descibed_count = len(list(filter(lambda x: x in canonical.columns, _descibed_keys)))
        # dictionary
        _dictionary = self.canonical_report(canonical, stylise=False)
        _total_fields = _dictionary.shape[0]
        _null_total = _dictionary['%_Null'].sum()
        _dom_fields = _dictionary['%_Dom'].sum()

        _null_columns = _dictionary['%_Null'].where(_dictionary['%_Null'] > 0.98).dropna()
        _dom_columns = _dictionary['%_Dom'].where(_dictionary['%_Dom'] > 0.98).dropna()
        _usable_fields = set(_null_columns)
        _usable_fields.update(_dom_columns)
        _numeric_fields = len(Commons.filter_headers(canonical, dtype='number'))
        _category_fields = len(Commons.filter_headers(canonical, dtype='category'))
        _date_fields = len(Commons.filter_headers(canonical, dtype='datetime'))
        _bool_fields = len(Commons.filter_headers(canonical, dtype='bool'))
        _other_fields = len(Commons.filter_headers(canonical, dtype=['category', 'datetime', 'bool',
                                                                     'number'],  exclude=True))
        _null_avg = _null_total / canonical.shape[1]
        _dom_avg = _dom_fields / canonical.shape[1]
        _quality_avg = int(round(100 - (((_null_avg + _dom_avg)/2)*100), 0))
        _correlated = self._correlated_columns(canonical)
        _usable = int(round(100 - (len(_usable_fields) / canonical.columns.size) * 100, 2))
        _field_avg = int(round(_descibed_count / canonical.shape[1] * 100, 0))
        _prov_avg = int(round(_provenance_count/len(_provenance_headers)*100, 0))
        _adjustments = self.report_intent(stylise=False).intent.size
        report = {'score': {'quality_avg': f"{_quality_avg}%", 'usability_avg': f"{_usable}%",
                            'provenance_complete': f"{_prov_avg}%", 'data_described': f"{_field_avg}%"},
                  'data_shape': {'rows': canonical.shape[0], 'columns': canonical.shape[1],
                                 'memory': Commons.bytes2human(canonical.memory_usage(deep=True).sum())},
                  'data_type': {'numeric': _numeric_fields, 'category': _category_fields,
                                'datetime': _date_fields, 'bool': _bool_fields,
                                'others': _other_fields},
                  'usability': {'mostly_null': len(_null_columns),
                                'predominance': len(_dom_columns),
                                'correlated': len(_correlated),
                                'adjustments': _adjustments},
                  'cost': {'price': _provenance_cost}}
        if as_dict:
            return report
        df = pd.DataFrame(columns=['report', 'summary', 'result'])
        counter = 0
        for index, values in report.items():
            for summary, result in values.items():
                df.loc[counter] = [index, summary, result]
                counter += 1
        if stylise:
            Commons.report(df, index_header='report', bold='summary')
        return df
コード例 #24
0
 def list_formatter(value) -> list:
     """override of the list_formatter to include Pandas types"""
     return Commons.list_formatter(value=value)
コード例 #25
0
 def report_quality(self, canonical: pd.DataFrame=None) -> dict:
     """A complete report of the components"""
     if not isinstance(canonical, pd.DataFrame):
         canonical = self._auto_transition()
     # meta
     report = {'meta-data': {'uid': str(uuid.uuid4()),
                             'created': str(pd.Timestamp.now()),
                             'creator': self.pm.username},
               'description': self.pm.description,
               'summary': self.report_quality_summary(canonical, as_dict=True)}
     # connectors
     _connectors = {}
     for connector in self.pm.connector_contract_list:
         if connector.startswith('pm_transition') or connector.startswith('template_'):
             continue
         _connector = self.pm.get_connector_contract(connector_name=connector)
         _connector_dict = {}
         if isinstance(_connector, ConnectorContract):
             kwargs = ''
             if isinstance(_connector.raw_kwargs, dict):
                 for k, v in _connector.raw_kwargs.items():
                     if len(kwargs) > 0:
                         kwargs += "  "
                     kwargs += f"{k}='{v}'"
             query = ''
             if isinstance(_connector.query, dict):
                 for k, v in _connector.query.items():
                     if len(query) > 0:
                         query += "  "
                     query += f"{k}='{v}'"
             _connector_dict['uri'] = _connector.raw_uri
             _connector_dict['version'] = _connector.version
             if len(kwargs) > 0:
                 _connector_dict['kwargs'] = kwargs
             if len(query) > 0:
                 _connector_dict['query'] = query
         _connectors[connector] = _connector_dict
     report['connectors'] = _connectors
     # provenance
     report['provenance'] = self.pm.provenance
     _provenance_headers = ['title', 'license', 'domain', 'description', 'provider',  'author', 'cost']
     _provenance_count = len(list(filter(lambda x: x in _provenance_headers, self.pm.provenance.keys())))
     # fields
     _field_count = 0
     _fields = {}
     for label, items in self.pm.get_knowledge(catalog='attributes').items():
         _fields[label] = Commons.list_formatter(items.values())
         if label in canonical.columns:
             _field_count += 1
     report['attributes'] = _fields
     # dictionary
     _data_dict = {}
     for _, row in self.canonical_report(canonical, stylise=False).iterrows():
         _data_dict[row.iloc[0]] = {}
         _att_name = None
         for index in row.index:
             if index.startswith('Attribute'):
                 _att_name = row.loc[index]
                 continue
             _data_dict[row.iloc[0]].update({index: row.loc[index]})
     report['dictionary'] = _data_dict
     # notes
     _observations = {}
     for label, items in self.pm.get_knowledge(catalog='observations').items():
         _observations[label] = Commons.list_formatter(items.values())
     _actions = {}
     for label, items in self.pm.get_knowledge(catalog='actions').items():
         _actions[label] = Commons.list_formatter(items.values())
     report['notes'] = {'observations': _observations,
                        'actions': _actions}
     return report