def run_component_pipeline(self, canonical: pd.DataFrame=None, feature_names: [str, list]=None, auto_connectors: bool=None, save: bool=None, reset_changed: bool=None, has_changed: bool=None): """runs all features within the feature catalog or an optional set of features :param canonical: (optional) A canonical if the source canonical isn't to be used :param feature_names: (optional) a single or list of features to run :param auto_connectors: (optional) Adds a versioned feature connector if not yet added. Default to True :param save: (optional) if True, persist changes to property manager. Default is True :param reset_changed: (optional) resets the has_changed boolean to True :param has_changed: (optional) tests if the underline canonical has changed since last load else error returned """ auto_connectors = auto_connectors if isinstance(auto_connectors, bool) else True if isinstance(feature_names, (str, list)): feature_names = Commons.list_formatter(feature_names) else: feature_names = Commons.list_formatter(self.pm.get_intent()) if not isinstance(canonical, (pd.DataFrame, str)): canonical = self.load_source_canonical(reset_changed=reset_changed, has_changed=has_changed) for feature in feature_names: if not self.pm.has_connector(feature): if not auto_connectors: continue self.set_feature_bootstrap(feature_name=feature, versioned=True, save=save) result = self.intent_model.run_intent_pipeline(canonical, feature) self.save_catalog_feature(feature_name=feature, canonical=result) return
def test_get_cols(self): tools = self.tools cleaner = self.clean df = pd.DataFrame() df['int'] = tools.get_number(100, size=10) df['float'] = tools.get_number(0, 1.0, size=10) df['object'] = tools.get_category(list('abcdef'), size=10) df['date'] = tools.get_datetime('01/01/2010', '01/01/2018', size=10) df['category'] = tools.get_category(list('vwxyz'), size=10) df = cleaner.to_category_type(df, headers='category') df = cleaner.to_date_type(df, headers='date') control = ['float', 'object', 'date', 'category', 'int'] result = Commons.filter_headers(df) self.assertTrue(set(result).intersection(control)) control = ['object'] result = Commons.filter_headers(df, dtype=[object]) self.assertEqual(control, result) result = Commons.filter_headers(df, dtype=['object']) self.assertEqual(control, result) control = ['float', 'int'] result = Commons.filter_headers(df, dtype=['number']) self.assertTrue(set(result).intersection(control)) control = ['date'] result = Commons.filter_headers(df, dtype=['datetime']) self.assertEqual(control, result)
def _report(self, canonical: pd.DataFrame, index_header: str, bold: [str, list]=None, large_font: [str, list]=None): """ generates a stylised report :param canonical :param index_header: :param bold: :param large_font :return: stylised report DataFrame """ pd.set_option('max_colwidth', 200) pd.set_option('expand_frame_repr', True) bold = Commons.list_formatter(bold) bold.append(index_header) large_font = Commons.list_formatter(large_font) large_font.append(index_header) style = [{'selector': 'th', 'props': [('font-size', "120%"), ("text-align", "center")]}, {'selector': '.row_heading, .blank', 'props': [('display', 'none;')]}] index = canonical[canonical[index_header].duplicated()].index.to_list() canonical.loc[index, index_header] = '' canonical = canonical.reset_index(drop=True) df_style = canonical.style.set_table_styles(style) _ = df_style.set_properties(**{'text-align': 'left'}) if len(bold) > 0: _ = df_style.set_properties(subset=bold, **{'font-weight': 'bold'}) if len(large_font) > 0: _ = df_style.set_properties(subset=large_font, **{'font-size': "120%"}) return df_style
def test_associate_analysis_complex(self): builder = SyntheticBuilder.from_memory() clinical_health = 'https://assets.datacamp.com/production/repositories/628/datasets/444cdbf175d5fbf564b564bd36ac21740627a834/diabetes.csv' builder.add_connector_uri('clinical_health', uri=clinical_health) discover: DataDiscovery = Transition.from_memory().discover A = discover.analysis2dict(header='age', dtype='int', granularity=10.0, lower=21, upper=90) B = discover.analysis2dict(header='pregnancies') columns_list = [A, B] df_clinical = builder.load_canonical('clinical_health') analysis_blob = discover.analyse_association(df_clinical, columns_list=columns_list) canonical = pd.DataFrame(index=range(1973)) df = builder.tools.model_analysis(canonical, analysis_blob=analysis_blob, column_name='clinical') self.assertEqual((1973, 2), df.shape) pregnancies = Commons.list_standardize( Commons.list_formatter(df_clinical.pregnancies)) low, high = discover.bootstrap_confidence_interval( pd.Series(pregnancies), func=np.mean) pregnancies = Commons.list_standardize( Commons.list_formatter(df.pregnancies)) self.assertTrue(low <= np.mean(pregnancies) <= high)
def register_estimator(self, canonical: pd.DataFrame, target: str, headers: list, class_name: str, module_name: str, hyper_param: dict = None, test_size: float = None, random_state: int = None, save_intent: bool = None, model_name: str = None, intent_order: int = None, replace_intent: bool = None, remove_duplicates: bool = None): """ registers and fits an estimator model returning the model fit :param canonical: the model canonical :param class_name: the name of the model class :param target: the model target :param headers: the model features header names :param hyper_param: (optional) hyper parameters for the model instance :param test_size: (optional) the size of the test sample (default tp 0.33) :param random_state: (optional) a random state value for the test sample :param module_name: (optional) the name of the module :param save_intent: (optional) if the intent contract should be saved to the property manager :param model_name: (optional) the name of the model :param intent_order: (optional) the order in which each intent should run. If None: default's to -1 if -1: added to a level above any current instance of the intent section, level 0 if not found if int: added to the level specified, overwriting any that already exist :param replace_intent: (optional) if the intent method exists at the level, or default level True - replaces the current intent method with the new False - leaves it untouched, disregarding the new intent :param remove_duplicates: (optional) removes any duplicate intent in any level that is identical :return: CatBoostClassifier. """ # resolve intent persist options _method = inspect.currentframe().f_code.co_name self._set_intend_signature(self._intent_builder(method=_method, params=locals()), model_name=model_name, intent_order=intent_order, replace_intent=replace_intent, remove_duplicates=remove_duplicates, save_intent=save_intent) # Code block for intent local_intent = {} if model_name and self._pm.has_intent(model_name): local_intent = self._pm.get_intent(level=model_name, intent=_method) module_name = module_name if isinstance( module_name, str) else local_intent.get('module_name', None) X = Commons.filter_columns(canonical, headers=headers) y = Commons.filter_columns(canonical, headers=target) module = HandlerFactory.get_module(module_name='ds_behavioral')
def report_canonical_schema(self, schema: [str, dict] = None, roots: [str, list] = None, sections: [str, list] = None, elements: [str, list] = None, stylise: bool = True): """ presents the current canonical schema :param schema: (optional) the name of the schema :param roots: (optional) one or more tree roots :param sections: (optional) the section under the root :param elements: (optional) the element in the section :param stylise: if True present the report stylised. :return: pd.DataFrame """ if not isinstance(schema, dict): schema = schema if isinstance(schema, str) else self.REPORT_SCHEMA if not self.pm.has_canonical_schema(name=schema): raise ValueError( f"There is no Schema currently stored under the name '{schema}'" ) schema = self.pm.get_canonical_schema(name=schema) df = pd.DataFrame(columns=['root', 'section', 'element', 'value']) root_list = DataAnalytics.get_tree_roots(analytics_blob=schema) if isinstance(roots, (str, list)): roots = Commons.list_formatter(roots) for root in roots: if root not in root_list: raise ValueError( f"The root '{root}' can not be found in the analytics tree roots" ) root_list = roots for root_items in root_list: data_analysis = DataAnalytics.from_root(analytics_blob=schema, root=root_items) for section in data_analysis.section_names: if isinstance(sections, (str, list)): if section not in Commons.list_formatter(sections): continue for element, value in data_analysis.get(section).items(): if isinstance(elements, (str, list)): if element not in Commons.list_formatter(elements): continue to_append = [root_items, section, element, value] a_series = pd.Series(to_append, index=df.columns) df = df.append(a_series, ignore_index=True) if stylise: return Commons.report(df, index_header=['root', 'section'], bold='element') return df
def report_notes(self, catalog: [str, list] = None, labels: [str, list] = None, regex: [str, list] = None, re_ignore_case: bool = False, stylise: bool = True, drop_dates: bool = False): """ generates a report on the notes :param catalog: (optional) the catalog to filter on :param labels: (optional) s label or list of labels to filter on :param regex: (optional) a regular expression on the notes :param re_ignore_case: (optional) if the regular expression should be case sensitive :param stylise: (optional) returns a stylised dataframe with formatting :param drop_dates: (optional) excludes the 'date' column from the report :return: pd.Dataframe """ report = self.pm.report_notes(catalog=catalog, labels=labels, regex=regex, re_ignore_case=re_ignore_case, drop_dates=drop_dates) df = pd.DataFrame.from_dict(data=report) if stylise: return Commons.report(df, index_header='section', bold='label') return df
def report_connectors(self, connector_filter: [str, list] = None, inc_pm: bool = None, inc_template: bool = None, stylise: bool = True): """ generates a report on the source contract :param connector_filter: (optional) filters on the connector name. :param inc_pm: (optional) include the property manager connector :param inc_template: (optional) include the template connectors :param stylise: (optional) returns a stylised DataFrame with formatting :return: pd.DataFrame """ report = self.pm.report_connectors(connector_filter=connector_filter, inc_pm=inc_pm, inc_template=inc_template) df = pd.DataFrame.from_dict(data=report) # sort out any values that start with a $ as it throws formatting for c in df.columns: df[c] = [ f"{x[1:]}" if str(x).startswith('$') else x for x in df[c] ] if stylise: return Commons.report(df, index_header='connector_name') return df
def test_make_list(self): for value in [ '', 0, 0.0, pd.Timestamp(2018, 1, 1), [], (), pd.Series(dtype=str), list(), tuple(), 'name', ['list1', 'list2'], ('tuple1', 'tuple2'), pd.Series(['series1', 'series2']), { 'key1': 'value1', 'key2': 'value2' }, {}, dict() ]: result = Commons.list_formatter(value) self.assertTrue(isinstance(result, list), value) self.assertEqual([], Commons.list_formatter(None))
def test_filter(self): tools = self.tools sample_size = 1000 df = pd.DataFrame() df['normal_num'] = tools.get_number(1, 10, size=sample_size, seed=31) df['single num'] = tools.get_number(1, 2, quantity=0.8, size=sample_size, seed=31) df['weight_num'] = tools.get_number(1, 3, relative_freq=[90, 1], size=sample_size, seed=31) df['null'] = tools.get_number(1, 100, quantity=0, size=sample_size, seed=31) df['single cat'] = tools.get_category(['A'], quantity=0.6, size=sample_size, seed=31) df['weight_cat'] = tools.get_category(['A', 'B', 'C'], relative_freq=[80, 1, 1], size=sample_size, seed=31) df['normal_cat'] = tools.get_category(['A', 'B', 'C'], size=sample_size, seed=31) result = Commons.filter_headers(df, headers=['normal_num', 'single num']) control = ['normal_num', 'single num'] self.assertCountEqual(control, result) result = Commons.filter_headers(df, dtype=['number']) control = ['null', 'weight_num', 'normal_num', 'single num'] self.assertCountEqual(control, result)
def report_intent(self, levels: [str, int, list] = None, stylise: bool = True): """ generates a report on all the intent :param levels: (optional) a filter on the levels. passing a single value will report a single parameterised view :param stylise: (optional) returns a stylised dataframe with formatting :return: pd.Dataframe """ if isinstance(levels, (int, str)): df = pd.DataFrame.from_dict(data=self.pm.report_intent_params( level=levels)) if stylise: return Commons.report(df, index_header='order') df = pd.DataFrame.from_dict(data=self.pm.report_intent(levels=levels)) if stylise: return Commons.report(df, index_header='level') return df
def report_run_book(self, stylise: bool = True): """ generates a report on all the intent :param stylise: returns a stylised dataframe with formatting :return: pd.Dataframe """ df = pd.DataFrame.from_dict(data=self.pm.report_run_book()) if stylise: return Commons.report(df, index_header='name') return df
def _correlated_columns(self, canonical: pd.DataFrame): """returns th percentage of useful colums""" threshold = 0.98 pad = self.scratch_pad() canonical = pad.auto_to_category(canonical, unique_max=1000, inplace=False) canonical = pad.to_category_type(canonical, dtype='category', as_num=True) for c in canonical.columns: if all(Commons.valid_date(x) for x in canonical[c].dropna()): canonical = pad.to_date_type(canonical, dtype='datetime', as_num=True) canonical = Commons.filter_columns(canonical, dtype=['number'], exclude=False) for c in canonical.columns: canonical[c] = Commons.fillna(canonical[c]) col_corr = set() corr_matrix = canonical.corr() for i in range(len(corr_matrix.columns)): for j in range(i): if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value colname = corr_matrix.columns[i] # getting the name of column col_corr.add(colname) return col_corr
def report_run_book(self, stylise: bool=True): """ generates a report on all the intent :param stylise: returns a stylised dataframe with formatting :return: pd.Dataframe """ report = pd.DataFrame(self.pm.report_run_book()) explode = report.explode(column='run_book', ignore_index=True) canonical = explode.join(pd.json_normalize(explode['run_book'])).drop(columns=['run_book']).replace(np.nan, '') if stylise: return Commons.report(canonical, index_header='name') return canonical
def runbook2dict(task: str, source: [str, int]=None, persist: bool=None, end_source: bool=None) -> dict: """ a utility method to help build feature conditions by aligning method parameters with dictionary format. :param task: the task name (intent level) name this runbook is applied too or a number if synthetic generation :param source: (optional) a task name indicating where the source of this task will come from. Optionally: '@' will use the source contract of this task as the source input. '@<connector>' will use the connector contract that must exist in the task connectors :param persist: (optional) if true persist to an event book named after the intent. if False do nothing :param end_source: (optional) if true indicates the source canonical can be removed from in-memory :return: dictionary of the parameters """ return Commons.param2dict(**locals())
def report_environ(self, hide_not_set: bool = True, stylise: bool = True): """ generates a report on all the intent :param hide_not_set: hide environ keys that are not set. :param stylise: returns a stylised dataframe with formatting :return: pd.Dataframe """ df = pd.DataFrame.from_dict(data=super().report_environ(hide_not_set), orient='index').reset_index() df.columns = ["environ", "value"] if stylise: return Commons.report(df, index_header='environ') return df
def action2dict(method: Any, **kwargs) -> dict: """ a utility method to help build feature conditions by aligning method parameters with dictionary format. :param method: the method to execute :param kwargs: name value pairs associated with the method :return: dictionary of the parameters Special method values @header: use a column as the value reference, expects the 'header' key @constant: use a value constant, expects the key 'value' @sample: use to get sample values, expected 'name' of the Sample method, optional 'shuffle' boolean @eval: evaluate a code string, expects the key 'code_str' and any locals() required """ return Commons.param2dict(method=method, **kwargs)
def report_provenance(self, as_dict: bool=None, stylise: bool=None): """ a report on the provenance set as part of the domain contract :param as_dict: (optional) if the result should be a dictionary. Default is False :param stylise: (optional) if as_dict is False, if the return dataFrame should be stylised :return: """ as_dict = as_dict if isinstance(as_dict, bool) else False stylise = stylise if isinstance(stylise, bool) else True report = self.pm.report_provenance() if as_dict: return report df = pd.DataFrame(report, index=['values']) df = df.transpose().reset_index() df.columns = ['provenance', 'values'] if stylise: return Commons.report(df, index_header='provenance') return df
def canonical2dict(method: Any, **kwargs) -> dict: """ a utility method to help build feature conditions by aligning method parameters with dictionary format. The method parameter can be wither a 'model_*' or 'frame_*' method with two special reserved options Special reserved method values @empty: returns an empty dataframe, optionally the key values size: int and headers: list @generate: generates a dataframe either from_env(task_name) o from a remote repo uri. params are task_name: the task name of the generator repo_uri: (optional) a remote repo to retrieve the the domain contract size: (optional) the generated sample size seed: (optional) if seeding should be applied the seed value run_book: (optional) a domain contract runbook to execute as part of the pipeline :param method: the method to execute :param kwargs: name value pairs associated with the method :return: dictionary of the parameters """ return Commons.param2dict(method=method, **kwargs)
def run_intent_pipeline(self, canonical: pd.DataFrame, intent_levels: [int, str, list]=None, run_book: str=None, **kwargs): """ Collectively runs all parameterised intent taken from the property manager against the code base as defined by the intent_contract. It is expected that all intent methods have the 'canonical' as the first parameter of the method signature and will contain 'inplace' and 'save_intent' as parameters. :param canonical: this is the iterative value all intent are applied to and returned. :param intent_levels: (optional) an single or list of levels to run, if list, run in order given :param run_book: (optional) a preset runbook of intent_level to run in order :param kwargs: additional kwargs to add to the parameterised intent, these will replace any that already exist :return Canonical with parameterised intent applied or None if inplace is True """ # test if there is any intent to run if self._pm.has_intent(): # get the list of levels to run if isinstance(intent_levels, (int, str, list)): intent_levels = Commons.list_formatter(intent_levels) elif isinstance(run_book, str) and self._pm.has_run_book(book_name=run_book): intent_levels = self._pm.get_run_book(book_name=run_book) else: intent_levels = sorted(self._pm.get_intent().keys()) for level in intent_levels: level_key = self._pm.join(self._pm.KEY.intent_key, level) for order in sorted(self._pm.get(level_key, {})): for method, params in self._pm.get(self._pm.join(level_key, order), {}).items(): if method in self.__dir__(): # fail safe in case kwargs was sored as the reference params.update(params.pop('kwargs', {})) # add method kwargs to the params if isinstance(kwargs, dict): params.update(kwargs) # remove the creator param _ = params.pop('intent_creator', 'Unknown') # add excluded params and set to False params.update({'inplace': False, 'save_intent': False}) canonical = eval(f"self.{method}(canonical, **{params})", globals(), locals()) return canonical
def select2dict(column: str, condition: str, expect: str=None, logic: str=None, date_format: str=None, offset: int=None) -> dict: """ a utility method to help build feature conditions by aligning method parameters with dictionary format. :param column: the column name to apply the condition to :param condition: the condition string (special conditions are 'date.now' for current date :param expect: (optional) the data type to expect. If None then the data type is assumed from the dtype :param logic: (optional) the logic to provide, see below for options :param date_format: (optional) a format of the date if only a specific part of the date and time is required :param offset: (optional) a time delta in days (+/-) from the current date and time (minutes not supported) :return: dictionary of the parameters logic: AND: the intersect of the current state with the condition result (common to both) NAND: outside the intersect of the current state with the condition result (not common to both) OR: the union of the current state with the condition result (everything in both) NOR: outside the union of the current state with the condition result (everything not in both) NOT: the difference between the current state and the condition result XOR: the difference between the union and the intersect current state with the condition result extra logic: ALL: the intersect of the whole index with the condition result irrelevant of level or current state index ANY: the intersect of the level index with the condition result irrelevant of current state index """ return Commons.param2dict(**locals())
def run_controller(self, run_book: [str, list, dict]=None, mod_tasks: [list, dict]=None, repeat: int=None, sleep: int=None, run_time: int=None, source_check_uri: str=None, run_cycle_report: str=None): """ Runs the components pipeline based on the runbook instructions. The run_book can be a simple list of controller registered task name that will run in the given order passing the resulting outcome of one to the input of the next, a list of task dictionaries that contain more detailed run commands (see below) or a mixture of task names and task dictionaries. If no runbook is given, all registered task names are taken from the intent list and run in no particular order and independent of each other using their connector source and persist as data input run book list elements can be a dictionary contain more detailed run commands for a particular task. if a dictionary is used it must contain the task_name as a minimum The dictionary keys are as follows: - task_name: The task name (intent level) this run detail is applied to - source: (optional) The task name of the source or '@<intent_name>' to reference a known event book - persist: (optional) if true persist to an event book named after the intent. if False do nothing - end_source (optional) if this task will be the last to use the source, remove it from memory on completion mod_tasks are a dictionary of modifications to tasks in the runbook. The run_book will still define the run order and modification tasks not found in the run_book will be ignored. The dictionary is indexed on the task name with the modifications a sub-dictionary of name value pairs. for example: mod_tasks = {'my_synth_gen': {source: 1000}} changes 'my_synth_gen' to now have a source reference of 1000 meaning it will generate 1000 synthetic rows. :param run_book: (optional) a run_book reference, a list of task names (intent levels) :param mod_tasks: (optional) a dict of modifications that override an existing task in the runbook :param repeat: (optional) the number of times this intent should be repeated. None or -1 -> never, 0 -> forever :param sleep: (optional) number of seconds to sleep before repeating :param run_time: (optional) number of seconds to run the controller using repeat and sleep cycles time is up :param source_check_uri: (optional) The source uri to check for change since last controller instance cycle :param run_cycle_report: (optional) The run cycle report name that provides the run cycle activities """ _lock = threading.Lock() mod_tasks = mod_tasks if isinstance(mod_tasks, (list, dict)) else [] if isinstance(run_cycle_report, str): self.add_connector_persist(connector_name='run_cycle_report', uri_file=run_cycle_report) df_report = pd.DataFrame(columns=['time', 'text']) if isinstance(mod_tasks, dict): mod_tasks = [mod_tasks] if not self.pm.has_intent(): return if isinstance(run_book, str): if not self.pm.has_run_book(run_book) and run_book not in self.pm.get_intent().keys(): raise ValueError(f"The run book or intent level '{run_book}' can not be found in the controller") if self.pm.has_run_book(run_book): intent_levels = self.pm.get_run_book(book_name=run_book) else: intent_levels = Commons.list_formatter(run_book) elif isinstance(run_book, list): intent_levels = run_book elif isinstance(run_book, dict): intent_levels = [run_book] elif self.pm.has_run_book(book_name=self.pm.PRIMARY_RUN_BOOK): intent_levels = self.pm.get_run_book(book_name=self.pm.PRIMARY_RUN_BOOK) else: intent_levels = Commons.list_formatter(self.pm.get_intent().keys()) # always put the DEFAULT_INTENT_LEVEL first if self.pm.DEFAULT_INTENT_LEVEL in intent_levels: intent_levels.insert(0, intent_levels.pop(intent_levels.index(self.pm.DEFAULT_INTENT_LEVEL))) for idx in range(len(intent_levels)): if isinstance(intent_levels[idx], str): intent_levels[idx] = {'task': intent_levels[idx]} if 'end_source' not in intent_levels[idx].keys(): intent_levels[idx].update({'end_source': False}) if 'persist' not in intent_levels[idx].keys(): _persist = True if idx == len(intent_levels) - 1 else False intent_levels[idx].update({'persist': _persist}) if 'source' not in intent_levels[idx].keys(): _level0 = self.pm.get_intent(intent_levels[idx].get('task')).get('0', {}) if 'synthetic_builder' in _level0.keys(): _source = int(_level0.get('synthetic_builder', {}).get('size', 1000)) else: _source = f'@{self.CONNECTOR_SOURCE}' if idx == 0 else intent_levels[idx - 1].get('task') intent_levels[idx].update({'source': _source}) if intent_levels[idx].get('source') == '@': intent_levels[idx].update({'source': f'@{self.CONNECTOR_SOURCE}'}) for mod in mod_tasks: if intent_levels[idx].get('task') in mod.keys(): intent_levels[idx].update(mod.get(intent_levels[idx].get('task'), {})) handler = None if isinstance(source_check_uri, str): self.add_connector_uri(connector_name='source_checker', uri=source_check_uri) handler = self.pm.get_connector_handler(connector_name='source_checker') repeat = repeat if isinstance(repeat, int) and repeat > 0 else 1 run_time = run_time if isinstance(run_time, int) else 0 if run_time > 0 and not isinstance(sleep, int): sleep = 1 end_time = datetime.datetime.now() + datetime.timedelta(seconds=run_time) run_count = 0 while True: # run_time always runs once if isinstance(run_cycle_report, str): df_report.loc[len(df_report.index)] = [datetime.datetime.now(), f'start run-cycle {run_count}'] for count in range(repeat): if isinstance(run_cycle_report, str): df_report.loc[len(df_report.index)] = [datetime.datetime.now(), f'start run count {count}'] if handler and handler.exists(): if handler.has_changed(): handler.reset_changed(False) else: if isinstance(run_cycle_report, str): df_report.loc[len(df_report.index)] = [datetime.datetime.now(), 'Source has not changed'] if isinstance(sleep, int) and count < repeat - 1: time.sleep(sleep) continue for intent in intent_levels: task = intent.get('task') source = intent.get('source', '') to_persist = intent.get('persist') end_source = intent.get('end_source', False) if isinstance(run_cycle_report, str): df_report.loc[len(df_report.index)] = [datetime.datetime.now(), f'running {task}'] if isinstance(source, int) or (isinstance(source, str) and source.startswith('@')): canonical = source elif isinstance(source, str) and source.isnumeric(): canonical = int(source) else: if self.eb_portfolio.is_active_book(source): canonical = self.eb_portfolio.current_state(source) if end_source: self.eb_portfolio.remove_event_books(book_names=task) else: raise ValueError(f"The task '{task}' source event book '{source}' does not exist") # get the result canonical = self.intent_model.run_intent_pipeline(canonical=canonical, intent_level=task, persist_result=to_persist, controller_repo=self.URI_PM_REPO) if isinstance(run_cycle_report, str): df_report.loc[len(df_report.index)] = [datetime.datetime.now(), f"canonical shape is " f"{canonical.shape}"] if to_persist: continue if self.eb_portfolio.is_event_book(task): self.eb_portfolio.remove_event_books(task) eb = self.eb_portfolio.intent_model.add_event_book(book_name=task, start_book=True) self.eb_portfolio.add_book_to_portfolio(book_name=task, event_book=eb) self.eb_portfolio.add_event(book_name=task, event=canonical) self.eb_portfolio.reset_portfolio() if isinstance(run_cycle_report, str): df_report.loc[len(df_report.index)] = [datetime.datetime.now(), 'tasks complete'] if isinstance(sleep, int) and count < repeat-1: time.sleep(sleep) if isinstance(run_cycle_report, str): run_count += 1 if end_time < datetime.datetime.now(): break else: time.sleep(sleep) if isinstance(run_cycle_report, str): df_report.loc[len(df_report.index)] = [datetime.datetime.now(), 'end of report'] self.save_canonical(connector_name='run_cycle_report', canonical=df_report) return
def report_quality_summary(self, canonical: pd.DataFrame=None, as_dict: bool=None, stylise: bool=None): """ a summary quality report of the canonical :param canonical: (optional) the canonical to be sumarised. If not passed then loads the canonical source :param as_dict: (optional) if the result should be a dictionary. Default is False :param stylise: (optional) if as_dict is False, if the return dataFrame should be stylised :return: a dict or pd.DataFrame """ as_dict = as_dict if isinstance(as_dict, bool) else False stylise = stylise if isinstance(stylise, bool) else True if not isinstance(canonical, pd.DataFrame): canonical = self._auto_transition() # provinance _provenance_headers = ['title', 'license', 'domain', 'description', 'provider', 'author', 'cost'] _provenance_count = len(list(filter(lambda x: x in _provenance_headers, self.pm.provenance.keys()))) _provenance_cost = self.pm.provenance.get('cost', {}).get('price', 'NA') # descibed _descibed_keys = self.pm.get_knowledge(catalog='attributes').keys() _descibed_count = len(list(filter(lambda x: x in canonical.columns, _descibed_keys))) # dictionary _dictionary = self.canonical_report(canonical, stylise=False) _total_fields = _dictionary.shape[0] _null_total = _dictionary['%_Null'].sum() _dom_fields = _dictionary['%_Dom'].sum() _null_columns = _dictionary['%_Null'].where(_dictionary['%_Null'] > 0.98).dropna() _dom_columns = _dictionary['%_Dom'].where(_dictionary['%_Dom'] > 0.98).dropna() _usable_fields = set(_null_columns) _usable_fields.update(_dom_columns) _numeric_fields = len(Commons.filter_headers(canonical, dtype='number')) _category_fields = len(Commons.filter_headers(canonical, dtype='category')) _date_fields = len(Commons.filter_headers(canonical, dtype='datetime')) _bool_fields = len(Commons.filter_headers(canonical, dtype='bool')) _other_fields = len(Commons.filter_headers(canonical, dtype=['category', 'datetime', 'bool', 'number'], exclude=True)) _null_avg = _null_total / canonical.shape[1] _dom_avg = _dom_fields / canonical.shape[1] _quality_avg = int(round(100 - (((_null_avg + _dom_avg)/2)*100), 0)) _correlated = self._correlated_columns(canonical) _usable = int(round(100 - (len(_usable_fields) / canonical.columns.size) * 100, 2)) _field_avg = int(round(_descibed_count / canonical.shape[1] * 100, 0)) _prov_avg = int(round(_provenance_count/len(_provenance_headers)*100, 0)) _adjustments = self.report_intent(stylise=False).intent.size report = {'score': {'quality_avg': f"{_quality_avg}%", 'usability_avg': f"{_usable}%", 'provenance_complete': f"{_prov_avg}%", 'data_described': f"{_field_avg}%"}, 'data_shape': {'rows': canonical.shape[0], 'columns': canonical.shape[1], 'memory': Commons.bytes2human(canonical.memory_usage(deep=True).sum())}, 'data_type': {'numeric': _numeric_fields, 'category': _category_fields, 'datetime': _date_fields, 'bool': _bool_fields, 'others': _other_fields}, 'usability': {'mostly_null': len(_null_columns), 'predominance': len(_dom_columns), 'correlated': len(_correlated), 'adjustments': _adjustments}, 'cost': {'price': _provenance_cost}} if as_dict: return report df = pd.DataFrame(columns=['report', 'summary', 'result']) counter = 0 for index, values in report.items(): for summary, result in values.items(): df.loc[counter] = [index, summary, result] counter += 1 if stylise: Commons.report(df, index_header='report', bold='summary') return df
def list_formatter(value) -> list: """override of the list_formatter to include Pandas types""" return Commons.list_formatter(value=value)
def report_quality(self, canonical: pd.DataFrame=None) -> dict: """A complete report of the components""" if not isinstance(canonical, pd.DataFrame): canonical = self._auto_transition() # meta report = {'meta-data': {'uid': str(uuid.uuid4()), 'created': str(pd.Timestamp.now()), 'creator': self.pm.username}, 'description': self.pm.description, 'summary': self.report_quality_summary(canonical, as_dict=True)} # connectors _connectors = {} for connector in self.pm.connector_contract_list: if connector.startswith('pm_transition') or connector.startswith('template_'): continue _connector = self.pm.get_connector_contract(connector_name=connector) _connector_dict = {} if isinstance(_connector, ConnectorContract): kwargs = '' if isinstance(_connector.raw_kwargs, dict): for k, v in _connector.raw_kwargs.items(): if len(kwargs) > 0: kwargs += " " kwargs += f"{k}='{v}'" query = '' if isinstance(_connector.query, dict): for k, v in _connector.query.items(): if len(query) > 0: query += " " query += f"{k}='{v}'" _connector_dict['uri'] = _connector.raw_uri _connector_dict['version'] = _connector.version if len(kwargs) > 0: _connector_dict['kwargs'] = kwargs if len(query) > 0: _connector_dict['query'] = query _connectors[connector] = _connector_dict report['connectors'] = _connectors # provenance report['provenance'] = self.pm.provenance _provenance_headers = ['title', 'license', 'domain', 'description', 'provider', 'author', 'cost'] _provenance_count = len(list(filter(lambda x: x in _provenance_headers, self.pm.provenance.keys()))) # fields _field_count = 0 _fields = {} for label, items in self.pm.get_knowledge(catalog='attributes').items(): _fields[label] = Commons.list_formatter(items.values()) if label in canonical.columns: _field_count += 1 report['attributes'] = _fields # dictionary _data_dict = {} for _, row in self.canonical_report(canonical, stylise=False).iterrows(): _data_dict[row.iloc[0]] = {} _att_name = None for index in row.index: if index.startswith('Attribute'): _att_name = row.loc[index] continue _data_dict[row.iloc[0]].update({index: row.loc[index]}) report['dictionary'] = _data_dict # notes _observations = {} for label, items in self.pm.get_knowledge(catalog='observations').items(): _observations[label] = Commons.list_formatter(items.values()) _actions = {} for label, items in self.pm.get_knowledge(catalog='actions').items(): _actions[label] = Commons.list_formatter(items.values()) report['notes'] = {'observations': _observations, 'actions': _actions} return report