def find_report_date(prev_dashboards, new_metric): """ Function is used to look into a previous report Parameters ---------- prev_dashboard (string): name of the 'old' dashboards that should reside in an Excel file in the current directory. these dashboards will be necessary to update the 'first_reported' aspect of DataQualityMetric objects. new_metric (DataQualityMetric): object whose 'counterpart' in the 'dashboard' needs to be found in order to report out the date Returns ------- report_date (datetime): date in the previous dashboard for the particular data quality metric that was found to be erroneous """ sheet_name = new_metric.hpo sheet = load_files(sheet_name=sheet_name, file_name=prev_dashboards) # now we have the sheet in question - should be easy to find to row report_date = None # default - should be changed to datetime object for index, row in sheet.iterrows(): # same standards as employed by cross_reference_old_metrics same_hpo = (row['HPO'] == new_metric.hpo) same_table = (row['Table/Class'] == new_metric.table_or_class) same_mt = (row['Metric Type'] == new_metric.metric_type) same_dqd = ( row['Data Quality Dimension'] == new_metric.data_quality_dimension) same_link = (row['Link'] == new_metric.link) correct_row = (same_hpo and same_table and same_mt and same_dqd and same_link) # get the date if correct_row: # should be a timestamp report_date = row['First Reported'] # check that it is reassigned - just in case assert isinstance(report_date, pd.Timestamp), \ "Date not found in the old dashboard. This applies to" \ "the following DataQualityMetric object: {dq}".format( dq=new_metric.print_dqd_attributes() ) return report_date
def populate_hpo_objects_with_dq_metrics(hpo_objects, metrics, file_name, date): """ Function is used to take the HPO objects created in a previous function (create_hpo_objects) and associate them with DataQualityMetric objects that contain the relevant pieces of information from the selected sheet. Parameters ---------- hpo_objects (lst): list of HPO objects (see class_definitions.py) that will be used and ultimately populated with the data quality metrics. metric_names (lst): list of the sheets that will be used to identify the data quality metrics for each of the HPO and DataQualityMetric objects. file_name (str): the date of the file that is being used to generate the data quality issue frames. date (datetime): datetime object that corresponds to the date that the file is named after. Returns ------- hpo_objects (lst): list of HPO objects (see class_definitions.py) that now have the appropriate DataQualityMetric objects. """ # start with analyzing each metric first - minimizes 'loads' for metric in metrics: sheet = load_files(sheet_name=metric, file_name=file_name) for hpo in hpo_objects: hpo_name = hpo.name row_num = find_hpo_row(sheet, hpo_name) # what we are looking for within each analytics sheet desired_columns = columns_to_document_for_sheet[metric] all_dqds_for_hpo_for_metric = [] # list of objects - to be filled for column_for_table in desired_columns: err_rate = get_err_rate(sheet, row_num, metric, hpo_name, column_for_table) data_quality_dimension = DataQualityMetric( hpo=hpo_name, table_or_class=table_or_class_based_on_column_provided[ column_for_table], metric_type=metric_type_to_english_dict[metric], value=err_rate, first_reported=date, data_quality_dimension=data_quality_dimension_dict[metric], link=relevant_links[metric]) # adding to a list of the same metric type for the same site all_dqds_for_hpo_for_metric.append(data_quality_dimension) # now we have objects for all of the data quality metrics for # a. each site # b. each table # for a particular data quality metric - should now assign to HPO for metric_object in all_dqds_for_hpo_for_metric: hpo.add_attribute_with_string(metric=metric_object.metric_type, dq_object=metric_object) return hpo_objects
def find_report_date(prev_dashboards, new_metric, new_hpo_ids, excel_file_name): """ Function is used to look into a previous report. Parameters ---------- prev_dashboard (string): name of the 'old' dashboards that should reside in an Excel file in the current directory. these dashboards will be necessary to update the 'first_reported' aspect of DataQualityMetric objects. new_metric (DataQualityMetric): object whose 'counterpart' in the 'dashboard' needs to be found in order to report out the date. new_hpo_ids (list): contains the IDs of the HPOs that are new to the latest 'analytics report' and therefore are not contained in the previous 'panels' excel_file_name (str): the name of the most recent 'analytics' report. contains the 'date' that will be assigned to novel data quality issues. Returns ------- report_date (datetime): date in the previous dashboard for the particular data quality metric that was found to be erroneous. """ sheet_name = new_metric.hpo if new_metric.hpo in new_hpo_ids: # new HPO site - means that the issue must have # originated in the latest 'analytics report' and # did not exist in the previous sheet date_str = excel_file_name[:-5] # take off the .xlsx date = datetime.datetime.strptime(date_str, constants.date_format) date = pd.Timestamp(date) report_date = date else: sheet = load_files(sheet_name=sheet_name, file_name=prev_dashboards) # now we have the sheet in question - should be easy to find to row report_date = None # default - should be changed to datetime object for index, row in sheet.iterrows(): # same standards as employed by cross_reference_old_metrics same_hpo = ( row[constants.hpo_col_name].lower() == new_metric.hpo.lower()) same_table = (row[constants.table_class_col_name].lower() == new_metric.table_or_class.lower()) same_mt = (row[constants.metric_type_col_name].lower() == new_metric.metric_type.lower()) same_dqd = (row[constants.data_quality_dimension_col_name].lower() == new_metric.data_quality_dimension.lower()) same_link = (row[constants.link_col_name].lower() == new_metric.link.lower()) correct_row = (same_hpo and same_table and same_mt and same_dqd and same_link) #print(row[constants.link_col_name].lower(), new_metric.link.lower()) ############ ''' if same_link == False: print(same_hpo, same_table, same_mt, same_dqd, same_link) old = row[constants.link_col_name].lower() new = new_metric.link.lower() for i in range(len(old)): if old[i] != new[i]: print(old[i:i+10], "new:", new[i:i+10]) ''' # get the date if correct_row: report_date = row[constants.first_reported_col_name] if new_metric.metric_type not in new_metric_types: # ensure reassignment assert isinstance(report_date, pd.Timestamp), \ f""" Date not found in the old dashboard. This applies to the following DataQualityMetric object: {new_metric.print_dqd_attributes()}""" else: # NOTE: the metric being investigated is new and therefore # should not be expected to appear on an old 'data_quality # _issues' panel. please be sure to delete a 'new metric' # in the corresponding list once it appears in old panels report_date = new_metric.first_reported return report_date