Example #1
0
def find_report_date(prev_dashboards, new_metric):
    """
    Function is used to look into a previous report

    Parameters
    ----------
    prev_dashboard (string): name of the 'old' dashboards that
        should reside in an Excel file in the current directory.
        these dashboards will be necessary to update the
        'first_reported' aspect of DataQualityMetric objects.

    new_metric (DataQualityMetric): object whose 'counterpart'
        in the 'dashboard' needs to be found in order to
        report out the date

    Returns
    -------
    report_date (datetime): date in the previous dashboard for the
        particular data quality metric that was found to be erroneous
    """
    sheet_name = new_metric.hpo
    sheet = load_files(sheet_name=sheet_name, file_name=prev_dashboards)
    # now we have the sheet in question - should be easy to find to row

    report_date = None  # default - should be changed to datetime object

    for index, row in sheet.iterrows():

        # same standards as employed by cross_reference_old_metrics
        same_hpo = (row['HPO'] == new_metric.hpo)
        same_table = (row['Table/Class'] == new_metric.table_or_class)
        same_mt = (row['Metric Type'] == new_metric.metric_type)
        same_dqd = (
            row['Data Quality Dimension'] == new_metric.data_quality_dimension)
        same_link = (row['Link'] == new_metric.link)

        correct_row = (same_hpo and same_table and same_mt and same_dqd
                       and same_link)

        # get the date
        if correct_row:
            # should be a timestamp
            report_date = row['First Reported']

    # check that it is reassigned - just in case
    assert isinstance(report_date, pd.Timestamp), \
        "Date not found in the old dashboard. This applies to" \
        "the following DataQualityMetric object: {dq}".format(
            dq=new_metric.print_dqd_attributes()
        )

    return report_date
Example #2
0
def populate_hpo_objects_with_dq_metrics(hpo_objects, metrics, file_name,
                                         date):
    """
    Function is used to take the HPO objects created in a previous
    function (create_hpo_objects) and associate them with
    DataQualityMetric objects that contain the relevant pieces
    of information from the selected sheet.

    Parameters
    ----------
    hpo_objects (lst): list of HPO objects (see class_definitions.py)
        that will be used and ultimately populated with the
        data quality metrics.

    metric_names (lst): list of the sheets that will be used to
        identify the data quality metrics for each of the HPO
        and DataQualityMetric objects.

    file_name (str): the date of the file that is being used to generate
        the data quality issue frames.

    date (datetime): datetime object that corresponds to the date that
        the file is named after.

    Returns
    -------
    hpo_objects (lst): list of HPO objects (see class_definitions.py)
        that now have the appropriate DataQualityMetric objects.
    """

    # start with analyzing each metric first - minimizes 'loads'
    for metric in metrics:
        sheet = load_files(sheet_name=metric, file_name=file_name)

        for hpo in hpo_objects:
            hpo_name = hpo.name
            row_num = find_hpo_row(sheet, hpo_name)

            # what we are looking for within each analytics sheet
            desired_columns = columns_to_document_for_sheet[metric]

            all_dqds_for_hpo_for_metric = []  # list of objects - to be filled

            for column_for_table in desired_columns:
                err_rate = get_err_rate(sheet, row_num, metric, hpo_name,
                                        column_for_table)

                data_quality_dimension = DataQualityMetric(
                    hpo=hpo_name,
                    table_or_class=table_or_class_based_on_column_provided[
                        column_for_table],
                    metric_type=metric_type_to_english_dict[metric],
                    value=err_rate,
                    first_reported=date,
                    data_quality_dimension=data_quality_dimension_dict[metric],
                    link=relevant_links[metric])

                # adding to a list of the same metric type for the same site
                all_dqds_for_hpo_for_metric.append(data_quality_dimension)

            # now we have objects for all of the data quality metrics for
            # a. each site
            # b. each table
            # for a particular data quality metric - should now assign to HPO

            for metric_object in all_dqds_for_hpo_for_metric:
                hpo.add_attribute_with_string(metric=metric_object.metric_type,
                                              dq_object=metric_object)

    return hpo_objects
def find_report_date(prev_dashboards, new_metric, new_hpo_ids,
                     excel_file_name):
    """
    Function is used to look into a previous report.

    Parameters
    ----------
    prev_dashboard (string): name of the 'old' dashboards that
        should reside in an Excel file in the current directory.
        these dashboards will be necessary to update the
        'first_reported' aspect of DataQualityMetric objects.

    new_metric (DataQualityMetric): object whose 'counterpart'
        in the 'dashboard' needs to be found in order to
        report out the date.

    new_hpo_ids (list): contains the IDs of the HPOs that are
        new to the latest 'analytics report' and therefore are
        not contained in the previous 'panels'

    excel_file_name (str): the name of the most recent 'analytics'
        report. contains the 'date' that will be assigned to
        novel data quality issues.

    Returns
    -------
    report_date (datetime): date in the previous dashboard for the
        particular data quality metric that was found to be erroneous.
    """
    sheet_name = new_metric.hpo

    if new_metric.hpo in new_hpo_ids:
        # new HPO site - means that the issue must have
        # originated in the latest 'analytics report' and
        # did not exist in the previous sheet

        date_str = excel_file_name[:-5]  # take off the .xlsx
        date = datetime.datetime.strptime(date_str, constants.date_format)
        date = pd.Timestamp(date)
        report_date = date

    else:
        sheet = load_files(sheet_name=sheet_name, file_name=prev_dashboards)
        # now we have the sheet in question - should be easy to find to row

        report_date = None  # default - should be changed to datetime object

        for index, row in sheet.iterrows():

            # same standards as employed by cross_reference_old_metrics
            same_hpo = (
                row[constants.hpo_col_name].lower() == new_metric.hpo.lower())
            same_table = (row[constants.table_class_col_name].lower() ==
                          new_metric.table_or_class.lower())
            same_mt = (row[constants.metric_type_col_name].lower() ==
                       new_metric.metric_type.lower())
            same_dqd = (row[constants.data_quality_dimension_col_name].lower()
                        == new_metric.data_quality_dimension.lower())
            same_link = (row[constants.link_col_name].lower() ==
                         new_metric.link.lower())

            correct_row = (same_hpo and same_table and same_mt and same_dqd
                           and same_link)

            #print(row[constants.link_col_name].lower(), new_metric.link.lower())  ############
            '''
            if same_link == False:
                print(same_hpo, same_table, same_mt, same_dqd, same_link)
                old = row[constants.link_col_name].lower()
                new = new_metric.link.lower()
                for i in range(len(old)):
                    if old[i] != new[i]: print(old[i:i+10], "new:", new[i:i+10])
            '''
            # get the date
            if correct_row:
                report_date = row[constants.first_reported_col_name]

    if new_metric.metric_type not in new_metric_types:
        # ensure reassignment
        assert isinstance(report_date, pd.Timestamp), \
            f"""
            Date not found in the old dashboard. This applies to
            the following DataQualityMetric object:
            {new_metric.print_dqd_attributes()}"""

    else:
        # NOTE: the metric being investigated is new and therefore
        # should not be expected to appear on an old 'data_quality
        # _issues' panel. please be sure to delete a 'new metric'
        # in the corresponding list once it appears in old panels
        report_date = new_metric.first_reported

    return report_date