def create_dqm_objects_for_sheet(dataframe, hpo_names, user_choice,
                                 metric_is_percent, target_low, date):
    """
    Function is used to create DataQualityMetric objects for all of
    the pertinent values on the various sheets being loaded.

    Parameters
    ---------
    dataframe (df): contains the information for a particular dimension
        of data quality on a particular date

    hpo_names (list): list of the strings that should go
        into an HPO ID column. for use in generating HPO objects.

    user_choice (string): represents the sheet from the analysis reports
        whose metrics will be compared over time

    metric_is_percent (bool): determines whether the data will be seen
        as 'percentage complete' or individual instances of a
        particular error

    target_low (bool): determines whether the number displayed should
        be considered a desirable or undesirable characteristic

    date (datetime): datetime object that represents the time that the
        data quality metric was documented (corresponding to the
        title of the file from which it was extracted)

    Returns
    -------
    dqm_objects (list): list of DataQualityMetrics objects
        these are objects that all should have the same
        metric_type, data_quality_dimension, and date attributes

    columns (list): the column names that whose data will be extracted.
        these will eventually be converted to either the rows of
        dataframes or the names of the different dataframes to be
        output.
    """
    # to instantiate dqm objects later on
    metric_type = metric_type_to_english_dict[user_choice]
    dqm_type = data_quality_dimension_dict[user_choice]
    columns = columns_to_document_for_sheet[user_choice]

    dqm_objects = []

    # for each HPO (row) in the dataframe
    for name in hpo_names:
        row_number = find_hpo_row(sheet=dataframe, hpo=name)

        data_dict = get_info(sheet=dataframe,
                             row_num=row_number,
                             percentage=metric_is_percent,
                             sheet_name=user_choice,
                             columns_to_collect=columns,
                             target_low=target_low)

        # for each table / class (column) in the dataframe
        for table, data in data_dict.items():
            table_or_class_name = table_based_on_column_provided[table]

            new_dqm_object = DataQualityMetric(
                hpo=name,
                table_or_class=table_or_class_name,
                metric_type=metric_type,
                value=data,
                data_quality_dimension=dqm_type,
                date=date)

            dqm_objects.append(new_dqm_object)

    return dqm_objects, columns
Beispiel #2
0
def add_number_total_rows_for_hpo_and_date(hpos, date_names, date):
    """
    Function is used to add further attributes to the HPO
    objects. These are the attributes pertaining to the number
    of rows in each of the tables. These row counts should be
    stored in the 'concept' sheet.

    Parameters
    ----------
    hpos (list): list of the HPO objects. these should
        already have the name and date established at
        the minimum.

    date_names (list): list of the strings that indicate
        the names of the files being ingested. these
        in sequential order.

    date (list): datetime object that is used to ensure
        that data quality metrics are being associated
        with the HPO object that is associated with their
        respective date

    Returns
    -------
    hpos (list): list of the HPO objects. now should have the
        attributes for the number of rows filled in.
    """
    sheet_name = 'concept'  # where row count is stored
    dfs = load_files(user_choice=sheet_name, file_names=date_names)

    dates_objs = []

    for date_str in date_names:
        date_str = date_str[:-5]  # get rid of extension
        date_obj = datetime.datetime.strptime(date_str, '%B_%d_%Y')
        dates_objs.append(date_obj)

    chosen_idx = -1

    for idx, date_object in enumerate(dates_objs):
        if date_object == date:
            chosen_idx = idx

    assert chosen_idx > -1, "Invalid Date: {date}".format(date=date)

    df_for_date = dfs[chosen_idx]

    for hpo in hpos:
        if hpo.date == date:

            hpo_name = hpo.name
            hpo_row = find_hpo_row(sheet=df_for_date, hpo=hpo_name)

            num_rows_dictionary = get_info(
                sheet=df_for_date,
                row_num=hpo_row,
                percentage=False,
                sheet_name=sheet_name,
                columns_to_collect=row_count_col_names,
                target_low=False)

            for table_name, value in num_rows_dictionary.items():
                hpo.add_row_count_with_string(table=table_name, value=value)

    return hpos