コード例 #1
0
def get_column_statistics(df_column):
    """Calculate a set of statistics or a DataFrame column.

    Given a data frame with a single column, return a set of statistics
    depending on its type.

    :param df_column: data frame with a single column

    :return: A dictionary with keys depending on the type of column
      {'min': minimum value (integer, double an datetime),
       'q1': Q1 value (0.25) (integer, double),
       'mean': mean value (integer, double),
       'median': median value (integer, double),
       'mean': mean value (integer, double),
       'q3': Q3 value (0.75) (integer, double),
       'max': maximum value (integer, double an datetime),
       'std': standard deviation (integer, double),
       'counts': (integer, double, string, datetime, Boolean',
       'mode': (integer, double, string, datetime, Boolean,
       or None if the column has all its values to NaN
    """
    if len(df_column.loc[df_column.notnull()]) == 0:
        # The column has no data
        return None

    # Dictionary to return
    to_return = {
        'min': 0,
        'q1': 0,
        'mean': 0,
        'median': 0,
        'q3': 0,
        'max': 0,
        'std': 0,
        'mode': None,
        'counts': {},
    }

    data_type = pandas_datatype_names.get(df_column.dtype.name)

    if data_type == 'integer' or data_type == 'double':
        quantiles = df_column.quantile([0, .25, .5, .75, 1])
        to_return['min'] = '{0:g}'.format(quantiles[0])
        to_return['q1'] = '{0:g}'.format(quantiles[.25])
        to_return['mean'] = '{0:g}'.format(df_column.mean())
        to_return['median'] = '{0:g}'.format(quantiles[.5])
        to_return['q3'] = '{0:g}'.format(quantiles[.75])
        to_return['max'] = '{0:g}'.format(quantiles[1])
        to_return['std'] = '{0:g}'.format(df_column.std())

    to_return['counts'] = df_column.value_counts().to_dict()
    mode = df_column.mode()
    if len(mode) == 0:
        mode = '--'
    to_return['mode'] = mode[0]

    return to_return
コード例 #2
0
def check_wf_df(workflow):
    """Check consistency between Workflow info and the data frame.

    Check the consistency between the information stored in the workflow
    and the structure of the underlying dataframe

    :param workflow: Workflow object

    :return: Boolean stating the result of the check. True: Correct.
    """
    # Get the df
    df = load_table(workflow.get_data_frame_table_name())

    # Set values in case there is no df
    if df is not None:
        dfnrows = df.shape[0]
        dfncols = df.shape[1]
        df_col_names = list(df.columns)
    else:
        dfnrows = 0
        dfncols = 0
        df_col_names = []

    # Check 1: Number of rows and columns
    assert workflow.nrows == dfnrows, 'Inconsistent number of rows'
    assert workflow.ncols == dfncols, 'Inconsistent number of columns'

    # Identical sets of columns
    wf_cols = workflow.columns.all()
    assert set(df_col_names) == {col.name
                                 for col in wf_cols
                                 }, ('Inconsistent set of columns')

    # Identical data types
    # for n1, n2 in zip(wf_cols, df_col_names):
    for col in wf_cols:
        df_dt = pandas_datatype_names.get(df[col.name].dtype.name)
        if col.data_type == 'boolean' and df_dt == 'string':
            # This is the case of a column with Boolean and Nulls
            continue

        assert col.data_type == df_dt, ('Inconsistent data type {0}'.format(
            col.name))

    # Verify that the columns marked as unique are preserved
    for col in workflow.columns.filter(is_key=True):
        assert is_unique_column(
            df[col.name]), ('Column {0} should be unique.'.format(col.name))

    return True
コード例 #3
0
def _verify_dataframe_columns(
    workflow,
    data_frame: pd.DataFrame,
):
    """Verify that the df columns are compatible with those in the wflow.

    This function is crucial to make sure the information stored in the
    workflow and the one in the dataframe is consistent. It it assumed that
    the data frame given as parameter contains a superset of the columns
    already present in the workflow. The function traverses those columns in
    the data frame that are already included in the workflow and checks the
    following conditions:

    1) The value of is_key is preserved. If not, the offending column should
    have reached this stage with is_key equal to False

    2) The data types stored in the column.data_type field is consistent with
    that observed in the data frame.

       2.1) A column of type bool must be of type string in the DF but with
       values None, True, False.

       2.2) A column of type integer or double in the WF must be either integer
       or double in the Dataframe. If it is double, it will be updated at a
       later stage.

       2.3) If a column is not of type string or integer, and has a type change
       it is flagged as an error.

    3) If the WF column has categories, the values in the DF should be
    compatible.
    """
    df_column_names = list(data_frame.columns)
    wf_column_names = [col.name for col in workflow.columns.all()]

    if settings.DEBUG:
        # There should not be any columns in the workflow that are not in the
        # DF
        assert not (set(wf_column_names) - set(df_column_names))

    # Loop over the columns in the Workflow to refresh the is_key value. There
    # may be values that have been added to the column, so this field needs to
    # be reassessed
    for col in workflow.columns.all():
        # Condition 1: If the column is marked as a key column, it should
        # maintain this property
        if col.is_key and not is_unique_column(data_frame[col.name]):
            raise Exception(gettext(
                'Column {0} looses its "key" property through this merge.'
                + ' Either remove this property from the column or '
                + 'remove the rows that cause this problem in the new '
                + 'dataset').format(col.name))

        # Get the pandas data type
        df_col_type = pandas_datatype_names.get(
            data_frame[col.name].dtype.name)

        # Condition 2: Review potential data type changes
        if col.data_type == 'boolean' and df_col_type == 'string':
            # 2.1: A WF boolean with must be DF string with True/False/None
            column_data_types = {
                type(row_value)
                for row_value in data_frame[col.name]
                # Remove the NoneType and Float
                if not isinstance(row_value, float) and row_value is not None
            }
            if len(column_data_types) != 1 or column_data_types.pop() != bool:
                raise Exception(gettext(
                    'New values in column {0} are not of type {1}',
                ).format(col.name, col.data_type))
        elif (
            col.data_type == 'integer' and df_col_type != 'integer'
            and df_col_type != 'double'
        ):
            # 2.2 WF Numeric column must be DF integer or double
            raise Exception(gettext(
                'New values in column {0} are not of type number',
            ).format(col.name))
        elif col.data_type != 'integer' and df_col_type != col.data_type:
            # 2.3 Any other type change is incorrect
            raise Exception(gettext(
                'New values in column {0} are not of type {1}',
            ).format(col.name, col.data_type))

        # Condition 3: If there are categories, the new values should be
        # compatible with them.
        if col.categories and not all(
            row_val in col.categories for row_val in data_frame[col.name]
            if row_val and not pd.isnull(row_val)
        ):
            raise Exception(gettext(
                'New values in column {0} are not in categories {1}',
            ).format(col.name, ', '.join(col.categories)))