Beispiel #1
0
def validate_merge_parameters(
    dst_df: pd.DataFrame,
    src_df: pd.DataFrame,
    how_merge: str,
    left_on: str,
    right_on: str,
) -> Optional[str]:
    """Verify that the merge parameters are correct

    :return: Error message, or none if everything is correct
    """
    # Check that the parameters are correct
    if not how_merge or how_merge not in ['left', 'right', 'outer', 'inner']:
        return gettext('Merge method must be one of '
                       'left, right, outer or inner')

    if left_on not in list(dst_df.columns):
        return gettext(
            'Column {0} not found in current data frame').format(left_on)

    if not pandas.is_unique_column(dst_df[left_on]):
        return gettext('Column {0} is not a unique key.').format(left_on)

    if right_on not in list(src_df.columns):
        return gettext(
            'Column {0} not found in new data frame').format(right_on)

    if not pandas.is_unique_column(src_df[right_on]):
        return gettext(
            'Column {0} is not a unique key.').format(right_on)

    return None
Beispiel #2
0
    def put(
        self,
        request: HttpRequest,
        wid: int,
        format=None,
        workflow: Optional[Workflow] = None,
    ) -> HttpResponse:
        """Process the put request."""
        # Get the dst_df
        dst_df = load_table(workflow.get_data_frame_table_name())

        serializer = self.serializer_class(data=request.data)
        if not serializer.is_valid():
            return Response(serializer.errors,
                            status=status.HTTP_400_BAD_REQUEST)

        # Check that the parameters are correct
        how = serializer.validated_data['how']
        if how == '' or how not in ['left', 'right', 'outer', 'inner']:
            raise APIException(
                _('how must be one of left, right, outer or inner'))

        left_on = serializer.validated_data['left_on']
        if not is_unique_column(dst_df[left_on]):
            raise APIException(
                _('column {0} does not contain a unique key.').format(left_on))

        # Operation has been accepted by the serializer
        src_df = serializer.validated_data['src_df']

        right_on = serializer.validated_data['right_on']
        if right_on not in list(src_df.columns):
            raise APIException(
                _('column {0} not found in data frame').format(right_on))

        if not is_unique_column(src_df[right_on]):
            raise APIException(
                _('column {0} does not contain a unique key.').format(
                    right_on))

        merge_info = {
            'how_merge': how,
            'dst_selected_key': left_on,
            'src_selected_key': right_on,
            'initial_column_names': list(src_df.columns),
            'rename_column_names': list(src_df.columns),
            'columns_to_upload': [True] * len(list(src_df.columns)),
        }

        # Ready to perform the MERGE
        try:
            perform_dataframe_upload_merge(workflow, dst_df, src_df,
                                           merge_info)
        except Exception as exc:
            raise APIException(
                _('Unable to perform merge operation: {0}').format(str(exc)))

        # Merge went through.
        return Response(serializer.data, status=status.HTTP_201_CREATED)
Beispiel #3
0
    def clean(self) -> Dict:
        """Verify that the rename preserve key, or verify the unique prop."""
        form_data = super().clean()

        # Check if there has been a change in the 'is_key' status
        if 'is_key' in self.changed_data:
            # Case 1: True -> False If it is the only one, cannot be
            # allowed
            column_unique = self.instance.workflow.get_column_unique()
            if (self.instance.is_key
                    and len([col for col in column_unique if col]) == 1):
                self.add_error(
                    'is_key',
                    _('There must be at least one column with unique values'),
                )
                return form_data

            # Case 2: False -> True Unique values must be verified
            if (not self.instance.is_key and not pandas.is_unique_column(
                    self.data_frame[self.instance.name])):
                self.add_error(
                    'is_key',
                    _('The column does not have unique values for each row.'),
                )
                return form_data

        # Check and force a correct column index
        ncols = self.workflow.columns.count()
        if form_data['position'] < 1 or ncols < form_data['position']:
            form_data['position'] = ncols

        return form_data
Beispiel #4
0
def check_wf_df(workflow: models.Workflow) -> bool:
    """Check consistency between Workflow info and the data frame.

    Check the consistency between the information stored in the workflow
    and the structure of the underlying dataframe

    :param workflow: Workflow object
    :return: Boolean stating the result of the check. True: Correct.
    """
    # Get the df
    df = pandas.load_table(workflow.get_data_frame_table_name())

    # Set values in case there is no df
    if df is not None:
        dfnrows = df.shape[0]
        dfncols = df.shape[1]
        df_col_names = list(df.columns)
    else:
        dfnrows = 0
        dfncols = 0
        df_col_names = []

    # Check 1: Number of rows and columns
    assert workflow.nrows == dfnrows, 'Inconsistent number of rows'
    assert workflow.ncols == dfncols, 'Inconsistent number of columns'

    # Identical sets of columns
    wf_cols = workflow.columns.all()
    assert set(df_col_names) == {col.name
                                 for col in wf_cols
                                 }, ('Inconsistent set of columns')

    # Identical data types
    # for n1, n2 in zip(wf_cols, df_col_names):
    for col in wf_cols:
        df_dt = pandas.datatype_names.get(df[col.name].dtype.name)
        if col.data_type == 'boolean' and df_dt == 'string':
            # This is the case of a column with Boolean and Nulls
            continue

        assert col.data_type == df_dt, ('Inconsistent data type {0}'.format(
            col.name))

    # Verify that the columns marked as unique are preserved
    for col in workflow.columns.filter(is_key=True):
        assert pandas.is_unique_column(
            df[col.name]), ('Column {0} should be unique.'.format(col.name))

    # Columns are properly numbered
    cpos = workflow.columns.values_list('position', flat=True)
    rng = range(1, len(cpos) + 1)
    assert sorted(cpos) == list(rng)

    return True
Beispiel #5
0
def _verify_dataframe_columns(
    workflow,
    data_frame: pd.DataFrame,
):
    """Verify that the df columns are compatible with those in the wflow.

    This function is crucial to make sure the information stored in the
    workflow and the one in the dataframe is consistent. It it assumed that
    the data frame given as parameter contains a superset of the columns
    already present in the workflow. The function traverses those columns in
    the data frame that are already included in the workflow and checks the
    following conditions:

    1) The value of is_key is preserved. If not, the offending column should
    have reached this stage with is_key equal to False

    2) The data types stored in the column.data_type field is consistent with
    that observed in the data frame.

       2.1) A column of type bool must be of type string in the DF but with
       values None, True, False.

       2.2) A column of type integer or double in the WF must be either integer
       or double in the Dataframe. If it is double, it will be updated at a
       later stage.

       2.3) If a column is not of type string or integer, and has a type change
       it is flagged as an error.

    3) If the WF column has categories, the values in the DF should be
    compatible.
    """
    df_column_names = list(data_frame.columns)
    wf_column_names = [col.name for col in workflow.columns.all()]

    if settings.DEBUG:
        # There should not be any columns in the workflow that are not in the
        # DF
        assert not (set(wf_column_names) - set(df_column_names))

    # Loop over the columns in the Workflow to refresh the is_key value. There
    # may be values that have been added to the column, so this field needs to
    # be reassessed
    for col in workflow.columns.all():
        # Condition 1: If the column is marked as a key column, it should
        # maintain this property
        if col.is_key and not pandas.is_unique_column(data_frame[col.name]):
            raise Exception(
                gettext(
                    'Column {0} looses its "key" property through this merge.'
                    + ' Either remove this property from the column or ' +
                    'remove the rows that cause this problem in the new ' +
                    'dataset').format(col.name))

        # Get the pandas data type
        df_col_type = pandas.datatype_names.get(
            data_frame[col.name].dtype.name)

        # Condition 2: Review potential data type changes
        if col.data_type == 'boolean' and df_col_type == 'string':
            # 2.1: A WF boolean with must be DF string with True/False/None
            column_data_types = {
                type(row_value)
                for row_value in data_frame[col.name]
                # Remove the NoneType and Float
                if not isinstance(row_value, float) and row_value is not None
            }
            if len(column_data_types) != 1 or column_data_types.pop() != bool:
                raise Exception(
                    gettext('New values in column {0} are not of type {1}',
                            ).format(col.name, col.data_type))
        elif (col.data_type == 'integer' and df_col_type != 'integer'
              and df_col_type != 'double'):
            # 2.2 WF Numeric column must be DF integer or double
            raise Exception(
                gettext('New values in column {0} are not of type number',
                        ).format(col.name))
        elif col.data_type != 'integer' and df_col_type != col.data_type:
            # 2.3 Any other type change is incorrect
            raise Exception(
                gettext(
                    'New values in column {0} are not of type {1}', ).format(
                        col.name, col.data_type))

        # Condition 3: If there are categories, the new values should be
        # compatible with them.
        if col.categories and not all(row_val in col.get_categories()
                                      for row_val in data_frame[col.name]
                                      if row_val and not pd.isnull(row_val)):
            raise Exception(
                gettext('New values in column {0} are not in categories {1}',
                        ).format(col.name, ', '.join(col.categories)))
Beispiel #6
0
def perform_dataframe_upload_merge(
    workflow,
    dst_df: pd.DataFrame,
    src_df: pd.DataFrame,
    merge_info: Dict,
):
    """Merge the existing data frame (dst) with a new one (src).

    It combines the two data frames dst_df and src_df and stores its content.

    The combination of dst_df and src_df assumes:

    - dst_df has a set of columns (potentially empty) that do not overlap in
      name with the ones in src_df (dst_df[NO_OVERLAP_DST])

    - dst_df and src_df have a set of columns (potentially empty) that overlap
      in name (dst_df[OVERLAP] and src_df[OVERLAP] respectively)

    - src_df has a set of columns (potentially empty) that do not overlap in
      name with the ones in dst_df (src_df[NO_OVERLAP_SRC])

    The function combines dst_df and src_df following two main steps (in both
    steps, the number of rows processed are derived from the parameter
    merge_info['how_merge']).

    STEP A: A new data frame dst_df_tmp1 is created using the pandas "merge"
    operation between dst_df and src_df[NO_OVERLAP_SRC]. This increases the
    number of columns in dst_df_tmp1 with respect to dst_df by adding the new
    columns from src_df.

    The pseudocode for this step is:

    dst_df_tmp1 = pd.merge(dst_df,
                           src_df[NO_OVERLAP_SRC],
                           how=merge['how_merge'],
                           left_on=merge_info['dst_selected_key'],
                           right_on=merge_info['src_selected_key'])

    STEP B: The data frame dst_df_tmp1 is then updated with the values in
    src_df[OVERLAP].

    :param workflow: Workflow with the data frame
    :param dst_df: Destination dataframe (already stored in DB)
    :param src_df: Source dataframe, stored in temporary table
    :param merge_info: Dictionary with merge options
           - initial_column_names: List of initial column names in src data
             frame.
           - rename_column_names: Columns that need to be renamed in src data
             frame.
           - columns_to_uplooad: Columns to be considered for the update
           - src_selected_key: Key in the source data frame
           - dst_selected_key: key in the destination (existing) data frame
           - how_merge: How to merge: inner, outer, left or right
    :return: None or Exception with anomaly in the message
    """
    # STEP 1 Rename the column names.
    src_df = src_df.rename(
        columns=dict(list(zip(
            merge_info['initial_column_names'],
            merge_info['rename_column_names']))))

    # STEP 2 Drop the columns not selected
    columns_to_upload = merge_info['columns_to_upload']
    src_df.drop(
        [
            col for idx, col in enumerate(list(src_df.columns))
            if not columns_to_upload[idx]
        ],
        axis=1,
        inplace=True)

    # If no keep_key_column value is given, initialize to True
    if 'keep_key_column' not in merge_info:
        kk_column = []
        for cname in merge_info['rename_column_names']:
            kk_column.append(pandas.is_unique_column(src_df[cname]))
        merge_info['keep_key_column'] = kk_column

    # Get the keys
    src_key = merge_info['src_selected_key']
    dst_key = merge_info['dst_selected_key']

    # STEP 3 Perform the combination
    # Separate the columns in src that overlap from those that do not
    # overlap, but include the key column in both data frames.
    overlap_names = set(dst_df.columns).intersection(src_df.columns)
    src_no_overlap_names = set(src_df.columns).difference(overlap_names)
    src_df_overlap = src_df[list(overlap_names.union({src_key}))]
    src_df_no_overlap = src_df[list(src_no_overlap_names.union({src_key}))]

    # Step A. Perform the merge of non-overlapping columns
    new_df = _perform_non_overlapping_column_merge(
        dst_df,
        src_df_no_overlap,
        merge_info,
        dst_key,
        src_key)

    # Step B. Perform the update with the overlapping columns
    new_df = _perform_overlap_update(
        new_df,
        src_df_overlap,
        dst_key,
        src_key,
        merge_info['how_merge'])

    # If the merge produced a data frame with no rows, flag it as an error to
    # prevent loosing data when there is a mistake in the key column
    if new_df.shape[0] == 0:
        raise Exception(gettext(
            'Merge operation produced a result with no rows'))

    # If the merge produced a data frame with no unique columns, flag it as an
    # error to prevent the data frame from propagating without a key column
    if not pandas.has_unique_column(new_df):
        raise Exception(gettext(
            'Merge operation produced a result without any key columns. '
            + 'Review the key columns in the data to upload.',
        ))

    # Store the result back in the DB
    pandas.store_dataframe(new_df, workflow)

    _update_is_key_field(merge_info, workflow)

    # Recompute all the values of the conditions in each of the actions
    for action in workflow.actions.all():
        action.update_n_rows_selected()