Python perform_dataframe_upload_merge Examples, ontask.dataops.pandas.perform_dataframe_upload_merge Python Examples

Example #1

0

Show file

File: api.py Project: ritotombe/ontask_b

    def put(
        self,
        request: HttpRequest,
        wid: int,
        format=None,
        workflow: Optional[Workflow] = None,
    ) -> HttpResponse:
        """Process the put request."""
        # Get the dst_df
        dst_df = load_table(workflow.get_data_frame_table_name())

        serializer = self.serializer_class(data=request.data)
        if not serializer.is_valid():
            return Response(serializer.errors,
                            status=status.HTTP_400_BAD_REQUEST)

        # Check that the parameters are correct
        how = serializer.validated_data['how']
        if how == '' or how not in ['left', 'right', 'outer', 'inner']:
            raise APIException(
                _('how must be one of left, right, outer or inner'))

        left_on = serializer.validated_data['left_on']
        if not is_unique_column(dst_df[left_on]):
            raise APIException(
                _('column {0} does not contain a unique key.').format(left_on))

        # Operation has been accepted by the serializer
        src_df = serializer.validated_data['src_df']

        right_on = serializer.validated_data['right_on']
        if right_on not in list(src_df.columns):
            raise APIException(
                _('column {0} not found in data frame').format(right_on))

        if not is_unique_column(src_df[right_on]):
            raise APIException(
                _('column {0} does not contain a unique key.').format(
                    right_on))

        merge_info = {
            'how_merge': how,
            'dst_selected_key': left_on,
            'src_selected_key': right_on,
            'initial_column_names': list(src_df.columns),
            'rename_column_names': list(src_df.columns),
            'columns_to_upload': [True] * len(list(src_df.columns)),
        }

        # Ready to perform the MERGE
        try:
            perform_dataframe_upload_merge(workflow, dst_df, src_df,
                                           merge_info)
        except Exception as exc:
            raise APIException(
                _('Unable to perform merge operation: {0}').format(str(exc)))

        # Merge went through.
        return Response(serializer.data, status=status.HTTP_201_CREATED)

Example #2

0

Show file

File: upload_steps.py Project: ubc/ontask_b

def upload_step_four(
    request: http.HttpRequest,
    workflow: models.Workflow,
    upload_data: Dict,
) -> http.HttpResponse:
    """Perform the merge operation.

    :param request: Received request
    :param workflow: Workflow being processed
    :param upload_data: Dictionary with all the information about the merge.
    :return: HttpResponse
    """
    # Get the dataframes to merge
    try:
        dst_df = pandas.load_table(workflow.get_data_frame_table_name())
        src_df = pandas.load_table(workflow.get_upload_table_name())
    except Exception:
        return render(request, 'error.html',
                      {'message': _('Exception while loading data frame')})

    try:
        pandas.perform_dataframe_upload_merge(workflow, dst_df, src_df,
                                              upload_data)
    except Exception as exc:
        # Nuke the temporary table
        sql.delete_table(workflow.get_upload_table_name())
        col_info = workflow.get_column_info()
        workflow.log(request.user,
                     models.Log.WORKFLOW_DATA_FAILEDMERGE,
                     column_names=col_info[0],
                     column_types=col_info[1],
                     column_unique=col_info[2],
                     error_message=str(exc))
        messages.error(request, _('Merge operation failed. ') + str(exc))
        return redirect(reverse('table:display'))

    col_info = workflow.get_column_info()
    workflow.log(request.user,
                 upload_data['log_upload'],
                 column_names=col_info[0],
                 column_types=col_info[1],
                 column_unique=col_info[2])
    store_workflow_in_session(request.session, workflow)
    request.session.pop('upload_data', None)

    return redirect(reverse('table:display'))

Example #3

0

Show file

File: api.py Project: ubc/ontask_b

    def put(
        self,
        request: http.HttpRequest,
        wid: int,
        format=None,
        workflow: Optional[models.Workflow] = None,
    ) -> http.HttpResponse:
        """Process the put request."""
        del wid, format
        # Get the dst_df
        dst_df = pandas.load_table(workflow.get_data_frame_table_name())

        serializer = self.serializer_class(data=request.data)
        if not serializer.is_valid():
            return Response(serializer.errors,
                            status=status.HTTP_400_BAD_REQUEST)

        src_df = serializer.validated_data['src_df']
        how = serializer.validated_data['how']
        left_on = serializer.validated_data['left_on']
        right_on = serializer.validated_data['right_on']
        error = pandas.validate_merge_parameters(dst_df, src_df, how, left_on,
                                                 right_on)

        if error:
            raise APIException(error)

        # Ready to perform the MERGE
        try:
            pandas.perform_dataframe_upload_merge(
                workflow, dst_df, src_df, {
                    'how_merge': how,
                    'dst_selected_key': left_on,
                    'src_selected_key': right_on,
                    'initial_column_names': list(src_df.columns),
                    'rename_column_names': list(src_df.columns),
                    'columns_to_upload': [True] * len(list(src_df.columns))
                })
        except Exception as exc:
            raise APIException(
                _('Unable to perform merge operation: {0}').format(str(exc)))

        # Merge went through.
        return Response(serializer.data, status=status.HTTP_201_CREATED)

Example #4

0

Show file

File: test_logic.py Project: ritotombe/ontask_b

    def test_merge_inner(self):

        # Get the workflow
        self.workflow = Workflow.objects.all()[0]

        # Parse the source data frame
        df_dst, df_src = self.parse_data_frames()

        self.merge_info['how_merge'] = 'inner'

        result = perform_dataframe_upload_merge(self.workflow, df_dst, df_src,
                                                self.merge_info)

        # Load again the workflow data frame
        df_dst = load_table(self.workflow.get_data_frame_table_name())

        # Result must be correct (None)
        self.assertEquals(result, None)

Example #5

0

Show file

File: dataframeupload.py Project: ubc/ontask_b

def batch_load_df_from_athenaconnection(
    workflow: models.Workflow,
    conn: models.AthenaConnection,
    run_params: Dict,
    log_item: models.Log,
):
    """Batch load a DF from an Athena connection.

    run_params has:
    aws_secret_access_key: Optional[str] = None,
    aws_session_token: Optional[str] = None,
    table_name: Optional[str] = None
    key_column_name[str] = None
    merge_method[str] = None

    from pyathena import connect
    from pyathena.pandas_cursor import PandasCursor

    cursor = connect(
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        aws_session_token=aws_session_token,
        s3_staging_dir=staging_dir,
        region_name=region_name)

    df = pd.read_sql('SELECT * FROM given_table_name', cursor)
    print(df.describe())
    print(df.head())

    :param workflow: Workflow to store the new data
    :param conn: AthenaConnection object with the connection parameters.
    :param run_params: Dictionary with additional connection parameters
    :param log_item: Log object to reflect the status of the execution
    :return: Nothing.
    """
    staging_dir = 's3://{0}'.format(conn.aws_bucket_name)
    if conn.aws_file_path:
        staging_dir = staging_dir + '/' + conn.aws_file_path

    cursor = connect(aws_access_key_id=conn.aws_access_key,
                     aws_secret_access_key=run_params['aws_secret_access_key'],
                     aws_session_token=run_params['aws_session_token'],
                     s3_staging_dir=staging_dir,
                     region_name=conn.aws_region_name)

    data_frame = pd.read_sql_table(run_params['table_name'], cursor)

    # Strip white space from all string columns and try to convert to
    # datetime just in case
    data_frame = pandas.detect_datetime_columns(data_frame)

    pandas.verify_data_frame(data_frame)

    col_names, col_types, is_key = pandas.store_temporary_dataframe(
        data_frame, workflow)

    upload_data = {
        'initial_column_names': col_names,
        'col_types': col_types,
        'src_is_key_column': is_key,
        'rename_column_names': col_names[:],
        'columns_to_upload': [True] * len(col_names),
        'keep_key_column': is_key[:]
    }

    if not workflow.has_data_frame():
        # Regular load operation
        pandas.store_workflow_table(workflow, upload_data)
        log_item.payload['col_names'] = col_names
        log_item.payload['col_types'] = col_types
        log_item.payload['column_unique'] = is_key
        log_item.payload['num_rows'] = workflow.nrows
        log_item.payload['num_cols'] = workflow.ncols
        log_item.save(update_fields=['payload'])
        return

    # Merge operation
    upload_data['dst_column_names'] = workflow.get_column_names()
    upload_data['dst_is_unique_column'] = workflow.get_column_unique()
    upload_data['dst_unique_col_names'] = [
        cname for idx, cname in enumerate(upload_data['dst_column_names'])
        if upload_data['dst_column_names'][idx]
    ]
    upload_data['src_selected_key'] = run_params['merge_key']
    upload_data['dst_selected_key'] = run_params['merge_key']
    upload_data['how_merge'] = run_params['merge_method']

    dst_df = pandas.load_table(workflow.get_data_frame_table_name())
    src_df = pandas.load_table(workflow.get_data_frame_upload_table_name())

    try:
        pandas.perform_dataframe_upload_merge(workflow, dst_df, src_df,
                                              upload_data)
    except Exception as exc:
        # Nuke the temporary table
        sql.delete_table(workflow.get_data_frame_upload_table_name())
        raise Exception(
            _('Unable to perform merge operation: {0}').format(str(exc)))

    col_names, col_types, is_key = workflow.get_column_info()
    log_item.payload['col_names'] = col_names
    log_item.payload['col_types'] = col_types
    log_item.payload['column_unique'] = is_key
    log_item.payload['num_rows'] = workflow.nrows
    log_item.payload['num_cols'] = workflow.ncols
    log_item.save(update_fields=['payload'])

Example #6

0

Show file

def upload_s4(
    request: HttpRequest,
    workflow: Optional[Workflow] = None,
) -> HttpResponse:
    """Step 4: Show the user the expected effect of the merge and perform it.

    ASSUMES:

    initial_column_names: List of column names in the initial file.

    column_types: List of column types as detected by pandas

    src_is_key_column: Boolean list with src columns that are unique

    step_1: URL name of the first step

    rename_column_names: Modified column names to remove ambiguity when
                          merging.

    columns_to_upload: Boolean list denoting the columns in SRC that are
                       marked for upload.

    dst_column_names: List of column names in destination frame

    dst_is_unique_column: Boolean list with dst columns that are unique

    dst_unique_col_names: List with the column names that are unique

    dst_selected_key: Key column name selected in DST

    src_selected_key: Key column name selected in SRC

    how_merge: How to merge. One of {left, right, outter, inner}

    :param request: Web request
    :return:
    """
    # Get the dictionary containing the information about the upload
    upload_data = request.session.get('upload_data')
    if not upload_data:
        # If there is nsendo object, someone is trying to jump directly here.
        return redirect('dataops:uploadmerge')

    # Check the type of request that is being processed
    if request.method == 'POST':
        # We are processing a POST request

        # Get the dataframes to merge
        try:
            dst_df = load_table(
                workflow.get_data_frame_table_name(),
            )
            src_df = load_table(
                workflow.get_data_frame_upload_table_name(),
            )
        except Exception:
            return render(
                request,
                'error.html',
                {'message': _('Exception while loading data frame')})

        try:
            perform_dataframe_upload_merge(
                workflow,
                dst_df,
                src_df,
                upload_data)
        except Exception as exc:
            # Nuke the temporary table
            table_queries.delete_table(
                workflow.get_data_frame_upload_table_name(),
            )
            col_info = workflow.get_column_info()
            workflow.log(
                request.user,
                Log.WORKFLOW_DATA_FAILEDMERGE,
                column_names=col_info[0],
                column_types=col_info[1],
                column_unique=col_info[2],
                error_message=str(exc))
            messages.error(request, _('Merge operation failed. ') + str(exc))
            return redirect(reverse('table:display'))

        col_info = workflow.get_column_info()
        workflow.log(
            request.user,
            Log.WORKFLOW_DATA_MERGE,
            column_names=col_info[0],
            column_types=col_info[1],
            column_unique=col_info[2])
        store_workflow_in_session(request, workflow)
        request.session.pop('upload_data', None)
        return redirect(reverse('table:display'))

    # We are processing a GET request
    dst_column_names = upload_data['dst_column_names']
    dst_selected_key = upload_data['dst_selected_key']
    src_selected_key = upload_data['src_selected_key']
    # List of final column names
    final_columns = sorted(set().union(
        dst_column_names,
        upload_data['rename_column_names'],
    ))
    # Dictionary with (new src column name: (old name, is_uploaded?)
    src_info = {rname: (iname, upload) for (rname, iname, upload) in zip(
        upload_data['rename_column_names'],
        upload_data['initial_column_names'],
        upload_data['columns_to_upload'],
    )}

    # Create the strings to show in the table for each of the rows explaining
    # what is going to be the effect of the update operation over them.
    #
    # There are 8 cases depending on the column name being a key column,
    # in DST, SRC, if SRC is being renamed, and SRC is being loaded.
    #
    # Case 1: The column is the key column used for the merge (skip it)
    #
    # Case 2: in DST, NOT in SRC:
    #         Dst | |
    #
    # Case 3: in DST, in SRC, NOT LOADED
    #         Dst Name | <-- | Src new name (Ignored)
    #
    # Case 4: NOT in DST, in SRC, NOT LOADED
    #         | | Src new name (Ignored)
    #
    # Case 5: in DST, in SRC, Loaded, no rename:
    #         Dst Name (Update) | <-- | Src name
    #
    # Case 6: in DST, in SRC, loaded, rename:
    #         Dst Name (Update) | <-- | Src new name (Renamed)
    #
    # Case 7: NOT in DST, in SRC, loaded, no rename
    #         Dst Name (NEW) | <-- | src name
    #
    # Case 8: NOT in DST, in SRC, loaded, renamed
    #         Dst Name (NEW) | <-- | src name (renamed)
    #
    column_info = []
    for colname in final_columns:

        # Case 1: Skip the keys
        if colname == src_selected_key or colname == dst_selected_key:
            continue

        # Case 2: Column is in DST and left untouched (no counter part in SRC)
        if colname not in list(src_info.keys()):
            column_info.append((colname, False, ''))
            continue

        # Get old name and if it is going to be loaded
        old_name, to_load = src_info[colname]

        # Column is not going to be loaded anyway
        if not to_load:
            if colname in dst_column_names:
                # Case 3
                column_info.append((colname, False, colname + _(' (Ignored)')))
            else:
                # Case 4
                column_info.append(('', False, colname + _(' (Ignored)')))
            continue

        # Initial name on the dst data frame
        dst_name = colname
        # Column not present in DST, so it is a new column
        if colname not in dst_column_names:
            dst_name += _(' (New)')
        else:
            dst_name += _(' (Update)')

        src_name = colname
        if colname != old_name:
            src_name += _(' (Renamed)')

        # Cases 5 - 8
        column_info.append((dst_name, True, src_name))

    # Store the value in the request object and update
    request.session['upload_data'] = upload_data

    return render(
        request,
        'dataops/upload_s4.html',
        {
            'prev_step': reverse('dataops:upload_s3'),
            'info': column_info,
            'valuerange': range(5),
            'next_name': 'Finish'})

Example #7

0

Show file

File: plugin_execute.py Project: ubc/ontask_b

def _execute_plugin(
    workflow,
    plugin_info,
    input_column_names,
    output_column_names,
    output_suffix,
    merge_key,
    plugin_params,
):
    """
    Execute the run method in the plugin.

    Execute the run method in a plugin with the dataframe from the given
    workflow

    :param workflow: Workflow object being processed
    :param plugin_info: PluginReistry object being processed
    :param input_column_names: List of input column names
    :param output_column_names: List of output column names
    :param output_suffix: Suffix that is added to the output column names
    :param merge_key: Key column to use in the merge
    :param plugin_params: Dictionary with the parameters to execute the plug in
    :return: Nothing, the result is stored in the log with log_id
    """
    try:
        plugin_instance, msgs = load_plugin(plugin_info.filename)
    except OnTaskServiceException:
        raise Exception(
            ugettext('Unable to instantiate plugin "{0}"').format(
                plugin_info.name), )

    # Check that the list of given inputs is consistent: if plugin has a list
    # of inputs, it has to have the same length as the given list.
    if (plugin_instance.get_input_column_names()
            and len(plugin_instance.get_input_column_names()) !=
            len(input_column_names)):
        raise Exception(
            ugettext(
                'Inconsistent number of inputs when invoking plugin "{0}"',
            ).format(plugin_info.name), )

    # Check that the list of given outputs has the same length as the list of
    # outputs proposed by the plugin
    if (plugin_instance.get_output_column_names()
            and len(plugin_instance.get_output_column_names()) !=
            len(output_column_names)):
        raise Exception(
            ugettext(
                'Inconsistent number of outputs when invoking plugin "{0}"',
            ).format(plugin_info.name), )

    # Get the data frame from the workflow
    try:
        df = pandas.load_table(workflow.get_data_frame_table_name())
    except Exception as exc:
        raise Exception(
            ugettext(
                'Exception when retrieving the data frame from workflow: {0}',
            ).format(str(exc)), )

    # Set the updated names of the input, output columns, and the suffix
    if not plugin_instance.get_input_column_names():
        plugin_instance.input_column_names = input_column_names
    plugin_instance.output_column_names = output_column_names
    plugin_instance.output_suffix = output_suffix

    # Create a new dataframe with the given input columns, and rename them if
    # needed
    try:
        sub_df = pd.DataFrame(df[input_column_names])
        if plugin_instance.get_input_column_names():
            sub_df.columns = plugin_instance.get_input_column_names()
    except Exception as exc:
        raise Exception(
            ugettext(
                'Error when creating data frame for plugin: {0}', ).format(
                    str(exc)))

    # Try the execution and catch any exception
    try:
        new_df = plugin_instance.run(sub_df, parameters=plugin_params)
    except Exception as exc:
        raise Exception(
            ugettext('Error while executing plugin: {0}').format(str(exc)), )

    # If plugin does not return a data frame, flag as error
    if not isinstance(new_df, pd.DataFrame):
        raise Exception(
            ugettext(
                'Plugin executed but did not return a pandas data frame.'), )

    # Execution is DONE. Now we have to perform various additional checks

    # Result has to have the exact same number of rows
    if new_df.shape[0] != df.shape[0]:
        raise Exception(
            ugettext('Incorrect number of rows ({0}) in result data frame.',
                     ).format(new_df.shape[0]), )

    # Merge key name cannot be part of the output df
    if merge_key in new_df.columns:
        raise Exception(
            ugettext(
                'Column name {0} cannot be in the result data frame.'.format(
                    merge_key)), )

    # Result column names are consistent
    if set(new_df.columns) != set(plugin_instance.get_output_column_names()):
        raise Exception(ugettext('Incorrect columns in result data frame.'))

    # Add the merge column to the result df
    new_df[merge_key] = df[merge_key]

    # Proceed with the merge
    try:
        new_frame = pandas.perform_dataframe_upload_merge(
            workflow,
            df,
            new_df,
            {
                'how_merge': 'inner',
                'dst_selected_key': merge_key,
                'src_selected_key': merge_key,
                'initial_column_names': list(new_df.columns),
                'rename_column_names': list(new_df.columns),
                'columns_to_upload': [True] * len(list(new_df.columns)),
            },
        )
    except Exception as exc:
        raise Exception(
            ugettext('Error while merging result: {0}.').format(str(exc)), )

    if isinstance(new_frame, str):
        raise Exception(
            ugettext('Error while merging result: {0}.').format(new_frame))

    # Update execution time in the plugin
    plugin_info.executed = datetime.now(pytz.timezone(settings.TIME_ZONE), )
    plugin_info.save(update_fields=['executed'])

    return True

Example #8

0

Show file

File: sql_upload.py Project: ubc/ontask_b

    def execute_operation(
        self,
        user,
        workflow: Optional[models.Workflow] = None,
        action: Optional[models.Action] = None,
        payload: Optional[Dict] = None,
        log_item: Optional[models.Log] = None,
    ):
        """Perform a SQL upload asynchronously.

        :param user: User object
        :param workflow: Optional workflow object
        :param action: Empty
        :param payload: has fields:
          - connection_id: PK of the connection object to use
          - dst_key: Key column in the existing dataframe (if any) for merge
          - src_key: Key column in the external dataframe (for merge)
          - how_merge: Merge method: inner, outer, left, right
          - db_password: Encoded password if not stored in the connection
          - db_table: Table name if not stored in the connection
        :param log_item: Optional logitem object.
        :return: Elements to add to the "extend" field. Empty list in this case
                 Upload/merge the data in the given workflow.
        """
        del action

        # Get the connection
        conn = models.SQLConnection.objects.filter(
            pk=payload['connection_id']).first()
        if not conn:
            msg = _('Incorrect connection identifier.')
            log_item.payload['error'] = msg
            log_item.save(update_fields=['payload'])
            raise ontask.OnTaskException(msg)

        # Get the dataframe from the connection
        src_df = _load_df_from_sqlconnection(conn, payload)

        # Create session
        session = SESSION_STORE()
        session.create()

        workflow = acquire_workflow_access(user, session, workflow.id)
        if not workflow:
            msg = _('Unable to acquire access to workflow.')
            log_item.payload['error'] = msg
            log_item.save(update_fields=['payload'])
            raise ontask.OnTaskException(msg)

        # IDEA: How to deal with failure to acquire access?
        #       Include a setting with the time to wait and number of retries?

        if not workflow.has_data_frame():
            # Simple upload
            pandas.store_dataframe(src_df, workflow)
            return []

        # At this point the operation is a merge

        dst_df = pandas.load_table(workflow.get_data_frame_table_name())

        dst_key = payload.get('dst_key')
        if not dst_key:
            msg = _('Missing key column name of existing table.')
            log_item.payload['error'] = msg
            log_item.save(update_fields=['payload'])
            raise ontask.OnTaskException(msg)

        src_key = payload.get('src_key')
        if not src_key:
            msg = _('Missing key column name of new table.')
            log_item.payload['error'] = msg
            log_item.save(update_fields=['payload'])
            raise ontask.OnTaskException(msg)

        how_merge = payload.get('how_merge')
        if not how_merge:
            msg = _('Missing merge method.')
            log_item.payload['error'] = msg
            log_item.save(update_fields=['payload'])
            raise ontask.OnTaskException(msg)

        # Check additional correctness properties in the parameters
        error = pandas.validate_merge_parameters(dst_df, src_df, how_merge,
                                                 dst_key, src_key)
        if error:
            log_item.payload['error'] = error
            log_item.save(update_fields=['payload'])
            raise ontask.OnTaskException(error)

        merge_info = {
            'how_merge': how_merge,
            'dst_selected_key': dst_key,
            'src_selected_key': src_key,
            'initial_column_names': list(src_df.columns),
            'rename_column_names': list(src_df.columns),
            'columns_to_upload': [True] * len(list(src_df.columns))
        }
        # Perform the merge
        pandas.perform_dataframe_upload_merge(workflow, dst_df, src_df,
                                              merge_info)

        log_item.payload = merge_info
        log_item.save(update_fields=['payload'])