Ejemplo n.º 1
0
    def put(self, request, pk, format=None):
        # Try to retrieve the wflow to check for permissions
        workflow = self.get_object(pk)
        # Get the dst_df
        dst_df = pandas_db.load_from_db(pk)

        serializer = self.serializer_class(data=request.data)
        if not serializer.is_valid():
            return Response(serializer.errors,
                            status=status.HTTP_400_BAD_REQUEST)

        # Check that the parameters are correct
        how = serializer.validated_data['how']
        if how == '' or how not in ['left', 'right', 'outer', 'inner']:
            raise APIException(_('how must be one of left, right, outer '
                                 'or inner'))

        left_on = serializer.validated_data['left_on']
        if not dataops.pandas_db.is_unique_column(dst_df[left_on]):
            raise APIException(_('column {0} does not contain a unique '
                                 'key.').format(left_on))

        # Operation has been accepted by the serializer
        src_df = serializer.validated_data['src_df']

        right_on = serializer.validated_data['right_on']
        if right_on not in list(src_df.columns):
            raise APIException(_('column {0} not found in data frame').format(
                right_on)
            )

        if not dataops.pandas_db.is_unique_column(src_df[right_on]):
            raise APIException(
                _('column {0} does not contain a unique key.').format(right_on)
            )

        merge_info = {
            'how_merge': how,
            'dst_selected_key': left_on,
            'src_selected_key': right_on,
            'initial_column_names': list(src_df.columns),
            'rename_column_names': list(src_df.columns),
            'columns_to_upload': [True] * len(list(src_df.columns)),
        }

        # Ready to perform the MERGE
        try:
            merge_result = ops.perform_dataframe_upload_merge(workflow,
                                                              dst_df,
                                                              src_df,
                                                              merge_info)
        except Exception:
            raise APIException(_('Unable to perform merge operation'))

        if merge_result:
            # Something went wrong, raise the exception
            raise APIException(merge_result)

        # Merge went through.
        return Response(serializer.data,
                        status=status.HTTP_201_CREATED)
Ejemplo n.º 2
0
def upload_s2(request):
    """
    The four step process will populate the following dictionary with name
    upload_data (divided by steps in which they are set

    ASSUMES:

    initial_column_names: List of column names in the initial file.

    column_types: List of column types as detected by pandas

    src_is_key_column: Boolean list with src columns that are unique

    step_1: URL name of the first step

    CREATES:

    rename_column_names: Modified column names to remove ambiguity when
                          merging.

    columns_to_upload: Boolean list denoting the columns in SRC that are
                       marked for upload.

    keep_key_column: Boolean list with those key columns that need to be kept.

    :param request: Web request
    :return: the dictionary upload_data in the session object
    """
    workflow = get_workflow(request)
    if not workflow:
        return redirect('workflow:index')

    # Get the dictionary to store information about the upload
    # is stored in the session.
    upload_data = request.session.get('upload_data', None)
    if not upload_data:
        # If there is no object, or it is an empty dict, it denotes a direct
        # jump to this step, get back to the dataops page
        return redirect('dataops:uploadmerge')

    # Get the column names, types, and those that are unique from the data frame
    try:
        initial_columns = upload_data.get('initial_column_names')
        column_types = upload_data.get('column_types')
        src_is_key_column = upload_data.get('src_is_key_column')
    except KeyError:
        # The page has been invoked out of order
        return redirect(
            upload_data.get('step_1', reverse('dataops:uploadmerge')))

    # Get or create the list with the renamed column names
    rename_column_names = upload_data.get('rename_column_names', None)
    if rename_column_names is None:
        rename_column_names = initial_columns[:]
        upload_data['rename_column_names'] = rename_column_names

    # Get or create list of booleans identifying columns to be uploaded
    columns_to_upload = upload_data.get('columns_to_upload', None)
    if columns_to_upload is None:
        columns_to_upload = [True] * len(initial_columns)
        upload_data['columns_to_upload'] = columns_to_upload

    # Get or create list of booleans identifying key columns to be kept
    keep_key_column = upload_data.get('keep_key_column', None)
    if keep_key_column is None:
        keep_key_column = upload_data['src_is_key_column'][:]
        upload_data['keep_key_column'] = keep_key_column

    # Bind the form with the received data (remember unique columns)
    form = SelectColumnUploadForm(request.POST or None,
                                  column_names=rename_column_names,
                                  columns_to_upload=columns_to_upload,
                                  is_key=src_is_key_column,
                                  keep_key=keep_key_column)

    # Get a hold of the fields to create a list to be processed in the page
    load_fields = [f for f in form if f.name.startswith('upload_')]
    newname_fields = [f for f in form if f.name.startswith('new_name_')]
    src_key_fields = [
        form['make_key_%s' % idx] if src_is_key_column[idx] else None
        for idx in range(len(src_is_key_column))
    ]

    # Create one of the context elements for the form. Pack the lists so that
    # they can be iterated in the template
    df_info = [
        list(i) for i in zip(load_fields, initial_columns, newname_fields,
                             column_types, src_key_fields)
    ]

    # Process the initial loading of the form and return
    if request.method != 'POST' or not form.is_valid():
        # Update the dictionary with the session information
        request.session['upload_data'] = upload_data
        context = {
            'form': form,
            'wid': workflow.id,
            'prev_step': upload_data['step_1'],
            'df_info': df_info
        }

        if not ops.workflow_id_has_table(workflow.id):
            # It is an upload, not a merge, set the next step to finish
            context['next_name'] = _('Finish')
        return render(request, 'dataops/upload_s2.html', context)

    # At this point we are processing a valid POST request

    # We need to modify upload_data with the information received in the post
    for i in range(len(initial_columns)):
        new_name = form.cleaned_data['new_name_%s' % i]
        upload_data['rename_column_names'][i] = new_name
        upload = form.cleaned_data['upload_%s' % i]
        upload_data['columns_to_upload'][i] = upload

        if src_is_key_column[i]:
            # If the column is key, check if the user wants to keep it
            keep_key_column[i] = form.cleaned_data['make_key_%s' % i]

    # Update the dictionary with the session information
    request.session['upload_data'] = upload_data

    # Load the existing DF or None if it doesn't exist
    existing_df = pandas_db.load_from_db(workflow.id)

    if existing_df is not None:
        # This is a merge operation, so move to Step 3
        return redirect('dataops:upload_s3')

    # This is an upload operation (not a merge) save the uploaded dataframe in
    # the DB and finish.

    # Get the uploaded data_frame
    try:
        data_frame = ops.load_upload_from_db(workflow.id)
    except Exception:
        return render(
            request, 'error.html',
            {'message': _('Exception while retrieving the data frame')})

    # Update the data frame
    status = ops.perform_dataframe_upload_merge(workflow, existing_df,
                                                data_frame, upload_data)

    if status:
        # Something went wrong. Flag it and reload
        context = {
            'form': form,
            'wid': workflow.id,
            'prev_step': upload_data['step_1'],
            'df_info': df_info
        }
        return render(request, 'dataops/upload_s2.html', context)

    # Nuke the temporary table
    pandas_db.delete_upload_table(workflow.id)

    # Log the event
    col_info = workflow.get_column_info()
    Log.objects.register(
        request.user, Log.WORKFLOW_DATA_UPLOAD, workflow, {
            'id': workflow.id,
            'name': workflow.name,
            'num_rows': workflow.nrows,
            'num_cols': workflow.ncols,
            'column_names': col_info[0],
            'column_types': col_info[1],
            'column_unique': col_info[2]
        })

    # Go back to show the workflow detail
    return redirect(reverse('table:display'))
Ejemplo n.º 3
0
def upload_s4(request):
    """

    Step 4: Show the user the expected effect of the merge and perform it.

    ASSUMES:

    initial_column_names: List of column names in the initial file.

    column_types: List of column types as detected by pandas

    src_is_key_column: Boolean list with src columns that are unique

    step_1: URL name of the first step

    rename_column_names: Modified column names to remove ambiguity when
                          merging.

    columns_to_upload: Boolean list denoting the columns in SRC that are
                       marked for upload.

    dst_column_names: List of column names in destination frame

    dst_is_unique_column: Boolean list with dst columns that are unique

    dst_unique_col_names: List with the column names that are unique

    dst_selected_key: Key column name selected in DST

    src_selected_key: Key column name selected in SRC

    how_merge: How to merge. One of {left, right, outter, inner}

    :param request: Web request
    :return:
    """
    # Get the workflow id we are processing
    workflow = get_workflow(request)
    if not workflow:
        return redirect('workflow:index')

    # Get the dictionary containing the information about the upload
    upload_data = request.session.get('upload_data', None)
    if not upload_data:
        # If there is nsendo object, someone is trying to jump directly here.
        return redirect('dataops:uploadmerge')

    # Check the type of request that is being processed
    if request.method == 'POST':
        # We are processing a POST request

        # Get the dataframes to merge
        try:
            dst_df = pandas_db.load_from_db(workflow.id)
            src_df = ops.load_upload_from_db(workflow.id)
        except Exception:
            return render(request, 'error.html',
                          {'message': _('Exception while loading data frame')})

        # Performing the merge
        status = ops.perform_dataframe_upload_merge(workflow, dst_df, src_df,
                                                    upload_data)

        # Nuke the temporary table
        pandas_db.delete_upload_table(workflow.id)

        col_info = workflow.get_column_info()
        if status:
            Log.objects.register(
                request.user, Log.WORKFLOW_DATA_FAILEDMERGE, workflow, {
                    'id': workflow.id,
                    'name': workflow.name,
                    'num_rows': workflow.nrows,
                    'num_cols': workflow.ncols,
                    'column_names': col_info[0],
                    'column_types': col_info[1],
                    'column_unique': col_info[2],
                    'error_msg': status
                })

            messages.error(request,
                           _('Merge operation failed.') + ' (' + status + ')'),
            return redirect(reverse('table:display'))

        # Log the event
        Log.objects.register(
            request.user, Log.WORKFLOW_DATA_MERGE, workflow, {
                'id': workflow.id,
                'name': workflow.name,
                'num_rows': workflow.nrows,
                'num_cols': workflow.ncols,
                'column_names': col_info[0],
                'column_types': col_info[1],
                'column_unique': col_info[2]
            })

        # Remove the csvupload from the session object
        request.session.pop('upload_data', None)

        return redirect(reverse('table:display'))

    # We are processing a GET request

    # Create the information to include in the final report table
    dst_column_names = upload_data['dst_column_names']
    dst_selected_key = upload_data['dst_selected_key']
    src_selected_key = upload_data['src_selected_key']
    # List of final column names
    final_columns = sorted(set().union(dst_column_names,
                                       upload_data['rename_column_names']))
    # Dictionary with (new src column name: (old name, is_uploaded?)
    src_info = {
        x: (y, z)
        for (x, y, z) in zip(upload_data['rename_column_names'],
                             upload_data['initial_column_names'],
                             upload_data['columns_to_upload'])
    }

    # Create the strings to show in the table for each of the rows explaining
    # what is going to be the effect of the update operation over them.
    #
    # There are 8 cases depending on the column name being a key column,
    # in DST, SRC, if SRC is being renamed, and SRC is being loaded.
    #
    # Case 1: The column is the key column used for the merge (skip it)
    #
    # Case 2: in DST, NOT in SRC:
    #         Dst | |
    #
    # Case 3: in DST, in SRC, NOT LOADED
    #         Dst Name | <-- | Src new name (Ignored)
    #
    # Case 4: NOT in DST, in SRC, NOT LOADED
    #         | | Src new name (Ignored)
    #
    # Case 5: in DST, in SRC, Loaded, no rename:
    #         Dst Name (Update) | <-- | Src name
    #
    # Case 6: in DST, in SRC, loaded, rename:
    #         Dst Name (Update) | <-- | Src new name (Renamed)
    #
    # Case 7: NOT in DST, in SRC, loaded, no rename
    #         Dst Name (NEW) | <-- | src name
    #
    # Case 8: NOT in DST, in SRC, loaded, renamed
    #         Dst Name (NEW) | <-- | src name (renamed)
    #
    info = []
    for colname in final_columns:

        # Case 1: Skip the keys
        if colname == src_selected_key or colname == dst_selected_key:
            continue

        # Case 2: Column is in DST and left untouched (no counter part in SRC)
        if colname not in list(src_info.keys()):
            info.append((colname, False, ''))
            continue

        # Get old name and if it is going to be loaded
        old_name, toLoad = src_info[colname]

        # Column is not going to be loaded anyway
        if not toLoad:
            if colname in dst_column_names:
                # Case 3
                info.append((colname, False, colname + _(' (Ignored)')))
            else:
                # Case 4
                info.append(('', False, colname + _(' (Ignored)')))
            continue

        # Initial name on the dst data frame
        dst_name = colname
        # Column not present in DST, so it is a new column
        if colname not in dst_column_names:
            dst_name += _(' (New)')
        else:
            dst_name += _(' (Update)')

        src_name = colname
        if colname != old_name:
            src_name += _(' (Renamed)')

        # Cases 5 - 8
        info.append((dst_name, True, src_name))

    # Store the value in the request object and update
    request.session['upload_data'] = upload_data

    return render(
        request, 'dataops/upload_s4.html', {
            'prev_step': reverse('dataops:upload_s3'),
            'info': info,
            'next_name': 'Finish'
        })
Ejemplo n.º 4
0
    def put(self, request, pk, format=None):
        # Try to retrieve the wflow to check for permissions
        self.get_object(pk, user=self.request.user)
        # Get the dst_df
        dst_df = pandas_db.load_from_db(pk)

        serializer = self.serializer_class(data=request.data)
        if not serializer.is_valid():
            return Response(serializer.errors,
                            status=status.HTTP_400_BAD_REQUEST)

        # Check that the parameters are correct
        how = serializer.validated_data['how']
        if how == '' or how not in ['left', 'right', 'outer', 'inner']:
            raise APIException('how must be one of left, right, outer '
                               'or inner')

        left_on = serializer.validated_data['left_on']
        if not ops.is_unique_column(dst_df[left_on]):
            raise APIException('column' + left_on +
                               'does not contain a unique key.')

        # Operation has been accepted by the serializer
        src_df = serializer.validated_data['src_df']

        right_on = serializer.validated_data['right_on']
        if right_on not in list(src_df.columns):
            raise APIException('column ' + right_on +
                               ' not found in data frame')

        if not ops.is_unique_column(src_df[right_on]):
            raise APIException('column' + right_on +
                               'does not contain a unique key.')

        dup_column = serializer.validated_data['dup_column']
        if dup_column == '' or dup_column not in ['override', 'rename']:
            raise APIException('dup_column must be override or rename')

        override_columns_names = []
        autorename_column_names = None
        if dup_column == 'override':
            # List of columns to drop (the ones in both data sets
            override_columns_names = list((set(dst_df.columns)
                                           & set(src_df.columns)) - {left_on})
        else:
            autorename_column_names = []
            for colname in list(src_df.columns):
                # If the column is the key, insert as is
                if colname == right_on:
                    autorename_column_names.append(colname)
                    continue

                # If the column does not collide, insert as is
                if colname not in dst_df.columns:
                    autorename_column_names.append(colname)
                    continue

                # Column name collides with existing column
                i = 0  # Suffix to rename
                while True:
                    i += 1
                    new_name = colname + '_{0}'.format(i)
                    if new_name not in dst_df.columns:
                        break
                autorename_column_names.append(new_name)

        merge_info = {
            'how_merge': how,
            'dst_selected_key': left_on,
            'src_selected_key': right_on,
            'initial_column_names': list(src_df.columns),
            'autorename_column_names': autorename_column_names,
            'rename_column_names': list(src_df.columns),
            'columns_to_upload': [True] * len(list(src_df.columns)),
            'override_columns_names': override_columns_names
        }

        # Ready to perform the MERGE
        try:
            merge_result = ops.perform_dataframe_upload_merge(
                pk, dst_df, src_df, merge_info)
        except Exception:
            raise APIException('Unable to perform merge operation')

        if merge_result:
            # Something went wrong, raise the exception
            raise APIException(merge_result)

        # Merge went through.
        return Response(serializer.data, status=status.HTTP_201_CREATED)