Example #1
0
    def clean_data_frame(self):
        """
        Function to check that the integers are positive.
        :return: The cleaned data
        """

        try:
            # Verify the data frame
            pandas_db.verify_data_frame(self.data_frame)
        except OnTaskDataFrameNoKey as e:
            self.add_error('file', e)
            # FIX Once django-bootstrap4 fixes the bug preventing file feedback
            # showing. REMOVE
            self.add_error(None, e)
            return

        # Store the data frame in the DB.
        try:
            # Get frame info with three lists: names, types and is_key
            self.frame_info = ops.store_upload_dataframe_in_db(
                self.data_frame,
                self.workflow_id)
        except Exception as e:
            self.add_error('file',
                           _('Unable to process file ({0}).'.format(e)))
            # FIX Once django-bootstrap4 fixes the bug preventing file feedback
            # showing. REMOVE
            self.add_error(None,
                           _('Unable to process file ({0}).'.format(e)))

        return
Example #2
0
def excelupload1(request):
    """
    Step 1 of the whole process to read data into the platform.

    The four step process will populate the following dictionary with name
    upload_data (divided by steps in which they are set

    STEP 1:

    initial_column_names: List of column names in the initial file.

    column_types: List of column types as detected by pandas

    src_is_key_column: Boolean list with src columns that are unique

    step_1: URL name of the first step

    :param request: Web request
    :return: Creates the upload_data dictionary in the session
    """

    # Get the current workflow
    workflow = get_workflow(request)
    if not workflow:
        return redirect('workflow:index')

    # Bind the form with the received data
    form = UploadExcelFileForm(request.POST or None, request.FILES or None)

    # Process the initial loading of the form
    if request.method != 'POST':
        return render(request, 'dataops/upload1.html',
                      {'form': form,
                       'wid': workflow.id,
                       'dtype': 'Excel',
                       'dtype_select': 'Excel file',
                       'prev_step': reverse('dataops:uploadmerge')})

    # Process the reception of the file
    if not form.is_multipart():
        msg = _("Excel upload form is not multiform")
        context = {'message': msg}

        meta = request.META.get('HTTP_REFERER', None)
        if meta:
            context['meta'] = meta
        return render(request, 'critical_error.html', context=context)

    # If not valid, this is probably because the file submitted was too big
    if not form.is_valid():
        return render(request, 'dataops/upload1.html',
                      {'form': form,
                       'wid': workflow.id,
                       'dtype': 'Excel',
                       'dtype_select': 'Excel file',
                       'prev_step': reverse('dataops:uploadmerge')})

    # Process Excel file using pandas read_excel
    try:
        data_frame = pd.read_excel(
            request.FILES['file'],
            sheet_name=form.cleaned_data['sheet'],
            index_col=False,
            infer_datetime_format=True,
            quotechar='"',
        )

        # Strip white space from all string columns and try to convert to
        # datetime just in case
        for x in list(data_frame.columns):
            if data_frame[x].dtype.name == 'object':
                # Column is a string!
                data_frame[x] = data_frame[x].str.strip()

                # Try the datetime conversion
                try:
                    series = pd.to_datetime(data_frame[x],
                                            infer_datetime_format=True)
                    # Datetime conversion worked! Update the data_frame
                    data_frame[x] = series
                except ValueError:
                    pass
    except Exception as e:
        form.add_error('file',
                       _('File could not be processed ({0})').format(e.message))
        return render(request,
                      'dataops/upload1.html',
                      {'form': form,
                       'dtype': 'Excel',
                       'dtype_select': 'Excel file',
                       'prev_step': reverse('dataops:uploadmerge')})

    # If the frame has repeated column names, it will not be processed.
    if len(set(data_frame.columns)) != len(data_frame.columns):
        dup = [x for x, v in Counter(list(data_frame.columns)) if v > 1]
        form.add_error(
            'file',
            _('The file has duplicated column names') + ' (' +
            ','.join(dup) + ').')
        return render(request, 'dataops/upload1.html',
                      {'form': form,
                       'dtype': 'Excel',
                       'dtype_select': 'Excel file',
                       'prev_step': reverse('dataops:uploadmerge')})

    # If the data frame does not have any unique key, it is not useful (no
    # way to uniquely identify rows). There must be at least one.
    src_is_key_column = ops.are_unique_columns(data_frame)
    if not any(src_is_key_column):
        form.add_error(
            'file',
            _('The data has no column with unique values per row. '
            'At least one column must have unique values.'))
        return render(request, 'dataops/upload1.html',
                      {'form': form,
                       'dtype': 'Excel',
                       'dtype_select': 'Excel file',
                       'prev_step': reverse('dataops:uploadmerge')})

    # Store the data frame in the DB.
    try:
        # Get frame info with three lists: names, types and is_key
        frame_info = ops.store_upload_dataframe_in_db(data_frame, workflow.id)
    except Exception as e:
        form.add_error(
            'file',
            _('Sorry. This file cannot be processed.')
        )
        return render(request, 'dataops/upload1.html',
                      {'form': form,
                       'dtype': 'Excel',
                       'dtype_select': 'Excel file',
                       'prev_step': reverse('dataops:uploadmerge')})

    # Dictionary to populate gradually throughout the sequence of steps. It
    # is stored in the session.
    request.session['upload_data'] = {
        'initial_column_names': frame_info[0],
        'column_types': frame_info[1],
        'src_is_key_column': frame_info[2],
        'step_1': reverse('dataops:excelupload1')
    }

    return redirect('dataops:upload_s2')
Example #3
0
def csvupload1(request):
    """
    The four step process will populate the following dictionary with name
    upload_data (divided by steps in which they are set

    STEP 1:

    initial_column_names: List of column names in the initial file.

    column_types: List of column types as detected by pandas

    src_is_key_column: Boolean list with src columns that are unique

    step_1: URL name of the first step

    :param request: Web request
    :return: Creates the upload_data dictionary in the session
    """

    # Get the current workflow
    workflow = get_workflow(request)
    if not workflow:
        return redirect('workflow:index')

    # Bind the form with the received data
    form = UploadCSVFileForm(request.POST or None, request.FILES or None)

    # Process the initial loading of the form
    if request.method != 'POST':
        return render(request, 'dataops/upload1.html',
                      {'form': form,
                       'wid': workflow.id,
                       'dtype': 'CSV',
                       'dtype_select': 'CSV file',
                       'prev_step': reverse('dataops:list')})

    # Process the reception of the file
    if not form.is_multipart():
        msg = "CSV upload form is not multiform"
        context = {'message': msg}

        meta = request.META.get('HTTP_REFERER', None)
        if meta:
            context['meta'] = meta
        return render(request, 'critical_error.html', context=context)

    # If not valid, this is probably because the file submitted was too big
    if not form.is_valid():
        return render(request, 'dataops/upload1.html',
                      {'form': form,
                       'wid': workflow.id,
                       'dtype': 'CSV',
                       'dtype_select': 'CSV file',
                       'prev_step': reverse('dataops:list')})

    # Process CSV file using pandas read_csv
    try:
        data_frame = pandas_db.load_df_from_csvfile(
            request.FILES['file'],
            form.cleaned_data['skip_lines_at_top'],
            form.cleaned_data['skip_lines_at_bottom'])
    except Exception as e:
        form.add_error('file',
                       'File could not be processed ({0})'.format(e.message))
        return render(request,
                      'dataops/upload1.html',
                      {'form': form,
                       'dtype': 'CSV',
                       'dtype_select': 'CSV file',
                       'prev_step': reverse('dataops:list')})

    # If the frame has repeated column names, it will not be processed.
    if len(set(data_frame.columns)) != len(data_frame.columns):
        dup = [x for x, v in Counter(list(data_frame.columns)) if v > 1]
        form.add_error(
            'file',
            'The file has duplicated column names (' +
            ','.join(dup) + ').')
        return render(request, 'dataops/upload1.html',
                      {'form': form,
                       'dtype': 'CSV',
                       'dtype_select': 'CSV file',
                       'prev_step': reverse('dataops:list')})

    # If the data frame does not have any unique key, it is not useful (no
    # way to uniquely identify rows). There must be at least one.
    src_is_key_column = ops.are_unique_columns(data_frame)
    if not any(src_is_key_column):
        form.add_error(
            'file',
            'The data has no column with unique values per row. '
            'At least one column must have unique values.')
        return render(request, 'dataops/upload1.html',
                      {'form': form,
                       'dtype': 'CSV',
                       'dtype_select': 'CSV file',
                       'prev_step': reverse('dataops:list')})

    # Store the data frame in the DB.
    try:
        # Get frame info with three lists: names, types and is_key
        frame_info = ops.store_upload_dataframe_in_db(data_frame, workflow.id)
    except Exception as e:
        form.add_error(
            'file',
            'Sorry. This file cannot be processed.'
        )
        return render(request, 'dataops/upload1.html',
                      {'form': form,
                       'dtype': 'CSV',
                       'dtype_select': 'CSV file',
                       'prev_step': reverse('dataops:list')})

    # Dictionary to populate gradually throughout the sequence of steps. It
    # is stored in the session.
    request.session['upload_data'] = {
        'initial_column_names': frame_info[0],
        'column_types': frame_info[1],
        'src_is_key_column': frame_info[2],
        'step_1': 'dataops:csvupload1'
    }

    return redirect('dataops:upload_s2')
Example #4
0
def sqlupload1(request, pk):
    """
    The four step process will populate the following dictionary with name
    upload_data (divided by steps in which they are set

    STEP 1:

    initial_column_names: List of column names in the initial file.

    column_types: List of column types as detected by pandas

    src_is_key_column: Boolean list with src columns that are unique

    step_1: URL name of the first step

    :param request: Web request
    :return: Creates the upload_data dictionary in the session
    """

    # Get the current workflow
    workflow = get_workflow(request)
    if not workflow:
        return redirect('workflow:index')

        # Get the connection
    conn = SQLConnection.objects.filter(pk=pk).first()
    if not conn:
        return redirect('dataops:sqlconns')

    form = None
    if conn.db_password:
        # The connection needs a password  to operate
        form = SQLRequestPassword(request.POST or None)

    context = {
        'form': form,
        'wid': workflow.id,
        'dtype': 'SQL',
        'dtype_select': 'SQL connection',
        'prev_step': reverse('dataops:sqlconns'),
        'conn_type': conn.conn_type,
        'conn_driver': conn.conn_driver,
        'db_user': conn.db_user,
        'db_passwd': '<PROTECTED>' if conn.db_password else '',
        'db_host': conn.db_host,
        'db_port': conn.db_port,
        'db_name': conn.db_name,
        'db_table': conn.db_table
    }

    # Process the initial loading of the form
    if request.method != 'POST' or (form and not form.is_valid()):
        return render(request, 'dataops/sqlupload1.html', context)

    read_pwd = None
    if form:
        read_pwd = form.cleaned_data['password']

    # Process SQL connection using pandas
    try:
        data_frame = pandas_db.load_df_from_sqlconnection(conn, read_pwd)
    except Exception as e:
        messages.error(request, 'Unable to obtain data: {0}'.format(e.message))
        return render(request, 'dataops/upload1.html', context)

    # If the frame has repeated column names, it will not be processed.
    if len(set(data_frame.columns)) != len(data_frame.columns):
        dup = [x for x, v in Counter(list(data_frame.columns)) if v > 1]
        messages.error(
            request, 'The data frame has duplicated column names (' +
            ','.join(dup) + ').')
        return render(request, 'dataops/sqlupload1.html', context)

    # If the data frame does not have any unique key, it is not useful (no
    # way to uniquely identify rows). There must be at least one.
    src_is_key_column = ops.are_unique_columns(data_frame)
    if not any(src_is_key_column):
        messages.error(
            request, 'The data has no column with unique values per row. '
            'At least one column must have unique values.')
        return render(request, 'dataops/sqlupload1.html', context)

    # Store the data frame in the DB.
    try:
        # Get frame info with three lists: names, types and is_key
        frame_info = ops.store_upload_dataframe_in_db(data_frame, workflow.id)
    except Exception as e:
        form.add_error(
            None, 'Sorry. The data from this connection cannot be processed.')
        return render(request, 'dataops/sqlupload1.html', context)

    # Dictionary to populate gradually throughout the sequence of steps. It
    # is stored in the session.
    request.session['upload_data'] = {
        'initial_column_names': frame_info[0],
        'column_types': frame_info[1],
        'src_is_key_column': frame_info[2],
        'step_1': reverse('dataops:sqlupload1', kwargs={'pk': conn.id})
    }

    return redirect('dataops:upload_s2')