Ejemplo n.º 1
0
    def parse_data_frames(self):
        # Parse the two CSV strings and return as data frames
        df_dst = pandas_db.load_df_from_csvfile(StringIO.StringIO(self.csv1),
                                                0, 0)
        df_src = pandas_db.load_df_from_csvfile(StringIO.StringIO(self.csv2),
                                                0, 0)

        # Fix the merge_info fields.
        self.merge_info['initial_column_names'] = list(df_src.columns)
        self.merge_info['rename_column_names'] = list(df_src.columns)
        self.merge_info['columns_to_upload'] = list(df_src.columns)

        return df_dst, df_src
Ejemplo n.º 2
0
    def test_df_equivalent_after_sql(self):

        # Parse the CSV
        df_source = pandas_db.load_df_from_csvfile(
            StringIO.StringIO(self.csv1), 0, 0)

        # Store the DF in the DB
        pandas_db.store_table(df_source, self.table_name)

        # Load it from the DB
        df_dst = pandas_db.load_table(self.table_name)

        # Data frames mut be identical
        assert df_source.equals(df_dst)
Ejemplo n.º 3
0
    def df_equivalent_after_sql(self):

        # Parse the CSV
        df_source = pandas_db.load_df_from_csvfile(
            StringIO.StringIO(self.csv1), 0, 0)

        # Store the DF in the DB
        pandas_db.store_table(df_source, self.table_name)

        # Load it from the DB
        df_dst = pandas_db.load_table(self.table_name)

        # Columns have to have the same values (None and NaN are
        # different)
        for x in df_source.columns:
            np.testing.assert_array_equal(
                np.array(df_source[x], dtype=unicode),
                np.array(df_dst[x], dtype=unicode))
Ejemplo n.º 4
0
    def clean(self):
        """
        Function to check that the integers are positive.
        :return: The cleaned data
        """

        data = super(UploadCSVFileForm, self).clean()

        done = False
        if data['skip_lines_at_top'] < 0:
            self.add_error(
                'skip_lines_at_top',
                _('This number has to be zero or positive')
            )
            done = True

        if data['skip_lines_at_bottom'] < 0:
            self.add_error(
                'skip_lines_at_bottom',
                _('This number has to be zero or positive')
            )
            done = True

        if done:
            return data

        # Process CSV file using pandas read_csv
        try:
            self.data_frame = pandas_db.load_df_from_csvfile(
                TextIOWrapper(self.files['file'].file,
                              encoding=self.data.encoding),
                self.cleaned_data['skip_lines_at_top'],
                self.cleaned_data['skip_lines_at_bottom'])
        except Exception as e:
            self.add_error('file',
                           _('File could not be processed ({0})').format(e))
            return data

        # Check the conditions in the data frame
        self.clean_data_frame()

        return data
Ejemplo n.º 5
0
def csvupload1(request):
    """
    The four step process will populate the following dictionary with name
    upload_data (divided by steps in which they are set

    STEP 1:

    initial_column_names: List of column names in the initial file.

    column_types: List of column types as detected by pandas

    src_is_key_column: Boolean list with src columns that are unique

    step_1: URL name of the first step

    :param request: Web request
    :return: Creates the upload_data dictionary in the session
    """

    # Get the current workflow
    workflow = get_workflow(request)
    if not workflow:
        return redirect('workflow:index')

    # Bind the form with the received data
    form = UploadCSVFileForm(request.POST or None, request.FILES or None)

    # Process the initial loading of the form
    if request.method != 'POST':
        return render(request, 'dataops/upload1.html',
                      {'form': form,
                       'wid': workflow.id,
                       'dtype': 'CSV',
                       'dtype_select': 'CSV file',
                       'prev_step': reverse('dataops:list')})

    # Process the reception of the file
    if not form.is_multipart():
        msg = "CSV upload form is not multiform"
        context = {'message': msg}

        meta = request.META.get('HTTP_REFERER', None)
        if meta:
            context['meta'] = meta
        return render(request, 'critical_error.html', context=context)

    # If not valid, this is probably because the file submitted was too big
    if not form.is_valid():
        return render(request, 'dataops/upload1.html',
                      {'form': form,
                       'wid': workflow.id,
                       'dtype': 'CSV',
                       'dtype_select': 'CSV file',
                       'prev_step': reverse('dataops:list')})

    # Process CSV file using pandas read_csv
    try:
        data_frame = pandas_db.load_df_from_csvfile(
            request.FILES['file'],
            form.cleaned_data['skip_lines_at_top'],
            form.cleaned_data['skip_lines_at_bottom'])
    except Exception as e:
        form.add_error('file',
                       'File could not be processed ({0})'.format(e.message))
        return render(request,
                      'dataops/upload1.html',
                      {'form': form,
                       'dtype': 'CSV',
                       'dtype_select': 'CSV file',
                       'prev_step': reverse('dataops:list')})

    # If the frame has repeated column names, it will not be processed.
    if len(set(data_frame.columns)) != len(data_frame.columns):
        dup = [x for x, v in Counter(list(data_frame.columns)) if v > 1]
        form.add_error(
            'file',
            'The file has duplicated column names (' +
            ','.join(dup) + ').')
        return render(request, 'dataops/upload1.html',
                      {'form': form,
                       'dtype': 'CSV',
                       'dtype_select': 'CSV file',
                       'prev_step': reverse('dataops:list')})

    # If the data frame does not have any unique key, it is not useful (no
    # way to uniquely identify rows). There must be at least one.
    src_is_key_column = ops.are_unique_columns(data_frame)
    if not any(src_is_key_column):
        form.add_error(
            'file',
            'The data has no column with unique values per row. '
            'At least one column must have unique values.')
        return render(request, 'dataops/upload1.html',
                      {'form': form,
                       'dtype': 'CSV',
                       'dtype_select': 'CSV file',
                       'prev_step': reverse('dataops:list')})

    # Store the data frame in the DB.
    try:
        # Get frame info with three lists: names, types and is_key
        frame_info = ops.store_upload_dataframe_in_db(data_frame, workflow.id)
    except Exception as e:
        form.add_error(
            'file',
            'Sorry. This file cannot be processed.'
        )
        return render(request, 'dataops/upload1.html',
                      {'form': form,
                       'dtype': 'CSV',
                       'dtype_select': 'CSV file',
                       'prev_step': reverse('dataops:list')})

    # Dictionary to populate gradually throughout the sequence of steps. It
    # is stored in the session.
    request.session['upload_data'] = {
        'initial_column_names': frame_info[0],
        'column_types': frame_info[1],
        'src_is_key_column': frame_info[2],
        'step_1': 'dataops:csvupload1'
    }

    return redirect('dataops:upload_s2')