def parse_data_frames(self): # Parse the two CSV strings and return as data frames df_dst = pandas_db.load_df_from_csvfile(StringIO.StringIO(self.csv1), 0, 0) df_src = pandas_db.load_df_from_csvfile(StringIO.StringIO(self.csv2), 0, 0) # Fix the merge_info fields. self.merge_info['initial_column_names'] = list(df_src.columns) self.merge_info['rename_column_names'] = list(df_src.columns) self.merge_info['columns_to_upload'] = list(df_src.columns) return df_dst, df_src
def test_df_equivalent_after_sql(self): # Parse the CSV df_source = pandas_db.load_df_from_csvfile( StringIO.StringIO(self.csv1), 0, 0) # Store the DF in the DB pandas_db.store_table(df_source, self.table_name) # Load it from the DB df_dst = pandas_db.load_table(self.table_name) # Data frames mut be identical assert df_source.equals(df_dst)
def df_equivalent_after_sql(self): # Parse the CSV df_source = pandas_db.load_df_from_csvfile( StringIO.StringIO(self.csv1), 0, 0) # Store the DF in the DB pandas_db.store_table(df_source, self.table_name) # Load it from the DB df_dst = pandas_db.load_table(self.table_name) # Columns have to have the same values (None and NaN are # different) for x in df_source.columns: np.testing.assert_array_equal( np.array(df_source[x], dtype=unicode), np.array(df_dst[x], dtype=unicode))
def clean(self): """ Function to check that the integers are positive. :return: The cleaned data """ data = super(UploadCSVFileForm, self).clean() done = False if data['skip_lines_at_top'] < 0: self.add_error( 'skip_lines_at_top', _('This number has to be zero or positive') ) done = True if data['skip_lines_at_bottom'] < 0: self.add_error( 'skip_lines_at_bottom', _('This number has to be zero or positive') ) done = True if done: return data # Process CSV file using pandas read_csv try: self.data_frame = pandas_db.load_df_from_csvfile( TextIOWrapper(self.files['file'].file, encoding=self.data.encoding), self.cleaned_data['skip_lines_at_top'], self.cleaned_data['skip_lines_at_bottom']) except Exception as e: self.add_error('file', _('File could not be processed ({0})').format(e)) return data # Check the conditions in the data frame self.clean_data_frame() return data
def csvupload1(request): """ The four step process will populate the following dictionary with name upload_data (divided by steps in which they are set STEP 1: initial_column_names: List of column names in the initial file. column_types: List of column types as detected by pandas src_is_key_column: Boolean list with src columns that are unique step_1: URL name of the first step :param request: Web request :return: Creates the upload_data dictionary in the session """ # Get the current workflow workflow = get_workflow(request) if not workflow: return redirect('workflow:index') # Bind the form with the received data form = UploadCSVFileForm(request.POST or None, request.FILES or None) # Process the initial loading of the form if request.method != 'POST': return render(request, 'dataops/upload1.html', {'form': form, 'wid': workflow.id, 'dtype': 'CSV', 'dtype_select': 'CSV file', 'prev_step': reverse('dataops:list')}) # Process the reception of the file if not form.is_multipart(): msg = "CSV upload form is not multiform" context = {'message': msg} meta = request.META.get('HTTP_REFERER', None) if meta: context['meta'] = meta return render(request, 'critical_error.html', context=context) # If not valid, this is probably because the file submitted was too big if not form.is_valid(): return render(request, 'dataops/upload1.html', {'form': form, 'wid': workflow.id, 'dtype': 'CSV', 'dtype_select': 'CSV file', 'prev_step': reverse('dataops:list')}) # Process CSV file using pandas read_csv try: data_frame = pandas_db.load_df_from_csvfile( request.FILES['file'], form.cleaned_data['skip_lines_at_top'], form.cleaned_data['skip_lines_at_bottom']) except Exception as e: form.add_error('file', 'File could not be processed ({0})'.format(e.message)) return render(request, 'dataops/upload1.html', {'form': form, 'dtype': 'CSV', 'dtype_select': 'CSV file', 'prev_step': reverse('dataops:list')}) # If the frame has repeated column names, it will not be processed. if len(set(data_frame.columns)) != len(data_frame.columns): dup = [x for x, v in Counter(list(data_frame.columns)) if v > 1] form.add_error( 'file', 'The file has duplicated column names (' + ','.join(dup) + ').') return render(request, 'dataops/upload1.html', {'form': form, 'dtype': 'CSV', 'dtype_select': 'CSV file', 'prev_step': reverse('dataops:list')}) # If the data frame does not have any unique key, it is not useful (no # way to uniquely identify rows). There must be at least one. src_is_key_column = ops.are_unique_columns(data_frame) if not any(src_is_key_column): form.add_error( 'file', 'The data has no column with unique values per row. ' 'At least one column must have unique values.') return render(request, 'dataops/upload1.html', {'form': form, 'dtype': 'CSV', 'dtype_select': 'CSV file', 'prev_step': reverse('dataops:list')}) # Store the data frame in the DB. try: # Get frame info with three lists: names, types and is_key frame_info = ops.store_upload_dataframe_in_db(data_frame, workflow.id) except Exception as e: form.add_error( 'file', 'Sorry. This file cannot be processed.' ) return render(request, 'dataops/upload1.html', {'form': form, 'dtype': 'CSV', 'dtype_select': 'CSV file', 'prev_step': reverse('dataops:list')}) # Dictionary to populate gradually throughout the sequence of steps. It # is stored in the session. request.session['upload_data'] = { 'initial_column_names': frame_info[0], 'column_types': frame_info[1], 'src_is_key_column': frame_info[2], 'step_1': 'dataops:csvupload1' } return redirect('dataops:upload_s2')