def store_temporary_dataframe( data_frame: pd.DataFrame, workflow, ): """Store a temporary dataframe. :param data_frame: Data frame to store :param workflow: Data frame will belong to this workflow :return: List of three lists: - Data frame columns - Column types (OnTask) - List of booleans denoting if the column is unique """ table_name = workflow.get_upload_table_name() # Get the if the columns have unique values per row column_unique = pandas.are_unique_columns(data_frame) # Store the table in the DB pandas.store_table(data_frame, table_name) # Get the column types df_column_types = sql.get_df_column_types(table_name) # Return a list with three list with information about the # data frame that will be needed in the next steps return [list(data_frame.columns), df_column_types, column_unique]
def upload_column_name_too_long(self): """Use the table store to detect column names that are too long.""" data_frame = load_df_from_csvfile(io.StringIO(self.csv), 0, 0) self.assertTrue( any(len(cname) > COLUMN_NAME_SIZE for cname in data_frame.columns)) try: store_table(data_frame, 'TABLE_NAME') except Exception as exc: self.assertTrue('Column name is longer than' in str(exc)) else: raise Exception('Column with long name is not detected')
def parse_data_frames(self): # Parse the two CSV strings and return as data frames if self.workflow: # Get the workflow data frame df_dst = load_table(self.workflow.get_data_frame_table_name()) else: df_dst = load_df_from_csvfile(io.StringIO(self.csv1), 0, 0) df_src = load_df_from_csvfile(io.StringIO(self.csv2), 0, 0) store_table(df_src, 'TEMPORARY_TABLE') df_src = load_table('TEMPORARY_TABLE') # Fix the merge_info fields. self.merge_info['initial_column_names'] = list(df_src.columns) self.merge_info['rename_column_names'] = list(df_src.columns) self.merge_info['columns_to_upload'] = list(df_src.columns) return df_dst, df_src
def test_df_equivalent_after_sql(self): # Parse the CSV df_source = services.load_df_from_csvfile(io.StringIO(self.csv1), 0, 0) # Store the DF in the DB pandas.store_table(df_source, self.table_name) # Load it from the DB df_dst = pandas.load_table(self.table_name) # NaN in boolean columns are now None df_source['bool1'] = df_source['bool1'].where( pd.notnull(df_source['bool1']), None) df_source['bool2'] = df_source['bool2'].where( pd.notnull(df_source['bool2']), None) # Data frames mut be identical assert df_source.equals(df_dst)
def test_df_equivalent_after_sql(self): # Parse the CSV df_source = load_df_from_csvfile(io.StringIO(self.csv1), 0, 0) # Store the DF in the DB store_table(df_source, self.table_name) # Load it from the DB df_dst = load_table(self.table_name) # NaN in boolean columns are now None df_source['bool1'] = df_source['bool1'].where( pd.notnull(df_source['bool1']), None) df_source['bool2'] = df_source['bool2'].where( pd.notnull(df_source['bool2']), None) # Datetime need to be localized to the local timezone df_dst['date1'] = df_dst['date1'].dt.tz_convert(settings.TIME_ZONE) df_dst['date2'] = df_dst['date2'].dt.tz_convert(settings.TIME_ZONE) # Data frames mut be identical assert df_source.equals(df_dst)
def test_eval_sql(self): # Create the dataframe with the variables df = pd.DataFrame( [(1, 2.0, True, 'xxx', datetime.datetime(2018, 1, 1, 0, 0, 0)), (None, None, None, None, None)], columns=self.test_columns) # Store the data frame store_table(df, 'TEST_TABLE') # # EQUAL # self.do_sql_txt_operand('number', '{0}equal', 'integer', '1') self.do_sql_txt_operand('number', '{0}equal', 'double', '2.0') self.do_sql_txt_operand('select', '{0}equal', 'boolean', 'true') self.do_sql_txt_operand('text', '{0}equal', 'string', 'xxx') self.do_sql_txt_operand('text', '{0}equal', 'datetime', '2018-01-01T00:00:00') # # BEGINS WITH # self.do_sql_txt_operand('text', '{0}begins_with', 'string', 'x') # # CONTAINS # self.do_sql_txt_operand('text', '{0}contains', 'string', 'xx') # # ENDS WITH # self.do_sql_txt_operand('text', '{0}ends_with', 'string', 'xx') # # IS EMPTY # self.do_sql_txt_operand('text', 'is_{0}empty', 'string', None) # # IS NULL # self.do_sql_txt_operand('number', 'is_{0}null', 'integer', None) self.do_sql_txt_operand('number', 'is_{0}null', 'double', None) self.do_sql_txt_operand('select', 'is_{0}null', 'boolean', None) self.do_sql_txt_operand('text', 'is_{0}null', 'string', None) self.do_sql_txt_operand('text', 'is_{0}null', 'datetime', None) # # LESS # self.do_sql_txt_operand('number', 'less', 'integer', '2') self.do_sql_txt_operand('number', 'less', 'double', '3.2') self.do_sql_txt_operand('text', 'less', 'datetime', '2018-01-02T00:00:00') # # LESS OR EQUAL # self.do_sql_txt_operand('number', 'less_or_equal', 'integer', '1') self.do_sql_txt_operand('number', 'less_or_equal', 'double', '2.0') self.do_sql_txt_operand('text', 'less_or_equal', 'datetime', '2018-01-01T00:00:00') # # GREATER # self.do_sql_txt_operand('number', 'greater', 'integer', '0') self.do_sql_txt_operand('number', 'greater', 'double', '1.2') self.do_sql_txt_operand('text', 'greater', 'datetime', '2017-01-01T00:00:00') # # GREATER OR EQUAL # self.do_sql_txt_operand('number', 'greater_or_equal', 'integer', '1') self.do_sql_txt_operand('number', 'greater_or_equal', 'double', '2.0') self.do_sql_txt_operand('text', 'greater_or_equal', 'datetime', '2018-01-01T00:00:00') # # BETWEEN # self.do_sql_txt_operand('number', '{0}between', 'integer', ['0', '2'], 1, 0) self.do_sql_txt_operand('number', '{0}between', 'double', ['1.2', '2.2'], 1, 0) self.do_sql_txt_operand('text', '{0}between', 'datetime', ['2017-01-01T00:00:00', '2018-09-13T00:00:00'], 1, 0)
def create(self, validated_data, **kwargs): """Create the new workflow.""" wflow_name = self.context.get('name') if not wflow_name: wflow_name = self.validated_data.get('name') if not wflow_name: raise Exception(_('Unexpected empty workflow name.')) if models.Workflow.objects.filter( name=wflow_name, user=self.context['user']).exists(): raise Exception( _('There is a workflow with this name. ' + 'Please provide a workflow name in the import page.')) # Initial values workflow_obj = None try: workflow_obj = models.Workflow( user=self.context['user'], name=wflow_name, description_text=validated_data['description_text'], nrows=0, ncols=0, attributes=validated_data['attributes'], query_builder_ops=validated_data.get('query_builder_ops', {}), ) workflow_obj.save() # Create the columns column_data = ColumnSerializer(data=validated_data.get( 'columns', []), many=True, context={'workflow': workflow_obj}) # And save its content if column_data.is_valid(): columns = column_data.save() else: raise Exception(_('Unable to save column information')) # If there is any column with position = 0, recompute (this is to # guarantee backward compatibility. if any(col.position == 0 for col in columns): for idx, col in enumerate(columns): col.position = idx + 1 col.save() # Load the data frame data_frame = validated_data.get('data_frame') if data_frame is not None: # Store the table in the DB pandas.store_table( data_frame, workflow_obj.get_data_frame_table_name(), dtype={ col.name: col.data_type for col in workflow_obj.columns.all() }, ) # Reconcile now the information in workflow and columns with # the one loaded workflow_obj.ncols = validated_data['ncols'] workflow_obj.nrows = validated_data['nrows'] workflow_obj.save() # Create the actions pointing to the workflow action_data = ActionSerializer(data=validated_data.get( 'actions', []), many=True, context={ 'workflow': workflow_obj, 'columns': columns }) if action_data.is_valid(): action_data.save() else: raise Exception(_('Unable to save column information')) # Create the views pointing to the workflow view_data = ViewSerializer(data=validated_data.get('views', []), many=True, context={ 'workflow': workflow_obj, 'columns': columns }) if view_data.is_valid(): view_data.save() else: raise Exception(_('Unable to save column information')) except Exception: # Get rid of the objects created if workflow_obj: if workflow_obj.id: workflow_obj.delete() raise return workflow_obj