def test_df_equivalent_after_sql(self): # Parse the CSV df_source = pandas_db.load_df_from_csvfile( StringIO.StringIO(self.csv1), 0, 0) # Store the DF in the DB pandas_db.store_table(df_source, self.table_name) # Load it from the DB df_dst = pandas_db.load_table(self.table_name) # Data frames mut be identical assert df_source.equals(df_dst)
def df_equivalent_after_sql(self): # Parse the CSV df_source = pandas_db.load_df_from_csvfile( StringIO.StringIO(self.csv1), 0, 0) # Store the DF in the DB pandas_db.store_table(df_source, self.table_name) # Load it from the DB df_dst = pandas_db.load_table(self.table_name) # Columns have to have the same values (None and NaN are # different) for x in df_source.columns: np.testing.assert_array_equal( np.array(df_source[x], dtype=unicode), np.array(df_dst[x], dtype=unicode))
def store_table_in_db(data_frame, pk, table_name, temporary=False): """ Update or create a table in the DB with the data in the data frame. It also updates the corresponding column information :param data_frame: Data frame to dump to DB :param pk: Corresponding primary key of the workflow :param table_name: Table to use in the DB :param temporary: Boolean stating if the table is temporary, or it belongs to an existing workflow. :return: If temporary = True, then return a list with three lists: - column names - column types - column is unique If temporary = False, return None. All this info is stored in the workflow """ if settings.DEBUG: print('Storing table ', table_name) # get column names df_column_names = list(data_frame.columns) # if the data frame is temporary, the procedure is much simpler if temporary: # Get the if the columns have unique values per row column_unique = are_unique_columns(data_frame) # Store the table in the DB store_table(data_frame, table_name) # Get the column types df_column_types = df_column_types_rename(table_name) # Return a list with three list with information about the # data frame that will be needed in the next steps return [df_column_names, df_column_types, column_unique] # We are modifying an existing DF # Get the workflow and its columns workflow = Workflow.objects.get(id=pk) wf_col_names = Column.objects.filter(workflow__id=pk).values_list( "name", flat=True) # Loop over the columns in the data frame and reconcile the column info # with the column objects attached to the WF for cname in df_column_names: # See if this is a new column if cname in wf_col_names: # If column already exists in wf_col_names, no need to do anything continue # Create the new column column = Column( name=cname, workflow=workflow, data_type=pandas_datatype_names[data_frame[cname].dtype.name], is_key=is_unique_column(data_frame[cname]), position=Column.objects.filter(workflow=workflow).count() + 1, ) column.save() # Get now the new set of columns with names wf_columns = Column.objects.filter(workflow__id=pk) # Reorder the columns in the data frame data_frame = data_frame[[x.name for x in wf_columns]] # Store the table in the DB store_table(data_frame, table_name) # Review the column types because some "objects" are stored as booleans column_types = df_column_types_rename(table_name) for ctype, col in zip(column_types, wf_columns): if col.data_type != ctype: # If the column type in the DB is different from the one in the # object, update col.data_type = ctype col.save() # Update workflow fields and save workflow.nrows = data_frame.shape[0] workflow.ncols = data_frame.shape[1] workflow.set_query_builder_ops() workflow.data_frame_table_name = table_name workflow.save() return None
def test_eval_sql(self): # Create the dataframe with the variables df = pd.DataFrame([ (1, 2.0, True, 'xxx', datetime.datetime(2018, 01, 01, 00, 00, 00)), (None, None, None, None, None) ], columns=[ 'v_integer', 'v_double', 'v_boolean', 'v_string', 'v_datetime' ]) # Store the data frame pandas_db.store_table(df, 'TEST_TABLE') # # EQUAL # self.do_sql_operand('number', '{0}equal', 'integer', '1') self.do_sql_operand('number', '{0}equal', 'double', '2.0') self.do_sql_operand('number', '{0}equal', 'boolean', '1') self.do_sql_operand('text', '{0}equal', 'string', 'xxx') self.do_sql_operand('text', '{0}equal', 'datetime', '2018-01-01T00:00:00') # # BEGINS WITH # self.do_sql_operand('text', '{0}begins_with', 'string', 'x') # # CONTAINS # self.do_sql_operand('text', '{0}contains', 'string', 'xx') # # ENDS WITH # self.do_sql_operand('text', '{0}ends_with', 'string', 'xx') # # IS EMPTY # self.do_sql_operand('number', 'is_{0}empty', 'integer', None) self.do_sql_operand('number', 'is_{0}empty', 'double', None) self.do_sql_operand('text', 'is_{0}empty', 'string', None) self.do_sql_operand('text', 'is_{0}empty', 'boolean', None) self.do_sql_operand('text', 'is_{0}empty', 'datetime', None) # # LESS # self.do_sql_operand('number', 'less', 'integer', '2') self.do_sql_operand('number', 'less', 'double', '3.2') self.do_sql_operand('text', 'less', 'datetime', '2018-01-02T00:00:00') # # LESS OR EQUAL # self.do_sql_operand('number', 'less_or_equal', 'integer', '1') self.do_sql_operand('number', 'less_or_equal', 'double', '2.0') self.do_sql_operand('text', 'less_or_equal', 'datetime', '2018-01-01T00:00:00') # # GREATER # self.do_sql_operand('number', 'greater', 'integer', '0') self.do_sql_operand('number', 'greater', 'double', '1.2') self.do_sql_operand('text', 'greater', 'datetime', '2017-01-01T00:00:00') # # GREATER OR EQUAL # self.do_sql_operand('number', 'greater_or_equal', 'integer', '1') self.do_sql_operand('number', 'greater_or_equal', 'double', '2.0') self.do_sql_operand('text', 'greater_or_equal', 'datetime', '2018-01-01T00:00:00') # # BETWEEN # self.do_sql_operand('number', '{0}between', 'integer', ['0', '2']) self.do_sql_operand('number', '{0}between', 'double', ['1.2', '2.2']) self.do_sql_operand('text', '{0}between', 'datetime', ['2017-01-01T00:00:00', '2018-09-13T00:00:00'])
def store_table_in_db(data_frame, pk, table_name, temporary=False, reset_keys=True): """ Update or create a table in the DB with the data in the data frame. It also updates the corresponding column information :param data_frame: Data frame to dump to DB :param pk: Corresponding primary key of the workflow :param table_name: Table to use in the DB :param temporary: Boolean stating if the table is temporary, or it belongs to an existing workflow. :param reset_keys: Reset the value of the field is_key computing it from scratch :return: If temporary = True, then return a list with three lists: - column names - column types - column is unique If temporary = False, return None. All this info is stored in the workflow """ if settings.DEBUG: print('Storing table ', table_name) # get column names df_column_names = list(data_frame.columns) # if the data frame is temporary, the procedure is much simpler if temporary: # Get the if the columns have unique values per row column_unique = are_unique_columns(data_frame) # Store the table in the DB store_table(data_frame, table_name) # Get the column types df_column_types = df_column_types_rename(table_name) # Return a list with three list with information about the # data frame that will be needed in the next steps return [df_column_names, df_column_types, column_unique] # We are modifying an existing DF # Get the workflow and its columns workflow = Workflow.objects.get(id=pk) wf_cols = workflow.columns.all() # Loop over the columns in the Workflow to refresh the is_key value. There # may be values that have been added to the column, so this field needs to # be reassessed for col in wf_cols: if reset_keys: new_val = is_unique_column(data_frame[col.name]) if col.is_key and not new_val: # Only set the is_key value if the column states that it is a # key column, but the values say no. Othe other way around # is_key is false in the column will be ignored as it may have # been set by the user col.is_key = new_val col.save() # Remove this column name from wf_col_names, no further processing is # needed. df_column_names.remove(col.name) # Loop over the remaining columns in the data frame and create the new # column objects in the workflow for cname in df_column_names: # Create the new column column = Column( name=cname, workflow=workflow, data_type=pandas_datatype_names[data_frame[cname].dtype.name], is_key=is_unique_column(data_frame[cname]), position=workflow.columns.count() + 1 ) column.save() # Get the new set of columns with names wf_columns = workflow.columns.all() # Reorder the columns in the data frame data_frame = data_frame[[x.name for x in wf_columns]] # Store the table in the DB store_table(data_frame, table_name) # Review the column types because some "objects" are stored as booleans column_types = df_column_types_rename(table_name) for ctype, col in zip(column_types, wf_columns): if col.data_type != ctype: # If the column type in the DB is different from the one in the # object, update col.data_type = ctype col.save() # Update workflow fields and save workflow.nrows = data_frame.shape[0] workflow.ncols = data_frame.shape[1] workflow.set_query_builder_ops() workflow.data_frame_table_name = table_name workflow.save() return None