def delete_data_frame_table(sender, **kwargs): """Delete the data table when deleting the workflow.""" del sender instance = kwargs.get('instance') if not instance: return if instance.has_table(): sql.delete_table(instance.get_data_frame_table_name())
def upload_step_four( request: http.HttpRequest, workflow: models.Workflow, upload_data: Dict, ) -> http.HttpResponse: """Perform the merge operation. :param request: Received request :param workflow: Workflow being processed :param upload_data: Dictionary with all the information about the merge. :return: HttpResponse """ # Get the dataframes to merge try: dst_df = pandas.load_table(workflow.get_data_frame_table_name()) src_df = pandas.load_table(workflow.get_upload_table_name()) except Exception: return render(request, 'error.html', {'message': _('Exception while loading data frame')}) try: pandas.perform_dataframe_upload_merge(workflow, dst_df, src_df, upload_data) except Exception as exc: # Nuke the temporary table sql.delete_table(workflow.get_upload_table_name()) col_info = workflow.get_column_info() workflow.log(request.user, models.Log.WORKFLOW_DATA_FAILEDMERGE, column_names=col_info[0], column_types=col_info[1], column_unique=col_info[2], error_message=str(exc)) messages.error(request, _('Merge operation failed. ') + str(exc)) return redirect(reverse('table:display')) col_info = workflow.get_column_info() workflow.log(request.user, upload_data['log_upload'], column_names=col_info[0], column_types=col_info[1], column_unique=col_info[2]) store_workflow_in_session(request.session, workflow) request.session.pop('upload_data', None) return redirect(reverse('table:display'))
def flush(self): """Flush all the data from the workflow and propagate changes. It removes relations with columns, conditions, filters, etc. These steps require: 1) Delete the data frame from the database 2) Delete all the actions attached to the workflow (with their conditions) 3) Delete all the views attached to the workflow 4) Delete all the columns attached to the workflow :return: Reflected in the DB """ # Step 1: Delete the data frame from the database delete_table(self.get_data_frame_table_name()) # Reset some of the workflow fields self.nrows = 0 self.ncols = 0 self.n_filterd_rows = -1 self.data_frame_table_name = '' # Step 2: Delete the conditions attached to all the actions attached # to the workflow. for act in self.actions.all(): act.conditions.all().delete() act.delete() # Step 3: Delete all the views attached to the workflow self.views.all().delete() # Step 4: Delete the column_names, column_types and column_unique self.columns.all().delete() self.set_query_builder_ops() # Save the workflow with the new fields. self.save()
def batch_load_df_from_athenaconnection( workflow: models.Workflow, conn: models.AthenaConnection, run_params: Dict, log_item: models.Log, ): """Batch load a DF from an Athena connection. run_params has: aws_secret_access_key: Optional[str] = None, aws_session_token: Optional[str] = None, table_name: Optional[str] = None key_column_name[str] = None merge_method[str] = None from pyathena import connect from pyathena.pandas_cursor import PandasCursor cursor = connect( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, aws_session_token=aws_session_token, s3_staging_dir=staging_dir, region_name=region_name) df = pd.read_sql('SELECT * FROM given_table_name', cursor) print(df.describe()) print(df.head()) :param workflow: Workflow to store the new data :param conn: AthenaConnection object with the connection parameters. :param run_params: Dictionary with additional connection parameters :param log_item: Log object to reflect the status of the execution :return: Nothing. """ staging_dir = 's3://{0}'.format(conn.aws_bucket_name) if conn.aws_file_path: staging_dir = staging_dir + '/' + conn.aws_file_path cursor = connect(aws_access_key_id=conn.aws_access_key, aws_secret_access_key=run_params['aws_secret_access_key'], aws_session_token=run_params['aws_session_token'], s3_staging_dir=staging_dir, region_name=conn.aws_region_name) data_frame = pd.read_sql_table(run_params['table_name'], cursor) # Strip white space from all string columns and try to convert to # datetime just in case data_frame = pandas.detect_datetime_columns(data_frame) pandas.verify_data_frame(data_frame) col_names, col_types, is_key = pandas.store_temporary_dataframe( data_frame, workflow) upload_data = { 'initial_column_names': col_names, 'col_types': col_types, 'src_is_key_column': is_key, 'rename_column_names': col_names[:], 'columns_to_upload': [True] * len(col_names), 'keep_key_column': is_key[:] } if not workflow.has_data_frame(): # Regular load operation pandas.store_workflow_table(workflow, upload_data) log_item.payload['col_names'] = col_names log_item.payload['col_types'] = col_types log_item.payload['column_unique'] = is_key log_item.payload['num_rows'] = workflow.nrows log_item.payload['num_cols'] = workflow.ncols log_item.save(update_fields=['payload']) return # Merge operation upload_data['dst_column_names'] = workflow.get_column_names() upload_data['dst_is_unique_column'] = workflow.get_column_unique() upload_data['dst_unique_col_names'] = [ cname for idx, cname in enumerate(upload_data['dst_column_names']) if upload_data['dst_column_names'][idx] ] upload_data['src_selected_key'] = run_params['merge_key'] upload_data['dst_selected_key'] = run_params['merge_key'] upload_data['how_merge'] = run_params['merge_method'] dst_df = pandas.load_table(workflow.get_data_frame_table_name()) src_df = pandas.load_table(workflow.get_data_frame_upload_table_name()) try: pandas.perform_dataframe_upload_merge(workflow, dst_df, src_df, upload_data) except Exception as exc: # Nuke the temporary table sql.delete_table(workflow.get_data_frame_upload_table_name()) raise Exception( _('Unable to perform merge operation: {0}').format(str(exc))) col_names, col_types, is_key = workflow.get_column_info() log_item.payload['col_names'] = col_names log_item.payload['col_types'] = col_types log_item.payload['column_unique'] = is_key log_item.payload['num_rows'] = workflow.nrows log_item.payload['num_cols'] = workflow.ncols log_item.save(update_fields=['payload'])
def store_workflow_table( workflow, update_info: Optional[Dict] = None, ): """Make a temporary DB table the workflow table. It is assumed that there is a temporal table already in the database. The function performs the following steps: Step 1: Drop the columns that are not being uploaded Step 2: Rename the columns (if needed) Step 3: Create the workflow columns Step 4: Rename the table (temporary to final) Step 5: Update workflow fields and update :param workflow: Workflow object being manipulated. :param update_info: Dictionary with the following fields: - initial_column_names: list of column names detected in read phase. - rename_column_names: List of new names for the columns - column_types: List of types detected after storing in DB - keep_key_column: List of booleans to flag if key property is kept - columns_to_upload: List of booleans to flag column upload The first field is mandatory. The have default values if not provided. :return: Nothing. Anomalies are raised as Exceptions """ # Check information on update_info and complete if needed if not update_info.get('initial_column_names'): raise _('Internal error while processing database.') if not update_info.get('rename_column_names'): update_info['rename_column_names'] = update_info[ 'initial_column_names'] if not update_info.get('column_types'): raise _('Internal error while processing database.') if not update_info.get('keep_key_column'): raise _('Internal error while processing database.') if not update_info.get('columns_to_upload'): update_info['columns_to_upload'] = [True] * len( update_info['initial_column_names']) db_table = workflow.get_upload_table_name() new_columns = [] for old_n, new_n, data_type, is_key, upload in zip( update_info['initial_column_names'], update_info['rename_column_names'], update_info['column_types'], update_info['keep_key_column'], update_info['columns_to_upload'], ): # Detect if the column is new or already exists current_col = workflow.columns.filter(name=old_n).first() # Step 1: Check if column needs to be uploaded if not upload: # Column is dropped sql.df_drop_column(db_table, old_n) if current_col: # Dropping an existing column. Incorrect. raise _('Invalid column drop operation.') continue # Step 2: Check if the column must be renamed if old_n != new_n: # Rename column from old_n to new_n sql.db_rename_column(db_table, old_n, new_n) if current_col: rename_df_column(workflow, old_n, new_n) if current_col: if current_col.data_type != data_type: # If the column type in the DB is different from the one in the # object, update current_col.data_type = data_type current_col.save() else: # Step 3: Create the column new_columns.append((new_n, data_type, is_key)) # Create the columns workflow.add_columns(new_columns) workflow.refresh_from_db() # Step 4: Rename the table (Drop the original one first if workflow.has_table(): sql.delete_table(workflow.get_data_frame_table_name()) sql.rename_table(db_table, workflow.get_data_frame_table_name()) # Step 5: Update workflow fields and save workflow.nrows = sql.get_num_rows(workflow.get_data_frame_table_name()) workflow.set_query_builder_ops() workflow.save(update_fields=['nrows', 'query_builder_ops'])
def delete_data_frame_table(sender, instance, **kwargs): if instance.has_table(): delete_table(instance.get_data_frame_table_name())