def upload_step_two( request: http.HttpRequest, workflow: models.Workflow, select_column_data: Dict, upload_data: Dict, ) -> http.HttpResponse: """Process the received dataframe and either store or continue merge. :param request: Http request received. :param workflow: workflow being processed. :param select_column_data: Dictionary with the upload/merge information (new column names, keep the keys) :param upload_data: Dictionary with the upload information. :return: Http Response """ # We need to modify upload_data with the information received in the # post initial_columns = upload_data.get('initial_column_names') src_is_key_column = upload_data.get('src_is_key_column') keep_key_column = upload_data.get('keep_key_column') for idx in range(len(initial_columns)): new_name = select_column_data['new_name_%s' % idx] upload_data['rename_column_names'][idx] = new_name upload = select_column_data['upload_%s' % idx] upload_data['columns_to_upload'][idx] = upload if src_is_key_column[idx]: # If the column is key, check if the user wants to keep it keep_key_column[idx] = select_column_data['make_key_%s' % idx] if workflow.has_data_frame(): # A Merge operation is required # Update the dictionary with the session information so that it is # available in the next step request.session['upload_data'] = upload_data # This is a merge operation, so move to Step 3 return redirect('dataops:upload_s3') # This is the first data to be stored in the workflow. Save the uploaded # dataframe in the DB and finish. pandas.store_workflow_table(workflow, upload_data) # Update the session information store_workflow_in_session(request.session, workflow) col_info = workflow.get_column_info() workflow.log(request.user, upload_data['log_upload'], column_names=col_info[0], column_types=col_info[1], column_unique=col_info[2]) return redirect(reverse('table:display'))
def batch_load_df_from_athenaconnection( workflow: models.Workflow, conn: models.AthenaConnection, run_params: Dict, log_item: models.Log, ): """Batch load a DF from an Athena connection. run_params has: aws_secret_access_key: Optional[str] = None, aws_session_token: Optional[str] = None, table_name: Optional[str] = None key_column_name[str] = None merge_method[str] = None from pyathena import connect from pyathena.pandas_cursor import PandasCursor cursor = connect( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, aws_session_token=aws_session_token, s3_staging_dir=staging_dir, region_name=region_name) df = pd.read_sql('SELECT * FROM given_table_name', cursor) print(df.describe()) print(df.head()) :param workflow: Workflow to store the new data :param conn: AthenaConnection object with the connection parameters. :param run_params: Dictionary with additional connection parameters :param log_item: Log object to reflect the status of the execution :return: Nothing. """ staging_dir = 's3://{0}'.format(conn.aws_bucket_name) if conn.aws_file_path: staging_dir = staging_dir + '/' + conn.aws_file_path cursor = connect(aws_access_key_id=conn.aws_access_key, aws_secret_access_key=run_params['aws_secret_access_key'], aws_session_token=run_params['aws_session_token'], s3_staging_dir=staging_dir, region_name=conn.aws_region_name) data_frame = pd.read_sql_table(run_params['table_name'], cursor) # Strip white space from all string columns and try to convert to # datetime just in case data_frame = pandas.detect_datetime_columns(data_frame) pandas.verify_data_frame(data_frame) col_names, col_types, is_key = pandas.store_temporary_dataframe( data_frame, workflow) upload_data = { 'initial_column_names': col_names, 'col_types': col_types, 'src_is_key_column': is_key, 'rename_column_names': col_names[:], 'columns_to_upload': [True] * len(col_names), 'keep_key_column': is_key[:] } if not workflow.has_data_frame(): # Regular load operation pandas.store_workflow_table(workflow, upload_data) log_item.payload['col_names'] = col_names log_item.payload['col_types'] = col_types log_item.payload['column_unique'] = is_key log_item.payload['num_rows'] = workflow.nrows log_item.payload['num_cols'] = workflow.ncols log_item.save(update_fields=['payload']) return # Merge operation upload_data['dst_column_names'] = workflow.get_column_names() upload_data['dst_is_unique_column'] = workflow.get_column_unique() upload_data['dst_unique_col_names'] = [ cname for idx, cname in enumerate(upload_data['dst_column_names']) if upload_data['dst_column_names'][idx] ] upload_data['src_selected_key'] = run_params['merge_key'] upload_data['dst_selected_key'] = run_params['merge_key'] upload_data['how_merge'] = run_params['merge_method'] dst_df = pandas.load_table(workflow.get_data_frame_table_name()) src_df = pandas.load_table(workflow.get_data_frame_upload_table_name()) try: pandas.perform_dataframe_upload_merge(workflow, dst_df, src_df, upload_data) except Exception as exc: # Nuke the temporary table sql.delete_table(workflow.get_data_frame_upload_table_name()) raise Exception( _('Unable to perform merge operation: {0}').format(str(exc))) col_names, col_types, is_key = workflow.get_column_info() log_item.payload['col_names'] = col_names log_item.payload['col_types'] = col_types log_item.payload['column_unique'] = is_key log_item.payload['num_rows'] = workflow.nrows log_item.payload['num_cols'] = workflow.ncols log_item.save(update_fields=['payload'])