Exemple #1
0
def upload_step_two(
    request: http.HttpRequest,
    workflow: models.Workflow,
    select_column_data: Dict,
    upload_data: Dict,
) -> http.HttpResponse:
    """Process the received dataframe and either store or continue merge.

    :param request: Http request received.
    :param workflow: workflow being processed.
    :param select_column_data: Dictionary with the upload/merge information
    (new column names, keep the keys)
    :param upload_data: Dictionary with the upload information.
    :return: Http Response
    """
    # We need to modify upload_data with the information received in the
    # post
    initial_columns = upload_data.get('initial_column_names')
    src_is_key_column = upload_data.get('src_is_key_column')
    keep_key_column = upload_data.get('keep_key_column')
    for idx in range(len(initial_columns)):
        new_name = select_column_data['new_name_%s' % idx]
        upload_data['rename_column_names'][idx] = new_name
        upload = select_column_data['upload_%s' % idx]
        upload_data['columns_to_upload'][idx] = upload

        if src_is_key_column[idx]:
            # If the column is key, check if the user wants to keep it
            keep_key_column[idx] = select_column_data['make_key_%s' % idx]

    if workflow.has_data_frame():
        # A Merge operation is required

        # Update the dictionary with the session information so that it is
        # available in the next step
        request.session['upload_data'] = upload_data

        # This is a merge operation, so move to Step 3
        return redirect('dataops:upload_s3')

    # This is the first data to be stored in the workflow. Save the uploaded
    # dataframe in the DB and finish.
    pandas.store_workflow_table(workflow, upload_data)

    # Update the session information
    store_workflow_in_session(request.session, workflow)

    col_info = workflow.get_column_info()
    workflow.log(request.user,
                 upload_data['log_upload'],
                 column_names=col_info[0],
                 column_types=col_info[1],
                 column_unique=col_info[2])

    return redirect(reverse('table:display'))
Exemple #2
0
def batch_load_df_from_athenaconnection(
    workflow: models.Workflow,
    conn: models.AthenaConnection,
    run_params: Dict,
    log_item: models.Log,
):
    """Batch load a DF from an Athena connection.

    run_params has:
    aws_secret_access_key: Optional[str] = None,
    aws_session_token: Optional[str] = None,
    table_name: Optional[str] = None
    key_column_name[str] = None
    merge_method[str] = None

    from pyathena import connect
    from pyathena.pandas_cursor import PandasCursor

    cursor = connect(
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        aws_session_token=aws_session_token,
        s3_staging_dir=staging_dir,
        region_name=region_name)

    df = pd.read_sql('SELECT * FROM given_table_name', cursor)
    print(df.describe())
    print(df.head())

    :param workflow: Workflow to store the new data
    :param conn: AthenaConnection object with the connection parameters.
    :param run_params: Dictionary with additional connection parameters
    :param log_item: Log object to reflect the status of the execution
    :return: Nothing.
    """
    staging_dir = 's3://{0}'.format(conn.aws_bucket_name)
    if conn.aws_file_path:
        staging_dir = staging_dir + '/' + conn.aws_file_path

    cursor = connect(aws_access_key_id=conn.aws_access_key,
                     aws_secret_access_key=run_params['aws_secret_access_key'],
                     aws_session_token=run_params['aws_session_token'],
                     s3_staging_dir=staging_dir,
                     region_name=conn.aws_region_name)

    data_frame = pd.read_sql_table(run_params['table_name'], cursor)

    # Strip white space from all string columns and try to convert to
    # datetime just in case
    data_frame = pandas.detect_datetime_columns(data_frame)

    pandas.verify_data_frame(data_frame)

    col_names, col_types, is_key = pandas.store_temporary_dataframe(
        data_frame, workflow)

    upload_data = {
        'initial_column_names': col_names,
        'col_types': col_types,
        'src_is_key_column': is_key,
        'rename_column_names': col_names[:],
        'columns_to_upload': [True] * len(col_names),
        'keep_key_column': is_key[:]
    }

    if not workflow.has_data_frame():
        # Regular load operation
        pandas.store_workflow_table(workflow, upload_data)
        log_item.payload['col_names'] = col_names
        log_item.payload['col_types'] = col_types
        log_item.payload['column_unique'] = is_key
        log_item.payload['num_rows'] = workflow.nrows
        log_item.payload['num_cols'] = workflow.ncols
        log_item.save(update_fields=['payload'])
        return

    # Merge operation
    upload_data['dst_column_names'] = workflow.get_column_names()
    upload_data['dst_is_unique_column'] = workflow.get_column_unique()
    upload_data['dst_unique_col_names'] = [
        cname for idx, cname in enumerate(upload_data['dst_column_names'])
        if upload_data['dst_column_names'][idx]
    ]
    upload_data['src_selected_key'] = run_params['merge_key']
    upload_data['dst_selected_key'] = run_params['merge_key']
    upload_data['how_merge'] = run_params['merge_method']

    dst_df = pandas.load_table(workflow.get_data_frame_table_name())
    src_df = pandas.load_table(workflow.get_data_frame_upload_table_name())

    try:
        pandas.perform_dataframe_upload_merge(workflow, dst_df, src_df,
                                              upload_data)
    except Exception as exc:
        # Nuke the temporary table
        sql.delete_table(workflow.get_data_frame_upload_table_name())
        raise Exception(
            _('Unable to perform merge operation: {0}').format(str(exc)))

    col_names, col_types, is_key = workflow.get_column_info()
    log_item.payload['col_names'] = col_names
    log_item.payload['col_types'] = col_types
    log_item.payload['column_unique'] = is_key
    log_item.payload['num_rows'] = workflow.nrows
    log_item.payload['num_cols'] = workflow.ncols
    log_item.save(update_fields=['payload'])