コード例 #1
0
ファイル: signals.py プロジェクト: ubc/ontask_b
def delete_data_frame_table(sender, **kwargs):
    """Delete the data table when deleting the workflow."""
    del sender
    instance = kwargs.get('instance')
    if not instance:
        return

    if instance.has_table():
        sql.delete_table(instance.get_data_frame_table_name())
コード例 #2
0
ファイル: upload_steps.py プロジェクト: ubc/ontask_b
def upload_step_four(
    request: http.HttpRequest,
    workflow: models.Workflow,
    upload_data: Dict,
) -> http.HttpResponse:
    """Perform the merge operation.

    :param request: Received request
    :param workflow: Workflow being processed
    :param upload_data: Dictionary with all the information about the merge.
    :return: HttpResponse
    """
    # Get the dataframes to merge
    try:
        dst_df = pandas.load_table(workflow.get_data_frame_table_name())
        src_df = pandas.load_table(workflow.get_upload_table_name())
    except Exception:
        return render(request, 'error.html',
                      {'message': _('Exception while loading data frame')})

    try:
        pandas.perform_dataframe_upload_merge(workflow, dst_df, src_df,
                                              upload_data)
    except Exception as exc:
        # Nuke the temporary table
        sql.delete_table(workflow.get_upload_table_name())
        col_info = workflow.get_column_info()
        workflow.log(request.user,
                     models.Log.WORKFLOW_DATA_FAILEDMERGE,
                     column_names=col_info[0],
                     column_types=col_info[1],
                     column_unique=col_info[2],
                     error_message=str(exc))
        messages.error(request, _('Merge operation failed. ') + str(exc))
        return redirect(reverse('table:display'))

    col_info = workflow.get_column_info()
    workflow.log(request.user,
                 upload_data['log_upload'],
                 column_names=col_info[0],
                 column_types=col_info[1],
                 column_unique=col_info[2])
    store_workflow_in_session(request.session, workflow)
    request.session.pop('upload_data', None)

    return redirect(reverse('table:display'))
コード例 #3
0
ファイル: workflow.py プロジェクト: ritotombe/ontask_b
    def flush(self):
        """Flush all the data from the workflow and propagate changes.

        It removes relations with columns, conditions, filters, etc. These
        steps require:

        1) Delete the data frame from the database

        2) Delete all the actions attached to the workflow (with their
        conditions)

        3) Delete all the views attached to the workflow

        4) Delete all the columns attached to the workflow

        :return: Reflected in the DB
        """
        # Step 1: Delete the data frame from the database
        delete_table(self.get_data_frame_table_name())

        # Reset some of the workflow fields
        self.nrows = 0
        self.ncols = 0
        self.n_filterd_rows = -1
        self.data_frame_table_name = ''

        # Step 2: Delete the conditions attached to all the actions attached
        # to the workflow.
        for act in self.actions.all():
            act.conditions.all().delete()
            act.delete()

        # Step 3: Delete all the views attached to the workflow
        self.views.all().delete()

        # Step 4: Delete the column_names, column_types and column_unique
        self.columns.all().delete()
        self.set_query_builder_ops()

        # Save the workflow with the new fields.
        self.save()
コード例 #4
0
ファイル: dataframeupload.py プロジェクト: ubc/ontask_b
def batch_load_df_from_athenaconnection(
    workflow: models.Workflow,
    conn: models.AthenaConnection,
    run_params: Dict,
    log_item: models.Log,
):
    """Batch load a DF from an Athena connection.

    run_params has:
    aws_secret_access_key: Optional[str] = None,
    aws_session_token: Optional[str] = None,
    table_name: Optional[str] = None
    key_column_name[str] = None
    merge_method[str] = None

    from pyathena import connect
    from pyathena.pandas_cursor import PandasCursor

    cursor = connect(
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        aws_session_token=aws_session_token,
        s3_staging_dir=staging_dir,
        region_name=region_name)

    df = pd.read_sql('SELECT * FROM given_table_name', cursor)
    print(df.describe())
    print(df.head())

    :param workflow: Workflow to store the new data
    :param conn: AthenaConnection object with the connection parameters.
    :param run_params: Dictionary with additional connection parameters
    :param log_item: Log object to reflect the status of the execution
    :return: Nothing.
    """
    staging_dir = 's3://{0}'.format(conn.aws_bucket_name)
    if conn.aws_file_path:
        staging_dir = staging_dir + '/' + conn.aws_file_path

    cursor = connect(aws_access_key_id=conn.aws_access_key,
                     aws_secret_access_key=run_params['aws_secret_access_key'],
                     aws_session_token=run_params['aws_session_token'],
                     s3_staging_dir=staging_dir,
                     region_name=conn.aws_region_name)

    data_frame = pd.read_sql_table(run_params['table_name'], cursor)

    # Strip white space from all string columns and try to convert to
    # datetime just in case
    data_frame = pandas.detect_datetime_columns(data_frame)

    pandas.verify_data_frame(data_frame)

    col_names, col_types, is_key = pandas.store_temporary_dataframe(
        data_frame, workflow)

    upload_data = {
        'initial_column_names': col_names,
        'col_types': col_types,
        'src_is_key_column': is_key,
        'rename_column_names': col_names[:],
        'columns_to_upload': [True] * len(col_names),
        'keep_key_column': is_key[:]
    }

    if not workflow.has_data_frame():
        # Regular load operation
        pandas.store_workflow_table(workflow, upload_data)
        log_item.payload['col_names'] = col_names
        log_item.payload['col_types'] = col_types
        log_item.payload['column_unique'] = is_key
        log_item.payload['num_rows'] = workflow.nrows
        log_item.payload['num_cols'] = workflow.ncols
        log_item.save(update_fields=['payload'])
        return

    # Merge operation
    upload_data['dst_column_names'] = workflow.get_column_names()
    upload_data['dst_is_unique_column'] = workflow.get_column_unique()
    upload_data['dst_unique_col_names'] = [
        cname for idx, cname in enumerate(upload_data['dst_column_names'])
        if upload_data['dst_column_names'][idx]
    ]
    upload_data['src_selected_key'] = run_params['merge_key']
    upload_data['dst_selected_key'] = run_params['merge_key']
    upload_data['how_merge'] = run_params['merge_method']

    dst_df = pandas.load_table(workflow.get_data_frame_table_name())
    src_df = pandas.load_table(workflow.get_data_frame_upload_table_name())

    try:
        pandas.perform_dataframe_upload_merge(workflow, dst_df, src_df,
                                              upload_data)
    except Exception as exc:
        # Nuke the temporary table
        sql.delete_table(workflow.get_data_frame_upload_table_name())
        raise Exception(
            _('Unable to perform merge operation: {0}').format(str(exc)))

    col_names, col_types, is_key = workflow.get_column_info()
    log_item.payload['col_names'] = col_names
    log_item.payload['col_types'] = col_types
    log_item.payload['column_unique'] = is_key
    log_item.payload['num_rows'] = workflow.nrows
    log_item.payload['num_cols'] = workflow.ncols
    log_item.save(update_fields=['payload'])
コード例 #5
0
ファイル: dataframe.py プロジェクト: ubc/ontask_b
def store_workflow_table(
    workflow,
    update_info: Optional[Dict] = None,
):
    """Make a temporary DB table the workflow table.

    It is assumed that there is a temporal table already in the database. The
    function performs the following steps:

    Step 1: Drop the columns that are not being uploaded

    Step 2: Rename the columns (if needed)

    Step 3: Create the workflow columns

    Step 4: Rename the table (temporary to final)

    Step 5: Update workflow fields and update

    :param workflow: Workflow object being manipulated.
    :param update_info: Dictionary with the following fields:
        - initial_column_names: list of column names detected in read phase.
        - rename_column_names: List of new names for the columns
        - column_types: List of types detected after storing in DB
        - keep_key_column: List of booleans to flag if key property is kept
        - columns_to_upload: List of booleans to flag column upload
        The first field is mandatory. The have default values if not provided.
    :return: Nothing. Anomalies are raised as Exceptions
    """
    # Check information on update_info and complete if needed
    if not update_info.get('initial_column_names'):
        raise _('Internal error while processing database.')
    if not update_info.get('rename_column_names'):
        update_info['rename_column_names'] = update_info[
            'initial_column_names']
    if not update_info.get('column_types'):
        raise _('Internal error while processing database.')
    if not update_info.get('keep_key_column'):
        raise _('Internal error while processing database.')
    if not update_info.get('columns_to_upload'):
        update_info['columns_to_upload'] = [True] * len(
            update_info['initial_column_names'])

    db_table = workflow.get_upload_table_name()
    new_columns = []
    for old_n, new_n, data_type, is_key, upload in zip(
            update_info['initial_column_names'],
            update_info['rename_column_names'],
            update_info['column_types'],
            update_info['keep_key_column'],
            update_info['columns_to_upload'],
    ):
        # Detect if the column is new or already exists
        current_col = workflow.columns.filter(name=old_n).first()

        # Step 1: Check if column needs to be uploaded
        if not upload:
            # Column is dropped
            sql.df_drop_column(db_table, old_n)

            if current_col:
                # Dropping an existing column. Incorrect.
                raise _('Invalid column drop operation.')
            continue

        # Step 2: Check if the column must be renamed
        if old_n != new_n:
            # Rename column from old_n to new_n
            sql.db_rename_column(db_table, old_n, new_n)

            if current_col:
                rename_df_column(workflow, old_n, new_n)

        if current_col:
            if current_col.data_type != data_type:
                # If the column type in the DB is different from the one in the
                # object, update
                current_col.data_type = data_type
                current_col.save()
        else:
            # Step 3: Create the column
            new_columns.append((new_n, data_type, is_key))

    # Create the columns
    workflow.add_columns(new_columns)
    workflow.refresh_from_db()

    # Step 4: Rename the table (Drop the original one first
    if workflow.has_table():
        sql.delete_table(workflow.get_data_frame_table_name())
    sql.rename_table(db_table, workflow.get_data_frame_table_name())

    # Step 5: Update workflow fields and save
    workflow.nrows = sql.get_num_rows(workflow.get_data_frame_table_name())
    workflow.set_query_builder_ops()
    workflow.save(update_fields=['nrows', 'query_builder_ops'])
コード例 #6
0
ファイル: workflow.py プロジェクト: ritotombe/ontask_b
def delete_data_frame_table(sender, instance, **kwargs):
    if instance.has_table():
        delete_table(instance.get_data_frame_table_name())