Exemple #1
0
def sql_upload_step_one(
    request: http.HttpRequest,
    workflow: models.Workflow,
    conn: models.SQLConnection,
    run_params: Dict,
):
    """Perform the first step to load a data frame from a SQL connection.

    :param request: Request received.
    :param workflow: Workflow being processed.
    :param conn: Database connection object.
    :param run_params: Dictionary with the additional run parameters.
    :return: Nothing, it creates the new dataframe in the database
    """
    # Process SQL connection using pandas
    data_frame = _load_df_from_sqlconnection(conn, run_params)
    # Verify the data frame
    pandas.verify_data_frame(data_frame)

    # Store the data frame in the DB.
    # Get frame info with three lists: names, types and is_key
    frame_info = pandas.store_temporary_dataframe(data_frame, workflow)

    # Dictionary to populate gradually throughout the sequence of steps. It
    # is stored in the session.
    request.session['upload_data'] = {
        'initial_column_names': frame_info[0],
        'column_types': frame_info[1],
        'src_is_key_column': frame_info[2],
        'step_1': reverse('dataops:sqlupload_start', kwargs={'pk': conn.id}),
        'log_upload': models.Log.WORKFLOW_DATA_SQL_UPLOAD
    }
Exemple #2
0
    def validate_data_frame(self):
        """Check that the dataframe can be properly stored.

        :return: The cleaned data
        """
        try:
            # Verify the data frame
            pandas.verify_data_frame(self.data_frame)
        except OnTaskDataFrameNoKey as exc:
            self.add_error(None, str(exc))
            return

        # Store the data frame in the DB.
        try:
            # Get frame info with three lists: names, types and is_key
            self.frame_info = pandas.store_temporary_dataframe(
                self.data_frame, self.workflow)
        except Exception as exc:
            self.add_error(None,
                           _('Unable to process file ({0}).').format(str(exc)))
Exemple #3
0
    def override(
        self,
        request: http.HttpRequest,
        wid: int,
        format=None,
        workflow: Optional[models.Workflow] = None,
    ) -> http.HttpResponse:
        """Override the content in the workflow.

        :param request: Received request object
        :param wid: Workflow ID
        :param format: format for the response
        :param workflow: Workflow being manipulated (set by decorator)
        """
        del wid, format
        # Try to retrieve the wflow to check for permissions
        serializer = self.serializer_class(data=request.data)
        if not serializer.is_valid():
            # Flag the error
            return Response(serializer.errors,
                            status=status.HTTP_400_BAD_REQUEST)

        # Data received is a correct data frame.
        df = serializer.validated_data['data_frame']

        try:
            # Verify the data frame
            pandas.verify_data_frame(df)
            # Store the content in the db and...
            pandas.store_dataframe(df, workflow)
        except OnTaskDataFrameNoKey as exc:
            return Response(str(exc), status=status.HTTP_400_BAD_REQUEST)

        # Update all the counters in the conditions
        for action in workflow.actions.all():
            action.update_n_rows_selected()

        return Response(None, status=status.HTTP_201_CREATED)
Exemple #4
0
def batch_load_df_from_athenaconnection(
    workflow: models.Workflow,
    conn: models.AthenaConnection,
    run_params: Dict,
    log_item: models.Log,
):
    """Batch load a DF from an Athena connection.

    run_params has:
    aws_secret_access_key: Optional[str] = None,
    aws_session_token: Optional[str] = None,
    table_name: Optional[str] = None
    key_column_name[str] = None
    merge_method[str] = None

    from pyathena import connect
    from pyathena.pandas_cursor import PandasCursor

    cursor = connect(
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        aws_session_token=aws_session_token,
        s3_staging_dir=staging_dir,
        region_name=region_name)

    df = pd.read_sql('SELECT * FROM given_table_name', cursor)
    print(df.describe())
    print(df.head())

    :param workflow: Workflow to store the new data
    :param conn: AthenaConnection object with the connection parameters.
    :param run_params: Dictionary with additional connection parameters
    :param log_item: Log object to reflect the status of the execution
    :return: Nothing.
    """
    staging_dir = 's3://{0}'.format(conn.aws_bucket_name)
    if conn.aws_file_path:
        staging_dir = staging_dir + '/' + conn.aws_file_path

    cursor = connect(aws_access_key_id=conn.aws_access_key,
                     aws_secret_access_key=run_params['aws_secret_access_key'],
                     aws_session_token=run_params['aws_session_token'],
                     s3_staging_dir=staging_dir,
                     region_name=conn.aws_region_name)

    data_frame = pd.read_sql_table(run_params['table_name'], cursor)

    # Strip white space from all string columns and try to convert to
    # datetime just in case
    data_frame = pandas.detect_datetime_columns(data_frame)

    pandas.verify_data_frame(data_frame)

    col_names, col_types, is_key = pandas.store_temporary_dataframe(
        data_frame, workflow)

    upload_data = {
        'initial_column_names': col_names,
        'col_types': col_types,
        'src_is_key_column': is_key,
        'rename_column_names': col_names[:],
        'columns_to_upload': [True] * len(col_names),
        'keep_key_column': is_key[:]
    }

    if not workflow.has_data_frame():
        # Regular load operation
        pandas.store_workflow_table(workflow, upload_data)
        log_item.payload['col_names'] = col_names
        log_item.payload['col_types'] = col_types
        log_item.payload['column_unique'] = is_key
        log_item.payload['num_rows'] = workflow.nrows
        log_item.payload['num_cols'] = workflow.ncols
        log_item.save(update_fields=['payload'])
        return

    # Merge operation
    upload_data['dst_column_names'] = workflow.get_column_names()
    upload_data['dst_is_unique_column'] = workflow.get_column_unique()
    upload_data['dst_unique_col_names'] = [
        cname for idx, cname in enumerate(upload_data['dst_column_names'])
        if upload_data['dst_column_names'][idx]
    ]
    upload_data['src_selected_key'] = run_params['merge_key']
    upload_data['dst_selected_key'] = run_params['merge_key']
    upload_data['how_merge'] = run_params['merge_method']

    dst_df = pandas.load_table(workflow.get_data_frame_table_name())
    src_df = pandas.load_table(workflow.get_data_frame_upload_table_name())

    try:
        pandas.perform_dataframe_upload_merge(workflow, dst_df, src_df,
                                              upload_data)
    except Exception as exc:
        # Nuke the temporary table
        sql.delete_table(workflow.get_data_frame_upload_table_name())
        raise Exception(
            _('Unable to perform merge operation: {0}').format(str(exc)))

    col_names, col_types, is_key = workflow.get_column_info()
    log_item.payload['col_names'] = col_names
    log_item.payload['col_types'] = col_types
    log_item.payload['column_unique'] = is_key
    log_item.payload['num_rows'] = workflow.nrows
    log_item.payload['num_cols'] = workflow.ncols
    log_item.save(update_fields=['payload'])
Exemple #5
0
def sqlupload_start(
    request: HttpRequest,
    pk: int,
    workflow: Optional[Workflow] = None,
) -> HttpResponse:
    """Load a data frame using a SQL connection.

    The four step process will populate the following dictionary with name
    upload_data (divided by steps in which they are set

    STEP 1:

    initial_column_names: List of column names in the initial file.

    column_types: List of column types as detected by pandas

    src_is_key_column: Boolean list with src columns that are unique

    step_1: URL name of the first step

    :param request: Web request
    :param pk: primary key of the SQL conn used
    :return: Creates the upload_data dictionary in the session
    """
    conn = SQLConnection.objects.filter(
        pk=pk
    ).filter(enabled=True).first()
    if not conn:
        return redirect('dataops:sqlconns_instructor_index_instructor_index')

    form = None
    missing_field = conn.has_missing_fields()
    if missing_field:
        # The connection needs a password  to operate
        form = SQLRequestConnectionParam(request.POST or None, instance=conn)

    context = {
        'form': form,
        'wid': workflow.id,
        'dtype': 'SQL',
        'dtype_select': _('SQL connection'),
        'valuerange': range(5) if workflow.has_table() else range(3),
        'prev_step': reverse('dataops:sqlconns_instructor_index'),
        'conn_type': conn.conn_type,
        'conn_driver': conn.conn_driver,
        'db_user': conn.db_user,
        'db_passwd': _('<PROTECTED>') if conn.db_password else '',
        'db_host': conn.db_host,
        'db_port': conn.db_port,
        'db_name': conn.db_name,
        'db_table': conn.db_table}

    if request.method == 'POST' and (not missing_field or form.is_valid()):
        run_params = conn.get_missing_fields(form.cleaned_data)

        # Process SQL connection using pandas
        try:
            data_frame = load_df_from_sqlconnection(conn, run_params)
            # Verify the data frame
            verify_data_frame(data_frame)
        except OnTaskDataFrameNoKey as exc:
            messages.error(request, str(exc))
            return render(request, 'dataops/sqlupload_start.html', context)
        except Exception as exc:
            messages.error(
                request,
                _('Unable to obtain data: {0}').format(str(exc)))
            return render(request, 'dataops/sqlupload_start.html', context)

        # Store the data frame in the DB.
        try:
            # Get frame info with three lists: names, types and is_key
            frame_info = store_temporary_dataframe(
                data_frame,
                workflow)
        except Exception:
            form.add_error(
                None,
                _('The data from this connection cannot be processed.'),
            )
            return render(request, 'dataops/sqlupload_start.html', context)

        # Dictionary to populate gradually throughout the sequence of steps. It
        # is stored in the session.
        request.session['upload_data'] = {
            'initial_column_names': frame_info[0],
            'column_types': frame_info[1],
            'src_is_key_column': frame_info[2],
            'step_1': reverse(
                'dataops:sqlupload_start',
                kwargs={'pk': conn.id}),
        }

        return redirect('dataops:upload_s2')

    return render(request, 'dataops/sqlupload_start.html', context)