Beispiel #1
0
def load_df_from_csvfile(
    file_obj,
    skiprows: Optional[int] = 0,
    skipfooter: Optional[int] = 0,
) -> pd.DataFrame:
    """Load a data frame from a CSV file.

    Given a file object, try to read the content as a CSV file and transform
    into a data frame. The skiprows and skipfooter are number of lines to skip
    from the top and bottom of the file (see read_csv in pandas).

    It also tries to convert as many columns as possible to date/time format
    (testing the conversion on every string column).

    :param file_obj: File object to read the CSV content
    :param skiprows: Number of lines to skip at the top of the document
    :param skipfooter: Number of lines to skip at the bottom of the document
    :return: Resulting data frame, or an Exception.
    """
    data_frame = pd.read_csv(file_obj,
                             index_col=False,
                             infer_datetime_format=True,
                             quotechar='"',
                             skiprows=skiprows,
                             skipfooter=skipfooter,
                             encoding='utf-8')

    # Strip white space from all string columns and try to convert to
    # datetime just in case
    return pandas.detect_datetime_columns(data_frame)
Beispiel #2
0
    def test_table_pandas_create(self):
        # Create a second workflow
        response = self.client.post(reverse('workflow:api_workflows'), {
            'name': tests.wflow_name + '2',
            'attributes': {
                'one': 'two'
            }
        },
                                    format='json')

        # Get the only workflow in the fixture
        workflow = models.Workflow.objects.get(id=response.data['id'])

        # Transform new table into a data frame
        r_df = pd.DataFrame(self.new_table)
        r_df = pandas.detect_datetime_columns(r_df)

        # Upload the table
        self.client.post(reverse('table:api_pops', kwargs={'wid':
                                                           workflow.id}),
                         {'data_frame': serializers.df_to_string(r_df)},
                         format='json')

        # Refresh wflow (has been updated)
        workflow = models.Workflow.objects.get(id=workflow.id)

        # Load the df from the db
        dframe = pandas.load_table(workflow.get_data_frame_table_name())

        # Compare both elements
        self.compare_tables(r_df, dframe)
Beispiel #3
0
def _load_df_from_sqlconnection(
    conn_item: models.SQLConnection,
    run_params: Dict,
) -> pd.DataFrame:
    """Load a DF from a SQL connection.

    :param conn_item: SQLConnection object with the connection parameters.
    :param run_params: Dictionary with the execution parameters.
    :return: Data frame or raise an exception.
    """
    if conn_item.db_password:
        password = conn_item.db_password
    else:
        password = run_params['db_password']

    if conn_item.db_table:
        table_name = conn_item.db_table
    else:
        table_name = run_params['db_table']

    db_engine = pandas.create_db_engine(conn_item.conn_type,
                                        conn_item.conn_driver,
                                        conn_item.db_user, password,
                                        conn_item.db_host, conn_item.db_name)

    # Try to fetch the data
    data_frame = pd.read_sql_table(table_name, db_engine)

    # Remove the engine
    db_engine.dispose()

    # Strip white space from all string columns and try to convert to
    # datetime just in case
    return pandas.detect_datetime_columns(data_frame)
Beispiel #4
0
    def test_table_pandas_update(self):
        # Get the only workflow in the fixture
        workflow = Workflow.objects.all()[0]

        # Transform new table into string
        r_df = pd.DataFrame(self.new_table)
        r_df = detect_datetime_columns(r_df)

        # Upload a new table
        response = self.client.put(
            reverse(
                'table:api_pops',
                kwargs={'wid': workflow.id}),
            {'data_frame': df_to_string(r_df)},
            format='json')

        # Refresh wflow (has been updated)
        workflow = Workflow.objects.get(id=workflow.id)

        # Load the df from the db
        df = load_table(workflow.get_data_frame_table_name())

        # Compare both elements
        self.compare_tables(r_df, df)

        # Check that the rest of the
        # information is correct
        workflow = Workflow.objects.get(id=workflow.id)
        self.assertTrue(check_wf_df(workflow))
Beispiel #5
0
    def test_table_json_create(self):
        # Create a second workflow
        response = self.client.post(
            reverse('workflow:api_workflows'),
            {'name': test.wflow_name + '2', 'attributes': {'one': 'two'}},
            format='json')

        # Get the only workflow in the fixture
        workflow = Workflow.objects.get(id=response.data['id'])

        # Upload the table
        response = self.client.post(
            reverse('table:api_ops', kwargs={'wid': workflow.id}),
            {'data_frame': self.new_table},
            format='json')

        # Refresh wflow (has been updated)
        workflow = Workflow.objects.get(id=workflow.id)

        # Load the df from the db
        df = load_table(workflow.get_data_frame_table_name())
        # Transform new table into data frame
        r_df = pd.DataFrame(self.new_table)
        r_df = detect_datetime_columns(r_df)

        # Compare both elements
        self.compare_tables(r_df, df)

        # Check that the rest of the information is correct
        self.assertTrue(check_wf_df(workflow))
Beispiel #6
0
    def to_internal_value(self, data):
        """Create the data frame from the given data detecting date/time."""
        try:
            df = pd.DataFrame(data)
            # Detect date/time columns
            df = detect_datetime_columns(df)
        except Exception as exc:
            raise serializers.ValidationError(exc)

        return df
Beispiel #7
0
def load_df_from_s3(
    aws_key: str,
    aws_secret: str,
    bucket_name: str,
    file_path: str,
    skiprows: Optional[int] = 0,
    skipfooter: Optional[int] = 0,
) -> pd.DataFrame:
    """Load data from a S3 bucket.

    Given a file object, try to read the content and transform it into a data
    frame.

    It also tries to convert as many columns as possible to date/time format
    (testing the conversion on every string column).

    :param aws_key: Key to access the S3 bucket
    :param aws_secret: Secret to access the S3 bucket
    :param bucket_name: Bucket name
    :param file_path: Path to access the file within the bucket
    :param skiprows: Number of lines to skip at the top of the document
    :param skipfooter: Number of lines to skip at the bottom of the document
    :return: Resulting data frame, or an Exception.
    """
    path_prefix = ''
    if aws_key and aws_secret:
        # If key/secret are given, create prefix
        path_prefix = '{0}:{1}@'.format(aws_key, aws_secret)

    if settings.ONTASK_TESTING:
        uri = 'file:///{0}/{1}'.format(bucket_name, file_path)
    else:
        uri = 's3://{0}{1}/{2}'.format(path_prefix, bucket_name, file_path)
    data_frame = pd.read_csv(
        smart_open.open(uri),
        index_col=False,
        infer_datetime_format=True,
        quotechar='"',
        skiprows=skiprows,
        skipfooter=skipfooter,
        encoding='utf-8',
    )

    # Strip white space from all string columns and try to convert to
    # datetime just in case
    return pandas.detect_datetime_columns(data_frame)
Beispiel #8
0
    def test_table_JSON_get(self):
        # Get the only workflow in the fixture
        workflow = models.Workflow.objects.all()[0]

        # Get the data through the API
        response = self.client.get(
            reverse('table:api_ops', kwargs={'wid': workflow.id}))

        # Transform the response into a data frame
        r_df = pd.DataFrame(response.data['data_frame'])
        r_df = pandas.detect_datetime_columns(r_df)

        # Load the df from the db
        dframe = pandas.load_table(workflow.get_data_frame_table_name())

        # Compare both elements
        self.compare_tables(r_df, dframe)
Beispiel #9
0
    def test_table_pandas_JSON_get(self):
        # Get the only workflow in the fixture
        workflow = models.Workflow.objects.all()[0]

        # Get the data through the API
        response = self.client.get(
            reverse('table:api_merge', kwargs={'wid': workflow.id}))

        workflow = models.Workflow.objects.all()[0]

        # Transform new table into string
        r_df = pd.DataFrame(response.data['src_df'])
        r_df = pandas.detect_datetime_columns(r_df)

        # Load the df from the db
        dframe = pandas.load_table(workflow.get_data_frame_table_name())

        # Compare both elements and check wf df consistency
        self.compare_tables(r_df, dframe)
Beispiel #10
0
    def test_table_pandas_update(self):
        # Get the only workflow in the fixture
        workflow = models.Workflow.objects.all()[0]

        # Transform new table into string
        r_df = pd.DataFrame(self.new_table)
        r_df = pandas.detect_datetime_columns(r_df)

        # Upload a new table
        self.client.put(reverse('table:api_pops', kwargs={'wid': workflow.id}),
                        {'data_frame': serializers.df_to_string(r_df)},
                        format='json')

        # Refresh wflow (has been updated)
        workflow = models.Workflow.objects.get(id=workflow.id)

        # Load the df from the db
        dframe = pandas.load_table(workflow.get_data_frame_table_name())

        # Compare both elements
        self.compare_tables(r_df, dframe)
Beispiel #11
0
def load_df_from_excelfile(file_obj, sheet_name: str) -> pd.DataFrame:
    """Load a data frame from a sheet in an excel file.

    Given a file object, try to read the content as a Excel file and transform
    into a data frame. The sheet_name is the name of the sheet to read.

    It also tries to convert as many columns as possible to date/time format
    (testing the conversion on every string column).

    :param file_obj: File object to read the CSV content
    :param sheet_name: Sheet in the file to read
    :return: Resulting data frame, or an Exception.
    """
    data_frame = pd.read_excel(file_obj,
                               sheet_name=sheet_name,
                               index_col=False,
                               infer_datetime_format=True,
                               quotechar='"')

    # Strip white space from all string columns and try to convert to
    # datetime just in case
    return pandas.detect_datetime_columns(data_frame)
Beispiel #12
0
def batch_load_df_from_athenaconnection(
    workflow: models.Workflow,
    conn: models.AthenaConnection,
    run_params: Dict,
    log_item: models.Log,
):
    """Batch load a DF from an Athena connection.

    run_params has:
    aws_secret_access_key: Optional[str] = None,
    aws_session_token: Optional[str] = None,
    table_name: Optional[str] = None
    key_column_name[str] = None
    merge_method[str] = None

    from pyathena import connect
    from pyathena.pandas_cursor import PandasCursor

    cursor = connect(
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        aws_session_token=aws_session_token,
        s3_staging_dir=staging_dir,
        region_name=region_name)

    df = pd.read_sql('SELECT * FROM given_table_name', cursor)
    print(df.describe())
    print(df.head())

    :param workflow: Workflow to store the new data
    :param conn: AthenaConnection object with the connection parameters.
    :param run_params: Dictionary with additional connection parameters
    :param log_item: Log object to reflect the status of the execution
    :return: Nothing.
    """
    staging_dir = 's3://{0}'.format(conn.aws_bucket_name)
    if conn.aws_file_path:
        staging_dir = staging_dir + '/' + conn.aws_file_path

    cursor = connect(aws_access_key_id=conn.aws_access_key,
                     aws_secret_access_key=run_params['aws_secret_access_key'],
                     aws_session_token=run_params['aws_session_token'],
                     s3_staging_dir=staging_dir,
                     region_name=conn.aws_region_name)

    data_frame = pd.read_sql_table(run_params['table_name'], cursor)

    # Strip white space from all string columns and try to convert to
    # datetime just in case
    data_frame = pandas.detect_datetime_columns(data_frame)

    pandas.verify_data_frame(data_frame)

    col_names, col_types, is_key = pandas.store_temporary_dataframe(
        data_frame, workflow)

    upload_data = {
        'initial_column_names': col_names,
        'col_types': col_types,
        'src_is_key_column': is_key,
        'rename_column_names': col_names[:],
        'columns_to_upload': [True] * len(col_names),
        'keep_key_column': is_key[:]
    }

    if not workflow.has_data_frame():
        # Regular load operation
        pandas.store_workflow_table(workflow, upload_data)
        log_item.payload['col_names'] = col_names
        log_item.payload['col_types'] = col_types
        log_item.payload['column_unique'] = is_key
        log_item.payload['num_rows'] = workflow.nrows
        log_item.payload['num_cols'] = workflow.ncols
        log_item.save(update_fields=['payload'])
        return

    # Merge operation
    upload_data['dst_column_names'] = workflow.get_column_names()
    upload_data['dst_is_unique_column'] = workflow.get_column_unique()
    upload_data['dst_unique_col_names'] = [
        cname for idx, cname in enumerate(upload_data['dst_column_names'])
        if upload_data['dst_column_names'][idx]
    ]
    upload_data['src_selected_key'] = run_params['merge_key']
    upload_data['dst_selected_key'] = run_params['merge_key']
    upload_data['how_merge'] = run_params['merge_method']

    dst_df = pandas.load_table(workflow.get_data_frame_table_name())
    src_df = pandas.load_table(workflow.get_data_frame_upload_table_name())

    try:
        pandas.perform_dataframe_upload_merge(workflow, dst_df, src_df,
                                              upload_data)
    except Exception as exc:
        # Nuke the temporary table
        sql.delete_table(workflow.get_data_frame_upload_table_name())
        raise Exception(
            _('Unable to perform merge operation: {0}').format(str(exc)))

    col_names, col_types, is_key = workflow.get_column_info()
    log_item.payload['col_names'] = col_names
    log_item.payload['col_types'] = col_types
    log_item.payload['column_unique'] = is_key
    log_item.payload['num_rows'] = workflow.nrows
    log_item.payload['num_cols'] = workflow.ncols
    log_item.save(update_fields=['payload'])