Beispiel #1
0
    def test_create_partitions(self, mock_session_helper, mock_execute):

        mock_execute.return_value = MockScopeObj()
        mock_session_helper.db_session_scope.return_value.__enter__ = scope_execute_mock

        table_name = "my_table"
        schema_name = "my_schema"
        bucket = "test"
        partitions = ["version", "time"]
        filepath = "something_overgeneric/dataset/version=v2_final_new/time=01-01-69  23:59:07/keysmash.parquet"
        sql_partitions = "(version='v2_final_new', time='01-01-69  23:59:07')"
        path_for_sql = "'s3://test/something_overgeneric/dataset/version=v2_final_new'"

        expected_sql = f"ALTER TABLE {schema_name}.{table_name} \
#             ADD IF NOT EXISTS PARTITION {sql_partitions} \
#             LOCATION {path_for_sql};"
        with mock_session_helper.db_session_scope() as mock_scope:
            publish_redshift.create_partitions(
                bucket, schema_name, table_name, filepath, mock_session_helper)
            assert mock_scope.execute.called_once_with(expected_sql)
Beispiel #2
0
    def test_create_partitions(self, mock_session_helper, mock_execute):
        bucket, schema, table, filepath = 'MyBucket', 'MySchema', 'MyTable', 'path/to/data/apple=abcd/banana=1234/abcd1234.parquet'
        mock_execute.return_value = MockScopeObj()
        mock_session_helper.db_session_scope.return_value.__enter__ = scope_execute_mock

        partitions = parq._get_partitions_for_spectrum(filepath)
        formatted_partitions = parq._format_partition_strings_for_sql(
            partitions)
        path_to_data = parq._get_partition_location(filepath)

        with mock_session_helper.db_session_scope() as mock_scope:
            generated_sql = parq.create_partitions(bucket, schema, table,
                                                   filepath,
                                                   mock_session_helper)
            expected_sql = f"ALTER TABLE {schema}.{table} \
            ADD PARTITION ({' ,'.join(formatted_partitions)}) \
            LOCATION 's3://{bucket}/{path_to_data}';"

            assert mock_scope.execute.called_once_with(expected_sql)
Beispiel #3
0
def _assign_partition_meta(bucket: str,
                           key: str,
                           dataframe: pd.DataFrame,
                           partitions: List['str'],
                           session_helper: SessionHelper,
                           redshift_params=None) -> List[str]:
    """ assigns the dataset partition meta to all keys in the dataset"""
    s3_client = boto3.client('s3')
    all_files_without_meta = []
    paginator = s3_client.get_paginator('list_objects')
    page_iterator = paginator.paginate(Bucket=bucket, Prefix=key)
    for page in page_iterator:
        for obj in page['Contents']:
            if obj['Key'].endswith(".parquet"):
                head_obj = s3_client.head_object(Bucket=bucket, Key=obj['Key'])
                if not 'partition_data_types' in head_obj['Metadata']:
                    all_files_without_meta.append(obj['Key'])
                    if redshift_params and partitions:
                        sql_command = publish_redshift.create_partitions(
                            bucket, redshift_params['schema_name'],
                            redshift_params['table_name'], obj['Key'],
                            session_helper)

    for obj in all_files_without_meta:
        logger.debug(f"Appending metadata to file {obj}..")
        s3_client.copy_object(Bucket=bucket,
                              CopySource={
                                  'Bucket': bucket,
                                  'Key': obj
                              },
                              Key=obj,
                              Metadata={
                                  'partition_data_types':
                                  str(
                                      _parse_dataframe_col_types(
                                          dataframe=dataframe,
                                          partitions=partitions))
                              },
                              MetadataDirective='REPLACE')
        logger.debug("Done appending metadata.")
    return all_files_without_meta
Beispiel #4
0
def _assign_partition_meta(bucket: str,
                           key: str,
                           dataframe: pd.DataFrame,
                           partitions: List['str'],
                           session_helper: SessionHelper,
                           redshift_params=None,
                           custom_redshift_columns: dict = None) -> List[str]:
    """ Assigns the dataset partition meta to all object keys in the dataset.
    Keys are found by listing all files under the given key and then filtering
    to only those that end in '.parquet' and then further filtering to those
    that do not already have partition_data_types metadata.

    Args:
        bucket (str): S3 bucket to publish to
        key (str): S3 key to the root of where the dataset is published
        dataframe (pd.DataFrame): Dataframe that has been published
        partitions (list): List of partition columns
        session_helper (SessionHelper): Current session, if not using Spectrum this should be None
        redshift_params (dict, Optional):
            Dictionary for Spectrum, should be in the following format
            The params should be formatted as follows:

                - schema_name (str): Name of the Spectrum schema to publish to
                - table_name (str): Name of the table to write the dataset as
                - iam_role (str): Role to take while writing data to Spectrum
                - region (str): AWS region for Spectrum
                - cluster_id (str): Spectrum cluster id
                - host (str): Redshift Spectrum host name
                - port (str): Redshift Spectrum port to use
                - db_name (str): Redshift Spectrum database name to use
                - ec2_user (str): If on ec2, the user that should be used
        custom_redshift_columns (dict, Optional): 
            This dictionary contains custom column data type definitions for redshift.
            The params should be formatted as follows:
                - column name (str)
                - data type (str)

    Returns:
        A str list of object keys of the objects that got metadata added
    """
    s3_client = boto3.client('s3')
    all_files_without_meta = []
    paginator = s3_client.get_paginator('list_objects')
    page_iterator = paginator.paginate(Bucket=bucket, Prefix=key)
    for page in page_iterator:
        for obj in page['Contents']:
            if obj['Key'].endswith(".parquet"):
                head_obj = s3_client.head_object(Bucket=bucket, Key=obj['Key'])
                if not 'partition_data_types' in head_obj['Metadata']:
                    all_files_without_meta.append(obj['Key'])
                    if redshift_params and partitions:
                        sql_command = publish_redshift.create_partitions(
                            bucket, redshift_params['schema_name'],
                            redshift_params['table_name'], obj['Key'],
                            session_helper)

    for obj in all_files_without_meta:
        logger.debug(f"Appending metadata to file {obj}..")
        s3_client.copy_object(
            Bucket=bucket,
            CopySource={
                'Bucket': bucket,
                'Key': obj
            },
            Key=obj,
            Metadata={
                'partition_data_types':
                str(
                    _parse_dataframe_col_types(
                        dataframe=dataframe,
                        partitions=partitions,
                        custom_redshift_columns=custom_redshift_columns))
            },
            MetadataDirective='REPLACE')
        logger.debug("Done appending metadata.")
    return all_files_without_meta