コード例 #1
0
    def test_create_custom_table_without_partitions(self, mock_session_helper,
                                                    mock_execute):

        custom_redshift_columns = setup_custom_redshift_columns_and_dataframe(
        )[1]

        mock_execute.return_value = MockScopeObj()
        mock_session_helper.db_session_scope.return_value.__enter__ = scope_execute_mock

        table_name = "my_string"
        schema_name = "my_schema"
        path = "s3://lol"
        columns = {
            'colA': 'VARCHAR(1000)',
            'colB': 'BIGINT',
            'colC': 'REAL',
            'coldD': 'DECIMAL(5,4)',
            'colE': 'VARCHAR',
            'colF': 'BOOLEAN'
        }
        partitions = {}

        expected_sql = f'CREATE EXTERNAL TABLE IF NOT EXISTS {schema_name}.{table_name} {columns} \
            STORED AS PARQUET \
            LOCATION "{path}";'

        with mock_session_helper.db_session_scope() as mock_scope:
            publish_redshift.create_custom_table(table_name, schema_name,
                                                 partitions, path,
                                                 custom_redshift_columns,
                                                 mock_session_helper)
            assert mock_scope.execute.called_once_with(expected_sql)
コード例 #2
0
    def test_custom_publish_table_publish(self, mock_session_helper,
                                          mock_create_custom_table):
        with get_s3_client() as s3_client:
            dataframe, custom_redshift_columns = setup_custom_redshift_columns_and_dataframe(
            )
            bucket, key = self.setup_s3(s3_client)
            partitions = ["colA", "colB", "colC"]
            redshift_params = self.setup_redshift_params()
            msh = mock_session_helper(region=redshift_params['region'],
                                      cluster_id=redshift_params['cluster_id'],
                                      host=redshift_params['host'],
                                      port=redshift_params['port'],
                                      db_name=redshift_params['db_name'])

            msh.configure_session_helper()
            parq.custom_publish(
                bucket=bucket,
                key=key,
                dataframe=dataframe,
                partitions=partitions,
                redshift_params=redshift_params,
                custom_redshift_columns=custom_redshift_columns)

            mock_create_custom_table.assert_called_once_with(
                redshift_params['table_name'],
                redshift_params['schema_name'], partitions,
                parq.s3_url(bucket, key), custom_redshift_columns, msh)
コード例 #3
0
    def test_custom_table_publish_mixed_type_column(self, mock_session_helper,
                                                    mock_create_custom_table):
        dataframe, custom_redshift_columns = setup_custom_redshift_columns_and_dataframe(
        )
        bucket, key = self.setup_s3()
        partitions = []
        redshift_params = self.setup_redshift_params()
        msh = mock_session_helper(region=redshift_params['region'],
                                  cluster_id=redshift_params['cluster_id'],
                                  host=redshift_params['host'],
                                  port=redshift_params['port'],
                                  db_name=redshift_params['db_name'])

        msh.configure_session_helper()

        dataframe.iat[1, dataframe.columns.get_loc("colA")] = 45

        parq.custom_publish(bucket=bucket,
                            key=key,
                            dataframe=dataframe,
                            partitions=partitions,
                            redshift_params=redshift_params,
                            custom_redshift_columns=custom_redshift_columns)

        mock_create_custom_table.assert_called_once_with(
            redshift_params['table_name'], redshift_params['schema_name'],
            partitions, parq.s3_url(bucket, key), custom_redshift_columns, msh)
コード例 #4
0
 def test_custom_publish_set_metadata_correctly(self):
     with get_s3_client() as s3_client:
         dataframe, custom_redshift_columns = setup_custom_redshift_columns_and_dataframe(
         )
         bucket, key = self.setup_s3(s3_client)
         s3_client = boto3.client('s3')
         partitions = ['colA', 'colB', 'colC', 'colD', 'colF']
         parq.custom_publish(
             bucket=bucket,
             key=key,
             dataframe=dataframe,
             partitions=partitions,
             custom_redshift_columns=custom_redshift_columns)
         for obj in s3_client.list_objects(Bucket=bucket)['Contents']:
             if obj['Key'].endswith(".parquet"):
                 meta = s3_client.get_object(Bucket=bucket,
                                             Key=obj['Key'])['Metadata']
                 assert meta['partition_data_types'] == str({
                     "colA":
                     "string",
                     "colB":
                     "integer",
                     "colC":
                     "float",
                     "colD":
                     "decimal",
                     "colF":
                     "boolean"
                 })
コード例 #5
0
 def test_custom_publish_no_redshift_publish(self):
     dataframe, custom_redshift_columns = setup_custom_redshift_columns_and_dataframe(
     )
     bucket, key = self.setup_s3()
     partitions = []
     parq.custom_publish(bucket=bucket,
                         key=key,
                         dataframe=dataframe,
                         partitions=partitions,
                         custom_redshift_columns=custom_redshift_columns)
コード例 #6
0
 def test_custom_publish_works_without_partitions(self):
     with get_s3_client() as s3_client:
         dataframe, custom_redshift_columns = setup_custom_redshift_columns_and_dataframe(
         )
         bucket, key = self.setup_s3(s3_client)
         partitions = []
         parq.custom_publish(
             bucket=bucket,
             key=key,
             dataframe=dataframe,
             partitions=partitions,
             custom_redshift_columns=custom_redshift_columns)
コード例 #7
0
 def test_custom_publish_reject_timedelta_dataframes(self):
     dataframe, custom_redshift_columns = setup_custom_redshift_columns_and_dataframe(
     )
     bucket, key = self.setup_s3()
     partitions = ['colA']
     dataframe['time_col'] = pd.Timedelta('1 days')
     with pytest.raises(NotImplementedError):
         parq.custom_publish(
             bucket=bucket,
             key=key,
             dataframe=dataframe,
             partitions=partitions,
             custom_redshift_columns=custom_redshift_columns)
コード例 #8
0
    def test_custom_publish_reject_empty_dataframe(self):
        dataframe = pd.DataFrame()
        custom_redshift_columns = setup_custom_redshift_columns_and_dataframe(
        )[1]
        bucket, key = self.setup_s3()
        s3_path = f"s3://{bucket}/{key}"

        with pytest.raises(ValueError):
            parq.custom_publish(
                bucket=bucket,
                key=key,
                dataframe=dataframe,
                partitions=[],
                custom_redshift_columns=custom_redshift_columns)
コード例 #9
0
    def test_custom_publish_input_equals_output(self):
        dataframe, custom_redshift_columns = setup_custom_redshift_columns_and_dataframe(
        )
        bucket, key = self.setup_s3()
        s3_path = f"s3://{bucket}/{key}"
        partitions = [dataframe.columns[0]]
        parq.custom_publish(bucket=bucket,
                            key=key,
                            dataframe=dataframe,
                            partitions=partitions,
                            custom_redshift_columns=custom_redshift_columns)

        from_s3 = pq.ParquetDataset(s3_path, filesystem=s3fs.S3FileSystem())
        s3pd = from_s3.read().to_pandas()
        # Switch partition type back -> by default it gets set to a category
        s3pd[partitions[0]] = s3pd[partitions[0]].astype(
            dataframe[partitions[0]].dtype)

        sorted_dfs_equal_by_pandas_testing(dataframe, s3pd)
コード例 #10
0
    def test_custom_publish_schema_publish(self, mock_session_helper,
                                           mock_create_schema):
        dataframe, custom_redshift_columns = setup_custom_redshift_columns_and_dataframe(
        )
        bucket, key = self.setup_s3()
        partitions = [dataframe.columns[0]]
        redshift_params = self.setup_redshift_params()
        msh = mock_session_helper(region=redshift_params['region'],
                                  cluster_id=redshift_params['cluster_id'],
                                  host=redshift_params['host'],
                                  port=redshift_params['port'],
                                  db_name=redshift_params['db_name'])

        msh.configure_session_helper()
        parq.custom_publish(bucket=bucket,
                            key=key,
                            dataframe=dataframe,
                            partitions=partitions,
                            redshift_params=redshift_params,
                            custom_redshift_columns=custom_redshift_columns)

        mock_create_schema.assert_called_once_with(
            redshift_params['schema_name'], redshift_params['db_name'],
            redshift_params['iam_role'], msh)