def test_custom_table_publish_null_in_int_column(self, mock_session_helper,
                                                     mock_create_custom_table):
        with get_s3_client() as s3_client:
            dataframe, custom_redshift_columns = setup_custom_redshift_columns_and_dataframe_with_null(
            )
            bucket, key = self.setup_s3(s3_client)
            partitions = []
            redshift_params = self.setup_redshift_params()
            msh = mock_session_helper(region=redshift_params['region'],
                                      cluster_id=redshift_params['cluster_id'],
                                      host=redshift_params['host'],
                                      port=redshift_params['port'],
                                      db_name=redshift_params['db_name'])

            msh.configure_session_helper()

            parq.custom_publish(
                bucket=bucket,
                key=key,
                dataframe=dataframe,
                partitions=partitions,
                redshift_params=redshift_params,
                custom_redshift_columns=custom_redshift_columns)

            mock_create_custom_table.assert_called_once_with(
                redshift_params['table_name'],
                redshift_params['schema_name'], partitions,
                parq.s3_url(bucket, key), custom_redshift_columns, msh)
Beispiel #2
0
    def test_custom_table_publish_mixed_type_column(self, mock_session_helper,
                                                    mock_create_custom_table):
        dataframe, custom_redshift_columns = setup_custom_redshift_columns_and_dataframe(
        )
        bucket, key = self.setup_s3()
        partitions = []
        redshift_params = self.setup_redshift_params()
        msh = mock_session_helper(region=redshift_params['region'],
                                  cluster_id=redshift_params['cluster_id'],
                                  host=redshift_params['host'],
                                  port=redshift_params['port'],
                                  db_name=redshift_params['db_name'])

        msh.configure_session_helper()

        dataframe.iat[1, dataframe.columns.get_loc("colA")] = 45

        parq.custom_publish(bucket=bucket,
                            key=key,
                            dataframe=dataframe,
                            partitions=partitions,
                            redshift_params=redshift_params,
                            custom_redshift_columns=custom_redshift_columns)

        mock_create_custom_table.assert_called_once_with(
            redshift_params['table_name'], redshift_params['schema_name'],
            partitions, parq.s3_url(bucket, key), custom_redshift_columns, msh)
 def test_custom_publish_set_metadata_correctly(self):
     with get_s3_client() as s3_client:
         dataframe, custom_redshift_columns = setup_custom_redshift_columns_and_dataframe(
         )
         bucket, key = self.setup_s3(s3_client)
         s3_client = boto3.client('s3')
         partitions = ['colA', 'colB', 'colC', 'colD', 'colF']
         parq.custom_publish(
             bucket=bucket,
             key=key,
             dataframe=dataframe,
             partitions=partitions,
             custom_redshift_columns=custom_redshift_columns)
         for obj in s3_client.list_objects(Bucket=bucket)['Contents']:
             if obj['Key'].endswith(".parquet"):
                 meta = s3_client.get_object(Bucket=bucket,
                                             Key=obj['Key'])['Metadata']
                 assert meta['partition_data_types'] == str({
                     "colA":
                     "string",
                     "colB":
                     "integer",
                     "colC":
                     "float",
                     "colD":
                     "decimal",
                     "colF":
                     "boolean"
                 })
Beispiel #4
0
 def test_custom_publish_no_redshift_publish(self):
     dataframe, custom_redshift_columns = setup_custom_redshift_columns_and_dataframe(
     )
     bucket, key = self.setup_s3()
     partitions = []
     parq.custom_publish(bucket=bucket,
                         key=key,
                         dataframe=dataframe,
                         partitions=partitions,
                         custom_redshift_columns=custom_redshift_columns)
 def test_custom_publish_works_without_partitions(self):
     with get_s3_client() as s3_client:
         dataframe, custom_redshift_columns = setup_custom_redshift_columns_and_dataframe(
         )
         bucket, key = self.setup_s3(s3_client)
         partitions = []
         parq.custom_publish(
             bucket=bucket,
             key=key,
             dataframe=dataframe,
             partitions=partitions,
             custom_redshift_columns=custom_redshift_columns)
Beispiel #6
0
 def test_custom_publish_reject_timedelta_dataframes(self):
     dataframe, custom_redshift_columns = setup_custom_redshift_columns_and_dataframe(
     )
     bucket, key = self.setup_s3()
     partitions = ['colA']
     dataframe['time_col'] = pd.Timedelta('1 days')
     with pytest.raises(NotImplementedError):
         parq.custom_publish(
             bucket=bucket,
             key=key,
             dataframe=dataframe,
             partitions=partitions,
             custom_redshift_columns=custom_redshift_columns)
Beispiel #7
0
    def test_custom_publish_reject_empty_dataframe(self):
        dataframe = pd.DataFrame()
        custom_redshift_columns = setup_custom_redshift_columns_and_dataframe(
        )[1]
        bucket, key = self.setup_s3()
        s3_path = f"s3://{bucket}/{key}"

        with pytest.raises(ValueError):
            parq.custom_publish(
                bucket=bucket,
                key=key,
                dataframe=dataframe,
                partitions=[],
                custom_redshift_columns=custom_redshift_columns)
Beispiel #8
0
    def test_custom_publish_input_equals_output(self):
        dataframe, custom_redshift_columns = setup_custom_redshift_columns_and_dataframe(
        )
        bucket, key = self.setup_s3()
        s3_path = f"s3://{bucket}/{key}"
        partitions = [dataframe.columns[0]]
        parq.custom_publish(bucket=bucket,
                            key=key,
                            dataframe=dataframe,
                            partitions=partitions,
                            custom_redshift_columns=custom_redshift_columns)

        from_s3 = pq.ParquetDataset(s3_path, filesystem=s3fs.S3FileSystem())
        s3pd = from_s3.read().to_pandas()
        # Switch partition type back -> by default it gets set to a category
        s3pd[partitions[0]] = s3pd[partitions[0]].astype(
            dataframe[partitions[0]].dtype)

        sorted_dfs_equal_by_pandas_testing(dataframe, s3pd)
Beispiel #9
0
    def test_custom_publish_schema_publish(self, mock_session_helper,
                                           mock_create_schema):
        dataframe, custom_redshift_columns = setup_custom_redshift_columns_and_dataframe(
        )
        bucket, key = self.setup_s3()
        partitions = [dataframe.columns[0]]
        redshift_params = self.setup_redshift_params()
        msh = mock_session_helper(region=redshift_params['region'],
                                  cluster_id=redshift_params['cluster_id'],
                                  host=redshift_params['host'],
                                  port=redshift_params['port'],
                                  db_name=redshift_params['db_name'])

        msh.configure_session_helper()
        parq.custom_publish(bucket=bucket,
                            key=key,
                            dataframe=dataframe,
                            partitions=partitions,
                            redshift_params=redshift_params,
                            custom_redshift_columns=custom_redshift_columns)

        mock_create_schema.assert_called_once_with(
            redshift_params['schema_name'], redshift_params['db_name'],
            redshift_params['iam_role'], msh)