Ejemplo n.º 1
0
    def test_table_publish(self, mock_session_helper, mock_create_table):
        dataframe = setup_grouped_dataframe()
        bucket, key = self.setup_s3()
        partitions = ["text_col", "int_col", "float_col"]
        redshift_params = self.setup_redshift_params()
        msh = mock_session_helper(region=redshift_params['region'],
                                  cluster_id=redshift_params['cluster_id'],
                                  host=redshift_params['host'],
                                  port=redshift_params['port'],
                                  db_name=redshift_params['db_name'])

        msh.configure_session_helper()
        parq.publish(bucket=bucket,
                     key=key,
                     dataframe=dataframe,
                     partitions=partitions,
                     redshift_params=redshift_params)

        df_types = parq._get_dataframe_datatypes(dataframe, partitions)
        partition_types = parq._get_dataframe_datatypes(
            dataframe, partitions, True)

        mock_create_table.assert_called_once_with(
            redshift_params['table_name'], redshift_params['schema_name'],
            df_types, partition_types, parq.s3_url(bucket, key), msh)
Ejemplo n.º 2
0
 def test_reject_timedelta_dataframes(self):
     columns, dataframe = self.setup_df()
     bucket, key = self.setup_s3()
     partitions = ['text_col']
     dataframe['time_col'] = pd.Timedelta('1 days')
     with pytest.raises(NotImplementedError):
         parq.publish(bucket=bucket, key=key,
                      dataframe=dataframe, partitions=partitions)
Ejemplo n.º 3
0
 def test_works_without_partitions(self):
     dataframe = setup_grouped_dataframe()
     bucket, key = self.setup_s3()
     partitions = []
     parq.publish(bucket=bucket,
                  key=key,
                  dataframe=dataframe,
                  partitions=partitions)
Ejemplo n.º 4
0
    def test_reject_empty_dataframe(self):
        dataframe = pd.DataFrame()
        bucket, key = self.setup_s3()
        s3_path = f"s3://{bucket}/{key}"

        with pytest.raises(ValueError):
            parq.publish(bucket=bucket, key=key,
                        dataframe=dataframe, partitions=[])
Ejemplo n.º 5
0
 def test_no_redshift_publish(self):
     dataframe = setup_grouped_dataframe()
     bucket, key = self.setup_s3()
     partitions = []
     parq.publish(bucket=bucket,
                  key=key,
                  dataframe=dataframe,
                  partitions=partitions)
Ejemplo n.º 6
0
 def test_works_without_partitions(self):
     columns, dataframe = self.setup_df()
     bucket, key = self.setup_s3()
     partitions = []
     parq.publish(bucket=bucket,
                  key=key,
                  dataframe=dataframe,
                  partitions=partitions)
Ejemplo n.º 7
0
 def test_no_redshift_publish(self):
     columns, dataframe = self.setup_df()
     bucket, key = self.setup_s3()
     partitions = []
     parq.publish(bucket=bucket,
                  key=key,
                  dataframe=dataframe,
                  partitions=partitions)
Ejemplo n.º 8
0
 def test_no_redshift_publish(self):
     with get_s3_client() as s3_client:
         dataframe = setup_grouped_dataframe()
         bucket, key = self.setup_s3(s3_client)
         partitions = []
         parq.publish(bucket=bucket,
                      key=key,
                      dataframe=dataframe,
                      partitions=partitions)
Ejemplo n.º 9
0
 def test_reject_timedelta_dataframes(self):
     with get_s3_client() as s3_client:
         dataframe = setup_grouped_dataframe()
         bucket, key = self.setup_s3(s3_client)
         partitions = ['text_col']
         dataframe['time_col'] = pd.Timedelta('1 days')
         with pytest.raises(NotImplementedError):
             parq.publish(bucket=bucket,
                          key=key,
                          dataframe=dataframe,
                          partitions=partitions)
Ejemplo n.º 10
0
 def test_set_metadata_correctly(self):
     columns, dataframe = self.setup_df()
     bucket, key = self.setup_s3()
     s3_client = boto3.client('s3')
     partitions = ['grouped_col']
     parq.publish(bucket=bucket, key=key,
                  dataframe=dataframe, partitions=partitions)
     for obj in s3_client.list_objects(Bucket=bucket)['Contents']:
         if obj['Key'].endswith(".parquet"):
             meta = s3_client.get_object(
                 Bucket=bucket, Key=obj['Key'])['Metadata']
             assert meta['partition_data_types'] == str(
                 {"grouped_col": "string"})
Ejemplo n.º 11
0
    def test_fetches_diff(self):
        input_key = "burger-shipment/buns"
        input_bucket = "loadingdock"
        comparison_key = "burger-inventory/buns"
        comparison_bucket = "backroom"
        partitions = ["exp-date"]

        part_types = {"count": "int", "price": "float", "exp-date": "string"}

        input_df = pd.DataFrame({
            "count": [2, 4, 7, 9],
            "price": [2.43, 1.23, 5.76, 3.28],
            "exp-date": ["x", "z", "a", "zz"]
        })
        comparison_df = pd.DataFrame({
            "count": [2, 3, 4, 9],
            "price": [2.43, 4.35, 1.23, 3.28],
            "exp-date": ["x", "y", "z", "zz"]
        })

        s3_client = boto3.client('s3')
        s3_client.create_bucket(Bucket=input_bucket)
        s3_client.create_bucket(Bucket=comparison_bucket)

        published_files = publish(bucket=input_bucket,
                                  key=input_key,
                                  dataframe=input_df,
                                  partitions=partitions)

        published_files = publish(bucket=comparison_bucket,
                                  key=comparison_key,
                                  dataframe=comparison_df,
                                  partitions=partitions)

        test_df = pd.DataFrame({
            "count": [7],
            "price": [5.76],
            "exp-date": ["a"]
        })

        fetched_diff = fetch_parq.fetch_diff(
            input_bucket=input_bucket,
            input_key=input_key,
            comparison_bucket=comparison_bucket,
            comparison_key=comparison_key,
            partition=partitions[0],
            parallel=False)

        # Test data knows these are single row-ed DFs, testing that data
        #   like this cause pandas DF equals is ???
        assert fetched_diff.iloc[0].equals(test_df.iloc[0])
Ejemplo n.º 12
0
def test_via_public_interface():
    s3_client = boto3.client('s3')
    bucket_name = 'another-bucket'
    key = 'testing/is/fun/dataset-name'
    s3_client.create_bucket(Bucket=bucket_name)

    publish(bucket=bucket_name,
            key=key,
            dataframe=df.dataframe,
            partitions=['datetime_options'])

    ## moto explodes when we use parallel :( need to test this with a real boto call
    result = fetch(bucket=bucket_name, key=key, parallel=False)
    assert result.shape == df.dataframe.shape
    assert df_equal_by_set(result, df.dataframe, df.dataframe.columns.tolist())
Ejemplo n.º 13
0
    def test_input_equals_output(self):
        columns, dataframe = self.setup_df()
        bucket, key = self.setup_s3()
        s3_path = f"s3://{bucket}/{key}"
        partitions = [columns[0]]
        parq.publish(bucket=bucket, key=key,
                     dataframe=dataframe, partitions=partitions)

        from_s3 = pq.ParquetDataset(s3_path, filesystem=s3fs.S3FileSystem())
        s3pd = from_s3.read().to_pandas()
        pre_df = dataframe

        assert set(zip(s3pd.int_col, s3pd.float_col, s3pd.text_col, s3pd.grouped_col)) - \
            set(zip(dataframe.int_col, dataframe.float_col,
                    dataframe.text_col, dataframe.grouped_col)) == set()
Ejemplo n.º 14
0
def test_parquet_sizes():
    bucket = "testbucket"
    key = "testdataset"
    s3_client = boto3.client('s3')
    s3_client.create_bucket(Bucket=bucket)
    df = DFMock(count=1000)
    df.columns = {"int_col": "int", "str_col": "string",
                  "grouped_col": {"option_count": 4, "option_type": "string"}}
    df.generate_dataframe()
    df.grow_dataframe_to_size(250)
    pub_parq.publish(
        dataframe=df.dataframe, key=key, bucket=bucket, partitions=['grouped_col'])

    for obj in s3_client.list_objects(Bucket=bucket)['Contents']:
        if obj['Key'].endswith(".parquet"):
            assert float(obj['Size']) <= 61 * float(1 << 20)
Ejemplo n.º 15
0
    def test_input_equals_output(self):
        dataframe = setup_grouped_dataframe()
        bucket, key = self.setup_s3()
        s3_path = f"s3://{bucket}/{key}"
        partitions = [dataframe.columns[0]]
        parq.publish(bucket=bucket,
                     key=key,
                     dataframe=dataframe,
                     partitions=partitions)

        from_s3 = pq.ParquetDataset(s3_path, filesystem=s3fs.S3FileSystem())
        s3pd = from_s3.read().to_pandas()
        # Switch partition type back -> by default it gets set to a category
        s3pd[partitions[0]] = s3pd[partitions[0]].astype(
            dataframe[partitions[0]].dtype)

        sorted_dfs_equal_by_pandas_testing(dataframe, s3pd)
Ejemplo n.º 16
0
    def test_schema_publish(self, mock_session_helper, mock_create_schema):
        columns, dataframe = self.setup_df()
        bucket, key = self.setup_s3()
        partitions = [columns[0]]
        redshift_params = self.setup_redshift_params()
        msh = mock_session_helper(
            region = redshift_params['region'],
            cluster_id = redshift_params['cluster_id'],
            host = redshift_params['host'],
            port = redshift_params['port'],
            db_name = redshift_params['db_name']
            )
            
        msh.configure_session_helper()
        parq.publish(bucket=bucket, key=key,
                        dataframe=dataframe, partitions=partitions, redshift_params=redshift_params)

        mock_create_schema.assert_called_once_with(redshift_params['schema_name'], redshift_params['db_name'], redshift_params['iam_role'], msh)
Ejemplo n.º 17
0
def test_end_to_end():
    df = dfmock.DFMock(count=100000)
    df.columns = {
        "string_options": {
            "option_count": 4,
            "option_type": "string"
        },
        "int_options": {
            "option_count": 4,
            "option_type": "int"
        },
        "datetime_options": {
            "option_count": 5,
            "option_type": "datetime"
        },
        "float_options": {
            "option_count": 2,
            "option_type": "float"
        },
        "metrics": "integer"
    }

    df.generate_dataframe()
    # This is unfortunately big, but getting it to force a partition doesn't work otherwise
    df.grow_dataframe_to_size(500)

    s3_client = boto3.client('s3')
    bucket_name = 'thistestbucket'
    key = 'thisdataset'
    s3_client.create_bucket(Bucket=bucket_name)

    old_df = pd.DataFrame(df.dataframe)
    # pub it
    publish(bucket=bucket_name,
            key=key,
            dataframe=old_df,
            partitions=['string_options', 'datetime_options', 'float_options'])

    # go get it
    fetched_df = fetch(bucket=bucket_name, key=key, parallel=False)

    assert fetched_df.shape == old_df.shape
    assert df_equal_by_set(fetched_df, old_df, old_df.columns)
    sorted_dfs_equal_by_pandas_testing(fetched_df, old_df)
Ejemplo n.º 18
0
def test_end_to_end():
    df = dfmock.DFMock(count=1000)
    df.columns = {
        "string_options": {
            "option_count": 4,
            "option_type": "string"
        },
        "int_options": {
            "option_count": 4,
            "option_type": "int"
        },
        "datetime_options": {
            "option_count": 5,
            "option_type": "datetime"
        },
        "float_options": {
            "option_count": 2,
            "option_type": "float"
        },
        "metrics": "integer"
    }
    df.generate_dataframe()
    df.grow_dataframe_to_size(250)

    s3_client = boto3.client('s3')

    bucket_name = 'thistestbucket'
    key = 'thisdataset'

    s3_client.create_bucket(Bucket=bucket_name)

    # pub it
    publish(bucket=bucket_name,
            key=key,
            dataframe=df.dataframe,
            partitions=['string_options', 'datetime_options', 'float_options'])

    # go get it
    fetched_df = fetch(bucket=bucket_name, key=key, parallel=False)

    assert fetched_df.shape == df.dataframe.shape
    pd.DataFrame.eq(fetched_df, df.dataframe)
    fetched_df.head()
Ejemplo n.º 19
0
def test_end_to_end():
    # make a sample DF for all the tests
    df = dfmock.DFMock(count=10000)
    df.columns = {
        "string_options": {
            "option_count": 4,
            "option_type": "string"
        },
        "int_options": {
            "option_count": 4,
            "option_type": "int"
        },
        "datetime_options": {
            "option_count": 5,
            "option_type": "datetime"
        },
        "float_options": {
            "option_count": 2,
            "option_type": "float"
        },
        "metrics": "integer"
    }
    df.generate_dataframe()

    s3_client = boto3.client('s3')
    bucket_name = 'thistestbucket'
    key = 'thisdataset'
    s3_client.create_bucket(Bucket=bucket_name)

    old_df = pd.DataFrame(df.dataframe)

    # pub it
    publish(bucket=bucket_name,
            key=key,
            dataframe=old_df,
            partitions=['string_options', 'datetime_options', 'float_options'])

    # go get it
    fetched_df = fetch(bucket=bucket_name, key=key, parallel=False)

    assert fetched_df.shape == old_df.shape
    assert df_equal_by_set(fetched_df, old_df, old_df.columns)
    sorted_dfs_equal_by_pandas_testing(fetched_df, old_df)
Ejemplo n.º 20
0
def test_end_to_end():
    s3_client = boto3.client('s3')

    bucket_name = 'thistestbucket'
    key = 'thisdataset'

    s3_client.create_bucket(Bucket=bucket_name)

    # pub it
    publish(bucket=bucket_name,
            key=key,
            dataframe=df.dataframe,
            partitions=['string_options', 'datetime_options', 'float_options'])

    # go get it
    dataframe = fetch(bucket=bucket_name, key=key, parallel=False)

    assert dataframe.shape == df.dataframe.shape
    pd.DataFrame.eq(dataframe, df.dataframe)
    dataframe.head()
Ejemplo n.º 21
0
    def publish(self,
                bucket: str,
                key: str,
                dataframe: pd.DataFrame,
                partitions: Iterable[str]) -> List:

        return publish(
            dataframe=dataframe,
            bucket=bucket,
            key=key,
            partitions=partitions
        )
Ejemplo n.º 22
0
    def publish(self,
                bucket: str,
                key: str,
                dataframe: pd.DataFrame,
                partitions: Iterable[str]) -> None:

        pub = publish(
            dataframe=dataframe,
            bucket=bucket,
            key=key,
            partitions=partitions
        )
        pub.publish()
Ejemplo n.º 23
0
 def test_set_metadata_correctly(self):
     with get_s3_client() as s3_client:
         dataframe = setup_grouped_dataframe()
         bucket, key = self.setup_s3(s3_client)
         s3_client = boto3.client('s3')
         partitions = ['string_col', 'int_col', 'bool_col']
         parq.publish(bucket=bucket,
                      key=key,
                      dataframe=dataframe,
                      partitions=partitions)
         for obj in s3_client.list_objects(Bucket=bucket)['Contents']:
             if obj['Key'].endswith(".parquet"):
                 meta = s3_client.get_object(Bucket=bucket,
                                             Key=obj['Key'])['Metadata']
                 assert meta['partition_data_types'] == str({
                     "string_col":
                     "string",
                     "int_col":
                     "integer",
                     "bool_col":
                     "boolean"
                 })
Ejemplo n.º 24
0
    def test_fetches_diff_none(self):
        input_key = "clay/beads"
        input_bucket = "kiln"
        comparison_key = "new-case"
        comparison_bucket = "storefront"
        partitions = ["price"]

        part_types = {"count": "int", "price": "float"}

        input_df = pd.DataFrame({
            "count": [2, 4, 7, 9],
            "price": [2.43, 1.23, 5.76, 3.28]
        })

        s3_client = boto3.client('s3')
        s3_client.create_bucket(Bucket=input_bucket)
        s3_client.create_bucket(Bucket=comparison_bucket)

        published_files = publish(bucket=input_bucket,
                                  key=input_key,
                                  dataframe=input_df,
                                  partitions=partitions)

        fetched_diff = fetch_parq.fetch_diff(
            input_bucket=input_bucket,
            input_key=input_key,
            comparison_bucket=comparison_bucket,
            comparison_key=comparison_key,
            partition=partitions[0],
            parallel=False)

        assert fetched_diff.empty

        fetched_diff_reverse = fetch_parq.fetch_diff(
            input_bucket=input_bucket,
            input_key=input_key,
            comparison_bucket=comparison_bucket,
            comparison_key=comparison_key,
            partition=partitions[0],
            reverse=True,
            parallel=False)

        fetched_diff_reverse.sort_index(inplace=True)
        input_df.sort_index(inplace=True)

        assert fetched_diff_reverse['count'].equals(input_df['count'])
        assert fetched_diff_reverse['price'].equals(input_df['price'])
Ejemplo n.º 25
0
    def mock_publish(self,
                     partition_types: Dict[str, str],
                     bucket="safebucketname",
                     key='safekeyprefixname/safedatasetname'):
        mocker = MockHelper(count=100, s3=True)
        df = mocker.dataframe
        partitions = list(partition_types.keys())
        dfmock = DFMock()
        dfmock.count = 10

        # add partition columns
        columns = dict({
            key: {
                "option_count": 3,
                "option_type": value
            }
            for key, value in partition_types.items()
        })

        # add one actual data column, called metrics
        columns["metrics"] = "int"

        dfmock.columns = columns
        dfmock.generate_dataframe()

        # generate dataframe we will write
        df = dfmock.dataframe
        bucket = mocker.s3_bucket

        defaults = {
            'bucket': bucket,
            'key': key,
            'dataframe': df,
            'partitions': partitions
        }
        published_files = publish(bucket=bucket,
                                  key=key,
                                  dataframe=df,
                                  partitions=partitions)

        return bucket, df, partitions, published_files
Ejemplo n.º 26
0
    def test_fetch_when_none(self):
        input_key = "burger-shipment/buns"
        input_bucket = "loadingdock"
        partitions = ["exp-date"]

        part_types = {"count": "int", "price": "float", "exp-date": "str"}

        fetched_dtypes = pd.Series(["int64", "float64", "object"],
                                   index=["count", "price", "exp-date"])

        input_df = pd.DataFrame({
            "count": [2, 4, 7, 9],
            "price": [2.43, 1.23, 5.76, 3.28],
            "exp-date": ["x", "z", "a", "zz"]
        })

        s3_client = boto3.client('s3')
        s3_client.create_bucket(Bucket=input_bucket)

        published_files = publish(bucket=input_bucket,
                                  key=input_key,
                                  dataframe=input_df,
                                  partitions=partitions)

        filters = [{
            "partition": "exp-date",
            "comparison": "==",
            "values": ["not-there"]
        }]

        fetched = fetch_parq.fetch(bucket=input_bucket,
                                   key=input_key,
                                   filters=filters,
                                   parallel=False)

        # Testing that DF is empty and has the expected columns+dtypes
        assert fetched.empty
        assert fetched.dtypes.equals(fetched_dtypes)