Exemple #1
0
        def list_files_in_table(
            self,
            table: Table,
            *,
            predicateHints: Optional[Sequence[str]] = None,
            limitHint: Optional[int] = None,
        ) -> ListFilesInTableResponse:
            assert table == Table("table_name", "share_name", "schema_name")

            metadata = Metadata(schema_string=(
                '{"fields":['
                '{"metadata":{},"name":"a","nullable":true,"type":"long"},'
                '{"metadata":{},"name":"b","nullable":true,"type":"string"}'
                '],"type":"struct"}'))
            add_files = [
                AddFile(
                    url=str(tmp_path / "pdf1.parquet"),
                    id="pdf1",
                    partition_values={"b": "x"},
                    size=0,
                    stats="",
                ),
                AddFile(
                    url=str(tmp_path / "pdf2.parquet"),
                    id="pdf2",
                    partition_values={"b": "y"},
                    size=0,
                    stats="",
                ),
            ]
            return ListFilesInTableResponse(protocol=None,
                                            metadata=metadata,
                                            add_files=add_files)
Exemple #2
0
def test_list_files_in_table_partitioned_different_schemas(
        rest_client: DataSharingRestClient):
    response = rest_client.list_files_in_table(
        Table(name="table3", share="share1", schema="default"))
    assert response.protocol == Protocol(min_reader_version=1)
    assert response.metadata == Metadata(
        id="7ba6d727-a578-4234-a138-953f790b427c",
        format=Format(provider="parquet", options={}),
        schema_string=
        ('{"type":"struct","fields":['
         '{"name":"eventTime","type":"timestamp","nullable":true,"metadata":{}},'
         '{"name":"date","type":"date","nullable":true,"metadata":{}},'
         '{"name":"type","type":"string","nullable":true,"metadata":{}}'
         "]}"),
        partition_columns=["date"],
    )
    assert response.add_files == [
        AddFile(
            url=response.add_files[0].url,
            id="db213271abffec6fd6c7fc2aad9d4b3f",
            partition_values={"date": "2021-04-28"},
            size=778,
            stats=
            (r'{"numRecords":1,'
             r'"minValues":{"eventTime":"2021-04-28T23:36:51.945Z","type":"bar"},'
             r'"maxValues":{"eventTime":"2021-04-28T23:36:51.945Z","type":"bar"},'
             r'"nullCount":{"eventTime":0,"type":0}}'),
        ),
        AddFile(
            url=response.add_files[1].url,
            id="f1f8be229d8b18eb6d6a34255f2d7089",
            partition_values={"date": "2021-04-28"},
            size=778,
            stats=
            (r'{"numRecords":1,'
             r'"minValues":{"eventTime":"2021-04-28T23:36:47.599Z","type":"foo"},'
             r'"maxValues":{"eventTime":"2021-04-28T23:36:47.599Z","type":"foo"},'
             r'"nullCount":{"eventTime":0,"type":0}}'),
        ),
        AddFile(
            url=response.add_files[2].url,
            id="a892a55d770ee70b34ffb2ebf7dc2fd0",
            partition_values={"date": "2021-04-28"},
            size=573,
            stats=(r'{"numRecords":1,'
                   r'"minValues":{"eventTime":"2021-04-28T23:35:53.156Z"},'
                   r'"maxValues":{"eventTime":"2021-04-28T23:35:53.156Z"},'
                   r'"nullCount":{"eventTime":0}}'),
        ),
    ]
Exemple #3
0
    def list_files_in_table(
        self,
        table: Table,
        *,
        predicateHints: Optional[Sequence[str]] = None,
        limitHint: Optional[int] = None,
    ) -> ListFilesInTableResponse:
        data: Dict = {}
        if predicateHints is not None:
            data["predicateHints"] = predicateHints
        if limitHint is not None:
            data["limitHint"] = limitHint

        with self._post_internal(
                f"/shares/{table.share}/schemas/{table.schema}/tables/{table.name}/query",
                data=data,
        ) as lines:
            protocol_json = json.loads(next(lines))
            metadata_json = json.loads(next(lines))
            return ListFilesInTableResponse(
                protocol=Protocol.from_json(protocol_json["protocol"]),
                metadata=Metadata.from_json(metadata_json["metaData"]),
                add_files=[
                    AddFile.from_json(json.loads(file)["file"])
                    for file in lines
                ],
            )
Exemple #4
0
def test_list_files_in_table_non_partitioned(
        rest_client: DataSharingRestClient):
    response = rest_client.list_files_in_table(
        Table(name="table1", share="share1", schema="default"),
        predicateHints=["date = '2021-01-31'"],
    )
    assert response.protocol == Protocol(min_reader_version=1)
    assert response.metadata == Metadata(
        id="ed96aa41-1d81-4b7f-8fb5-846878b4b0cf",
        format=Format(provider="parquet", options={}),
        schema_string=
        ('{"type":"struct","fields":['
         '{"name":"eventTime","type":"timestamp","nullable":true,"metadata":{}},'
         '{"name":"date","type":"date","nullable":true,"metadata":{}}'
         "]}"),
        partition_columns=[],
    )
    assert response.add_files == [
        AddFile(
            url=response.add_files[0].url,
            id="061cb3683a467066995f8cdaabd8667d",
            partition_values={},
            size=781,
            stats=
            (r'{"numRecords":1,'
             r'"minValues":{"eventTime":"2021-04-28T06:32:22.421Z","date":"2021-04-28"},'
             r'"maxValues":{"eventTime":"2021-04-28T06:32:22.421Z","date":"2021-04-28"},'
             r'"nullCount":{"eventTime":0,"date":0}}'),
        ),
        AddFile(
            url=response.add_files[1].url,
            id="e268cbf70dbaa6143e7e9fa3e2d3b00e",
            partition_values={},
            size=781,
            stats=
            (r'{"numRecords":1,'
             r'"minValues":{"eventTime":"2021-04-28T06:32:02.070Z","date":"2021-04-28"},'
             r'"maxValues":{"eventTime":"2021-04-28T06:32:02.070Z","date":"2021-04-28"},'
             r'"nullCount":{"eventTime":0,"date":0}}'),
        ),
    ]
Exemple #5
0
def test_list_files_in_table_partitioned(rest_client: DataSharingRestClient):
    response = rest_client.list_files_in_table(
        Table(name="table2", share="share2", schema="default"),
        predicateHints=["date = '2021-01-31'"],
        limitHint=123,
    )
    assert response.protocol == Protocol(min_reader_version=1)
    assert response.metadata == Metadata(
        id="f8d5c169-3d01-4ca3-ad9e-7dc3355aedb2",
        format=Format(provider="parquet", options={}),
        schema_string=
        ('{"type":"struct","fields":['
         '{"name":"eventTime","type":"timestamp","nullable":true,"metadata":{}},'
         '{"name":"date","type":"date","nullable":true,"metadata":{}}'
         "]}"),
        partition_columns=["date"],
    )
    assert response.add_files == [
        AddFile(
            url=response.add_files[0].url,
            id="9f1a49539c5cffe1ea7f9e055d5c003c",
            partition_values={"date": "2021-04-28"},
            size=573,
            stats=(r'{"numRecords":1,'
                   r'"minValues":{"eventTime":"2021-04-28T23:33:57.955Z"},'
                   r'"maxValues":{"eventTime":"2021-04-28T23:33:57.955Z"},'
                   r'"nullCount":{"eventTime":0}}'),
        ),
        AddFile(
            url=response.add_files[1].url,
            id="cd2209b32f5ed5305922dd50f5908a75",
            partition_values={"date": "2021-04-28"},
            size=573,
            stats=(r'{"numRecords":1,'
                   r'"minValues":{"eventTime":"2021-04-28T23:33:48.719Z"},'
                   r'"maxValues":{"eventTime":"2021-04-28T23:33:48.719Z"},'
                   r'"nullCount":{"eventTime":0}}'),
        ),
    ]
Exemple #6
0
def test_add_file(json: str, expected: AddFile):
    assert AddFile.from_json(json) == expected
Exemple #7
0
 "json,expected",
 [
     pytest.param(
         """
         {
             "url" : "https://localhost/path/to/file.parquet",
             "id" : "id",
             "partitionValues" : {},
             "size" : 120,
             "stats" : "{\\"numRecords\\":2}"
         }
         """,
         AddFile(
             url="https://localhost/path/to/file.parquet",
             id="id",
             partition_values={},
             size=120,
             stats=r'{"numRecords":2}',
         ),
         id="non partitioned",
     ),
     pytest.param(
         """
         {
             "url" : "https://localhost/path/to/file.parquet",
             "id" : "id",
             "partitionValues" : {"b": "x"},
             "size" : 120,
             "stats" : "{\\"numRecords\\":2}"
         }
         """,