def list_files_in_table( self, table: Table, *, predicateHints: Optional[Sequence[str]] = None, limitHint: Optional[int] = None, ) -> ListFilesInTableResponse: assert table == Table("table_name", "share_name", "schema_name") metadata = Metadata(schema_string=( '{"fields":[' '{"metadata":{},"name":"a","nullable":true,"type":"long"},' '{"metadata":{},"name":"b","nullable":true,"type":"string"}' '],"type":"struct"}')) add_files = [ AddFile( url=str(tmp_path / "pdf1.parquet"), id="pdf1", partition_values={"b": "x"}, size=0, stats="", ), AddFile( url=str(tmp_path / "pdf2.parquet"), id="pdf2", partition_values={"b": "y"}, size=0, stats="", ), ] return ListFilesInTableResponse(protocol=None, metadata=metadata, add_files=add_files)
def test_list_files_in_table_partitioned_different_schemas( rest_client: DataSharingRestClient): response = rest_client.list_files_in_table( Table(name="table3", share="share1", schema="default")) assert response.protocol == Protocol(min_reader_version=1) assert response.metadata == Metadata( id="7ba6d727-a578-4234-a138-953f790b427c", format=Format(provider="parquet", options={}), schema_string= ('{"type":"struct","fields":[' '{"name":"eventTime","type":"timestamp","nullable":true,"metadata":{}},' '{"name":"date","type":"date","nullable":true,"metadata":{}},' '{"name":"type","type":"string","nullable":true,"metadata":{}}' "]}"), partition_columns=["date"], ) assert response.add_files == [ AddFile( url=response.add_files[0].url, id="db213271abffec6fd6c7fc2aad9d4b3f", partition_values={"date": "2021-04-28"}, size=778, stats= (r'{"numRecords":1,' r'"minValues":{"eventTime":"2021-04-28T23:36:51.945Z","type":"bar"},' r'"maxValues":{"eventTime":"2021-04-28T23:36:51.945Z","type":"bar"},' r'"nullCount":{"eventTime":0,"type":0}}'), ), AddFile( url=response.add_files[1].url, id="f1f8be229d8b18eb6d6a34255f2d7089", partition_values={"date": "2021-04-28"}, size=778, stats= (r'{"numRecords":1,' r'"minValues":{"eventTime":"2021-04-28T23:36:47.599Z","type":"foo"},' r'"maxValues":{"eventTime":"2021-04-28T23:36:47.599Z","type":"foo"},' r'"nullCount":{"eventTime":0,"type":0}}'), ), AddFile( url=response.add_files[2].url, id="a892a55d770ee70b34ffb2ebf7dc2fd0", partition_values={"date": "2021-04-28"}, size=573, stats=(r'{"numRecords":1,' r'"minValues":{"eventTime":"2021-04-28T23:35:53.156Z"},' r'"maxValues":{"eventTime":"2021-04-28T23:35:53.156Z"},' r'"nullCount":{"eventTime":0}}'), ), ]
def list_files_in_table( self, table: Table, *, predicateHints: Optional[Sequence[str]] = None, limitHint: Optional[int] = None, ) -> ListFilesInTableResponse: data: Dict = {} if predicateHints is not None: data["predicateHints"] = predicateHints if limitHint is not None: data["limitHint"] = limitHint with self._post_internal( f"/shares/{table.share}/schemas/{table.schema}/tables/{table.name}/query", data=data, ) as lines: protocol_json = json.loads(next(lines)) metadata_json = json.loads(next(lines)) return ListFilesInTableResponse( protocol=Protocol.from_json(protocol_json["protocol"]), metadata=Metadata.from_json(metadata_json["metaData"]), add_files=[ AddFile.from_json(json.loads(file)["file"]) for file in lines ], )
def test_list_files_in_table_non_partitioned( rest_client: DataSharingRestClient): response = rest_client.list_files_in_table( Table(name="table1", share="share1", schema="default"), predicateHints=["date = '2021-01-31'"], ) assert response.protocol == Protocol(min_reader_version=1) assert response.metadata == Metadata( id="ed96aa41-1d81-4b7f-8fb5-846878b4b0cf", format=Format(provider="parquet", options={}), schema_string= ('{"type":"struct","fields":[' '{"name":"eventTime","type":"timestamp","nullable":true,"metadata":{}},' '{"name":"date","type":"date","nullable":true,"metadata":{}}' "]}"), partition_columns=[], ) assert response.add_files == [ AddFile( url=response.add_files[0].url, id="061cb3683a467066995f8cdaabd8667d", partition_values={}, size=781, stats= (r'{"numRecords":1,' r'"minValues":{"eventTime":"2021-04-28T06:32:22.421Z","date":"2021-04-28"},' r'"maxValues":{"eventTime":"2021-04-28T06:32:22.421Z","date":"2021-04-28"},' r'"nullCount":{"eventTime":0,"date":0}}'), ), AddFile( url=response.add_files[1].url, id="e268cbf70dbaa6143e7e9fa3e2d3b00e", partition_values={}, size=781, stats= (r'{"numRecords":1,' r'"minValues":{"eventTime":"2021-04-28T06:32:02.070Z","date":"2021-04-28"},' r'"maxValues":{"eventTime":"2021-04-28T06:32:02.070Z","date":"2021-04-28"},' r'"nullCount":{"eventTime":0,"date":0}}'), ), ]
def test_list_files_in_table_partitioned(rest_client: DataSharingRestClient): response = rest_client.list_files_in_table( Table(name="table2", share="share2", schema="default"), predicateHints=["date = '2021-01-31'"], limitHint=123, ) assert response.protocol == Protocol(min_reader_version=1) assert response.metadata == Metadata( id="f8d5c169-3d01-4ca3-ad9e-7dc3355aedb2", format=Format(provider="parquet", options={}), schema_string= ('{"type":"struct","fields":[' '{"name":"eventTime","type":"timestamp","nullable":true,"metadata":{}},' '{"name":"date","type":"date","nullable":true,"metadata":{}}' "]}"), partition_columns=["date"], ) assert response.add_files == [ AddFile( url=response.add_files[0].url, id="9f1a49539c5cffe1ea7f9e055d5c003c", partition_values={"date": "2021-04-28"}, size=573, stats=(r'{"numRecords":1,' r'"minValues":{"eventTime":"2021-04-28T23:33:57.955Z"},' r'"maxValues":{"eventTime":"2021-04-28T23:33:57.955Z"},' r'"nullCount":{"eventTime":0}}'), ), AddFile( url=response.add_files[1].url, id="cd2209b32f5ed5305922dd50f5908a75", partition_values={"date": "2021-04-28"}, size=573, stats=(r'{"numRecords":1,' r'"minValues":{"eventTime":"2021-04-28T23:33:48.719Z"},' r'"maxValues":{"eventTime":"2021-04-28T23:33:48.719Z"},' r'"nullCount":{"eventTime":0}}'), ), ]
def test_add_file(json: str, expected: AddFile): assert AddFile.from_json(json) == expected
"json,expected", [ pytest.param( """ { "url" : "https://localhost/path/to/file.parquet", "id" : "id", "partitionValues" : {}, "size" : 120, "stats" : "{\\"numRecords\\":2}" } """, AddFile( url="https://localhost/path/to/file.parquet", id="id", partition_values={}, size=120, stats=r'{"numRecords":2}', ), id="non partitioned", ), pytest.param( """ { "url" : "https://localhost/path/to/file.parquet", "id" : "id", "partitionValues" : {"b": "x"}, "size" : 120, "stats" : "{\\"numRecords\\":2}" } """,