Beispiel #1
0
def test_table():
    json = """
        {
            "name" : "table_name",
            "share" : "share_name",
            "schema" : "schema_name"
        }
        """
    table = Table.from_json(json)
    assert table == Table("table_name", "share_name", "schema_name")
Beispiel #2
0
def test_list_tables(sharing_client: SharingClient):
    tables = sharing_client.list_tables(Schema(name="default", share="share1"))
    assert tables == [
        Table(name="table1", share="share1", schema="default"),
        Table(name="table3", share="share1", schema="default"),
        Table(name="table7", share="share1", schema="default"),
    ]

    tables = sharing_client.list_tables(Schema(name="default", share="share2"))
    assert tables == [Table(name="table2", share="share2", schema="default")]
Beispiel #3
0
def test_list_tables_with_pagination(rest_client: DataSharingRestClient):
    response = rest_client.list_tables(Schema(name="default", share="share1"),
                                       max_results=1)
    assert response.tables == [
        Table(name="table1", share="share1", schema="default"),
    ]
    response = rest_client.list_tables(Schema(name="default", share="share1"),
                                       page_token=response.next_page_token)
    assert response.tables == [
        Table(name="table3", share="share1", schema="default"),
        Table(name="table7", share="share1", schema="default"),
    ]
Beispiel #4
0
def test_list_tables(rest_client: DataSharingRestClient):
    response = rest_client.list_tables(Schema(name="default", share="share1"))
    assert response.tables == [
        Table(name="table1", share="share1", schema="default"),
        Table(name="table3", share="share1", schema="default"),
        Table(name="table7", share="share1", schema="default"),
    ]

    response = rest_client.list_tables(Schema(name="default", share="share2"))
    assert response.tables == [
        Table(name="table2", share="share2", schema="default")
    ]
Beispiel #5
0
def test_to_pandas_empty(rest_client: DataSharingRestClient):
    class RestClientMock:
        def list_files_in_table(
            self,
            table: Table,
            *,
            predicateHints: Optional[Sequence[str]] = None,
            limitHint: Optional[int] = None,
        ) -> ListFilesInTableResponse:
            assert table == Table("table_name", "share_name", "schema_name")

            metadata = Metadata(schema_string=(
                '{"fields":['
                '{"metadata":{},"name":"a","nullable":true,"type":"boolean"},'
                '{"metadata":{},"name":"b","nullable":true,"type":"byte"},'
                '{"metadata":{},"name":"c","nullable":true,"type":"short"},'
                '{"metadata":{},"name":"d","nullable":true,"type":"integer"},'
                '{"metadata":{},"name":"e","nullable":true,"type":"long"},'
                '{"metadata":{},"name":"f","nullable":true,"type":"float"},'
                '{"metadata":{},"name":"g","nullable":true,"type":"double"},'
                '{"metadata":{},"name":"h","nullable":true,"type":"decimal(5,2)"},'
                '{"metadata":{},"name":"i","nullable":true,"type":"string"},'
                '{"metadata":{},"name":"j","nullable":true,"type":"binary"},'
                '{"metadata":{},"name":"k","nullable":true,"type":"timestamp"},'
                '{"metadata":{},"name":"l","nullable":true,"type":"date"},'
                '{"metadata":{},"name":"m","nullable":true,"type":{"type":"array",'
                '"elementType":"string","containsNull":true}},'
                '{"metadata":{},"name":"n","nullable":true,"type":{"type":"struct","fields":'
                '[{"name":"foo","type":"string","nullable":true,"metadata":{}},'
                '{"name":"bar","type":"integer","nullable":true,"metadata":{}}]}},'
                '{"metadata":{},"name":"o","nullable":true,"type":{"type":"map",'
                '"keyType":"string","valueType":"integer","valueContainsNull":true}}'
                '],"type":"struct"}'))
            add_files: Sequence[AddFile] = []
            return ListFilesInTableResponse(protocol=None,
                                            metadata=metadata,
                                            add_files=add_files)

    reader = DeltaSharingReader(
        Table("table_name", "share_name", "schema_name"),
        RestClientMock()  # type: ignore
    )
    pdf = reader.to_pandas()

    reader = DeltaSharingReader(
        Table(name="table7", share="share1", schema="default"), rest_client)
    expected = reader.to_pandas().iloc[0:0]

    pd.testing.assert_frame_equal(pdf, expected)
Beispiel #6
0
        def list_files_in_table(
            self,
            table: Table,
            *,
            predicateHints: Optional[Sequence[str]] = None,
            limitHint: Optional[int] = None,
        ) -> ListFilesInTableResponse:
            assert table == Table("table_name", "share_name", "schema_name")

            metadata = Metadata(schema_string=(
                '{"fields":['
                '{"metadata":{},"name":"a","nullable":true,"type":"long"},'
                '{"metadata":{},"name":"b","nullable":true,"type":"string"}'
                '],"type":"struct"}'))
            add_files = [
                AddFile(
                    url=str(tmp_path / "pdf1.parquet"),
                    id="pdf1",
                    partition_values={"b": "x"},
                    size=0,
                    stats="",
                ),
                AddFile(
                    url=str(tmp_path / "pdf2.parquet"),
                    id="pdf2",
                    partition_values={"b": "y"},
                    size=0,
                    stats="",
                ),
            ]
            return ListFilesInTableResponse(protocol=None,
                                            metadata=metadata,
                                            add_files=add_files)
Beispiel #7
0
        def list_files_in_table(
            self,
            table: Table,
            *,
            predicateHints: Optional[Sequence[str]] = None,
            limitHint: Optional[int] = None,
        ) -> ListFilesInTableResponse:
            assert table == Table("table_name", "share_name", "schema_name")

            metadata = Metadata(schema_string=(
                '{"fields":['
                '{"metadata":{},"name":"a","nullable":true,"type":"boolean"},'
                '{"metadata":{},"name":"b","nullable":true,"type":"byte"},'
                '{"metadata":{},"name":"c","nullable":true,"type":"short"},'
                '{"metadata":{},"name":"d","nullable":true,"type":"integer"},'
                '{"metadata":{},"name":"e","nullable":true,"type":"long"},'
                '{"metadata":{},"name":"f","nullable":true,"type":"float"},'
                '{"metadata":{},"name":"g","nullable":true,"type":"double"},'
                '{"metadata":{},"name":"h","nullable":true,"type":"decimal(5,2)"},'
                '{"metadata":{},"name":"i","nullable":true,"type":"string"},'
                '{"metadata":{},"name":"j","nullable":true,"type":"binary"},'
                '{"metadata":{},"name":"k","nullable":true,"type":"timestamp"},'
                '{"metadata":{},"name":"l","nullable":true,"type":"date"},'
                '{"metadata":{},"name":"m","nullable":true,"type":{"type":"array",'
                '"elementType":"string","containsNull":true}},'
                '{"metadata":{},"name":"n","nullable":true,"type":{"type":"struct","fields":'
                '[{"name":"foo","type":"string","nullable":true,"metadata":{}},'
                '{"name":"bar","type":"integer","nullable":true,"metadata":{}}]}},'
                '{"metadata":{},"name":"o","nullable":true,"type":{"type":"map",'
                '"keyType":"string","valueType":"integer","valueContainsNull":true}}'
                '],"type":"struct"}'))
            add_files: Sequence[AddFile] = []
            return ListFilesInTableResponse(protocol=None,
                                            metadata=metadata,
                                            add_files=add_files)
Beispiel #8
0
def test_to_pandas_partitioned_different_schemas(tmp_path):
    pdf1 = pd.DataFrame({"a": [1, 2, 3]})
    pdf2 = pd.DataFrame({"a": [4.0, 5.0, 6.0], "b": ["d", "e", "f"]})

    pdf1.to_parquet(tmp_path / "pdf1.parquet")
    pdf2.to_parquet(tmp_path / "pdf2.parquet")

    class RestClientMock:
        def list_files_in_table(
            self,
            table: Table,
            *,
            predicateHints: Optional[Sequence[str]] = None,
            limitHint: Optional[int] = None,
        ) -> ListFilesInTableResponse:
            assert table == Table("table_name", "share_name", "schema_name")

            metadata = Metadata(schema_string=(
                '{"fields":['
                '{"metadata":{},"name":"a","nullable":true,"type":"long"},'
                '{"metadata":{},"name":"b","nullable":true,"type":"string"},'
                '{"metadata":{},"name":"c","nullable":true,"type":"date"}'
                '],"type":"struct"}'))
            add_files = [
                AddFile(
                    url=str(tmp_path / "pdf1.parquet"),
                    id="pdf1",
                    partition_values={"c": "2021-01-01"},
                    size=0,
                    stats="",
                ),
                AddFile(
                    url=str(tmp_path / "pdf2.parquet"),
                    id="pdf2",
                    partition_values={"c": "2021-01-02"},
                    size=0,
                    stats="",
                ),
            ]
            return ListFilesInTableResponse(protocol=None,
                                            metadata=metadata,
                                            add_files=add_files)

    reader = DeltaSharingReader(
        Table("table_name", "share_name", "schema_name"), RestClientMock())
    pdf = reader.to_pandas()

    expected1 = pdf1.copy()
    expected1["c"] = date(2021, 1, 1)
    expected2 = pdf2.copy()
    expected2["c"] = date(2021, 1, 2)
    expected = pd.concat([expected1, expected2])[["a", "b",
                                                  "c"]].reset_index(drop=True)

    pd.testing.assert_frame_equal(pdf, expected)
Beispiel #9
0
def test_query_table_metadata_partitioned(rest_client: DataSharingRestClient):
    response = rest_client.query_table_metadata(
        Table(name="table2", share="share2", schema="default"))
    assert response.protocol == Protocol(min_reader_version=1)
    assert response.metadata == Metadata(
        id="f8d5c169-3d01-4ca3-ad9e-7dc3355aedb2",
        format=Format(provider="parquet", options={}),
        schema_string=
        ('{"type":"struct","fields":['
         '{"name":"eventTime","type":"timestamp","nullable":true,"metadata":{}},'
         '{"name":"date","type":"date","nullable":true,"metadata":{}}'
         "]}"),
        partition_columns=["date"],
    )
Beispiel #10
0
def test_list_files_in_table_partitioned_different_schemas(
        rest_client: DataSharingRestClient):
    response = rest_client.list_files_in_table(
        Table(name="table3", share="share1", schema="default"))
    assert response.protocol == Protocol(min_reader_version=1)
    assert response.metadata == Metadata(
        id="7ba6d727-a578-4234-a138-953f790b427c",
        format=Format(provider="parquet", options={}),
        schema_string=
        ('{"type":"struct","fields":['
         '{"name":"eventTime","type":"timestamp","nullable":true,"metadata":{}},'
         '{"name":"date","type":"date","nullable":true,"metadata":{}},'
         '{"name":"type","type":"string","nullable":true,"metadata":{}}'
         "]}"),
        partition_columns=["date"],
    )
    assert response.add_files == [
        AddFile(
            url=response.add_files[0].url,
            id="db213271abffec6fd6c7fc2aad9d4b3f",
            partition_values={"date": "2021-04-28"},
            size=778,
            stats=
            (r'{"numRecords":1,'
             r'"minValues":{"eventTime":"2021-04-28T23:36:51.945Z","type":"bar"},'
             r'"maxValues":{"eventTime":"2021-04-28T23:36:51.945Z","type":"bar"},'
             r'"nullCount":{"eventTime":0,"type":0}}'),
        ),
        AddFile(
            url=response.add_files[1].url,
            id="f1f8be229d8b18eb6d6a34255f2d7089",
            partition_values={"date": "2021-04-28"},
            size=778,
            stats=
            (r'{"numRecords":1,'
             r'"minValues":{"eventTime":"2021-04-28T23:36:47.599Z","type":"foo"},'
             r'"maxValues":{"eventTime":"2021-04-28T23:36:47.599Z","type":"foo"},'
             r'"nullCount":{"eventTime":0,"type":0}}'),
        ),
        AddFile(
            url=response.add_files[2].url,
            id="a892a55d770ee70b34ffb2ebf7dc2fd0",
            partition_values={"date": "2021-04-28"},
            size=573,
            stats=(r'{"numRecords":1,'
                   r'"minValues":{"eventTime":"2021-04-28T23:35:53.156Z"},'
                   r'"maxValues":{"eventTime":"2021-04-28T23:35:53.156Z"},'
                   r'"nullCount":{"eventTime":0}}'),
        ),
    ]
Beispiel #11
0
def test_query_table_metadata_non_partitioned(
        rest_client: DataSharingRestClient):
    response = rest_client.query_table_metadata(
        Table(name="table1", share="share1", schema="default"))
    assert response.protocol == Protocol(min_reader_version=1)
    assert response.metadata == Metadata(
        id="ed96aa41-1d81-4b7f-8fb5-846878b4b0cf",
        format=Format(provider="parquet", options={}),
        schema_string=
        ('{"type":"struct","fields":['
         '{"name":"eventTime","type":"timestamp","nullable":true,"metadata":{}},'
         '{"name":"date","type":"date","nullable":true,"metadata":{}}'
         "]}"),
        partition_columns=[],
    )
Beispiel #12
0
def test_query_table_metadata_partitioned_different_schemas(
        rest_client: DataSharingRestClient):
    response = rest_client.query_table_metadata(
        Table(name="table3", share="share1", schema="default"))
    assert response.protocol == Protocol(min_reader_version=1)
    assert response.metadata == Metadata(
        id="7ba6d727-a578-4234-a138-953f790b427c",
        format=Format(provider="parquet", options={}),
        schema_string=
        ('{"type":"struct","fields":['
         '{"name":"eventTime","type":"timestamp","nullable":true,"metadata":{}},'
         '{"name":"date","type":"date","nullable":true,"metadata":{}},'
         '{"name":"type","type":"string","nullable":true,"metadata":{}}'
         "]}"),
        partition_columns=["date"],
    )
Beispiel #13
0
def load_as_pandas(url: str, limit: Optional[int] = None) -> pd.DataFrame:
    """
    Load the shared table using the give url as a pandas DataFrame.

    :param url: a url under the format "<profile>#<share>.<schema>.<table>"
    :param limit: a non-negative int. Load only the ``limit`` rows if the parameter is specified.
      Use this optional parameter to explore the shared table without loading the entire table to
      the memory.
    :return: A pandas DataFrame representing the shared table.
    """
    profile_json, share, schema, table = _parse_url(url)
    profile = DeltaSharingProfile.read_from_file(profile_json)
    return DeltaSharingReader(
        table=Table(name=table, share=share, schema=schema),
        rest_client=DataSharingRestClient(profile),
        limit=limit,
    ).to_pandas()
Beispiel #14
0
def test_list_files_in_table_non_partitioned(
        rest_client: DataSharingRestClient):
    response = rest_client.list_files_in_table(
        Table(name="table1", share="share1", schema="default"),
        predicateHints=["date = '2021-01-31'"],
    )
    assert response.protocol == Protocol(min_reader_version=1)
    assert response.metadata == Metadata(
        id="ed96aa41-1d81-4b7f-8fb5-846878b4b0cf",
        format=Format(provider="parquet", options={}),
        schema_string=
        ('{"type":"struct","fields":['
         '{"name":"eventTime","type":"timestamp","nullable":true,"metadata":{}},'
         '{"name":"date","type":"date","nullable":true,"metadata":{}}'
         "]}"),
        partition_columns=[],
    )
    assert response.add_files == [
        AddFile(
            url=response.add_files[0].url,
            id="061cb3683a467066995f8cdaabd8667d",
            partition_values={},
            size=781,
            stats=
            (r'{"numRecords":1,'
             r'"minValues":{"eventTime":"2021-04-28T06:32:22.421Z","date":"2021-04-28"},'
             r'"maxValues":{"eventTime":"2021-04-28T06:32:22.421Z","date":"2021-04-28"},'
             r'"nullCount":{"eventTime":0,"date":0}}'),
        ),
        AddFile(
            url=response.add_files[1].url,
            id="e268cbf70dbaa6143e7e9fa3e2d3b00e",
            partition_values={},
            size=781,
            stats=
            (r'{"numRecords":1,'
             r'"minValues":{"eventTime":"2021-04-28T06:32:02.070Z","date":"2021-04-28"},'
             r'"maxValues":{"eventTime":"2021-04-28T06:32:02.070Z","date":"2021-04-28"},'
             r'"nullCount":{"eventTime":0,"date":0}}'),
        ),
    ]
Beispiel #15
0
    def list_all_tables(
            self,
            share: Share,
            *,
            max_results: Optional[int] = None,
            page_token: Optional[str] = None) -> ListAllTablesResponse:
        data: Dict = {}
        if max_results is not None:
            data["maxResults"] = max_results
        if page_token is not None:
            data["pageToken"] = page_token

        with self._get_internal(f"/shares/{share.name}/all-tables",
                                data) as lines:
            tables_json = json.loads(next(lines))
            return ListAllTablesResponse(
                tables=[
                    Table.from_json(table_json)
                    for table_json in tables_json.get("items", [])
                ],
                next_page_token=tables_json.get("nextPageToken", None),
            )
Beispiel #16
0
def test_list_files_in_table_partitioned(rest_client: DataSharingRestClient):
    response = rest_client.list_files_in_table(
        Table(name="table2", share="share2", schema="default"),
        predicateHints=["date = '2021-01-31'"],
        limitHint=123,
    )
    assert response.protocol == Protocol(min_reader_version=1)
    assert response.metadata == Metadata(
        id="f8d5c169-3d01-4ca3-ad9e-7dc3355aedb2",
        format=Format(provider="parquet", options={}),
        schema_string=
        ('{"type":"struct","fields":['
         '{"name":"eventTime","type":"timestamp","nullable":true,"metadata":{}},'
         '{"name":"date","type":"date","nullable":true,"metadata":{}}'
         "]}"),
        partition_columns=["date"],
    )
    assert response.add_files == [
        AddFile(
            url=response.add_files[0].url,
            id="9f1a49539c5cffe1ea7f9e055d5c003c",
            partition_values={"date": "2021-04-28"},
            size=573,
            stats=(r'{"numRecords":1,'
                   r'"minValues":{"eventTime":"2021-04-28T23:33:57.955Z"},'
                   r'"maxValues":{"eventTime":"2021-04-28T23:33:57.955Z"},'
                   r'"nullCount":{"eventTime":0}}'),
        ),
        AddFile(
            url=response.add_files[1].url,
            id="cd2209b32f5ed5305922dd50f5908a75",
            partition_values={"date": "2021-04-28"},
            size=573,
            stats=(r'{"numRecords":1,'
                   r'"minValues":{"eventTime":"2021-04-28T23:33:48.719Z"},'
                   r'"maxValues":{"eventTime":"2021-04-28T23:33:48.719Z"},'
                   r'"nullCount":{"eventTime":0}}'),
        ),
    ]
Beispiel #17
0
def _verify_all_tables_result(tables: Sequence[Table]):
    assert tables == [
        Table(name="table1", share="share1", schema="default"),
        Table(name="table3", share="share1", schema="default"),
        Table(name="table7", share="share1", schema="default"),
        Table(name="table2", share="share2", schema="default"),
        Table(name="table4", share="share3", schema="default"),
        Table(name="table5", share="share3", schema="default"),
        Table(name="test_gzip", share="share4", schema="default"),
        Table(name="table8", share="share7", schema="schema1"),
        Table(name="table9", share="share7", schema="schema2"),
        Table(name="table_wasb", share="share_azure", schema="default"),
        Table(name="table_abfs", share="share_azure", schema="default"),
        Table(name="table_gcs", share="share_gcp", schema="default"),
    ]