Beispiel #1
0
def _cleanup_minio():
    if MINIO.bucket_exists(S3_BUCKET):
        objects = [
            o.object_name for o in MINIO.list_objects(bucket_name=S3_BUCKET)
        ]
        # remove_objects is an iterator, so we force evaluate it
        list(MINIO.remove_objects(bucket_name=S3_BUCKET, objects_iter=objects))
Beispiel #2
0
def clean_minio():
    if not MINIO.bucket_exists(S3_BUCKET):
        MINIO.make_bucket(S3_BUCKET)

    # Make sure to delete extra objects in the remote Minio bucket
    _cleanup_minio()
    yield MINIO
    # Comment this out if tests fail and you want to see what the hell went on in the bucket.
    _cleanup_minio()
Beispiel #3
0
def clean_minio():
    try:
        MINIO.make_bucket(S3_BUCKET)
    except BucketAlreadyExists:
        pass
    except BucketAlreadyOwnedByYou:
        pass

    # Make sure to delete extra objects in the remote Minio bucket
    _cleanup_minio()
    yield MINIO
    # Comment this out if tests fail and you want to see what the hell went on in the bucket.
    _cleanup_minio()
Beispiel #4
0
def test_csv_introspection_http():
    # Pre-sign the S3 URL for an easy HTTP URL to test this
    schema = CSVForeignDataWrapper.import_schema(
        schema=None,
        srv_options={
            "url": MINIO.presigned_get_object("test_csv",
                                              "some_prefix/fruits.csv")
        },
        options={},
        restriction_type=None,
        restricts=[],
    )
    assert len(schema) == 1

    assert schema[0] == {
        "table_name":
        "data",
        "schema":
        None,
        "columns": [
            {
                "column_name": "fruit_id",
                "type_name": "integer"
            },
            {
                "column_name": "timestamp",
                "type_name": "timestamp"
            },
            {
                "column_name": "name",
                "type_name": "character varying"
            },
            {
                "column_name": "number",
                "type_name": "integer"
            },
            {
                "column_name": "bignumber",
                "type_name": "bigint"
            },
            {
                "column_name": "vbignumber",
                "type_name": "numeric"
            },
        ],
        "options": {
            "autodetect_dialect": "false",
            "autodetect_encoding": "false",
            "autodetect_header": "false",
            "delimiter": ",",
            "encoding": "utf-8",
            "header": "true",
            "quotechar": '"',
        },
    }
Beispiel #5
0
def test_csv_data_source_http(local_engine_empty):
    source = CSVDataSource(
        local_engine_empty,
        credentials={},
        params={
            "url":
            MINIO.presigned_get_object("test_csv",
                                       "some_prefix/rdu-weather-history.csv"),
        },
    )

    schema = source.introspect()
    assert len(schema.keys()) == 1
    assert len(schema["data"][0]) == 28

    preview = source.preview(schema)
    assert len(preview.keys()) == 1
    assert len(preview["data"]) == 10
Beispiel #6
0
def test_csv_data_source_raw_url(local_engine_empty):
    # Use the data from the previous test to test out the raw URL functionality
    url = MINIO.presigned_get_object("test_csv",
                                     "some_prefix/rdu-weather-history.csv")

    credentials = {
        "s3_access_key": "minioclient",
        "s3_secret_key": "supersecure",
    }

    params = {
        "s3_endpoint": "objectstorage:9000",
        "s3_secure": False,
        "s3_bucket": "test_csv",
        # Put this delimiter in as a canary to make sure table params override server params.
        "delimiter": ",",
    }

    tables = {
        "from_url": ([], {
            "url": url
        }),
        "from_s3_rdu": ([], {
            "s3_object": "some_prefix/rdu-weather-history.csv"
        }),
        "from_s3_encoding": ([], {
            "s3_object": "some_prefix/encoding-win-1252.csv"
        }),
        "from_url_broken": ([], {
            "url": "invalid_url"
        }),
        "from_s3_broken": ([], {
            "s3_object": "invalid_object"
        }),
    }

    source = CSVDataSource(
        local_engine_empty,
        credentials,
        params,
        tables,
    )

    schema = source.introspect()
    schema = unwrap(schema)[0]

    raw_urls = source.get_raw_url(tables=schema)
    assert raw_urls == {
        "from_s3_encoding": [(
            "text/csv",
            mock.ANY,
        )],
        "from_s3_rdu": [(
            "text/csv",
            mock.ANY,
        )],
        "from_url": [(
            "text/csv",
            url,
        )],
    }

    assert "objectstorage:9000" in raw_urls["from_s3_encoding"][0][1]
    assert "objectstorage:9000" in raw_urls["from_s3_rdu"][0][1]
Beispiel #7
0
def test_csv_data_source_multiple(local_engine_empty):
    # End-to-end version for test_csv_introspection_multiple to check things like table params
    # getting serialized and deserialized properly.

    url = MINIO.presigned_get_object("test_csv",
                                     "some_prefix/rdu-weather-history.csv")

    credentials = {
        "s3_access_key": "minioclient",
        "s3_secret_key": "supersecure",
    }

    params = {
        "s3_endpoint": "objectstorage:9000",
        "s3_secure": False,
        "s3_bucket": "test_csv",
        # Put this delimiter in as a canary to make sure table params override server params.
        "delimiter": ",",
    }

    tables = {
        # Pass an empty table schema to denote we want to introspect it
        "from_url": ([], {
            "url": url
        }),
        "from_s3_rdu": ([], {
            "s3_object": "some_prefix/rdu-weather-history.csv"
        }),
        "from_s3_encoding": ([], {
            "s3_object": "some_prefix/encoding-win-1252.csv"
        }),
        "from_url_broken": ([], {
            "url": "invalid_url"
        }),
        "from_s3_broken": ([], {
            "s3_object": "invalid_object"
        }),
    }

    source = CSVDataSource(
        local_engine_empty,
        credentials,
        params,
        tables,
    )

    schema = source.introspect()

    assert schema == {
        "from_url": (
            mock.ANY,
            {
                "autodetect_dialect": False,
                "url": url,
                "quotechar": '"',
                "header": True,
                "encoding": "utf-8",
                "delimiter": ";",
                "autodetect_header": False,
                "autodetect_encoding": False,
            },
        ),
        "from_s3_rdu": (
            mock.ANY,
            {
                "encoding": "utf-8",
                "autodetect_dialect": False,
                "autodetect_encoding": False,
                "autodetect_header": False,
                "delimiter": ";",
                "header": True,
                "quotechar": '"',
                "s3_object": "some_prefix/rdu-weather-history.csv",
            },
        ),
        "from_s3_encoding": (
            mock.ANY,
            {
                "s3_object": "some_prefix/encoding-win-1252.csv",
                "quotechar": '"',
                "header": True,
                "encoding": "Windows-1252",
                "autodetect_dialect": False,
                "delimiter": ";",
                "autodetect_header": False,
                "autodetect_encoding": False,
            },
        ),
        "from_url_broken":
        MountError(
            table_name="from_url_broken",
            error="requests.exceptions.MissingSchema",
            error_text=
            "Invalid URL 'invalid_url': No schema supplied. Perhaps you meant http://invalid_url?",
        ),
        "from_s3_broken":
        MountError(
            table_name="from_s3_broken",
            error="minio.error.S3Error",
            error_text=mock.ANY,
        ),
    }

    # Mount the datasets with this introspected schema.
    schema = unwrap(schema)[0]
    try:
        source.mount("temp_data", tables=schema)
        rows = local_engine_empty.run_sql(
            "SELECT * FROM temp_data.from_s3_encoding")
        assert len(rows) == 3
        assert len(rows[0]) == 3
    finally:
        local_engine_empty.delete_schema("temp_data")

    # Override the delimiter and blank out the schema for a single table
    schema["from_s3_encoding"] = (
        [],
        {
            "s3_object": "some_prefix/encoding-win-1252.csv",
            "quotechar": '"',
            "header": True,
            "encoding": "Windows-1252",
            "autodetect_dialect": False,
            # We force a delimiter "," here which will make the CSV a single-column one
            # (to test we can actually override these)
            "delimiter": ",",
            "autodetect_header": False,
            "autodetect_encoding": False,
        },
    )

    # Reintrospect the source with the new table parameters
    source = CSVDataSource(local_engine_empty, credentials, params, schema)
    new_schema = source.introspect()
    assert len(new_schema) == 3
    # Check other tables are unchanged
    assert new_schema["from_url"] == schema["from_url"]
    assert new_schema["from_s3_rdu"] == schema["from_s3_rdu"]

    # Table with a changed separator only has one column (since we have , for delimiter
    # instead of ;)
    assert new_schema["from_s3_encoding"][0] == [
        TableColumn(ordinal=1,
                    name=";DATE;TEXT",
                    pg_type="character varying",
                    is_pk=False,
                    comment=None)
    ]

    try:
        source.mount("temp_data", tables=new_schema)
        rows = local_engine_empty.run_sql(
            "SELECT * FROM temp_data.from_s3_encoding")
        assert len(rows) == 3
        # Check we get one column now
        assert rows[0] == ("1;01/07/2021;Pañamao", )
    finally:
        local_engine_empty.delete_schema("temp_data")
Beispiel #8
0
def test_csv_introspection_multiple():
    # Test running the introspection passing the table options as CREATE FOREIGN SCHEMA params.
    # In effect, we specify the table names, S3 key/URL and expect the FDW to figure out
    # the rest.

    fdw_options = {
        "s3_endpoint": "objectstorage:9000",
        "s3_secure": "false",
        "s3_access_key": "minioclient",
        "s3_secret_key": "supersecure",
        "s3_bucket": "test_csv",
        "s3_object_prefix": "some_prefix/",
    }

    url = MINIO.presigned_get_object("test_csv",
                                     "some_prefix/rdu-weather-history.csv")
    schema = CSVForeignDataWrapper.import_schema(
        schema=None,
        srv_options=fdw_options,
        options={
            "table_options":
            json.dumps({
                "from_url": {
                    "url": url
                },
                "from_s3_rdu": {
                    "s3_object": "some_prefix/rdu-weather-history.csv"
                },
                "from_s3_encoding": {
                    "s3_object": "some_prefix/encoding-win-1252.csv"
                },
            })
        },
        restriction_type=None,
        restricts=[],
    )

    assert len(schema) == 3
    schema = sorted(schema, key=lambda s: s["table_name"])

    assert schema[0] == {
        "table_name": "from_s3_encoding",
        "schema": None,
        "columns": mock.ANY,
        "options": _s3_win_1252_opts,
    }
    assert schema[1] == {
        "table_name": "from_s3_rdu",
        "schema": None,
        "columns": mock.ANY,
        "options": {
            "autodetect_dialect": "false",
            "autodetect_encoding": "false",
            "autodetect_header": "false",
            "delimiter": ";",
            "encoding": "utf-8",
            "header": "true",
            "quotechar": '"',
            "s3_object": "some_prefix/rdu-weather-history.csv",
        },
    }
    assert schema[2] == {
        "table_name": "from_url",
        "schema": None,
        "columns": mock.ANY,
        "options": {
            "autodetect_dialect": "false",
            "autodetect_encoding": "false",
            "autodetect_header": "false",
            "delimiter": ";",
            "encoding": "utf-8",
            "header": "true",
            "quotechar": '"',
            "url": url,
        },
    }