Ejemplo n.º 1
0
def test_type_promoted_schema_read_with_fastparquet():
    with _test_utils.LocalTestFileSystem():
        a = _schema_impl.Schema.create_at_any_location(
            schema_type=_schema_impl.SchemaType([(
                'a', _primitives.Integer), ('b', _primitives.Boolean)]))
        with a as writer:
            writer.write(
                _pd.DataFrame.from_dict({
                    'a': [1, 2, 3, 4],
                    'b': [None, True, None, False]
                }))

        import os as _os
        original_engine = _os.getenv('PARQUET_ENGINE')
        _os.environ['PARQUET_ENGINE'] = 'fastparquet'

        b = _schema_impl.Schema.fetch(a.remote_prefix,
                                      schema_type=_schema_impl.SchemaType([]))

        with b as reader:
            df = reader.read()
            assert df['a'].tolist() == [1, 2, 3, 4]
            assert _pd.api.types.is_object_dtype(df.dtypes['b'])
            assert df['b'].tolist() == [None, True, None, False]

        if original_engine is None:
            del _os.environ['PARQUET_ENGINE']
        else:
            _os.environ['PARQUET_ENGINE'] = original_engine
Ejemplo n.º 2
0
def test_multipart_blob_fetch_managed():
    with AutoDeletingTempDir('test') as wd:
        with test_utils.LocalTestFileSystem() as t:
            _generate_multipart_blob_data(wd)

            b = blobs.MultiPartBlob.fetch(wd.name)
            assert b.local_path.startswith(t.name)
            assert b.remote_location == wd.name + "/"
            assert b.mode == 'rb'
            assert b.metadata.type.format == ""
            assert b.metadata.type.dimensionality == _core_types.BlobType.BlobDimensionality.MULTIPART
            with b as r:
                assert r[0].read() == "part0".encode('utf-8')
                assert r[1].read() == "part1".encode('utf-8')
                assert r[2].read() == "part2".encode('utf-8')

            with pytest.raises(_user_exceptions.FlyteAssertion):
                blobs.MultiPartBlob.fetch(wd.name, local_path=b.local_path)

            with open(os.path.join(wd.name, "0"), 'wb') as w:
                w.write("bye".encode('utf-8'))

            b2 = blobs.MultiPartBlob.fetch(wd.name,
                                           local_path=b.local_path,
                                           overwrite=True)
            with b2 as r:
                assert r[0].read() == "bye".encode('utf-8')
                assert r[1].read() == "part1".encode('utf-8')
                assert r[2].read() == "part2".encode('utf-8')

        with pytest.raises(_user_exceptions.FlyteAssertion):
            blobs.Blob.fetch(wd.name)
Ejemplo n.º 3
0
def test_blob_fetch_managed():
    with AutoDeletingTempDir('test') as wd:
        with test_utils.LocalTestFileSystem() as t:
            tmp_name = wd.get_named_tempfile('tmp')
            with open(tmp_name, 'wb') as w:
                w.write("hello".encode('utf-8'))

            b = blobs.Blob.fetch(tmp_name)
            assert b.local_path.startswith(t.name)
            assert b.remote_location == tmp_name
            assert b.mode == 'rb'
            assert b.metadata.type.format == ""
            assert b.metadata.type.dimensionality == _core_types.BlobType.BlobDimensionality.SINGLE
            with b as r:
                assert r.read() == "hello".encode('utf-8')

            with pytest.raises(_user_exceptions.FlyteAssertion):
                blobs.Blob.fetch(tmp_name, local_path=b.local_path)

            with open(tmp_name, 'wb') as w:
                w.write("bye".encode('utf-8'))

            b2 = blobs.Blob.fetch(tmp_name,
                                  local_path=b.local_path,
                                  overwrite=True)
            with b2 as r:
                assert r.read() == "bye".encode('utf-8')

        with pytest.raises(_user_exceptions.FlyteAssertion):
            blobs.Blob.fetch(tmp_name)
Ejemplo n.º 4
0
def test_schema_read_consistency_between_two_engines():
    with _test_utils.LocalTestFileSystem():
        a = _schema_impl.Schema.create_at_any_location(
            schema_type=_schema_impl.SchemaType([(
                'a', _primitives.Integer), ('b', _primitives.Boolean)]))
        with a as writer:
            writer.write(
                _pd.DataFrame.from_dict({
                    'a': [1, 2, 3, 4],
                    'b': [None, True, None, False]
                }))

        import os as _os
        original_engine = _os.getenv('PARQUET_ENGINE')
        _os.environ['PARQUET_ENGINE'] = 'fastparquet'

        b = _schema_impl.Schema.fetch(a.remote_prefix,
                                      schema_type=_schema_impl.SchemaType([]))

        with b as b_reader:
            b_df = b_reader.read()
            _os.environ['PARQUET_ENGINE'] = 'pyarrow'

            c = _schema_impl.Schema.fetch(a.remote_prefix,
                                          schema_type=_schema_impl.SchemaType(
                                              []))
            with c as c_reader:
                c_df = c_reader.read()
                assert b_df.equals(c_df)

        if original_engine is None:
            del _os.environ['PARQUET_ENGINE']
        else:
            _os.environ['PARQUET_ENGINE'] = original_engine
Ejemplo n.º 5
0
def test_datetime_coercion():
    values = [
        tuple(
            [
                _datetime.datetime(day=1, month=1, year=2017, hour=1, minute=1, second=1, microsecond=1)
                - _datetime.timedelta(days=x)
            ]
        )
        for x in _six_moves.range(5)
    ]
    schema_type = _schema_impl.SchemaType(columns=[("testname", _primitives.Datetime)])

    with _test_utils.LocalTestFileSystem():
        with _utils.AutoDeletingTempDir("test") as t:
            a = _schema_impl.Schema.create_at_known_location(t.name, mode="wb", schema_type=schema_type)
            with a as writer:
                for _ in _six_moves.range(5):
                    # us to ms coercion segfaults unless we explicitly allow truncation.
                    writer.write(
                        _pd.DataFrame.from_records(values, columns=["testname"]),
                        coerce_timestamps="ms",
                        allow_truncated_timestamps=True,
                    )

                    # TODO: Uncomment when segfault bug is resolved
                    # with _pytest.raises(Exception):
                    #    writer.write(
                    #        _pd.DataFrame.from_records(values, columns=['testname']),
                    #        coerce_timestamps='ms')

            b = _schema_impl.Schema.create_at_known_location(t.name, mode="wb", schema_type=schema_type)
            with b as writer:
                for _ in _six_moves.range(5):
                    writer.write(_pd.DataFrame.from_records(values, columns=["testname"]))
Ejemplo n.º 6
0
def test_simple_read_and_write_with_different_types(value_type_pair):
    column_name, flyte_type, values = value_type_pair
    values = [tuple([value]) for value in values]
    schema_type = _schema_impl.SchemaType(columns=[(column_name, flyte_type)])

    with _test_utils.LocalTestFileSystem() as sandbox:
        with _utils.AutoDeletingTempDir("test") as t:
            a = _schema_impl.Schema.create_at_known_location(
                t.name, mode='wb', schema_type=schema_type)
            assert a.local_path is None
            with a as writer:
                for _ in _six_moves.range(5):
                    writer.write(
                        _pd.DataFrame.from_records(values,
                                                   columns=[column_name]))
                assert a.local_path.startswith(sandbox.name)
            assert a.local_path is None

            b = _schema_impl.Schema.create_at_known_location(
                t.name, mode='rb', schema_type=schema_type)
            assert b.local_path is None
            with b as reader:
                for df in reader.iter_chunks():
                    for check, actual in _six_moves.zip(
                            values, df[column_name].tolist()):
                        assert check[0] == actual
                assert reader.read() is None
                reader.seek(0)
                df = reader.read(concat=True)
                for iter_count, actual in enumerate(df[column_name].tolist()):
                    assert values[iter_count % len(values)][0] == actual
                assert b.local_path.startswith(sandbox.name)
            assert b.local_path is None
Ejemplo n.º 7
0
def test_normal_schema_read_with_fastparquet():
    with _test_utils.LocalTestFileSystem():
        a = _schema_impl.Schema.create_at_any_location(
            schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Boolean)])
        )
        with a as writer:
            writer.write(_pd.DataFrame.from_dict({"a": [1, 2, 3, 4], "b": [False, True, True, False]}))

        import os as _os

        original_engine = _os.getenv("PARQUET_ENGINE")
        _os.environ["PARQUET_ENGINE"] = "fastparquet"

        b = _schema_impl.Schema.fetch(a.remote_prefix, schema_type=_schema_impl.SchemaType([]))

        with b as reader:
            df = reader.read()
            assert df["a"].tolist() == [1, 2, 3, 4]
            assert _pd.api.types.is_bool_dtype(df.dtypes["b"])
            assert df["b"].tolist() == [False, True, True, False]

        if original_engine is None:
            del _os.environ["PARQUET_ENGINE"]
        else:
            _os.environ["PARQUET_ENGINE"] = original_engine
Ejemplo n.º 8
0
def test_extra_schema_read():
    with _test_utils.LocalTestFileSystem():
        a = _schema_impl.Schema.create_at_any_location(
            schema_type=_schema_impl.SchemaType([(
                'a', _primitives.Integer), ('b', _primitives.Integer)]))
        with a as writer:
            writer.write(
                _pd.DataFrame.from_dict({
                    'a': [1, 2, 3, 4],
                    'b': [5, 6, 7, 8]
                }))

        b = _schema_impl.Schema.fetch(a.remote_prefix,
                                      schema_type=_schema_impl.SchemaType([
                                          ('a', _primitives.Integer)
                                      ]))
        with b as reader:
            df = reader.read(concat=True, truncate_extra_columns=False)
            assert df.columns.values.tolist() == ['a', 'b']
            assert df['a'].tolist() == [1, 2, 3, 4]
            assert df['b'].tolist() == [5, 6, 7, 8]

        with b as reader:
            df = reader.read(concat=True)
            assert df.columns.values.tolist() == ['a']
            assert df['a'].tolist() == [1, 2, 3, 4]
Ejemplo n.º 9
0
def test_blob_download_managed():
    with AutoDeletingTempDir("test") as wd:
        with test_utils.LocalTestFileSystem() as t:
            tmp_name = wd.get_named_tempfile("tmp")
            with open(tmp_name, "wb") as w:
                w.write("hello".encode("utf-8"))

            b = blobs.Blob(tmp_name)
            b.download()
            assert b.local_path.startswith(t.name)
            assert b.remote_location == tmp_name
            assert b.mode == "rb"
            assert b.metadata.type.format == ""
            assert b.metadata.type.dimensionality == _core_types.BlobType.BlobDimensionality.SINGLE
            with b as r:
                assert r.read() == "hello".encode("utf-8")

            b2 = blobs.Blob(tmp_name)
            with pytest.raises(_user_exceptions.FlyteAssertion):
                b2.download(b.local_path)

            with open(tmp_name, "wb") as w:
                w.write("bye".encode("utf-8"))

            b2 = blobs.Blob(tmp_name)
            b2.download(local_path=b.local_path, overwrite=True)
            with b2 as r:
                assert r.read() == "bye".encode("utf-8")

        b = blobs.Blob(tmp_name)
        with pytest.raises(_user_exceptions.FlyteAssertion):
            b.download()
Ejemplo n.º 10
0
def execution_data_locations():
    with test_utils.LocalTestFileSystem() as fs:
        input_filename = fs.get_named_tempfile("inputs.pb")
        output_filename = fs.get_named_tempfile("outputs.pb")
        utils.write_proto_to_file(_INPUT_MAP.to_flyte_idl(), input_filename)
        utils.write_proto_to_file(_OUTPUT_MAP.to_flyte_idl(), output_filename)
        yield (_common_models.UrlBlob(input_filename, 100),
               _common_models.UrlBlob(output_filename, 100))
Ejemplo n.º 11
0
def test_generic_schema():
    with test_utils.LocalTestFileSystem() as t:
        instantiator = schema.schema_instantiator()
        b = instantiator()
        assert isinstance(b, schema_impl.Schema)
        assert b.mode == "wb"
        assert len(b.type.columns) == 0
        assert b.remote_location.startswith(t.name)
Ejemplo n.º 12
0
def test_blob_double_enter():
    with test_utils.LocalTestFileSystem():
        with AutoDeletingTempDir('test') as wd:
            b = blobs.Blob(wd.get_named_tempfile("sink"), mode='wb')
            with b:
                with pytest.raises(_user_exceptions.FlyteAssertion):
                    with b:
                        pass
Ejemplo n.º 13
0
def test_typed_schema():
    with test_utils.LocalTestFileSystem() as t:
        instantiator = schema.schema_instantiator(_ALL_COLUMN_TYPES)
        b = instantiator()
        assert isinstance(b, schema_impl.Schema)
        assert b.mode == "wb"
        assert len(b.type.columns) == len(_ALL_COLUMN_TYPES)
        assert list(b.type.sdk_columns.items()) == _ALL_COLUMN_TYPES
        assert b.remote_location.startswith(t.name)
Ejemplo n.º 14
0
def test_generic_schema_read():
    with _test_utils.LocalTestFileSystem():
        a = _schema_impl.Schema.create_at_any_location(
            schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Integer)])
        )
        with a as writer:
            writer.write(_pd.DataFrame.from_dict({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}))

        b = _schema_impl.Schema.fetch(a.remote_prefix, schema_type=_schema_impl.SchemaType([]))
        with b as reader:
            df = reader.read()
            assert df.columns.values.tolist() == ["a", "b"]
            assert df["a"].tolist() == [1, 2, 3, 4]
            assert df["b"].tolist() == [5, 6, 7, 8]
Ejemplo n.º 15
0
def test_create_at_known_location():
    with _test_utils.LocalTestFileSystem():
        with _utils.AutoDeletingTempDir("test") as wd:
            b = _schema_impl.Schema.create_at_known_location(wd.name, schema_type=_schema_impl.SchemaType())
            assert b.local_path is None
            assert b.remote_location == wd.name + "/"
            assert b.mode == "wb"

            with b as w:
                w.write(_pd.DataFrame.from_dict({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}))

            df = _pd.read_parquet(_os.path.join(wd.name, "000000"))
            assert list(df["a"]) == [1, 2, 3, 4]
            assert list(df["b"]) == [5, 6, 7, 8]
Ejemplo n.º 16
0
def test_casting():
    class MyDateTime(primitives.Datetime):
        ...

    with test_utils.LocalTestFileSystem() as t:
        test_columns_1 = [('altered', MyDateTime)]
        test_columns_2 = [('altered', primitives.Datetime)]

        instantiator_1 = schema.schema_instantiator(test_columns_1)
        a = instantiator_1()

        instantiator_2 = schema.schema_instantiator(test_columns_2)

        a.cast_to(instantiator_2._schema_type)
Ejemplo n.º 17
0
def test_blob_create_at():
    with test_utils.LocalTestFileSystem() as t:
        with AutoDeletingTempDir('test') as wd:
            tmp_name = wd.get_named_tempfile('tmp')
            b = blobs.Blob.create_at_known_location(tmp_name)
            assert b.local_path is None
            assert b.remote_location == tmp_name
            assert b.mode == 'wb'
            assert b.metadata.type.format == ""
            assert b.metadata.type.dimensionality == _core_types.BlobType.BlobDimensionality.SINGLE
            with b as w:
                w.write("hello hello".encode('utf-8'))

            assert b.local_path.startswith(t.name)
            with open(tmp_name, 'rb') as r:
                assert r.read() == "hello hello".encode('utf-8')
Ejemplo n.º 18
0
def test_download(value_type_pair):
    column_name, flyte_type, values = value_type_pair
    values = [tuple([value]) for value in values]
    schema_type = _schema_impl.SchemaType(columns=[(column_name, flyte_type)])

    with _utils.AutoDeletingTempDir("test") as tmpdir:
        for i in _six_moves.range(3):
            _pd.DataFrame.from_records(values, columns=[
                column_name
            ]).to_parquet(tmpdir.get_named_tempfile(str(i).zfill(6)),
                          coerce_timestamps='us')

        with _utils.AutoDeletingTempDir("test2") as local_dir:
            schema_obj = _schema_impl.Schema(tmpdir.name,
                                             schema_type=schema_type)
            schema_obj.download(local_dir.get_named_tempfile(
                _uuid.uuid4().hex))
            with schema_obj as reader:
                for df in reader.iter_chunks():
                    for check, actual in _six_moves.zip(
                            values, df[column_name].tolist()):
                        assert check[0] == actual
                assert reader.read() is None
                reader.seek(0)
                df = reader.read(concat=True)
                for iter_count, actual in enumerate(df[column_name].tolist()):
                    assert values[iter_count % len(values)][0] == actual

        with _pytest.raises(Exception):
            schema_obj = _schema_impl.Schema(tmpdir.name,
                                             schema_type=schema_type)
            schema_obj.download()

        with _test_utils.LocalTestFileSystem():
            schema_obj = _schema_impl.Schema(tmpdir.name,
                                             schema_type=schema_type)
            schema_obj.download()
            with schema_obj as reader:
                for df in reader.iter_chunks():
                    for check, actual in _six_moves.zip(
                            values, df[column_name].tolist()):
                        assert check[0] == actual
                assert reader.read() is None
                reader.seek(0)
                df = reader.read(concat=True)
                for iter_count, actual in enumerate(df[column_name].tolist()):
                    assert values[iter_count % len(values)][0] == actual
Ejemplo n.º 19
0
def test_multipartblob():
    with test_utils.LocalTestFileSystem() as t:
        b = blobs.MultiPartBlob()
        assert isinstance(b, blob_impl.MultiPartBlob)
        assert b.remote_location.startswith(t.name)
        assert b.mode == "wb"
        assert b.metadata.type.format == ""

        b2 = blobs.MultiPartBlob(b)
        assert isinstance(b2, blobs.MultiPartBlob)
        assert b2.scalar.blob.uri == b.remote_location
        assert b2.scalar.blob.metadata == b.metadata

        b3 = blobs.MultiPartBlob.from_string("/a/b/c")
        assert isinstance(b3, blobs.MultiPartBlob)
        assert b3.scalar.blob.uri == "/a/b/c/"
        assert b3.scalar.blob.metadata.type.format == ""
Ejemplo n.º 20
0
def test_csv():
    with test_utils.LocalTestFileSystem() as t:
        b = blobs.CSV()
        assert isinstance(b, blob_impl.Blob)
        assert b.remote_location.startswith(t.name)
        assert b.mode == "w"
        assert b.metadata.type.format == "csv"

        b2 = blobs.CSV(b)
        assert isinstance(b2, blobs.Blob)
        assert b2.scalar.blob.uri == b.remote_location
        assert b2.scalar.blob.metadata == b.metadata

        b3 = blobs.CSV.from_string("/a/b/c")
        assert isinstance(b3, blobs.Blob)
        assert b3.scalar.blob.uri == "/a/b/c"
        assert b3.scalar.blob.metadata.type.format == "csv"
Ejemplo n.º 21
0
def test_blob_from_python_std():
    with test_utils.LocalTestFileSystem() as t:
        with AutoDeletingTempDir('test') as wd:
            tmp_name = wd.get_named_tempfile("from_python_std")
            with open(tmp_name, 'wb') as w:
                w.write("hello hello".encode('utf-8'))
            b = blobs.Blob.from_python_std(tmp_name)
            assert b.mode == "wb"
            assert b.metadata.type.format == ""
            assert b.metadata.type.dimensionality == _core_types.BlobType.BlobDimensionality.SINGLE
            assert b.remote_location.startswith(t.name)
            assert b.local_path == tmp_name
            with open(b.remote_location, 'rb') as r:
                assert r.read() == "hello hello".encode('utf-8')

    b = blobs.Blob("/tmp/fake")
    b2 = blobs.Blob.from_python_std(b)
    assert b == b2

    with pytest.raises(_user_exceptions.FlyteTypeException):
        blobs.Blob.from_python_std(3)
Ejemplo n.º 22
0
def test_partial_column_read():
    with _test_utils.LocalTestFileSystem():
        a = _schema_impl.Schema.create_at_any_location(
            schema_type=_schema_impl.SchemaType([(
                'a', _primitives.Integer), ('b', _primitives.Integer)]))
        with a as writer:
            writer.write(
                _pd.DataFrame.from_dict({
                    'a': [1, 2, 3, 4],
                    'b': [5, 6, 7, 8]
                }))

        b = _schema_impl.Schema.fetch(a.uri,
                                      schema_type=_schema_impl.SchemaType([
                                          ('a', _primitives.Integer),
                                          ('b', _primitives.Integer)
                                      ]))
        with b as reader:
            df = reader.read(columns=['b'])
            assert df.columns.values == ['b']
            assert df['b'].tolist() == [5, 6, 7, 8]
Ejemplo n.º 23
0
def test_multipart_blob_create_at():
    with test_utils.LocalTestFileSystem():
        with AutoDeletingTempDir('test') as wd:
            b = blobs.MultiPartBlob.create_at_known_location(wd.name)
            assert b.local_path is None
            assert b.remote_location == wd.name + "/"
            assert b.mode == 'wb'
            assert b.metadata.type.format == ""
            assert b.metadata.type.dimensionality == _core_types.BlobType.BlobDimensionality.MULTIPART
            with b.create_part('0') as w:
                w.write("part0".encode('utf-8'))
            with b.create_part('1') as w:
                w.write("part1".encode('utf-8'))
            with b.create_part('2') as w:
                w.write("part2".encode('utf-8'))

            with open(os.path.join(wd.name, '0'), 'rb') as r:
                assert r.read() == "part0".encode('utf-8')
            with open(os.path.join(wd.name, '1'), 'rb') as r:
                assert r.read() == "part1".encode('utf-8')
            with open(os.path.join(wd.name, '2'), 'rb') as r:
                assert r.read() == "part2".encode('utf-8')
Ejemplo n.º 24
0
def test_multipart_blob_from_python_std():
    with test_utils.LocalTestFileSystem() as t:
        with AutoDeletingTempDir('test') as wd:
            _generate_multipart_blob_data(wd)
            b = blobs.MultiPartBlob.from_python_std(wd.name)
            assert b.mode == "wb"
            assert b.metadata.type.format == ""
            assert b.metadata.type.dimensionality == _core_types.BlobType.BlobDimensionality.MULTIPART
            assert b.remote_location.startswith(t.name)
            assert b.local_path == wd.name
            with open(os.path.join(b.remote_location, '0'), 'rb') as r:
                assert r.read() == "part0".encode('utf-8')
            with open(os.path.join(b.remote_location, '1'), 'rb') as r:
                assert r.read() == "part1".encode('utf-8')
            with open(os.path.join(b.remote_location, '2'), 'rb') as r:
                assert r.read() == "part2".encode('utf-8')

    b = blobs.MultiPartBlob("/tmp/fake/")
    b2 = blobs.MultiPartBlob.from_python_std(b)
    assert b == b2

    with pytest.raises(_user_exceptions.FlyteTypeException):
        blobs.MultiPartBlob.from_python_std(3)
Ejemplo n.º 25
0
def test_from_python_std():
    with _test_utils.LocalTestFileSystem():

        def single_dataframe():
            df1 = _pd.DataFrame.from_dict({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})
            s = _schema_impl.Schema.from_python_std(
                t_value=df1,
                schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Integer)]),
            )
            assert s is not None
            n = _schema_impl.Schema.fetch(
                s.uri, schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Integer)]),
            )
            with n as reader:
                df2 = reader.read()
                assert df2.columns.values.all() == df1.columns.values.all()
                assert df2["b"].tolist() == df1["b"].tolist()

        def list_of_dataframes():
            df1 = _pd.DataFrame.from_dict({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})
            df2 = _pd.DataFrame.from_dict({"a": [9, 10, 11, 12], "b": [13, 14, 15, 16]})
            s = _schema_impl.Schema.from_python_std(
                t_value=[df1, df2],
                schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Integer)]),
            )
            assert s is not None
            n = _schema_impl.Schema.fetch(
                s.uri, schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Integer)]),
            )
            with n as reader:
                actual = []
                for df in reader.iter_chunks():
                    assert df.columns.values.all() == df1.columns.values.all()
                    actual.extend(df["b"].tolist())
                b_val = df1["b"].tolist()
                b_val.extend(df2["b"].tolist())
                assert actual == b_val

        def mixed_list():
            df1 = _pd.DataFrame.from_dict({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})
            df2 = [1, 2, 3]
            with _pytest.raises(_user_exceptions.FlyteTypeException):
                _schema_impl.Schema.from_python_std(
                    t_value=[df1, df2],
                    schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Integer)]),
                )

        def empty_list():
            s = _schema_impl.Schema.from_python_std(
                t_value=[],
                schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Integer)]),
            )
            assert s is not None
            n = _schema_impl.Schema.fetch(
                s.uri, schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Integer)]),
            )
            with n as reader:
                df = reader.read()
                assert df is None

        single_dataframe()
        mixed_list()
        empty_list()
        list_of_dataframes()
Ejemplo n.º 26
0
def test_multipart_blob_no_enter_on_write():
    with test_utils.LocalTestFileSystem():
        b = blobs.MultiPartBlob.create_at_any_location()
        with pytest.raises(_user_exceptions.FlyteAssertion):
            with b:
                pass
Ejemplo n.º 27
0
def test_hive_queries(monkeypatch):
    def return_deterministic_uuid():
        class FakeUUID4(object):
            def __init__(self):
                self.hex = 'test_uuid'

        class Uuid(object):
            def uuid4(self):
                return FakeUUID4()

        return Uuid()

    monkeypatch.setattr(_schema_impl, '_uuid', return_deterministic_uuid())

    all_types = _schema_impl.SchemaType([('a', _primitives.Integer),
                                         ('b', _primitives.String),
                                         ('c', _primitives.Float),
                                         ('d', _primitives.Boolean),
                                         ('e', _primitives.Datetime)])

    with _test_utils.LocalTestFileSystem():
        df, query = _schema_impl.Schema.create_from_hive_query(
            "SELECT a, b, c, d, e FROM some_place WHERE i = 0",
            stage_query=
            "CREATE TEMPORARY TABLE some_place AS SELECT * FROM some_place_original",
            known_location="s3://my_fixed_path/",
            schema_type=all_types)

        full_query = """
        CREATE TEMPORARY TABLE some_place AS SELECT * FROM some_place_original;
        CREATE TEMPORARY TABLE test_uuid_tmp AS SELECT a, b, c, d, e FROM some_place WHERE i = 0;
        CREATE EXTERNAL TABLE test_uuid LIKE test_uuid_tmp STORED AS PARQUET;
        ALTER TABLE test_uuid SET LOCATION 's3://my_fixed_path/';
        INSERT OVERWRITE TABLE test_uuid
            SELECT
                a as a,
            b as b,
            CAST(c as double) c,
            d as d,
            e as e
            FROM test_uuid_tmp;
        DROP TABLE test_uuid;
        """
        full_query = " ".join(full_query.split())
        query = " ".join(query.split())
        assert query == full_query

        # Test adding partition
        full_query = """
        ALTER TABLE some_table ADD IF NOT EXISTS PARTITION (
            region = 'SEA',
            ds = '2017-01-01'
        ) LOCATION 's3://my_fixed_path/';
        ALTER TABLE some_table PARTITION (
            region = 'SEA',
            ds = '2017-01-01'
        ) SET LOCATION 's3://my_fixed_path/';
        """
        query = df.get_write_partition_to_hive_table_query(
            'some_table',
            partitions=_collections.OrderedDict([('region', 'SEA'),
                                                 ('ds', '2017-01-01')]))
        full_query = " ".join(full_query.split())
        query = " ".join(query.split())
        assert query == full_query