Ejemplo n.º 1
0
def test_normal_schema_read_with_fastparquet():
    with _test_utils.LocalTestFileSystem():
        a = _schema_impl.Schema.create_at_any_location(
            schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Boolean)])
        )
        with a as writer:
            writer.write(_pd.DataFrame.from_dict({"a": [1, 2, 3, 4], "b": [False, True, True, False]}))

        import os as _os

        original_engine = _os.getenv("PARQUET_ENGINE")
        _os.environ["PARQUET_ENGINE"] = "fastparquet"

        b = _schema_impl.Schema.fetch(a.remote_prefix, schema_type=_schema_impl.SchemaType([]))

        with b as reader:
            df = reader.read()
            assert df["a"].tolist() == [1, 2, 3, 4]
            assert _pd.api.types.is_bool_dtype(df.dtypes["b"])
            assert df["b"].tolist() == [False, True, True, False]

        if original_engine is None:
            del _os.environ["PARQUET_ENGINE"]
        else:
            _os.environ["PARQUET_ENGINE"] = original_engine
Ejemplo n.º 2
0
def test_extra_schema_read():
    with _test_utils.LocalTestFileSystem():
        a = _schema_impl.Schema.create_at_any_location(
            schema_type=_schema_impl.SchemaType([(
                'a', _primitives.Integer), ('b', _primitives.Integer)]))
        with a as writer:
            writer.write(
                _pd.DataFrame.from_dict({
                    'a': [1, 2, 3, 4],
                    'b': [5, 6, 7, 8]
                }))

        b = _schema_impl.Schema.fetch(a.remote_prefix,
                                      schema_type=_schema_impl.SchemaType([
                                          ('a', _primitives.Integer)
                                      ]))
        with b as reader:
            df = reader.read(concat=True, truncate_extra_columns=False)
            assert df.columns.values.tolist() == ['a', 'b']
            assert df['a'].tolist() == [1, 2, 3, 4]
            assert df['b'].tolist() == [5, 6, 7, 8]

        with b as reader:
            df = reader.read(concat=True)
            assert df.columns.values.tolist() == ['a']
            assert df['a'].tolist() == [1, 2, 3, 4]
Ejemplo n.º 3
0
def test_schema_read_consistency_between_two_engines():
    with _test_utils.LocalTestFileSystem():
        a = _schema_impl.Schema.create_at_any_location(
            schema_type=_schema_impl.SchemaType([(
                'a', _primitives.Integer), ('b', _primitives.Boolean)]))
        with a as writer:
            writer.write(
                _pd.DataFrame.from_dict({
                    'a': [1, 2, 3, 4],
                    'b': [None, True, None, False]
                }))

        import os as _os
        original_engine = _os.getenv('PARQUET_ENGINE')
        _os.environ['PARQUET_ENGINE'] = 'fastparquet'

        b = _schema_impl.Schema.fetch(a.remote_prefix,
                                      schema_type=_schema_impl.SchemaType([]))

        with b as b_reader:
            b_df = b_reader.read()
            _os.environ['PARQUET_ENGINE'] = 'pyarrow'

            c = _schema_impl.Schema.fetch(a.remote_prefix,
                                          schema_type=_schema_impl.SchemaType(
                                              []))
            with c as c_reader:
                c_df = c_reader.read()
                assert b_df.equals(c_df)

        if original_engine is None:
            del _os.environ['PARQUET_ENGINE']
        else:
            _os.environ['PARQUET_ENGINE'] = original_engine
Ejemplo n.º 4
0
def test_type_promoted_schema_read_with_fastparquet():
    with _test_utils.LocalTestFileSystem():
        a = _schema_impl.Schema.create_at_any_location(
            schema_type=_schema_impl.SchemaType([(
                'a', _primitives.Integer), ('b', _primitives.Boolean)]))
        with a as writer:
            writer.write(
                _pd.DataFrame.from_dict({
                    'a': [1, 2, 3, 4],
                    'b': [None, True, None, False]
                }))

        import os as _os
        original_engine = _os.getenv('PARQUET_ENGINE')
        _os.environ['PARQUET_ENGINE'] = 'fastparquet'

        b = _schema_impl.Schema.fetch(a.remote_prefix,
                                      schema_type=_schema_impl.SchemaType([]))

        with b as reader:
            df = reader.read()
            assert df['a'].tolist() == [1, 2, 3, 4]
            assert _pd.api.types.is_object_dtype(df.dtypes['b'])
            assert df['b'].tolist() == [None, True, None, False]

        if original_engine is None:
            del _os.environ['PARQUET_ENGINE']
        else:
            _os.environ['PARQUET_ENGINE'] = original_engine
Ejemplo n.º 5
0
def test_schema_type():
    _schema_impl.SchemaType()
    _schema_impl.SchemaType([])
    _schema_impl.SchemaType(
        [
            ("a", _primitives.Integer),
            ("b", _primitives.String),
            ("c", _primitives.Float),
            ("d", _primitives.Boolean),
            ("e", _primitives.Datetime),
        ]
    )

    with _pytest.raises(ValueError):
        _schema_impl.SchemaType({"a": _primitives.Integer})

    with _pytest.raises(TypeError):
        _schema_impl.SchemaType([("a", _blobs.Blob)])

    with _pytest.raises(ValueError):
        _schema_impl.SchemaType([("a", _primitives.Integer, 1)])

        _schema_impl.SchemaType([("1", _primitives.Integer)])
    with _pytest.raises(TypeError):
        _schema_impl.SchemaType([(1, _primitives.Integer)])

    with _pytest.raises(TypeError):
        _schema_impl.SchemaType([("1", [_primitives.Integer])])
Ejemplo n.º 6
0
 def empty_list():
     s = _schema_impl.Schema.from_python_std(
         t_value=[],
         schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Integer)]),
     )
     assert s is not None
     n = _schema_impl.Schema.fetch(
         s.uri, schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Integer)]),
     )
     with n as reader:
         df = reader.read()
         assert df is None
Ejemplo n.º 7
0
def test_generic_schema_read():
    with _test_utils.LocalTestFileSystem():
        a = _schema_impl.Schema.create_at_any_location(
            schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Integer)])
        )
        with a as writer:
            writer.write(_pd.DataFrame.from_dict({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}))

        b = _schema_impl.Schema.fetch(a.remote_prefix, schema_type=_schema_impl.SchemaType([]))
        with b as reader:
            df = reader.read()
            assert df.columns.values.tolist() == ["a", "b"]
            assert df["a"].tolist() == [1, 2, 3, 4]
            assert df["b"].tolist() == [5, 6, 7, 8]
Ejemplo n.º 8
0
 def single_dataframe():
     df1 = _pd.DataFrame.from_dict({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})
     s = _schema_impl.Schema.from_python_std(
         t_value=df1,
         schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Integer)]),
     )
     assert s is not None
     n = _schema_impl.Schema.fetch(
         s.uri, schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Integer)]),
     )
     with n as reader:
         df2 = reader.read()
         assert df2.columns.values.all() == df1.columns.values.all()
         assert df2["b"].tolist() == df1["b"].tolist()
Ejemplo n.º 9
0
def test_fetch(value_type_pair):
    column_name, flyte_type, values = value_type_pair
    values = [tuple([value]) for value in values]
    schema_type = _schema_impl.SchemaType(columns=[(column_name, flyte_type)])

    with _utils.AutoDeletingTempDir("test") as tmpdir:
        for i in _six_moves.range(3):
            _pd.DataFrame.from_records(values, columns=[
                column_name
            ]).to_parquet(tmpdir.get_named_tempfile(str(i).zfill(6)),
                          coerce_timestamps='us')

        with _utils.AutoDeletingTempDir("test2") as local_dir:
            schema_obj = _schema_impl.Schema.fetch(
                tmpdir.name,
                local_path=local_dir.get_named_tempfile('schema_test'),
                schema_type=schema_type)
            with schema_obj as reader:
                for df in reader.iter_chunks():
                    for check, actual in _six_moves.zip(
                            values, df[column_name].tolist()):
                        assert check[0] == actual
                assert reader.read() is None
                reader.seek(0)
                df = reader.read(concat=True)
                for iter_count, actual in enumerate(df[column_name].tolist()):
                    assert values[iter_count % len(values)][0] == actual
Ejemplo n.º 10
0
def test_simple_read_and_write_with_different_types(value_type_pair):
    column_name, flyte_type, values = value_type_pair
    values = [tuple([value]) for value in values]
    schema_type = _schema_impl.SchemaType(columns=[(column_name, flyte_type)])

    with _test_utils.LocalTestFileSystem() as sandbox:
        with _utils.AutoDeletingTempDir("test") as t:
            a = _schema_impl.Schema.create_at_known_location(
                t.name, mode='wb', schema_type=schema_type)
            assert a.local_path is None
            with a as writer:
                for _ in _six_moves.range(5):
                    writer.write(
                        _pd.DataFrame.from_records(values,
                                                   columns=[column_name]))
                assert a.local_path.startswith(sandbox.name)
            assert a.local_path is None

            b = _schema_impl.Schema.create_at_known_location(
                t.name, mode='rb', schema_type=schema_type)
            assert b.local_path is None
            with b as reader:
                for df in reader.iter_chunks():
                    for check, actual in _six_moves.zip(
                            values, df[column_name].tolist()):
                        assert check[0] == actual
                assert reader.read() is None
                reader.seek(0)
                df = reader.read(concat=True)
                for iter_count, actual in enumerate(df[column_name].tolist()):
                    assert values[iter_count % len(values)][0] == actual
                assert b.local_path.startswith(sandbox.name)
            assert b.local_path is None
Ejemplo n.º 11
0
def test_datetime_coercion():
    values = [
        tuple(
            [
                _datetime.datetime(day=1, month=1, year=2017, hour=1, minute=1, second=1, microsecond=1)
                - _datetime.timedelta(days=x)
            ]
        )
        for x in _six_moves.range(5)
    ]
    schema_type = _schema_impl.SchemaType(columns=[("testname", _primitives.Datetime)])

    with _test_utils.LocalTestFileSystem():
        with _utils.AutoDeletingTempDir("test") as t:
            a = _schema_impl.Schema.create_at_known_location(t.name, mode="wb", schema_type=schema_type)
            with a as writer:
                for _ in _six_moves.range(5):
                    # us to ms coercion segfaults unless we explicitly allow truncation.
                    writer.write(
                        _pd.DataFrame.from_records(values, columns=["testname"]),
                        coerce_timestamps="ms",
                        allow_truncated_timestamps=True,
                    )

                    # TODO: Uncomment when segfault bug is resolved
                    # with _pytest.raises(Exception):
                    #    writer.write(
                    #        _pd.DataFrame.from_records(values, columns=['testname']),
                    #        coerce_timestamps='ms')

            b = _schema_impl.Schema.create_at_known_location(t.name, mode="wb", schema_type=schema_type)
            with b as writer:
                for _ in _six_moves.range(5):
                    writer.write(_pd.DataFrame.from_records(values, columns=["testname"]))
Ejemplo n.º 12
0
 def mixed_list():
     df1 = _pd.DataFrame.from_dict({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})
     df2 = [1, 2, 3]
     with _pytest.raises(_user_exceptions.FlyteTypeException):
         _schema_impl.Schema.from_python_std(
             t_value=[df1, df2],
             schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Integer)]),
         )
Ejemplo n.º 13
0
 def list_of_dataframes():
     df1 = _pd.DataFrame.from_dict({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})
     df2 = _pd.DataFrame.from_dict({"a": [9, 10, 11, 12], "b": [13, 14, 15, 16]})
     s = _schema_impl.Schema.from_python_std(
         t_value=[df1, df2],
         schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Integer)]),
     )
     assert s is not None
     n = _schema_impl.Schema.fetch(
         s.uri, schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Integer)]),
     )
     with n as reader:
         actual = []
         for df in reader.iter_chunks():
             assert df.columns.values.all() == df1.columns.values.all()
             actual.extend(df["b"].tolist())
         b_val = df1["b"].tolist()
         b_val.extend(df2["b"].tolist())
         assert actual == b_val
Ejemplo n.º 14
0
def test_partial_column_read():
    with _test_utils.LocalTestFileSystem():
        a = _schema_impl.Schema.create_at_any_location(
            schema_type=_schema_impl.SchemaType([(
                'a', _primitives.Integer), ('b', _primitives.Integer)]))
        with a as writer:
            writer.write(
                _pd.DataFrame.from_dict({
                    'a': [1, 2, 3, 4],
                    'b': [5, 6, 7, 8]
                }))

        b = _schema_impl.Schema.fetch(a.uri,
                                      schema_type=_schema_impl.SchemaType([
                                          ('a', _primitives.Integer),
                                          ('b', _primitives.Integer)
                                      ]))
        with b as reader:
            df = reader.read(columns=['b'])
            assert df.columns.values == ['b']
            assert df['b'].tolist() == [5, 6, 7, 8]
Ejemplo n.º 15
0
def test_create_at_known_location():
    with _test_utils.LocalTestFileSystem():
        with _utils.AutoDeletingTempDir("test") as wd:
            b = _schema_impl.Schema.create_at_known_location(wd.name, schema_type=_schema_impl.SchemaType())
            assert b.local_path is None
            assert b.remote_location == wd.name + "/"
            assert b.mode == "wb"

            with b as w:
                w.write(_pd.DataFrame.from_dict({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}))

            df = _pd.read_parquet(_os.path.join(wd.name, "000000"))
            assert list(df["a"]) == [1, 2, 3, 4]
            assert list(df["b"]) == [5, 6, 7, 8]
Ejemplo n.º 16
0
def test_schema_type():
    _schema_impl.SchemaType()
    _schema_impl.SchemaType([])
    _schema_impl.SchemaType([('a', _primitives.Integer),
                             ('b', _primitives.String),
                             ('c', _primitives.Float),
                             ('d', _primitives.Boolean),
                             ('e', _primitives.Datetime)])

    with _pytest.raises(ValueError):
        _schema_impl.SchemaType({'a': _primitives.Integer})

    with _pytest.raises(TypeError):
        _schema_impl.SchemaType([('a', _blobs.Blob)])

    with _pytest.raises(ValueError):
        _schema_impl.SchemaType([('a', _primitives.Integer, 1)])

        _schema_impl.SchemaType([('1', _primitives.Integer)])
    with _pytest.raises(TypeError):
        _schema_impl.SchemaType([(1, _primitives.Integer)])

    with _pytest.raises(TypeError):
        _schema_impl.SchemaType([('1', [_primitives.Integer])])
Ejemplo n.º 17
0
def test_download(value_type_pair):
    column_name, flyte_type, values = value_type_pair
    values = [tuple([value]) for value in values]
    schema_type = _schema_impl.SchemaType(columns=[(column_name, flyte_type)])

    with _utils.AutoDeletingTempDir("test") as tmpdir:
        for i in _six_moves.range(3):
            _pd.DataFrame.from_records(values, columns=[column_name]).to_parquet(
                tmpdir.get_named_tempfile(str(i).zfill(6)), coerce_timestamps="us"
            )

        with _utils.AutoDeletingTempDir("test2") as local_dir:
            schema_obj = _schema_impl.Schema(tmpdir.name, schema_type=schema_type)
            schema_obj.download(local_dir.get_named_tempfile(_uuid.uuid4().hex))
            with schema_obj as reader:
                for df in reader.iter_chunks():
                    for check, actual in _six_moves.zip(values, df[column_name].tolist()):
                        assert check[0] == actual
                assert reader.read() is None
                reader.seek(0)
                df = reader.read(concat=True)
                for iter_count, actual in enumerate(df[column_name].tolist()):
                    assert values[iter_count % len(values)][0] == actual

        with _pytest.raises(Exception):
            schema_obj = _schema_impl.Schema(tmpdir.name, schema_type=schema_type)
            schema_obj.download()

        with _test_utils.LocalTestFileSystem():
            schema_obj = _schema_impl.Schema(tmpdir.name, schema_type=schema_type)
            schema_obj.download()
            with schema_obj as reader:
                for df in reader.iter_chunks():
                    for check, actual in _six_moves.zip(values, df[column_name].tolist()):
                        assert check[0] == actual
                assert reader.read() is None
                reader.seek(0)
                df = reader.read(concat=True)
                for iter_count, actual in enumerate(df[column_name].tolist()):
                    assert values[iter_count % len(values)][0] == actual
Ejemplo n.º 18
0
 class _Schema(Schema, metaclass=SchemaInstantiator):
     _schema_type = _schema_impl.SchemaType(columns=columns)
Ejemplo n.º 19
0
 class _Schema(_six.with_metaclass(SchemaInstantiator, Schema)):
     _schema_type = _schema_impl.SchemaType(columns=columns)
Ejemplo n.º 20
0
def test_hive_queries(monkeypatch):
    def return_deterministic_uuid():
        class FakeUUID4(object):
            def __init__(self):
                self.hex = 'test_uuid'

        class Uuid(object):
            def uuid4(self):
                return FakeUUID4()

        return Uuid()

    monkeypatch.setattr(_schema_impl, '_uuid', return_deterministic_uuid())

    all_types = _schema_impl.SchemaType([('a', _primitives.Integer),
                                         ('b', _primitives.String),
                                         ('c', _primitives.Float),
                                         ('d', _primitives.Boolean),
                                         ('e', _primitives.Datetime)])

    with _test_utils.LocalTestFileSystem():
        df, query = _schema_impl.Schema.create_from_hive_query(
            "SELECT a, b, c, d, e FROM some_place WHERE i = 0",
            stage_query=
            "CREATE TEMPORARY TABLE some_place AS SELECT * FROM some_place_original",
            known_location="s3://my_fixed_path/",
            schema_type=all_types)

        full_query = """
        CREATE TEMPORARY TABLE some_place AS SELECT * FROM some_place_original;
        CREATE TEMPORARY TABLE test_uuid_tmp AS SELECT a, b, c, d, e FROM some_place WHERE i = 0;
        CREATE EXTERNAL TABLE test_uuid LIKE test_uuid_tmp STORED AS PARQUET;
        ALTER TABLE test_uuid SET LOCATION 's3://my_fixed_path/';
        INSERT OVERWRITE TABLE test_uuid
            SELECT
                a as a,
            b as b,
            CAST(c as double) c,
            d as d,
            e as e
            FROM test_uuid_tmp;
        DROP TABLE test_uuid;
        """
        full_query = " ".join(full_query.split())
        query = " ".join(query.split())
        assert query == full_query

        # Test adding partition
        full_query = """
        ALTER TABLE some_table ADD IF NOT EXISTS PARTITION (
            region = 'SEA',
            ds = '2017-01-01'
        ) LOCATION 's3://my_fixed_path/';
        ALTER TABLE some_table PARTITION (
            region = 'SEA',
            ds = '2017-01-01'
        ) SET LOCATION 's3://my_fixed_path/';
        """
        query = df.get_write_partition_to_hive_table_query(
            'some_table',
            partitions=_collections.OrderedDict([('region', 'SEA'),
                                                 ('ds', '2017-01-01')]))
        full_query = " ".join(full_query.split())
        query = " ".join(query.split())
        assert query == full_query