Example #1
0
def test_formats_serdes():
    objects = [
        {
            'blah': 'foo'
        },
        b'blather',
        'blip',
    ]
    metadata = [{} for o in objects]

    for obj, meta in zip(objects, metadata):
        data, format_meta = FormatRegistry.serialize(obj, meta)
        meta.update(format_meta)
        assert FormatRegistry.deserialize(data, meta) == obj

    meta = {}
    df1 = pd.DataFrame([[1, 2], [3, 4]])
    data, format_meta = FormatRegistry.serialize(df1, meta)
    meta.update(format_meta)
    df2 = FormatRegistry.deserialize(data, meta)

    # we can't really get around this nicely -- if header is used, and header names are numeric,
    # once loaded from CSV, header names are now strings.  This causes a bad comparison, so we
    # cast to int again.
    df2.columns = df2.columns.astype(int, copy=False)

    assert df1.equals(df2)
Example #2
0
def test_formats_for_format():
    bytes_fmt = FormatRegistry.for_format('bytes')[0]
    json_fmt = FormatRegistry.for_format('json')[0]

    some_bytes = b'["phlipper", "piglet"]'
    assert bytes_fmt.serialize(some_bytes)[0] == some_bytes
    assert json_fmt.deserialize(some_bytes) == ['phlipper', 'piglet']
Example #3
0
def test_formats_csv_read():
    csv_file = pathlib.Path(__file__).parent / 'data' / 'csv.csv'

    meta = {'format': {'name': 'csv'}}
    expected_bytes = b'a,b,c,d\n1,2,3,4\n5,6,7,8\n'
    expected_df = FormatRegistry.deserialize(expected_bytes, meta)
    df = FormatRegistry.deserialize(csv_file.read_bytes(), meta)

    assert df.equals(expected_df)
    assert expected_bytes == FormatRegistry.serialize(df, meta)[0]
Example #4
0
def test_formats_search_order():
    # String is nice and easy to work with for this test
    utf8_meta = {'format': {'name': 'utf-8'}}
    unicode_meta = {
        'format': {
            'name': 'unicode'
        }
    }  # just compat with older 'unicode'
    json_meta = {'format': {'name': 'json'}}

    # pd.DataFrame can be serialized to a few formats.
    utf8_handler = FormatRegistry.search(obj_type=str, meta=utf8_meta)[0]
    unicode_handler = FormatRegistry.search(obj_type=str, meta=unicode_meta)[0]
    json_handler = FormatRegistry.search(obj_type=str, meta=json_meta)[0]

    # FormatRegistry.search(), all other things being equal,
    # should return the most-recently-registered format first.
    utf8_handler.register()
    assert FormatRegistry.search(obj_type=str)[0] is utf8_handler
    unicode_handler.register()
    assert FormatRegistry.search(obj_type=str)[0] is unicode_handler
    json_handler.register()
    assert FormatRegistry.search(obj_type=str)[0] is json_handler
    utf8_handler.register()
    assert FormatRegistry.search(obj_type=str)[0] is utf8_handler

    # FormatRegistry.search() should not *exclude* by extension, because
    # users could make files like "data.monday" or whatever.
    # However, it should *prioritize* by extension.
    #
    # From the above section, utf8_handler is formats[0].
    formats = FormatRegistry.search(obj_type=str, ext='.json')
    assert utf8_handler in formats
    assert unicode_handler in formats
    assert formats[0] is json_handler
Example #5
0
def test_formats_search_fail_notfound():
    # a search that finds nothing should raise with an explanation.
    class Foo:
        pass

    bad_kwargs = [
        dict(obj_type=Foo, meta=None, ext=None),
        dict(obj_type=None, meta={}, ext=None),
        dict(obj_type=None, meta=None, ext='.fizz'),
    ]

    for args in bad_kwargs:
        with pytest.raises(QuiltException):
            FormatRegistry.search(**args)
Example #6
0
def test_formats_for_obj():
    arr = np.ndarray(3)

    fmt = FormatRegistry.for_obj(arr)[0]

    assert 'npz' in fmt.handled_extensions
    assert FormatRegistry.for_ext('npy')[0] is fmt

    expected_string_fmt_names = ['utf-8', 'unicode', 'json']
    found_string_fmt_names = list(f.name for f in FormatRegistry.for_obj('blah'))
    assert found_string_fmt_names == expected_string_fmt_names

    bytes_obj = fmt.serialize(arr)[0]
    np.testing.assert_array_equal(fmt.deserialize(bytes_obj), arr)
Example #7
0
def test_buggy_parquet():
    """
    Test that Quilt avoids crashing on bad Pandas metadata from
    old pyarrow libaries.
    """
    path = pathlib.Path(__file__).parent
    for parquet_handler in FormatRegistry.for_format('parquet'):
        with open(path / 'data' / 'buggy_parquet.parquet', 'rb') as bad_parq:
            # Make sure this doesn't crash.
            parquet_handler.deserialize(bad_parq.read())
Example #8
0
def test_formats_for_ext():
    fmt = FormatRegistry.for_ext('json')[0]
    assert fmt.serialize({'blah': 'blah'})[0] == b'{"blah": "blah"}'
    assert fmt.deserialize(b'{"meow": "mix"}', ) == {'meow': 'mix'}
Example #9
0
def test_formats_csv_roundtrip():
    test_data = b'9,2,5\n7,2,6\n1,0,1\n'

    # roundtrip defaults.
    meta = {'format': {'name': 'csv'}}
    df1 = FormatRegistry.deserialize(test_data, meta)
    bin, format_meta = FormatRegistry.serialize(df1, meta)
    meta.update(format_meta)
    df2 = FormatRegistry.deserialize(bin, meta)

    assert test_data == bin
    assert df1.equals(df2)

    # interpret first row as header
    meta = {'format': {'name': 'csv', 'opts': {'use_header': True}}}
    df1 = FormatRegistry.deserialize(test_data, meta)
    bin, format_meta = FormatRegistry.serialize(df1, meta)
    meta.update(format_meta)
    df2 = FormatRegistry.deserialize(bin, meta)

    assert test_data == bin
    assert df1.equals(df2)

    # interpret first column as index
    meta = {'format': {'name': 'csv', 'opts': {'use_index': True}}}
    df1 = FormatRegistry.deserialize(test_data, meta)
    bin, format_meta = FormatRegistry.serialize(df1, meta)
    meta.update(format_meta)
    df2 = FormatRegistry.deserialize(bin, meta)

    assert test_data == bin
    assert df1.equals(df2)

    # interpret first row as header, and first column as index
    meta = {
        'format': {
            'name': 'csv',
            'opts': {
                'use_index': True,
                'use_header': True
            }
        }
    }
    df1 = FormatRegistry.deserialize(test_data, meta)
    bin, format_meta = FormatRegistry.serialize(df1, meta)
    meta.update(format_meta)
    df2 = FormatRegistry.deserialize(bin, meta)

    assert test_data == bin
    assert df1.equals(df2)