Example #1
0
def test_merge(tempdir, dirs, row_groups):
    fn = str(tempdir)

    default_mkdirs(os.path.join(fn, dirs[0]))
    df0 = pd.DataFrame({'a': [1, 2, 3, 4]})
    fn0 = os.sep.join([fn, dirs[0], 'out0.parq'])
    write(fn0, df0, row_group_offsets=row_groups)

    default_mkdirs(os.path.join(fn, dirs[1]))
    df1 = pd.DataFrame({'a': [5, 6, 7, 8]})
    fn1 = os.sep.join([fn, dirs[1], 'out1.parq'])
    write(fn1, df1, row_group_offsets=row_groups)

    # with file-names
    pf = writer.merge([fn0, fn1])
    assert len(pf.row_groups) == 2 * len(row_groups)
    out = pf.to_pandas().a.tolist()
    assert out == [1, 2, 3, 4, 5, 6, 7, 8]
    if "cat=1" in dirs:
        assert 'cat' in pf.cats

    # with instances
    pf = writer.merge([ParquetFile(fn0), ParquetFile(fn1)])
    assert len(pf.row_groups) == 2 * len(row_groups)
    out = pf.to_pandas().a.tolist()
    assert out == [1, 2, 3, 4, 5, 6, 7, 8]
    if "cat=1" in dirs:
        assert 'cat' in pf.cats
def test_merge(tempdir, dirs, row_groups):
    fn = str(tempdir)

    os.makedirs(os.path.join(fn, dirs[0]), exist_ok=True)
    df0 = pd.DataFrame({"a": [1, 2, 3, 4]})
    fn0 = os.sep.join([fn, dirs[0], "out0.parq"])
    write(fn0, df0, row_group_offsets=row_groups)

    os.makedirs(os.path.join(fn, dirs[1]), exist_ok=True)
    df1 = pd.DataFrame({"a": [5, 6, 7, 8]})
    fn1 = os.sep.join([fn, dirs[1], "out1.parq"])
    write(fn1, df1, row_group_offsets=row_groups)

    # with file-names
    pf = writer.merge([fn0, fn1])
    assert len(pf.row_groups) == 2 * len(row_groups)
    out = pf.to_pandas().a.tolist()
    assert out == [1, 2, 3, 4, 5, 6, 7, 8]
    if "cat=1" in dirs:
        assert "cat" in pf.cats

    # with instances
    pf = writer.merge([ParquetFile(fn0), ParquetFile(fn1)])
    assert len(pf.row_groups) == 2 * len(row_groups)
    out = pf.to_pandas().a.tolist()
    assert out == [1, 2, 3, 4, 5, 6, 7, 8]
    if "cat=1" in dirs:
        assert "cat" in pf.cats
Example #3
0
def test_merge_fail(tempdir):
    fn = str(tempdir)

    df0 = pd.DataFrame({'a': [1, 2, 3, 4]})
    fn0 = os.sep.join([fn, 'out0.parq'])
    write(fn0, df0)

    df1 = pd.DataFrame({'a': ['a', 'b', 'c']})
    fn1 = os.sep.join([fn, 'out1.parq'])
    write(fn1, df1)

    with pytest.raises(ValueError) as e:
        writer.merge([fn0, fn1])
    assert 'schemas' in str(e.value)
def test_merge_fail(tempdir):
    fn = str(tempdir)

    df0 = pd.DataFrame({"a": [1, 2, 3, 4]})
    fn0 = os.sep.join([fn, "out0.parq"])
    write(fn0, df0)

    df1 = pd.DataFrame({"a": ["a", "b", "c"]})
    fn1 = os.sep.join([fn, "out1.parq"])
    write(fn1, df1)

    with pytest.raises(ValueError) as e:
        writer.merge([fn0, fn1])
    assert "schemas" in str(e)

    os.remove(fn1)
    write(fn1, df0, file_scheme="hive")
    with pytest.raises(ValueError) as e:
        writer.merge([fn0, fn1])
    assert "multi-file" in str(e)
Example #5
0
def test_merge_s3(tempdir, s3):
    fn = str(tempdir)

    df0 = pd.DataFrame({'a': [1, 2, 3, 4]})
    fn0 = TEST_DATA + '/out0.parq'
    write(fn0, df0, open_with=s3.open)

    df1 = pd.DataFrame({'a': [5, 6, 7, 8]})
    fn1 = TEST_DATA + '/out1.parq'
    write(fn1, df1, open_with=s3.open)

    # with file-names
    pf = writer.merge([fn0, fn1], open_with=s3.open)
    assert len(pf.row_groups) == 2
    out = pf.to_pandas().a.tolist()
    assert out == [1, 2, 3, 4, 5, 6, 7, 8]
def test_merge_s3(tempdir, s3):
    fn = str(tempdir)

    df0 = pd.DataFrame({"a": [1, 2, 3, 4]})
    fn0 = TEST_DATA + "/out0.parq"
    write(fn0, df0, open_with=s3.open)

    df1 = pd.DataFrame({"a": [5, 6, 7, 8]})
    fn1 = TEST_DATA + "/out1.parq"
    write(fn1, df1, open_with=s3.open)

    # with file-names
    pf = writer.merge([fn0, fn1], open_with=s3.open)
    assert len(pf.row_groups) == 2
    out = pf.to_pandas().a.tolist()
    assert out == [1, 2, 3, 4, 5, 6, 7, 8]