def test_write_to_dataset_pandas_preserve_extensiondtypes(
    tempdir, use_legacy_dataset
):
    # ARROW-8251 - preserve pandas extension dtypes in roundtrip
    if Version(pd.__version__) < Version("1.0.0"):
        pytest.skip("__arrow_array__ added to pandas in 1.0.0")

    df = pd.DataFrame({'part': 'a', "col": [1, 2, 3]})
    df['col'] = df['col'].astype("Int64")
    table = pa.table(df)

    pq.write_to_dataset(
        table, str(tempdir / "case1"), partition_cols=['part'],
        use_legacy_dataset=use_legacy_dataset
    )
    result = pq.read_table(
        str(tempdir / "case1"), use_legacy_dataset=use_legacy_dataset
    ).to_pandas()
    tm.assert_frame_equal(result[["col"]], df[["col"]])

    pq.write_to_dataset(
        table, str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset
    )
    result = pq.read_table(
        str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset
    ).to_pandas()
    tm.assert_frame_equal(result[["col"]], df[["col"]])

    pq.write_table(table, str(tempdir / "data.parquet"))
    result = pq.read_table(
        str(tempdir / "data.parquet"), use_legacy_dataset=use_legacy_dataset
    ).to_pandas()
    tm.assert_frame_equal(result[["col"]], df[["col"]])
Exemple #2
0
def test_schema_from_pandas():
    import pandas as pd
    inputs = [
        list(range(10)),
        pd.Categorical(list(range(10))),
        ['foo', 'bar', None, 'baz', 'qux'],
        np.array([
            '2007-07-13T01:23:34.123456789', '2006-01-13T12:34:56.432539784',
            '2010-08-13T05:46:57.437699912'
        ],
                 dtype='datetime64[ns]'),
    ]
    if Version(pd.__version__) >= Version('1.0.0'):
        inputs.append(pd.array([1, 2, None], dtype=pd.Int32Dtype()))
    for data in inputs:
        df = pd.DataFrame({'a': data})
        schema = pa.Schema.from_pandas(df)
        expected = pa.Table.from_pandas(df).schema
        assert schema == expected
Exemple #3
0
def py_fsspec_s3fs(request, s3_connection, s3_server):
    s3fs = pytest.importorskip("s3fs")
    if (sys.version_info < (3, 7)
            and Version(s3fs.__version__) >= Version("0.5")):
        pytest.skip("s3fs>=0.5 version is async and requires Python >= 3.7")

    host, port, access_key, secret_key = s3_connection
    bucket = 'pyarrow-filesystem/'

    fs = s3fs.S3FileSystem(
        key=access_key,
        secret=secret_key,
        client_kwargs=dict(endpoint_url='http://{}:{}'.format(host, port)))
    fs = PyFileSystem(FSSpecHandler(fs))
    fs.create_dir(bucket)

    yield dict(
        fs=fs,
        pathfn=bucket.__add__,
        allow_move_dir=False,
        allow_append_to_file=True,
    )
    fs.delete_dir(bucket)
Exemple #4
0
def _check_pandas_version():
    if _pandas_api.loose_version < Version('0.17.0'):
        raise ImportError("feather requires pandas >= 0.17.0")