def test_write_to_dataset_pandas_preserve_extensiondtypes( tempdir, use_legacy_dataset ): # ARROW-8251 - preserve pandas extension dtypes in roundtrip if Version(pd.__version__) < Version("1.0.0"): pytest.skip("__arrow_array__ added to pandas in 1.0.0") df = pd.DataFrame({'part': 'a', "col": [1, 2, 3]}) df['col'] = df['col'].astype("Int64") table = pa.table(df) pq.write_to_dataset( table, str(tempdir / "case1"), partition_cols=['part'], use_legacy_dataset=use_legacy_dataset ) result = pq.read_table( str(tempdir / "case1"), use_legacy_dataset=use_legacy_dataset ).to_pandas() tm.assert_frame_equal(result[["col"]], df[["col"]]) pq.write_to_dataset( table, str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset ) result = pq.read_table( str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset ).to_pandas() tm.assert_frame_equal(result[["col"]], df[["col"]]) pq.write_table(table, str(tempdir / "data.parquet")) result = pq.read_table( str(tempdir / "data.parquet"), use_legacy_dataset=use_legacy_dataset ).to_pandas() tm.assert_frame_equal(result[["col"]], df[["col"]])
def test_schema_from_pandas(): import pandas as pd inputs = [ list(range(10)), pd.Categorical(list(range(10))), ['foo', 'bar', None, 'baz', 'qux'], np.array([ '2007-07-13T01:23:34.123456789', '2006-01-13T12:34:56.432539784', '2010-08-13T05:46:57.437699912' ], dtype='datetime64[ns]'), ] if Version(pd.__version__) >= Version('1.0.0'): inputs.append(pd.array([1, 2, None], dtype=pd.Int32Dtype())) for data in inputs: df = pd.DataFrame({'a': data}) schema = pa.Schema.from_pandas(df) expected = pa.Table.from_pandas(df).schema assert schema == expected
def py_fsspec_s3fs(request, s3_connection, s3_server): s3fs = pytest.importorskip("s3fs") if (sys.version_info < (3, 7) and Version(s3fs.__version__) >= Version("0.5")): pytest.skip("s3fs>=0.5 version is async and requires Python >= 3.7") host, port, access_key, secret_key = s3_connection bucket = 'pyarrow-filesystem/' fs = s3fs.S3FileSystem( key=access_key, secret=secret_key, client_kwargs=dict(endpoint_url='http://{}:{}'.format(host, port))) fs = PyFileSystem(FSSpecHandler(fs)) fs.create_dir(bucket) yield dict( fs=fs, pathfn=bucket.__add__, allow_move_dir=False, allow_append_to_file=True, ) fs.delete_dir(bucket)
def _check_pandas_version(): if _pandas_api.loose_version < Version('0.17.0'): raise ImportError("feather requires pandas >= 0.17.0")