def test_bad_file_paths(tempdir): df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]}) dir1 = os.path.join(tempdir, 'x=0') fn1 = os.path.join(dir1, 'part.=.parquet') os.makedirs(dir1) write(fn1, df) dir2 = os.path.join(tempdir, 'y/z') fn2 = os.path.join(dir2, 'part.0.parquet') os.makedirs(dir2) write(fn2, df) pf = ParquetFile([fn1, fn2]) assert pf.file_scheme == 'other' out = pf.to_pandas() assert out.a.tolist() == ['x', 'y', 'z'] * 2 assert 'dir0' not in out path1 = os.path.join(tempdir, 'data') fn1 = os.path.join(path1, 'out.parq') os.makedirs(path1) write(fn1, df) path2 = os.path.join(tempdir, 'data2') fn2 = os.path.join(path2, 'out.parq') os.makedirs(path2) write(fn2, df) pf = ParquetFile([fn1, fn2]) out = pf.to_pandas() assert out.a.tolist() == ['x', 'y', 'z'] * 2
def test_text_convert(tempdir): df = pd.DataFrame({"a": ["a"] * 100, "b": [b"a"] * 100}) fn = os.path.join(tempdir, "tmp.parq") write(fn, df, fixed_text={"a": 1, "b": 2}) pf = ParquetFile(fn) assert pf.schema[1].type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY assert pf.schema[1].type_length == 1 assert pf.schema[2].type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY assert pf.schema[2].type_length == 2 assert pf.statistics["max"]["a"] == ["a"] df2 = pf.to_pandas() tm.assert_frame_equal(df, df2, check_categorical=False) write(fn, df) pf = ParquetFile(fn) assert pf.schema[1].type == parquet_thrift.Type.BYTE_ARRAY assert pf.schema[2].type == parquet_thrift.Type.BYTE_ARRAY assert pf.statistics["max"]["a"] == ["a"] df2 = pf.to_pandas() tm.assert_frame_equal(df, df2, check_categorical=False) write(fn, df, fixed_text={"a": 1}) pf = ParquetFile(fn) assert pf.schema[1].type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY assert pf.schema[2].type == parquet_thrift.Type.BYTE_ARRAY assert pf.statistics["max"]["a"] == ["a"] df2 = pf.to_pandas() tm.assert_frame_equal(df, df2, check_categorical=False)
def time_text(): with tmpdir() as tempdir: result = {} fn = join_path(tempdir, 'temp.parq') n = 1000000 d = pd.DataFrame({ 'a': np.random.choice(['hi', 'you', 'people'], size=n), 'b': np.random.choice([b'hi', b'you', b'people'], size=n)}) for col in d.columns: for fixed in [None, 6]: df = d[[col]] if isinstance(df.iloc[0, 0], bytes): t = "bytes" else: t = 'utf8' write(fn, df) with measure('%s: write, fixed: %s' % (t, fixed), result): write(fn, df, has_nulls=False, write_index=False, fixed_text={col: fixed}, object_encoding=t) pf = ParquetFile(fn) pf.to_pandas() # warm-up with measure('%s: read, fixed: %s' % (t, fixed), result): pf.to_pandas() return result
def test_groups_roundtrip(tempdir): df = pd.DataFrame( { "a": np.random.choice(["a", "b", None], size=1000), "b": np.random.randint(0, 64000, size=1000), "c": np.random.choice([True, False], size=1000), } ) writer.write(tempdir, df, partition_on=["a", "c"], file_scheme="hive") r = ParquetFile(tempdir) assert r.columns == ["b"] out = r.to_pandas() for i, row in out.iterrows(): assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b) writer.write(tempdir, df, row_group_offsets=[0, 50], partition_on=["a", "c"], file_scheme="hive") r = ParquetFile(tempdir) assert r.count == sum(~df.a.isnull()) assert len(r.row_groups) == 8 out = r.to_pandas() for i, row in out.iterrows(): assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b)
def test_auto_null(tempdir): tmp = str(tempdir) df = pd.DataFrame( { "a": [1, 2, 3, 0], "b": [1.0, 2.0, 3.0, np.nan], "c": pd.to_timedelta([1, 2, 3, np.nan], unit="ms"), "d": ["a", "b", "c", None], } ) df["e"] = df["d"].astype("category") fn = os.path.join(tmp, "test.parq") with pytest.raises(TypeError): ## TODO: this should be a nicer error? write(fn, df, has_nulls=False) write(fn, df, has_nulls=True) pf = ParquetFile(fn) for col in pf.schema[2:]: assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL assert pf.schema[1].repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED df2 = pf.to_pandas(categories=["e"]) tm.assert_frame_equal(df, df2, check_categorical=False) write(fn, df, has_nulls=None) pf = ParquetFile(fn) for col in pf.schema[1:3]: assert col.repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED assert pf.schema[4].repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL df2 = pf.to_pandas(categories=["e"]) tm.assert_frame_equal(df, df2, check_categorical=False)
def test_index_not_in_columns(tempdir): df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]}).set_index('a') write(tempdir, df, file_scheme='hive') pf = ParquetFile(tempdir) out = pf.to_pandas(columns=['b']) assert out.index.tolist() == ['x', 'y', 'z'] out = pf.to_pandas(columns=['b'], index=False) assert out.index.tolist() == [0, 1, 2]
def test_request_nonexistent_column(tempdir): df = pd.DataFrame({'x': [1, 2, 3]}) fn = os.path.join(tempdir, 'foo.parquet') write(fn, df) pf = ParquetFile(fn) with pytest.raises(ValueError): pf.to_pandas(columns=['y'])
def test_in_filter_numbers(tempdir): symbols = ['a', 'a', 'b', 'c', 'c', 'd'] values = [1, 2, 3, 4, 5, 6] df = pd.DataFrame(data={'symbols': symbols, 'values': values}) write(tempdir, df, file_scheme='hive', partition_on=['values']) pf = ParquetFile(tempdir) out = pf.to_pandas(filters=[('values', 'in', ['1', '4'])]) assert set(out.symbols) == {'a', 'c'} out = pf.to_pandas(filters=[('values', 'in', [1, 4])]) assert set(out.symbols) == {'a', 'c'}
def test_multi_cat_fail(tempdir): fn = os.path.join(tempdir, 'test.parq') N = 200 df = pd.DataFrame( {'a': np.random.randint(10, size=N), 'b': np.random.choice(['a', 'b', 'c'], size=N), 'c': np.arange(200)}) df = df.set_index(['a', 'b']) write(fn, df, row_group_offsets=25) pf = ParquetFile(fn) with pytest.raises(RuntimeError): pf.to_pandas()
def test_filter_without_paths(tempdir): fn = os.path.join(tempdir, 'test.parq') df = pd.DataFrame({ 'x': [1, 2, 3, 4, 5, 6, 7], 'letter': ['a', 'b', 'c', 'd', 'e', 'f', 'g'] }) write(fn, df) pf = ParquetFile(fn) out = pf.to_pandas(filters=[['x', '>', 3]]) pd.util.testing.assert_frame_equal(out, df) out = pf.to_pandas(filters=[['x', '>', 30]]) assert len(out) == 0
def test_single_upper_directory(tempdir): df = pd.DataFrame({'x': [1, 5, 2, 5], 'y': ['aa'] * 4}) write(tempdir, df, file_scheme='hive', partition_on='y') pf = ParquetFile(tempdir) out = pf.to_pandas() assert (out.y == 'aa').all() os.unlink(os.path.join(tempdir, '_metadata')) os.unlink(os.path.join(tempdir, '_common_metadata')) import glob flist = list(sorted(glob.glob(os.path.join(tempdir, '*/*')))) pf = ParquetFile(flist, root=tempdir) assert pf.fn == join_path(os.path.join(tempdir, '_metadata')) out = pf.to_pandas() assert (out.y == 'aa').all()
def test_multi_list(tempdir): df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]}) dir1 = os.path.join(tempdir, 'x') write(dir1, df, file_scheme='hive') dir2 = os.path.join(tempdir, 'y') write(dir2, df, file_scheme='hive') dir3 = os.path.join(tempdir, 'z', 'deep') write(dir3, df, file_scheme='hive') pf = ParquetFile([dir1, dir2]) out = pf.to_pandas() # this version may have extra column! assert out.a.tolist() == ['x', 'y', 'z'] * 2 pf = ParquetFile([dir1, dir2, dir3]) out = pf.to_pandas() assert out.a.tolist() == ['x', 'y', 'z'] * 3
def test_filelike(tempdir): df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': [1.0, 2.0, 1.0, 2.0], 'z': ['a', 'b', 'c', 'd']}) fn = os.path.join(tempdir, 'foo.parquet') write(fn, df, row_group_offsets=[0, 2]) with open(fn, 'rb') as f: pf = ParquetFile(f, open_with=open) d2 = pf.to_pandas() pd.util.testing.assert_frame_equal(d2, df) b = io.BytesIO(open(fn, 'rb').read()) pf = ParquetFile(b, open_with=open) d2 = pf.to_pandas() pd.util.testing.assert_frame_equal(d2, df)
def _read_pf_simple(fs, path, base, index_names, all_columns, is_series, categories, cats, scheme, storage_name_mapping): """Read dataset with fastparquet using ParquetFile machinery""" from fastparquet import ParquetFile pf = ParquetFile(path, open_with=fs.open) relpath = path.replace(base, '').lstrip('/') for rg in pf.row_groups: for ch in rg.columns: ch.file_path = relpath pf.file_scheme = scheme pf.cats = cats pf.fn = base df = pf.to_pandas(all_columns, categories, index=index_names) if df.index.nlevels == 1: if index_names: df.index.name = storage_name_mapping.get(index_names[0], index_names[0]) else: if index_names: df.index.names = [storage_name_mapping.get(name, name) for name in index_names] df.columns = [storage_name_mapping.get(col, col) for col in all_columns if col not in (index_names or [])] if is_series: return df[df.columns[0]] else: return df
def test_numerical_partition_name(tempdir): df = pd.DataFrame({'x': [1, 5, 2, 5], 'y1': ['aa', 'aa', 'bb', 'aa']}) write(tempdir, df, file_scheme='hive', partition_on=['y1']) pf = ParquetFile(tempdir) out = pf.to_pandas() assert out[out.y1 == 'aa'].x.tolist() == [1, 5, 5] assert out[out.y1 == 'bb'].x.tolist() == [2]
def test_floating_point_partition_name(tempdir): df = pd.DataFrame({'x': [1e99, 5e-10, 2e+2, -0.1], 'y1': ['aa', 'aa', 'bb', 'aa']}) write(tempdir, df, file_scheme='hive', partition_on=['y1']) pf = ParquetFile(tempdir) out = pf.to_pandas() assert out[out.y1 == 'aa'].x.tolist() == [1e99, 5e-10, -0.1] assert out[out.y1 == 'bb'].x.tolist() == [200.0]
def test_roundtrip(tempdir, scheme, row_groups, comp): data = pd.DataFrame( { "i32": np.arange(1000, dtype=np.int32), "i64": np.arange(1000, dtype=np.int64), "f": np.arange(1000, dtype=np.float64), "bhello": np.random.choice([b"hello", b"you", b"people"], size=1000).astype("O"), } ) data["a"] = np.array([b"a", b"b", b"c", b"d", b"e"] * 200, dtype="S1") data["aa"] = data["a"].map(lambda x: 2 * x).astype("S2") data["hello"] = data.bhello.str.decode("utf8") data["bcat"] = data.bhello.astype("category") data["cat"] = data.hello.astype("category") fname = os.path.join(tempdir, "test.parquet") write(fname, data, file_scheme=scheme, row_group_offsets=row_groups, compression=comp) r = ParquetFile(fname) df = r.to_pandas() assert data.cat.dtype == "category" for col in r.columns: assert (df[col] == data[col]).all()
def test_roundtrip_complex(tempdir, scheme): import datetime data = pd.DataFrame( { "ui32": np.arange(1000, dtype=np.uint32), "i16": np.arange(1000, dtype=np.int16), "ui8": np.array([1, 2, 3, 4] * 250, dtype=np.uint8), "f16": np.arange(1000, dtype=np.float16), "dicts": [{"oi": "you"}] * 1000, "t": [datetime.datetime.now()] * 1000, "td": [datetime.timedelta(seconds=1)] * 1000, "bool": np.random.choice([True, False], size=1000), } ) data.loc[100, "t"] = None fname = os.path.join(tempdir, "test.parquet") write(fname, data, file_scheme=scheme) r = ParquetFile(fname) df = r.to_pandas() for col in r.columns: assert (df[col] == data[col])[~data[col].isnull()].all()
def test_input_column_list_not_mutated(tempdir): df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) write(tempdir, df, file_scheme='hive') cols = ['a'] pf = ParquetFile(tempdir) out = pf.to_pandas(columns=cols) assert cols == ['a']
def test_to_pandas(): fname = TEST_DATA+'/airlines_parquet/4345e5eef217aa1b-c8f16177f35fd983_1150363067_data.1.parq' pf = ParquetFile(fname) out = pf.to_pandas() assert len(out.columns) == 29 # test for bad integer conversion assert (out.dep_time < 0).sum() == 0 assert out.dep_time.dtype == 'float64'
def test_write_compression_dict(tempdir, compression): df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]}) fn = os.path.join(tempdir, "tmp.parq") writer.write(fn, df, compression=compression) r = ParquetFile(fn) df2 = r.to_pandas() tm.assert_frame_equal(df, df2, check_categorical=False)
def test_filter_stats(tempdir): df = pd.DataFrame({ 'x': [1, 2, 3, 4, 5, 6, 7], }) write(tempdir, df, file_scheme='hive', row_group_offsets=[0, 4]) pf = ParquetFile(tempdir) out = pf.to_pandas(filters=[('x', '>=', 5)]) assert out.x.tolist() == [5, 6, 7]
def test_append_simple(tempdir): fn = os.path.join(str(tempdir), "test.parq") df = pd.DataFrame({"a": [1, 2, 3, 0], "b": ["a", "a", "b", "b"]}) write(fn, df, write_index=False) write(fn, df, append=True, write_index=False) pf = ParquetFile(fn) expected = pd.concat([df, df], ignore_index=True) pd.util.testing.assert_frame_equal(pf.to_pandas(), expected, check_categorical=False)
def read_partition( cls, fs, piece, columns, index, categories=(), pf=None, **kwargs ): null_index_name = False if isinstance(index, list): if index == [None]: # Handling a None-labeled index... # The pandas metadata told us to read in an index # labeled `None`. If this corresponds to a `RangeIndex`, # fastparquet will need use the pandas metadata to # construct the index. Otherwise, the index will correspond # to a column named "__index_level_0__". We will need to # check the `ParquetFile` object for this column below. index = [] null_index_name = True columns += index if pf is None: base, fns = _analyze_paths([piece], fs) scheme = get_file_scheme(fns) pf = ParquetFile(piece, open_with=fs.open) relpath = piece.replace(base, "").lstrip("/") for rg in pf.row_groups: for ch in rg.columns: ch.file_path = relpath pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) pf.fn = base if null_index_name and "__index_level_0__" in pf.columns: # See "Handling a None-labeled index" comment above index = ["__index_level_0__"] columns += index return pf.to_pandas(columns, categories, index=index) else: if isinstance(pf, tuple): if isinstance(pf[0], list): pf = _determine_pf_parts(fs, pf[0], pf[1], **kwargs)[1] else: pf = ParquetFile( pf[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {}) ) pf._dtypes = lambda *args: pf.dtypes # ugly patch, could be fixed pf.fmd.row_groups = None rg_piece = pf.row_groups[piece] if null_index_name: if "__index_level_0__" in pf.columns: # See "Handling a None-labeled index" comment above index = ["__index_level_0__"] columns += index pf.fmd.key_value_metadata = None else: pf.fmd.key_value_metadata = None return pf.read_row_group_file( rg_piece, columns, categories, index=index, **kwargs.get("read", {}) )
def test_spark_date_empty_rg(): # https://github.com/dask/fastparquet/issues/634 # first file has header size much smaller than others as it contains no row groups fn = os.path.join(TEST_DATA, 'spark-date-empty-rg.parq') pf = ParquetFile(fn) out = pf.to_pandas(columns=['Date']) assert out.Date.tolist() == [ pd.Timestamp("2020-1-1"), pd.Timestamp("2020-1-2") ]
def test_null_time(tempdir): """Test reading a file that contains null records.""" tmp = str(tempdir) expected = pd.DataFrame({"t": [np.timedelta64(), np.timedelta64("NaT")]}) fn = os.path.join(tmp, "test-time-null.parquet") # with NaT write(fn, expected, has_nulls=False) p = ParquetFile(fn) data = p.to_pandas() assert (data["t"] == expected["t"])[~expected["t"].isnull()].all() assert sum(data["t"].isnull()) == sum(expected["t"].isnull()) # with NULL write(fn, expected, has_nulls=True) p = ParquetFile(fn) data = p.to_pandas() assert (data["t"] == expected["t"])[~expected["t"].isnull()].all() assert sum(data["t"].isnull()) == sum(expected["t"].isnull())
def test_filter_special(tempdir): df = pd.DataFrame({ 'x': [1, 2, 3, 4, 5, 6, 7], 'symbol': ['NOW', 'OI', 'OI', 'OI', 'NOW', 'NOW', 'OI'] }) write(tempdir, df, file_scheme='hive', partition_on=['symbol']) pf = ParquetFile(tempdir) out = pf.to_pandas(filters=[('symbol', '==', 'NOW')]) assert out.x.tolist() == [1, 5, 6] assert out.symbol.tolist() == ['NOW', 'NOW', 'NOW']
def test_filter_dates(tempdir): df = pd.DataFrame({ 'x': [1, 2, 3, 4, 5, 6, 7], 'date': [ '2015-05-09', '2017-05-15', '2017-05-14', '2017-05-13', '2015-05-10', '2015-05-11', '2017-05-12' ] }) write(tempdir, df, file_scheme='hive', partition_on=['date']) pf = ParquetFile(tempdir) out_1 = pf.to_pandas(filters=[('date', '>', '2017-01-01')]) assert set(out_1.x.tolist()) == {2, 3, 4, 7} expected_dates = set(pd.to_datetime(['2017-05-15', '2017-05-14', '2017-05-13', '2017-05-12'])) assert set(out_1.date.tolist()) == expected_dates out_2 = pf.to_pandas(filters=[('date', '==', pd.to_datetime('may 9 2015'))]) assert out_2.x.tolist() == [1] assert out_2.date.tolist() == pd.to_datetime(['2015-05-09']).tolist()
def test_multi_index(tempdir): import json fn = os.path.join(tempdir, 'tmp.parq') idx = pd.MultiIndex.from_product([['a', 'b', 'c'], [1, 2, 3]]) idx.names = ['index0', 'index1'] df = pd.DataFrame(list(range(9)), idx, ['col']) writer.write(fn, df) pf = ParquetFile(fn) assert set(pf.columns) == {'col', 'index0', 'index1'} meta = json.loads(pf.key_value_metadata['pandas']) assert meta['index_columns'] == idx.names out = pf.to_pandas() assert out.index.names == idx.names pd.util.testing.assert_frame_equal(df, out) out = pf.to_pandas(index=False) assert out.index.name is None assert (out.index == range(9)).all() assert len(out.columns) == 3
def test_open_standard(tempdir): df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': [1.0, 2.0, 1.0, 2.0], 'z': ['a', 'b', 'c', 'd']}) fn = os.path.join(tempdir, 'foo.parquet') write(fn, df, row_group_offsets=[0, 2], file_scheme='hive', open_with=open) pf = ParquetFile(fn, open_with=open) d2 = pf.to_pandas() pd.util.testing.assert_frame_equal(d2, df)
def test_null_time(tempdir): """Test reading a file that contains null records.""" tmp = str(tempdir) expected = pd.DataFrame({"t": [np.timedelta64(), np.timedelta64('NaT')]}) fn = os.path.join(tmp, "test-time-null.parquet") # with NaT write(fn, expected, has_nulls=False) p = ParquetFile(fn) data = p.to_pandas() assert (data['t'] == expected['t'])[~expected['t'].isnull()].all() assert sum(data['t'].isnull()) == sum(expected['t'].isnull()) # with NULL write(fn, expected, has_nulls=True) p = ParquetFile(fn) data = p.to_pandas() assert (data['t'] == expected['t'])[~expected['t'].isnull()].all() assert sum(data['t'].isnull()) == sum(expected['t'].isnull())
def test_read_multiple_no_metadata(tempdir): df = pd.DataFrame({'x': [1, 5, 2, 5]}) write(tempdir, df, file_scheme='hive', row_group_offsets=[0, 2]) os.unlink(os.path.join(tempdir, '_metadata')) import glob flist = glob.glob(os.path.join(tempdir, '*')) pf = ParquetFile(flist) assert len(pf.row_groups) == 2 out = pf.to_pandas() pd.util.testing.assert_frame_equal(out, df)
def parquet_heatmap(): pf = ParquetFile(os.path.join(project_dir, 'data', 'interim', 'data.parq')) df = pf.to_pandas( filters=[('user', '==', 194), ('modality', '==', 'cpm')]).set_index( 'date') # .drop(['modality', 'user'], axis=1) print(df.shape) data = DataLoader.convert_to_npy(df, save=False) p = sns.heatmap(np.nan_to_num(data[:, :, 0])) plt.show(p)
def read_local_parquet(file_name): """ Read Local ParquetFile :param file_name1: :return: """ pf = ParquetFile(file_name) print(pf.columns) print(len(pf.columns)) print(pf.to_pandas())
def test_floating_point_partition_name(tempdir): df = pd.DataFrame({ 'x': [1e99, 5e-10, 2e+2, -0.1], 'y1': ['aa', 'aa', 'bb', 'aa'] }) write(tempdir, df, file_scheme='hive', partition_on=['y1']) pf = ParquetFile(tempdir) out = pf.to_pandas() assert out[out.y1 == 'aa'].x.tolist() == [1e99, 5e-10, -0.1] assert out[out.y1 == 'bb'].x.tolist() == [200.0]
def test_datetime_partition_names(tempdir): dates = pd.to_datetime( ['2015-05-09', '2018-10-15', '2020-10-17', '2015-05-09']) df = pd.DataFrame({'date': dates, 'x': [1, 5, 2, 5]}) write(tempdir, df, file_scheme='hive', partition_on=['date']) pf = ParquetFile(tempdir) out = pf.to_pandas() assert set(out.date.tolist()) == set(dates.tolist()) assert out[out.date == '2015-05-09'].x.tolist() == [1, 5] assert out[out.date == '2020-10-17'].x.tolist() == [2]
def test_empty_dataframe(tempdir): df = pd.DataFrame({'a': [], 'b': []}, dtype=int) fn = os.path.join(str(tempdir), 'test.parquet') write(fn, df) pf = ParquetFile(fn) out = pf.to_pandas() assert pf.count() == 0 assert len(out) == 0 assert (out.columns == df.columns).all() assert pf.statistics
def test_nulls_roundtrip(tempdir): fname = os.path.join(tempdir, "temp.parq") data = pd.DataFrame({"o": np.random.choice(["hello", "world", None], size=1000)}) data["cat"] = data["o"].astype("category") writer.write(fname, data, has_nulls=["o", "cat"]) r = ParquetFile(fname) df = r.to_pandas() for col in r.columns: assert (df[col] == data[col])[~data[col].isnull()].all() assert (data[col].isnull() == df[col].isnull()).all()
def test_index(tempdir): import json fn = os.path.join(tempdir, 'tmp.parq') df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]}, index=pd.Index([10, 20, 30], name='z')) writer.write(fn, df) pf = ParquetFile(fn) assert set(pf.columns) == {'x', 'y', 'z'} meta = json.loads(pf.key_value_metadata['pandas']) assert meta['index_columns'] == ['z'] out = pf.to_pandas() assert out.index.name == 'z' pd.testing.assert_frame_equal(df, out, check_dtype=False) out = pf.to_pandas(index=False) assert out.index.name is None assert (out.index == range(3)).all() assert (out.z == df.index).all()
def test_many_categories(tempdir, n): tmp = str(tempdir) cats = np.arange(n) codes = np.random.randint(0, n, size=1000000) df = pd.DataFrame({"x": pd.Categorical.from_codes(codes, cats), "y": 1}) fn = os.path.join(tmp, "test.parq") write(fn, df, has_nulls=False) pf = ParquetFile(fn) out = pf.to_pandas(categories={"x": n}) tm.assert_frame_equal(df, out, check_categorical=False) df.set_index("x", inplace=True) write(fn, df, has_nulls=False, write_index=True) pf = ParquetFile(fn) out = pf.to_pandas(categories={"x": n}, index="x") assert (out.index == df.index).all() assert (out.y == df.y).all()
def test_many_categories(tempdir, n): tmp = str(tempdir) cats = np.arange(n) codes = np.random.randint(0, n, size=1000000) df = pd.DataFrame({'x': pd.Categorical.from_codes(codes, cats), 'y': 1}) fn = os.path.join(tmp, "test.parq") write(fn, df, has_nulls=False) pf = ParquetFile(fn) out = pf.to_pandas(categories={'x': n}) tm.assert_frame_equal(df, out, check_categorical=False, check_dtype=False) df.set_index('x', inplace=True) write(fn, df, has_nulls=False, write_index=True) pf = ParquetFile(fn) out = pf.to_pandas(categories={'x': n}, index='x') assert (out.index == df.index).all() assert (out.y == df.y).all()
def test_append_simple(tempdir): fn = os.path.join(str(tempdir), 'test.parq') df = pd.DataFrame({'a': [1, 2, 3, 0], 'b': ['a', 'a', 'b', 'b']}) write(fn, df, write_index=False) write(fn, df, append=True, write_index=False) pf = ParquetFile(fn) expected = pd.concat([df, df], ignore_index=True) pd.testing.assert_frame_equal( pf.to_pandas(), expected, check_categorical=False, check_dtype=False)
def test_read_multiple_no_metadata(tempdir): df = pd.DataFrame({'x': [1, 5, 2, 5]}) write(tempdir, df, file_scheme='hive', row_group_offsets=[0, 2]) os.unlink(os.path.join(tempdir, '_metadata')) os.unlink(os.path.join(tempdir, '_common_metadata')) import glob flist = list(sorted(glob.glob(os.path.join(tempdir, '*')))) pf = ParquetFile(flist) assert len(pf.row_groups) == 2 out = pf.to_pandas() pd.util.testing.assert_frame_equal(out, df)
def test_many_categories(tempdir, n): tmp = str(tempdir) cats = np.arange(n) codes = np.random.randint(0, n, size=1000000) df = pd.DataFrame({'x': pd.Categorical.from_codes(codes, cats)}) fn = os.path.join(tmp, "test.parq") write(fn, df, has_nulls=False) pf = ParquetFile(fn) out = pf.to_pandas(categories=['x']) tm.assert_frame_equal(df, out)
def test_groups_iterable(tempdir): df = pd.DataFrame({'a': np.random.choice(['aaa', 'bbb', None], size=1000), 'b': np.random.randint(0, 64000, size=1000), 'c': np.random.choice([True, False], size=1000)}) writer.write(tempdir, df, partition_on=['a'], file_scheme='hive') r = ParquetFile(tempdir) assert r.columns == ['b', 'c'] out = r.to_pandas() for i, row in out.iterrows(): assert row.b in list(df[(df.a==row.a)&(df.c==row.c)].b)
def test_directory_local(tempdir): df = pd.DataFrame({ 'x': [1, 2, 3, 4], 'y': [1.0, 2.0, 1.0, 2.0], 'z': ['a', 'b', 'c', 'd'] }) df.index.name = 'index' write(os.path.join(tempdir, 'foo1.parquet'), df) write(os.path.join(tempdir, 'foo2.parquet'), df) pf = ParquetFile(tempdir) assert pf.info['rows'] == 8 assert pf.to_pandas()['z'].tolist() == ['a', 'b', 'c', 'd'] * 2
def test_nulls_roundtrip(tempdir): fname = os.path.join(tempdir, 'temp.parq') data = pd.DataFrame({'o': np.random.choice(['hello', 'world', None], size=1000)}) data['cat'] = data['o'].astype('category') writer.write(fname, data, has_nulls=['o', 'cat']) r = ParquetFile(fname) df = r.to_pandas() for col in r.columns: assert (df[col] == data[col])[~data[col].isnull()].all() assert (data[col].isnull() == df[col].isnull()).all()
def test_no_index_name(tempdir): df = pd.DataFrame({'__index_level_0__': ['x', 'y', 'z'], 'b': [4, 5, 6]}).set_index('__index_level_0__') write(tempdir, df, file_scheme='hive') pf = ParquetFile(tempdir) out = pf.to_pandas() assert out.index.name is None assert out.index.tolist() == ['x', 'y', 'z'] df = pd.DataFrame({'__index_level_0__': ['x', 'y', 'z'], 'b': [4, 5, 6]}) write(tempdir, df, file_scheme='hive') pf = ParquetFile(tempdir) out = pf.to_pandas(index='__index_level_0__', columns=['b']) assert out.index.name is None assert out.index.tolist() == ['x', 'y', 'z'] pf = ParquetFile(tempdir) out = pf.to_pandas() assert out.index.name is None assert out.index.tolist() == [0, 1, 2]
def get_epa_by_date(start_date, end_date, hourly=True): """ Gets EPA data by whole day at a time Args: - start_date, end_date (str, format "YYYY/MM/DD" although pandas is pretty smart about picking that stuff up) - hourly (boolean, default = True) whether only values on the hour (or interpolated values in between hours) are returned """ date_range = pd.date_range(start=start_date, end=end_date, freq='D') df_list = [] # Get File from s3 try: for one_day in date_range: filename = 'epa_' + one_day.strftime("%Y%m%d") folder = 'EpaDaily' s3 = s3fs.S3FileSystem() myopen = s3.open s3_resource = boto3.resource('s3') s3_resource.Object('midscapstone-whos-polluting-my-air', '{}/{}.parquet'.format(folder, filename)).load() pf = ParquetFile( 'midscapstone-whos-polluting-my-air/{}/{}.parquet'.format( folder, filename), open_with=myopen) df = pf.to_pandas() df.reset_index(inplace=True, drop=True) if hourly: hourly_filter = np.where(df.created % 100 == 0, True, False) df_list.append(df[hourly_filter]) else: df_list.append(df) except Exception as e: print(f"Processing {folder}/{filename} failed") print(e) all_df = pd.concat(df_list, ignore_index = True) \ .assign( ts_ = lambda da: da['created'].map(parse_date), site_id = lambda da: da.apply(lambda l: str(l['ts_']) + "_" + format_name(l['site_name']), axis = 1) ) \ .set_index("site_id", drop = True)['epa_pm25_value'] # create lookup dictionary based site id and value lookup = {} for site_id, val in all_df.iteritems(): lookup[site_id] = val return lookup
def __init__(self, pq_file_dir, output, mimic_notes): self.pq_file_dir = pq_file_dir self.output = output self.mimic_notes_file = mimic_notes pf = ParquetFile(self.mimic_notes_file) self.notes = pf.to_pandas() self.preds = self.get_df_from_pq(self.pq_file_dir, 'predicates') self.mentions = self.get_df_from_pq(self.pq_file_dir, 'mentions') self.umls = self.get_df_from_pq(self.pq_file_dir, 'umls_concepts') self.sents = self.get_df_from_pq(self.pq_file_dir, 'sentences') print("Finished loading data...")
def test_append_empty(tempdir, scheme): fn = os.path.join(str(tempdir), 'test.parq') df = pd.DataFrame({'a': [1, 2, 3, 0], 'b': ['a', 'a', 'b', 'b']}) write(fn, df.head(0), write_index=False, file_scheme=scheme) pf = ParquetFile(fn) assert pf.count() == 0 assert pf.file_scheme == 'empty' write(fn, df, append=True, write_index=False, file_scheme=scheme) pf = ParquetFile(fn) pd.testing.assert_frame_equal( pf.to_pandas(), df, check_categorical=False, check_dtype=False)
def test_auto_null(tempdir): tmp = str(tempdir) df = pd.DataFrame({ 'a': [1, 2, 3, 0], 'aa': [1, 2, 3, None], 'b': [1., 2., 3., np.nan], 'c': pd.to_timedelta([1, 2, 3, np.nan], unit='ms'), 'd': ['a', 'b', 'c', None], 'f': [True, False, True, True], 'ff': [True, False, None, True] }) df['e'] = df['d'].astype('category') fn = os.path.join(tmp, "test.parq") with pytest.raises((TypeError, AttributeError)): ## TODO: this should be a nicer error? write(fn, df, has_nulls=False) write(fn, df, has_nulls=True) pf = ParquetFile(fn) for col in pf._schema[1:]: assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL df2 = pf.to_pandas(categories=['e']) cols = list(set(df) - {'ff'}) tm.assert_frame_equal(df[cols], df2[cols], check_categorical=False) tm.assert_frame_equal(df[['ff']].astype('float16'), df2[['ff']]) write(fn, df, has_nulls=None) pf = ParquetFile(fn) for col in pf._schema[1:]: if col.name in ['d', 'ff']: assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL else: assert col.repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED df2 = pf.to_pandas() tm.assert_frame_equal(df[cols], df2[cols], check_categorical=False) tm.assert_frame_equal(df[['ff']].astype('float16'), df2[['ff']])
def test_groups_roundtrip(tempdir): df = pd.DataFrame({'a': np.random.choice(['a', 'b', None], size=1000), 'b': np.random.randint(0, 64000, size=1000), 'c': np.random.choice([True, False], size=1000)}) writer.write(tempdir, df, partition_on=['a', 'c'], file_scheme='hive') r = ParquetFile(tempdir) assert r.columns == ['b'] out = r.to_pandas() for i, row in out.iterrows(): assert row.b in list(df[(df.a==row.a)&(df.c==row.c)].b) writer.write(tempdir, df, row_group_offsets=[0, 50], partition_on=['a', 'c'], file_scheme='hive') r = ParquetFile(tempdir) assert r.count == sum(~df.a.isnull()) assert len(r.row_groups) == 8 out = r.to_pandas() for i, row in out.iterrows(): assert row.b in list(df[(df.a==row.a)&(df.c==row.c)].b)
def test_multi(tempdir): fn = os.path.join(tempdir, 'test.parq') N = 200 df = pd.DataFrame( {'a': np.random.randint(10, size=N), 'b': np.random.choice(['a', 'b', 'c'], size=N), 'c': np.arange(200)}) df = df.set_index(['a', 'b']) write(fn, df) pf = ParquetFile(fn) df1 = pf.to_pandas() assert df1.equals(df) assert df1.loc[1, 'a'].equals(df.loc[1, 'a'])
def test_only_partition_columns(tempdir): df = pd.DataFrame({'a': np.random.rand(20), 'b': np.random.choice(['hi', 'ho'], size=20), 'c': np.random.choice(['a', 'b'], size=20)}) write(tempdir, df, file_scheme='hive', partition_on=['b']) pf = ParquetFile(tempdir) df2 = pf.to_pandas(columns=['b']) df.b.value_counts().to_dict() == df2.b.value_counts().to_dict() write(tempdir, df, file_scheme='hive', partition_on=['a', 'b']) pf = ParquetFile(tempdir) df2 = pf.to_pandas(columns=['a', 'b']) df.b.value_counts().to_dict() == df2.b.value_counts().to_dict() df2 = pf.to_pandas(columns=['b']) df.b.value_counts().to_dict() == df2.b.value_counts().to_dict() df2 = pf.to_pandas(columns=['b', 'c']) df.b.value_counts().to_dict() == df2.b.value_counts().to_dict() with pytest.raises(ValueError): # because this leaves no data to write write(tempdir, df[['b']], file_scheme='hive', partition_on=['b'])
def test_pickle(tempdir): import pickle df = pd.DataFrame({ 'x': [1, 2, 3, 4], 'y': [1.0, 2.0, 1.0, 2.0], 'z': ['a', 'b', 'c', 'd'] }) df.index.name = 'index' fn = os.path.join(tempdir, 'foo.parquet') write(fn, df, row_group_offsets=[0, 2], write_index=True) pf = ParquetFile(fn) pf2 = pickle.loads(pickle.dumps(pf)) assert pf.to_pandas().equals(pf2.to_pandas())
def test_empty_groupby(tempdir): df = pd.DataFrame({'a': np.random.choice(['a', 'b', None], size=1000), 'b': np.random.randint(0, 64000, size=1000), 'c': np.random.choice([True, False], size=1000)}) df.loc[499:, 'c'] = True # no False in second half writer.write(tempdir, df, partition_on=['a', 'c'], file_scheme='hive', row_group_offsets=[0, 500]) r = ParquetFile(tempdir) assert r.count() == sum(~df.a.isnull()) assert len(r.row_groups) == 6 out = r.to_pandas() for i, row in out.iterrows(): assert row.b in list(df[(df.a==row.a)&(df.c==row.c)].b)