def test_read_multiple_parquet_files(self): import pyarrow.parquet as pq nfiles = 10 size = 5 tmpdir = pjoin(self.tmp_path, 'multi-parquet-' + guid()) self.hdfs.mkdir(tmpdir) test_data = [] paths = [] for i in range(nfiles): df = test_parquet._test_dataframe(size, seed=i) df['index'] = np.arange(i * size, (i + 1) * size) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) path = pjoin(tmpdir, '{0}.parquet'.format(i)) table = pa.Table.from_pandas(df, preserve_index=False) with self.hdfs.open(path, 'wb') as f: pq.write_table(table, f) test_data.append(table) paths.append(path) result = self.hdfs.read_parquet(tmpdir) expected = pa.concat_tables(test_data) pdt.assert_frame_equal(result.to_pandas() .sort_values(by='index').reset_index(drop=True), expected.to_pandas())
def test_read_single_row_group(): # ARROW-471 N, K = 10000, 4 df = alltypes_sample(size=N) a_table = pa.Table.from_pandas(df, timestamps_to_ms=True) buf = io.BytesIO() pq.write_table(a_table, buf, row_group_size=N / K, compression='snappy', version='2.0') buf.seek(0) pf = pq.ParquetFile(buf) assert pf.num_row_groups == K row_groups = [pf.read_row_group(i) for i in range(K)] result = pa.concat_tables(row_groups) pdt.assert_frame_equal(df, result.to_pandas()) cols = df.columns[:2] row_groups = [pf.read_row_group(i, columns=cols) for i in range(K)] result = pa.concat_tables(row_groups) pdt.assert_frame_equal(df[cols], result.to_pandas())
def test_pandas_parquet_1_0_rountrip(tmpdir): size = 10000 np.random.seed(0) df = pd.DataFrame({ 'uint8': np.arange(size, dtype=np.uint8), 'uint16': np.arange(size, dtype=np.uint16), 'uint32': np.arange(size, dtype=np.uint32), 'uint64': np.arange(size, dtype=np.uint64), 'int8': np.arange(size, dtype=np.int16), 'int16': np.arange(size, dtype=np.int16), 'int32': np.arange(size, dtype=np.int32), 'int64': np.arange(size, dtype=np.int64), 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0, 'str': [str(x) for x in range(size)], 'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None], 'empty_str': [''] * size }) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = pa.Table.from_pandas(df) pq.write_table(arrow_table, filename.strpath, version="1.0") table_read = pq.read_table(filename.strpath) df_read = table_read.to_pandas() # We pass uint32_t as int64_t if we write Parquet version 1.0 df['uint32'] = df['uint32'].values.astype(np.int64) pdt.assert_frame_equal(df, df_read)
def test_pandas_parquet_configuration_options(tmpdir): size = 10000 np.random.seed(0) df = pd.DataFrame({ 'uint8': np.arange(size, dtype=np.uint8), 'uint16': np.arange(size, dtype=np.uint16), 'uint32': np.arange(size, dtype=np.uint32), 'uint64': np.arange(size, dtype=np.uint64), 'int8': np.arange(size, dtype=np.int16), 'int16': np.arange(size, dtype=np.int16), 'int32': np.arange(size, dtype=np.int32), 'int64': np.arange(size, dtype=np.int64), 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0 }) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = pa.Table.from_pandas(df) for use_dictionary in [True, False]: pq.write_table(arrow_table, filename.strpath, version="2.0", use_dictionary=use_dictionary) table_read = pq.read_table(filename.strpath) df_read = table_read.to_pandas() pdt.assert_frame_equal(df, df_read) for compression in ['NONE', 'SNAPPY', 'GZIP']: pq.write_table(arrow_table, filename.strpath, version="2.0", compression=compression) table_read = pq.read_table(filename.strpath) df_read = table_read.to_pandas() pdt.assert_frame_equal(df, df_read)
def _write_table(table, path, **kwargs): import pyarrow.parquet as pq if isinstance(table, pd.DataFrame): table = pa.Table.from_pandas(table) pq.write_table(table, path, **kwargs) return table
def make_sample_file(df): a_table = pa.Table.from_pandas(df, timestamps_to_ms=True) buf = io.BytesIO() pq.write_table(a_table, buf, compression='SNAPPY', version='2.0') buf.seek(0) return pq.ParquetFile(buf)
def test_column_of_lists(tmpdir): df, schema = dataframe_with_arrays() filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True, schema=schema) pq.write_table(arrow_table, filename.strpath, version="2.0") table_read = pq.read_table(filename.strpath) df_read = table_read.to_pandas() pdt.assert_frame_equal(df, df_read)
def test_pandas_parquet_native_file_roundtrip(tmpdir): df = _test_dataframe(10000) arrow_table = A.from_pandas_dataframe(df) imos = paio.InMemoryOutputStream() pq.write_table(arrow_table, imos, version="2.0") buf = imos.get_result() reader = paio.BufferReader(buf) df_read = pq.read_table(reader).to_pandas() pdt.assert_frame_equal(df, df_read)
def test_pandas_parquet_2_0_rountrip(tmpdir): df = alltypes_sample(size=10000) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True) pq.write_table(arrow_table, filename.strpath, version="2.0") table_read = pq.read_table(filename.strpath) df_read = table_read.to_pandas() pdt.assert_frame_equal(df, df_read)
def _write_partition_pyarrow(df, open_with, filename, write_index, metadata_path=None, **kwargs): import pyarrow as pa from pyarrow import parquet t = pa.Table.from_pandas(df, preserve_index=write_index) with open_with(filename, 'wb') as fil: parquet.write_table(t, fil, **kwargs) if metadata_path is not None: with open_with(metadata_path, 'wb') as fil: kwargs.pop('compression', None) parquet.write_metadata(t.schema, fil, **kwargs)
def read_parquet(fn): """ read parquet file with Spark """ print("Loading parquest file: %s..."% fn) file_name = 'parquet_sample.dat' read_parquest(file_name) fn = 'sample.parquet' tbl = pq.read_table(fn) df = tbl.to_pandas() d=df.iloc[:, 0:3] table = pa.Table.from_pandas(d) pq.write_table(table, 'example.parquet') pass
def test_read_no_metadata(tmpdir, engine): # use pyarrow.parquet to create a parquet file without # pandas metadata pa = pytest.importorskip("pyarrow") import pyarrow.parquet as pq tmp = str(tmpdir) + "table.parq" table = pa.Table.from_arrays([pa.array([1, 2, 3]), pa.array([3, 4, 5])], names=['A', 'B']) pq.write_table(table, tmp) result = dd.read_parquet(tmp, engine=engine) expected = pd.DataFrame({"A": [1, 2, 3], "B": [3, 4, 5]}) assert_eq(result, expected)
def parquet(tmpdir, data): pa = pytest.importorskip('pyarrow') import pyarrow.parquet as pq # noqa: E402 from ibis.file.parquet import ParquetClient # create single files d = tmpdir.mkdir('pq') for k, v in data.items(): f = d / '{}.parquet'.format(k) table = pa.Table.from_pandas(v) pq.write_table(table, str(f)) return ParquetClient(tmpdir).database()
def test_client(tmpdir, data): # construct with a path to a file d = tmpdir / 'pq' d.mkdir() for k, v in data.items(): f = d / "{}.parquet".format(k) table = pa.Table.from_pandas(v) pq.write_table(table, str(f)) c = ParquetClient(tmpdir) assert c.list_databases() == ['pq'] assert c.database().pq.list_tables() == ['close', 'open']
def test_pandas_column_selection(tmpdir): size = 10000 np.random.seed(0) df = pd.DataFrame({ 'uint8': np.arange(size, dtype=np.uint8), 'uint16': np.arange(size, dtype=np.uint16) }) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = pa.Table.from_pandas(df) pq.write_table(arrow_table, filename.strpath) table_read = pq.read_table(filename.strpath, columns=['uint8']) df_read = table_read.to_pandas() pdt.assert_frame_equal(df[['uint8']], df_read)
def test_fastparquet_read_with_hdfs(): fs = hdfs_test_client() df = tm.makeDataFrame() table = pa.Table.from_pandas(df) path = '/tmp/testing.parquet' with fs.open(path, 'wb') as f: pq.write_table(table, f) parquet_file = fastparquet.ParquetFile(path, open_with=fs.open) result = parquet_file.to_pandas() tm.assert_frame_equal(result, df)
def test_min_chunksize(): data = pd.DataFrame([np.arange(4)], columns=['A', 'B', 'C', 'D']) table = pa.Table.from_pandas(data.reset_index()) buf = io.BytesIO() pq.write_table(table, buf, chunk_size=-1) buf.seek(0) result = pq.read_table(buf) assert result.equals(table) with pytest.raises(ValueError): pq.write_table(table, buf, chunk_size=0)
def test_multithreaded_read(): df = alltypes_sample(size=10000) table = pa.Table.from_pandas(df, timestamps_to_ms=True) buf = io.BytesIO() pq.write_table(table, buf, compression='SNAPPY', version='2.0') buf.seek(0) table1 = pq.read_table(buf, nthreads=4) buf.seek(0) table2 = pq.read_table(buf, nthreads=1) assert table1.equals(table2)
def test_single_pylist_column_roundtrip(tmpdir): for dtype in [int, float]: filename = tmpdir.join('single_{}_column.parquet' .format(dtype.__name__)) data = [pa.from_pylist(list(map(dtype, range(5))))] table = pa.Table.from_arrays(data, names=('a', 'b'), name='table_name') pq.write_table(table, filename.strpath) table_read = pq.read_table(filename.strpath) for col_written, col_read in zip(table.itercolumns(), table_read.itercolumns()): assert col_written.name == col_read.name assert col_read.data.num_chunks == 1 data_written = col_written.data.chunk(0) data_read = col_read.data.chunk(0) assert data_written.equals(data_read)
def test_pass_separate_metadata(): # ARROW-471 df = alltypes_sample(size=10000) a_table = pa.Table.from_pandas(df, timestamps_to_ms=True) buf = io.BytesIO() pq.write_table(a_table, buf, compression='snappy', version='2.0') buf.seek(0) metadata = pq.ParquetFile(buf).metadata buf.seek(0) fileh = pq.ParquetFile(buf, metadata=metadata) pdt.assert_frame_equal(df, fileh.read().to_pandas())
def parquet(tables, data_directory, ignore_missing_dependency, **params): try: import pyarrow as pa # noqa: F401 import pyarrow.parquet as pq # noqa: F401 except ImportError: msg = 'PyArrow dependency is missing' if ignore_missing_dependency: logger.warning('Ignored: %s', msg) return 0 else: raise click.ClickException(msg) data_directory = Path(data_directory) for table, df in read_tables(tables, data_directory): arrow_table = pa.Table.from_pandas(df) target_path = data_directory / '{}.parquet'.format(table) pq.write_table(arrow_table, str(target_path))
def test_read_write_parquet_files_with_uri(self): import pyarrow.parquet as pq tmpdir = pjoin(self.tmp_path, 'uri-parquet-' + guid()) self.hdfs.mkdir(tmpdir) path = _get_hdfs_uri(pjoin(tmpdir, 'test.parquet')) size = 5 df = test_parquet._test_dataframe(size, seed=0) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) table = pa.Table.from_pandas(df, preserve_index=False) pq.write_table(table, path) result = pq.read_table(path).to_pandas() pdt.assert_frame_equal(result, df)
def _write_partition_pyarrow(df, open_with, path, fs, filename, write_index, partition_on, metadata_path=None, **kwargs): import pyarrow as pa from pyarrow import parquet t = pa.Table.from_pandas(df, preserve_index=write_index) if partition_on: parquet.write_to_dataset(t, path, partition_cols=partition_on, filesystem=fs) else: with open_with(filename, 'wb') as fil: parquet.write_table(t, fil, **kwargs) if metadata_path is not None: with open_with(metadata_path, 'wb') as fil: # Get only arguments specified in the function kwargs_meta = {k: v for k, v in kwargs.items() if k in _pyarrow_write_metadata_kwargs} parquet.write_metadata(t.schema, fil, **kwargs_meta)
def json_to_parquet(data, output, schema): column_data = {} array_data = [] for row in data: for column in schema.names: _col = column_data.get(column, []) _col.append(row.get(column)) column_data[column] = _col for column in schema: _col = column_data.get(column.name) if isinstance(column.type, pa.lib.TimestampType): _converted_col = [] for t in _col: try: _converted_col.append(pd.to_datetime(t)) except pd._libs.tslib.OutOfBoundsDatetime: _converted_col.append(pd.Timestamp.max) array_data.append(pa.Array.from_pandas(pd.to_datetime(_converted_col), type=pa.timestamp('ms'))) # Float types are ambiguous for conversions, need to specify the exact type elif column.type.id == pa.float64().id: array_data.append(pa.array(_col, type=pa.float64())) elif column.type.id == pa.float32().id: # Python doesn't have a native float32 type # and PyArrow cannot cast float64 -> float32 _col = pd.to_numeric(_col, downcast='float') array_data.append(pa.Array.from_pandas(_col, type=pa.float32())) elif column.type.id == pa.int64().id: array_data.append(pa.array([int(ele) for ele in _col], type=pa.int64())) else: array_data.append(pa.array(_col, type=column.type)) data = pa.RecordBatch.from_arrays(array_data, schema.names) try: table = pa.Table.from_batches(data) except TypeError: table = pa.Table.from_batches([data]) pq.write_table(table, output, compression='SNAPPY', coerce_timestamps='ms')
def test_direct_read_dictionary(use_legacy_dataset): # ARROW-3325 repeats = 10 nunique = 5 data = [ [util.rands(10) for i in range(nunique)] * repeats, ] table = pa.table(data, names=['f0']) bio = pa.BufferOutputStream() pq.write_table(table, bio) contents = bio.getvalue() result = pq.read_table(pa.BufferReader(contents), read_dictionary=['f0'], use_legacy_dataset=use_legacy_dataset) # Compute dictionary-encoded subfield expected = pa.table([table[0].dictionary_encode()], names=['f0']) assert result.equals(expected)
def read_csv_write_to_parquet(local_data_path, s3_path, local_meta_path): if s3_path.startswith("s3://"): s3_path = s3_path.replace("s3://", "", 1) local = fs.LocalFileSystem() s3 = fs.S3FileSystem(region=REGION) with local.open_input_stream(local_data_path) as f: tab = csv.read_csv(f) metadata = read_table_json(local_meta_path) arrow_cols = [] for col in metadata.columns: if col["name"] not in metadata.partitions: arrow_cols.append(convert_meta_col_to_arrow_tuple(col)) s = pa.schema(arrow_cols) tab = tab.cast(s) with s3.open_output_stream(s3_path) as f: pq.write_table(tab, f)
def test_pandas_parquet_pyfile_roundtrip(tmpdir): filename = tmpdir.join('pandas_pyfile_roundtrip.parquet').strpath size = 5 df = pd.DataFrame({ 'int64': np.arange(size, dtype=np.int64), 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0, 'strings': ['foo', 'bar', None, 'baz', 'qux'] }) arrow_table = pa.Table.from_pandas(df) with open(filename, 'wb') as f: pq.write_table(arrow_table, f, version="1.0") data = io.BytesIO(open(filename, 'rb').read()) table_read = pq.read_table(data) df_read = table_read.to_pandas() pdt.assert_frame_equal(df, df_read)
def _write_data(self, directory=None, schema=None, prefix=tempfile.template, row_group_size=1000, codec='none', count=None): if directory is None: directory = self.temp_dir with tempfile.NamedTemporaryFile(delete=False, dir=directory, prefix=prefix) as f: table = self._records_as_arrow(schema, count) pq.write_table(table, f, row_group_size=row_group_size, compression=codec, use_deprecated_int96_timestamps=True) return f.name
def getdata(year: int): """ A helper function to retrieve data and save it locally as parquet file. Args: Year which should be retrieved. Returns: saves for each month a parquet file on local drive. """ for month in range(1, 13): if month < 10: linkurl = f"http://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_{year}-0{month}.csv" elif month >= 10: linkurl = f"http://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_{year}-{month}.csv" df = pd.read_csv(linkurl, sep=',') table = pa.Table.from_pandas(df) pq.write_table(table, f"../avgdrive/nyc_yellow{year}-{month}.parquet")
def test_compat_old_rw_path(df_all_types, store): # strip down DF before some column types weren't supported before anyway df = df_all_types[ [ c for c in df_all_types.columns if ( not c.startswith("array_") # array types (always null) and c != "unicode" # unicode type (alway null) and "8" not in c # 8 bit types are casted to 64 bit and "16" not in c # 16 bit types are casted to 64 bit and "32" not in c # 32 bit types are casted to 64 bit ) ] ] expected_meta = make_meta(df, origin="df") # old schema write path old_meta = dask_make_meta(df) pa_table = pa.Table.from_pandas(old_meta) buf = pa.BufferOutputStream() pq.write_table(pa_table, buf, version="2.0") key_old = _get_common_metadata_key("dataset_uuid_old", "table") store.put(key_old, buf.getvalue().to_pybytes()) actual_meta = read_schema_metadata( dataset_uuid="dataset_uuid_old", store=store, table="table" ) validate_compatible([actual_meta, expected_meta]) store_schema_metadata( schema=make_meta(df, origin="df"), dataset_uuid="dataset_uuid_new", store=store, table="table", ) key_new = _get_common_metadata_key("dataset_uuid_new", "table") actual_df = ParquetSerializer.restore_dataframe(key=key_new, store=store) actual_df["date"] = actual_df["date"].dt.date pdt.assert_frame_equal(actual_df, old_meta)
def test_multi_dataset_metadata(tempdir): filenames = ["ARROW-1983-dataset.0", "ARROW-1983-dataset.1"] metapath = str(tempdir / "_metadata") # create a test dataset df = pd.DataFrame({ 'one': [1, 2, 3], 'two': [-1, -2, -3], 'three': [[1, 2], [2, 3], [3, 4]], }) table = pa.Table.from_pandas(df) # write dataset twice and collect/merge metadata _meta = None for filename in filenames: meta = [] pq.write_table(table, str(tempdir / filename), metadata_collector=meta) meta[0].set_file_path(filename) if _meta is None: _meta = meta[0] else: _meta.append_row_groups(meta[0]) # Write merged metadata-only file with open(metapath, "wb") as f: _meta.write_metadata_file(f) # Read back the metadata meta = pq.read_metadata(metapath) md = meta.to_dict() _md = _meta.to_dict() for key in _md: if key != 'serialized_size': assert _md[key] == md[key] assert _md['num_columns'] == 3 assert _md['num_rows'] == 6 assert _md['num_row_groups'] == 2 assert _md['serialized_size'] == 0 assert md['serialized_size'] > 0
def save_dataframe(self, dataframe): """ Save a DataFrame to the store. """ storepath = self.temporary_object_path(str(uuid.uuid4())) # switch parquet lib parqlib = self.get_parquet_lib() if isinstance(dataframe, pd.DataFrame): #parqlib is ParquetLib.ARROW: # other parquet libs are deprecated, remove? import pyarrow as pa from pyarrow import parquet table = pa.Table.from_pandas(dataframe) parquet.write_table(table, storepath) elif parqlib is ParquetLib.SPARK: from pyspark import sql as sparksql assert isinstance(dataframe, sparksql.DataFrame) dataframe.write.parquet(storepath) else: assert False, "Unimplemented ParquetLib %s" % parqlib # Move serialized DataFrame to object store if os.path.isdir(storepath): # Pyspark hashes = [] files = [ ofile for ofile in os.listdir(storepath) if ofile.endswith(".parquet") ] for obj in files: path = os.path.join(storepath, obj) objhash = digest_file(path) move(path, self.object_path(objhash)) hashes.append(objhash) rmtree(storepath) else: filehash = digest_file(storepath) move(storepath, self.object_path(filehash)) hashes = [filehash] return hashes
def prepare(): print("Download titles....") imdb_title = pd.read_csv(IMDB_TITLE_GZIP, sep='\t', dtype='str', index_col='tconst', engine='c') imdb_title = imdb_title[imdb_title['titleType']=='movie'] imdb_title = imdb_title.dropna(subset=['startYear', 'originalTitle']) print("Download ratings....") table = pa.Table.from_pandas(pd.merge( imdb_title, pd.read_csv(IMDB_RATING_GZIP, sep='\t', dtype='str', index_col='tconst', engine='c'), how='left', left_index=True, right_index=True, sort=False), preserve_index=True) pq.write_table(table, IMDB_MOVIES_PARQUET, compression='gzip') print("Download actors....") imdb_actors = pd.read_csv(IMDB_ACTORS_GZIP, sep='\t', dtype='str', index_col='tconst', engine='c') imdb_actors = imdb_actors[(imdb_actors["ordering"] == '1') & ( (imdb_actors["category"] == 'actor') | (imdb_actors["category"] == 'actress'))] imdb_actors_names = pd.read_csv(IMDB_ACTORS_NAMES_GZIP, sep='\t', dtype='str', index_col='nconst', engine='c') imdb_actors_with_names = imdb_actors.merge(imdb_actors_names, right_index=True, left_on="nconst") imdb_actors_with_names = imdb_actors_with_names[["primaryName", "characters"]] pa_actors = pa.Table.from_pandas(imdb_actors_with_names) pq.write_table(pa_actors, IMDB_ACTORS_PARQUET, compression='gzip') print("Download covers....") table = pa.Table.from_pandas(pd.read_csv(IMDB_COVERS_CSV), preserve_index=False) pq.write_table(table, IMDB_COVERS_PARQUET, compression='gzip')
def main(): f1 = '../data/e024b429-3fb1-4a6d-b4e6-23fe5eaadfc5' f2 = '../data/468cd686-0b96-4296-92ff-45f46c73b90e' fp1 = 'dataset1.parquet' fp2 = 'dataset2.parquet' fp3 = 'dataset3.parquet' ds1 = xr.open_dataset(f1) pq.write_table(pa.Table.from_pandas(ds1.to_dataframe()), fp1) ds2 = xr.open_dataset(f2) pq.write_table(pa.Table.from_pandas(ds2.to_dataframe()), fp2) ds = ds1.merge(ds2) ds.to_netcdf("dataset3.nc") # dask required TODO... # with xr.open_mfdataset('../data/*') as ds: # print(ds.keys()) df = ds.to_dataframe() table = pa.Table.from_pandas(df) print(table.to_pandas()) pq.write_table(table, fp3) for f in [f1, fp1, f2, fp2, fp3]: print("{} {} MB".format(f, size_mb(os.path.getsize(f))))
def merge_non_audio_summaries(self): """ combines and replaces all summaries per type except for audio summaries """ smrs_dict = {} for smry in self.summaries: if smry.stype != SensorType.AUDIO: if smry.stype in smrs_dict.keys(): smrs_dict[smry.stype].append(smry) else: smrs_dict[smry.stype] = [smry] self.summaries = self.get_audio() for styp, smrys in smrs_dict.items(): first_summary = smrys.pop(0) tbl = first_summary.data() combined_mint = np.mean([smrs.smint_s for smrs in smrys]) combined_std = np.mean([smrs.sstd_s for smrs in smrys]) if not first_summary.check_data(): os.makedirs(first_summary.fdir, exist_ok=True) for smrs in smrys: tbl = pa.concat_tables([tbl, smrs.data()]) if not first_summary.check_data(): os.remove(smrs.file_name()) if first_summary.check_data(): first_summary._data = tbl else: pq.write_table(tbl, first_summary.file_name()) mnint = dtu.microseconds_to_seconds(float(np.mean(np.diff(tbl["timestamps"].to_numpy())))) stdint = dtu.microseconds_to_seconds(float(np.std(np.diff(tbl["timestamps"].to_numpy())))) if not combined_mint + combined_std > mnint > combined_mint - combined_std: self.errors.append(f"Mean interval s of combined {styp.name} sensor does not match the " f"compilation of individual mean interval s per packet. Will use compilation of " f"individual values.") mnint = combined_mint stdint = combined_std single_smry = PyarrowSummary(first_summary.name, styp, first_summary.start, 1 / mnint, first_summary.fdir, tbl.num_rows, mnint, stdint, first_summary.data() if first_summary.check_data() else None ) self.summaries.append(single_smry)
def save_data(self, data, format='parquet', resolution='time', errors=False): """fn: to save data to directory # Args data : pd.DataFrame format : str, ('parquet', 'h5', 'csv', 'feather') resolution : str, date or time if date uses default str format, if time will use YYYY-MM-DD_HH.MM.SS errors : bool, if True change filepath name if False use options data filepath name """ _dir = self._create_dir() if resolution == 'time': _timestamp = self.__create_timestamp_str() elif resolution == 'date': _timestamp = self.__create_date_str() if errors: _fp = _dir + f'yahoo_options_scraper_errors_{_timestamp}.{format}' else: _fp = _dir + f'yahoo_options_data_{_timestamp}.{format}' if format == 'parquet': _table = pa.Table.from_pandas(data) pq.write_table(_table, _fp) elif format == 'h5': data.to_hdf(_fp, key='data') elif format == 'csv': data.to_csv(_fp, index=False) elif format == 'feather': data.to_feather(_fp) return
def test_use_nullable_dtypes(self, engine): import pyarrow.parquet as pq if engine == "fastparquet": # We are manually disabling fastparquet's # nullable dtype support pending discussion pytest.skip("Fastparquet nullable dtype support is disabled") table = pyarrow.table( { "a": pyarrow.array([1, 2, 3, None], "int64"), "b": pyarrow.array([1, 2, 3, None], "uint8"), "c": pyarrow.array(["a", "b", "c", None]), "d": pyarrow.array([True, False, True, None]), # Test that nullable dtypes used even in absence of nulls "e": pyarrow.array([1, 2, 3, 4], "int64"), } ) with tm.ensure_clean() as path: # write manually with pyarrow to write integers pq.write_table(table, path) result1 = read_parquet(path, engine=engine) result2 = read_parquet(path, engine=engine, use_nullable_dtypes=True) assert result1["a"].dtype == np.dtype("float64") expected = pd.DataFrame( { "a": pd.array([1, 2, 3, None], dtype="Int64"), "b": pd.array([1, 2, 3, None], dtype="UInt8"), "c": pd.array(["a", "b", "c", None], dtype="string"), "d": pd.array([True, False, True, None], dtype="boolean"), "e": pd.array([1, 2, 3, 4], dtype="Int64"), } ) if engine == "fastparquet": # Fastparquet doesn't support string columns yet # Only int and boolean result2 = result2.drop("c", axis=1) expected = expected.drop("c", axis=1) tm.assert_frame_equal(result2, expected)
def combine_features(metadata_filename: str) -> None: """ Combine feature files for multiple datasets into a single feature file. If the combined feature file already exists it will _not_ be recreated. Parameters ---------- metadata_filename : str Features for all datasets included in the metadata will be combined. Should be a Parquet file. """ feat_dir = os.path.join(os.environ['GLEAMS_HOME'], 'data', 'feature') feat_filename = os.path.join( feat_dir, os.path.splitext(os.path.basename(metadata_filename))[0].replace( 'metadata', 'feature')) if (os.path.isfile(f'{feat_filename}.npz') and os.path.isfile(f'{feat_filename}.parquet')): return datasets = pd.read_parquet(metadata_filename, columns=['dataset'])['dataset'].unique() logger.info('Combine features for metadata file %s containing %d datasets', metadata_filename, len(datasets)) encodings, indexes = [], [] for i, dataset in enumerate(datasets, 1): logger.debug('Append dataset %s [%3d/%3d]', dataset, i, len(datasets)) dataset_encodings_filename = os.path.join(feat_dir, 'dataset', f'{dataset}.npz') dataset_index_filename = os.path.join(feat_dir, 'dataset', f'{dataset}.parquet') if (not os.path.isfile(dataset_encodings_filename) or not os.path.isfile(dataset_index_filename)): logger.warning('Missing features for dataset %s, skipping...', dataset) else: encodings.append(ss.load_npz(dataset_encodings_filename)) indexes.append(pq.read_table(dataset_index_filename)) ss.save_npz(f'{feat_filename}.npz', ss.vstack(encodings, 'csr')) pq.write_table(pa.concat_tables(indexes), f'{feat_filename}.parquet')
def stream_csv(self, in_io): parsed_rows = 0 out = io.StringIO() out_parquet = io.BytesIO() header_rows = self.header_fields.keys() df_data = list() writer = csv.writer(out, delimiter=',') writer.writerow(header_rows) lines = in_io.decode('utf-8').split('\n') logging.info("got {} lines to parse".format(len(lines))) for line_num, line in enumerate(lines): if not self.is_valid_format(line, line_num): continue result, uuid = self.json_to_csv(self.extract_json(line, line_num)) if uuid in self.uuids: continue self.uuids.add(uuid) writer.writerow(result) df_data.append(result) parsed_rows += 1 df = pd.DataFrame(df_data, columns=header_rows) # Pyarrow tries to infer types by default. # Explicitly set the types to prevent mis-typing. df = self.apply_df_types(df) # Convert pandas.DataFrame -> pyarrow.Table (Parquet) table = pa.Table.from_pandas(df) # Write parquet table. pq.write_table(table, out_parquet, compression='snappy') # Reset all FP's out_parquet.seek(0) out.seek(0) total_rows = len(lines) return parsed_rows, total_rows, out, out_parquet
def test_write_metadata(tempdir): path = str(tempdir / "metadata") schema = pa.schema([("a", "int64"), ("b", "float64")]) # write a pyarrow schema pq.write_metadata(schema, path) parquet_meta = pq.read_metadata(path) schema_as_arrow = parquet_meta.schema.to_arrow_schema() assert schema_as_arrow.equals(schema) # ARROW-8980: Check that the ARROW:schema metadata key was removed if schema_as_arrow.metadata: assert b'ARROW:schema' not in schema_as_arrow.metadata # pass through writer keyword arguments for version in ["1.0", "2.0", "2.4", "2.6"]: pq.write_metadata(schema, path, version=version) parquet_meta = pq.read_metadata(path) # The version is stored as a single integer in the Parquet metadata, # so it cannot correctly express dotted format versions expected_version = "1.0" if version == "1.0" else "2.6" assert parquet_meta.format_version == expected_version # metadata_collector: list of FileMetaData objects table = pa.table({'a': [1, 2], 'b': [.1, .2]}, schema=schema) pq.write_table(table, tempdir / "data.parquet") parquet_meta = pq.read_metadata(str(tempdir / "data.parquet")) pq.write_metadata( schema, path, metadata_collector=[parquet_meta, parquet_meta] ) parquet_meta_mult = pq.read_metadata(path) assert parquet_meta_mult.num_row_groups == 2 # append metadata with different schema raises an error with pytest.raises(RuntimeError, match="requires equal schemas"): pq.write_metadata( pa.schema([("a", "int32"), ("b", "null")]), path, metadata_collector=[parquet_meta, parquet_meta] )
def apply(df, path, parameters=None): """ Exports a dataframe to a Parquet file Parameters ------------ df Dataframe path Path parameters Possible parameters of the algorithm """ if parameters is None: parameters = {} compression = parameters[ "compression"] if "compression" in parameters else "snappy" df.columns = [x.replace(":", "AAA") for x in df.columns] df = pa.Table.from_pandas(df) pq.write_table(df, path, compression=compression)
def write_pandas( self, df: pd.DataFrame, compression="snappy", num_chunks: int = None, chunk_size: int = None, schema: pyarrow.Schema = None, ): """Write DataFrame as Parquet Dataset""" # Check arguments if not isinstance(self.path_or_paths, str): msg = f"Cannot write table to {self.path_or_paths} (expected string)" raise TypeError(msg) if num_chunks is not None and chunk_size is not None: msg = f"Both num_chunks and chunk_size are given, not allowed" raise ValueError(msg) if chunk_size is not None: num_chunks = max(len(df) // chunk_size, 1) # Write DataFrame to parquet if num_chunks is None: table = pyarrow.Table.from_pandas(df, schema=schema, preserve_index=False) self.write(table, compression=compression) else: Path(self.path_or_paths).mkdir(parents=True, exist_ok=True, filesystem=self.filesystem) chunks = np.array_split(df, num_chunks) for idx, chunk in enumerate(chunks): filename = f"part-{idx:05d}.parquet.{compression}" chunk_path = Path(self.path_or_paths, filename) LOGGER.info(f"Writing chunk:{idx} to {chunk_path}") with chunk_path.open("wb", filesystem=self.filesystem) as file: table = pyarrow.Table.from_pandas(chunk, schema=schema, preserve_index=False) pq.write_table(table, file, compression=compression)
def local_to_parquet(): files = [f for f in listdir(path) if isfile(join(path, f))] for file in files: df = pd.read_csv(path + '\\' + file) df[df['text_type'].isin(['post', 'comment' ])] #only keep rows with correct formatting if (len(df.columns) > 7): print("wait") df.drop( df.iloc[:, 7:], inplace=True, axis=1 ) #drop columns created due to wrong formatting #TBD with parquet df = df.fillna('') #change NaNs to empty string df['tickers'] = df['tickers'].apply(tickers_to_list) df['post_id'] = df['post_id'].astype(str) df['comment_id'] = df['comment_id'].astype(str) df['subreddit'] = df['subreddit'].astype(str) df['text_type'] = df['text_type'].astype(str) filename = file.replace(".csv", '') path_tsv = path_parquet + filename + ".parquet" print(df.head()) table_pa = pa.Table.from_pandas(df) pq.write_table(table_pa, path_tsv)
def load_raw(): # note manually removed some bad row kwargs = get_pandas_read_csv_defaults() kwargs['thousands'] = ',' # always do this kwargs['parse_dates'] = ['Date'] kwargs['na_values'] = ['-'] kwargs['dtype'] = 'str' dtype = { 'Close': 'float', 'High': 'float', 'Low': 'float', 'Market Cap': 'float', 'Open': 'float', 'Volume': 'float' } meta = pd.read_csv(os.path.join(_mydir, 'Top100Cryptos/data/100 List.csv')) names = meta.Name.tolist() files = [os.path.join(_mydir, 'Top100Cryptos/data/{}.csv'.format(x)) for x in names] # files = glob.glob(os.path.join(_mydir, 'Top100Cryptos/data/*.csv')) dfs = list() datadir = os.path.join(_mydir, 'parsed') if not os.path.exists(datadir): os.makedirs(datadir) for i, (name, f) in enumerate(zip(names, files)): mtime = os.path.getmtime(f) dirname = os.path.join(datadir, 'name={}/mtime={}'.format(name, mtime)) filename = os.path.join(dirname, 'data.parquet') if not os.path.exists(filename): df = pd.read_csv(f, **kwargs) df = pa.Table.from_pandas(df) if not os.path.exists(dirname): os.makedirs(dirname) print('writing {}'.format(filename)) pq.write_table(df, filename) pq.read_table('./parsed') # test else: print('{} exists'.format(filename)) return pq.read_table('./parsed') # test
def write_parquet(self): # TODO: Test and create docs for import pyarrow as pa import pyarrow.parquet as pq # Generate the schema field_list = [] # TODO: Catch and exit gracefully # Will fail if the extractor does not have the class var schema_fields for k, v in self.schema_fields.items(): field_list.append(pa.field(k, v)) schema = pa.schema(field_list) # Create pyarrow table column_names = [] columns = [] for column in schema: column_values = [dic.get(column.name) for dic in self.data] try: columns.append(pa.array(column_values, type=column.type)) except Exception: logger.exception(("Could not create array" f" for column: {column.name}")) raise column_names.append(column.name) record_batch = pa.RecordBatch.from_arrays(columns, column_names) table = pa.Table.from_batches([record_batch]) output_io = io.BytesIO() pq.write_table(table, output_io) output_io.seek(0) # TODO: Is this the correct content type for a parquet file? return SaveTo(self.scraper, output_io, content_type='application/octet-stream', encoding=self.encoding)
def df_to_parquet(df, filename, compression='SNAPPY'): """write_to_parquet: Converts a Pandas DataFrame into a Parquet file Args: df (pandas dataframe): The Pandas Dataframe to be saved as parquet file filename (string): The full path to the filename for the Parquet file """ # Right now there are two open Parquet issues # Timestamps in Spark: https://issues.apache.org/jira/browse/ARROW-1499 # TimeDelta Support: https://issues.apache.org/jira/browse/ARROW-835 for column in df.columns: if (df[column].dtype == 'timedelta64[ns]'): print('Converting timedelta column {:s}...'.format(column)) df[column] = df[column].astype(str) arrow_table = pa.Table.from_pandas(df) if compression == 'UNCOMPRESSED': compression = None pq.write_table(arrow_table, filename, compression=compression, use_deprecated_int96_timestamps=True)
def _write_partition_pyarrow(df, open_with, filename, write_index, metadata_path=None, **kwargs): import pyarrow as pa from pyarrow import parquet t = pa.Table.from_pandas(df, preserve_index=write_index) with open_with(filename, 'wb') as fil: parquet.write_table(t, fil, **kwargs) if metadata_path is not None: with open_with(metadata_path, 'wb') as fil: # Get only arguments specified in the function kwargs_meta = { k: v for k, v in kwargs.items() if k in _pyarrow_write_metadata_kwargs } parquet.write_metadata(t.schema, fil, **kwargs_meta)
def test_metadata_exceeds_message_size(): # ARROW-13655: Thrift may enable a default message size that limits # the size of Parquet metadata that can be written. NCOLS = 1000 NREPEATS = 4000 table = pa.table({str(i): np.random.randn(10) for i in range(NCOLS)}) with pa.BufferOutputStream() as out: pq.write_table(table, out) buf = out.getvalue() original_metadata = pq.read_metadata(pa.BufferReader(buf)) metadata = pq.read_metadata(pa.BufferReader(buf)) for i in range(NREPEATS): metadata.append_row_groups(original_metadata) with pa.BufferOutputStream() as out: metadata.write_metadata_file(out) buf = out.getvalue() metadata = pq.read_metadata(pa.BufferReader(buf))
def fixColumnNames(pdf): """ Copy files from cc3 folder to temporary location for fixing column names :return: """ try: hdfs = pa.hdfs.connect("dantooine10dot", 8020) old_folder_name = "/cc3/" new_folder_name = "/fixed_column_names/" user_folder = pdf["user_folder"].iloc[0] new_location = user_folder.replace(old_folder_name, new_folder_name) hdfs.mkdir(new_location) for index, row in pdf.iterrows(): file_name = "hdfs://dantooine10dot:8020" + row["file_name"] if hdfs.exists(row["file_name"].replace( old_folder_name, new_folder_name)): #print("ALREADY PROCESSED - ",row["file_name"].replace(old_folder_name,new_folder_name)) pass else: data = pq.read_table(file_name) try: data2 = data.drop(["__index_level_0__"]) except: data2 = data if isinstance(row["corrected_schema"], str): new_column_names = eval(row["corrected_schema"]) else: new_column_names = row["corrected_schema"] data3 = data2.rename_columns(new_column_names) pq.write_table( data3, file_name.replace(old_folder_name, new_folder_name)) return pd.DataFrame([[user_folder, 1]], columns=['user_folder', 'success']) except Exception as e: print("*" * 10, user_folder, str(e)) return pd.DataFrame([[user_folder, 0]], columns=['user_folder', 'success'])
def test_parquet_nested_storage(tmpdir): # Parquet support for extension types with nested storage type import pyarrow.parquet as pq struct_array = pa.StructArray.from_arrays( [pa.array([0, 1], type="int64"), pa.array([4, 5], type="int64")], names=["left", "right"]) list_array = pa.array([[1, 2, 3], [4, 5]], type=pa.list_(pa.int32())) mystruct_array = pa.ExtensionArray.from_storage(MyStructType(), struct_array) mylist_array = pa.ExtensionArray.from_storage(MyListType(), list_array) orig_table = pa.table({'structs': mystruct_array, 'lists': mylist_array}) filename = tmpdir / 'nested_extension_storage.parquet' pq.write_table(orig_table, filename) table = pq.read_table(filename) assert table.column(0).type == mystruct_array.type assert table.column(1).type == mylist_array.type assert table == orig_table
def test_noncoerced_nanoseconds_written_without_exception(tempdir): # ARROW-1957: the Parquet version 2.0 writer preserves Arrow # nanosecond timestamps by default n = 9 df = pd.DataFrame({'x': range(n)}, index=pd.date_range('2017-01-01', freq='1n', periods=n)) tb = pa.Table.from_pandas(df) filename = tempdir / 'written.parquet' try: pq.write_table(tb, filename, version='2.6') except Exception: pass assert filename.exists() recovered_table = pq.read_table(filename) assert tb.equals(recovered_table) # Loss of data through coercion (without explicit override) still an error filename = tempdir / 'not_written.parquet' with pytest.raises(ValueError): pq.write_table(tb, filename, coerce_timestamps='ms', version='2.6')
def test_fastparquet_read_with_hdfs(client): try: import snappy # noqa except ImportError: pytest.skip('fastparquet test requires snappy') import pyarrow.parquet as pq fastparquet = pytest.importorskip('fastparquet') fs = hdfs_test_client(client) df = pdt.makeDataFrame() table = pa.Table.from_pandas(df) path = '/tmp/testing.parquet' with fs.open(path, 'wb') as f: pq.write_table(table, f) parquet_file = fastparquet.ParquetFile(path, open_with=fs.open) result = parquet_file.to_pandas() pdt.assert_frame_equal(result, df)
def bench_write(self, niter=2): print("Reading text file: {}".format(self.csv_path)) df = pd.read_csv(self.csv_path, sep=self.sep, header=self.header, low_memory=False) if self.header is None: df.columns = ['f{}'.format(i) for i in range(len(df.columns))] def _get_table(df): return (pa.Table.from_pandas( df, preserve_index=False).replace_schema_metadata(None)) t = _get_table(df) cases = [ ('parquet (UNC)', 'arrow Table', lambda: pq.write_table( t, self.parquet_unc_path, compression='NONE')), ('parquet (UNC)', 'pandas', lambda: pq.write_table( _get_table(df), self.parquet_unc_path, compression='NONE')), ('parquet (SNAPPY)', 'arrow Table', lambda: pq.write_table(t, self.parquet_snappy_path)), ('parquet (SNAPPY)', 'pandas', lambda: pq.write_table(_get_table(df), self.parquet_snappy_path)), ('feather V2 (UNC)', 'pandas', lambda: feather.write_feather( df, self.feather_unc_path, compression='uncompressed')), ('feather V2 (UNC)', 'arrow Table', lambda: feather.write_feather( t, self.feather_unc_path, compression='uncompressed')), ('feather V2 (LZ4)', 'pandas', lambda: feather.write_feather( df, self.feather_lz4_path, compression='lz4')), ('feather V2 (LZ4)', 'arrow Table', lambda: feather.write_feather( t, self.feather_lz4_path, compression='lz4')), ('feather V2 (ZSTD)', 'pandas', lambda: feather.write_feather( df, self.feather_zstd_path, compression='zstd')), ('feather V2 (ZSTD)', 'arrow Table', lambda: feather.write_feather( t, self.feather_zstd_path, compression='zstd')) ] return self._bench_cases(cases, niter)
def multisourcefs(request): request.config.pyarrow.requires('pandas') request.config.pyarrow.requires('parquet') import pyarrow.parquet as pq df = _generate_data(1000) mockfs = fs._MockFileSystem() # simply split the dataframe into three chunks to construct a data source # from each chunk into its own directory df_a, df_b, df_c, df_d = np.array_split(df, 4) # create a directory containing a flat sequence of parquet files without # any partitioning involved mockfs.create_dir('plain') for i, chunk in enumerate(np.array_split(df_a, 10)): path = 'plain/chunk-{}.parquet'.format(i) with mockfs.open_output_stream(path) as out: pq.write_table(_table_from_pandas(chunk), out) # create one with schema partitioning by week and color mockfs.create_dir('schema') for part, chunk in df_b.groupby([df_b.date.dt.week, df_b.color]): folder = 'schema/{}/{}'.format(*part) path = '{}/chunk.parquet'.format(folder) mockfs.create_dir(folder) with mockfs.open_output_stream(path) as out: pq.write_table(_table_from_pandas(chunk), out) # create one with hive partitioning by year and month mockfs.create_dir('hive') for part, chunk in df_c.groupby([df_c.date.dt.year, df_c.date.dt.month]): folder = 'hive/year={}/month={}'.format(*part) path = '{}/chunk.parquet'.format(folder) mockfs.create_dir(folder) with mockfs.open_output_stream(path) as out: pq.write_table(_table_from_pandas(chunk), out) # create one with hive partitioning by color mockfs.create_dir('hive_color') for part, chunk in df_d.groupby(["color"]): folder = 'hive_color/color={}'.format(*part) path = '{}/chunk.parquet'.format(folder) mockfs.create_dir(folder) with mockfs.open_output_stream(path) as out: pq.write_table(_table_from_pandas(chunk), out) return mockfs
def execute_perm_imp(respseptup): """ Standalone function writes the importance of this combination of responseagg and separation to a subdirectory no returns """ responseagg, separation = respseptup retpath = OUTDIR / str(responseagg) / str(separation) if not retpath.exists(): X, y = read_data(responseagg=responseagg, separation=separation, quantile=0.666) #def wrapper(self, *args, **kwargs): # return self.predict_proba(*args,**kwargs)[:,-1] # Last class is True #RandomForestClassifier.predict = wrapper # To avoid things inside permutation importance package where it is only possible to invoke probabilistic prediction with twoclass y. #m = RandomForestClassifier(max_depth = 7, n_estimators = 1500, min_samples_split = 40, max_features = 35, n_jobs = njobs_per_imp) model = HybridExceedenceModel(fit_base_to_all_cv=True, max_depth=5, n_estimators=2500, min_samples_split=30, max_features=35, n_jobs=njobs_per_imp) ret = permute_importance(model, X_in=X, y_in=y, on_validation=False, evaluation_fn=brier_score_loss, n_folds=5, perm_imp_kwargs=dict(nimportant_vars=30, njobs=njobs_per_imp, nbootstrap=1500)) retpath.mkdir(parents=True) pq.write_table(pa.Table.from_pandas(ret), retpath / 'responsagg_separation.parquet') logging.debug( f'subprocess has written out importance frame at {retpath}') else: logging.debug(f'importance frame at {retpath} already exists')
def _write_multiple_hdfs_pq_files(self, tmpdir): import pyarrow.parquet as pq nfiles = 10 size = 5 test_data = [] for i in range(nfiles): df = test_parquet._test_dataframe(size, seed=i) df['index'] = np.arange(i * size, (i + 1) * size) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) path = pjoin(tmpdir, '{0}.parquet'.format(i)) table = pa.Table.from_pandas(df, preserve_index=False) with self.hdfs.open(path, 'wb') as f: pq.write_table(table, f) test_data.append(table) expected = pa.concat_tables(test_data) return expected
def test_columns_index_with_multi_index(tmpdir, engine): fn = os.path.join(str(tmpdir), 'test.parquet') index = pd.MultiIndex.from_arrays([np.arange(10), np.arange(10) + 1], names=['x0', 'x1']) df = pd.DataFrame(np.random.randn(10, 2), columns=['a', 'b'], index=index) df2 = df.reset_index(drop=False) if engine == 'fastparquet': fastparquet.write(fn, df, write_index=True) # fastparquet doesn't support multi-index with pytest.raises(ValueError): ddf = dd.read_parquet(fn, engine=engine) else: import pyarrow as pa pq.write_table(pa.Table.from_pandas(df), fn) # Pyarrow supports multi-index reads ddf = dd.read_parquet(fn, engine=engine) assert_eq(ddf, df) d = dd.read_parquet(fn, columns='a', engine=engine) assert_eq(d, df['a']) d = dd.read_parquet(fn, index=['a', 'b'], columns=['x0', 'x1'], engine=engine) assert_eq(d, df2.set_index(['a', 'b'])[['x0', 'x1']]) # Just index d = dd.read_parquet(fn, index=False, engine=engine) assert_eq(d, df2) d = dd.read_parquet(fn, index=['a'], engine=engine) assert_eq(d, df2.set_index('a')[['b']]) d = dd.read_parquet(fn, index=['x0'], engine=engine) assert_eq(d, df2.set_index('x0')[['a', 'b']]) # Just columns d = dd.read_parquet(fn, columns=['x0', 'a'], engine=engine) assert_eq(d, df2.set_index('x1')[['x0', 'a']]) # Both index and columns d = dd.read_parquet(fn, index=False, columns=['x0', 'b'], engine=engine) assert_eq(d, df2[['x0', 'b']]) for index in ['x1', 'b']: d = dd.read_parquet(fn, index=index, columns=['x0', 'a'], engine=engine) assert_eq(d, df2.set_index(index)[['x0', 'a']]) # Columns and index intersect for index in ['a', 'x0']: with pytest.raises(ValueError): d = dd.read_parquet(fn, index=index, columns=['x0', 'a'], engine=engine) # Series output for ind, col, sol_df in [(None, 'x0', df2.set_index('x1')), (False, 'b', df2), (False, 'x0', df2), ('a', 'x0', df2.set_index('a')), ('a', 'b', df2.set_index('a'))]: d = dd.read_parquet(fn, index=ind, columns=col, engine=engine) assert_eq(d, sol_df[col])
def _write_table(*args, **kwargs): import pyarrow.parquet as pq return pq.write_table(*args, **kwargs)
def test_read_multiple_files(tmpdir): nfiles = 10 size = 5 dirpath = tmpdir.join(guid()).strpath os.mkdir(dirpath) test_data = [] paths = [] for i in range(nfiles): df = _test_dataframe(size, seed=i) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) path = pjoin(dirpath, '{0}.parquet'.format(i)) table = pa.Table.from_pandas(df) pq.write_table(table, path) test_data.append(table) paths.append(path) result = pq.read_multiple_files(paths) expected = pa.concat_tables(test_data) assert result.equals(expected) # Read with provided metadata metadata = pq.ParquetFile(paths[0]).metadata result2 = pq.read_multiple_files(paths, metadata=metadata) assert result2.equals(expected) result3 = pa.localfs.read_parquet(dirpath, schema=metadata.schema) assert result3.equals(expected) # Read column subset to_read = [result[0], result[3], result[6]] result = pa.localfs.read_parquet( dirpath, columns=[c.name for c in to_read]) expected = pa.Table.from_arrays(to_read) assert result.equals(expected) # Test failure modes with non-uniform metadata bad_apple = _test_dataframe(size, seed=i).iloc[:, :4] bad_apple_path = tmpdir.join('{0}.parquet'.format(guid())).strpath t = pa.Table.from_pandas(bad_apple) pq.write_table(t, bad_apple_path) bad_meta = pq.ParquetFile(bad_apple_path).metadata with pytest.raises(ValueError): pq.read_multiple_files(paths + [bad_apple_path]) with pytest.raises(ValueError): pq.read_multiple_files(paths, metadata=bad_meta) mixed_paths = [bad_apple_path, paths[0]] with pytest.raises(ValueError): pq.read_multiple_files(mixed_paths, schema=bad_meta.schema) with pytest.raises(ValueError): pq.read_multiple_files(mixed_paths)