def test_native_file_TextIOWrapper(tmpdir): data = (u'foooo\n' u'barrr\n' u'bazzz\n') path = os.path.join(str(tmpdir), guid()) with open(path, 'wb') as f: f.write(data.encode('utf-8')) with TextIOWrapper(pa.OSFile(path, mode='rb')) as fil: assert fil.readable() res = fil.read() assert res == data assert fil.closed with TextIOWrapper(pa.OSFile(path, mode='rb')) as fil: # Iteration works lines = list(fil) assert ''.join(lines) == data # Writing path2 = os.path.join(str(tmpdir), guid()) with TextIOWrapper(pa.OSFile(path2, mode='wb')) as fil: assert fil.writable() fil.write(data) with TextIOWrapper(pa.OSFile(path2, mode='rb')) as fil: res = fil.read() assert res == data
def test_native_file_raises_ValueError_after_close(tmpdir): path = os.path.join(str(tmpdir), guid()) with open(path, 'wb') as f: f.write(b'foooo') with pa.OSFile(path, mode='rb') as os_file: assert not os_file.closed assert os_file.closed with pa.memory_map(path, mode='rb') as mmap_file: assert not mmap_file.closed assert mmap_file.closed files = [os_file, mmap_file] methods = [('tell', ()), ('seek', (0,)), ('size', ()), ('flush', ()), ('readable', ()), ('writable', ()), ('seekable', ())] for f in files: for method, args in methods: with pytest.raises(ValueError): getattr(f, method)(*args)
def test_dataset_read_pandas(tmpdir): import pyarrow.parquet as pq nfiles = 5 size = 5 dirpath = tmpdir.join(guid()).strpath os.mkdir(dirpath) test_data = [] frames = [] paths = [] for i in range(nfiles): df = _test_dataframe(size, seed=i) df.index = np.arange(i * size, (i + 1) * size) df.index.name = 'index' path = pjoin(dirpath, '{0}.parquet'.format(i)) table = pa.Table.from_pandas(df) _write_table(table, path) test_data.append(table) frames.append(df) paths.append(path) dataset = pq.ParquetDataset(dirpath) columns = ['uint8', 'strings'] result = dataset.read_pandas(columns=columns).to_pandas() expected = pd.concat([x[columns] for x in frames]) tm.assert_frame_equal(result, expected)
def test_native_file_modes(tmpdir): path = os.path.join(str(tmpdir), guid()) with open(path, 'wb') as f: f.write(b'foooo') with pa.OSFile(path, mode='r') as f: assert f.mode == 'rb' with pa.OSFile(path, mode='rb') as f: assert f.mode == 'rb' with pa.OSFile(path, mode='w') as f: assert f.mode == 'wb' with pa.OSFile(path, mode='wb') as f: assert f.mode == 'wb' with open(path, 'wb') as f: f.write(b'foooo') with pa.memory_map(path, 'r') as f: assert f.mode == 'rb' with pa.memory_map(path, 'r+') as f: assert f.mode == 'rb+' with pa.memory_map(path, 'r+b') as f: assert f.mode == 'rb+'
def test_read_multiple_parquet_files(self): import pyarrow.parquet as pq nfiles = 10 size = 5 tmpdir = pjoin(self.tmp_path, 'multi-parquet-' + guid()) self.hdfs.mkdir(tmpdir) test_data = [] paths = [] for i in range(nfiles): df = test_parquet._test_dataframe(size, seed=i) df['index'] = np.arange(i * size, (i + 1) * size) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) path = pjoin(tmpdir, '{0}.parquet'.format(i)) table = pa.Table.from_pandas(df, preserve_index=False) with self.hdfs.open(path, 'wb') as f: pq.write_table(table, f) test_data.append(table) paths.append(path) result = self.hdfs.read_parquet(tmpdir) expected = pa.concat_tables(test_data) pdt.assert_frame_equal(result.to_pandas() .sort_values(by='index').reset_index(drop=True), expected.to_pandas())
def test_read_multiple_parquet_files(self): tmpdir = pjoin(self.tmp_path, 'multi-parquet-' + guid()) self.hdfs.mkdir(tmpdir) expected = self._write_multiple_hdfs_pq_files(tmpdir) result = self.hdfs.read_parquet(tmpdir) pdt.assert_frame_equal(result.to_pandas() .sort_values(by='index').reset_index(drop=True), expected.to_pandas())
def sample_disk_data(request): SIZE = 4096 arr = np.random.randint(0, 256, size=SIZE).astype('u1') data = arr.tobytes()[:SIZE] path = guid() with open(path, 'wb') as f: f.write(data) def teardown(): _try_delete(path) request.addfinalizer(teardown) return path, data
def test_read_multiple_parquet_files_with_uri(self): import pyarrow.parquet as pq tmpdir = pjoin(self.tmp_path, 'multi-parquet-uri-' + guid()) self.hdfs.mkdir(tmpdir) expected = self._write_multiple_hdfs_pq_files(tmpdir) path = _get_hdfs_uri(tmpdir) result = pq.read_table(path) pdt.assert_frame_equal(result.to_pandas() .sort_values(by='index').reset_index(drop=True), expected.to_pandas())
def s3_example(): access_key = os.environ['PYARROW_TEST_S3_ACCESS_KEY'] secret_key = os.environ['PYARROW_TEST_S3_SECRET_KEY'] bucket_name = os.environ['PYARROW_TEST_S3_BUCKET'] import s3fs fs = s3fs.S3FileSystem(key=access_key, secret=secret_key) test_dir = guid() bucket_uri = 's3://{0}/{1}'.format(bucket_name, test_dir) fs.mkdir(bucket_uri) yield fs, bucket_uri fs.rm(bucket_uri, recursive=True)
def sample_disk_data(request, tmpdir): SIZE = 4096 arr = np.random.randint(0, 256, size=SIZE).astype('u1') data = arr.tobytes()[:SIZE] path = os.path.join(str(tmpdir), guid()) with open(path, 'wb') as f: f.write(data) def teardown(): _try_delete(path) request.addfinalizer(teardown) return path, data
def test_read_multiple_parquet_files_with_uri(self): import pyarrow.parquet as pq tmpdir = pjoin(self.tmp_path, 'multi-parquet-uri-' + guid()) self.hdfs.mkdir(tmpdir) expected = self._write_multiple_hdfs_pq_files(tmpdir) path = _get_hdfs_uri(tmpdir) result = pq.read_table(path) _pandas_api.assert_frame_equal(result.to_pandas() .sort_values(by='index') .reset_index(drop=True), expected.to_pandas())
def _write_partitioned( table, root_path, partition_cols, fs, preserve_index=True, **kwargs ): """ Write table to a partitioned dataset with pyarrow. Logic copied from pyarrow.parquet. (arrow/python/pyarrow/parquet.py::write_to_dataset) TODO: Remove this in favor of pyarrow's `write_to_dataset` once ARROW-8244 is addressed. """ fs.mkdirs(root_path, exist_ok=True) df = table.to_pandas(ignore_metadata=False) partition_keys = [df[col] for col in partition_cols] data_df = df.drop(partition_cols, axis="columns") data_cols = df.columns.drop(partition_cols) if len(data_cols) == 0 and not preserve_index: raise ValueError("No data left to save outside partition columns") subschema = table.schema for col in table.schema.names: if col in partition_cols: subschema = subschema.remove(subschema.get_field_index(col)) md_list = [] for keys, subgroup in data_df.groupby(partition_keys): if not isinstance(keys, tuple): keys = (keys,) subdir = fs.sep.join( [ "{colname}={value}".format(colname=name, value=val) for name, val in zip(partition_cols, keys) ] ) subtable = pa.Table.from_pandas( subgroup, preserve_index=False, schema=subschema, safe=False ) prefix = fs.sep.join([root_path, subdir]) fs.mkdir(prefix, exists_ok=True) outfile = guid() + ".parquet" full_path = fs.sep.join([prefix, outfile]) with fs.open(full_path, "wb") as f: pq.write_table(subtable, f, metadata_collector=md_list, **kwargs) md_list[-1].set_file_path(fs.sep.join([subdir, outfile])) return md_list
def test_dataset_read_pandas_common_metadata(tmpdir): # ARROW-1103 import pyarrow.parquet as pq nfiles = 5 size = 5 dirpath = tmpdir.join(guid()).strpath os.mkdir(dirpath) test_data = [] frames = [] paths = [] for i in range(nfiles): df = _test_dataframe(size, seed=i) df.index = pd.Index(np.arange(i * size, (i + 1) * size)) df.index.name = 'index' path = pjoin(dirpath, '{0}.parquet'.format(i)) df_ex_index = df.reset_index(drop=True) df_ex_index['index'] = df.index table = pa.Table.from_pandas(df_ex_index, preserve_index=False) # Obliterate metadata table = table.replace_schema_metadata(None) assert table.schema.metadata is None _write_table(table, path) test_data.append(table) frames.append(df) paths.append(path) # Write _metadata common file table_for_metadata = pa.Table.from_pandas(df) pq.write_metadata(table_for_metadata.schema, pjoin(dirpath, '_metadata')) dataset = pq.ParquetDataset(dirpath) columns = ['uint8', 'strings'] result = dataset.read_pandas(columns=columns).to_pandas() expected = pd.concat([x[columns] for x in frames]) tm.assert_frame_equal(result, expected)
def test_memory_map_resize(tmpdir): SIZE = 4096 arr = np.random.randint(0, 256, size=SIZE).astype(np.uint8) data1 = arr.tobytes()[:(SIZE // 2)] data2 = arr.tobytes()[(SIZE // 2):] path = os.path.join(str(tmpdir), guid()) mmap = pa.create_memory_map(path, SIZE / 2) mmap.write(data1) mmap.resize(SIZE) mmap.write(data2) mmap.close() with open(path, 'rb') as f: assert f.read() == arr.tobytes()
def test_read_write_parquet_files_with_uri(self): import pyarrow.parquet as pq tmpdir = pjoin(self.tmp_path, 'uri-parquet-' + guid()) self.hdfs.mkdir(tmpdir) path = _get_hdfs_uri(pjoin(tmpdir, 'test.parquet')) size = 5 df = test_parquet._test_dataframe(size, seed=0) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) table = pa.Table.from_pandas(df, preserve_index=False) pq.write_table(table, path, filesystem=self.hdfs) result = pq.read_table(path, filesystem=self.hdfs).to_pandas() _pandas_api.assert_frame_equal(result, df)
def test_read_write_parquet_files_with_uri(self): import pyarrow.parquet as pq tmpdir = pjoin(self.tmp_path, 'uri-parquet-' + guid()) self.hdfs.mkdir(tmpdir) path = _get_hdfs_uri(pjoin(tmpdir, 'test.parquet')) size = 5 df = test_parquet._test_dataframe(size, seed=0) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) table = pa.Table.from_pandas(df, preserve_index=False) pq.write_table(table, path) result = pq.read_table(path).to_pandas() pdt.assert_frame_equal(result, df)
def test_os_file_writer(tmpdir): SIZE = 4096 arr = np.random.randint(0, 256, size=SIZE).astype('u1') data = arr.tobytes()[:SIZE] path = os.path.join(str(tmpdir), guid()) with open(path, 'wb') as f: f.write(data) # Truncates file f2 = pa.OSFile(path, mode='w') f2.write('foo') with pa.OSFile(path) as f3: assert f3.size() == 3 with pytest.raises(IOError): f2.read(5)
def test_os_file_writer(tmpdir): SIZE = 4096 arr = np.random.randint(0, 256, size=SIZE).astype('u1') data = arr.tobytes()[:SIZE] path = os.path.join(str(tmpdir), guid()) with open(path, 'wb') as f: f.write(data) # Truncates file f2 = pa.OSFile(path, mode='w') f2.write(b'foo') with pa.OSFile(path) as f3: assert f3.size() == 3 with pytest.raises(IOError): f2.read(5)
def test_memory_map_writer(): SIZE = 4096 arr = np.random.randint(0, 256, size=SIZE).astype('u1') data = arr.tobytes()[:SIZE] path = guid() try: with open(path, 'wb') as f: f.write(data) f = io.MemoryMappedFile(path, mode='r+w') f.seek(10) f.write('peekaboo') assert f.tell() == 18 f.seek(10) assert f.read(8) == b'peekaboo' f2 = io.MemoryMappedFile(path, mode='r+w') f2.seek(10) f2.write(b'booapeak') f2.seek(10) f.seek(10) assert f.read(8) == b'booapeak' # Does not truncate file f3 = io.MemoryMappedFile(path, mode='w') f3.write('foo') with io.MemoryMappedFile(path) as f4: assert f4.size() == SIZE with pytest.raises(IOError): f3.read(5) f.seek(0) assert f.read(3) == b'foo' finally: _try_delete(path)
def test_spark_executor(): pyspark = pytest.importorskip("pyspark", minversion="2.4.1") from pyarrow.compat import guid from coffea.processor.spark.detail import (_spark_initialize, _spark_make_dfs, _spark_stop) from coffea.processor import run_spark_job import os import os.path as osp import pyspark.sql spark_config = pyspark.sql.SparkSession.builder \ .appName('spark-executor-test-%s' % guid()) \ .master('local[*]') \ .config('spark.sql.execution.arrow.enabled','true') \ .config('spark.sql.execution.arrow.maxRecordsPerBatch', 200000) spark = _spark_initialize(config=spark_config,log_level='ERROR',spark_progress=False) filelist = {'ZJets': {'files': ['file:'+osp.join(os.getcwd(),'tests/samples/nano_dy.root')], 'treename': 'Events' }, 'Data' : {'files': ['file:'+osp.join(os.getcwd(),'tests/samples/nano_dimuon.root')], 'treename': 'Events'} } from coffea.processor.test_items import NanoTestProcessor from coffea.processor.spark.spark_executor import spark_executor columns = ['nMuon','Muon_pt','Muon_eta','Muon_phi','Muon_mass'] proc = NanoTestProcessor(columns=columns) hists = run_spark_job(filelist, processor_instance=proc, executor=spark_executor, spark=spark, thread_workers=1, executor_args={'file_type': 'root'}) _spark_stop(spark) assert( sum(spark_executor.counts.values()) == 20 ) assert( hists['cutflow']['ZJets_pt'] == 4 ) assert( hists['cutflow']['ZJets_mass'] == 1 ) assert( hists['cutflow']['Data_pt'] == 15 ) assert( hists['cutflow']['Data_mass'] == 5 )
def test_os_file_writer(): SIZE = 4096 arr = np.random.randint(0, 256, size=SIZE).astype('u1') data = arr.tobytes()[:SIZE] path = guid() try: with open(path, 'wb') as f: f.write(data) # Truncates file f2 = io.OSFile(path, mode='w') f2.write('foo') with io.OSFile(path) as f3: assert f3.size() == 3 with pytest.raises(IOError): f2.read(5) finally: _try_delete(path)
def test_memory_map_writer(tmpdir): SIZE = 4096 arr = np.random.randint(0, 256, size=SIZE).astype('u1') data = arr.tobytes()[:SIZE] path = os.path.join(str(tmpdir), guid()) with open(path, 'wb') as f: f.write(data) f = pa.memory_map(path, mode='r+b') f.seek(10) f.write('peekaboo') assert f.tell() == 18 f.seek(10) assert f.read(8) == b'peekaboo' f2 = pa.memory_map(path, mode='r+b') f2.seek(10) f2.write(b'booapeak') f2.seek(10) f.seek(10) assert f.read(8) == b'booapeak' # Does not truncate file f3 = pa.memory_map(path, mode='w') f3.write('foo') with pa.memory_map(path) as f4: assert f4.size() == SIZE with pytest.raises(IOError): f3.read(5) f.seek(0) assert f.read(3) == b'foo'
def _visit_level(base_dir, level, part_keys): name, values = partition_spec[level] for value in values: this_part_keys = part_keys + [(name, value)] level_dir = pjoin(base_dir, '{0}={1}'.format(name, value)) fs.mkdir(level_dir) if level == DEPTH - 1: # Generate example data file_path = pjoin(level_dir, guid()) filtered_df = _filter_partition(df, this_part_keys) part_table = pa.Table.from_pandas(filtered_df) with fs.open(file_path, 'wb') as f: _write_table(part_table, f) assert fs.exists(file_path) _touch(pjoin(level_dir, '_SUCCESS')) else: _visit_level(level_dir, level + 1, this_part_keys) _touch(pjoin(level_dir, '_SUCCESS'))
def test_native_file_raises_ValueError_after_close(tmpdir): path = os.path.join(str(tmpdir), guid()) with open(path, 'wb') as f: f.write(b'foooo') with pa.OSFile(path, mode='rb') as os_file: assert not os_file.closed assert os_file.closed with pa.memory_map(path, mode='rb') as mmap_file: assert not mmap_file.closed assert mmap_file.closed files = [os_file, mmap_file] methods = [('tell', ()), ('seek', (0, )), ('size', ()), ('flush', ()), ('readable', ()), ('writable', ()), ('seekable', ())] for f in files: for method, args in methods: with pytest.raises(ValueError): getattr(f, method)(*args)
def test_memory_map_writer(tmpdir): SIZE = 4096 arr = np.random.randint(0, 256, size=SIZE).astype('u1') data = arr.tobytes()[:SIZE] path = os.path.join(str(tmpdir), guid()) with open(path, 'wb') as f: f.write(data) f = pa.memory_map(path, mode='r+b') f.seek(10) f.write(b'peekaboo') assert f.tell() == 18 f.seek(10) assert f.read(8) == b'peekaboo' f2 = pa.memory_map(path, mode='r+b') f2.seek(10) f2.write(b'booapeak') f2.seek(10) f.seek(10) assert f.read(8) == b'booapeak' # Does not truncate file f3 = pa.memory_map(path, mode='w') f3.write(b'foo') with pa.memory_map(path) as f4: assert f4.size() == SIZE with pytest.raises(IOError): f3.read(5) f.seek(0) assert f.read(3) == b'foo'
def test_ignore_private_directories(tmpdir): import pyarrow.parquet as pq nfiles = 10 size = 5 dirpath = tmpdir.join(guid()).strpath os.mkdir(dirpath) test_data = [] paths = [] for i in range(nfiles): df = _test_dataframe(size, seed=i) path = pjoin(dirpath, '{0}.parquet'.format(i)) test_data.append(_write_table(df, path)) paths.append(path) # private directory os.mkdir(pjoin(dirpath, '_impala_staging')) dataset = pq.ParquetDataset(dirpath) assert set(paths) == set(x.path for x in dataset.pieces)
def upsert_to_dataset(table, root_path, partition_cols=None, filesystem=None, preserve_index=True, temp_folder=None, categories=None, **kwargs): if filesystem is None: fs = _get_fs_from_path(root_path) else: fs = _ensure_filesystem(filesystem) _mkdir_if_not_exists(fs, root_path) if temp_folder: if not os.path.exists(temp_folder): temp_folder = None if partition_cols is not None and len(partition_cols) > 0: # df is the data in the new table df = table.to_pandas() partition_keys = [df[col] for col in partition_cols] data_df = df.drop(partition_cols, axis='columns') data_cols = df.columns.drop(partition_cols) if len(data_cols) == 0: raise ValueError("No data left to save outside partition columns") subschema = table.schema # ARROW-2891: Ensure the output_schema is preserved when writing a # partitioned dataset for partition_col in partition_cols: subschema = subschema.remove( subschema.get_field_index(partition_col)) for keys, subgroup in data_df.groupby(partition_keys): if not isinstance(keys, tuple): keys = (keys,) subdir = "/".join( ["{colname}={value}".format(colname=name, value=val) for name, val in zip(partition_cols, keys)]) prefix = "/".join([root_path, subdir]) _mkdir_if_not_exists(fs, prefix) existing_files = [f for f in os.listdir(prefix) if f.endswith('.parquet')] if len(existing_files) > 1: raise ValueError('Unsupported scenario, multiple files found in path %s' % prefix) if len(existing_files) == 1: outfile = existing_files[0] full_path = "/".join([prefix, outfile]) old_table = read_table(full_path) category_cols = _to_category_cols(subgroup, categories) # get categories before merging old_subgroup = old_table.to_pandas() # TODO: compare old schema with new subgroup = _upsert_dataframes(subgroup, old_subgroup) # subgroup = pd.concat([subgroup, old_subgroup[~old_subgroup.index.isin(subgroup.index.values)]]) for c, v in category_cols.items(): subgroup.loc[:, c] = subgroup.loc[:, c].astype('category', categories=v) else: outfile = compat.guid() + ".parquet" full_path = "/".join([prefix, outfile]) subtable = Table.from_pandas(subgroup, preserve_index=preserve_index, schema=subschema) write_file = os.path.join(temp_folder, outfile) if temp_folder else full_path with fs.open(write_file, 'wb') as f: write_table(subtable, f, **kwargs) if temp_folder: shutil.move(write_file, full_path) else: existing_files = [f for f in os.listdir(root_path) if f.endswith('.parquet')] if len(existing_files) > 1: raise ValueError('Unsupported scenario, multiple files found in path %s' % root_path) if len(existing_files) == 1: # append use case outfile = existing_files[0] full_path = "/".join([root_path, outfile]) old_table = read_table(full_path) subgroup = table.to_pandas() category_cols = _to_category_cols(subgroup, categories) old_subgroup = old_table.to_pandas() # TODO: compare old schema with new subgroup = _upsert_dataframes(subgroup, old_subgroup) # subgroup = pd.concat([old_subgroup[~old_subgroup.index.isin(subgroup.index)], subgroup]) for c, v in category_cols.items(): subgroup.loc[:, c] = subgroup.loc[:, c].astype('category', categories=v) schema = table.schema table = Table.from_pandas( subgroup, preserve_index=preserve_index, schema=schema ) else: # write use case outfile = compat.guid() + ".parquet" full_path = "/".join([root_path, outfile]) write_file = os.path.join(temp_folder, outfile) if temp_folder else full_path with fs.open(write_file, 'wb') as f: write_table(table, f, **kwargs) if temp_folder: shutil.move(write_file, full_path)
def test_write_to_dataset_no_partitions(self): tmpdir = pjoin(self.tmp_path, 'write-no_partitions-' + guid()) self.hdfs.mkdir(tmpdir) test_parquet._test_write_to_dataset_no_partitions(tmpdir, filesystem=self.hdfs)
def write_to_dataset(table, root_path, partition_cols=None, filesystem=None, preserve_index=None, **kwargs): """ Wrapper around parquet.write_table for writing a Table to Parquet format by partitions. For each combination of partition columns and values, a subdirectories are created in the following manner: root_dir/ group1=value1 group2=value1 <uuid>.parquet group2=value2 <uuid>.parquet group1=valueN group2=value1 <uuid>.parquet group2=valueN <uuid>.parquet Parameters ---------- table : pyarrow.Table root_path : string, The root directory of the dataset filesystem : FileSystem, default None If nothing passed, paths assumed to be found in the local on-disk filesystem partition_cols : list, Column names by which to partition the dataset Columns are partitioned in the order they are given **kwargs : dict, kwargs for write_table function. """ if preserve_index is not None: warnings.warn('preserve_index argument is deprecated as of 0.13.0 and ' 'has no effect', DeprecationWarning) fs, root_path = _get_filesystem_and_path(filesystem, root_path) _mkdir_if_not_exists(fs, root_path) if partition_cols is not None and len(partition_cols) > 0: df = table.to_pandas(ignore_metadata=True) partition_keys = [df[col] for col in partition_cols] data_df = df.drop(partition_cols, axis='columns') data_cols = df.columns.drop(partition_cols) if len(data_cols) == 0: raise ValueError('No data left to save outside partition columns') subschema = table.schema # ARROW-2891: Ensure the output_schema is preserved when writing a # partitioned dataset for col in table.schema.names: if col in partition_cols: subschema = subschema.remove(subschema.get_field_index(col)) for keys, subgroup in data_df.groupby(partition_keys): if not isinstance(keys, tuple): keys = (keys,) subdir = '/'.join( ['{colname}={value}'.format(colname=name, value=val) for name, val in zip(partition_cols, keys)]) subtable = pa.Table.from_pandas(subgroup, preserve_index=False, schema=subschema, safe=False) prefix = '/'.join([root_path, subdir]) _mkdir_if_not_exists(fs, prefix) outfile = guid() + '.parquet' full_path = '/'.join([prefix, outfile]) with fs.open(full_path, 'wb') as f: write_table(subtable, f, **kwargs) else: outfile = guid() + '.parquet' full_path = '/'.join([root_path, outfile]) with fs.open(full_path, 'wb') as f: write_table(table, f, **kwargs)
def write_to_dataset(table, root_path, partition_cols=None, filesystem=None, preserve_index=True, **kwargs): """ Wrapper around parquet.write_table for writing a Table to Parquet format by partitions. For each combination of partition columns and values, a subdirectories are created in the following manner: root_dir/ group1=value1 group2=value1 <uuid>.parquet group2=value2 <uuid>.parquet group1=valueN group2=value1 <uuid>.parquet group2=valueN <uuid>.parquet Parameters ---------- table : pyarrow.Table root_path : string, The root directory of the dataset filesystem : FileSystem, default None If nothing passed, paths assumed to be found in the local on-disk filesystem partition_cols : list, Column names by which to partition the dataset Columns are partitioned in the order they are given preserve_index : bool, Parameter for instantiating Table; preserve pandas index or not. **kwargs : dict, kwargs for write_table function. """ from pyarrow import ( Table, compat ) if filesystem is None: fs = _get_fs_from_path(root_path) else: fs = _ensure_filesystem(filesystem) _mkdir_if_not_exists(fs, root_path) if partition_cols is not None and len(partition_cols) > 0: df = table.to_pandas() partition_keys = [df[col] for col in partition_cols] data_df = df.drop(partition_cols, axis='columns') data_cols = df.columns.drop(partition_cols) if len(data_cols) == 0: raise ValueError("No data left to save outside partition columns") for keys, subgroup in data_df.groupby(partition_keys): if not isinstance(keys, tuple): keys = (keys,) subdir = "/".join( ["{colname}={value}".format(colname=name, value=val) for name, val in zip(partition_cols, keys)]) subtable = Table.from_pandas(subgroup, preserve_index=preserve_index) prefix = "/".join([root_path, subdir]) _mkdir_if_not_exists(fs, prefix) outfile = compat.guid() + ".parquet" full_path = "/".join([prefix, outfile]) with fs.open(full_path, 'wb') as f: write_table(subtable, f, **kwargs) else: outfile = compat.guid() + ".parquet" full_path = "/".join([root_path, outfile]) with fs.open(full_path, 'wb') as f: write_table(table, f, **kwargs)
def test_read_common_metadata_files(self): tmpdir = pjoin(self.tmp_path, 'common-metadata-' + guid()) self.hdfs.mkdir(tmpdir) test_parquet._test_read_common_metadata_files(self.hdfs, tmpdir)
def test_write_to_dataset_no_partitions(self): tmpdir = pjoin(self.tmp_path, 'write-no_partitions-' + guid()) self.hdfs.mkdir(tmpdir) test_parquet._test_write_to_dataset_no_partitions( tmpdir, filesystem=self.hdfs)
from tqdm import tqdm import pyspark.sql import pyspark.sql.functions as fn from pyarrow.compat import guid try: from collections.abc import Sequence except ImportError: from collections import Sequence from ..executor import _futures_handler # this is a reasonable local spark configuration _default_config = pyspark.sql.SparkSession.builder \ .appName('coffea-analysis-%s' % guid()) \ .master('local[*]') \ .config('spark.sql.execution.arrow.enabled', 'true') \ .config('spark.sql.execution.arrow.maxRecordsPerBatch', 200000) def _spark_initialize(config=_default_config, **kwargs): spark_progress = False if 'spark_progress' in kwargs.keys(): spark_progress = kwargs['spark_progress'] cfg_actual = config # get spark to not complain about missing log configs cfg_actual = cfg_actual.config('spark.driver.extraJavaOptions', '-Dlog4jspark.root.logger=ERROR,console') if not spark_progress:
def test_read_multiple_files(tmpdir): import pyarrow.parquet as pq nfiles = 10 size = 5 dirpath = tmpdir.join(guid()).strpath os.mkdir(dirpath) test_data = [] paths = [] for i in range(nfiles): df = _test_dataframe(size, seed=i) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) path = pjoin(dirpath, '{0}.parquet'.format(i)) table = pa.Table.from_pandas(df) _write_table(table, path) test_data.append(table) paths.append(path) # Write a _SUCCESS.crc file with open(pjoin(dirpath, '_SUCCESS.crc'), 'wb') as f: f.write(b'0') def read_multiple_files(paths, columns=None, nthreads=None, **kwargs): dataset = pq.ParquetDataset(paths, **kwargs) return dataset.read(columns=columns, nthreads=nthreads) result = read_multiple_files(paths) expected = pa.concat_tables(test_data) assert result.equals(expected) with pytest.raises(NotImplementedError): pq.read_pandas(dirpath) # Read with provided metadata metadata = pq.ParquetFile(paths[0]).metadata result2 = read_multiple_files(paths, metadata=metadata) assert result2.equals(expected) result3 = pa.localfs.read_parquet(dirpath, schema=metadata.schema) assert result3.equals(expected) # Read column subset to_read = [result[0], result[2], result[6], result[result.num_columns - 1]] result = pa.localfs.read_parquet( dirpath, columns=[c.name for c in to_read]) expected = pa.Table.from_arrays(to_read, metadata=result.schema.metadata) assert result.equals(expected) # Read with multiple threads pa.localfs.read_parquet(dirpath, nthreads=2) # Test failure modes with non-uniform metadata bad_apple = _test_dataframe(size, seed=i).iloc[:, :4] bad_apple_path = tmpdir.join('{0}.parquet'.format(guid())).strpath t = pa.Table.from_pandas(bad_apple) _write_table(t, bad_apple_path) bad_meta = pq.ParquetFile(bad_apple_path).metadata with pytest.raises(ValueError): read_multiple_files(paths + [bad_apple_path]) with pytest.raises(ValueError): read_multiple_files(paths, metadata=bad_meta) mixed_paths = [bad_apple_path, paths[0]] with pytest.raises(ValueError): read_multiple_files(mixed_paths, schema=bad_meta.schema) with pytest.raises(ValueError): read_multiple_files(mixed_paths)
def test_memory_map_deref_remove(tmpdir): path = os.path.join(str(tmpdir), guid()) pa.create_memory_map(path, 4096) os.remove(path) # Shouldn't fail
def test_read_multiple_files(tmpdir): import pyarrow.parquet as pq nfiles = 10 size = 5 dirpath = tmpdir.join(guid()).strpath os.mkdir(dirpath) test_data = [] paths = [] for i in range(nfiles): df = _test_dataframe(size, seed=i) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) path = pjoin(dirpath, '{0}.parquet'.format(i)) table = pa.Table.from_pandas(df) _write_table(table, path) test_data.append(table) paths.append(path) # Write a _SUCCESS.crc file _touch(pjoin(dirpath, '_SUCCESS.crc')) def read_multiple_files(paths, columns=None, nthreads=None, **kwargs): dataset = pq.ParquetDataset(paths, **kwargs) return dataset.read(columns=columns, nthreads=nthreads) result = read_multiple_files(paths) expected = pa.concat_tables(test_data) assert result.equals(expected) # Read with provided metadata metadata = pq.read_metadata(paths[0]) result2 = read_multiple_files(paths, metadata=metadata) assert result2.equals(expected) result3 = pa.localfs.read_parquet(dirpath, schema=metadata.schema) assert result3.equals(expected) # Read column subset to_read = [result[0], result[2], result[6], result[result.num_columns - 1]] result = pa.localfs.read_parquet(dirpath, columns=[c.name for c in to_read]) expected = pa.Table.from_arrays(to_read, metadata=result.schema.metadata) assert result.equals(expected) # Read with multiple threads pa.localfs.read_parquet(dirpath, nthreads=2) # Test failure modes with non-uniform metadata bad_apple = _test_dataframe(size, seed=i).iloc[:, :4] bad_apple_path = tmpdir.join('{0}.parquet'.format(guid())).strpath t = pa.Table.from_pandas(bad_apple) _write_table(t, bad_apple_path) bad_meta = pq.read_metadata(bad_apple_path) with pytest.raises(ValueError): read_multiple_files(paths + [bad_apple_path]) with pytest.raises(ValueError): read_multiple_files(paths, metadata=bad_meta) mixed_paths = [bad_apple_path, paths[0]] with pytest.raises(ValueError): read_multiple_files(mixed_paths, schema=bad_meta.schema) with pytest.raises(ValueError): read_multiple_files(mixed_paths)
def test_memory_zero_length(tmpdir): path = os.path.join(str(tmpdir), guid()) f = open(path, 'wb') f.close() with pa.memory_map(path, mode='r+b') as memory_map: assert memory_map.size() == 0
def write_to_dataset(table, root_path, partition_cols=None, filesystem=None, preserve_index=True, **kwargs): """ Wrapper around parquet.write_table for writing a Table to Parquet format by partitions. For each combination of partition columns and values, a subdirectories are created in the following manner: root_dir/ group1=value1 group2=value1 <uuid>.parquet group2=value2 <uuid>.parquet group1=valueN group2=value1 <uuid>.parquet group2=valueN <uuid>.parquet Parameters ---------- table : pyarrow.Table root_path : string, The root directory of the dataset filesystem : FileSystem, default None If nothing passed, paths assumed to be found in the local on-disk filesystem partition_cols : list, Column names by which to partition the dataset Columns are partitioned in the order they are given preserve_index : bool, Parameter for instantiating Table; preserve pandas index or not. **kwargs : dict, kwargs for write_table function. """ fs, root_path = _get_filesystem_and_path(filesystem, root_path) _mkdir_if_not_exists(fs, root_path) if partition_cols is not None and len(partition_cols) > 0: df = table.to_pandas() partition_keys = [df[col] for col in partition_cols] data_df = df.drop(partition_cols, axis='columns') data_cols = df.columns.drop(partition_cols) if len(data_cols) == 0: raise ValueError('No data left to save outside partition columns') subschema = table.schema # ARROW-2891: Ensure the output_schema is preserved when writing a # partitioned dataset for col in table.schema.names: if (col.startswith('__index_level_') or col in partition_cols): subschema = subschema.remove(subschema.get_field_index(col)) for keys, subgroup in data_df.groupby(partition_keys): if not isinstance(keys, tuple): keys = (keys, ) subdir = '/'.join([ '{colname}={value}'.format(colname=name, value=val) for name, val in zip(partition_cols, keys) ]) subtable = pa.Table.from_pandas(subgroup, preserve_index=preserve_index, schema=subschema, safe=False) prefix = '/'.join([root_path, subdir]) _mkdir_if_not_exists(fs, prefix) outfile = guid() + '.parquet' full_path = '/'.join([prefix, outfile]) with fs.open(full_path, 'wb') as f: write_table(subtable, f, **kwargs) else: outfile = guid() + '.parquet' full_path = '/'.join([root_path, outfile]) with fs.open(full_path, 'wb') as f: write_table(table, f, **kwargs)
def write_to_dataset( df, root_path, partition_cols=None, fs=None, preserve_index=False, return_metadata=False, **kwargs, ): """Wraps `to_parquet` to write partitioned Parquet datasets. For each combination of partition group and value, subdirectories are created as follows: root_dir/ group=value1 <uuid>.parquet ... group=valueN <uuid>.parquet Parameters ---------- df : cudf.DataFrame root_path : string, The root directory of the dataset fs : FileSystem, default None If nothing passed, paths assumed to be found in the local on-disk filesystem preserve_index : bool, default False Preserve index values in each parquet file. partition_cols : list, Column names by which to partition the dataset Columns are partitioned in the order they are given return_metadata : bool, default False Return parquet metadata for written data. Returned metadata will include the file-path metadata (relative to `root_path`). **kwargs : dict, kwargs for to_parquet function. """ fs = _ensure_filesystem(fs, root_path) fs.mkdirs(root_path, exist_ok=True) metadata = [] if partition_cols is not None and len(partition_cols) > 0: data_cols = df.columns.drop(partition_cols) if len(data_cols) == 0: raise ValueError("No data left to save outside partition columns") # Loop through the partition groups for i, sub_df in enumerate( _get_partition_groups( df, partition_cols, preserve_index=preserve_index ) ): if sub_df is None or len(sub_df) == 0: continue keys = tuple([sub_df[col].iloc[0] for col in partition_cols]) if not isinstance(keys, tuple): keys = (keys,) subdir = fs.sep.join( [ "{colname}={value}".format(colname=name, value=val) for name, val in zip(partition_cols, keys) ] ) prefix = fs.sep.join([root_path, subdir]) fs.mkdirs(prefix, exist_ok=True) outfile = guid() + ".parquet" full_path = fs.sep.join([prefix, outfile]) write_df = sub_df.copy(deep=False) write_df.drop(columns=partition_cols, inplace=True) if return_metadata: metadata.append( write_df.to_parquet( full_path, index=preserve_index, metadata_file_path=fs.sep.join([subdir, outfile]), **kwargs, ) ) else: write_df.to_parquet(full_path, index=preserve_index, **kwargs) else: outfile = guid() + ".parquet" full_path = fs.sep.join([root_path, outfile]) if return_metadata: metadata.append( df.to_parquet( full_path, index=preserve_index, metadata_file_path=outfile, **kwargs, ) ) else: df.to_parquet(full_path, index=preserve_index, **kwargs) if metadata: return ( merge_parquet_filemetadata(metadata) if len(metadata) > 1 else metadata[0] )
def test_read_multiple_files(tmpdir): nfiles = 10 size = 5 dirpath = tmpdir.join(guid()).strpath os.mkdir(dirpath) test_data = [] paths = [] for i in range(nfiles): df = _test_dataframe(size, seed=i) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) path = pjoin(dirpath, '{0}.parquet'.format(i)) table = pa.Table.from_pandas(df) pq.write_table(table, path) test_data.append(table) paths.append(path) # Write a _SUCCESS.crc file with open(pjoin(dirpath, '_SUCCESS.crc'), 'wb') as f: f.write(b'0') result = pq.read_multiple_files(paths) expected = pa.concat_tables(test_data) assert result.equals(expected) # Read with provided metadata metadata = pq.ParquetFile(paths[0]).metadata result2 = pq.read_multiple_files(paths, metadata=metadata) assert result2.equals(expected) result3 = pa.localfs.read_parquet(dirpath, schema=metadata.schema) assert result3.equals(expected) # Read column subset to_read = [result[0], result[3], result[6]] result = pa.localfs.read_parquet(dirpath, columns=[c.name for c in to_read]) expected = pa.Table.from_arrays(to_read) assert result.equals(expected) # Read with multiple threads pa.localfs.read_parquet(dirpath, nthreads=2) # Test failure modes with non-uniform metadata bad_apple = _test_dataframe(size, seed=i).iloc[:, :4] bad_apple_path = tmpdir.join('{0}.parquet'.format(guid())).strpath t = pa.Table.from_pandas(bad_apple) pq.write_table(t, bad_apple_path) bad_meta = pq.ParquetFile(bad_apple_path).metadata with pytest.raises(ValueError): pq.read_multiple_files(paths + [bad_apple_path]) with pytest.raises(ValueError): pq.read_multiple_files(paths, metadata=bad_meta) mixed_paths = [bad_apple_path, paths[0]] with pytest.raises(ValueError): pq.read_multiple_files(mixed_paths, schema=bad_meta.schema) with pytest.raises(ValueError): pq.read_multiple_files(mixed_paths)
def write_to_dataset(table, root_path, partition_cols=None, filesystem=None, preserve_index=True, **kwargs): """ Wrapper around parquet.write_table for writing a Table to Parquet format by partitions. For each combination of partition columns and values, a subdirectories are created in the following manner: root_dir/ group1=value1 group2=value1 <uuid>.parquet group2=value2 <uuid>.parquet group1=valueN group2=value1 <uuid>.parquet group2=valueN <uuid>.parquet Parameters ---------- table : pyarrow.Table root_path : string, The root directory of the dataset filesystem : FileSystem, default None If nothing passed, paths assumed to be found in the local on-disk filesystem partition_cols : list, Column names by which to partition the dataset Columns are partitioned in the order they are given preserve_index : bool, Parameter for instantiating Table; preserve pandas index or not. **kwargs : dict, kwargs for write_table function. """ from pyarrow import (Table, compat) if filesystem is None: fs = LocalFileSystem.get_instance() else: fs = _ensure_filesystem(filesystem) _mkdir_if_not_exists(fs, root_path) if partition_cols is not None and len(partition_cols) > 0: df = table.to_pandas() partition_keys = [df[col] for col in partition_cols] data_df = df.drop(partition_cols, axis='columns') data_cols = df.columns.drop(partition_cols) if len(data_cols) == 0: raise ValueError("No data left to save outside partition columns") for keys, subgroup in data_df.groupby(partition_keys): if not isinstance(keys, tuple): keys = (keys, ) subdir = "/".join([ "{colname}={value}".format(colname=name, value=val) for name, val in zip(partition_cols, keys) ]) subtable = Table.from_pandas(subgroup, preserve_index=preserve_index) prefix = "/".join([root_path, subdir]) _mkdir_if_not_exists(fs, prefix) outfile = compat.guid() + ".parquet" full_path = "/".join([prefix, outfile]) with fs.open(full_path, 'wb') as f: write_table(subtable, f, **kwargs) else: outfile = compat.guid() + ".parquet" full_path = "/".join([root_path, outfile]) with fs.open(full_path, 'wb') as f: write_table(table, f, **kwargs)
def write_to_dataset(df, root_path, partition_cols=None, fs=None, preserve_index=False, **kwargs): """Wraps `to_parquet` to write partitioned Parquet datasets. For each combination of partition group and value, subdirectories are created as follows: root_dir/ group=value1 <uuid>.parquet ... group=valueN <uuid>.parquet Parameters ---------- df : cudf.DataFrame root_path : string, The root directory of the dataset fs : FileSystem, default None If nothing passed, paths assumed to be found in the local on-disk filesystem preserve_index : bool, default False Preserve index values in each parquet file. partition_cols : list, Column names by which to partition the dataset Columns are partitioned in the order they are given **kwargs : dict, kwargs for to_parquet function. """ fs, root_path = pq._get_filesystem_and_path(fs, root_path) _mkdir_if_not_exists(fs, root_path) if partition_cols is not None and len(partition_cols) > 0: data_cols = df.columns.drop(partition_cols) if len(data_cols) == 0: raise ValueError("No data left to save outside partition columns") # Loop through the partition groups for i, sub_df in enumerate( _get_partition_groups(df, partition_cols, preserve_index=preserve_index)): if sub_df is None or len(sub_df) == 0: continue keys = tuple([sub_df[col].iloc[0] for col in partition_cols]) if not isinstance(keys, tuple): keys = (keys, ) subdir = "/".join([ "{colname}={value}".format(colname=name, value=val) for name, val in zip(partition_cols, keys) ]) prefix = "/".join([root_path, subdir]) _mkdir_if_not_exists(fs, prefix) outfile = guid() + ".parquet" full_path = "/".join([prefix, outfile]) write_df = sub_df.copy(deep=False) write_df.drop(columns=partition_cols, inplace=True) write_df.to_parquet(full_path, index=preserve_index, **kwargs) else: outfile = guid() + ".parquet" full_path = "/".join([root_path, outfile]) df.to_parquet(full_path, index=preserve_index, **kwargs)
def random_path(): return 'feather_{}'.format(guid())
def pandas2Parquet(self, pandasDF, bucket: str, folder: str, file: str, overwrite: bool = False, engine: str = 'auto', compression: str = 'snappy', use_dictionary: bool = False, coerce_timestamps: str = 'ms', partition_cols: list = None, row_group_size: int = None, **kwargs): s3Path = "s3://%s/%s/%s" % (bucket, folder, file) if folder != None \ else "s3://%s/%s" % (bucket, file) self.log.info("Writing the Pandas DF to S3 path %s" % (s3Path)) if overwrite and self.isFolderPresent(s3Path): self.deleteObject(s3Path) if partition_cols is not None and len(partition_cols) > 0: part_keys = [pandasDF[col] for col in partition_cols] data_cols = pandasDF.columns.drop(partition_cols) if len(data_cols) == 0: raise ValueError( 'No data left to save outside partition columns') table = pa.Table.from_pandas(pandasDF) subschema = table.schema for col in table.schema.names: if (col.startswith('__index_level_') or col in partition_cols): subschema = subschema.remove( subschema.get_field_index(col)) for keys, subgroup in pandasDF.groupby(part_keys): if not isinstance(keys, tuple): keys = (keys, ) subdir = '/'.join([ '{colname}={value}'.format(colname=name, value=val) for name, val in zip(partition_cols, keys) ]) subtable = pa.Table.from_pandas(df=subgroup, schema=subschema, preserve_index=False, safe=False, nthreads=5) prefix = '/'.join([s3Path, subdir]) if (not overwrite) and self.isFolderPresent(prefix): self.deleteObject(prefix) outfile = "pyarow-%s.%s.parquet" % (guid(), compression) full_path = '/'.join([prefix, outfile]) self.log.debug("Creating the file: %s" % (full_path)) self.mkdir(prefix) with self._s3fs.open(full_path, 'wb') as f: pq.write_table( table=subtable, where=f, compression=compression, flavor='spark', #Enable Spark compatibility coerce_timestamps= coerce_timestamps, #Limit the timestamp to miliseconds allow_truncated_timestamps= True, #Don't raise exception during truncation use_dictionary=use_dictionary, row_group_size=row_group_size, version='2.0', **kwargs) else: outfile = "pyarow-single-%s.%s.parquet" % (guid(), compression) full_path = '/'.join([prefix, outfile]) self.log.debug("Creating the file: %s" % (full_path)) with self._s3fs.open(full_path, 'wb') as f: pq.write_table( table=pa.Table.from_pandas(df=pandasDF, preserve_index=False, nthreads=5), where=f, compression=compression, flavor='spark', #Enable Spark compatibility coerce_timestamps= coerce_timestamps, #Limit the timestamp to miliseconds allow_truncated_timestamps= True, #Don't raise exception during truncation use_dictionary=use_dictionary, row_group_size=row_group_size, version='2.0', **kwargs)
def test_read_multiple_files(tmpdir): nfiles = 10 size = 5 dirpath = tmpdir.join(guid()).strpath os.mkdir(dirpath) test_data = [] paths = [] for i in range(nfiles): df = _test_dataframe(size, seed=i) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) path = pjoin(dirpath, '{0}.parquet'.format(i)) table = pa.Table.from_pandas(df) pq.write_table(table, path) test_data.append(table) paths.append(path) result = pq.read_multiple_files(paths) expected = pa.concat_tables(test_data) assert result.equals(expected) # Read with provided metadata metadata = pq.ParquetFile(paths[0]).metadata result2 = pq.read_multiple_files(paths, metadata=metadata) assert result2.equals(expected) result3 = pa.localfs.read_parquet(dirpath, schema=metadata.schema) assert result3.equals(expected) # Read column subset to_read = [result[0], result[3], result[6]] result = pa.localfs.read_parquet( dirpath, columns=[c.name for c in to_read]) expected = pa.Table.from_arrays(to_read) assert result.equals(expected) # Test failure modes with non-uniform metadata bad_apple = _test_dataframe(size, seed=i).iloc[:, :4] bad_apple_path = tmpdir.join('{0}.parquet'.format(guid())).strpath t = pa.Table.from_pandas(bad_apple) pq.write_table(t, bad_apple_path) bad_meta = pq.ParquetFile(bad_apple_path).metadata with pytest.raises(ValueError): pq.read_multiple_files(paths + [bad_apple_path]) with pytest.raises(ValueError): pq.read_multiple_files(paths, metadata=bad_meta) mixed_paths = [bad_apple_path, paths[0]] with pytest.raises(ValueError): pq.read_multiple_files(mixed_paths, schema=bad_meta.schema) with pytest.raises(ValueError): pq.read_multiple_files(mixed_paths)