def test(tmp_path): filename = os.path.join(tmp_path, "test.parquet") ak.to_parquet(ak.repartition(range(8), 2), filename) assert ak.from_parquet(filename, row_groups=[1, 3]).tolist() == [2, 3, 6, 7] assert ak.from_parquet(filename, row_groups=[1, 3], lazy=True).tolist() == [ 2, 3, 6, 7, ] assert ak.from_parquet(tmp_path, row_groups=[1, 3]).tolist() == [2, 3, 6, 7] assert ak.from_parquet(tmp_path, row_groups=[1, 3], lazy=True).tolist() == [ 2, 3, 6, 7, ] ak.to_parquet.dataset(tmp_path) assert ak.from_parquet(tmp_path, row_groups=[1, 3]).tolist() == [2, 3, 6, 7] assert ak.from_parquet(tmp_path, row_groups=[1, 3], lazy=True).tolist() == [ 2, 3, 6, 7, ]
def test_8(one, two, three, tmp_path): filename = os.path.join(str(tmp_path), "test8.parquet") data = [ { "x": [] }, { "x": [{ "y": one, "z": 1.1 }] }, { "x": [{ "y": one, "z": 1.1 }, { "y": two, "z": 2.2 }, { "y": three, "z": 3.3 }] }, ] ak.to_parquet(ak.Array(data), filename) array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp") assert set(array.caches[0].keys()) == set() array.layout.field("x").array assert set(array.caches[0].keys()) == set(["tmp:off:x.list.item.y:x[0]"]) assert np.asarray( array.layout.field("x").array.offsets).tolist() == [0, 0, 1, 4] assert set(array.caches[0].keys()) == set(["tmp:off:x.list.item.y:x[0]"]) array.layout.field("x").array.content.field("y").array assert set(array.caches[0].keys()) == set( ["tmp:off:x.list.item.y:x[0]", "tmp:col:x.list.item.y[0]"]) array.layout.field("x").array.content.field("z").array assert set(array.caches[0].keys()) == set([ "tmp:off:x.list.item.y:x[0]", "tmp:col:x.list.item.y[0]", "tmp:col:x.list.item.z[0]", ]) assert array.tolist() == data array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp") assert set(array.caches[0].keys()) == set() array.layout.field("x").array assert set(array.caches[0].keys()) == set(["tmp:off:x.list.item.y:x[0]"]) assert np.asarray( array.layout.field("x").array.offsets).tolist() == [0, 0, 1, 4] assert set(array.caches[0].keys()) == set(["tmp:off:x.list.item.y:x[0]"]) array.layout.field("x").array.content.field("z").array assert set(array.caches[0].keys()) == set( ["tmp:off:x.list.item.y:x[0]", "tmp:col:x.list.item.z[0]"]) array.layout.field("x").array.content.field("y").array assert set(array.caches[0].keys()) == set([ "tmp:off:x.list.item.y:x[0]", "tmp:col:x.list.item.z[0]", "tmp:col:x.list.item.y[0]", ]) assert array.tolist() == data
def test(tmp_path): filename = os.path.join(tmp_path, "what-ever.parquet") fish = ak.Array([True, True])[np.newaxis] clob = ak.Array([2, 3, 7])[np.newaxis] frog = ak.zip({"c": clob, "f": fish}, depth_limit=1) ak.to_parquet(frog, filename) assert ak.from_parquet(filename).tolist() == frog.tolist()
def test_12(one, two, three, tmp_path): filename = os.path.join(str(tmp_path), "test12.parquet") data = [ { "x": { "y": [] } }, { "x": { "y": [[one]] } }, { "x": { "y": [[one, two], [], [three]] } }, ] ak.to_parquet(ak.Array(data), filename) array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp") assert set(array.caches[0].keys()) == set() array.layout.field("x").array assert set(array.caches[0].keys()) == set() array.layout.field("x").array.field("y").array assert set(array.caches[0].keys()) == set(["tmp:lst:x.y[0]"]) assert array.tolist() == data
def test_to_parquet_2(tmp_path): array = ak.Array([ [{ "x": 0.0, "y": [] }, { "x": 1.1, "y": [1] }, { "x": 2.2, "y": None }], [], [{ "x": 3.3, "y": [1, 2, 3] }, None, { "x": 4.4, "y": [1, 2, 3, 4] }], ]) assert str( array.type) == '3 * var * ?{"x": float64, "y": option[var * int64]}' ak.to_parquet(array, os.path.join(tmp_path, "complicated-example.parquet")) array2 = ak.from_parquet( os.path.join(tmp_path, "complicated-example.parquet")) assert str(array2.type) == str(array.type) assert array2.tolist() == array.tolist()
def test_no_fields(tmp_path): one = ak.Array([[1, 2, 3], [], [4, 5]]) two = ak.Array([[6], [7, 8, 9, 10]]) ak.to_parquet(one, tmp_path / "file1.parquet") ak.to_parquet(two, tmp_path / "file2.parquet") assert not os.path.exists(tmp_path / "_common_metadata") assert not os.path.exists(tmp_path / "_metadata") no_metadata = ak.from_parquet(tmp_path) assert no_metadata.tolist() == [[1, 2, 3], [], [4, 5], [6], [7, 8, 9, 10]] no_metadata_lazy = ak.from_parquet(tmp_path, lazy=True) assert no_metadata_lazy.tolist() == [[1, 2, 3], [], [4, 5], [6], [7, 8, 9, 10]] ak.to_parquet.dataset(tmp_path) assert os.path.exists(tmp_path / "_common_metadata") assert os.path.exists(tmp_path / "_metadata") with_metadata = ak.from_parquet(tmp_path) assert with_metadata.tolist() == [[1, 2, 3], [], [4, 5], [6], [7, 8, 9, 10]] with_metadata_lazy = ak.from_parquet(tmp_path, lazy=True) assert with_metadata_lazy.tolist() == [[1, 2, 3], [], [4, 5], [6], [7, 8, 9, 10]]
def save(self): """ """ logger.info( "[DNNHelper : save] Saving events and metadata to directory '%s'." % (self.output_dir)) self.best_model = self.output_dir + "/models/epoch_%d" % self.best_epoch os.system("mkdir -p %s" % (self.output_dir + "/model_best")) os.system("cp -r %s %s" % (self.best_model, self.output_dir + "/model_best")) logger.info("[DNNHelper : save] Best model saved to path '%s'." % (self.output_dir + "/model_best")) # Save parquet files with added fields awkward.to_parquet(self.events_cl, self.output_dir + "/events_cl.parquet") os.system("cp %s %s" % (self.input_dir_cl + "/summary.json", self.output_dir + "/hdna_summary_cl.json")) if self.has_da_events: awkward.to_parquet(self.events_da, self.output_dir + "/events_da.parquet") os.system("cp %s %s" % (self.input_dir_da + "/summary.json", self.output_dir + "/hdna_summary_da.json")) # Save config and metadata self.summary = {"metadata": self.metadata, "config": self.config} with open(self.output_dir + "/training_summary.json", "w") as f_out: json.dump(self.summary, f_out, indent=4, sort_keys=True)
def test_with_fields(tmp_path): one_list = [[{"x": 1}, {"x": 2}, {"x": 3}], [], [{"x": 4}, {"x": 5}]] two_list = [[{"x": 6}], [{"x": 7}, {"x": 8}, {"x": 9}, {"x": 10}]] one = ak.Array(one_list) two = ak.Array(two_list) ak.to_parquet(one, tmp_path / "file1.parquet") ak.to_parquet(two, tmp_path / "file2.parquet") assert not os.path.exists(tmp_path / "_common_metadata") assert not os.path.exists(tmp_path / "_metadata") no_metadata = ak.from_parquet(tmp_path) assert no_metadata.tolist() == one_list + two_list no_metadata_lazy = ak.from_parquet(tmp_path, lazy=True) assert no_metadata_lazy.tolist() == one_list + two_list ak.to_parquet.dataset(tmp_path) assert os.path.exists(tmp_path / "_common_metadata") assert os.path.exists(tmp_path / "_metadata") with_metadata = ak.from_parquet(tmp_path) assert with_metadata.tolist() == one_list + two_list with_metadata_lazy = ak.from_parquet(tmp_path, lazy=True) assert with_metadata_lazy.tolist() == one_list + two_list
def test_3(one, two, three, tmp_path): filename = os.path.join(str(tmp_path), "test3.parquet") data = [ { "x": { "y": one, "z": 1.1 } }, { "x": { "y": two, "z": 2.2 } }, { "x": { "y": three, "z": 3.3 } }, ] ak.to_parquet(ak.Array(data), filename) array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp") assert set(array.caches[0].keys()) == set() array.layout.field("x").array assert set(array.caches[0].keys()) == set() array.layout.field("x").array.field("z").array assert set(array.caches[0].keys()) == set(["tmp:col:x.z[0]"]) array.layout.field("x").array.field("y").array assert set(array.caches[0].keys()) == set( ["tmp:col:x.z[0]", "tmp:col:x.y[0]"]) assert array.tolist() == data
def test_parquet2b(tmp_path): filename = os.path.join(tmp_path, "whatever.parquet") array = ak.Array( [ {"x": [{"y": 0.0, "z": 0}]}, {"x": [{"y": 1.1, "z": 1}]}, {"x": [{"y": 2.2, "z": 2}]}, ] ) ak.to_parquet(array, filename) lazy = ak.from_parquet(filename, lazy=True, lazy_cache=None) @numba.njit def f1(lazy): out = np.ones(3, np.float64) i = 0 for obj in lazy: for subobj in obj.x: out[i] = subobj.y i += 1 return out @numba.njit def f2(lazy): out = np.ones(3, np.float64) i = 0 for obj in lazy: for subobj in obj.x: out[i] = subobj.z i += 1 return out assert f1(lazy).tolist() == [0.0, 1.1, 2.2] assert f2(lazy).tolist() == [0, 1, 2]
def test(): array = ak.Array([1, 2, 3]) file_ = io.BytesIO() ak.to_parquet(array, file_) file_.seek(0) array_from_file = ak.from_parquet(file_) assert ak.to_list(array) == ak.to_list(array_from_file)
def test_1(one, two, three, tmp_path): filename = os.path.join(str(tmp_path), "test1.parquet") data = [{"x": one}, {"x": two}, {"x": three}] ak.to_parquet(ak.Array(data), filename) array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp") assert set(array.caches[0].keys()) == set() array.layout.field("x").array assert set(array.caches[0].keys()) == set(["tmp:col:x[0]"]) assert array.tolist() == data
def test_4(one, two, three, tmp_path): filename = os.path.join(str(tmp_path), "test4.parquet") data = [{"x": []}, {"x": [one]}, {"x": [one, two, three]}] ak.to_parquet(ak.Array(data), filename) array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp") assert set(array.caches[0].keys()) == set() array.layout.field("x").array assert set(array.caches[0].keys()) == {"tmp:lst:x[0]"} assert array.tolist() == data
def test_15(one, two, three, tmp_path): filename = os.path.join(str(tmp_path), "test15.parquet") data = [one, two, three] ak.to_parquet(ak.Array(data), filename) array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp") assert set(array.caches[0].keys()) == set() array.layout.array assert set(array.caches[0].keys()) == set(["tmp:col:[0]"]) assert array.tolist() == data
def test_16(one, two, three, tmp_path): filename = os.path.join(str(tmp_path), "test15.parquet") data = [[one, two], [], [three]] ak.to_parquet(ak.Array(data), filename) array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp") assert set(array.caches[0].keys()) == set() assert np.asarray(array.layout.array.offsets).tolist() == [0, 2, 2, 3] assert set(array.caches[0].keys()) == set(["tmp:lst:[0]"]) assert array.tolist() == data
def test(tmp_path): filename = os.path.join(tmp_path, "test.parquet") dog = ak.from_iter([1, 2, 5]) cat = ak.from_iter([4]) pets = ak.zip({ "dog": dog[np.newaxis], "cat": cat[np.newaxis] }, depth_limit=1) ak.to_parquet(pets, filename) assert ak.from_parquet(filename).tolist() == pets.tolist()
def test_17(one, two, three, tmp_path): filename = os.path.join(str(tmp_path), "test15.parquet") data = [[{"x": one}, {"x": two}], [], [{"x": three}]] ak.to_parquet(ak.Array(data), filename) array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp") assert set(array.caches[0].keys()) == set() assert np.asarray(array.layout.array.offsets).tolist() == [0, 2, 2, 3] assert set(array.caches[0].keys()) == set(["tmp:off:.list.item.x:[0]"]) array.layout.array.content.field("x").array assert set(array.caches[0].keys()) == set( ["tmp:off:.list.item.x:[0]", "tmp:col:.list.item.x[0]"]) assert array.tolist() == data
def test_parquet1(tmp_path): filename = os.path.join(tmp_path, "whatever.parquet") array = ak.Array([{"x": {"y": 0.0}}, {"x": {"y": 1.1}}, {"x": {"y": 2.2}}]) ak.to_parquet(array, filename) lazy = ak.from_parquet(filename, lazy=True, lazy_cache=None) @numba.njit def f1(lazy): out = np.ones(3, np.float64) i = 0 for obj in lazy: out[i] = obj.x.y i += 1 return out assert f1(lazy).tolist() == [0.0, 1.1, 2.2]
def dump(cls, path, obj, *args, **kwargs): path = get_path(path) if path.endswith(".parquet"): import awkward as ak return ak.to_parquet(obj, path, *args, **kwargs) # .pickle, .pkl return PickleFormatter.dump(path, obj, *args, **kwargs)
def test_14(one, two, three, tmp_path): filename = os.path.join(str(tmp_path), "test6.parquet") data = [ {"x": [{"y": [], "z": 1.1}]}, {"x": []}, {"x": [{"y": [one, two, three], "z": 3.3}]}, ] ak.to_parquet(ak.Array(data), filename) array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp") assert set(array.caches[0].keys()) == set() array.layout.field("x").array assert set(array.caches[0].keys()) == {"tmp:off:x.list.item.y.list.item:x[0]"} array.layout.field("x").array.content.field("z").array assert set(array.caches[0].keys()) == { "tmp:off:x.list.item.y.list.item:x[0]", "tmp:col:x.list.item.z[0]", } array.layout.field("x").array.content.field("y").array assert set(array.caches[0].keys()) == { "tmp:off:x.list.item.y.list.item:x[0]", "tmp:col:x.list.item.z[0]", "tmp:lst:x.list.item.y[0]", } assert array.tolist() == data array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp") assert set(array.caches[0].keys()) == set() array.layout.field("x").array assert set(array.caches[0].keys()) == {"tmp:off:x.list.item.y.list.item:x[0]"} array.layout.field("x").array.content.field("y").array assert set(array.caches[0].keys()) == { "tmp:off:x.list.item.y.list.item:x[0]", "tmp:lst:x.list.item.y[0]", } array.layout.field("x").array.content.field("z").array assert set(array.caches[0].keys()) == { "tmp:off:x.list.item.y.list.item:x[0]", "tmp:lst:x.list.item.y[0]", "tmp:col:x.list.item.z[0]", } assert array.tolist() == data
def test_11(one, two, three, tmp_path): filename = os.path.join(str(tmp_path), "test11.parquet") data = [ {"x": []}, {"x": [{"z": 1.1, "y": {"q": one}}]}, { "x": [ {"z": 1.1, "y": {"q": one}}, {"z": 2.2, "y": {"q": two}}, {"z": 3.3, "y": {"q": three}}, ] }, ] ak.to_parquet(ak.Array(data), filename) array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp") assert set(array.caches[0].keys()) == set() array.layout.field("x").array assert len(set(array.caches[0].keys())) == 1 assert np.asarray(array.layout.field("x").array.offsets).tolist() == [0, 0, 1, 4] assert len(set(array.caches[0].keys())) == 1 array.layout.field("x").array.content.field("y").array assert len(set(array.caches[0].keys())) == 1 array.layout.field("x").array.content.field("y").array.field("q").array assert len(set(array.caches[0].keys())) == 2 array.layout.field("x").array.content.field("z").array assert len(set(array.caches[0].keys())) == 3 assert array.tolist() == data array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp") assert set(array.caches[0].keys()) == set() array.layout.field("x").array assert len(set(array.caches[0].keys())) == 1 assert np.asarray(array.layout.field("x").array.offsets).tolist() == [0, 0, 1, 4] assert len(set(array.caches[0].keys())) == 1 array.layout.field("x").array.content.field("y").array assert len(set(array.caches[0].keys())) == 1 array.layout.field("x").array.content.field("z").array assert len(set(array.caches[0].keys())) == 2 array.layout.field("x").array.content.field("y").array.field("q").array assert len(set(array.caches[0].keys())) == 3 assert array.tolist() == data
def test_9(one, two, three, tmp_path): filename = os.path.join(str(tmp_path), "test9.parquet") data = [ {"x": []}, {"x": [{"y": {"q": one}}]}, {"x": [{"y": {"q": one}}, {"y": {"q": two}}, {"y": {"q": three}}]}, ] ak.to_parquet(ak.Array(data), filename) array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp") assert set(array.caches[0].keys()) == set() array.layout.field("x").array assert set(array.caches[0].keys()) == {"tmp:off:x.list.item.y.q:x[0]"} assert np.asarray(array.layout.field("x").array.offsets).tolist() == [0, 0, 1, 4] assert set(array.caches[0].keys()) == {"tmp:off:x.list.item.y.q:x[0]"} array.layout.field("x").array.content.field("y").array assert set(array.caches[0].keys()) == {"tmp:off:x.list.item.y.q:x[0]"} array.layout.field("x").array.content.field("y").array.field("q").array assert set(array.caches[0].keys()) == { "tmp:off:x.list.item.y.q:x[0]", "tmp:col:x.list.item.y.q[0]", } assert array.tolist() == data
def test(tmp_path): one = ak.Array([[], [{"x": [{"y": 1}]}]]) two = ak.Array([[{"x": []}, {"x": [{"y": 1}]}]]) three = ak.Array([[{"x": [{"y": 1}]}], [], [{"x": [{"y": 2}]}]]) ak.to_parquet(one, tmp_path / "one.parquet") ak.to_parquet(two, tmp_path / "two.parquet") ak.to_parquet(three, tmp_path / "three.parquet") lazy_one = ak.from_parquet(tmp_path / "one.parquet", lazy=True) lazy_two = ak.from_parquet(tmp_path / "two.parquet", lazy=True) lazy_three = ak.from_parquet(tmp_path / "three.parquet", lazy=True) assert lazy_one.tolist() == [[], [{"x": [{"y": 1}]}]] assert lazy_two.tolist() == [[{"x": []}, {"x": [{"y": 1}]}]] assert lazy_three.tolist() == [[{ "x": [{ "y": 1 }] }], [], [{ "x": [{ "y": 2 }] }]]
from coffea.nanoevents import schemas if __name__ == "__main__": config_dict = { "skyhook": { "ceph_config_path": "/tmp/testskyhookjob/ceph.conf", "ceph_data_pool": "cephfs_data", } } with open("/root/.coffea.toml", "w") as f: toml.dump(config_dict, f) ak.to_parquet( uproot.lazy("tests/samples/nano_dy.root:Events"), "nano_dy.parquet", list_to32=True, use_dictionary=False, compression="GZIP", compression_level=1, ) ak.to_parquet( uproot.lazy("tests/samples/nano_dimuon.root:Events"), "nano_dimuon.parquet", list_to32=True, use_dictionary=False, compression="GZIP", compression_level=1, ) os.makedirs("/mnt/cephfs/nanoevents/ZJets") os.makedirs("/mnt/cephfs/nanoevents/Data")
def test_to_parquet(tmp_path): original = ak.Array([ [{ "x": 1, "y": 1.1 }, { "x": 2, "y": 2.2 }, { "x": 3, "y": 3.3 }], [], [{ "x": 4, "y": 4.4 }, { "x": 5, "y": 5.5 }], [], [], [ { "x": 6, "y": 6.6 }, { "x": 7, "y": 7.7 }, { "x": 8, "y": 8.8 }, { "x": 9, "y": 9.9 }, ], ]) ak.to_parquet(original, os.path.join(tmp_path, "data.parquet")) reconstituted = ak.from_parquet(os.path.join(tmp_path, "data.parquet")) assert reconstituted.tolist() == [ [{ "x": 1, "y": 1.1 }, { "x": 2, "y": 2.2 }, { "x": 3, "y": 3.3 }], [], [{ "x": 4, "y": 4.4 }, { "x": 5, "y": 5.5 }], [], [], [ { "x": 6, "y": 6.6 }, { "x": 7, "y": 7.7 }, { "x": 8, "y": 8.8 }, { "x": 9, "y": 9.9 }, ], ] assert str(reconstituted.type) == '6 * var * {"x": int64, "y": float64}'
stop0 = int(min(stop0 + events_per_basket, len(content))) c = content[start0:stop0] partitions.append(ak.Array(ak.layout.NumpyArray(c), check_valid=True)) start0 = stop0 for level in [None]: # [9, 1]: print("level", level) ak.to_parquet( ak.partitioned(partitions), "/home/jpivarski/storage/data/chep-2021-jagged-jagged-jagged/lzfour" + str(level) + "-jagged0.parquet", list_to32=True, compression="LZ4", compression_level=level, use_dictionary=False, write_statistics=False, data_page_size=100 * 1024**2, ) print("level", level, "split") ak.to_parquet( ak.partitioned(partitions), "/home/jpivarski/storage/data/chep-2021-jagged-jagged-jagged/lzfour" + str(level) + "-split-jagged0.parquet", list_to32=True, compression="LZ4", compression_level=level, use_dictionary=False, write_statistics=False,
def parse_to_parquet(base_output_filename: Union[Path, str], store_only_necessary_columns: bool, input_filename: Union[Path, str], events_per_chunk: int, parser: str = "pandas", max_chunks: int = -1, compression: str = "zstd", compression_level: Optional[int] = None) -> None: """ Parse the JETSCAPE ASCII and convert it to parquet, (potentially) storing only the minimum necessary columns. Args: base_output_filename: Basic output filename. Should include the entire path. store_only_necessary_columns: If True, store only the necessary columns, rather than all of them. input_filename: Filename of the input JETSCAPE ASCII file. events_per_chunk: Number of events to be read per chunk. parser: Name of the parser. Default: "pandas". max_chunks: Maximum number of chunks to read. Default: -1. compression: Compression algorithm for parquet. Default: "zstd". Options include: ["snappy", "gzip", "ztsd"]. "gzip" is slightly better for storage, but slower. See the compression tests and parquet docs for more. compression_level: Compression level for parquet. Default: `None`, which lets parquet choose the best value. Returns: None. The parsed events are stored in parquet files. """ # Validation base_output_filename = Path(base_output_filename) # Setup the base output directory base_output_filename.parent.mkdir(parents=True, exist_ok=True) # We will check which fields actually exist when writing. possible_fields_containing_floats = ["event_plane_angle", "event_weight", "cross_section", "cross_section_error", "px", "py", "pz", "E"] for i, arrays in enumerate(read(filename=input_filename, events_per_chunk=events_per_chunk, parser=parser)): # Reduce to the minimum required data. if store_only_necessary_columns: arrays = full_events_to_only_necessary_columns_E_px_py_pz(arrays) else: # To match the steps taken when reducing the columns, we'll re-zip with the depth limited to 1. # As of April 2021, I'm not certainly this is truly required anymore, but it may be needed for # parquet writing to be successful (apparently parquet couldn't handle lists of structs sometime # in 2020. The status in April 2021 is unclear, but not worth digging into now). arrays = ak.zip( dict(zip(ak.fields(arrays), ak.unzip(arrays))), depth_limit = 1 ) # If converting in chunks, add an index to the output file so the chunks don't overwrite each other. if events_per_chunk > 0: suffix = base_output_filename.suffix output_filename = (base_output_filename.parent / f"{base_output_filename.stem}_{i:02}").with_suffix(suffix) else: output_filename = base_output_filename # Optimize the output # Additional parquet options are based on https://stackoverflow.com/a/66854439/12907985 # byte_stream_fields apparently only work for float fields. Other fields should be handled # by use_dictionary. Apparently it can't handle this automatically, we so we have to define it # ourselves. This is a bit brittle if fields change, but they don't change so often, and # it's simpler than parsing field types, so it should be fine for now. byte_stream_fields = [field for field in ak.fields(arrays) if field in possible_fields_containing_floats] dict_fields = [field for field in ak.fields(arrays) if field not in possible_fields_containing_floats] # logger.debug(f"dict_fields: {dict_fields}") # logger.debug(f"byte_stream_fields: {byte_stream_fields}") # Parquet with zlib seems to do about the same as ascii tar.gz when we drop unneeded columns. # And it should load much faster! ak.to_parquet( arrays, output_filename, compression=compression, compression_level=compression_level, explode_records=False, # Additional parquet options are based on https://stackoverflow.com/a/66854439/12907985 #use_dictionary=True, #use_byte_stream_split=True, use_dictionary=dict_fields, use_byte_stream_split=byte_stream_fields, ) # Break now so we don't have to read the next chunk. if (i + 1) == max_chunks: break
ak.to_parquet( ak.Array([ { "x": [{ "y": { "q": 1 }, "z": 1.1 }] }, { "x": [{ "y": { "q": 1 }, "z": 1.1 }, { "y": { "q": 2 }, "z": 2.2 }] }, { "x": [{ "y": { "q": 1 }, "z": 1.1 }, { "y": { "q": 2 }, "z": 2.2 }, { "y": { "q": 3 }, "z": 3.3 }] }, ]), "tmp.parquet", )
def test_explode(tmp_path): array3 = ak.Array([ [{ "x": 1, "y": 1.1 }, { "x": 2, "y": 2.2 }, { "x": 3, "y": 3.3 }], [], [{ "x": 4, "y": 4.4 }, { "x": 5, "y": 5.5 }], [], [], [ { "x": 6, "y": 6.6 }, { "x": 7, "y": 7.7 }, { "x": 8, "y": 8.8 }, { "x": 9, "y": 9.9 }, ], ]) array4 = ak.repartition(array3, 2) ak.to_parquet(array3, os.path.join(tmp_path, "array3.parquet"), explode_records=True) ak.to_parquet(array4, os.path.join(tmp_path, "array4.parquet"), explode_records=True) assert ak.from_parquet(os.path.join(tmp_path, "array3.parquet")).tolist() == [ { "x": [1, 2, 3], "y": [1.1, 2.2, 3.3] }, { "x": [], "y": [] }, { "x": [4, 5], "y": [4.4, 5.5] }, { "x": [], "y": [] }, { "x": [], "y": [] }, { "x": [6, 7, 8, 9], "y": [6.6, 7.7, 8.8, 9.9] }, ] assert ak.from_parquet(os.path.join(tmp_path, "array4.parquet")).tolist() == [ { "x": [1, 2, 3], "y": [1.1, 2.2, 3.3] }, { "x": [], "y": [] }, { "x": [4, 5], "y": [4.4, 5.5] }, { "x": [], "y": [] }, { "x": [], "y": [] }, { "x": [6, 7, 8, 9], "y": [6.6, 7.7, 8.8, 9.9] }, ]
ew_shapes = Components.EventWise.from_file("../megaIgnore/IRC_shapes2.parquet") for n in range(1, 5): spectral_shapes = [[[[] for _ in spectral_jets] for _ in ew_shapes.shape_names] for _ in ew_shapes.orders] for order in ["nlo", "lo"]: o_idx = list(ew_shapes.orders).index(order) name = file_name.format(n, order) kinematics = ak.from_parquet(name) print(name) print("Getting jet shapes") for j_idx, jname in enumerate(spectral_jets): print('.', end='', flush=True) ew.selected_event = None for event_n in range(len(kinematics[o_idx, 0, 0])): ew.selected_event = event_n event_kinematics = ak.to_numpy(kinematics[o_idx, 1:, j_idx, event_n, :]) if np.any(np.isnan(event_kinematics)) or len(event_kinematics) != 4: shapes = [np.nan for _ in ew_shapes.shape_names] else: shape_dict = ShapeVariables.shape(*event_kinematics)[1] shapes = [shape_dict[name] for name in ew_shapes.shape_names] for i, val in enumerate(shapes): spectral_shapes[o_idx][i][j_idx].append(val) print("Saving") save_name = name.replace("kinematics", "shapes") spectral_shapes = ak.from_iter(spectral_shapes) ak.to_parquet(spectral_shapes, save_name)