コード例 #1
0
def test(tmp_path):
    filename = os.path.join(tmp_path, "test.parquet")
    ak.to_parquet(ak.repartition(range(8), 2), filename)

    assert ak.from_parquet(filename, row_groups=[1,
                                                 3]).tolist() == [2, 3, 6, 7]
    assert ak.from_parquet(filename, row_groups=[1, 3],
                           lazy=True).tolist() == [
                               2,
                               3,
                               6,
                               7,
                           ]

    assert ak.from_parquet(tmp_path, row_groups=[1,
                                                 3]).tolist() == [2, 3, 6, 7]
    assert ak.from_parquet(tmp_path, row_groups=[1, 3],
                           lazy=True).tolist() == [
                               2,
                               3,
                               6,
                               7,
                           ]

    ak.to_parquet.dataset(tmp_path)

    assert ak.from_parquet(tmp_path, row_groups=[1,
                                                 3]).tolist() == [2, 3, 6, 7]
    assert ak.from_parquet(tmp_path, row_groups=[1, 3],
                           lazy=True).tolist() == [
                               2,
                               3,
                               6,
                               7,
                           ]
コード例 #2
0
def test_8(one, two, three, tmp_path):
    filename = os.path.join(str(tmp_path), "test8.parquet")
    data = [
        {
            "x": []
        },
        {
            "x": [{
                "y": one,
                "z": 1.1
            }]
        },
        {
            "x": [{
                "y": one,
                "z": 1.1
            }, {
                "y": two,
                "z": 2.2
            }, {
                "y": three,
                "z": 3.3
            }]
        },
    ]
    ak.to_parquet(ak.Array(data), filename)
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array
    assert set(array.caches[0].keys()) == set(["tmp:off:x.list.item.y:x[0]"])
    assert np.asarray(
        array.layout.field("x").array.offsets).tolist() == [0, 0, 1, 4]
    assert set(array.caches[0].keys()) == set(["tmp:off:x.list.item.y:x[0]"])
    array.layout.field("x").array.content.field("y").array
    assert set(array.caches[0].keys()) == set(
        ["tmp:off:x.list.item.y:x[0]", "tmp:col:x.list.item.y[0]"])
    array.layout.field("x").array.content.field("z").array
    assert set(array.caches[0].keys()) == set([
        "tmp:off:x.list.item.y:x[0]",
        "tmp:col:x.list.item.y[0]",
        "tmp:col:x.list.item.z[0]",
    ])
    assert array.tolist() == data
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array
    assert set(array.caches[0].keys()) == set(["tmp:off:x.list.item.y:x[0]"])
    assert np.asarray(
        array.layout.field("x").array.offsets).tolist() == [0, 0, 1, 4]
    assert set(array.caches[0].keys()) == set(["tmp:off:x.list.item.y:x[0]"])
    array.layout.field("x").array.content.field("z").array
    assert set(array.caches[0].keys()) == set(
        ["tmp:off:x.list.item.y:x[0]", "tmp:col:x.list.item.z[0]"])
    array.layout.field("x").array.content.field("y").array
    assert set(array.caches[0].keys()) == set([
        "tmp:off:x.list.item.y:x[0]",
        "tmp:col:x.list.item.z[0]",
        "tmp:col:x.list.item.y[0]",
    ])
    assert array.tolist() == data
コード例 #3
0
def test(tmp_path):
    filename = os.path.join(tmp_path, "what-ever.parquet")
    fish = ak.Array([True, True])[np.newaxis]
    clob = ak.Array([2, 3, 7])[np.newaxis]
    frog = ak.zip({"c": clob, "f": fish}, depth_limit=1)
    ak.to_parquet(frog, filename)
    assert ak.from_parquet(filename).tolist() == frog.tolist()
コード例 #4
0
def test_12(one, two, three, tmp_path):
    filename = os.path.join(str(tmp_path), "test12.parquet")
    data = [
        {
            "x": {
                "y": []
            }
        },
        {
            "x": {
                "y": [[one]]
            }
        },
        {
            "x": {
                "y": [[one, two], [], [three]]
            }
        },
    ]
    ak.to_parquet(ak.Array(data), filename)
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array.field("y").array
    assert set(array.caches[0].keys()) == set(["tmp:lst:x.y[0]"])
    assert array.tolist() == data
def test_to_parquet_2(tmp_path):
    array = ak.Array([
        [{
            "x": 0.0,
            "y": []
        }, {
            "x": 1.1,
            "y": [1]
        }, {
            "x": 2.2,
            "y": None
        }],
        [],
        [{
            "x": 3.3,
            "y": [1, 2, 3]
        }, None, {
            "x": 4.4,
            "y": [1, 2, 3, 4]
        }],
    ])
    assert str(
        array.type) == '3 * var * ?{"x": float64, "y": option[var * int64]}'
    ak.to_parquet(array, os.path.join(tmp_path, "complicated-example.parquet"))
    array2 = ak.from_parquet(
        os.path.join(tmp_path, "complicated-example.parquet"))
    assert str(array2.type) == str(array.type)
    assert array2.tolist() == array.tolist()
コード例 #6
0
def test_no_fields(tmp_path):
    one = ak.Array([[1, 2, 3], [], [4, 5]])
    two = ak.Array([[6], [7, 8, 9, 10]])

    ak.to_parquet(one, tmp_path / "file1.parquet")
    ak.to_parquet(two, tmp_path / "file2.parquet")
    assert not os.path.exists(tmp_path / "_common_metadata")
    assert not os.path.exists(tmp_path / "_metadata")

    no_metadata = ak.from_parquet(tmp_path)
    assert no_metadata.tolist() == [[1, 2, 3], [], [4, 5], [6], [7, 8, 9, 10]]

    no_metadata_lazy = ak.from_parquet(tmp_path, lazy=True)
    assert no_metadata_lazy.tolist() == [[1, 2, 3], [], [4, 5], [6],
                                         [7, 8, 9, 10]]

    ak.to_parquet.dataset(tmp_path)
    assert os.path.exists(tmp_path / "_common_metadata")
    assert os.path.exists(tmp_path / "_metadata")

    with_metadata = ak.from_parquet(tmp_path)
    assert with_metadata.tolist() == [[1, 2, 3], [], [4, 5], [6],
                                      [7, 8, 9, 10]]

    with_metadata_lazy = ak.from_parquet(tmp_path, lazy=True)
    assert with_metadata_lazy.tolist() == [[1, 2, 3], [], [4, 5], [6],
                                           [7, 8, 9, 10]]
コード例 #7
0
    def save(self):
        """

        """
        logger.info(
            "[DNNHelper : save] Saving events and metadata to directory '%s'."
            % (self.output_dir))

        self.best_model = self.output_dir + "/models/epoch_%d" % self.best_epoch
        os.system("mkdir -p %s" % (self.output_dir + "/model_best"))
        os.system("cp -r %s %s" %
                  (self.best_model, self.output_dir + "/model_best"))
        logger.info("[DNNHelper : save] Best model saved to path '%s'." %
                    (self.output_dir + "/model_best"))

        # Save parquet files with added fields
        awkward.to_parquet(self.events_cl,
                           self.output_dir + "/events_cl.parquet")
        os.system("cp %s %s" % (self.input_dir_cl + "/summary.json",
                                self.output_dir + "/hdna_summary_cl.json"))
        if self.has_da_events:
            awkward.to_parquet(self.events_da,
                               self.output_dir + "/events_da.parquet")
            os.system("cp %s %s" % (self.input_dir_da + "/summary.json",
                                    self.output_dir + "/hdna_summary_da.json"))

        # Save config and metadata
        self.summary = {"metadata": self.metadata, "config": self.config}
        with open(self.output_dir + "/training_summary.json", "w") as f_out:
            json.dump(self.summary, f_out, indent=4, sort_keys=True)
コード例 #8
0
def test_with_fields(tmp_path):
    one_list = [[{"x": 1}, {"x": 2}, {"x": 3}], [], [{"x": 4}, {"x": 5}]]
    two_list = [[{"x": 6}], [{"x": 7}, {"x": 8}, {"x": 9}, {"x": 10}]]
    one = ak.Array(one_list)
    two = ak.Array(two_list)

    ak.to_parquet(one, tmp_path / "file1.parquet")
    ak.to_parquet(two, tmp_path / "file2.parquet")
    assert not os.path.exists(tmp_path / "_common_metadata")
    assert not os.path.exists(tmp_path / "_metadata")

    no_metadata = ak.from_parquet(tmp_path)
    assert no_metadata.tolist() == one_list + two_list

    no_metadata_lazy = ak.from_parquet(tmp_path, lazy=True)
    assert no_metadata_lazy.tolist() == one_list + two_list

    ak.to_parquet.dataset(tmp_path)
    assert os.path.exists(tmp_path / "_common_metadata")
    assert os.path.exists(tmp_path / "_metadata")

    with_metadata = ak.from_parquet(tmp_path)
    assert with_metadata.tolist() == one_list + two_list

    with_metadata_lazy = ak.from_parquet(tmp_path, lazy=True)
    assert with_metadata_lazy.tolist() == one_list + two_list
コード例 #9
0
def test_3(one, two, three, tmp_path):
    filename = os.path.join(str(tmp_path), "test3.parquet")
    data = [
        {
            "x": {
                "y": one,
                "z": 1.1
            }
        },
        {
            "x": {
                "y": two,
                "z": 2.2
            }
        },
        {
            "x": {
                "y": three,
                "z": 3.3
            }
        },
    ]
    ak.to_parquet(ak.Array(data), filename)
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array.field("z").array
    assert set(array.caches[0].keys()) == set(["tmp:col:x.z[0]"])
    array.layout.field("x").array.field("y").array
    assert set(array.caches[0].keys()) == set(
        ["tmp:col:x.z[0]", "tmp:col:x.y[0]"])
    assert array.tolist() == data
コード例 #10
0
def test_parquet2b(tmp_path):
    filename = os.path.join(tmp_path, "whatever.parquet")
    array = ak.Array(
        [
            {"x": [{"y": 0.0, "z": 0}]},
            {"x": [{"y": 1.1, "z": 1}]},
            {"x": [{"y": 2.2, "z": 2}]},
        ]
    )
    ak.to_parquet(array, filename)

    lazy = ak.from_parquet(filename, lazy=True, lazy_cache=None)

    @numba.njit
    def f1(lazy):
        out = np.ones(3, np.float64)
        i = 0
        for obj in lazy:
            for subobj in obj.x:
                out[i] = subobj.y
                i += 1
        return out

    @numba.njit
    def f2(lazy):
        out = np.ones(3, np.float64)
        i = 0
        for obj in lazy:
            for subobj in obj.x:
                out[i] = subobj.z
                i += 1
        return out

    assert f1(lazy).tolist() == [0.0, 1.1, 2.2]
    assert f2(lazy).tolist() == [0, 1, 2]
コード例 #11
0
def test():
    array = ak.Array([1, 2, 3])
    file_ = io.BytesIO()
    ak.to_parquet(array, file_)
    file_.seek(0)

    array_from_file = ak.from_parquet(file_)
    assert ak.to_list(array) == ak.to_list(array_from_file)
コード例 #12
0
def test_1(one, two, three, tmp_path):
    filename = os.path.join(str(tmp_path), "test1.parquet")
    data = [{"x": one}, {"x": two}, {"x": three}]
    ak.to_parquet(ak.Array(data), filename)
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array
    assert set(array.caches[0].keys()) == set(["tmp:col:x[0]"])
    assert array.tolist() == data
コード例 #13
0
def test_4(one, two, three, tmp_path):
    filename = os.path.join(str(tmp_path), "test4.parquet")
    data = [{"x": []}, {"x": [one]}, {"x": [one, two, three]}]
    ak.to_parquet(ak.Array(data), filename)
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array
    assert set(array.caches[0].keys()) == {"tmp:lst:x[0]"}
    assert array.tolist() == data
コード例 #14
0
def test_15(one, two, three, tmp_path):
    filename = os.path.join(str(tmp_path), "test15.parquet")
    data = [one, two, three]
    ak.to_parquet(ak.Array(data), filename)
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    array.layout.array
    assert set(array.caches[0].keys()) == set(["tmp:col:[0]"])
    assert array.tolist() == data
コード例 #15
0
def test_16(one, two, three, tmp_path):
    filename = os.path.join(str(tmp_path), "test15.parquet")
    data = [[one, two], [], [three]]
    ak.to_parquet(ak.Array(data), filename)
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    assert np.asarray(array.layout.array.offsets).tolist() == [0, 2, 2, 3]
    assert set(array.caches[0].keys()) == set(["tmp:lst:[0]"])
    assert array.tolist() == data
コード例 #16
0
def test(tmp_path):
    filename = os.path.join(tmp_path, "test.parquet")
    dog = ak.from_iter([1, 2, 5])
    cat = ak.from_iter([4])
    pets = ak.zip({
        "dog": dog[np.newaxis],
        "cat": cat[np.newaxis]
    },
                  depth_limit=1)
    ak.to_parquet(pets, filename)
    assert ak.from_parquet(filename).tolist() == pets.tolist()
コード例 #17
0
def test_17(one, two, three, tmp_path):
    filename = os.path.join(str(tmp_path), "test15.parquet")
    data = [[{"x": one}, {"x": two}], [], [{"x": three}]]
    ak.to_parquet(ak.Array(data), filename)
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    assert np.asarray(array.layout.array.offsets).tolist() == [0, 2, 2, 3]
    assert set(array.caches[0].keys()) == set(["tmp:off:.list.item.x:[0]"])
    array.layout.array.content.field("x").array
    assert set(array.caches[0].keys()) == set(
        ["tmp:off:.list.item.x:[0]", "tmp:col:.list.item.x[0]"])
    assert array.tolist() == data
コード例 #18
0
def test_parquet1(tmp_path):
    filename = os.path.join(tmp_path, "whatever.parquet")
    array = ak.Array([{"x": {"y": 0.0}}, {"x": {"y": 1.1}}, {"x": {"y": 2.2}}])
    ak.to_parquet(array, filename)

    lazy = ak.from_parquet(filename, lazy=True, lazy_cache=None)

    @numba.njit
    def f1(lazy):
        out = np.ones(3, np.float64)
        i = 0
        for obj in lazy:
            out[i] = obj.x.y
            i += 1
        return out

    assert f1(lazy).tolist() == [0.0, 1.1, 2.2]
コード例 #19
0
    def dump(cls, path, obj, *args, **kwargs):
        path = get_path(path)

        if path.endswith(".parquet"):
            import awkward as ak
            return ak.to_parquet(obj, path, *args, **kwargs)

        # .pickle, .pkl
        return PickleFormatter.dump(path, obj, *args, **kwargs)
コード例 #20
0
def test_14(one, two, three, tmp_path):
    filename = os.path.join(str(tmp_path), "test6.parquet")
    data = [
        {"x": [{"y": [], "z": 1.1}]},
        {"x": []},
        {"x": [{"y": [one, two, three], "z": 3.3}]},
    ]
    ak.to_parquet(ak.Array(data), filename)
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array
    assert set(array.caches[0].keys()) == {"tmp:off:x.list.item.y.list.item:x[0]"}
    array.layout.field("x").array.content.field("z").array
    assert set(array.caches[0].keys()) == {
        "tmp:off:x.list.item.y.list.item:x[0]",
        "tmp:col:x.list.item.z[0]",
    }
    array.layout.field("x").array.content.field("y").array
    assert set(array.caches[0].keys()) == {
        "tmp:off:x.list.item.y.list.item:x[0]",
        "tmp:col:x.list.item.z[0]",
        "tmp:lst:x.list.item.y[0]",
    }
    assert array.tolist() == data
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array
    assert set(array.caches[0].keys()) == {"tmp:off:x.list.item.y.list.item:x[0]"}
    array.layout.field("x").array.content.field("y").array
    assert set(array.caches[0].keys()) == {
        "tmp:off:x.list.item.y.list.item:x[0]",
        "tmp:lst:x.list.item.y[0]",
    }
    array.layout.field("x").array.content.field("z").array
    assert set(array.caches[0].keys()) == {
        "tmp:off:x.list.item.y.list.item:x[0]",
        "tmp:lst:x.list.item.y[0]",
        "tmp:col:x.list.item.z[0]",
    }
    assert array.tolist() == data
コード例 #21
0
def test_11(one, two, three, tmp_path):
    filename = os.path.join(str(tmp_path), "test11.parquet")
    data = [
        {"x": []},
        {"x": [{"z": 1.1, "y": {"q": one}}]},
        {
            "x": [
                {"z": 1.1, "y": {"q": one}},
                {"z": 2.2, "y": {"q": two}},
                {"z": 3.3, "y": {"q": three}},
            ]
        },
    ]
    ak.to_parquet(ak.Array(data), filename)
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array
    assert len(set(array.caches[0].keys())) == 1
    assert np.asarray(array.layout.field("x").array.offsets).tolist() == [0, 0, 1, 4]
    assert len(set(array.caches[0].keys())) == 1
    array.layout.field("x").array.content.field("y").array
    assert len(set(array.caches[0].keys())) == 1
    array.layout.field("x").array.content.field("y").array.field("q").array
    assert len(set(array.caches[0].keys())) == 2
    array.layout.field("x").array.content.field("z").array
    assert len(set(array.caches[0].keys())) == 3
    assert array.tolist() == data
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array
    assert len(set(array.caches[0].keys())) == 1
    assert np.asarray(array.layout.field("x").array.offsets).tolist() == [0, 0, 1, 4]
    assert len(set(array.caches[0].keys())) == 1
    array.layout.field("x").array.content.field("y").array
    assert len(set(array.caches[0].keys())) == 1
    array.layout.field("x").array.content.field("z").array
    assert len(set(array.caches[0].keys())) == 2
    array.layout.field("x").array.content.field("y").array.field("q").array
    assert len(set(array.caches[0].keys())) == 3
    assert array.tolist() == data
コード例 #22
0
def test_9(one, two, three, tmp_path):
    filename = os.path.join(str(tmp_path), "test9.parquet")
    data = [
        {"x": []},
        {"x": [{"y": {"q": one}}]},
        {"x": [{"y": {"q": one}}, {"y": {"q": two}}, {"y": {"q": three}}]},
    ]
    ak.to_parquet(ak.Array(data), filename)
    array = ak.from_parquet(filename, lazy=True, lazy_cache_key="tmp")
    assert set(array.caches[0].keys()) == set()
    array.layout.field("x").array
    assert set(array.caches[0].keys()) == {"tmp:off:x.list.item.y.q:x[0]"}
    assert np.asarray(array.layout.field("x").array.offsets).tolist() == [0, 0, 1, 4]
    assert set(array.caches[0].keys()) == {"tmp:off:x.list.item.y.q:x[0]"}
    array.layout.field("x").array.content.field("y").array
    assert set(array.caches[0].keys()) == {"tmp:off:x.list.item.y.q:x[0]"}
    array.layout.field("x").array.content.field("y").array.field("q").array
    assert set(array.caches[0].keys()) == {
        "tmp:off:x.list.item.y.q:x[0]",
        "tmp:col:x.list.item.y.q[0]",
    }
    assert array.tolist() == data
def test(tmp_path):
    one = ak.Array([[], [{"x": [{"y": 1}]}]])
    two = ak.Array([[{"x": []}, {"x": [{"y": 1}]}]])
    three = ak.Array([[{"x": [{"y": 1}]}], [], [{"x": [{"y": 2}]}]])

    ak.to_parquet(one, tmp_path / "one.parquet")
    ak.to_parquet(two, tmp_path / "two.parquet")
    ak.to_parquet(three, tmp_path / "three.parquet")

    lazy_one = ak.from_parquet(tmp_path / "one.parquet", lazy=True)
    lazy_two = ak.from_parquet(tmp_path / "two.parquet", lazy=True)
    lazy_three = ak.from_parquet(tmp_path / "three.parquet", lazy=True)

    assert lazy_one.tolist() == [[], [{"x": [{"y": 1}]}]]
    assert lazy_two.tolist() == [[{"x": []}, {"x": [{"y": 1}]}]]
    assert lazy_three.tolist() == [[{
        "x": [{
            "y": 1
        }]
    }], [], [{
        "x": [{
            "y": 2
        }]
    }]]
コード例 #24
0
from coffea.nanoevents import schemas

if __name__ == "__main__":
    config_dict = {
        "skyhook": {
            "ceph_config_path": "/tmp/testskyhookjob/ceph.conf",
            "ceph_data_pool": "cephfs_data",
        }
    }
    with open("/root/.coffea.toml", "w") as f:
        toml.dump(config_dict, f)

    ak.to_parquet(
        uproot.lazy("tests/samples/nano_dy.root:Events"),
        "nano_dy.parquet",
        list_to32=True,
        use_dictionary=False,
        compression="GZIP",
        compression_level=1,
    )

    ak.to_parquet(
        uproot.lazy("tests/samples/nano_dimuon.root:Events"),
        "nano_dimuon.parquet",
        list_to32=True,
        use_dictionary=False,
        compression="GZIP",
        compression_level=1,
    )

    os.makedirs("/mnt/cephfs/nanoevents/ZJets")
    os.makedirs("/mnt/cephfs/nanoevents/Data")
def test_to_parquet(tmp_path):
    original = ak.Array([
        [{
            "x": 1,
            "y": 1.1
        }, {
            "x": 2,
            "y": 2.2
        }, {
            "x": 3,
            "y": 3.3
        }],
        [],
        [{
            "x": 4,
            "y": 4.4
        }, {
            "x": 5,
            "y": 5.5
        }],
        [],
        [],
        [
            {
                "x": 6,
                "y": 6.6
            },
            {
                "x": 7,
                "y": 7.7
            },
            {
                "x": 8,
                "y": 8.8
            },
            {
                "x": 9,
                "y": 9.9
            },
        ],
    ])

    ak.to_parquet(original, os.path.join(tmp_path, "data.parquet"))
    reconstituted = ak.from_parquet(os.path.join(tmp_path, "data.parquet"))
    assert reconstituted.tolist() == [
        [{
            "x": 1,
            "y": 1.1
        }, {
            "x": 2,
            "y": 2.2
        }, {
            "x": 3,
            "y": 3.3
        }],
        [],
        [{
            "x": 4,
            "y": 4.4
        }, {
            "x": 5,
            "y": 5.5
        }],
        [],
        [],
        [
            {
                "x": 6,
                "y": 6.6
            },
            {
                "x": 7,
                "y": 7.7
            },
            {
                "x": 8,
                "y": 8.8
            },
            {
                "x": 9,
                "y": 9.9
            },
        ],
    ]
    assert str(reconstituted.type) == '6 * var * {"x": int64, "y": float64}'
コード例 #26
0
    stop0 = int(min(stop0 + events_per_basket, len(content)))

    c = content[start0:stop0]

    partitions.append(ak.Array(ak.layout.NumpyArray(c), check_valid=True))

    start0 = stop0

for level in [None]:  # [9, 1]:
    print("level", level)
    ak.to_parquet(
        ak.partitioned(partitions),
        "/home/jpivarski/storage/data/chep-2021-jagged-jagged-jagged/lzfour" +
        str(level) + "-jagged0.parquet",
        list_to32=True,
        compression="LZ4",
        compression_level=level,
        use_dictionary=False,
        write_statistics=False,
        data_page_size=100 * 1024**2,
    )
    print("level", level, "split")
    ak.to_parquet(
        ak.partitioned(partitions),
        "/home/jpivarski/storage/data/chep-2021-jagged-jagged-jagged/lzfour" +
        str(level) + "-split-jagged0.parquet",
        list_to32=True,
        compression="LZ4",
        compression_level=level,
        use_dictionary=False,
        write_statistics=False,
コード例 #27
0
def parse_to_parquet(base_output_filename: Union[Path, str], store_only_necessary_columns: bool,
                     input_filename: Union[Path, str], events_per_chunk: int, parser: str = "pandas",
                     max_chunks: int = -1, compression: str = "zstd", compression_level: Optional[int] = None) -> None:
    """ Parse the JETSCAPE ASCII and convert it to parquet, (potentially) storing only the minimum necessary columns.

    Args:
        base_output_filename: Basic output filename. Should include the entire path.
        store_only_necessary_columns: If True, store only the necessary columns, rather than all of them.
        input_filename: Filename of the input JETSCAPE ASCII file.
        events_per_chunk: Number of events to be read per chunk.
        parser: Name of the parser. Default: "pandas".
        max_chunks: Maximum number of chunks to read. Default: -1.
        compression: Compression algorithm for parquet. Default: "zstd". Options include: ["snappy", "gzip", "ztsd"].
            "gzip" is slightly better for storage, but slower. See the compression tests and parquet docs for more.
        compression_level: Compression level for parquet. Default: `None`, which lets parquet choose the best value.
    Returns:
        None. The parsed events are stored in parquet files.
    """
    # Validation
    base_output_filename = Path(base_output_filename)
    # Setup the base output directory
    base_output_filename.parent.mkdir(parents=True, exist_ok=True)
    # We will check which fields actually exist when writing.
    possible_fields_containing_floats = ["event_plane_angle", "event_weight", "cross_section", "cross_section_error", "px", "py", "pz", "E"]

    for i, arrays in enumerate(read(filename=input_filename, events_per_chunk=events_per_chunk, parser=parser)):
        # Reduce to the minimum required data.
        if store_only_necessary_columns:
            arrays = full_events_to_only_necessary_columns_E_px_py_pz(arrays)
        else:
            # To match the steps taken when reducing the columns, we'll re-zip with the depth limited to 1.
            # As of April 2021, I'm not certainly this is truly required anymore, but it may be needed for
            # parquet writing to be successful (apparently parquet couldn't handle lists of structs sometime
            # in 2020. The status in April 2021 is unclear, but not worth digging into now).
            arrays = ak.zip(
                dict(zip(ak.fields(arrays), ak.unzip(arrays))),
                depth_limit = 1
            )

        # If converting in chunks, add an index to the output file so the chunks don't overwrite each other.
        if events_per_chunk > 0:
            suffix = base_output_filename.suffix
            output_filename = (base_output_filename.parent / f"{base_output_filename.stem}_{i:02}").with_suffix(suffix)
        else:
            output_filename = base_output_filename

        # Optimize the output
        # Additional parquet options are based on https://stackoverflow.com/a/66854439/12907985
        # byte_stream_fields apparently only work for float fields. Other fields should be handled
        # by use_dictionary. Apparently it can't handle this automatically, we so we have to define it
        # ourselves. This is a bit brittle if fields change, but they don't change so often, and
        # it's simpler than parsing field types, so it should be fine for now.
        byte_stream_fields = [field for field in ak.fields(arrays) if field in possible_fields_containing_floats]
        dict_fields = [field for field in ak.fields(arrays) if field not in possible_fields_containing_floats]
        # logger.debug(f"dict_fields: {dict_fields}")
        # logger.debug(f"byte_stream_fields: {byte_stream_fields}")

        # Parquet with zlib seems to do about the same as ascii tar.gz when we drop unneeded columns.
        # And it should load much faster!
        ak.to_parquet(
            arrays, output_filename,
            compression=compression, compression_level=compression_level,
            explode_records=False,
            # Additional parquet options are based on https://stackoverflow.com/a/66854439/12907985
            #use_dictionary=True,
            #use_byte_stream_split=True,
            use_dictionary=dict_fields,
            use_byte_stream_split=byte_stream_fields,
        )

        # Break now so we don't have to read the next chunk.
        if (i + 1) == max_chunks:
            break
コード例 #28
0
ak.to_parquet(
    ak.Array([
        {
            "x": [{
                "y": {
                    "q": 1
                },
                "z": 1.1
            }]
        },
        {
            "x": [{
                "y": {
                    "q": 1
                },
                "z": 1.1
            }, {
                "y": {
                    "q": 2
                },
                "z": 2.2
            }]
        },
        {
            "x": [{
                "y": {
                    "q": 1
                },
                "z": 1.1
            }, {
                "y": {
                    "q": 2
                },
                "z": 2.2
            }, {
                "y": {
                    "q": 3
                },
                "z": 3.3
            }]
        },
    ]),
    "tmp.parquet",
)
コード例 #29
0
def test_explode(tmp_path):
    array3 = ak.Array([
        [{
            "x": 1,
            "y": 1.1
        }, {
            "x": 2,
            "y": 2.2
        }, {
            "x": 3,
            "y": 3.3
        }],
        [],
        [{
            "x": 4,
            "y": 4.4
        }, {
            "x": 5,
            "y": 5.5
        }],
        [],
        [],
        [
            {
                "x": 6,
                "y": 6.6
            },
            {
                "x": 7,
                "y": 7.7
            },
            {
                "x": 8,
                "y": 8.8
            },
            {
                "x": 9,
                "y": 9.9
            },
        ],
    ])
    array4 = ak.repartition(array3, 2)

    ak.to_parquet(array3,
                  os.path.join(tmp_path, "array3.parquet"),
                  explode_records=True)
    ak.to_parquet(array4,
                  os.path.join(tmp_path, "array4.parquet"),
                  explode_records=True)

    assert ak.from_parquet(os.path.join(tmp_path,
                                        "array3.parquet")).tolist() == [
                                            {
                                                "x": [1, 2, 3],
                                                "y": [1.1, 2.2, 3.3]
                                            },
                                            {
                                                "x": [],
                                                "y": []
                                            },
                                            {
                                                "x": [4, 5],
                                                "y": [4.4, 5.5]
                                            },
                                            {
                                                "x": [],
                                                "y": []
                                            },
                                            {
                                                "x": [],
                                                "y": []
                                            },
                                            {
                                                "x": [6, 7, 8, 9],
                                                "y": [6.6, 7.7, 8.8, 9.9]
                                            },
                                        ]
    assert ak.from_parquet(os.path.join(tmp_path,
                                        "array4.parquet")).tolist() == [
                                            {
                                                "x": [1, 2, 3],
                                                "y": [1.1, 2.2, 3.3]
                                            },
                                            {
                                                "x": [],
                                                "y": []
                                            },
                                            {
                                                "x": [4, 5],
                                                "y": [4.4, 5.5]
                                            },
                                            {
                                                "x": [],
                                                "y": []
                                            },
                                            {
                                                "x": [],
                                                "y": []
                                            },
                                            {
                                                "x": [6, 7, 8, 9],
                                                "y": [6.6, 7.7, 8.8, 9.9]
                                            },
                                        ]
コード例 #30
0
ファイル: jet_shapes.py プロジェクト: HenryDayHall/jetTools
ew_shapes = Components.EventWise.from_file("../megaIgnore/IRC_shapes2.parquet")


for n in range(1, 5):
    spectral_shapes = [[[[] for _ in spectral_jets] for _ in ew_shapes.shape_names] for _ in ew_shapes.orders]

    for order in ["nlo", "lo"]:
        o_idx = list(ew_shapes.orders).index(order)
        name = file_name.format(n, order)
        kinematics = ak.from_parquet(name)
        print(name)
        
        print("Getting jet shapes")
        for j_idx, jname in enumerate(spectral_jets):
            print('.', end='', flush=True)
            ew.selected_event = None
            for event_n in range(len(kinematics[o_idx, 0, 0])):
                ew.selected_event = event_n
                event_kinematics = ak.to_numpy(kinematics[o_idx, 1:, j_idx, event_n, :])
                if np.any(np.isnan(event_kinematics)) or len(event_kinematics) != 4:
                    shapes = [np.nan for _ in ew_shapes.shape_names]
                else:
                    shape_dict = ShapeVariables.shape(*event_kinematics)[1]
                    shapes = [shape_dict[name] for name in ew_shapes.shape_names]
                for i, val in enumerate(shapes):
                    spectral_shapes[o_idx][i][j_idx].append(val)
        print("Saving")
        save_name = name.replace("kinematics", "shapes")
        spectral_shapes = ak.from_iter(spectral_shapes)
        ak.to_parquet(spectral_shapes, save_name)