Example #1
0
def timing():
    baseurl = CML_BASEURL_S3
    baseurl = CML_BASEURL_CDS
    index = PerUrlIndex(
        f"{baseurl}/test-data/input/indexed-urls/large_grib_1.grb", )

    sizes = ["sharp(1,1)", "auto", "cluster"]
    sizes = []
    for r in range(11, 24):  # from 2k to 8M
        sizes.append(f"blocked({2 ** r})")

    report = {}
    for request in [
            dict(param="r"),
            dict(param="r", time="1000"),
            dict(date="19970101"),
            dict(param="r", time="1000", date="19970101"),
    ]:
        times = []
        for n in sizes:
            try:
                elapsed = retrieve_and_check(index,
                                             request,
                                             range_method=n,
                                             force=True)
            except Exception as e:
                print(e)
                times.append(-1)
                continue
            if n is None:
                n = 0
            if n == "auto":
                n = -1
            if n == "cluster":
                n = 1
            if n == "sharp":
                n = -2
            times.append((round(elapsed * 10) / 10.0, n))

        report[tuple(request.items())] = request, sorted(times)

    for k, v in report.items():
        print(k)
        print(v)
Example #2
0
def test_grib_index_eumetnet():
    from climetlab import load_source
    from climetlab.indexing import PerUrlIndex

    request = {
        "param": "2ti",
        "date": "20171228",
        "step":
        ["0-24", "24-48", "48-72", "72-96", "96-120", "120-144", "144-168"],
        # Parameters passed to the filename mangling
        "url":
        "https://storage.ecmwf.europeanweather.cloud/eumetnet-postprocessing-benchmark-training-dataset/",
        "month": "12",
        "year": "2017",
    }
    PATTERN = "{url}data/fcs/efi/" "EU_forecast_efi_params_{year}-{month}_0.grb"
    ds = load_source("indexed-urls", PerUrlIndex(PATTERN), request)
    xds = ds.to_xarray()
    print(xds)
Example #3
0
def test_per_url_index(baseurl):
    index = PerUrlIndex(
        f"{baseurl}/test-data/input/indexed-urls/large_grib_1.grb", )
    request = dict(param="r", time="1000", date="19970101")
    retrieve_and_check(index, request)
Example #4
0
def test_per_url_index_2():
    baseurl = CML_BASEURL_S3
    index = PerUrlIndex(f"{baseurl}/test-data/big.grib", )
    request = dict(param="cin", date="20211125", step="6", number=["1", "3"])
    retrieve_and_check(index, request)
Example #5
0
def benchmark():
    collect_statistics(True)

    baseurls = [
        CML_BASEURL_S3,
        CML_BASEURL_CDS,
        # CML_BASEURL_GET,
    ]

    requests = [
        {"param": "r", "time": "1000", "step": "0"},
        {"param": "r", "time": "1000"},
        {"param": "r", "time": ["1100", "1200", "1300", "1400"]},
        {
            "param": ["r", "z"],
            "time": ["0200", "1000", "1800", "2300"],
            "levelist": ["500", "850"],
        },
        {"param": ["r", "z"], "levelist": ["500", "850"]},
        {"param": "r"},
        # {"param": ["r", "z"]},
        {"param": ["r", "z", "t"]},
        # {},
    ]

    methods = get_methods_list()

    # requests = [requests[2]]
    # methods = [methods[0]]
    # baseurls = [baseurls[0]]
    # requests = requests[::2]
    # methods = methods[::2]
    # baseurls = [baseurls[0]]
    failed = []
    successfull = 0
    import tqdm

    for request in tqdm.tqdm(requests):
        for range_method in tqdm.tqdm(methods):
            for baseurl in baseurls:
                index = PerUrlIndex(
                    f"{baseurl}/test-data/input/indexed-urls/large_grib_1.grb",
                )
                try:
                    retrieve_and_check(
                        index,
                        request,
                        range_method,
                        force=True,
                    )
                    successfull += 1
                except Exception as e:
                    failed.append((index, request, range_method))
                    print("FAILED for ", index, request, range_method)
                    print(e)

    stats = retrieve_statistics()

    run_id = get_run_id()

    logfiles = []

    path = f"climetlab_benchmark{run_id}.json"
    logfiles.append(path)
    stats.write_to_json(path)
    print(f"BENCHMARK FINISHED. Raw logs saved in {path}")

    df = stats.to_pandas()

    df["server"] = df["url"].apply(url_to_server)
    df["speed"] = df["total"] / df["elapsed"] / (1024 * 1024)  # MB/s
    df["method"] = df["full_method"].apply(radix)

    df = df.rename(
        dict(
            size_parts="size_requested",
            size_blocks="size_downloaded",
        )
    )
    df["size_ratio"] = df["size_downloaded"] / df["size_requested"]

    path = f"climetlab_benchmark{run_id}.csv"
    df.to_csv(path)
    # df.to_csv("climetlab_benchmark.csv")
    logfiles.append(path)

    print(f"Benchmark finished ({successfull} successfull, {len(failed)} failed).")
    print(
        "All data in the log files are anonymous."
        "Only the log file names contain personal data (machine name, IP, etc.)."
    )
    for f in logfiles:
        print(f"Log file: {f}")