def timing(): baseurl = CML_BASEURL_S3 baseurl = CML_BASEURL_CDS index = PerUrlIndex( f"{baseurl}/test-data/input/indexed-urls/large_grib_1.grb", ) sizes = ["sharp(1,1)", "auto", "cluster"] sizes = [] for r in range(11, 24): # from 2k to 8M sizes.append(f"blocked({2 ** r})") report = {} for request in [ dict(param="r"), dict(param="r", time="1000"), dict(date="19970101"), dict(param="r", time="1000", date="19970101"), ]: times = [] for n in sizes: try: elapsed = retrieve_and_check(index, request, range_method=n, force=True) except Exception as e: print(e) times.append(-1) continue if n is None: n = 0 if n == "auto": n = -1 if n == "cluster": n = 1 if n == "sharp": n = -2 times.append((round(elapsed * 10) / 10.0, n)) report[tuple(request.items())] = request, sorted(times) for k, v in report.items(): print(k) print(v)
def test_grib_index_eumetnet(): from climetlab import load_source from climetlab.indexing import PerUrlIndex request = { "param": "2ti", "date": "20171228", "step": ["0-24", "24-48", "48-72", "72-96", "96-120", "120-144", "144-168"], # Parameters passed to the filename mangling "url": "https://storage.ecmwf.europeanweather.cloud/eumetnet-postprocessing-benchmark-training-dataset/", "month": "12", "year": "2017", } PATTERN = "{url}data/fcs/efi/" "EU_forecast_efi_params_{year}-{month}_0.grb" ds = load_source("indexed-urls", PerUrlIndex(PATTERN), request) xds = ds.to_xarray() print(xds)
def test_per_url_index(baseurl): index = PerUrlIndex( f"{baseurl}/test-data/input/indexed-urls/large_grib_1.grb", ) request = dict(param="r", time="1000", date="19970101") retrieve_and_check(index, request)
def test_per_url_index_2(): baseurl = CML_BASEURL_S3 index = PerUrlIndex(f"{baseurl}/test-data/big.grib", ) request = dict(param="cin", date="20211125", step="6", number=["1", "3"]) retrieve_and_check(index, request)
def benchmark(): collect_statistics(True) baseurls = [ CML_BASEURL_S3, CML_BASEURL_CDS, # CML_BASEURL_GET, ] requests = [ {"param": "r", "time": "1000", "step": "0"}, {"param": "r", "time": "1000"}, {"param": "r", "time": ["1100", "1200", "1300", "1400"]}, { "param": ["r", "z"], "time": ["0200", "1000", "1800", "2300"], "levelist": ["500", "850"], }, {"param": ["r", "z"], "levelist": ["500", "850"]}, {"param": "r"}, # {"param": ["r", "z"]}, {"param": ["r", "z", "t"]}, # {}, ] methods = get_methods_list() # requests = [requests[2]] # methods = [methods[0]] # baseurls = [baseurls[0]] # requests = requests[::2] # methods = methods[::2] # baseurls = [baseurls[0]] failed = [] successfull = 0 import tqdm for request in tqdm.tqdm(requests): for range_method in tqdm.tqdm(methods): for baseurl in baseurls: index = PerUrlIndex( f"{baseurl}/test-data/input/indexed-urls/large_grib_1.grb", ) try: retrieve_and_check( index, request, range_method, force=True, ) successfull += 1 except Exception as e: failed.append((index, request, range_method)) print("FAILED for ", index, request, range_method) print(e) stats = retrieve_statistics() run_id = get_run_id() logfiles = [] path = f"climetlab_benchmark{run_id}.json" logfiles.append(path) stats.write_to_json(path) print(f"BENCHMARK FINISHED. Raw logs saved in {path}") df = stats.to_pandas() df["server"] = df["url"].apply(url_to_server) df["speed"] = df["total"] / df["elapsed"] / (1024 * 1024) # MB/s df["method"] = df["full_method"].apply(radix) df = df.rename( dict( size_parts="size_requested", size_blocks="size_downloaded", ) ) df["size_ratio"] = df["size_downloaded"] / df["size_requested"] path = f"climetlab_benchmark{run_id}.csv" df.to_csv(path) # df.to_csv("climetlab_benchmark.csv") logfiles.append(path) print(f"Benchmark finished ({successfull} successfull, {len(failed)} failed).") print( "All data in the log files are anonymous." "Only the log file names contain personal data (machine name, IP, etc.)." ) for f in logfiles: print(f"Log file: {f}")