コード例 #1
0
ファイル: count_records.py プロジェクト: rmax/dask-avro
def main():
    if not sys.argv[1:]:
        sys.stderr.write("Usage: python %s <avro file>\n" % sys.argv[0])
        sys.exit(1)

    data = dask.bag.from_delayed(read_avro(sys.argv[1], blocksize=1024 * 1024))
    print("Partitions: %s" % data.npartitions)

    task = data.count()
    with progress.ProgressBar():
        count = task.compute()

    print("Result: %s" % count)
コード例 #2
0
def main():
    if not sys.argv[1:]:
        sys.stderr.write("Usage: python %s <query>\n" % sys.argv[0])
        sys.exit(1)

    search_body = {"query": {"match": {"_all": sys.argv[1]}}}
    data = dask.bag.from_delayed(read_elasticsearch(search_body,
                                                    npartitions=8))
    print("Partitions: %s" % data.npartitions)

    task = data.count()
    with progress.ProgressBar():
        count = task.compute()

    print("Result: %s" % count)
コード例 #3
0
# This is a very basic example of using chunks to make computation easier.
# Due to the large number of dimensions, if you didn't chunk the dataset, you'd excede 16 GB of RAM. With chunks, you can run this in 23.1s while barely using any memory.
# More info here: example of using chunking for low-memory situations

import xarray as xr
import dask
from dask.diagnostics import progress
fpath = '/home/jqiu21/Documents/Internship/'

ds = xr.open_dataset(fpath + 'Tair-1979-JFM00.nc', chunks={'time': 1})

print(ds)

delayed_obj = ds.rename({
    't': 'tair'
}).to_netcdf(fpath + 'Tair-1979-JFM00_redim.nc',
             mode='w',
             compute=False,
             engine='netcdf4',
             format='NETCDF4')
with progress.ProgressBar():
    delayed_obj.compute()
コード例 #4
0
def plot_verlauf_von(var, expts, start, end, table_id='Amon'):
    query = dict(
        experiment_id=expts,
        table_id=table_id,
        variable_id=[var],
        member_id='r1i1p1f1',
    )
    col = intake.open_esm_datastore(
        "https://storage.googleapis.com/cmip6/pangeo-cmip6.json")
    col_subset = col.search(require_all_on=["source_id"], **query)

    from collections import defaultdict
    dsets = defaultdict(dict)

    for group, df in col_subset.df.groupby(by=['source_id', 'experiment_id']):
        dsets[group[0]][group[1]] = open_delayed(df)

    dsets_ = dask.compute(dict(dsets))[0]
    expt_da = xr.DataArray(expts,
                           dims='experiment_id',
                           name='experiment_id',
                           coords={'experiment_id': expts})

    dsets_aligned = {}

    print('Suchen und sortieren der angeforderten Daten.')

    for k, v in dsets_.items():
        expt_dsets = v.values()
        if any([d is None for d in expt_dsets]):
            print(f"Missing experiment for {k}")
            continue

        for ds in expt_dsets:
            ds.coords['year'] = ds.time.dt.year

        # workaround for
        # https://github.com/pydata/xarray/issues/2237#issuecomment-620961663
        dsets_ann_mean = [
            v[expt].pipe(global_mean).swap_dims({
                'time': 'year'
            }).drop('time').coarsen(year=12).mean() for expt in expts
        ]

        # align everything with the 4xCO2 experiment
        dsets_aligned[k] = xr.concat(dsets_ann_mean, join='outer', dim=expt_da)

    print('Berechnung des Globalenmittel von jedem einzelnen Modell.')
    print('Dies kann ein paar Minuten dauern.')
    with progress.ProgressBar():
        dsets_aligned_ = dask.compute(dsets_aligned)[0]

    source_ids = list(dsets_aligned_.keys())
    source_da = xr.DataArray(source_ids,
                             dims='source_id',
                             name='source_id',
                             coords={'source_id': source_ids})

    big_ds = xr.concat(
        [ds.reset_coords(drop=True) for ds in dsets_aligned_.values()],
        dim=source_da)

    df_all = big_ds.sel(year=slice(start, end)).to_dataframe().reset_index()
    print('Erstellung des Plots.')
    sns.relplot(data=df_all,
                x="year",
                y=var,
                hue='experiment_id',
                kind="line",
                ci="sd",
                aspect=2)
    return ()