Ejemplo n.º 1
0
def test_reduction_empty_aggregate(npartitions):
    b = db.from_sequence([0, 0, 0, 1], npartitions=npartitions).filter(None)
    assert b.min(split_every=2).compute(get=dask.get) == 1
    vals = db.compute(b.min(split_every=2), b.max(split_every=2), get=dask.get)
    assert vals == (1, 1)
    with pytest.raises(ValueError):
        b = db.from_sequence([0, 0, 0, 0], npartitions=npartitions)
        b.filter(None).min(split_every=2).compute(get=dask.get)
Ejemplo n.º 2
0
def test_reduction_empty_aggregate(npartitions):
    b = db.from_sequence([0, 0, 0, 1], npartitions=npartitions).filter(None)
    assert_eq(b.min(split_every=2), 1)
    vals = db.compute(b.min(split_every=2), b.max(split_every=2), scheduler="sync")
    assert vals == (1, 1)
    with pytest.raises(ValueError):
        b = db.from_sequence([0, 0, 0, 0], npartitions=npartitions)
        b.filter(None).min(split_every=2).compute(scheduler="sync")
Ejemplo n.º 3
0
def test_reduction_empty_aggregate(npartitions):
    b = db.from_sequence([0, 0, 0, 1], npartitions=npartitions).filter(None)
    assert b.min(split_every=2).compute(get=dask.get) == 1
    vals = db.compute(b.min(split_every=2), b.max(split_every=2), get=dask.get)
    assert vals == (1, 1)
    with pytest.raises(ValueError):
        b = db.from_sequence([0, 0, 0, 0], npartitions=npartitions)
        b.filter(None).min(split_every=2).compute(get=dask.get)
Ejemplo n.º 4
0
def test_reduction_empty_aggregate(npartitions):
    b = db.from_sequence([0, 0, 0, 1], npartitions=npartitions).filter(None)
    assert_eq(b.min(split_every=2), 1)
    vals = db.compute(b.min(split_every=2), b.max(split_every=2), scheduler='sync')
    assert vals == (1, 1)
    with pytest.raises(ValueError):
        b = db.from_sequence([0, 0, 0, 0], npartitions=npartitions)
        b.filter(None).min(split_every=2).compute(scheduler='sync')
Ejemplo n.º 5
0
def test_from_delayed_iterator():
    from dask.delayed import delayed

    def lazy_records(n):
        return ({'operations': [1, 2]} for _ in range(n))

    delayed_records = delayed(lazy_records, pure=False)
    bag = db.from_delayed([delayed_records(5) for _ in range(5)])
    assert db.compute(
        bag.count(),
        bag.pluck('operations').count(),
        bag.pluck('operations').flatten().count(),
        get=dask.get,
    ) == (25, 25, 50)
Ejemplo n.º 6
0
def test_from_delayed_iterator():
    from dask.delayed import delayed

    def lazy_records(n):
        return ({"operations": [1, 2]} for _ in range(n))

    delayed_records = delayed(lazy_records, pure=False)
    bag = db.from_delayed([delayed_records(5) for _ in range(5)])
    assert (db.compute(
        bag.count(),
        bag.pluck("operations").count(),
        bag.pluck("operations").flatten().count(),
        scheduler="sync",
    ) == (25, 25, 50))
Ejemplo n.º 7
0
def test_from_delayed_iterator():
    from dask.delayed import delayed

    def lazy_records(n):
        return ({'operations': [1, 2]} for _ in range(n))

    delayed_records = delayed(lazy_records, pure=False)
    bag = db.from_delayed([delayed_records(5) for _ in range(5)])
    assert db.compute(
        bag.count(),
        bag.pluck('operations').count(),
        bag.pluck('operations').flatten().count(),
        get=dask.get,
    ) == (25, 25, 50)
Ejemplo n.º 8
0
def get_nc_variable(expt, ncfile,
                    variable, chunks={}, n=None,
                    op=None, 
                    time_units="days since 1900-01-01",
                    use_bag = False):
    """
    For a given experiment, concatenate together
    variable over all time given a basename ncfile.

    Since some NetCDF4 files have trailing integers (e.g. ocean_123_456.nc)
    ncfile is actually an regular expression.

    By default, xarray is set to use the same chunking pattern that is
    stored in the ncfile. This can be overwritten by passing in a dictionary
    chunks or setting chunks=None for no chunking (load directly into memory).

    n > 0 means only use the last n ncfiles files. Useful for testing.

    op() is function to apply to each variable before concatenating.

    time_units (e.g. "days since 1600-01-01") can be used to override
    the original time.units.  If time_units=None, no overriding is performed.

    if variable is a list, then return a dataset for all given variables
    """

    if '/' in expt:
        configuration, experiment = expt.split('/')
    else:
        experiment = expt

    if not isinstance(variable, list):
        variables = [variable]
        return_dataarray = True
    else:
        variables = variable
        return_dataarray = False

    db = dataset.connect(database_url)

    var_list = ",".join(['"{}"'.format(v) for v in variables])

    sql = " ".join(['SELECT DISTINCT ncfile, dimensions, chunking ',
                    'FROM ncfiles',
                    'WHERE experiment = "{}"'.format(experiment),
                    'AND basename_pattern = "{}"'.format(ncfile),
                    'AND variable in ({})'.format(var_list),
                    'ORDER BY ncfile'])

    logging.debug(sql)

    res = db.query(sql)
    rows = list(res)

    ncfiles = [row['ncfile'] for row in rows]
    
    #res.close()
    
    if len(ncfiles) == 0:
        raise ValueError("No variable {} found for {} in {}".format(variable, expt, ncfile))

    #print('Found {} ncfiles'.format(len(ncfiles)))

    dimensions = eval(rows[0]['dimensions'])
    chunking = eval(rows[0]['chunking'])

    #print ('chunking info', dimensions, chunking)
    if chunking is not None:
        default_chunks = dict(zip(dimensions, chunking))
    else:
        default_chunks = {}

    if chunks is not None:
        default_chunks.update(chunks)
        chunks = default_chunks

    if n is not None:
        #print('using last {} ncfiles only'.format(n))
        ncfiles = ncfiles[-n:]

    if op is None:
        op = lambda x: x


    #print ('Opening {} ncfiles...'.format(len(ncfiles)))
    logging.debug(f'Opening {len(ncfiles)} ncfiles...')

    if use_bag:
        bag = dask.bag.from_sequence(ncfiles)
        
        load_variable = lambda ncfile: xr.open_dataset(ncfile, 
                           chunks=chunks, 
                           decode_times=False)[variables]
        #bag = bag.map(load_variable, chunks, time_units, variables)
        bag = bag.map(load_variable)
        
        dataarrays = bag.compute()
    else:
        dataarrays = []
        for ncfile in tqdm.tqdm_notebook(ncfiles,
            desc='get_nc_variable:', leave=False):
            dataarray = xr.open_dataset(ncfile, chunks=chunks, decode_times=False)[variables]

            #dataarray = op(dataarray)

            dataarrays.append(dataarray)

    #print ('Building dataarray.')

    dataarray = xr.concat(dataarrays,
                          dim='time', coords='all', )

    
    if 'time' in dataarray.coords:
        if time_units is None:
            time_units = dataarray.time.units

        decoded_time = xr.conventions.decode_cf_datetime(dataarray.time, time_units)
        dataarray.coords['time'] = ('time', decoded_time,
                                    {'long_name' : 'time', 'decoded_using' : time_units }
                                   )

    #print ('Dataarray constructed.')

    if return_dataarray:
        return dataarray[variable]
    else:
        return dataarray
def get_nc_variable(expt,
                    ncfile,
                    variable,
                    chunks={},
                    n=None,
                    op=None,
                    time_units="days since 1900-01-01",
                    offset=None,
                    use_bag=False,
                    use_cache=False):
    """
    For a given experiment, concatenate together
    variable over all time given a basename ncfile.
    If variable is a list, then return a dataset for all given variables.

    If expt is an experiment name, a central database is used.
    If expt is the absolute path of the experiment directory, a separate
    database in that directory will be used instead. This must be
    created beforehand (and updated) via build_index(expt_dir_list=expt).

    Since some NetCDF4 files have trailing integers (e.g. ocean_123_456.nc)
    ncfile can use glob syntax http://www.sqlitetutorial.net/sqlite-glob/
    and regular expressions also work in some limited cases.

    By default, xarray is set to use the same chunking pattern that is
    stored in the ncfile. This can be overwritten by passing in a dictionary
    chunks or setting chunks=None for no chunking (load directly into memory).

    n < 0 means only use the last |n| ncfiles files.
    n > 0 means only use the first n ncfiles files.

    op() is function to apply to each variable before concatenating.
    TODO: implement this - currently does nothing.

    time_units (e.g. "days since 1600-01-01") can be used to override
    the original time.units.  If time_units=None, no overriding is performed.

    offset shifts the data by the specified number of days, to allow different
    experiments to be aligned in time. Use with care ...

    use_cache determines whether to return a cached result, which is faster,
    but is not kept up to date with the .nc files. The cache file is persistent
    across kernel restarts. It can be deleted to save space or force an update.
    Switching to use_cache=False will also delete the cache file if it exists.
    The default is use_cache=False.

    """

    if expt.endswith('/'):
        expt = expt[:-1]  # assumes only one trailing slash...
    experiment = os.path.basename(expt)

    if not isinstance(variable, list):
        variables = [variable]
        return_dataarray = True
    else:
        variables = variable
        return_dataarray = False

    if time_units is None:
        tunits = str(time_units)
    else:
        tunits = time_units.replace(' ', '-')


# BUG: cachefname doesn't include chunks or op
# TODO: use all args in filename, perhaps via args = locals()  ...?
    cachefname = 'cache_get_nc_variable_' + '_'.join([
        experiment, ncfile, '_'.join(variables),
        str(n), tunits,
        str(offset),
        str(use_bag)
    ]) + '.pkl'

    if use_cache and os.path.isfile(cachefname):
        print('Reading from cache file {}'.format(cachefname))
        with open(cachefname, 'rb') as cachefile:
            return pickle.load(cachefile)
    else:
        if os.path.isabs(expt):
            db_url = database_url_from_path(expt)
        else:
            db_url = database_url
        print('Using database {}'.format(db_url))
        db = dataset.connect(db_url)

        var_list = ",".join(['"{}"'.format(v) for v in variables])

        sql = " ".join([
            'SELECT DISTINCT ncfile, dimensions, chunking ', 'FROM ncfiles',
            f'WHERE experiment = "{experiment}"', 'AND (',
            f'basename_pattern = "{ncfile}"', f'OR basename GLOB "{ncfile}"',
            ')', f'AND variable in ({var_list})', 'ORDER BY ncfile'
        ])

        logging.debug(sql)

        res = db.query(sql)
        rows = list(res)

        ncfiles = [row['ncfile'] for row in rows]

        #res.close()

        if len(ncfiles) == 0:
            raise ValueError("No variable {} found for {} in {}".format(
                variable, expt, ncfile))

        #print('Found {} ncfiles'.format(len(ncfiles)))

        dimensions = eval(rows[0]['dimensions'])
        chunking = eval(rows[0]['chunking'])

        #print ('chunking info', dimensions, chunking)
        if chunking is not None:
            default_chunks = dict(zip(dimensions, chunking))
        else:
            default_chunks = {}

        if chunks is not None:
            default_chunks.update(chunks)
            chunks = default_chunks

        if n is not None:
            #print('using last {} ncfiles only'.format(n))
            if n < 0:
                ncfiles = ncfiles[n:]
            else:
                ncfiles = ncfiles[:n]

        if op is None:
            op = lambda x: x

        #print ('Opening {} ncfiles...'.format(len(ncfiles)))
        logging.debug(f'Opening {len(ncfiles)} ncfiles...')

        if use_bag:
            bag = dask.bag.from_sequence(ncfiles)

            load_variable = lambda ncfile: xr.open_dataset(
                ncfile, chunks=chunks, decode_times=False)[variables]
            #bag = bag.map(load_variable, chunks, time_units, variables)
            bag = bag.map(load_variable)

            dataarrays = bag.compute()
        else:
            dataarrays = []
            for ncfile in tqdm.tqdm_notebook(ncfiles,
                                             desc='get_nc_variable:',
                                             leave=False):
                dataarray = xr.open_dataset(ncfile,
                                            chunks=chunks,
                                            decode_times=False,
                                            autoclose=True)[variables]

                #dataarray = op(dataarray)

                dataarrays.append(dataarray)

        # print ('Building dataarray.')

        dataarray = xr.concat(
            dataarrays,
            dim='time',
            coords='all',
        )

        if 'time' in dataarray.coords:
            if time_units is None:
                time_units = dataarray.time.units
            if offset is not None:
                dataarray = rebase_dataset(dataarray,
                                           time_units,
                                           offset=offset)
            try:
                decoded_time = xr.conventions.times.decode_cf_datetime(
                    dataarray.time, time_units)
            except:  # for compatibility with older xarray (pre-0.10.2 ?)
                decoded_time = xr.conventions.decode_cf_datetime(
                    dataarray.time, time_units)
            dataarray.coords['time'] = ('time', decoded_time, {
                'long_name': 'time',
                'decoded_using': time_units
            })

        if return_dataarray:
            out = dataarray[variable]
        else:
            out = dataarray
        if use_cache:
            print('Saving cache file {}'.format(cachefname))
            with open(cachefname, 'wb') as cachefile:
                pkl = pickle.dump(out, cachefile, protocol=-1)
        else:
            if os.path.exists(cachefname):
                print('Deleting cache file {}'.format(cachefname))
                os.remove(cachefname)
        return out
Ejemplo n.º 10
0
def get_nc_variable(
    expt,
    ncfile,
    variable,
    chunks={},
    n=None,
    op=None,
    time_units="days since 1900-01-01",
    offset=None,
    use_bag=False,
    use_cache=False,
    **kwargs,
):
    """
    For a given experiment, concatenate together
    variable over all time given a basename ncfile.
    If variable is a list, then return a dataset for all given variables.

    If expt is an experiment name, a central database is used.
    If expt is the absolute path of the experiment directory, a separate
    database in that directory will be used instead. This must be
    created beforehand (and updated) via build_index(expt_dir_list=expt).

    Since some NetCDF4 files have trailing integers (e.g. ocean_123_456.nc)
    ncfile can use glob syntax http://www.sqlitetutorial.net/sqlite-glob/
    and regular expressions also work in some limited cases.

    By default, xarray is set to use the same chunking pattern that is
    stored in the ncfile. This can be overwritten by passing in a dictionary
    chunks or setting chunks=None for no chunking (load directly into memory).

    n < 0 means only use the last |n| ncfiles files.
    n > 0 means only use the first n ncfiles files.

    op() is function to apply to each variable before concatenating.
    TODO: implement this - currently does nothing.

    time_units (e.g. "days since 1600-01-01") can be used to override
    the time units specified in the .nc files.
    Default is "days since 1900-01-01".
    If time_units=None, no overriding is performed.
    NB: The effect of time_units depends on whether offset=None.
    If offset=None, time_units alters the interpretation of numerical time data
    in terms of dates, i.e. dates are changed if time_units differs from that
    in the .nc files.
    If offset!=None, the time data is altered to use time_units, so time_units
    no longer alters the dates if time_units differs from that
    in the .nc files. In particular, offset=None (the default) is not
    equivalent to offset=0 unless time_units matches what is in the .nc files.

    offset shifts the data by the specified number of days, to allow different
    experiments to be aligned in time and/or to work within the 2^64 nanosecond
    pandas time range. Valid values are None, a number, or 'auto'.
    Use with care ...
    NB: offset=None (the default) is not equivalent to offset=0 since it alters
    the interpretation of time_units (see above).

    use_cache determines whether to return a cached result, which is faster,
    but is not kept up to date with the .nc files. The cache file is persistent
    across kernel restarts. It can be deleted to save space or force an update.
    Switching to use_cache=False will also delete the cache file if it exists.
    The default is use_cache=False.

    """

    if expt.endswith("/"):
        expt = expt[:-1]  # assumes only one trailing slash...
    experiment = os.path.basename(expt)

    if not isinstance(variable, list):
        variables = [variable]
        return_dataarray = True
    else:
        variables = variable
        return_dataarray = False

    if time_units is None:
        tunits = str(time_units)
    else:
        tunits = time_units.replace(" ", "-")
    # BUG: cachefname doesn't include chunks or op
    # TODO: use all args in filename, perhaps via args = locals()  ...?
    cachefname = ("cache_get_nc_variable_" + "_".join([
        experiment,
        ncfile,
        "_".join(variables),
        str(n),
        tunits,
        str(offset),
        str(use_bag),
    ]) + ".pkl")

    if use_cache and os.path.isfile(cachefname):
        print("Reading from cache file {}".format(cachefname))
        with open(cachefname, "rb") as cachefile:
            return pickle.load(cachefile)
    else:
        if os.path.isabs(expt):
            db_url = database_url_from_path(expt)
        else:
            db_url = database_url
        print("Using database {}".format(db_url))
        db = dataset.connect(db_url)

        var_list = ",".join(['"{}"'.format(v) for v in variables])

        sql = " ".join([
            "SELECT DISTINCT ncfile, dimensions, chunking ",
            "FROM ncfiles",
            f'WHERE experiment = "{experiment}"',
            "AND (",
            f'basename_pattern = "{ncfile}"',
            f'OR basename GLOB "{ncfile}"',
            ")",
            f"AND variable in ({var_list})",
            "ORDER BY ncfile",
        ])

        logging.debug(sql)

        res = db.query(sql)
        rows = list(res)

        ncfiles = [row["ncfile"] for row in rows]

        # res.close()

        if len(ncfiles) == 0:
            raise ValueError("No variable {} found for {} in {}".format(
                variable, expt, ncfile))

        # print('Found {} ncfiles'.format(len(ncfiles)))

        dimensions = eval(rows[0]["dimensions"])
        chunking = eval(rows[0]["chunking"])

        # print ('chunking info', dimensions, chunking)
        if chunking is not None:
            default_chunks = dict(zip(dimensions, chunking))
        else:
            default_chunks = {}

        if chunks is not None:
            default_chunks.update(chunks)
            chunks = default_chunks

        if n is not None:
            # print('using last {} ncfiles only'.format(n))
            if n < 0:
                ncfiles = ncfiles[n:]
            else:
                ncfiles = ncfiles[:n]

        if op is None:
            op = lambda x: x

        # print ('Opening {} ncfiles...'.format(len(ncfiles)))
        logging.debug(f"Opening {len(ncfiles)} ncfiles...")

        if use_bag:
            bag = dask.bag.from_sequence(ncfiles)

            load_variable = lambda ncfile: xr.open_dataset(
                ncfile, chunks=chunks, decode_times=False)[variables]
            # bag = bag.map(load_variable, chunks, time_units, variables)
            bag = bag.map(load_variable)

            dataarrays = bag.compute()
            dataarray = xr.concat(
                dataarrays,
                dim="time",
                coords="all",
            )
            dataarray = decode_time(dataarray, time_units, offset)
        else:
            dataarray = xr.open_mfdataset(
                ncfiles,
                parallel=True,
                chunks=chunks,
                autoclose=True,
                decode_times=False,
                preprocess=lambda d: decode_time(d[variables], time_units,
                                                 offset),
                **kwargs,
            )

        if return_dataarray:
            out = dataarray[variable]
        else:
            out = dataarray
        if use_cache:
            print("Saving cache file {}".format(cachefname))
            with open(cachefname, "wb") as cachefile:
                pkl = pickle.dump(out, cachefile, protocol=-1)
        else:
            if os.path.exists(cachefname):
                print("Deleting cache file {}".format(cachefname))
                os.remove(cachefname)
        return out