def test_no_worker_to_memory_restrictions(c, s, a, b):
    x = delayed(slowinc)(1, delay=0.4)
    y = delayed(slowinc)(x, delay=0.4)
    z = delayed(slowinc)(y, delay=0.4)

    yy, zz = c.persist([y, z], workers={(x, y, z): 'alice'})

    while not s.tasks:
        yield gen.sleep(0.01)

    w = Worker(s.ip, s.port, ncores=1, name='alice')
    w.put_key_in_memory(y.key, 3)

    yield w._start()

    while len(s.workers) < 3:
        yield gen.sleep(0.01)
    yield gen.sleep(0.3)

    assert s.get_task_status(keys={x.key, y.key, z.key}) == {
        x.key: 'released',
        y.key: 'memory',
        z.key: 'processing',
    }

    yield w._close()
Exemple #2
0
def test_optimize():
    x = dask.delayed(inc)(1)
    y = dask.delayed(inc)(x)
    z = x + y

    x2, y2, z2, constant = optimize(x, y, z, 1)
    assert constant == 1

    # Same graphs for each
    dsk = dict(x2.dask)
    assert dict(y2.dask) == dsk
    assert dict(z2.dask) == dsk

    # Computationally equivalent
    assert dask.compute(x2, y2, z2) == dask.compute(x, y, z)

    # Applying optimizations before compute and during compute gives
    # same results. Shows optimizations are occurring.
    sols = dask.compute(x, y, z, optimizations=[inc_to_dec])
    x3, y3, z3 = optimize(x, y, z, optimizations=[inc_to_dec])
    assert dask.compute(x3, y3, z3) == sols

    # Optimize respects global optimizations as well
    with dask.config.set(optimizations=[inc_to_dec]):
        x4, y4, z4 = optimize(x, y, z)
    for a, b in zip([x3, y3, z3], [x4, y4, z4]):
        assert dict(a.dask) == dict(b.dask)
def test_no_workers_to_memory(c, s):
    x = delayed(slowinc)(1, delay=0.4)
    y = delayed(slowinc)(x, delay=0.4)
    z = delayed(slowinc)(y, delay=0.4)

    yy, zz = c.persist([y, z])

    while not s.tasks:
        yield gen.sleep(0.01)

    w = Worker(s.ip, s.port, ncores=1)
    w.put_key_in_memory(y.key, 3)

    yield w._start()

    start = time()

    while not s.workers:
        yield gen.sleep(0.01)

    assert s.get_task_status(keys={x.key, y.key, z.key}) == {
        x.key: 'released',
        y.key: 'memory',
        z.key: 'processing',
    }

    yield w._close()
def test_worker_arrives_with_processing_data(c, s, a, b):
    x = delayed(slowinc)(1, delay=0.4)
    y = delayed(slowinc)(x, delay=0.4)
    z = delayed(slowinc)(y, delay=0.4)

    yy, zz = c.persist([y, z])

    while not any(w.processing for w in s.workers.values()):
        yield gen.sleep(0.01)

    w = Worker(s.ip, s.port, ncores=1)
    w.put_key_in_memory(y.key, 3)

    yield w._start()

    start = time()

    while len(s.workers) < 3:
        yield gen.sleep(0.01)

    assert s.get_task_status(keys={x.key, y.key, z.key}) == {
        x.key: 'released',
        y.key: 'memory',
        z.key: 'processing',
    }

    yield w._close()
Exemple #5
0
def counts_by_origin():
    frames = []
    # For each file
    for f in sorted(glob.glob('data/*.csv')):
        # Load the dataframe
        df = delayed(pd.read_csv)(f,
                                  parse_dates={'Date': [0, 1, 2]},
                                  infer_datetime_format=True)

        # Store in list of frames
        frames.append(df)

    # concatenate all the frames together
    df = delayed(pd.concat)(frames)

    # Resample by month
    by_month = (df.resample('MS', on='Date')
                  .Origin.value_counts()
                  .unstack())

    # Resample by year
    by_year = (df.resample('AS', on='Date')
                 .Origin.value_counts()
                 .unstack())

    return by_month, by_year
Exemple #6
0
def test_publish_multiple_datasets(c, s, a, b):
    x = delayed(inc)(1)
    y = delayed(inc)(2)

    yield c.publish_dataset(x=x, y=y)
    datasets = yield c.scheduler.publish_list()
    assert set(datasets) == {'x', 'y'}
def normalize(block):
    old_min = delayed(block.min())
    old_max = delayed(block.max())
    r = delayed(decr)(old_max, old_min)
    minimum = old_min.compute()
    t0 = decr(block, minimum)
    return t0/r.compute(), -minimum/r.compute()
Exemple #8
0
def test_stress_scatter_death(c, s, *workers):
    import random
    s.allowed_failures = 1000
    np = pytest.importorskip('numpy')
    L = yield c.scatter([np.random.random(10000) for i in range(len(workers))])
    yield c._replicate(L, n=2)

    adds = [delayed(slowadd, pure=True)(random.choice(L),
                                        random.choice(L),
                                        delay=0.05,
                                        dask_key_name='slowadd-1-%d' % i)
            for i in range(50)]

    adds = [delayed(slowadd, pure=True)(a, b, delay=0.02,
                                        dask_key_name='slowadd-2-%d' % i)
            for i, (a, b) in enumerate(sliding_window(2, adds))]

    futures = c.compute(adds)
    L = adds = None

    alive = list(workers)

    from distributed.scheduler import logger

    for i in range(7):
        yield gen.sleep(0.1)
        try:
            s.validate_state()
        except Exception as c:
            logger.exception(c)
            if config.get('log-on-err'):
                import pdb
                pdb.set_trace()
            else:
                raise
        w = random.choice(alive)
        yield w._close()
        alive.remove(w)

    try:
        yield gen.with_timeout(timedelta(seconds=25), c._gather(futures))
    except gen.TimeoutError:
        ws = {w.address: w for w in workers if w.status != 'closed'}
        print(s.processing)
        print(ws)
        print(futures)
        try:
            worker = [w for w in ws.values() if w.waiting_for_data][0]
        except Exception:
            pass
        if config.get('log-on-err'):
            import pdb
            pdb.set_trace()
        else:
            raise
    except CancelledError:
        pass
    finally:
        futures = None
def test_expand_persist(c, s, a, b):
    low = delayed(inc)(1, dask_key_name='low')
    many = [delayed(slowinc)(i, delay=0.1) for i in range(4)]
    high = delayed(inc)(2, dask_key_name='high')

    low, high, x, y, z, w = persist(low, high, *many, priority={low: -1, high: 1})
    yield wait(high)
    assert s.tasks[low.key].state == 'processing'
def test_expand_compute(c, s, a, b):
    low = delayed(inc)(1)
    many = [delayed(slowinc)(i, delay=0.1) for i in range(10)]
    high = delayed(inc)(2)

    low, many, high = c.compute([low, many, high], priority={low: -1, high: 1})
    yield wait(high)
    assert s.tasks[low.key].state == 'processing'
Exemple #11
0
def test_compute(c, s, a, b):
    x = delayed(inc)(1)
    y = delayed(inc)(x)

    yy = c.compute(y, resources={x: {'A': 1}, y: {'B': 1}})
    yield _wait(yy)

    assert b.data
    def block_candset_excluding_rule(self, c_df, l_df, r_df, l_key, r_key,
                                     fk_ltable, fk_rtable, rule_to_exclude,
                                     show_progress, n_chunks):


        # # list to keep track of valid ids
        valid = []

        apply_rules_excluding_rule_pkl = cp.dumps(self.apply_rules_excluding_rule)

        if n_chunks == 1:
            # single process
            valid = _block_candset_excluding_rule_split(c_df, l_df, r_df,
                                                        l_key, r_key,
                                                        fk_ltable, fk_rtable,
                                                        rule_to_exclude,
                                                        apply_rules_excluding_rule_pkl,
                                                        show_progress)
        else:
            # multiprocessing
            c_splits = pd.np.array_split(c_df, n_chunks)

            valid_splits = []

            for i in range(len(c_splits)):
                partial_result = delayed(_block_candset_excluding_rule_split)(c_splits[i],
                                                             l_df, r_df,
                                                             l_key, r_key,
                                                             fk_ltable,
                                                             fk_rtable,
                                                             rule_to_exclude,
                                                             apply_rules_excluding_rule_pkl, False)
                                                            # use Progressbar from
                                                            # Dask.diagnostics so set the
                                                            #show_progress to False

                valid_splits.append(partial_result)

            valid_splits = delayed(wrap)(valid_splits)
            if show_progress:
                with ProgressBar():
                    valid_splits = valid_splits.compute(scheduler="processes",
                                                num_workers=get_num_cores())
            else:
                valid_splits = valid_splits.compute(scheduler="processes",
                                                    num_workers=get_num_cores())

            valid = sum(valid_splits, [])

        # construct output candset
        if len(c_df) > 0:
            candset = c_df[valid]
        else:
            candset = pd.DataFrame(columns=c_df.columns)

        # return candidate set
        return candset
Exemple #13
0
def test_local_get_with_distributed_active(c, s, a, b):
    with dask.set_options(get=dask.get):
        x = delayed(inc)(1).persist()
    yield gen.sleep(0.01)
    assert not s.task_state # scheduler hasn't done anything

    y = delayed(inc)(2).persist(get=dask.get)
    yield gen.sleep(0.01)
    assert not s.task_state # scheduler hasn't done anything
Exemple #14
0
def normaltest(a, axis=0, nan_policy='propagate'):
    if nan_policy != 'propagate':
        raise NotImplementedError("`nan_policy` other than 'propagate' "
                                  "have not been implemented.")

    s, _ = skewtest(a, axis)
    k, _ = kurtosistest(a, axis)
    k2 = s * s + k * k
    return delayed(NormaltestResult, nout=2)(k2, delayed(distributions.chi2.sf)(k2, 2))
Exemple #15
0
def test_local_get_with_distributed_active(c, s, a, b):
    with dask.config.set(scheduler='sync'):
        x = delayed(inc)(1).persist()
    yield gen.sleep(0.01)
    assert not s.tasks # scheduler hasn't done anything

    y = delayed(inc)(2).persist(scheduler='sync')
    yield gen.sleep(0.01)
    assert not s.tasks # scheduler hasn't done anything
Exemple #16
0
def test_persist(c, s, a, b):
    x = delayed(inc)(1)
    y = delayed(inc)(x)

    xx, yy = c.persist([x, y], resources={x: {'A': 1}, y: {'B': 1}})

    yield _wait([xx, yy])

    assert x.key in a.data
    assert y.key in b.data
Exemple #17
0
def test_object_in_graph(c, s, a, b):
    o = MyObj(123)
    v = delayed(o)
    v2 = delayed(identity)(v)

    future = c.compute(v2)
    result = yield future._result()

    assert isinstance(result, MyObj)
    assert result.data == 123
Exemple #18
0
def featurize_ts_files(ts_paths, features_to_use, output_path=None,
                       custom_script_path=None, custom_functions=None,
                       scheduler=dask.multiprocessing.get):
    """Feature generation function for on-disk time series (NetCDF) files.

    By default, computes features concurrently using the
    `dask.multiprocessing.get` scheduler. Other possible options include
    `dask.async.get_sync` for synchronous computation (e.g., when debugging),
    or `dask.distributed.Executor.get` for distributed computation.

    In the case of multichannel measurements, each channel will be
    featurized separately, and the data variables of the output
    `xarray.Dataset` will be indexed by a `channel` coordinate.

    Parameters
    ----------
    ts_paths : list of str
        List of paths to time series data, stored in NetCDF format. See
        `time_series.from_netcdf` for details.
    features_to_use : list of str, optional
        List of feature names to be generated. Defaults to an empty list, which
        will result in only meta_features features being stored.
    custom_script_path : str, optional
        Path to Python script containing function definitions for the
        generation of any custom features. Defaults to None.
    custom_functions : dict, optional
        Dictionary of custom feature functions to be evaluated for the given
        time series, or a dictionary representing a dask graph of function
        evaluations.  Dictionaries of functions should have keys `feature_name`
        and values functions that take arguments (t, m, e); in the case of a
        dask graph, these arrays should be referenced as 't', 'm', 'e',
        respectively, and any values with keys present in `features_to_use`
        will be computed.
    scheduler : function, optional
        `dask` scheduler function used to perform feature extraction
        computation. Defaults to `dask.multiprocessing.get`.

    Returns
    -------
    xarray.Dataset
        Featureset with `data_vars` containing feature values and `coords`
        containing labels (`name`) and targets (`target`), if applicable.
    """
    all_time_series = [delayed(time_series.from_netcdf, pure=True)(ts_path)
                       for ts_path in ts_paths]
    all_features = [delayed(featurize_single_ts, pure=True)(ts, features_to_use,
                                                            custom_script_path,
                                                            custom_functions)
                    for ts in all_time_series]
    result = delayed(assemble_featureset, pure=True)(all_features, all_time_series)
    fset = result.compute(get=scheduler)
    if output_path:
        fset.to_netcdf(output_path)

    return fset
Exemple #19
0
def test_persist_tuple(c, s, a, b):
    x = delayed(inc)(1)
    y = delayed(inc)(x)

    xx, yy = c.persist([x, y], resources={(x, y): {'A': 1}})

    yield _wait([xx, yy])

    assert x.key in a.data
    assert y.key in a.data
    assert not b.data
Exemple #20
0
def test_persist_delayed():
    x1 = delayed(1)
    x2 = delayed(inc)(x1)
    x3 = delayed(inc)(x2)

    xx, = persist(x3)
    assert isinstance(xx, Delayed)
    assert xx.key == x3.key
    assert len(xx.dask) == 1

    assert x3.compute() == xx.compute()
Exemple #21
0
def test_clean_nbytes(c, s, a, b):
    L = [delayed(inc)(i) for i in range(10)]
    for i in range(5):
        L = [delayed(add)(x, y) for x, y in sliding_window(2, L)]
    total = delayed(sum)(L)

    future = c.compute(total)
    yield wait(future)

    yield gen.sleep(1)
    assert len(a.nbytes) + len(b.nbytes) == 1
Exemple #22
0
def test_persist(c, s, a, b):
    x = delayed(inc)(1)
    x2, = persist(x)

    yield _wait(x2)
    assert x2.key in a.data or x2.key in b.data

    y = delayed(inc)(10)
    y2, one = persist(y, 1)

    yield _wait(y2)
    assert y2.key in a.data or y2.key in b.data
def test_cancel_fire_and_forget(c, s, a, b):
    x = delayed(slowinc)(1, delay=0.05)
    y = delayed(slowinc)(x, delay=0.05)
    z = delayed(slowinc)(y, delay=0.05)
    w = delayed(slowinc)(z, delay=0.05)
    future = c.compute(w)
    fire_and_forget(future)

    yield gen.sleep(0.05)
    yield future.cancel(force=True)
    assert future.status == 'cancelled'
    assert not s.tasks
Exemple #24
0
def test_set_index_sorted_min_max_same():
    a = pd.DataFrame({'x': [1, 2, 3], 'y': [0, 0, 0]})
    b = pd.DataFrame({'x': [1, 2, 3], 'y': [1, 1, 1]})

    aa = delayed(a)
    bb = delayed(b)

    df = dd.from_delayed([aa, bb], meta=a)
    assert not df.known_divisions

    df2 = df.set_index('y', sorted=True)
    assert df2.divisions == (0, 1, 1)
def test_respect_data_in_memory(c, s, a):
    x = delayed(inc)(1)
    y = delayed(inc)(x)
    f = c.persist(y)
    yield wait([f])

    assert s.tasks[y.key].who_has == {s.workers[a.address]}

    z = delayed(add)(x, y)
    f2 = c.persist(z)
    while f2.key not in s.tasks or not s.tasks[f2.key]:
        assert s.tasks[y.key].who_has
        yield gen.sleep(0.0001)
Exemple #26
0
    def test_categorical_empty(self):
        # GH 1705

        def make_empty():
            return pd.DataFrame({"A": pd.Categorical([np.nan, np.nan])})

        def make_full():
            return pd.DataFrame({"A": pd.Categorical(['a', 'a'])})

        a = dd.from_delayed([dask.delayed(make_empty)(),
                             dask.delayed(make_full)()])
        # Used to raise an IndexError
        a.A.cat.categories
def test_repeated_persists_same_priority(c, s, w):
    xs = [delayed(slowinc)(i, delay=0.05, dask_key_name='x-%d' % i) for i in range(10)]
    ys = [delayed(slowinc)(x, delay=0.05, dask_key_name='y-%d' % i) for i, x in enumerate(xs)]
    zs = [delayed(slowdec)(x, delay=0.05, dask_key_name='z-%d' % i) for i, x in enumerate(xs)]

    ys = dask.persist(*ys)
    zs = dask.persist(*zs)

    while sum(t.state == 'memory' for t in s.tasks.values()) < 5:  # TODO: reduce this number
        yield gen.sleep(0.01)

    assert any(s.tasks[y.key].state == 'memory' for y in ys)
    assert any(s.tasks[z.key].state == 'memory' for z in zs)
def test_restart_during_computation(c, s, a, b):
    xs = [delayed(slowinc)(i, delay=0.01) for i in range(50)]
    ys = [delayed(slowinc)(i, delay=0.01) for i in xs]
    zs = [delayed(slowadd)(x, y, delay=0.01) for x, y in zip(xs, ys)]
    total = delayed(sum)(zs)
    result = c.compute(total)

    yield gen.sleep(0.5)
    assert s.rprocessing
    yield c._restart()
    assert not s.rprocessing

    assert len(s.ncores) == 2
    assert not s.task_state
Exemple #29
0
def test_delayed_with_dataclass():
    dataclasses = pytest.importorskip("dataclasses")

    # Avoid @dataclass decorator as Python < 3.7 fail to interpret the type hints
    ADataClass = dataclasses.make_dataclass('ADataClass', [('a', int)])

    literal = dask.delayed(3)
    with_class = dask.delayed({"a": ADataClass(a=literal)})

    def return_nested(obj):
        return obj["a"].a
    final = delayed(return_nested)(with_class)

    assert final.compute() == 3
Exemple #30
0
def test_transition_story(c, s, a, b):
    x = delayed(inc)(1)
    y = delayed(inc)(x)
    f = c.persist(y)
    yield _wait([f])

    assert s.transition_log

    story = s.transition_story(x.key)
    assert all(line in s.transition_log for line in story)
    assert len(story) < len(s.transition_log)
    assert all(x.key == line[0] or x.key in line[-2] for line in story)

    assert len(s.transition_story(x.key, y.key)) > len(story)
Exemple #31
0
    return x + 1


def add(x, y):
    time.sleep(2)
    return x + y


client = Client('10.255.23.115:8786')

data = [1, 2, 3, 4, 5, 6, 7, 8]
results = []

tic = time.perf_counter()
results = []

for x in data:
    y = delayed(inc)(x)
    results.append(y)

total = delayed(sum)(results)
print("Before computing:", total)  # Let's see what type of thing total is
result = total.compute(scheduler='distributed')
print("After computing :", result)  # After it's computed

toc = time.perf_counter()

print(f"Downloaded the tutorial in {toc - tic:0.4f} seconds")

total.visualize()
Exemple #32
0
def open_mfdataset(paths,
                   chunks=None,
                   concat_dim=_CONCAT_DIM_DEFAULT,
                   compat='no_conflicts',
                   preprocess=None,
                   engine=None,
                   lock=None,
                   data_vars='all',
                   coords='different',
                   autoclose=False,
                   parallel=False,
                   **kwargs):
    """Open multiple files as a single dataset.

    Requires dask to be installed. See documentation for details on dask [1].
    Attributes from the first dataset file are used for the combined dataset.

    Parameters
    ----------
    paths : str or sequence
        Either a string glob in the form "path/to/my/files/*.nc" or an explicit
        list of files to open.  Paths can be given as strings or as pathlib
        Paths.
    chunks : int or dict, optional
        Dictionary with keys given by dimension names and values given by chunk
        sizes. In general, these should divide the dimensions of each dataset.
        If int, chunk each dimension by ``chunks``.
        By default, chunks will be chosen to load entire input files into
        memory at once. This has a major impact on performance: please see the
        full documentation for more details [2].
    concat_dim : None, str, DataArray or Index, optional
        Dimension to concatenate files along. This argument is passed on to
        :py:func:`xarray.auto_combine` along with the dataset objects. You only
        need to provide this argument if the dimension along which you want to
        concatenate is not a dimension in the original datasets, e.g., if you
        want to stack a collection of 2D arrays along a third dimension.
        By default, xarray attempts to infer this argument by examining
        component files. Set ``concat_dim=None`` explicitly to disable
        concatenation.
    compat : {'identical', 'equals', 'broadcast_equals',
              'no_conflicts'}, optional
        String indicating how to compare variables of the same name for
        potential conflicts when merging:

        - 'broadcast_equals': all values must be equal when variables are
          broadcast against each other to ensure common dimensions.
        - 'equals': all values and dimensions must be the same.
        - 'identical': all values, dimensions and attributes must be the
          same.
        - 'no_conflicts': only values which are not null in both datasets
          must be equal. The returned dataset then contains the combination
          of all non-null values.
    preprocess : callable, optional
        If provided, call this function on each dataset prior to concatenation.
    engine : {'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio'}, optional
        Engine to use when reading files. If not provided, the default engine
        is chosen based on available dependencies, with a preference for
        'netcdf4'.
    autoclose : bool, optional
        If True, automatically close files to avoid OS Error of too many files
        being open.  However, this option doesn't work with streams, e.g.,
        BytesIO.
    lock : False, True or threading.Lock, optional
        This argument is passed on to :py:func:`dask.array.from_array`. By
        default, a per-variable lock is used when reading data from netCDF
        files with the netcdf4 and h5netcdf engines to avoid issues with
        concurrent access when using dask's multithreaded backend.
    data_vars : {'minimal', 'different', 'all' or list of str}, optional
        These data variables will be concatenated together:
          * 'minimal': Only data variables in which the dimension already
            appears are included.
          * 'different': Data variables which are not equal (ignoring
            attributes) across all datasets are also concatenated (as well as
            all for which dimension already appears). Beware: this option may
            load the data payload of data variables into memory if they are not
            already loaded.
          * 'all': All data variables will be concatenated.
          * list of str: The listed data variables will be concatenated, in
            addition to the 'minimal' data variables.
    coords : {'minimal', 'different', 'all' o list of str}, optional
        These coordinate variables will be concatenated together:
          * 'minimal': Only coordinates in which the dimension already appears
            are included.
          * 'different': Coordinates which are not equal (ignoring attributes)
            across all datasets are also concatenated (as well as all for which
            dimension already appears). Beware: this option may load the data
            payload of coordinate variables into memory if they are not already
            loaded.
          * 'all': All coordinate variables will be concatenated, except
            those corresponding to other dimensions.
          * list of str: The listed coordinate variables will be concatenated,
            in addition the 'minimal' coordinates.
    parallel : bool, optional
        If True, the open and preprocess steps of this function will be
        performed in parallel using ``dask.delayed``. Default is False.
    **kwargs : optional
        Additional arguments passed on to :py:func:`xarray.open_dataset`.

    Returns
    -------
    xarray.Dataset

    See Also
    --------
    auto_combine
    open_dataset

    References
    ----------
    .. [1] http://xarray.pydata.org/en/stable/dask.html
    .. [2] http://xarray.pydata.org/en/stable/dask.html#chunking-and-performance
    """
    if isinstance(paths, basestring):
        if is_remote_uri(paths):
            raise ValueError(
                'cannot do wild-card matching for paths that are remote URLs: '
                '{!r}. Instead, supply paths as an explicit list of strings.'.
                format(paths))
        paths = sorted(glob(paths))
    else:
        paths = [str(p) if isinstance(p, path_type) else p for p in paths]

    if not paths:
        raise IOError('no files to open')

    if lock is None:
        lock = _default_lock(paths[0], engine)

    open_kwargs = dict(engine=engine,
                       chunks=chunks or {},
                       lock=lock,
                       autoclose=autoclose,
                       **kwargs)

    if parallel:
        import dask
        # wrap the open_dataset, getattr, and preprocess with delayed
        open_ = dask.delayed(open_dataset)
        getattr_ = dask.delayed(getattr)
        if preprocess is not None:
            preprocess = dask.delayed(preprocess)
    else:
        open_ = open_dataset
        getattr_ = getattr

    datasets = [open_(p, **open_kwargs) for p in paths]
    file_objs = [getattr_(ds, '_file_obj') for ds in datasets]
    if preprocess is not None:
        datasets = [preprocess(ds) for ds in datasets]

    if parallel:
        # calling compute here will return the datasets/file_objs lists,
        # the underlying datasets will still be stored as dask arrays
        datasets, file_objs = dask.compute(datasets, file_objs)

    # close datasets in case of a ValueError
    try:
        if concat_dim is _CONCAT_DIM_DEFAULT:
            combined = auto_combine(datasets,
                                    compat=compat,
                                    data_vars=data_vars,
                                    coords=coords)
        else:
            combined = auto_combine(datasets,
                                    concat_dim=concat_dim,
                                    compat=compat,
                                    data_vars=data_vars,
                                    coords=coords)
    except ValueError:
        for ds in datasets:
            ds.close()
        raise

    combined._file_obj = _MultiFileCloser(file_objs)
    combined.attrs = datasets[0].attrs
    return combined
Exemple #33
0
        def create_graph(client=None):
            # NOTE ON CHUNKING SIGNATURES:
            # Chunking the gene signatures might not be necessary anymore because the overhead of the dask
            # scheduler is minimal (cf. blog http://matthewrocklin.com/blog/work/2016/05/05/performant-task-scheduling).
            # The original behind the decision to implement this was the refuted assumption that fast executing tasks
            # would greatly be impacted by scheduler overhead. The performance gain introduced by chunking of signatures
            # seemed to corroborate this assumption. However, the benefit was through less pickling and unpickling of
            # the motif annotations dataframe as this was not wrapped in a delayed() construct.
            # When using a distributed scheduler chunking even has a negative impact and is therefore overruled. The
            # negative impact is due to having these large chunks to be shipped to different workers across cluster nodes.

            # NOTE ON BROADCASTING DATASET:
            # There are three large pieces of data that need to be orchestrated between client/scheduler and workers:
            # 1. In a cluster the motif annotations need to be broadcasted to all nodes. Otherwise
            # the motif annotations need to wrapped in a delayed() construct to avoid needless pickling and
            # unpicking between processes.
            def wrap(data):
                return client.scatter(
                    data, broadcast=True) if client else delayed(data,
                                                                 pure=True)

            delayed_or_future_annotations = wrap(motif_annotations)
            # 2. The databases: these database objects are typically proxies to the data on disk. They only have
            # the name and location on shared storage as fields. For consistency reason we do broadcast these database
            # objects to the workers. If we decide to have all information of a database loaded into memory we can still
            # safely use clusters.
            #def memoize(db: Type[RankingDatabase]) -> Type[RankingDatabase]:
            #    return MemoryDecorator(db)
            #delayed_or_future_dbs = list(map(wrap, map(memoize, rnkdbs)))
            # Check also latest Stackoverflow message: https://stackoverflow.com/questions/50795901/dask-scatter-broadcast-a-list
            delayed_or_future_dbs = list(map(wrap, rnkdbs))
            # 3. The gene signatures: these signatures become large when chunking them, therefore chunking is overruled
            # when using dask.distributed.
            # See earlier.

            # NOTE ON SHARING RANKING DATABASES ACROSS NODES:
            # Because the frontnodes of the VSC share the staging disk, these databases can be accessed from all nodes
            # in the cluster and can all use the same path in the configuration file. The RankingDatabase objects shared
            # from scheduler to workers can therefore be just contain information on database file location.
            # There might be a need to be able to run on clusters that do not share a network drive. This can be
            # achieved via by loading all data in from the scheduler and use the broadcasting system to share data
            # across nodes. The only element that needs to be adapted to cater for this need is loading the databases
            # in memory on the scheduler via the already available MemoryDecorator for databases. But make sure the
            # adapt memory limits for workers to avoid "distributed.nanny - WARNING - Worker exceeded 95% memory budget.".

            # NOTE ON REMOVING I/O CONTENTION:
            # A potential improvement to reduce I/O contention for this shared drive (accessing the ranking
            # database) would be to load the database in memory (using the available decorator) for each task.
            # The penalty of loading the database in memory should be shared across multiple gene signature so
            # in this case chunking of gene signatures is mandatory to avoid severe performance penalties.
            # However, because of the memory need of a node running pyscenic is already high (i.e. pre-allocation
            # of recovery curves - 20K features (max. enriched) * rank_threshold * 8 bytes (float) * num_cores),
            # this might not be a sound idea to do.
            # Another approach to overcome the I/O bottleneck in a clustered infrastructure is to assign each cluster
            # to a different database which is achievable in the dask framework. This approach has of course many
            # limitations: for 6 database you need at least 6 cores and you cannot take advantage of more
            # (http://distributed.readthedocs.io/en/latest/locality.html)

            # NOTE ON REMAINING WARNINGS:
            # >> distributed.worker - WARNING - Memory use is high but worker has no data to store to disk.
            # >> Perhaps some other process is leaking memory?  Process memory: 1.51 GB -- Worker memory limit: 2.15 GB
            # My current idea is that this cannot be avoided processing a single module can sometimes required
            # substantial amount of memory because of pre-allocation of recovery curves (see code notes on how to
            # mitigate this problem). Setting module_chunksize=1 also limits this problem.
            #
            # >> distributed.utils_perf - WARNING - full garbage collections took 10% CPU time recently (threshold: 10%)
            # The current implementation of module2df removes substantial amounts of memory (i.e. the RCCs) so this might
            # again be unavoidable. TBI + See following stackoverflow question:
            # https://stackoverflow.com/questions/47776936/why-is-a-computation-much-slower-within-a-dask-distributed-worker

            return aggregate_func(
                (delayed(transform_func)(db, gs_chunk,
                                         delayed_or_future_annotations)
                 for db in delayed_or_future_dbs
                 for gs_chunk in chunked_iter(modules, module_chunksize)))
Exemple #34
0
def get_synth_preds(
    store,
    shape,
    all_cat_inds,
    categories,
    batch_size,
    only_cat,
    synth_strat,
    use_dask=True,
    con_limit=None,
    limit=None,
    pbar=None,
):
    with h5py.File(store, "a") as f_synth, h5py.File(store, "r") as f_preds:
        if "synthesis" in f_synth.keys():
            del f_synth['synthesis']
        f_synth.create_dataset("synthesis", shape)
        synth_preds = f_synth["synthesis"]
        if (limit is not None):
            kwd_preds = f_preds["predictions"][:, 0:limit, :]
        else:
            kwd_preds = f_preds["predictions"]
        n_batches = np.ceil(kwd_preds.shape[1] / batch_size)
        LOG.debug(f"{n_batches} batches")
        no_cat_ind = categories.index("")
        for n in range(int(n_batches)):
            start_batch = n * batch_size
            end_batch = (n + 1) * batch_size
            if con_limit is not None:
                kwd_preds_tmp = kwd_preds[0:con_limit,
                                          start_batch:end_batch, :]
            else:
                kwd_preds_tmp = kwd_preds[:, start_batch:end_batch, :]
            n_docs = kwd_preds_tmp.shape[1]
            if True:  # use_dask is True:
                kwd_preds_tmp = dask.delayed(kwd_preds_tmp)
                all_cat_inds = dask.delayed(all_cat_inds)
                jobs = []
                for doc_index in range(n_docs):
                    # should be everything now, since '' category is included
                    job = dask.delayed(get_means_for_one_doc)(
                        doc_index,
                        all_cat_inds,
                        kwd_preds_tmp,
                        categories,
                        no_cat_ind,
                        synth_strat,
                        pbar=pbar,
                    )
                    jobs.append(job)
                hybrid_preds = dask.compute(jobs)[0]
            else:
                hybrid_preds = []
                for doc_index in range(n_docs):
                    # should be everything now, since '' category is included
                    v = get_means_for_one_doc(
                        doc_index,
                        all_cat_inds,
                        kwd_preds_tmp,
                        categories,
                        no_cat_ind,
                        only_cat,
                        synth_strat,
                        pbar=pbar,
                    )
                    hybrid_preds.append(v)
            hybrid_pred_array = np.stack(hybrid_preds)
            if limit is not None:
                if limit <= end_batch:
                    synth_preds[start_batch:limit, :] = hybrid_pred_array
                else:
                    synth_preds[start_batch:end_batch, :] = hybrid_pred_array
            else:
                synth_preds[start_batch:end_batch, :] = hybrid_pred_array
Exemple #35
0
def main(client):
    import cudf
    import dask_cudf

    item_ddf, customer_ddf, customer_dem_ddf = read_tables()

    # We want to find clicks in the parameterized category
    # It would be more efficient to translate to a category id, but
    # all of the SQL samples refer to string categories directly We'll
    # call this clicks_in_category to match the names used in SQL
    # examples, though clicks_in_target would be a much better name
    item_ddf["clicks_in_category"] = ((
        item_ddf["i_category"] == Q05_I_CATEGORY).astype(
            np.int8).reset_index(drop=True))
    keep_cols = ["i_item_sk", "i_category_id", "clicks_in_category"]
    item_ddf = item_ddf[keep_cols]

    web_clickstream_flist = glob.glob(cli_args["data_dir"] +
                                      "web_clickstreams/*.parquet")
    n_workers = len(client.scheduler_info()["workers"])
    batchsize = len(web_clickstream_flist) // n_workers
    if batchsize < 1:
        batchsize = 1

    chunks = [
        web_clickstream_flist[x:x + batchsize]
        for x in range(0, len(web_clickstream_flist), batchsize)
    ]
    task_ls = [
        delayed(get_groupby_results)(c, item_ddf.to_delayed()[0])
        for c in chunks
    ]

    meta_d = {
        "wcs_user_sk": {},
        "clicks_in_category": {},
        "clicks_in_1": {},
        "clicks_in_2": {},
        "clicks_in_3": {},
        "clicks_in_4": {},
        "clicks_in_5": {},
        "clicks_in_6": {},
        "clicks_in_7": {},
    }
    df = cudf.from_pandas(pd.DataFrame.from_dict(meta_d, dtype="int64"))

    sum_by_cat_ddf = dask_cudf.from_delayed(task_ls, meta=df)
    sum_by_cat_ddf = sum_by_cat_ddf.groupby(["wcs_user_sk"], sort=True).sum()
    sum_by_cat_ddf = sum_by_cat_ddf.reset_index(drop=False)
    #
    # Combine user-level click summaries with customer demographics
    #
    customer_merged_ddf = customer_ddf.merge(customer_dem_ddf,
                                             left_on="c_current_cdemo_sk",
                                             right_on="cd_demo_sk")
    customer_merged_ddf = customer_merged_ddf[[
        "c_customer_sk", "cd_gender", "cd_education_status"
    ]]

    customer_merged_ddf["college_education"] = (
        customer_merged_ddf.cd_education_status.isin(COLLEGE_ED_STRS).astype(
            np.int64).fillna(0)).reset_index(drop=True)

    customer_merged_ddf["male"] = ((
        customer_merged_ddf["cd_gender"] == "M").astype(
            np.int64).fillna(0)).reset_index(drop=True)

    cust_and_clicks_ddf = customer_merged_ddf[[
        "c_customer_sk", "college_education", "male"
    ]].merge(sum_by_cat_ddf, left_on="c_customer_sk", right_on="wcs_user_sk")

    keep_cols = ["clicks_in_category", "college_education", "male"
                 ] + [f"clicks_in_{i}" for i in range(1, 8)]
    cust_and_clicks_ddf = cust_and_clicks_ddf[keep_cols]

    # The ETL step in spark covers everything above this point

    # Convert clicks_in_category to a binary label
    cust_and_clicks_ddf["clicks_in_category"] = (
        (cust_and_clicks_ddf["clicks_in_category"] >
         cust_and_clicks_ddf["clicks_in_category"].mean()).reset_index(
             drop=True).astype(np.int64))

    # Converting the dataframe to float64 as cuml logistic reg requires this
    ml_input_df = cust_and_clicks_ddf.astype("float64")

    ml_input_df = ml_input_df.persist()

    ml_tasks = [
        delayed(build_and_predict_model)(df)
        for df in ml_input_df.to_delayed()
    ]
    results_dict = client.compute(*ml_tasks, sync=True)

    return results_dict
Exemple #36
0
def test_update_shuffle_buckets(
    store_factory,
    metadata_version,
    unique_primaries,
    unique_secondaries,
    num_buckets,
    repartition,
    npartitions,
    bucket_by,
):
    """
    Assert that certain properties are always given for the output dataset
    no matter how the input data distribution looks like

    Properties to assert:
    * All partitions have a unique value for its correspondent primary key
    * number of partitions is at least one per unique partition value, at
      most ``num_buckets`` per primary partition value.
    * If we demand a column to be sorted it is per partition monotonic
    """

    primaries = np.arange(unique_primaries)
    secondary = np.arange(unique_secondaries)
    num_rows = 100
    primaries = np.repeat(primaries,
                          np.ceil(num_rows / unique_primaries))[:num_rows]
    secondary = np.repeat(secondary,
                          np.ceil(num_rows / unique_secondaries))[:num_rows]
    # ensure that there is an unsorted column uncorrelated
    # to the primary and secondary columns which can be sorted later on per partition
    unsorted_column = np.repeat(np.arange(100 / 10), 10)
    np.random.shuffle(unsorted_column)
    np.random.shuffle(primaries)
    np.random.shuffle(secondary)

    df = pd.DataFrame({
        "primary": primaries,
        "secondary": secondary,
        "sorted_column": unsorted_column
    })
    secondary_indices = ["secondary"]
    expected_num_indices = 2  # One primary

    # used for tests later on to
    if bucket_by:
        secondary_indices.append(bucket_by)
        expected_num_indices = 3

    # shuffle all rows. properties of result should be reproducible
    df = df.sample(frac=1).reset_index(drop=True)
    ddf = dd.from_pandas(df, npartitions=npartitions)

    dataset_comp = update_dataset_from_ddf(
        ddf,
        store_factory,
        dataset_uuid="output_dataset_uuid",
        table="core",
        secondary_indices=secondary_indices,
        shuffle=True,
        bucket_by=bucket_by,
        repartition_ratio=repartition,
        num_buckets=num_buckets,
        sort_partitions_by="sorted_column",
        default_metadata_version=metadata_version,
        partition_on=["primary"],
    )

    s = pickle.dumps(dataset_comp, pickle.HIGHEST_PROTOCOL)
    dataset_comp = pickle.loads(s)

    dataset = dataset_comp.compute()
    dataset = dataset.load_all_indices(store_factory())

    assert len(dataset.partitions) <= num_buckets * unique_primaries
    assert len(dataset.partitions) >= unique_primaries

    assert len(dataset.indices) == expected_num_indices

    assert set(dataset.indices["primary"].index_dct.keys()) == set(
        range(unique_primaries))
    assert (list(
        map(lambda x: len(x), dataset.indices["primary"].index_dct.values()))
            <= [num_buckets] * unique_primaries)

    assert set(dataset.indices["secondary"].index_dct.keys()) == set(
        range(unique_secondaries))

    assert set(dataset.table_meta["core"].names) == {
        "primary",
        "secondary",
        "sorted_column",
    }

    factory = DatasetFactory("output_dataset_uuid", store_factory)
    factory.load_all_indices()

    if bucket_by:
        ind_df = factory.get_indices_as_dataframe(["primary", bucket_by])

        assert not ind_df.duplicated().any()

    for data_dct in read_dataset_as_dataframes__iterator(
            dataset_uuid=dataset.uuid, store=store_factory):
        df = data_dct["core"]
        assert len(df.primary.unique()) == 1
        assert df.sorted_column.is_monotonic

    # update the dataset
    # do not use partition_on since it should be interfered from the existing dataset
    tasks = update_dataset_from_ddf(
        ddf,
        store_factory,
        dataset_uuid="output_dataset_uuid",
        table="core",
        shuffle=True,
        repartition_ratio=repartition,
        num_buckets=num_buckets,
        sort_partitions_by="sorted_column",
        default_metadata_version=metadata_version,
        bucket_by=bucket_by,
    )

    s = pickle.dumps(tasks, pickle.HIGHEST_PROTOCOL)
    tasks = pickle.loads(s)

    updated_dataset = tasks.compute()

    assert len(updated_dataset.partitions) == 2 * len(dataset.partitions)

    # Not allowed to use different partition_on
    with pytest.raises(
            ValueError,
            match="Incompatible set of partition keys encountered."):
        update_dataset_from_ddf(
            ddf,
            store_factory,
            dataset_uuid="output_dataset_uuid",
            table="core",
            shuffle=True,
            repartition_ratio=repartition,
            partition_on=["sorted_column"],
            num_buckets=num_buckets,
            sort_partitions_by="sorted_column",
            default_metadata_version=metadata_version,
        )

    # Not allowed to update with indices which do not yet exist in dataset
    with pytest.raises(ValueError, match="indices"):
        update_dataset_from_ddf(
            ddf,
            store_factory,
            dataset_uuid="output_dataset_uuid",
            table="core",
            shuffle=True,
            partition_on=["primary"],
            repartition_ratio=repartition,
            secondary_indices=["sorted_column"],
            num_buckets=num_buckets,
            sort_partitions_by="sorted_column",
            default_metadata_version=metadata_version,
        )

    # Check that delayed objects are allowed as delete scope.
    tasks = update_dataset_from_ddf(
        None,
        store_factory,
        dataset_uuid="output_dataset_uuid",
        table="core",
        shuffle=True,
        repartition_ratio=repartition,
        num_buckets=num_buckets,
        sort_partitions_by="sorted_column",
        default_metadata_version=metadata_version,
        delete_scope=dask.delayed(_return_none)(),
        bucket_by=bucket_by,
    )

    s = pickle.dumps(tasks, pickle.HIGHEST_PROTOCOL)
    tasks = pickle.loads(s)

    tasks.compute()
Exemple #37
0
def equilibrium(dbf,
                comps,
                phases,
                conditions,
                output=None,
                model=None,
                verbose=False,
                broadcast=True,
                calc_opts=None,
                scheduler='sync',
                parameters=None,
                solver=None,
                callables=None,
                **kwargs):
    """
    Calculate the equilibrium state of a system containing the specified
    components and phases, under the specified conditions.

    Parameters
    ----------
    dbf : Database
        Thermodynamic database containing the relevant parameters.
    comps : list
        Names of components to consider in the calculation.
    phases : list or dict
        Names of phases to consider in the calculation.
    conditions : dict or (list of dict)
        StateVariables and their corresponding value.
    output : str or list of str, optional
        Additional equilibrium model properties (e.g., CPM, HM, etc.) to compute.
        These must be defined as attributes in the Model class of each phase.
    model : Model, a dict of phase names to Model, or a seq of both, optional
        Model class to use for each phase.
    verbose : bool, optional
        Print details of calculations. Useful for debugging.
    broadcast : bool
        If True, broadcast conditions against each other. This will compute all combinations.
        If False, each condition should be an equal-length list (or single-valued).
        Disabling broadcasting is useful for calculating equilibrium at selected conditions,
        when those conditions don't comprise a grid.
    calc_opts : dict, optional
        Keyword arguments to pass to `calculate`, the energy/property calculation routine.
    scheduler : Dask scheduler, optional
        Job scheduler for performing the computation.
        If None, return a Dask graph of the computation instead of actually doing it.
    parameters : dict, optional
        Maps SymPy Symbol to numbers, for overriding the values of parameters in the Database.
    solver : pycalphad.core.solver.SolverBase
        Instance of a solver that is used to calculate local equilibria.
        Defaults to a pycalphad.core.solver.InteriorPointSolver.
    callables : dict, optional
        Pre-computed callable functions for equilibrium calculation.

    Returns
    -------
    Structured equilibrium calculation, or Dask graph if scheduler=None.

    Examples
    --------
    None yet.
    """
    if not broadcast:
        raise NotImplementedError('Broadcasting cannot yet be disabled')
    comps = sorted(unpack_components(dbf, comps))
    phases = unpack_phases(phases) or sorted(dbf.phases.keys())
    # remove phases that cannot be active
    list_of_possible_phases = filter_phases(dbf, comps)
    active_phases = sorted(
        set(list_of_possible_phases).intersection(set(phases)))
    if len(list_of_possible_phases) == 0:
        raise ConditionError(
            'There are no phases in the Database that can be active with components {0}'
            .format(comps))
    if len(active_phases) == 0:
        raise ConditionError(
            'None of the passed phases ({0}) are active. List of possible phases: {1}.'
            .format(phases, list_of_possible_phases))
    if isinstance(comps, (str, v.Species)):
        comps = [comps]
    if len(set(comps) - set(dbf.species)) > 0:
        raise EquilibriumError('Components not found in database: {}'.format(
            ','.join([c.name for c in (set(comps) - set(dbf.species))])))
    calc_opts = calc_opts if calc_opts is not None else dict()
    solver = solver if solver is not None else InteriorPointSolver(
        verbose=verbose)
    parameters = parameters if parameters is not None else dict()
    if isinstance(parameters, dict):
        parameters = OrderedDict(sorted(parameters.items(), key=str))
    models = instantiate_models(dbf,
                                comps,
                                active_phases,
                                model=model,
                                parameters=parameters)
    # Temporary solution until constraint system improves
    if conditions.get(v.N) is None:
        conditions[v.N] = 1
    if np.any(np.array(conditions[v.N]) != 1):
        raise ConditionError('N!=1 is not yet supported, got N={}'.format(
            conditions[v.N]))
    # Modify conditions values to be within numerical limits, e.g., X(AL)=0
    # Also wrap single-valued conditions with lists
    conds = _adjust_conditions(conditions)

    for cond in conds.keys():
        if isinstance(cond,
                      (v.Composition,
                       v.ChemicalPotential)) and cond.species not in comps:
            raise ConditionError(
                '{} refers to non-existent component'.format(cond))
    state_variables = sorted(get_state_variables(models=models, conds=conds),
                             key=str)
    str_conds = OrderedDict((str(key), value) for key, value in conds.items())
    num_calcs = np.prod([len(i) for i in str_conds.values()])
    components = [x for x in sorted(comps)]
    desired_active_pure_elements = [
        list(x.constituents.keys()) for x in components
    ]
    desired_active_pure_elements = [
        el.upper() for constituents in desired_active_pure_elements
        for el in constituents
    ]
    pure_elements = sorted(
        set([x for x in desired_active_pure_elements if x != 'VA']))
    if verbose:
        print('Components:', ' '.join([str(x) for x in comps]))
        print('Phases:', end=' ')
    output = output if output is not None else 'GM'
    output = output if isinstance(output, (list, tuple, set)) else [output]
    output = set(output)
    output |= {'GM'}
    output = sorted(output)
    need_hessians = any(
        type(c) in v.CONDITIONS_REQUIRING_HESSIANS for c in conds.keys())
    phase_records = build_phase_records(dbf,
                                        comps,
                                        active_phases,
                                        conds,
                                        models,
                                        output='GM',
                                        callables=callables,
                                        parameters=parameters,
                                        verbose=verbose,
                                        build_gradients=True,
                                        build_hessians=need_hessians)
    if verbose:
        print('[done]', end='\n')

    # 'calculate' accepts conditions through its keyword arguments
    grid_opts = calc_opts.copy()
    statevar_strings = [str(x) for x in state_variables]
    grid_opts.update({
        key: value
        for key, value in str_conds.items() if key in statevar_strings
    })
    if 'pdens' not in grid_opts:
        grid_opts['pdens'] = 500
    grid = delayed(calculate, pure=False)(dbf,
                                          comps,
                                          active_phases,
                                          model=models,
                                          fake_points=True,
                                          callables=callables,
                                          output='GM',
                                          parameters=parameters,
                                          **grid_opts)
    coord_dict = str_conds.copy()
    coord_dict['vertex'] = np.arange(
        len(pure_elements) + 1
    )  # +1 is to accommodate the degenerate degree of freedom at the invariant reactions
    coord_dict['component'] = pure_elements
    grid_shape = tuple(len(x)
                       for x in conds.values()) + (len(pure_elements) + 1, )
    properties = delayed(starting_point, pure=False)(conds, state_variables,
                                                     phase_records, grid)
    conditions_per_chunk_per_axis = 2
    if num_calcs > 1:
        # Generate slices of 'properties'
        slices = []
        for val in grid_shape[:-1]:
            idx_arr = list(range(val))
            num_chunks = int(np.floor(val / conditions_per_chunk_per_axis))
            if num_chunks > 0:
                cond_slices = [
                    x for x in np.array_split(np.asarray(idx_arr), num_chunks)
                    if len(x) > 0
                ]
            else:
                cond_slices = [idx_arr]
            slices.append(cond_slices)
        chunk_dims = [len(slc) for slc in slices]
        chunk_grid = np.array(
            np.unravel_index(np.arange(np.prod(chunk_dims)), chunk_dims)).T
        res = []
        for chunk in chunk_grid:
            prop_slice = properties[OrderedDict(
                list(
                    zip(str_conds.keys(), [
                        np.atleast_1d(sl)[ch] for ch, sl in zip(chunk, slices)
                    ])))]
            job = delayed(_solve_eq_at_conditions,
                          pure=False)(comps,
                                      prop_slice,
                                      phase_records,
                                      grid,
                                      list(str_conds.keys()),
                                      state_variables,
                                      verbose,
                                      solver=solver)
            res.append(job)
        properties = delayed(_merge_property_slices,
                             pure=False)(properties, chunk_grid, slices,
                                         list(str_conds.keys()), res)
    else:
        # Single-process job; don't create child processes
        properties = delayed(_solve_eq_at_conditions,
                             pure=False)(comps,
                                         properties,
                                         phase_records,
                                         grid,
                                         list(str_conds.keys()),
                                         state_variables,
                                         verbose,
                                         solver=solver)

    # Compute equilibrium values of any additional user-specified properties
    # We already computed these properties so don't recompute them
    output = sorted(set(output) - {'GM', 'MU'})
    for out in output:
        if (out is None) or (len(out) == 0):
            continue
        # TODO: How do we know if a specified property should be per_phase or not?
        # For now, we make a best guess
        if (out == 'degree_of_ordering') or (out == 'DOO'):
            per_phase = True
        else:
            per_phase = False
        eqcal = delayed(_eqcalculate, pure=False)(dbf,
                                                  comps,
                                                  active_phases,
                                                  conditions,
                                                  out,
                                                  data=properties,
                                                  per_phase=per_phase,
                                                  callables=callables,
                                                  parameters=parameters,
                                                  model=models,
                                                  **calc_opts)
        properties = delayed(properties.merge, pure=False)(eqcal,
                                                           compat='equals')
    if scheduler is not None:
        properties = dask.compute(properties, scheduler=scheduler)[0]
    properties.attrs['created'] = datetime.utcnow().isoformat()
    if len(kwargs) > 0:
        warnings.warn(
            'The following equilibrium keyword arguments were passed, but unused:\n{}'
            .format(kwargs))
    return properties
Exemple #38
0
def perceptual_path_length_score(model,
                                 data_generator,
                                 tolerance_threshold=1e-6,
                                 max_iteration=200,
                                 batch_size=10):
    # prepare the inception v3 model
    VGG16_model = VGG16(include_top=False,
                        pooling='avg',
                        input_shape=model.get_inputs_shape())
    VGG16_featues_fn = lambda x: VGG16_model(x)
    epsilon = 1e-2

    def learned_perceptual_image_patch_similarity(images_a, images_b):
        """LPIPS metric using VGG-16 and Zhang weighting. (https://arxiv.org/abs/1801.03924)

        Takes reference images and corrupted images as an input and outputs the perceptual
        distance between the image pairs.
        """

        # Concatenate images.
        images = tf.concat([images_a, images_b], axis=0)

        # Extract features.
        vgg_features = VGG16_featues_fn(images)

        # Normalize each feature vector to unit length over channel dimension.
        normalized_features = []
        for x in vgg_features:
            x = tf.reshape(x, (len(x), 1))
            n = tf.reduce_sum(x**2, axis=1, keepdims=True)**0.5
            normalized_features.append(x / (n + 1e-10))

        # Split and compute distances.
        diff = [
            tf.subtract(*tf.split(x, 2, axis=0))**2
            for x in normalized_features
        ]

        return np.array(diff)

    def filter_distances_fn(distances):
        # Reject outliers.
        lo = np.percentile(distances, 1, interpolation='lower')
        hi = np.percentile(distances, 99, interpolation='higher')
        filtered_distances = np.extract(
            np.logical_and(lo <= distances, distances <= hi), distances)
        return filtered_distances

    def calculate_distances(images):
        images01, images02 = images[0::2], images[1::2]
        return learned_perceptual_image_patch_similarity(
            images01, images02) * (1 / epsilon**2)

    # prepare the ae model random_images_generator
    def model_random_images_generator():
        while True:
            # Generate latents from the data
            latents_real = model.encode(next(data_generator)['images'])

            # Generate random latents and interpolation t-values.
            latents_t = np.random.normal(size=latents_real.shape)
            lerp_t = np.random.uniform(size=1)[0]

            latents_e = slerp(lerp_t, latents_real, latents_t)
            images = model.decode(latents_e)  #.numpy()
            # images = (images*255).astype(np.uint8)

            yield images[:batch_size]
            # calculate_distances(images[0::2], images[1::2])

    def stopping_fn(distances):
        # Reject outliers.
        filter_distances = filter_distances_fn(distances)
        return np.mean(distances)

    ppl_mean = bootstrapping_additive(
        data_generator=model_random_images_generator(), func=delayed(calculate_distances), \
        stopping_func=stopping_fn, tolerance_threshold=tolerance_threshold, max_iteration=max_iteration
    )

    return ppl_mean
Exemple #39
0
def get_data(baseline=False,
             start_year=DEFAULT_START_YEAR,
             reform={},
             data=None,
             client=None,
             num_workers=1):
    '''
    This function creates dataframes of micro data with marginal tax
    rates and information to compute effective tax rates from the
    Tax-Calculator output.  The resulting dictionary of dataframes is
    returned and saved to disk in a pickle file.

    Args:
        baseline (boolean): True if baseline tax policy
        calculator_start_year (int): first year of budget window
        reform (dictionary): IIT policy reform parameters, None if
            baseline
        data (DataFrame or str): DataFrame or path to datafile for
            Records object
        client (Dask Client object): client for Dask multiprocessing
        num_workers (int): number of workers to use for Dask
            multiprocessing

    Returns:
        micro_data_dict (dict): dict of Pandas Dataframe, one for each
            year from start_year to the maximum year Tax-Calculator can
            analyze
        taxcalc_version (str): version of Tax-Calculator used

    '''
    # Compute MTRs and taxes or each year, but not beyond TC_LAST_YEAR
    lazy_values = []
    for year in range(start_year, TC_LAST_YEAR + 1):
        lazy_values.append(
            delayed(taxcalc_advance)(baseline, start_year, reform, data, year))
    if client:  # pragma: no cover
        futures = client.compute(lazy_values, num_workers=num_workers)
        results = client.gather(futures)
    else:
        results = results = compute(*lazy_values,
                                    scheduler=dask.multiprocessing.get,
                                    num_workers=num_workers)

    # dictionary of data frames to return
    micro_data_dict = {}
    for i, result in enumerate(results):
        year = start_year + i
        micro_data_dict[str(year)] = DataFrame(result)

    if baseline:
        pkl_path = "micro_data_baseline.pkl"
    else:
        pkl_path = "micro_data_policy.pkl"

    with open(pkl_path, "wb") as f:
        pickle.dump(micro_data_dict, f)

    # Do some garbage collection
    del results

    # Pull Tax-Calc version for reference
    taxcalc_version = pkg_resources.get_distribution("taxcalc").version

    return micro_data_dict, taxcalc_version
Exemple #40
0
    def load_from_unf(cls, filename, lazy=False):
        r"""Load a `.unf`-file into a :class:`~.SemperFormat` object.

        Parameters
        ----------
        filename : string
            The name of the unf-file from which to load the data. Standard
            format is '\*.unf'.

        Returns
        -------
        semper : :class:`~.SemperFormat` (N=1)
            SEMPER file format object containing the loaded information.

        """
        metadata = OrderedDict()
        with open(filename, 'rb') as f:
            # Read header:
            rec_length = np.fromfile(f, dtype='<i4',
                                     count=1)[0]  # length of header
            header = np.fromfile(f,
                                 dtype=cls.HEADER_DTYPES[:rec_length // 2],
                                 count=1)
            metadata.update(sarray2dict(header))
            assert np.frombuffer(f.read(4), dtype=np.int32)[0] == rec_length, \
                'Error while reading the header (length is not correct)!'
            data_format = cls.IFORM_DICT[metadata['IFORM']]
            iversn, remain = divmod(metadata['IFLAG'], 10000)
            ilabel, ntitle = divmod(remain, 1000)
            metadata.update({
                'IVERSN': iversn,
                'ILABEL': ilabel,
                'NTITLE': ntitle
            })
            # Read title:
            title = ''
            if ntitle > 0:
                assert np.fromfile(f, dtype='<i4',
                                   count=1)[0] == ntitle  # length of title
                title = b''.join(np.fromfile(f, dtype='c', count=ntitle))
                title = title.decode()
                metadata['TITLE'] = title
                assert np.fromfile(f, dtype='<i4', count=1)[0] == ntitle
            if ilabel:
                try:
                    metadata.update(cls._read_label(f))
                except Exception as e:
                    warning = ('Could not read label, trying to proceed '
                               'without it!')
                    warning += ' (Error message: {})'.format(str(e))
                    warnings.warn(warning)
            # Read picture data:
            pos = f.tell()
            shape = metadata['NLAY'], metadata['NROW'], metadata['NCOL']
            if lazy:
                from dask.array import from_delayed
                from dask import delayed
                task = delayed(_read_data)(f, filename, pos, data_format,
                                           shape)
                data = from_delayed(task, shape=shape, dtype=data_format)
            else:
                data = _read_data(f, filename, pos, data_format, shape)
        offsets = (metadata.get('X0V0', 0.), metadata.get('Y0V2', 0.),
                   metadata.get('Z0V4', 0.))
        scales = (metadata.get('DXV1',
                               1.), metadata.get('DYV3',
                                                 1.), metadata.get('DZV5', 1.))
        units = (metadata.get('XUNIT',
                              Undefined), metadata.get('YUNIT', Undefined),
                 metadata.get('ZUNIT', Undefined))
        return cls(data, title, offsets, scales, units, metadata)
Exemple #41
0
    output.write()

    print("{:<15}".format(algo.getName()) \
        + " executed input: " + "{:<15}".format(file) \
        + " in {:.2f}s".format(end_time - start_time) \
        + " | Score: " + "{:<10}".format(score.score) \
        + " | Params: " + str(parameter))

    return score.score


if __name__ == "__main__":

    print("-------------------------")
    print("      HashCode 2021      ")
    print("-------------------------")
    start_time_program = time.time()

    scores = []
    for f in FILES:
        res = delayed(run_algorithm)(GreedyAlgorithm(), f)
        scores.append(res)

    total_scores = delayed(sum)(scores)
    total_scores = total_scores.compute(
        num_workers=1)  # Set num_workers=# to limit worker processes
    print("Total scores: ", total_scores)

    end_time_program = time.time()
    print("Whole execution took: {:.2f}s".format(end_time_program -
                                                 start_time_program))
Exemple #42
0
def fft_shading_test(obj,
                     variable='diffuse_hemisp_narrowband_filter4',
                     fft_window=30,
                     shad_freq_lower=[0.008, 0.017],
                     shad_freq_upper=[0.0105, 0.0195],
                     ratio_thresh=[3.15, 1.2],
                     time_interval=None):
    """
    Function to test shadowband radiometer (MFRSR, RSS, etc) instruments
    for shading related problems.  Program was adapted by Adam Theisen
    from the method defined in Alexandrov et al 2007 to process on a
    point by point basis using a window of data around that point for
    the FFT analysis.

    For ARM data, testing has found that this works the best on narrowband
    filter4 for MFRSR data.

    Function has been tested and is in use by the ARM DQ Office for
    problem detection.  It is know to have some false positives at times.

    Need to run obj.clean.cleanup() ahead of time to ensure proper addition
    to QC variable

    Parameters
    ----------
    obj : xarray Dataset
        Data object

    Returns
    -------
    obj : xarray Dataset
        Data object

    References
    ----------
    Alexandrov, Mikhail & Kiedron, Peter & Michalsky, Joseph & Hodges, Gary
    & Flynn, Connor & Lacis, Andrew. (2007). Optical depth measurements by
    shadow-band radiometers and their uncertainties. Applied optics. 46.
    8027-38. 10.1364/AO.46.008027.

    """

    # Get time and data from variable
    time = obj['time'].values
    data = obj[variable].values
    if 'missing_value' in obj[variable].attrs:
        missing = obj[variable].attrs['missing_value']
    else:
        missing = -9999.

    # Get time interval between measurements
    dt = time_interval
    if time_interval is None:
        dt = determine_time_delta(time)

    # Compute the FFT for each point +- window samples
    task = []
    for t in range(len(time)):
        sind = t - fft_window
        eind = t + fft_window
        if sind < 0:
            sind = 0
        if eind > len(time):
            eind = len(time)

        # Get data and remove all nan/missing values
        d = data[sind:eind]
        idx = ((d != missing) & (np.isnan(d) is not True))
        index = np.where(idx)
        d = d[index]

        # Add to task for dask processing
        lat = [
            obj['lat'].values
        ] if not isinstance(obj['lat'].values, list) else obj['lat'].values
        lon = [
            obj['lon'].values
        ] if not isinstance(obj['lon'].values, list) else obj['lon'].values
        task.append(
            dask.delayed(fft_shading_test_process)(
                time[t],
                lat[0],
                lon[0],
                d,
                shad_freq_lower=shad_freq_lower,
                shad_freq_upper=shad_freq_upper,
                ratio_thresh=ratio_thresh,
                time_interval=dt))

    # Process using dask
    result = dask.compute(*task)

    # Run data through a rolling median to filter out singular
    # false positives
    result = pd.Series(result).rolling(window=5, min_periods=1).median()

    # Find indices where shading is indicated
    idx = (np.asarray(result) > 0.4)
    index = np.where(idx)

    # Add test to QC Variable
    desc = 'FFT Shading Test'
    result = obj.qcfilter.add_test(variable, index=index, test_meaning=desc)

    return obj
Exemple #43
0
def compute_tile_foreground_fraction(slide_path,
                                     im_fgnd_mask_lres,
                                     fgnd_seg_scale,
                                     it_kwargs,
                                     tile_position=None):
    """
    Computes the fraction of foreground of a single tile or
    all tiles in a whole slide image given the binary foreground
    mask computed from a low resolution version of the slide.

    Parameters
    ----------
    slide_path : str
        path to an image or slide
    im_fgnd_mask_lres : array_like
        A binary foreground mask computed at a low-resolution
    fgnd_seg_scale : double
        The scale/magnification at which the foreground mask `im_fgnd_mask_lres`
        was computed
    it_kwargs : dict
        A dictionary of any key:value parameters (e.g. defining the scale,
         tile_size, region etc) in addition to tile_position that need to be
         passed to `large_image.TileSource.getSingleTile` to get the tile.
    tile_position : int or None
        A linear 0-based index of a tile for which the foreground needs to be
        computed. If set to None, the foreground fraction of all tiles will be
        computed.

    Returns
    -------
    tile_fgnd_frac : double or array_like
        A value between 0 and 1 indicating the fraction of foreground pixels
        present in the tile indicated by `tile_position`. If `tile_position`
        is set to None, then a 1D array containing the foreground fraction of
        all tiles will be returned.

    """

    if tile_position is None:

        # get slide tile source
        ts = large_image.getTileSource(slide_path)

        num_tiles = ts.getSingleTile(**it_kwargs)['iterator_range']['position']

        # broadcasting fgnd mask to all dask workers
        try:
            c = dask.distributed.get_client()

            [im_fgnd_mask_lres] = c.scatter([im_fgnd_mask_lres],
                                            broadcast=True)
        except ValueError:
            pass

        # compute tile foreground fraction in parallel
        tile_fgnd_frac = []

        for tile_position in range(num_tiles):

            tile_fgnd_frac.append(
                dask.delayed(_compute_tile_foreground_fraction_single)(
                    slide_path, im_fgnd_mask_lres, fgnd_seg_scale, it_kwargs,
                    tile_position))

        tile_fgnd_frac = dask.delayed(tile_fgnd_frac).compute()

        tile_fgnd_frac = np.array(tile_fgnd_frac)

    elif np.isscalar(tile_position):

        tile_fgnd_frac = _compute_tile_foreground_fraction_single(
            slide_path, im_fgnd_mask_lres, fgnd_seg_scale, it_kwargs,
            tile_position)

    else:

        raise ValueError(
            'Invalid value for tile_position. Must be None or int')

    return tile_fgnd_frac
Exemple #44
0
def ser_reader(filename, objects=None, *args, **kwds):
    """Reads the information from the file and returns it in the HyperSpy
    required format.

    """
    header, data = load_ser_file(filename)
    record_by = guess_record_by(header['DataTypeID'])
    ndim = int(header['NumberDimensions'])
    date, time = None, None
    if objects is not None:
        objects_dict = convert_xml_to_dict(objects[0])
        date, time = _get_date_time(objects_dict.ObjectInfo.AcquireDate)
    if "PositionY" in data.dtype.names and len(data['PositionY']) > 1 and \
            (data['PositionY'][0] == data['PositionY'][1]):
        # The spatial dimensions are stored in F order i.e. X, Y, ...
        order = "F"
    else:
        # The spatial dimensions are stored in C order i.e. ..., Y, X
        order = "C"
    if ndim == 0 and header["ValidNumberElements"] != 0:
        # The calibration of the axes are not stored in the header.
        # We try to guess from the position coordinates.
        array_shape, axes = get_axes_from_position(header=header, data=data)
    else:
        axes = []
        array_shape = [
            None,
        ] * int(ndim)
        spatial_axes = ["x", "y"][:ndim]
        for i in range(ndim):
            idim = 1 + i if order == "C" else ndim - i
            if (record_by == "spectrum"
                    or header['Dim-%i_DimensionSize' % (i + 1)][0] != 1):
                units = (header['Dim-%i_Units' % (idim)][0].decode('utf-8')
                         if header['Dim-%i_UnitsLength' %
                                   (idim)] > 0 else t.Undefined)
                if units == "meters":
                    name = (spatial_axes.pop()
                            if order == "F" else spatial_axes.pop(-1))
                else:
                    name = t.Undefined
                axes.append({
                    'offset':
                    header['Dim-%i_CalibrationOffset' % idim][0],
                    'scale':
                    header['Dim-%i_CalibrationDelta' % idim][0],
                    'units':
                    units,
                    'size':
                    header['Dim-%i_DimensionSize' % idim][0],
                    'name':
                    name,
                })
                array_shape[i] = \
                    header['Dim-%i_DimensionSize' % idim][0]
    # Spectral dimension
    if record_by == "spectrum":
        axes.append({
            'offset': data['CalibrationOffset'][0],
            'scale': data['CalibrationDelta'][0],
            'size': data['ArrayLength'][0],
            'index_in_array': header['NumberDimensions'][0]
        })

        # FEI seems to use the international system of units (SI) for the
        # energy scale (eV).
        axes[-1]['units'] = 'eV'
        axes[-1]['name'] = 'Energy'

        array_shape.append(data['ArrayLength'][0])

    elif record_by == 'image':
        if objects is not None:
            units = _guess_units_from_mode(objects_dict, header)
        else:
            units = "meters"
        # Y axis
        axes.append({
            'name':
            'y',
            'offset':
            data['CalibrationOffsetY'][0] -
            data['CalibrationElementY'][0] * data['CalibrationDeltaY'][0],
            'scale':
            data['CalibrationDeltaY'][0],
            'units':
            units,
            'size':
            data['ArraySizeY'][0],
        })
        array_shape.append(data['ArraySizeY'][0])
        # X axis
        axes.append({
            'name':
            'x',
            'offset':
            data['CalibrationOffsetX'][0] -
            data['CalibrationElementX'][0] * data['CalibrationDeltaX'][0],
            'scale':
            data['CalibrationDeltaX'][0],
            'size':
            data['ArraySizeX'][0],
            'units':
            units,
        })
        array_shape.append(data['ArraySizeX'][0])

    # FEI seems to use the international system of units (SI) for the
    # spatial scale. However, we prefer to work in nm
    for axis in axes:
        if axis['units'] == 'meters':
            axis['units'] = 'nm'
            axis['scale'] *= 10**9
        elif axis['units'] == '1/meters':
            axis['units'] = '1/nm'
            axis['scale'] /= 10**9
    # Remove Nones from array_shape caused by squeezing size 1 dimensions
    array_shape = [dim for dim in array_shape if dim is not None]
    lazy = kwds.pop('lazy', False)
    if lazy:
        from dask import delayed
        from dask.array import from_delayed
        val = delayed(load_only_data, pure=True)(filename, array_shape,
                                                 record_by, len(axes))
        dc = from_delayed(val, shape=array_shape, dtype=data['Array'].dtype)
    else:
        dc = load_only_data(filename,
                            array_shape,
                            record_by,
                            len(axes),
                            data=data)

    if ordict:
        original_metadata = OrderedDict()
    else:
        original_metadata = {}
    header_parameters = sarray2dict(header)
    sarray2dict(data, header_parameters)
    # We remove the Array key to save memory avoiding duplication
    del header_parameters['Array']
    original_metadata['ser_header_parameters'] = header_parameters
    metadata = {
        'General': {
            'original_filename': os.path.split(filename)[1],
        },
        "Signal": {
            'signal_type': "",
            'record_by': record_by,
        },
    }
    if date is not None and time is not None:
        metadata['General']['date'] = date
        metadata['General']['time'] = time
    dictionary = {
        'data': dc,
        'metadata': metadata,
        'axes': axes,
        'original_metadata': original_metadata,
        'mapping': mapping
    }
    return dictionary
 def dask_win_func(n):
     return dsar.from_delayed(
         delayed(numpy_win_func, pure=True)(n), (n, ), float)
Exemple #46
0
def svd_flip(u, v):
    u2, v2 = delayed(skm.svd_flip, nout=2)(u, v)
    u = da.from_delayed(u2, shape=u.shape, dtype=u.dtype)
    v = da.from_delayed(v2, shape=v.shape, dtype=v.dtype)
    return u, v
Exemple #47
0
def to_cloudvolume(arr,
                   cloudpath,
                   resolution=(1, 1, 1),
                   voxel_offset=(0, 0, 0),
                   layer_type=None,
                   encoding='raw',
                   max_mip=0,
                   compute=True,
                   return_stored=False,
                   **kwargs):
    """Save 3d or 4d dask array to the precomputed CloudVolume storage format.

  NOTE: DO NOT USE thread-based dask scheduler. See comment at top of module.

  See https://docs.dask.org/en/latest/array.html for details about the format.

  Parameters
  ----------
  arr: dask.array
    Data to store
  cloudpath: str
    Path to the dataset layer. This should match storage's supported
    providers.
    e.g. Google: gs://$BUCKET/$DATASET/$LAYER/
         S3    : s3://$BUCKET/$DATASET/$LAYER/
         Lcl FS: file:///tmp/$DATASET/$LAYER/
         Boss  : boss://$COLLECTION/$EXPERIMENT/$CHANNEL
         HTTP/S: http(s)://.../$CHANNEL
         matrix: matrix://$BUCKET/$DATASET/$LAYER/
  resolution: Iterable of ints of length 3
    The x, y, z voxel dimensions in nanometers
  voxel_offset: Iterable of ints of length 3
    The x, y, z beginning of dataset in positive cartesian space.
  layer_type: str
    "image" or "segmentation"
  max_mip: int
    Maximum mip level id.
  compute: boolean, optional
    If true compute immediately, return ``dask.delayed.Delayed`` otherwise.
  return_stored: boolean, optional
    Optionally return stored results.
  kwargs: passed to the ``cloudvolume.CloudVolume()`` function, e.g., compression options

  Raises
  ------
  ValueError
    If ``arr`` has ndim different that 3 or 4, or ``layer_type`` is unsupported.

  Returns
  -------
  See notes on `compute` and `return_stored` parameters.
  """
    import dask
    import dask.array as da
    if not da.core._check_regular_chunks(arr.chunks):
        raise ValueError('Attempt to save array to cloudvolume with irregular '
                         'chunking, please call `arr.rechunk(...)` first.')

    if not layer_type:
        if arr.dtype in (bool, np.uint32, np.uint64, np.uint16):
            layer_type = 'segmentation'
        elif np.issubdtype(arr.dtype, np.integer) or np.issubdtype(
                arr.dtype, np.floating):
            layer_type = 'image'
        else:
            raise ValueError('Unsupported layer_type for CloudVolume: %s' %
                             layer_type)

    if arr.ndim == 3:
        num_channels = 1
        chunk_size = arr.chunksize
    elif arr.ndim == 4:
        num_channels = arr.shape[-1]
        chunk_size = arr.chunksize[:3]
    else:
        raise ValueError(
            'CloudVolume only supports 3 or 4 dimensions.  Array has %d.' %
            arr.ndim)

    info = CloudVolume.create_new_info(num_channels,
                                       layer_type,
                                       arr.dtype.name,
                                       encoding,
                                       resolution,
                                       voxel_offset,
                                       arr.shape[:3],
                                       chunk_size=chunk_size,
                                       max_mip=max_mip)

    # Delay writing any metadata until computation time.
    #   - the caller may never do the full computation
    #   - the filesystem may be slow, and there is a desire to open files
    #     in parallel on worker machines.
    vol = dask.delayed(_create_cloudvolume)(cloudpath, info, **kwargs)
    return arr.store(vol,
                     lock=False,
                     compute=compute,
                     return_stored=return_stored)
Exemple #48
0
    def add_data(
        self,
        dates,
        param=None,
        daily=False,
        network=None,
        download=False,
        local=False,
        n_procs=1,
        meta=False,
    ):
        """Short summary.

        Parameters
        ----------
        dates : list of datetime objects
            Description of parameter `dates`.
        param : list of strings
            Description of parameter `param` (the default is None).
        daily : boolean
            Description of parameter `daily` (the default is False).
        network : type
            Description of parameter `network` (the default is None).
        download : type
            Description of parameter `download` (the default is False).

        Returns
        -------
        pandas DataFrame
            Description of returned object.

        """
        import dask
        import dask.dataframe as dd

        if param is None:
            params = [
                "SPEC",
                "PM10",
                "PM2.5",
                "PM2.5_FRM",
                "CO",
                "OZONE",
                "SO2",
                "VOC",
                "NONOXNOY",
                "WIND",
                "TEMP",
                "RHDP",
            ]
        else:
            params = param
        urls, fnames = self.build_urls(params, dates, daily=daily)
        if download:
            for url, fname in zip(urls, fnames):
                self.retrieve(url, fname)
            dfs = [
                dask.delayed(self.load_aqs_file)(i, network) for i in fnames
            ]
        elif local:
            dfs = [
                dask.delayed(self.load_aqs_file)(i, network) for i in fnames
            ]
        else:
            dfs = [dask.delayed(self.load_aqs_file)(i, network) for i in urls]
        dff = dd.from_delayed(dfs)
        dfff = dff.compute(num_workers=n_procs)
        if meta:
            return self.add_data2(dfff, daily, network)
        else:
            return dfff
Exemple #49
0
def file_reader(filename,
                record_by='image',
                force_read_resolution=False,
                **kwds):
    """
    Read data from tif files using Christoph Gohlke's tifffile library.
    The units and the scale of images saved with ImageJ or Digital
    Micrograph is read. There is limited support for reading the scale of
    files created with Zeiss and FEI SEMs.

    Parameters
    ----------
    filename: str
    record_by: {'image'}
        Has no effect because this format only supports recording by
        image.
    force_read_resolution: Bool
        Default: False.
        Force reading the x_resolution, y_resolution and the resolution_unit
        of the tiff tags.
        See http://www.awaresystems.be/imaging/tiff/tifftags/resolutionunit.html
    **kwds, optional
    """

    _logger.debug('************* Loading *************')
    # For testing the use of local and skimage tifffile library

    lazy = kwds.pop('lazy', False)
    memmap = kwds.pop('memmap', False)
    with TiffFile(filename, **kwds) as tiff:

        # change in the Tifffiles API
        if hasattr(tiff.series[0], 'axes'):
            # in newer version the axes is an attribute
            axes = tiff.series[0].axes
        else:
            # old version
            axes = tiff.series[0]['axes']
        is_rgb = tiff.is_rgb
        _logger.debug("Is RGB: %s" % is_rgb)
        series = tiff.series[0]
        if hasattr(series, 'shape'):
            shape = series.shape
            dtype = series.dtype
        else:
            shape = series['shape']
            dtype = series['dtype']
        if is_rgb:
            axes = axes[:-1]
            names = ['R', 'G', 'B', 'A']
            lastshape = shape[-1]
            dtype = np.dtype({
                'names': names[:lastshape],
                'formats': [dtype] * lastshape
            })
            shape = shape[:-1]
        op = {}
        for key, tag in tiff[0].tags.items():
            op[key] = tag.value
        names = [axes_label_codes[axis] for axis in axes]

        _logger.debug('Tiff tags list: %s' % op)
        _logger.debug("Photometric: %s" % op['photometric'])
        _logger.debug('is_imagej: {}'.format(tiff[0].is_imagej))

        # workaround for 'palette' photometric, keep only 'X' and 'Y' axes
        sl = None
        if op['photometric'] == 3:
            sl = [0] * len(shape)
            names = []
            for i, axis in enumerate(axes):
                if axis == 'X' or axis == 'Y':
                    sl[i] = slice(None)
                    names.append(axes_label_codes[axis])
                else:
                    axes.replace(axis, '')
            shape = tuple(_sh for _s, _sh in zip(sl, shape)
                          if isinstance(_s, slice))
        _logger.debug("names: {0}".format(names))

        scales = [1.0] * len(names)
        offsets = [0.0] * len(names)
        units = [t.Undefined] * len(names)
        intensity_axis = {}
        try:
            scales_d, units_d, offsets_d, intensity_axis, op = \
                _parse_scale_unit(tiff, op, shape,
                                  force_read_resolution)
            for i, name in enumerate(names):
                if name == 'height':
                    scales[i], units[i] = scales_d['x'], units_d['x']
                    offsets[i] = offsets_d['x']
                elif name == 'width':
                    scales[i], units[i] = scales_d['y'], units_d['y']
                    offsets[i] = offsets_d['y']
                elif name in ['depth', 'image series', 'time']:
                    scales[i], units[i] = scales_d['z'], units_d['z']
                    offsets[i] = offsets_d['z']
        except:
            _logger.info("Scale and units could not be imported")

        axes = [{
            'size': size,
            'name': str(name),
            'scale': scale,
            'offset': offset,
            'units': unit,
        }
                for size, name, scale, offset, unit in zip(
                    shape, names, scales, offsets, units)]

        md = {
            'General': {
                'original_filename': os.path.split(filename)[1]
            },
            'Signal': {
                'signal_type': "",
                'record_by': "image",
            },
        }

        if 'datetime' in op:
            dt = datetime.strptime(_decode_string(op['datetime']),
                                   "%Y:%m:%d %H:%M:%S")
            md['General']['date'] = dt.date().isoformat()
            md['General']['time'] = dt.time().isoformat()
        if 'units' in intensity_axis:
            md['Signal']['quantity'] = intensity_axis['units']
        if 'scale' in intensity_axis and 'offset' in intensity_axis:
            dic = {
                'gain_factor': intensity_axis['scale'],
                'gain_offset': intensity_axis['offset']
            }
            md['Signal']['Noise_properties'] = {'Variance_linear_model': dic}

    data_args = TiffFile, filename, is_rgb, sl
    if lazy:
        from dask import delayed
        from dask.array import from_delayed
        memmap = True
        val = delayed(_load_data, pure=True)(*data_args, memmap=memmap, **kwds)
        dc = from_delayed(val, dtype=dtype, shape=shape)
        # TODO: maybe just pass the memmap from tiffile?
    else:
        dc = _load_data(*data_args, memmap=memmap, **kwds)

    metadata = Metadata(op)
    md.update(metadata.get_additional_metadata())

    return [{
        'data': dc,
        'original_metadata': op,
        'axes': axes,
        'metadata': md,
        'mapping': metadata.mapping,
    }]
Exemple #50
0
 def wrap(data):
     return client.scatter(
         data, broadcast=True) if client else delayed(data,
                                                      pure=True)
import dask

parameter_scores = []

for i in range(4):
    X_train, X_test, y_train, y_test = dask.delayed(train_test_split, nout=4)(data.data, data.target, pure=False)

    for max_df in [0.5, 0.75, 1.0]:
        for ngram_range in [(1, 1), (1, 2)]:
            vect = dask.delayed(CountVectorizer)(max_df=max_df, ngram_range=ngram_range)
            vect = vect.fit(X_train)
            X2_train = vect.transform(X_train)
            X2_test = vect.transform(X_test)
            for norm in ['l1', 'l2']:
                tfidf = dask.delayed(TfidfTransformer)(norm=norm)
                tfidf = tfidf.fit(X2_train)
                X3_train = tfidf.transform(X2_train)
                X3_test = tfidf.transform(X2_test)

                for max_iter in [5]:
                    for alpha in [0.00001, 0.000001]:
                        for penalty in ['l2', 'elasticnet']:
                            clf = dask.delayed(SGDClassifier)(max_iter=max_iter, alpha=alpha, penalty=penalty)
                            clf = clf.fit(X3_train, y_train)

                            score = clf.score(X3_test, y_test)
                            params = {
                                'max_df': max_df,
                                'ngram_range': ngram_range,
                                'norm': norm,
                                'max_iter': max_iter,
Exemple #52
0
def test_worker_task_data(c, s, w):
    x = delayed(2)
    xx = c.persist(x)
    yield _wait(xx)
    assert w.data[x.key] == 2
@delayed
def extract_feature_image(img, feature_type, feature_coord=None):
    """Extract the haar feature for the current image"""
    ii = integral_image(img)
    return haar_like_feature(ii, 0, 0, ii.shape[0], ii.shape[1],
                             feature_type=feature_type,
                             feature_coord=feature_coord)

#?haar_like_feature

# To speed up the example, extract the two types of features only
feature_types = ['type-4', 'type-2-x', 'type-2-y']

# Build a computation graph using Dask. This allows the use of multiple
# CPU cores later during the actual computation
X = delayed(extract_feature_image(img, feature_types) for img in images)
print(X.max())




X[1]

#?extract_feature_image

# Compute the result
t_start = time()
X = np.array(X.compute(scheduler='threads'))
time_full_feature_comp = time() - t_start

Exemple #54
0
def to_sql(
    df,
    name: str,
    uri: str,
    schema=None,
    if_exists: str = "fail",
    index: bool = True,
    index_label=None,
    chunksize=None,
    dtype=None,
    method=None,
    compute=True,
    parallel=False,
):
    """ Store Dask Dataframe to a SQL table

    An empty table is created based on the "meta" DataFrame (and conforming to the caller's "if_exists" preference), and
    then each block calls pd.DataFrame.to_sql (with `if_exists="append"`).

    Databases supported by SQLAlchemy [1]_ are supported. Tables can be
    newly created, appended to, or overwritten.

    Parameters
    ----------
    name : str
        Name of SQL table.
    uri : string
        Full sqlalchemy URI for the database connection
    schema : str, optional
        Specify the schema (if database flavor supports this). If None, use
        default schema.
    if_exists : {'fail', 'replace', 'append'}, default 'fail'
        How to behave if the table already exists.

        * fail: Raise a ValueError.
        * replace: Drop the table before inserting new values.
        * append: Insert new values to the existing table.

    index : bool, default True
        Write DataFrame index as a column. Uses `index_label` as the column
        name in the table.
    index_label : str or sequence, default None
        Column label for index column(s). If None is given (default) and
        `index` is True, then the index names are used.
        A sequence should be given if the DataFrame uses MultiIndex.
    chunksize : int, optional
        Specify the number of rows in each batch to be written at a time.
        By default, all rows will be written at once.
    dtype : dict or scalar, optional
        Specifying the datatype for columns. If a dictionary is used, the
        keys should be the column names and the values should be the
        SQLAlchemy types or strings for the sqlite3 legacy mode. If a
        scalar is provided, it will be applied to all columns.
    method : {None, 'multi', callable}, optional
        Controls the SQL insertion clause used:

        * None : Uses standard SQL ``INSERT`` clause (one per row).
        * 'multi': Pass multiple values in a single ``INSERT`` clause.
        * callable with signature ``(pd_table, conn, keys, data_iter)``.

        Details and a sample callable implementation can be found in the
        section :ref:`insert method <io.sql.method>`.
    compute : bool, default True
        When true, call dask.compute and perform the load into SQL; otherwise, return a Dask object (or array of
        per-block objects when parallel=True)
    parallel : bool, default False
        When true, have each block append itself to the DB table concurrently. This can result in DB rows being in a
        different order than the source DataFrame's corresponding rows. When false, load each block into the SQL DB in
        sequence.

    Raises
    ------
    ValueError
        When the table already exists and `if_exists` is 'fail' (the
        default).

    See Also
    --------
    read_sql : Read a DataFrame from a table.

    Notes
    -----
    Timezone aware datetime columns will be written as
    ``Timestamp with timezone`` type with SQLAlchemy if supported by the
    database. Otherwise, the datetimes will be stored as timezone unaware
    timestamps local to the original timezone.

    .. versionadded:: 0.24.0

    References
    ----------
    .. [1] https://docs.sqlalchemy.org
    .. [2] https://www.python.org/dev/peps/pep-0249/

    Examples
    --------
    Create a table from scratch with 4 rows.

    >>> import pandas as pd
    >>> df = pd.DataFrame([ {'i':i, 's':str(i)*2 } for i in range(4) ])
    >>> from dask.dataframe import from_pandas
    >>> ddf = from_pandas(df, npartitions=2)
    >>> ddf  # doctest: +SKIP
    Dask DataFrame Structure:
                       i       s
    npartitions=2
    0              int64  object
    2                ...     ...
    3                ...     ...
    Dask Name: from_pandas, 2 tasks

    >>> from dask.utils import tmpfile
    >>> from sqlalchemy import create_engine
    >>> with tmpfile() as f:
    ...     db = 'sqlite:///%s' % f
    ...     ddf.to_sql('test', db)
    ...     engine = create_engine(db, echo=False)
    ...     result = engine.execute("SELECT * FROM test").fetchall()
    >>> result
    [(0, 0, '00'), (1, 1, '11'), (2, 2, '22'), (3, 3, '33')]
    """

    # This is the only argument we add on top of what Pandas supports
    kwargs = dict(
        name=name,
        con=uri,
        schema=schema,
        if_exists=if_exists,
        index=index,
        index_label=index_label,
        chunksize=chunksize,
        dtype=dtype,
    )

    if method:
        if not PANDAS_GT_0240:
            raise NotImplementedError(
                "'method' requires pandas>=0.24.0. You have version %s" % PANDAS_VERSION
            )
        else:
            kwargs["method"] = method

    def make_meta(meta):
        return meta.to_sql(**kwargs)

    make_meta = delayed(make_meta)
    meta_task = make_meta(df._meta)

    # Partitions should always append to the empty table created from `meta` above
    worker_kwargs = dict(kwargs, if_exists="append")

    if parallel:
        # Perform the meta insert, then one task that inserts all blocks concurrently:
        result = [
            _extra_deps(
                d.to_sql,
                extras=meta_task,
                **worker_kwargs,
                dask_key_name="to_sql-%s" % tokenize(d, **worker_kwargs)
            )
            for d in df.to_delayed()
        ]
    else:
        # Chain the "meta" insert and each block's insert
        result = []
        last = meta_task
        for d in df.to_delayed():
            result.append(
                _extra_deps(
                    d.to_sql,
                    extras=last,
                    **worker_kwargs,
                    dask_key_name="to_sql-%s" % tokenize(d, **worker_kwargs)
                )
            )
            last = result[-1]
    result = dask.delayed(result)

    if compute:
        dask.compute(result)
    else:
        return result
Exemple #55
0
def forecast(R,
             V,
             n_timesteps,
             n_ens_members,
             n_cascade_levels,
             R_thr=None,
             kmperpixel=None,
             timestep=None,
             extrap_method="semilagrangian",
             decomp_method="fft",
             bandpass_filter_method="gaussian",
             noise_method="nonparametric",
             noise_stddev_adj=True,
             ar_order=2,
             vel_pert_method=None,
             conditional=False,
             use_precip_mask=True,
             use_probmatching=True,
             mask_method="incremental",
             callback=None,
             return_output=True,
             seed=None,
             num_workers=None,
             extrap_kwargs={},
             filter_kwargs={},
             noise_kwargs={},
             vel_pert_kwargs={}):
    """Generate a nowcast ensemble by using the Short-Term Ensemble Prediction 
    System (STEPS) method.
    
    Parameters
    ----------
    R : array-like
      Array of shape (ar_order+1,m,n) containing the input precipitation fields 
      ordered by timestamp from oldest to newest. The time steps between the inputs 
      are assumed to be regular, and the inputs are required to have finite values.
    V : array-like
      Array of shape (2,m,n) containing the x- and y-components of the advection 
      field. The velocities are assumed to represent one time step between the 
      inputs. All values are required to be finite.
    n_timesteps : int
      Number of time steps to forecast.
    n_ens_members : int
      The number of ensemble members to generate.
    n_cascade_levels : int
      The number of cascade levels to use.
    
    Other Parameters
    ----------------
    R_thr : float
      Specifies the threshold value for minimum observable precipitation 
      intensity. Must be set if use_probmatching is True or conditional is True.
    kmperpixel : float
      Spatial resolution of the input data (kilometers/pixel). Required if 
      vel_pert_method is not None or mask_method is 'incremental'.
    timestep : float
      Time step of the motion vectors (minutes). Required if vel_pert_method is 
      not None or mask_method is 'incremental'.
    extrap_method : {'semilagrangian'}
      Name of the extrapolation method to use. See the documentation of 
      pysteps.advection.interface.
    decomp_method : {'fft'}
      Name of the cascade decomposition method to use. See the documentation 
      of pysteps.cascade.interface.
    bandpass_filter_method : {'gaussian', 'uniform'}
      Name of the bandpass filter method to use with the cascade decomposition. 
      See the documentation of pysteps.cascade.interface.
    noise_method : {'parametric','nonparametric','ssft','nested'}
      Name of the noise generator to use for perturbating the precipitation 
      field. See the documentation of pysteps.noise.interface.
    noise_stddev_adj : bool
      Optional adjustment for the standard deviations of the noise fields added 
      to each cascade level. See pysteps.noise.utils.compute_noise_stddev_adjs.
    ar_order : int
      The order of the autoregressive model to use. Must be >= 1.
    vel_pert_method : {'bps'}
      Name of the noise generator to use for perturbing the velocity field. See 
      the documentation of pysteps.noise.interface.
    conditional : bool
      If set to True, compute the statistics of the precipitation field 
      conditionally by excluding the areas where the values are below the 
      threshold R_thr.
    use_precip_mask : bool
      If True, set pixels outside precipitation areas to the minimum value of 
      the observed field.
    mask_method : {'obs', 'sprog', 'incremental'}
      The precipitation/no precipitation method to use with mask: 'obs' = apply R_thr
      to the most recently observed precipitation intensity field, 'sprog' = use the
      smoothed forecast field from S-PROG, where the AR(p) model has been applied,
      'incremental' = iteratively buffer the mask with a certain rate (currently
      it is 1 km/min)
    use_probmatching : bool
      If True, apply probability matching to the forecast field in order to 
      preserve the distribution of the most recently observed precipitation 
      field.
    callback : function
      Optional function that is called after computation of each time step of 
      the nowcast. The function takes one argument: a three-dimensional array 
      of shape (n_ens_members,h,w), where h and w are the height and width 
      of the input field R, respectively. This can be used, for instance, 
      writing the outputs into files.
    return_output : bool
      Set to False to disable returning the outputs as numpy arrays. This can 
      save memory if the intermediate results are written to output files using 
      the callback function.
    seed : int
      Optional seed number for the random generators.
    num_workers : int
      The number of workers to use for parallel computation. Set to None to use 
      all available CPUs. Applicable if dask is enabled.
    extrap_kwargs : dict
      Optional dictionary that is supplied as keyword arguments to the 
      extrapolation method.
    filter_kwargs : dict
      Optional dictionary that is supplied as keyword arguments to the 
      filter method.
    noise_kwargs : dict
      Optional dictionary that is supplied as keyword arguments to the 
      initializer of the noise generator.
    vel_pert_kwargs : dict
      Optional dictionary that is supplied as keyword arguments to the 
      initializer of the velocity perturbator.
    
    Returns
    -------
    out : ndarray
      If return_output is True, a four-dimensional array of shape 
      (n_ens_members,n_timesteps,m,n) containing a time series of forecast 
      precipitation fields for each ensemble member. Otherwise, a None value 
      is returned.
    
    See also
    --------
    pysteps.advection.interface, pysteps.cascade.interface, 
    pysteps.noise.interface, pysteps.noise.utils.compute_noise_stddev_adjs
    
    References
    ----------
    :cite:`Seed2003`, :cite:`BPS2006`, :cite:`SPN2013`
    
    """
    _check_inputs(R, V, ar_order)

    if np.any(~np.isfinite(R)):
        raise ValueError("R contains non-finite values")

    if np.any(~np.isfinite(V)):
        raise ValueError("V contains non-finite values")

    if mask_method not in ["obs", "sprog", "incremental"]:
        raise ValueError(
            "unknown mask method %s: must be 'obs', 'sprog' or 'incremental'" %
            mask_method)

    if conditional and R_thr is None:
        raise Exception("conditional=True but R_thr is not set")

    if use_probmatching and R_thr is None:
        raise Exception("use_probmatching=True but R_thr is not set")

    if kmperpixel is None:
        if vel_pert_method is None:
            raise Exception("vel_pert_method is set but kmperpixel=None")
        if mask_method == "incremental":
            raise Exception("mask_method='incremental' but kmperpixel=None")

    if timestep is None:
        if vel_pert_method is None:
            raise Exception("vel_pert_method is set but timestep=None")
        if mask_method == "incremental":
            raise Exception("mask_method='incremental' but timestep=None")

    print("Computing STEPS nowcast:")
    print("------------------------")
    print("")

    print("Inputs:")
    print("-------")
    print("input dimensions: %dx%d" % (R.shape[1], R.shape[2]))
    if kmperpixel is not None:
        print("km/pixel:         %g" % kmperpixel)
    if timestep is not None:
        print("time step:        %d minutes" % timestep)
    print("")

    print("Methods:")
    print("--------")
    print("extrapolation:          %s" % extrap_method)
    print("bandpass filter:        %s" % bandpass_filter_method)
    print("decomposition:          %s" % decomp_method)
    print("noise generator:        %s" % noise_method)
    print("noise adjustment:       %s" % ("yes" if noise_stddev_adj else "no"))
    print("velocity perturbator:   %s" % vel_pert_method)
    print("conditional statistics: %s" % ("yes" if conditional else "no"))
    print("precipitation mask:     %s" % ("yes" if use_precip_mask else "no"))
    print("mask method:            %s" % mask_method)
    print("probability matching:   %s" % ("yes" if use_probmatching else "no"))
    print("")

    print("Parameters:")
    print("-----------")
    print("number of time steps:     %d" % n_timesteps)
    print("ensemble size:            %d" % n_ens_members)
    print("number of cascade levels: %d" % n_cascade_levels)
    print("order of the AR(p) model: %d" % ar_order)
    if vel_pert_method is not None:
        vp_par = vel_pert_kwargs["p_pert_par"]
        vp_perp = vel_pert_kwargs["p_pert_perp"]
        print("velocity perturbations, parallel:      %g,%g,%g" % \
            (vp_par[0],  vp_par[1],  vp_par[2]))
        print("velocity perturbations, perpendicular: %g,%g,%g" % \
            (vp_perp[0], vp_perp[1], vp_perp[2]))

    if conditional or use_probmatching:
        print("conditional precip. intensity threshold: %g" % R_thr)

    M, N = R.shape[1:]
    extrap_method = advection.get_method(extrap_method)
    R = R[-(ar_order + 1):, :, :].copy()

    if conditional or use_probmatching:
        MASK_thr = np.logical_and.reduce(
            [R[i, :, :] >= R_thr for i in range(R.shape[0])])
    else:
        MASK_thr = None

    # advect the previous precipitation fields to the same position with the
    # most recent one (i.e. transform them into the Lagrangian coordinates)
    extrap_kwargs = extrap_kwargs.copy()
    res = []
    f = lambda R, i: extrap_method(R[i, :, :], V, ar_order - i, "min", **
                                   extrap_kwargs)[-1]
    for i in range(ar_order):
        if not dask_imported:
            R[i, :, :] = f(R, i)
        else:
            res.append(dask.delayed(f)(R, i))

    if dask_imported:
        R = np.stack(
            list(dask.compute(*res, num_workers=num_workers)) + [R[-1, :, :]])

    # initialize the band-pass filter
    filter_method = cascade.get_method(bandpass_filter_method)
    filter = filter_method((M, N), n_cascade_levels, **filter_kwargs)

    # compute the cascade decompositions of the input precipitation fields
    decomp_method = cascade.get_method(decomp_method)
    R_d = []
    for i in range(ar_order + 1):
        R_ = decomp_method(R[i, :, :], filter, MASK=MASK_thr)
        R_d.append(R_)

    # normalize the cascades and rearrange them into a four-dimensional array
    # of shape (n_cascade_levels,ar_order+1,m,n) for the autoregressive model
    R_c, mu, sigma = _stack_cascades(R_d, n_cascade_levels)
    R_d = None

    # compute lag-l temporal autocorrelation coefficients for each cascade level
    GAMMA = np.empty((n_cascade_levels, ar_order))
    for i in range(n_cascade_levels):
        R_c_ = np.stack([R_c[i, j, :, :] for j in range(ar_order + 1)])
        GAMMA[i, :] = correlation.temporal_autocorrelation(R_c_, MASK=MASK_thr)
    R_c_ = None

    _print_corrcoefs(GAMMA)

    if ar_order == 2:
        # adjust the lag-2 correlation coefficient to ensure that the AR(p)
        # process is stationary
        for i in range(n_cascade_levels):
            GAMMA[i, 1] = autoregression.adjust_lag2_corrcoef(
                GAMMA[i, 0], GAMMA[i, 1])

    # estimate the parameters of the AR(p) model from the autocorrelation
    # coefficients
    PHI = np.empty((n_cascade_levels, ar_order + 1))
    for i in range(n_cascade_levels):
        PHI[i, :] = autoregression.estimate_ar_params_yw(GAMMA[i, :])

    _print_ar_params(PHI, False)

    # discard all except the p-1 last cascades because they are not needed for
    # the AR(p) model
    R_c = R_c[:, -ar_order:, :, :]

    # stack the cascades into a five-dimensional array containing all ensemble
    # members
    R_c = np.stack([R_c.copy() for i in range(n_ens_members)])

    # initialize the random generators
    if noise_method is not None:
        randgen_prec = []
        randgen_motion = []
        np.random.seed(seed)
        for j in range(n_ens_members):
            rs = np.random.RandomState(seed)
            randgen_prec.append(rs)
            seed = rs.randint(0, high=1e9)
            rs = np.random.RandomState(seed)
            randgen_motion.append(rs)
            seed = rs.randint(0, high=1e9)

    R_min = np.min(R)

    if noise_method is not None:
        # get methods for perturbations
        init_noise, generate_noise = noise.get_method(noise_method)

        # initialize the perturbation generator for the precipitation field
        pp = init_noise(R[-1, :, :], **noise_kwargs)

        if noise_stddev_adj:
            print("Computing noise adjustment factors... ", end="")
            sys.stdout.flush()
            starttime = time.time()

            noise_std_coeffs = noise.utils.compute_noise_stddev_adjs(
                R[-1, :, :],
                R_thr,
                R_min,
                filter,
                decomp_method,
                10,
                conditional=True,
                num_workers=num_workers)

            print("%.2f seconds." % (time.time() - starttime))
        else:
            noise_std_coeffs = np.ones(n_cascade_levels)

    if vel_pert_method is not None:
        init_vel_noise, generate_vel_noise = noise.get_method(vel_pert_method)

        # initialize the perturbation generators for the motion field
        vps = []
        for j in range(n_ens_members):
            kwargs = {
                "randstate": randgen_motion[j],
                "p_pert_par": vp_par,
                "p_pert_perp": vp_perp
            }
            vp_ = init_vel_noise(V, 1. / kmperpixel, timestep, **kwargs)
            vps.append(vp_)

    D = [None for j in range(n_ens_members)]
    R_f = [[] for j in range(n_ens_members)]

    if use_precip_mask:
        if mask_method == "obs":
            MASK_prec = R[-1, :, :] >= R_thr
            # add a slight buffer to the mask
            # n=5
            # kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (n,n))
            # MASK_prec = MASK_prec.astype('uint8')
            # MASK_prec = cv2.dilate(MASK_prec,kernel).astype(bool)

        elif mask_method == "sprog":
            # compute the wet area ratio and the precipitation mask
            MASK_prec = R[-1, :, :] >= R_thr
            war = 1.0 * np.sum(MASK_prec) / (R.shape[1] * R.shape[2])
            R_m = R_c.copy()
        elif mask_method == "incremental":
            # initialize precip mask for each member
            MASK_prec_ = R[-1, :, :] >= R_thr
            MASK_prec = [MASK_prec_.copy() for j in range(n_ens_members)]
            # initialize the structuring element
            struct = scipy.ndimage.generate_binary_structure(2, 1)
            # iterate it to expand it nxn
            n = timestep / kmperpixel
            struct = scipy.ndimage.iterate_structure(struct, int((n - 1) / 2.))

    R = R[-1, :, :]

    print("Starting nowcast computation.")

    # iterate each time step
    for t in range(n_timesteps):
        print("Computing nowcast for time step %d... " % (t + 1), end="")
        sys.stdout.flush()
        starttime = time.time()

        # iterate each ensemble member
        def worker(j):
            if noise_method is not None:
                # generate noise field
                EPS = generate_noise(pp, randstate=randgen_prec[j])
                # decompose the noise field into a cascade
                EPS = decomp_method(EPS, filter)
            else:
                EPS = None

            # iterate the AR(p) model for each cascade level
            for i in range(n_cascade_levels):
                # normalize the noise cascade
                if EPS is not None:
                    EPS_ = (EPS["cascade_levels"][i, :, :] -
                            EPS["means"][i]) / EPS["stds"][i]
                    EPS_ *= noise_std_coeffs[i]
                else:
                    EPS_ = None
                # apply AR(p) process to cascade level
                R_c[j, i, :, :, :] = \
                    autoregression.iterate_ar_model(R_c[j, i, :, :, :],
                                                    PHI[i, :], EPS=EPS_)
                if use_precip_mask and mask_method == "sprog":
                    # use a separate AR(p) model for the non-perturbed forecast,
                    # from which the mask is obtained
                    R_m[j, i, :, :, :] = \
                        autoregression.iterate_ar_model(R_m[j, i, :, :, :], PHI[i, :])

            EPS = None
            EPS_ = None

            # compute the recomposed precipitation field(s) from the cascades
            # obtained from the AR(p) model(s)
            R_c_ = _recompose_cascade(R_c[j, :, :, :], mu, sigma)

            if use_precip_mask:
                # apply the precipitation mask to prevent generation of new
                # precipitation into areas where it was not originally
                # observed
                if mask_method == "obs":
                    R_c_[~MASK_prec] = R_c_.min()
                elif mask_method == "incremental":
                    R_c_[~MASK_prec[j]] = R_c_.min()
                elif mask_method == "sprog":
                    # obtain the CDF from the non-perturbed forecast that is
                    # scale-filtered by the AR(p) model
                    R_m_ = _recompose_cascade(R_m[j, :, :, :], mu, sigma)
                    R_s = R_m_.flatten()

                    # compute the threshold value R_pct_thr corresponding to the
                    # same fraction of precipitation pixels (forecast values above
                    # R_min) as in the most recently observed precipitation field
                    R_s.sort(kind="quicksort")
                    x = 1.0 * np.arange(1, len(R_s) + 1)[::-1] / len(R_s)
                    i = np.argmin(abs(x - war))
                    # handle ties
                    if R_s[i] == R_s[i + 1]:
                        i = np.where(R_s == R_s[i])[0][-1] + 1
                    R_pct_thr = R_s[i]

                    # apply a mask obtained from the above to preserve the
                    # wet-area ratio
                    MASK_prec_ = R_m_ < R_pct_thr
                    R_c_[MASK_prec_] = R_c_.min()

            if use_probmatching:
                ## adjust the conditional CDF of the forecast (precipitation
                ## intensity above the threshold R_thr) to match the most
                ## recently observed precipitation field
                R_c_ = probmatching.nonparam_match_empirical_cdf(R_c_, R)

            if use_precip_mask and mask_method == "incremental":
                MASK_prec_ = R_c_ >= R_thr
                MASK_prec_ = scipy.ndimage.morphology.binary_dilation(
                    MASK_prec_, struct)
                MASK_prec[j] = MASK_prec_

            # compute the perturbed motion field
            if vel_pert_method is not None:
                V_ = V + generate_vel_noise(vps[j], t * timestep)
            else:
                V_ = V

            # advect the recomposed precipitation field to obtain the forecast
            # for time step t
            extrap_kwargs.update({"D_prev": D[j], "return_displacement": True})
            R_f_, D_ = extrap_method(R_c_, V_, 1, **extrap_kwargs)
            D[j] = D_
            R_f_ = R_f_[0]

            return R_f_

        res = []
        for j in range(n_ens_members):
            if not dask_imported or n_ens_members == 1:
                res.append(worker(j))
            else:
                res.append(dask.delayed(worker)(j))

        R_f_ = dask.compute(*res, num_workers=num_workers) \
            if dask_imported and n_ens_members > 1 else res
        res = None

        print("%.2f seconds." % (time.time() - starttime))

        if callback is not None:
            callback(np.stack(R_f_))
            R_f_ = None

        if return_output:
            for j in range(n_ens_members):
                R_f[j].append(R_f_[j])

    if return_output:
        if n_ens_members == 1:
            return np.stack(R_f[0])
        else:
            return np.stack([np.stack(R_f[j]) for j in range(n_ens_members)])
    else:
        return None
Exemple #56
0
    def _predict_kwds_for_cat(
        self,
        feature_matrix: np.array,
        cat_index: int,
        predictions: np.array,
        cat_indices: Dict[str, List[int]] = None,
        use_dask: bool = True,
        pbar: tqdm = None,
    ):
        """
        Make predictions for all documents for all concepts
            in the given category

        Args:
            feature_matrix: array of features for each document
            cat_index: index in categories attribute of the given category
            predictions: the h5 dataset where predictions are stored
            cat_indices: Predicted indices where categories occur
                for each category
            use_dask: Use dask for multiprocessing
            pbar: tqdm progress bar
        """
        cat = self.categories[cat_index]
        pbar.set_postfix(category=cat, refresh=False)
        if (cat_indices is not None) and (cat != ""):
            feature_matrix_test = feature_matrix[cat_indices[cat], :]
            # this could be a problem if I want everything to perfectly align.
        else:
            feature_matrix_test = feature_matrix
        if feature_matrix_test.shape[0] == 0:
            pbar.update(len(self.cat_concept_indices[cat_index]))
            return 0
        # TODO: for good bar, should walk tasks to compute total
        cat_concept_cols = self.cat_concept_indices[cat_index]
        # use the np.where here, bool index for initial setting?
        if False:  # use_dask is True:
            feature_matrix_test = dask.delayed(feature_matrix_test)
            jobs = []
            ProgressBar().register()
            for concept_index in cat_concept_cols:
                j = dask.delayed(self._predict_one_clf)(feature_matrix_test,
                                                        concept_index, cat,
                                                        pbar)
                jobs.append(j)
            vals = dask.compute(jobs)[0]
        else:
            vals = []
            for concept_index in cat_concept_cols:
                val = self._predict_one_clf(feature_matrix_test, concept_index,
                                            cat, pbar)
                vals.append(val)
        if (cat_indices is not None) and (cat is not ""):
            # need to correct indices, zeros in places with no predictions
            # TODO: determine if this patching activity
            #  takes longer than just predicting on more
            new_vals = []
            for v in vals:
                new_v = np.zeros(feature_matrix.shape[0])
                new_v[cat_indices[cat]] = v
                new_vals.append(new_v)
            vals = new_vals
        # TODO: below will not work with cat_inds
        if len(vals) > 0:
            topic_preds_sub = np.stack(vals, axis=1)
            predictions[cat_index, :, cat_concept_cols] = topic_preds_sub
Exemple #57
0
def return_gsea_capsules(ma=None,
                         tissue='',
                         context_on=False,
                         use_set=False,
                         gsea_superset='H',
                         n_top_sets=25,
                         min_capsule_len=2000,
                         all_genes=False,
                         union_cpgs=True,
                         limited_capsule_names_file=''):
    global gene2cpg, gsea_collections, gene_set_weights
    if limited_capsule_names_file:
        with open(limited_capsule_names_file) as f:
            limited_capsule_names = f.read().replace('\n', ' ').split()
    else:
        limited_capsule_names = []
    allcpgs = ma.beta.columns.values
    entire_sets = use_set
    collection = gsea_superset
    gene2cpg = pickle.load(open(gene2cpg, 'rb'))
    if all_genes:
        gene_sets = list(gene2cpg.keys())
    else:
        gsea = pickle.load(open(gsea_collections, 'rb'))
        if tissue:
            gene_sets = pd.read_csv(gene_set_weights[collection],
                                    sep='\t',
                                    index_col=0)
            if tissue != 'ubiquitous':
                gene_sets = (gene_sets.quantile(1., axis=1) -
                             gene_sets.quantile(
                                 0., axis=1)).sort_values().index.tolist()
            else:
                gene_sets = gene_sets[tissue].sort_values(
                    ascending=False).index.tolist()
        else:
            gene_sets = list(gsea[collection].keys())
    intersect_context = False
    if limited_capsule_names_file:
        gene_sets_tmp = np.intersect1d(gene_sets,
                                       limited_capsule_names).tolist()
        print('LIMITED GENE CAPS', gene_sets_tmp)
        if gene_sets_tmp:
            gene_sets = gene_sets_tmp
            intersect_context = True
    if not tissue:
        n_top_sets = 0
    if n_top_sets and not all_genes:
        gene_sets = gene_sets[:n_top_sets]

    capsules = dict()
    if all_genes:
        entire_sets = False
    if entire_sets:
        context_on = False

    def process_gene_set(gene_set):
        capsules = []
        gene_set_cpgs = []
        for genename in (gsea[collection][gene_set]
                         if not all_genes else [gene_set]):
            gene = gene2cpg.get(genename, {'Gene': [], 'Upstream': []})
            if context_on:
                for k in ['Gene', 'Upstream']:
                    context = gene.get(k, [])
                    if len(context):
                        capsules.append(('{}_{}'.format(genename,
                                                        k), list(context)))
                        #capsules['{}_{}'.format(genename,k)]=context.tolist()
            else:
                if not entire_sets:
                    capsules.append((genename,
                                     np.union1d(gene.get('Gene', []),
                                                gene.get('Upstream',
                                                         [])).tolist()))
                    #capsules[genename]=np.union1d(gene.get('Gene',[]),gene.get('Upstream',[])).tolist()
                else:
                    upstream = gene.get('Upstream', [])
                    gene = gene.get('Gene', [])
                    cpg_set = np.union1d(gene, upstream)
                    if cpg_set.tolist():
                        gene_set_cpgs.append(cpg_set)
        if entire_sets and not all_genes:
            capsules.append((gene_set, reduce(np.union1d,
                                              gene_set_cpgs).tolist()))
            #capsules[gene_set]=reduce(np.union1d,gene_set_cpgs).tolist()
        return capsules

    def process_chunk(chunk):
        with ProgressBar():
            chunk = dask.compute(*chunk, scheduler='threading')
        return chunk

    with ProgressBar():
        capsules = dict(
            list(
                reduce(
                    lambda x, y: x + y,
                    dask.compute(*[
                        dask.delayed(process_gene_set)(gene_set)
                        for gene_set in gene_sets
                    ],
                                 scheduler='threading'))))

    capsules2 = []
    #caps_lens=np.array([len(capsules[capsule]) for capsule in capsules])

    # cluster = LocalCluster(n_workers=multiprocessing.cpu_count()*2, threads_per_worker=20)
    # client = Client(cluster)
    capsule_names = list(capsules.keys())

    if intersect_context:
        capsules_tmp_names = np.intersect1d(capsule_names,
                                            limited_capsule_names).tolist()
        if capsules_tmp_names:
            capsules = {k: capsules[k] for k in capsules_tmp_names}
            capsule_names = capsules_tmp_names

    capsules = reduce_caps(capsules, allcpgs, min_capsule_len)

    # print(capsule_names)
    # capsules_bag=db.from_sequence(list(capsules.values()))
    # capsules_intersect=capsules_bag.map(lambda x: np.intersect1d(x,allcpgs))
    # capsules_len=capsules_intersect.map(lambda x: x if len(x) >= min_capsule_len else [])
    # # with get_task_stream(plot='save', filename="task-stream.html") as ts:
    # capsules=capsules_len.compute()
    # #print(capsules)
    # capsules=dict([(capsule_names[i],capsules[i].tolist()) for i in range(len(capsule_names)) if len(capsules[i])])

    # for capsule in capsules:
    # 	capsules2.append([capsule,dask.delayed(return_caps)(capsules[capsule],allcpgs,min_capsule_len)])
    # cpus=multiprocessing.cpu_count()
    # caps_chunks=list(divide_chunks(capsules2,cpus))
    # p=Pool(cpus)
    # capsules=dict(list(reduce(lambda x,y: x+y,p.map(process_chunk,caps_chunks))))

    # with ProgressBar():
    # 	capsules=dask.compute(capsules2,scheduler='threading')[0]
    #print(capsules)
    modules = list(capsules.values(
    ))  #[capsules[capsule] for capsule in capsules if capsules[capsule]]
    modulecpgs = reduce((np.union1d if union_cpgs else (lambda x, y: x + y)),
                        modules).tolist()
    module_names = list(capsules.keys())

    return modules, modulecpgs, module_names
Exemple #58
0
def plot_paths_to_files_delayed(G, paths, orig_points, dest_points, boundaries,
                                dirpath):
    assert os.path.exists(dirpath)

    figures = []
    for n in tqdm(range(len(paths))):

        f, ax = plt.subplots(figsize=(30, 30))

        delayed(boundaries.plot)(edgecolor="red", ax=ax)

        delayed(plot_graph)(G, ax)

        x, y = zip(*paths[n][1])
        delayed(ax.plot)(x, y, c="k", lw=20, alpha=0.5)
        delayed(ax.scatter)(orig_points.iloc[n].x,
                            orig_points.iloc[n].y,
                            color="green",
                            s=500)
        delayed(ax.scatter)(x[0], y[0], color="red", s=500)
        delayed(ax.scatter)(x[-1], y[-1], color="green", s=500)
        delayed(ax.scatter)(dest_points.x, dest_points.y, color="k", s=250)

        figures.append(delayed(f.savefig)(f"{dirpath}/{n}.png"))
        delayed(f.clf())
        delayed(plt.close(f))

    with ProgressBar():
        compute(*figures)
Exemple #59
0
    def add_data(self,
                 dates,
                 box=None,
                 country=None,
                 state=None,
                 site=None,
                 resample=True,
                 window='H'):
        """
        dates : list of datetime objects
               description
        box : list of floats
             [latmin, lonmin, latmax, lonmax]
        country :
        state :
        site :
        resample : boolean
        window :
        """
        from numpy import NaN
        self.dates = pd.to_datetime(dates)
        idate = dates[0]
        year = idate.strftime('%Y')
        url = 'https://www1.ncdc.noaa.gov/pub/data/noaa/' + year + '/'
        if self.history is None:
            self.read_ish_history()
        self.history['fname'] = url + self.history.usaf + \
            '-' + self.history.wban + '-' + year + '.gz'
        dfloc = self.history.copy()
        # if isinstance(box, None):  # type(box) is not type(None):
        if box is not None:  # type(box) is not type(None):
            print('Retrieving Sites in: ' + ' '.join(map(str, box)))
            dfloc = self.subset_sites(latmin=box[0],
                                      lonmin=box[1],
                                      latmax=box[2],
                                      lonmax=box[3])
        elif country is not None:
            print('Retrieving Country: ' + country)
            dfloc = self.history.loc[self.history.ctry == country, :]
        elif state is not None:
            print('Retrieving State: ' + state)
            dfloc = self.history.loc[self.history.STATE == state, :]
        elif site is not None:
            print('Retrieving Site: ' + site)
            dfloc = self.history.loc[self.history.station_id == site, :]
        print(dfloc.fname.unique())
        objs = self.get_url_file_objs(dfloc.fname.unique())
        # return objs,size,self.history.fname
        # dfs = []
        # for f in objs:
        #     try:
        #         dfs.append(self.read_data_frame(f))
        #     except:
        #         pass

        print('  Reading ISH into pandas DataFrame...')
        dfs = [dask.delayed(self.read_data_frame)(f) for f in objs]
        dff = dd.from_delayed(dfs)
        self.df = dff.compute()
        self.df.loc[self.df.vsb == 99999, 'vsb'] = NaN
        if resample:
            print('  Resampling to every ' + window)
            self.df.index = self.df.time
            self.df = self.df.groupby('station_id').resample(
                'H').mean().reset_index()
        # this was encoded as byte literal but in dfloc it is a string so could
        # not merge on station_id correctly.
        try:
            self.df['station_id'] = self.df['station_id'].str.decode("utf-8")
        except RuntimeError:
            pass
        self.df = self.df.merge(
            dfloc[['station_id', 'latitude', 'longitude', 'station name']],
            on=['station_id'],
            how='left')

        return self.df.copy()
Exemple #60
0
        image = images[0]
        image = draw_haar_like_feature(image, 0, 0, images.shape[2],
                                       images.shape[1],
                                       [feature_coord[idx_sorted[idx]]])
        ax.imshow(image)
        ax.set_xticks([])
        ax.set_yticks([])

    _ = fig.suptitle('The most important features')


images = lfw_subset()

feature_types = None

X = delayed(extract_feature_image(img, feature_types) for img in images)

X = np.array(X.compute(scheduler='threads'))

y = np.array([1] * 100 + [0] * 100)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=150,
                                                    random_state=0,
                                                    stratify=y)


feature_coord, feature_type = \
    haar_like_feature_coord(width=images.shape[2], height=images.shape[1],
                            feature_type=feature_types)