def test_no_worker_to_memory_restrictions(c, s, a, b): x = delayed(slowinc)(1, delay=0.4) y = delayed(slowinc)(x, delay=0.4) z = delayed(slowinc)(y, delay=0.4) yy, zz = c.persist([y, z], workers={(x, y, z): 'alice'}) while not s.tasks: yield gen.sleep(0.01) w = Worker(s.ip, s.port, ncores=1, name='alice') w.put_key_in_memory(y.key, 3) yield w._start() while len(s.workers) < 3: yield gen.sleep(0.01) yield gen.sleep(0.3) assert s.get_task_status(keys={x.key, y.key, z.key}) == { x.key: 'released', y.key: 'memory', z.key: 'processing', } yield w._close()
def test_optimize(): x = dask.delayed(inc)(1) y = dask.delayed(inc)(x) z = x + y x2, y2, z2, constant = optimize(x, y, z, 1) assert constant == 1 # Same graphs for each dsk = dict(x2.dask) assert dict(y2.dask) == dsk assert dict(z2.dask) == dsk # Computationally equivalent assert dask.compute(x2, y2, z2) == dask.compute(x, y, z) # Applying optimizations before compute and during compute gives # same results. Shows optimizations are occurring. sols = dask.compute(x, y, z, optimizations=[inc_to_dec]) x3, y3, z3 = optimize(x, y, z, optimizations=[inc_to_dec]) assert dask.compute(x3, y3, z3) == sols # Optimize respects global optimizations as well with dask.config.set(optimizations=[inc_to_dec]): x4, y4, z4 = optimize(x, y, z) for a, b in zip([x3, y3, z3], [x4, y4, z4]): assert dict(a.dask) == dict(b.dask)
def test_no_workers_to_memory(c, s): x = delayed(slowinc)(1, delay=0.4) y = delayed(slowinc)(x, delay=0.4) z = delayed(slowinc)(y, delay=0.4) yy, zz = c.persist([y, z]) while not s.tasks: yield gen.sleep(0.01) w = Worker(s.ip, s.port, ncores=1) w.put_key_in_memory(y.key, 3) yield w._start() start = time() while not s.workers: yield gen.sleep(0.01) assert s.get_task_status(keys={x.key, y.key, z.key}) == { x.key: 'released', y.key: 'memory', z.key: 'processing', } yield w._close()
def test_worker_arrives_with_processing_data(c, s, a, b): x = delayed(slowinc)(1, delay=0.4) y = delayed(slowinc)(x, delay=0.4) z = delayed(slowinc)(y, delay=0.4) yy, zz = c.persist([y, z]) while not any(w.processing for w in s.workers.values()): yield gen.sleep(0.01) w = Worker(s.ip, s.port, ncores=1) w.put_key_in_memory(y.key, 3) yield w._start() start = time() while len(s.workers) < 3: yield gen.sleep(0.01) assert s.get_task_status(keys={x.key, y.key, z.key}) == { x.key: 'released', y.key: 'memory', z.key: 'processing', } yield w._close()
def counts_by_origin(): frames = [] # For each file for f in sorted(glob.glob('data/*.csv')): # Load the dataframe df = delayed(pd.read_csv)(f, parse_dates={'Date': [0, 1, 2]}, infer_datetime_format=True) # Store in list of frames frames.append(df) # concatenate all the frames together df = delayed(pd.concat)(frames) # Resample by month by_month = (df.resample('MS', on='Date') .Origin.value_counts() .unstack()) # Resample by year by_year = (df.resample('AS', on='Date') .Origin.value_counts() .unstack()) return by_month, by_year
def test_publish_multiple_datasets(c, s, a, b): x = delayed(inc)(1) y = delayed(inc)(2) yield c.publish_dataset(x=x, y=y) datasets = yield c.scheduler.publish_list() assert set(datasets) == {'x', 'y'}
def normalize(block): old_min = delayed(block.min()) old_max = delayed(block.max()) r = delayed(decr)(old_max, old_min) minimum = old_min.compute() t0 = decr(block, minimum) return t0/r.compute(), -minimum/r.compute()
def test_stress_scatter_death(c, s, *workers): import random s.allowed_failures = 1000 np = pytest.importorskip('numpy') L = yield c.scatter([np.random.random(10000) for i in range(len(workers))]) yield c._replicate(L, n=2) adds = [delayed(slowadd, pure=True)(random.choice(L), random.choice(L), delay=0.05, dask_key_name='slowadd-1-%d' % i) for i in range(50)] adds = [delayed(slowadd, pure=True)(a, b, delay=0.02, dask_key_name='slowadd-2-%d' % i) for i, (a, b) in enumerate(sliding_window(2, adds))] futures = c.compute(adds) L = adds = None alive = list(workers) from distributed.scheduler import logger for i in range(7): yield gen.sleep(0.1) try: s.validate_state() except Exception as c: logger.exception(c) if config.get('log-on-err'): import pdb pdb.set_trace() else: raise w = random.choice(alive) yield w._close() alive.remove(w) try: yield gen.with_timeout(timedelta(seconds=25), c._gather(futures)) except gen.TimeoutError: ws = {w.address: w for w in workers if w.status != 'closed'} print(s.processing) print(ws) print(futures) try: worker = [w for w in ws.values() if w.waiting_for_data][0] except Exception: pass if config.get('log-on-err'): import pdb pdb.set_trace() else: raise except CancelledError: pass finally: futures = None
def test_expand_persist(c, s, a, b): low = delayed(inc)(1, dask_key_name='low') many = [delayed(slowinc)(i, delay=0.1) for i in range(4)] high = delayed(inc)(2, dask_key_name='high') low, high, x, y, z, w = persist(low, high, *many, priority={low: -1, high: 1}) yield wait(high) assert s.tasks[low.key].state == 'processing'
def test_expand_compute(c, s, a, b): low = delayed(inc)(1) many = [delayed(slowinc)(i, delay=0.1) for i in range(10)] high = delayed(inc)(2) low, many, high = c.compute([low, many, high], priority={low: -1, high: 1}) yield wait(high) assert s.tasks[low.key].state == 'processing'
def test_compute(c, s, a, b): x = delayed(inc)(1) y = delayed(inc)(x) yy = c.compute(y, resources={x: {'A': 1}, y: {'B': 1}}) yield _wait(yy) assert b.data
def block_candset_excluding_rule(self, c_df, l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, rule_to_exclude, show_progress, n_chunks): # # list to keep track of valid ids valid = [] apply_rules_excluding_rule_pkl = cp.dumps(self.apply_rules_excluding_rule) if n_chunks == 1: # single process valid = _block_candset_excluding_rule_split(c_df, l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, rule_to_exclude, apply_rules_excluding_rule_pkl, show_progress) else: # multiprocessing c_splits = pd.np.array_split(c_df, n_chunks) valid_splits = [] for i in range(len(c_splits)): partial_result = delayed(_block_candset_excluding_rule_split)(c_splits[i], l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, rule_to_exclude, apply_rules_excluding_rule_pkl, False) # use Progressbar from # Dask.diagnostics so set the #show_progress to False valid_splits.append(partial_result) valid_splits = delayed(wrap)(valid_splits) if show_progress: with ProgressBar(): valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) valid = sum(valid_splits, []) # construct output candset if len(c_df) > 0: candset = c_df[valid] else: candset = pd.DataFrame(columns=c_df.columns) # return candidate set return candset
def test_local_get_with_distributed_active(c, s, a, b): with dask.set_options(get=dask.get): x = delayed(inc)(1).persist() yield gen.sleep(0.01) assert not s.task_state # scheduler hasn't done anything y = delayed(inc)(2).persist(get=dask.get) yield gen.sleep(0.01) assert not s.task_state # scheduler hasn't done anything
def normaltest(a, axis=0, nan_policy='propagate'): if nan_policy != 'propagate': raise NotImplementedError("`nan_policy` other than 'propagate' " "have not been implemented.") s, _ = skewtest(a, axis) k, _ = kurtosistest(a, axis) k2 = s * s + k * k return delayed(NormaltestResult, nout=2)(k2, delayed(distributions.chi2.sf)(k2, 2))
def test_local_get_with_distributed_active(c, s, a, b): with dask.config.set(scheduler='sync'): x = delayed(inc)(1).persist() yield gen.sleep(0.01) assert not s.tasks # scheduler hasn't done anything y = delayed(inc)(2).persist(scheduler='sync') yield gen.sleep(0.01) assert not s.tasks # scheduler hasn't done anything
def test_persist(c, s, a, b): x = delayed(inc)(1) y = delayed(inc)(x) xx, yy = c.persist([x, y], resources={x: {'A': 1}, y: {'B': 1}}) yield _wait([xx, yy]) assert x.key in a.data assert y.key in b.data
def test_object_in_graph(c, s, a, b): o = MyObj(123) v = delayed(o) v2 = delayed(identity)(v) future = c.compute(v2) result = yield future._result() assert isinstance(result, MyObj) assert result.data == 123
def featurize_ts_files(ts_paths, features_to_use, output_path=None, custom_script_path=None, custom_functions=None, scheduler=dask.multiprocessing.get): """Feature generation function for on-disk time series (NetCDF) files. By default, computes features concurrently using the `dask.multiprocessing.get` scheduler. Other possible options include `dask.async.get_sync` for synchronous computation (e.g., when debugging), or `dask.distributed.Executor.get` for distributed computation. In the case of multichannel measurements, each channel will be featurized separately, and the data variables of the output `xarray.Dataset` will be indexed by a `channel` coordinate. Parameters ---------- ts_paths : list of str List of paths to time series data, stored in NetCDF format. See `time_series.from_netcdf` for details. features_to_use : list of str, optional List of feature names to be generated. Defaults to an empty list, which will result in only meta_features features being stored. custom_script_path : str, optional Path to Python script containing function definitions for the generation of any custom features. Defaults to None. custom_functions : dict, optional Dictionary of custom feature functions to be evaluated for the given time series, or a dictionary representing a dask graph of function evaluations. Dictionaries of functions should have keys `feature_name` and values functions that take arguments (t, m, e); in the case of a dask graph, these arrays should be referenced as 't', 'm', 'e', respectively, and any values with keys present in `features_to_use` will be computed. scheduler : function, optional `dask` scheduler function used to perform feature extraction computation. Defaults to `dask.multiprocessing.get`. Returns ------- xarray.Dataset Featureset with `data_vars` containing feature values and `coords` containing labels (`name`) and targets (`target`), if applicable. """ all_time_series = [delayed(time_series.from_netcdf, pure=True)(ts_path) for ts_path in ts_paths] all_features = [delayed(featurize_single_ts, pure=True)(ts, features_to_use, custom_script_path, custom_functions) for ts in all_time_series] result = delayed(assemble_featureset, pure=True)(all_features, all_time_series) fset = result.compute(get=scheduler) if output_path: fset.to_netcdf(output_path) return fset
def test_persist_tuple(c, s, a, b): x = delayed(inc)(1) y = delayed(inc)(x) xx, yy = c.persist([x, y], resources={(x, y): {'A': 1}}) yield _wait([xx, yy]) assert x.key in a.data assert y.key in a.data assert not b.data
def test_persist_delayed(): x1 = delayed(1) x2 = delayed(inc)(x1) x3 = delayed(inc)(x2) xx, = persist(x3) assert isinstance(xx, Delayed) assert xx.key == x3.key assert len(xx.dask) == 1 assert x3.compute() == xx.compute()
def test_clean_nbytes(c, s, a, b): L = [delayed(inc)(i) for i in range(10)] for i in range(5): L = [delayed(add)(x, y) for x, y in sliding_window(2, L)] total = delayed(sum)(L) future = c.compute(total) yield wait(future) yield gen.sleep(1) assert len(a.nbytes) + len(b.nbytes) == 1
def test_persist(c, s, a, b): x = delayed(inc)(1) x2, = persist(x) yield _wait(x2) assert x2.key in a.data or x2.key in b.data y = delayed(inc)(10) y2, one = persist(y, 1) yield _wait(y2) assert y2.key in a.data or y2.key in b.data
def test_cancel_fire_and_forget(c, s, a, b): x = delayed(slowinc)(1, delay=0.05) y = delayed(slowinc)(x, delay=0.05) z = delayed(slowinc)(y, delay=0.05) w = delayed(slowinc)(z, delay=0.05) future = c.compute(w) fire_and_forget(future) yield gen.sleep(0.05) yield future.cancel(force=True) assert future.status == 'cancelled' assert not s.tasks
def test_set_index_sorted_min_max_same(): a = pd.DataFrame({'x': [1, 2, 3], 'y': [0, 0, 0]}) b = pd.DataFrame({'x': [1, 2, 3], 'y': [1, 1, 1]}) aa = delayed(a) bb = delayed(b) df = dd.from_delayed([aa, bb], meta=a) assert not df.known_divisions df2 = df.set_index('y', sorted=True) assert df2.divisions == (0, 1, 1)
def test_respect_data_in_memory(c, s, a): x = delayed(inc)(1) y = delayed(inc)(x) f = c.persist(y) yield wait([f]) assert s.tasks[y.key].who_has == {s.workers[a.address]} z = delayed(add)(x, y) f2 = c.persist(z) while f2.key not in s.tasks or not s.tasks[f2.key]: assert s.tasks[y.key].who_has yield gen.sleep(0.0001)
def test_categorical_empty(self): # GH 1705 def make_empty(): return pd.DataFrame({"A": pd.Categorical([np.nan, np.nan])}) def make_full(): return pd.DataFrame({"A": pd.Categorical(['a', 'a'])}) a = dd.from_delayed([dask.delayed(make_empty)(), dask.delayed(make_full)()]) # Used to raise an IndexError a.A.cat.categories
def test_repeated_persists_same_priority(c, s, w): xs = [delayed(slowinc)(i, delay=0.05, dask_key_name='x-%d' % i) for i in range(10)] ys = [delayed(slowinc)(x, delay=0.05, dask_key_name='y-%d' % i) for i, x in enumerate(xs)] zs = [delayed(slowdec)(x, delay=0.05, dask_key_name='z-%d' % i) for i, x in enumerate(xs)] ys = dask.persist(*ys) zs = dask.persist(*zs) while sum(t.state == 'memory' for t in s.tasks.values()) < 5: # TODO: reduce this number yield gen.sleep(0.01) assert any(s.tasks[y.key].state == 'memory' for y in ys) assert any(s.tasks[z.key].state == 'memory' for z in zs)
def test_restart_during_computation(c, s, a, b): xs = [delayed(slowinc)(i, delay=0.01) for i in range(50)] ys = [delayed(slowinc)(i, delay=0.01) for i in xs] zs = [delayed(slowadd)(x, y, delay=0.01) for x, y in zip(xs, ys)] total = delayed(sum)(zs) result = c.compute(total) yield gen.sleep(0.5) assert s.rprocessing yield c._restart() assert not s.rprocessing assert len(s.ncores) == 2 assert not s.task_state
def test_delayed_with_dataclass(): dataclasses = pytest.importorskip("dataclasses") # Avoid @dataclass decorator as Python < 3.7 fail to interpret the type hints ADataClass = dataclasses.make_dataclass('ADataClass', [('a', int)]) literal = dask.delayed(3) with_class = dask.delayed({"a": ADataClass(a=literal)}) def return_nested(obj): return obj["a"].a final = delayed(return_nested)(with_class) assert final.compute() == 3
def test_transition_story(c, s, a, b): x = delayed(inc)(1) y = delayed(inc)(x) f = c.persist(y) yield _wait([f]) assert s.transition_log story = s.transition_story(x.key) assert all(line in s.transition_log for line in story) assert len(story) < len(s.transition_log) assert all(x.key == line[0] or x.key in line[-2] for line in story) assert len(s.transition_story(x.key, y.key)) > len(story)
return x + 1 def add(x, y): time.sleep(2) return x + y client = Client('10.255.23.115:8786') data = [1, 2, 3, 4, 5, 6, 7, 8] results = [] tic = time.perf_counter() results = [] for x in data: y = delayed(inc)(x) results.append(y) total = delayed(sum)(results) print("Before computing:", total) # Let's see what type of thing total is result = total.compute(scheduler='distributed') print("After computing :", result) # After it's computed toc = time.perf_counter() print(f"Downloaded the tutorial in {toc - tic:0.4f} seconds") total.visualize()
def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, compat='no_conflicts', preprocess=None, engine=None, lock=None, data_vars='all', coords='different', autoclose=False, parallel=False, **kwargs): """Open multiple files as a single dataset. Requires dask to be installed. See documentation for details on dask [1]. Attributes from the first dataset file are used for the combined dataset. Parameters ---------- paths : str or sequence Either a string glob in the form "path/to/my/files/*.nc" or an explicit list of files to open. Paths can be given as strings or as pathlib Paths. chunks : int or dict, optional Dictionary with keys given by dimension names and values given by chunk sizes. In general, these should divide the dimensions of each dataset. If int, chunk each dimension by ``chunks``. By default, chunks will be chosen to load entire input files into memory at once. This has a major impact on performance: please see the full documentation for more details [2]. concat_dim : None, str, DataArray or Index, optional Dimension to concatenate files along. This argument is passed on to :py:func:`xarray.auto_combine` along with the dataset objects. You only need to provide this argument if the dimension along which you want to concatenate is not a dimension in the original datasets, e.g., if you want to stack a collection of 2D arrays along a third dimension. By default, xarray attempts to infer this argument by examining component files. Set ``concat_dim=None`` explicitly to disable concatenation. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional String indicating how to compare variables of the same name for potential conflicts when merging: - 'broadcast_equals': all values must be equal when variables are broadcast against each other to ensure common dimensions. - 'equals': all values and dimensions must be the same. - 'identical': all values, dimensions and attributes must be the same. - 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. preprocess : callable, optional If provided, call this function on each dataset prior to concatenation. engine : {'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio'}, optional Engine to use when reading files. If not provided, the default engine is chosen based on available dependencies, with a preference for 'netcdf4'. autoclose : bool, optional If True, automatically close files to avoid OS Error of too many files being open. However, this option doesn't work with streams, e.g., BytesIO. lock : False, True or threading.Lock, optional This argument is passed on to :py:func:`dask.array.from_array`. By default, a per-variable lock is used when reading data from netCDF files with the netcdf4 and h5netcdf engines to avoid issues with concurrent access when using dask's multithreaded backend. data_vars : {'minimal', 'different', 'all' or list of str}, optional These data variables will be concatenated together: * 'minimal': Only data variables in which the dimension already appears are included. * 'different': Data variables which are not equal (ignoring attributes) across all datasets are also concatenated (as well as all for which dimension already appears). Beware: this option may load the data payload of data variables into memory if they are not already loaded. * 'all': All data variables will be concatenated. * list of str: The listed data variables will be concatenated, in addition to the 'minimal' data variables. coords : {'minimal', 'different', 'all' o list of str}, optional These coordinate variables will be concatenated together: * 'minimal': Only coordinates in which the dimension already appears are included. * 'different': Coordinates which are not equal (ignoring attributes) across all datasets are also concatenated (as well as all for which dimension already appears). Beware: this option may load the data payload of coordinate variables into memory if they are not already loaded. * 'all': All coordinate variables will be concatenated, except those corresponding to other dimensions. * list of str: The listed coordinate variables will be concatenated, in addition the 'minimal' coordinates. parallel : bool, optional If True, the open and preprocess steps of this function will be performed in parallel using ``dask.delayed``. Default is False. **kwargs : optional Additional arguments passed on to :py:func:`xarray.open_dataset`. Returns ------- xarray.Dataset See Also -------- auto_combine open_dataset References ---------- .. [1] http://xarray.pydata.org/en/stable/dask.html .. [2] http://xarray.pydata.org/en/stable/dask.html#chunking-and-performance """ if isinstance(paths, basestring): if is_remote_uri(paths): raise ValueError( 'cannot do wild-card matching for paths that are remote URLs: ' '{!r}. Instead, supply paths as an explicit list of strings.'. format(paths)) paths = sorted(glob(paths)) else: paths = [str(p) if isinstance(p, path_type) else p for p in paths] if not paths: raise IOError('no files to open') if lock is None: lock = _default_lock(paths[0], engine) open_kwargs = dict(engine=engine, chunks=chunks or {}, lock=lock, autoclose=autoclose, **kwargs) if parallel: import dask # wrap the open_dataset, getattr, and preprocess with delayed open_ = dask.delayed(open_dataset) getattr_ = dask.delayed(getattr) if preprocess is not None: preprocess = dask.delayed(preprocess) else: open_ = open_dataset getattr_ = getattr datasets = [open_(p, **open_kwargs) for p in paths] file_objs = [getattr_(ds, '_file_obj') for ds in datasets] if preprocess is not None: datasets = [preprocess(ds) for ds in datasets] if parallel: # calling compute here will return the datasets/file_objs lists, # the underlying datasets will still be stored as dask arrays datasets, file_objs = dask.compute(datasets, file_objs) # close datasets in case of a ValueError try: if concat_dim is _CONCAT_DIM_DEFAULT: combined = auto_combine(datasets, compat=compat, data_vars=data_vars, coords=coords) else: combined = auto_combine(datasets, concat_dim=concat_dim, compat=compat, data_vars=data_vars, coords=coords) except ValueError: for ds in datasets: ds.close() raise combined._file_obj = _MultiFileCloser(file_objs) combined.attrs = datasets[0].attrs return combined
def create_graph(client=None): # NOTE ON CHUNKING SIGNATURES: # Chunking the gene signatures might not be necessary anymore because the overhead of the dask # scheduler is minimal (cf. blog http://matthewrocklin.com/blog/work/2016/05/05/performant-task-scheduling). # The original behind the decision to implement this was the refuted assumption that fast executing tasks # would greatly be impacted by scheduler overhead. The performance gain introduced by chunking of signatures # seemed to corroborate this assumption. However, the benefit was through less pickling and unpickling of # the motif annotations dataframe as this was not wrapped in a delayed() construct. # When using a distributed scheduler chunking even has a negative impact and is therefore overruled. The # negative impact is due to having these large chunks to be shipped to different workers across cluster nodes. # NOTE ON BROADCASTING DATASET: # There are three large pieces of data that need to be orchestrated between client/scheduler and workers: # 1. In a cluster the motif annotations need to be broadcasted to all nodes. Otherwise # the motif annotations need to wrapped in a delayed() construct to avoid needless pickling and # unpicking between processes. def wrap(data): return client.scatter( data, broadcast=True) if client else delayed(data, pure=True) delayed_or_future_annotations = wrap(motif_annotations) # 2. The databases: these database objects are typically proxies to the data on disk. They only have # the name and location on shared storage as fields. For consistency reason we do broadcast these database # objects to the workers. If we decide to have all information of a database loaded into memory we can still # safely use clusters. #def memoize(db: Type[RankingDatabase]) -> Type[RankingDatabase]: # return MemoryDecorator(db) #delayed_or_future_dbs = list(map(wrap, map(memoize, rnkdbs))) # Check also latest Stackoverflow message: https://stackoverflow.com/questions/50795901/dask-scatter-broadcast-a-list delayed_or_future_dbs = list(map(wrap, rnkdbs)) # 3. The gene signatures: these signatures become large when chunking them, therefore chunking is overruled # when using dask.distributed. # See earlier. # NOTE ON SHARING RANKING DATABASES ACROSS NODES: # Because the frontnodes of the VSC share the staging disk, these databases can be accessed from all nodes # in the cluster and can all use the same path in the configuration file. The RankingDatabase objects shared # from scheduler to workers can therefore be just contain information on database file location. # There might be a need to be able to run on clusters that do not share a network drive. This can be # achieved via by loading all data in from the scheduler and use the broadcasting system to share data # across nodes. The only element that needs to be adapted to cater for this need is loading the databases # in memory on the scheduler via the already available MemoryDecorator for databases. But make sure the # adapt memory limits for workers to avoid "distributed.nanny - WARNING - Worker exceeded 95% memory budget.". # NOTE ON REMOVING I/O CONTENTION: # A potential improvement to reduce I/O contention for this shared drive (accessing the ranking # database) would be to load the database in memory (using the available decorator) for each task. # The penalty of loading the database in memory should be shared across multiple gene signature so # in this case chunking of gene signatures is mandatory to avoid severe performance penalties. # However, because of the memory need of a node running pyscenic is already high (i.e. pre-allocation # of recovery curves - 20K features (max. enriched) * rank_threshold * 8 bytes (float) * num_cores), # this might not be a sound idea to do. # Another approach to overcome the I/O bottleneck in a clustered infrastructure is to assign each cluster # to a different database which is achievable in the dask framework. This approach has of course many # limitations: for 6 database you need at least 6 cores and you cannot take advantage of more # (http://distributed.readthedocs.io/en/latest/locality.html) # NOTE ON REMAINING WARNINGS: # >> distributed.worker - WARNING - Memory use is high but worker has no data to store to disk. # >> Perhaps some other process is leaking memory? Process memory: 1.51 GB -- Worker memory limit: 2.15 GB # My current idea is that this cannot be avoided processing a single module can sometimes required # substantial amount of memory because of pre-allocation of recovery curves (see code notes on how to # mitigate this problem). Setting module_chunksize=1 also limits this problem. # # >> distributed.utils_perf - WARNING - full garbage collections took 10% CPU time recently (threshold: 10%) # The current implementation of module2df removes substantial amounts of memory (i.e. the RCCs) so this might # again be unavoidable. TBI + See following stackoverflow question: # https://stackoverflow.com/questions/47776936/why-is-a-computation-much-slower-within-a-dask-distributed-worker return aggregate_func( (delayed(transform_func)(db, gs_chunk, delayed_or_future_annotations) for db in delayed_or_future_dbs for gs_chunk in chunked_iter(modules, module_chunksize)))
def get_synth_preds( store, shape, all_cat_inds, categories, batch_size, only_cat, synth_strat, use_dask=True, con_limit=None, limit=None, pbar=None, ): with h5py.File(store, "a") as f_synth, h5py.File(store, "r") as f_preds: if "synthesis" in f_synth.keys(): del f_synth['synthesis'] f_synth.create_dataset("synthesis", shape) synth_preds = f_synth["synthesis"] if (limit is not None): kwd_preds = f_preds["predictions"][:, 0:limit, :] else: kwd_preds = f_preds["predictions"] n_batches = np.ceil(kwd_preds.shape[1] / batch_size) LOG.debug(f"{n_batches} batches") no_cat_ind = categories.index("") for n in range(int(n_batches)): start_batch = n * batch_size end_batch = (n + 1) * batch_size if con_limit is not None: kwd_preds_tmp = kwd_preds[0:con_limit, start_batch:end_batch, :] else: kwd_preds_tmp = kwd_preds[:, start_batch:end_batch, :] n_docs = kwd_preds_tmp.shape[1] if True: # use_dask is True: kwd_preds_tmp = dask.delayed(kwd_preds_tmp) all_cat_inds = dask.delayed(all_cat_inds) jobs = [] for doc_index in range(n_docs): # should be everything now, since '' category is included job = dask.delayed(get_means_for_one_doc)( doc_index, all_cat_inds, kwd_preds_tmp, categories, no_cat_ind, synth_strat, pbar=pbar, ) jobs.append(job) hybrid_preds = dask.compute(jobs)[0] else: hybrid_preds = [] for doc_index in range(n_docs): # should be everything now, since '' category is included v = get_means_for_one_doc( doc_index, all_cat_inds, kwd_preds_tmp, categories, no_cat_ind, only_cat, synth_strat, pbar=pbar, ) hybrid_preds.append(v) hybrid_pred_array = np.stack(hybrid_preds) if limit is not None: if limit <= end_batch: synth_preds[start_batch:limit, :] = hybrid_pred_array else: synth_preds[start_batch:end_batch, :] = hybrid_pred_array else: synth_preds[start_batch:end_batch, :] = hybrid_pred_array
def main(client): import cudf import dask_cudf item_ddf, customer_ddf, customer_dem_ddf = read_tables() # We want to find clicks in the parameterized category # It would be more efficient to translate to a category id, but # all of the SQL samples refer to string categories directly We'll # call this clicks_in_category to match the names used in SQL # examples, though clicks_in_target would be a much better name item_ddf["clicks_in_category"] = (( item_ddf["i_category"] == Q05_I_CATEGORY).astype( np.int8).reset_index(drop=True)) keep_cols = ["i_item_sk", "i_category_id", "clicks_in_category"] item_ddf = item_ddf[keep_cols] web_clickstream_flist = glob.glob(cli_args["data_dir"] + "web_clickstreams/*.parquet") n_workers = len(client.scheduler_info()["workers"]) batchsize = len(web_clickstream_flist) // n_workers if batchsize < 1: batchsize = 1 chunks = [ web_clickstream_flist[x:x + batchsize] for x in range(0, len(web_clickstream_flist), batchsize) ] task_ls = [ delayed(get_groupby_results)(c, item_ddf.to_delayed()[0]) for c in chunks ] meta_d = { "wcs_user_sk": {}, "clicks_in_category": {}, "clicks_in_1": {}, "clicks_in_2": {}, "clicks_in_3": {}, "clicks_in_4": {}, "clicks_in_5": {}, "clicks_in_6": {}, "clicks_in_7": {}, } df = cudf.from_pandas(pd.DataFrame.from_dict(meta_d, dtype="int64")) sum_by_cat_ddf = dask_cudf.from_delayed(task_ls, meta=df) sum_by_cat_ddf = sum_by_cat_ddf.groupby(["wcs_user_sk"], sort=True).sum() sum_by_cat_ddf = sum_by_cat_ddf.reset_index(drop=False) # # Combine user-level click summaries with customer demographics # customer_merged_ddf = customer_ddf.merge(customer_dem_ddf, left_on="c_current_cdemo_sk", right_on="cd_demo_sk") customer_merged_ddf = customer_merged_ddf[[ "c_customer_sk", "cd_gender", "cd_education_status" ]] customer_merged_ddf["college_education"] = ( customer_merged_ddf.cd_education_status.isin(COLLEGE_ED_STRS).astype( np.int64).fillna(0)).reset_index(drop=True) customer_merged_ddf["male"] = (( customer_merged_ddf["cd_gender"] == "M").astype( np.int64).fillna(0)).reset_index(drop=True) cust_and_clicks_ddf = customer_merged_ddf[[ "c_customer_sk", "college_education", "male" ]].merge(sum_by_cat_ddf, left_on="c_customer_sk", right_on="wcs_user_sk") keep_cols = ["clicks_in_category", "college_education", "male" ] + [f"clicks_in_{i}" for i in range(1, 8)] cust_and_clicks_ddf = cust_and_clicks_ddf[keep_cols] # The ETL step in spark covers everything above this point # Convert clicks_in_category to a binary label cust_and_clicks_ddf["clicks_in_category"] = ( (cust_and_clicks_ddf["clicks_in_category"] > cust_and_clicks_ddf["clicks_in_category"].mean()).reset_index( drop=True).astype(np.int64)) # Converting the dataframe to float64 as cuml logistic reg requires this ml_input_df = cust_and_clicks_ddf.astype("float64") ml_input_df = ml_input_df.persist() ml_tasks = [ delayed(build_and_predict_model)(df) for df in ml_input_df.to_delayed() ] results_dict = client.compute(*ml_tasks, sync=True) return results_dict
def test_update_shuffle_buckets( store_factory, metadata_version, unique_primaries, unique_secondaries, num_buckets, repartition, npartitions, bucket_by, ): """ Assert that certain properties are always given for the output dataset no matter how the input data distribution looks like Properties to assert: * All partitions have a unique value for its correspondent primary key * number of partitions is at least one per unique partition value, at most ``num_buckets`` per primary partition value. * If we demand a column to be sorted it is per partition monotonic """ primaries = np.arange(unique_primaries) secondary = np.arange(unique_secondaries) num_rows = 100 primaries = np.repeat(primaries, np.ceil(num_rows / unique_primaries))[:num_rows] secondary = np.repeat(secondary, np.ceil(num_rows / unique_secondaries))[:num_rows] # ensure that there is an unsorted column uncorrelated # to the primary and secondary columns which can be sorted later on per partition unsorted_column = np.repeat(np.arange(100 / 10), 10) np.random.shuffle(unsorted_column) np.random.shuffle(primaries) np.random.shuffle(secondary) df = pd.DataFrame({ "primary": primaries, "secondary": secondary, "sorted_column": unsorted_column }) secondary_indices = ["secondary"] expected_num_indices = 2 # One primary # used for tests later on to if bucket_by: secondary_indices.append(bucket_by) expected_num_indices = 3 # shuffle all rows. properties of result should be reproducible df = df.sample(frac=1).reset_index(drop=True) ddf = dd.from_pandas(df, npartitions=npartitions) dataset_comp = update_dataset_from_ddf( ddf, store_factory, dataset_uuid="output_dataset_uuid", table="core", secondary_indices=secondary_indices, shuffle=True, bucket_by=bucket_by, repartition_ratio=repartition, num_buckets=num_buckets, sort_partitions_by="sorted_column", default_metadata_version=metadata_version, partition_on=["primary"], ) s = pickle.dumps(dataset_comp, pickle.HIGHEST_PROTOCOL) dataset_comp = pickle.loads(s) dataset = dataset_comp.compute() dataset = dataset.load_all_indices(store_factory()) assert len(dataset.partitions) <= num_buckets * unique_primaries assert len(dataset.partitions) >= unique_primaries assert len(dataset.indices) == expected_num_indices assert set(dataset.indices["primary"].index_dct.keys()) == set( range(unique_primaries)) assert (list( map(lambda x: len(x), dataset.indices["primary"].index_dct.values())) <= [num_buckets] * unique_primaries) assert set(dataset.indices["secondary"].index_dct.keys()) == set( range(unique_secondaries)) assert set(dataset.table_meta["core"].names) == { "primary", "secondary", "sorted_column", } factory = DatasetFactory("output_dataset_uuid", store_factory) factory.load_all_indices() if bucket_by: ind_df = factory.get_indices_as_dataframe(["primary", bucket_by]) assert not ind_df.duplicated().any() for data_dct in read_dataset_as_dataframes__iterator( dataset_uuid=dataset.uuid, store=store_factory): df = data_dct["core"] assert len(df.primary.unique()) == 1 assert df.sorted_column.is_monotonic # update the dataset # do not use partition_on since it should be interfered from the existing dataset tasks = update_dataset_from_ddf( ddf, store_factory, dataset_uuid="output_dataset_uuid", table="core", shuffle=True, repartition_ratio=repartition, num_buckets=num_buckets, sort_partitions_by="sorted_column", default_metadata_version=metadata_version, bucket_by=bucket_by, ) s = pickle.dumps(tasks, pickle.HIGHEST_PROTOCOL) tasks = pickle.loads(s) updated_dataset = tasks.compute() assert len(updated_dataset.partitions) == 2 * len(dataset.partitions) # Not allowed to use different partition_on with pytest.raises( ValueError, match="Incompatible set of partition keys encountered."): update_dataset_from_ddf( ddf, store_factory, dataset_uuid="output_dataset_uuid", table="core", shuffle=True, repartition_ratio=repartition, partition_on=["sorted_column"], num_buckets=num_buckets, sort_partitions_by="sorted_column", default_metadata_version=metadata_version, ) # Not allowed to update with indices which do not yet exist in dataset with pytest.raises(ValueError, match="indices"): update_dataset_from_ddf( ddf, store_factory, dataset_uuid="output_dataset_uuid", table="core", shuffle=True, partition_on=["primary"], repartition_ratio=repartition, secondary_indices=["sorted_column"], num_buckets=num_buckets, sort_partitions_by="sorted_column", default_metadata_version=metadata_version, ) # Check that delayed objects are allowed as delete scope. tasks = update_dataset_from_ddf( None, store_factory, dataset_uuid="output_dataset_uuid", table="core", shuffle=True, repartition_ratio=repartition, num_buckets=num_buckets, sort_partitions_by="sorted_column", default_metadata_version=metadata_version, delete_scope=dask.delayed(_return_none)(), bucket_by=bucket_by, ) s = pickle.dumps(tasks, pickle.HIGHEST_PROTOCOL) tasks = pickle.loads(s) tasks.compute()
def equilibrium(dbf, comps, phases, conditions, output=None, model=None, verbose=False, broadcast=True, calc_opts=None, scheduler='sync', parameters=None, solver=None, callables=None, **kwargs): """ Calculate the equilibrium state of a system containing the specified components and phases, under the specified conditions. Parameters ---------- dbf : Database Thermodynamic database containing the relevant parameters. comps : list Names of components to consider in the calculation. phases : list or dict Names of phases to consider in the calculation. conditions : dict or (list of dict) StateVariables and their corresponding value. output : str or list of str, optional Additional equilibrium model properties (e.g., CPM, HM, etc.) to compute. These must be defined as attributes in the Model class of each phase. model : Model, a dict of phase names to Model, or a seq of both, optional Model class to use for each phase. verbose : bool, optional Print details of calculations. Useful for debugging. broadcast : bool If True, broadcast conditions against each other. This will compute all combinations. If False, each condition should be an equal-length list (or single-valued). Disabling broadcasting is useful for calculating equilibrium at selected conditions, when those conditions don't comprise a grid. calc_opts : dict, optional Keyword arguments to pass to `calculate`, the energy/property calculation routine. scheduler : Dask scheduler, optional Job scheduler for performing the computation. If None, return a Dask graph of the computation instead of actually doing it. parameters : dict, optional Maps SymPy Symbol to numbers, for overriding the values of parameters in the Database. solver : pycalphad.core.solver.SolverBase Instance of a solver that is used to calculate local equilibria. Defaults to a pycalphad.core.solver.InteriorPointSolver. callables : dict, optional Pre-computed callable functions for equilibrium calculation. Returns ------- Structured equilibrium calculation, or Dask graph if scheduler=None. Examples -------- None yet. """ if not broadcast: raise NotImplementedError('Broadcasting cannot yet be disabled') comps = sorted(unpack_components(dbf, comps)) phases = unpack_phases(phases) or sorted(dbf.phases.keys()) # remove phases that cannot be active list_of_possible_phases = filter_phases(dbf, comps) active_phases = sorted( set(list_of_possible_phases).intersection(set(phases))) if len(list_of_possible_phases) == 0: raise ConditionError( 'There are no phases in the Database that can be active with components {0}' .format(comps)) if len(active_phases) == 0: raise ConditionError( 'None of the passed phases ({0}) are active. List of possible phases: {1}.' .format(phases, list_of_possible_phases)) if isinstance(comps, (str, v.Species)): comps = [comps] if len(set(comps) - set(dbf.species)) > 0: raise EquilibriumError('Components not found in database: {}'.format( ','.join([c.name for c in (set(comps) - set(dbf.species))]))) calc_opts = calc_opts if calc_opts is not None else dict() solver = solver if solver is not None else InteriorPointSolver( verbose=verbose) parameters = parameters if parameters is not None else dict() if isinstance(parameters, dict): parameters = OrderedDict(sorted(parameters.items(), key=str)) models = instantiate_models(dbf, comps, active_phases, model=model, parameters=parameters) # Temporary solution until constraint system improves if conditions.get(v.N) is None: conditions[v.N] = 1 if np.any(np.array(conditions[v.N]) != 1): raise ConditionError('N!=1 is not yet supported, got N={}'.format( conditions[v.N])) # Modify conditions values to be within numerical limits, e.g., X(AL)=0 # Also wrap single-valued conditions with lists conds = _adjust_conditions(conditions) for cond in conds.keys(): if isinstance(cond, (v.Composition, v.ChemicalPotential)) and cond.species not in comps: raise ConditionError( '{} refers to non-existent component'.format(cond)) state_variables = sorted(get_state_variables(models=models, conds=conds), key=str) str_conds = OrderedDict((str(key), value) for key, value in conds.items()) num_calcs = np.prod([len(i) for i in str_conds.values()]) components = [x for x in sorted(comps)] desired_active_pure_elements = [ list(x.constituents.keys()) for x in components ] desired_active_pure_elements = [ el.upper() for constituents in desired_active_pure_elements for el in constituents ] pure_elements = sorted( set([x for x in desired_active_pure_elements if x != 'VA'])) if verbose: print('Components:', ' '.join([str(x) for x in comps])) print('Phases:', end=' ') output = output if output is not None else 'GM' output = output if isinstance(output, (list, tuple, set)) else [output] output = set(output) output |= {'GM'} output = sorted(output) need_hessians = any( type(c) in v.CONDITIONS_REQUIRING_HESSIANS for c in conds.keys()) phase_records = build_phase_records(dbf, comps, active_phases, conds, models, output='GM', callables=callables, parameters=parameters, verbose=verbose, build_gradients=True, build_hessians=need_hessians) if verbose: print('[done]', end='\n') # 'calculate' accepts conditions through its keyword arguments grid_opts = calc_opts.copy() statevar_strings = [str(x) for x in state_variables] grid_opts.update({ key: value for key, value in str_conds.items() if key in statevar_strings }) if 'pdens' not in grid_opts: grid_opts['pdens'] = 500 grid = delayed(calculate, pure=False)(dbf, comps, active_phases, model=models, fake_points=True, callables=callables, output='GM', parameters=parameters, **grid_opts) coord_dict = str_conds.copy() coord_dict['vertex'] = np.arange( len(pure_elements) + 1 ) # +1 is to accommodate the degenerate degree of freedom at the invariant reactions coord_dict['component'] = pure_elements grid_shape = tuple(len(x) for x in conds.values()) + (len(pure_elements) + 1, ) properties = delayed(starting_point, pure=False)(conds, state_variables, phase_records, grid) conditions_per_chunk_per_axis = 2 if num_calcs > 1: # Generate slices of 'properties' slices = [] for val in grid_shape[:-1]: idx_arr = list(range(val)) num_chunks = int(np.floor(val / conditions_per_chunk_per_axis)) if num_chunks > 0: cond_slices = [ x for x in np.array_split(np.asarray(idx_arr), num_chunks) if len(x) > 0 ] else: cond_slices = [idx_arr] slices.append(cond_slices) chunk_dims = [len(slc) for slc in slices] chunk_grid = np.array( np.unravel_index(np.arange(np.prod(chunk_dims)), chunk_dims)).T res = [] for chunk in chunk_grid: prop_slice = properties[OrderedDict( list( zip(str_conds.keys(), [ np.atleast_1d(sl)[ch] for ch, sl in zip(chunk, slices) ])))] job = delayed(_solve_eq_at_conditions, pure=False)(comps, prop_slice, phase_records, grid, list(str_conds.keys()), state_variables, verbose, solver=solver) res.append(job) properties = delayed(_merge_property_slices, pure=False)(properties, chunk_grid, slices, list(str_conds.keys()), res) else: # Single-process job; don't create child processes properties = delayed(_solve_eq_at_conditions, pure=False)(comps, properties, phase_records, grid, list(str_conds.keys()), state_variables, verbose, solver=solver) # Compute equilibrium values of any additional user-specified properties # We already computed these properties so don't recompute them output = sorted(set(output) - {'GM', 'MU'}) for out in output: if (out is None) or (len(out) == 0): continue # TODO: How do we know if a specified property should be per_phase or not? # For now, we make a best guess if (out == 'degree_of_ordering') or (out == 'DOO'): per_phase = True else: per_phase = False eqcal = delayed(_eqcalculate, pure=False)(dbf, comps, active_phases, conditions, out, data=properties, per_phase=per_phase, callables=callables, parameters=parameters, model=models, **calc_opts) properties = delayed(properties.merge, pure=False)(eqcal, compat='equals') if scheduler is not None: properties = dask.compute(properties, scheduler=scheduler)[0] properties.attrs['created'] = datetime.utcnow().isoformat() if len(kwargs) > 0: warnings.warn( 'The following equilibrium keyword arguments were passed, but unused:\n{}' .format(kwargs)) return properties
def perceptual_path_length_score(model, data_generator, tolerance_threshold=1e-6, max_iteration=200, batch_size=10): # prepare the inception v3 model VGG16_model = VGG16(include_top=False, pooling='avg', input_shape=model.get_inputs_shape()) VGG16_featues_fn = lambda x: VGG16_model(x) epsilon = 1e-2 def learned_perceptual_image_patch_similarity(images_a, images_b): """LPIPS metric using VGG-16 and Zhang weighting. (https://arxiv.org/abs/1801.03924) Takes reference images and corrupted images as an input and outputs the perceptual distance between the image pairs. """ # Concatenate images. images = tf.concat([images_a, images_b], axis=0) # Extract features. vgg_features = VGG16_featues_fn(images) # Normalize each feature vector to unit length over channel dimension. normalized_features = [] for x in vgg_features: x = tf.reshape(x, (len(x), 1)) n = tf.reduce_sum(x**2, axis=1, keepdims=True)**0.5 normalized_features.append(x / (n + 1e-10)) # Split and compute distances. diff = [ tf.subtract(*tf.split(x, 2, axis=0))**2 for x in normalized_features ] return np.array(diff) def filter_distances_fn(distances): # Reject outliers. lo = np.percentile(distances, 1, interpolation='lower') hi = np.percentile(distances, 99, interpolation='higher') filtered_distances = np.extract( np.logical_and(lo <= distances, distances <= hi), distances) return filtered_distances def calculate_distances(images): images01, images02 = images[0::2], images[1::2] return learned_perceptual_image_patch_similarity( images01, images02) * (1 / epsilon**2) # prepare the ae model random_images_generator def model_random_images_generator(): while True: # Generate latents from the data latents_real = model.encode(next(data_generator)['images']) # Generate random latents and interpolation t-values. latents_t = np.random.normal(size=latents_real.shape) lerp_t = np.random.uniform(size=1)[0] latents_e = slerp(lerp_t, latents_real, latents_t) images = model.decode(latents_e) #.numpy() # images = (images*255).astype(np.uint8) yield images[:batch_size] # calculate_distances(images[0::2], images[1::2]) def stopping_fn(distances): # Reject outliers. filter_distances = filter_distances_fn(distances) return np.mean(distances) ppl_mean = bootstrapping_additive( data_generator=model_random_images_generator(), func=delayed(calculate_distances), \ stopping_func=stopping_fn, tolerance_threshold=tolerance_threshold, max_iteration=max_iteration ) return ppl_mean
def get_data(baseline=False, start_year=DEFAULT_START_YEAR, reform={}, data=None, client=None, num_workers=1): ''' This function creates dataframes of micro data with marginal tax rates and information to compute effective tax rates from the Tax-Calculator output. The resulting dictionary of dataframes is returned and saved to disk in a pickle file. Args: baseline (boolean): True if baseline tax policy calculator_start_year (int): first year of budget window reform (dictionary): IIT policy reform parameters, None if baseline data (DataFrame or str): DataFrame or path to datafile for Records object client (Dask Client object): client for Dask multiprocessing num_workers (int): number of workers to use for Dask multiprocessing Returns: micro_data_dict (dict): dict of Pandas Dataframe, one for each year from start_year to the maximum year Tax-Calculator can analyze taxcalc_version (str): version of Tax-Calculator used ''' # Compute MTRs and taxes or each year, but not beyond TC_LAST_YEAR lazy_values = [] for year in range(start_year, TC_LAST_YEAR + 1): lazy_values.append( delayed(taxcalc_advance)(baseline, start_year, reform, data, year)) if client: # pragma: no cover futures = client.compute(lazy_values, num_workers=num_workers) results = client.gather(futures) else: results = results = compute(*lazy_values, scheduler=dask.multiprocessing.get, num_workers=num_workers) # dictionary of data frames to return micro_data_dict = {} for i, result in enumerate(results): year = start_year + i micro_data_dict[str(year)] = DataFrame(result) if baseline: pkl_path = "micro_data_baseline.pkl" else: pkl_path = "micro_data_policy.pkl" with open(pkl_path, "wb") as f: pickle.dump(micro_data_dict, f) # Do some garbage collection del results # Pull Tax-Calc version for reference taxcalc_version = pkg_resources.get_distribution("taxcalc").version return micro_data_dict, taxcalc_version
def load_from_unf(cls, filename, lazy=False): r"""Load a `.unf`-file into a :class:`~.SemperFormat` object. Parameters ---------- filename : string The name of the unf-file from which to load the data. Standard format is '\*.unf'. Returns ------- semper : :class:`~.SemperFormat` (N=1) SEMPER file format object containing the loaded information. """ metadata = OrderedDict() with open(filename, 'rb') as f: # Read header: rec_length = np.fromfile(f, dtype='<i4', count=1)[0] # length of header header = np.fromfile(f, dtype=cls.HEADER_DTYPES[:rec_length // 2], count=1) metadata.update(sarray2dict(header)) assert np.frombuffer(f.read(4), dtype=np.int32)[0] == rec_length, \ 'Error while reading the header (length is not correct)!' data_format = cls.IFORM_DICT[metadata['IFORM']] iversn, remain = divmod(metadata['IFLAG'], 10000) ilabel, ntitle = divmod(remain, 1000) metadata.update({ 'IVERSN': iversn, 'ILABEL': ilabel, 'NTITLE': ntitle }) # Read title: title = '' if ntitle > 0: assert np.fromfile(f, dtype='<i4', count=1)[0] == ntitle # length of title title = b''.join(np.fromfile(f, dtype='c', count=ntitle)) title = title.decode() metadata['TITLE'] = title assert np.fromfile(f, dtype='<i4', count=1)[0] == ntitle if ilabel: try: metadata.update(cls._read_label(f)) except Exception as e: warning = ('Could not read label, trying to proceed ' 'without it!') warning += ' (Error message: {})'.format(str(e)) warnings.warn(warning) # Read picture data: pos = f.tell() shape = metadata['NLAY'], metadata['NROW'], metadata['NCOL'] if lazy: from dask.array import from_delayed from dask import delayed task = delayed(_read_data)(f, filename, pos, data_format, shape) data = from_delayed(task, shape=shape, dtype=data_format) else: data = _read_data(f, filename, pos, data_format, shape) offsets = (metadata.get('X0V0', 0.), metadata.get('Y0V2', 0.), metadata.get('Z0V4', 0.)) scales = (metadata.get('DXV1', 1.), metadata.get('DYV3', 1.), metadata.get('DZV5', 1.)) units = (metadata.get('XUNIT', Undefined), metadata.get('YUNIT', Undefined), metadata.get('ZUNIT', Undefined)) return cls(data, title, offsets, scales, units, metadata)
output.write() print("{:<15}".format(algo.getName()) \ + " executed input: " + "{:<15}".format(file) \ + " in {:.2f}s".format(end_time - start_time) \ + " | Score: " + "{:<10}".format(score.score) \ + " | Params: " + str(parameter)) return score.score if __name__ == "__main__": print("-------------------------") print(" HashCode 2021 ") print("-------------------------") start_time_program = time.time() scores = [] for f in FILES: res = delayed(run_algorithm)(GreedyAlgorithm(), f) scores.append(res) total_scores = delayed(sum)(scores) total_scores = total_scores.compute( num_workers=1) # Set num_workers=# to limit worker processes print("Total scores: ", total_scores) end_time_program = time.time() print("Whole execution took: {:.2f}s".format(end_time_program - start_time_program))
def fft_shading_test(obj, variable='diffuse_hemisp_narrowband_filter4', fft_window=30, shad_freq_lower=[0.008, 0.017], shad_freq_upper=[0.0105, 0.0195], ratio_thresh=[3.15, 1.2], time_interval=None): """ Function to test shadowband radiometer (MFRSR, RSS, etc) instruments for shading related problems. Program was adapted by Adam Theisen from the method defined in Alexandrov et al 2007 to process on a point by point basis using a window of data around that point for the FFT analysis. For ARM data, testing has found that this works the best on narrowband filter4 for MFRSR data. Function has been tested and is in use by the ARM DQ Office for problem detection. It is know to have some false positives at times. Need to run obj.clean.cleanup() ahead of time to ensure proper addition to QC variable Parameters ---------- obj : xarray Dataset Data object Returns ------- obj : xarray Dataset Data object References ---------- Alexandrov, Mikhail & Kiedron, Peter & Michalsky, Joseph & Hodges, Gary & Flynn, Connor & Lacis, Andrew. (2007). Optical depth measurements by shadow-band radiometers and their uncertainties. Applied optics. 46. 8027-38. 10.1364/AO.46.008027. """ # Get time and data from variable time = obj['time'].values data = obj[variable].values if 'missing_value' in obj[variable].attrs: missing = obj[variable].attrs['missing_value'] else: missing = -9999. # Get time interval between measurements dt = time_interval if time_interval is None: dt = determine_time_delta(time) # Compute the FFT for each point +- window samples task = [] for t in range(len(time)): sind = t - fft_window eind = t + fft_window if sind < 0: sind = 0 if eind > len(time): eind = len(time) # Get data and remove all nan/missing values d = data[sind:eind] idx = ((d != missing) & (np.isnan(d) is not True)) index = np.where(idx) d = d[index] # Add to task for dask processing lat = [ obj['lat'].values ] if not isinstance(obj['lat'].values, list) else obj['lat'].values lon = [ obj['lon'].values ] if not isinstance(obj['lon'].values, list) else obj['lon'].values task.append( dask.delayed(fft_shading_test_process)( time[t], lat[0], lon[0], d, shad_freq_lower=shad_freq_lower, shad_freq_upper=shad_freq_upper, ratio_thresh=ratio_thresh, time_interval=dt)) # Process using dask result = dask.compute(*task) # Run data through a rolling median to filter out singular # false positives result = pd.Series(result).rolling(window=5, min_periods=1).median() # Find indices where shading is indicated idx = (np.asarray(result) > 0.4) index = np.where(idx) # Add test to QC Variable desc = 'FFT Shading Test' result = obj.qcfilter.add_test(variable, index=index, test_meaning=desc) return obj
def compute_tile_foreground_fraction(slide_path, im_fgnd_mask_lres, fgnd_seg_scale, it_kwargs, tile_position=None): """ Computes the fraction of foreground of a single tile or all tiles in a whole slide image given the binary foreground mask computed from a low resolution version of the slide. Parameters ---------- slide_path : str path to an image or slide im_fgnd_mask_lres : array_like A binary foreground mask computed at a low-resolution fgnd_seg_scale : double The scale/magnification at which the foreground mask `im_fgnd_mask_lres` was computed it_kwargs : dict A dictionary of any key:value parameters (e.g. defining the scale, tile_size, region etc) in addition to tile_position that need to be passed to `large_image.TileSource.getSingleTile` to get the tile. tile_position : int or None A linear 0-based index of a tile for which the foreground needs to be computed. If set to None, the foreground fraction of all tiles will be computed. Returns ------- tile_fgnd_frac : double or array_like A value between 0 and 1 indicating the fraction of foreground pixels present in the tile indicated by `tile_position`. If `tile_position` is set to None, then a 1D array containing the foreground fraction of all tiles will be returned. """ if tile_position is None: # get slide tile source ts = large_image.getTileSource(slide_path) num_tiles = ts.getSingleTile(**it_kwargs)['iterator_range']['position'] # broadcasting fgnd mask to all dask workers try: c = dask.distributed.get_client() [im_fgnd_mask_lres] = c.scatter([im_fgnd_mask_lres], broadcast=True) except ValueError: pass # compute tile foreground fraction in parallel tile_fgnd_frac = [] for tile_position in range(num_tiles): tile_fgnd_frac.append( dask.delayed(_compute_tile_foreground_fraction_single)( slide_path, im_fgnd_mask_lres, fgnd_seg_scale, it_kwargs, tile_position)) tile_fgnd_frac = dask.delayed(tile_fgnd_frac).compute() tile_fgnd_frac = np.array(tile_fgnd_frac) elif np.isscalar(tile_position): tile_fgnd_frac = _compute_tile_foreground_fraction_single( slide_path, im_fgnd_mask_lres, fgnd_seg_scale, it_kwargs, tile_position) else: raise ValueError( 'Invalid value for tile_position. Must be None or int') return tile_fgnd_frac
def ser_reader(filename, objects=None, *args, **kwds): """Reads the information from the file and returns it in the HyperSpy required format. """ header, data = load_ser_file(filename) record_by = guess_record_by(header['DataTypeID']) ndim = int(header['NumberDimensions']) date, time = None, None if objects is not None: objects_dict = convert_xml_to_dict(objects[0]) date, time = _get_date_time(objects_dict.ObjectInfo.AcquireDate) if "PositionY" in data.dtype.names and len(data['PositionY']) > 1 and \ (data['PositionY'][0] == data['PositionY'][1]): # The spatial dimensions are stored in F order i.e. X, Y, ... order = "F" else: # The spatial dimensions are stored in C order i.e. ..., Y, X order = "C" if ndim == 0 and header["ValidNumberElements"] != 0: # The calibration of the axes are not stored in the header. # We try to guess from the position coordinates. array_shape, axes = get_axes_from_position(header=header, data=data) else: axes = [] array_shape = [ None, ] * int(ndim) spatial_axes = ["x", "y"][:ndim] for i in range(ndim): idim = 1 + i if order == "C" else ndim - i if (record_by == "spectrum" or header['Dim-%i_DimensionSize' % (i + 1)][0] != 1): units = (header['Dim-%i_Units' % (idim)][0].decode('utf-8') if header['Dim-%i_UnitsLength' % (idim)] > 0 else t.Undefined) if units == "meters": name = (spatial_axes.pop() if order == "F" else spatial_axes.pop(-1)) else: name = t.Undefined axes.append({ 'offset': header['Dim-%i_CalibrationOffset' % idim][0], 'scale': header['Dim-%i_CalibrationDelta' % idim][0], 'units': units, 'size': header['Dim-%i_DimensionSize' % idim][0], 'name': name, }) array_shape[i] = \ header['Dim-%i_DimensionSize' % idim][0] # Spectral dimension if record_by == "spectrum": axes.append({ 'offset': data['CalibrationOffset'][0], 'scale': data['CalibrationDelta'][0], 'size': data['ArrayLength'][0], 'index_in_array': header['NumberDimensions'][0] }) # FEI seems to use the international system of units (SI) for the # energy scale (eV). axes[-1]['units'] = 'eV' axes[-1]['name'] = 'Energy' array_shape.append(data['ArrayLength'][0]) elif record_by == 'image': if objects is not None: units = _guess_units_from_mode(objects_dict, header) else: units = "meters" # Y axis axes.append({ 'name': 'y', 'offset': data['CalibrationOffsetY'][0] - data['CalibrationElementY'][0] * data['CalibrationDeltaY'][0], 'scale': data['CalibrationDeltaY'][0], 'units': units, 'size': data['ArraySizeY'][0], }) array_shape.append(data['ArraySizeY'][0]) # X axis axes.append({ 'name': 'x', 'offset': data['CalibrationOffsetX'][0] - data['CalibrationElementX'][0] * data['CalibrationDeltaX'][0], 'scale': data['CalibrationDeltaX'][0], 'size': data['ArraySizeX'][0], 'units': units, }) array_shape.append(data['ArraySizeX'][0]) # FEI seems to use the international system of units (SI) for the # spatial scale. However, we prefer to work in nm for axis in axes: if axis['units'] == 'meters': axis['units'] = 'nm' axis['scale'] *= 10**9 elif axis['units'] == '1/meters': axis['units'] = '1/nm' axis['scale'] /= 10**9 # Remove Nones from array_shape caused by squeezing size 1 dimensions array_shape = [dim for dim in array_shape if dim is not None] lazy = kwds.pop('lazy', False) if lazy: from dask import delayed from dask.array import from_delayed val = delayed(load_only_data, pure=True)(filename, array_shape, record_by, len(axes)) dc = from_delayed(val, shape=array_shape, dtype=data['Array'].dtype) else: dc = load_only_data(filename, array_shape, record_by, len(axes), data=data) if ordict: original_metadata = OrderedDict() else: original_metadata = {} header_parameters = sarray2dict(header) sarray2dict(data, header_parameters) # We remove the Array key to save memory avoiding duplication del header_parameters['Array'] original_metadata['ser_header_parameters'] = header_parameters metadata = { 'General': { 'original_filename': os.path.split(filename)[1], }, "Signal": { 'signal_type': "", 'record_by': record_by, }, } if date is not None and time is not None: metadata['General']['date'] = date metadata['General']['time'] = time dictionary = { 'data': dc, 'metadata': metadata, 'axes': axes, 'original_metadata': original_metadata, 'mapping': mapping } return dictionary
def dask_win_func(n): return dsar.from_delayed( delayed(numpy_win_func, pure=True)(n), (n, ), float)
def svd_flip(u, v): u2, v2 = delayed(skm.svd_flip, nout=2)(u, v) u = da.from_delayed(u2, shape=u.shape, dtype=u.dtype) v = da.from_delayed(v2, shape=v.shape, dtype=v.dtype) return u, v
def to_cloudvolume(arr, cloudpath, resolution=(1, 1, 1), voxel_offset=(0, 0, 0), layer_type=None, encoding='raw', max_mip=0, compute=True, return_stored=False, **kwargs): """Save 3d or 4d dask array to the precomputed CloudVolume storage format. NOTE: DO NOT USE thread-based dask scheduler. See comment at top of module. See https://docs.dask.org/en/latest/array.html for details about the format. Parameters ---------- arr: dask.array Data to store cloudpath: str Path to the dataset layer. This should match storage's supported providers. e.g. Google: gs://$BUCKET/$DATASET/$LAYER/ S3 : s3://$BUCKET/$DATASET/$LAYER/ Lcl FS: file:///tmp/$DATASET/$LAYER/ Boss : boss://$COLLECTION/$EXPERIMENT/$CHANNEL HTTP/S: http(s)://.../$CHANNEL matrix: matrix://$BUCKET/$DATASET/$LAYER/ resolution: Iterable of ints of length 3 The x, y, z voxel dimensions in nanometers voxel_offset: Iterable of ints of length 3 The x, y, z beginning of dataset in positive cartesian space. layer_type: str "image" or "segmentation" max_mip: int Maximum mip level id. compute: boolean, optional If true compute immediately, return ``dask.delayed.Delayed`` otherwise. return_stored: boolean, optional Optionally return stored results. kwargs: passed to the ``cloudvolume.CloudVolume()`` function, e.g., compression options Raises ------ ValueError If ``arr`` has ndim different that 3 or 4, or ``layer_type`` is unsupported. Returns ------- See notes on `compute` and `return_stored` parameters. """ import dask import dask.array as da if not da.core._check_regular_chunks(arr.chunks): raise ValueError('Attempt to save array to cloudvolume with irregular ' 'chunking, please call `arr.rechunk(...)` first.') if not layer_type: if arr.dtype in (bool, np.uint32, np.uint64, np.uint16): layer_type = 'segmentation' elif np.issubdtype(arr.dtype, np.integer) or np.issubdtype( arr.dtype, np.floating): layer_type = 'image' else: raise ValueError('Unsupported layer_type for CloudVolume: %s' % layer_type) if arr.ndim == 3: num_channels = 1 chunk_size = arr.chunksize elif arr.ndim == 4: num_channels = arr.shape[-1] chunk_size = arr.chunksize[:3] else: raise ValueError( 'CloudVolume only supports 3 or 4 dimensions. Array has %d.' % arr.ndim) info = CloudVolume.create_new_info(num_channels, layer_type, arr.dtype.name, encoding, resolution, voxel_offset, arr.shape[:3], chunk_size=chunk_size, max_mip=max_mip) # Delay writing any metadata until computation time. # - the caller may never do the full computation # - the filesystem may be slow, and there is a desire to open files # in parallel on worker machines. vol = dask.delayed(_create_cloudvolume)(cloudpath, info, **kwargs) return arr.store(vol, lock=False, compute=compute, return_stored=return_stored)
def add_data( self, dates, param=None, daily=False, network=None, download=False, local=False, n_procs=1, meta=False, ): """Short summary. Parameters ---------- dates : list of datetime objects Description of parameter `dates`. param : list of strings Description of parameter `param` (the default is None). daily : boolean Description of parameter `daily` (the default is False). network : type Description of parameter `network` (the default is None). download : type Description of parameter `download` (the default is False). Returns ------- pandas DataFrame Description of returned object. """ import dask import dask.dataframe as dd if param is None: params = [ "SPEC", "PM10", "PM2.5", "PM2.5_FRM", "CO", "OZONE", "SO2", "VOC", "NONOXNOY", "WIND", "TEMP", "RHDP", ] else: params = param urls, fnames = self.build_urls(params, dates, daily=daily) if download: for url, fname in zip(urls, fnames): self.retrieve(url, fname) dfs = [ dask.delayed(self.load_aqs_file)(i, network) for i in fnames ] elif local: dfs = [ dask.delayed(self.load_aqs_file)(i, network) for i in fnames ] else: dfs = [dask.delayed(self.load_aqs_file)(i, network) for i in urls] dff = dd.from_delayed(dfs) dfff = dff.compute(num_workers=n_procs) if meta: return self.add_data2(dfff, daily, network) else: return dfff
def file_reader(filename, record_by='image', force_read_resolution=False, **kwds): """ Read data from tif files using Christoph Gohlke's tifffile library. The units and the scale of images saved with ImageJ or Digital Micrograph is read. There is limited support for reading the scale of files created with Zeiss and FEI SEMs. Parameters ---------- filename: str record_by: {'image'} Has no effect because this format only supports recording by image. force_read_resolution: Bool Default: False. Force reading the x_resolution, y_resolution and the resolution_unit of the tiff tags. See http://www.awaresystems.be/imaging/tiff/tifftags/resolutionunit.html **kwds, optional """ _logger.debug('************* Loading *************') # For testing the use of local and skimage tifffile library lazy = kwds.pop('lazy', False) memmap = kwds.pop('memmap', False) with TiffFile(filename, **kwds) as tiff: # change in the Tifffiles API if hasattr(tiff.series[0], 'axes'): # in newer version the axes is an attribute axes = tiff.series[0].axes else: # old version axes = tiff.series[0]['axes'] is_rgb = tiff.is_rgb _logger.debug("Is RGB: %s" % is_rgb) series = tiff.series[0] if hasattr(series, 'shape'): shape = series.shape dtype = series.dtype else: shape = series['shape'] dtype = series['dtype'] if is_rgb: axes = axes[:-1] names = ['R', 'G', 'B', 'A'] lastshape = shape[-1] dtype = np.dtype({ 'names': names[:lastshape], 'formats': [dtype] * lastshape }) shape = shape[:-1] op = {} for key, tag in tiff[0].tags.items(): op[key] = tag.value names = [axes_label_codes[axis] for axis in axes] _logger.debug('Tiff tags list: %s' % op) _logger.debug("Photometric: %s" % op['photometric']) _logger.debug('is_imagej: {}'.format(tiff[0].is_imagej)) # workaround for 'palette' photometric, keep only 'X' and 'Y' axes sl = None if op['photometric'] == 3: sl = [0] * len(shape) names = [] for i, axis in enumerate(axes): if axis == 'X' or axis == 'Y': sl[i] = slice(None) names.append(axes_label_codes[axis]) else: axes.replace(axis, '') shape = tuple(_sh for _s, _sh in zip(sl, shape) if isinstance(_s, slice)) _logger.debug("names: {0}".format(names)) scales = [1.0] * len(names) offsets = [0.0] * len(names) units = [t.Undefined] * len(names) intensity_axis = {} try: scales_d, units_d, offsets_d, intensity_axis, op = \ _parse_scale_unit(tiff, op, shape, force_read_resolution) for i, name in enumerate(names): if name == 'height': scales[i], units[i] = scales_d['x'], units_d['x'] offsets[i] = offsets_d['x'] elif name == 'width': scales[i], units[i] = scales_d['y'], units_d['y'] offsets[i] = offsets_d['y'] elif name in ['depth', 'image series', 'time']: scales[i], units[i] = scales_d['z'], units_d['z'] offsets[i] = offsets_d['z'] except: _logger.info("Scale and units could not be imported") axes = [{ 'size': size, 'name': str(name), 'scale': scale, 'offset': offset, 'units': unit, } for size, name, scale, offset, unit in zip( shape, names, scales, offsets, units)] md = { 'General': { 'original_filename': os.path.split(filename)[1] }, 'Signal': { 'signal_type': "", 'record_by': "image", }, } if 'datetime' in op: dt = datetime.strptime(_decode_string(op['datetime']), "%Y:%m:%d %H:%M:%S") md['General']['date'] = dt.date().isoformat() md['General']['time'] = dt.time().isoformat() if 'units' in intensity_axis: md['Signal']['quantity'] = intensity_axis['units'] if 'scale' in intensity_axis and 'offset' in intensity_axis: dic = { 'gain_factor': intensity_axis['scale'], 'gain_offset': intensity_axis['offset'] } md['Signal']['Noise_properties'] = {'Variance_linear_model': dic} data_args = TiffFile, filename, is_rgb, sl if lazy: from dask import delayed from dask.array import from_delayed memmap = True val = delayed(_load_data, pure=True)(*data_args, memmap=memmap, **kwds) dc = from_delayed(val, dtype=dtype, shape=shape) # TODO: maybe just pass the memmap from tiffile? else: dc = _load_data(*data_args, memmap=memmap, **kwds) metadata = Metadata(op) md.update(metadata.get_additional_metadata()) return [{ 'data': dc, 'original_metadata': op, 'axes': axes, 'metadata': md, 'mapping': metadata.mapping, }]
def wrap(data): return client.scatter( data, broadcast=True) if client else delayed(data, pure=True)
import dask parameter_scores = [] for i in range(4): X_train, X_test, y_train, y_test = dask.delayed(train_test_split, nout=4)(data.data, data.target, pure=False) for max_df in [0.5, 0.75, 1.0]: for ngram_range in [(1, 1), (1, 2)]: vect = dask.delayed(CountVectorizer)(max_df=max_df, ngram_range=ngram_range) vect = vect.fit(X_train) X2_train = vect.transform(X_train) X2_test = vect.transform(X_test) for norm in ['l1', 'l2']: tfidf = dask.delayed(TfidfTransformer)(norm=norm) tfidf = tfidf.fit(X2_train) X3_train = tfidf.transform(X2_train) X3_test = tfidf.transform(X2_test) for max_iter in [5]: for alpha in [0.00001, 0.000001]: for penalty in ['l2', 'elasticnet']: clf = dask.delayed(SGDClassifier)(max_iter=max_iter, alpha=alpha, penalty=penalty) clf = clf.fit(X3_train, y_train) score = clf.score(X3_test, y_test) params = { 'max_df': max_df, 'ngram_range': ngram_range, 'norm': norm, 'max_iter': max_iter,
def test_worker_task_data(c, s, w): x = delayed(2) xx = c.persist(x) yield _wait(xx) assert w.data[x.key] == 2
@delayed def extract_feature_image(img, feature_type, feature_coord=None): """Extract the haar feature for the current image""" ii = integral_image(img) return haar_like_feature(ii, 0, 0, ii.shape[0], ii.shape[1], feature_type=feature_type, feature_coord=feature_coord) #?haar_like_feature # To speed up the example, extract the two types of features only feature_types = ['type-4', 'type-2-x', 'type-2-y'] # Build a computation graph using Dask. This allows the use of multiple # CPU cores later during the actual computation X = delayed(extract_feature_image(img, feature_types) for img in images) print(X.max()) X[1] #?extract_feature_image # Compute the result t_start = time() X = np.array(X.compute(scheduler='threads')) time_full_feature_comp = time() - t_start
def to_sql( df, name: str, uri: str, schema=None, if_exists: str = "fail", index: bool = True, index_label=None, chunksize=None, dtype=None, method=None, compute=True, parallel=False, ): """ Store Dask Dataframe to a SQL table An empty table is created based on the "meta" DataFrame (and conforming to the caller's "if_exists" preference), and then each block calls pd.DataFrame.to_sql (with `if_exists="append"`). Databases supported by SQLAlchemy [1]_ are supported. Tables can be newly created, appended to, or overwritten. Parameters ---------- name : str Name of SQL table. uri : string Full sqlalchemy URI for the database connection schema : str, optional Specify the schema (if database flavor supports this). If None, use default schema. if_exists : {'fail', 'replace', 'append'}, default 'fail' How to behave if the table already exists. * fail: Raise a ValueError. * replace: Drop the table before inserting new values. * append: Insert new values to the existing table. index : bool, default True Write DataFrame index as a column. Uses `index_label` as the column name in the table. index_label : str or sequence, default None Column label for index column(s). If None is given (default) and `index` is True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. chunksize : int, optional Specify the number of rows in each batch to be written at a time. By default, all rows will be written at once. dtype : dict or scalar, optional Specifying the datatype for columns. If a dictionary is used, the keys should be the column names and the values should be the SQLAlchemy types or strings for the sqlite3 legacy mode. If a scalar is provided, it will be applied to all columns. method : {None, 'multi', callable}, optional Controls the SQL insertion clause used: * None : Uses standard SQL ``INSERT`` clause (one per row). * 'multi': Pass multiple values in a single ``INSERT`` clause. * callable with signature ``(pd_table, conn, keys, data_iter)``. Details and a sample callable implementation can be found in the section :ref:`insert method <io.sql.method>`. compute : bool, default True When true, call dask.compute and perform the load into SQL; otherwise, return a Dask object (or array of per-block objects when parallel=True) parallel : bool, default False When true, have each block append itself to the DB table concurrently. This can result in DB rows being in a different order than the source DataFrame's corresponding rows. When false, load each block into the SQL DB in sequence. Raises ------ ValueError When the table already exists and `if_exists` is 'fail' (the default). See Also -------- read_sql : Read a DataFrame from a table. Notes ----- Timezone aware datetime columns will be written as ``Timestamp with timezone`` type with SQLAlchemy if supported by the database. Otherwise, the datetimes will be stored as timezone unaware timestamps local to the original timezone. .. versionadded:: 0.24.0 References ---------- .. [1] https://docs.sqlalchemy.org .. [2] https://www.python.org/dev/peps/pep-0249/ Examples -------- Create a table from scratch with 4 rows. >>> import pandas as pd >>> df = pd.DataFrame([ {'i':i, 's':str(i)*2 } for i in range(4) ]) >>> from dask.dataframe import from_pandas >>> ddf = from_pandas(df, npartitions=2) >>> ddf # doctest: +SKIP Dask DataFrame Structure: i s npartitions=2 0 int64 object 2 ... ... 3 ... ... Dask Name: from_pandas, 2 tasks >>> from dask.utils import tmpfile >>> from sqlalchemy import create_engine >>> with tmpfile() as f: ... db = 'sqlite:///%s' % f ... ddf.to_sql('test', db) ... engine = create_engine(db, echo=False) ... result = engine.execute("SELECT * FROM test").fetchall() >>> result [(0, 0, '00'), (1, 1, '11'), (2, 2, '22'), (3, 3, '33')] """ # This is the only argument we add on top of what Pandas supports kwargs = dict( name=name, con=uri, schema=schema, if_exists=if_exists, index=index, index_label=index_label, chunksize=chunksize, dtype=dtype, ) if method: if not PANDAS_GT_0240: raise NotImplementedError( "'method' requires pandas>=0.24.0. You have version %s" % PANDAS_VERSION ) else: kwargs["method"] = method def make_meta(meta): return meta.to_sql(**kwargs) make_meta = delayed(make_meta) meta_task = make_meta(df._meta) # Partitions should always append to the empty table created from `meta` above worker_kwargs = dict(kwargs, if_exists="append") if parallel: # Perform the meta insert, then one task that inserts all blocks concurrently: result = [ _extra_deps( d.to_sql, extras=meta_task, **worker_kwargs, dask_key_name="to_sql-%s" % tokenize(d, **worker_kwargs) ) for d in df.to_delayed() ] else: # Chain the "meta" insert and each block's insert result = [] last = meta_task for d in df.to_delayed(): result.append( _extra_deps( d.to_sql, extras=last, **worker_kwargs, dask_key_name="to_sql-%s" % tokenize(d, **worker_kwargs) ) ) last = result[-1] result = dask.delayed(result) if compute: dask.compute(result) else: return result
def forecast(R, V, n_timesteps, n_ens_members, n_cascade_levels, R_thr=None, kmperpixel=None, timestep=None, extrap_method="semilagrangian", decomp_method="fft", bandpass_filter_method="gaussian", noise_method="nonparametric", noise_stddev_adj=True, ar_order=2, vel_pert_method=None, conditional=False, use_precip_mask=True, use_probmatching=True, mask_method="incremental", callback=None, return_output=True, seed=None, num_workers=None, extrap_kwargs={}, filter_kwargs={}, noise_kwargs={}, vel_pert_kwargs={}): """Generate a nowcast ensemble by using the Short-Term Ensemble Prediction System (STEPS) method. Parameters ---------- R : array-like Array of shape (ar_order+1,m,n) containing the input precipitation fields ordered by timestamp from oldest to newest. The time steps between the inputs are assumed to be regular, and the inputs are required to have finite values. V : array-like Array of shape (2,m,n) containing the x- and y-components of the advection field. The velocities are assumed to represent one time step between the inputs. All values are required to be finite. n_timesteps : int Number of time steps to forecast. n_ens_members : int The number of ensemble members to generate. n_cascade_levels : int The number of cascade levels to use. Other Parameters ---------------- R_thr : float Specifies the threshold value for minimum observable precipitation intensity. Must be set if use_probmatching is True or conditional is True. kmperpixel : float Spatial resolution of the input data (kilometers/pixel). Required if vel_pert_method is not None or mask_method is 'incremental'. timestep : float Time step of the motion vectors (minutes). Required if vel_pert_method is not None or mask_method is 'incremental'. extrap_method : {'semilagrangian'} Name of the extrapolation method to use. See the documentation of pysteps.advection.interface. decomp_method : {'fft'} Name of the cascade decomposition method to use. See the documentation of pysteps.cascade.interface. bandpass_filter_method : {'gaussian', 'uniform'} Name of the bandpass filter method to use with the cascade decomposition. See the documentation of pysteps.cascade.interface. noise_method : {'parametric','nonparametric','ssft','nested'} Name of the noise generator to use for perturbating the precipitation field. See the documentation of pysteps.noise.interface. noise_stddev_adj : bool Optional adjustment for the standard deviations of the noise fields added to each cascade level. See pysteps.noise.utils.compute_noise_stddev_adjs. ar_order : int The order of the autoregressive model to use. Must be >= 1. vel_pert_method : {'bps'} Name of the noise generator to use for perturbing the velocity field. See the documentation of pysteps.noise.interface. conditional : bool If set to True, compute the statistics of the precipitation field conditionally by excluding the areas where the values are below the threshold R_thr. use_precip_mask : bool If True, set pixels outside precipitation areas to the minimum value of the observed field. mask_method : {'obs', 'sprog', 'incremental'} The precipitation/no precipitation method to use with mask: 'obs' = apply R_thr to the most recently observed precipitation intensity field, 'sprog' = use the smoothed forecast field from S-PROG, where the AR(p) model has been applied, 'incremental' = iteratively buffer the mask with a certain rate (currently it is 1 km/min) use_probmatching : bool If True, apply probability matching to the forecast field in order to preserve the distribution of the most recently observed precipitation field. callback : function Optional function that is called after computation of each time step of the nowcast. The function takes one argument: a three-dimensional array of shape (n_ens_members,h,w), where h and w are the height and width of the input field R, respectively. This can be used, for instance, writing the outputs into files. return_output : bool Set to False to disable returning the outputs as numpy arrays. This can save memory if the intermediate results are written to output files using the callback function. seed : int Optional seed number for the random generators. num_workers : int The number of workers to use for parallel computation. Set to None to use all available CPUs. Applicable if dask is enabled. extrap_kwargs : dict Optional dictionary that is supplied as keyword arguments to the extrapolation method. filter_kwargs : dict Optional dictionary that is supplied as keyword arguments to the filter method. noise_kwargs : dict Optional dictionary that is supplied as keyword arguments to the initializer of the noise generator. vel_pert_kwargs : dict Optional dictionary that is supplied as keyword arguments to the initializer of the velocity perturbator. Returns ------- out : ndarray If return_output is True, a four-dimensional array of shape (n_ens_members,n_timesteps,m,n) containing a time series of forecast precipitation fields for each ensemble member. Otherwise, a None value is returned. See also -------- pysteps.advection.interface, pysteps.cascade.interface, pysteps.noise.interface, pysteps.noise.utils.compute_noise_stddev_adjs References ---------- :cite:`Seed2003`, :cite:`BPS2006`, :cite:`SPN2013` """ _check_inputs(R, V, ar_order) if np.any(~np.isfinite(R)): raise ValueError("R contains non-finite values") if np.any(~np.isfinite(V)): raise ValueError("V contains non-finite values") if mask_method not in ["obs", "sprog", "incremental"]: raise ValueError( "unknown mask method %s: must be 'obs', 'sprog' or 'incremental'" % mask_method) if conditional and R_thr is None: raise Exception("conditional=True but R_thr is not set") if use_probmatching and R_thr is None: raise Exception("use_probmatching=True but R_thr is not set") if kmperpixel is None: if vel_pert_method is None: raise Exception("vel_pert_method is set but kmperpixel=None") if mask_method == "incremental": raise Exception("mask_method='incremental' but kmperpixel=None") if timestep is None: if vel_pert_method is None: raise Exception("vel_pert_method is set but timestep=None") if mask_method == "incremental": raise Exception("mask_method='incremental' but timestep=None") print("Computing STEPS nowcast:") print("------------------------") print("") print("Inputs:") print("-------") print("input dimensions: %dx%d" % (R.shape[1], R.shape[2])) if kmperpixel is not None: print("km/pixel: %g" % kmperpixel) if timestep is not None: print("time step: %d minutes" % timestep) print("") print("Methods:") print("--------") print("extrapolation: %s" % extrap_method) print("bandpass filter: %s" % bandpass_filter_method) print("decomposition: %s" % decomp_method) print("noise generator: %s" % noise_method) print("noise adjustment: %s" % ("yes" if noise_stddev_adj else "no")) print("velocity perturbator: %s" % vel_pert_method) print("conditional statistics: %s" % ("yes" if conditional else "no")) print("precipitation mask: %s" % ("yes" if use_precip_mask else "no")) print("mask method: %s" % mask_method) print("probability matching: %s" % ("yes" if use_probmatching else "no")) print("") print("Parameters:") print("-----------") print("number of time steps: %d" % n_timesteps) print("ensemble size: %d" % n_ens_members) print("number of cascade levels: %d" % n_cascade_levels) print("order of the AR(p) model: %d" % ar_order) if vel_pert_method is not None: vp_par = vel_pert_kwargs["p_pert_par"] vp_perp = vel_pert_kwargs["p_pert_perp"] print("velocity perturbations, parallel: %g,%g,%g" % \ (vp_par[0], vp_par[1], vp_par[2])) print("velocity perturbations, perpendicular: %g,%g,%g" % \ (vp_perp[0], vp_perp[1], vp_perp[2])) if conditional or use_probmatching: print("conditional precip. intensity threshold: %g" % R_thr) M, N = R.shape[1:] extrap_method = advection.get_method(extrap_method) R = R[-(ar_order + 1):, :, :].copy() if conditional or use_probmatching: MASK_thr = np.logical_and.reduce( [R[i, :, :] >= R_thr for i in range(R.shape[0])]) else: MASK_thr = None # advect the previous precipitation fields to the same position with the # most recent one (i.e. transform them into the Lagrangian coordinates) extrap_kwargs = extrap_kwargs.copy() res = [] f = lambda R, i: extrap_method(R[i, :, :], V, ar_order - i, "min", ** extrap_kwargs)[-1] for i in range(ar_order): if not dask_imported: R[i, :, :] = f(R, i) else: res.append(dask.delayed(f)(R, i)) if dask_imported: R = np.stack( list(dask.compute(*res, num_workers=num_workers)) + [R[-1, :, :]]) # initialize the band-pass filter filter_method = cascade.get_method(bandpass_filter_method) filter = filter_method((M, N), n_cascade_levels, **filter_kwargs) # compute the cascade decompositions of the input precipitation fields decomp_method = cascade.get_method(decomp_method) R_d = [] for i in range(ar_order + 1): R_ = decomp_method(R[i, :, :], filter, MASK=MASK_thr) R_d.append(R_) # normalize the cascades and rearrange them into a four-dimensional array # of shape (n_cascade_levels,ar_order+1,m,n) for the autoregressive model R_c, mu, sigma = _stack_cascades(R_d, n_cascade_levels) R_d = None # compute lag-l temporal autocorrelation coefficients for each cascade level GAMMA = np.empty((n_cascade_levels, ar_order)) for i in range(n_cascade_levels): R_c_ = np.stack([R_c[i, j, :, :] for j in range(ar_order + 1)]) GAMMA[i, :] = correlation.temporal_autocorrelation(R_c_, MASK=MASK_thr) R_c_ = None _print_corrcoefs(GAMMA) if ar_order == 2: # adjust the lag-2 correlation coefficient to ensure that the AR(p) # process is stationary for i in range(n_cascade_levels): GAMMA[i, 1] = autoregression.adjust_lag2_corrcoef( GAMMA[i, 0], GAMMA[i, 1]) # estimate the parameters of the AR(p) model from the autocorrelation # coefficients PHI = np.empty((n_cascade_levels, ar_order + 1)) for i in range(n_cascade_levels): PHI[i, :] = autoregression.estimate_ar_params_yw(GAMMA[i, :]) _print_ar_params(PHI, False) # discard all except the p-1 last cascades because they are not needed for # the AR(p) model R_c = R_c[:, -ar_order:, :, :] # stack the cascades into a five-dimensional array containing all ensemble # members R_c = np.stack([R_c.copy() for i in range(n_ens_members)]) # initialize the random generators if noise_method is not None: randgen_prec = [] randgen_motion = [] np.random.seed(seed) for j in range(n_ens_members): rs = np.random.RandomState(seed) randgen_prec.append(rs) seed = rs.randint(0, high=1e9) rs = np.random.RandomState(seed) randgen_motion.append(rs) seed = rs.randint(0, high=1e9) R_min = np.min(R) if noise_method is not None: # get methods for perturbations init_noise, generate_noise = noise.get_method(noise_method) # initialize the perturbation generator for the precipitation field pp = init_noise(R[-1, :, :], **noise_kwargs) if noise_stddev_adj: print("Computing noise adjustment factors... ", end="") sys.stdout.flush() starttime = time.time() noise_std_coeffs = noise.utils.compute_noise_stddev_adjs( R[-1, :, :], R_thr, R_min, filter, decomp_method, 10, conditional=True, num_workers=num_workers) print("%.2f seconds." % (time.time() - starttime)) else: noise_std_coeffs = np.ones(n_cascade_levels) if vel_pert_method is not None: init_vel_noise, generate_vel_noise = noise.get_method(vel_pert_method) # initialize the perturbation generators for the motion field vps = [] for j in range(n_ens_members): kwargs = { "randstate": randgen_motion[j], "p_pert_par": vp_par, "p_pert_perp": vp_perp } vp_ = init_vel_noise(V, 1. / kmperpixel, timestep, **kwargs) vps.append(vp_) D = [None for j in range(n_ens_members)] R_f = [[] for j in range(n_ens_members)] if use_precip_mask: if mask_method == "obs": MASK_prec = R[-1, :, :] >= R_thr # add a slight buffer to the mask # n=5 # kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (n,n)) # MASK_prec = MASK_prec.astype('uint8') # MASK_prec = cv2.dilate(MASK_prec,kernel).astype(bool) elif mask_method == "sprog": # compute the wet area ratio and the precipitation mask MASK_prec = R[-1, :, :] >= R_thr war = 1.0 * np.sum(MASK_prec) / (R.shape[1] * R.shape[2]) R_m = R_c.copy() elif mask_method == "incremental": # initialize precip mask for each member MASK_prec_ = R[-1, :, :] >= R_thr MASK_prec = [MASK_prec_.copy() for j in range(n_ens_members)] # initialize the structuring element struct = scipy.ndimage.generate_binary_structure(2, 1) # iterate it to expand it nxn n = timestep / kmperpixel struct = scipy.ndimage.iterate_structure(struct, int((n - 1) / 2.)) R = R[-1, :, :] print("Starting nowcast computation.") # iterate each time step for t in range(n_timesteps): print("Computing nowcast for time step %d... " % (t + 1), end="") sys.stdout.flush() starttime = time.time() # iterate each ensemble member def worker(j): if noise_method is not None: # generate noise field EPS = generate_noise(pp, randstate=randgen_prec[j]) # decompose the noise field into a cascade EPS = decomp_method(EPS, filter) else: EPS = None # iterate the AR(p) model for each cascade level for i in range(n_cascade_levels): # normalize the noise cascade if EPS is not None: EPS_ = (EPS["cascade_levels"][i, :, :] - EPS["means"][i]) / EPS["stds"][i] EPS_ *= noise_std_coeffs[i] else: EPS_ = None # apply AR(p) process to cascade level R_c[j, i, :, :, :] = \ autoregression.iterate_ar_model(R_c[j, i, :, :, :], PHI[i, :], EPS=EPS_) if use_precip_mask and mask_method == "sprog": # use a separate AR(p) model for the non-perturbed forecast, # from which the mask is obtained R_m[j, i, :, :, :] = \ autoregression.iterate_ar_model(R_m[j, i, :, :, :], PHI[i, :]) EPS = None EPS_ = None # compute the recomposed precipitation field(s) from the cascades # obtained from the AR(p) model(s) R_c_ = _recompose_cascade(R_c[j, :, :, :], mu, sigma) if use_precip_mask: # apply the precipitation mask to prevent generation of new # precipitation into areas where it was not originally # observed if mask_method == "obs": R_c_[~MASK_prec] = R_c_.min() elif mask_method == "incremental": R_c_[~MASK_prec[j]] = R_c_.min() elif mask_method == "sprog": # obtain the CDF from the non-perturbed forecast that is # scale-filtered by the AR(p) model R_m_ = _recompose_cascade(R_m[j, :, :, :], mu, sigma) R_s = R_m_.flatten() # compute the threshold value R_pct_thr corresponding to the # same fraction of precipitation pixels (forecast values above # R_min) as in the most recently observed precipitation field R_s.sort(kind="quicksort") x = 1.0 * np.arange(1, len(R_s) + 1)[::-1] / len(R_s) i = np.argmin(abs(x - war)) # handle ties if R_s[i] == R_s[i + 1]: i = np.where(R_s == R_s[i])[0][-1] + 1 R_pct_thr = R_s[i] # apply a mask obtained from the above to preserve the # wet-area ratio MASK_prec_ = R_m_ < R_pct_thr R_c_[MASK_prec_] = R_c_.min() if use_probmatching: ## adjust the conditional CDF of the forecast (precipitation ## intensity above the threshold R_thr) to match the most ## recently observed precipitation field R_c_ = probmatching.nonparam_match_empirical_cdf(R_c_, R) if use_precip_mask and mask_method == "incremental": MASK_prec_ = R_c_ >= R_thr MASK_prec_ = scipy.ndimage.morphology.binary_dilation( MASK_prec_, struct) MASK_prec[j] = MASK_prec_ # compute the perturbed motion field if vel_pert_method is not None: V_ = V + generate_vel_noise(vps[j], t * timestep) else: V_ = V # advect the recomposed precipitation field to obtain the forecast # for time step t extrap_kwargs.update({"D_prev": D[j], "return_displacement": True}) R_f_, D_ = extrap_method(R_c_, V_, 1, **extrap_kwargs) D[j] = D_ R_f_ = R_f_[0] return R_f_ res = [] for j in range(n_ens_members): if not dask_imported or n_ens_members == 1: res.append(worker(j)) else: res.append(dask.delayed(worker)(j)) R_f_ = dask.compute(*res, num_workers=num_workers) \ if dask_imported and n_ens_members > 1 else res res = None print("%.2f seconds." % (time.time() - starttime)) if callback is not None: callback(np.stack(R_f_)) R_f_ = None if return_output: for j in range(n_ens_members): R_f[j].append(R_f_[j]) if return_output: if n_ens_members == 1: return np.stack(R_f[0]) else: return np.stack([np.stack(R_f[j]) for j in range(n_ens_members)]) else: return None
def _predict_kwds_for_cat( self, feature_matrix: np.array, cat_index: int, predictions: np.array, cat_indices: Dict[str, List[int]] = None, use_dask: bool = True, pbar: tqdm = None, ): """ Make predictions for all documents for all concepts in the given category Args: feature_matrix: array of features for each document cat_index: index in categories attribute of the given category predictions: the h5 dataset where predictions are stored cat_indices: Predicted indices where categories occur for each category use_dask: Use dask for multiprocessing pbar: tqdm progress bar """ cat = self.categories[cat_index] pbar.set_postfix(category=cat, refresh=False) if (cat_indices is not None) and (cat != ""): feature_matrix_test = feature_matrix[cat_indices[cat], :] # this could be a problem if I want everything to perfectly align. else: feature_matrix_test = feature_matrix if feature_matrix_test.shape[0] == 0: pbar.update(len(self.cat_concept_indices[cat_index])) return 0 # TODO: for good bar, should walk tasks to compute total cat_concept_cols = self.cat_concept_indices[cat_index] # use the np.where here, bool index for initial setting? if False: # use_dask is True: feature_matrix_test = dask.delayed(feature_matrix_test) jobs = [] ProgressBar().register() for concept_index in cat_concept_cols: j = dask.delayed(self._predict_one_clf)(feature_matrix_test, concept_index, cat, pbar) jobs.append(j) vals = dask.compute(jobs)[0] else: vals = [] for concept_index in cat_concept_cols: val = self._predict_one_clf(feature_matrix_test, concept_index, cat, pbar) vals.append(val) if (cat_indices is not None) and (cat is not ""): # need to correct indices, zeros in places with no predictions # TODO: determine if this patching activity # takes longer than just predicting on more new_vals = [] for v in vals: new_v = np.zeros(feature_matrix.shape[0]) new_v[cat_indices[cat]] = v new_vals.append(new_v) vals = new_vals # TODO: below will not work with cat_inds if len(vals) > 0: topic_preds_sub = np.stack(vals, axis=1) predictions[cat_index, :, cat_concept_cols] = topic_preds_sub
def return_gsea_capsules(ma=None, tissue='', context_on=False, use_set=False, gsea_superset='H', n_top_sets=25, min_capsule_len=2000, all_genes=False, union_cpgs=True, limited_capsule_names_file=''): global gene2cpg, gsea_collections, gene_set_weights if limited_capsule_names_file: with open(limited_capsule_names_file) as f: limited_capsule_names = f.read().replace('\n', ' ').split() else: limited_capsule_names = [] allcpgs = ma.beta.columns.values entire_sets = use_set collection = gsea_superset gene2cpg = pickle.load(open(gene2cpg, 'rb')) if all_genes: gene_sets = list(gene2cpg.keys()) else: gsea = pickle.load(open(gsea_collections, 'rb')) if tissue: gene_sets = pd.read_csv(gene_set_weights[collection], sep='\t', index_col=0) if tissue != 'ubiquitous': gene_sets = (gene_sets.quantile(1., axis=1) - gene_sets.quantile( 0., axis=1)).sort_values().index.tolist() else: gene_sets = gene_sets[tissue].sort_values( ascending=False).index.tolist() else: gene_sets = list(gsea[collection].keys()) intersect_context = False if limited_capsule_names_file: gene_sets_tmp = np.intersect1d(gene_sets, limited_capsule_names).tolist() print('LIMITED GENE CAPS', gene_sets_tmp) if gene_sets_tmp: gene_sets = gene_sets_tmp intersect_context = True if not tissue: n_top_sets = 0 if n_top_sets and not all_genes: gene_sets = gene_sets[:n_top_sets] capsules = dict() if all_genes: entire_sets = False if entire_sets: context_on = False def process_gene_set(gene_set): capsules = [] gene_set_cpgs = [] for genename in (gsea[collection][gene_set] if not all_genes else [gene_set]): gene = gene2cpg.get(genename, {'Gene': [], 'Upstream': []}) if context_on: for k in ['Gene', 'Upstream']: context = gene.get(k, []) if len(context): capsules.append(('{}_{}'.format(genename, k), list(context))) #capsules['{}_{}'.format(genename,k)]=context.tolist() else: if not entire_sets: capsules.append((genename, np.union1d(gene.get('Gene', []), gene.get('Upstream', [])).tolist())) #capsules[genename]=np.union1d(gene.get('Gene',[]),gene.get('Upstream',[])).tolist() else: upstream = gene.get('Upstream', []) gene = gene.get('Gene', []) cpg_set = np.union1d(gene, upstream) if cpg_set.tolist(): gene_set_cpgs.append(cpg_set) if entire_sets and not all_genes: capsules.append((gene_set, reduce(np.union1d, gene_set_cpgs).tolist())) #capsules[gene_set]=reduce(np.union1d,gene_set_cpgs).tolist() return capsules def process_chunk(chunk): with ProgressBar(): chunk = dask.compute(*chunk, scheduler='threading') return chunk with ProgressBar(): capsules = dict( list( reduce( lambda x, y: x + y, dask.compute(*[ dask.delayed(process_gene_set)(gene_set) for gene_set in gene_sets ], scheduler='threading')))) capsules2 = [] #caps_lens=np.array([len(capsules[capsule]) for capsule in capsules]) # cluster = LocalCluster(n_workers=multiprocessing.cpu_count()*2, threads_per_worker=20) # client = Client(cluster) capsule_names = list(capsules.keys()) if intersect_context: capsules_tmp_names = np.intersect1d(capsule_names, limited_capsule_names).tolist() if capsules_tmp_names: capsules = {k: capsules[k] for k in capsules_tmp_names} capsule_names = capsules_tmp_names capsules = reduce_caps(capsules, allcpgs, min_capsule_len) # print(capsule_names) # capsules_bag=db.from_sequence(list(capsules.values())) # capsules_intersect=capsules_bag.map(lambda x: np.intersect1d(x,allcpgs)) # capsules_len=capsules_intersect.map(lambda x: x if len(x) >= min_capsule_len else []) # # with get_task_stream(plot='save', filename="task-stream.html") as ts: # capsules=capsules_len.compute() # #print(capsules) # capsules=dict([(capsule_names[i],capsules[i].tolist()) for i in range(len(capsule_names)) if len(capsules[i])]) # for capsule in capsules: # capsules2.append([capsule,dask.delayed(return_caps)(capsules[capsule],allcpgs,min_capsule_len)]) # cpus=multiprocessing.cpu_count() # caps_chunks=list(divide_chunks(capsules2,cpus)) # p=Pool(cpus) # capsules=dict(list(reduce(lambda x,y: x+y,p.map(process_chunk,caps_chunks)))) # with ProgressBar(): # capsules=dask.compute(capsules2,scheduler='threading')[0] #print(capsules) modules = list(capsules.values( )) #[capsules[capsule] for capsule in capsules if capsules[capsule]] modulecpgs = reduce((np.union1d if union_cpgs else (lambda x, y: x + y)), modules).tolist() module_names = list(capsules.keys()) return modules, modulecpgs, module_names
def plot_paths_to_files_delayed(G, paths, orig_points, dest_points, boundaries, dirpath): assert os.path.exists(dirpath) figures = [] for n in tqdm(range(len(paths))): f, ax = plt.subplots(figsize=(30, 30)) delayed(boundaries.plot)(edgecolor="red", ax=ax) delayed(plot_graph)(G, ax) x, y = zip(*paths[n][1]) delayed(ax.plot)(x, y, c="k", lw=20, alpha=0.5) delayed(ax.scatter)(orig_points.iloc[n].x, orig_points.iloc[n].y, color="green", s=500) delayed(ax.scatter)(x[0], y[0], color="red", s=500) delayed(ax.scatter)(x[-1], y[-1], color="green", s=500) delayed(ax.scatter)(dest_points.x, dest_points.y, color="k", s=250) figures.append(delayed(f.savefig)(f"{dirpath}/{n}.png")) delayed(f.clf()) delayed(plt.close(f)) with ProgressBar(): compute(*figures)
def add_data(self, dates, box=None, country=None, state=None, site=None, resample=True, window='H'): """ dates : list of datetime objects description box : list of floats [latmin, lonmin, latmax, lonmax] country : state : site : resample : boolean window : """ from numpy import NaN self.dates = pd.to_datetime(dates) idate = dates[0] year = idate.strftime('%Y') url = 'https://www1.ncdc.noaa.gov/pub/data/noaa/' + year + '/' if self.history is None: self.read_ish_history() self.history['fname'] = url + self.history.usaf + \ '-' + self.history.wban + '-' + year + '.gz' dfloc = self.history.copy() # if isinstance(box, None): # type(box) is not type(None): if box is not None: # type(box) is not type(None): print('Retrieving Sites in: ' + ' '.join(map(str, box))) dfloc = self.subset_sites(latmin=box[0], lonmin=box[1], latmax=box[2], lonmax=box[3]) elif country is not None: print('Retrieving Country: ' + country) dfloc = self.history.loc[self.history.ctry == country, :] elif state is not None: print('Retrieving State: ' + state) dfloc = self.history.loc[self.history.STATE == state, :] elif site is not None: print('Retrieving Site: ' + site) dfloc = self.history.loc[self.history.station_id == site, :] print(dfloc.fname.unique()) objs = self.get_url_file_objs(dfloc.fname.unique()) # return objs,size,self.history.fname # dfs = [] # for f in objs: # try: # dfs.append(self.read_data_frame(f)) # except: # pass print(' Reading ISH into pandas DataFrame...') dfs = [dask.delayed(self.read_data_frame)(f) for f in objs] dff = dd.from_delayed(dfs) self.df = dff.compute() self.df.loc[self.df.vsb == 99999, 'vsb'] = NaN if resample: print(' Resampling to every ' + window) self.df.index = self.df.time self.df = self.df.groupby('station_id').resample( 'H').mean().reset_index() # this was encoded as byte literal but in dfloc it is a string so could # not merge on station_id correctly. try: self.df['station_id'] = self.df['station_id'].str.decode("utf-8") except RuntimeError: pass self.df = self.df.merge( dfloc[['station_id', 'latitude', 'longitude', 'station name']], on=['station_id'], how='left') return self.df.copy()
image = images[0] image = draw_haar_like_feature(image, 0, 0, images.shape[2], images.shape[1], [feature_coord[idx_sorted[idx]]]) ax.imshow(image) ax.set_xticks([]) ax.set_yticks([]) _ = fig.suptitle('The most important features') images = lfw_subset() feature_types = None X = delayed(extract_feature_image(img, feature_types) for img in images) X = np.array(X.compute(scheduler='threads')) y = np.array([1] * 100 + [0] * 100) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=150, random_state=0, stratify=y) feature_coord, feature_type = \ haar_like_feature_coord(width=images.shape[2], height=images.shape[1], feature_type=feature_types)