def test_publish_bag(s, a, b): db = pytest.importorskip('dask.bag') c = Client((s.ip, s.port), start=False) yield c._start() f = Client((s.ip, s.port), start=False) yield f._start() bag = db.from_sequence([0, 1, 2]) bagp = c.persist(bag) assert len(futures_of(bagp)) == 3 keys = {f.key for f in futures_of(bagp)} assert keys == set(bag.dask) yield c._publish_dataset(data=bagp) # check that serialization didn't affect original bag's dask assert len(futures_of(bagp)) == 3 result = yield f._get_dataset('data') assert set(result.dask.keys()) == set(bagp.dask.keys()) assert {f.key for f in result.dask.values() } == {f.key for f in bagp.dask.values()} out = yield f.compute(result)._result() assert out == [0, 1, 2] yield c._shutdown() yield f._shutdown()
def build_cli(args=None): if not args: args = parse_args() else: args = parse_args(args) filter_dirty = any(args.packages) or not args._all outputs = get_dask_outputs(args.path, packages=args.packages, filter_dirty=filter_dirty, git_rev=args.git_rev, stop_rev=args.stop_rev, steps=args.steps, max_downstream=args.max_downstream, visualize=args.visualize, test=args.test) if args.visualize: # setattr(nx.drawing, 'graphviz_layout', nx.nx_pydot.graphviz_layout) # graphviz_graph = nx.draw_graphviz(graph, 'dot') # graphviz_graph.draw(args.visualize) visualize(*outputs, filename=args.visualize) # create neat looking graph. else: # many threads, because this is just the dispatch. Takes very little compute. # Only waiting for build complete. cluster = LocalCluster(n_workers=1, threads_per_worker=args.threads, nanny=False) client = Client(cluster) futures = client.persist(outputs) progress(futures)
class SchedulerComputeDepsInMemory(object): def setup(self): self.client = Client() # Generate 10 indepdent tasks x = [delayed(random.random)() for _ in range(10)] # Generate lots of interrelated dependent tasks n = 200 for _ in range(10, n): random_subset = [random.choice(x) for _ in range(5)] random_max = delayed(max)(random_subset) x.append(random_max) # Persist tasks into distributed memory and wait to finish y = self.client.persist(x) wait(y) self.x = x def teardown(self): self.client.close() def time_compute_deps_already_in_memory(self): """ Measure compute time when dependent tasks are already in memory. xref https://github.com/dask/distributed/pull/3293 """ compute(*self.x, scheduler=self.client)
def test_publish_bag(s, a, b): db = pytest.importorskip('dask.bag') c = Client((s.ip, s.port), start=False) yield c._start() f = Client((s.ip, s.port), start=False) yield f._start() bag = db.from_sequence([0, 1, 2]) bagp = c.persist(bag) assert len(futures_of(bagp)) == 3 keys = {f.key for f in futures_of(bagp)} assert keys == set(bag.dask) yield c._publish_dataset(data=bagp) # check that serialization didn't affect original bag's dask assert len(futures_of(bagp)) == 3 result = yield f._get_dataset('data') assert set(result.dask.keys()) == set(bagp.dask.keys()) assert {f.key for f in result.dask.values()} == {f.key for f in bagp.dask.values()} out = yield f.compute(result)._result() assert out == [0, 1, 2] yield c._shutdown() yield f._shutdown()
def test_persist(dsf): correct = dsf.compute().todense() client = Client() persisted = client.persist(dsf) res = persisted.compute().todense() pdt.assert_frame_equal(res, correct)
def dask_evaluate(outputs): utils.port_increment += 2 scheduler_port = 8786 + utils.port_increment diagnostics_port = 8787 + utils.port_increment cluster = LocalCluster(n_workers=1, threads_per_worker=10, nanny=False, scheduler_port=scheduler_port, diagnostics_port=diagnostics_port) client = Client(cluster) futures = client.persist(outputs) return client.gather(futures)
def distribute_dask_df(dask_df): """ Distribute a dask dataframe over a client that's accessible via the global DASK_CLIENT """ from distributed import Client global DASK_CLIENT DASK_CLIENT = Client() dask_df = DASK_CLIENT.persist(dask_df) return dask_df
self.cluster = Cluster( [self.MORPHL_SERVER_IP_ADDRESS], auth_provider=self.auth_provider) self.session = self.cluster.connect(self.MORPHL_CASSANDRA_KEYSPACE) self.prep_stmt = self.session.prepare(self.CQL_STMT) def save_prediction(self, client_id, prediction): bind_list = [client_id, prediction] self.session.execute(self.prep_stmt, bind_list, timeout=self.CASS_REQ_TIMEOUT) def batch_inference_on_partition(partition_df): churn_model_file = f'/opt/models/{DAY_AS_STR}_{UNIQUE_HASH}_churn_model.h5' churn_model = load_model(churn_model_file) prediction = churn_model.predict(partition_df.drop(['client_id'], axis=1))[0][0] return prediction def persist_partition(partition_df): def persist_one_prediction(series_obj): cassandra.save_prediction(series_obj.client_id, series_obj.prediction) cassandra = Cassandra() partition_df.apply(persist_one_prediction, axis=1) return 0 if __name__ == '__main__': client = Client() dask_df = client.persist(dd.read_parquet(HDFS_DIR_INPUT)) dask_df.client_id.count().compute() dask_df['prediction'] = dask_df.map_partitions(batch_inference_on_partition, meta=('prediction', float)) dask_df['token'] = dask_df.map_partitions(persist_partition, meta=('token', int)) dask_df.token.compute()
# Encoding: UTF-8 -*- import numpy as np from distributed import Client, progress import dask.array as da dask_client = Client( scheduler_file="/global/cscratch1/sd/rkube/scheduler.json") with np.load("../dask_fft_data_s0000.npz") as df: num_channels, num_fft = df["fft_data"].shape print(num_channels, num_fft) fft_data = da.from_array(df["fft_data"], chunks=(1, num_fft)) dask_client.persist(fft_data) # Calculate the crosspower using the array interface res1 = (fft_data[:2, :] * fft_data[-2:, :].conj()).mean(axis=1) print("type res1 = ", type(res1)) res2 = da.arctan2(res1.real, res1.imag).real print("type res2 = ", type(res2)) print("result res2 = ", res2.compute()) # Calculate the crosspower using the distributed interface def cross_phase(ft_data, ch1, ch2): _tmp1 = (ft_data[ch1, :] * ft_data[ch2, :].conj()).mean().compute() print("** crosspower: type(tmp1) =", type(_tmp1)) _tmp2 = np.arctan2(_tmp1.real, _tmp1.imag).real #_tmp2 = _tmp1.real + _tmp1.imag
# # - Display head of the dataframe # - Display number of rows of this dataframe. # - Compute the total number of passengers. # - Count occurrences in the payment_type column both for the full dataset, and filtered by zero tip (tip_amount == 0). # - Create a new column, tip_fraction # - Plot the average of the new column tip_fraction grouped by day of week. # - Plot the average of the new column tip_fraction grouped by hour of day. # # [Dask dataframe documentation](http://docs.dask.org/en/latest/dataframe.html) # # + import dask.dataframe as dd from distributed import Client, progress c = Client('127.0.0.1:8786') nyc2014 = dd.read_csv('hdfs://svmass2.mass.uhb.fr:54310/user/datasets/nyc-tlc/2014/yellow*.csv', parse_dates=['pickup_datetime', 'dropoff_datetime'], skipinitialspace=True) nyc2015 = dd.read_csv('hdfs://svmass2.mass.uhb.fr:54310/user/datasets/nyc-tlc/2015/yellow*.csv', parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime']) nyc2014, nyc2015 = c.persist([nyc2014, nyc2015]) progress(nyc2014, nyc2015) # -
client = Client(LocalCluster()) print(client._repr_html_()) network_graph, meta_graph = create_graph( expression_matrix.as_matrix(), gene_names, tf_names, "GBM", SGBM_KWARGS, client=client, # broadcast! early_stop_window_length=25, include_meta=True) # Good! a, b = client.persist([network_graph, meta_graph]) network_df = a.compute(sync=True) meta_df = b.compute(sync=True) # Bad! # network_df, meta_df = client.compute([network_graph, meta_graph], sync=True) if client: client.close() network_df.to_csv(net_out_path, sep='\t', index=False) meta_df.to_csv(meta_out_path, sep='\t', index=False) end_time = time.time() print('wall time: {} seconds'.format(end_time - start_time))
def main(): client = Client() dask_df = client.persist(dd.read_parquet(HDFS_DIR_INPUT)) ModelGenerator(dask_df).generate_and_save_model()
raise ValueError("n_files = {0} > n_workers {1}".format( no_of_files, n_workers)) # start script h5filelist = sorted(utils.getFileList(data_dir, 'h5')) # print(h5filelist[:no_of_files]) file_size = utils.getFileSizeInGB(h5filelist[:no_of_files]) print("File size to read in is {0:.0f} GB".format(file_size)) cluster = LocalCluster(n_workers=n_workers, threads_per_worker=n_threads) client = Client(cluster) print("Using {0} workers and {1} thread(s) per worker".format( n_workers, n_threads)) df = ddf.read_hdf(h5filelist[:no_of_files], 'csv', columns=columns) start_time = time.time() df = client.persist(df) print(df.index.compute()) end_time = time.time() read_time = end_time - start_time # print("lines read in =", df.shape) print("Reading in {0} files in {1:.0f}s ".format(no_of_files, read_time)) df_size = np.sum(df.memory_usage().compute()) / 1e9 if args.output_json: from collections import OrderedDict import json timing_info = OrderedDict({}) timing_info['git_commit'] = utils.capture_multiline_output( 'git show | head -1')[0]
# In[2]: cluster=LocalCluster(n_workers=10) client=Client(cluster) # In[3]: fs=gcsfs.GCSFileSystem(token='anon') f=fs.open("anaconda-public-data/nyc-taxi/nyc.parquet/part.0.parquet") pf=ParquetFile(f) df1=pf.to_pandas() df2=dd.from_pandas(data=df1,npartitions=3) df_clust=client.persist(df2) # In[41]: df_clust.head(5) # In[40]: df3=df_clust.groupby(df_clust.passenger_count).trip_distance.mean().compute # In[39]: