def main(): x = np.random.normal(size=(1000000, 5)) y = x.mean(axis=1) cluster = LocalCluster(n_workers=4, threads_per_worker=1, memory_limit='1G') client = Client(cluster) print(client) print("scattering") [x_ref, y_ref] = client.scatter([x, y], broadcast=True) jobs = [] for e in range(1, 30): print(e) jobs.append(client.submit(train_rf, e, x_ref, y_ref)) for job in as_completed(jobs): print(job.result()) del job client.rebalance() client.close() cluster.close() return
import pandas import dask.dataframe as dd from dask.distributed import Client client = Client("10.110.122.238:8888") df = pd.read_csv('trainingData.csv') future = client.scatter(df) # send dataframe to one worker ddf = dd.from_delayed([future], meta=df) # build dask.dataframe on remote data ddf = ddf.repartition(npartitions=20).persist() # split client.rebalance(ddf) # spread around all of your workers