def test__futures_to_collection(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() remote_dfs = e.map(identity, dfs) ddf = yield _futures_to_collection(remote_dfs, divisions=True) ddf2 = yield _futures_to_dask_dataframe(remote_dfs, divisions=True) assert isinstance(ddf, dd.DataFrame) assert ddf.dask == ddf2.dask remote_arrays = e.map(np.arange, range(3, 5)) x = yield _futures_to_collection(remote_arrays) y = yield _futures_to_dask_array(remote_arrays) assert type(x) == type(y) assert x.dask == y.dask remote_lists = yield e._scatter([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) b = yield _futures_to_collection(remote_lists) c = yield _futures_to_dask_bag(remote_lists) assert type(b) == type(c) assert b.dask == b.dask yield e._shutdown()
def dont_test_dataframes(s, a): # slow pytest.importorskip('pandas') n = 3000000 fn = '/tmp/test/file.csv' with make_hdfs() as hdfs: data = (b'name,amount,id\r\n' + b'Alice,100,1\r\nBob,200,2\r\n' * n) with hdfs.open(fn, 'w') as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_binary(fn, hdfs=hdfs, delimiter=b'\r\n') assert len(futures) > 1 def load(b, **kwargs): assert b from io import BytesIO import pandas as pd bio = BytesIO(b) return pd.read_csv(bio, **kwargs) dfs = e.map(load, futures, names=['name', 'amount', 'id'], skiprows=1) dfs2 = yield e._gather(dfs) assert sum(map(len, dfs2)) == n * 2 - 1
def test_with_data(s, a, b): ss = HTTPScheduler(s) ss.listen(0) e = Executor((s.ip, s.port), start=False) yield e._start() L = e.map(inc, [1, 2, 3]) L2 = yield e._scatter(['Hello', 'world!']) yield _wait(L) client = AsyncHTTPClient() response = yield client.fetch('http://localhost:%s/memory-load.json' % ss.port) out = json.loads(response.body.decode()) assert all(isinstance(v, int) for v in out.values()) assert set(out) == {a.address_string, b.address_string} assert sum(out.values()) == sum(map(sys.getsizeof, [1, 2, 3, 'Hello', 'world!'])) response = yield client.fetch('http://localhost:%s/memory-load-by-key.json' % ss.port) out = json.loads(response.body.decode()) assert set(out) == {a.address_string, b.address_string} assert all(isinstance(v, dict) for v in out.values()) assert all(k in {'inc', 'data'} for d in out.values() for k in d) assert all(isinstance(v, int) for d in out.values() for v in d.values()) assert sum(v for d in out.values() for v in d.values()) == \ sum(map(sys.getsizeof, [1, 2, 3, 'Hello', 'world!'])) ss.stop() yield e._shutdown()
def test_with_data(s, a, b): ss = HTTPScheduler(s) ss.listen(0) e = Executor((s.ip, s.port), start=False) yield e._start() L = e.map(inc, [1, 2, 3]) L2 = yield e._scatter(['Hello', 'world!']) yield _wait(L) client = AsyncHTTPClient() response = yield client.fetch('http://localhost:%s/memory-load.json' % ss.port) out = json.loads(response.body.decode()) assert all(isinstance(v, int) for v in out.values()) assert set(out) == {a.address_string, b.address_string} assert sum(out.values()) == sum( map(sys.getsizeof, [1, 2, 3, 'Hello', 'world!'])) response = yield client.fetch( 'http://localhost:%s/memory-load-by-key.json' % ss.port) out = json.loads(response.body.decode()) assert set(out) == {a.address_string, b.address_string} assert all(isinstance(v, dict) for v in out.values()) assert all(k in {'inc', 'data'} for d in out.values() for k in d) assert all(isinstance(v, int) for d in out.values() for v in d.values()) assert sum(v for d in out.values() for v in d.values()) == \ sum(map(sys.getsizeof, [1, 2, 3, 'Hello', 'world!'])) ss.stop() yield e._shutdown()
def dont_test_dataframes(s, a): # slow pytest.importorskip('pandas') n = 3000000 fn = '/tmp/test/file.csv' with make_hdfs() as hdfs: data = (b'name,amount,id\r\n' + b'Alice,100,1\r\nBob,200,2\r\n' * n) with hdfs.open(fn, 'w') as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_bytes(fn, hdfs=hdfs, delimiter=b'\r\n') assert len(futures) > 1 def load(b, **kwargs): assert b from io import BytesIO import pandas as pd bio = BytesIO(b) return pd.read_csv(bio, **kwargs) dfs = e.map(load, futures, names=['name', 'amount', 'id'], skiprows=1) dfs2 = yield e._gather(dfs) assert sum(map(len, dfs2)) == n * 2 - 1
def test_no_divisions(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() dfs = e.map(tm.makeTimeDataFrame, range(5, 10)) df = yield _futures_to_dask_dataframe(dfs) assert not df.known_divisions assert list(df.columns) == list(tm.makeTimeDataFrame(5).columns)
def test_framework_runs(self): with MesosCluster() as cluster: time.sleep(2) driver = DistributedDriver().create_driver(DistributedScheduler) driver.start() time.sleep(5) expect(cluster).to(have_activated_slaves(1)) expect(cluster).to(have_framework_name('distributed-framework')) # distributed test - this probably doesnt belong here executor = Executor('127.0.0.1:8787') A = executor.map(lambda x: x**2, range(10)) B = executor.map(lambda x: -x, A) total = executor.submit(sum, B) expect(total.result()).to(equal(-285)) driver.stop()
def f(c, a, b): e = Executor((c.ip, c.port), start=False) IOLoop.current().spawn_callback(e._go) remote_dfs = e.map(lambda x: x, dfs) ddf = yield _futures_to_dask_dataframe(e, remote_dfs, divisions=True) assert isinstance(ddf, dd.DataFrame) assert ddf.divisions == (0, 30, 60, 80) expr = ddf.x.sum() result = yield e._get(expr.dask, expr._keys()) assert result == [sum([df.x.sum() for df in dfs])] yield e._shutdown()
def test__futures_to_dask_dataframe(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() remote_dfs = e.map(identity, dfs) ddf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True, executor=e) assert isinstance(ddf, dd.DataFrame) assert ddf.divisions == (0, 30, 60, 80) expr = ddf.x.sum() result = yield e._get(expr.dask, expr._keys()) assert result == [sum([df.x.sum() for df in dfs])] yield e._shutdown()
def f(c, a, b): e = Executor((c.ip, c.port), start=False, loop=loop) yield e._start() arrays = e.map(np.ones, [(5, 5)] * 6) y = yield _stack(arrays, axis=0) assert y.shape == (6, 5, 5) assert y.chunks == ((1, 1, 1, 1, 1, 1), (5,), (5,)) y_results = yield e._get(y.dask, y._keys()) yy = da.Array._finalize(y, y_results) assert isinstance(yy, np.ndarray) assert yy.shape == y.shape assert (yy == 1).all() yield e._shutdown()
def test__stack(s, a, b): import dask.array as da e = Executor((s.ip, s.port), start=False) yield e._start() arrays = e.map(np.ones, [(5, 5)] * 6) y = yield _stack(arrays, axis=0) assert y.shape == (6, 5, 5) assert y.chunks == ((1, 1, 1, 1, 1, 1), (5, ), (5, )) y_result, = e.compute(y) yy = yield y_result._result() assert isinstance(yy, np.ndarray) assert yy.shape == y.shape assert (yy == 1).all() yield e._shutdown()
def test__stack(s, a, b): import dask.array as da e = Executor((s.ip, s.port), start=False) yield e._start() arrays = e.map(np.ones, [(5, 5)] * 6) y = yield _stack(arrays, axis=0) assert y.shape == (6, 5, 5) assert y.chunks == ((1, 1, 1, 1, 1, 1), (5,), (5,)) y_result = e.compute(y) yy = yield y_result._result() assert isinstance(yy, np.ndarray) assert yy.shape == y.shape assert (yy == 1).all() yield e._shutdown()
def test_dataframes(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() dfs = [ pd.DataFrame({ 'x': np.random.random(100), 'y': np.random.random(100) }, index=list(range(i, i + 100))) for i in range(0, 100 * 10, 100) ] remote_dfs = e.map(lambda x: x, dfs) rdf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True) name = 'foo' ldf = dd.DataFrame({(name, i): df for i, df in enumerate(dfs)}, name, dfs[0].columns, list(range(0, 1000, 100)) + [999]) assert rdf.divisions == ldf.divisions remote = e.compute(rdf) result = yield remote._result() tm.assert_frame_equal(result, ldf.compute(get=dask.get)) exprs = [ lambda df: df.x.mean(), lambda df: df.y.std(), lambda df: df.assign(z=df.x + df.y).drop_duplicates(), lambda df: df.index, lambda df: df.x, lambda df: df.x.cumsum(), lambda df: df.loc[50:75] ] for f in exprs: local = f(ldf).compute(get=dask.get) remote = e.compute(f(rdf)) remote = yield gen.with_timeout(timedelta(seconds=5), remote._result()) assert_equal(local, remote) yield e._shutdown()
def test_dataframes(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() dfs = [pd.DataFrame({'x': np.random.random(100), 'y': np.random.random(100)}, index=list(range(i, i + 100))) for i in range(0, 100*10, 100)] remote_dfs = e.map(lambda x: x, dfs) rdf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True) name = 'foo' ldf = dd.DataFrame({(name, i): df for i, df in enumerate(dfs)}, name, dfs[0].columns, list(range(0, 1000, 100)) + [999]) assert rdf.divisions == ldf.divisions remote = e.compute(rdf) result = yield remote._result() tm.assert_frame_equal(result, ldf.compute(get=dask.get)) exprs = [lambda df: df.x.mean(), lambda df: df.y.std(), lambda df: df.assign(z=df.x + df.y).drop_duplicates(), lambda df: df.index, lambda df: df.x, lambda df: df.x.cumsum(), lambda df: df.loc[50:75]] for f in exprs: local = f(ldf).compute(get=dask.get) remote = e.compute(f(rdf)) remote = yield gen.with_timeout(timedelta(seconds=5), remote._result()) assert_equal(local, remote) yield e._shutdown()
import argparse def inject(url): import config from pymongo import MongoClient import os #connstring = config.db['host'] + ":" + config.db['port'] run = "sqlmap -u " + url + ' --batch ' command = os.popen(run).read() data = {'url': url, 'output': command} client = MongoClient('192.168.1.14:27017') dbn = config.db['dbname'] db = client[dbn] results = db.results results.insert(data) return command executor = Executor() parser = argparse.ArgumentParser() parser.add_argument("-f") args = parser.parse_args() links = [] s = open(args.f, 'r') for x in s: links.append(x.rstrip().replace("\n", "")) job = executor.map(inject, links) print executor.gather(job)
if (not args.append) and (os.path.exists(out_path)): shutil.rmtree(out_path) if not os.path.exists(out_path): os.makedirs(os.path.join(out_path, 'feat/df')) os.makedirs(os.path.join(out_path, 'feat/desc')) odDicts = [{ 'flight_hdf': args.uvan, 'img_num': ii, 'kp_det_func': kp_type_dict[kp_type], 'kp_desc_func': kp_type_dict[kp_type], 'p_meta': tf_meta, 'o_path': out_path } for ii in range(num_imgs)] r = executor.map(extract_kp_from_frame, odDicts, pure=False) kp_list = executor.gather(r) kp_meta = pd.DataFrame(kp_list, columns=[ 'num_feat', 'center_lon', 'center_lat', 'df_path', 'desc_path', 'flight', 'img_num' ]) kp_meta.to_hdf(os.path.join(out_path, 'feat_meta.hdf'), key='feat_meta') print('what')
class DistributedContext(object): io_loop = None io_thread = None def __init__(self, ip="127.0.0.1", port=8787, spawn_workers=0, write_partial_results=None, track_progress=False, time_limit=None, job_observer=None): """ :type ip: string :type port: int :type spawn_workers: int :type write_partial_results: int :type track_progress: bool :type time_limit: int :type job_observer: JobObserver """ self.worker_count = spawn_workers self.ip = ip self.port = port self.active = False self.write_partial_results = write_partial_results self.track_progress = track_progress self.execution_count = 0 self.timeout = TimeoutManager(time_limit) if time_limit else None self.job_observer = job_observer if not DistributedContext.io_loop: DistributedContext.io_loop = IOLoop() DistributedContext.io_thread = Thread( target=DistributedContext.io_loop.start) DistributedContext.io_thread.daemon = True DistributedContext.io_thread.start() if spawn_workers > 0: self.scheduler = self._create_scheduler() self.workers = [self._create_worker() for i in xrange(spawn_workers)] time.sleep(0.5) # wait for workers to spawn self.executor = Executor((ip, port)) def run(self, domain, worker_reduce_fn, worker_reduce_init, global_reduce_fn, global_reduce_init): size = domain.steps assert size is not None # TODO: Iterators without size workers = 0 for name, value in self.executor.ncores().items(): workers += value if workers == 0: raise Exception("There are no workers") batch_count = workers * 4 batch_size = max(int(round(size / float(batch_count))), 1) batches = self._create_batches(batch_size, size, domain, worker_reduce_fn, worker_reduce_init) logging.info("Qit: starting {} batches with size {}".format( batch_count, batch_size)) if self.job_observer: self.job_observer.on_computation_start(batch_count, batch_size) futures = self.executor.map(process_batch, batches) if self.track_progress: distributed.diagnostics.progress(futures) if self.write_partial_results is not None: result_saver = ResultSaver(self.execution_count, self.write_partial_results) else: result_saver = None timeouted = False results = [] for future in as_completed(futures): job = future.result() if result_saver: result_saver.handle_result(job.result) if self.job_observer: self.job_observer.on_job_completed(job) results.append(job.result) if self.timeout and self.timeout.is_finished(): logging.info("Qit: timeouted after {} seconds".format( self.timeout.timeout)) timeouted = True break # order results if not timeouted: results = [j.result for j in self.executor.gather(futures)] self.execution_count += 1 if worker_reduce_fn is None: results = list(itertools.chain.from_iterable(results)) logging.info("Qit: finished run with size {} (taking {})".format( len(results), domain.size)) results = results[:domain.size] # trim results to required size if global_reduce_fn is None: return results else: if global_reduce_init is None: return reduce(global_reduce_fn, results) else: return reduce(global_reduce_fn, results, global_reduce_init) def _create_scheduler(self): scheduler = Scheduler(ip=self.ip) scheduler.start(self.port) return scheduler def _create_worker(self): worker = Worker(scheduler_ip=self.ip, scheduler_port=self.port, ncores=1) worker.start(0) return worker def _create_batches(self, batch_size, size, domain, worker_reduce_fn, worker_reduce_init): batches = [] i = 0 while True: new = i + batch_size if i + batch_size <= size: batches.append((domain, i, batch_size, worker_reduce_fn, worker_reduce_init)) i = new if new == size: break else: batches.append((domain, i, size - i, worker_reduce_fn, worker_reduce_init)) break return batches
SCHEDULER_PORT = 5678 SCHEDULER_HTTP_PORT = 9786 SCHEDULER_BOKEH_PORT = 12345 SCHEDULER_IP = '127.0.0.1' HOME_PAGE = 'http://localhost:5050' def test(one, two): return 4 executor = Executor('{}:{}'.format(SCHEDULER_IP, SCHEDULER_PORT)) taskclient.add_user(HOME_PAGE) for _ in range(5): taskclient.add_job(HOME_PAGE) result_list = [] num_iters = 50 #result = executor.map(taskclient.worker, [HOME_PAGE, HOME_PAGE], [0, 1]) result = executor.map(taskclient.worker, itertools.repeat(HOME_PAGE, num_iters), range(num_iters)) result_list = result distributed.diagnostics.progress(result_list) print() print(executor.who_has(result_list)) sim_results = executor.gather(result_list) print('---------')