コード例 #1
0
def test__futures_to_collection(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    remote_dfs = e.map(identity, dfs)
    ddf = yield _futures_to_collection(remote_dfs, divisions=True)
    ddf2 = yield _futures_to_dask_dataframe(remote_dfs, divisions=True)
    assert isinstance(ddf, dd.DataFrame)

    assert ddf.dask == ddf2.dask

    remote_arrays = e.map(np.arange, range(3, 5))
    x = yield _futures_to_collection(remote_arrays)
    y = yield _futures_to_dask_array(remote_arrays)

    assert type(x) == type(y)
    assert x.dask == y.dask

    remote_lists = yield e._scatter([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    b = yield _futures_to_collection(remote_lists)
    c = yield _futures_to_dask_bag(remote_lists)

    assert type(b) == type(c)
    assert b.dask == b.dask

    yield e._shutdown()
コード例 #2
0
def test__futures_to_collection(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    remote_dfs = e.map(identity, dfs)
    ddf = yield _futures_to_collection(remote_dfs, divisions=True)
    ddf2 = yield _futures_to_dask_dataframe(remote_dfs, divisions=True)
    assert isinstance(ddf, dd.DataFrame)

    assert ddf.dask == ddf2.dask

    remote_arrays = e.map(np.arange, range(3, 5))
    x = yield _futures_to_collection(remote_arrays)
    y = yield _futures_to_dask_array(remote_arrays)

    assert type(x) == type(y)
    assert x.dask == y.dask

    remote_lists = yield e._scatter([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    b = yield _futures_to_collection(remote_lists)
    c = yield _futures_to_dask_bag(remote_lists)

    assert type(b) == type(c)
    assert b.dask == b.dask

    yield e._shutdown()
コード例 #3
0
def dont_test_dataframes(s, a):  # slow
    pytest.importorskip('pandas')
    n = 3000000
    fn = '/tmp/test/file.csv'
    with make_hdfs() as hdfs:
        data = (b'name,amount,id\r\n' + b'Alice,100,1\r\nBob,200,2\r\n' * n)
        with hdfs.open(fn, 'w') as f:
            f.write(data)

        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        futures = read_binary(fn, hdfs=hdfs, delimiter=b'\r\n')
        assert len(futures) > 1

        def load(b, **kwargs):
            assert b
            from io import BytesIO
            import pandas as pd
            bio = BytesIO(b)
            return pd.read_csv(bio, **kwargs)

        dfs = e.map(load, futures, names=['name', 'amount', 'id'], skiprows=1)
        dfs2 = yield e._gather(dfs)
        assert sum(map(len, dfs2)) == n * 2 - 1
コード例 #4
0
def test_with_data(s, a, b):
    ss = HTTPScheduler(s)
    ss.listen(0)

    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    L = e.map(inc, [1, 2, 3])
    L2 = yield e._scatter(['Hello', 'world!'])
    yield _wait(L)

    client = AsyncHTTPClient()
    response = yield client.fetch('http://localhost:%s/memory-load.json' %
                                  ss.port)
    out = json.loads(response.body.decode())

    assert all(isinstance(v, int) for v in out.values())
    assert set(out) == {a.address_string, b.address_string}
    assert sum(out.values()) == sum(map(sys.getsizeof,
                                        [1, 2, 3, 'Hello', 'world!']))

    response = yield client.fetch('http://localhost:%s/memory-load-by-key.json'
                                  % ss.port)
    out = json.loads(response.body.decode())
    assert set(out) == {a.address_string, b.address_string}
    assert all(isinstance(v, dict) for v in out.values())
    assert all(k in {'inc', 'data'} for d in out.values() for k in d)
    assert all(isinstance(v, int) for d in out.values() for v in d.values())

    assert sum(v for d in out.values() for v in d.values()) == \
            sum(map(sys.getsizeof, [1, 2, 3, 'Hello', 'world!']))

    ss.stop()
    yield e._shutdown()
コード例 #5
0
def test_with_data(s, a, b):
    ss = HTTPScheduler(s)
    ss.listen(0)

    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    L = e.map(inc, [1, 2, 3])
    L2 = yield e._scatter(['Hello', 'world!'])
    yield _wait(L)

    client = AsyncHTTPClient()
    response = yield client.fetch('http://localhost:%s/memory-load.json' %
                                  ss.port)
    out = json.loads(response.body.decode())

    assert all(isinstance(v, int) for v in out.values())
    assert set(out) == {a.address_string, b.address_string}
    assert sum(out.values()) == sum(
        map(sys.getsizeof, [1, 2, 3, 'Hello', 'world!']))

    response = yield client.fetch(
        'http://localhost:%s/memory-load-by-key.json' % ss.port)
    out = json.loads(response.body.decode())
    assert set(out) == {a.address_string, b.address_string}
    assert all(isinstance(v, dict) for v in out.values())
    assert all(k in {'inc', 'data'} for d in out.values() for k in d)
    assert all(isinstance(v, int) for d in out.values() for v in d.values())

    assert sum(v for d in out.values() for v in d.values()) == \
            sum(map(sys.getsizeof, [1, 2, 3, 'Hello', 'world!']))

    ss.stop()
    yield e._shutdown()
コード例 #6
0
ファイル: test_hdfs.py プロジェクト: kevineriklee/distributed
def dont_test_dataframes(s, a):  # slow
    pytest.importorskip('pandas')
    n = 3000000
    fn = '/tmp/test/file.csv'
    with make_hdfs() as hdfs:
        data = (b'name,amount,id\r\n' +
                b'Alice,100,1\r\nBob,200,2\r\n' * n)
        with hdfs.open(fn, 'w') as f:
            f.write(data)

        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        futures = read_bytes(fn, hdfs=hdfs, delimiter=b'\r\n')
        assert len(futures) > 1

        def load(b, **kwargs):
            assert b
            from io import BytesIO
            import pandas as pd
            bio = BytesIO(b)
            return pd.read_csv(bio, **kwargs)

        dfs = e.map(load, futures, names=['name', 'amount', 'id'], skiprows=1)
        dfs2 = yield e._gather(dfs)
        assert sum(map(len, dfs2)) == n * 2 - 1
コード例 #7
0
def test_no_divisions(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()
    dfs = e.map(tm.makeTimeDataFrame, range(5, 10))

    df = yield _futures_to_dask_dataframe(dfs)
    assert not df.known_divisions
    assert list(df.columns) == list(tm.makeTimeDataFrame(5).columns)
コード例 #8
0
def test_no_divisions(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()
    dfs = e.map(tm.makeTimeDataFrame, range(5, 10))

    df = yield _futures_to_dask_dataframe(dfs)
    assert not df.known_divisions
    assert list(df.columns) == list(tm.makeTimeDataFrame(5).columns)
コード例 #9
0
    def test_framework_runs(self):
        with MesosCluster() as cluster:
            time.sleep(2)
            driver = DistributedDriver().create_driver(DistributedScheduler)
            driver.start()
            time.sleep(5)

            expect(cluster).to(have_activated_slaves(1))
            expect(cluster).to(have_framework_name('distributed-framework'))

            # distributed test - this probably doesnt belong here
            executor = Executor('127.0.0.1:8787')
            A = executor.map(lambda x: x**2, range(10))
            B = executor.map(lambda x: -x, A)
            total = executor.submit(sum, B)
            expect(total.result()).to(equal(-285))
            driver.stop()
コード例 #10
0
    def f(c, a, b):
        e = Executor((c.ip, c.port), start=False)
        IOLoop.current().spawn_callback(e._go)

        remote_dfs = e.map(lambda x: x, dfs)
        ddf = yield _futures_to_dask_dataframe(e, remote_dfs, divisions=True)

        assert isinstance(ddf, dd.DataFrame)
        assert ddf.divisions == (0, 30, 60, 80)
        expr = ddf.x.sum()
        result = yield e._get(expr.dask, expr._keys())
        assert result == [sum([df.x.sum() for df in dfs])]

        yield e._shutdown()
コード例 #11
0
def test__futures_to_dask_dataframe(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    remote_dfs = e.map(identity, dfs)
    ddf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True,
            executor=e)

    assert isinstance(ddf, dd.DataFrame)
    assert ddf.divisions == (0, 30, 60, 80)
    expr = ddf.x.sum()
    result = yield e._get(expr.dask, expr._keys())
    assert result == [sum([df.x.sum() for df in dfs])]

    yield e._shutdown()
コード例 #12
0
def test__futures_to_dask_dataframe(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    remote_dfs = e.map(identity, dfs)
    ddf = yield _futures_to_dask_dataframe(remote_dfs,
                                           divisions=True,
                                           executor=e)

    assert isinstance(ddf, dd.DataFrame)
    assert ddf.divisions == (0, 30, 60, 80)
    expr = ddf.x.sum()
    result = yield e._get(expr.dask, expr._keys())
    assert result == [sum([df.x.sum() for df in dfs])]

    yield e._shutdown()
コード例 #13
0
    def f(c, a, b):
        e = Executor((c.ip, c.port), start=False, loop=loop)
        yield e._start()

        arrays = e.map(np.ones, [(5, 5)] * 6)
        y = yield _stack(arrays, axis=0)
        assert y.shape == (6, 5, 5)
        assert y.chunks == ((1, 1, 1, 1, 1, 1), (5,), (5,))

        y_results = yield e._get(y.dask, y._keys())
        yy = da.Array._finalize(y, y_results)

        assert isinstance(yy, np.ndarray)
        assert yy.shape == y.shape
        assert (yy == 1).all()

        yield e._shutdown()
コード例 #14
0
def test__stack(s, a, b):
    import dask.array as da
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    arrays = e.map(np.ones, [(5, 5)] * 6)
    y = yield _stack(arrays, axis=0)
    assert y.shape == (6, 5, 5)
    assert y.chunks == ((1, 1, 1, 1, 1, 1), (5, ), (5, ))

    y_result, = e.compute(y)
    yy = yield y_result._result()

    assert isinstance(yy, np.ndarray)
    assert yy.shape == y.shape
    assert (yy == 1).all()

    yield e._shutdown()
コード例 #15
0
def test__stack(s, a, b):
    import dask.array as da
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    arrays = e.map(np.ones, [(5, 5)] * 6)
    y = yield _stack(arrays, axis=0)
    assert y.shape == (6, 5, 5)
    assert y.chunks == ((1, 1, 1, 1, 1, 1), (5,), (5,))

    y_result = e.compute(y)
    yy = yield y_result._result()

    assert isinstance(yy, np.ndarray)
    assert yy.shape == y.shape
    assert (yy == 1).all()

    yield e._shutdown()
コード例 #16
0
def test_dataframes(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    dfs = [
        pd.DataFrame({
            'x': np.random.random(100),
            'y': np.random.random(100)
        },
                     index=list(range(i, i + 100)))
        for i in range(0, 100 * 10, 100)
    ]

    remote_dfs = e.map(lambda x: x, dfs)
    rdf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True)
    name = 'foo'
    ldf = dd.DataFrame({(name, i): df
                        for i, df in enumerate(dfs)}, name, dfs[0].columns,
                       list(range(0, 1000, 100)) + [999])

    assert rdf.divisions == ldf.divisions

    remote = e.compute(rdf)
    result = yield remote._result()

    tm.assert_frame_equal(result, ldf.compute(get=dask.get))

    exprs = [
        lambda df: df.x.mean(), lambda df: df.y.std(),
        lambda df: df.assign(z=df.x + df.y).drop_duplicates(),
        lambda df: df.index, lambda df: df.x, lambda df: df.x.cumsum(),
        lambda df: df.loc[50:75]
    ]
    for f in exprs:
        local = f(ldf).compute(get=dask.get)
        remote = e.compute(f(rdf))
        remote = yield gen.with_timeout(timedelta(seconds=5), remote._result())
        assert_equal(local, remote)

    yield e._shutdown()
コード例 #17
0
def test_dataframes(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    dfs = [pd.DataFrame({'x': np.random.random(100),
                         'y': np.random.random(100)},
                        index=list(range(i, i + 100)))
           for i in range(0, 100*10, 100)]

    remote_dfs = e.map(lambda x: x, dfs)
    rdf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True)
    name = 'foo'
    ldf = dd.DataFrame({(name, i): df for i, df in enumerate(dfs)},
                       name, dfs[0].columns,
                       list(range(0, 1000, 100)) + [999])

    assert rdf.divisions == ldf.divisions

    remote = e.compute(rdf)
    result = yield remote._result()

    tm.assert_frame_equal(result,
                          ldf.compute(get=dask.get))

    exprs = [lambda df: df.x.mean(),
             lambda df: df.y.std(),
             lambda df: df.assign(z=df.x + df.y).drop_duplicates(),
             lambda df: df.index,
             lambda df: df.x,
             lambda df: df.x.cumsum(),
             lambda df: df.loc[50:75]]
    for f in exprs:
        local = f(ldf).compute(get=dask.get)
        remote = e.compute(f(rdf))
        remote = yield gen.with_timeout(timedelta(seconds=5), remote._result())
        assert_equal(local, remote)

    yield e._shutdown()
コード例 #18
0
import argparse


def inject(url):
    import config
    from pymongo import MongoClient
    import os
    #connstring = config.db['host'] + ":" + config.db['port']
    run = "sqlmap -u " + url + ' --batch '
    command = os.popen(run).read()
    data = {'url': url, 'output': command}
    client = MongoClient('192.168.1.14:27017')
    dbn = config.db['dbname']
    db = client[dbn]
    results = db.results
    results.insert(data)
    return command


executor = Executor()
parser = argparse.ArgumentParser()
parser.add_argument("-f")
args = parser.parse_args()
links = []
s = open(args.f, 'r')
for x in s:
    links.append(x.rstrip().replace("\n", ""))

job = executor.map(inject, links)
print executor.gather(job)
コード例 #19
0
        if (not args.append) and (os.path.exists(out_path)):
            shutil.rmtree(out_path)

        if not os.path.exists(out_path):
            os.makedirs(os.path.join(out_path, 'feat/df'))
            os.makedirs(os.path.join(out_path, 'feat/desc'))

        odDicts = [{
            'flight_hdf': args.uvan,
            'img_num': ii,
            'kp_det_func': kp_type_dict[kp_type],
            'kp_desc_func': kp_type_dict[kp_type],
            'p_meta': tf_meta,
            'o_path': out_path
        } for ii in range(num_imgs)]

        r = executor.map(extract_kp_from_frame, odDicts, pure=False)

        kp_list = executor.gather(r)
        kp_meta = pd.DataFrame(kp_list,
                               columns=[
                                   'num_feat', 'center_lon', 'center_lat',
                                   'df_path', 'desc_path', 'flight', 'img_num'
                               ])

        kp_meta.to_hdf(os.path.join(out_path, 'feat_meta.hdf'),
                       key='feat_meta')

    print('what')
コード例 #20
0
ファイル: distributedcontext.py プロジェクト: Kobzol/pyqit
class DistributedContext(object):
    io_loop = None
    io_thread = None

    def __init__(self,
                 ip="127.0.0.1",
                 port=8787,
                 spawn_workers=0,
                 write_partial_results=None,
                 track_progress=False,
                 time_limit=None,
                 job_observer=None):
        """
        :type ip: string
        :type port: int
        :type spawn_workers: int
        :type write_partial_results: int
        :type track_progress: bool
        :type time_limit: int
        :type job_observer: JobObserver
        """

        self.worker_count = spawn_workers
        self.ip = ip
        self.port = port
        self.active = False
        self.write_partial_results = write_partial_results
        self.track_progress = track_progress
        self.execution_count = 0
        self.timeout = TimeoutManager(time_limit) if time_limit else None
        self.job_observer = job_observer

        if not DistributedContext.io_loop:
            DistributedContext.io_loop = IOLoop()
            DistributedContext.io_thread = Thread(
                target=DistributedContext.io_loop.start)
            DistributedContext.io_thread.daemon = True
            DistributedContext.io_thread.start()

        if spawn_workers > 0:
            self.scheduler = self._create_scheduler()
            self.workers = [self._create_worker()
                            for i in xrange(spawn_workers)]
            time.sleep(0.5)  # wait for workers to spawn

        self.executor = Executor((ip, port))

    def run(self, domain,
            worker_reduce_fn, worker_reduce_init,
            global_reduce_fn, global_reduce_init):
        size = domain.steps
        assert size is not None  # TODO: Iterators without size

        workers = 0
        for name, value in self.executor.ncores().items():
            workers += value

        if workers == 0:
            raise Exception("There are no workers")

        batch_count = workers * 4
        batch_size = max(int(round(size / float(batch_count))), 1)
        batches = self._create_batches(batch_size, size, domain,
                                       worker_reduce_fn, worker_reduce_init)

        logging.info("Qit: starting {} batches with size {}".format(
            batch_count, batch_size))

        if self.job_observer:
            self.job_observer.on_computation_start(batch_count, batch_size)

        futures = self.executor.map(process_batch, batches)

        if self.track_progress:
            distributed.diagnostics.progress(futures)

        if self.write_partial_results is not None:
            result_saver = ResultSaver(self.execution_count,
                                       self.write_partial_results)
        else:
            result_saver = None

        timeouted = False
        results = []

        for future in as_completed(futures):
            job = future.result()
            if result_saver:
                result_saver.handle_result(job.result)
            if self.job_observer:
                self.job_observer.on_job_completed(job)

            results.append(job.result)

            if self.timeout and self.timeout.is_finished():
                logging.info("Qit: timeouted after {} seconds".format(
                    self.timeout.timeout))
                timeouted = True
                break

        # order results
        if not timeouted:
            results = [j.result for j in self.executor.gather(futures)]

        self.execution_count += 1

        if worker_reduce_fn is None:
            results = list(itertools.chain.from_iterable(results))

        logging.info("Qit: finished run with size {} (taking {})".format(
            len(results), domain.size))

        results = results[:domain.size]  # trim results to required size

        if global_reduce_fn is None:
            return results
        else:
            if global_reduce_init is None:
                return reduce(global_reduce_fn, results)
            else:
                return reduce(global_reduce_fn, results, global_reduce_init)

    def _create_scheduler(self):
        scheduler = Scheduler(ip=self.ip)
        scheduler.start(self.port)
        return scheduler

    def _create_worker(self):
        worker = Worker(scheduler_ip=self.ip,
                        scheduler_port=self.port,
                        ncores=1)
        worker.start(0)
        return worker

    def _create_batches(self, batch_size, size,
                        domain,
                        worker_reduce_fn,
                        worker_reduce_init):
        batches = []
        i = 0

        while True:
            new = i + batch_size
            if i + batch_size <= size:
                batches.append((domain, i, batch_size,
                                worker_reduce_fn, worker_reduce_init))
                i = new
                if new == size:
                    break
            else:
                batches.append((domain, i, size - i,
                                worker_reduce_fn, worker_reduce_init))
                break

        return batches
コード例 #21
0
SCHEDULER_PORT = 5678
SCHEDULER_HTTP_PORT = 9786
SCHEDULER_BOKEH_PORT = 12345
SCHEDULER_IP = '127.0.0.1'

HOME_PAGE = 'http://localhost:5050'


def test(one, two):
    return 4


executor = Executor('{}:{}'.format(SCHEDULER_IP, SCHEDULER_PORT))

taskclient.add_user(HOME_PAGE)
for _ in range(5):
    taskclient.add_job(HOME_PAGE)

result_list = []
num_iters = 50
#result = executor.map(taskclient.worker, [HOME_PAGE, HOME_PAGE], [0, 1])
result = executor.map(taskclient.worker,
                      itertools.repeat(HOME_PAGE, num_iters), range(num_iters))
result_list = result

distributed.diagnostics.progress(result_list)
print()
print(executor.who_has(result_list))
sim_results = executor.gather(result_list)
print('---------')