Ejemplo n.º 1
0
    def testOperandsWithoutPrepareInputs(self):
        self.start_processes(etcd=False, modules=['mars.scheduler.tests.integrated.no_prepare_op'])

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id))

        actor_address = self.cluster_info.get_scheduler(ResourceActor.default_uid())
        resource_ref = actor_client.actor_ref(ResourceActor.default_uid(), address=actor_address)
        worker_endpoints = resource_ref.get_worker_endpoints()

        t1 = mt.random.rand(10)
        t1.op._expect_worker = worker_endpoints[0]
        t2 = mt.random.rand(10)
        t2.op._expect_worker = worker_endpoints[1]

        t = NoPrepareOperand().new_tileable([t1, t2])
        t.op._prepare_inputs = [False, False]

        graph = t.build_graph()
        targets = [t.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)
Ejemplo n.º 2
0
    def testMainWithEtcd(self):
        self.start_processes(etcd=True)

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(
            self.session_manager_ref.create_session(session_id))

        a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        c = (a * b * 2 + 1).sum()
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key,
                                        target_tensors=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1
        assert_allclose(loads(result), expected.sum())
Ejemplo n.º 3
0
    def testShuffleFailoverBeforeSuccStart(self):
        pred_finish_file = self.add_state_file('SHUFFLE_ALL_PRED_FINISHED_FILE')
        succ_start_file = self.add_state_file('SHUFFLE_START_SUCC_FILE')

        self.start_processes(modules=['mars.scheduler.tests.integrated.op_delayer'], log_worker=True)

        session_id = uuid.uuid1()
        actor_client = new_client()
        session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id))

        a = mt.ones((31, 27), chunk_size=10)
        b = a.reshape(27, 31)
        b.op.extra_params['_reshape_with_shuffle'] = True
        graph = b.build_graph()
        targets = [b.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)
        actor_client.sleep(1)

        while not os.path.exists(pred_finish_file):
            actor_client.sleep(0.01)

        self.kill_process_tree(self.proc_workers[0])
        logger.warning('Worker %s KILLED!\n\n', self.proc_workers[0].pid)
        self.proc_workers = self.proc_workers[1:]
        open(succ_start_file, 'w').close()

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, b.key)
        assert_allclose(loads(result), np.ones((27, 31)))
Ejemplo n.º 4
0
    def testWorkerFailOver(self):
        def kill_process_tree(proc):
            import psutil
            proc = psutil.Process(proc.pid)
            plasma_sock_dir = None
            for p in proc.children(recursive=True):
                if 'plasma' in p.name():
                    socks = [
                        conn.laddr for conn in p.connections('unix')
                        if 'plasma' in conn.laddr
                    ]
                    if socks:
                        plasma_sock_dir = os.path.dirname(socks[0])
                p.kill()
            proc.kill()
            if plasma_sock_dir:
                shutil.rmtree(plasma_sock_dir, ignore_errors=True)

        delay_file = self.add_state_file('DELAY_STATE_FILE')
        open(delay_file, 'w').close()

        terminate_file = self.add_state_file('TERMINATE_STATE_FILE')

        self.start_processes(modules=['mars.scheduler.tests.op_delayer'],
                             log_worker=True)

        session_id = uuid.uuid1()
        actor_client = new_client()
        session_ref = actor_client.actor_ref(
            self.session_manager_ref.create_session(session_id))

        np_a = np.random.random((100, 100))
        np_b = np.random.random((100, 100))

        a = mt.array(np_a, chunk_size=30) * 2 + 1
        b = mt.array(np_b, chunk_size=30) * 2 + 1
        c = a.dot(b) * 2 + 1
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key,
                                        target_tensors=targets)

        while not os.path.exists(terminate_file):
            actor_client.sleep(0.05)

        kill_process_tree(self.proc_workers[0])
        logger.warning('Worker %s KILLED!\n\n', self.proc_workers[0].pid)
        self.proc_workers = self.proc_workers[1:]
        os.unlink(delay_file)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np_a * 2 + 1).dot(np_b * 2 + 1) * 2 + 1
        assert_allclose(loads(result), expected)
Ejemplo n.º 5
0
    def testRemoteWithoutEtcd(self):
        self.start_processes(
            etcd=False,
            modules=['mars.scheduler.tests.integrated.no_prepare_op'])

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(
            self.session_manager_ref.create_session(session_id))

        rs = np.random.RandomState(0)
        raw1 = rs.rand(10, 10)
        raw2 = rs.rand(10, 10)

        def f_none(_x):
            return None

        r_none = spawn(f_none, raw1)

        graph = r_none.build_graph()
        targets = [r_none.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, r_none.key)
        self.assertIsNone(loads(result))

        def f1(x):
            return x + 1

        def f2(x, y, z=None):
            return x * y * (z[0] + z[1])

        r1 = spawn(f1, raw1)
        r2 = spawn(f1, raw2)
        r3 = spawn(f2, (r1, r2), {'z': [r1, r2]})

        graph = r3.build_graph()
        targets = [r3.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, r3.key)
        expected = (raw1 + 1) * (raw2 + 1) * (raw1 + 1 + raw2 + 1)
        assert_allclose(loads(result), expected)
Ejemplo n.º 6
0
    def testWorkerFailOver(self):
        def kill_process_tree(p):
            import psutil
            proc = psutil.Process(p.pid)
            for p in proc.children(recursive=True):
                p.kill()
            proc.kill()

        import tempfile
        delay_file = os.environ['DELAY_STATE_FILE'] = os.path.join(
            tempfile.gettempdir(),
            'test-main-delay-%d-%d' % (os.getpid(), id(self)))
        open(delay_file, 'w').close()

        terminate_file = os.environ['TERMINATE_STATE_FILE'] = os.path.join(
            tempfile.gettempdir(),
            'test-main-terminate-%d-%d' % (os.getpid(), id(self)))

        self.start_processes(modules=['mars.scheduler.tests.op_delayer'],
                             log_worker=True)

        session_id = uuid.uuid1()
        actor_client = new_client()
        session_ref = actor_client.actor_ref(
            self.session_manager_ref.create_session(session_id))

        np_a = np.random.random((100, 100))
        np_b = np.random.random((100, 100))

        a = mt.array(np_a, chunk_size=30) * 2 + 1
        b = mt.array(np_b, chunk_size=30) * 2 + 1
        c = a.dot(b) * 2 + 1
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key,
                                        target_tensors=targets)

        while not os.path.exists(terminate_file):
            actor_client.sleep(0.05)
        os.unlink(terminate_file)
        # actor_client.sleep(1.2)

        kill_process_tree(self.proc_workers[0])
        logger.warning('Worker %s KILLED!\n\n', self.proc_workers[0].pid)
        self.proc_workers = self.proc_workers[1:]
        os.unlink(delay_file)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np_a * 2 + 1).dot(np_b * 2 + 1) * 2 + 1
        assert_allclose(loads(result), expected)
Ejemplo n.º 7
0
    def testIterativeTilingWithoutEtcd(self):
        self.start_processes(etcd=False)

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(
            self.session_manager_ref.create_session(session_id))

        rs = np.random.RandomState(0)
        raw = rs.rand(100)
        a = mt.tensor(raw, chunk_size=10)
        a.sort()
        c = a[:5]

        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = np.sort(raw)[:5]
        assert_allclose(loads(result), expected)

        with self.assertRaises(TypeError):
            session_ref.fetch_result(graph_key, a.key, check=False)

        raw1 = rs.rand(20)
        raw2 = rs.rand(20)
        a = mt.tensor(raw1, chunk_size=10)
        a.sort()
        b = mt.tensor(raw2, chunk_size=15) + 1
        c = mt.concatenate([a[:10], b])
        c.sort()
        d = c[:5]

        graph = d.build_graph()
        targets = [d.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, d.key)
        expected = np.sort(np.concatenate([np.sort(raw1)[:10], raw2 + 1]))[:5]
        assert_allclose(loads(result), expected)
Ejemplo n.º 8
0
    def testMain(self):
        session_id = uuid.uuid1()
        scheduler_address = '127.0.0.1:' + self.scheduler_port
        actor_client = new_client()
        session_ref = actor_client.create_actor(
            SessionActor,
            uid=SessionActor.gen_name(session_id),
            address=scheduler_address,
            session_id=session_id)
        a = ones((100, 100), chunks=30) * 2 * 1 + 1
        b = ones((100, 100), chunks=30) * 2 * 1 + 1
        c = (a * b * 2 + 1).sum()
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key,
                                        target_tensors=targets)

        check_time = time.time()
        while True:
            time.sleep(1)
            self.check_process_statuses()
            if time.time() - check_time > 60:
                raise SystemError('Check graph status timeout')
            if session_ref.graph_state(graph_key) == GraphState.SUCCEEDED:
                result = session_ref.fetch_result(graph_key, c.key)
                break

        expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1

        assert_array_equal(loads(result), expected.sum())

        a = ones((100, 50), chunks=30) * 2 + 1
        b = ones((50, 200), chunks=30) * 2 + 1
        c = a.dot(b)
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key,
                                        target_tensors=targets)

        check_time = time.time()
        while True:
            time.sleep(1)
            self.check_process_statuses()
            if time.time() - check_time > 60:
                raise SystemError('Check graph status timeout')
            if session_ref.graph_state(graph_key) == GraphState.SUCCEEDED:
                result = session_ref.fetch_result(graph_key, c.key)
                break

        assert_array_equal(loads(result), np.ones((100, 200)) * 450)
Ejemplo n.º 9
0
    def testMainWithoutEtcd(self):
        self.start_processes()

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id))

        a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        c = (a * b * 2 + 1).sum()
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key, target_tensors=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np.ones(a.shape) * 2 * 1 + 1) ** 2 * 2 + 1
        assert_allclose(loads(result), expected.sum())

        a = mt.ones((100, 50), chunk_size=35) * 2 + 1
        b = mt.ones((50, 200), chunk_size=35) * 2 + 1
        c = a.dot(b)
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key, target_tensors=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)
        result = session_ref.fetch_result(graph_key, c.key)
        assert_allclose(loads(result), np.ones((100, 200)) * 450)

        base_arr = np.random.random((100, 100))
        a = mt.array(base_arr)
        sumv = reduce(operator.add, [a[:10, :10] for _ in range(10)])
        graph = sumv.build_graph()
        targets = [sumv.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key, target_tensors=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = reduce(operator.add, [base_arr[:10, :10] for _ in range(10)])
        result = session_ref.fetch_result(graph_key, sumv.key)
        assert_allclose(loads(result), expected)
Ejemplo n.º 10
0
    def setUp(self):
        scheduler_port = str(get_next_port())
        proc_worker1 = subprocess.Popen([sys.executable, '-m', 'mars.worker',
                                         '-a', '127.0.0.1',
                                         '--cpu-procs', '2',
                                         '--level', 'debug',
                                         '--cache-mem', '16m',
                                         '--schedulers', '127.0.0.1:' + scheduler_port,
                                         '--ignore-avail-mem'])
        proc_worker2 = subprocess.Popen([sys.executable, '-m', 'mars.worker',
                                         '-a', '127.0.0.1',
                                         '--cpu-procs', '2',
                                         '--level', 'debug',
                                         '--cache-mem', '16m',
                                         '--schedulers', '127.0.0.1:' + scheduler_port,
                                         '--ignore-avail-mem'])
        proc_scheduler = subprocess.Popen([sys.executable, '-m', 'mars.scheduler',
                                           '-H', '127.0.0.1',
                                           '--level', 'debug',
                                           '-p', scheduler_port,
                                           '--format', '%(asctime)-15s %(message)s'])

        self.scheduler_port = scheduler_port
        self.proc_workers = [proc_worker1, proc_worker2]
        self.proc_scheduler = proc_scheduler

        time.sleep(2)
        actor_client = new_client()
        check_time = time.time()
        while True:
            try:
                resource_ref = actor_client.actor_ref(ResourceActor.default_name(), address='127.0.0.1:' + scheduler_port)
                if actor_client.has_actor(resource_ref):
                    break
                else:
                    raise SystemError('Check meta_timestamp timeout')
            except:
                if time.time() - check_time > 10:
                    raise
                time.sleep(1)

        check_time = time.time()
        while True:
            if resource_ref.get_worker_count() < 2:
                time.sleep(0.5)
                self.check_process_statuses()
                if time.time() - check_time > 20:
                    raise SystemError('Check meta_timestamp timeout')
            else:
                break

        self.exceptions = gevent.hub.Hub.NOT_ERROR
        gevent.hub.Hub.NOT_ERROR = (Exception,)
Ejemplo n.º 11
0
    def setUp(self):
        self.worker_plasma_sock = '/tmp/plasma_%d_%d.sock' % (os.getpid(),
                                                              id(Test))
        scheduler_port = str(get_next_port())
        proc_worker = subprocess.Popen([
            sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1',
            '--cpu-procs', '2', '--level', 'debug', '--cache-mem', '16m',
            '--schedulers', '127.0.0.1:' + scheduler_port, '--plasma-socket',
            self.worker_plasma_sock, '--ignore-avail-mem'
        ])
        proc_scheduler = subprocess.Popen([
            sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1',
            '--level', 'debug', '-p', scheduler_port, '--format',
            '%(asctime)-15s %(message)s'
        ])

        self.scheduler_port = scheduler_port
        self.proc_worker = proc_worker
        self.proc_scheduler = proc_scheduler

        time.sleep(2)
        actor_client = new_client()
        check_time = time.time()
        while True:
            try:
                kv_ref = actor_client.actor_ref(KVStoreActor.default_name(),
                                                address='127.0.0.1:' +
                                                scheduler_port)
                if actor_client.has_actor(kv_ref):
                    break
                else:
                    raise SystemError('Check meta_timestamp timeout')
            except:
                if time.time() - check_time > 10:
                    raise
                time.sleep(1)

        check_time = time.time()
        while True:
            content = kv_ref.read('/workers/meta_timestamp', silent=True)
            if not content:
                time.sleep(0.5)
                self.check_process_statuses()
                if time.time() - check_time > 20:
                    raise SystemError('Check meta_timestamp timeout')
            else:
                break

        self.exceptions = gevent.hub.Hub.NOT_ERROR
        gevent.hub.Hub.NOT_ERROR = (Exception, )
Ejemplo n.º 12
0
    def testDistributedContext(self):
        self.start_processes(etcd=False)

        session_id = uuid.uuid1()
        actor_client = new_client()
        rs = np.random.RandomState(0)

        context = DistributedContext(
            scheduler_address=self.scheduler_endpoints[0],
            session_id=session_id)

        session_ref = actor_client.actor_ref(
            self.session_manager_ref.create_session(session_id))
        raw1 = rs.rand(10, 10)
        a = mt.tensor(raw1, chunk_size=4)

        graph = a.build_graph()
        targets = [a.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets,
                                          names=['test'])

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        tileable_infos = context.get_named_tileable_infos('test')
        self.assertEqual(a.key, tileable_infos.tileable_key)
        self.assertEqual(a.shape, tileable_infos.tileable_shape)

        nsplits = context.get_tileable_metas([a.key],
                                             filter_fields=['nsplits'])[0][0]
        self.assertEqual(((4, 4, 2), (4, 4, 2)), nsplits)

        r = context.get_tileable_data(a.key)
        np.testing.assert_array_equal(raw1, r)

        indexes = [slice(3, 9), slice(0, 7)]
        r = context.get_tileable_data(a.key, indexes)
        np.testing.assert_array_equal(raw1[tuple(indexes)], r)

        indexes = [[1, 4, 2, 4, 5], slice(None, None, None)]
        r = context.get_tileable_data(a.key, indexes)
        np.testing.assert_array_equal(raw1[tuple(indexes)], r)

        indexes = ([9, 1, 2, 0], [0, 0, 4, 4])
        r = context.get_tileable_data(a.key, indexes)
        np.testing.assert_array_equal(raw1[[9, 1, 2, 0], [0, 0, 4, 4]], r)
Ejemplo n.º 13
0
    def testMain(self):
        session_id = uuid.uuid1()
        scheduler_address = '127.0.0.1:' + self.scheduler_port
        actor_client = new_client()
        session_ref = actor_client.create_actor(
            SessionActor,
            uid=SessionActor.gen_name(session_id),
            address=scheduler_address,
            session_id=session_id)
        a = ones((100, 100), chunks=30) * 2 * 1 + 1
        b = ones((100, 100), chunks=30) * 2 * 1 + 1
        c = (a * b * 2 + 1).sum()
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key,
                                        target_tensors=targets)

        state = self.wait_for_termination(session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1
        assert_array_equal(loads(result), expected.sum())

        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key,
                                        target_tensors=targets)

        # todo this behavior may change when eager mode is introduced
        state = self.wait_for_termination(session_ref, graph_key)
        self.assertEqual(state, GraphState.FAILED)

        a = ones((100, 50), chunks=30) * 2 + 1
        b = ones((50, 200), chunks=30) * 2 + 1
        c = a.dot(b)
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key,
                                        target_tensors=targets)

        state = self.wait_for_termination(session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)
        result = session_ref.fetch_result(graph_key, c.key)
        assert_array_equal(loads(result), np.ones((100, 200)) * 450)
Ejemplo n.º 14
0
    def testCommonOperandFailover(self):
        delay_file = self.add_state_file('OP_DELAY_STATE_FILE')
        open(delay_file, 'w').close()

        terminate_file = self.add_state_file('OP_TERMINATE_STATE_FILE')

        self.start_processes(
            modules=['mars.scheduler.tests.integrated.op_delayer'],
            log_worker=True)

        session_id = uuid.uuid1()
        actor_client = new_client()
        session_ref = actor_client.actor_ref(
            self.session_manager_ref.create_session(session_id))

        np_a = np.random.random((100, 100))
        np_b = np.random.random((100, 100))

        a = mt.array(np_a, chunk_size=30) * 2 + 1
        b = mt.array(np_b, chunk_size=30) * 2 + 1
        c = a.dot(b) * 2 + 1
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets)

        while not os.path.exists(terminate_file):
            actor_client.sleep(0.01)

        self.kill_process_tree(self.proc_workers[0])
        logger.warning('Worker %s KILLED!\n\n', self.proc_workers[0].pid)
        self.proc_workers = self.proc_workers[1:]
        os.unlink(delay_file)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np_a * 2 + 1).dot(np_b * 2 + 1) * 2 + 1
        assert_allclose(loads(result), expected)
Ejemplo n.º 15
0
    def start_processes(self,
                        n_schedulers=2,
                        n_workers=2,
                        etcd=False,
                        modules=None,
                        log_scheduler=True,
                        log_worker=True):
        old_not_errors = gevent.hub.Hub.NOT_ERROR
        gevent.hub.Hub.NOT_ERROR = (Exception, )

        scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)]
        self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports]

        append_args = []
        if modules:
            append_args.extend(['--load-modules', ','.join(modules)])

        if etcd:
            etcd_port = get_next_port()
            self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port)
            self.etcd_helper.run()
            options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port
            append_args.extend(['--kv-store', options.kv_store])
        else:
            append_args.extend(
                ['--schedulers', ','.join(self.scheduler_endpoints)])

        if 'DUMP_GRAPH_DATA' in os.environ:
            append_args += ['-Dscheduler.dump_graph_data=true']

        self.proc_schedulers = [
            subprocess.Popen([
                sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1',
                '--level', 'debug' if log_scheduler else 'warning', '-p', p,
                '--format', '%(asctime)-15s %(message)s',
                '-Dscheduler.retry_delay=5'
            ] + append_args) for p in scheduler_ports
        ]
        self.proc_workers = [
            subprocess.Popen([
                sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1',
                '--cpu-procs', '1', '--level',
                'debug' if log_worker else 'warning', '--cache-mem', '16m',
                '--ignore-avail-mem', '-Dworker.prepare_data_timeout=30'
            ] + append_args) for _ in range(n_workers)
        ]

        actor_client = new_client()
        self.cluster_info = actor_client.actor_ref(
            ClusterInfoActor.default_name(),
            address=self.scheduler_endpoints[0])

        check_time = time.time()
        while True:
            try:
                started_schedulers = self.cluster_info.get_schedulers()
                if len(started_schedulers) < n_schedulers:
                    raise RuntimeError(
                        'Schedulers does not met requirement: %d < %d.' %
                        (len(started_schedulers), n_schedulers))
                actor_address = self.cluster_info.get_scheduler(
                    SessionManagerActor.default_name())
                self.session_manager_ref = actor_client.actor_ref(
                    SessionManagerActor.default_name(), address=actor_address)

                actor_address = self.cluster_info.get_scheduler(
                    ResourceActor.default_name())
                resource_ref = actor_client.actor_ref(
                    ResourceActor.default_name(), address=actor_address)

                if resource_ref.get_worker_count() < n_workers:
                    raise RuntimeError(
                        'Workers does not met requirement: %d < %d.' %
                        (resource_ref.get_worker_count(), n_workers))
                break
            except:
                if time.time() - check_time > 20:
                    raise
                time.sleep(0.1)

        gevent.hub.Hub.NOT_ERROR = old_not_errors
Ejemplo n.º 16
0
    def start_processes(self,
                        n_schedulers=2,
                        n_workers=2,
                        etcd=False,
                        cuda=False,
                        modules=None,
                        log_scheduler=True,
                        log_worker=True,
                        env=None):
        old_not_errors = gevent.hub.Hub.NOT_ERROR
        gevent.hub.Hub.NOT_ERROR = (Exception, )

        scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)]
        self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports]

        append_args = []
        append_args_scheduler = []
        append_args_worker = []
        if modules:
            append_args.extend(['--load-modules', ','.join(modules)])

        if etcd:
            etcd_port = get_next_port()
            self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port)
            self.etcd_helper.run()
            options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port
            append_args.extend(['--kv-store', options.kv_store])
        else:
            append_args.extend(
                ['--schedulers', ','.join(self.scheduler_endpoints)])

        if 'DUMP_GRAPH_DATA' in os.environ:
            append_args_scheduler += ['-Dscheduler.dump_graph_data=true']
        if not cuda:
            append_args_worker += ['--no-cuda']

        proc_env = os.environ.copy()
        if env:
            proc_env.update(env)

        self.proc_schedulers = [
            subprocess.Popen([
                sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1',
                '-p', p, '--log-level',
                'debug' if log_scheduler else 'warning', '--log-format',
                'SCH%d %%(asctime)-15s %%(message)s' % idx,
                '-Dscheduler.retry_delay=5', '-Dscheduler.default_cpu_usage=0',
                '-Dscheduler.status_timeout=10'
            ] + append_args + append_args_scheduler,
                             env=proc_env)
            for idx, p in enumerate(scheduler_ports)
        ]
        cuda_count = resource.cuda_count()
        cuda_devices = [int(d) for d in os.environ['CUDA_VISIBLE_DEVICES'].split(',')] \
            if os.environ.get('CUDA_VISIBLE_DEVICES') else list(range(cuda_count))
        self.proc_workers = [
            subprocess.Popen([
                sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1',
                '--cpu-procs', '1', '--log-level',
                'debug' if log_worker else 'warning', '--log-format',
                'WOR%d %%(asctime)-15s %%(message)s' % idx, '--cache-mem',
                '16m', '--ignore-avail-mem', '--cuda-device',
                str(cuda_devices[idx % cuda_count]) if cuda_count else '0',
                '-Dworker.prepare_data_timeout=30'
            ] + append_args + append_args_worker,
                             env=proc_env) for idx in range(n_workers)
        ]

        actor_client = new_client()
        self.cluster_info = actor_client.actor_ref(
            SchedulerClusterInfoActor.default_uid(),
            address=self.scheduler_endpoints[0])

        check_time = time.time()
        while True:
            try:
                try:
                    started_schedulers = self.cluster_info.get_schedulers()
                except Exception as e:
                    raise ProcessRequirementUnmetError(
                        'Failed to get scheduler numbers, %s' % e)
                if len(started_schedulers) < n_schedulers:
                    raise ProcessRequirementUnmetError(
                        'Schedulers does not met requirement: %d < %d.' %
                        (len(started_schedulers), n_schedulers))
                actor_address = self.cluster_info.get_scheduler(
                    SessionManagerActor.default_uid())
                self.session_manager_ref = actor_client.actor_ref(
                    SessionManagerActor.default_uid(), address=actor_address)

                actor_address = self.cluster_info.get_scheduler(
                    ResourceActor.default_uid())
                resource_ref = actor_client.actor_ref(
                    ResourceActor.default_uid(), address=actor_address)

                if resource_ref.get_worker_count() < n_workers:
                    raise ProcessRequirementUnmetError(
                        'Workers does not met requirement: %d < %d.' %
                        (resource_ref.get_worker_count(), n_workers))
                break
            except:
                if time.time() - check_time > 20:
                    raise
                time.sleep(0.1)

        gevent.hub.Hub.NOT_ERROR = old_not_errors
Ejemplo n.º 17
0
    def testMainTensorWithoutEtcd(self):
        self.start_processes()

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id))

        a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        c = (a * b * 2 + 1).sum()
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np.ones(a.shape) * 2 * 1 + 1) ** 2 * 2 + 1
        assert_allclose(loads(result), expected.sum())

        a = mt.ones((100, 50), chunk_size=35) * 2 + 1
        b = mt.ones((50, 200), chunk_size=35) * 2 + 1
        c = a.dot(b)
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)
        result = session_ref.fetch_result(graph_key, c.key)
        assert_allclose(loads(result), np.ones((100, 200)) * 450)

        base_arr = np.random.random((100, 100))
        a = mt.array(base_arr)
        sumv = reduce(operator.add, [a[:10, :10] for _ in range(10)])
        graph = sumv.build_graph()
        targets = [sumv.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = reduce(operator.add, [base_arr[:10, :10] for _ in range(10)])
        result = session_ref.fetch_result(graph_key, sumv.key)
        assert_allclose(loads(result), expected)

        a = mt.ones((31, 27), chunk_size=10)
        b = a.reshape(27, 31)
        b.op.extra_params['_reshape_with_shuffle'] = True
        r = b.sum(axis=1)
        graph = r.build_graph()
        targets = [r.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, r.key)
        assert_allclose(loads(result), np.ones((27, 31)).sum(axis=1))

        raw = np.random.RandomState(0).rand(10, 10)
        a = mt.tensor(raw, chunk_size=(5, 4))
        b = a[a.argmin(axis=1), mt.tensor(np.arange(10))]
        graph = b.build_graph()
        targets = [b.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, b.key)

        np.testing.assert_array_equal(loads(result), raw[raw.argmin(axis=1), np.arange(10)])
Ejemplo n.º 18
0
    def testMainDataFrameWithoutEtcd(self):
        import pandas as pd
        from mars.dataframe.datasource.dataframe import from_pandas as from_pandas_df
        from mars.dataframe.datasource.series import from_pandas as from_pandas_series
        from mars.dataframe.arithmetic import add

        self.start_processes(etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true'])

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id))

        data1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = from_pandas_df(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = from_pandas_df(data2, chunk_size=6)

        df3 = add(df1, df2)

        graph = df3.build_graph()
        targets = [df3.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = data1 + data2
        result = session_ref.fetch_result(graph_key, df3.key)
        pd.testing.assert_frame_equal(expected, loads(result))

        data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas_df(data1, chunk_size=(10, 5))
        data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas_df(data2, chunk_size=(10, 6))

        df3 = add(df1, df2)

        graph = df3.build_graph()
        targets = [df3.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = data1 + data2
        result = session_ref.fetch_result(graph_key, df3.key)
        pd.testing.assert_frame_equal(expected, loads(result))

        data1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas_df(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas_df(data2, chunk_size=6)

        df3 = add(df1, df2)

        graph = df3.build_graph()
        targets = [df3.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = data1 + data2
        result = session_ref.fetch_result(graph_key, df3.key)
        pd.testing.assert_frame_equal(expected, loads(result))

        s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
        series1 = from_pandas_series(s1)

        graph = series1.build_graph()
        targets = [series1.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, series1.key)
        pd.testing.assert_series_equal(s1, loads(result))
Ejemplo n.º 19
0
    def setUp(self):
        self.worker_plasma_sock = '/tmp/plasma_%d_%d.sock' % (os.getpid(),
                                                              id(Test))
        scheduler_port = str(get_next_port())
        proc_worker = subprocess.Popen([
            sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--level',
            'debug', '--cpu-procs', '2', '--cache-mem', '10m', '--schedulers',
            '127.0.0.1:' + scheduler_port, '--plasma-socket',
            self.worker_plasma_sock, '--ignore-avail-mem'
        ])
        proc_scheduler = subprocess.Popen([
            sys.executable, '-m', 'mars.scheduler', '--nproc', '1', '--level',
            'debug', '-H', '127.0.0.1', '-p', scheduler_port, '--format',
            '%(asctime)-15s %(message)s'
        ])

        self.scheduler_port = scheduler_port
        self.proc_worker = proc_worker
        self.proc_scheduler = proc_scheduler

        actor_client = new_client()
        time.sleep(2)
        check_time = time.time()
        while True:
            try:
                kv_ref = actor_client.actor_ref(KVStoreActor.default_name(),
                                                address='127.0.0.1:' +
                                                scheduler_port)
                if actor_client.has_actor(kv_ref):
                    break
                else:
                    raise SystemError('Check meta_timestamp timeout')
            except:
                if time.time() - check_time > 10:
                    raise
                time.sleep(1)

        check_time = time.time()
        while True:
            content = kv_ref.read('/workers/meta_timestamp', silent=True)
            if self.proc_scheduler.poll() is not None:
                raise SystemError('Scheduler not started. exit code %s' %
                                  self.proc_scheduler.poll())
            if self.proc_worker.poll() is not None:
                raise SystemError('Worker not started. exit code %s' %
                                  self.proc_worker.poll())
            if time.time() - check_time > 20:
                raise SystemError('Check meta_timestamp timeout')

            if not content:
                time.sleep(0.5)
            else:
                break

        web_port = str(get_next_port())
        self.web_port = web_port
        proc_web = subprocess.Popen([
            sys.executable, '-m', 'mars.web', '-H', '127.0.0.1', '--level',
            'debug', '--ui-port', web_port, '-s',
            '127.0.0.1:' + self.scheduler_port
        ])
        self.proc_web = proc_web

        service_ep = 'http://127.0.0.1:' + self.web_port
        check_time = time.time()
        while True:
            if time.time() - check_time > 30:
                raise SystemError('Wait for service start timeout')
            try:
                resp = requests.get(service_ep + '/api', timeout=1)
            except (requests.ConnectionError, requests.Timeout):
                time.sleep(1)
                continue
            if resp.status_code >= 400:
                time.sleep(1)
                continue
            break

        self.exceptions = gevent.hub.Hub.NOT_ERROR
        gevent.hub.Hub.NOT_ERROR = (Exception, )