Exemple #1
0
def test_multi_output(setup):
    sentences = ['word1 word2', 'word2 word3', 'word3 word2 word1']

    def mapper(s):
        word_to_count = defaultdict(lambda: 0)
        for word in s.split():
            word_to_count[word] += 1

        downsides = [defaultdict(lambda: 0),
                     defaultdict(lambda: 0)]
        for word, count in word_to_count.items():
            downsides[mmh3_hash(word) % 2][word] += count

        return downsides

    def reducer(word_to_count_list):
        d = defaultdict(lambda: 0)
        for word_to_count in word_to_count_list:
            for word, count in word_to_count.items():
                d[word] += count

        return dict(d)

    outs = [], []
    for sentence in sentences:
        out1, out2 = spawn(mapper, sentence, n_output=2)
        outs[0].append(out1)
        outs[1].append(out2)

    rs = []
    for out in outs:
        r = spawn(reducer, out)
        rs.append(r)

    result = dict()
    for wc in ExecutableTuple(rs).to_object():
        result.update(wc)

    assert result == {'word1': 2, 'word2': 3, 'word3': 2}
Exemple #2
0
    def testFetchLogWithoutEtcd(self):
        # test fetch log
        with tempfile.TemporaryDirectory() as temp_dir:
            self.start_processes(
                etcd=False,
                modules=['mars.scheduler.tests.integrated.no_prepare_op'],
                scheduler_args=[f'-Dcustom_log_dir={temp_dir}'])
            sess = new_session(self.session_manager_ref.address)

            def f():
                print('test')

            r = spawn(f)
            r.execute(session=sess)

            custom_log_actor = sess._api.actor_client.actor_ref(
                CustomLogMetaActor.default_uid(),
                address=self.cluster_info.get_scheduler(
                    CustomLogMetaActor.default_uid()))

            chunk_key_to_log_path = custom_log_actor.get_tileable_op_log_paths(
                sess.session_id, r.op.key)
            paths = list(chunk_key_to_log_path.values())
            self.assertEqual(len(paths), 1)
            log_path = paths[0][1]
            with open(log_path) as f:
                self.assertEqual(f.read().strip(), 'test')

            context = DistributedContext(
                scheduler_address=self.session_manager_ref.address,
                session_id=sess.session_id)
            log_result = context.fetch_tileable_op_logs(r.op.key)
            log = next(iter(log_result.values()))['log']
            self.assertEqual(log.strip(), 'test')

            log = r.fetch_log()
            self.assertEqual(str(log).strip(), 'test')

            # test multiple functions
            def f1(size):
                print('f1' * size)
                sys.stdout.flush()

            fs = ExecutableTuple([spawn(f1, 30), spawn(f1, 40)])
            fs.execute(session=sess)
            log = fs.fetch_log(offsets=20, sizes=10)
            self.assertEqual(str(log[0]).strip(), ('f1' * 30)[20:30])
            self.assertEqual(str(log[1]).strip(), ('f1' * 40)[20:30])
            self.assertGreater(len(log[0].offsets), 0)
            self.assertTrue(all(s > 0 for s in log[0].offsets))
            self.assertGreater(len(log[1].offsets), 0)
            self.assertTrue(all(s > 0 for s in log[1].offsets))
            self.assertGreater(len(log[0].chunk_op_keys), 0)

            # test negative offsets
            log = fs.fetch_log(offsets=-20, sizes=10)
            self.assertEqual(str(log[0]).strip(), ('f1' * 30 + '\n')[-20:-10])
            self.assertEqual(str(log[1]).strip(), ('f1' * 40 + '\n')[-20:-10])
            self.assertTrue(all(s > 0 for s in log[0].offsets))
            self.assertGreater(len(log[1].offsets), 0)
            self.assertTrue(all(s > 0 for s in log[1].offsets))
            self.assertGreater(len(log[0].chunk_op_keys), 0)

            # test negative offsets which represented in string
            log = fs.fetch_log(offsets='-0.02K', sizes='0.01K')
            self.assertEqual(str(log[0]).strip(), ('f1' * 30 + '\n')[-20:-10])
            self.assertEqual(str(log[1]).strip(), ('f1' * 40 + '\n')[-20:-10])
            self.assertTrue(all(s > 0 for s in log[0].offsets))
            self.assertGreater(len(log[1].offsets), 0)
            self.assertTrue(all(s > 0 for s in log[1].offsets))
            self.assertGreater(len(log[0].chunk_op_keys), 0)

            def test_nested():
                print('level0')
                fr = spawn(f1, 1)
                fr.execute()
                print(fr.fetch_log())

            r = spawn(test_nested)
            with self.assertRaises(ValueError):
                r.fetch_log()
            r.execute(session=sess)
            log = str(r.fetch_log())
            self.assertIn('level0', log)
            self.assertIn('f1', log)

            df = md.DataFrame(mt.random.rand(10, 3), chunk_size=5)

            def df_func(c):
                print('df func')
                return c

            df2 = df.map_chunk(df_func)
            df2.execute(session=sess)
            log = df2.fetch_log()
            self.assertIn('Chunk op key:', str(log))
            self.assertIn('df func', repr(log))
            self.assertEqual(len(str(df.fetch_log(session=sess))), 0)
Exemple #3
0
    def testFetchLogWithoutEtcd(self):
        # test fetch log
        with tempfile.TemporaryDirectory() as temp_dir:
            self.start_processes(etcd=False, modules=['mars.scheduler.tests.integrated.no_prepare_op'],
                                 scheduler_args=[f'-Dcustom_log_dir={temp_dir}'])
            sess = new_session(self.session_manager_ref.address)

            def f():
                print('test')

            r = spawn(f)
            r.execute(session=sess)

            custom_log_actor = sess._api.actor_client.actor_ref(
                CustomLogMetaActor.default_uid(),
                address=self.cluster_info.get_scheduler(CustomLogMetaActor.default_uid())
            )

            chunk_key_to_log_path = custom_log_actor.get_tileable_op_log_paths(
                sess.session_id, r.op.key)
            paths = list(chunk_key_to_log_path.values())
            self.assertEqual(len(paths), 1)
            log_path = paths[0][1]
            with open(log_path) as f:
                self.assertEqual(f.read().strip(), 'test')

            context = DistributedContext(scheduler_address=self.session_manager_ref.address,
                                         session_id=sess.session_id)
            log_result = context.fetch_tileable_op_logs(r.op.key)
            log = next(iter(log_result.values()))['log']
            self.assertEqual(log.strip(), 'test')

            log = r.fetch_log()
            self.assertEqual(str(log).strip(), 'test')

            # test multiple functions
            def f1(size):
                print('f1' * size)
                sys.stdout.flush()

            fs = ExecutableTuple([spawn(f1, 30), spawn(f1, 40)])
            fs.execute(session=sess)
            log = fs.fetch_log(offsets=20, sizes=10)
            self.assertEqual(str(log[0]).strip(), ('f1' * 30)[20:30])
            self.assertEqual(str(log[1]).strip(), ('f1' * 40)[20:30])
            self.assertGreater(len(log[0].offsets), 0)
            self.assertTrue(all(s > 0 for s in log[0].offsets))
            self.assertGreater(len(log[1].offsets), 0)
            self.assertTrue(all(s > 0 for s in log[1].offsets))
            self.assertGreater(len(log[0].chunk_op_keys), 0)

            # test negative offsets
            log = fs.fetch_log(offsets=-20, sizes=10)
            self.assertEqual(str(log[0]).strip(), ('f1' * 30 + '\n')[-20:-10])
            self.assertEqual(str(log[1]).strip(), ('f1' * 40 + '\n')[-20:-10])
            self.assertTrue(all(s > 0 for s in log[0].offsets))
            self.assertGreater(len(log[1].offsets), 0)
            self.assertTrue(all(s > 0 for s in log[1].offsets))
            self.assertGreater(len(log[0].chunk_op_keys), 0)

            # test negative offsets which represented in string
            log = fs.fetch_log(offsets='-0.02K', sizes='0.01K')
            self.assertEqual(str(log[0]).strip(), ('f1' * 30 + '\n')[-20:-10])
            self.assertEqual(str(log[1]).strip(), ('f1' * 40 + '\n')[-20:-10])
            self.assertTrue(all(s > 0 for s in log[0].offsets))
            self.assertGreater(len(log[1].offsets), 0)
            self.assertTrue(all(s > 0 for s in log[1].offsets))
            self.assertGreater(len(log[0].chunk_op_keys), 0)

            def test_nested():
                print('level0')
                fr = spawn(f1, 1)
                fr.execute()
                print(fr.fetch_log())

            r = spawn(test_nested)
            with self.assertRaises(ValueError):
                r.fetch_log()
            r.execute(session=sess)
            log = str(r.fetch_log())
            self.assertIn('level0', log)
            self.assertIn('f1', log)

            df = md.DataFrame(mt.random.rand(10, 3), chunk_size=5)

            def df_func(c):
                print('df func')
                return c

            df2 = df.map_chunk(df_func)
            df2.execute(session=sess)
            log = df2.fetch_log()
            self.assertIn('Chunk op key:', str(log))
            self.assertIn('df func', repr(log))
            self.assertEqual(len(str(df.fetch_log(session=sess))), 0)

            def test_host(rndf):
                rm = spawn(nested, rndf)
                rm.execute()
                print(rm.fetch_log())

            def nested(_rndf):
                print('log_content')

            ds = [spawn(test_host, n, retry_when_fail=False)
                  for n in np.random.rand(4)]
            xtp = ExecutableTuple(ds)
            xtp.execute(session=sess)
            for log in xtp.fetch_log(session=sess):
                self.assertEqual(str(log).strip(), 'log_content')

            def test_threaded():
                import threading

                exc_info = None

                def print_fun():
                    nonlocal exc_info
                    try:
                        print('inner')
                    except:  # noqa: E722  # nosec  # pylint: disable=bare-except
                        exc_info = sys.exc_info()

                print_thread = threading.Thread(target=print_fun)
                print_thread.start()
                print_thread.join()

                if exc_info is not None:
                    raise exc_info[1].with_traceback(exc_info[-1])

                print('after')

            rm = spawn(test_threaded)
            rm.execute(session=sess)
            logs = str(rm.fetch_log(session=sess)).strip()
            self.assertEqual(logs, 'inner\nafter')