Ejemplo n.º 1
0
    def test_safe_shell_exec_interrupts_on_parent_shutdown(self):
        sleep = 20
        parent_script = os.path.join(os.path.dirname(__file__), 'data/run_safe_shell_exec.py')
        child_script = os.path.join(os.path.dirname(__file__), 'data/sleep.py')

        def get_pid(logfile):
            # Wait until the script has written its PID to the logfile
            wait(lambda: os.path.exists(logfile), timeout=5)
            with open(logfile, 'r') as f:
                return int(f.read())

        with temppath() as parent_logfile, temppath() as child_logfile:
            # It's important that this executes in an entirely different interpreter with as little shared
            # state as possible, to avoid issues with the semaphore tracker.
            cmd = ' '.join([sys.executable, parent_script, parent_logfile, child_script, str(sleep), child_logfile])
            p = subprocess.Popen(cmd, shell=True)

            parent = psutil.Process(get_pid(parent_logfile))
            child = psutil.Process(get_pid(child_logfile))

            self.assertTrue(parent.is_running())
            self.assertTrue(child.is_running())

            # Hard kill the parent process
            parent.kill()
            parent.wait(timeout=safe_shell_exec.GRACEFUL_TERMINATION_TIME_S)
            p.wait()

            # Child process will exit when pipe breaks
            child.wait(timeout=2 * safe_shell_exec.GRACEFUL_TERMINATION_TIME_S + 1)

            self.assertFalse(parent.is_running())
            self.assertFalse(child.is_running())
Ejemplo n.º 2
0
def spark_session(app, cores=2, gpus=0, *args):
    from pyspark import SparkConf
    from pyspark.sql import SparkSession

    master = 'local-cluster[{},1,1024]'.format(cores) if gpus > 0 else 'local[{}]'.format(cores)
    conf = SparkConf().setAppName(app).setMaster(master)

    with temppath() as temp_filename:
        if gpus > 0:
            with open(temp_filename, 'wb') as temp_file:
                addresses = ', '.join('\\"{}\\"'.format(i) for i in range(gpus))
                temp_file.write(b'echo {\\"name\\": \\"gpu\\", \\"addresses\\": [' +
                                addresses.encode('ascii') + b']}')

            os.chmod(temp_file.name, stat.S_IRWXU | stat.S_IXGRP | stat.S_IRGRP |
                     stat.S_IROTH | stat.S_IXOTH)

            conf = conf.set("spark.test.home", os.environ.get('SPARK_HOME'))
            conf = conf.set("spark.worker.resource.gpu.discoveryScript", temp_filename)
            conf = conf.set("spark.worker.resource.gpu.amount", 1)
            conf = conf.set("spark.task.resource.gpu.amount", "1")
            conf = conf.set("spark.executor.resource.gpu.amount", "1")

        session = SparkSession \
            .builder \
            .config(conf=conf) \
            .getOrCreate()

        try:
            yield session
        finally:
            session.stop()
Ejemplo n.º 3
0
    def test_load_model_custom_optimizers(self):
        class TestOptimizer(keras.optimizers.RMSprop):
            def __init__(self, **kwargs):
                super(TestOptimizer, self).__init__(**kwargs)

        with self.test_session(config=self.config) as sess:
            K.set_session(sess)

            opt = TestOptimizer(lr=0.0001)
            opt = hvd.DistributedOptimizer(opt)

            model = keras.models.Sequential()
            model.add(keras.layers.Dense(2, input_shape=(3,)))
            model.add(keras.layers.RepeatVector(3))
            model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
            model.compile(loss=keras.losses.MSE,
                          optimizer=opt,
                          metrics=[keras.metrics.categorical_accuracy],
                          sample_weight_mode='temporal')

            x = np.random.random((1, 3))
            y = np.random.random((1, 3, 3))
            model.train_on_batch(x, y)

            with temppath() as fname:
                model.save(fname)

                custom_optimizers = [TestOptimizer]
                new_model = hvd.load_model(fname, custom_optimizers=custom_optimizers)
                new_opt = new_model.optimizer

            self.assertEqual(type(new_opt).__module__, 'horovod._keras')
            self.assertEqual(type(new_opt).__name__, 'TestOptimizer')
            self._check_optimizer_weights(opt, new_opt)
Ejemplo n.º 4
0
    def test_load_model(self):
        with self.test_session(config=self.config) as sess:
            K.set_session(sess)

            opt = keras.optimizers.RMSprop(lr=0.0001)
            opt = hvd.DistributedOptimizer(opt)

            model = keras.models.Sequential()
            model.add(keras.layers.Dense(2, input_shape=(3,)))
            model.add(keras.layers.RepeatVector(3))
            model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
            model.compile(loss=keras.losses.MSE,
                          optimizer=opt,
                          metrics=[keras.metrics.categorical_accuracy],
                          sample_weight_mode='temporal')

            x = np.random.random((1, 3))
            y = np.random.random((1, 3, 3))
            model.train_on_batch(x, y)

            with temppath() as fname:
                model.save(fname)

                new_model = hvd.load_model(fname)
                new_opt = new_model.optimizer

            self.assertEqual(type(new_opt).__module__, 'horovod._keras')
            self.assertEqual(type(new_opt).__name__, 'RMSprop')
            self.assertEqual(K.get_value(opt.lr), K.get_value(new_opt.lr))
            self._check_optimizer_weights(opt, new_opt)
Ejemplo n.º 5
0
    def test_load_model_broadcast(self):
        def create_model():
            opt = keras.optimizers.SGD(lr=0.01 * hvd.size(), momentum=0.9)
            opt = hvd.DistributedOptimizer(opt)

            model = keras.models.Sequential()
            model.add(keras.layers.Dense(2, input_shape=(3, )))
            model.add(keras.layers.RepeatVector(3))
            model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
            model.compile(loss=keras.losses.MSE,
                          optimizer=opt,
                          metrics=[keras.metrics.categorical_accuracy],
                          sample_weight_mode='temporal')

            return model

        with temppath() as fname:
            with self.session(config=self.config) as sess:
                K.set_session(sess)

                model = create_model()

                x = np.random.random((1, 3))
                y = np.random.random((1, 3, 3))
                model.train_on_batch(x, y)

                if hvd.rank() == 0:
                    model.save(fname)

            K.clear_session()
            with self.session(config=self.config) as sess:
                K.set_session(sess)

                weight = np.random.random((1, 3))

                if hvd.rank() == 0:
                    model = hvd.load_model(fname)
                else:
                    model = create_model()

                def generator():
                    while 1:
                        yield (x, y, weight)

                if hvd.rank() == 0:
                    self.assertEqual(len(model.optimizer.weights), 5)
                else:
                    self.assertEqual(len(model.optimizer.weights), 0)

                # No assertions, we just need to verify that it doesn't hang
                callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)]
                model.fit_generator(generator(),
                                    steps_per_epoch=1,
                                    callbacks=callbacks,
                                    epochs=1,
                                    verbose=0,
                                    workers=4,
                                    initial_epoch=0)

                self.assertEqual(len(model.optimizer.weights), 5)
Ejemplo n.º 6
0
    def test_generate_jsrun_rankfile(self):
        settings = hvd_settings.Settings(
            num_proc=5,
            hosts='host1:4,host2:4,host3:4',
        )

        with temppath() as rankfile_path:
            rankfile_path = generate_jsrun_rankfile(settings, rankfile_path)

            with open(rankfile_path, 'r') as file:
                gen_rankfile = file.read()

            expected_rankfile = (
"""overlapping_rs: allow
cpu_index_using: logical

rank: 0: { hostname: host1; cpu: {0-3} ; gpu: * ; mem: * }
rank: 1: { hostname: host1; cpu: {4-7} ; gpu: * ; mem: * }
rank: 2: { hostname: host1; cpu: {8-11} ; gpu: * ; mem: * }
rank: 3: { hostname: host1; cpu: {12-15} ; gpu: * ; mem: * }

rank: 4: { hostname: host2; cpu: {0-3} ; gpu: * ; mem: * }
""")

            self.assertMultiLineEqual(gen_rankfile, expected_rankfile)
Ejemplo n.º 7
0
    def test_horovodrun_hostfile(self):
        with temppath() as host_filename:
            with open(host_filename, 'w+') as fp:
                fp.write('172.31.32.7 slots=8\n')
                fp.write('172.31.33.9 slots=8\n')

            hosts = parse_host_files(host_filename)
            self.assertEqual(hosts, '172.31.32.7:8,172.31.33.9:8')
Ejemplo n.º 8
0
    def _run(self,
             discovery_schedule=None,
             exit_schedule=None,
             exit_mode='exception',
             np=2,
             min_np=2,
             max_np=4,
             hosts=None):
        if not discovery_schedule and not hosts:
            raise ValueError(
                'at least one of discovery schedule or hosts must be given')

        with temppath() as logfile:
            with _temp_discovery_script(logfile, discovery_schedule or [(None, hosts.split(','))]) \
                    as discovery_script:
                command_args = [
                    'horovodrun', '-np',
                    str(np), '--min-np',
                    str(min_np), '--log-level', 'DEBUG'
                ]
                if hosts is not None:
                    command_args += ['-H', hosts]
                else:
                    command_args += [
                        '--host-discovery-script', discovery_script,
                        '--max-np',
                        str(max_np)
                    ]

                command_args += [
                    'python', self._training_script, '--logfile', logfile
                ]
                if discovery_schedule:
                    command_args += [
                        '--discovery-schedule',
                        json.dumps(discovery_schedule)
                    ]
                if exit_schedule:
                    command_args += [
                        '--exit-schedule',
                        json.dumps(exit_schedule), '--exit-mode', exit_mode
                    ]
                print(' '.join(command_args))

                with override_args(*command_args):
                    args = parse_args()
                    env = {}
                    config_parser.set_env_from_args(env, args)
                    _run_elastic(args)

                    with open(logfile, 'r') as f:
                        lines = f.readlines()

                    print('logfile:')
                    for line in lines:
                        print(line)

                    return [json.loads(line) for line in lines]
Ejemplo n.º 9
0
    def _run(self,
             discovery_schedule=None,
             exit_schedule=None,
             hosts=None,
             discovery_wait=10,
             epoch_wait=None,
             epochs=None,
             num_proc=2,
             min_num_proc=None,
             max_num_proc=None,
             extra_conf=None):
        with temppath() as logfile:
            with spark_cluster(logfile=logfile,
                               discovery_schedule=discovery_schedule,
                               hosts=hosts,
                               extra_conf=extra_conf):
                command = [
                    sys.executable, self._training_script, '--logfile', logfile
                ]
                if discovery_schedule:
                    command += [
                        '--discovery-schedule',
                        "'{}'".format(json.dumps(discovery_schedule)),
                        '--discovery-wait',
                        str(discovery_wait)
                    ]
                if exit_schedule:
                    command += [
                        '--exit-schedule',
                        "'{}'".format(json.dumps(exit_schedule))
                    ]
                if epochs:
                    command += ['--epochs', str(epochs)]
                if epoch_wait:
                    command += ['--epoch-wait', str(epoch_wait)]

                cmd = ' '.join(command)
                run_elastic(self._exec, (cmd, ),
                            env={'HOROVOD_LOG_LEVEL': 'DEBUG'},
                            num_proc=num_proc,
                            min_num_proc=min_num_proc,
                            max_num_proc=max_num_proc,
                            stdout=sys.stdout,
                            stderr=sys.stderr,
                            start_timeout=10,
                            elastic_timeout=10,
                            verbose=2,
                            prefix_output_with_timestamp=True)

                with open(logfile, 'r') as f:
                    lines = f.readlines()

                print('logfile:')
                for line in lines:
                    print(line)

                return [json.loads(line) for line in lines]
Ejemplo n.º 10
0
def spark_session(app, cores=2, gpus=0, max_failures=1, *args):
    from pyspark import SparkConf
    from pyspark.sql import SparkSession

    with TemporaryDirectory() as tmpdir:
        metastore_path = os.path.join(tmpdir, 'metastore')

        # start a single worker with given cores when gpus are present
        # max failures are ignored when gpus in that case
        master = 'local-cluster[1,{},1024]'.format(cores) if gpus > 0 \
            else 'local[{},{}]'.format(cores, max_failures)
        conf = SparkConf().setAppName(app).setMaster(master)
        conf = conf.setAll([
            ('spark.ui.showConsoleProgress', 'false'),
            ('spark.test.home', os.environ.get('SPARK_HOME')),
            ('spark.locality.wait', '0'),
            ('spark.unsafe.exceptionOnMemoryLeak', 'true'),
            ('spark.ui.enabled', 'false'),
            ('spark.local.dir', os.path.join(tmpdir, 'tmp')),
            ('spark.sql.warehouse.dir', os.path.join(tmpdir, 'warehouse')),
            ('javax.jdo.option.ConnectionURL',
             f'jdbc:derby:;databaseName={metastore_path};create=true'),
        ])

        with temppath() as temp_filename:
            if gpus > 0:
                with open(temp_filename, 'wb') as temp_file:
                    addresses = ', '.join('\\"{}\\"'.format(i)
                                          for i in range(gpus))
                    temp_file.write(
                        b'echo {\\"name\\": \\"gpu\\", \\"addresses\\": [' +
                        addresses.encode('ascii') + b']}')

                os.chmod(
                    temp_file.name, stat.S_IRWXU | stat.S_IXGRP | stat.S_IRGRP
                    | stat.S_IROTH | stat.S_IXOTH)

                # the single worker takes all gpus discovered, and a single executor will get them
                # each task on that executor will get a single gpu
                conf = conf.setAll([
                    ('spark.worker.resource.gpu.discoveryScript',
                     temp_filename),
                    ('spark.worker.resource.gpu.amount', str(gpus)),
                    ('spark.task.resource.gpu.amount', '1'),
                    ('spark.executor.resource.gpu.amount', str(gpus)),
                ])

            session = SparkSession \
                .builder \
                .config(conf=conf) \
                .getOrCreate()

            try:
                yield session
            finally:
                session.stop()
Ejemplo n.º 11
0
def _temp_discovery_script(logfile, discovery_schedule):
    with temppath() as discovery_script:
        with open(discovery_script, 'w') as f:
            f.write(DISCOVERY_SCRIPT_TEMPLATE.format(logfile=logfile) + os.linesep)
            for i, schedule_step in enumerate(discovery_schedule):
                f.write(_get_discovery_lines(schedule_step,
                                             start=i == 0,
                                             end=i == len(discovery_schedule) - 1))
        os.chmod(discovery_script, 0o755)
        yield discovery_script
Ejemplo n.º 12
0
    def test_timeline(self):
        with temppath() as t:
            with env(HOROVOD_TIMELINE=t, HOROVOD_TIMELINE_MARK_CYCLES='1'):
                hvd.init()

                # Perform a simple allreduce operation
                hvd.allreduce(torch.tensor([1, 2, 3], dtype=torch.float32), name='test_allreduce')

                # Wait for it to register in the timeline.
                time.sleep(0.1)

                if hvd.rank() == 0:
                    with open(t, 'r') as tf:
                        timeline_text = tf.read()
                        assert 'allreduce.test_allreduce' in timeline_text, timeline_text
                        assert 'NEGOTIATE_ALLREDUCE' in timeline_text, timeline_text
                        assert 'ALLREDUCE' in timeline_text, timeline_text
                        assert 'CYCLE_START' in timeline_text, timeline_text
Ejemplo n.º 13
0
    def test_model_serialization(self, mock_remote_trainer):
        model = create_xor_model()
        optimizer = tf.keras.optimizers.SGD(lr=0.1)
        loss = 'binary_crossentropy'

        def train(serialized_model, train_rows, val_rows, avg_row_size):
            return None, serialized_model, 2

        mock_remote_trainer.return_value = train

        with spark_session('test_model_serialization') as spark:
            df = create_xor_data(spark)

            keras_estimator = hvd.KerasEstimator(model=model,
                                                 optimizer=optimizer,
                                                 loss=loss,
                                                 feature_cols=['features'],
                                                 label_cols=['y'],
                                                 batch_size=1,
                                                 epochs=3,
                                                 verbose=2)

            backend = CallbackBackend()
            with local_store() as store:
                with temppath() as saved_path:
                    keras_estimator.save(saved_path)
                    keras_estimator_loaded = hvd.KerasEstimator.load(
                        saved_path)

                keras_model = keras_estimator_loaded.fit(
                    df,
                    params={
                        keras_estimator_loaded.backend: backend,
                        keras_estimator_loaded.store: store
                    })

                trained_model = keras_model.getModel()
                pred = trained_model.predict(
                    [np.ones([1, 2], dtype=np.float32)])
                assert len(pred) == 1
                assert pred.dtype == np.float32