def test_spark_task_cuda_devices_env_support(num_workers, num_gpus_per_worker):
    def train_fn():
        import os
        return os.environ['CUDA_VISIBLE_DEVICES']

    for num_slots in [2, 3, 4]:
        runner = MirroredStrategyRunner(num_slots=num_slots)
        task_cuda_env = runner.run(train_fn)
        gpu_set = {int(i) for i in task_cuda_env.split(',')}
        assert len(gpu_set) == num_slots
        for gpu_id in gpu_set:
            assert gpu_id in [10, 11, 12, 13]
Beispiel #2
0
def test_cpu_training_with_gpus(num_workers, num_gpus_per_worker):
    def train_fn():
        from pyspark import BarrierTaskContext
        context = BarrierTaskContext.get()
        cuda_state = os.environ['CUDA_VISIBLE_DEVICES']
        if cuda_state:
            num_gpus = len(os.environ['CUDA_VISIBLE_DEVICES'].split(','))
        else:
            num_gpus = 0
        return [int(e) for e in context.allGather(str(num_gpus))]

    runner = MirroredStrategyRunner(num_slots=2, use_gpu=False)
    assert runner.get_num_tasks() == 2
    gpus_used_by_each_task = runner.run(train_fn)
    assert gpus_used_by_each_task == [0, 0]
def test_equal_gpu_allocation(num_workers, num_gpus_per_worker):
    def train_fn():
        import os
        from pyspark import BarrierTaskContext
        context = BarrierTaskContext.get()
        cuda_state = os.environ['CUDA_VISIBLE_DEVICES']
        if cuda_state:
            num_gpus = len(os.environ['CUDA_VISIBLE_DEVICES'].split(','))
        else:
            num_gpus = 0
        return [int(e) for e in context.allGather(str(num_gpus))]

    for num_slots in [2, 4, 6, 8]:
        runner = MirroredStrategyRunner(num_slots=num_slots)
        task_gpu_amount = int(
            runner.sc.getConf().get('spark.task.resource.gpu.amount'))
        expected_num_task = math.ceil(num_slots / task_gpu_amount)
        assert runner.get_num_tasks() == expected_num_task
        gpus_used_by_each_task = runner.run(train_fn)
        assert gpus_used_by_each_task == [(num_slots // expected_num_task) +
                                          (i < (num_slots % expected_num_task))
                                          for i in range(expected_num_task)]
Beispiel #4
0
def test_local_run(num_workers, num_gpus_per_worker, num_slots,
                   old_cuda_state):
    def train_fn():
        import os
        return len(os.environ['CUDA_VISIBLE_DEVICES'].split(','))

    if old_cuda_state is not None:
        os.environ['CUDA_VISIBLE_DEVICES'] = old_cuda_state
    result = MirroredStrategyRunner(num_slots=num_slots,
                                    local_mode=True,
                                    gpu_resource_name='gpu').run(train_fn)
    gpus_on_the_driver = [str(e) for e in range(num_slots)]
    assert result == num_slots
    new_cuda_state = os.environ.get('CUDA_VISIBLE_DEVICES')
    assert old_cuda_state == new_cuda_state
def test_local_run(num_workers, num_gpus_per_worker, num_slots,
                   old_cuda_state):
    def train_fn():
        import os
        return os.environ['CUDA_VISIBLE_DEVICES']

    if old_cuda_state is not None:
        mock_env = {'CUDA_VISIBLE_DEVICES': old_cuda_state}
    else:
        mock_env = {}

    with mock.patch.dict(os.environ, mock_env, clear=True):
        task_cuda_env = MirroredStrategyRunner(
            num_slots=num_slots, local_mode=True,
            gpu_resource_name='gpu').run(train_fn)
        gpu_set = {int(i) for i in task_cuda_env.split(',')}
        assert len(gpu_set) == num_slots
        for gpu_id in gpu_set:
            if old_cuda_state is not None:
                assert gpu_id in [10, 11, 12, 13]
            else:
                assert gpu_id in [0, 1, 2, 3]
        new_cuda_state = os.environ.get('CUDA_VISIBLE_DEVICES')
        assert old_cuda_state == new_cuda_state
Beispiel #6
0
def test_run_on_ssl_cluster_override(num_workers, num_gpus_per_worker,
                                     extra_spark_configs):
    MirroredStrategyRunner(num_slots=2,
                           gpu_resource_name='gpu').run(lambda: None)
Beispiel #7
0
def test_equal_gpu_allocation(num_workers, num_gpus_per_worker):
    def train_fn():
        import os
        from pyspark import BarrierTaskContext
        context = BarrierTaskContext.get()
        cuda_state = os.environ['CUDA_VISIBLE_DEVICES']
        if cuda_state:
            num_gpus = len(os.environ['CUDA_VISIBLE_DEVICES'].split(','))
        else:
            num_gpus = 0
        return [int(e) for e in context.allGather(str(num_gpus))]

    runner = MirroredStrategyRunner(num_slots=2)
    assert runner.get_num_tasks() == 1
    gpus_used_by_each_task = runner.run(train_fn)
    assert gpus_used_by_each_task == [2]

    runner = MirroredStrategyRunner(num_slots=4)
    assert runner.get_num_tasks() == 1
    gpus_used_by_each_task = runner.run(train_fn)
    assert gpus_used_by_each_task == [4]

    runner = MirroredStrategyRunner(num_slots=6)
    assert runner.get_num_tasks() == 2
    gpus_used_by_each_task = runner.run(train_fn)
    assert gpus_used_by_each_task == [3, 3]

    runner = MirroredStrategyRunner(num_slots=8)
    assert runner.get_num_tasks() == 2
    gpus_used_by_each_task = runner.run(train_fn)
    assert gpus_used_by_each_task == [4, 4]
Beispiel #8
0
def test_run_on_ssl_cluster(num_workers, num_gpus_per_worker,
                            extra_spark_configs):
    with pytest.raises(Exception):
        MirroredStrategyRunner(num_slots=2,
                               gpu_resource_name='gpu').run(lambda: None)
Beispiel #9
0
def test_zero_num_slots(num_workers, num_gpus_per_worker):
    with pytest.raises(ValueError):
        result = MirroredStrategyRunner(num_slots=0).run(lambda: None)
Beispiel #10
0
        dataset = tf.data.Dataset.from_tensor_slices((
            tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32),
            tf.cast(mnist_labels, tf.int64))
        )
        dataset = dataset.repeat().shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
        return dataset

    def build_and_compile_cnn_model():
        model = tf.keras.Sequential([
            tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)),
            tf.keras.layers.MaxPooling2D(),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(10, activation='softmax'),
        ])
        model.compile(
            loss=tf.keras.losses.sparse_categorical_crossentropy,
            optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
            metrics=['accuracy'],
        )
        return model

    train_datasets = make_datasets()
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA
    train_datasets = train_datasets.with_options(options)
    multi_worker_model = build_and_compile_cnn_model()
    multi_worker_model.fit(x=train_datasets, epochs=3, steps_per_epoch=5)

MirroredStrategyRunner(num_slots=8).run(train)
Beispiel #11
0
 def distributed_train(self, train_datasets):
     self.train_datasets = train_datasets
     MirroredStrategyRunner(num_slots=8).run(self.train)
                                  activation='relu',
                                  kernel_initializer='he_uniform'))
        model2.add(tf.keras.layers.Dense(10, activation='softmax'))
        model2.compile(
            loss=tf.keras.losses.sparse_categorical_crossentropy,
            optimizer=tf.keras.optimizers.Adam(),
            metrics=['accuracy'],
        )
        return model2

    train_datasets = make_datasets()
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA
    train_datasets = train_datasets.with_options(options)
    multi_worker_model = build_and_compile_cnn_model()
    multi_worker_model.fit(x=train_datasets,
                           epochs=10,
                           steps_per_epoch=60000 // 32)
    return multi_worker_model.get_weights()

spark = SparkSession.builder.master("spark://172.31.0.101:7077").appName("distributedTrain")\
    .config("spark.driver.memory" , "2g")\
    .config("spark.executor.memory" , "2g").enableHiveSupport().getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("Error")
weights = MirroredStrategyRunner(num_slots=2, spark=spark,
                                 use_gpu=False).run(train)
model = build_and_compile_cnn_model()
model.set_weights(weights)
model.save("./trained_model.h5")
Beispiel #13
0
                                  activation='relu',
                                  kernel_initializer='he_uniform'))
        model2.add(tf.keras.layers.Dense(10, activation='softmax'))
        model2.compile(
            loss=tf.keras.losses.sparse_categorical_crossentropy,
            optimizer=tf.keras.optimizers.Adam(),
            metrics=['accuracy'],
        )
        return model2

    train_datasets = make_datasets()
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA
    train_datasets = train_datasets.with_options(options)
    multi_worker_model = build_and_compile_cnn_model()
    multi_worker_model.fit(x=train_datasets,
                           epochs=1,
                           steps_per_epoch=60000 // 32)
    return multi_worker_model.get_weights()

spark = SparkSession.builder.master("spark://192.168.1.38:7077").appName("distributedTrain")\
    .config("spark.driver.memory" , "2g")\
    .config("spark.executor.memory" , "2g").enableHiveSupport().getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("Error")
weights = MirroredStrategyRunner(num_slots=sc.defaultParallelism,
                                 spark=spark,
                                 use_gpu=False).run(train)
model = build_and_compile_cnn_model()
model.set_weights(weights)
model.save("./trained_model.h5")