def test_train_model(self):
        hvd.init()

        with self.test_session() as sess:
            K.set_session(sess)

            opt = keras.optimizers.RMSprop(lr=0.0001)
            opt = hvd.DistributedOptimizer(opt)

            model = keras.models.Sequential()
            model.add(keras.layers.Dense(2, input_shape=(3, )))
            model.add(keras.layers.RepeatVector(3))
            model.add(keras.layers.ThresholdedReLU(0.5))
            model.compile(loss=keras.losses.mean_squared_error,
                          optimizer=opt,
                          metrics=[keras.metrics.categorical_accuracy],
                          sample_weight_mode='temporal')

            x = np.random.random((1, 3))
            y = np.random.random((1, 3, 3))

            def generator():
                while 1:
                    yield (x, y)

            # No assertions, we just need to verify that it doesn't hang
            callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)]
            model.fit_generator(generator(),
                                steps_per_epoch=10,
                                callbacks=callbacks,
                                epochs=0,
                                verbose=0,
                                workers=4,
                                initial_epoch=1)
Ejemplo n.º 2
0
def init_gpu(args, logger):
    hvd.init()

    init_logger(
        full=hvd.rank() == 0,
        args=args,
        logger=logger
    )
    if args.affinity != 'disabled':
        gpu_id = hvd.local_rank()
        affinity = set_affinity(
            gpu_id=gpu_id,
            nproc_per_node=hvd.size(),
            mode=args.affinity
        )
        logger.warning(f'{gpu_id}: thread affinity: {affinity}')
    gpus = tf.config.experimental.list_physical_devices('GPU')
    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')

    if args.amp:
        policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
        tf.keras.mixed_precision.experimental.set_policy(policy)

    if args.xla:
        tf.config.optimizer.set_jit(True)
Ejemplo n.º 3
0
def init_workers(distributed=False):
    """Initialize distributed worker"""
    rank, local_rank, n_ranks = 0, 0, 1
    if distributed:
        hvd.init()
        rank, local_rank, n_ranks = hvd.rank(), hvd.local_rank(), hvd.size()
    return rank, local_rank, n_ranks
Ejemplo n.º 4
0
def init_workers(distributed=False):
    if distributed:
        hvd.init()
        return SimpleNamespace(rank=hvd.rank(), size=hvd.size(),
                               local_rank=hvd.local_rank(),
                               local_size=hvd.local_size())
    else:
        return SimpleNamespace(rank=0, size=1, local_rank=0, local_size=1)
Ejemplo n.º 5
0
    def __init__(self, *args, **kwargs):
        super(TfKerasTests, self).__init__(*args, **kwargs)
        warnings.simplefilter('module')
        hvd.init()

        self.config = tf.compat.v1.ConfigProto()
        self.config.gpu_options.allow_growth = True
        self.config.gpu_options.visible_device_list = str(hvd.local_rank())
Ejemplo n.º 6
0
def main(_):
    hvd.init()
    print("After hvd init")
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    # K.set_session(tf.Session(config=config))
    print("After gpu_options visible_device_list")
    tf.enable_eager_execution(config=config)
    epochs = 20
    steps_per_epoch = 2
    batch_size = 32
    num_classes = 10
    full_model = 'image'
    image_model = 'efficientnet'
    image_training_type = 'finetuning'
    text_model = 'cnn'
    combined_embeddings = 'stack'
    learning_rate = 0.005
    width = 150
    height = 150
    input_shape = (height, width, 3)
    input_size = (224, 224, 3)
    train_tfrecord = tf.data.TFRecordDataset(filenames=['tfrecords/train.tfrecords'])
    print(train_tfrecord)
    val_tfrecord = tf.data.TFRecordDataset(filenames=['tfrecords/val.tfrecords'])
    test_tfrecord = tf.data.TFRecordDataset(filenames=['tfrecords/test.tfrecords'])

    def read_tfrecord(serialized_example):
        feature_description = {
            'image_raw': tf.io.FixedLenFeature([], tf.string),
            'label': tf.io.FixedLenFeature([], tf.int64),
        }
        example = tf.io.parse_single_example(serialized_example, feature_description)
        input_2 = tf.image.decode_png(example['image_raw'], channels=3, dtype=tf.dtypes.uint8)
        input_2 = tf.image.resize(input_2, [600, 600])
        return (input_2, example['label'])
    train_parsed_dataset = train_tfrecord.map(read_tfrecord)
    val_parsed_dataset = val_tfrecord.map(read_tfrecord)
    test_parsed_dataset = test_tfrecord.map(read_tfrecord)
    tf.keras.backend.clear_session()
    baseModel = EfficientNetB7(weights='imagenet', include_top=True)
    probs = baseModel.layers.pop()
    top_droput = probs.input
    headModel = layers.Dense(10, activation='softmax')(top_droput)
    model = models.Model(inputs=baseModel.input, outputs=headModel)
    SGD = optimizers.SGD(lr=0.01, decay=4e-05, momentum=0.9)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=adapt_optimizer(SGD), metrics=['accuracy'])
    train_dataset = train_parsed_dataset.batch(2).repeat()
    val_dataset = val_parsed_dataset.batch(2).repeat()
    test_dataset = test_parsed_dataset.batch(2).repeat()
    model.fit(train_dataset, epochs=adapt_epochs(epochs), steps_per_epoch=400, validation_data=val_dataset, validation_steps=100, verbose=(1 if (hvd.rank() == 0) else 0), callbacks=adapt_callbacks([], True))
    if (hvd.rank() == 0):
        model.save('saved_model.h5')
    if (hvd.rank() == 0):
        (test_loss, test_acc) = model.evaluate(test_dataset, verbose=0, steps=1241)
        print('Test loss =', test_loss)
        print('Test acc =', test_acc)
Ejemplo n.º 7
0
    def setup_horovod(self):
        import horovod.tensorflow.keras as hvd
        hvd.init()
        self.model = self.model_creator(self.config)
        compile_args = self.compile_args_creator(self.config)
        compile_args["optimizer"] = hvd.DistributedOptimizer(compile_args["optimizer"])

        self.model.compile(**compile_args)
        self.backend = "horovod"
    def __init__(self, *args, **kwargs):
        super(Tf2KerasTests, self).__init__(*args, **kwargs)
        warnings.simplefilter('module')
        hvd.init()

        gpus = tf.config.experimental.list_physical_devices('GPU')
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        if gpus:
            tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
Ejemplo n.º 9
0
def initialize_horovod():
    hvd.init()
    gpus = tf.config.experimental.list_physical_devices("GPU")
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                   "GPU")

    return hvd.size()
Ejemplo n.º 10
0
def init():
    gpu_thread_count = 2
    os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
    os.environ['TF_GPU_THREAD_COUNT'] = str(gpu_thread_count)
    os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
    hvd.init()
    if hvd.rank() == 0:
        print('PY', sys.version)
        print('TF', tf.version.VERSION)
Ejemplo n.º 11
0
def connect_GPU_to_horovod():
    import horovod.tensorflow.keras as hvd
    import tensorflow as tf
    tf.keras.backend.clear_session()
    hvd.init()
    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    gpus = tf.config.experimental.list_physical_devices('GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                   'GPU')
def train_hvd(learning_rate=1.0):
    # Tensorflow has given up on pickling. We need to explicitly import its modules inside workers
    from tensorflow.keras import backend as K
    from tensorflow.keras.models import Sequential
    import tensorflow as tf
    from tensorflow import keras
    import horovod.tensorflow.keras as hvd

    # Horovod: initialize Horovod.
    hvd.init()

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.Session(config=config))

    (x_train, y_train), (x_test, y_test) = get_dataset(num_classes, hvd.rank(),
                                                       hvd.size())
    model = get_model(num_classes)

    # Horovod: adjust learning rate based on number of GPUs.
    optimizer = keras.optimizers.Adadelta(lr=learning_rate * hvd.size())

    # Horovod: add Horovod Distributed Optimizer.
    optimizer = hvd.DistributedOptimizer(optimizer)

    model.compile(optimizer=optimizer,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    callbacks = [
        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),
    ]

    # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
    if hvd.rank() == 0:
        callbacks.append(
            keras.callbacks.ModelCheckpoint(checkpoint_dir +
                                            '/checkpoint-{epoch}.ckpt',
                                            save_weights_only=True))

    model.fit(x_train,
              y_train,
              batch_size=batch_size,
              callbacks=callbacks,
              epochs=epochs,
              verbose=2,
              validation_data=(x_test, y_test))
Ejemplo n.º 13
0
    def setup_horovod(self):
        import horovod.tensorflow.keras as hvd
        hvd.init()
        self.model = self.model_creator(self.config)
        compile_args = self.compile_args_creator(self.config)
        compile_args["optimizer"] = hvd.DistributedOptimizer(compile_args["optimizer"])

        self.model.compile(**compile_args)
        self.backend = "horovod"
        self.size = hvd.size()
        self.rank = hvd.rank()
        from tensorflow.python.distribute import distribution_strategy_context as ds_context
        self.strategy = ds_context.get_strategy()
Ejemplo n.º 14
0
def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--tf1", action="store_true")
    parser.add_argument(
        "--aggregation-frequency", dest="aggregation_frequency", default=0, type=int
    )
    parser.add_argument("--average-aggregated-gradients", action="store_true")
    args = parser.parse_args()

    hvd.init()
    if args.tf1:
        check_tf_1(args.aggregation_frequency, args.average_aggregated_gradients)
    else:
        check_tf_2(args.aggregation_frequency, args.average_aggregated_gradients)
Ejemplo n.º 15
0
def simple_fn():

    hvd.init()

    rank = hvd.rank()

    ## getting the hostname by socket.gethostname() method
    hostname = socket.gethostname()
    ## getting the IP address using socket.gethostbyname() method
    ip_address = socket.gethostbyname(hostname)

    print(f"hvd rank[{ip_address}]", rank)

    return rank
Ejemplo n.º 16
0
def setup(args, report):
    """Set up environment variables given the type of partition."""
    # Initialize Horovod
    hvd.init()
    # Set environment variable necessary to use h5py for file read/write
    os.putenv("HDF5_USE_FILE_LOCKING", "FALSE")
    os.system("export $HDF5_USE_FILE_LOCKING")
    # Pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.log_device_placement = False
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.compat.v1.Session(config=config))
    np.random.seed(args.random_seed)
    print('Rank ' + str(hvd.rank()) + ' session configured')
Ejemplo n.º 17
0
    def train(cls, training_rows, training_steps_per_epoch, val_rows,
              val_steps_per_epoch, epochs, gen_workers):
        """
      Trains model over training / validation data generators.
      We measure the average time taken to train & validate the model for each epoch
      """
        from tensorflow.keras import backend as K
        from tensorflow import keras
        import horovod.tensorflow.keras as hvd

        hvd.init()
        model = Lenet5.get_model()

        train_imgs = ArrGenerator(img_size=np.array([training_rows, 32, 32,
                                                     3]),
                                  gen_cls=RandomArrCreator)
        train_labels = ArrGenerator(img_size=np.array([training_rows, 10]),
                                    gen_cls=RandomArrCreator)
        train_gen = DataGenerator.generate(img_gen=train_imgs,
                                           label_gen=train_labels)

        val_imgs = ArrGenerator(img_size=np.array([val_rows, 32, 32, 3]),
                                gen_cls=RandomArrCreator)
        val_labels = ArrGenerator(img_size=np.array([val_rows, 10]),
                                  gen_cls=RandomArrCreator)
        val_gen = DataGenerator.generate(img_gen=val_imgs,
                                         label_gen=val_labels)

        opt = keras.optimizers.Adadelta()
        opt = hvd.DistributedOptimizer(opt)

        model.compile(optimizer=opt,
                      loss="mean_squared_error",
                      metrics=['accuracy'])

        # For training
        model.fit_generator(generator=train_gen,
                            steps_per_epoch=training_steps_per_epoch,
                            epochs=epochs,
                            validation_data=val_gen,
                            validation_steps=val_steps_per_epoch,
                            max_queue_size=20,
                            workers=gen_workers,
                            use_multiprocessing=True,
                            callbacks=[cls.time_callback])

        hvd.shutdown()
        return
Ejemplo n.º 18
0
def train_evaluate():

    # Initialize Horovod
    hvd.init()

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    tf.keras.backend.set_session(tf.Session(config=config))

    train_dataset, eval_dataset = prepare_datasets()

    model = toy_resnet_model()

    # Wrap an optimizer in Horovod
    optimizer = hvd.DistributedOptimizer(optimizers.Adadelta())

    model.compile(optimizer=optimizer,
                  loss="categorical_crossentropy",
                  metrics=["accuracy"])

    callbacks = [
        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with loaded weights.
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),
        # Horovod: average metrics among workers at the end of every epoch.
        #
        # Note: This callback must be in the list before the ReduceLROnPlateau,
        # TensorBoard, or other metrics-based callbacks.
        hvd.callbacks.MetricAverageCallback()
    ]

    # Horovod: save checkpoints only on worker 0 (master) to prevent other workers from corrupting them.
    # Configure Tensorboard and Azure ML Tracking
    if hvd.rank() == 0:
        #callbacks.append(tf.keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))
        callbacks.append(
            tf.keras.callbacks.TensorBoard(log_dir=FLAGS['job-dir'].value,
                                           update_freq='epoch'))

    model.fit(train_dataset,
              epochs=FLAGS.epochs,
              steps_per_epoch=1000,
              callbacks=callbacks,
              validation_data=eval_dataset,
              validation_steps=200)
Ejemplo n.º 19
0
def init(en_mem_growth=False, set_visible_dev=False):
    """ This initializes the horovod package.
    :param en_mem_growth:
    :param set_visible_dev:
    """
    if hvd is not None:
        hvd.init()
        gpus = tf.config.experimental.list_physical_devices('GPU')
        for gpu in gpus:
            if en_mem_growth:
                tf.config.experimental.set_memory_growth(gpu, True)
        if gpus and set_visible_dev:
            tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                       'GPU')
    else:
        print("Horovod not supported on this system!")
Ejemplo n.º 20
0
def init_gpu(args, logger):
    hvd.init()

    init_logger(full=hvd.rank() == 0, args=args, logger=logger)
    if args.affinity != "disabled":
        gpu_id = hvd.local_rank()
        affinity = set_affinity(gpu_id=gpu_id,
                                nproc_per_node=hvd.size(),
                                mode=args.affinity)
        logger.warning(f"{gpu_id}: thread affinity: {affinity}")

    if args.amp:
        tf.keras.mixed_precision.set_global_policy("mixed_float16")

    if args.xla:
        tf.config.optimizer.set_jit(True)
Ejemplo n.º 21
0
def init_hvd(args):
    if hvd:
        hvd.init()
        FORMAT = "[%%(levelname)s - P%i/%i - %%(filename)s:%%(lineno)s - %%(funcName)s] %%(message)s" % (
            hvd.rank(), hvd.size())
        # Remove all handlers associated with the root logger object.
        for handler in logging.root.handlers[:]:
            logging.root.removeHandler(handler)
        logging.basicConfig(level=logging.INFO, format=FORMAT)
        if args.verbose:
            logger.setLevel(logging.DEBUG)
        else:
            logger.setLevel(logging.INFO)
        logger.debug("Updated logger to print process")
    args.hvd_rank = hvd.rank() if hvd else 0
    args.hvd_size = hvd.size() if hvd else 1
Ejemplo n.º 22
0
    def test_sparse_as_dense(self):
        hvd.init()

        with self.test_session() as sess:
            K.set_session(sess)

            opt = keras.optimizers.RMSprop(lr=0.0001)
            opt = hvd.DistributedOptimizer(opt, sparse_as_dense=True)

            model = keras.models.Sequential()
            model.add(keras.layers.Embedding(1000, 64, input_length=10))
            model.compile(loss=keras.losses.mean_squared_error, optimizer=opt)

            x = np.random.randint(1000, size=(32, 10))
            y = np.random.random((32, 10, 64))
            # No assertions, we just need to verify that it doesn't hang
            model.train_on_batch(x, y)
Ejemplo n.º 23
0
def handle_distribution_strategy(distribution_strategy):
    """ Create distribution strategy. """
    strategy = None
    if distribution_strategy:
        strategy = distribution_strategy
        if isinstance(distribution_strategy, dict):
            strategy = distribution_strategy.get("distribution_strategy", None)
        if isinstance(distribution_strategy, str):
            strategy = distribution_strategy.lower()
        if is_third_party_allreduce(strategy):
            if strategy == "horovod":
                import horovod.tensorflow.keras as hvd
            else:
                import byteps.tensorflow.keras as hvd
            logging.info("import {} as hvd backend.".format(strategy))
            hvd.init()
            # Horovod: pin GPU to be used to process local rank (one GPU per process)
            gpus = tf.config.experimental.list_physical_devices('GPU')
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            if gpus:
                tf.config.experimental.set_visible_devices(
                    gpus[hvd.local_rank()], 'GPU')
            compat.register_distributed_worker_setting(hvd.rank(), hvd.size(),
                                                       strategy)
            if hvd.rank() != 0:
                logging.set_verbosity(logging.ERROR)
        else:
            if isinstance(distribution_strategy, str):
                strategy = distribution_utils.get_distribution_strategy(
                    distribution_strategy=distribution_strategy)
            elif isinstance(distribution_strategy, dict):
                strategy = distribution_utils.get_distribution_strategy(
                    **distribution_strategy)

    if strategy is None:
        logging.info("No distribution strategy was used.")
    else:
        try:
            logging.info(
                "Using distribution strategy: {} with num_replicas_in_sync={}".
                format(strategy, strategy.num_replicas_in_sync))
        except Exception:
            pass
    return strategy
Ejemplo n.º 24
0
def main() -> None:
    """
    Start training Seq2Seq model.

    :return: None
    """
    # Horovod: initialize Horovod.
    hvd.init()

    # Pin GPU to be used to process local rank (one GPU per process)
    gpu_list = tf.config.experimental.list_physical_devices("GPU")
    for gpu in gpu_list:
        tf.config.experimental.set_memory_growth(gpu, True)

    if gpu_list:
        print("Visible GPUs detected.")
        tf.config.experimental.set_visible_devices(gpu_list[hvd.local_rank()], "GPU")

    print("Loading input data.")
    subject_list, body_list = load_data()

    config: Seq2SeqConfig = fit_text(body_list, subject_list)
    summarizer: Seq2SeqSummarizer = Seq2SeqSummarizer(config)

    if not CONFIG.is_dev:
        if tf.io.gfile.exists(LOCAL_MODEL_WEIGHTS):
            summarizer.load_weights(weight_file_path=LOCAL_MODEL_WEIGHTS)
    else:
        Path(CONFIG.bucket_summarization_model).mkdir(parents=True, exist_ok=True)
        if Path(LOCAL_MODEL_WEIGHTS).exists():
            summarizer.load_weights(weight_file_path=LOCAL_MODEL_WEIGHTS)

    body_train, body_test, subject_train, subject_test = train_test_split(body_list, subject_list, test_size=0.2)

    print("Starting training.")
    summarizer.fit(
        body_train=body_train,
        subject_train=subject_train,
        body_test=body_test,
        subject_test=subject_test,
        epochs=int(math.ceil(100 / hvd.size())),
        batch_size=128,
    )
Ejemplo n.º 25
0
def main(argv=None):
    tf.reset_default_graph()
    # init horovod
    hvd.init()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    keras.backend.set_session(tf.Session(config=config))

    image, label = _get_dataset()

    model_input = keras.layers.Input(tensor=image)

    model_output = keras.layers.Flatten(input_shape=(-1, 299, 299, 3))(model_input)

    model_output = keras.layers.Dense(5, activation='relu')(model_output)

    model = keras.models.Model(inputs=model_input, outputs=model_output)

    # Horovod:
    opt = keras.optimizers.Adadelta(1.0 * hvd.size())
    opt = hvd.DistributedOptimizer(opt)
    model.compile(optimizer=opt,loss='categorical_crossentropy',
                  metrics=['accuracy'],target_tensors=[label])

    # callback
    t_callback = keras.callbacks.TensorBoard(log_dir='./logs')    # fit model
    callbacks = [
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),
        t_callback,
    ]

    epochs = int(math.ceil(FLAGS.num_epochs / hvd.size()))
    model.fit(epochs=epochs,steps_per_epoch=FLAGS.steps_one_epoch,callbacks=callbacks)

    # save to h5
    h5file = os.path.join(FLAGS.model_path,'model.h5')

    if hvd.rank() == 0:
        keras.models.save_model(model, h5file)
Ejemplo n.º 26
0
 def __init__(self, config):
     self.config = config
     self.checkpoint_path = config.get_attribute('checkpoint_path')
     self.epochs = config.get_attribute('epochs')
     self.checkpoint_save_period = config.get_attribute(
         'checkpoint_save_period')
     self.checkpoint_format = 'checkpoint-{epoch}.h5'
     self.learning_rate = config.get_attribute('learning_rate')
     self.models_train = []
     self.models_eval = []
     self.train_steps_per_epoch = 1
     self.eval_steps_per_epoch = 1
     self.resume_from_epoch = 0
     self.verbose = 1
     self.cur_epoch = 0
     hvd.init()
     # Horovod: pin GPU to be used to process local rank (one GPU per process)
     gpus = tf.config.experimental.list_physical_devices('GPU')
     for gpu in gpus:
         tf.config.experimental.set_memory_growth(gpu, True)
     if gpus:
         tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                    'GPU')
     self.verbose = 1 if hvd.rank() == 0 else 0
     origin_train_model = get_model(config, is_training=True)
     origin_eval_model = get_model(config, is_training=False)
     self.models_train.append(origin_train_model)
     self.models_eval.append(origin_eval_model)
     train_model = tf.keras.models.clone_model(origin_train_model)
     eval_model = tf.keras.models.clone_model(origin_eval_model)
     self.models_train.append(train_model)
     self.models_eval.append(eval_model)
     self.train_dataset, self.eval_dataset, self.train_dataset_distill, self.eval_dataset_distill = \
         self.build_dataset()
     self.build_train()
     self.build_eval()
     self.load_model()
     self.save_model_path = config.get_attribute('checkpoint_eval_path')
     self.callbacks = []
Ejemplo n.º 27
0
def train(num_epochs):
    # Horovod: initialize Horovod.
    hvd.init()

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    gpus = tf.config.experimental.list_physical_devices('GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.experimental.set_visible_devices(
            gpus[hvd.local_rank()], 'GPU')

    (mnist_images, mnist_labels), _ = \
        tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % hvd.rank())

    dataset = tf.data.Dataset.from_tensor_slices(
        (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32),
         tf.cast(mnist_labels, tf.int64))
    )
    dataset = dataset.repeat().shuffle(10000).batch(128)

    mnist_model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
        tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Dropout(0.25),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(10, activation='softmax')
    ])

    # Horovod: adjust learning rate based on number of GPUs.
    scaled_lr = 0.001 * hvd.size()
    opt = tf.optimizers.Adam(scaled_lr)

    # Horovod: add Horovod DistributedOptimizer.
    opt = hvd.DistributedOptimizer(opt)

    # Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow
    # uses hvd.DistributedOptimizer() to compute gradients.
    mnist_model.compile(loss=tf.losses.SparseCategoricalCrossentropy(),
                        optimizer=opt,
                        metrics=['accuracy'],
                        experimental_run_tf_function=False)

    callbacks = [
        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),

        # Horovod: average metrics among workers at the end of every epoch.
        #
        # Note: This callback must be in the list before the ReduceLROnPlateau,
        # TensorBoard or other metrics-based callbacks.
        hvd.callbacks.MetricAverageCallback(),

        # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
        # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
        # the first three epochs. See https://arxiv.org/abs/1706.02677 for details.
        hvd.callbacks.LearningRateWarmupCallback(
            warmup_epochs=3, initial_lr=scaled_lr, verbose=1),
    ]

    # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
    if hvd.rank() == 0:
        callbacks.append(tf.keras.callbacks.ModelCheckpoint(
            './checkpoint-{epoch}.h5'))

    # Horovod: write logs on worker 0.
    verbose = 1 if hvd.rank() == 0 else 0

    # Train the model.
    # Horovod: adjust number of steps based on number of GPUs.
    mnist_model.fit(dataset, steps_per_epoch=500 // hvd.size(),
                    callbacks=callbacks, epochs=num_epochs, verbose=verbose)
Ejemplo n.º 28
0
    def train_fn(model_bytes):
        # Make sure pyarrow is referenced before anything else to avoid segfault due to conflict
        # with TensorFlow libraries.  Use `pa` package reference to ensure it's loaded before
        # functions like `deserialize_model` which are implemented at the top level.
        # See https://jira.apache.org/jira/browse/ARROW-3346
        pa

        import atexit
        import horovod.tensorflow.keras as hvd
        from horovod.spark.task import get_available_devices
        import os
        from petastorm import make_batch_reader
        from petastorm.tf_utils import make_petastorm_dataset
        import tempfile
        import tensorflow as tf
        import tensorflow.keras.backend as K
        import shutil

        # Horovod: initialize Horovod inside the trainer.
        hvd.init()

        # Horovod: pin GPU to be used to process local rank (one GPU per process), if GPUs are available.
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = get_available_devices()[0]
        K.set_session(tf.Session(config=config))

        # Horovod: restore from checkpoint, use hvd.load_model under the hood.
        model = deserialize_model(model_bytes, hvd.load_model)

        # Horovod: adjust learning rate based on number of processes.
        scaled_lr = K.get_value(model.optimizer.lr) * hvd.size()
        K.set_value(model.optimizer.lr, scaled_lr)

        # Horovod: print summary logs on the first worker.
        verbose = 2 if hvd.rank() == 0 else 0

        callbacks = [
            # Horovod: broadcast initial variable states from rank 0 to all other processes.
            # This is necessary to ensure consistent initialization of all workers when
            # training is started with random weights or restored from a checkpoint.
            hvd.callbacks.BroadcastGlobalVariablesCallback(root_rank=0),

            # Horovod: average metrics among workers at the end of every epoch.
            #
            # Note: This callback must be in the list before the ReduceLROnPlateau,
            # TensorBoard, or other metrics-based callbacks.
            hvd.callbacks.MetricAverageCallback(),

            # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
            # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
            # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
            hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, initial_lr=scaled_lr, verbose=verbose),

            # Reduce LR if the metric is not improved for 10 epochs, and stop training
            # if it has not improved for 20 epochs.
            tf.keras.callbacks.ReduceLROnPlateau(monitor='val_exp_rmspe', patience=10, verbose=verbose),
            tf.keras.callbacks.EarlyStopping(monitor='val_exp_rmspe', mode='min', patience=20, verbose=verbose),
            tf.keras.callbacks.TerminateOnNaN()
        ]

        # Model checkpoint location.
        ckpt_dir = tempfile.mkdtemp()
        ckpt_file = os.path.join(ckpt_dir, 'checkpoint.h5')
        atexit.register(lambda: shutil.rmtree(ckpt_dir))

        # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.
        if hvd.rank() == 0:
            callbacks.append(tf.keras.callbacks.ModelCheckpoint(ckpt_file, monitor='val_exp_rmspe', mode='min',
                                                                save_best_only=True))

        # Make Petastorm readers.
        with make_batch_reader('%s/train_df.parquet' % args.data_dir, num_epochs=None,
                               cur_shard=hvd.rank(), shard_count=hvd.size(),
                               hdfs_driver=PETASTORM_HDFS_DRIVER) as train_reader:
            with make_batch_reader('%s/val_df.parquet' % args.data_dir, num_epochs=None,
                                   cur_shard=hvd.rank(), shard_count=hvd.size(),
                                   hdfs_driver=PETASTORM_HDFS_DRIVER) as val_reader:
                # Convert readers to tf.data.Dataset.
                train_ds = make_petastorm_dataset(train_reader) \
                    .apply(tf.data.experimental.unbatch()) \
                    .shuffle(int(train_rows / hvd.size())) \
                    .batch(args.batch_size) \
                    .map(lambda x: (tuple(getattr(x, col) for col in all_cols), tf.log(x.Sales)))

                val_ds = make_petastorm_dataset(val_reader) \
                    .apply(tf.data.experimental.unbatch()) \
                    .batch(args.batch_size) \
                    .map(lambda x: (tuple(getattr(x, col) for col in all_cols), tf.log(x.Sales)))

                history = model.fit(train_ds,
                                    validation_data=val_ds,
                                    steps_per_epoch=int(train_rows / args.batch_size / hvd.size()),
                                    validation_steps=int(val_rows / args.batch_size / hvd.size()),
                                    callbacks=callbacks,
                                    verbose=verbose,
                                    epochs=args.epochs)

        # Dataset API usage currently displays a wall of errors upon termination.
        # This global model registration ensures clean termination.
        # Tracked in https://github.com/tensorflow/tensorflow/issues/24570
        globals()['_DATASET_FINALIZATION_HACK'] = model

        if hvd.rank() == 0:
            with open(ckpt_file, 'rb') as f:
                return history.history, f.read()
    def __init__(self,
                 timesteps,
                 includeAux,
                 folderI,
                 trainLoss,
                 includeModis,
                 includeVGG,
                 disLoss,
                 cloud_cov=0.4,
                 istransfer=False,
                 img_h=256,
                 img_width=256,
                 startT='01-01-2018',
                 endT='01-05-2019'):

        self.img_h = img_h
        self.img_w = img_width
        self.timesteps = timesteps
        self.includeModis = includeModis
        hvd.init()

        gpus = tf.config.experimental.list_physical_devices('GPU')
        if gpus:
            tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                       'GPU')
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)

        self.gen_schedule = ExponentialDecay(1e-4 * hvd.size(),
                                             decay_steps=10000,
                                             decay_rate=0.1,
                                             staircase=True)

        self.disc_schedule = ExponentialDecay(1e-4 * hvd.size() * 5,
                                              decay_steps=10000,
                                              decay_rate=0.1,
                                              staircase=True)
        self.istransfer = istransfer
        # self.disOp = hvd.DistributedOptimizer(tf.keras.optimizers.Adam(1e-4 * hvd.size(), 0.5))
        # self.lstmOp = hvd.DistributedOptimizer(Adam(lr=1e-4 * hvd.size(), beta_1=0.9, beta_2=0.999, epsilon=1e-08))
        self.disOp = hvd.DistributedOptimizer(
            Adam(learning_rate=self.disc_schedule))
        self.lstmOp = hvd.DistributedOptimizer(
            Adam(learning_rate=self.gen_schedule))

        self.model_helpers = models.LSTM_GAN_MODEL(disOp=self.disOp,
                                                   lstmOp=self.lstmOp,
                                                   h=self.img_h,
                                                   w=self.img_w,
                                                   timeStep=timesteps,
                                                   includeAux=includeAux,
                                                   trainLoss=trainLoss,
                                                   disLoss=disLoss)

        # print("GOT MODIS======", includeModis)
        if includeVGG and includeModis == 0:
            if istransfer:
                self.dataloader = dataloaders.DatasetHandling(
                    self.img_w,
                    self.img_h,
                    no_of_timesteps=timesteps,
                    startT=startT,
                    endT=endT,
                    cloud_cov=cloud_cov,
                    album='foco-co-20km')

                self.lstm_gan, self.vgg, self.disciminator, self.lstm_generator = self.model_helpers.lstm_gan_with_vgg_transfer(
                    self.transferLear())
            else:
                self.dataloader = dataloaders.DatasetHandling(
                    self.img_w,
                    self.img_h,
                    no_of_timesteps=timesteps,
                    startT=startT,
                    endT=endT,
                    cloud_cov=cloud_cov)

                self.lstm_gan, self.vgg, self.disciminator, self.lstm_generator = self.model_helpers.lstm_gan_with_vgg(
                )
        elif not includeVGG and includeModis == 0:
            self.lstm_gan, self.vgg, self.disciminator, self.lstm_generator = self.model_helpers.lstm_gan_no_vgg(
            )
        elif includeModis == 1:
            self.lstm_gan, self.vgg, self.disciminator, self.lstm_generator = self.model_helpers.lstm_gan_with_vgg_multi_modis(
            )

        self.dirName = "/s/" + socket.gethostname(
        ) + "/a/nobackup/galileo/paahuni/" + str(folderI) + "/"
        if not includeModis == 2:
            self.img_itr = self.dataloader.get_non_random_image_iterator_new(
                batch_size=1,
                no_of_timesteps=self.timesteps,
                sendMetaInfo=True,
                includeModis=includeModis)
        else:
            self.dataloader = dataloaders.DatasetHandling(
                self.img_w,
                self.img_h,
                no_of_timesteps=timesteps,
                startT=startT,
                endT=endT,
                cloud_cov=cloud_cov)
        self.includeVGG = includeVGG
Ejemplo n.º 30
0
IMAGE_HEIGHT = mlctx.get_param("image_height", 128)
IMAGE_CHANNELS = mlctx.get_param("image_channels", 3)  # RGB color
IMAGE_SIZE = (IMAGE_WIDTH, IMAGE_HEIGHT)
IMAGE_SHAPE = (IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_CHANNELS)
EPOCHS = mlctx.get_param("epochs", 1)
BATCH_SIZE = mlctx.get_param("batch_size", 16)
# RANDOM_STATE must be a parameter for reproducibility:
RANDOM_STATE = mlctx.get_param("random_state", 1)
TEST_SIZE = mlctx.get_param("test_size", 0.2)

# kubeflow outputs/inputs
categories_map = str(mlctx.get_input("categories_map").get())
df = pd.read_csv(str(mlctx.get_input("file_categories")))

# Horovod: initialize Horovod.
hvd.init()

# if gpus found, pin GPU to be used to process local rank (one GPU per process)
gpus = tf.config.experimental.list_physical_devices("GPU")
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")
else:
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

if hvd.rank() == 0:
    mlctx.logger.info(
        f"Validating paths:\nData_path:\t{DATA_PATH}\nModel_dir:\t{MODEL_DIR}\n"
    )
    mlctx.logger.info(f"Categories map:{categories_map}")
Ejemplo n.º 31
0
if __name__ == '__main__':
    
    num_gpus = int(os.environ['SM_NUM_GPUS'])

    parser = argparse.ArgumentParser()

    # Data, model, and output directories. These are required.
    parser.add_argument('--output-dir', type=str, default=os.environ['SM_OUTPUT_DIR'])
    parser.add_argument('--model_dir', type=str)
    parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
    parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEST'])

    args, _ = parser.parse_known_args()

    # Horovod: initialize Horovod.
    hvd.init()

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.Session(config=config))

    batch_size = 128
    num_classes = 10

    # Horovod: adjust number of epochs based on number of GPUs.
    epochs = int(math.ceil(12.0 / hvd.size()))

    # Input image dimensions
    img_rows, img_cols = 28, 28