Beispiel #1
0
def init_gpu(args, logger):
    hvd.init()

    init_logger(
        full=hvd.rank() == 0,
        args=args,
        logger=logger
    )
    if args.affinity != 'disabled':
        gpu_id = hvd.local_rank()
        affinity = set_affinity(
            gpu_id=gpu_id,
            nproc_per_node=hvd.size(),
            mode=args.affinity
        )
        logger.warning(f'{gpu_id}: thread affinity: {affinity}')
    gpus = tf.config.experimental.list_physical_devices('GPU')
    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')

    if args.amp:
        policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
        tf.keras.mixed_precision.experimental.set_policy(policy)

    if args.xla:
        tf.config.optimizer.set_jit(True)
    def build(self) -> tf.data.Dataset:
        """Construct a dataset end-to-end and return it.

    Args:
      input_context: An optional context provided by `tf.distribute` for
        cross-replica training.

    Returns:
      A TensorFlow dataset outputting batched images and labels.
    """
        if self._use_dali:
            print("Using dali for {train} dataloading".format(
                train="training" if self.is_training else "validation"))
            tfrec_filenames = sorted(
                tf.io.gfile.glob(
                    os.path.join(self._data_dir, '%s-*' % self._split)))
            tfrec_idx_filenames = sorted(
                tf.io.gfile.glob(
                    os.path.join(self._index_file, '%s-*' % self._split)))

            # # Create pipeline
            dali_pipeline = Dali.DaliPipeline(
                tfrec_filenames=tfrec_filenames,
                tfrec_idx_filenames=tfrec_idx_filenames,
                height=self._image_size,
                width=self._image_size,
                batch_size=self.local_batch_size,
                num_threads=1,
                device_id=hvd.local_rank(),
                shard_id=hvd.rank(),
                num_gpus=hvd.size(),
                num_classes=self.num_classes,
                deterministic=False,
                dali_cpu=False,
                training=self.is_training)

            # Define shapes and types of the outputs
            shapes = ((self.local_batch_size, self._image_size,
                       self._image_size, 3), (self.local_batch_size,
                                              self._num_classes))
            dtypes = (tf.float32, tf.float32)

            # Create dataset
            dataset = dali_tf.DALIDataset(pipeline=dali_pipeline,
                                          batch_size=self.local_batch_size,
                                          output_shapes=shapes,
                                          output_dtypes=dtypes,
                                          device_id=hvd.local_rank())
            # if self.is_training and self._augmenter:
            #     print('Augmenting with {}'.format(self._augmenter))
            #     dataset.unbatch().map(self.augment_pipeline, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(self.local_batch_size)
            return dataset
        else:
            print("Using tf native pipeline for {train} dataloading".format(
                train="training" if self.is_training else "validation"))
            dataset = self.load_records()
            dataset = self.pipeline(dataset)

            return dataset
Beispiel #3
0
def check_tf_1(aggregation_frequency: int,
               average_aggregated_gradients: bool) -> None:
    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    K.set_session(tf.Session(config=config))
    session = tf.compat.v1.keras.backend.get_session(op_input_list=())

    hvd_optimizer = hvd.DistributedOptimizer(
        optimizer=CustomOptimizer("mine"),
        backward_passes_per_step=aggregation_frequency,
        average_aggregated_gradients=average_aggregated_gradients,
    )
    iterations = hvd_optimizer.iterations
    session.run(iterations.initializer)

    grads = [tf.constant([float(hvd.rank())])]
    variables = [tf.Variable([0.0])]
    session.run(variables[0].initializer)

    allreduce_op = hvd_optimizer._allreduce(grads)
    grads_and_vars = [(allreduce_op[0], variables[0])]
    apply_grads_op = hvd_optimizer.apply_gradients(grads_and_vars)

    for idx in range(10):
        _ = session.run(apply_grads_op)

        expected_value = compute_expected_variable_value(
            idx, aggregation_frequency, average_aggregated_gradients)

        assert idx + 1 == session.run(hvd_optimizer.iterations)
        assert expected_value == session.run(variables[0].read_value())
Beispiel #4
0
def check_tf_2(aggregation_frequency: int,
               average_aggregated_gradients: bool) -> None:
    gpus = tf.config.experimental.list_physical_devices("GPU")
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                   "GPU")

    hvd_optimizer = hvd.DistributedOptimizer(
        optimizer=CustomOptimizer("mine"),
        backward_passes_per_step=aggregation_frequency,
        average_aggregated_gradients=average_aggregated_gradients,
    )
    _ = hvd_optimizer.iterations

    gradients = [tf.constant([float(hvd.rank())])]
    variables = [tf.Variable([0.0])]
    for idx in range(10):
        if _PRE_TF_2_4_0:
            # In TF < 2.4 `_aggregate_gradients()` is called outside of `apply_gradients()`.
            updated_gradients = hvd_optimizer._aggregate_gradients(
                zip(gradients, variables))
            hvd_optimizer.apply_gradients(
                zip(updated_gradients, variables),
                experimental_aggregate_gradients=False)
        else:
            hvd_optimizer.apply_gradients(zip(gradients, variables))

        updated_variable_value = variables[0][0].numpy()
        expected_value = compute_expected_variable_value(
            idx, aggregation_frequency, average_aggregated_gradients)

        assert expected_value == updated_variable_value
        assert idx + 1 == hvd_optimizer.iterations.numpy()
def init_workers(distributed=False):
    """Initialize distributed worker"""
    rank, local_rank, n_ranks = 0, 0, 1
    if distributed:
        hvd.init()
        rank, local_rank, n_ranks = hvd.rank(), hvd.local_rank(), hvd.size()
    return rank, local_rank, n_ranks
Beispiel #6
0
def generate_stats_name(model, root):
    # Generates the name of the output stats file.
    # If Horovod distribution is enabled, the node and GPU ID are appended to the end
    return ('%s_%s%s%s.csv' %
           (model, root,
           ('_%s' % os.environ["HOSTNAME"] if hvd else ""),
           (('_gpu%s' % hvd.local_rank()) if hvd else "")))
def run_model(args):
    if args.lms:
        tf.config.experimental.set_lms_enabled(True)

    image_dim = args.image_size
    opt = tf.keras.optimizers.RMSprop()
    if hvd:
        # Horovod: pin GPU to be used to process local rank (one GPU per process)
        gpus = tf.config.list_physical_devices('GPU')
        tf.config.experimental.set_memory_growth(gpus[hvd.local_rank()], True)
        tf.config.set_visible_devices(gpus[hvd.local_rank()], 'GPU')

        # Horovod: add Horovod DistributedOptimizer.
        opt = hvd.DistributedOptimizer(opt)
        steps_per_epoch = max(1, args.steps // hvd.size())
        experimental_run_tf_function = False
    else:
        steps_per_epoch = args.steps
        experimental_run_tf_function = True

    if args.channels_last:
        K.set_image_data_format('channels_last')
        input_shape = (image_dim, image_dim, 3)
    else:
        K.set_image_data_format('channels_first')
        input_shape = (3, image_dim, image_dim)

    num_classes = 15
    batch_size = args.batch_size
    model_class = model_choices.get(args.model)
    model = model_class(weights=None,
                        include_top=True,
                        input_shape=input_shape,
                        classes=num_classes)

    model.compile(optimizer=opt,
                  loss='categorical_crossentropy',
                  experimental_run_tf_function=experimental_run_tf_function)

    random_generator = random_image_generator(batch_size, num_classes,
                                              input_shape)

    model.fit(random_generator,
              steps_per_epoch=steps_per_epoch,
              verbose=1 if not hvd or hvd.rank() == 0 else 0,
              epochs=args.epochs,
              callbacks=get_callbacks(args))
def init_workers(distributed=False):
    if distributed:
        hvd.init()
        return SimpleNamespace(rank=hvd.rank(), size=hvd.size(),
                               local_rank=hvd.local_rank(),
                               local_size=hvd.local_size())
    else:
        return SimpleNamespace(rank=0, size=1, local_rank=0, local_size=1)
    def __init__(self, *args, **kwargs):
        super(TfKerasTests, self).__init__(*args, **kwargs)
        warnings.simplefilter('module')
        hvd.init()

        self.config = tf.compat.v1.ConfigProto()
        self.config.gpu_options.allow_growth = True
        self.config.gpu_options.visible_device_list = str(hvd.local_rank())
Beispiel #10
0
def main(_):
    hvd.init()
    print("After hvd init")
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    # K.set_session(tf.Session(config=config))
    print("After gpu_options visible_device_list")
    tf.enable_eager_execution(config=config)
    epochs = 20
    steps_per_epoch = 2
    batch_size = 32
    num_classes = 10
    full_model = 'image'
    image_model = 'efficientnet'
    image_training_type = 'finetuning'
    text_model = 'cnn'
    combined_embeddings = 'stack'
    learning_rate = 0.005
    width = 150
    height = 150
    input_shape = (height, width, 3)
    input_size = (224, 224, 3)
    train_tfrecord = tf.data.TFRecordDataset(filenames=['tfrecords/train.tfrecords'])
    print(train_tfrecord)
    val_tfrecord = tf.data.TFRecordDataset(filenames=['tfrecords/val.tfrecords'])
    test_tfrecord = tf.data.TFRecordDataset(filenames=['tfrecords/test.tfrecords'])

    def read_tfrecord(serialized_example):
        feature_description = {
            'image_raw': tf.io.FixedLenFeature([], tf.string),
            'label': tf.io.FixedLenFeature([], tf.int64),
        }
        example = tf.io.parse_single_example(serialized_example, feature_description)
        input_2 = tf.image.decode_png(example['image_raw'], channels=3, dtype=tf.dtypes.uint8)
        input_2 = tf.image.resize(input_2, [600, 600])
        return (input_2, example['label'])
    train_parsed_dataset = train_tfrecord.map(read_tfrecord)
    val_parsed_dataset = val_tfrecord.map(read_tfrecord)
    test_parsed_dataset = test_tfrecord.map(read_tfrecord)
    tf.keras.backend.clear_session()
    baseModel = EfficientNetB7(weights='imagenet', include_top=True)
    probs = baseModel.layers.pop()
    top_droput = probs.input
    headModel = layers.Dense(10, activation='softmax')(top_droput)
    model = models.Model(inputs=baseModel.input, outputs=headModel)
    SGD = optimizers.SGD(lr=0.01, decay=4e-05, momentum=0.9)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=adapt_optimizer(SGD), metrics=['accuracy'])
    train_dataset = train_parsed_dataset.batch(2).repeat()
    val_dataset = val_parsed_dataset.batch(2).repeat()
    test_dataset = test_parsed_dataset.batch(2).repeat()
    model.fit(train_dataset, epochs=adapt_epochs(epochs), steps_per_epoch=400, validation_data=val_dataset, validation_steps=100, verbose=(1 if (hvd.rank() == 0) else 0), callbacks=adapt_callbacks([], True))
    if (hvd.rank() == 0):
        model.save('saved_model.h5')
    if (hvd.rank() == 0):
        (test_loss, test_acc) = model.evaluate(test_dataset, verbose=0, steps=1241)
        print('Test loss =', test_loss)
        print('Test acc =', test_acc)
Beispiel #11
0
def initialize_horovod():
    hvd.init()
    gpus = tf.config.experimental.list_physical_devices("GPU")
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                   "GPU")

    return hvd.size()
    def __init__(self, *args, **kwargs):
        super(Tf2KerasTests, self).__init__(*args, **kwargs)
        warnings.simplefilter('module')
        hvd.init()

        gpus = tf.config.experimental.list_physical_devices('GPU')
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        if gpus:
            tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
Beispiel #13
0
def config_gpu():
    """"
    Setup the GPUs to support for running with more memory as opposed to pre allocated memory.
    Controls the percentage use of memory. Sets up process such that, typically one process acts on one gpu.
    """
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.per_process_gpu_memory_fraction = FLAGS.gpu_memory_fraction
    session = tf.Session(config=config)
    K.set_session(session)
Beispiel #14
0
def gpu_config():
    try:
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        K.set_session(tf.Session(config=config))
    except:
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        # config.gpu_options.allow_soft_placement = True
        session = tf.Session(config=config)
        K.set_session(session)
Beispiel #15
0
def connect_GPU_to_horovod():
    import horovod.tensorflow.keras as hvd
    import tensorflow as tf
    tf.keras.backend.clear_session()
    hvd.init()
    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    gpus = tf.config.experimental.list_physical_devices('GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                   'GPU')
def train_hvd(learning_rate=1.0):
    # Tensorflow has given up on pickling. We need to explicitly import its modules inside workers
    from tensorflow.keras import backend as K
    from tensorflow.keras.models import Sequential
    import tensorflow as tf
    from tensorflow import keras
    import horovod.tensorflow.keras as hvd

    # Horovod: initialize Horovod.
    hvd.init()

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.Session(config=config))

    (x_train, y_train), (x_test, y_test) = get_dataset(num_classes, hvd.rank(),
                                                       hvd.size())
    model = get_model(num_classes)

    # Horovod: adjust learning rate based on number of GPUs.
    optimizer = keras.optimizers.Adadelta(lr=learning_rate * hvd.size())

    # Horovod: add Horovod Distributed Optimizer.
    optimizer = hvd.DistributedOptimizer(optimizer)

    model.compile(optimizer=optimizer,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    callbacks = [
        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),
    ]

    # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
    if hvd.rank() == 0:
        callbacks.append(
            keras.callbacks.ModelCheckpoint(checkpoint_dir +
                                            '/checkpoint-{epoch}.ckpt',
                                            save_weights_only=True))

    model.fit(x_train,
              y_train,
              batch_size=batch_size,
              callbacks=callbacks,
              epochs=epochs,
              verbose=2,
              validation_data=(x_test, y_test))
Beispiel #17
0
def setup(args, report):
    """Set up environment variables given the type of partition."""
    # Initialize Horovod
    hvd.init()
    # Set environment variable necessary to use h5py for file read/write
    os.putenv("HDF5_USE_FILE_LOCKING", "FALSE")
    os.system("export $HDF5_USE_FILE_LOCKING")
    # Pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.log_device_placement = False
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.compat.v1.Session(config=config))
    np.random.seed(args.random_seed)
    print('Rank ' + str(hvd.rank()) + ' session configured')
Beispiel #18
0
def train_evaluate():

    # Initialize Horovod
    hvd.init()

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    tf.keras.backend.set_session(tf.Session(config=config))

    train_dataset, eval_dataset = prepare_datasets()

    model = toy_resnet_model()

    # Wrap an optimizer in Horovod
    optimizer = hvd.DistributedOptimizer(optimizers.Adadelta())

    model.compile(optimizer=optimizer,
                  loss="categorical_crossentropy",
                  metrics=["accuracy"])

    callbacks = [
        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with loaded weights.
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),
        # Horovod: average metrics among workers at the end of every epoch.
        #
        # Note: This callback must be in the list before the ReduceLROnPlateau,
        # TensorBoard, or other metrics-based callbacks.
        hvd.callbacks.MetricAverageCallback()
    ]

    # Horovod: save checkpoints only on worker 0 (master) to prevent other workers from corrupting them.
    # Configure Tensorboard and Azure ML Tracking
    if hvd.rank() == 0:
        #callbacks.append(tf.keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))
        callbacks.append(
            tf.keras.callbacks.TensorBoard(log_dir=FLAGS['job-dir'].value,
                                           update_freq='epoch'))

    model.fit(train_dataset,
              epochs=FLAGS.epochs,
              steps_per_epoch=1000,
              callbacks=callbacks,
              validation_data=eval_dataset,
              validation_steps=200)
Beispiel #19
0
def init_gpu(args, logger):
    hvd.init()

    init_logger(full=hvd.rank() == 0, args=args, logger=logger)
    if args.affinity != "disabled":
        gpu_id = hvd.local_rank()
        affinity = set_affinity(gpu_id=gpu_id,
                                nproc_per_node=hvd.size(),
                                mode=args.affinity)
        logger.warning(f"{gpu_id}: thread affinity: {affinity}")

    if args.amp:
        tf.keras.mixed_precision.set_global_policy("mixed_float16")

    if args.xla:
        tf.config.optimizer.set_jit(True)
Beispiel #20
0
def init(en_mem_growth=False, set_visible_dev=False):
    """ This initializes the horovod package.
    :param en_mem_growth:
    :param set_visible_dev:
    """
    if hvd is not None:
        hvd.init()
        gpus = tf.config.experimental.list_physical_devices('GPU')
        for gpu in gpus:
            if en_mem_growth:
                tf.config.experimental.set_memory_growth(gpu, True)
        if gpus and set_visible_dev:
            tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                       'GPU')
    else:
        print("Horovod not supported on this system!")
Beispiel #21
0
def handle_distribution_strategy(distribution_strategy):
    """ Create distribution strategy. """
    strategy = None
    if distribution_strategy:
        strategy = distribution_strategy
        if isinstance(distribution_strategy, dict):
            strategy = distribution_strategy.get("distribution_strategy", None)
        if isinstance(distribution_strategy, str):
            strategy = distribution_strategy.lower()
        if is_third_party_allreduce(strategy):
            if strategy == "horovod":
                import horovod.tensorflow.keras as hvd
            else:
                import byteps.tensorflow.keras as hvd
            logging.info("import {} as hvd backend.".format(strategy))
            hvd.init()
            # Horovod: pin GPU to be used to process local rank (one GPU per process)
            gpus = tf.config.experimental.list_physical_devices('GPU')
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            if gpus:
                tf.config.experimental.set_visible_devices(
                    gpus[hvd.local_rank()], 'GPU')
            compat.register_distributed_worker_setting(hvd.rank(), hvd.size(),
                                                       strategy)
            if hvd.rank() != 0:
                logging.set_verbosity(logging.ERROR)
        else:
            if isinstance(distribution_strategy, str):
                strategy = distribution_utils.get_distribution_strategy(
                    distribution_strategy=distribution_strategy)
            elif isinstance(distribution_strategy, dict):
                strategy = distribution_utils.get_distribution_strategy(
                    **distribution_strategy)

    if strategy is None:
        logging.info("No distribution strategy was used.")
    else:
        try:
            logging.info(
                "Using distribution strategy: {} with num_replicas_in_sync={}".
                format(strategy, strategy.num_replicas_in_sync))
        except Exception:
            pass
    return strategy
Beispiel #22
0
def main() -> None:
    """
    Start training Seq2Seq model.

    :return: None
    """
    # Horovod: initialize Horovod.
    hvd.init()

    # Pin GPU to be used to process local rank (one GPU per process)
    gpu_list = tf.config.experimental.list_physical_devices("GPU")
    for gpu in gpu_list:
        tf.config.experimental.set_memory_growth(gpu, True)

    if gpu_list:
        print("Visible GPUs detected.")
        tf.config.experimental.set_visible_devices(gpu_list[hvd.local_rank()], "GPU")

    print("Loading input data.")
    subject_list, body_list = load_data()

    config: Seq2SeqConfig = fit_text(body_list, subject_list)
    summarizer: Seq2SeqSummarizer = Seq2SeqSummarizer(config)

    if not CONFIG.is_dev:
        if tf.io.gfile.exists(LOCAL_MODEL_WEIGHTS):
            summarizer.load_weights(weight_file_path=LOCAL_MODEL_WEIGHTS)
    else:
        Path(CONFIG.bucket_summarization_model).mkdir(parents=True, exist_ok=True)
        if Path(LOCAL_MODEL_WEIGHTS).exists():
            summarizer.load_weights(weight_file_path=LOCAL_MODEL_WEIGHTS)

    body_train, body_test, subject_train, subject_test = train_test_split(body_list, subject_list, test_size=0.2)

    print("Starting training.")
    summarizer.fit(
        body_train=body_train,
        subject_train=subject_train,
        body_test=body_test,
        subject_test=subject_test,
        epochs=int(math.ceil(100 / hvd.size())),
        batch_size=128,
    )
Beispiel #23
0
    def __init__(self,
                 filenames,
                 idx_filenames,
                 height,
                 width,
                 batch_size,
                 num_threads,
                 dtype=tf.uint8,
                 dali_cpu=True,
                 deterministic=False,
                 training=False):
        device_id = hvd.local_rank()
        shard_id = hvd.rank()
        num_gpus = hvd.size()
        self.pipe = get_dali_pipeline(
            tfrec_filenames=filenames,
            tfrec_idx_filenames=idx_filenames,
            height=height,
            width=width,
            batch_size=batch_size,
            num_threads=num_threads,
            device_id=device_id,
            shard_id=shard_id,
            num_gpus=num_gpus,
            dali_cpu=dali_cpu,
            training=training,
            seed=7 * (1 + hvd.rank()) if deterministic else None)

        self.daliop = dali_tf.DALIIterator()

        self.batch_size = batch_size
        self.height = height
        self.width = width
        self.device_id = device_id

        self.dalidataset = dali_tf.DALIDataset(
            pipeline=self.pipe,
            output_shapes=((batch_size, height, width, 3), (batch_size)),
            batch_size=batch_size,
            output_dtypes=(tf.float32, tf.int64),
            device_id=device_id)
Beispiel #24
0
def main(argv=None):
    tf.reset_default_graph()
    # init horovod
    hvd.init()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    keras.backend.set_session(tf.Session(config=config))

    image, label = _get_dataset()

    model_input = keras.layers.Input(tensor=image)

    model_output = keras.layers.Flatten(input_shape=(-1, 299, 299, 3))(model_input)

    model_output = keras.layers.Dense(5, activation='relu')(model_output)

    model = keras.models.Model(inputs=model_input, outputs=model_output)

    # Horovod:
    opt = keras.optimizers.Adadelta(1.0 * hvd.size())
    opt = hvd.DistributedOptimizer(opt)
    model.compile(optimizer=opt,loss='categorical_crossentropy',
                  metrics=['accuracy'],target_tensors=[label])

    # callback
    t_callback = keras.callbacks.TensorBoard(log_dir='./logs')    # fit model
    callbacks = [
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),
        t_callback,
    ]

    epochs = int(math.ceil(FLAGS.num_epochs / hvd.size()))
    model.fit(epochs=epochs,steps_per_epoch=FLAGS.steps_one_epoch,callbacks=callbacks)

    # save to h5
    h5file = os.path.join(FLAGS.model_path,'model.h5')

    if hvd.rank() == 0:
        keras.models.save_model(model, h5file)
Beispiel #25
0
 def __init__(self, config):
     self.config = config
     self.checkpoint_path = config.get_attribute('checkpoint_path')
     self.epochs = config.get_attribute('epochs')
     self.checkpoint_save_period = config.get_attribute(
         'checkpoint_save_period')
     self.checkpoint_format = 'checkpoint-{epoch}.h5'
     self.learning_rate = config.get_attribute('learning_rate')
     self.models_train = []
     self.models_eval = []
     self.train_steps_per_epoch = 1
     self.eval_steps_per_epoch = 1
     self.resume_from_epoch = 0
     self.verbose = 1
     self.cur_epoch = 0
     hvd.init()
     # Horovod: pin GPU to be used to process local rank (one GPU per process)
     gpus = tf.config.experimental.list_physical_devices('GPU')
     for gpu in gpus:
         tf.config.experimental.set_memory_growth(gpu, True)
     if gpus:
         tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                    'GPU')
     self.verbose = 1 if hvd.rank() == 0 else 0
     origin_train_model = get_model(config, is_training=True)
     origin_eval_model = get_model(config, is_training=False)
     self.models_train.append(origin_train_model)
     self.models_eval.append(origin_eval_model)
     train_model = tf.keras.models.clone_model(origin_train_model)
     eval_model = tf.keras.models.clone_model(origin_eval_model)
     self.models_train.append(train_model)
     self.models_eval.append(eval_model)
     self.train_dataset, self.eval_dataset, self.train_dataset_distill, self.eval_dataset_distill = \
         self.build_dataset()
     self.build_train()
     self.build_eval()
     self.load_model()
     self.save_model_path = config.get_attribute('checkpoint_eval_path')
     self.callbacks = []
Beispiel #26
0
def check_tf_1(aggregation_frequency: int, average_aggregated_gradients: bool) -> None:
    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.Session(config=config))
    session = tf.compat.v1.keras.backend.get_session(op_input_list=())

    hvd_optimizer = hvd.DistributedOptimizer(
        optimizer=CustomOptimizer("mine"),
        aggregation_frequency=aggregation_frequency,
        average_aggregated_gradients=average_aggregated_gradients,
    )

    constant_multiplier = 4.0
    grads = [tf.constant([hvd.rank() * constant_multiplier])]
    op = hvd_optimizer._allreduce(grads)
    for idx in range(10):
        value = session.run(op)[0][0]
        expected_value = compute_expected_value(
            idx, aggregation_frequency, constant_multiplier, average_aggregated_gradients, False
        )
        assert expected_value == value
Beispiel #27
0
    def __init__(self, benchmark, output_dir):
        self._benchmark = benchmark
        self._output_dir = output_dir

        Path(self._output_dir).mkdir(parents=True, exist_ok=True)

        host_spec = HostSpec()
        self._node_name = host_spec.node_name

        # Log system information if on local rank 0
        if hvd.local_rank() == 0:

            # Log host information
            file_name = '{}_host.json'.format(self._node_name)
            db = TrackingClient(Path(self._output_dir) / file_name)

            host_info = {
                'name': host_spec.name,
                'node_name': host_spec.node_name,
                'ip': host_spec.node_name,
                'num_cores': host_spec.num_cores,
                'release': host_spec.release,
                'system': host_spec.system,
                'cpu_info': host_spec.cpu_info,
            }

            db.log_tag('host_info', host_info)

            # Log device information
            device_specs = DeviceSpecs()

            file_name = '{}_devices.json'.format(self._node_name)
            db = TrackingClient(Path(self._output_dir) / file_name)

            device_info = {}
            device_info['gpu_count'] = device_specs.device_count
            device_info.update({'gpu_{}'.format(i): device_specs.get_device_info(i) for i in range(device_specs.device_count)})

            db.log_tag('device_info', device_info)
Beispiel #28
0
def check_tf_2(aggregation_frequency: int, average_aggregated_gradients: bool) -> None:
    gpus = tf.config.experimental.list_physical_devices("GPU")
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")

    hvd_optimizer = hvd.DistributedOptimizer(
        optimizer=CustomOptimizer("mine"),
        aggregation_frequency=aggregation_frequency,
        average_aggregated_gradients=average_aggregated_gradients,
    )

    constant_multiplier = 4.0
    grads_and_vars = [(tf.constant([hvd.rank() * constant_multiplier]), None)]
    for idx in range(10):
        grads = hvd_optimizer._aggregate_gradients(grads_and_vars)
        value = grads[0][0].numpy()
        expected_value = compute_expected_value(
            idx, aggregation_frequency, constant_multiplier, average_aggregated_gradients, True
        )
        assert expected_value == value
Beispiel #29
0
RANDOM_STATE = mlctx.get_param("random_state", 1)
TEST_SIZE = mlctx.get_param("test_size", 0.2)

# kubeflow outputs/inputs
categories_map = str(mlctx.get_input("categories_map").get())
df = pd.read_csv(str(mlctx.get_input("file_categories")))

# Horovod: initialize Horovod.
hvd.init()

# if gpus found, pin GPU to be used to process local rank (one GPU per process)
gpus = tf.config.experimental.list_physical_devices("GPU")
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")
else:
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

if hvd.rank() == 0:
    mlctx.logger.info(
        f"Validating paths:\nData_path:\t{DATA_PATH}\nModel_dir:\t{MODEL_DIR}\n"
    )
    mlctx.logger.info(f"Categories map:{categories_map}")
    mlctx.logger.info(f"Got {df.shape[0]} files in {DATA_PATH}")
    mlctx.logger.info(f"Training data has {df.size} samples")
    mlctx.logger.info(df.category.value_counts())

# artifact folders (deprecate these)
os.makedirs(DATA_PATH, exist_ok=True)
os.makedirs(CHECKPOINTS_DIR, exist_ok=True)
Beispiel #30
0
def train(model_func, params):
    image_width = params['image_width']
    image_height = params['image_height']
    image_format = params['image_format']
    distort_color = params['distort_color']
    momentum = params['momentum']
    loss_scale = params['loss_scale']
    data_dir = params['data_dir']
    data_idx_dir = params['data_idx_dir']
    batch_size = params['batch_size']
    num_iter = params['num_iter']
    iter_unit = params['iter_unit']
    log_dir = params['log_dir']
    export_dir = params['export_dir']
    tensorboard_dir = params['tensorboard_dir']
    display_every = params['display_every']
    precision = params['precision']
    dali_mode = params['dali_mode']
    use_xla = params['use_xla']

    if data_dir is not None:
        file_format = os.path.join(data_dir, '%s-*')
        train_files = sorted(tf.io.gfile.glob(file_format % 'train'))
        valid_files = sorted(tf.io.gfile.glob(file_format % 'validation'))
        num_train_samples = common.get_num_records(train_files)
        num_valid_samples = common.get_num_records(valid_files)
    else:
        num_train_samples = 1281982
        num_valid_samples = 5000

    train_idx_files = None
    valid_idx_files = None
    if data_idx_dir is not None:
        file_format = os.path.join(data_idx_dir, '%s-*')
        train_idx_files = sorted(tf.io.gfile.glob(file_format % 'train'))
        valid_idx_files = sorted(tf.io.gfile.glob(file_format % 'validation'))

    if iter_unit.lower() == 'epoch':
        num_epochs = num_iter
        nstep_per_epoch = num_train_samples // (batch_size * hvd.size())
        nstep_per_valid = num_valid_samples // (batch_size * hvd.size())
    else:
        assert iter_unit.lower() == 'batch'
        num_epochs = 1
        nstep_per_epoch = min(num_iter,
                              num_train_samples // (batch_size * hvd.size()))
        nstep_per_valid = min(10,
                              num_valid_samples // (batch_size * hvd.size()))

    initial_epoch = 0
    if log_dir:
        # We save check points only when using the real data.
        assert data_dir, "--data_dir cannot be empty when using --log_dir"
        assert os.path.exists(log_dir)
        ckpt_format = log_dir + "/model-{epoch:02d}-{val_top1:.2f}.hdf5"
        # Looks for the most recent checkpoint and sets the initial epoch from it.
        for filename in os.listdir(log_dir):
            if filename.startswith('model-'):
                initial_epoch = max(int(re.findall(r'\d+', filename)[0]),
                                    initial_epoch)

    if tensorboard_dir:
        assert os.path.exists(tensorboard_dir)

    if export_dir:
        assert os.path.exists(export_dir)
        save_format = export_dir + "/saved_model_rn50.h5"

    if use_xla:
        tf.config.optimizer.set_jit(True)

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    gpus = tf.config.experimental.list_physical_devices('GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                   'GPU')

    if precision == 'fp16':
        policy = keras.mixed_precision.experimental.Policy(
            'mixed_float16', loss_scale)
        keras.mixed_precision.experimental.set_policy(policy)

    lr_schedule = common.create_piecewise_constant_decay_with_warmup(
        batch_size=batch_size * hvd.size(),
        epoch_size=num_train_samples,
        warmup_epochs=common.LR_SCHEDULE[0][1],
        boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]),
        multipliers=list(p[0] for p in common.LR_SCHEDULE),
        compute_lr_on_cpu=True)
    opt = keras.optimizers.SGD(learning_rate=lr_schedule, momentum=momentum)
    # Horovod: add Horovod DistributedOptimizer. We use a modified version to
    # support the custom learning rate schedule.
    opt = hvd_patch.DistributedOptimizer(opt)

    backend.set_image_data_format(image_format)
    dtype = 'float16' if precision == 'fp16' else 'float32'
    backend.set_floatx(dtype)
    model = model_func(num_classes=image_processing.NUM_CLASSES)
    loss_func = 'sparse_categorical_crossentropy',

    top5 = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5, name='top5')
    top1 = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=1, name='top1')

    # Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow
    # uses hvd.DistributedOptimizer() to compute gradients. However, this option
    # will disable the overlapping of the data loading and compute and hurt the
    # performace if the model is not under the scope of distribution strategy
    # scope.
    model.compile(optimizer=opt,
                  loss=loss_func,
                  metrics=[top1, top5],
                  experimental_run_tf_function=False)

    training_hooks = []
    training_hooks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
    training_hooks.append(_ProfileKerasFitCallback(batch_size, display_every))

    if log_dir and hvd.rank() == 0:
        ckpt_callback = keras.callbacks.ModelCheckpoint(
            ckpt_format,
            monitor='val_top1',
            verbose=1,
            save_best_only=False,
            save_weights_only=False,
            save_frequency=1)
        training_hooks.append(ckpt_callback)

    if tensorboard_dir and hvd.rank() == 0:
        tensorboard_callback = tf.keras.callbacks.TensorBoard(
            log_dir=tensorboard_dir)
        training_hooks.append(tensorboard_callback)

    if data_dir is not None:
        num_preproc_threads = params['dali_threads'] if dali_mode else 10
        train_input = image_processing.image_set(
            train_files,
            batch_size,
            image_height,
            image_width,
            training=True,
            distort_color=distort_color,
            deterministic=False,
            num_threads=num_preproc_threads,
            use_dali=dali_mode,
            idx_filenames=train_idx_files)

        valid_input = image_processing.image_set(
            valid_files,
            batch_size,
            image_height,
            image_width,
            training=False,
            distort_color=False,
            deterministic=False,
            num_threads=num_preproc_threads,
            use_dali=dali_mode,
            idx_filenames=valid_idx_files)
        if dali_mode:
            train_input = train_input.get_device_dataset()
            valid_input = valid_input.get_device_dataset()
        valid_params = {
            'validation_data': valid_input,
            'validation_steps': nstep_per_valid,
            'validation_freq': 1
        }
    else:
        train_input = image_processing.fake_image_set(batch_size, image_height,
                                                      image_width)
        valid_params = {}

    try:
        verbose = 2 if hvd.rank() == 0 else 0
        model.fit(train_input,
                  epochs=num_epochs,
                  callbacks=training_hooks,
                  steps_per_epoch=nstep_per_epoch,
                  verbose=verbose,
                  initial_epoch=initial_epoch,
                  **valid_params)
    except KeyboardInterrupt:
        print("Keyboard interrupt")

    if export_dir and hvd.rank() == 0:
        model.save(save_format)
        print(f"The model is saved to {save_format}")
    # Data, model, and output directories. These are required.
    parser.add_argument('--output-dir', type=str, default=os.environ['SM_OUTPUT_DIR'])
    parser.add_argument('--model_dir', type=str)
    parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
    parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEST'])

    args, _ = parser.parse_known_args()

    # Horovod: initialize Horovod.
    hvd.init()

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.Session(config=config))

    batch_size = 128
    num_classes = 10

    # Horovod: adjust number of epochs based on number of GPUs.
    epochs = int(math.ceil(12.0 / hvd.size()))

    # Input image dimensions
    img_rows, img_cols = 28, 28

    # The data, shuffled and split between train and test sets

    x_train = np.load(os.path.join(args.train, 'train.npz'))['data']
    y_train = np.load(os.path.join(args.train, 'train.npz'))['labels']