Beispiel #1
0
    def test_horovod_allreduce_cpu_gpu_error(self):
        """Test that the allreduce raises an error if different ranks try to
        perform reduction on CPU and GPU."""
        # Only do this test if there are GPUs available.
        if not tf.test.is_gpu_available(cuda_only=True):
            return

        hvd.init()
        local_rank = hvd.local_rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        device = "/gpu:0" if local_rank % 2 == 0 else "/cpu:0"
        one_gpu = tf.GPUOptions(visible_device_list=str(local_rank))
        gpu_config = tf.ConfigProto(gpu_options=one_gpu)
        with self.test_session(config=gpu_config) as session:
            with tf.device(device):
                # Same rank, different dimension
                dims = [17] * 3
                tensor = tf.ones(dims, dtype=tf.int32)
                with self.assertRaises(tf.errors.FailedPreconditionError):
                    session.run(hvd.allreduce(tensor))
def main(unused_argv):
    # Horovod: initialize Horovod.
    hvd.init()

    # Load training and eval data
    mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank())
    train_data = mnist.train.images  # Returns np.array
    train_labels = np.asarray(mnist.train.labels, dtype=np.int32)
    eval_data = mnist.test.images  # Returns np.array
    eval_labels = np.asarray(mnist.test.labels, dtype=np.int32)

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    model_dir = './mnist_convnet_model' if hvd.rank() == 0 else None

    # Create the Estimator
    mnist_classifier = tf.estimator.Estimator(
        model_fn=cnn_model_fn, model_dir=model_dir,
        config=tf.estimator.RunConfig(session_config=config))

    # Set up logging for predictions
    # Log the values in the "Softmax" tensor with label "probabilities"
    tensors_to_log = {"probabilities": "softmax_tensor"}
    logging_hook = tf.train.LoggingTensorHook(
        tensors=tensors_to_log, every_n_iter=500)

    # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from
    # rank 0 to all other processes. This is necessary to ensure consistent
    # initialization of all workers when training is started with random weights or
    # restored from a checkpoint.
    bcast_hook = hvd.BroadcastGlobalVariablesHook(0)

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": train_data},
        y=train_labels,
        batch_size=100,
        num_epochs=None,
        shuffle=True)

    # Horovod: adjust number of steps based on number of GPUs.
    mnist_classifier.train(
        input_fn=train_input_fn,
        steps=20000 // hvd.size(),
        hooks=[logging_hook, bcast_hook])

    # Evaluate the model and print results
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": eval_data},
        y=eval_labels,
        num_epochs=1,
        shuffle=False)
    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
    print(eval_results)
Beispiel #3
0
def tensorflow_session():
    # Init session and params
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    # Pin GPU to local rank (one GPU per process)
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    sess = tf.Session(config=config)
    return sess
Beispiel #4
0
    def _setup_graph(self):
        num_gpu = cfg.TRAIN.NUM_GPUS
        if cfg.TRAINER == 'replicated':
            # Use two predictor threads per GPU to get better throughput
            self.num_predictor = num_gpu * 2
            self.predictors = [self._build_coco_predictor(k % num_gpu) for k in range(self.num_predictor)]
            self.dataflows = [get_eval_dataflow(shard=k, num_shards=self.num_predictor)
                              for k in range(self.num_predictor)]
        else:
            # Only eval on the first machine.
            # Alternatively, can eval on all ranks and use allgather, but allgather sometimes hangs
            self._horovod_run_eval = hvd.rank() == hvd.local_rank()
            if self._horovod_run_eval:
                self.predictor = self._build_coco_predictor(0)
                self.dataflow = get_eval_dataflow(shard=hvd.local_rank(), num_shards=hvd.local_size())

            self.barrier = hvd.allreduce(tf.random_normal(shape=[1]))
Beispiel #5
0
def init_backend_engine():
  """
  Initializes ``engine``, which is either :class:`TFEngine.Engine` or Theano :class:`Engine.Engine`.
  """
  BackendEngine.select_engine(config=config)
  if BackendEngine.is_theano_selected():
    print("Theano:", describe_theano_version(), file=log.v3)
    import TheanoUtil
    TheanoUtil.monkey_patches()
  elif BackendEngine.is_tensorflow_selected():
    print("TensorFlow:", describe_tensorflow_version(), file=log.v3)
    if get_tensorflow_version_tuple()[0] == 0:
      print("Warning: TF <1.0 is not supported and likely broken.", file=log.v2)
    if os.environ.get("TF_DEVICE"):
      print("Devices: Use %s via TF_DEVICE instead of %s." % (
        os.environ.get("TF_DEVICE"), config.opt_typed_value("device")), file=log.v4)
      config.set("device", os.environ.get("TF_DEVICE"))
    if config.is_true("use_horovod"):
      import socket
      # noinspection PyPackageRequirements,PyUnresolvedReferences
      import horovod.tensorflow as hvd
      from TFUtil import init_horovod
      init_horovod()  # make sure it is initialized
      if "gpu" in config.value("device", "") or os.environ.get("CUDA_VISIBLE_DEVICES", ""):
        # We assume that we want to use a GPU.
        gpu_opts = config.typed_dict.setdefault("tf_session_opts", {}).setdefault("gpu_options", {})
        assert "visible_device_list" not in gpu_opts
        gpu_opts["visible_device_list"] = str(hvd.local_rank())
        print("Horovod: Hostname %s, pid %i, using GPU %s." % (
          socket.gethostname(), os.getpid(), gpu_opts["visible_device_list"]), file=log.v3)
      else:
        if hvd.rank() == 0:  # Don't spam in all ranks.
          print("Horovod: Not using GPU.", file=log.v3)
      horovod_reduce_type = config.value("horovod_reduce_type", "")
      if horovod_reduce_type == "":
        horovod_reduce_type = "grad"
        config.set("horovod_reduce_type", horovod_reduce_type)
      else:
        assert horovod_reduce_type in ["grad", "param"], "config option 'horovod_reduce_type' invalid"
      if hvd.rank() == 0:  # Don't spam in all ranks.
        print("Horovod: Reduce type:", horovod_reduce_type, file=log.v3)
    from TFUtil import debug_register_better_repr, setup_tf_thread_pools, print_available_devices
    tf_session_opts = config.typed_value("tf_session_opts", {})
    assert isinstance(tf_session_opts, dict)
    # This must be done after the Horovod logic, such that we only touch the devices we are supposed to touch.
    setup_tf_thread_pools(log_file=log.v3, tf_session_opts=tf_session_opts)
    # Print available devices. Also make sure that get_tf_list_local_devices uses the correct TF session opts.
    print_available_devices(tf_session_opts=tf_session_opts, file=log.v2)
    debug_register_better_repr()
  else:
    raise NotImplementedError
def main(_):
    # Initialize Horovod.
    hvd.init()

    # Download and load MNIST dataset.
    mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank())

    # Build model...
    with tf.name_scope('input'):
        image = tf.placeholder(tf.float32, [None, 784], name='image')
        label = tf.placeholder(tf.float32, [None], name='label')
    predict, loss = conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN)

    opt = tf.train.RMSPropOptimizer(0.01)

    # Add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    global_step = tf.contrib.framework.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    # BroadcastGlobalVariablesHook broadcasts initial variable states from rank 0
    # to all other processes. This is necessary to ensure consistent initialization
    # of all workers when training is started with random weights or restored
    # from a checkpoint.
    hooks = [hvd.BroadcastGlobalVariablesHook(0),
             tf.train.StopAtStepHook(last_step=100),
             tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
                                        every_n_iter=10),
             ]

    # Pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Save checkpoints only on worker 0 to prevent other workers from corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None

    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            image_, label_ = mnist.train.next_batch(100)
            mon_sess.run(train_op, feed_dict={image: image_, label: label_})
Beispiel #7
0
    def test_horovod_allreduce_multi_gpu(self):
        """Test that the allreduce works on multiple GPUs.

        This test will crash badly if used with an MPI implementation that does
        not support GPU memory transfers directly, as it will call MPI_Send on
        a GPU data pointer."""
        # Only do this test if there are GPUs available.
        if not tf.test.is_gpu_available(cuda_only=True):
            return

        hvd.init()
        local_rank = hvd.local_rank()
        size = hvd.size()

        iter = 0
        two_gpus = tf.GPUOptions(visible_device_list=(
            '%d,%d' % (local_rank * 2, local_rank * 2 + 1)))
        gpu_config = tf.ConfigProto(gpu_options=two_gpus)
        with self.test_session(config=gpu_config) as session:
            dtypes = [tf.int32, tf.int64, tf.float32, tf.float64]
            dims = [1, 2, 3]
            for dtype, dim in itertools.product(dtypes, dims):
                iter += 1
                with tf.device("/gpu:%d" % ((iter + local_rank) % 2)):
                    tf.set_random_seed(1234)
                    tensor = tf.random_uniform(
                        [17] * dim, -100, 100, dtype=dtype)
                    summed = hvd.allreduce(tensor, average=False)
                multiplied = tensor * size
                max_difference = tf.reduce_max(tf.abs(summed - multiplied))

                # Threshold for floating point equality depends on number of
                # ranks, since we're comparing against precise multiplication.
                if size <= 3:
                    threshold = 0
                elif size < 10:
                    threshold = 1e-4
                elif size < 15:
                    threshold = 5e-4
                else:
                    return

                diff = session.run(max_difference)
                self.assertTrue(diff <= threshold,
                                "hvd.allreduce on GPU produces incorrect results")
    def test_horovod_allreduce_gpu_fused(self):
        """Test that the allreduce works on GPUs with Tensor Fusion.

        This test will crash badly if used with an MPI implementation that does
        not support GPU memory transfers directly, as it will call MPI_Send on
        a GPU data pointer."""
        # Only do this test if there are GPUs available.
        if not tf.test.is_gpu_available(cuda_only=True):
            return

        hvd.init()
        local_rank = hvd.local_rank()
        size = hvd.size()

        with self.test_session(config=self.config) as session:
            dtypes = [tf.int32, tf.int64, tf.float32, tf.float64]
            dims = [1, 2, 3]
            tests = []
            for dtype, dim in itertools.product(dtypes, dims):
                with tf.device("/gpu:%d" % local_rank):
                    tf.set_random_seed(1234)
                    tensor = tf.random_uniform(
                        [17] * dim, -100, 100, dtype=dtype)
                    summed = hvd.allreduce(tensor, average=False)
                multiplied = tensor * size
                max_difference = tf.reduce_max(tf.abs(summed - multiplied))

                # Threshold for floating point equality depends on number of
                # ranks, since we're comparing against precise multiplication.
                if size <= 3 or dtype in [tf.int32, tf.int64]:
                    threshold = 0
                elif size < 10:
                    threshold = 1e-4
                elif size < 15:
                    threshold = 5e-4
                else:
                    return

                test = max_difference <= threshold
                tests.append(test)
            self.assertTrue(session.run(tf.reduce_all(tests)),
                            "hvd.allreduce produces incorrect results")
Beispiel #9
0
    def _eval(self):
        logdir = args.logdir
        if cfg.TRAINER == 'replicated':
            with ThreadPoolExecutor(max_workers=self.num_predictor, thread_name_prefix='EvalWorker') as executor, \
                    tqdm.tqdm(total=sum([df.size() for df in self.dataflows])) as pbar:
                futures = []
                for dataflow, pred in zip(self.dataflows, self.predictors):
                    futures.append(executor.submit(eval_coco, dataflow, pred, pbar))
                all_results = list(itertools.chain(*[fut.result() for fut in futures]))
        else:
            if self._horovod_run_eval:
                local_results = eval_coco(self.dataflow, self.predictor)
                output_partial = os.path.join(
                    logdir, 'outputs{}-part{}.json'.format(self.global_step, hvd.local_rank()))
                with open(output_partial, 'w') as f:
                    json.dump(local_results, f)
            self.barrier.eval()
            if hvd.rank() > 0:
                return
            all_results = []
            for k in range(hvd.local_size()):
                output_partial = os.path.join(
                    logdir, 'outputs{}-part{}.json'.format(self.global_step, k))
                with open(output_partial, 'r') as f:
                    obj = json.load(f)
                all_results.extend(obj)
                os.unlink(output_partial)

        output_file = os.path.join(
            logdir, 'outputs{}.json'.format(self.global_step))
        with open(output_file, 'w') as f:
            json.dump(all_results, f)
        try:
            scores = print_evaluation_scores(output_file)
            for k, v in scores.items():
                self.trainer.monitors.put_scalar(k, v)
        except Exception:
            logger.exception("Exception in COCO evaluation.")
Beispiel #10
0
def main(_):
    """
    Builds the model and runs.
    """

    if FLAGS.distributed:
        import horovod.tensorflow as hvd
        hvd.init()

    tf.logging.set_verbosity(tf.logging.INFO)

    tx.utils.maybe_create_dir(FLAGS.output_dir)

    bert_pretrain_dir = ('bert_pretrained_models'
                         '/%s') % FLAGS.config_bert_pretrain
    # Loads BERT model configuration
    if FLAGS.config_format_bert == "json":
        bert_config = model_utils.transform_bert_to_texar_config(
            os.path.join(bert_pretrain_dir, 'bert_config.json'))
    elif FLAGS.config_format_bert == 'texar':
        bert_config = importlib.import_module(
            ('bert_config_lib.'
             'config_model_%s') % FLAGS.config_bert_pretrain)
    else:
        raise ValueError('Unknown config_format_bert.')

    # Loads data

    num_classes = config_data.num_classes
    num_train_data = config_data.num_train_data

    # Configures distribued mode
    if FLAGS.distributed:
        config_data.train_hparam["dataset"]["num_shards"] = hvd.size()
        config_data.train_hparam["dataset"]["shard_id"] = hvd.rank()
        config_data.train_hparam["batch_size"] //= hvd.size()

    train_dataset = tx.data.TFRecordData(hparams=config_data.train_hparam)
    eval_dataset = tx.data.TFRecordData(hparams=config_data.eval_hparam)
    test_dataset = tx.data.TFRecordData(hparams=config_data.test_hparam)

    iterator = tx.data.FeedableDataIterator({
        'train': train_dataset, 'eval': eval_dataset, 'test': test_dataset})
    batch = iterator.get_next()
    input_ids = batch["input_ids"]
    segment_ids = batch["segment_ids"]
    batch_size = tf.shape(input_ids)[0]
    input_length = tf.reduce_sum(1 - tf.to_int32(tf.equal(input_ids, 0)),
                                 axis=1)
    # Builds BERT
    with tf.variable_scope('bert'):
        embedder = tx.modules.WordEmbedder(
            vocab_size=bert_config.vocab_size,
            hparams=bert_config.embed)
        word_embeds = embedder(input_ids)

        # Creates segment embeddings for each type of tokens.
        segment_embedder = tx.modules.WordEmbedder(
            vocab_size=bert_config.type_vocab_size,
            hparams=bert_config.segment_embed)
        segment_embeds = segment_embedder(segment_ids)

        input_embeds = word_embeds + segment_embeds

        # The BERT model (a TransformerEncoder)
        encoder = tx.modules.TransformerEncoder(hparams=bert_config.encoder)
        output = encoder(input_embeds, input_length)

        # Builds layers for downstream classification, which is also
        # initialized with BERT pre-trained checkpoint.
        with tf.variable_scope("pooler"):
            # Uses the projection of the 1st-step hidden vector of BERT output
            # as the representation of the sentence
            bert_sent_hidden = tf.squeeze(output[:, 0:1, :], axis=1)
            bert_sent_output = tf.layers.dense(
                bert_sent_hidden, config_downstream.hidden_dim,
                activation=tf.tanh)
            output = tf.layers.dropout(
                bert_sent_output, rate=0.1, training=tx.global_mode_train())

    # Adds the final classification layer
    logits = tf.layers.dense(
        output, num_classes,
        kernel_initializer=tf.truncated_normal_initializer(stddev=0.02))
    preds = tf.argmax(logits, axis=-1, output_type=tf.int32)
    accu = tx.evals.accuracy(batch['label_ids'], preds)

    # Optimization

    loss = tf.losses.sparse_softmax_cross_entropy(
        labels=batch["label_ids"], logits=logits)
    global_step = tf.Variable(0, trainable=False)

    # Builds learning rate decay scheduler
    static_lr = config_downstream.lr['static_lr']
    num_train_steps = int(num_train_data / config_data.train_batch_size
                          * config_data.max_train_epoch)
    num_warmup_steps = int(num_train_steps * config_data.warmup_proportion)
    lr = model_utils.get_lr(global_step, num_train_steps, # lr is a Tensor
                            num_warmup_steps, static_lr)

    opt = tx.core.get_optimizer(
        global_step=global_step,
        learning_rate=lr,
        hparams=config_downstream.opt
    )

    if FLAGS.distributed:
        opt = hvd.DistributedOptimizer(opt)

    train_op = tf.contrib.layers.optimize_loss(
        loss=loss,
        global_step=global_step,
        learning_rate=None,
        optimizer=opt)

    # Train/eval/test routine

    def _is_head():
        if not FLAGS.distributed:
            return True
        else:
            return hvd.rank() == 0

    def _train_epoch(sess):
        """Trains on the training set, and evaluates on the dev set
        periodically.
        """
        iterator.restart_dataset(sess, 'train')
        
        fetches = {
            'train_op': train_op,
            'loss': loss,
            'batch_size': batch_size,
            'step': global_step
        }

        while True:
            try:
                feed_dict = {
                    iterator.handle: iterator.get_handle(sess, 'train'),
                    tx.global_mode(): tf.estimator.ModeKeys.TRAIN,
                }
                rets = sess.run(fetches, feed_dict)
                step = rets['step']

                dis_steps = config_data.display_steps
                if _is_head() and dis_steps > 0 and step % dis_steps == 0:
                    tf.logging.info('step:%d; loss:%f' % (step, rets['loss']))

                eval_steps = config_data.eval_steps
                if _is_head() and eval_steps > 0 and step % eval_steps == 0:
                    _eval_epoch(sess)

            except tf.errors.OutOfRangeError:
                break

    def _eval_epoch(sess):
        """Evaluates on the dev set.
        """
        iterator.restart_dataset(sess, 'eval')

        cum_acc = 0.0
        cum_loss = 0.0
        nsamples = 0
        fetches = {
            'accu': accu,
            'loss': loss,
            'batch_size': batch_size,
        }
        while True:
            try:
                feed_dict = {
                    iterator.handle: iterator.get_handle(sess, 'eval'),
                    tx.context.global_mode(): tf.estimator.ModeKeys.EVAL,
                }
                rets = sess.run(fetches, feed_dict)

                cum_acc += rets['accu'] * rets['batch_size']
                cum_loss += rets['loss'] * rets['batch_size']
                nsamples += rets['batch_size']
            except tf.errors.OutOfRangeError:
                break

        tf.logging.info('eval accu: {}; loss: {}; nsamples: {}'.format(
            cum_acc / nsamples, cum_loss / nsamples, nsamples))

    def _test_epoch(sess):
        """Does predictions on the test set.
        """
        iterator.restart_dataset(sess, 'test')

        _all_preds = []
        while True:
            try:
                feed_dict = {
                    iterator.handle: iterator.get_handle(sess, 'test'),
                    tx.context.global_mode(): tf.estimator.ModeKeys.PREDICT,
                }
                _preds = sess.run(preds, feed_dict=feed_dict)
                _all_preds.extend(_preds.tolist())
            except tf.errors.OutOfRangeError:
                break

        output_file = os.path.join(FLAGS.output_dir, "test_results.tsv")
        with tf.gfile.GFile(output_file, "w") as writer:
            writer.write('\n'.join(str(p) for p in _all_preds))

    # Loads pretrained BERT model parameters
    init_checkpoint = os.path.join(bert_pretrain_dir, 'bert_model.ckpt')
    model_utils.init_bert_checkpoint(init_checkpoint)

    # Broadcasts global variables from rank-0 process
    if FLAGS.distributed:
        bcast = hvd.broadcast_global_variables(0)

    session_config = tf.ConfigProto()
    if FLAGS.distributed:
        session_config.gpu_options.visible_device_list = str(hvd.local_rank())

    with tf.Session(config=session_config) as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())
        sess.run(tf.tables_initializer())

        if FLAGS.distributed:
            bcast.run()

        # Restores trained model if specified
        saver = tf.train.Saver()
        if FLAGS.checkpoint:
            saver.restore(sess, FLAGS.checkpoint)

        iterator.initialize_dataset(sess)

        if FLAGS.do_train:
            for i in range(config_data.max_train_epoch):
                _train_epoch(sess)
            saver.save(sess, FLAGS.output_dir + '/model.ckpt')

        if FLAGS.do_eval:
            _eval_epoch(sess)

        if FLAGS.do_test:
            _test_epoch(sess)
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)

    nranks_per_gpu = args.nranks_per_gpu
    local_rank = hvd.local_rank()
    gpu_local_rank = local_rank // nranks_per_gpu
    print('local_rank, GPU_LOCAL_RANK: {}, {}'.format(
        local_rank, gpu_local_rank))

    # Pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    # config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.visible_device_list = str(gpu_local_rank)
    K.set_session(tf.Session(config=config))

    # input image dimensions
    img_rows, img_cols, img_chns = 28, 28, 1
    # number of convolutional filters to use
    filters = 64
    # convolution kernel size
    num_conv = 3

    hvdsize = hvd.size()

    batch_size = 128  # 100
    if K.image_data_format() == 'channels_first':
        original_img_size = (img_chns, img_rows, img_cols)
    else:
        original_img_size = (img_rows, img_cols, img_chns)
    latent_dim = 2
    intermediate_dim = 128
    epsilon_std = 1.0
    epochs = args.epochs  # 5

    # train the VAE on MNIST digits
    (x_train, _), (x_test, y_test) = mnist.load_data()

    # Data split if going for reduction in each iteration step. Using
    # tf-queue or dataset is better to preserve uniform random sampling.
    # nsamples = x_train.shape[0]
    # mysamples = nsamples // hvdsize
    # start_sam = hvd.local_rank() * mysamples
    # stop_sam = min((hvd.local_rank() + 1) * mysamples, nsamples)
    # x_train = x_train[start_sam:stop_sam, ...]

    x_train = x_train.astype('float32') / 255.
    x_train = x_train.reshape((x_train.shape[0],) + original_img_size)
    x_test = x_test.astype('float32') / 255.
    x_test = x_test.reshape((x_test.shape[0],) + original_img_size)

    if hvd.rank() == 0:
        print('x_train.shape:', x_train.shape)

    vae, encoder, generator = make_vae_and_codec(
        original_img_size, img_chns, img_rows, img_cols, batch_size,
        filters, num_conv, intermediate_dim, latent_dim, epsilon_std)
    # :  :type vae: Model

    lr = 0.001  # * hvdsize
    opt = tf.train.RMSPropOptimizer(lr)
    # Add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)  # , use_locking=True)
    opt = TFOptimizer(opt)

    vae.compile(optimizer=opt, loss=None)
    if hvd.rank() == 0:
        vae.summary()

    callbacks = []
    if hvd.rank() == 0:
        callbacks += [BatchTiming(), SamplesPerSec(batch_size * hvdsize)]

    sess = K.get_session()
    sess.run(hvd.broadcast_global_variables(0))

    vae.fit(x_train,
            shuffle=True,
            epochs=epochs,
            batch_size=batch_size,
            validation_data=(x_test, None),
            callbacks=callbacks)

    if hvd.rank() == 0:
        vae_val = vae
        loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size)
        print('\n\nVAE VALIDATION LOSS: {}'.format(loss))

        # display a 2D plot of the digit classes in the latent space
        x_test_encoded = encoder.predict(x_test, batch_size=batch_size)
        plt.figure(figsize=(6, 6))
        plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test)
        plt.colorbar()
        # plt.show()
        plt.savefig('vae_scatter.ps')
        plt.close()

        # display a 2D manifold of the digits
        n = 15  # figure with 15x15 digits
        digit_size = 28
        figure = np.zeros((digit_size * n, digit_size * n))
        # Linearly spaced coordinates on the unit square were transformed
        # through the inverse CDF (ppf) of the Gaussian
        # To produce values of the latent variables z, since the prior of the
        # latent space is Gaussian
        grid_x = norm.ppf(np.linspace(0.05, 0.95, n))
        grid_y = norm.ppf(np.linspace(0.05, 0.95, n))

        for i, yi in enumerate(grid_x):
            for j, xi in enumerate(grid_y):
                z_sample = np.array([[xi, yi]])
                z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2)
                x_decoded = generator.predict(z_sample, batch_size=batch_size)
                digit = x_decoded[0].reshape(digit_size, digit_size)
                figure[i * digit_size: (i + 1) * digit_size,
                       j * digit_size: (j + 1) * digit_size] = digit

        plt.figure(figsize=(10, 10))
        plt.imshow(figure, cmap='Greys_r')
        # plt.show()
        plt.savefig('vae_digit.ps')
        plt.close()

    K.clear_session()
def main(_):
    # causes memory fragmentation for bert leading to OOM
    if os.environ.get("TF_XLA_FLAGS", None) is not None:
        os.environ["TF_XLA_FLAGS"] += "--tf_xla_enable_lazy_compilation=false"
    else:
        os.environ["TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false"

    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path)

    if FLAGS.horovod:
        hvd.init()

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "xnli": XnliProcessor,
    }

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.io.gfile.makedirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    master_process = True
    training_hooks = []
    global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps
    hvd_rank = 0

    config = tf.compat.v1.ConfigProto()
    if FLAGS.horovod:

        tf.compat.v1.logging.info("Multi-GPU training with TF Horovod")
        tf.compat.v1.logging.info("hvd.size() = %d hvd.rank() = %d",
                                  hvd.size(), hvd.rank())
        global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size(
        )
        master_process = (hvd.rank() == 0)
        hvd_rank = hvd.rank()
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        if hvd.size() > 1:
            training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
    if FLAGS.use_xla:
        config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
        tf.enable_resource_variables()

    run_config = tf.estimator.RunConfig(
        model_dir=FLAGS.output_dir if master_process else None,
        session_config=config,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps
        if master_process else None,
        save_summary_steps=FLAGS.save_checkpoints_steps
        if master_process else None,
        log_step_count_steps=FLAGS.display_loss_steps,
        keep_checkpoint_max=1)

    if master_process:
        tf.compat.v1.logging.info("***** Configuaration *****")
        for key in FLAGS.__flags.keys():
            tf.compat.v1.logging.info('  {}: {}'.format(
                key, getattr(FLAGS, key)))
        tf.compat.v1.logging.info("**************************")

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    training_hooks.append(
        LogTrainRunHook(global_batch_size,
                        hvd_rank,
                        FLAGS.save_checkpoints_steps,
                        num_steps_ignore_xla=10))

    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / global_batch_size * FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

        start_index = 0
        end_index = len(train_examples)
        tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")]

        if FLAGS.horovod:
            tmp_filenames = [
                os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i))
                for i in range(hvd.size())
            ]
            num_examples_per_rank = len(train_examples) // hvd.size()
            remainder = len(train_examples) % hvd.size()
            if hvd.rank() < remainder:
                start_index = hvd.rank() * (num_examples_per_rank + 1)
                end_index = start_index + num_examples_per_rank + 1
            else:
                start_index = hvd.rank() * num_examples_per_rank + remainder
                end_index = start_index + (num_examples_per_rank)

    model_fn = model_fn_builder(task_name=task_name,
                                bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate
                                if not FLAGS.horovod else FLAGS.learning_rate *
                                hvd.size(),
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_one_hot_embeddings=False,
                                hvd=None if not FLAGS.horovod else hvd)

    estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)

    if FLAGS.do_train:

        file_based_convert_examples_to_features(
            train_examples[start_index:end_index], label_list,
            FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank])

        tf.compat.v1.logging.info("***** Running training *****")
        tf.compat.v1.logging.info("  Num examples = %d", len(train_examples))
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.compat.v1.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=tmp_filenames,
            batch_size=FLAGS.train_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True,
            hvd=None if not FLAGS.horovod else hvd)

        train_start_time = time.time()
        estimator.train(input_fn=train_input_fn,
                        max_steps=num_train_steps,
                        hooks=training_hooks)
        train_time_elapsed = time.time() - train_start_time
        train_time_wo_overhead = training_hooks[-1].total_time
        avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed
        ss_sentences_per_second = (
            num_train_steps - training_hooks[-1].skipped
        ) * global_batch_size * 1.0 / train_time_wo_overhead

        if master_process:
            tf.compat.v1.logging.info("-----------------------------")
            tf.compat.v1.logging.info(
                "Total Training Time = %0.2f for Sentences = %d",
                train_time_elapsed, num_train_steps * global_batch_size)
            tf.compat.v1.logging.info(
                "Total Training Time W/O Overhead = %0.2f for Sentences = %d",
                train_time_wo_overhead,
                (num_train_steps - training_hooks[-1].skipped) *
                global_batch_size)
            tf.compat.v1.logging.info(
                "Throughput Average (sentences/sec) with overhead = %0.2f",
                avg_sentences_per_second)
            tf.compat.v1.logging.info(
                "Throughput Average (sentences/sec) = %0.2f",
                ss_sentences_per_second)
            tf.compat.v1.logging.info("-----------------------------")

    if FLAGS.do_eval and master_process:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        file_based_convert_examples_to_features(eval_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, eval_file)

        tf.compat.v1.logging.info("***** Running evaluation *****")
        tf.compat.v1.logging.info("  Num examples = %d", len(eval_examples))
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_drop_remainder = False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            batch_size=FLAGS.eval_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)]
        eval_start_time = time.time()
        result = estimator.evaluate(input_fn=eval_input_fn, hooks=eval_hooks)

        eval_time_elapsed = time.time() - eval_start_time

        time_list = eval_hooks[-1].time_list
        time_list.sort()
        # Removing outliers (init/warmup) in throughput computation.
        eval_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.8)])
        num_sentences = (int(len(time_list) * 0.8)) * FLAGS.eval_batch_size

        avg = np.mean(time_list)
        cf_50 = max(time_list[:int(len(time_list) * 0.50)])
        cf_90 = max(time_list[:int(len(time_list) * 0.90)])
        cf_95 = max(time_list[:int(len(time_list) * 0.95)])
        cf_99 = max(time_list[:int(len(time_list) * 0.99)])
        cf_100 = max(time_list[:int(len(time_list) * 1)])
        ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead

        tf.compat.v1.logging.info("-----------------------------")
        tf.compat.v1.logging.info(
            "Total Inference Time = %0.2f for Sentences = %d",
            eval_time_elapsed, eval_hooks[-1].count * FLAGS.eval_batch_size)
        tf.compat.v1.logging.info(
            "Total Inference Time W/O Overhead = %0.2f for Sentences = %d",
            eval_time_wo_overhead, num_sentences)
        tf.compat.v1.logging.info("Summary Inference Statistics on EVAL set")
        tf.compat.v1.logging.info("Batch size = %d", FLAGS.eval_batch_size)
        tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
        tf.compat.v1.logging.info("Precision = %s",
                                  "fp16" if FLAGS.amp else "fp32")
        tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f",
                                  cf_50 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f",
                                  cf_90 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f",
                                  cf_95 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f",
                                  cf_99 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f",
                                  cf_100 * 1000)
        tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000)
        tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f",
                                  ss_sentences_per_second)
        dllogging.logger.log(step=(),
                             data={"throughput_val": ss_sentences_per_second},
                             verbosity=Verbosity.DEFAULT)
        tf.compat.v1.logging.info("-----------------------------")

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.io.gfile.GFile(output_eval_file, "w") as writer:
            tf.compat.v1.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                dllogging.logger.log(step=(),
                                     data={key: float(result[key])},
                                     verbosity=Verbosity.DEFAULT)
                tf.compat.v1.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if FLAGS.do_predict and master_process:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        file_based_convert_examples_to_features(predict_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, predict_file)

        tf.compat.v1.logging.info("***** Running prediction*****")
        tf.compat.v1.logging.info("  Num examples = %d", len(predict_examples))
        tf.compat.v1.logging.info("  Batch size = %d",
                                  FLAGS.predict_batch_size)

        predict_drop_remainder = False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            batch_size=FLAGS.predict_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        predict_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)]
        predict_start_time = time.time()

        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.tsv")
        with tf.io.gfile.GFile(output_predict_file, "w") as writer:
            tf.compat.v1.logging.info("***** Predict results *****")
            for prediction in estimator.predict(input_fn=predict_input_fn,
                                                hooks=predict_hooks,
                                                yield_single_examples=False):
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in prediction) + "\n"
                writer.write(output_line)

        predict_time_elapsed = time.time() - predict_start_time

        time_list = predict_hooks[-1].time_list
        time_list.sort()
        # Removing outliers (init/warmup) in throughput computation.
        predict_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.8)])
        num_sentences = (int(len(time_list) * 0.8)) * FLAGS.predict_batch_size

        avg = np.mean(time_list)
        cf_50 = max(time_list[:int(len(time_list) * 0.50)])
        cf_90 = max(time_list[:int(len(time_list) * 0.90)])
        cf_95 = max(time_list[:int(len(time_list) * 0.95)])
        cf_99 = max(time_list[:int(len(time_list) * 0.99)])
        cf_100 = max(time_list[:int(len(time_list) * 1)])
        ss_sentences_per_second = num_sentences * 1.0 / predict_time_wo_overhead

        tf.compat.v1.logging.info("-----------------------------")
        tf.compat.v1.logging.info(
            "Total Inference Time = %0.2f for Sentences = %d",
            predict_time_elapsed,
            predict_hooks[-1].count * FLAGS.predict_batch_size)
        tf.compat.v1.logging.info(
            "Total Inference Time W/O Overhead = %0.2f for Sentences = %d",
            predict_time_wo_overhead, num_sentences)
        tf.compat.v1.logging.info("Summary Inference Statistics on TEST SET")
        tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size)
        tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
        tf.compat.v1.logging.info("Precision = %s",
                                  "fp16" if FLAGS.amp else "fp32")
        tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f",
                                  cf_50 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f",
                                  cf_90 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f",
                                  cf_95 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f",
                                  cf_99 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f",
                                  cf_100 * 1000)
        tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000)
        tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f",
                                  ss_sentences_per_second)
        dllogging.logger.log(step=(),
                             data={"throughput_val": ss_sentences_per_second},
                             verbosity=Verbosity.DEFAULT)
        tf.compat.v1.logging.info("-----------------------------")
Beispiel #13
0
def main():
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments,
         LoggingArguments, PathArguments))
    (
        model_args,
        data_args,
        train_args,
        log_args,
        path_args,
        remaining_strings,
    ) = parser.parse_args_into_dataclasses(return_remaining_strings=True)
    # SageMaker may have some extra strings. TODO: Test this on SM.
    assert len(remaining_strings
               ) == 0, f"The args {remaining_strings} could not be parsed."

    tf.random.set_seed(train_args.seed)
    tf.autograph.set_verbosity(0)

    # Settings init
    parse_bool = lambda arg: arg == "true"
    do_gradient_accumulation = train_args.gradient_accumulation_steps > 1
    do_xla = not parse_bool(train_args.skip_xla)
    do_eager = parse_bool(train_args.eager)
    skip_sop = parse_bool(train_args.skip_sop)
    skip_mlm = parse_bool(train_args.skip_mlm)
    pre_layer_norm = parse_bool(model_args.pre_layer_norm)
    fast_squad = parse_bool(log_args.fast_squad)
    dummy_eval = parse_bool(log_args.dummy_eval)
    is_sagemaker = path_args.filesystem_prefix.startswith("/opt/ml")
    disable_tqdm = is_sagemaker
    global max_grad_norm
    max_grad_norm = train_args.max_grad_norm

    # Horovod init
    hvd.init()
    gpus = tf.config.list_physical_devices("GPU")
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.set_visible_devices(gpus[hvd.local_rank()], "GPU")
    # XLA, AutoGraph
    tf.config.optimizer.set_jit(do_xla)
    tf.config.experimental_run_functions_eagerly(do_eager)

    if hvd.rank() == 0:
        # Run name should only be used on one process to avoid race conditions
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        platform = "sm" if is_sagemaker else "eks"
        if skip_sop:
            loss_str = "-skipsop"
        elif skip_mlm:
            loss_str = "-skipmlm"
        else:
            loss_str = ""

        if log_args.run_name is None:
            metadata = (
                f"{model_args.model_type}"
                f"-{model_args.model_size}"
                f"-{model_args.load_from}"
                f"-{hvd.size()}gpus"
                f"-{train_args.per_gpu_batch_size * hvd.size() * train_args.gradient_accumulation_steps}globalbatch"
                f"-{train_args.learning_rate}maxlr"
                f"-{train_args.learning_rate_decay_power}power"
                f"-{train_args.optimizer}opt"
                f"-{train_args.total_steps}steps"
                f"-{'preln' if pre_layer_norm else 'postln'}"
                f"{loss_str}"
                f"-{model_args.hidden_dropout_prob}dropout")
            run_name = f"{current_time}-{platform}-{metadata}-{train_args.name if train_args.name else 'unnamed'}"
        else:
            run_name = log_args.run_name

        # Logging should only happen on a single process
        # https://stackoverflow.com/questions/9321741/printing-to-screen-and-writing-to-a-file-at-the-same-time
        level = logging.INFO
        format = "%(asctime)-15s %(name)-12s: %(levelname)-8s %(message)s"
        handlers = [
            logging.FileHandler(
                os.path.join(path_args.filesystem_prefix, path_args.log_dir,
                             f"{run_name}.log")),
            TqdmLoggingHandler(),
        ]
        logging.basicConfig(level=level, format=format, handlers=handlers)

        # Check that arguments passed in properly, only after registering the alert_func and logging
        assert not (skip_sop
                    and skip_mlm), "Cannot use --skip_sop and --skip_mlm"

    wrap_global_functions(do_gradient_accumulation)

    # Create optimizer and enable AMP loss scaling.
    if train_args.optimizer == "lamb":
        optimizer = get_lamb_optimizer(train_args)
    elif train_args.optimizer == "adamw":
        optimizer = get_adamw_optimizer(train_args)

    optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
        optimizer, loss_scale="dynamic")
    gradient_accumulator = GradientAccumulator()

    loaded_optimizer_weights = None

    model = create_model(model_class=TFAutoModelForPreTraining,
                         model_args=model_args)
    tokenizer = create_tokenizer(model_args.model_type)
    if model_args.load_from == "checkpoint":
        checkpoint_path = os.path.join(path_args.filesystem_prefix,
                                       model_args.checkpoint_path)
        model_ckpt, optimizer_ckpt = get_checkpoint_paths_from_prefix(
            checkpoint_path)
        if hvd.rank() == 0:
            model.load_weights(model_ckpt)
            if model_args.load_optimizer_state == "true":
                loaded_optimizer_weights = np.load(optimizer_ckpt,
                                                   allow_pickle=True)
            # We do not set the weights yet, we have to do a first step to initialize the optimizer.

    # Train filenames are [1, 2047], Val filenames are [0]. Note the different subdirectories
    # Move to same folder structure and remove if/else
    train_glob = os.path.join(path_args.filesystem_prefix, path_args.train_dir,
                              "*.tfrecord")
    validation_glob = os.path.join(path_args.filesystem_prefix,
                                   path_args.val_dir, "*.tfrecord")

    train_filenames = glob.glob(train_glob)
    validation_filenames = glob.glob(validation_glob)

    train_dataset = get_dataset_from_tfrecords(
        model_type=model_args.model_type,
        filenames=train_filenames,
        max_seq_length=data_args.max_seq_length,
        max_predictions_per_seq=data_args.max_predictions_per_seq,
        per_gpu_batch_size=train_args.per_gpu_batch_size,
    )  # Of shape [per_gpu_batch_size, ...]
    # Batch of batches, helpful for gradient accumulation. Shape [grad_steps, per_gpu_batch_size, ...]
    train_dataset = train_dataset.batch(train_args.gradient_accumulation_steps)
    # One iteration with 10 dupes, 8 nodes seems to be 60-70k steps.
    train_dataset = train_dataset.prefetch(buffer_size=8)

    # Validation should only be done on one node, since Horovod doesn't allow allreduce on a subset of ranks
    if hvd.rank() == 0:
        validation_dataset = get_dataset_from_tfrecords(
            model_type=model_args.model_type,
            filenames=validation_filenames,
            max_seq_length=data_args.max_seq_length,
            max_predictions_per_seq=data_args.max_predictions_per_seq,
            per_gpu_batch_size=train_args.per_gpu_batch_size,
        )
        # validation_dataset = validation_dataset.batch(1)
        validation_dataset = validation_dataset.prefetch(buffer_size=8)

        pbar = tqdm.tqdm(total=train_args.total_steps, disable=disable_tqdm)
        summary_writer = None  # Only create a writer if we make it through a successful step
        logger.info(f"Starting training, job name {run_name}")

    i = 1
    start_time = time.perf_counter()
    for batch in train_dataset:
        learning_rate = optimizer.learning_rate(
            step=tf.constant(i, dtype=tf.float32))
        # weight_decay = wd_schedule(step=tf.constant(i, dtype=tf.float32))
        loss_scale = optimizer.loss_scale()
        loss, mlm_loss, mlm_acc, sop_loss, sop_acc, grad_norm, weight_norm = train_step(
            model=model,
            optimizer=optimizer,
            gradient_accumulator=gradient_accumulator,
            batch=batch,
            gradient_accumulation_steps=train_args.gradient_accumulation_steps,
            skip_sop=skip_sop,
            skip_mlm=skip_mlm,
        )

        # Don't want to wrap broadcast_variables() in a tf.function, can lead to asynchronous errors
        if i == 1:
            if hvd.rank() == 0 and loaded_optimizer_weights is not None:
                optimizer.set_weights(loaded_optimizer_weights)
            hvd.broadcast_variables(model.variables, root_rank=0)
            hvd.broadcast_variables(optimizer.variables(), root_rank=0)
            i = optimizer.get_weights()[0]

        is_final_step = i >= train_args.total_steps
        do_squad = (log_args.squad_frequency != 0) and (
            (i % log_args.squad_frequency == 0) or is_final_step)
        # Squad requires all the ranks to train, but results are only returned on rank 0
        if do_squad:
            squad_results = get_squad_results_while_pretraining(
                model=model,
                tokenizer=tokenizer,
                model_size=model_args.model_size,
                filesystem_prefix=path_args.filesystem_prefix,
                step=i,
                dataset=data_args.squad_version,
                fast=log_args.fast_squad,
                dummy_eval=log_args.dummy_eval,
            )
            if hvd.rank() == 0:
                squad_exact, squad_f1 = squad_results["exact"], squad_results[
                    "f1"]
                logger.info(
                    f"SQuAD step {i} -- F1: {squad_f1:.3f}, Exact: {squad_exact:.3f}"
                )
            # Re-wrap autograph so it doesn't get arg mismatches
            wrap_global_functions(do_gradient_accumulation)
            gc.collect()

        if hvd.rank() == 0:
            do_log = i % log_args.log_frequency == 0
            do_checkpoint = (log_args.checkpoint_frequency != 0) and (
                (i % log_args.checkpoint_frequency == 0) or is_final_step)
            do_validation = (log_args.validation_frequency != 0) and (
                (i % log_args.validation_frequency == 0) or is_final_step)

            pbar.update(1)
            description = f"Loss: {loss:.3f}, MLM: {mlm_loss:.3f}, SOP: {sop_loss:.3f}, MLM_acc: {mlm_acc:.3f}, SOP_acc: {sop_acc:.3f}"
            pbar.set_description(description)
            if do_log:
                elapsed_time = time.perf_counter() - start_time
                if i == 1:
                    logger.info(f"First step: {elapsed_time:.3f} secs")
                else:
                    it_per_sec = log_args.log_frequency / elapsed_time
                    logger.info(
                        f"Train step {i} -- {description} -- It/s: {it_per_sec:.2f}"
                    )
                    start_time = time.perf_counter()

            if do_checkpoint:
                checkpoint_prefix = os.path.join(path_args.filesystem_prefix,
                                                 path_args.checkpoint_dir,
                                                 f"{run_name}-step{i}")
                model_ckpt = f"{checkpoint_prefix}.ckpt"
                optimizer_ckpt = f"{checkpoint_prefix}-optimizer.npy"
                logger.info(
                    f"Saving model at {model_ckpt}, optimizer at {optimizer_ckpt}"
                )
                model.save_weights(model_ckpt)
                # model.load_weights(model_ckpt)

                optimizer_weights = optimizer.get_weights()
                np.save(optimizer_ckpt, optimizer_weights)
                # optimizer.set_weights(optimizer_weights)

            if do_validation:
                val_loss, val_mlm_loss, val_mlm_acc, val_sop_loss, val_sop_acc = run_validation(
                    model=model,
                    validation_dataset=validation_dataset,
                    skip_sop=skip_sop,
                    skip_mlm=skip_mlm,
                )
                description = f"Loss: {val_loss:.3f}, MLM: {val_mlm_loss:.3f}, SOP: {val_sop_loss:.3f}, MLM_acc: {val_mlm_acc:.3f}, SOP_acc: {val_sop_acc:.3f}"
                logger.info(f"Validation step {i} -- {description}")

            # Create summary_writer after the first step
            if summary_writer is None:
                summary_writer = tf.summary.create_file_writer(
                    os.path.join(path_args.filesystem_prefix,
                                 path_args.log_dir, run_name))
                config = {
                    **asdict(model_args),
                    **asdict(data_args),
                    **asdict(train_args),
                    **asdict(log_args),
                    "global_batch_size":
                    train_args.per_gpu_batch_size * hvd.size(),
                }
                if is_wandb_available():
                    wandb.init(config=config, project=model_args.model_type)
                    wandb.run.save()
                    wandb_run_name = wandb.run.name

            train_metrics = {
                "weight_norm": weight_norm,
                "grad_norm": grad_norm,
                "loss_scale": loss_scale,
                "learning_rate": learning_rate,
                "train/loss": loss,
                "train/mlm_loss": mlm_loss,
                "train/mlm_acc": mlm_acc,
                "train/sop_loss": sop_loss,
                "train/sop_acc": sop_acc,
            }
            all_metrics = {**train_metrics}
            if do_validation:
                val_metrics = {
                    "val/loss": val_loss,
                    "val/mlm_loss": val_mlm_loss,
                    "val/mlm_acc": val_mlm_acc,
                    "val/sop_loss": val_sop_loss,
                    "val/sop_acc": val_sop_acc,
                }
                all_metrics = {**all_metrics, **val_metrics}
            if do_squad:
                squad_metrics = {
                    "squad/f1": squad_f1,
                    "squad/exact": squad_exact,
                }
                all_metrics = {**all_metrics, **squad_metrics}

            # Log to TensorBoard
            with summary_writer.as_default():
                for name, val in all_metrics.items():
                    tf.summary.scalar(name, val, step=i)
            # Log to Weights & Biases
            if is_wandb_available():
                wandb.log({"step": i, **all_metrics})

        i += 1
        if is_final_step:
            break

    if hvd.rank() == 0:
        pbar.close()
        logger.info(f"Finished pretraining, job name {run_name}")
def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    # Keras automatically creates a cache directory in ~/.keras/datasets for
    # storing the downloaded MNIST data. This creates a race
    # condition among the workers that share the same filesystem. If the
    # directory already exists by the time this worker gets around to creating
    # it, ignore the resulting exception and continue.
    cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
    if not os.path.exists(cache_dir):
        try:
            os.mkdir(cache_dir)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
                pass
            else:
                raise

    # Download and load MNIST dataset.
    (x_train, y_train), (x_test, y_test) = \
        keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank())

    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
    # into (-1, 784) to feed into our network. Also, need to normalize the
    # features between 0 and 1.
    x_train = np.reshape(x_train, (-1, 784)) / 255.0
    x_test = np.reshape(x_test, (-1, 784)) / 255.0

    # Build model...
    with tf.name_scope('input'):
        image = tf.placeholder(tf.float32, [None, 784], name='image')
        label = tf.placeholder(tf.float32, [None], name='label')
    predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)

    # Horovod: adjust learning rate based on number of GPUs.
    opt = tf.train.AdamOptimizer(0.001 * hvd.size())

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    global_step = tf.train.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=20000 // hvd.size()),
        tf.train.LoggingTensorHook(tensors={
            'step': global_step,
            'loss': loss
        },
                                   every_n_iter=10),
    ]

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
    training_batch_generator = train_input_generator(x_train,
                                                     y_train,
                                                     batch_size=100)
    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            image_, label_ = next(training_batch_generator)
            mon_sess.run(train_op, feed_dict={image: image_, label: label_})
Beispiel #15
0
def train_main(dataset,
               model_name='117M',
               seed=None,
               batch_size=2,
               sample_length=1023,
               sample_num=1,
               sample_every=4500,
               run_name='run1',
               restore_from='latest',
               save_every=2000,
               combine=50000):

    enc = encoder.get_encoder(model_name)
    hparams = model.default_hparams()
    with open(os.path.join('models', model_name, 'hparams.json')) as f:
        hparams.override_from_dict(json.load(f))

    if sample_length is None:
        sample_length = hparams.n_ctx // 2
    elif sample_length > hparams.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" %
                         hparams.n_ctx)

    # TF config

    config = tf.ConfigProto()
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.allow_growth = True

    with tf.Session(config=config) as sess:
        context = tf.placeholder(tf.int32, [batch_size, None])
        np.random.seed(seed)
        tf.set_random_seed(seed)
        output = model.model(hparams=hparams, X=context)
        loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=context[:, 1:], logits=output['logits'][:, :-1]))

        tf_sample = sample.sample_sequence(hparams=hparams,
                                           length=sample_length,
                                           context=context,
                                           batch_size=batch_size,
                                           temperature=0.8,
                                           top_k=40)

        train_vars = [v for v in tf.trainable_variables() if 'model' in v.name]

        opt = tf.train.AdamOptimizer()
        opt = hvd.DistributedOptimizer(opt)
        train_op = opt.minimize(loss, var_list=train_vars)

        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        bcast = hvd.broadcast_global_variables(0)

        saver = tf.train.Saver(var_list=train_vars,
                               max_to_keep=5,
                               keep_checkpoint_every_n_hours=2)

        sess.run(tf.global_variables_initializer())

        if restore_from == 'latest':
            ckpt = tf.train.latest_checkpoint(
                os.path.join(CHECKPOINT_DIR, run_name))
            if ckpt is None:
                # Get fresh GPT weights if new run.
                ckpt = tf.train.latest_checkpoint(
                    os.path.join('models', model_name))
        elif restore_from == 'fresh':
            ckpt = tf.train.latest_checkpoint(
                os.path.join('models', model_name))
        else:
            ckpt = tf.train.latest_checkpoint(restore_from)
        print(str(hvd.local_rank()), 'Loading checkpoint', ckpt)
        saver.restore(sess, ckpt)

        bcast.run()

        print(str(hvd.local_rank()), 'Loading dataset...')
        chunks = load_dataset(enc, dataset, combine)
        data_sampler = Sampler(chunks)
        print(str(hvd.local_rank()), 'dataset has', data_sampler.total_size,
              'tokens')
        print(str(hvd.local_rank()), 'Training...')

        counter = 1
        if os.path.exists(os.path.join(CHECKPOINT_DIR, run_name, 'counter')):
            # Load the step number if we're resuming a run
            # Add 1 so we don't immediately try to save again
            with open(os.path.join(CHECKPOINT_DIR, run_name, 'counter'),
                      'r') as fp:
                counter = int(fp.read()) + 1

        def save():
            maketree(os.path.join(CHECKPOINT_DIR, run_name))
            print(
                'Saving',
                os.path.join(CHECKPOINT_DIR, run_name,
                             'model-{}').format(counter))
            saver.save(sess,
                       os.path.join(CHECKPOINT_DIR, run_name, 'model'),
                       global_step=counter)
            with open(os.path.join(CHECKPOINT_DIR, run_name, 'counter'),
                      'w') as fp:
                fp.write(str(counter) + '\n')

        def generate_samples():
            context_tokens = data_sampler.sample(1)
            all_text = []
            index = 0
            while index < sample_num:
                out = sess.run(
                    tf_sample,
                    feed_dict={context: batch_size * [context_tokens]})
                for i in range(min(sample_num - index, batch_size)):
                    text = enc.decode(out[i])
                    text = '======== SAMPLE {} ========\n{}\n'.format(
                        index + 1, text)
                    all_text.append(text)
                    index += 1
            print(text)
            maketree(os.path.join(SAMPLE_DIR, run_name))
            with open(
                    os.path.join(SAMPLE_DIR, run_name,
                                 'samples-{}').format(counter), 'w') as fp:
                fp.write('\n'.join(all_text))

        avg_loss = (0.0, 0.0)
        start_time = time.time()

        try:
            while True:

                batch = [data_sampler.sample(1024) for _ in range(batch_size)]

                _, lv = sess.run((train_op, loss), feed_dict={context: batch})

                avg_loss = (avg_loss[0] * 0.99 + lv, avg_loss[1] * 0.99 + 1.0)

                if hvd.rank() == 0:
                    if counter % save_every == 0:
                        save()
                    if counter % sample_every == 0:
                        generate_samples()

                    print(
                        '[{counter} | {time:2.2f}] loss={loss:2.2f} avg={avg:2.2f}'
                        .format(counter=counter,
                                time=time.time() - start_time,
                                loss=lv,
                                avg=avg_loss[0] / avg_loss[1]))

                counter += 1

        except KeyboardInterrupt:
            print('interrupted')
            if hvd.rank() == 0:
                save()
Beispiel #16
0
def train_eval_fn(FLAGS, worker_count, task_index, is_chief, target,
                  init_checkpoint, train_file, dev_file, checkpoint_dir,
                  is_debug, **kargs):

    graph = tf.Graph()
    with graph.as_default():
        import json

        # config = model_config_parser(FLAGS)

        print(FLAGS.train_size)

        if FLAGS.if_shard == "0":
            train_size = FLAGS.train_size
            epoch = int(FLAGS.epoch / worker_count)
        elif FLAGS.if_shard == "1":
            train_size = int(FLAGS.train_size / worker_count)
            epoch = FLAGS.epoch
        else:
            train_size = int(FLAGS.train_size / worker_count)
            epoch = FLAGS.epoch

        multi_task_config = Bunch(
            json.load(tf.gfile.Open(FLAGS.multi_task_config)))

        num_train_steps = int(train_size / FLAGS.batch_size * epoch)
        num_warmup_steps = int(num_train_steps * 0.1)

        num_storage_steps = int(train_size / FLAGS.batch_size)

        num_eval_steps = int(FLAGS.eval_size / FLAGS.batch_size)

        if is_debug == "0":
            num_storage_steps = 190
            num_eval_steps = 100
            num_train_steps = 200
        print("num_train_steps {}, num_eval_steps {}, num_storage_steps {}".
              format(num_train_steps, num_eval_steps, num_storage_steps))

        print(" model type {}".format(FLAGS.model_type))

        print(num_train_steps, num_warmup_steps, "=============")

        print("==init lr==", FLAGS.init_lr)

        opt_config = Bunch({
            "init_lr": FLAGS.init_lr,
            "num_train_steps": num_train_steps,
            "num_warmup_steps": num_warmup_steps,
            "worker_count": worker_count,
            "opt_type": FLAGS.opt_type,
            "is_chief": is_chief,
            "train_op": kargs.get("train_op", "adam"),
            "decay": kargs.get("decay", "no"),
            "warmup": kargs.get("warmup", "no"),
            "grad_clip": kargs.get("grad_clip", "global_norm"),
            "clip_norm": kargs.get("clip_norm", 1000.0),
            "opt_ema": kargs.get("opt_ema", "no")
        })

        anneal_config = Bunch({
            "initial_value": 1.0,
            "num_train_steps": num_train_steps
        })

        model_io_config = Bunch({
            "fix_lm": False,
            "ema_saver": kargs.get("opt_ema", "no")
        })

        if FLAGS.opt_type == "hvd" and hvd:
            checkpoint_dir = checkpoint_dir if task_index == 0 else None
        elif FLAGS.opt_type == "all_reduce":
            checkpoint_dir = checkpoint_dir
        elif FLAGS.opt_type == "collective_reduce":
            checkpoint_dir = checkpoint_dir if task_index == 0 else None
        elif FLAGS.opt_type == "ps" or FLAGS.opt_type == "ps_sync":
            checkpoint_dir = checkpoint_dir if task_index == 0 else None
        print("==checkpoint_dir==", checkpoint_dir, is_chief)

        model_config_dict = {}
        num_labels_dict = {}
        init_checkpoint_dict = {}
        load_pretrained_dict = {}
        exclude_scope_dict = {}
        not_storage_params_dict = {}
        target_dict = {}
        task_type_dict = {}
        model_type_lst = []
        label_dict = {}

        for task_type in FLAGS.multi_task_type.split(","):
            print("==task type==", task_type)
            multi_task_config[task_type]['buckets'] = FLAGS.buckets
            multi_task_config[task_type]['w2v_path'] = FLAGS.w2v_path
            model_config_dict[task_type] = model_config_parser(
                Bunch(multi_task_config[task_type]))
            num_labels_dict[task_type] = multi_task_config[task_type][
                "num_labels"]
            init_checkpoint_dict[task_type] = os.path.join(
                FLAGS.buckets, multi_task_config[task_type]["init_checkpoint"])
            load_pretrained_dict[task_type] = multi_task_config[task_type][
                "load_pretrained"]
            exclude_scope_dict[task_type] = multi_task_config[task_type][
                "exclude_scope"]
            not_storage_params_dict[task_type] = multi_task_config[task_type][
                "not_storage_params"]
            target_dict[task_type] = multi_task_config[task_type]["target"]
            task_type_dict[task_type] = multi_task_config[task_type][
                "task_type"]
            label_dict[task_type] = json.load(
                tf.gfile.Open(
                    os.path.join(FLAGS.buckets,
                                 multi_task_config[task_type]["label_id"])))

        model_fn = multitask_model_fn(
            model_config_dict,
            num_labels_dict,
            task_type_dict,
            init_checkpoint_dict,
            load_pretrained_dict=load_pretrained_dict,
            opt_config=opt_config,
            model_io_config=model_io_config,
            exclude_scope_dict=exclude_scope_dict,
            not_storage_params_dict=not_storage_params_dict,
            target_dict=target_dict,
            output_type="estimator",
            checkpoint_dir=checkpoint_dir,
            num_storage_steps=num_storage_steps,
            anneal_config=anneal_config,
            task_layer_reuse=None,
            model_type_lst=model_type_lst,
            task_invariant=FLAGS.task_invariant,
            multi_task_config=multi_task_config,
            flags=FLAGS,
            **kargs)

        print("==succeeded in building model==")

        name_to_features = data_interface_dual_encoder(
            FLAGS, multi_task_config, FLAGS.multi_task_type.split(","))

        def _decode_record(record, name_to_features):
            """Decodes a record to a TensorFlow example.
			"""
            example = tf.parse_single_example(record, name_to_features)

            # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
            # So cast all int64 to int32.
            for name in list(example.keys()):
                t = example[name]
                if t.dtype == tf.int64:
                    t = tf.to_int32(t)
                example[name] = t

            return example

        def _decode_batch_record(record, name_to_features):
            example = tf.parse_example(record, name_to_features)
            return example

        params = Bunch({})
        params.epoch = epoch
        params.batch_size = FLAGS.batch_size

        # data_prior = []
        # for task_type in FLAGS.multi_task_type.split(","):
        # 	data_prior.append(multi_task_config[task_type].get("data_prior", None))
        # if None in data_prior:
        # 	data_prior = None
        # else:
        # 	print("===task prior===", data_prior)

        if kargs.get("parse_type", "parse_single") == "parse_single":

            train_file_lst = [
                multi_task_config[task_type]["train_result_file"]
                for task_type in FLAGS.multi_task_type.split(",")
            ]

            print(train_file_lst)

            train_features = lambda: tf_data_utils.all_reduce_multitask_train_input_fn(
                train_file_lst,
                _decode_record,
                name_to_features,
                params,
                if_shard=FLAGS.if_shard,
                worker_count=worker_count,
                task_index=task_index)

        elif kargs.get("parse_type", "parse_single") == "parse_batch":

            train_file_path_lst = []
            data_prior = []
            # train_file_lst = [multi_task_config[task_type]["train_result_file"] for task_type in FLAGS.multi_task_type.split(",")]
            # train_file_path_lst = [os.path.join(FLAGS.buckets, train_file) for train_file in train_file_lst]

            for task_type in FLAGS.multi_task_type.split(","):
                task_prior = multi_task_config[task_type].get(
                    "data_prior", None)
                task_path = multi_task_config[task_type][
                    "train_result_file"].split(',')
                for task_sub_path in task_path:
                    train_file_path_lst.append(
                        os.path.join(FLAGS.buckets, task_sub_path))
                    data_prior.append(task_prior)

            print(train_file_path_lst)
            if None in data_prior:
                data_prior = None
            else:
                print("===task prior===", data_prior)

            # train_features = lambda: tf_data_utils.all_reduce_train_batch_input_fn(train_file_path_lst,
            # 							_decode_batch_record,
            # 							name_to_features,
            # 							params,
            # 							if_shard=FLAGS.if_shard,
            # 							worker_count=worker_count,
            # 							task_index=task_index)

            train_features = lambda: tf_data_utils.all_reduce_multitask_train_batch_input_fn_sample(
                train_file_path_lst,
                _decode_record,
                name_to_features,
                params,
                data_prior=data_prior,
                if_shard=FLAGS.if_shard,
                worker_count=worker_count,
                task_index=task_index)
        # elif kargs.get("parse_type", "parse_single") == "generator":
        # 	def train_features(): return train_eval_input_fn(FLAGS, multi_task_config, "train", 0)

        print("==succeeded in building data and model==")
        print("start training")

        train_hooks = []

        sess_config = tf.ConfigProto(allow_soft_placement=False,
                                     log_device_placement=False)
        if FLAGS.opt_type == "ps" or FLAGS.opt_type == "ps_sync":
            print("==no need for hook==")
        elif FLAGS.opt_type == "pai_soar" and pai:
            print("no need for hook")
        elif FLAGS.opt_type == "hvd" and hvd:
            sess_config.gpu_options.allow_growth = True
            sess_config.gpu_options.visible_device_list = str(hvd.local_rank())
            print("==no need fo hook==")
        else:
            print("==no need for hooks==")

        if kargs.get("run_config", None):
            run_config = kargs.get("run_config", None)
            run_config = run_config.replace(
                save_checkpoints_steps=num_storage_steps)
            print("==run config==", run_config.save_checkpoints_steps)
        else:
            run_config = tf.estimator.RunConfig(
                model_dir=checkpoint_dir,
                save_checkpoints_steps=num_storage_steps,
                session_config=sess_config)

        if kargs.get("profiler", "profiler") == "profiler":
            if checkpoint_dir:
                hooks = tf.train.ProfilerHook(
                    save_steps=100,
                    save_secs=None,
                    output_dir=os.path.join(checkpoint_dir, "profiler"),
                )
                train_hooks.append(hooks)
                print("==add profiler hooks==")

        model_estimator = tf.estimator.Estimator(model_fn=model_fn,
                                                 config=run_config)

        print("==finish build model estimator==")

        train_being_time = time.time()
        tf.logging.info("==training distribution_strategy=={}".format(
            kargs.get("distribution_strategy", "MirroredStrategy")))
        if kargs.get("distribution_strategy",
                     "MirroredStrategy") == "MirroredStrategy":
            print("==apply single machine multi-card training==")
            model_estimator.train(input_fn=train_features,
                                  max_steps=num_train_steps)

            train_end_time = time.time()
            print("==training time==", train_end_time - train_being_time)
            tf.logging.info("==training time=={}".format(train_end_time -
                                                         train_being_time))

        elif kargs.get("distribution_strategy", "MirroredStrategy") in [
                "ParameterServerStrategy", "CollectiveAllReduceStrategy"
        ]:
            print("==apply multi-machine machine multi-card training==")
            try:
                print(os.environ['TF_CONFIG'], "==tf_run_config==")
            except:
                print("==not tf config==")
            train_spec = tf.estimator.TrainSpec(input_fn=train_features,
                                                max_steps=num_train_steps)

            eval_spec = tf.estimator.EvalSpec(input_fn=eval_features,
                                              steps=num_eval_steps)

            tf.estimator.train_and_evaluate(model_estimator, train_spec,
                                            eval_spec)
            train_end_time = time.time()
            print("==training time==", train_end_time - train_being_time)
Beispiel #17
0
def train_eval_fn(FLAGS, worker_count, task_index, is_chief, target,
                  init_checkpoint, train_file, dev_file, checkpoint_dir,
                  is_debug, **kargs):

    graph = tf.Graph()
    with graph.as_default():
        import json

        config = json.load(open(FLAGS.config_file, "r"))

        config = Bunch(config)
        config.use_one_hot_embeddings = True
        config.scope = "bert"
        config.dropout_prob = 0.1
        config.label_type = "single_label"

        config.model = FLAGS.model_type
        config.init_lr = 3e-4
        config.ln_type = FLAGS.ln_type

        config.loss = 'entropy'

        print('==init learning rate==', config.init_lr)

        if FLAGS.if_shard == "0":
            train_size = FLAGS.train_size
            epoch = int(FLAGS.epoch / worker_count)
        elif FLAGS.if_shard == "1":
            train_size = int(FLAGS.train_size / worker_count)
            epoch = FLAGS.epoch
        else:
            train_size = int(FLAGS.train_size / worker_count)
            epoch = FLAGS.epoch

        init_lr = config.init_lr

        label_dict = json.load(tf.gfile.Open(FLAGS.label_id))

        num_train_steps = int(train_size / FLAGS.batch_size * epoch)
        num_warmup_steps = int(num_train_steps * 0.1)

        num_storage_steps = int(train_size / FLAGS.batch_size)

        num_eval_steps = int(FLAGS.eval_size / FLAGS.batch_size)

        if is_debug == "0":
            num_storage_steps = 2
            num_eval_steps = 10
            num_train_steps = 10
        print("num_train_steps {}, num_eval_steps {}, num_storage_steps {}".
              format(num_train_steps, num_eval_steps, num_storage_steps))

        print(" model type {}".format(FLAGS.model_type))

        print(num_train_steps, num_warmup_steps, "=============")

        if worker_count * kargs.get("num_gpus", 1) >= 2:
            clip_norm_scale = 1.0
            lr_scale = 0.75
        else:
            clip_norm_scale = 1.0
            lr_scale = 1.0
        lr = init_lr * worker_count * kargs.get("num_gpus", 1) * lr_scale
        # if lr >= 1e-3:
        # 	lr = 1e-3
        lr = config.init_lr
        print('--training learning rate--', lr)

        opt_config = Bunch({
            "init_lr": lr,
            "num_train_steps": num_train_steps,
            "num_warmup_steps": num_warmup_steps,
            "worker_count": worker_count,
            "opt_type": FLAGS.opt_type,
            "is_chief": is_chief,
            "train_op": kargs.get("train_op", "adam"),
            "decay": kargs.get("decay", "no"),
            "warmup": kargs.get("warmup", "no"),
            "clip_norm": 1
        })

        anneal_config = Bunch({
            "initial_value": 1.0,
            "num_train_steps": num_train_steps
        })

        model_io_config = Bunch({"fix_lm": False})
        model_io_fn = model_io.ModelIO(model_io_config)

        num_classes = FLAGS.num_classes

        print("==current task_index==", task_index)

        if FLAGS.opt_type == "hvd" and hvd:
            checkpoint_dir = checkpoint_dir if task_index == 0 else None
        elif FLAGS.opt_type == "all_reduce":
            checkpoint_dir = checkpoint_dir
        elif FLAGS.opt_type == "collective_reduce":
            checkpoint_dir = checkpoint_dir if task_index == 0 else None
        elif FLAGS.opt_type == "ps" or FLAGS.opt_type == "ps_sync":
            checkpoint_dir = checkpoint_dir if task_index == 0 else None
        print("==checkpoint_dir==", checkpoint_dir, is_chief)

        model_fn = model_fn_builder(config,
                                    num_classes,
                                    init_checkpoint,
                                    model_reuse=None,
                                    load_pretrained=FLAGS.load_pretrained,
                                    model_io_config=model_io_config,
                                    opt_config=opt_config,
                                    model_io_fn=model_io_fn,
                                    exclude_scope="",
                                    not_storage_params=[],
                                    target=kargs.get("input_target", ""),
                                    output_type="estimator",
                                    checkpoint_dir=checkpoint_dir,
                                    num_storage_steps=num_storage_steps,
                                    task_index=task_index,
                                    anneal_config=anneal_config,
                                    **kargs)

        name_to_features = {
            "input_ids": tf.FixedLenFeature([FLAGS.max_length], tf.int64),
            "input_mask": tf.FixedLenFeature([FLAGS.max_length], tf.int64),
            "segment_ids": tf.FixedLenFeature([FLAGS.max_length], tf.int64),
            "label_ids": tf.FixedLenFeature([], tf.int64),
        }

        def _decode_record(record, name_to_features):
            """Decodes a record to a TensorFlow example.
			"""
            example = tf.parse_single_example(record, name_to_features)

            # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
            # So cast all int64 to int32.
            for name in list(example.keys()):
                t = example[name]
                if t.dtype == tf.int64:
                    t = tf.to_int32(t)
                example[name] = t

            return example

        def _decode_batch_record(record, name_to_features):
            example = tf.parse_example(record, name_to_features)
            # for name in list(example.keys()):
            # 	t = example[name]
            # 	if t.dtype == tf.int64:
            # 		t = tf.to_int32(t)
            # 	example[name] = t

            return example

        params = Bunch({})
        params.epoch = FLAGS.epoch
        params.batch_size = FLAGS.batch_size

        train_features = lambda: tf_data_utils.all_reduce_train_batch_input_fn(
            train_file,
            _decode_batch_record,
            name_to_features,
            params,
            if_shard=FLAGS.if_shard,
            worker_count=worker_count,
            task_index=task_index)
        eval_features = lambda: tf_data_utils.all_reduce_eval_batch_input_fn(
            dev_file,
            _decode_batch_record,
            name_to_features,
            params,
            if_shard=FLAGS.if_shard,
            worker_count=worker_count,
            task_index=task_index)

        sess_config = tf.ConfigProto(allow_soft_placement=False,
                                     log_device_placement=False)
        if FLAGS.opt_type == "ps" or FLAGS.opt_type == "ps_sync":
            print("==no need for hook==")
        elif FLAGS.opt_type == "pai_soar" and pai:
            print("no need for hook")
        elif FLAGS.opt_type == "hvd" and hvd:
            sess_config.gpu_options.allow_growth = True
            sess_config.gpu_options.visible_device_list = str(hvd.local_rank())
            print("==no need fo hook==")
        else:
            print("==no need for hooks==")

        if kargs.get("run_config", None):
            run_config = kargs.get("run_config", None)
            run_config = run_config.replace(
                save_checkpoints_steps=num_storage_steps)
            print("==run config==", run_config.save_checkpoints_steps)
        else:
            run_config = tf.estimator.RunConfig(
                model_dir=checkpoint_dir,
                save_checkpoints_steps=num_storage_steps,
                session_config=sess_config)

        train_hooks = []
        if kargs.get("profiler", "profiler") == "profiler":
            if checkpoint_dir:
                hooks = tf.train.ProfilerHook(
                    save_steps=100,
                    save_secs=None,
                    output_dir=os.path.join(checkpoint_dir, "profiler"),
                )
                train_hooks.append(hooks)
                print("==add profiler hooks==")

        model_estimator = tf.estimator.Estimator(model_fn=model_fn,
                                                 model_dir=checkpoint_dir,
                                                 config=run_config)

        train_being_time = time.time()
        tf.logging.info("==training distribution_strategy=={}".format(
            kargs.get("distribution_strategy", "MirroredStrategy")))
        if kargs.get("distribution_strategy",
                     "MirroredStrategy") == "MirroredStrategy":
            print("==apply single machine multi-card training==")
            # model_estimator.train(input_fn=train_features,
            # 				max_steps=num_train_steps,
            # 				hooks=train_hooks)

            train_spec = tf.estimator.TrainSpec(input_fn=train_features,
                                                max_steps=num_train_steps)

            eval_spec = tf.estimator.EvalSpec(input_fn=eval_features,
                                              steps=num_eval_steps)

            model_estimator.train(input_fn=train_features,
                                  max_steps=num_train_steps,
                                  hooks=train_hooks)
            # tf.estimator.train(model_estimator, train_spec)
            # tf.estimator.evaluate(model_estimator, eval_spec)

            train_end_time = time.time()
            print("==training time==", train_end_time - train_being_time)
            tf.logging.info("==training time=={}".format(train_end_time -
                                                         train_being_time))
            eval_results = model_estimator.evaluate(input_fn=eval_features,
                                                    steps=num_eval_steps)
            # print(eval_results)

        elif kargs.get("distribution_strategy", "MirroredStrategy") in [
                "ParameterServerStrategy", "CollectiveAllReduceStrategy"
        ]:
            print("==apply multi-machine machine multi-card training==")
            try:
                print(os.environ['TF_CONFIG'], "==tf_run_config==")
            except:
                print("==not tf config==")
            train_spec = tf.estimator.TrainSpec(input_fn=train_features,
                                                max_steps=num_train_steps)

            eval_spec = tf.estimator.EvalSpec(input_fn=eval_features,
                                              steps=num_eval_steps)

            tf.estimator.train_and_evaluate(model_estimator, train_spec,
                                            eval_spec)
            train_end_time = time.time()
            print("==training time==", train_end_time - train_being_time)
Beispiel #18
0
 def _Horovod():
     import horovod.tensorflow as hvd
     return hvd.local_rank()
Beispiel #19
0
def train_eval_fn(FLAGS,
				worker_count, 
				task_index, 
				is_chief, 
				target,
				init_checkpoint,
				train_file,
				dev_file,
				checkpoint_dir,
				is_debug):

	graph = tf.Graph()
	with graph.as_default():
		import json
				
		config = json.load(open(FLAGS.config_file, "r"))

		config = Bunch(config)
		config.use_one_hot_embeddings = True
		config.scope = "bert"
		config.dropout_prob = 0.1
		config.label_type = "single_label"
		
		if FLAGS.if_shard == "0":
			train_size = FLAGS.train_size
			epoch = int(FLAGS.epoch / worker_count)
		elif FLAGS.if_shard == "1":
			train_size = int(FLAGS.train_size/worker_count)
			epoch = FLAGS.epoch

		init_lr = 2e-5

		label_dict = json.load(open(FLAGS.label_id))

		num_train_steps = int(
			train_size / FLAGS.batch_size * epoch)
		num_warmup_steps = int(num_train_steps * 0.1)

		num_storage_steps = int(train_size / FLAGS.batch_size)

		num_eval_steps = int(FLAGS.eval_size / FLAGS.batch_size)

		if is_debug == "0":
			num_storage_steps = 2
			num_eval_steps = 10
			num_train_steps = 10
		print("num_train_steps {}, num_eval_steps {}, num_storage_steps {}".format(num_train_steps, num_eval_steps, num_storage_steps))

		print(" model type {}".format(FLAGS.model_type))

		print(num_train_steps, num_warmup_steps, "=============")
		
		opt_config = Bunch({"init_lr":init_lr/worker_count, 
							"num_train_steps":num_train_steps,
							"num_warmup_steps":num_warmup_steps,
							"worker_count":worker_count,
							"opt_type":FLAGS.opt_type})

		model_io_config = Bunch({"fix_lm":False})
		
		num_classes = FLAGS.num_classes

		model_fn = model_fn_builder(config, num_classes, init_checkpoint, 
												model_reuse=None, 
												load_pretrained=True,
												model_io_config=model_io_config,
												opt_config=opt_config,
												exclude_scope="",
												not_storage_params=[],
												target="",
												output_type="estimator")

		name_to_features = {
				"input_ids":
						tf.FixedLenFeature([FLAGS.max_length], tf.int64),
				"input_mask":
						tf.FixedLenFeature([FLAGS.max_length], tf.int64),
				"segment_ids":
						tf.FixedLenFeature([FLAGS.max_length], tf.int64),
				"label_ids":
						tf.FixedLenFeature([], tf.int64),
		}

		def _decode_record(record, name_to_features):
			"""Decodes a record to a TensorFlow example.
			"""
			example = tf.parse_single_example(record, name_to_features)

			# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
			# So cast all int64 to int32.
			for name in list(example.keys()):
				t = example[name]
				if t.dtype == tf.int64:
					t = tf.to_int32(t)
				example[name] = t

			return example 

		params = Bunch({})
		params.epoch = FLAGS.epoch
		params.batch_size = FLAGS.batch_size

		train_features = lambda: tf_data_utils.train_input_fn(train_file,
									_decode_record, name_to_features, params, if_shard=FLAGS.if_shard,
									worker_count=worker_count,
									task_index=task_index)

		eval_features = lambda: tf_data_utils.eval_input_fn(dev_file,
									_decode_record, name_to_features, params, if_shard=FLAGS.if_shard,
									worker_count=worker_count,
									task_index=task_index)

		print("===========begin to train============")
		sess_config = tf.ConfigProto(allow_soft_placement=False,
									log_device_placement=False)

		checkpoint_dir = checkpoint_dir if task_index == 0 else None

		print("start training")

		hooks = []
		if FLAGS.opt_type == "ps":
			sync_replicas_hook = optimizer_fn.opt.make_session_run_hook(is_chief, num_tokens=0)
			hooks.append(sync_replicas_hook)
		elif FLAGS.opt_type == "pai_soar" and pai:
			sess = tf.train.MonitoredTrainingSession(master=target,
												 is_chief=is_chief,
												 config=sess_config,
												 hooks=hooks,
												 checkpoint_dir=checkpoint_dir,
												 save_checkpoint_steps=num_storage_steps)
		elif FLAGS.opt_type == "hvd" and hvd:
			bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
			hooks.append(bcast_hook)
			sess_config.gpu_options.allow_growth = True
			sess_config.gpu_options.visible_device_list = str(hvd.local_rank())
		else:
			print("==not supported==")

		run_config = tf.estimator.RunConfig(model_dir=checkpoint_dir, 
											save_checkpoints_steps=num_storage_steps,
											session_config=sess_config)
		
		model_estimator = tf.estimator.Estimator(
						model_fn=model_fn,
						config=run_config)

		train_spec = tf.estimator.TrainSpec(input_fn=train_features, 
										max_steps=num_train_steps)

		eval_spec = tf.estimator.EvalSpec(input_fn=eval_features, 
										steps=num_eval_steps)

		tf.estimator.train_and_evaluate(model_estimator, train_spec, eval_spec)
		
		
def main():
    """
    Run training/evaluation
    """
    script_start = time.time()
    hvd_init()
    mpi_comm = MPI.COMM_WORLD
    args = parse_args()

    if hvd.rank() == 0:
        log_args(args)

    if args.seed is not None:
        tf.random.set_random_seed(args.seed)
        np.random.seed(args.seed)
        cp.random.seed(args.seed)

    if args.amp:
        os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1"
    if "TF_ENABLE_AUTO_MIXED_PRECISION" in os.environ \
       and os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] == "1":
        args.fp16 = False

    # directory to store/read final checkpoint
    if args.mode == 'train' and hvd.rank() == 0:
        LOGGER.log("Saving best checkpoint to {}".format(args.checkpoint_dir))

    if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir != '':
        os.makedirs(args.checkpoint_dir, exist_ok=True)
    final_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.ckpt')

    # Load converted data and get statistics
    train_df = pd.read_pickle(args.data+'/train_ratings.pickle')
    test_df = pd.read_pickle(args.data+'/test_ratings.pickle')
    nb_users, nb_items = train_df.max() + 1

    # Extract train and test feature tensors from dataframe
    pos_train_users = train_df.iloc[:, 0].values.astype(np.int32)
    pos_train_items = train_df.iloc[:, 1].values.astype(np.int32)
    pos_test_users = test_df.iloc[:, 0].values.astype(np.int32)
    pos_test_items = test_df.iloc[:, 1].values.astype(np.int32)
    # Negatives indicator for negatives generation
    neg_mat = np.ones((nb_users, nb_items), dtype=np.bool)
    neg_mat[pos_train_users, pos_train_items] = 0

    # Get the local training/test data
    train_users, train_items, train_labels = get_local_train_data(
        pos_train_users, pos_train_items, args.negative_samples
    )
    test_users, test_items = get_local_test_data(
        pos_test_users, pos_test_items
    )

    # Create and run Data Generator in a separate thread
    data_generator = DataGenerator(
        args.seed,
        hvd.rank(),
        nb_users,
        nb_items,
        neg_mat,
        train_users,
        train_items,
        train_labels,
        args.batch_size // hvd.size(),
        args.negative_samples,
        test_users,
        test_items,
        args.valid_users_per_batch,
        args.valid_negative,
        )

    # Create tensorflow session and saver
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    if args.xla:
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    sess = tf.Session(config=config)

    # Input tensors
    users = tf.placeholder(tf.int32, shape=(None,))
    items = tf.placeholder(tf.int32, shape=(None,))
    labels = tf.placeholder(tf.int32, shape=(None,))
    is_dup = tf.placeholder(tf.float32, shape=(None,))
    dropout = tf.placeholder_with_default(args.dropout, shape=())
    # Model ops and saver
    hit_rate, ndcg, eval_op, train_op = ncf_model_ops(
        users,
        items,
        labels,
        is_dup,
        params={
            'fp16': args.fp16,
            'val_batch_size': args.valid_negative+1,
            'top_k': args.topk,
            'learning_rate': args.learning_rate,
            'beta_1': args.beta1,
            'beta_2': args.beta2,
            'epsilon': args.eps,
            'num_users': nb_users,
            'num_items': nb_items,
            'num_factors': args.factors,
            'mf_reg': 0,
            'layer_sizes': args.layers,
            'layer_regs': [0. for i in args.layers],
            'dropout': dropout,
            'sigmoid': True,
            'loss_scale': args.loss_scale
        },
        mode='TRAIN' if args.mode == 'train' else 'EVAL'
    )
    saver = tf.train.Saver()

    # Accuracy metric tensors
    hr_sum = tf.get_default_graph().get_tensor_by_name('neumf/hit_rate/total:0')
    hr_cnt = tf.get_default_graph().get_tensor_by_name('neumf/hit_rate/count:0')
    ndcg_sum = tf.get_default_graph().get_tensor_by_name('neumf/ndcg/total:0')
    ndcg_cnt = tf.get_default_graph().get_tensor_by_name('neumf/ndcg/count:0')

    # Prepare evaluation data
    data_generator.prepare_eval_data()

    if args.load_checkpoint_path:
        saver.restore(sess, args.load_checkpoint_path)
    else:
        # Manual initialize weights
        sess.run(tf.global_variables_initializer())

    # If test mode, run one eval
    if args.mode == 'test':
        sess.run(tf.local_variables_initializer())
        eval_start = time.time()
        for user_batch, item_batch, dup_batch \
            in zip(data_generator.eval_users, data_generator.eval_items, data_generator.dup_mask):
            sess.run(
                eval_op,
                feed_dict={
                    users: user_batch,
                    items: item_batch,
                    is_dup:dup_batch, dropout: 0.0
                }
            )
        eval_duration = time.time() - eval_start

        # Report results
        hit_rate_sum = sess.run(hvd.allreduce(hr_sum, average=False))
        hit_rate_cnt = sess.run(hvd.allreduce(hr_cnt, average=False))
        ndcg_sum = sess.run(hvd.allreduce(ndcg_sum, average=False))
        ndcg_cnt = sess.run(hvd.allreduce(ndcg_cnt, average=False))

        hit_rate = hit_rate_sum / hit_rate_cnt
        ndcg = ndcg_sum / ndcg_cnt

        if hvd.rank() == 0:
            LOGGER.log("Eval Time: {:.4f}, HR: {:.4f}, NDCG: {:.4f}"
                       .format(eval_duration, hit_rate, ndcg))

            eval_throughput = pos_test_users.shape[0] * (args.valid_negative + 1) / eval_duration
            LOGGER.log('Average Eval Throughput: {:.4f}'.format(eval_throughput))
        return

    # Performance Metrics
    train_times = list()
    eval_times = list()
    # Accuracy Metrics
    first_to_target = None
    time_to_train = 0.0
    best_hr = 0
    best_epoch = 0
    # Buffers for global metrics
    global_hr_sum = np.ones(1)
    global_hr_count = np.ones(1)
    global_ndcg_sum = np.ones(1)
    global_ndcg_count = np.ones(1)
    # Buffers for local metrics
    local_hr_sum = np.ones(1)
    local_hr_count = np.ones(1)
    local_ndcg_sum = np.ones(1)
    local_ndcg_count = np.ones(1)

    # Begin training
    begin_train = time.time()
    if hvd.rank() == 0:
        LOGGER.log("Begin Training. Setup Time: {}".format(begin_train - script_start))
    for epoch in range(args.epochs):
        # Train for one epoch
        train_start = time.time()
        data_generator.prepare_train_data()
        for user_batch, item_batch, label_batch \
            in zip(data_generator.train_users_batches,
                   data_generator.train_items_batches,
                   data_generator.train_labels_batches):
            sess.run(
                train_op,
                feed_dict={
                    users: user_batch.get(),
                    items: item_batch.get(),
                    labels: label_batch.get()
                }
            )
        train_duration = time.time() - train_start
        ## Only log "warm" epochs
        if epoch >= 1:
            train_times.append(train_duration)
        # Evaluate
        if epoch > args.eval_after:
            eval_start = time.time()
            sess.run(tf.local_variables_initializer())
            for user_batch, item_batch, dup_batch \
                in zip(data_generator.eval_users,
                       data_generator.eval_items,
                       data_generator.dup_mask):
                sess.run(
                    eval_op,
                    feed_dict={
                        users: user_batch,
                        items: item_batch,
                        is_dup: dup_batch,
                        dropout: 0.0
                    }
                )
            # Compute local metrics
            local_hr_sum[0] = sess.run(hr_sum)
            local_hr_count[0] = sess.run(hr_cnt)
            local_ndcg_sum[0] = sess.run(ndcg_sum)
            local_ndcg_count[0] = sess.run(ndcg_cnt)
            # Reduce metrics across all workers
            mpi_comm.Reduce(local_hr_count, global_hr_count)
            mpi_comm.Reduce(local_hr_sum, global_hr_sum)
            mpi_comm.Reduce(local_ndcg_count, global_ndcg_count)
            mpi_comm.Reduce(local_ndcg_sum, global_ndcg_sum)
            # Calculate metrics
            hit_rate = global_hr_sum[0] / global_hr_count[0]
            ndcg = global_ndcg_sum[0] / global_ndcg_count[0]

            eval_duration = time.time() - eval_start
            ## Only log "warm" epochs
            if epoch >= 1:
                eval_times.append(eval_duration)

            if hvd.rank() == 0:
                if args.verbose:
                    log_string = "Epoch: {:02d}, Train Time: {:.4f}, Eval Time: {:.4f}, HR: {:.4f}, NDCG: {:.4f}"
                    LOGGER.log(
                        log_string.format(
                            epoch,
                            train_duration,
                            eval_duration,
                            hit_rate,
                            ndcg
                        )
                    )

                # Update summary metrics
                if hit_rate > args.target and first_to_target is None:
                    first_to_target = epoch
                    time_to_train = time.time() - begin_train
                if hit_rate > best_hr:
                    best_hr = hit_rate
                    best_epoch = epoch
                    time_to_best =  time.time() - begin_train
                    if not args.verbose:
                        log_string = "New Best Epoch: {:02d}, Train Time: {:.4f}, Eval Time: {:.4f}, HR: {:.4f}, NDCG: {:.4f}"
                        LOGGER.log(
                            log_string.format(
                                epoch,
                                train_duration,
                                eval_duration,
                                hit_rate,
                                ndcg
                            )
                        )
                    # Save, if meets target
                    if hit_rate > args.target:
                        saver.save(sess, final_checkpoint_path)

    # Final Summary
    if hvd.rank() == 0:
        train_times = np.array(train_times)
        train_throughputs = pos_train_users.shape[0]*(args.negative_samples+1) / train_times
        eval_times = np.array(eval_times)
        eval_throughputs = pos_test_users.shape[0]*(args.valid_negative+1) / eval_times
        LOGGER.log(' ')

        LOGGER.log('batch_size: {}'.format(args.batch_size))
        LOGGER.log('num_gpus: {}'.format(hvd.size()))
        LOGGER.log('AMP: {}'.format(1 if args.amp else 0))
        LOGGER.log('seed: {}'.format(args.seed))
        LOGGER.log('Minimum Train Time per Epoch: {:.4f}'.format(np.min(train_times)))
        LOGGER.log('Average Train Time per Epoch: {:.4f}'.format(np.mean(train_times)))
        LOGGER.log('Average Train Throughput:     {:.4f}'.format(np.mean(train_throughputs)))
        LOGGER.log('Minimum Eval Time per Epoch:  {:.4f}'.format(np.min(eval_times)))
        LOGGER.log('Average Eval Time per Epoch:  {:.4f}'.format(np.mean(eval_times)))
        LOGGER.log('Average Eval Throughput:      {:.4f}'.format(np.mean(eval_throughputs)))
        LOGGER.log('First Epoch to hit:           {}'.format(first_to_target))
        LOGGER.log('Time to Train:                {:.4f}'.format(time_to_train))
        LOGGER.log('Time to Best:                 {:.4f}'.format(time_to_best))
        LOGGER.log('Best HR:                      {:.4f}'.format(best_hr))
        LOGGER.log('Best Epoch:                   {}'.format(best_epoch))

    sess.close()
    return
def set_conf(cfg):
    print(cfg.pretty())

    # debug
    if cfg.get("enable_check_numerics"):
        print("Enabling numerics debugging...")
        tf.debugging.enable_check_numerics(stack_height_limit=30,
                                           path_length_limit=50)

    # the model we are running
    setattr(sys.modules[__name__], "MODEL_NAME", cfg.MODEL_NAME)

    seed_offset = 0

    setattr(sys.modules[__name__], "horovod_worker", False)

    # distributed setup
    if horovod:
        # Initialize Horovod
        hvd.init()
        # Pin GPU to be used to process local rank (one GPU per process)
        gpus = tf.config.experimental.list_physical_devices('GPU')
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        if gpus:
            tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                       'GPU')

        seed_offset = hvd.rank()
        if seed_offset > 0:
            setattr(sys.modules[__name__], "horovod_worker", True)

    # RNG
    tf.random.set_seed(cfg.seed + seed_offset)
    rng_state = tf.Variable([0, 0, cfg.seed + seed_offset], dtype=tf.int64)
    setattr(sys.modules[__name__], "rng",
            tf.random.Generator.from_state(rng_state, alg='philox'))

    # RUN CONFIGURATION
    setattr(sys.modules[__name__], "N_sim_batch", cfg.N_sim_batch)
    setattr(sys.modules[__name__], "N_epochs_per_episode",
            cfg.N_epochs_per_episode)
    setattr(sys.modules[__name__], "N_minibatch_size", cfg.N_minibatch_size)
    setattr(sys.modules[__name__], "N_episode_length", cfg.N_episode_length)
    setattr(sys.modules[__name__], "N_episodes", cfg.N_episodes)
    setattr(sys.modules[__name__], "expectation_pseudo_draws",
            cfg.get('expectation_pseudo_draws', 5))
    setattr(sys.modules[__name__], "expectation_type",
            cfg.get('expectation_type', 'product'))

    # OUTPUT FILE FOR ERROR MEASURES
    setattr(sys.modules[__name__], "error_filename", cfg.error_filename)

    # VARIABLES
    try:
        import importlib
        variables = importlib.import_module(MODEL_NAME + ".Variables")
        config_states = variables.states
        config_policies = variables.policies
        config_definitions = variables.definitions
        config_constants = variables.constants
        # for backward compatibility in case constants are also in yaml
        if cfg.constants:
            config_constants.update(cfg.constants)
        print("Variables imported from Variables module")
        print(__name__)
    except ImportError:
        config_states = cfg.states
        config_policies = cfg.policies
        config_definitions = cfg.definitions
        config_constants = cfg.constants

    setattr(sys.modules[__name__], "states",
            [s['name'] for s in config_states])
    setattr(sys.modules[__name__], "policy_states",
            [s['name'] for s in config_policies])
    setattr(sys.modules[__name__], "definitions",
            [s['name'] for s in config_definitions])

    state_bounds = {
        "lower": {},
        "penalty_lower": {},
        "upper": {},
        "penalty_upper": {}
    }

    for s in config_states:
        if "bounds" in s.keys() and "lower" in s["bounds"].keys():
            state_bounds["lower"][s["name"]] = s["bounds"]["lower"]
            if 'penalty_lower' in s['bounds'].keys():
                penalty = s["bounds"]["penalty_lower"]
            else:
                penalty = 1 / s['bounds']['lower']**2
            state_bounds["penalty_lower"][s["name"]] = penalty

        if "bounds" in s.keys() and "upper" in s["bounds"].keys():
            state_bounds["upper"][s["name"]] = s["bounds"]["upper"]
            if 'penalty_upper' in s['bounds'].keys():
                penalty = s["bounds"]["penalty_upper"]
            else:
                penalty = 1 / s['bounds']['upper']**2
            state_bounds["penalty_upper"][s["name"]] = penalty

    setattr(sys.modules[__name__], "state_bounds_hard", state_bounds)

    policy_bounds = {
        'lower': {},
        'penalty_lower': {},
        'upper': {},
        'penalty_upper': {}
    }

    for s in config_policies:
        if 'bounds' in s.keys() and 'lower' in s['bounds'].keys():
            policy_bounds['lower'][s['name']] = s['bounds']['lower']
            if 'penalty_lower' in s['bounds'].keys():
                penalty = s["bounds"]["penalty_lower"]
            else:
                penalty = 1 / s['bounds']['lower']**2
            policy_bounds['penalty_lower'][s['name']] = penalty

        if 'bounds' in s.keys() and 'upper' in s['bounds'].keys():
            policy_bounds['upper'][s['name']] = s['bounds']['upper']
            if 'penalty_upper' in s['bounds'].keys():
                penalty = s["bounds"]["penalty_upper"]
            else:
                penalty = 1 / s['bounds']['upper']**2
            policy_bounds['penalty_upper'][s['name']] = penalty

    setattr(sys.modules[__name__], "policy_bounds_hard", policy_bounds)

    definition_bounds = {
        'lower': {},
        'penalty_lower': {},
        'upper': {},
        'penalty_upper': {}
    }

    for s in config_definitions:
        if 'bounds' in s.keys() and 'lower' in s['bounds'].keys():
            definition_bounds['lower'][s['name']] = s['bounds']['lower']
            if 'penalty_lower' in s['bounds'].keys():
                penalty = s["bounds"]["penalty_lower"]
            else:
                penalty = 1 / s['bounds']['lower']**2
            definition_bounds['penalty_lower'][s['name']] = penalty

        if 'bounds' in s.keys() and 'upper' in s['bounds'].keys():
            definition_bounds['upper'][s['name']] = s['bounds']['upper']
            if 'penalty_upper' in s['bounds'].keys():
                penalty = s["bounds"]["penalty_upper"]
            else:
                penalty = 1 / s['bounds']['upper']**2
            definition_bounds['penalty_upper'][s['name']] = penalty

    setattr(sys.modules[__name__], "definition_bounds_hard", definition_bounds)

    # NEURAL NET
    tf.keras.backend.set_floatx(cfg.get('keras_precision', 'float32'))

    layers = []

    for i, layer in enumerate(cfg.layers, start=1):
        if i < len(cfg.layers):
            if 'dropout_rate' in layer['hidden']:
                layers.append(
                    tf.keras.layers.Dropout(
                        rate=layer['hidden']['dropout_rate']))
            if 'batch_normalize' in layer['hidden']:
                layers.append(
                    tf.keras.layers.BatchNormalization(
                        **layer['hidden']['batch_normalize']))
            layers.append(
                tf.keras.layers.Dense(
                    units=layer['hidden']['units'],
                    activation=layer['hidden']['activation'],
                    kernel_initializer=tf.keras.initializers.VarianceScaling(
                        scale=layer['hidden'].get('init_scale', 1.0),
                        mode=cfg.get('net_initializer_mode', 'fan_in'),
                        distribution=cfg.get('net_initializer_distribution',
                                             'truncated_normal'),
                        seed=i)))
        else:
            layers.append(
                tf.keras.layers.Dense(
                    units=len(policy_states),
                    activation=layer['output']['activation'],
                    kernel_initializer=tf.keras.initializers.VarianceScaling(
                        scale=layer['output'].get('init_scale', 1.0),
                        mode=cfg.get('net_initializer_mode', 'fan_in'),
                        distribution=cfg.get('net_initializer_distribution',
                                             'truncated_normal'),
                        seed=i)))

    policy_net = tf.keras.models.Sequential(layers)
    policy_net.build(input_shape=(None, len(states)))

    learning_rate_multiplier = 1 if not horovod else hvd.size()

    optim = getattr(tf.keras.optimizers, cfg.optimizer)(
        learning_rate=cfg.learning_rate * learning_rate_multiplier,
        clipvalue=cfg.clipvalue)

    # apply post-processing per-variable
    def policy(s):
        raw_policy = policy_net(s)
        for i, pol in enumerate(config_policies):
            if 'activation' in pol.keys():
                activation_str = pol['activation']
                if pol['activation'] == 'implied':
                    if 'lower' in pol['bounds'].keys(
                    ) and 'upper' in pol['bounds'].keys():
                        activation_str = 'lambda x: {l} + ({u} - {l}) * tf.math.sigmoid(x)'.format(
                            l=str(pol['bounds']['lower']),
                            u=str(pol['bounds']['upper']))
                raw_policy = tf.tensor_scatter_nd_update(
                    raw_policy, [[j, i] for j in range(s.shape[0])],
                    eval(activation_str)(raw_policy[:, i]))

        if cfg.keras_precision == 'float64':
            return tf.cast(raw_policy, tf.dtypes.float32)

        return raw_policy

    setattr(sys.modules[__name__], "policy", policy)
    setattr(sys.modules[__name__], "policy_net", policy_net)

    # OPTIMIZER
    setattr(sys.modules[__name__], "optimizer", optim)

    # CONSTANTS
    for (key, value) in config_constants.items():
        setattr(sys.modules[__name__], key, value)

    # STATE INITIALIZATION
    def initialize_states(N_batch=N_sim_batch):
        # starting state
        init_val = tf.ones([N_batch, len(states)])
        # apply special inits if any
        for i, s in enumerate(config_states):
            if 'init' in s:
                init_val = tf.tensor_scatter_nd_update(
                    init_val, [[j, i] for j in range(init_val.shape[0])],
                    getattr(rng,
                            s["init"]["distribution"])(shape=(N_batch, ),
                                                       **s["init"]["kwargs"]))
        return init_val

    starting_state = tf.Variable(initialize_states())

    setattr(sys.modules[__name__], "starting_state", starting_state)
    setattr(sys.modules[__name__], "initialize_states", initialize_states)
    setattr(sys.modules[__name__], "initialize_each_episode",
            cfg.get("initialize_each_episode", False))
    setattr(sys.modules[__name__], "N_simulated_batch_size",
            cfg.get("N_simulated_batch_size", None))
    setattr(sys.modules[__name__], "N_simulated_episode_length",
            cfg.get("N_simulated_episode_length", None))

    # LOGGING
    setattr(sys.modules[__name__], "LOG_DIR", os.getcwd())

    if cfg.STARTING_POINT == 'NEW' and not horovod_worker:
        for file in os.scandir(os.getcwd()):
            if not ".hydra" in file.path:
                os.unlink(file.path)

    setattr(sys.modules[__name__], "writer",
            tf.summary.create_file_writer(os.getcwd()))

    setattr(sys.modules[__name__], "current_episode", tf.Variable(1))
    ckpt = tf.train.Checkpoint(step=tf.Variable(1),
                               current_episode=current_episode,
                               optimizer=optimizer,
                               policy=policy_net,
                               rng_state=rng_state,
                               starting_state=starting_state)
    manager = tf.train.CheckpointManager(
        ckpt,
        os.getcwd(),
        max_to_keep=cfg.MAX_TO_KEEP_NUMBER,
        step_counter=current_episode,
        checkpoint_interval=cfg.CHECKPOINT_INTERVAL)

    if cfg.STARTING_POINT == 'LATEST' and manager.latest_checkpoint:
        print("Restored from {}".format(manager.latest_checkpoint))
        ckpt.restore(manager.latest_checkpoint)

    if cfg.STARTING_POINT != 'LATEST' and cfg.STARTING_POINT != 'NEW':
        print("Restored from {}".format(cfg.STARTING_POINT))
        ckpt.restore(cfg.STARTING_POINT)

    setattr(sys.modules[__name__], "optimizer_starting_iteration",
            optimizer.iterations.numpy())
    setattr(sys.modules[__name__], "ckpt", ckpt)
    setattr(sys.modules[__name__], "manager", manager)

    tf.print("Optimizer configuration:")
    tf.print(optimizer.get_config())

    tf.print("Starting state:")
    tf.print(starting_state)
Beispiel #22
0
def main(_):
    # Horovod: initialize Horovod if using multiple GPUs.
    if FLAGS.use_multi_gpu:
        hvd.init()
        FLAGS.output_dir = FLAGS.output_dir if hvd.rank(
        ) == 0 else os.path.join(FLAGS.output_dir, str(hvd.rank()))

    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "xnli": XnliProcessor,
    }

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    if FLAGS.use_multi_gpu:
        # Horovod: pin GPU to be used to process local rank (one GPU per process)
        config = tf.ConfigProto()
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        run_config = tf.contrib.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            master=FLAGS.master,
            model_dir=FLAGS.output_dir,
            save_checkpoints_steps=FLAGS.save_checkpoints_steps,
            tpu_config=tf.contrib.tpu.TPUConfig(
                iterations_per_loop=FLAGS.iterations_per_loop,
                num_shards=FLAGS.num_tpu_cores,
                per_host_input_for_training=is_per_host),
            log_step_count_steps=25,
            session_config=config)
    else:
        run_config = tf.contrib.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            master=FLAGS.master,
            model_dir=FLAGS.output_dir,
            save_checkpoints_steps=FLAGS.save_checkpoints_steps,
            tpu_config=tf.contrib.tpu.TPUConfig(
                iterations_per_loop=FLAGS.iterations_per_loop,
                num_shards=FLAGS.num_tpu_cores,
                per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

        # Horovod: adjust number of steps based on number of GPUs.
        if FLAGS.use_multi_gpu:
            num_warmup_steps = num_warmup_steps // hvd.size()
            num_train_steps = num_train_steps // hvd.size()

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu,
                                use_multi_gpu=FLAGS.use_multi_gpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        file_based_convert_examples_to_features(train_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, train_file)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)

        # Horovod: In the case of multi GPU training with Horovod, adding
        # hvd.BroadcastGlobalVariablesHook(0) hook, broadcasts the initial variable
        # states from rank 0 to all other processes. This is necessary to ensure
        # consistent initialization of all workers when training is started with
        # random weights or restored from a checkpoint.
        if FLAGS.use_multi_gpu:
            hooks = [hvd.BroadcastGlobalVariablesHook(0)]
        else:
            hooks = []
        estimator.train(input_fn=train_input_fn,
                        max_steps=num_train_steps,
                        hooks=hooks)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on. These do NOT count towards the metric (all tf.metrics
            # support a per-instance weight, and these get a weight of 0.0).
            while len(eval_examples) % FLAGS.eval_batch_size != 0:
                eval_examples.append(PaddingInputExample())

        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        file_based_convert_examples_to_features(eval_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            assert len(eval_examples) % FLAGS.eval_batch_size == 0
            eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on.
            while len(predict_examples) % FLAGS.predict_batch_size != 0:
                predict_examples.append(PaddingInputExample())

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        file_based_convert_examples_to_features(predict_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, predict_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        result = estimator.predict(input_fn=predict_input_fn)

        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as writer:
            num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            for (i, prediction) in enumerate(result):
                probabilities = prediction["probabilities"]
                if i >= num_actual_predict_examples:
                    break
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities) + "\n"
                writer.write(output_line)
                num_written_lines += 1
        assert num_written_lines == num_actual_predict_examples
Beispiel #23
0
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)

    nranks_per_gpu = args.nranks_per_gpu
    local_rank = hvd.local_rank()
    gpu_local_rank = local_rank // nranks_per_gpu
    print('local_rank, GPU_LOCAL_RANK: {}, {}'.format(local_rank,
                                                      gpu_local_rank))

    # Pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    # config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.visible_device_list = str(gpu_local_rank)
    K.set_session(tf.Session(config=config))

    # input image dimensions
    img_rows, img_cols, img_chns = 28, 28, 1
    # number of convolutional filters to use
    filters = 64
    # convolution kernel size
    num_conv = 3

    hvdsize = hvd.size()

    batch_size = 128  # 100
    if K.image_data_format() == 'channels_first':
        original_img_size = (img_chns, img_rows, img_cols)
    else:
        original_img_size = (img_rows, img_cols, img_chns)
    latent_dim = 2
    intermediate_dim = 128
    epsilon_std = 1.0
    epochs = args.epochs  # 5

    # train the VAE on MNIST digits
    (x_train, _), (x_test, y_test) = mnist.load_data()

    # Data split if going for reduction in each iteration step. Using
    # tf-queue or dataset is better to preserve uniform random sampling.
    # nsamples = x_train.shape[0]
    # mysamples = nsamples // hvdsize
    # start_sam = hvd.local_rank() * mysamples
    # stop_sam = min((hvd.local_rank() + 1) * mysamples, nsamples)
    # x_train = x_train[start_sam:stop_sam, ...]

    x_train = x_train.astype('float32') / 255.
    x_train = x_train.reshape((x_train.shape[0], ) + original_img_size)
    x_test = x_test.astype('float32') / 255.
    x_test = x_test.reshape((x_test.shape[0], ) + original_img_size)

    if hvd.rank() == 0:
        print('x_train.shape:', x_train.shape)

    vae, encoder, generator = make_vae_and_codec(original_img_size, img_chns,
                                                 img_rows, img_cols,
                                                 batch_size, filters, num_conv,
                                                 intermediate_dim, latent_dim,
                                                 epsilon_std)
    # :  :type vae: Model

    lr = 0.001  # * hvdsize
    opt = tf.train.RMSPropOptimizer(lr)
    # Add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)  # , use_locking=True)
    opt = TFOptimizer(opt)

    vae.compile(optimizer=opt, loss=None)
    if hvd.rank() == 0:
        vae.summary()

    # callbacks = []
    callbacks = [hvd_keras.callbacks.BroadcastGlobalVariablesCallback(0)]
    if hvd.rank() == 0:
        callbacks += [BatchTiming(), SamplesPerSec(batch_size * hvdsize)]

    sess = K.get_session()
    sess.run(hvd.broadcast_global_variables(0))

    vae.fit(x_train,
            shuffle=True,
            epochs=epochs,
            batch_size=batch_size,
            verbose=hvd.local_rank() == 0,
            validation_data=(x_test, None),
            callbacks=callbacks)

    if hvd.rank() == 0:
        vae_val = vae
        loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size)
        print('\n\nVAE VALIDATION LOSS: {}'.format(loss))

        # display a 2D plot of the digit classes in the latent space
        x_test_encoded = encoder.predict(x_test, batch_size=batch_size)
        plt.figure(figsize=(6, 6))
        plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test)
        plt.colorbar()
        # plt.show()
        plt.savefig('vae_scatter.ps')
        plt.close()

        # display a 2D manifold of the digits
        n = 15  # figure with 15x15 digits
        digit_size = 28
        figure = np.zeros((digit_size * n, digit_size * n))
        # Linearly spaced coordinates on the unit square were transformed
        # through the inverse CDF (ppf) of the Gaussian
        # To produce values of the latent variables z, since the prior of the
        # latent space is Gaussian
        grid_x = norm.ppf(np.linspace(0.05, 0.95, n))
        grid_y = norm.ppf(np.linspace(0.05, 0.95, n))

        for i, yi in enumerate(grid_x):
            for j, xi in enumerate(grid_y):
                z_sample = np.array([[xi, yi]])
                z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2)
                x_decoded = generator.predict(z_sample, batch_size=batch_size)
                digit = x_decoded[0].reshape(digit_size, digit_size)
                figure[i * digit_size:(i + 1) * digit_size,
                       j * digit_size:(j + 1) * digit_size] = digit

        plt.figure(figsize=(10, 10))
        plt.imshow(figure, cmap='Greys_r')
        # plt.show()
        plt.savefig('vae_digit.ps')
        plt.close()

    K.clear_session()
    init = tf.global_variables_initializer()

    # Horovod: broadcast initial variable states from rank 0 to all other processes.
    # This is necessary to ensure consistent initialization of all workers when
    # training is started with random weights or restored from a checkpoint.
    bcast = hvd.broadcast_global_variables(0)

# Step 5: Begin training.

# Horovod: adjust number of steps based on number of GPUs.
num_steps = 100000 // hvd.size() + 1

# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())

with tf.Session(graph=graph, config=config) as session:
    # We must initialize all variables before we use them.
    init.run()
    bcast.run()
    print('Initialized')

    average_loss = 0
    for step in xrange(num_steps):
        # simulate various sentence length by randomization
        batch_size = random.randint(max_batch_size // 2, max_batch_size)
        batch_inputs, batch_labels = generate_batch(
            batch_size, num_skips, skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
    parser.add_argument('--init_restore_dir', type=str,
                        default='check_points/pretrain_models/google_bert_base/bert_model.ckpt')
    parser.add_argument('--checkpoint_dir', type=str,
                        default='check_points/DRCD/google_bert_base/')
    parser.add_argument('--setting_file', type=str, default='setting.txt')
    parser.add_argument('--log_file', type=str, default='log.txt')

    # use some global vars for convenience
    args = parser.parse_args()
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids
    n_gpu = len(args.gpu_ids.split(','))
    if n_gpu > 1:
        assert hvd
        hvd.init()
        mpi_size = hvd.size()
        mpi_rank = hvd.local_rank()
        assert mpi_size == n_gpu
        training_hooks = [hvd.BroadcastGlobalVariablesHook(0)]
        print_rank0('GPU NUM', n_gpu)
    else:
        hvd = None
        mpi_size = 1
        mpi_rank = 0
        training_hooks = None
        print('GPU NUM', n_gpu)

    args.checkpoint_dir += ('/epoch{}_batch{}_lr{}_warmup{}_anslen{}_tf/'
                            .format(args.train_epochs, args.n_batch, args.lr, args.warmup_rate, args.max_ans_length))
    args = utils.check_args(args, mpi_rank)
    print_rank0('######## generating data ########')
Beispiel #26
0
def main(_):
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

    if FLAGS.horovod:
        hvd.init()
    if FLAGS.use_fp16:
        os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"

    processors = {'consensus': ConsensusProcessor}

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    task_name = FLAGS.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    tf.io.gfile.makedirs(FLAGS.output_dir)

    processor = processors[task_name]()

    label_list, label_map = processor.get_labels()
    inv_label_map = {v: k for k, v in label_map.items()}
    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

    master_process = True
    training_hooks = []
    global_batch_size = FLAGS.train_batch_size
    hvd_rank = 0

    config = tf.compat.v1.ConfigProto()
    if FLAGS.horovod:
        global_batch_size = FLAGS.train_batch_size * hvd.size()
        master_process = (hvd.rank() == 0)
        hvd_rank = hvd.rank()
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        if hvd.size() > 1:
            training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))

    if FLAGS.use_xla:
        config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
    run_config = tf.estimator.RunConfig(
        model_dir=FLAGS.output_dir if master_process else None,
        session_config=config,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps
        if master_process else None,
        keep_checkpoint_max=1)

    if master_process:
        tf.compat.v1.logging.info("***** Configuration *****")
        for key in FLAGS.__flags.keys():
            tf.compat.v1.logging.info('  {}: {}'.format(
                key, getattr(FLAGS, key)))
        tf.compat.v1.logging.info("**************************")

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank))

    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / global_batch_size * FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

        start_index = 0
        end_index = len(train_examples)
        tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")]

        if FLAGS.horovod:
            tmp_filenames = [
                os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i))
                for i in range(hvd.size())
            ]
            num_examples_per_rank = len(train_examples) // hvd.size()
            remainder = len(train_examples) % hvd.size()
            if hvd.rank() < remainder:
                start_index = hvd.rank() * (num_examples_per_rank + 1)
                end_index = start_index + num_examples_per_rank + 1
            else:
                start_index = hvd.rank() * num_examples_per_rank + remainder
                end_index = start_index + (num_examples_per_rank)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list) + 1,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_one_hot_embeddings=False,
                                hvd=None if not FLAGS.horovod else hvd,
                                use_fp16=FLAGS.use_fp16)

    estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)

    if FLAGS.do_train:
        filed_based_convert_examples_to_features(
            train_examples[start_index:end_index], label_list, label_map,
            FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank],
            FLAGS.replace_span_A, FLAGS.replace_span_B)
        tf.compat.v1.logging.info("***** Running training *****")
        tf.compat.v1.logging.info("  Num examples = %d", len(train_examples))
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.compat.v1.logging.info("  Num steps = %d", num_train_steps)
        tf.compat.v1.logging.info("  Num of labels = %d", len(label_list))
        train_input_fn = file_based_input_fn_builder(
            input_file=tmp_filenames,
            batch_size=FLAGS.train_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True,
            hvd=None if not FLAGS.horovod else hvd)

        train_start_time = time.time()
        estimator.train(input_fn=train_input_fn,
                        max_steps=num_train_steps,
                        hooks=training_hooks)
        train_time_elapsed = time.time() - train_start_time
        train_time_wo_overhead = training_hooks[-1].total_time
        avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed
        ss_sentences_per_second = (
            num_train_steps - training_hooks[-1].skipped
        ) * global_batch_size * 1.0 / train_time_wo_overhead

        if master_process:
            tf.compat.v1.logging.info("-----------------------------")
            tf.compat.v1.logging.info(
                "Total Training Time = %0.2f for Sentences = %d",
                train_time_elapsed, num_train_steps * global_batch_size)
            tf.compat.v1.logging.info(
                "Total Training Time W/O Overhead = %0.2f for Sentences = %d",
                train_time_wo_overhead,
                (num_train_steps - training_hooks[-1].skipped) *
                global_batch_size)
            tf.compat.v1.logging.info(
                "Throughput Average (sentences/sec) with overhead = %0.2f",
                avg_sentences_per_second)
            tf.compat.v1.logging.info(
                "Throughput Average (sentences/sec) = %0.2f",
                ss_sentences_per_second)
            tf.compat.v1.logging.info("-----------------------------")

    if FLAGS.do_eval and master_process:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)
        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        filed_based_convert_examples_to_features(
            eval_examples, label_list, label_map, FLAGS.max_seq_length,
            tokenizer, eval_file, FLAGS.replace_span_A, FLAGS.replace_span_B)

        tf.compat.v1.logging.info("***** Running evaluation *****")
        tf.compat.v1.logging.info(
            "  Num examples = %d (%d actual, %d padding)", len(eval_examples),
            num_actual_eval_examples,
            len(eval_examples) - num_actual_eval_examples)
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
        # This tells the estimator to run through the entire set.
        eval_steps = None
        eval_drop_remainder = False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            batch_size=FLAGS.eval_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)
        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.io.gfile.GFile(output_eval_file, "w") as writer:
            tf.compat.v1.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.compat.v1.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
    if FLAGS.do_predict and master_process:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)
        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        filed_based_convert_examples_to_features(predict_examples, label_list,
                                                 label_map,
                                                 FLAGS.max_seq_length,
                                                 tokenizer, predict_file,
                                                 FLAGS.replace_span_A,
                                                 FLAGS.replace_span_B)
        tf.compat.v1.logging.info("***** Running prediction*****")
        tf.compat.v1.logging.info(
            "  Num examples = %d (%d actual, %d padding)",
            len(predict_examples), num_actual_predict_examples,
            len(predict_examples) - num_actual_predict_examples)
        tf.compat.v1.logging.info("  Batch size = %d",
                                  FLAGS.predict_batch_size)

        predict_drop_remainder = False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            batch_size=FLAGS.predict_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        eval_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)]
        eval_start_time = time.time()
        output_class_file = os.path.join(FLAGS.output_dir,
                                         "test_output_labels.txt")
        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.tsv")
        with tf.io.gfile.GFile(output_predict_file,
                               "w") as writer, tf.io.gfile.GFile(
                                   output_class_file, "w") as writer2:
            num_written_lines = 0
            tf.compat.v1.logging.info("***** Predict results *****")
            for prediction in estimator.predict(input_fn=predict_input_fn,
                                                hooks=eval_hooks,
                                                yield_single_examples=True):
                probabilities = prediction["probabilities"]
                logits = prediction["logits"]
                pr_res = np.argmax(logits, axis=-1)
                output = str(inv_label_map[pr_res]) + "\n"
                writer2.write(output)
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities) + "\n"
                writer.write(output_line)
                num_written_lines += 1
        assert num_written_lines == num_actual_predict_examples

        eval_time_elapsed = time.time() - eval_start_time
        eval_time_wo_overhead = eval_hooks[-1].total_time

        time_list = eval_hooks[-1].time_list
        time_list.sort()
        num_sentences = (eval_hooks[-1].count -
                         eval_hooks[-1].skipped) * FLAGS.predict_batch_size

        avg = np.mean(time_list)
        cf_50 = max(time_list[:int(len(time_list) * 0.50)])
        cf_90 = max(time_list[:int(len(time_list) * 0.90)])
        cf_95 = max(time_list[:int(len(time_list) * 0.95)])
        cf_99 = max(time_list[:int(len(time_list) * 0.99)])
        cf_100 = max(time_list[:int(len(time_list) * 1)])
        ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead

        tf.compat.v1.logging.info("-----------------------------")
        tf.compat.v1.logging.info(
            "Total Inference Time = %0.2f for Sentences = %d",
            eval_time_elapsed, eval_hooks[-1].count * FLAGS.predict_batch_size)
        tf.compat.v1.logging.info(
            "Total Inference Time W/O Overhead = %0.2f for Sentences = %d",
            eval_time_wo_overhead,
            (eval_hooks[-1].count - eval_hooks[-1].skipped) *
            FLAGS.predict_batch_size)
        tf.compat.v1.logging.info("Summary Inference Statistics")
        tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size)
        tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
        tf.compat.v1.logging.info("Precision = %s",
                                  "fp16" if FLAGS.use_fp16 else "fp32")
        tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f",
                                  cf_50 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f",
                                  cf_90 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f",
                                  cf_95 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f",
                                  cf_99 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f",
                                  cf_100 * 1000)
        tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000)
        tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f",
                                  ss_sentences_per_second)
        tf.compat.v1.logging.info("-----------------------------")
Beispiel #27
0
def main(_):

    hvd.init()

    sess_config = tf.ConfigProto()
    sess_config.gpu_options.visible_device_list = str(hvd.local_rank())

    graph = tf.Graph()
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    with graph.as_default():
        import json

        # config = json.load(open("/data/xuht/bert/chinese_L-12_H-768_A-12/bert_config.json", "r"))

        config = json.load(open(FLAGS.config_file, "r"))

        init_checkpoint = FLAGS.init_checkpoint
        print("===init checkoutpoint==={}".format(init_checkpoint))

        # init_checkpoint = "/data/xuht/bert/chinese_L-12_H-768_A-12/bert_model.ckpt"
        # init_checkpoint = "/data/xuht/concat/model_1/oqmrc.ckpt"
        config = Bunch(config)
        config.use_one_hot_embeddings = True
        config.scope = "esim/bert"
        config.dropout_prob = 0.1
        config.label_type = "single_label"
        config.lstm_dim = 128
        config.num_heads = 12
        config.num_units = 768

        import json
        label_dict = json.load(open(FLAGS.label_id))

        # label_tensor = np.asarray(label_dict["class_ratio"]).astype(np.float32)
        label_tensor = None
        # config.loss = "focal_loss"

        json.dump(config, open(FLAGS.model_output + "/config.json", "w"))

        # os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu_id
        sess = tf.Session(config=sess_config)

        train_size = int(FLAGS.train_size / hvd.size())

        num_train_steps = int(train_size / FLAGS.batch_size * FLAGS.epoch)
        num_warmup_steps = int(num_train_steps * 0.01)

        num_storage_steps = int(train_size / FLAGS.batch_size)

        print(num_train_steps, num_warmup_steps, "=============")

        opt_config = Bunch({
            "init_lr": (5e-5 / hvd.size()),
            "num_train_steps": num_train_steps,
            "num_warmup_steps": num_warmup_steps,
            "train_op": "adam"
        })

        model_io_config = Bunch({"fix_lm": False})

        model_io_fn = model_io.ModelIO(model_io_config)

        num_choice = FLAGS.num_classes
        max_seq_length = FLAGS.max_length * 2 + 3

        model_function = bert_esim.classifier_attn_model_fn_builder

        model_eval_fn = model_function(config,
                                       num_choice,
                                       init_checkpoint,
                                       model_reuse=None,
                                       load_pretrained=True,
                                       model_io_fn=model_io_fn,
                                       model_io_config=model_io_config,
                                       opt_config=opt_config,
                                       input_name=["a", "b"],
                                       label_tensor=label_tensor,
                                       not_storage_params=["adam", "adam_1"],
                                       exclude_scope_dict={"task": "esim"})

        def metric_fn(features, logits, loss):
            print(logits.get_shape(), "===logits shape===")
            pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32)
            prob = tf.nn.softmax(logits)
            accuracy = correct = tf.equal(
                tf.cast(pred_label, tf.int32),
                tf.cast(features["label_ids"], tf.int32))
            accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
            return {
                "accuracy": accuracy,
                "loss": loss,
                "pred_label": pred_label,
                "label_ids": features["label_ids"]
            }

        name_to_features = {
            "input_ids_a": tf.FixedLenFeature([max_seq_length], tf.int64),
            "input_mask_a": tf.FixedLenFeature([max_seq_length], tf.int64),
            "segment_ids_a": tf.FixedLenFeature([max_seq_length], tf.int64),
            "input_ids_b": tf.FixedLenFeature([max_seq_length], tf.int64),
            "input_mask_b": tf.FixedLenFeature([max_seq_length], tf.int64),
            "segment_ids_b": tf.FixedLenFeature([max_seq_length], tf.int64),
            "label_ids": tf.FixedLenFeature([], tf.int64),
        }

        def _decode_record(record, name_to_features):
            """Decodes a record to a TensorFlow example.
            """
            example = tf.parse_single_example(record, name_to_features)

            # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
            # So cast all int64 to int32.
            for name in list(example.keys()):
                t = example[name]
                if t.dtype == tf.int64:
                    t = tf.to_int32(t)
                example[name] = t
            return example

        params = Bunch({})
        params.epoch = FLAGS.epoch
        params.batch_size = FLAGS.batch_size
        # train_features = tf_data_utils.train_input_fn("/data/xuht/wsdm19/data/train.tfrecords",
        #                             _decode_record, name_to_features, params)
        # eval_features = tf_data_utils.eval_input_fn("/data/xuht/wsdm19/data/dev.tfrecords",
        #                             _decode_record, name_to_features, params)

        # train_features = tf_data_utils.train_input_fn(FLAGS.train_file,
        #                             _decode_record, name_to_features, params)
        eval_features = tf_data_utils.eval_input_fn(FLAGS.dev_file,
                                                    _decode_record,
                                                    name_to_features, params)

        # [train_op, train_loss, train_per_example_loss, train_logits] = model_train_fn(train_features, [], tf.estimator.ModeKeys.TRAIN)
        [_, eval_loss, eval_per_example_loss,
         eval_logits] = model_eval_fn(eval_features, [],
                                      tf.estimator.ModeKeys.EVAL)
        result = metric_fn(eval_features, eval_logits, eval_loss)

        model_io_fn.set_saver()

        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        sess.run(init_op)

        model_io_fn.load_model(sess, init_checkpoint)
        print(" ==succeeded in loading model== ")

        sess.run(hvd.broadcast_global_variables(0))

        def eval_fn(result):
            i = 0
            total_accuracy = 0
            label, label_id = [], []
            # label_weight = []
            while True:
                try:
                    eval_result = sess.run(result)
                    total_accuracy += eval_result["accuracy"]
                    label_id.extend(eval_result["label_ids"])
                    label.extend(eval_result["pred_label"])
                    # for item in eval_result["label_ids"]:
                    #     label_weight.append(label_tensor[item])
                    i += 1
                except tf.errors.OutOfRangeError:
                    print("End of dataset")
                    break
            # f1 = f1_score(label_id, label, average="macro", sample_weight=label_weight)
            # accuracy = accuracy_score(label_id, label, sample_weight=label_weight)
            f1 = f1_score(label_id, label, average="macro")
            accuracy = accuracy_score(label_id, label)
            print("test accuracy accuracy {} {} f1 {}".format(
                total_accuracy / i, accuracy, f1))
            return total_accuracy / i, f1

        # print("===========begin to train============")
        # train_fn(train_op, train_loss)
        print("===========begin to eval============")
        accuracy, f1 = eval_fn(result)
        print("==accuracy {} f1 {}==".format(accuracy, f1))
Beispiel #28
0
def parallax_run_mpi(single_gpu_meta_graph_def, config):
    hostname = os.getenv(PARALLAX_HOSTNAME, 0)
    create_profile_directory(config.profile_config.profile_dir,
                             config.resource_info, hostname)

    mpi_meta_graph_def, tensor_or_op_name_to_replica_names = \
        graph_transform_mpi(single_gpu_meta_graph_def, config)
    worker_id = hvd.rank()
    num_workers = hvd.size()
        
    if config.profile_config.profile_dir:
        append_task_info(config.profile_config.profile_dir,
                         hostname,
                         ['worker:%d'%worker_id])

    with tf.Graph().as_default() as graph_to_run:
        parallax_log.debug("Importing MPI graph on worker %d" % worker_id)
        tf.train.import_meta_graph(mpi_meta_graph_def)

        if config.export_graph_path:
            export_meta_graph(config.export_graph_path, worker_id)

        if config.profile_config.profile_dir:
            path = os.path.join(config.profile_config.profile_dir, hostname,
                                'worker:%d'%worker_id)
            export_meta_graph(path, worker_id)
            
            if config.profile_config.profile_worker != None and worker_id != config.profile_config.profile_worker:
                #Only one CUPTI profiler can run in a machine
                #See tensorflow/tensorflow/core/platform/default/device_tracer.cc:L452
                config.profile_config.profile_dir = None
            else:
                config.profile_config.profile_dir = \
                    os.path.join(config.profile_config.profile_dir, hostname,
                                 'worker:%d'%worker_id, 'run_meta')      
 
        ckpt_hooks = build_ckpt_hooks(config.get_ckpt_config()) if worker_id == 0 else None

        sess_config = config.sess_config
        if sess_config is None:
            sess_config = tf.ConfigProto(allow_soft_placement=True)
        sess_config.gpu_options.visible_device_list = str(hvd.local_rank())
        sess = tf.train.MonitoredTrainingSession(
                is_chief=True,
                checkpoint_dir=config.get_ckpt_config().ckpt_dir if worker_id == 0 else None,
                # TODO: Allow user-defined hooks
                hooks=None,
                chief_only_hooks=ckpt_hooks,
                save_checkpoint_secs=None,
                save_summaries_steps=None,
                save_summaries_secs=None,
                config=sess_config)

        parallax_log.debug(
            "Created MonitoredTrainingSession for worker %d" % worker_id)
        _init_global_vars(sess)
        parallax_log.debug(
            "Finished initialization process, start training on \
             worker %d" % worker_id)
        step = sess.run(tf.get_collection(tf.GraphKeys.GLOBAL_STEP)[0])
        sess_context = \
            ParallaxSessionContext(step,
                                   tf.get_collection(tf.GraphKeys.GLOBAL_STEP)[0],
                                   config.profile_config.profile_dir,
                                   config.profile_config.profile_steps,
                                   config.profile_config.profile_range,
                                   tensor_or_op_name_to_replica_names,
                                   1,
                                   config.resource_info['master'][0])
        sess_context.set_parallax_session_context()
        return sess, num_workers, worker_id, 1
Beispiel #29
0
    def test_horovod_allgather_grad_gpu(self):
        """Test the correctness of the allgather gradient on GPU."""
        # Only do this test if there are GPUs available.
        if not tf.test.is_gpu_available(cuda_only=True):
            return

        if os.environ.get('HOROVOD_MIXED_INSTALL'):
            # Skip if compiled with CUDA but without HOROVOD_GPU_ALLGATHER.
            return

        hvd.init()
        rank = hvd.rank()
        local_rank = hvd.local_rank()
        size = hvd.size()

        # As of TensorFlow v1.9, gradients are not supported on
        # integer tensors
        dtypes = [tf.float32, tf.float64]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5
            tensor_sizes = tensor_sizes[:size]

            if _executing_eagerly():
                with tf.GradientTape() as tape:
                    tensor = self.tfe.Variable(
                        tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) *
                        rank)
                    if dtype == tf.bool:
                        tensor = tensor % 2
                    tensor = tf.cast(tensor, dtype=dtype)
                    gathered = hvd.allgather(tensor)
                    grad_list = []
                    for r, tensor_size in enumerate(tensor_sizes):
                        g = tf.ones([tensor_size] + [17] * (dim - 1)) * r
                        grad_list.append(g)
                    grad_ys = tf.concat(grad_list, axis=0)
                with tf.device("/gpu:%d" % local_rank):
                    grad_out = tape.gradient(gathered, tensor, grad_ys)
            else:
                tensor = tf.ones([tensor_sizes[rank]] + [17] *
                                 (dim - 1)) * rank
                if dtype == tf.bool:
                    tensor = tensor % 2
                tensor = tf.cast(tensor, dtype=dtype)
                gathered = hvd.allgather(tensor)

                grad_list = []
                for r, tensor_size in enumerate(tensor_sizes):
                    g = tf.ones([tensor_size] + [17] * (dim - 1)) * r
                    grad_list.append(g)
                grad_ys = tf.concat(grad_list, axis=0)

                with tf.device("/gpu:%d" % local_rank):
                    grad = tf.gradients(gathered, tensor, grad_ys)[0]
                grad_out = self.evaluate(grad)

            expected = np.ones([tensor_sizes[rank]] + [17] *
                               (dim - 1)) * rank * size
            err = np.linalg.norm(expected - grad_out)
            self.assertLess(
                err, 0.00000001, "gradient %s differs from expected %s, "
                "error: %s" % (grad_out, expected, str(err)))
Beispiel #30
0
def FastTextEstimator(model_dir, config=None):
    params = {
        "learning_rate": FLAGS.learning_rate,
    }

    def model_fn(features, labels, mode, params):
        features["text"] = tf.sparse_tensor_to_dense(features["text"],
                                                     default_value=" ")
        # if FLAGS.use_ngrams:
        #     if FLAGS.ngrams is not None:
        #         ngrams_list = text_utils.ParseNgramsOpts(FLAGS.ngrams)
        #         features["ngrams"] = tf.py_func(text_utils.GenerateNgrams,
        #                                         [features["text"], ngrams_list], tf.string)
        text_lookup_table = tf.contrib.lookup.index_table_from_file(
            FLAGS.vocab_file, FLAGS.num_oov_vocab_buckets, FLAGS.vocab_size)
        text_ids = text_lookup_table.lookup(features["text"])
        text_embedding_w = tf.Variable(
            tf.random_uniform([
                FLAGS.vocab_size + FLAGS.num_oov_vocab_buckets,
                FLAGS.embedding_dimension
            ], -0.1, 0.1))
        text_embedding = tf.reduce_mean(tf.nn.embedding_lookup(
            text_embedding_w, text_ids),
                                        axis=-2)
        input_layer = text_embedding
        # if FLAGS.use_ngrams:
        #     ngram_hash = tf.string_to_hash_bucket(features["ngrams"],
        #                                           FLAGS.num_ngram_buckets)
        #     ngram_embedding_w = tf.Variable(tf.random_uniform(
        #         [FLAGS.num_ngram_buckets, FLAGS.ngram_embedding_dimension], -0.1, 0.1))
        #     ngram_embedding = tf.reduce_mean(tf.nn.embedding_lookup(
        #         ngram_embedding_w, ngram_hash), axis=-2)
        #     ngram_embedding = tf.expand_dims(ngram_embedding, -2)
        #     input_layer = tf.concat([text_embedding, ngram_embedding], -1)
        num_classes = FLAGS.num_labels
        logits = tf.contrib.layers.fully_connected(inputs=input_layer,
                                                   num_outputs=num_classes,
                                                   activation_fn=None)
        predictions = tf.argmax(logits, axis=-1)
        probs = tf.nn.softmax(logits)
        loss, train_op = None, None
        metrics = {}
        if mode != tf.estimator.ModeKeys.PREDICT:
            label_lookup_table = tf.contrib.lookup.index_table_from_file(
                FLAGS.label_file, vocab_size=FLAGS.num_labels)
            labels = label_lookup_table.lookup(labels)
            loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
                                                               logits=logits))
            opt = tf.train.AdamOptimizer(params["learning_rate"])
            if FLAGS.horovod:
                opt = hvd.DistributedOptimizer(opt)
            train_op = opt.minimize(loss,
                                    global_step=tf.train.get_global_step())
            metrics = {"accuracy": tf.metrics.accuracy(labels, predictions)}
        exports = {}
        if FLAGS.export_dir:
            exports = Exports(probs, text_embedding)
        return tf.estimator.EstimatorSpec(mode,
                                          predictions=predictions,
                                          loss=loss,
                                          train_op=train_op,
                                          eval_metric_ops=metrics,
                                          export_outputs=exports)

    session_config = tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement)
    if FLAGS.horovod:
        session_config.gpu_options.visible_device_list = str(hvd.local_rank())
    config = tf.contrib.learn.RunConfig(
        save_checkpoints_secs=None,
        save_checkpoints_steps=FLAGS.checkpoint_steps,
        session_config=session_config)
    return tf.estimator.Estimator(model_fn=model_fn,
                                  model_dir=model_dir,
                                  params=params,
                                  config=config)
Beispiel #31
0
        ds = get_val_dataflow(args.data, batch, fbresnet_augmentor(False))
        model = Model(args.depth)
        eval_on_ILSVRC12(model, get_model_loader(args.load), ds)
        sys.exit()

    logger.info("Training on {}".format(socket.gethostname()))
    # Print some information for sanity check.
    os.system("nvidia-smi")
    assert args.load is None

    hvd.init()

    if args.logdir is None:
        args.logdir = os.path.join(
            'train_log',
            'Horovod-{}GPUs-{}Batch'.format(hvd.size(), args.batch))

    if hvd.rank() == 0:
        logger.set_logger_dir(args.logdir, 'd')
    logger.info("Rank={}, Local Rank={}, Size={}".format(
        hvd.rank(), hvd.local_rank(), hvd.size()))

    model = Model(args.depth, loss_scale=1.0 / hvd.size())
    config = get_config(model, fake=args.fake)
    """
    Sec 3: standard communication primitives like
    allreduce [11] perform summing, not averaging
    """
    trainer = HorovodTrainer(average=False)
    launch_train_with_config(config, trainer)
Beispiel #32
0
	def set_model(self, model):
		self.model = model
		self.graph = tf.Graph()
		with self.graph.as_default():
			#Horovod added: Normal workflow
			config1 = tf.ConfigProto(log_device_placement=False)
			config1.gpu_options.allow_growth = True
			config1.gpu_options.visible_device_list = str(hvd.local_rank())
			self.sess = tf.Session(config=config1)
			#Horovod end
			with self.sess.as_default():
				initializer = tf.contrib.layers.xavier_initializer(uniform = True)
				with tf.variable_scope("model", reuse=tf.AUTO_REUSE, initializer = initializer):
					self.trainModel = self.model(config = self)
					#Horovod added: Vary the learning rate, dist optimizer
					if self.optimizer != None:
						pass
					elif self.opt_method == "Adagrad" or self.opt_method == "adagrad":
						self.optimizer = tf.train.AdagradOptimizer(learning_rate = self.alpha * hvd.size(), initial_accumulator_value=1e-20)
					elif self.opt_method == "Adadelta" or self.opt_method == "adadelta":
						self.optimizer = tf.train.AdadeltaOptimizer(self.alpha * hvd.size())
					elif self.opt_method == "Adam" or self.opt_method == "adam":
						self.optimizer = tf.train.AdamOptimizer(self.alpha * hvd.size())
					else:
						self.optimizer = tf.train.GradientDescentOptimizer(self.alpha * hvd.size() * self.sync_after)

					################################################################
					# Fetch a list of our network's trainable parameters.
					self.trainable_vars = tf.trainable_variables()
					#print("Shape of trainable vars: {}".format(np.array(self.trainable_vars)))

					# Create variables to store accumulated gradients
					self.accumulators = [
					    tf.Variable(
					        tf.zeros_like(tv.initialized_value()),
					        trainable=False
					    ) for tv in self.trainable_vars
					]
					#print("Shape of accumulators: {}".format(np.array(self.accumulators)))

					# Create a variable for counting the number of accumulations
					self.accumulation_counter = tf.Variable(0.0, trainable=False)

					# Compute gradients; grad_pairs contains (gradient, variable) pairs
					self.grad_pairs = self.optimizer.compute_gradients(self.trainModel.loss, self.trainable_vars)
					# print("Shape of grad_pairs: {}".format(np.array(self.grad_pairs)))
					# for g, v in self.grad_pairs:
					# 	print("Shape of grad: {}".format(np.array(g)))


					# Create operations which add a variable's gradient to its accumulator.
					self.accumulate_ops = [
					    accumulator.assign_add(
					        grad
					    ) for (accumulator, (grad, var)) in zip(self.accumulators, self.grad_pairs) if grad is not None
					]

					# The final accumulation operation is to increment the counter
					self.accumulate_ops.append(self.accumulation_counter.assign_add(1.0))

					# Update trainable variables by applying the accumulated gradients
					# divided by the counter. Note: apply_gradients takes in a list of 
					# (grad, var) pairs
					# self.apply_step = self.optimizer.apply_gradients(
					#     [(accumulator / self.accumulation_counter, var) \
					#         for (accumulator, (grad, var)) in zip(self.accumulators, self.grad_pairs)]
					# )

					# Accumulators must be zeroed once the accumulated gradient is applied.
					self.zero_ops = [
					    accumulator.assign(
					        tf.zeros_like(tv)
					    ) for (accumulator, tv) in zip(self.accumulators, self.trainable_vars)
					]

					# Add one last op for zeroing the counter
					self.zero_ops.append(self.accumulation_counter.assign(0.0))
					################################################################///////////



					# self.dist_optimizer = hvd.DistributedOptimizer(self.optimizer)
					# self.train_op = self.dist_optimizer.minimize(self.trainModel.loss)
					#Horovod end
				self.barrier = hvd.allreduce(tf.random_normal(shape=[1]))
				if(hvd.rank() == 0):
					self.saver = tf.train.Saver()
					# self.logSummary = tf.summary.scalar('Train_loss', self.trainModel.loss)
					# self.train_writer = tf.summary.FileWriter('./train', self.sess.graph)
				self.sess.run(tf.global_variables_initializer())
				
				#Horovod added: Normal workflow
				self.sess.run(hvd.broadcast_global_variables(0))
Beispiel #33
0
def main(device, input_path_train, input_path_validation, downsampling_fact,
         downsampling_mode, channels, data_format, label_id, weights,
         image_dir, checkpoint_dir, trn_sz, val_sz, loss_type, model, decoder,
         fs_type, optimizer, batch, batchnorm, num_epochs, dtype,
         disable_checkpoints, disable_imsave, tracing, trace_dir,
         output_sampling, scale_factor):
    #init horovod
    comm_rank = 0
    comm_local_rank = 0
    comm_size = 1
    comm_local_size = 1
    if horovod:
        hvd.init()
        comm_rank = hvd.rank()
        comm_local_rank = hvd.local_rank()
        comm_size = hvd.size()
        #not all horovod versions have that implemented
        try:
            comm_local_size = hvd.local_size()
        except:
            comm_local_size = 1
        if comm_rank == 0:
            print("Using distributed computation with Horovod: {} total ranks".
                  format(comm_size, comm_rank))

    #downsampling? recompute image dims
    image_height = image_height_orig // downsampling_fact
    image_width = image_width_orig // downsampling_fact

    #parameters
    per_rank_output = False
    loss_print_interval = 10

    #session config
    sess_config = tf.ConfigProto(
        inter_op_parallelism_threads=1,  #1
        intra_op_parallelism_threads=6,  #6
        log_device_placement=False,
        allow_soft_placement=True)
    sess_config.gpu_options.visible_device_list = str(comm_local_rank + 2)
    sess_config.gpu_options.force_gpu_compatible = True

    #get data
    training_graph = tf.Graph()
    if comm_rank == 0:
        print("Loading data...")
    trn_data = load_data(input_path_train, True, trn_sz, horovod)
    val_data = load_data(input_path_validation, False, val_sz, horovod)
    if comm_rank == 0:
        print("Shape of trn_data is {}".format(trn_data.shape[0]))
        print("Shape of val_data is {}".format(val_data.shape[0]))
        print("done.")

    #print some stats
    if comm_rank == 0:
        print("Num workers: {}".format(comm_size))
        print("Local batch size: {}".format(batch))
        if dtype == tf.float32:
            print("Precision: {}".format("FP32"))
        else:
            print("Precision: {}".format("FP16"))
        print("Decoder: {}".format(decoder))
        print("Batch normalization: {}".format(batchnorm))
        print("Channels: {}".format(channels))
        print("Loss type: {}".format(loss_type))
        print("Loss weights: {}".format(weights))
        print("Loss scale factor: {}".format(scale_factor))
        print("Output sampling target: {}".format(output_sampling))
        #print optimizer parameters
        for k, v in optimizer.items():
            print("Solver Parameters: {k}: {v}".format(k=k, v=v))
        print("Num training samples: {}".format(trn_data.shape[0]))
        print("Num validation samples: {}".format(val_data.shape[0]))
        print("Disable checkpoints: {}".format(disable_checkpoints))
        print("Disable image save: {}".format(disable_imsave))

    #compute epochs and stuff:
    if fs_type == "local":
        num_samples = trn_data.shape[0] // comm_local_size
    else:
        num_samples = trn_data.shape[0] // comm_size
    num_steps_per_epoch = num_samples // batch
    num_steps = num_epochs * num_steps_per_epoch
    if per_rank_output:
        print("Rank {} does {} steps per epoch".format(comm_rank,
                                                       num_steps_per_epoch))

    with training_graph.as_default():
        #create readers
        trn_reader = h5_input_reader(input_path_train,
                                     channels,
                                     weights,
                                     dtype,
                                     normalization_file="stats.h5",
                                     update_on_read=False,
                                     data_format=data_format,
                                     label_id=label_id,
                                     sample_target=output_sampling)
        val_reader = h5_input_reader(input_path_validation,
                                     channels,
                                     weights,
                                     dtype,
                                     normalization_file="stats.h5",
                                     update_on_read=False,
                                     data_format=data_format,
                                     label_id=label_id)
        #create datasets
        if fs_type == "local":
            trn_dataset = create_dataset(trn_reader,
                                         trn_data,
                                         batch,
                                         num_epochs,
                                         comm_local_size,
                                         comm_local_rank,
                                         dtype,
                                         shuffle=True)
            val_dataset = create_dataset(val_reader,
                                         val_data,
                                         batch,
                                         1,
                                         comm_local_size,
                                         comm_local_rank,
                                         dtype,
                                         shuffle=False)
        else:
            trn_dataset = create_dataset(trn_reader,
                                         trn_data,
                                         batch,
                                         num_epochs,
                                         comm_size,
                                         comm_rank,
                                         dtype,
                                         shuffle=True)
            val_dataset = create_dataset(val_reader,
                                         val_data,
                                         batch,
                                         1,
                                         comm_size,
                                         comm_rank,
                                         dtype,
                                         shuffle=False)

        #create iterators
        handle = tf.placeholder(tf.string,
                                shape=[],
                                name="iterator-placeholder")
        iterator = tf.data.Iterator.from_string_handle(
            handle, (dtype, tf.int32, dtype, tf.string),
            ((batch, len(channels), image_height_orig,
              image_width_orig) if data_format == "channels_first" else
             (batch, image_height_orig, image_width_orig, len(channels)),
             (batch, image_height_orig, image_width_orig),
             (batch, image_height_orig, image_width_orig), (batch)))
        next_elem = iterator.get_next()

        #if downsampling, do some preprocessing
        if downsampling_fact != 1:
            if downsampling_mode == "scale":
                #do downsampling
                rand_select = tf.cast(tf.one_hot(tf.random_uniform(
                    (batch, image_height, image_width),
                    minval=0,
                    maxval=downsampling_fact * downsampling_fact,
                    dtype=tf.int32),
                                                 depth=downsampling_fact *
                                                 downsampling_fact,
                                                 axis=-1),
                                      dtype=tf.int32)
                next_elem = (tf.layers.average_pooling2d(next_elem[0], downsampling_fact, downsampling_fact, 'valid', data_format), \
                             tf.reduce_max(tf.multiply(tf.image.extract_image_patches(tf.expand_dims(next_elem[1], axis=-1), \
                                                                                 [1, downsampling_fact, downsampling_fact, 1], \
                                                                                 [1, downsampling_fact, downsampling_fact, 1], \
                                                                                 [1,1,1,1], 'VALID'), rand_select), axis=-1), \
                             tf.squeeze(tf.layers.average_pooling2d(tf.expand_dims(next_elem[2], axis=-1), downsampling_fact, downsampling_fact, 'valid', "channels_last"), axis=-1), \
                             next_elem[3])
            elif downsampling_mode == "center-crop":
                #some parameters
                length = 1. / float(downsampling_fact)
                offset = length / 2.
                boxes = [[offset, offset, offset + length, offset + length]
                         ] * batch
                box_ind = list(range(0, batch))
                crop_size = [image_height, image_width]

                #be careful with data order
                if data_format == "channels_first":
                    next_elem[0] = tf.transpose(next_elem[0],
                                                perm=[0, 2, 3, 1])

                #crop
                next_elem = (tf.image.crop_and_resize(next_elem[0], boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="data_cropping"), \
                             ensure_type(tf.squeeze(tf.image.crop_and_resize(tf.expand_dims(next_elem[1],axis=-1), boxes, box_ind, crop_size, method='nearest', extrapolation_value=0, name="label_cropping"), axis=-1), tf.int32), \
                             tf.squeeze(tf.image.crop_and_resize(tf.expand_dims(next_elem[2],axis=-1), boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="weight_cropping"), axis=-1), \
                             next_elem[3])

                #be careful with data order
                if data_format == "channels_first":
                    next_elem[0] = tf.transpose(next_elem[0],
                                                perm=[0, 3, 1, 2])

            else:
                raise ValueError(
                    "Error, downsampling mode {} not supported. Supported are [center-crop, scale]"
                    .format(downsampling_mode))

        #create init handles
        #trn
        trn_iterator = trn_dataset.make_initializable_iterator()
        trn_handle_string = trn_iterator.string_handle()
        trn_init_op = iterator.make_initializer(trn_dataset)
        #val
        val_iterator = val_dataset.make_initializable_iterator()
        val_handle_string = val_iterator.string_handle()
        val_init_op = iterator.make_initializer(val_dataset)

        #compute the input filter number based on number of channels used
        num_channels = len(channels)
        #set up model
        model = deeplab_v3_plus_generator(num_classes=3,
                                          output_stride=8,
                                          base_architecture=model,
                                          decoder=decoder,
                                          batchnorm=batchnorm,
                                          pre_trained_model=None,
                                          batch_norm_decay=None,
                                          data_format=data_format)

        logit, prediction = model(next_elem[0], True, dtype)

        #set up loss
        loss = None

        #cast the logits to fp32
        logit = ensure_type(logit, tf.float32)
        if loss_type == "weighted":
            #cast weights to FP32
            w_cast = ensure_type(next_elem[2], tf.float32)
            loss = tf.losses.sparse_softmax_cross_entropy(
                labels=next_elem[1],
                logits=logit,
                weights=w_cast,
                reduction=tf.losses.Reduction.SUM)
            if scale_factor != 1.0:
                loss *= scale_factor

        elif loss_type == "weighted_mean":
            #cast weights to FP32
            w_cast = ensure_type(next_elem[2], tf.float32)
            loss = tf.losses.sparse_softmax_cross_entropy(
                labels=next_elem[1],
                logits=logit,
                weights=w_cast,
                reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
            if scale_factor != 1.0:
                loss *= scale_factor

        elif loss_type == "focal":
            #one-hot-encode
            labels_one_hot = tf.contrib.layers.one_hot_encoding(
                next_elem[1], 3)
            #cast to FP32
            labels_one_hot = ensure_type(labels_one_hot, tf.float32)
            loss = focal_loss(onehot_labels=labels_one_hot,
                              logits=logit,
                              alpha=1.,
                              gamma=2.)
        else:
            raise ValueError("Error, loss type {} not supported.",
                             format(loss_type))

        # tf.summary.scalar('loss', loss)

        #determine flops
        flops = graph_flops.graph_flops(
            format="NHWC" if data_format == "channels_last" else "NCHW",
            batch=batch,
            sess_config=sess_config)
        flops *= comm_size
        if comm_rank == 0:
            print('training flops: {:.3f} TF/step'.format(flops * 1e-12))

        #number of trainable parameters
        if comm_rank == 0:
            num_params = get_number_of_trainable_parameters()
            print('number of trainable parameters: {} ({} MB)'.format(
                num_params,
                num_params * (4 if dtype == tf.float32 else 2) * (2**-20)))

        if horovod:
            loss_avg = hvd.allreduce(ensure_type(loss, tf.float32))
        else:
            loss_avg = tf.identity(loss)

        #set up global step - keep on CPU
        with tf.device('/device:CPU:0'):
            global_step = tf.train.get_or_create_global_step()

        #set up optimizer
        if optimizer['opt_type'].startswith("LARC"):
            if comm_rank == 0:
                print("Enabling LARC")
            train_op, lr = get_larc_optimizer(optimizer, loss, global_step,
                                              num_steps_per_epoch, horovod)
        else:
            train_op, lr = get_optimizer(optimizer, loss, global_step,
                                         num_steps_per_epoch, horovod)

        #set up streaming metrics
        iou_op, iou_update_op = tf.metrics.mean_iou(labels=next_elem[1],
                                                    predictions=tf.argmax(
                                                        prediction, axis=3),
                                                    num_classes=3,
                                                    weights=None,
                                                    metrics_collections=None,
                                                    updates_collections=None,
                                                    name="iou_score")
        iou_reset_op = tf.variables_initializer([
            i for i in tf.local_variables() if i.name.startswith('iou_score/')
        ])

        if horovod:
            iou_avg = hvd.allreduce(iou_op)
        else:
            iou_avg = tf.identity(iou_op)
        # tf.summary.scalar('IOU', iou_avg)

        with tf.device(device):
            mem_usage_ops = [
                tf.contrib.memory_stats.MaxBytesInUse(),
                tf.contrib.memory_stats.BytesLimit()
            ]
        #hooks
        #these hooks are essential. regularize the step hook by adding one additional step at the end
        hooks = [tf.train.StopAtStepHook(last_step=num_steps + 1)]
        #bcast init for bcasting the model after start
        if horovod:
            init_bcast = hvd.broadcast_global_variables(0)
        #initializers:
        init_op = tf.global_variables_initializer()
        init_local_op = tf.local_variables_initializer()

        #checkpointing
        if comm_rank == 0:
            checkpoint_save_freq = 5 * num_steps_per_epoch
            checkpoint_saver = tf.train.Saver(max_to_keep=1000)
            if (not disable_checkpoints):
                hooks.append(
                    tf.train.CheckpointSaverHook(
                        checkpoint_dir=checkpoint_dir,
                        save_steps=checkpoint_save_freq,
                        saver=checkpoint_saver))
            #create image dir if not exists
            if not os.path.isdir(image_dir):
                os.makedirs(image_dir)

        #tracing
        if tracing is not None:
            import tracehook
            tracing_hook = tracehook.TraceHook(steps_to_trace=tracing,
                                               cache_traces=True,
                                               trace_dir=trace_dir)
            hooks.append(tracing_hook)

        # instead of averaging losses over an entire epoch, use a moving
        #  window average
        recent_losses = []
        loss_window_size = 10
        #start session
        with tf.train.MonitoredTrainingSession(config=sess_config,
                                               hooks=hooks) as sess:
            #initialize
            sess.run([init_op, init_local_op])
            #restore from checkpoint:
            if comm_rank == 0 and not disable_checkpoints:
                load_model(sess, checkpoint_saver, checkpoint_dir)
            with open('prm{}.log'.format(comm_rank), 'w') as f:
                f.write('{}\n'.format(hvd.rank()))
                for i, v in enumerate(tf.global_variables()):
                    f.write('{} {}\n'.format(i, v))
                print('Parameters checked')
            #broadcast loaded model variables
            if horovod:
                tb = time.time()
                sess.run(init_bcast)
                te = time.time()
                print('Model synchronization done in {} s'.format(te - tb))
            #create iterator handles
            trn_handle, val_handle = sess.run(
                [trn_handle_string, val_handle_string])
            #init iterators
            sess.run(trn_init_op, feed_dict={handle: trn_handle})
            sess.run(val_init_op, feed_dict={handle: val_handle})

            print('Init iterations done')

            # figure out what step we're on (it won't be 0 if we are
            #  restoring from a checkpoint) so we can count from there
            train_steps = sess.run([global_step])[0]

            #do the training
            epoch = 1
            step = 1

            prev_mem_usage = 0
            t_sustained_start = time.time()
            r_peak = 0

            #start training
            start_time = time.time()

            while not sess.should_stop():

                #training loop
                try:
                    #construct feed dict
                    t_inst_start = time.time()
                    _, tmp_loss, cur_lr = sess.run(
                        [
                            train_op,
                            (loss if per_rank_output else loss_avg), lr
                        ],
                        feed_dict={handle: trn_handle})
                    t_inst_end = time.time()
                    if "gpu" in device.lower():
                        mem_used = sess.run(mem_usage_ops)
                    else:
                        mem_used = [0, 0]
                    train_steps += 1
                    train_steps_in_epoch = train_steps % num_steps_per_epoch
                    recent_losses = [tmp_loss
                                     ] + recent_losses[0:loss_window_size - 1]
                    train_loss = sum(recent_losses) / len(recent_losses)
                    step += 1

                    r_inst = (t_inst_end - t_inst_start)
                    r_peak = max(r_peak, r_inst)

                    #print step report
                    eff_steps = train_steps_in_epoch if (
                        train_steps_in_epoch > 0) else num_steps_per_epoch
                    if (train_steps % loss_print_interval) == 0:
                        mem_used = sess.run(mem_usage_ops)
                        if per_rank_output:
                            print(
                                "REPORT: rank {}, training loss for step {} (of {}) is {}, time {:.3f}"
                                .format(comm_rank, train_steps, num_steps,
                                        train_loss,
                                        time.time() - start_time))
                        else:
                            if comm_rank == 0:
                                if True or mem_used[0] > prev_mem_usage:
                                    print(
                                        "memory usage: {:.2f} GB / {:.2f} GB".
                                        format(mem_used[0] / 2.0**30,
                                               mem_used[1] / 2.0**30))
                                    prev_mem_usage = mem_used[0]
                                print(
                                    "REPORT: training loss for step {} (of {}) is {}, time {:.3f}, r_inst {:.3f}, r_peak {:.3f}, lr {:.2g}"
                                    .format(train_steps, num_steps, train_loss,
                                            time.time() - start_time, r_inst,
                                            r_peak, cur_lr))

                    # summary = sess.run([merged], feed_dict=feed_dict(False))
                    # test_writer.add_summary(summary, train_steps)

                    #do the validation phase
                    if train_steps_in_epoch == 0:
                        end_time = time.time()
                        #print epoch report
                        if per_rank_output:
                            print(
                                "COMPLETED: rank {}, training loss for epoch {} (of {}) is {}, time {:.3f}, r_sust {:.3f}"
                                .format(
                                    comm_rank, epoch, num_epochs, train_loss,
                                    time.time() - start_time,
                                    1e-12 * flops * num_steps_per_epoch /
                                    (end_time - t_sustained_start)))
                        else:
                            if comm_rank == 0:
                                print(
                                    "COMPLETED: training loss for epoch {} (of {}) is {}, time {:.3f}, r_sust {:.3f}"
                                    .format(
                                        epoch, num_epochs, train_loss,
                                        time.time() - start_time,
                                        1e-12 * flops * num_steps_per_epoch /
                                        (end_time - t_sustained_start)))

                        #evaluation loop
                        eval_loss = 0.
                        eval_steps = 0
                        while True:
                            try:
                                #construct feed dict
                                _, tmp_loss, val_model_predictions, val_model_labels, val_model_filenames = sess.run(
                                    [
                                        iou_update_op,
                                        (loss
                                         if per_rank_output else loss_avg),
                                        prediction, next_elem[1], next_elem[3]
                                    ],
                                    feed_dict={handle: val_handle})
                                #print some images
                                if comm_rank == 0 and not disable_imsave:
                                    if have_imsave:
                                        imsave(
                                            image_dir + '/test_pred_epoch' +
                                            str(epoch) + '_estep' +
                                            str(eval_steps) + '_rank' +
                                            str(comm_rank) + '.png',
                                            np.argmax(
                                                val_model_predictions[0, ...],
                                                axis=2) * 100)
                                        imsave(
                                            image_dir + '/test_label_epoch' +
                                            str(epoch) + '_estep' +
                                            str(eval_steps) + '_rank' +
                                            str(comm_rank) + '.png',
                                            val_model_labels[0, ...] * 100)
                                        imsave(
                                            image_dir +
                                            '/test_combined_epoch' +
                                            str(epoch) + '_estep' +
                                            str(eval_steps) + '_rank' +
                                            str(comm_rank) + '.png',
                                            plot_colormap[
                                                val_model_labels[0, ...],
                                                np.argmax(
                                                    val_model_predictions[0,
                                                                          ...],
                                                    axis=2)])
                                    else:
                                        np.savez(
                                            image_dir + '/test_epoch' +
                                            str(epoch) + '_estep' +
                                            str(eval_steps) + '_rank' +
                                            str(comm_rank) + '.npz',
                                            prediction=np.argmax(
                                                val_model_predictions[0, ...],
                                                axis=2) * 100,
                                            label=val_model_labels[0, ...] *
                                            100,
                                            filename=val_model_filenames[0])
                                eval_loss += tmp_loss
                                eval_steps += 1
                            except tf.errors.OutOfRangeError:
                                eval_steps = np.max([eval_steps, 1])
                                eval_loss /= eval_steps
                                if per_rank_output:
                                    print(
                                        "COMPLETED: rank {}, evaluation loss for epoch {} (of {}) is {}"
                                        .format(comm_rank, epoch, num_epochs,
                                                eval_loss))
                                else:
                                    if comm_rank == 0:
                                        print(
                                            "COMPLETED: evaluation loss for epoch {} (of {}) is {}"
                                            .format(epoch, num_epochs,
                                                    eval_loss))
                                if per_rank_output:
                                    iou_score = sess.run(iou_op)
                                    print(
                                        "COMPLETED: rank {}, evaluation IoU for epoch {} (of {}) is {}"
                                        .format(comm_rank, epoch, num_epochs,
                                                iou_score))
                                else:
                                    iou_score = sess.run(iou_avg)
                                    if comm_rank == 0:
                                        print(
                                            "COMPLETED: evaluation IoU for epoch {} (of {}) is {}"
                                            .format(epoch, num_epochs,
                                                    iou_score))
                                sess.run(iou_reset_op)
                                sess.run(val_init_op,
                                         feed_dict={handle: val_handle})
                                break

                        #reset counters
                        epoch += 1
                        step = 0
                        t_sustained_start = time.time()

                except tf.errors.OutOfRangeError:
                    break

        # write any cached traces to disk
        if tracing is not None:
            tracing_hook.write_traces()
Beispiel #34
0
def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    tf.enable_eager_execution(config=config)

    mnist_model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(16, [3, 3], activation='relu'),
        tf.keras.layers.Conv2D(16, [3, 3], activation='relu'),
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(10)
    ])

    # Horovod: adjust learning rate based on number of GPUs.
    opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())

    (mnist_images, mnist_labels), _ = \
        tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % hvd.rank())

    dataset = tf.data.Dataset.from_tensor_slices(
        (tf.cast(mnist_images[..., tf.newaxis] / 255.0,
                 tf.float32), tf.cast(mnist_labels, tf.int64)))
    dataset = dataset.shuffle(1000).batch(32)

    checkpoint_dir = './checkpoints'
    step_counter = tf.train.get_or_create_global_step()
    checkpoint = tf.train.Checkpoint(model=mnist_model,
                                     optimizer=opt,
                                     step_counter=step_counter)

    # Horovod: adjust number of steps based on number of GPUs.
    for (batch, (images,
                 labels)) in enumerate(dataset.take(20000 // hvd.size())):
        with tf.GradientTape() as tape:
            logits = mnist_model(images, training=True)
            loss_value = tf.losses.sparse_softmax_cross_entropy(labels, logits)

        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        if batch == 0:
            hvd.broadcast_variables(mnist_model.variables, root_rank=0)

        # Horovod: add Horovod Distributed GradientTape.
        tape = hvd.DistributedGradientTape(tape)

        grads = tape.gradient(loss_value, mnist_model.variables)
        opt.apply_gradients(zip(grads, mnist_model.variables),
                            global_step=tf.train.get_or_create_global_step())

        if batch % 10 == 0 and hvd.local_rank() == 0:
            print('Step #%d\tLoss: %.6f' % (batch, loss_value))

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting it.
    if hvd.rank() == 0:
        checkpoint.save(checkpoint_dir)
    init = tf.global_variables_initializer()

    # Horovod: broadcast initial variable states from rank 0 to all other processes.
    # This is necessary to ensure consistent initialization of all workers when
    # training is started with random weights or restored from a checkpoint.
    bcast = hvd.broadcast_global_variables(0)

# Step 5: Begin training.

# Horovod: adjust number of steps based on number of GPUs.
num_steps = 100000 // hvd.size() + 1

# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())

with tf.Session(graph=graph, config=config) as session:
    # We must initialize all variables before we use them.
    init.run()
    bcast.run()
    print('Initialized')

    average_loss = 0
    for step in xrange(num_steps):
        # simulate various sentence length by randomization
        batch_size = random.randint(max_batch_size // 2, max_batch_size)
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips,
                                                    skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)

    # Initialize Horovod.
    hvd.init()

    logdevp = args.logdevp  # For debugging
    log_device_placement, allow_soft_placement = (True, True) \
        if _DEVPROF or logdevp else (False, False)

    nranks_per_gpu = args.nranks_per_gpu
    local_rank = hvd.local_rank()
    gpu_local_rank = local_rank // nranks_per_gpu
    print('local_rank, GPU_LOCAL_RANK: {}, {}'.format(
        local_rank, gpu_local_rank))

    # Pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto(log_device_placement=log_device_placement,
                            allow_soft_placement=allow_soft_placement)
    config.gpu_options.allow_growth = True
    # config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.visible_device_list = str(gpu_local_rank)
    KB.set_session(tf.Session(config=config))

    hvdsize = hvd.size()

    checkpt = getattr(args, 'checkpt', None)
    checkpt_flag = False if checkpt is None else True
    filepath = checkpt
    # print('CHECKPT:', checkpt)

    batch_size = args.batch_size
    num_classes = 10
    epochs = args.epochs
    data_augmentation = args.aug

    datadir = getattr(args, 'datadir', None)

    # The data, shuffled and split between train and test sets:
    (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) \
        if datadir is not None else cifar10.load_data()
    train_samples = x_train.shape[0]
    test_samples = x_test.shape[0]
    steps_per_epoch = train_samples // batch_size // hvdsize
    test_batches = test_samples // batch_size
    print(train_samples, 'train samples')
    print(test_samples, 'test samples')

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255

    # Convert class vectors to binary class matrices.
    y_train = to_categorical(y_train, num_classes).squeeze()
    y_test = to_categorical(y_test, num_classes).squeeze()

    callbacks = []
    if hvd.rank() == 0:
        callbacks += [BatchTiming(), SamplesPerSec(batch_size * hvdsize)]

    print(x_train.shape, 'train shape')
    # with tf.device('/cpu:0'):
    model = make_model(x_train.shape, num_classes,
                       filepath if checkpt_flag else None)

    lr = 0.0001 * hvdsize
    opt = tf.train.RMSPropOptimizer(lr)
    # Add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)  # , use_locking=True)
    opt = TFOptimizer(opt)  # Required for tf.train based optimizers

    # ------------------------------------- HAVE TO GET SESSION AFTER OPTIMIZER
    # sess = KB.get_session()
    # -------------------------------------------------------------------------

    # Let's train the model using RMSprop
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    if hvd.rank() == 0:
        model.summary()

    KB.get_session().run(hvd.broadcast_global_variables(0))
    if not data_augmentation:
        print('Not using data augmentation.')
        # model.fit(x_train, y_train,
        #           batch_size=batch_size,
        #           epochs=epochs,
        #           validation_data=(x_test, y_test),
        #           shuffle=True,
        #           callbacks=callbacks)

        train_gen = ImageDataGenerator()
        test_gen = ImageDataGenerator()
        # Train the model. The training will randomly sample 1 / N batches of
        # training data and 3 / N batches of validation data on every worker,
        # where N is the number of workers. Over-sampling of validation data
        # helps to increase probability that every validation example will be
        # evaluated.
        start_time = time.time()
        model.fit_generator(
            train_gen.flow(x_train, y_train, batch_size=batch_size),
            steps_per_epoch=steps_per_epoch,
            callbacks=callbacks,
            epochs=epochs,
            verbose=hvd.rank() == 0,
            validation_data=test_gen.flow(x_test, y_test,
                                          batch_size=batch_size),
            validation_steps=3 * test_batches // hvdsize)

    else:
        print('Using real-time data augmentation.')
        # This will do preprocessing and realtime data augmentation:
        datagen = ImageDataGenerator(
            featurewise_center=False,  # set input mean to 0 over the dataset
            samplewise_center=False,  # set each sample mean to 0
            # divide inputs by std of the dataset
            featurewise_std_normalization=False,
            samplewise_std_normalization=False,  # divide each input by its std
            zca_whitening=False,  # apply ZCA whitening
            # randomly rotate images in the range (degrees, 0 to 180)
            rotation_range=0,
            # randomly shift images horizontally (fraction of total width)
            width_shift_range=0.1,
            # randomly shift images vertically (fraction of total height)
            height_shift_range=0.1,
            horizontal_flip=True,  # randomly flip images
            vertical_flip=False)  # randomly flip images

        # Compute quantities required for feature-wise normalization
        # (std, mean, and principal components if ZCA whitening is applied).
        datagen.fit(x_train)

        start_time = time.time()
        # Fit the model on the batches generated by datagen.flow().
        model.fit_generator(
            datagen.flow(x_train, y_train, batch_size=batch_size),
            steps_per_epoch=steps_per_epoch,
            epochs=epochs,
            validation_data=(x_test, y_test),
            verbose=hvd.rank() == 0,
            callbacks=callbacks)

    if hvd.rank() == 0:
        elapsed_time = time.time() - start_time
        print('[{}] finished in {} s'
              .format('TRAINING', round(elapsed_time, 3)))

        metrics = model.evaluate(x=x_test, y=y_test, batch_size=batch_size)
        print('\nCIFAR VALIDATION LOSS, ACC: {}, {}'.format(*metrics))

    KB.clear_session()
def main(argv=None):
    # Initialize Horovod.
    hvd.init()

    # Pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    KB.set_session(tf.Session(config=config))

    # print('LOCAL RANK, OVERAL RANK: {}, {}'.format(hvd.local_rank(),
    #                                                hvd.rank()))

    ngpus = hvd.size()

    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = _parser(desc)

    num_devices_tfrecord = 1
    height, width = 224, 224  # Image dimensions. Gets resized if not match.
    distort_color = args.distort_color
    data_dir = args.datadir
    batch_size = args.batch_size  # * ngpus
    epochs = args.epochs
    imgs_per_epoch = args.imgs_per_epoch

    # Fit the model using data from the TFRecord data tensors.
    device_minibatches = RecordInputImagenetPreprocessor.device_minibatches
    images_tfrecord, labels_tfrecord, nrecords = device_minibatches(
        num_devices_tfrecord, data_dir, batch_size,
        height, width, distort_color, val=False)
    images_tfrecord = images_tfrecord[0]
    labels_tfrecord = labels_tfrecord[0]

    # CASTING FOR KERAS
    # labels[device_num] = tf.cast(labels_tfrecord, dtype)
    nclasses = 1000
    labels_tfrecord = tf.one_hot(labels_tfrecord, nclasses)

    nimgs_to_use = imgs_per_epoch if imgs_per_epoch > 0 else nrecords
    steps_per_epoch = nimgs_to_use // batch_size // hvd.size()
    # steps_per_epoch = 100

    # batch_shape = images_tfrecord.get_shape().as_list()
    # images = Input(tensor=images_tfrecord, batch_shape=x_batch_shape)
    images = Input(tensor=images_tfrecord)
    model = ResNet50(input_tensor=images, weights=None)
    if hvd.rank() == 0:
        model.summary()

        print('Num images: {}'.format(nrecords))

        if nimgs_to_use < nrecords:
            print('Using {} images per epoch'.format(nimgs_to_use))

        # print('IMAGES_TFRECORD: {}'.format(images_tfrecord))
        # print('LABELS_TFRECORD: {}'.format(labels_tfrecord))

    # Add Horovod Distributed Optimizer from nvcnn.py
    # momentum = 0.9
    # lr = 0.1
    # learning_rate = tf.train.exponential_decay(
    #             lr,
    #             self.global_step,
    #             decay_steps=FLAGS.lr_decay_epochs * nstep_per_epoch,
    #             decay_rate=FLAGS.lr_decay_rate,
    #             staircase=True)
    # opt = tf.train.MomentumOptimizer(self.learning_rate, momentum,
    #                                  use_nesterov=True)

    # lr = 0.001 * ngpus
    # opt = tf.train.AdamOptimizer()
    # opt = hvd.DistributedOptimizer(opt)  # , use_locking=True)
    # opt = KO.TFOptimizer(opt)  # Required for tf.train based optimizers

    opt = KO.Adam()
    opt = hvd_keras.DistributedOptimizer(opt)

    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  # metrics=['accuracy'],
                  target_tensors=[labels_tfrecord])

    # Broadcast variables from rank 0 to all other processes.
    KB.get_session().run(hvd.broadcast_global_variables(0))

    callbacks = []
    if hvd.rank() == 0:
        callbacks += [BatchTiming(),
                      SamplesPerSec(ngpus * batch_size)]

    # RecordInput is a yield op which doesn't use queue runners or queues.
    # Start the queue runners.
    # sess = KB.get_session()

    # sess.run([tf.local_variables_initializer(),
    #           tf.global_variables_initializer()])

    # coord = tf.train.Coordinator()
    # threads = tf.train.start_queue_runners(sess, coord)

    start_time = time.time()
    model.fit(
        steps_per_epoch=steps_per_epoch,
        epochs=epochs,
        callbacks=callbacks,
        verbose=1)
    # verbose=hvd.rank() == 0)
    elapsed_time = time.time() - start_time

    if hvd.rank() == 0:
        print('[{}] finished in {} s'
              .format('TRAINING', round(elapsed_time, 3)))
        # loss = model.evaluate(None, None, steps=steps_per_epoch_val)

        images_tfrecord_val, labels_tfrecord_val, nrecords_val = \
            device_minibatches(num_devices_tfrecord, data_dir, batch_size,
                               height, width, distort_color, val=True)
        images_tfrecord_val = images_tfrecord_val[0]
        labels_tfrecord_val = labels_tfrecord_val[0]
        labels_tfrecord_val = tf.one_hot(labels_tfrecord_val, nclasses)

        # print('IMAGES_TFRECORD_VAL: {}'.format(images_tfrecord_val))
        # print('labels_tfrecord_val: {}'.format(labels_tfrecord_val))

        steps_per_epoch_val = nrecords_val // batch_size

        images_val = Input(tensor=images_tfrecord_val)
        model_val = model
        model_val.layers[0] = KL.InputLayer(input_tensor=images_val)
        model_val.compile(
            loss='categorical_crossentropy',
            optimizer=opt,
            metrics=['accuracy'],
            target_tensors=[labels_tfrecord_val])
        # model.summary()
        loss = model_val.evaluate(x=None, y=None, steps=steps_per_epoch_val)

        print('\nNum images evaluated, steps: {}, {}'.
              format(nrecords_val, steps_per_epoch_val))
        print('\nTest loss, acc: {}'.format(loss))
        # print('\nTest accuracy: {0}'.format(acc))

    # Clean up the TF session.
    # coord.request_stop()
    # coord.join(threads)

    KB.clear_session()  # do this for Horovod
Beispiel #38
0
#!/usr/bin/env python3

import os
print("pid %i: Hello" % os.getpid())

import tensorflow as tf
import horovod.tensorflow as hvd


# Initialize Horovod
hvd.init()

print("pid %i: hvd: rank: %i, size: %i, local_rank %i, local_size %i" % (os.getpid(), hvd.rank(), hvd.size(), hvd.local_rank(), hvd.local_size()))
Beispiel #39
0
def train():
    """
    init dir and log config
    """
    init_cluster_ray()
    hvd.init()
    base_dir, ckpt_dir, summary_dir = init_dir_and_log()

    kwargs = FLAGS.flag_values_dict()
    kwargs["BASE_DIR"] = base_dir
    kwargs["ckpt_dir"] = ckpt_dir
    """
    get one seg from rollout worker for dtype and shapes

    :param kwargs rollout worker config
    """
    logging.info('get one seg from Evaluator for dtype and shapes')
    ps = AsyncPS.remote()
    small_data_collector = RolloutCollector(
        server_nums=1, ps=ps, policy_evaluator_build_func=build_policy_evaluator,
        **kwargs)
    cache_struct_path = '/tmp/ExploreGoalLocationsLarge_%s.pkl' % FLAGS.dir

    if hvd.local_rank() == 0:
        structure = fetch_one_structure(small_data_collector, cache_struct_path=cache_struct_path, is_head=True)
    else:
        structure = fetch_one_structure(small_data_collector, cache_struct_path=cache_struct_path, is_head=False)

    del small_data_collector

    """
        init data prefetch thread, prepare_input_pipe
    """
    keys = list(structure.keys())
    dtypes = [structure[k].dtype for k in keys]
    shapes = [structure[k].shape for k in keys]
    segBuffer = tf.queue.FIFOQueue(
        capacity=FLAGS.qsize * FLAGS.batch_size,
        dtypes=dtypes,
        shapes=shapes,
        names=keys,
        shared_name="buffer")

    server_nums = FLAGS.nof_evaluator
    server_nums_refine = server_nums * 2 // FLAGS.cpu_per_actor
    nof_server_gpus = FLAGS.nof_server_gpus
    server_nums_refine = server_nums_refine // nof_server_gpus
    data_collector = RolloutCollector(server_nums=server_nums_refine, ps=ps,
                                      policy_evaluator_build_func=build_policy_evaluator, **kwargs)

    config = tf.ConfigProto(
        allow_soft_placement=True,
        gpu_options=tf.GPUOptions(
            per_process_gpu_memory_fraction=1))
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    sess = tf.Session(config=config)
    reader = QueueReader(
        sess=sess,
        global_queue=segBuffer,
        data_collector=data_collector,
        keys=keys,
        dtypes=dtypes,
        shapes=shapes)
    reader.daemon = True
    reader.start()

    dequeued = segBuffer.dequeue_many(FLAGS.batch_size)
    prephs, postphs = dict(), dict()
    for k, v in dequeued.items():
        if k == "state_in":
            prephs[k] = v
        else:
            prephs[k], postphs[k] = tf.split(
                v, [FLAGS.burn_in, FLAGS.seqlen], axis=1)
    prekeys = list(prephs.keys())
    postkeys = list(postphs.keys())

    ##  count frame and total steps
    num_frames = tf.get_variable(
        'num_environment_frames',
        initializer=tf.zeros_initializer(),
        shape=[],
        dtype=tf.int32,
        trainable=False)
    tf.summary.scalar("frames", num_frames)
    global_step = tf.train.get_or_create_global_step()

    dur_time_tensor = tf.placeholder(dtype=tf.float32)
    tf.summary.scalar('time_per_step', dur_time_tensor)

    #  set stage_op and build learner
    with tf.device("/gpu"):
        if FLAGS.use_stage:
            area = tf.contrib.staging.StagingArea(
                [prephs[key].dtype for key in prekeys] + [postphs[key].dtype for key in postkeys],
                [prephs[key].shape for key in prekeys] + [postphs[key].shape for key in postkeys])
            stage_op = area.put([prephs[key] for key in prekeys] + [postphs[key] for key in postkeys])
            from_stage = area.get()
            predatas = {key: from_stage[i] for i, key in enumerate(prekeys)}
            postdatas = {key: from_stage[i + len(prekeys)] for i, key in enumerate(postkeys)}
        else:
            stage_op = []
            predatas, postdatas = prephs, postphs

        act_space = FLAGS.act_space
        num_frames_and_train, global_step_and_train = build_learner(
            pre=predatas, post=postdatas, act_space=act_space,
            num_frames=num_frames)

    """
        add summary
    """
    summary_ops = tf.summary.merge_all()
    if hvd.local_rank() == 0:
        summary_writer = tf.summary.FileWriter(summary_dir, sess.graph)

    """
        initialize and save ckpt
    """
    saver = tf.train.Saver(max_to_keep=100, keep_checkpoint_every_n_hours=6)
    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        sess.run(tf.global_variables_initializer())

    sess.run(hvd.broadcast_global_variables(0))
    if hvd.local_rank() == 0:
        saver.save(sess, os.path.join(ckpt_dir, "PPOcGAE"), global_step=global_step)

    """
        step
    """
    total_frames = 0
    sess.run(stage_op)

    dur_time = 0
    while total_frames < FLAGS.total_environment_frames:
        start = time.time()

        total_frames, gs, summary, _ = sess.run(
            [num_frames_and_train, global_step_and_train, summary_ops, stage_op],
            feed_dict={dur_time_tensor: dur_time})

        if hvd.local_rank() == 0:
            if gs % 1 == 0:
                summary_writer.add_summary(summary, global_step=gs)
                dur_time = time.time() - start
                msg = "Global Step %d, Total Frames %d,  Time Consume %.2f" % (
                    gs, total_frames, dur_time)
                logging.info(msg)
            if gs % 25 == 0:
                ws = Model.get_ws(sess)
                logging.info('pushing weight to ps')
                ray.get(ps.push.remote(ws))
            if gs % 1000 == 0:
                saver.save(sess, os.path.join(ckpt_dir, "CKPT"), global_step=global_step)
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)

    nranks_per_gpu = args.nranks_per_gpu
    local_rank = hvd.local_rank()
    gpu_local_rank = local_rank // nranks_per_gpu
    print('local_rank, GPU_LOCAL_RANK: {}, {}'.format(
        local_rank, gpu_local_rank))

    # Pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    # config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.visible_device_list = str(gpu_local_rank)
    K.set_session(tf.Session(config=config))

    # input image dimensions
    img_rows, img_cols, img_chns = 28, 28, 1
    # number of convolutional filters to use
    filters = 64
    # convolution kernel size
    num_conv = 3

    hvdsize = hvd.size()

    batch_size = 128  # 100
    if K.image_data_format() == 'channels_first':
        original_img_size = (img_chns, img_rows, img_cols)
    else:
        original_img_size = (img_rows, img_cols, img_chns)
    latent_dim = 2
    intermediate_dim = 128
    epsilon_std = 1.0
    epochs = args.epochs  # 5

    # train the VAE on MNIST digits
    (x_train, _), (x_test, y_test) = mnist.load_data()

    x_train = x_train.astype('float32') / 255.
    x_train = x_train.reshape((x_train.shape[0],) + original_img_size)
    x_test = x_test.astype('float32') / 255.
    x_test = x_test.reshape((x_test.shape[0],) + original_img_size)

    if hvd.rank() == 0:
        print('x_train.shape:', x_train.shape)

    train_samples = x_train.shape[0]
    # steps_per_epoch = train_samples // batch_size // hvdsize
    speedupopt = args.speedup
    if speedupopt == SpeedupOpts.imgspersec:
        steps_per_epoch = train_samples // batch_size
    else:
        steps_per_epoch = int(round(
            float(train_samples) / batch_size / hvdsize + 0.5))

    # Create the dataset and its associated one-shot iterator.
    buffer_size = 10000
    dataset = Dataset.from_tensor_slices(x_train)
    dataset = dataset.repeat()
    dataset = dataset.shuffle(buffer_size)
    dataset = dataset.batch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    x_train_batch = iterator.get_next()

    ldict = make_shared_layers_dict(
        img_chns, img_rows, img_cols, batch_size, filters,
        num_conv, intermediate_dim, latent_dim, epsilon_std)
    # ldict is a dictionary that holds all layers. Since these layers are
    # instantiated once, they are shared amongs vae, encoder, and generator.

    x = Input(tensor=x_train_batch)
    vae = make_vae(ldict, x)
    # :  :type vae: Model

    lr = 0.001  # * hvdsize
    opt = tf.train.RMSPropOptimizer(lr)
    # Add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)  # , use_locking=True)
    opt = TFOptimizer(opt)

    # opt = RMSprop(lr)
    # Add Horovod Distributed Optimizer.
    # opt = hvd_keras.DistributedOptimizer(opt)  # , use_locking=True)

    vae.compile(optimizer=opt, loss=None)
    if hvd.rank() == 0:
        vae.summary()

    callbacks = []
    if hvd.rank() == 0:
        callbacks += [BatchTiming(), SamplesPerSec(batch_size * hvdsize)]

    sess = K.get_session()
    sess.run(hvd.broadcast_global_variables(0))

    # Fit the model using data from the TF data tensors.
    vae.fit(steps_per_epoch=steps_per_epoch, epochs=epochs,
            callbacks=callbacks)

    if hvd.rank() == 0:
        x = Input(shape=original_img_size)
        vae_val = make_vae(ldict, x)
        vae_val.compile(optimizer=opt, loss=None)
        loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size)
        print('\n\nVAE VALIDATION LOSS: {}'.format(loss))

        x = Input(shape=original_img_size)
        z_mean, _ = get_encoded(ldict, x)
        encoder = Model(x, z_mean)
        # :  :type encoder: Model

        decoder_input = Input(shape=(latent_dim,))
        x_decoded_mean_squash = get_decoded(ldict, decoder_input)
        generator = Model(decoder_input, x_decoded_mean_squash)
        # :  :type generator: Model

        # display a 2D plot of the digit classes in the latent space
        x_test_encoded = encoder.predict(x_test, batch_size=batch_size)
        plt.figure(figsize=(6, 6))
        plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test)
        plt.colorbar()
        # plt.show()
        plt.savefig('vae_scatter.ps')
        plt.close()

        # display a 2D manifold of the digits
        n = 15  # figure with 15x15 digits
        digit_size = 28
        figure = np.zeros((digit_size * n, digit_size * n))
        # Linearly spaced coordinates on the unit square were transformed
        # through the inverse CDF (ppf) of the Gaussian
        # To produce values of the latent variables z, since the prior of the
        # latent space is Gaussian
        grid_x = norm.ppf(np.linspace(0.05, 0.95, n))
        grid_y = norm.ppf(np.linspace(0.05, 0.95, n))

        for i, yi in enumerate(grid_x):
            for j, xi in enumerate(grid_y):
                z_sample = np.array([[xi, yi]])
                z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2)
                x_decoded = generator.predict(z_sample, batch_size=batch_size)
                digit = x_decoded[0].reshape(digit_size, digit_size)
                figure[i * digit_size: (i + 1) * digit_size,
                       j * digit_size: (j + 1) * digit_size] = digit

        plt.figure(figsize=(10, 10))
        plt.imshow(figure, cmap='Greys_r')
        # plt.show()
        plt.savefig('vae_digit.ps')
        plt.close()

    K.clear_session()
Beispiel #41
0
def main():
    script_start = time.time()
    hvd_init()
    mpi_comm = MPI.COMM_WORLD
    args = parse_args()

    if hvd.rank() == 0:
        dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                                           filename=args.log_path),
                                dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)])
    else:
        dllogger.init(backends=[])

    args.world_size = hvd.size()
    dllogger.log(data=vars(args), step='PARAMETER')

    if args.seed is None:
        if hvd.rank() == 0:
            seed = int(time.time())
        else:
            seed = None

        seed = mpi_comm.bcast(seed, root=0)
    else:
        seed = args.seed

    tf.random.set_random_seed(seed)
    np.random.seed(seed)
    cp.random.seed(seed)

    if args.amp:
        os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1"

    if args.checkpoint_dir is not None:
        os.makedirs(args.checkpoint_dir, exist_ok=True)
        final_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.ckpt')
    else:
        final_checkpoint_path = None

    # Load converted data and get statistics
    train_df = pd.read_pickle(args.data+'/train_ratings.pickle')
    test_df = pd.read_pickle(args.data+'/test_ratings.pickle')
    nb_users, nb_items = train_df.max() + 1

    # Extract train and test feature tensors from dataframe
    pos_train_users = train_df.iloc[:, 0].values.astype(np.int32)
    pos_train_items = train_df.iloc[:, 1].values.astype(np.int32)
    pos_test_users = test_df.iloc[:, 0].values.astype(np.int32)
    pos_test_items = test_df.iloc[:, 1].values.astype(np.int32)
    # Negatives indicator for negatives generation
    neg_mat = np.ones((nb_users, nb_items), dtype=np.bool)
    neg_mat[pos_train_users, pos_train_items] = 0

    # Get the local training/test data
    train_users, train_items, train_labels = get_local_train_data(
        pos_train_users, pos_train_items, args.negative_samples
    )
    test_users, test_items = get_local_test_data(
        pos_test_users, pos_test_items
    )

    # Create and run Data Generator in a separate thread
    data_generator = DataGenerator(
        args.seed,
        hvd.rank(),
        nb_users,
        nb_items,
        neg_mat,
        train_users,
        train_items,
        train_labels,
        args.batch_size // hvd.size(),
        args.negative_samples,
        test_users,
        test_items,
        args.valid_users_per_batch,
        args.valid_negative,
        )

    # Create tensorflow session and saver
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    if args.xla:
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    sess = tf.Session(config=config)

    # Input tensors
    users = tf.placeholder(tf.int32, shape=(None,))
    items = tf.placeholder(tf.int32, shape=(None,))
    labels = tf.placeholder(tf.int32, shape=(None,))
    is_dup = tf.placeholder(tf.float32, shape=(None,))
    dropout = tf.placeholder_with_default(args.dropout, shape=())
    # Model ops and saver
    hit_rate, ndcg, eval_op, train_op = ncf_model_ops(
        users,
        items,
        labels,
        is_dup,
        params={
            'val_batch_size': args.valid_negative+1,
            'top_k': args.topk,
            'learning_rate': args.learning_rate,
            'beta_1': args.beta1,
            'beta_2': args.beta2,
            'epsilon': args.eps,
            'num_users': nb_users,
            'num_items': nb_items,
            'num_factors': args.factors,
            'mf_reg': 0,
            'layer_sizes': args.layers,
            'layer_regs': [0. for i in args.layers],
            'dropout': dropout,
            'sigmoid': True,
            'loss_scale': args.loss_scale
        },
        mode='TRAIN' if args.mode == 'train' else 'EVAL'
    )
    saver = tf.train.Saver()

    # Accuracy metric tensors
    hr_sum = tf.get_default_graph().get_tensor_by_name('neumf/hit_rate/total:0')
    hr_cnt = tf.get_default_graph().get_tensor_by_name('neumf/hit_rate/count:0')
    ndcg_sum = tf.get_default_graph().get_tensor_by_name('neumf/ndcg/total:0')
    ndcg_cnt = tf.get_default_graph().get_tensor_by_name('neumf/ndcg/count:0')

    # Prepare evaluation data
    data_generator.prepare_eval_data()

    if args.load_checkpoint_path:
        saver.restore(sess, args.load_checkpoint_path)
    else:
        # Manual initialize weights
        sess.run(tf.global_variables_initializer())

    # If test mode, run one eval
    if args.mode == 'test':
        sess.run(tf.local_variables_initializer())
        eval_start = time.time()
        for user_batch, item_batch, dup_batch \
            in zip(data_generator.eval_users, data_generator.eval_items, data_generator.dup_mask):
            sess.run(
                eval_op,
                feed_dict={
                    users: user_batch,
                    items: item_batch,
                    is_dup:dup_batch, dropout: 0.0
                }
            )
        eval_duration = time.time() - eval_start

        # Report results
        hit_rate_sum = sess.run(hvd.allreduce(hr_sum, average=False))
        hit_rate_cnt = sess.run(hvd.allreduce(hr_cnt, average=False))
        ndcg_sum = sess.run(hvd.allreduce(ndcg_sum, average=False))
        ndcg_cnt = sess.run(hvd.allreduce(ndcg_cnt, average=False))

        hit_rate = hit_rate_sum / hit_rate_cnt
        ndcg = ndcg_sum / ndcg_cnt

        if hvd.rank() == 0:
            eval_throughput = pos_test_users.shape[0] * (args.valid_negative + 1) / eval_duration
            dllogger.log(step=tuple(), data={'eval_throughput': eval_throughput,
                                             'eval_time': eval_duration,
                                             'hr@10': float(hit_rate),
                                             'ndcg': float(ndcg)})
        return

    # Performance Metrics
    train_times = list()
    eval_times = list()
    # Accuracy Metrics
    first_to_target = None
    time_to_train = 0.0
    best_hr = 0
    best_epoch = 0
    # Buffers for global metrics
    global_hr_sum = np.ones(1)
    global_hr_count = np.ones(1)
    global_ndcg_sum = np.ones(1)
    global_ndcg_count = np.ones(1)
    # Buffers for local metrics
    local_hr_sum = np.ones(1)
    local_hr_count = np.ones(1)
    local_ndcg_sum = np.ones(1)
    local_ndcg_count = np.ones(1)

    # Begin training
    begin_train = time.time()
    for epoch in range(args.epochs):
        # Train for one epoch
        train_start = time.time()
        data_generator.prepare_train_data()
        for user_batch, item_batch, label_batch \
            in zip(data_generator.train_users_batches,
                   data_generator.train_items_batches,
                   data_generator.train_labels_batches):
            sess.run(
                train_op,
                feed_dict={
                    users: user_batch.get(),
                    items: item_batch.get(),
                    labels: label_batch.get()
                }
            )
        train_duration = time.time() - train_start
        # Only log "warm" epochs
        if epoch >= 1:
            train_times.append(train_duration)
        # Evaluate
        if epoch > args.eval_after:
            eval_start = time.time()
            sess.run(tf.local_variables_initializer())
            for user_batch, item_batch, dup_batch \
                in zip(data_generator.eval_users,
                       data_generator.eval_items,
                       data_generator.dup_mask):
                sess.run(
                    eval_op,
                    feed_dict={
                        users: user_batch,
                        items: item_batch,
                        is_dup: dup_batch,
                        dropout: 0.0
                    }
                )
            # Compute local metrics
            local_hr_sum[0] = sess.run(hr_sum)
            local_hr_count[0] = sess.run(hr_cnt)
            local_ndcg_sum[0] = sess.run(ndcg_sum)
            local_ndcg_count[0] = sess.run(ndcg_cnt)
            # Reduce metrics across all workers

            mpi_comm.Reduce(local_hr_count, global_hr_count)
            mpi_comm.Reduce(local_hr_sum, global_hr_sum)
            mpi_comm.Reduce(local_ndcg_count, global_ndcg_count)
            mpi_comm.Reduce(local_ndcg_sum, global_ndcg_sum)

            # Calculate metrics
            hit_rate = global_hr_sum[0] / global_hr_count[0]
            ndcg = global_ndcg_sum[0] / global_ndcg_count[0]

            eval_duration = time.time() - eval_start
            # Only log "warm" epochs
            if epoch >= 1:
                eval_times.append(eval_duration)

            if hvd.rank() == 0:
                dllogger.log(step=(epoch,), data={
                                'train_time': train_duration,
                                'eval_time': eval_duration,
                                'hr@10': hit_rate,
                                'ndcg': ndcg})

                # Update summary metrics
                if hit_rate > args.target and first_to_target is None:
                    first_to_target = epoch
                    time_to_train = time.time() - begin_train
                if hit_rate > best_hr:
                    best_hr = hit_rate
                    best_epoch = epoch
                    time_to_best =  time.time() - begin_train
                    if hit_rate > args.target and final_checkpoint_path:
                        saver.save(sess, final_checkpoint_path)

    # Final Summary
    if hvd.rank() == 0:
        train_times = np.array(train_times)
        train_throughputs = pos_train_users.shape[0]*(args.negative_samples+1) / train_times
        eval_times = np.array(eval_times)
        eval_throughputs = pos_test_users.shape[0]*(args.valid_negative+1) / eval_times

        dllogger.log(step=tuple(), data={
            'average_train_time_per_epoch': np.mean(train_times),
            'average_train_throughput': np.mean(train_throughputs),
            'average_eval_time_per_epoch': np.mean(eval_times),
            'average_eval_throughput': np.mean(eval_throughputs),
            'first_epoch_to_hit': first_to_target,
            'time_to_train': time_to_train,
            'time_to_best': time_to_best,
            'best_hr': best_hr,
            'best_epoch': best_epoch})
        dllogger.flush()

    sess.close()
    return
Beispiel #42
0
def main(_):

    if FLAGS.strategy == 'horovod':
        import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
        logging.info('Use horovod with multi gpus')
        hvd.init()
        os.environ['CUDA_VISIBLE_DEVICES'] = str(hvd.local_rank())
    import tensorflow.compat.v1 as tf  # pylint: disable=g-import-not-at-top
    tf.enable_v2_tensorshape()
    tf.disable_eager_execution()

    if FLAGS.strategy == 'tpu':
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
        tpu_grpc_url = tpu_cluster_resolver.get_master()
        tf.Session.reset(tpu_grpc_url)
    else:
        tpu_cluster_resolver = None

    # Check data path
    if FLAGS.mode in (
            'train', 'train_and_eval') and FLAGS.training_file_pattern is None:
        raise RuntimeError(
            'You must specify --training_file_pattern for training.')
    if FLAGS.mode in ('eval', 'train_and_eval'):
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError('You must specify --validation_file_pattern '
                               'for evaluation.')

    # Parse and override hparams
    config = hparams_config.get_detection_config(FLAGS.model_name)
    config.override(FLAGS.hparams)
    if FLAGS.num_epochs:  # NOTE: remove this flag after updating all docs.
        config.num_epochs = FLAGS.num_epochs

    # Parse image size in case it is in string format.
    config.image_size = utils.parse_image_size(config.image_size)

    # The following is for spatial partitioning. `features` has one tensor while
    # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input
    # partition is performed on `features` and all partitionable tensors of
    # `labels`, see the partition logic below.
    # In the TPUEstimator context, the meaning of `shard` and `replica` is the
    # same; follwing the API, here has mixed use of both.
    if FLAGS.use_spatial_partition:
        # Checks input_partition_dims agrees with num_cores_per_replica.
        if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims):
            raise RuntimeError(
                '--num_cores_per_replica must be a product of array'
                'elements in --input_partition_dims.')

        labels_partition_dims = {
            'mean_num_positives': None,
            'source_ids': None,
            'groundtruth_data': None,
            'image_scales': None,
        }
        # The Input Partition Logic: We partition only the partition-able tensors.
        # Spatial partition requires that the to-be-partitioned tensors must have a
        # dimension that is a multiple of `partition_dims`. Depending on the
        # `partition_dims` and the `image_size` and the `max_level` in config, some
        # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot
        # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image
        # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of
        # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this
        # case, the level-8 and level-9 target tensors are not partition-able, and
        # the highest partition-able level is 7.
        feat_sizes = utils.get_feat_sizes(config.get('image_size'),
                                          config.get('max_level'))
        for level in range(config.get('min_level'),
                           config.get('max_level') + 1):

            def _can_partition(spatial_dim):
                partitionable_index = np.where(
                    spatial_dim % np.array(FLAGS.input_partition_dims) == 0)
                return len(partitionable_index[0]) == len(
                    FLAGS.input_partition_dims)

            spatial_dim = feat_sizes[level]
            if _can_partition(spatial_dim['height']) and _can_partition(
                    spatial_dim['width']):
                labels_partition_dims['box_targets_%d' %
                                      level] = FLAGS.input_partition_dims
                labels_partition_dims['cls_targets_%d' %
                                      level] = FLAGS.input_partition_dims
            else:
                labels_partition_dims['box_targets_%d' % level] = None
                labels_partition_dims['cls_targets_%d' % level] = None
        num_cores_per_replica = FLAGS.num_cores_per_replica
        input_partition_dims = [
            FLAGS.input_partition_dims, labels_partition_dims
        ]
        num_shards = FLAGS.num_cores // num_cores_per_replica
    else:
        num_cores_per_replica = None
        input_partition_dims = None
        num_shards = FLAGS.num_cores

    params = dict(config.as_dict(),
                  model_name=FLAGS.model_name,
                  iterations_per_loop=FLAGS.iterations_per_loop,
                  model_dir=FLAGS.model_dir,
                  num_shards=num_shards,
                  num_examples_per_epoch=FLAGS.num_examples_per_epoch,
                  strategy=FLAGS.strategy,
                  backbone_ckpt=FLAGS.backbone_ckpt,
                  ckpt=FLAGS.ckpt,
                  val_json_file=FLAGS.val_json_file,
                  testdev_dir=FLAGS.testdev_dir,
                  mode=FLAGS.mode)
    config_proto = tf.ConfigProto(allow_soft_placement=True,
                                  log_device_placement=False)
    if FLAGS.strategy != 'tpu':
        if FLAGS.use_xla:
            config_proto.graph_options.optimizer_options.global_jit_level = (
                tf.OptimizerOptions.ON_1)
        config_proto.gpu_options.allow_growth = True

    tpu_config = tf.estimator.tpu.TPUConfig(
        FLAGS.iterations_per_loop,
        num_cores_per_replica=num_cores_per_replica,
        input_partition_dims=input_partition_dims,
        per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig.
        PER_HOST_V2)

    if FLAGS.strategy == 'horovod':
        model_dir = FLAGS.model_dir if hvd.rank() == 0 else None
    else:
        model_dir = FLAGS.model_dir

    run_config = tf.estimator.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=model_dir,
        log_step_count_steps=FLAGS.iterations_per_loop,
        session_config=config_proto,
        tpu_config=tpu_config,
        tf_random_seed=FLAGS.tf_random_seed,
    )

    model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name)
    max_instances_per_image = config.max_instances_per_image
    eval_steps = int(FLAGS.eval_samples // FLAGS.eval_batch_size)
    use_tpu = (FLAGS.strategy == 'tpu')
    logging.info(params)

    def _train(steps):
        """Build train estimator and run training if steps > 0."""
        train_estimator = tf.estimator.tpu.TPUEstimator(
            model_fn=model_fn_instance,
            use_tpu=use_tpu,
            train_batch_size=FLAGS.train_batch_size,
            config=run_config,
            params=params)
        train_estimator.train(input_fn=dataloader.InputReader(
            FLAGS.training_file_pattern,
            is_training=True,
            use_fake_data=FLAGS.use_fake_data,
            max_instances_per_image=max_instances_per_image),
                              max_steps=steps)

    def _eval(steps):
        """Build estimator and eval the latest checkpoint if steps > 0."""
        eval_params = dict(
            params,
            strategy=FLAGS.strategy,
            input_rand_hflip=False,
            is_training_bn=False,
        )
        eval_estimator = tf.estimator.tpu.TPUEstimator(
            model_fn=model_fn_instance,
            use_tpu=use_tpu,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            config=run_config,
            params=eval_params)
        eval_results = eval_estimator.evaluate(input_fn=dataloader.InputReader(
            FLAGS.validation_file_pattern,
            is_training=False,
            max_instances_per_image=max_instances_per_image),
                                               steps=steps,
                                               name=FLAGS.eval_name)
        logging.info('Evaluation results: %s', eval_results)
        return eval_results

    # start train/eval flow.
    if FLAGS.mode == 'train':
        total_examples = int(config.num_epochs * FLAGS.num_examples_per_epoch)
        _train(total_examples // FLAGS.train_batch_size)
        if FLAGS.eval_after_training:
            _eval(eval_steps)

    elif FLAGS.mode == 'eval':
        # Run evaluation when there's a new checkpoint
        for ckpt in tf.train.checkpoints_iterator(
                FLAGS.model_dir,
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout):

            logging.info('Starting to evaluate.')
            try:
                eval_results = _eval(eval_steps)
                # Terminate eval job when final checkpoint is reached.
                try:
                    current_step = int(os.path.basename(ckpt).split('-')[1])
                except IndexError:
                    logging.info('%s has no global step info: stop!', ckpt)
                    break

                utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
                total_step = int(
                    (config.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                if current_step >= total_step:
                    logging.info('Evaluation finished after training step %d',
                                 current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                logging.info('Checkpoint %s no longer exists, skipping.', ckpt)

    elif FLAGS.mode == 'train_and_eval':
        epochs_per_cycle = 1  # higher number has less graph construction overhead.
        for e in range(1, config.num_epochs + 1, epochs_per_cycle):
            logging.info('Starting training, epoch: %d.', e)
            _train(e * FLAGS.num_examples_per_epoch // FLAGS.train_batch_size)
            logging.info('Starting evaluation, epoch: %d.', )
            eval_results = _eval(eval_steps)
            ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
            utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)

    else:
        logging.info('Mode not found.')
Beispiel #43
0
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Disable TensorFlow logging.

import tensorflow as tf
import horovod.tensorflow as hvd

# Horovod: initialize Horovod.
# `dims` and `comm` are optional.
hvd.init([2, 4, 4])

# Horovod: pin GPU to be used to process local rank (one GPU per process)
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
if gpus:
    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')

(mnist_images, mnist_labels), _ = \
    tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % hvd.rank())

dataset = tf.data.Dataset.from_tensor_slices(
    (tf.cast(mnist_images[..., tf.newaxis] / 255.0,
             tf.float32), tf.cast(mnist_labels, tf.int64)))
dataset = dataset.repeat().shuffle(10000).batch(128)

mnist_model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
    tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Flatten(),
Beispiel #44
0
 def minibatch(self):
     """
     Returns minibatch of images and labels from TF records file.
     """
     with tf.name_scope('pipeline'):
         ds = tf.data.Dataset.from_generator(self.generator, (tf.int64),
                                             (tf.TensorShape([])))
         if self.mode == 'train':
             max_num_records = self.params['num_epochs'] * self.params[
                 'NUM_EXAMPLES_PER_EPOCH']
             ds = ds.take(max_num_records)
             ds = ds.prefetch(min(1, self.num_samples))
             ds = ds.batch(self.params['batch_size'], drop_remainder=True)
             #ds = ds.map(self.wrapped_decode, num_parallel_calls=tf.data.experimental.AUTOTUNE)
             ds = ds.map(self.wrapped_decode)
             iterator = ds.make_one_shot_iterator()
             images, labels = [], []
             for _ in range(self.params['batch_size']):
                 image, label = iterator.get_next()
                 image = tf.reshape(image, self.data_specs['image_shape'])
                 images.append(
                     tf.reshape(image, self.data_specs['image_shape']))
                 labels.append(
                     tf.reshape(label, self.data_specs['label_shape']))
         elif self.mode == 'eval':
             ds = ds.take(self.num_samples)
             ds = ds.batch(self.params['batch_size'], drop_remainder=True)
             ds = ds.map(self.wrapped_decode)
             iterator = ds.make_one_shot_iterator()
             images, labels = [], []
             if self.params[self.mode + '_distort']:
                 print('images will be distorted')
             for _ in range(self.params['batch_size']):
                 image, label = iterator.get_next()
                 image = tf.reshape(image, self.data_specs['image_shape'])
                 images.append(
                     tf.reshape(image, self.data_specs['image_shape']))
                 labels.append(
                     tf.reshape(label, self.data_specs['label_shape']))
         if tf.executing_eagerly():
             images = tf.stack(images)
             labels = tf.stack(labels)
         else:
             images = tf.parallel_stack(images)
             labels = tf.parallel_stack(labels)
         # reshape them to the expected shape:
         labels_newshape = [self.params['batch_size']
                            ] + self.data_specs['label_shape']
         images_newshape = [self.params['batch_size']
                            ] + self.data_specs['image_shape']
         labels = tf.reshape(labels, labels_newshape)
         images = tf.reshape(images, images_newshape)
         #labels = self.image_scaling(labels)
         images = self.image_scaling(images)
     # data augmentation
     if self.params[self.mode + '_distort']:
         with tf.device('/gpu:%i' % hvd.local_rank()):
             if self.params.get('random_crop', False):
                 images = tf.transpose(images, perm=[0, 2, 3, 1])
                 images = self.random_crop_resize(images)
                 images = self.add_noise_image(images)
                 images = tf.transpose(images, perm=[0, 3, 1, 2])
             else:
                 images = self.add_noise_image(images)
     return images, labels
Beispiel #45
0
def train_ffn(model_cls, **model_kwargs):
    hvd.init()
    logging.info('Rank: %d %d' % (hvd.rank(), rank))
    with tf.Graph().as_default():
        # The constructor might define TF ops/placeholders, so it is important
        # that the FFN is instantiated within the current context.
        model = model_cls(**model_kwargs)
        eval_shape_zyx = train_eval_size(model).tolist()[::-1]

        eval_tracker = EvalTracker(eval_shape_zyx)
        load_data_ops = h5_distributed_dataset(model, queue_batch=1)
        print(load_data_ops)

        prepare_ffn(model)
        merge_summaries_op = tf.summary.merge_all()

        if FLAGS.task == 0:
            save_flags()

        hooks = [
            # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
            # from rank 0 to all other processes. This is necessary to ensure consistent
            # initialization of all workers when training is started with random weights
            # or restored from a checkpoint.
            hvd.BroadcastGlobalVariablesHook(0),

            # Horovod: adjust number of steps based on number of GPUs.
            tf.train.StopAtStepHook(last_step=FLAGS.max_steps // hvd.size()),
        ]

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(hvd.local_rank())

        checkpoint_dir = FLAGS.train_dir if hvd.rank() == 0 else None
        summary_writer = None
        saver = tf.train.Saver(max_to_keep=None,
                               keep_checkpoint_every_n_hours=24)
        scaffold = tf.train.Scaffold(saver=saver)

        with tf.train.MonitoredTrainingSession(master=FLAGS.master,
                                               is_chief=(FLAGS.task == 0),
                                               checkpoint_dir=checkpoint_dir,
                                               hooks=hooks,
                                               save_checkpoint_secs=30,
                                               save_summaries_steps=None,
                                               config=config,
                                               scaffold=scaffold) as sess:
            eval_tracker.sess = sess

            step = int(sess.run(model.global_step))

            if FLAGS.task > 0:
                # To avoid early instabilities when using multiple replicas, we use
                # a launch schedule where new replicas are brought online gradually.
                logging.info('Delaying replica start.')
                while step < FLAGS.replica_step_delay * FLAGS.task:
                    time.sleep(5.0)

            if rank == 0:
                summary_writer = tf.summary.FileWriterCache.get(
                    FLAGS.train_dir)
                summary_writer.add_session_log(
                    tf.summary.SessionLog(status=tf.summary.SessionLog.START),
                    step)

            fov_shifts = list(model.shifts)  # x, y, z
            if FLAGS.shuffle_moves:
                random.shuffle(fov_shifts)

            policy_map = {
                'fixed': partial(fixed_offsets, fov_shifts=fov_shifts),
                'max_pred_moves': max_pred_offsets
            }
            batch_it = get_batch(
                lambda: sess.run(load_data_ops),
                eval_tracker,
                model,
                FLAGS.batch_size,
                # eval_tracker, model, 1,
                policy_map[FLAGS.fov_policy])

            t_last = time.time()
            while not sess.should_stop() and step < FLAGS.max_steps:
                # Run summaries periodically.
                t_curr = time.time()
                if t_curr - t_last > FLAGS.summary_rate_secs and FLAGS.task == 0:
                    summ_op = merge_summaries_op
                    t_last = t_curr
                else:
                    summ_op = None

                seed, patches, labels, weights = next(batch_it)

                updated_seed, step, summ = run_training_step(
                    sess,
                    model,
                    summ_op,
                    feed_dict={
                        model.loss_weights: weights,
                        model.labels: labels,
                        model.input_patches: patches,
                        model.input_seed: seed,
                    })

                # Save prediction results in the original seed array so that
                # they can be used in subsequent steps.
                mask.update_at(seed, (0, 0, 0), updated_seed)

                # Record summaries.
                if hvd.rank() == 0 and summ is not None:
                    logging.info('Saving summaries.')
                    summ = tf.Summary.FromString(summ)

                    # Compute a loss over the whole training patch (i.e. more than a
                    # single-step field of view of the network). This quantifies the
                    # quality of the final object mask.
                    summ.value.extend(eval_tracker.get_summaries())
                    eval_tracker.reset()

                    assert summary_writer is not None
                    summary_writer.add_summary(summ, step)

        if summary_writer is not None:
            summary_writer.flush()