Exemple #1
0
def main(hps):

    # Initialize Horovod.
    hvd.init()

    # Create tensorflow session
    sess = tensorflow_session()

    # Download and load dataset.
    tf.set_random_seed(hvd.rank() + hvd.size() * hps.seed)
    np.random.seed(hvd.rank() + hvd.size() * hps.seed)

    # Get data and set train_its and valid_its
    train_iterator, test_iterator, data_init = get_data(hps, sess)
    hps.train_its, hps.test_its, hps.full_test_its = get_its(hps)

    # Create log dir
    logdir = os.path.abspath(hps.logdir) + "/"
    if not os.path.exists(logdir):
        os.mkdir(logdir)

    # Create model
    import model
    model = model.model(sess, hps, train_iterator, test_iterator, data_init)

    # Initialize visualization functions
    visualise = init_visualizations(hps, model, logdir)

    if not hps.inference:
        # Perform training
        train(sess, model, hps, logdir, visualise)
    else:
        infer(sess, model, hps, test_iterator)
Exemple #2
0
def get_data(hps, sess):
    if hps.image_size == -1:
        hps.image_size = {'mnist': 32, 'cifar10': 32, 'imagenet-oord': 64,
                          'imagenet': 256, 'celeba': 256, 'lsun_realnvp': 64, 'lsun': 256}[hps.problem]
    if hps.n_test == -1:
        hps.n_test = {'mnist': 10000, 'cifar10': 10000, 'imagenet-oord': 50000, 'imagenet': 50000,
                      'celeba': 3000, 'lsun_realnvp': 300*hvd.size(), 'lsun': 300*hvd.size()}[hps.problem]
    hps.n_y = {'mnist': 10, 'cifar10': 10, 'imagenet-oord': 1000,
               'imagenet': 1000, 'celeba': 1, 'lsun_realnvp': 1, 'lsun': 1}[hps.problem]
    if hps.data_dir == "":
        hps.data_dir = {'mnist': None, 'cifar10': None, 'imagenet-oord': '/mnt/host/imagenet-oord-tfr', 'imagenet': '/mnt/host/imagenet-tfr',
                        'celeba': '/mnt/host/celeba-reshard-tfr', 'lsun_realnvp': '/mnt/host/lsun_realnvp', 'lsun': '/mnt/host/lsun'}[hps.problem]

    if hps.problem == 'lsun_realnvp':
        hps.rnd_crop = True
    else:
        hps.rnd_crop = False

    if hps.category:
        hps.data_dir += ('/%s' % hps.category)

    # Use anchor_size to rescale batch size based on image_size
    s = hps.anchor_size
    hps.local_batch_train = hps.n_batch_train * \
        s * s // (hps.image_size * hps.image_size)
    hps.local_batch_test = {64: 50, 32: 25, 16: 10, 8: 5, 4: 2, 2: 2, 1: 1}[
        hps.local_batch_train]  # round down to closest divisor of 50
    hps.local_batch_init = hps.n_batch_init * \
        s * s // (hps.image_size * hps.image_size)

    print("Rank {} Batch sizes Train {} Test {} Init {}".format(
        hvd.rank(), hps.local_batch_train, hps.local_batch_test, hps.local_batch_init))

    if hps.problem in ['imagenet-oord', 'imagenet', 'celeba', 'lsun_realnvp', 'lsun']:
        hps.direct_iterator = True
        import data_loaders.get_data as v
        train_iterator, test_iterator, data_init = \
            v.get_data(sess, hps.data_dir, hvd.size(), hvd.rank(), hps.pmap, hps.fmap, hps.local_batch_train,
                       hps.local_batch_test, hps.local_batch_init, hps.image_size, hps.rnd_crop)

    elif hps.problem in ['mnist', 'cifar10']:
        hps.direct_iterator = False
        import data_loaders.get_mnist_cifar as v
        train_iterator, test_iterator, data_init = \
            v.get_data(hps.problem, hvd.size(), hvd.rank(), hps.dal, hps.local_batch_train,
                       hps.local_batch_test, hps.local_batch_init, hps.image_size)

    else:
        raise Exception()

    return train_iterator, test_iterator, data_init
Exemple #3
0
    def test_horovod_allreduce_cpu_gpu_error(self):
        """Test that the allreduce raises an error if different ranks try to
        perform reduction on CPU and GPU."""
        # Only do this test if there are GPUs available.
        if not tf.test.is_gpu_available(cuda_only=True):
            return

        hvd.init()
        local_rank = hvd.local_rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        device = "/gpu:0" if local_rank % 2 == 0 else "/cpu:0"
        one_gpu = tf.GPUOptions(visible_device_list=str(local_rank))
        gpu_config = tf.ConfigProto(gpu_options=one_gpu)
        with self.test_session(config=gpu_config) as session:
            with tf.device(device):
                # Same rank, different dimension
                dims = [17] * 3
                tensor = tf.ones(dims, dtype=tf.int32)
                with self.assertRaises(tf.errors.FailedPreconditionError):
                    session.run(hvd.allreduce(tensor))
Exemple #4
0
    def test_horovod_broadcast(self):
        """Test that the broadcast correctly broadcasts 1D, 2D, 3D tensors."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session() as session:
            dtypes = [tf.uint8, tf.int8, tf.uint16, tf.int16,
                      tf.int32, tf.int64, tf.float32, tf.float64,
                      tf.bool]
            dims = [1, 2, 3]
            root_ranks = list(range(size))
            for dtype, dim, root_rank in itertools.product(dtypes, dims, root_ranks):
                try:
                    tensor = tf.ones([17] * dim) * rank
                    root_tensor = tf.ones([17] * dim) * root_rank
                    if dtype == tf.bool:
                        tensor = tensor % 2
                        root_tensor = root_tensor % 2
                    tensor = tf.cast(tensor, dtype=dtype)
                    root_tensor = tf.cast(root_tensor, dtype=dtype)
                    broadcasted_tensor = hvd.broadcast(tensor, root_rank)
                    self.assertTrue(
                        session.run(tf.reduce_all(tf.equal(
                            tf.cast(root_tensor, tf.int32), tf.cast(broadcasted_tensor, tf.int32)))),
                        "hvd.broadcast produces incorrect broadcasted tensor")
                except Exception:
                    import traceback
                    traceback.print_exc()
Exemple #5
0
    def test_horovod_allreduce_cpu(self):
        """Test on CPU that the allreduce correctly sums 1D, 2D, 3D tensors."""
        hvd.init()
        size = hvd.size()
        with self.test_session() as session:
            dtypes = [tf.int32, tf.int64, tf.float32, tf.float64]
            dims = [1, 2, 3]
            for dtype, dim in itertools.product(dtypes, dims):
                with tf.device("/cpu:0"):
                    tf.set_random_seed(1234)
                    tensor = tf.random_uniform(
                        [17] * dim, -100, 100, dtype=dtype)
                    summed = hvd.allreduce(tensor, average=False)
                multiplied = tensor * size
                max_difference = tf.reduce_max(tf.abs(summed - multiplied))

                # Threshold for floating point equality depends on number of
                # ranks, since we're comparing against precise multiplication.
                if size <= 3:
                    threshold = 0
                elif size < 10:
                    threshold = 1e-4
                elif size < 15:
                    threshold = 5e-4
                else:
                    break

                diff = session.run(max_difference)
                self.assertTrue(diff <= threshold,
                                "hvd.allreduce produces incorrect results")
    def test_horovod_broadcast_grad(self):
        """Test the correctness of the broadcast gradient."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session(config=self.config) as session:
            # As of TensorFlow v1.9, gradients are not supported on
            # integer tensors
            dtypes = [tf.float32, tf.float64]
            dims = [1, 2, 3]
            root_ranks = list(range(size))
            for dtype, dim, root_rank in itertools.product(
                    dtypes, dims, root_ranks):
                tensor = tf.ones([5] * dim) * rank
                if dtype == tf.bool:
                    tensor = tensor % 2
                tensor = tf.cast(tensor, dtype=dtype)
                broadcasted_tensor = hvd.broadcast(tensor, root_rank)

                grad_ys = tf.ones([5] * dim)
                grad = tf.gradients(broadcasted_tensor, tensor, grad_ys)[0]
                grad_out = session.run(grad)

                c = size if rank == root_rank else 0
                expected = np.ones([5] * dim) * c
                err = np.linalg.norm(expected - grad_out)
                self.assertLess(err, 0.00000001,
                                "gradient %s differs from expected %s, "
                                "error: %s" % (grad_out, expected, str(err)))
    def test_horovod_allreduce_grad(self):
        """Test the correctness of the allreduce gradient."""
        hvd.init()
        size = hvd.size()

        with self.test_session(config=self.config) as session:
            # As of TensorFlow v1.9, gradients are not supported on
            # integer tensors
            dtypes = [tf.float32, tf.float64]
            dims = [1, 2, 3]
            for dtype, dim in itertools.product(dtypes, dims):
                with tf.device("/cpu:0"):
                    tf.set_random_seed(1234)
                    tensor = tf.random_uniform(
                        [5] * dim, -100, 100, dtype=dtype)
                    summed = hvd.allreduce(tensor, average=False)

                grad_ys = tf.ones([5] * dim)
                grad = tf.gradients(summed, tensor, grad_ys)[0]
                grad_out = session.run(grad)

                expected = np.ones([5] * dim) * size
                err = np.linalg.norm(expected - grad_out)
                self.assertLess(err, 0.00000001,
                                "gradient %s differs from expected %s, "
                                "error: %s" % (grad_out, expected, str(err)))
Exemple #8
0
    def test_horovod_allreduce_error(self):
        """Test that the allreduce raises an error if different ranks try to
        send tensors of different rank or dimension."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session() as session:
            # Same rank, different dimension
            tf.set_random_seed(1234)
            dims = [17 + rank] * 3
            tensor = tf.random_uniform(dims, -1.0, 1.0)
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.allreduce(tensor))

            # Same number of elements, different rank
            tf.set_random_seed(1234)
            if rank == 0:
                dims = [17, 23 * 57]
            else:
                dims = [17, 23, 57]
            tensor = tf.random_uniform(dims, -1.0, 1.0)
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.allreduce(tensor))
Exemple #9
0
def adam2_old(params, cost_or_grads, lr=3e-4, mom1=0.9, mom2=0.999, epsilon=1e-8):
    updates = []
    if type(cost_or_grads) is not list:
        gs = tf.gradients(cost_or_grads, params)
    else:
        gs = cost_or_grads

    # all-reduce
    grads1 = [Z.allreduce_mean(g) for g in gs]
    grads2 = [Z.allreduce_mean(tf.square(g)) for g in gs]
    mom2 = tf.maximum(0., 1. - (hvd.size() * (1 - mom2)))

    t = tf.Variable(1., 'adam_t')
    lr_t = lr * tf.sqrt((1. - tf.pow(mom2, t))) / (1. - tf.pow(mom1, t))
    updates.append(t.assign_add(1))

    for p, g1, g2 in zip(params, grads1, grads2):
        mg = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_mg')
        if mom1 > 0:
            v = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_v')
            v_t = mom1 * v + (1. - mom1) * g1
            updates.append(v.assign(v_t))
        else:
            v_t = g1
        mg_t = mom2 * mg + (1. - mom2) * g2
        delta_t = v_t / (tf.sqrt(mg_t) + epsilon)
        p_t = p - lr_t * delta_t
        updates.append(mg.assign(mg_t))
        updates.append(p.assign(p_t))
    return tf.group(*updates)
def main(unused_argv):
    # Horovod: initialize Horovod.
    hvd.init()

    # Load training and eval data
    mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank())
    train_data = mnist.train.images  # Returns np.array
    train_labels = np.asarray(mnist.train.labels, dtype=np.int32)
    eval_data = mnist.test.images  # Returns np.array
    eval_labels = np.asarray(mnist.test.labels, dtype=np.int32)

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    model_dir = './mnist_convnet_model' if hvd.rank() == 0 else None

    # Create the Estimator
    mnist_classifier = tf.estimator.Estimator(
        model_fn=cnn_model_fn, model_dir=model_dir,
        config=tf.estimator.RunConfig(session_config=config))

    # Set up logging for predictions
    # Log the values in the "Softmax" tensor with label "probabilities"
    tensors_to_log = {"probabilities": "softmax_tensor"}
    logging_hook = tf.train.LoggingTensorHook(
        tensors=tensors_to_log, every_n_iter=500)

    # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from
    # rank 0 to all other processes. This is necessary to ensure consistent
    # initialization of all workers when training is started with random weights or
    # restored from a checkpoint.
    bcast_hook = hvd.BroadcastGlobalVariablesHook(0)

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": train_data},
        y=train_labels,
        batch_size=100,
        num_epochs=None,
        shuffle=True)

    # Horovod: adjust number of steps based on number of GPUs.
    mnist_classifier.train(
        input_fn=train_input_fn,
        steps=20000 // hvd.size(),
        hooks=[logging_hook, bcast_hook])

    # Evaluate the model and print results
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": eval_data},
        y=eval_labels,
        num_epochs=1,
        shuffle=False)
    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
    print(eval_results)
def run(benchmark_step):
    # Warm-up
    log('Running warmup...')
    timeit.timeit(benchmark_step, number=args.num_warmup_batches)

    # Benchmark
    log('Running benchmark...')
    img_secs = []
    for x in range(args.num_iters):
        time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter)
        img_sec = args.batch_size * args.num_batches_per_iter / time
        log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device))
        img_secs.append(img_sec)

    # Results
    img_sec_mean = np.mean(img_secs)
    img_sec_conf = 1.96 * np.std(img_secs)
    log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf))
    log('Total img/sec on %d %s(s): %.1f +-%.1f' %
        (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf))
Exemple #12
0
def init_config():
    if config.TRAINER == 'horovod':
        ngpu = hvd.size()
    else:
        ngpu = get_num_gpu()
    assert ngpu % 8 == 0 or 8 % ngpu == 0, ngpu
    if config.NUM_GPUS is None:
        config.NUM_GPUS = ngpu
    else:
        if config.TRAINER == 'horovod':
            assert config.NUM_GPUS == ngpu
        else:
            assert config.NUM_GPUS <= ngpu
    print_config()
Exemple #13
0
    def test_horovod_broadcast_rank_error(self):
        """Test that the broadcast returns an error if different ranks
        specify different root rank."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session() as session:
            tensor = tf.ones([17] * 3, dtype=tf.float32)
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.broadcast(tensor, rank))
Exemple #14
0
    def on_batch_begin(self, batch, logs=None):
        if self.current_epoch > self.warmup_epochs:
            # Outside of adjustment scope.
            return

        if self.current_epoch == self.warmup_epochs and batch > 0:
            # Outside of adjustment scope, final adjustment is done on first batch.
            return

        old_lr = K.get_value(self.model.optimizer.lr)
        epoch = self.current_epoch + float(batch) / self.steps_per_epoch
        new_lr = self.initial_lr / hvd.size() * \
            (epoch * (hvd.size() - 1) / self.warmup_epochs + 1)
        K.set_value(self.model.optimizer.lr, new_lr)

        if self.current_epoch == self.warmup_epochs and self.verbose:
            print('Epoch %d: finished gradual learning rate warmup to %s.' %
                  (epoch + 1, new_lr))

        if hasattr(self.model.optimizer, 'momentum') and self.momentum_correction:
            # See the paper cited above for more information about momentum correction.
            self.restore_momentum = K.get_value(self.model.optimizer.momentum)
            K.set_value(self.model.optimizer.momentum,
                        self.restore_momentum * new_lr / old_lr)
Exemple #15
0
    def test_horovod_allgather_variable_size(self):
        """Test that the allgather correctly gathers 1D, 2D, 3D tensors,
        even if those tensors have different sizes along the first dim."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        with self.test_session() as session:
            dtypes = [tf.uint8, tf.int8, tf.uint16, tf.int16,
                      tf.int32, tf.int64, tf.float32, tf.float64,
                      tf.bool]
            dims = [1, 2, 3]
            for dtype, dim in itertools.product(dtypes, dims):
                # Support tests up to MPI Size of 35
                if size > 35:
                    break

                tensor_sizes = [17, 32, 81, 12, 15, 23, 22] * 5
                tensor_sizes = tensor_sizes[:size]

                tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank
                if dtype == tf.bool:
                    tensor = tensor % 2
                tensor = tf.cast(tensor, dtype=dtype)
                gathered = hvd.allgather(tensor)

                gathered_tensor = session.run(gathered)
                expected_size = sum(tensor_sizes)
                self.assertEqual(list(gathered_tensor.shape),
                                 [expected_size] + [17] * (dim - 1))

                for i in range(size):
                    rank_size = [tensor_sizes[i]] + [17] * (dim - 1)
                    rank_tensor = tf.slice(
                        gathered, [sum(tensor_sizes[:i])] + [0] * (dim - 1),
                        rank_size)
                    self.assertEqual(list(rank_tensor.shape), rank_size)
                    # tf.equal() does not support tf.uint16 as of TensorFlow 1.2,
                    # so need to cast rank_tensor to tf.int32.
                    if dtype != tf.bool:
                        value = i
                    else:
                        value = i % 2
                    self.assertTrue(
                        session.run(tf.reduce_all(
                            tf.equal(tf.cast(rank_tensor, tf.int32), value))),
                        "hvd.allgather produces incorrect gathered tensor")
Exemple #16
0
    def test_horovod_broadcast_type_error(self):
        """Test that the broadcast returns an error if the types being broadcasted
        differ among the processes"""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session() as session:
            tensor_size = [17] * 3
            dtype = tf.int32 if rank % 2 == 0 else tf.float32
            tensor = tf.ones(tensor_size, dtype=dtype) * rank
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.broadcast(tensor, 0))
Exemple #17
0
    def test_horovod_broadcast_error(self):
        """Test that the broadcast returns an error if any dimension besides
        the first is different among the tensors being broadcasted."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session() as session:
            tensor_size = [17] * 3
            tensor_size[1] = 10 * (rank + 1)
            tensor = tf.ones(tensor_size, dtype=tf.float32) * rank
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.broadcast(tensor, 0))
Exemple #18
0
    def test_horovod_allreduce_multi_gpu(self):
        """Test that the allreduce works on multiple GPUs.

        This test will crash badly if used with an MPI implementation that does
        not support GPU memory transfers directly, as it will call MPI_Send on
        a GPU data pointer."""
        # Only do this test if there are GPUs available.
        if not tf.test.is_gpu_available(cuda_only=True):
            return

        hvd.init()
        local_rank = hvd.local_rank()
        size = hvd.size()

        iter = 0
        two_gpus = tf.GPUOptions(visible_device_list=(
            '%d,%d' % (local_rank * 2, local_rank * 2 + 1)))
        gpu_config = tf.ConfigProto(gpu_options=two_gpus)
        with self.test_session(config=gpu_config) as session:
            dtypes = [tf.int32, tf.int64, tf.float32, tf.float64]
            dims = [1, 2, 3]
            for dtype, dim in itertools.product(dtypes, dims):
                iter += 1
                with tf.device("/gpu:%d" % ((iter + local_rank) % 2)):
                    tf.set_random_seed(1234)
                    tensor = tf.random_uniform(
                        [17] * dim, -100, 100, dtype=dtype)
                    summed = hvd.allreduce(tensor, average=False)
                multiplied = tensor * size
                max_difference = tf.reduce_max(tf.abs(summed - multiplied))

                # Threshold for floating point equality depends on number of
                # ranks, since we're comparing against precise multiplication.
                if size <= 3:
                    threshold = 0
                elif size < 10:
                    threshold = 1e-4
                elif size < 15:
                    threshold = 5e-4
                else:
                    return

                diff = session.run(max_difference)
                self.assertTrue(diff <= threshold,
                                "hvd.allreduce on GPU produces incorrect results")
Exemple #19
0
    def test_horovod_allreduce_type_error(self):
        """Test that the allreduce raises an error if different ranks try to
        send tensors of different type."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session() as session:
            # Same rank, different dimension
            dims = [17] * 3
            tensor = tf.ones(dims,
                             dtype=tf.int32 if rank % 2 == 0 else tf.float32)
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.allreduce(tensor))
Exemple #20
0
 def init_by_config(self, config):
   """
   :param Config.Config config:
   """
   logs = config.list('log', [])
   log_verbosity = config.int_list('log_verbosity', [])
   log_format = config.list('log_format', [])
   if config.is_true("use_horovod"):
     # noinspection PyPackageRequirements,PyUnresolvedReferences
     import horovod.tensorflow as hvd
     from TFUtil import init_horovod
     init_horovod()  # make sure it is initialized
     new_logs = []
     for fn in logs:
       fn_prefix, fn_ext = os.path.splitext(fn)
       fn_ext = ".horovod-%i-%i%s" % (hvd.rank(), hvd.size(), fn_ext)
       new_logs.append(fn_prefix + fn_ext)
     logs = new_logs
   self.initialize(logs=logs, verbosity=log_verbosity, formatter=log_format)
    def test_horovod_allreduce_gpu_fused(self):
        """Test that the allreduce works on GPUs with Tensor Fusion.

        This test will crash badly if used with an MPI implementation that does
        not support GPU memory transfers directly, as it will call MPI_Send on
        a GPU data pointer."""
        # Only do this test if there are GPUs available.
        if not tf.test.is_gpu_available(cuda_only=True):
            return

        hvd.init()
        local_rank = hvd.local_rank()
        size = hvd.size()

        with self.test_session(config=self.config) as session:
            dtypes = [tf.int32, tf.int64, tf.float32, tf.float64]
            dims = [1, 2, 3]
            tests = []
            for dtype, dim in itertools.product(dtypes, dims):
                with tf.device("/gpu:%d" % local_rank):
                    tf.set_random_seed(1234)
                    tensor = tf.random_uniform(
                        [17] * dim, -100, 100, dtype=dtype)
                    summed = hvd.allreduce(tensor, average=False)
                multiplied = tensor * size
                max_difference = tf.reduce_max(tf.abs(summed - multiplied))

                # Threshold for floating point equality depends on number of
                # ranks, since we're comparing against precise multiplication.
                if size <= 3 or dtype in [tf.int32, tf.int64]:
                    threshold = 0
                elif size < 10:
                    threshold = 1e-4
                elif size < 15:
                    threshold = 5e-4
                else:
                    return

                test = max_difference <= threshold
                tests.append(test)
            self.assertTrue(session.run(tf.reduce_all(tests)),
                            "hvd.allreduce produces incorrect results")
Exemple #22
0
def get_its(hps):
    # These run for a fixed amount of time. As anchored batch is smaller, we've actually seen fewer examples
    train_its = int(np.ceil(hps.n_train / (hps.n_batch_train * hvd.size())))
    test_its = int(np.ceil(hps.n_test / (hps.n_batch_train * hvd.size())))
    train_epoch = train_its * hps.n_batch_train * hvd.size()

    # Do a full validation run
    if hvd.rank() == 0:
        print(hps.n_test, hps.local_batch_test, hvd.size())
    assert hps.n_test % (hps.local_batch_test * hvd.size()) == 0
    full_test_its = hps.n_test // (hps.local_batch_test * hvd.size())

    if hvd.rank() == 0:
        print("Train epoch size: " + str(train_epoch))
    return train_its, test_its, full_test_its
    def test_horovod_allgather_grad(self):
        """Test the correctness of the allgather gradient."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        with self.test_session(config=self.config) as session:
            # As of TensorFlow v1.9, gradients are not supported on
            # integer tensors
            dtypes = [tf.float32, tf.float64]
            dims = [1, 2, 3]
            for dtype, dim in itertools.product(dtypes, dims):
                tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5
                tensor_sizes = tensor_sizes[:size]

                tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank
                if dtype == tf.bool:
                    tensor = tensor % 2
                tensor = tf.cast(tensor, dtype=dtype)
                gathered = hvd.allgather(tensor)

                grad_list = []
                for r, tensor_size in enumerate(tensor_sizes):
                    g = tf.ones([tensor_size] + [17] * (dim - 1)) * r
                    grad_list.append(g)
                grad_ys = tf.concat(grad_list, axis=0)

                grad = tf.gradients(gathered, tensor, grad_ys)[0]
                grad_out = session.run(grad)

                expected = np.ones(
                    [tensor_sizes[rank]] + [17] * (dim - 1)
                ) * rank * size
                err = np.linalg.norm(expected - grad_out)
                self.assertLess(err, 0.00000001,
                                "gradient %s differs from expected %s, "
                                "error: %s" %
                                (grad_out, expected, str(err)))
Exemple #24
0
def finalize_configs(is_training):
    """
    Run some sanity checks, and populate some configs from others
    """
    _C.DATA.NUM_CLASS = _C.DATA.NUM_CATEGORY + 1  # +1 background
    _C.RPN.NUM_ANCHOR = len(_C.RPN.ANCHOR_SIZES) * len(_C.RPN.ANCHOR_RATIOS)
    assert len(_C.FPN.ANCHOR_STRIDES) == len(_C.RPN.ANCHOR_SIZES)
    # image size into the backbone has to be multiple of this number
    _C.FPN.RESOLUTION_REQUIREMENT = _C.FPN.ANCHOR_STRIDES[3]  # [3] because we build FPN with features r2,r3,r4,r5

    if _C.MODE_FPN:
        size_mult = _C.FPN.RESOLUTION_REQUIREMENT * 1.
        _C.PREPROC.MAX_SIZE = np.ceil(_C.PREPROC.MAX_SIZE / size_mult) * size_mult

    if is_training:
        os.environ['TF_AUTOTUNE_THRESHOLD'] = '1'
        assert _C.TRAINER in ['horovod', 'replicated'], _C.TRAINER

        # setup NUM_GPUS
        if _C.TRAINER == 'horovod':
            import horovod.tensorflow as hvd
            ngpu = hvd.size()
        else:
            assert 'OMPI_COMM_WORLD_SIZE' not in os.environ
            ngpu = get_num_gpu()
        assert ngpu % 8 == 0 or 8 % ngpu == 0, ngpu
        if _C.TRAIN.NUM_GPUS is None:
            _C.TRAIN.NUM_GPUS = ngpu
        else:
            if _C.TRAINER == 'horovod':
                assert _C.TRAIN.NUM_GPUS == ngpu
            else:
                assert _C.TRAIN.NUM_GPUS <= ngpu
    else:
        # autotune is too slow for inference
        os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0'

    logger.info("Config: ------------------------------------------\n" + str(_C))
Exemple #25
0
    def get_gradients(self, loss, params):
        """
        Compute gradients of all trainable variables.

        See Optimizer.get_gradients() for more info.

        In DistributedOptimizer, get_gradients() is overriden to also
        allreduce the gradients before returning them.
        """
        gradients = super(self.__class__, self).get_gradients(loss, params)
        if hvd.size() > 1:
            averaged_gradients = []
            with tf.name_scope(self._name + "_Allreduce"):
                for grad in gradients:
                    if grad is not None:
                        avg_grad = hvd.allreduce(grad, device_dense=self._device_dense,
                                                 device_sparse=self._device_sparse)
                        averaged_gradients.append(avg_grad)
                    else:
                        averaged_gradients.append(None)
                return averaged_gradients
        else:
            return gradients
Exemple #26
0
 def get_current_step_learning_rate(self):
   """
   :rtype: tf.Tensor
   """
   lr = self.learning_rate_var
   if self.config.typed_dict.get("dynamic_learning_rate"):
     # To implement any kind of cyclic learning rate during the epoch. E.g.: https://arxiv.org/abs/1608.03983
     with tf.name_scope("dynamic_learning_rate"):
       from Util import CollectionReadCheckCovered
       opts = CollectionReadCheckCovered(self.config.typed_dict["dynamic_learning_rate"])
       # Currently all intervals of same step size.
       interval_steps = tf.constant(opts["interval"], name="interval", dtype=self.network.global_train_step.dtype)
       step_in_interval = tf.mod(self.network.global_train_step, interval_steps, name="step_in_interval")
       factor = tf.pow(
         tf.constant(opts["decay"], name="decay", dtype=tf.float32),
         tf.to_float(step_in_interval, name="step_in_interval_float"), name="factor")
       lr *= factor
       opts.assert_all_read()
   if self.config.is_true("use_horovod") and self.config.is_true("horovod_scale_lr"):
     # noinspection PyPackageRequirements,PyUnresolvedReferences
     import horovod.tensorflow as hvd
     lr *= hvd.size()
   return lr
Exemple #27
0
    def test_horovod_allgather(self):
        """Test that the allgather correctly gathers 1D, 2D, 3D tensors."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        with self.test_session() as session:
            dtypes = [tf.uint8, tf.int8, tf.uint16, tf.int16,
                      tf.int32, tf.int64, tf.float32, tf.float64,
                      tf.bool]
            dims = [1, 2, 3]
            for dtype, dim in itertools.product(dtypes, dims):
                tensor = tf.ones([17] * dim) * rank
                if dtype == tf.bool:
                    tensor = tensor % 2
                tensor = tf.cast(tensor, dtype=dtype)
                gathered = hvd.allgather(tensor)

                gathered_tensor = session.run(gathered)
                self.assertEqual(list(gathered_tensor.shape),
                                 [17 * size] + [17] * (dim - 1))

                for i in range(size):
                    rank_tensor = tf.slice(gathered_tensor,
                                           [i * 17] + [0] * (dim - 1),
                                           [17] + [-1] * (dim - 1))
                    self.assertEqual(list(rank_tensor.shape), [17] * dim)
                    # tf.equal() does not support tf.uint16 as of TensorFlow 1.2,
                    # so need to cast rank_tensor to tf.int32.
                    if dtype != tf.bool:
                        value = i
                    else:
                        value = i % 2
                    self.assertTrue(
                        session.run(tf.reduce_all(
                            tf.equal(tf.cast(rank_tensor, tf.int32), value))),
                        "hvd.allgather produces incorrect gathered tensor")
Exemple #28
0
            loss, metrics = evaluate_step(samples)
            if batch % self.hparams.log_interval == 0 and hvd.local_rank(
            ) == 0:
                logging.info(self.metric_checker(loss, metrics, -2))
            loss_metric.update_state(loss)
        if hvd.local_rank() == 0:
            logging.info(
                self.metric_checker(loss_metric.result(),
                                    metrics,
                                    evaluate_epoch=epoch))
            self.model.reset_metrics()
        return loss_metric.result(), metrics


if __name__ == "__main__":
    logging.set_verbosity(logging.INFO)
    if len(sys.argv) < 2:
        logging.warning('Usage: python {} config_json_file'.format(
            sys.argv[0]))
        sys.exit()
    tf.random.set_seed(1)

    json_file = sys.argv[1]
    #config = None
    #with open(json_file) as f:
    #    config = json.load(f)
    #p = parse_config(config)
    HorovodSolver.initialize_devices()
    #multi-servers training should use hvd.rank()
    train(json_file, HorovodSolver, hvd.size(), hvd.rank())
def main(_):
    hvd.init()
    FLAGS.output_dir = FLAGS.output_dir if hvd.rank() == 0 else os.path.join(
        FLAGS.output_dir, str(hvd.rank()))

    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "xnli": XnliProcessor,
        "cla": ClaProcessor,
        "pair": PairProcessor
    }

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

    config = tf.ConfigProto()
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host),
        log_step_count_steps=25,
        session_config=config)

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
        num_train_steps = num_train_steps // hvd.size()
        num_warmup_steps = num_warmup_steps // hvd.size()

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        file_based_convert_examples_to_features(train_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, train_file)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        hooks = [hvd.BroadcastGlobalVariablesHook(0)]

        estimator.train(input_fn=train_input_fn,
                        max_steps=num_train_steps,
                        hooks=hooks)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on. These do NOT count towards the metric (all tf.metrics
            # support a per-instance weight, and these get a weight of 0.0).
            while len(eval_examples) % FLAGS.eval_batch_size != 0:
                eval_examples.append(PaddingInputExample())

        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        file_based_convert_examples_to_features(eval_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            assert len(eval_examples) % FLAGS.eval_batch_size == 0
            eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        #######################################################################################################################
        # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy
        steps_and_files = []
        filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
        for filename in filenames:
            if filename.endswith(".index"):
                ckpt_name = filename[:-6]
                cur_filename = os.path.join(FLAGS.output_dir, ckpt_name)
                global_step = int(cur_filename.split("-")[-1])
                tf.logging.info("Add {} to eval list.".format(cur_filename))
                steps_and_files.append([global_step, cur_filename])
        steps_and_files = sorted(steps_and_files, key=lambda x: x[0])

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        print("output_eval_file:", output_eval_file)
        tf.logging.info("output_eval_file:" + output_eval_file)
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            for global_step, filename in sorted(steps_and_files,
                                                key=lambda x: x[0]):
                result = estimator.evaluate(input_fn=eval_input_fn,
                                            steps=eval_steps,
                                            checkpoint_path=filename)

                tf.logging.info("***** Eval results %s *****" % (filename))
                writer.write("***** Eval results %s *****\n" % (filename))
                for key in sorted(result.keys()):
                    tf.logging.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
        #######################################################################################################################

        # result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
        #
        # output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        # with tf.gfile.GFile(output_eval_file, "w") as writer:
        #     tf.logging.info("***** Eval results *****")
        #     for key in sorted(result.keys()):
        #         tf.logging.info("  %s = %s", key, str(result[key]))
        #         writer.write("%s = %s\n" % (key, str(result[key])))

    if FLAGS.do_predict:
        true_labels = []
        with open(os.path.join(FLAGS.data_dir, "test.tsv"),
                  'r',
                  encoding='utf-8') as f:
            for line in f.readlines():
                line = line.strip()
                true_labels.append(int(line.split('\t')[0]))

        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on.
            while len(predict_examples) % FLAGS.predict_batch_size != 0:
                predict_examples.append(PaddingInputExample())

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        file_based_convert_examples_to_features(predict_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, predict_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        result = estimator.predict(input_fn=predict_input_fn)

        predictions = []
        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as writer:
            num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            for (i, prediction) in enumerate(result):
                probabilities = prediction["probabilities"]
                a = probabilities.tolist()
                predictions.append(a.index(max(a)))
                if i >= num_actual_predict_examples:
                    break
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities) + "\n"
                writer.write(output_line)
                num_written_lines += 1
        assert num_written_lines == num_actual_predict_examples

        count = 0
        for i in range(len(predictions)):
            if predictions[i] == true_labels[i]:
                count += 1
        print("Average accuracy: ", count / len(predictions))

        with open(os.path.join(FLAGS.data_dir, "id2label.json"),
                  'r',
                  encoding='utf-8') as f:
            ld2label = json.load(f)

        cla_labels = [i for i in range(FLAGS.cla_nums)]
        report = metrics.classification_report(
            y_true=true_labels,
            y_pred=predictions,
            labels=cla_labels,
            target_names=[ld2label[str(i)].split()[0] for i in cla_labels],
            digits=4)

        confution_matrix = metrics.confusion_matrix(y_true=true_labels,
                                                    y_pred=predictions,
                                                    labels=cla_labels)
        print(report)
        print(confution_matrix)
        with open(os.path.join(FLAGS.output_dir, "eval_report.txt"),
                  'w',
                  encoding='utf-8') as f:
            f.write(report)
Exemple #30
0
def loss_function():
    logits = model(data, training=True)
    return tf.losses.sparse_softmax_cross_entropy(target, logits)


def log(s, nl=True):
    if hvd.rank() != 0:
        return
    print(s, end='\n' if nl else '')


log('Model: %s' % args.model)
log('Batch size: %d' % args.batch_size)
device = 'GPU' if args.cuda else 'CPU'
log('Number of %ss: %d' % (device, hvd.size()))


def run(benchmark_step):
    # Warm-up
    log('Running warmup...')
    timeit.timeit(benchmark_step, number=args.num_warmup_batches)

    # Benchmark
    log('Running benchmark...')
    img_secs = []
    for x in range(args.num_iters):
        time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter)
        img_sec = args.batch_size * args.num_batches_per_iter / time
        log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device))
        img_secs.append(img_sec)
Exemple #31
0
def parallax_run_hybrid(single_gpu_meta_graph_def,
                        config):
    # Initialize horovod
    hvd.init()
    #worker_id = hvd.rank()
    local_worker_id = hvd.local_rank()

    num_workers = hvd.size()
    machine_id, hostname = _get_worker_info()
    create_profile_directory(config.profile_config.profile_dir,
                             config.profile_config.profile_worker,
                             config.resource_info, hostname)

    sess_config = config.sess_config
    if sess_config is None:
        sess_config = tf.ConfigProto(allow_soft_placement=True)
    sess_config.gpu_options.visible_device_list = str(hvd.local_rank())
    cluster_spec = get_tf_clusterspec_for_hybrid(config.resource_info)
    worker_id = 0
    for i in range(machine_id):
      worker_id += len(config.resource_info['worker'][i]['gpus'])
    worker_id += hvd.local_rank()
    if config.profile_config.profile_dir:
        for ps_i, ps in enumerate(config.resource_info['ps']):
            if ps['hostname'] == hostname:
                if local_worker_id == 0:
                    tasks = ['ps:%d'%ps_i, 'worker:%d'%worker_id]
                else:
                    tasks = ['worker:%d'%worker_id]
                append_task_info(config.profile_config.profile_dir,
                                 hostname, 
                                 tasks)
                break
    server = tf.train.Server(cluster_spec, job_name='worker',
                             task_index=worker_id, protocol=config.communication_config.ps_config.protocol,
                             config=sess_config)
    
    meta_graph_def, tensor_or_op_name_to_replica_names = graph_transform_hybrid(
        single_gpu_meta_graph_def,
        worker_id,
        local_worker_id,
        machine_id,
        hostname,
        config)

    with tf.Graph().as_default() as graph_to_run:
        parallax_log.debug("Importing MPI graph on worker %d" % worker_id)

        tf.train.import_meta_graph(meta_graph_def)
        if config.export_graph_path:
            export_meta_graph(config.export_graph_path, worker_id)
        if config.profile_config.profile_dir:
            path = os.path.join(config.profile_config.profile_dir, hostname,
                                'worker:%d'%worker_id)
            export_meta_graph(path, worker_id)

            if worker_id != config.profile_config.profile_worker:
                #Only one CUPTI profiler can run in a machine
                #See tensorflow/tensorflow/core/platform/default/device_tracer.cc:L452
                config.profile_config.profile_dir = None
            else:
                config.profile_config.profile_dir = \
                    os.path.join(config.profile_config.profile_dir, hostname,
                                 'worker:%d'%worker_id, 'run_meta')
        ckpt_hooks = \
            build_ckpt_hooks(config.get_ckpt_config()) \
            if worker_id == 0 else None

        sess = tf.train.MonitoredTrainingSession(
                master=server.target,
                is_chief=True,
                checkpoint_dir=config.get_ckpt_config().ckpt_dir if worker_id == 0 else None,
                # TODO: Allow user-defined hooks
                hooks=None,
                chief_only_hooks=ckpt_hooks,
                save_checkpoint_secs=None,
                save_summaries_steps=None,
                save_summaries_secs=None,
                config=sess_config)

        parallax_log.debug(
            "Created MonitoredTrainingSession for worker %d" % worker_id)
        _init_global_vars(sess)
        parallax_log.debug(
            "Finished initialization process, start training on worker %d"
            % worker_id)

        step = sess.run(tf.get_collection(tf.GraphKeys.GLOBAL_STEP)[0])
        sess_context = \
            ParallaxSessionContext(step,
                                   config.profile_config.profile_dir,
                                   config.profile_config.profile_steps,
                                   config.profile_config.profile_range,
                                   tensor_or_op_name_to_replica_names,
                                   1)
        sess_context.set_parallax_session_context()
        return sess, num_workers, worker_id, 1
Exemple #32
0
                session_init=get_model_loader(args.load),
                input_names=MODEL.get_inference_tensor_names()[0],
                output_names=MODEL.get_inference_tensor_names()[1])
            if args.predict:
                predictor = OfflinePredictor(predcfg)
                for image_file in args.predict:
                    do_predict(predictor, image_file)
            elif args.evaluate:
                assert args.evaluate.endswith('.json'), args.evaluate
                do_evaluate(predcfg, args.evaluate)
    else:
        is_horovod = cfg.TRAINER == 'horovod'
        if is_horovod:
            hvd.init()
            logger.info("Horovod Rank={}, Size={}".format(
                hvd.rank(), hvd.size()))

        if not is_horovod or hvd.rank() == 0:
            logger.set_logger_dir(args.logdir, 'd')
        logger.info("Environment Information:\n" + collect_env_info())

        finalize_configs(is_training=True)
        stepnum = cfg.TRAIN.STEPS_PER_EPOCH

        # warmup is step based, lr is epoch based
        init_lr = cfg.TRAIN.WARMUP_INIT_LR * min(8. / cfg.TRAIN.NUM_GPUS, 1.)
        warmup_schedule = [(0, init_lr), (cfg.TRAIN.WARMUP, cfg.TRAIN.BASE_LR)]
        warmup_end_epoch = cfg.TRAIN.WARMUP * 1. / stepnum
        lr_schedule = [(int(warmup_end_epoch + 0.5), cfg.TRAIN.BASE_LR)]

        factor = 8. / cfg.TRAIN.NUM_GPUS
def main(args):
    # Horovod: initialize Horovod.
    hvd.init()

    # Keras automatically creates a cache directory in ~/.keras/datasets for
    # storing the downloaded MNIST data. This creates a race
    # condition among the workers that share the same filesystem. If the
    # directory already exists by the time this worker gets around to creating
    # it, ignore the resulting exception and continue.
    cache_dir = os.path.join(os.path.expanduser("~"), ".keras", "datasets")
    if not os.path.exists(cache_dir):
        try:
            os.mkdir(cache_dir)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
                pass
            else:
                raise

    # Download and load MNIST dataset.
    (train_data, train_labels), (eval_data, eval_labels) = keras.datasets.mnist.load_data(
        "MNIST-data-%d" % hvd.rank()
    )

    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
    # into (-1, 784) to feed into our network. Also, need to normalize the
    # features between 0 and 1.
    train_data = np.reshape(train_data, (-1, 784)) / 255.0
    eval_data = np.reshape(eval_data, (-1, 784)) / 255.0

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    if not args.use_only_cpu:
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        estimator_config = tf.estimator.RunConfig(session_config=config)
    else:
        estimator_config = None

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    model_dir = args.model_dir if hvd.rank() == 0 else None

    # Create the Estimator
    mnist_classifier = tf.estimator.Estimator(
        model_fn=cnn_model_fn, model_dir=model_dir, config=estimator_config
    )

    # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from
    # rank 0 to all other processes. This is necessary to ensure consistent
    # initialization of all workers when training is started with random weights or
    # restored from a checkpoint.
    bcast_hook = hvd.BroadcastGlobalVariablesHook(0)

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": train_data}, y=train_labels, batch_size=100, num_epochs=None, shuffle=True
    )

    # Horovod: adjust number of steps based on number of GPUs.
    mnist_classifier.train(
        input_fn=train_input_fn, steps=args.num_steps // hvd.size(), hooks=[bcast_hook]
    )

    # Evaluate the model and print results
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False
    )
    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
    print(eval_results)
def cnn_model_fn(features, labels, mode):
    """Model function for CNN."""
    # Input Layer
    # Reshape X to 4-D tensor: [batch_size, width, height, channels]
    # MNIST images are 28x28 pixels, and have one color channel
    input_layer = tf.reshape(features["x"], [-1, 28, 28, 1])

    # Convolutional Layer #1
    # Computes 32 features using a 5x5 filter with ReLU activation.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 28, 28, 1]
    # Output Tensor Shape: [batch_size, 28, 28, 32]
    conv1 = tf.layers.conv2d(
        inputs=input_layer, filters=32, kernel_size=[5, 5], padding="same", activation=tf.nn.relu
    )

    # Pooling Layer #1
    # First max pooling layer with a 2x2 filter and stride of 2
    # Input Tensor Shape: [batch_size, 28, 28, 32]
    # Output Tensor Shape: [batch_size, 14, 14, 32]
    pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)

    # Convolutional Layer #2
    # Computes 64 features using a 5x5 filter.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 14, 14, 32]
    # Output Tensor Shape: [batch_size, 14, 14, 64]
    conv2 = tf.layers.conv2d(
        inputs=pool1, filters=64, kernel_size=[5, 5], padding="same", activation=tf.nn.relu
    )

    # Pooling Layer #2
    # Second max pooling layer with a 2x2 filter and stride of 2
    # Input Tensor Shape: [batch_size, 14, 14, 64]
    # Output Tensor Shape: [batch_size, 7, 7, 64]
    pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)

    # Flatten tensor into a batch of vectors
    # Input Tensor Shape: [batch_size, 7, 7, 64]
    # Output Tensor Shape: [batch_size, 7 * 7 * 64]
    pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])

    # Dense Layer
    # Densely connected layer with 1024 neurons
    # Input Tensor Shape: [batch_size, 7 * 7 * 64]
    # Output Tensor Shape: [batch_size, 1024]
    dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)

    # Add dropout operation; 0.6 probability that element will be kept
    dropout = tf.layers.dropout(
        inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN
    )

    # Logits layer
    # Input Tensor Shape: [batch_size, 1024]
    # Output Tensor Shape: [batch_size, 10]
    logits = tf.layers.dense(inputs=dropout, units=10)

    predictions = {
        # Generate predictions (for PREDICT and EVAL mode)
        "classes": tf.argmax(input=logits, axis=1),
        # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
        # `logging_hook`.
        "probabilities": tf.nn.softmax(logits, name="softmax_tensor"),
    }
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Calculate Loss (for both TRAIN and EVAL modes)
    onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
    loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits)

    # Configure the Training Op (for TRAIN mode)
    if mode == tf.estimator.ModeKeys.TRAIN:
        # Horovod: scale learning rate by the number of workers.
        optimizer = tf.train.MomentumOptimizer(learning_rate=0.001 * hvd.size(), momentum=0.9)

        # Horovod: add Horovod Distributed Optimizer.
        optimizer = hvd.DistributedOptimizer(optimizer)

        train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

    # Add evaluation metrics (for EVAL mode)
    eval_metric_ops = {
        "accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["classes"])
    }
    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
Exemple #35
0
 def multiplier(epoch):
     # Adjust epoch to produce round numbers at the end of each epoch, so that TensorBoard
     # learning rate graphs look better.
     epoch += 1. / self.steps_per_epoch
     return 1. / hvd.size() * (epoch *
                               (hvd.size() - 1) / warmup_epochs + 1)
Exemple #36
0
def main(_):
    """
    Builds the model and runs
    """
    if FLAGS.distributed:
        import horovod.tensorflow as hvd
        hvd.init()

    tf.logging.set_verbosity(tf.logging.INFO)

    # Loads GPT-2 model configuration

    if FLAGS.config_type == "json":
        gpt2_config = model_utils.transform_gpt2_to_texar_config(
            FLAGS.config_model)
    elif FLAGS.config_type == 'texar':
        gpt2_config = importlib.import_module(FLAGS.config_model)
    else:
        raise ValueError('Unknown config_type.')

    # Creates a data pre-processor for, e.g., BPE encoding
    proc = processor.get_encoder(FLAGS.pretrain_model_dir)

    max_decoding_length = config_train.max_decoding_length
    assert max_decoding_length <= gpt2_config.position_size, (
        "max_decoding_length should not be greater than position_size. "
        "{}>{}".format(max_decoding_length, gpt2_config.position_size))

    # Loads data

    # Configures training data shard in distributed mode
    if FLAGS.distributed:
        config_train.train_hparam["dataset"]["num_shards"] = hvd.size()
        config_train.train_hparam["dataset"]["shard_id"] = hvd.rank()
        config_train.train_hparam["batch_size"] //= hvd.size()

    datasets = {}
    if FLAGS.do_train:
        train_dataset = tx.data.TFRecordData(hparams=config_train.train_hparam)
        datasets['train'] = train_dataset
    if FLAGS.do_eval:
        dev_dataset = tx.data.TFRecordData(hparams=config_train.dev_hparam)
        datasets['dev'] = dev_dataset
    if FLAGS.do_test:
        test_dataset = tx.data.TFRecordData(hparams=config_train.test_hparam)
        datasets['test'] = test_dataset
    iterator = tx.data.FeedableDataIterator(datasets)
    batch = iterator.get_next()
    batch_size = tf.shape(batch['text_ids'])[0]

    # Builds the GPT-2 model

    word_embedder = tx.modules.WordEmbedder(vocab_size=gpt2_config.vocab_size,
                                            hparams=gpt2_config.embed)

    pos_embedder = tx.modules.PositionEmbedder(
        position_size=gpt2_config.position_size, hparams=gpt2_config.pos_embed)

    # Ties output layer with input word embedding
    output_layer = tf.transpose(word_embedder.embedding, (1, 0))

    decoder = tx.modules.TransformerDecoder(vocab_size=gpt2_config.vocab_size,
                                            output_layer=output_layer,
                                            hparams=gpt2_config.decoder)

    # For training
    seq_len = tf.fill([batch_size], tf.shape(batch['text_ids'])[1])
    pos_embeds = pos_embedder(sequence_length=seq_len)
    input_embeds = word_embedder(batch['text_ids']) + pos_embeds

    outputs = decoder(inputs=input_embeds, decoding_strategy='train_greedy')

    loss = tx.losses.sequence_sparse_softmax_cross_entropy(
        labels=batch['text_ids'][:, 1:],
        logits=outputs.logits[:, :-1, :],
        sequence_length=batch['length'] - 1,
        average_across_timesteps=True,
        sum_over_timesteps=False)
    ppl = tf.exp(loss)

    global_step = tf.Variable(0, trainable=False)
    opt = tx.core.get_optimizer(global_step=global_step,
                                hparams=config_train.opt)

    if FLAGS.distributed:
        opt = hvd.DistributedOptimizer(opt)

    train_op = tf.contrib.layers.optimize_loss(loss=loss,
                                               global_step=global_step,
                                               learning_rate=None,
                                               optimizer=opt)

    # For generation: generates continuations of test text
    def _embedding_fn(x, y):
        # `x` is token ids, `y` is time steps
        return word_embedder(x) + pos_embedder(y)

    end_token = proc.encoder['<|endoftext|>']
    start_tokens = batch['text_ids'][:, 0]
    helper = tx.modules.TopKSampleEmbeddingHelper(
        embedding=_embedding_fn,
        start_tokens=start_tokens,
        end_token=end_token,
        top_k=FLAGS.top_k,
        softmax_temperature=FLAGS.temperature)

    outputs_infer, _ = decoder(context=batch['text_ids'],
                               context_sequence_length=batch['length'],
                               max_decoding_length=max_decoding_length,
                               helper=helper)
    sample_id = outputs_infer.sample_id

    # Train/eval/test routine
    saver = tf.train.Saver()
    saver_best = tf.train.Saver(max_to_keep=1)
    dev_best = {'loss': 1e8, 'ppl': 1e8}

    def _is_head():
        if not FLAGS.distributed:
            return True
        else:
            return hvd.rank() == 0

    def _train_epoch(sess):
        """Trains on the training set, and evaluates on the dev set
        periodically.
        """
        iterator.restart_dataset(sess, 'train')

        fetches = {'loss': train_op, 'step': global_step}

        while True:
            try:
                feed_dict = {
                    iterator.handle: iterator.get_handle(sess, 'train'),
                    tx.global_mode(): tf.estimator.ModeKeys.TRAIN,
                }
                rets = sess.run(fetches, feed_dict)
                step = rets['step']

                dis_steps = config_train.display_steps
                if _is_head() and dis_steps > 0 and step % dis_steps == 0:
                    tf.logging.info('step:%d; loss:%f' % (step, rets['loss']))

                eval_steps = config_train.eval_steps
                if _is_head() and eval_steps > 0 and step % eval_steps == 0:
                    _dev_epoch(sess)

                ckpt_steps = config_train.checkpoint_steps
                if _is_head() and ckpt_steps > 0 and step % ckpt_steps == 0:
                    ckpt_fn = os.path.join(FLAGS.output_dir, 'model.ckpt')
                    ckpt_fn = saver.save(sess, ckpt_fn, global_step=step)
                    tf.logging.info('Checkpoint to {}'.format(ckpt_fn))

            except tf.errors.OutOfRangeError:
                break

    def _dev_epoch(sess):
        """Evaluates on the dev set.
        """
        iterator.restart_dataset(sess, 'dev')

        cum_loss = 0.
        cum_ppl = 0.
        nsamples = 0
        fetches = {
            'loss': loss,
            'ppl': ppl,
            'batch_size': batch_size,
        }
        while True:
            try:
                feed_dict = {
                    iterator.handle: iterator.get_handle(sess, 'dev'),
                    tx.context.global_mode(): tf.estimator.ModeKeys.EVAL,
                }
                rets = sess.run(fetches, feed_dict)

                cum_loss += rets['loss'] * rets['batch_size']
                cum_ppl += rets['ppl'] * rets['batch_size']
                nsamples += rets['batch_size']
            except tf.errors.OutOfRangeError:
                break

        avg_loss = cum_loss / nsamples
        avg_ppl = cum_ppl / nsamples
        tf.logging.info('dev loss: {}; ppl: {}; nsamples: {}'.format(
            avg_loss, avg_ppl, nsamples))

        if FLAGS.do_train and avg_loss < dev_best['loss']:
            dev_best['loss'] = avg_loss
            dev_best['ppl'] = avg_ppl
            ckpt_fn = os.path.join(FLAGS.output_dir, 'model_best.ckpt')
            ckpt_fn = saver_best.save(sess, ckpt_fn)
            tf.logging.info('Checkpoint best to {}'.format(ckpt_fn))

    def _test_epoch(sess):
        """Generates samples on the test set.
        """
        iterator.restart_dataset(sess, 'test')

        _all_inputs = []
        _all_samples = []
        fetches = {
            'inputs': batch['text_ids'],
            'length': batch['length'],
            'samples': sample_id
        }
        while True:
            try:
                feed_dict = {
                    iterator.handle: iterator.get_handle(sess, 'test'),
                    tx.context.global_mode(): tf.estimator.ModeKeys.PREDICT,
                }
                rets = sess.run(fetches, feed_dict=feed_dict)

                _inputs = []
                for i, l in zip(rets['inputs'], rets['length']):
                    # Delete padding
                    _inputs.append(i[:l].tolist())
                _all_inputs.extend(_inputs)

                _samples = []
                for s, l in zip(rets['samples'], rets['length']):
                    # Delete inputs from samples
                    _samples.append(s[l:].tolist())
                _all_samples.extend(_samples)

            except tf.errors.OutOfRangeError:
                break

        # Parse samples and write to file

        eos_token_id = proc.encoder['<|endoftext|>']

        _all_input_text = []
        for i in _all_inputs:
            if i[0] == eos_token_id:
                # '<|endoftext|>' is used as the BOS token. Delete it here
                i = i[1:]
            i_text = proc.decode(i)
            _all_input_text.append(i_text)
        # '<|endoftext|>' is used as the PAD token. Delete them here
        _all_input_text = tx.utils.strip_eos(_all_input_text,
                                             eos_token='<|endoftext|>')

        _all_samples_text = []
        for i, s in zip(_all_inputs, _all_samples):
            s_text = proc.decode(s)
            s_text = s_text.replace('\n', ' ')
            _all_samples_text.append(s_text)
        _all_samples_text = tx.utils.strip_eos(_all_samples_text,
                                               eos_token='<|endoftext|>')

        output_file = os.path.join(FLAGS.output_dir, "test_samples.tsv")
        tf.logging.info('Write samples to {}'.format(output_file))
        tx.utils.write_paired_text(_all_input_text, _all_samples_text,
                                   output_file)

    # Broadcasts global variables from rank-0 process
    if FLAGS.distributed:
        bcast = hvd.broadcast_global_variables(0)

    session_config = tf.ConfigProto()
    if FLAGS.distributed:
        session_config.gpu_options.visible_device_list = str(hvd.local_rank())

    with tf.Session(config=session_config) as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())
        sess.run(tf.tables_initializer())

        if FLAGS.distributed:
            bcast.run()

        # Restores trained model if specified
        if FLAGS.checkpoint:
            tf.logging.info('Restore from {}'.format(FLAGS.checkpoint))
            saver.restore(sess, FLAGS.checkpoint)
        elif FLAGS.pretrain_checkpoint:
            tf.logging.info('Restore from {}'.format(
                FLAGS.pretrain_checkpoint))
            model_utils.init_gpt2_checkpoint(sess, FLAGS.pretrain_checkpoint)
            print("\nFinished loading\n")

        iterator.initialize_dataset(sess)

        if FLAGS.do_train:
            for _ in range(config_train.max_train_epoch):
                _train_epoch(sess)
            saver.save(sess, FLAGS.output_dir + '/model.ckpt')

        if FLAGS.do_eval:
            _dev_epoch(sess)

        if FLAGS.do_test:
            _test_epoch(sess)
Exemple #37
0
def on_state_reset():
    optimizer.lr.assign(lr * hvd.size())
def main(_):
    '''Main routine for Horovod Tensorflow Mnist example.'''
    # Horovod: initialize Horovod.
    hvd.init()

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    gpu_options = tf.GPUOptions(allow_growth=True,
                                visible_device_list=str(hvd.local_rank()))
    config = tf.ConfigProto(gpu_options=gpu_options)

    batch_size = 100

    # Download and load MNIST dataset.
    if hvd.rank() == 0:
        # mnist = learn.datasets.mnist.read_data_sets(MNIST_DATADIR)
        image, label = get_data_mnist(batch_size)

    # hvd.allreduce(tf.constant([0]), average=False)  # Barrier (not working)
    with tf.Session(config=config):
        # download/unzip in rank 0 only.
        hvd_keras.allreduce([0], name="Barrier")

    if hvd.rank() != 0:
        # mnist = learn.datasets.mnist.read_data_sets(MNIST_DATADIR)
        image, label = get_data_mnist(batch_size)

    # Build model...
    # with tf.name_scope('input'):
    #     image = tf.placeholder(tf.float32, [None, 784], name='image')
    #     label = tf.placeholder(tf.float32, [None], name='label')

    predict, loss = conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN)

    # Horovod: adjust learning rate based on number of GPUs.
    opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    # global_step = tf.contrib.framework.get_or_create_global_step()
    global_step = tf.train.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable
        # states from rank 0 to all other processes. This is necessary to
        # ensure consistent initialization of all workers when training is
        # started with random weights or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=20000 // hvd.size()),
        tf.train.LoggingTensorHook(tensors={
            'step': global_step,
            'loss': loss
        },
                                   every_n_iter=10),
    ]

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None

    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when
    # done or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            # image_, label_ = mnist.train.next_batch(100)
            # mon_sess.run(train_op, feed_dict={image: image_, label: label_})
            mon_sess.run(train_op)
from ESN import EchoStateRNNCell
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from time import time
import horovod.tensorflow as hvd

# Initialize Horovod
hvd.init()

mnist = tf.keras.datasets.mnist

(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train, X_test = X_train / 255.0, X_test / 255.0

X_train, y_train = X_train[hvd.rank()::hvd.size()], y_train[hvd.rank()::hvd.
                                                            size()]

print("MNIST shape", X_train.shape, X_test.shape)

if False:
    # debug only
    X_train = X_train[:10000]
    y_train = y_train[:10000]

# Pin GPU to be used to process local rank (one GPU per process)
# takes only current needed GPU memory
config = tf.ConfigProto(intra_op_parallelism_threads=hvd.size(),
                        inter_op_parallelism_threads=hvd.size())
config.gpu_options.allow_growth = False
config.gpu_options.visible_device_list = str(hvd.local_rank())
Exemple #40
0
        attacker = PGDAttacker(
            args.attack_iter,
            args.attack_epsilon,
            args.attack_step_size,
            prob_start_from_clean=0.2 if not args.eval else 0.0)
        if args.use_fp16xla:
            attacker.USE_FP16 = True
            attacker.USE_XLA = True
    model.set_attacker(attacker)

    os.system("nvidia-smi")
    hvd.init()

    if args.eval:
        sessinit = SmartInit(args.load)
        if hvd.size() == 1:
            # single-GPU eval, slow
            ds = get_val_dataflow(args.data, args.batch)
            eval_on_ILSVRC12(model, sessinit, ds)
        else:
            logger.info("CMD: " + " ".join(sys.argv))
            cb = create_eval_callback("eval",
                                      model.get_inference_func(attacker),
                                      lambda e: True)
            trainer = HorovodTrainer()
            trainer.setup_graph(model.get_input_signature(),
                                PlaceholderInput(), model.build_graph,
                                model.get_optimizer)
            # train for an empty epoch, to reuse the distributed evaluation code
            trainer.train_with_defaults(
                callbacks=[cb],
def main(_):

    hvd.init()

    sess_config = tf.ConfigProto()
    sess_config.gpu_options.visible_device_list = str(hvd.local_rank())

    graph = tf.Graph()
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    with graph.as_default():
        import json
        
        # config = json.load(open("/data/xuht/bert/chinese_L-12_H-768_A-12/bert_config.json", "r"))
        
        config = json.load(open(FLAGS.config_file, "r"))

        import json
        label_dict = json.load(open(FLAGS.label_id))

        label_tensor = np.asarray(label_dict["class_ratio"]).astype(np.float32)

        init_checkpoint = FLAGS.init_checkpoint
        print("===init checkoutpoint==={}".format(init_checkpoint))

        # init_checkpoint = "/data/xuht/bert/chinese_L-12_H-768_A-12/bert_model.ckpt"
        # init_checkpoint = "/data/xuht/concat/model_1/oqmrc.ckpt"
        config = Bunch(config)
        config.use_one_hot_embeddings = True
        config.scope = "bert"
        config.dropout_prob = 0.1
        config.label_type = "single_label"
        # config.loss = "focal_loss"
        
        # os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu_id
        sess = tf.Session(config=sess_config)

        train_size = int(FLAGS.train_size/hvd.size())

        num_train_steps = int(
            train_size / FLAGS.batch_size * FLAGS.epoch)
        num_warmup_steps = int(num_train_steps * 0.01)

        num_storage_steps = int(train_size / FLAGS.batch_size)

        print(num_train_steps, num_warmup_steps, "=============")
        
        opt_config = Bunch({"init_lr":(1e-5/hvd.size()), 
                            "num_train_steps":num_train_steps,
                            "num_warmup_steps":num_warmup_steps})

        model_io_config = Bunch({"fix_lm":False})
        
        model_io_fn = model_io.ModelIO(model_io_config)
        
        num_choice = FLAGS.num_classes
        max_seq_length = FLAGS.max_length

        vib_config = {"kl_type":"original", "beta":0.1}

        model_train_fn = bert_order_classifier.classifier_vib_model_fn_builder(
                                config,
                                num_choice,
                                init_checkpoint,
                                model_reuse=None,
                                load_pretrained=True,
                                model_io_fn=model_io_fn,
                                model_io_config=model_io_config,
                                opt_config=opt_config,
                                input_name=["a", "b"],
                                vib_config=vib_config,
                                label_tensor=None)
        
        model_eval_fn = bert_order_classifier.classifier_vib_model_fn_builder(
                                config, 
                                num_choice, 
                                init_checkpoint, 
                                model_reuse=True, 
                                load_pretrained=True,
                                model_io_fn=model_io_fn,
                                model_io_config=model_io_config, 
                                opt_config=opt_config,
                                input_name=["a", "b"],
                                vib_config=vib_config,
                                label_tensor=None)
        
        def metric_fn(features, logits, loss):
            print(logits.get_shape(), "===logits shape===")
            pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32)
            prob = tf.nn.softmax(logits)
            accuracy = correct = tf.equal(
                tf.cast(pred_label, tf.int32),
                tf.cast(features["label_ids"], tf.int32)
            )
            accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
            return {"accuracy":accuracy, "loss":loss, "pred_label":pred_label, "label_ids":features["label_ids"]}
        
        name_to_features = {
                "input_ids_a":
                        tf.FixedLenFeature([max_seq_length], tf.int64),
                "input_mask_a":
                        tf.FixedLenFeature([max_seq_length], tf.int64),
                "segment_ids_a":
                        tf.FixedLenFeature([max_seq_length], tf.int64),
                "input_ids_b":
                        tf.FixedLenFeature([max_seq_length], tf.int64),
                "input_mask_b":
                        tf.FixedLenFeature([max_seq_length], tf.int64),
                "segment_ids_b":
                        tf.FixedLenFeature([max_seq_length], tf.int64),
                "label_ids":
                        tf.FixedLenFeature([], tf.int64),
        }
        
        def _decode_record(record, name_to_features):
            """Decodes a record to a TensorFlow example.
            """
            example = tf.parse_single_example(record, name_to_features)

            # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
            # So cast all int64 to int32.
            for name in list(example.keys()):
                t = example[name]
                if t.dtype == tf.int64:
                    t = tf.to_int32(t)
                example[name] = t
            return example

        params = Bunch({})
        params.epoch = FLAGS.epoch
        params.batch_size = FLAGS.batch_size
        # train_features = tf_data_utils.train_input_fn("/data/xuht/wsdm19/data/train.tfrecords",
        #                             _decode_record, name_to_features, params)
        # eval_features = tf_data_utils.eval_input_fn("/data/xuht/wsdm19/data/dev.tfrecords",
        #                             _decode_record, name_to_features, params)

        train_features = tf_data_utils.train_input_fn(FLAGS.train_file,
                                    _decode_record, name_to_features, params)
        eval_features = tf_data_utils.eval_input_fn(FLAGS.dev_file,
                                    _decode_record, name_to_features, params)
        
        [train_op, train_loss, train_per_example_loss, train_logits] = model_train_fn(train_features, [], tf.estimator.ModeKeys.TRAIN)
        [_, eval_loss, eval_per_example_loss, eval_logits] = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL)
        result = metric_fn(eval_features, eval_logits, eval_loss)
        
        model_io_fn.set_saver()
        
        init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
        sess.run(init_op)

        sess.run(hvd.broadcast_global_variables(0))
        
        def eval_fn(result):
            i = 0
            total_accuracy = 0
            label, label_id = [], []
            label_weight = []
            while True:
                try:
                    eval_result = sess.run(result)
                    total_accuracy += eval_result["accuracy"]
                    label_id.extend(eval_result["label_ids"])
                    label.extend(eval_result["pred_label"])
                    for item in eval_result["label_ids"]:
                        label_weight.append(label_tensor[item])
                    i += 1
                except tf.errors.OutOfRangeError:
                    print("End of dataset")
                    break
            f1 = f1_score(label_id, label, average="macro", sample_weight=label_weight)
            accuracy = accuracy_score(label_id, label, sample_weight=label_weight)
            print("test accuracy accuracy {} {} f1 {}".format(total_accuracy/i, 
                accuracy, f1))
            return total_accuracy/ i, f1
        
        def train_fn(op, loss):
            i = 0
            cnt = 0
            total_loss = 0.0
            while True:
                try:
                    [_, train_loss] = sess.run([op, loss])
                    total_loss += train_loss
                    i += 1
                    cnt += 1
                    if np.mod(i, num_storage_steps) == 0:
                        print(total_loss/cnt)
                        # model_io_fn.save_model(sess, "/data/xuht/wsdm19/data/model_11_15_focal_loss/oqmrc_{}.ckpt".format(int(i/8000)))
                        if hvd.rank() == 0:
                            model_io_fn.save_model(sess, FLAGS.model_output+"/oqmrc_{}.ckpt".format(int(i/num_storage_steps)))
                            print("==successful storing model=={}".format(int(i/num_storage_steps)))
                        total_loss = 0
                        cnt = 0
                except tf.errors.OutOfRangeError:
                    break
        print("===========begin to train============")        
        train_fn(train_op, train_loss)
        print("===========begin to eval============")
        accuracy, f1 = eval_fn(result)
        print("==accuracy {} f1 {}==".format(accuracy, f1))
        # model_io_fn.save_model(sess, "/data/xuht/wsdm19/data/model_11_15_focal_loss/oqmrc.ckpt")
        if hvd.rank() == 0:
            model_io_fn.save_model(sess, FLAGS.model_output+"/oqmrc.ckpt")
Exemple #42
0
def parallel_train(training_dataset):
    import horovod.tensorflow as hvd

    hvd.init()  # Horovod

    ds = training_dataset.shuffle(buffer_size=4096)
    ds = ds.shard(num_shards=hvd.size(), index=hvd.rank())
    ds = ds.repeat(n_epoch)
    ds = ds.map(_map_fn, num_parallel_calls=4)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(buffer_size=1)

    iterator = ds.make_one_shot_iterator()
    one_element = iterator.get_next()
    net, total_loss, log_tensors = make_model(*one_element,
                                              is_train=True,
                                              reuse=False)
    x_ = net.img  # net input
    last_conf = net.last_conf  # net output
    last_paf = net.last_paf  # net output
    confs_ = net.confs  # GT
    pafs_ = net.pafs  # GT
    mask = net.m1  # mask1, GT
    # net.m2 = m2                 # mask2, GT
    stage_losses = net.stage_losses
    l2_loss = net.l2_loss

    global_step = tf.Variable(1, trainable=False)
    # scaled_lr = lr_init * hvd.size()  # Horovod: scale the learning rate linearly
    scaled_lr = lr_init  # Linear scaling rule is not working in openpose training.
    with tf.variable_scope('learning_rate'):
        lr_v = tf.Variable(scaled_lr, trainable=False)

    opt = tf.train.MomentumOptimizer(lr_v, 0.9)
    opt = hvd.DistributedOptimizer(opt)  # Horovod
    train_op = opt.minimize(total_loss, global_step=global_step)
    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False)

    config.gpu_options.allow_growth = True  # Horovod
    config.gpu_options.visible_device_list = str(hvd.local_rank())  # Horovod

    # Add variable initializer.
    init = tf.global_variables_initializer()

    # Horovod: broadcast initial variable states from rank 0 to all other processes.
    # This is necessary to ensure consistent initialization of all workers when
    # training is started with random weights or restored from a checkpoint.
    bcast = hvd.broadcast_global_variables(0)  # Horovod

    # Horovod: adjust number of steps based on number of GPUs.
    global n_step, lr_decay_every_step
    n_step = n_step // hvd.size() + 1  # Horovod
    lr_decay_every_step = lr_decay_every_step // hvd.size() + 1  # Horovod

    # Start training
    with tf.Session(config=config) as sess:
        init.run()
        bcast.run()  # Horovod
        print('Worker{}: Initialized'.format(hvd.rank()))
        print(
            'Worker{}: Start - n_step: {} batch_size: {} lr_init: {} lr_decay_every_step: {}'
            .format(hvd.rank(), n_step, batch_size, lr_init,
                    lr_decay_every_step))

        # restore pre-trained weights
        try:
            # tl.files.load_and_assign_npz(sess, os.path.join(model_path, 'pose.npz'), net)
            tl.files.load_and_assign_npz_dict(sess=sess,
                                              name=os.path.join(
                                                  model_path, 'pose.npz'))
        except:
            print("no pre-trained model")

        # train until the end
        while True:
            step = sess.run(global_step)
            if step == n_step:
                break

            tic = time.time()
            if step != 0 and (step % lr_decay_every_step == 0):
                new_lr_decay = lr_decay_factor**(step // lr_decay_every_step)
                sess.run(tf.assign(lr_v, scaled_lr * new_lr_decay))

            [_, _loss, _stage_losses, _l2, conf_result, paf_result] = \
                sess.run([train_op, total_loss, stage_losses, l2_loss, last_conf, last_paf])

            # tstring = time.strftime('%d-%m %H:%M:%S', time.localtime(time.time()))
            lr = sess.run(lr_v)
            print(
                'Worker{}: Total Loss at iteration {} / {} is: {} Learning rate {:10e} l2_loss {:10e} Took: {}s'
                .format(hvd.rank(), step, n_step, _loss, lr, _l2,
                        time.time() - tic))
            for ix, ll in enumerate(_stage_losses):
                print('Worker{}:', hvd.rank(), 'Network#', ix, 'For Branch',
                      ix % 2 + 1, 'Loss:', ll)

            # save intermediate results and model
            if hvd.rank() == 0:  # Horovod
                if (step != 0) and (step % save_interval == 0):
                    # save some results
                    [
                        img_out, confs_ground, pafs_ground, conf_result,
                        paf_result, mask_out
                    ] = sess.run(
                        [x_, confs_, pafs_, last_conf, last_paf, mask])
                    draw_results(img_out, confs_ground, conf_result,
                                 pafs_ground, paf_result, mask_out,
                                 'train_%d_' % step)

                    # save model
                    # tl.files.save_npz(
                    #    net.all_params, os.path.join(model_path, 'pose' + str(step) + '.npz'), sess=sess)
                    # tl.files.save_npz(net.all_params, os.path.join(model_path, 'pose.npz'), sess=sess)
                    tl.files.save_npz_dict(net.all_params,
                                           os.path.join(
                                               model_path,
                                               'pose' + str(step) + '.npz'),
                                           sess=sess)
                    tl.files.save_npz_dict(net.all_params,
                                           os.path.join(
                                               model_path, 'pose.npz'),
                                           sess=sess)
Exemple #43
0
def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    # Keras automatically creates a cache directory in ~/.keras/datasets for
    # storing the downloaded MNIST data. This creates a race
    # condition among the workers that share the same filesystem. If the
    # directory already exists by the time this worker gets around to creating
    # it, ignore the resulting exception and continue.
    cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
    if not os.path.exists(cache_dir):
        try:
            os.mkdir(cache_dir)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
                pass
            else:
                raise

    # Download and load MNIST dataset.
    (x_train, y_train), (x_test, y_test) = \
        keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank())

    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
    # into (-1, 784) to feed into our network. Also, need to normalize the
    # features between 0 and 1.
    x_train = np.reshape(x_train, (-1, 784)) / 255.0
    x_test = np.reshape(x_test, (-1, 784)) / 255.0

    # Build model...
    with tf.name_scope('input'):
        image = tf.placeholder(tf.float32, [None, 784], name='image')
        label = tf.placeholder(tf.float32, [None], name='label')
    predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)

    # Horovod: adjust learning rate based on number of GPUs.
    opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    global_step = tf.train.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=2000 // hvd.size()),
        tf.train.LoggingTensorHook(tensors={
            'step': global_step,
            'loss': loss
        },
                                   every_n_iter=100)
    ]

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
    training_batch_generator = train_input_generator(x_train,
                                                     y_train,
                                                     batch_size=100)
    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    builder = option_builder.ProfileOptionBuilder
    opts1 = builder(builder.time_and_memory()).\
            order_by('micros').\
            with_max_depth(10).\
            with_file_output("./pctx/opts1-rank-%d" % hvd.rank()).\
            build()
    opts2 = builder.trainable_variables_parameter()
    # with profile_context.ProfileContext("./pctx",
    #                                     trace_steps=range(100, 110),
    #                                     dump_steps=[110]) as pctx:
    with profile_context.ProfileContext("./pctx") as pctx:
        pctx.add_auto_profiling('op', opts1, [800, 900, 1000])
        pctx.add_auto_profiling('scope', opts2, [1000])
        with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                               hooks=hooks,
                                               config=config) as mon_sess:
            while not mon_sess.should_stop():
                # Run a training step synchronously.
                image_, label_ = next(training_batch_generator)
                mon_sess.run(train_op,
                             feed_dict={
                                 image: image_,
                                 label: label_
                             })
        pctx.profiler.advise(options=model_analyzer.ALL_ADVICE)
def main(_):
    tf.get_logger().setLevel(logging.ERROR)

    hvd.init()

    FLAGS = PARSER.parse_args()
    backends = [StdOutBackend(Verbosity.DEFAULT)]

    if FLAGS.log_dir:
        backends += [JSONStreamBackend(Verbosity.DEFAULT, FLAGS.log_dir)]

    DLLogger.init(backends=backends)
    os.environ['CUDA_CACHE_DISABLE'] = '0'

    os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'

    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

    os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'

    os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'

    os.environ['TF_ADJUST_HUE_FUSED'] = '1'
    os.environ['TF_ADJUST_SATURATION_FUSED'] = '1'
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    os.environ['TF_SYNC_ON_FINISH'] = '0'
    os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'
    os.environ['TF_DISABLE_NVTX_RANGES'] = '1'

    if hvd.rank() == 0:
        DLLogger.log(step=tuple(), data={"mixed_precision": "ENABLED" if FLAGS.use_amp else "DISABLED"})

    dataset = MSDDataset(json_path=os.path.join(FLAGS.data_dir, 'dataset.json'),
                         dst_size=FLAGS.input_shape,
                         seed=FLAGS.seed,
                         interpolator=FLAGS.resize_interpolator,
                         data_normalization=FLAGS.data_normalization,
                         batch_size=FLAGS.batch_size,
                         train_split=FLAGS.train_split,
                         split_seed=FLAGS.split_seed)

    FLAGS.labels = dataset.labels

    gpu_options = tf.GPUOptions()
    config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True)
    config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    run_config = tf.estimator.RunConfig(
        save_summary_steps=None,
        save_checkpoints_steps=dataset.train_steps * FLAGS.train_epochs,
        save_checkpoints_secs=None,
        tf_random_seed=None,
        session_config=config,
        keep_checkpoint_max=1)

    estimator = tf.estimator.Estimator(
        model_fn=vnet_v2,
        model_dir=FLAGS.model_dir if hvd.rank() == 0 else None,
        config=run_config,
        params=FLAGS)

    train_hooks = [hvd.BroadcastGlobalVariablesHook(0)]

    if 'train' in FLAGS.exec_mode:
        steps = dataset.train_steps * FLAGS.train_epochs

        if FLAGS.benchmark:
            steps = FLAGS.warmup_steps * 2
            if hvd.rank() == 0:
                train_hooks += [ProfilingHook(FLAGS.warmup_steps, FLAGS.batch_size * hvd.size(), DLLogger)]
        else:
            if hvd.rank() == 0:
                train_hooks += [TrainHook(FLAGS.log_every, DLLogger)]

        estimator.train(
            input_fn=lambda: dataset.train_fn(FLAGS.augment),
            steps=steps,
            hooks=train_hooks)

    if 'evaluate' in FLAGS.exec_mode:
        if hvd.rank() == 0:
            if FLAGS.train_split >= 1.0:
                raise ValueError("Missing argument: --train_split < 1.0")
            result = estimator.evaluate(
                input_fn=dataset.eval_fn,
                steps=dataset.eval_steps,
                hooks=[])
            DLLogger.log(step=tuple(), data={'background_dice': result['background dice']})
            DLLogger.log(step=tuple(), data={'anterior_dice': result['Anterior dice']})
            DLLogger.log(step=tuple(), data={'posterior_dice': result['Posterior dice']})

    if 'predict' in FLAGS.exec_mode:
        count = 1
        hooks = []
        if hvd.rank() == 0:
            if FLAGS.benchmark:
                count = math.ceil((FLAGS.warmup_steps * 2) / dataset.test_steps)
                hooks += [ProfilingHook(FLAGS.warmup_steps, FLAGS.batch_size * hvd.size(), DLLogger, training=False)]

            predictions = estimator.predict(input_fn=lambda: dataset.test_fn(count=count),
                                            hooks=hooks)

            pred = [p['prediction'] for p in predictions]

            predict_path = os.path.join(FLAGS.model_dir, 'predictions')
            if os.path.exists(predict_path):
                shutil.rmtree(predict_path)

            os.makedirs(predict_path)

            pickle.dump(pred, open(os.path.join(predict_path, 'predictions.pkl'), 'wb'))
Exemple #45
0
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu,
                     use_multi_gpu):
    """Creates an optimizer training op."""
    global_step = tf.train.get_or_create_global_step()

    learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)

    # Implements linear decay of the learning rate.
    learning_rate = tf.train.polynomial_decay(learning_rate,
                                              global_step,
                                              num_train_steps,
                                              end_learning_rate=0.0,
                                              power=1.0,
                                              cycle=False)

    # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
    # learning rate will be `global_step/num_warmup_steps * init_lr`.
    if num_warmup_steps:
        global_steps_int = tf.cast(global_step, tf.int32)
        warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

        global_steps_float = tf.cast(global_steps_int, tf.float32)
        warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

        warmup_percent_done = global_steps_float / warmup_steps_float
        warmup_learning_rate = init_lr * warmup_percent_done

        is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
        learning_rate = ((1.0 - is_warmup) * learning_rate +
                         is_warmup * warmup_learning_rate)

        # Horovod: scale learning rate by the number of GPUs.
        if use_multi_gpu:
            learning_rate = learning_rate * hvd.size()

    # It is recommended that you use this optimizer for fine tuning, since this
    # is how the model was trained (note that the Adam m/v variables are NOT
    # loaded from init_checkpoint.)
    optimizer = AdamWeightDecayOptimizer(
        learning_rate=learning_rate,
        weight_decay_rate=0.01,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-6,
        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

    if use_multi_gpu:
        optimizer = hvd.DistributedOptimizer(optimizer,
                                             compression=hvd.Compression.fp16,
                                             sparse_as_dense=True)

    if use_tpu:
        optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

    tvars = tf.trainable_variables()
    if use_multi_gpu:
        grads_and_vars = optimizer.compute_gradients(loss, tvars)
        grads = [grad for grad, var in grads_and_vars]
        tvars = [var for grad, var in grads_and_vars]
    else:
        grads = tf.gradients(loss, tvars)

    # This is how the model was pre-trained.
    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
    train_op = optimizer.apply_gradients(zip(grads, tvars),
                                         global_step=global_step)

    # Normally the global step update is done inside of `apply_gradients`.
    # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
    # a different optimizer, you should probably take this line out.
    new_global_step = global_step + 1
    train_op = tf.group(train_op, [global_step.assign(new_global_step)])
    return train_op
Exemple #46
0
def train(*tf_records: "Records to train on"):
    """Train on examples."""
    tf.logging.set_verbosity(tf.logging.INFO)
    estimator = dual_net.get_estimator()

    effective_batch_size = FLAGS.train_batch_size
    if FLAGS.dist_train:
        effective_batch_size = int(FLAGS.train_batch_size / hvd.size())
    if FLAGS.use_tpu:
        effective_batch_size *= FLAGS.num_tpu_cores

    if FLAGS.use_tpu:
        if FLAGS.use_bt:

            def _input_fn(params):
                games = bigtable_input.GameQueue(FLAGS.cbt_project,
                                                 FLAGS.cbt_instance,
                                                 FLAGS.cbt_table)
                games_nr = bigtable_input.GameQueue(FLAGS.cbt_project,
                                                    FLAGS.cbt_instance,
                                                    FLAGS.cbt_table + '-nr')
                return preprocessing.get_tpu_bt_input_tensors(
                    games,
                    games_nr,
                    params['batch_size'],
                    number_of_games=FLAGS.window_size,
                    random_rotation=True)
        else:

            def _input_fn(params):
                return preprocessing.get_tpu_input_tensors(
                    params['batch_size'], tf_records, random_rotation=True)

        # Hooks are broken with TPUestimator at the moment.
        hooks = []
    else:

        def _input_fn():
            return preprocessing.get_input_tensors(
                effective_batch_size,
                tf_records,
                filter_amount=FLAGS.filter_amount,
                shuffle_buffer_size=FLAGS.shuffle_buffer_size,
                random_rotation=True,
                seed=FLAGS.training_seed,
                dist_train=FLAGS.dist_train)

        hooks = [
            UpdateRatioSessionHook(FLAGS.work_dir),
            EchoStepCounterHook(output_dir=FLAGS.work_dir)
        ]
        if FLAGS.dist_train:
            hooks.append(hvd.BroadcastGlobalVariablesHook(0))

    steps = FLAGS.steps_to_train
    logging.info("Training, steps = %s, batch = %s -> %s examples", steps
                 or '?', effective_batch_size,
                 (steps * effective_batch_size) if steps else '?')

    if FLAGS.use_bt:
        games = bigtable_input.GameQueue(FLAGS.cbt_project, FLAGS.cbt_instance,
                                         FLAGS.cbt_table)
        if not games.read_wait_cell():
            games.require_fresh_games(20000)
        latest_game = games.latest_game_number
        index_from = max(latest_game, games.read_wait_cell())
        print("== Last game before training:", latest_game, flush=True)
        print("== Wait cell:", games.read_wait_cell(), flush=True)

    try:
        estimator.train(_input_fn, steps=steps, hooks=hooks)
        if FLAGS.use_bt:
            bigtable_input.set_fresh_watermark(games, index_from,
                                               FLAGS.window_size)
    except:
        if FLAGS.use_bt:
            games.require_fresh_games(0)
        raise
def BatchNormEidt(inputs,
                  axis=None,
                  training=None,
                  momentum=0.9,
                  epsilon=1e-5,
                  center=True,
                  scale=True,
                  beta_initializer=tf.zeros_initializer(),
                  gamma_initializer=tf.ones_initializer(),
                  virtual_batch_size=None,
                  data_format='channels_last',
                  ema_update='default',
                  sync_statistics=None,
                  internal_update=None,
                  bit_activation=2):
    """
    A more powerful version of `tf.layers.batch_normalization`. It differs from
    the offical one in the following aspects:

    1. Accepts an alternative ``data_format`` option when ``axis`` is None. For 2D input, this argument will be ignored.
    2. Default value for ``momentum`` and ``epsilon`` is different.
    3. Default value for ``training`` is automatically obtained from tensorpack's ``TowerContext``.
       User-provided value can overwrite this behavior.
    4. Support the ``ema_update`` option, which covers broader use cases than the standard EMA update.
    5. Support the ``sync_statistics`` option, which implements "SyncBN" and is very useful in small-batch models.

    Args:
        training (bool): if True, use per-batch statistics to normalize. Otherwise, use stored EMA
            to normalize. By default, it is equal to `get_current_tower_context().is_training`.
            This is not a good argument name, but it is what the Tensorflow layer uses.
        ema_update (str): Only effective when ``training=True``. It has the following options:

          * "default": same as "collection". Because this is the default behavior in tensorflow.
          * "skip": do not update EMA. This can be useful when you reuse a batch norm layer in several places
            but do not want them to all update your EMA.
          * "collection": Add EMA update ops to collection `tf.GraphKeys.UPDATE_OPS`.
            The ops in the collection will be run automatically by the callback :class:`RunUpdateOps`, along with
            your training iterations. This can waste compute if your training iterations do not always depend
            on the BatchNorm layer.
          * "internal": EMA is updated inside this layer itself by control dependencies.
            In common cases, it has similar speed to "collection". But it covers more cases, e.g.:

            1. BatchNorm is used inside dynamic control flow.
               The collection-based update does not support dynamic control flows.
            2. BatchNorm layer is sometimes unused (e.g., in GANs you have two networks to train alternatively).
               Putting all update ops into a single collection will waste a lot of compute.
            3. Other part of the model relies on the "updated" EMA. The collection-based method does not update
               EMA immediately.

            Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/14699
        sync_statistics (str or None): one of None, "nccl", or "horovod". It determines how to compute the
          "per-batch statistics" when ``training==True``.

          * None: it uses statistics of the input tensor to normalize during training.
            This is the standard way BatchNorm was implemented in most frameworks.

          * "nccl": this layer must be used under tensorpack's multi-GPU trainers.
            It uses the aggregated statistics of the whole batch (across all GPUs) to normalize.

          * "horovod": this layer must be used under tensorpack's :class:`HorovodTrainer`.
            It uses the aggregated statistics of the whole batch (across all MPI ranks) to normalize.
            Note that on single machine this is significantly slower than the "nccl" implementation.

          When not None, each GPU computes its own E[x] and E[x^2],
          which are then averaged among all GPUs to compute global mean & variance.
          Therefore each GPU needs to have the same batch size.

          The synchronization is based on the current variable scope + the name of the layer
          (`BatchNorm('name', input)`). Therefore, you need to make sure that:

          1. The BatchNorm layer on different GPUs needs to have the same name, so that
             statistics can be synchronized. If names do not match, this layer will hang.
          2. A BatchNorm layer cannot be reused within one tower.
          3. A BatchNorm layer needs to be executed for the same number of times by all GPUs.
             If different GPUs execute one BatchNorm layer for different number of times
             (e.g., if some GPUs do not execute it), this layer may hang.

          This option is also known as "SyncBN" or "Cross-GPU BatchNorm" as mentioned in:
          `MegDet: A Large Mini-Batch Object Detector <https://arxiv.org/abs/1711.07240>`_.
          Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/18222.

          When `sync_statistics` is enabled, `ema_update` is set to "internal" automatically.
          This is to avoid running `UPDATE_OPS`, which requires synchronization.

        internal_update: deprecated option. Don't use.

    Variable Names:

    * ``beta``: the bias term. Will be zero-inited by default.
    * ``gamma``: the scale term. Will be one-inited by default.
    * ``mean/EMA``: the moving average of mean.
    * ``variance/EMA``: the moving average of variance.

    Note:
        This layer is more flexible than the standard "BatchNorm" layer and provides more features:

        1. No matter whether you're doing training or not, you can set the ``training`` argument
           to use batch statistics or EMA statistics.
           i.e., you can use batch statistics during inference, or use EMA statistics during training.
           Using EMA statistics in training is useful when you load a pre-trained BN and
           don't want to update it.
        2. As long as `training=True`, `sync_statistics` and `ema_update` option will take effect.
    """

    # parse training/ctx
    def get_quan_point():
        return np.array([(2**bit_activation-i+0.5)/(2**bit_activation-1) \
            for i in range(2**bit_activation,1,-1)])

    ctx = get_current_tower_context()
    if training is None:
        training = ctx.is_training
    training = bool(training)

    # parse shapes
    data_format = get_data_format(data_format, keras_mode=False)
    shape = inputs.get_shape().as_list()
    ndims = len(shape)
    assert ndims in [2, 4], ndims
    if sync_statistics is not None:
        sync_statistics = sync_statistics.lower()
    assert sync_statistics in [None, 'nccl', 'horovod'], sync_statistics

    assert ema_update in ["default", "collection", "internal", "skip"]
    if internal_update is not None:
        log_deprecated("BatchNorm(internal_update=)",
                       "Use ema_update='internal' instead!", "2020-01-01")
        assert ema_update == 'default', \
            "Do not use internal_update and ema_update together! internal_update is deprecated"
        ema_update = "internal" if internal_update else "collection"
    if ema_update == "default":
        ema_update = "collection"
    # Logic:
    # 1. EMA update is possible only when we compute batch statistics (training=True)
    # 2. We know that in training, non-main training tower does not need EMA update
    #    We don't know about what to do in prediction context, so be conservative and do the update.
    # 3. User can explicit disable update by "skip".
    do_ema_update = training and \
        (ctx.is_main_training_tower or not ctx.is_training) \
        and (ema_update != "skip")

    if axis is None:
        if ndims == 2:
            axis = 1
        else:
            axis = 1 if data_format == 'NCHW' else 3
    assert axis in [1, 3], axis
    num_chan = shape[axis]

    TF_version = get_tf_version_tuple()

    freeze_bn_backward = not training and ctx.is_training
    if freeze_bn_backward:
        assert TF_version >= (1, 4), \
            "Fine tuning a BatchNorm model with fixed statistics needs TF>=1.4!"
        if ctx.is_main_training_tower:  # only warn in first tower
            logger.warn(
                "[BatchNorm] Using moving_mean/moving_variance in training.")
        # Using moving_mean/moving_variance in training, which means we
        # loaded a pre-trained BN and only fine-tuning the affine part.

    do_sync_bn = (sync_statistics is not None) and training

    if not do_sync_bn:
        # Use the builtin layer for anything except for sync-bn
        coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS])
        with rename_get_variable({
                'moving_mean': 'mean/EMA',
                'moving_variance': 'variance/EMA'
        }):
            tf_args = dict(
                axis=axis,
                momentum=momentum,
                epsilon=epsilon,
                center=center,
                scale=scale,
                beta_initializer=beta_initializer,
                gamma_initializer=gamma_initializer,
                # https://github.com/tensorflow/tensorflow/issues/10857#issuecomment-410185429
                fused=(ndims == 4 and axis in [1, 3]
                       and not freeze_bn_backward),
                _reuse=tf.get_variable_scope().reuse)
            if TF_version >= (1, 5):
                tf_args['virtual_batch_size'] = virtual_batch_size
            else:
                assert virtual_batch_size is None, "Feature not supported in this version of TF!"
            use_fp16 = inputs.dtype == tf.float16
            if use_fp16:
                # non-fused does not support fp16; fused does not support all layouts.
                # we made our best guess here
                tf_args['fused'] = True
            if training:
                layer = tf.layers.BatchNormalization(**tf_args)
                xn = layer.apply(inputs,
                                 training=training,
                                 scope=tf.get_variable_scope())
            else:
                layer = tf.layers.BatchNormalization(**tf_args)
                xnn = layer.apply(inputs,
                                  training=training,
                                  scope=tf.get_variable_scope())

                i1 = inputs[0, 0, 0, :]
                i2 = inputs[1, 1, 1, :]
                x1 = xnn[0, 0, 0, :]
                x2 = xnn[1, 1, 1, :]

                mean0 = i1 - x1 * (i1 - i2) / (x1 - x2)
                var0 = (i1 - i2) / (x1 - x2)
                #quantize BN during inference
                print('in quantize BN')
                quan_points = get_quan_point()

                #add_moving_summary(tf.identity(quan_points[3],name='origin_quan_points_3'))
                quan_values = np.array([round((quan_points[i]-0.005)*(2**bit_activation-1))\
                /(float(2**bit_activation-1)) for i in range(len(quan_points))])
                quan_values = np.append(quan_values, np.array([1.]), axis=-1)

                moving_mean_ = tf.identity(mean0, name='moving_mean_')
                moving_mean_ = tf.expand_dims(moving_mean_, axis=-1)
                moving_var_ = tf.identity(var0, name='moving_var')
                moving_var_ = tf.expand_dims(moving_var_, axis=-1)

                quan_points = moving_var_ * quan_points + moving_mean_

                b, w, h, c = inputs.shape

                inputs = tf.transpose(tf.reshape(inputs, [-1, c]))

                label1 = tf.cast(tf.less_equal(
                    inputs, tf.expand_dims(quan_points[:, 0], axis=-1)),
                                 dtype=tf.float32)
                label2 = tf.cast(tf.math.logical_and(tf.math.less_equal(inputs,tf.expand_dims(quan_points[:,1],axis=-1)),\
                    tf.math.greater(inputs,tf.expand_dims(quan_points[:,0],axis=-1))),dtype=tf.float32)
                label3 = tf.cast(tf.math.logical_and(tf.math.less_equal(inputs,tf.expand_dims(quan_points[:,2],axis=-1)),\
                    tf.math.greater(inputs,tf.expand_dims(quan_points[:,1],axis=-1))),dtype=tf.float32)
                label4 = tf.cast(tf.math.greater(
                    inputs, tf.expand_dims(quan_points[:, 2], axis=-1)),
                                 dtype=tf.float32)
                xn = label1*quan_values[0]+label2*quan_values[1]+label3*quan_values[2]+\
                label4*quan_values[3]
                xn = tf.reshape(tf.transpose(xn), [-1, w, h, c])

        # Add EMA variables to the correct collection
        if ctx.is_main_training_tower:
            for v in layer.non_trainable_variables:
                if isinstance(v, tf.Variable):
                    tf.add_to_collection(tf.GraphKeys.MODEL_VARIABLES, v)

        if not do_ema_update:
            restore_collection(coll_bk)
        if do_ema_update and ema_update == "internal":
            # Implement "internal" update.
            restore_collection(coll_bk)
            assert layer.updates
            with tf.control_dependencies(layer.updates):
                ret = tf.identity(xn, name='output')
        else:
            ret = tf.identity(xn, name='output')

        vh = ret.variables = VariableHolder(
            moving_mean=layer.moving_mean,
            mean=layer.moving_mean,  # for backward-compatibility
            moving_variance=layer.moving_variance,
            variance=layer.moving_variance)  # for backward-compatibility
        if scale:
            vh.gamma = layer.gamma
        if center:
            vh.beta = layer.beta

    else:
        red_axis = [0] if ndims == 2 else (
            [0, 2, 3] if axis == 1 else [0, 1, 2])

        new_shape = None  # don't need to reshape unless ...
        if ndims == 4 and axis == 1:
            new_shape = [1, num_chan, 1, 1]

        batch_mean = tf.reduce_mean(inputs, axis=red_axis)
        batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=red_axis)

        if sync_statistics == 'nccl':
            num_dev = ctx.total
            if num_dev == 1:
                logger.warn(
                    "BatchNorm(sync_statistics='nccl') is used with only one tower!"
                )
            else:
                assert six.PY2 or TF_version >= (1, 10), \
                    "Cross-GPU BatchNorm is only supported in TF>=1.10 ." \
                    "Upgrade TF or apply this patch manually: https://github.com/tensorflow/tensorflow/pull/20360"

                if TF_version <= (1, 12):
                    try:
                        from tensorflow.contrib.nccl.python.ops.nccl_ops import _validate_and_load_nccl_so
                    except Exception:
                        pass
                    else:
                        _validate_and_load_nccl_so()
                    from tensorflow.contrib.nccl.ops import gen_nccl_ops
                else:
                    from tensorflow.python.ops import gen_nccl_ops
                shared_name = re.sub('tower[0-9]+/', '',
                                     tf.get_variable_scope().name)
                batch_mean = gen_nccl_ops.nccl_all_reduce(
                    input=batch_mean,
                    reduction='sum',
                    num_devices=num_dev,
                    shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev)
                batch_mean_square = gen_nccl_ops.nccl_all_reduce(
                    input=batch_mean_square,
                    reduction='sum',
                    num_devices=num_dev,
                    shared_name=shared_name + '_NCCL_mean_square') * (1.0 /
                                                                      num_dev)
        elif sync_statistics == 'horovod':
            # Require https://github.com/uber/horovod/pull/331
            import horovod.tensorflow as hvd
            if hvd.size() == 1:
                logger.warn(
                    "BatchNorm(sync_statistics='horovod') is used with only one process!"
                )
            else:
                import horovod
                hvd_version = tuple(map(int, horovod.__version__.split('.')))
                assert hvd_version >= (
                    0, 13,
                    6), "sync_statistics=horovod needs horovod>=0.13.6 !"

                batch_mean = hvd.allreduce(batch_mean, average=True)
                batch_mean_square = hvd.allreduce(batch_mean_square,
                                                  average=True)
        batch_var = batch_mean_square - tf.square(batch_mean)
        batch_mean_vec = batch_mean
        batch_var_vec = batch_var

        beta, gamma, moving_mean, moving_var = get_bn_variables(
            num_chan, scale, center, beta_initializer, gamma_initializer)
        if new_shape is not None:
            batch_mean = tf.reshape(batch_mean, new_shape)
            batch_var = tf.reshape(batch_var, new_shape)
            # Using fused_batch_norm(is_training=False) is actually slightly faster,
            # but hopefully this call will be JITed in the future.
            xn = tf.nn.batch_normalization(inputs, batch_mean, batch_var,
                                           tf.reshape(beta, new_shape),
                                           tf.reshape(gamma, new_shape),
                                           epsilon)
        else:
            xn = tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta,
                                           gamma, epsilon)

        if do_ema_update:
            ret = internal_update_bn_ema(xn, batch_mean_vec, batch_var_vec,
                                         moving_mean, moving_var, momentum)
        else:
            ret = tf.identity(xn, name='output')

        vh = ret.variables = VariableHolder(
            moving_mean=moving_mean,
            mean=moving_mean,  # for backward-compatibility
            moving_variance=moving_var,
            variance=moving_var)  # for backward-compatibility
        if scale:
            vh.gamma = gamma
        if center:
            vh.beta = beta
    return ret
Exemple #48
0
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None):
    """Model definition entry.

  Args:
    features: the input image tensor with shape [batch_size, height, width, 3].
      The height and width are fixed and equal.
    labels: the input labels in a dictionary. The labels include class targets
      and box targets which are dense label maps. The labels are generated from
      get_input_fn function in data/dataloader.py
    mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
    params: the dictionary defines hyperparameters of model. The default
      settings are in default_hparams function in this file.
    model: the model outputs class logits and box regression outputs.
    variable_filter_fn: the filter function that takes trainable_variables and
      returns the variable list after applying the filter rule.

  Returns:
    tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.

  Raises:
    RuntimeError: if both ckpt and backbone_ckpt are set.
  """
    # Convert params (dict) to Config for easier access.
    training_hooks = None
    if params['data_format'] == 'channels_first':
        features = tf.transpose(features, [0, 3, 1, 2])

    def _model_outputs(inputs):
        return model(inputs, config=hparams_config.Config(params))

    cls_outputs, box_outputs = utils.build_model_with_precision(
        params['precision'], _model_outputs, features)

    levels = cls_outputs.keys()
    for level in levels:
        cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32)
        box_outputs[level] = tf.cast(box_outputs[level], tf.float32)

    # First check if it is in PREDICT mode.
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'image': features,
        }
        for level in levels:
            predictions['cls_outputs_%d' % level] = cls_outputs[level]
            predictions['box_outputs_%d' % level] = box_outputs[level]
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Set up training loss and learning rate.
    update_learning_rate_schedule_parameters(params)
    global_step = tf.train.get_or_create_global_step()
    learning_rate = learning_rate_schedule(params, global_step)

    # cls_loss and box_loss are for logging. only total_loss is optimized.
    det_loss, cls_loss, box_loss, box_iou_loss = detection_loss(
        cls_outputs, box_outputs, labels, params)
    reg_l2loss = reg_l2_loss(params['weight_decay'])
    total_loss = det_loss + reg_l2loss

    if mode == tf.estimator.ModeKeys.TRAIN:
        utils.scalar('lrn_rate', learning_rate)
        utils.scalar('trainloss/cls_loss', cls_loss)
        utils.scalar('trainloss/box_loss', box_loss)
        utils.scalar('trainloss/box_iou_loss', box_iou_loss)
        utils.scalar('trainloss/det_loss', det_loss)
        utils.scalar('trainloss/reg_l2_loss', reg_l2loss)
        utils.scalar('trainloss/loss', total_loss)

    moving_average_decay = params['moving_average_decay']
    if moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay,
                                                num_updates=global_step)
        ema_vars = utils.get_ema_vars()
    if params['strategy'] == 'horovod':
        import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
        learning_rate = learning_rate * hvd.size()
    if mode == tf.estimator.ModeKeys.TRAIN:
        if params['optimizer'].lower() == 'sgd':
            optimizer = tf.train.MomentumOptimizer(learning_rate,
                                                   momentum=params['momentum'])
        elif params['optimizer'].lower() == 'adam':
            optimizer = tf.train.AdamOptimizer(learning_rate)
        else:
            raise ValueError('optimizers should be adam or sgd')

        if params['strategy'] == 'tpu':
            optimizer = tf.tpu.CrossShardOptimizer(optimizer)
        elif params['strategy'] == 'horovod':
            optimizer = hvd.DistributedOptimizer(optimizer)
            training_hooks = [hvd.BroadcastGlobalVariablesHook(0)]

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        var_list = tf.trainable_variables()
        if variable_filter_fn:
            var_list = variable_filter_fn(var_list)

        if params.get('clip_gradients_norm', 0) > 0:
            logging.info('clip gradients norm by %f',
                         params['clip_gradients_norm'])
            grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
            with tf.name_scope('clip'):
                grads = [gv[0] for gv in grads_and_vars]
                tvars = [gv[1] for gv in grads_and_vars]
                clipped_grads, gnorm = tf.clip_by_global_norm(
                    grads, params['clip_gradients_norm'])
                utils.scalar('gnorm', gnorm)
                grads_and_vars = list(zip(clipped_grads, tvars))

            with tf.control_dependencies(update_ops):
                train_op = optimizer.apply_gradients(grads_and_vars,
                                                     global_step)
        else:
            with tf.control_dependencies(update_ops):
                train_op = optimizer.minimize(total_loss,
                                              global_step,
                                              var_list=var_list)

        if moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

    else:
        train_op = None

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(**kwargs):
            """Returns a dictionary that has the evaluation metrics."""
            batch_size = params['batch_size']
            if params['strategy'] == 'tpu':
                batch_size = params['batch_size'] * params['num_shards']
            eval_anchors = anchors.Anchors(params['min_level'],
                                           params['max_level'],
                                           params['num_scales'],
                                           params['aspect_ratios'],
                                           params['anchor_scale'],
                                           params['image_size'])
            anchor_labeler = anchors.AnchorLabeler(eval_anchors,
                                                   params['num_classes'])
            cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat'])
            box_loss = tf.metrics.mean(kwargs['box_loss_repeat'])

            if params.get('testdev_dir', None):
                logging.info('Eval testdev_dir %s', params['testdev_dir'])
                coco_metrics = coco_metric_fn(
                    batch_size,
                    anchor_labeler,
                    params['val_json_file'],
                    testdev_dir=params['testdev_dir'],
                    disable_pyfun=params.get('disable_pyfun', None),
                    **kwargs)
            else:
                logging.info('Eval val with groudtruths %s.',
                             params['val_json_file'])
                coco_metrics = coco_metric_fn(batch_size, anchor_labeler,
                                              params['val_json_file'],
                                              **kwargs)

            # Add metrics to output.
            output_metrics = {
                'cls_loss': cls_loss,
                'box_loss': box_loss,
            }
            output_metrics.update(coco_metrics)
            return output_metrics

        cls_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(cls_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        box_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(box_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        metric_fn_inputs = {
            'cls_loss_repeat': cls_loss_repeat,
            'box_loss_repeat': box_loss_repeat,
            'source_ids': labels['source_ids'],
            'groundtruth_data': labels['groundtruth_data'],
            'image_scales': labels['image_scales'],
        }
        add_metric_fn_inputs(params, cls_outputs, box_outputs,
                             metric_fn_inputs)
        eval_metrics = (metric_fn, metric_fn_inputs)

    checkpoint = params.get('ckpt') or params.get('backbone_ckpt')

    if checkpoint and mode == tf.estimator.ModeKeys.TRAIN:
        # Initialize the model from an EfficientDet or backbone checkpoint.
        if params.get('ckpt') and params.get('backbone_ckpt'):
            raise RuntimeError(
                '--backbone_ckpt and --checkpoint are mutually exclusive')

        if params.get('backbone_ckpt'):
            var_scope = params['backbone_name'] + '/'
            if params['ckpt_var_scope'] is None:
                # Use backbone name as default checkpoint scope.
                ckpt_scope = params['backbone_name'] + '/'
            else:
                ckpt_scope = params['ckpt_var_scope'] + '/'
        else:
            # Load every var in the given checkpoint
            var_scope = ckpt_scope = '/'

        def scaffold_fn():
            """Loads pretrained model through scaffold function."""
            logging.info('restore variables from %s', checkpoint)

            var_map = utils.get_ckpt_var_map(ckpt_path=checkpoint,
                                             ckpt_scope=ckpt_scope,
                                             var_scope=var_scope,
                                             var_exclude_expr=params.get(
                                                 'var_exclude_expr', None))

            tf.train.init_from_checkpoint(checkpoint, var_map)

            return tf.train.Scaffold()
    elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay:

        def scaffold_fn():
            """Load moving average variables for eval."""
            logging.info('Load EMA vars with ema_decay=%f',
                         moving_average_decay)
            restore_vars_dict = ema.variables_to_restore(ema_vars)
            saver = tf.train.Saver(restore_vars_dict)
            return tf.train.Scaffold(saver=saver)
    else:
        scaffold_fn = None

    return tf.estimator.tpu.TPUEstimatorSpec(mode=mode,
                                             loss=total_loss,
                                             train_op=train_op,
                                             eval_metrics=eval_metrics,
                                             host_call=utils.get_tpu_host_call(
                                                 global_step, params),
                                             scaffold_fn=scaffold_fn,
                                             training_hooks=training_hooks)
Exemple #49
0
def finalize_configs(is_training):
    """
    Run some sanity checks, and populate some configs from others
    """
    _C.freeze(False)  # populate new keys now
    if isinstance(_C.DATA.VAL, six.string_types):  # support single string (the typical case) as well
        _C.DATA.VAL = (_C.DATA.VAL, )
    if isinstance(_C.DATA.TRAIN, six.string_types):  # support single string
        _C.DATA.TRAIN = (_C.DATA.TRAIN, )

    # finalize dataset definitions ...
    from dataset import DatasetRegistry
    datasets = list(_C.DATA.TRAIN) + list(_C.DATA.VAL)
    _C.DATA.CLASS_NAMES = DatasetRegistry.get_metadata(datasets[0], "class_names")
    _C.DATA.NUM_CATEGORY = len(_C.DATA.CLASS_NAMES) - 1

    assert _C.BACKBONE.NORM in ['FreezeBN', 'SyncBN', 'GN', 'None'], _C.BACKBONE.NORM
    if _C.BACKBONE.NORM != 'FreezeBN':
        assert not _C.BACKBONE.FREEZE_AFFINE
    assert _C.BACKBONE.FREEZE_AT in [0, 1, 2]

    _C.RPN.NUM_ANCHOR = len(_C.RPN.ANCHOR_SIZES) * len(_C.RPN.ANCHOR_RATIOS)
    assert len(_C.FPN.ANCHOR_STRIDES) == len(_C.RPN.ANCHOR_SIZES)
    # image size into the backbone has to be multiple of this number
    _C.FPN.RESOLUTION_REQUIREMENT = _C.FPN.ANCHOR_STRIDES[3]  # [3] because we build FPN with features r2,r3,r4,r5

    if _C.MODE_FPN:
        size_mult = _C.FPN.RESOLUTION_REQUIREMENT * 1.
        _C.PREPROC.MAX_SIZE = np.ceil(_C.PREPROC.MAX_SIZE / size_mult) * size_mult
        assert _C.FPN.PROPOSAL_MODE in ['Level', 'Joint']
        assert _C.FPN.FRCNN_HEAD_FUNC.endswith('_head')
        assert _C.FPN.MRCNN_HEAD_FUNC.endswith('_head')
        assert _C.FPN.NORM in ['None', 'GN']

        if _C.FPN.CASCADE:
            # the first threshold is the proposal sampling threshold
            assert _C.CASCADE.IOUS[0] == _C.FRCNN.FG_THRESH
            assert len(_C.CASCADE.BBOX_REG_WEIGHTS) == len(_C.CASCADE.IOUS)

    if is_training:
        train_scales = _C.PREPROC.TRAIN_SHORT_EDGE_SIZE
        if isinstance(train_scales, (list, tuple)) and train_scales[1] - train_scales[0] > 100:
            # don't autotune if augmentation is on
            os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0'
        os.environ['TF_AUTOTUNE_THRESHOLD'] = '1'
        assert _C.TRAINER in ['horovod', 'replicated'], _C.TRAINER

        lr = _C.TRAIN.LR_SCHEDULE
        if isinstance(lr, six.string_types):
            if lr.endswith("x"):
                LR_SCHEDULE_KITER = {
                    "{}x".format(k):
                    [180 * k - 120, 180 * k - 40, 180 * k]
                    for k in range(2, 10)}
                LR_SCHEDULE_KITER["1x"] = [120, 160, 180]
                _C.TRAIN.LR_SCHEDULE = [x * 1000 for x in LR_SCHEDULE_KITER[lr]]
            else:
                _C.TRAIN.LR_SCHEDULE = eval(lr)

        # setup NUM_GPUS
        if _C.TRAINER == 'horovod':
            import horovod.tensorflow as hvd
            ngpu = hvd.size()
            logger.info("Horovod Rank={}, Size={}, LocalRank={}".format(
                hvd.rank(), hvd.size(), hvd.local_rank()))
        else:
            assert 'OMPI_COMM_WORLD_SIZE' not in os.environ
            ngpu = get_num_gpu()
        assert ngpu > 0, "Has to train with GPU!"
        assert ngpu % 8 == 0 or 8 % ngpu == 0, "Can only train with 1,2,4 or >=8 GPUs, but found {} GPUs".format(ngpu)
    else:
        # autotune is too slow for inference
        os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0'
        ngpu = get_num_gpu()

    if _C.TRAIN.NUM_GPUS is None:
        _C.TRAIN.NUM_GPUS = ngpu
    else:
        if _C.TRAINER == 'horovod':
            assert _C.TRAIN.NUM_GPUS == ngpu
        else:
            assert _C.TRAIN.NUM_GPUS <= ngpu

    _C.freeze()
    logger.info("Config: ------------------------------------------\n" + str(_C))
Exemple #50
0
def train_ffn(model_cls, **model_kwargs):
    with tf.Graph().as_default():
        model = model_cls(**model_kwargs)  # initialize the model
        eval_shape_zyx = train_eval_size(model).tolist(
        )[::-1]  # size of the subvolume (within which the FOV moves)
        eval_tracker = EvalTracker(
            eval_shape_zyx)  # computes summary statistics inside EFOV
        load_data_ops = define_data_input(
            model,
            queue_batch=1)  # this creates a batch of training subvolumes
        prepare_ffn(model)  # here the tf graph is defined
        merge_summaries_op = tf.summary.merge_all(
        )  # merges all summaries defined in the graph.

        if hvd.rank() == 0:
            save_flags()

        var_to_reduce = tf.placeholder(tf.float32)
        bcast_op = hvd.broadcast_global_variables(0)
        avg_op = hvd.allreduce(var_to_reduce, average=True)

        # Start supervisor.
        sv = tf.train.Supervisor(
            logdir=(FLAGS.train_dir if hvd.rank() == 0 else None),
            is_chief=True,
            saver=(tf.train.Saver(max_to_keep=FLAGS.max_to_keep,
                                  keep_checkpoint_every_n_hours=1)
                   if hvd.rank() == 0 else None),
            save_model_secs=(FLAGS.save_model_secs if hvd.rank() == 0 else 0),
            summary_op=None,
            save_summaries_secs=0,  # will perform custom summaries instead
        )

        sess = sv.prepare_or_wait_for_session(
            FLAGS.master,
            config=tf.ConfigProto(
                log_device_placement=False,
                allow_soft_placement=True,
                intra_op_parallelism_threads=FLAGS.num_intra_threads,
                inter_op_parallelism_threads=FLAGS.num_inter_threads))

        # broadcast initial weights. This ensures that all horovod ranks
        # start at the same point in parameter space
        if hvd.rank() == 0:
            print("broadcasting initial weights")
        sess.run(bcast_op)

        eval_tracker.sess = sess  #--connect the eval tracker to the session
        eval_tracker.avg_op = avg_op
        fov_shifts = list(model.shifts)  # x, y, z
        if FLAGS.shuffle_moves:
            random.shuffle(
                fov_shifts
            )  #--this will shuffle the FOV positions that make up an extended FOV (EFOV)

        policy_map = {
            'fixed': partial(fixed_offsets, fov_shifts=fov_shifts),
            'max_pred_moves': max_pred_offsets
        }
        # batch iterator for getting the next batch
        batch_it = get_batch(lambda: sess.run(load_data_ops), eval_tracker,
                             model, FLAGS.batch_size,
                             policy_map[FLAGS.fov_policy])

        step = 0
        t_last = time.time()

        if hvd.rank() == 0:
            timing = []  # list of times for benchmarking

        steps_since_last_summary = 0
        if hvd.rank() == 0:
            print("starting training")
        while step < FLAGS.max_steps:
            time_step_start = time.time()

            if steps_since_last_summary == FLAGS.summary_every_steps:
                summ_op = merge_summaries_op
                steps_since_last_summary = 1
                if hvd.rank() == 0:
                    print("step ", step, "is a summary step")
            else:
                summ_op = None
                steps_since_last_summary += 1

            # get the next batch - this is reading the data from disk.
            seed, patches, labels, weights = next(batch_it)

            summaries = []

            scaled_lr = FLAGS.learning_rate
            if (FLAGS.scaling_rule >
                    0):  #--scale the learning rate linearly (1) or sqrt (2)
                if FLAGS.scaled_lr == 1:
                    scaled_lr *= hvd.size()
                elif FLAGS.scaled_lr == 2:
                    scaled_lr *= np.sqrt(hvd.size())
                if step < FLAGS.warmup_steps:
                    scaled_lr = FLAGS.learning_rate + (step / float(
                        FLAGS.warmup_steps)) * (scaled_lr -
                                                FLAGS.learning_rate)

            if (
                    FLAGS.decay_learning_rate_fraction > 0
            ):  # constantly decay the learning rate using exponential decay
                scaled_lr *= (FLAGS.decay_learning_rate_fraction)**(
                    step / FLAGS.decay_learning_rate_steps)

            updated_seed, step, summ, my_loss = run_training_step( # run training step on a SINGLE FOV
                sess, model, summ_op,
                feed_dict={
                    model.loss_weights: weights,
                    model.labels: labels,
                    model.offset_label: 'off',
                    model.input_patches: patches,
                    model.input_seed: seed,
                    model.learning_rate: scaled_lr
                })

            # compute average loss
            avg_loss = sess.run(avg_op, feed_dict={var_to_reduce: my_loss})

            # Save prediction results in the original seed array so that
            # they can be used in subsequent steps.
            mask.update_at(
                seed, (0, 0, 0),
                updated_seed)  # updates the mask inside the subvolume batches
            if hvd.rank() == 0:
                this_time = time.time(
                ) - time_step_start  # how long did this step take
                timing.append(this_time)
                print("step %i took %.2f seconds" % (step - 1, this_time))

            if summ is not None:
                summaries.append(tf.Summary.FromString(
                    summ))  # this adds the summaries from the single FOV

                # Compute a loss over the whole training patch (i.e. more than a
                # single-step field of view of the network). This quantifies the
                # quality of the final object mask.

                tp, fp, tn, fn, num_patches = eval_tracker.get_summaries_scalar(
                )

                tp_sum = hvd.size() * sess.run(
                    avg_op, feed_dict={var_to_reduce: float(tp)})
                fp_sum = hvd.size() * sess.run(
                    avg_op, feed_dict={var_to_reduce: float(fp)})
                tn_sum = hvd.size() * sess.run(
                    avg_op, feed_dict={var_to_reduce: float(tn)})
                fn_sum = hvd.size() * sess.run(
                    avg_op, feed_dict={var_to_reduce: float(fn)})
                avg_num_patches = sess.run(
                    avg_op, feed_dict={var_to_reduce: float(num_patches)})

                accuracy = (tp_sum + tn_sum) / (tp_sum + fp_sum + tn_sum +
                                                fn_sum)
                precision = (tp_sum) / (tp_sum + fp_sum)
                recall = (tp_sum) / (tp_sum + fn_sum)
                f1 = 2.0 * precision * recall / (precision + recall)

                eval_tracker_summaries = ([
                    tf.Summary.Value(tag='eval/patches',
                                     simple_value=avg_num_patches),
                    tf.Summary.Value(tag='eval/accuracy',
                                     simple_value=accuracy),
                    tf.Summary.Value(tag='eval/precision',
                                     simple_value=precision),
                    tf.Summary.Value(tag='eval/recall', simple_value=recall),
                    tf.Summary.Value(tag='eval/f1', simple_value=f1)
                ])

                if hvd.rank() == 0:
                    logging.info('Saving summaries.')
                    summ = tf.Summary()  #initialize tensorflow summary
                    summ.value.extend(
                        eval_tracker_summaries)  # add EFOV metrics
                    summ.value.extend(eval_tracker.get_summaries_images()
                                      )  # add image summaries
                    for s in summaries:
                        summ.value.extend(s.value)  # add FOV metrics
                    # other custom summary items:
                    summ.value.extend([
                        tf.Summary.Value(tag='avg_pixel_loss',
                                         simple_value=avg_loss)
                    ])  #avg pixel loss
                    summ.value.extend([
                        tf.Summary.Value(tag='learning_rate',
                                         simple_value=scaled_lr)
                    ])  #(scaled) learning rate
                    summ.value.extend([
                        tf.Summary.Value(tag='avg_time_per_step',
                                         simple_value=np.mean(timing))
                    ])  #avg time per step
                    print("avg time per step: ", np.mean(timing),
                          np.std(timing))
                    print("avg throughput: ",
                          FLAGS.batch_size / np.mean(timing))
                    timing = []  # reset the timing array
                    sv.summary_computed(sess, summ, step)

                # reset eval tracker before the next training step.
                eval_tracker.reset()

        if hvd.rank() == 0:
            print("all steps done!")
            if (FLAGS.do_benchmark_test == 1):
                print("benchmark result: ")
                print("steps, ranks, threads, mean, sigma:")
                string = str(FLAGS.max_steps) + "," + str(
                    FLAGS.batch_size) + "," + str(
                        hvd.size()) + "," + str(FLAGS.nthreads) + "," + str(
                            np.mean(timing)) + "," + str(np.std(timing)) + "\n"
                print(string)
                with open(FLAGS.timelog, "a") as myfile:
                    myfile.write(string)
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)

    nranks_per_gpu = args.nranks_per_gpu
    local_rank = hvd.local_rank()
    gpu_local_rank = local_rank // nranks_per_gpu
    print('local_rank, GPU_LOCAL_RANK: {}, {}'.format(
        local_rank, gpu_local_rank))

    # Pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    # config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.visible_device_list = str(gpu_local_rank)
    K.set_session(tf.Session(config=config))

    # input image dimensions
    img_rows, img_cols, img_chns = 28, 28, 1
    # number of convolutional filters to use
    filters = 64
    # convolution kernel size
    num_conv = 3

    hvdsize = hvd.size()

    batch_size = 128  # 100
    if K.image_data_format() == 'channels_first':
        original_img_size = (img_chns, img_rows, img_cols)
    else:
        original_img_size = (img_rows, img_cols, img_chns)
    latent_dim = 2
    intermediate_dim = 128
    epsilon_std = 1.0
    epochs = args.epochs  # 5

    # train the VAE on MNIST digits
    (x_train, _), (x_test, y_test) = mnist.load_data()

    x_train = x_train.astype('float32') / 255.
    x_train = x_train.reshape((x_train.shape[0],) + original_img_size)
    x_test = x_test.astype('float32') / 255.
    x_test = x_test.reshape((x_test.shape[0],) + original_img_size)

    if hvd.rank() == 0:
        print('x_train.shape:', x_train.shape)

    train_samples = x_train.shape[0]
    # steps_per_epoch = train_samples // batch_size // hvdsize
    speedupopt = args.speedup
    if speedupopt == SpeedupOpts.imgspersec:
        steps_per_epoch = train_samples // batch_size
    else:
        steps_per_epoch = int(round(
            float(train_samples) / batch_size / hvdsize + 0.5))

    # Create the dataset and its associated one-shot iterator.
    buffer_size = 10000
    dataset = Dataset.from_tensor_slices(x_train)
    dataset = dataset.repeat()
    dataset = dataset.shuffle(buffer_size)
    dataset = dataset.batch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    x_train_batch = iterator.get_next()

    ldict = make_shared_layers_dict(
        img_chns, img_rows, img_cols, batch_size, filters,
        num_conv, intermediate_dim, latent_dim, epsilon_std)
    # ldict is a dictionary that holds all layers. Since these layers are
    # instantiated once, they are shared amongs vae, encoder, and generator.

    x = Input(tensor=x_train_batch)
    vae = make_vae(ldict, x)
    # :  :type vae: Model

    lr = 0.001  # * hvdsize
    opt = tf.train.RMSPropOptimizer(lr)
    # Add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)  # , use_locking=True)
    opt = TFOptimizer(opt)

    # opt = RMSprop(lr)
    # Add Horovod Distributed Optimizer.
    # opt = hvd_keras.DistributedOptimizer(opt)  # , use_locking=True)

    vae.compile(optimizer=opt, loss=None)
    if hvd.rank() == 0:
        vae.summary()

    callbacks = []
    if hvd.rank() == 0:
        callbacks += [BatchTiming(), SamplesPerSec(batch_size * hvdsize)]

    sess = K.get_session()
    sess.run(hvd.broadcast_global_variables(0))

    # Fit the model using data from the TF data tensors.
    vae.fit(steps_per_epoch=steps_per_epoch, epochs=epochs,
            callbacks=callbacks)

    if hvd.rank() == 0:
        x = Input(shape=original_img_size)
        vae_val = make_vae(ldict, x)
        vae_val.compile(optimizer=opt, loss=None)
        loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size)
        print('\n\nVAE VALIDATION LOSS: {}'.format(loss))

        x = Input(shape=original_img_size)
        z_mean, _ = get_encoded(ldict, x)
        encoder = Model(x, z_mean)
        # :  :type encoder: Model

        decoder_input = Input(shape=(latent_dim,))
        x_decoded_mean_squash = get_decoded(ldict, decoder_input)
        generator = Model(decoder_input, x_decoded_mean_squash)
        # :  :type generator: Model

        # display a 2D plot of the digit classes in the latent space
        x_test_encoded = encoder.predict(x_test, batch_size=batch_size)
        plt.figure(figsize=(6, 6))
        plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test)
        plt.colorbar()
        # plt.show()
        plt.savefig('vae_scatter.ps')
        plt.close()

        # display a 2D manifold of the digits
        n = 15  # figure with 15x15 digits
        digit_size = 28
        figure = np.zeros((digit_size * n, digit_size * n))
        # Linearly spaced coordinates on the unit square were transformed
        # through the inverse CDF (ppf) of the Gaussian
        # To produce values of the latent variables z, since the prior of the
        # latent space is Gaussian
        grid_x = norm.ppf(np.linspace(0.05, 0.95, n))
        grid_y = norm.ppf(np.linspace(0.05, 0.95, n))

        for i, yi in enumerate(grid_x):
            for j, xi in enumerate(grid_y):
                z_sample = np.array([[xi, yi]])
                z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2)
                x_decoded = generator.predict(z_sample, batch_size=batch_size)
                digit = x_decoded[0].reshape(digit_size, digit_size)
                figure[i * digit_size: (i + 1) * digit_size,
                       j * digit_size: (j + 1) * digit_size] = digit

        plt.figure(figsize=(10, 10))
        plt.imshow(figure, cmap='Greys_r')
        # plt.show()
        plt.savefig('vae_digit.ps')
        plt.close()

    K.clear_session()
Exemple #52
0
def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    # Keras automatically creates a cache directory in ~/.keras/datasets for
    # storing the downloaded MNIST data. This creates a race
    # condition among the workers that share the same filesystem. If the
    # directory already exists by the time this worker gets around to creating
    # it, ignore the resulting exception and continue.
    cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
    if not os.path.exists(cache_dir):
        try:
            os.mkdir(cache_dir)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
                pass
            else:
                raise

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    tf.enable_eager_execution(config=config)

    mnist_model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(16, [3, 3], activation='relu'),
        tf.keras.layers.Conv2D(16, [3, 3], activation='relu'),
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(10)
    ])

    # Horovod: adjust learning rate based on number of GPUs.
    opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())

    # Make sure the Fetcher worked
    mnist_filename = 'mnist.npz'
    mnist_path = os.path.join(cache_dir, mnist_filename)
    if not os.path.isfile(mnist_path):
        raise FileNotFoundError("Dataset not found. Looked in " + mnist_path)

    (mnist_images, mnist_labels), _ = \
        tf.keras.datasets.mnist.load_data(path=mnist_filename)

    dataset = tf.data.Dataset.from_tensor_slices(
        (tf.cast(mnist_images[..., tf.newaxis] / 255.0,
                 tf.float32), tf.cast(mnist_labels, tf.int64)))
    dataset = dataset.shuffle(1000).batch(32)

    # Horovod: adjust number of steps based on number of GPUs.
    for (batch, (images,
                 labels)) in enumerate(dataset.take(20000 // hvd.size())):
        with tf.GradientTape() as tape:
            logits = mnist_model(images, training=True)
            loss_value = tf.losses.sparse_softmax_cross_entropy(labels, logits)

        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        if batch == 0:
            hvd.broadcast_variables(mnist_model.variables, root_rank=0)

        # Horovod: add Horovod Distributed GradientTape.
        tape = hvd.DistributedGradientTape(tape)

        grads = tape.gradient(loss_value, mnist_model.variables)
        opt.apply_gradients(zip(grads, mnist_model.variables),
                            global_step=tf.train.get_or_create_global_step())

        if batch % 50 == 0 and hvd.local_rank() == 0:
            print('Step #%d\tLoss: %.6f' % (batch, loss_value))
            emit({"batch": str(batch), "train_loss": "%.6f" % loss_value})
Exemple #53
0
def train(sess, model, hps, logdir, visualise):
    _print(hps)
    _print('Starting training. Logging to', logdir)
    _print('epoch n_processed n_images ips dtrain dtest dsample dtot train_results test_results msg')

    # Train
    sess.graph.finalize()
    n_processed = 0
    n_images = 0
    train_time = 0.0
    test_loss_best = 999999

    if hvd.rank() == 0:
        train_logger = ResultLogger(logdir + "train.txt", **hps.__dict__)
        test_logger = ResultLogger(logdir + "test.txt", **hps.__dict__)

    tcurr = time.time()
    for epoch in range(1, hps.epochs):

        t = time.time()

        train_results = []
        for it in range(hps.train_its):

            # Set learning rate, linearly annealed from 0 in the first hps.epochs_warmup epochs.
            lr = hps.lr * min(1., n_processed /
                              (hps.n_train * hps.epochs_warmup))

            # Run a training step synchronously.
            _t = time.time()
            train_results += [model.train(lr)]
            if hps.verbose and hvd.rank() == 0:
                _print(n_processed, time.time()-_t, train_results[-1])
                sys.stdout.flush()

            # Images seen wrt anchor resolution
            n_processed += hvd.size() * hps.n_batch_train
            # Actual images seen at current resolution
            n_images += hvd.size() * hps.local_batch_train

        train_results = np.mean(np.asarray(train_results), axis=0)

        dtrain = time.time() - t
        ips = (hps.train_its * hvd.size() * hps.local_batch_train) / dtrain
        train_time += dtrain

        if hvd.rank() == 0:
            train_logger.log(epoch=epoch, n_processed=n_processed, n_images=n_images, train_time=int(
                train_time), **process_results(train_results))

        if epoch < 10 or (epoch < 50 and epoch % 10 == 0) or epoch % hps.epochs_full_valid == 0:
            test_results = []
            msg = ''

            t = time.time()
            # model.polyak_swap()

            if epoch % hps.epochs_full_valid == 0:
                # Full validation run
                for it in range(hps.full_test_its):
                    test_results += [model.test()]
                test_results = np.mean(np.asarray(test_results), axis=0)

                if hvd.rank() == 0:
                    test_logger.log(epoch=epoch, n_processed=n_processed,
                                    n_images=n_images, **process_results(test_results))

                    # Save checkpoint
                    if test_results[0] < test_loss_best:
                        test_loss_best = test_results[0]
                        model.save(logdir+"model_best_loss.ckpt")
                        msg += ' *'

            dtest = time.time() - t

            # Sample
            t = time.time()
            if epoch == 1 or epoch == 10 or epoch % hps.epochs_full_sample == 0:
                visualise(epoch)
            dsample = time.time() - t

            if hvd.rank() == 0:
                dcurr = time.time() - tcurr
                tcurr = time.time()
                _print(epoch, n_processed, n_images, "{:.1f} {:.1f} {:.1f} {:.1f} {:.1f}".format(
                    ips, dtrain, dtest, dsample, dcurr), train_results, test_results, msg)

            # model.polyak_swap()

    if hvd.rank() == 0:
        _print("Finished!")
Exemple #54
0
def main(_):
    """
    Builds the model and runs
    """
    if FLAGS.distributed:
        import horovod.tensorflow as hvd
        hvd.init()

    tf.logging.set_verbosity(tf.logging.INFO)

    if len(config_train.name) > 0:
        output_dir = os.path.join(FLAGS.output_dir, config_train.name)
    else:
        output_dir = FLAGS.output_dir
    tx.utils.maybe_create_dir(output_dir)


    ## Loads GPT-2 model configuration

    if FLAGS.config_type == "json":
        gpt2_config = model_utils.transform_gpt2_to_texar_config(
            FLAGS.config_model)
    elif FLAGS.config_type == 'texar':
        gpt2_config = importlib.import_module(
            FLAGS.config_model)
    else:
        raise ValueError('Unknown config_type.')

    # Creates a data pre-processor for, e.g., BPE encoding
    proc = processor.get_encoder(FLAGS.pretrained_model_dir)

    max_decoding_length = config_train.max_decoding_length
    assert max_decoding_length <= gpt2_config.position_size, (
        "max_decoding_length should not be greater than position_size. "
        "{}>{}".format(max_decoding_length, gpt2_config.position_size))

    ## Loads data

    # Configures training data shard in distribued mode
    if FLAGS.distributed:
        config_train.train_hparam["dataset"]["num_shards"] = hvd.size()
        config_train.train_hparam["dataset"]["shard_id"] = hvd.rank()
        config_train.train_hparam["batch_size"] //= hvd.size()

    datasets = {}
    #if FLAGS.do_train:
    train_dataset = tx.data.TFRecordData(hparams=config_train.train_hparam)
    datasets['train'] = train_dataset
    #if FLAGS.do_eval:
    dev_dataset = tx.data.TFRecordData(hparams=config_train.dev_hparam)
    datasets['dev'] = dev_dataset
    #if FLAGS.do_test:
    test_dataset = tx.data.TFRecordData(hparams=config_train.test_hparam)
    datasets['test'] = test_dataset
    iterator = tx.data.FeedableDataIterator(datasets)
    batch = iterator.get_next()
    batch_size = tf.shape(batch['x1x4_ids'])[0]

    ## Builds the GPT-2 model
    vocab_size = gpt2_config.vocab_size

    word_embedder = tx.modules.WordEmbedder(
        vocab_size=vocab_size,
        hparams=gpt2_config.embed)

    pos_embedder = tx.modules.PositionEmbedder(
        position_size=gpt2_config.position_size,
        hparams=gpt2_config.pos_embed)

    # Ties output layer with input word embedding
    output_layer = tf.transpose(word_embedder.embedding, (1, 0))

    decoder = tx.modules.TransformerDecoder(
        vocab_size=vocab_size,
        output_layer=output_layer,
        hparams=gpt2_config.decoder)

    # For training
    def _get_recon_loss(ids, full_len, prefix_len, mask_prefix=True, do_print=False):
        ids = ids[:,:tf.reduce_max(full_len)]
        batch_size__ = tf.shape(ids)[0]
        seq_len = tf.fill([batch_size__], tf.shape(ids)[1])
        pos_embeds = pos_embedder(sequence_length=seq_len)
        input_embeds = word_embedder(ids) + pos_embeds

        outputs = decoder(inputs=input_embeds, decoding_strategy='train_greedy')

        max_full_len = tf.reduce_max(full_len)
        ids = ids[:, :max_full_len]
        logits = outputs.logits[:, :max_full_len]

        if mask_prefix:
            loss_recon = tx.losses.sequence_sparse_softmax_cross_entropy(
                labels=ids[:, 1:],
                logits=logits[:, :-1, :],
                sequence_length=full_len-1,
                average_across_timesteps=False,
                sum_over_timesteps=False,
                average_across_batch=False,
                sum_over_batch=False)
            mask_recon = tf.sequence_mask(
                full_len-1,
                dtype=tf.float32)
            mask_recon_prefix = 1 - tf.sequence_mask(
                prefix_len-1,
                maxlen=max_full_len-1,#max_decoding_length-1,
                dtype=tf.float32)
            mask_recon = mask_recon * mask_recon_prefix

            if do_print:
                print_op_1 = tf.print(mask_recon)
                loss_recon_flat = tx.utils.reduce_with_weights(
                    tensor=loss_recon,
                    weights=mask_recon,
                    average_across_remaining=False,
                    sum_over_remaining=False,
                    average_across_batch=False)
                print_op_2 = tf.print(loss_recon_flat)
                with tf.control_dependencies([print_op_1, print_op_2]):
                    loss_recon = tx.utils.reduce_with_weights(
                        tensor=loss_recon,
                        weights=mask_recon,
                        average_across_remaining=True,
                        sum_over_remaining=False)
                return loss_recon, mask_recon, loss_recon_flat
            else:
                loss_recon = tx.utils.reduce_with_weights(
                    tensor=loss_recon,
                    weights=mask_recon,
                    average_across_remaining=True,
                    sum_over_remaining=False)
        else:
            loss_recon = tx.losses.sequence_sparse_softmax_cross_entropy(
                labels=ids[:, 1:],
                logits=logits[:, :-1, :],
                sequence_length=full_len-1,
                average_across_timesteps=True,
                sum_over_timesteps=False,
                average_across_batch=True,
                sum_over_batch=False)

        return loss_recon


    ## ROC Loss-1: fine-tune loss
    x1_len = tf.placeholder(tf.int32, shape=[None], name='x1_len')
    x1x4_ids = tf.placeholder(tf.int32, shape=[None, None], name='x1x4_ids')
    x1x4_len = tf.placeholder(tf.int32, shape=[None], name='x1x4_len')

    loss_fine = _get_recon_loss(x1x4_ids, x1x4_len, x1_len)

    tau = tf.placeholder(tf.float32, shape=[], name='tau')

    # generate soft yy
    def _soft_embedding_fn(soft_ids, times):
        return word_embedder(soft_ids=soft_ids) + pos_embedder(times)
    end_token = proc.encoder['<|endoftext|>']

    if not FLAGS.supervised:
        loss = config_train.w_fine * loss_fine

        loss_dict = {
            'loss': loss,
            'loss_fine': config_train.w_fine * loss_fine,
        }
    else:
        loss = loss_yy

        loss_dict = {
            'loss': loss,
            'loss_yy': loss_yy,
            # dumb
            'loss_mask_recon': tf.constant(0),
            'loss_bt': tf.constant(0),
            'loss_d_xx2': tf.constant(0),
            'loss_d_x2': tf.constant(0),
            'loss_fine': tf.constant(0),
            'loss_xx2': tf.constant(0)
        }

    ## Inference
    def _embedding_fn(ids, times):
        return word_embedder(ids) + pos_embedder(times)

    def _infer(context_name):
        helper = tx.modules.TopKSampleEmbeddingHelper(
            embedding=_embedding_fn,
            start_tokens=batch['%s_ids' % context_name][:, 0],
            end_token=end_token,
            top_k=FLAGS.top_k,
            softmax_temperature=FLAGS.temperature)
        outputs_infer, len_infer = decoder(
            context=batch['%s_ids' % context_name],
            context_sequence_length=batch['%s_len' % context_name],
            max_decoding_length=max_decoding_length,
            helper=helper)
        yy_ids = tx.utils.varlength_roll(
            outputs_infer.sample_id, -batch['%s_len' % context_name])
        yy_len = len_infer - batch['%s_len' % context_name]
        yy_ids = yy_ids[:, :tf.reduce_max(yy_len)]
        # yy_logits = outputs_infer.logits
        # # yy_loss = _evaluate_loss_test(yy_logits, target_name, context_name)

        return yy_ids, yy_len

    def _evaluate_loss_test(target_name, context_name, bpe_loss=FLAGS.bpe_loss):
        ids = batch['%s_ids' % target_name]
        full_len = batch['%s_len' % target_name]
        ids = ids[:, :tf.reduce_max(full_len)]

        batch_size__ = tf.shape(ids)[0]
        seq_len = tf.fill([batch_size__], tf.shape(ids)[1])
        pos_embeds = pos_embedder(sequence_length=seq_len)
        input_embeds = word_embedder(ids) + pos_embeds

        # greedy output
        outputs = decoder(inputs=input_embeds, decoding_strategy='train_greedy')
        max_full_len = tf.reduce_max(full_len)
        logits = outputs.logits[:, :max_full_len]

        test_loss = tx.losses.sequence_sparse_softmax_cross_entropy(
            labels=ids[:, 1:],
            logits=logits[:, :-1, :],
            sequence_length=full_len - 1,
            average_across_timesteps=False,
            sum_over_timesteps=False, # not bpe_loss, # True,
            average_across_batch=False,
            sum_over_batch=False)
        mask_recon = tf.sequence_mask(
            full_len - 1,
            dtype=tf.float32)
        mask_recon_prefix = 1 - tf.sequence_mask(
            batch['%s_len' % context_name] - 1,
            maxlen=max_full_len - 1,  # max_decoding_length-1,
            dtype=tf.float32)
        mask_recon = mask_recon * mask_recon_prefix

        test_loss = tx.utils.reduce_with_weights(
            tensor=test_loss,
            weights=mask_recon,
            average_across_batch=bpe_loss,
            average_across_remaining=bpe_loss,
            sum_over_remaining=not bpe_loss)

        return test_loss # [bs,] ?



    x4_ids_fine, x4_len_fine = _infer('x1')
    x4_loss_fine = _evaluate_loss_test('x1x4', 'x1')

    ## Optimization

    def _get_beam_ids(context_name):
        # beam-search
        predictions = decoder(
            beam_width=5,
            length_penalty=config_train.length_penalty,
            embedding=_embedding_fn,
            context=batch['%s_ids' % context_name],
            context_sequence_length=batch['%s_len' % context_name],
            max_decoding_length=max_decoding_length,
            end_token=end_token,
            mode=tf.estimator.ModeKeys.PREDICT)

        beam_output_ids = tx.utils.varlength_roll(predictions["sample_id"][:, :, 0], -batch['%s_len' % context_name])

        return beam_output_ids
    beam_search_ids = _get_beam_ids('x1')

    def _get_greedy_story(context_name):

        greedy_res, greedy_len = decoder(
            decoding_strategy='infer_greedy',
            embedding=_embedding_fn,
            context=batch['%s_ids' % context_name],
            context_sequence_length=batch['%s_len' % context_name],
            max_decoding_length=max_decoding_length,
            end_token=end_token,
            mode=tf.estimator.ModeKeys.PREDICT)

        greedy_ids = tx.utils.varlength_roll(greedy_res.sample_id, -batch['%s_len' % context_name])
        greedy_ids_len = greedy_len - batch['%s_len' % context_name]
        greedy_ids = greedy_ids[:, :tf.reduce_max(greedy_ids_len)]

        return greedy_ids, greedy_ids_len
    greedy_ids, greedy_len = _get_greedy_story('x1')





    trainable_variables = tx.utils.collect_trainable_variables(
        [word_embedder, pos_embedder, decoder])

    global_step = tf.Variable(0, trainable=False)
    opt = tx.core.get_optimizer(
        global_step=global_step,
        hparams=config_train.opt)

    if FLAGS.distributed:
        opt = hvd.DistributedOptimizer(opt)

    train_op = tf.contrib.layers.optimize_loss(
        loss=loss,
        global_step=global_step,
        learning_rate=None,
        optimizer=opt,
        variables=trainable_variables)


    ## Train/eval/test routine
    saver = tf.train.Saver()
    saver_best = tf.train.Saver(max_to_keep=1)
    dev_best = {
        'loss': 1e8, 'loss_fine': 1e8}


    def _log_losses(losses, step=None):
        loss_str = 'loss: %.4f, loss_fine: %.4f' % \
            (losses['loss'], losses['loss_fine'])

        if step is not None:
            loss_str = 'step: %d, %s' % (step, loss_str)

        _log(loss_str)

    def _is_head():
        if not FLAGS.distributed:
            return True
        else:
            return hvd.rank() == 0

    def _train_epoch(sess, initial=False):
        """Trains on the training set, and evaluates on the dev set
        periodically.
        """
        iterator.restart_dataset(sess, 'train')

        while True:
            try:
                # (1) Get data and yy sample
                fetches_data = {
                    'batch': batch,
                    'batch_size': batch_size,
                }
                feed_dict_data = {
                    iterator.handle: iterator.get_handle(sess, 'train'),
                    tx.global_mode(): tf.estimator.ModeKeys.PREDICT,
                }
                rets_data = sess.run(fetches_data, feed_dict_data)


                # (2) Optimize loss
                feed_dict = {
                    #x1_ids: rets_data['batch']['x1_ids'],
                    x1_len: rets_data['batch']['x1_len'],
                    x1x4_ids: rets_data['batch']['x1x4_ids'],
                    x1x4_len: rets_data['batch']['x1x4_len'],
                    tau: config_train.tau,
                    tx.global_mode(): tf.estimator.ModeKeys.TRAIN,
                }

                fetches = {
                    'train_op': train_op,
                    'step': global_step,
                }
                fetches.update(loss_dict)

                rets = sess.run(fetches, feed_dict)
                step = rets['step']

                dis_steps = config_train.display_steps

                if _is_head() and dis_steps > 0 and step % dis_steps == 0:
                    _log_losses(rets, step)

                eval_steps = config_train.eval_steps
                if _is_head() and eval_steps > 0 and step % eval_steps == 0:
                    _dev_epoch(sess)
                sample_steps = config_train.sample_steps
                if _is_head() and sample_steps > 0 and step % sample_steps == 0:
                    print('-----------testing-----------------')
                    _test_epoch(sess, step=step)

                ckpt_steps = config_train.checkpoint_steps
                if _is_head() and ckpt_steps > 0 and step % ckpt_steps == 0:
                    ckpt_fn = os.path.join(output_dir, 'model.ckpt')
                    ckpt_fn = saver.save(sess, ckpt_fn, global_step=step)
                    _log('Checkpoint to {}'.format(ckpt_fn))

            except tf.errors.OutOfRangeError:
                break

    def _dev_epoch(sess):
        """Evaluates on the dev set.
        """
        iterator.restart_dataset(sess, 'dev')

        results = tx.utils.AverageRecorder()
        nsamples = 0
        fetches = {}
        fetches.update(loss_dict)
        # i = 0

        while True:
            try:

                # (1) Get data and yy sample
                fetches_data = {
                    'batch': batch,
                    'batch_size': batch_size,
                }
                feed_dict_data = {
                    iterator.handle: iterator.get_handle(sess, 'dev'),
                    tx.global_mode(): tf.estimator.ModeKeys.PREDICT,
                }
                rets_data = sess.run(fetches_data, feed_dict_data)


                # (2) eval loss
                feed_dict = {
                    #x1_ids: rets_data['batch']['x1_ids'],
                    x1_len: rets_data['batch']['x1_len'],
                    x1x4_ids: rets_data['batch']['x1x4_ids'],
                    x1x4_len: rets_data['batch']['x1x4_len'],
                    tau: config_train.tau,
                    tx.global_mode(): tf.estimator.ModeKeys.PREDICT,
                }

                rets = sess.run(fetches, feed_dict)

                results.add(rets, weight=rets_data['batch_size'])
                nsamples += rets_data['batch_size']
            except tf.errors.OutOfRangeError:
                break

        _log_losses(results.avg())
        _log('nsamples: %d' % nsamples)

        avg_loss = results.avg('loss')
        if FLAGS.do_train and avg_loss < dev_best['loss']:
            dev_best.update(results.avg())
            ckpt_fn = os.path.join(output_dir, 'model_best.ckpt')
            ckpt_fn = saver_best.save(sess, ckpt_fn)
            _log('Checkpoint best to {}'.format(ckpt_fn))

    def _test_epoch(sess, step=None):
        """Generates samples on the test set.
        """
        iterator.restart_dataset(sess, 'test')

        _all_inputs = []
        _all_samples = []
        _all_loss = []

        # if FLAGS.finetune and FLAGS.roc:
        #     raise ValueError('Cannot set --finetune and --roc at the same time')
        if FLAGS.finetune:
            _log('Generation input: x1')
            if FLAGS.greedy:
                fetches = {
                    'inputs': batch['x1_ids'],
                    'length': batch['x1_len'],
                    'samples_length': greedy_len,
                    'samples': greedy_ids
                }
            elif FLAGS.beam:
                fetches = {
                    'inputs': batch['x1_ids'],
                    'length': batch['x1_len'],
                    # 'samples_length': x4_len_fine,
                    'samples': beam_search_ids
                }
            else:
                fetches = {
                    'inputs': batch['x1_ids'],
                    'length': batch['x1_len'],
                    'samples_length': x4_len_fine,
                    'samples': x4_ids_fine,
                    'sample_loss': x4_loss_fine,
                    'outputs': batch['x1x4_ids'],
                    'out_length': batch['x1x4_len']
                }
            res_fn_appendix = "x1"




        while True:
            try:
                feed_dict = {
                    iterator.handle: iterator.get_handle(sess, 'test'),
                    tx.context.global_mode(): tf.estimator.ModeKeys.PREDICT,
                }
                rets = sess.run(fetches, feed_dict=feed_dict)

                # ! ----
                _inputs = []
                for i, l in zip(rets['inputs'], rets['length']):
                    # Delete padding
                    _inputs.append(i[:l].tolist())
                _all_inputs.extend(_inputs)

                _samples = []

                if not FLAGS.beam:
                    for s, l in zip(rets['samples'], rets['samples_length']):
                        _samples.append(s[:l].tolist())

                else:
                    _samples.extend(h.tolist() for h in rets['samples'])
                    _samples = utils.list_strip_eos(_samples, eos_token=proc.encoder['<|endoftext|>'])
                _all_samples.extend(_samples)
                # ----!

                _loss = []
                if not FLAGS.bpe_loss:
                    for los in rets["sample_loss"]:
                        _loss.append(los)
                else:
                    _loss = [rets["sample_loss"]]

                _all_loss.extend(_loss)


            except tf.errors.OutOfRangeError:
                break

        # Parse samples and write to file

        eos_token_id = proc.encoder['<|endoftext|>']

        # !----
        _all_input_text = []
        for i in _all_inputs:
            if i[0] == eos_token_id:
                i = i[1:]
            i_text = proc.decode(i)
            _all_input_text.append(i_text)
        _all_input_text = tx.utils.strip_eos(_all_input_text,
                                             eos_token='<|endoftext|>')

        _all_samples_text = []
        for j, (i, s) in enumerate(zip(_all_inputs, _all_samples)):
            s_text = proc.decode(s)
            s_text = s_text.replace('\n', ' ')
            # print(s_text)
            _all_samples_text.append(s_text)
            if j % 1000 == 0:
                print("{} stories is process of total {}".format(j, len(_all_inputs)))

        _all_samples_text = tx.utils.strip_eos(_all_samples_text,
                                             eos_token='<|endoftext|>')

        if step is None:
            fn = "test_samples_%s_sample_k%d.tsv" % (res_fn_appendix, FLAGS.top_k)
        else:
            fn = "test_samples_%s_%d_beam.tsv" % (res_fn_appendix, step)
        output_file = os.path.join(output_dir, fn)
        _log('Write samples to {}'.format(output_file))
        if not FLAGS.beam:
            tx.utils.write_paired_text(
            _all_input_text, _all_samples_text, output_file)
            with open(output_file[:-4]+".txt", 'w') as f:
                for item in _all_samples_text:
                    f.write("%s\n" % item.strip(" | "))
        else:
            with open(output_file, 'w') as f:
                for item in _all_samples_text:
                    f.write("%s\n" % item)
        # ----!

        if FLAGS.ppl:
            if not FLAGS.bpe_loss:
                # load target file
                target = [i.strip().split() for i in open("emotion_evaluation/baselines/ground-truth/ground_truth_story-processed.txt")]
                for j, (txt, los) in enumerate(zip(target, _all_loss)):
                    _all_loss[j] = los/len(txt)

                np.save(os.path.join(output_dir, "test_loss_word.npy"), np.array(_all_loss))
                avg_loss = np.mean(np.array(_all_loss))
                ppl = np.exp(avg_loss)
                msg = 'test_loss (per word): %.4f, test_perplexity: %.4f' % \
                    (avg_loss, ppl
                     )
            else:
                avg_loss = np.mean(np.array(_all_loss))
                ppl = np.exp(avg_loss)
                msg = 'test_loss (bpe): %.4f, test_perplexity: %.4f' % \
                    (avg_loss, ppl
                     )

            _log(msg)


    # Broadcasts global variables from rank-0 process
    if FLAGS.distributed:
        bcast = hvd.broadcast_global_variables(0)

    session_config = tf.ConfigProto()
    if FLAGS.distributed:
        session_config.gpu_options.visible_device_list = str(hvd.local_rank())

    with tf.Session(config=session_config) as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())
        sess.run(tf.tables_initializer())

#        smry_writer = tf.summary.FileWriter(FLAGS.output_dir, graph=sess.graph)

        if FLAGS.distributed:
            bcast.run()

        #Restores trained model if specified
        if FLAGS.checkpoint:
           _log('Restore from {}'.format(FLAGS.checkpoint))
           saver.restore(sess, FLAGS.checkpoint)
        elif FLAGS.pretrain_checkpoint:
           _log('Restore from {}'.format(FLAGS.pretrain_checkpoint))
           model_utils.init_gpt2_checkpoint(sess, FLAGS.pretrain_checkpoint)
           print("\nFinished loading\n")
           saver.save(sess, output_dir + '/gpt2_model.ckpt')




        iterator.initialize_dataset(sess)

        if FLAGS.do_train:
            for epoch in range(config_train.max_train_epoch):
                print("Training epoch {}".format(epoch))
                _train_epoch(sess, epoch==0)
            saver.save(sess, output_dir + '/model.ckpt')

        if FLAGS.do_eval:
            _dev_epoch(sess)

        if FLAGS.do_test:
            _test_epoch(sess)
Exemple #55
0
def do_train(model):
    batch = args.batch
    total_batch = batch * hvd.size()

    if args.fake:
        data = FakeData([[batch, 224, 224, 3], [batch]],
                        1000,
                        random=False,
                        dtype=['uint8', 'int32'])
        data = StagingInput(QueueInput(data))
        callbacks = []
        steps_per_epoch = 50
    else:
        logger.info("#Tower: {}; Batch size per tower: {}".format(
            hvd.size(), batch))
        zmq_addr = 'ipc://@imagenet-train-b{}'.format(batch)
        if args.no_zmq_ops:
            dataflow = RemoteDataZMQ(zmq_addr, hwm=150, bind=False)
            data = QueueInput(dataflow)
        else:
            data = ZMQInput(zmq_addr, 30, bind=False)
        data = StagingInput(data)

        steps_per_epoch = int(np.round(1281167 / total_batch))

    BASE_LR = 0.1 * (total_batch // 256)
    """
    ImageNet in 1 Hour, Sec 2.1:
    Linear Scaling Rule: When the minibatch size is
    multiplied by k, multiply the learning rate by k.
    """
    logger.info("Base LR: {}".format(BASE_LR))
    callbacks = [
        ModelSaver(max_to_keep=10),
        EstimatedTimeLeft(),
        ScheduledHyperParamSetter('learning_rate', [(0, BASE_LR),
                                                    (35, BASE_LR * 1e-1),
                                                    (70, BASE_LR * 1e-2),
                                                    (95, BASE_LR * 1e-3)])
    ]
    """
    Feature Denoising, Sec 5:
    Our models are trained for a total of
    110 epochs; we decrease the learning rate by 10× at the 35-
    th, 70-th, and 95-th epoch
    """
    max_epoch = 110

    if BASE_LR > 0.1:
        callbacks.append(
            ScheduledHyperParamSetter('learning_rate',
                                      [(0, 0.1),
                                       (5 * steps_per_epoch, BASE_LR)],
                                      interp='linear',
                                      step_based=True))
        """
        ImageNet in 1 Hour, Sec 2.2:
        we start from a learning rate of η and increment it by a constant amount at
        each iteration such that it reaches ηˆ = kη after 5 epochs
        """

    if not args.fake:
        # add distributed evaluation, for various attackers that we care.
        def add_eval_callback(name, attacker, condition):
            cb = create_eval_callback(
                name,
                model.get_inference_func(attacker),
                # always eval in the last 2 epochs no matter what
                lambda epoch_num: condition(epoch_num) or epoch_num > max_epoch
                - 2)
            callbacks.append(cb)

        add_eval_callback('eval-clean', NoOpAttacker(), lambda e: True)
        add_eval_callback(
            'eval-10step',
            PGDAttacker(10, args.attack_epsilon, args.attack_step_size),
            lambda e: True)
        add_eval_callback(
            'eval-50step',
            PGDAttacker(50, args.attack_epsilon, args.attack_step_size),
            lambda e: e % 20 == 0)
        add_eval_callback(
            'eval-100step',
            PGDAttacker(100, args.attack_epsilon, args.attack_step_size),
            lambda e: e % 10 == 0 or e > max_epoch - 5)
        for k in [20, 30, 40, 60, 70, 80, 90]:
            add_eval_callback(
                'eval-{}step'.format(k),
                PGDAttacker(k, args.attack_epsilon, args.attack_step_size),
                lambda e: False)

    trainer = HorovodTrainer(average=True)
    trainer.setup_graph(model.get_input_signature(), data, model.build_graph,
                        model.get_optimizer)
    trainer.train_with_defaults(callbacks=callbacks,
                                steps_per_epoch=steps_per_epoch,
                                session_init=SmartInit(args.load),
                                max_epoch=max_epoch,
                                starting_epoch=args.starting_epoch)
Exemple #56
0
 def test_horovod_size(self):
     """Test that the size returned by hvd.size() is correct."""
     _, true_size = mpi_env_rank_and_size()
     hvd.init()
     size = hvd.size()
     assert true_size == size
Exemple #57
0
import os
import json
import time

from math import pi
from typing import Optional, Tuple

import numpy as np
import tensorflow as tf

from tensorflow.python.keras import backend as K

try:
    import horovod.tensorflow as hvd

    NUM_RANKS = hvd.size()
    NUM_WORKERS = NUM_RANKS * hvd.local_size()
    HAS_HOROVOD = True
    print(f'hvd.size : {hvd.size()}')
    print(f'hvd.local_size: {hvd.local_size()}')

except (ImportError, ModuleNotFoundError):
    NUM_RANKS = 1
    NUM_WORKERS = NUM_RANKS
    HAS_HOROVOD = False

import utils.file_io as io

from config import BIN_DIR
from lattice.gauge_lattice import GaugeLattice
from utils.attr_dict import AttrDict
Exemple #58
0
def main(device, input_path_test, downsampling_fact, downsampling_mode,
         channels, data_format, label_id, weights, image_dir, checkpoint_dir,
         output_graph_file, tst_sz, loss_type, model, decoder, fs_type, batch,
         batchnorm, dtype, scale_factor, predmode):
    #init horovod
    comm_rank = 0
    comm_local_rank = 0
    comm_size = 1
    comm_local_size = 1
    if horovod:
        hvd.init()
        comm_rank = hvd.rank()
        comm_local_rank = hvd.local_rank()
        comm_size = hvd.size()
        #not all horovod versions have that implemented
        try:
            comm_local_size = hvd.local_size()
        except:
            comm_local_size = 1
        if comm_rank == 0:
            print("Using distributed computation with Horovod: {} total ranks".
                  format(comm_size, comm_rank))

    #downsampling? recompute image dimensions
    image_height = image_height_orig // downsampling_fact
    image_width = image_width_orig // downsampling_fact

    #session config
    sess_config = tf.ConfigProto(
        inter_op_parallelism_threads=2,  #1
        intra_op_parallelism_threads=33,  #6
        log_device_placement=False,
        allow_soft_placement=True)
    sess_config.gpu_options.visible_device_list = str(comm_local_rank)
    sess_config.gpu_options.force_gpu_compatible = True

    #get data
    test_graph = tf.Graph()
    if comm_rank == 0:
        print("Loading data...")
    tst_data = load_data(input_path_test,
                         shuffle=False,
                         max_files=tst_sz,
                         use_horovod=False)
    if comm_rank == 0:
        print("Shape of tst_data is {}".format(tst_data.shape[0]))
        print("done.")

    #print some stats
    if comm_rank == 0:
        print("Num workers: {}".format(comm_size))
        print("Local batch size: {}".format(batch))
        if dtype == tf.float32:
            print("Precision: {}".format("FP32"))
        else:
            print("Precision: {}".format("FP16"))
        print("Decoder: {}".format(decoder))
        print("Batch normalization: {}".format(batchnorm))
        print("Channels: {}".format(channels))
        print("Loss type: {}".format(loss_type))
        print("Loss weights: {}".format(weights))
        print("Loss scale factor: {}".format(scale_factor))
        print("Num test samples: {}".format(tst_data.shape[0]))

    #compute epochs and stuff:
    if fs_type == "local":
        num_samples = tst_data.shape[0] // comm_local_size
    else:
        num_samples = tst_data.shape[0] // comm_size

    with test_graph.as_default():
        #create readers
        tst_reader = h5_input_reader(input_path_test,
                                     channels,
                                     weights,
                                     dtype,
                                     normalization_file="stats.h5",
                                     update_on_read=False,
                                     data_format=data_format,
                                     label_id=label_id,
                                     read_labels=(not predmode))
        #create datasets
        if fs_type == "local":
            tst_dataset = create_dataset(tst_reader,
                                         tst_data,
                                         batch,
                                         1,
                                         comm_local_size,
                                         comm_local_rank,
                                         dtype,
                                         shuffle=False)
        else:
            tst_dataset = create_dataset(tst_reader,
                                         tst_data,
                                         batch,
                                         1,
                                         comm_size,
                                         comm_rank,
                                         dtype,
                                         shuffle=False)

        #create iterators
        handle = tf.placeholder(tf.string,
                                shape=[],
                                name="iterator-placeholder")
        if not predmode:
            #in evaluation mode, issue data, label, weight and filename
            iterator = tf.data.Iterator.from_string_handle(
                handle, (dtype, tf.int32, dtype, tf.string),
                ((batch, len(channels), image_height_orig,
                  image_width_orig) if data_format == "channels_first" else
                 (batch, image_height_orig, image_width_orig, len(channels)),
                 (batch, image_height_orig, image_width_orig),
                 (batch, image_height_orig, image_width_orig), (batch)))
        else:
            #in prediction mode, just issue data and filename
            iterator = tf.data.Iterator.from_string_handle(
                handle, (dtype, tf.string),
                ((batch, len(channels), image_height_orig,
                  image_width_orig) if data_format == "channels_first" else
                 (batch, image_height_orig, image_width_orig, len(channels)),
                 (batch)))
        next_elem = iterator.get_next()

        print(next_elem[0].shape, next_elem[1].shape)

        #if downsampling, do some preprocessing
        if downsampling_fact != 1:
            if downsampling_mode == "scale":
                rand_select = tf.cast(tf.one_hot(tf.random_uniform(
                    (batch, image_height, image_width),
                    minval=0,
                    maxval=downsampling_fact * downsampling_fact,
                    dtype=tf.int32),
                                                 depth=downsampling_fact *
                                                 downsampling_fact,
                                                 axis=-1),
                                      dtype=tf.int32)
                if not predmode:
                    next_elem = (tf.layers.average_pooling2d(next_elem[0], downsampling_fact, downsampling_fact, 'valid', data_format), \
                                tf.reduce_max(tf.multiply(tf.image.extract_image_patches(tf.expand_dims(next_elem[1], axis=-1), \
                                                                                  [1, downsampling_fact, downsampling_fact, 1], \
                                                                                  [1, downsampling_fact, downsampling_fact, 1], \
                                                                                  [1,1,1,1], 'VALID'), rand_select), axis=-1), \
                                                                                  tf.squeeze(tf.layers.average_pooling2d(tf.expand_dims(next_elem[2], axis=-1), downsampling_fact, downsampling_fact, 'valid', "channels_last"), axis=-1), \
                                                                                  next_elem[3])
                else:
                    next_elem = (tf.layers.average_pooling2d(next_elem[0], downsampling_fact, downsampling_fact, 'valid', data_format), \
                                 next_elem[1])

            elif downsampling_mode == "center-crop":
                #some parameters
                length = 1. / float(downsampling_fact)
                offset = length / 2.
                boxes = [[offset, offset, offset + length, offset + length]
                         ] * batch
                box_ind = list(range(0, batch))
                crop_size = [image_height, image_width]

                #be careful with data order
                if data_format == "channels_first":
                    next_elem[0] = tf.transpose(next_elem[0],
                                                perm=[0, 2, 3, 1])

                #crop
                if not predmode:
                    next_elem = (tf.image.crop_and_resize(next_elem[0], boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="data_cropping"), \
                                 ensure_type(tf.squeeze(tf.image.crop_and_resize(tf.expand_dims(next_elem[1],axis=-1), boxes, box_ind, crop_size, method='nearest', extrapolation_value=0, name="label_cropping"), axis=-1), tf.int32), \
                                 tf.squeeze(tf.image.crop_and_resize(tf.expand_dims(next_elem[2],axis=-1), boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="weight_cropping"), axis=-1), \
                                 next_elem[3])
                else:
                    next_elem = (tf.image.crop_and_resize(
                        next_elem[0],
                        boxes,
                        box_ind,
                        crop_size,
                        method='bilinear',
                        extrapolation_value=0,
                        name="data_cropping"), next_elem[1])

                #be careful with data order
                if data_format == "channels_first":
                    next_elem[0] = tf.transpose(next_elem[0],
                                                perm=[0, 3, 1, 2])

            else:
                raise ValueError(
                    "Error, downsampling mode {} not supported. Supported are [center-crop, scale]"
                    .format(downsampling_mode))

        #create init handles
        #tst
        tst_iterator = tst_dataset.make_initializable_iterator()
        tst_handle_string = tst_iterator.string_handle()
        tst_init_op = iterator.make_initializer(tst_dataset)

        print(next_elem[0].shape, next_elem[1])

        #compute the input filter number based on number of channels used
        num_channels = len(channels)
        #set up model
        model = deeplab_v3_plus_generator(num_classes=3,
                                          output_stride=8,
                                          base_architecture=model,
                                          decoder=decoder,
                                          batchnorm=batchnorm,
                                          pre_trained_model=None,
                                          batch_norm_decay=None,
                                          data_format=data_format)

        logit, prediction = model(next_elem[0], True, dtype)

        #cast the logits to fp32
        logit = ensure_type(logit, tf.float32)

        if not predmode:
            #set up loss
            loss = None
            if loss_type == "weighted":
                #cast weights to FP32
                w_cast = ensure_type(next_elem[2], tf.float32)
                loss = tf.losses.sparse_softmax_cross_entropy(
                    labels=next_elem[1],
                    logits=logit,
                    weights=w_cast,
                    reduction=tf.losses.Reduction.SUM)
                if scale_factor != 1.0:
                    loss *= scale_factor

            elif loss_type == "weighted_mean":
                #cast weights to FP32
                w_cast = ensure_type(next_elem[2], tf.float32)
                loss = tf.losses.sparse_softmax_cross_entropy(
                    labels=next_elem[1],
                    logits=logit,
                    weights=w_cast,
                    reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
                if scale_factor != 1.0:
                    loss *= scale_factor

            elif loss_type == "focal":
                #one-hot-encode
                labels_one_hot = tf.contrib.layers.one_hot_encoding(
                    next_elem[1], 3)
                #cast to FP32
                labels_one_hot = ensure_type(labels_one_hot, tf.float32)
                loss = focal_loss(onehot_labels=labels_one_hot,
                                  logits=logit,
                                  alpha=1.,
                                  gamma=2.)

            else:
                raise ValueError("Error, loss type {} not supported.",
                                 format(loss_type))

            #set up streaming metrics
            iou_op, iou_update_op = tf.metrics.mean_iou(
                labels=next_elem[1],
                predictions=tf.argmax(prediction, axis=3),
                num_classes=3,
                weights=None,
                metrics_collections=None,
                updates_collections=None,
                name="iou_score")
            iou_reset_op = tf.variables_initializer([
                i for i in tf.local_variables()
                if i.name.startswith('iou_score/')
            ])

        #initializers:
        init_op = tf.global_variables_initializer()
        init_local_op = tf.local_variables_initializer()

        #create image dir if not exists
        if not os.path.isdir(image_dir):
            os.makedirs(image_dir)

        #start session
        with tf.Session(config=sess_config) as sess:
            #initialize
            sess.run([init_op, init_local_op])
            #restore from checkpoint:
            load_model(sess, tf.train.Saver(), checkpoint_dir)
            #create iterator handles
            tst_handle = sess.run(tst_handle_string)
            #init iterators
            sess.run(tst_init_op, feed_dict={handle: tst_handle})

            #remove training nodes
            if output_graph_file:
                print(
                    "Storing inference graph to {}.".format(output_graph_file))
                inference_graph_def = tf.graph_util.remove_training_nodes(
                    sess.graph_def, protected_nodes=None)
                #save the inference graph
                with open(output_graph_file, 'wb') as ogf:
                    ogf.write(inference_graph_def.SerializeToString())

            #start inference
            eval_loss = 0.
            eval_steps = 0
            print("Starting evaluation on test set")
            while True:
                try:
                    if not predmode:
                        #construct feed dict
                        _, tmp_loss, tst_model_predictions, tst_model_labels, tst_model_filenames = sess.run(
                            [
                                iou_update_op, loss, prediction, next_elem[1],
                                next_elem[3]
                            ],
                            feed_dict={handle: tst_handle})
                    else:
                        tst_model_predictions, tst_model_filenames = sess.run(
                            [prediction, next_elem[1]],
                            feed_dict={handle: tst_handle})

                    #print some images
                    if have_imsave:
                        for i in range(tst_model_labels.shape[0]):
                            suf = '{}_rank{}_{}.png'.format(
                                eval_steps, comm_rank, i)
                            imsave(
                                image_dir + '/test_pred_estep' + suf,
                                np.argmax(tst_model_predictions[i, ...],
                                          axis=-1) * 100)
                            if not predmode:
                                imsave(image_dir + '/test_label_estep' + suf,
                                       tst_model_labels[i, ...] * 100)
                                imsave(
                                    image_dir + '/test_combined_estep' + suf,
                                    plot_colormap[
                                        tst_model_labels[i, ...],
                                        np.argmax(tst_model_predictions[i,
                                                                        ...],
                                                  axis=-1)])
                    else:
                        if not predmode:
                            np.savez(
                                image_dir + '/test_estep' + str(eval_steps) +
                                '_rank' + str(comm_rank) + '.npz',
                                prediction=np.argmax(
                                    tst_model_predictions[...], axis=-1) * 100,
                                label=tst_model_labels[...] * 100,
                                filename=tst_model_filenames)
                        else:
                            np.savez(
                                image_dir + '/test_estep' + str(eval_steps) +
                                '_rank' + str(comm_rank) + '.npz',
                                prediction=np.argmax(
                                    tst_model_predictions[...], axis=-1) * 100,
                                filename=tst_model_filenames)

                    #update loss
                    if not predmode:
                        eval_loss += tmp_loss
                    eval_steps += 1

                except tf.errors.OutOfRangeError:
                    eval_steps = np.max([eval_steps, 1])
                    if not predmode:
                        eval_loss /= eval_steps
                        print("COMPLETED: evaluation loss is {}".format(
                            eval_loss))
                        iou_score = sess.run(iou_op)
                        print("COMPLETED: evaluation IoU is {}".format(
                            iou_score))
                    break
                            stddev=1.0 / math.sqrt(embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Compute the average NCE loss for the batch.
    # tf.nce_loss automatically draws a new sample of the negative labels each
    # time we evaluate the loss.
    loss = tf.reduce_mean(
        tf.nn.nce_loss(weights=nce_weights,
                       biases=nce_biases,
                       labels=train_labels,
                       inputs=embed,
                       num_sampled=num_sampled,
                       num_classes=vocabulary_size))

    # Horovod: adjust learning rate based on number of GPUs.
    optimizer = tf.train.GradientDescentOptimizer(1.0 * hvd.size())

    # Horovod: add Horovod Distributed Optimizer.
    optimizer = hvd.DistributedOptimizer(optimizer)

    train_op = optimizer.minimize(loss)

    # Compute the cosine similarity between minibatch examples and all embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(
        normalized_embeddings, valid_dataset)
    similarity = tf.matmul(
        valid_embeddings, normalized_embeddings, transpose_b=True)

    # Add variable initializer.
Exemple #60
0
    # initialization.
    if first_batch:
        hvd.broadcast_variables(model.variables, root_rank=0)
        hvd.broadcast_variables(opt.variables(), root_rank=0)


def log(s, nl=True):
    if hvd.rank() != 0:
        return
    print(s, end='\n' if nl else '')


log('Model: %s' % args.model)
log('Batch size: %d' % args.batch_size)
device = 'GPU' if args.cuda else 'CPU'
log('Number of %ss: %d' % (device, hvd.size()))

with tf.device(device):
    # Warm-up
    log('Running warmup...')
    benchmark_step(first_batch=True)
    timeit.timeit(lambda: benchmark_step(first_batch=False),
                  number=args.num_warmup_batches)

    # Benchmark
    log('Running benchmark...')
    img_secs = []
    for x in range(args.num_iters):
        time = timeit.timeit(lambda: benchmark_step(first_batch=False),
                             number=args.num_batches_per_iter)
        img_sec = args.batch_size * args.num_batches_per_iter / time