def main(hps): # Initialize Horovod. hvd.init() # Create tensorflow session sess = tensorflow_session() # Download and load dataset. tf.set_random_seed(hvd.rank() + hvd.size() * hps.seed) np.random.seed(hvd.rank() + hvd.size() * hps.seed) # Get data and set train_its and valid_its train_iterator, test_iterator, data_init = get_data(hps, sess) hps.train_its, hps.test_its, hps.full_test_its = get_its(hps) # Create log dir logdir = os.path.abspath(hps.logdir) + "/" if not os.path.exists(logdir): os.mkdir(logdir) # Create model import model model = model.model(sess, hps, train_iterator, test_iterator, data_init) # Initialize visualization functions visualise = init_visualizations(hps, model, logdir) if not hps.inference: # Perform training train(sess, model, hps, logdir, visualise) else: infer(sess, model, hps, test_iterator)
def get_data(hps, sess): if hps.image_size == -1: hps.image_size = {'mnist': 32, 'cifar10': 32, 'imagenet-oord': 64, 'imagenet': 256, 'celeba': 256, 'lsun_realnvp': 64, 'lsun': 256}[hps.problem] if hps.n_test == -1: hps.n_test = {'mnist': 10000, 'cifar10': 10000, 'imagenet-oord': 50000, 'imagenet': 50000, 'celeba': 3000, 'lsun_realnvp': 300*hvd.size(), 'lsun': 300*hvd.size()}[hps.problem] hps.n_y = {'mnist': 10, 'cifar10': 10, 'imagenet-oord': 1000, 'imagenet': 1000, 'celeba': 1, 'lsun_realnvp': 1, 'lsun': 1}[hps.problem] if hps.data_dir == "": hps.data_dir = {'mnist': None, 'cifar10': None, 'imagenet-oord': '/mnt/host/imagenet-oord-tfr', 'imagenet': '/mnt/host/imagenet-tfr', 'celeba': '/mnt/host/celeba-reshard-tfr', 'lsun_realnvp': '/mnt/host/lsun_realnvp', 'lsun': '/mnt/host/lsun'}[hps.problem] if hps.problem == 'lsun_realnvp': hps.rnd_crop = True else: hps.rnd_crop = False if hps.category: hps.data_dir += ('/%s' % hps.category) # Use anchor_size to rescale batch size based on image_size s = hps.anchor_size hps.local_batch_train = hps.n_batch_train * \ s * s // (hps.image_size * hps.image_size) hps.local_batch_test = {64: 50, 32: 25, 16: 10, 8: 5, 4: 2, 2: 2, 1: 1}[ hps.local_batch_train] # round down to closest divisor of 50 hps.local_batch_init = hps.n_batch_init * \ s * s // (hps.image_size * hps.image_size) print("Rank {} Batch sizes Train {} Test {} Init {}".format( hvd.rank(), hps.local_batch_train, hps.local_batch_test, hps.local_batch_init)) if hps.problem in ['imagenet-oord', 'imagenet', 'celeba', 'lsun_realnvp', 'lsun']: hps.direct_iterator = True import data_loaders.get_data as v train_iterator, test_iterator, data_init = \ v.get_data(sess, hps.data_dir, hvd.size(), hvd.rank(), hps.pmap, hps.fmap, hps.local_batch_train, hps.local_batch_test, hps.local_batch_init, hps.image_size, hps.rnd_crop) elif hps.problem in ['mnist', 'cifar10']: hps.direct_iterator = False import data_loaders.get_mnist_cifar as v train_iterator, test_iterator, data_init = \ v.get_data(hps.problem, hvd.size(), hvd.rank(), hps.dal, hps.local_batch_train, hps.local_batch_test, hps.local_batch_init, hps.image_size) else: raise Exception() return train_iterator, test_iterator, data_init
def test_horovod_allreduce_cpu_gpu_error(self): """Test that the allreduce raises an error if different ranks try to perform reduction on CPU and GPU.""" # Only do this test if there are GPUs available. if not tf.test.is_gpu_available(cuda_only=True): return hvd.init() local_rank = hvd.local_rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return device = "/gpu:0" if local_rank % 2 == 0 else "/cpu:0" one_gpu = tf.GPUOptions(visible_device_list=str(local_rank)) gpu_config = tf.ConfigProto(gpu_options=one_gpu) with self.test_session(config=gpu_config) as session: with tf.device(device): # Same rank, different dimension dims = [17] * 3 tensor = tf.ones(dims, dtype=tf.int32) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.allreduce(tensor))
def test_horovod_broadcast(self): """Test that the broadcast correctly broadcasts 1D, 2D, 3D tensors.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session() as session: dtypes = [tf.uint8, tf.int8, tf.uint16, tf.int16, tf.int32, tf.int64, tf.float32, tf.float64, tf.bool] dims = [1, 2, 3] root_ranks = list(range(size)) for dtype, dim, root_rank in itertools.product(dtypes, dims, root_ranks): try: tensor = tf.ones([17] * dim) * rank root_tensor = tf.ones([17] * dim) * root_rank if dtype == tf.bool: tensor = tensor % 2 root_tensor = root_tensor % 2 tensor = tf.cast(tensor, dtype=dtype) root_tensor = tf.cast(root_tensor, dtype=dtype) broadcasted_tensor = hvd.broadcast(tensor, root_rank) self.assertTrue( session.run(tf.reduce_all(tf.equal( tf.cast(root_tensor, tf.int32), tf.cast(broadcasted_tensor, tf.int32)))), "hvd.broadcast produces incorrect broadcasted tensor") except Exception: import traceback traceback.print_exc()
def test_horovod_allreduce_cpu(self): """Test on CPU that the allreduce correctly sums 1D, 2D, 3D tensors.""" hvd.init() size = hvd.size() with self.test_session() as session: dtypes = [tf.int32, tf.int64, tf.float32, tf.float64] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): with tf.device("/cpu:0"): tf.set_random_seed(1234) tensor = tf.random_uniform( [17] * dim, -100, 100, dtype=dtype) summed = hvd.allreduce(tensor, average=False) multiplied = tensor * size max_difference = tf.reduce_max(tf.abs(summed - multiplied)) # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: break diff = session.run(max_difference) self.assertTrue(diff <= threshold, "hvd.allreduce produces incorrect results")
def test_horovod_broadcast_grad(self): """Test the correctness of the broadcast gradient.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session(config=self.config) as session: # As of TensorFlow v1.9, gradients are not supported on # integer tensors dtypes = [tf.float32, tf.float64] dims = [1, 2, 3] root_ranks = list(range(size)) for dtype, dim, root_rank in itertools.product( dtypes, dims, root_ranks): tensor = tf.ones([5] * dim) * rank if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) broadcasted_tensor = hvd.broadcast(tensor, root_rank) grad_ys = tf.ones([5] * dim) grad = tf.gradients(broadcasted_tensor, tensor, grad_ys)[0] grad_out = session.run(grad) c = size if rank == root_rank else 0 expected = np.ones([5] * dim) * c err = np.linalg.norm(expected - grad_out) self.assertLess(err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def test_horovod_allreduce_grad(self): """Test the correctness of the allreduce gradient.""" hvd.init() size = hvd.size() with self.test_session(config=self.config) as session: # As of TensorFlow v1.9, gradients are not supported on # integer tensors dtypes = [tf.float32, tf.float64] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): with tf.device("/cpu:0"): tf.set_random_seed(1234) tensor = tf.random_uniform( [5] * dim, -100, 100, dtype=dtype) summed = hvd.allreduce(tensor, average=False) grad_ys = tf.ones([5] * dim) grad = tf.gradients(summed, tensor, grad_ys)[0] grad_out = session.run(grad) expected = np.ones([5] * dim) * size err = np.linalg.norm(expected - grad_out) self.assertLess(err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def test_horovod_allreduce_error(self): """Test that the allreduce raises an error if different ranks try to send tensors of different rank or dimension.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session() as session: # Same rank, different dimension tf.set_random_seed(1234) dims = [17 + rank] * 3 tensor = tf.random_uniform(dims, -1.0, 1.0) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.allreduce(tensor)) # Same number of elements, different rank tf.set_random_seed(1234) if rank == 0: dims = [17, 23 * 57] else: dims = [17, 23, 57] tensor = tf.random_uniform(dims, -1.0, 1.0) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.allreduce(tensor))
def adam2_old(params, cost_or_grads, lr=3e-4, mom1=0.9, mom2=0.999, epsilon=1e-8): updates = [] if type(cost_or_grads) is not list: gs = tf.gradients(cost_or_grads, params) else: gs = cost_or_grads # all-reduce grads1 = [Z.allreduce_mean(g) for g in gs] grads2 = [Z.allreduce_mean(tf.square(g)) for g in gs] mom2 = tf.maximum(0., 1. - (hvd.size() * (1 - mom2))) t = tf.Variable(1., 'adam_t') lr_t = lr * tf.sqrt((1. - tf.pow(mom2, t))) / (1. - tf.pow(mom1, t)) updates.append(t.assign_add(1)) for p, g1, g2 in zip(params, grads1, grads2): mg = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_mg') if mom1 > 0: v = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_v') v_t = mom1 * v + (1. - mom1) * g1 updates.append(v.assign(v_t)) else: v_t = g1 mg_t = mom2 * mg + (1. - mom2) * g2 delta_t = v_t / (tf.sqrt(mg_t) + epsilon) p_t = p - lr_t * delta_t updates.append(mg.assign(mg_t)) updates.append(p.assign(p_t)) return tf.group(*updates)
def main(unused_argv): # Horovod: initialize Horovod. hvd.init() # Load training and eval data mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank()) train_data = mnist.train.images # Returns np.array train_labels = np.asarray(mnist.train.labels, dtype=np.int32) eval_data = mnist.test.images # Returns np.array eval_labels = np.asarray(mnist.test.labels, dtype=np.int32) # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. model_dir = './mnist_convnet_model' if hvd.rank() == 0 else None # Create the Estimator mnist_classifier = tf.estimator.Estimator( model_fn=cnn_model_fn, model_dir=model_dir, config=tf.estimator.RunConfig(session_config=config)) # Set up logging for predictions # Log the values in the "Softmax" tensor with label "probabilities" tensors_to_log = {"probabilities": "softmax_tensor"} logging_hook = tf.train.LoggingTensorHook( tensors=tensors_to_log, every_n_iter=500) # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from # rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights or # restored from a checkpoint. bcast_hook = hvd.BroadcastGlobalVariablesHook(0) # Train the model train_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": train_data}, y=train_labels, batch_size=100, num_epochs=None, shuffle=True) # Horovod: adjust number of steps based on number of GPUs. mnist_classifier.train( input_fn=train_input_fn, steps=20000 // hvd.size(), hooks=[logging_hook, bcast_hook]) # Evaluate the model and print results eval_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False) eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) print(eval_results)
def run(benchmark_step): # Warm-up log('Running warmup...') timeit.timeit(benchmark_step, number=args.num_warmup_batches) # Benchmark log('Running benchmark...') img_secs = [] for x in range(args.num_iters): time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter) img_sec = args.batch_size * args.num_batches_per_iter / time log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device)) img_secs.append(img_sec) # Results img_sec_mean = np.mean(img_secs) img_sec_conf = 1.96 * np.std(img_secs) log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf)) log('Total img/sec on %d %s(s): %.1f +-%.1f' % (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf))
def init_config(): if config.TRAINER == 'horovod': ngpu = hvd.size() else: ngpu = get_num_gpu() assert ngpu % 8 == 0 or 8 % ngpu == 0, ngpu if config.NUM_GPUS is None: config.NUM_GPUS = ngpu else: if config.TRAINER == 'horovod': assert config.NUM_GPUS == ngpu else: assert config.NUM_GPUS <= ngpu print_config()
def test_horovod_broadcast_rank_error(self): """Test that the broadcast returns an error if different ranks specify different root rank.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session() as session: tensor = tf.ones([17] * 3, dtype=tf.float32) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.broadcast(tensor, rank))
def on_batch_begin(self, batch, logs=None): if self.current_epoch > self.warmup_epochs: # Outside of adjustment scope. return if self.current_epoch == self.warmup_epochs and batch > 0: # Outside of adjustment scope, final adjustment is done on first batch. return old_lr = K.get_value(self.model.optimizer.lr) epoch = self.current_epoch + float(batch) / self.steps_per_epoch new_lr = self.initial_lr / hvd.size() * \ (epoch * (hvd.size() - 1) / self.warmup_epochs + 1) K.set_value(self.model.optimizer.lr, new_lr) if self.current_epoch == self.warmup_epochs and self.verbose: print('Epoch %d: finished gradual learning rate warmup to %s.' % (epoch + 1, new_lr)) if hasattr(self.model.optimizer, 'momentum') and self.momentum_correction: # See the paper cited above for more information about momentum correction. self.restore_momentum = K.get_value(self.model.optimizer.momentum) K.set_value(self.model.optimizer.momentum, self.restore_momentum * new_lr / old_lr)
def test_horovod_allgather_variable_size(self): """Test that the allgather correctly gathers 1D, 2D, 3D tensors, even if those tensors have different sizes along the first dim.""" hvd.init() rank = hvd.rank() size = hvd.size() with self.test_session() as session: dtypes = [tf.uint8, tf.int8, tf.uint16, tf.int16, tf.int32, tf.int64, tf.float32, tf.float64, tf.bool] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): # Support tests up to MPI Size of 35 if size > 35: break tensor_sizes = [17, 32, 81, 12, 15, 23, 22] * 5 tensor_sizes = tensor_sizes[:size] tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) gathered = hvd.allgather(tensor) gathered_tensor = session.run(gathered) expected_size = sum(tensor_sizes) self.assertEqual(list(gathered_tensor.shape), [expected_size] + [17] * (dim - 1)) for i in range(size): rank_size = [tensor_sizes[i]] + [17] * (dim - 1) rank_tensor = tf.slice( gathered, [sum(tensor_sizes[:i])] + [0] * (dim - 1), rank_size) self.assertEqual(list(rank_tensor.shape), rank_size) # tf.equal() does not support tf.uint16 as of TensorFlow 1.2, # so need to cast rank_tensor to tf.int32. if dtype != tf.bool: value = i else: value = i % 2 self.assertTrue( session.run(tf.reduce_all( tf.equal(tf.cast(rank_tensor, tf.int32), value))), "hvd.allgather produces incorrect gathered tensor")
def test_horovod_broadcast_type_error(self): """Test that the broadcast returns an error if the types being broadcasted differ among the processes""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session() as session: tensor_size = [17] * 3 dtype = tf.int32 if rank % 2 == 0 else tf.float32 tensor = tf.ones(tensor_size, dtype=dtype) * rank with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.broadcast(tensor, 0))
def test_horovod_broadcast_error(self): """Test that the broadcast returns an error if any dimension besides the first is different among the tensors being broadcasted.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session() as session: tensor_size = [17] * 3 tensor_size[1] = 10 * (rank + 1) tensor = tf.ones(tensor_size, dtype=tf.float32) * rank with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.broadcast(tensor, 0))
def test_horovod_allreduce_multi_gpu(self): """Test that the allreduce works on multiple GPUs. This test will crash badly if used with an MPI implementation that does not support GPU memory transfers directly, as it will call MPI_Send on a GPU data pointer.""" # Only do this test if there are GPUs available. if not tf.test.is_gpu_available(cuda_only=True): return hvd.init() local_rank = hvd.local_rank() size = hvd.size() iter = 0 two_gpus = tf.GPUOptions(visible_device_list=( '%d,%d' % (local_rank * 2, local_rank * 2 + 1))) gpu_config = tf.ConfigProto(gpu_options=two_gpus) with self.test_session(config=gpu_config) as session: dtypes = [tf.int32, tf.int64, tf.float32, tf.float64] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): iter += 1 with tf.device("/gpu:%d" % ((iter + local_rank) % 2)): tf.set_random_seed(1234) tensor = tf.random_uniform( [17] * dim, -100, 100, dtype=dtype) summed = hvd.allreduce(tensor, average=False) multiplied = tensor * size max_difference = tf.reduce_max(tf.abs(summed - multiplied)) # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: return diff = session.run(max_difference) self.assertTrue(diff <= threshold, "hvd.allreduce on GPU produces incorrect results")
def test_horovod_allreduce_type_error(self): """Test that the allreduce raises an error if different ranks try to send tensors of different type.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session() as session: # Same rank, different dimension dims = [17] * 3 tensor = tf.ones(dims, dtype=tf.int32 if rank % 2 == 0 else tf.float32) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.allreduce(tensor))
def init_by_config(self, config): """ :param Config.Config config: """ logs = config.list('log', []) log_verbosity = config.int_list('log_verbosity', []) log_format = config.list('log_format', []) if config.is_true("use_horovod"): # noinspection PyPackageRequirements,PyUnresolvedReferences import horovod.tensorflow as hvd from TFUtil import init_horovod init_horovod() # make sure it is initialized new_logs = [] for fn in logs: fn_prefix, fn_ext = os.path.splitext(fn) fn_ext = ".horovod-%i-%i%s" % (hvd.rank(), hvd.size(), fn_ext) new_logs.append(fn_prefix + fn_ext) logs = new_logs self.initialize(logs=logs, verbosity=log_verbosity, formatter=log_format)
def test_horovod_allreduce_gpu_fused(self): """Test that the allreduce works on GPUs with Tensor Fusion. This test will crash badly if used with an MPI implementation that does not support GPU memory transfers directly, as it will call MPI_Send on a GPU data pointer.""" # Only do this test if there are GPUs available. if not tf.test.is_gpu_available(cuda_only=True): return hvd.init() local_rank = hvd.local_rank() size = hvd.size() with self.test_session(config=self.config) as session: dtypes = [tf.int32, tf.int64, tf.float32, tf.float64] dims = [1, 2, 3] tests = [] for dtype, dim in itertools.product(dtypes, dims): with tf.device("/gpu:%d" % local_rank): tf.set_random_seed(1234) tensor = tf.random_uniform( [17] * dim, -100, 100, dtype=dtype) summed = hvd.allreduce(tensor, average=False) multiplied = tensor * size max_difference = tf.reduce_max(tf.abs(summed - multiplied)) # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in [tf.int32, tf.int64]: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: return test = max_difference <= threshold tests.append(test) self.assertTrue(session.run(tf.reduce_all(tests)), "hvd.allreduce produces incorrect results")
def get_its(hps): # These run for a fixed amount of time. As anchored batch is smaller, we've actually seen fewer examples train_its = int(np.ceil(hps.n_train / (hps.n_batch_train * hvd.size()))) test_its = int(np.ceil(hps.n_test / (hps.n_batch_train * hvd.size()))) train_epoch = train_its * hps.n_batch_train * hvd.size() # Do a full validation run if hvd.rank() == 0: print(hps.n_test, hps.local_batch_test, hvd.size()) assert hps.n_test % (hps.local_batch_test * hvd.size()) == 0 full_test_its = hps.n_test // (hps.local_batch_test * hvd.size()) if hvd.rank() == 0: print("Train epoch size: " + str(train_epoch)) return train_its, test_its, full_test_its
def test_horovod_allgather_grad(self): """Test the correctness of the allgather gradient.""" hvd.init() rank = hvd.rank() size = hvd.size() with self.test_session(config=self.config) as session: # As of TensorFlow v1.9, gradients are not supported on # integer tensors dtypes = [tf.float32, tf.float64] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5 tensor_sizes = tensor_sizes[:size] tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) gathered = hvd.allgather(tensor) grad_list = [] for r, tensor_size in enumerate(tensor_sizes): g = tf.ones([tensor_size] + [17] * (dim - 1)) * r grad_list.append(g) grad_ys = tf.concat(grad_list, axis=0) grad = tf.gradients(gathered, tensor, grad_ys)[0] grad_out = session.run(grad) expected = np.ones( [tensor_sizes[rank]] + [17] * (dim - 1) ) * rank * size err = np.linalg.norm(expected - grad_out) self.assertLess(err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def finalize_configs(is_training): """ Run some sanity checks, and populate some configs from others """ _C.DATA.NUM_CLASS = _C.DATA.NUM_CATEGORY + 1 # +1 background _C.RPN.NUM_ANCHOR = len(_C.RPN.ANCHOR_SIZES) * len(_C.RPN.ANCHOR_RATIOS) assert len(_C.FPN.ANCHOR_STRIDES) == len(_C.RPN.ANCHOR_SIZES) # image size into the backbone has to be multiple of this number _C.FPN.RESOLUTION_REQUIREMENT = _C.FPN.ANCHOR_STRIDES[3] # [3] because we build FPN with features r2,r3,r4,r5 if _C.MODE_FPN: size_mult = _C.FPN.RESOLUTION_REQUIREMENT * 1. _C.PREPROC.MAX_SIZE = np.ceil(_C.PREPROC.MAX_SIZE / size_mult) * size_mult if is_training: os.environ['TF_AUTOTUNE_THRESHOLD'] = '1' assert _C.TRAINER in ['horovod', 'replicated'], _C.TRAINER # setup NUM_GPUS if _C.TRAINER == 'horovod': import horovod.tensorflow as hvd ngpu = hvd.size() else: assert 'OMPI_COMM_WORLD_SIZE' not in os.environ ngpu = get_num_gpu() assert ngpu % 8 == 0 or 8 % ngpu == 0, ngpu if _C.TRAIN.NUM_GPUS is None: _C.TRAIN.NUM_GPUS = ngpu else: if _C.TRAINER == 'horovod': assert _C.TRAIN.NUM_GPUS == ngpu else: assert _C.TRAIN.NUM_GPUS <= ngpu else: # autotune is too slow for inference os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0' logger.info("Config: ------------------------------------------\n" + str(_C))
def get_gradients(self, loss, params): """ Compute gradients of all trainable variables. See Optimizer.get_gradients() for more info. In DistributedOptimizer, get_gradients() is overriden to also allreduce the gradients before returning them. """ gradients = super(self.__class__, self).get_gradients(loss, params) if hvd.size() > 1: averaged_gradients = [] with tf.name_scope(self._name + "_Allreduce"): for grad in gradients: if grad is not None: avg_grad = hvd.allreduce(grad, device_dense=self._device_dense, device_sparse=self._device_sparse) averaged_gradients.append(avg_grad) else: averaged_gradients.append(None) return averaged_gradients else: return gradients
def get_current_step_learning_rate(self): """ :rtype: tf.Tensor """ lr = self.learning_rate_var if self.config.typed_dict.get("dynamic_learning_rate"): # To implement any kind of cyclic learning rate during the epoch. E.g.: https://arxiv.org/abs/1608.03983 with tf.name_scope("dynamic_learning_rate"): from Util import CollectionReadCheckCovered opts = CollectionReadCheckCovered(self.config.typed_dict["dynamic_learning_rate"]) # Currently all intervals of same step size. interval_steps = tf.constant(opts["interval"], name="interval", dtype=self.network.global_train_step.dtype) step_in_interval = tf.mod(self.network.global_train_step, interval_steps, name="step_in_interval") factor = tf.pow( tf.constant(opts["decay"], name="decay", dtype=tf.float32), tf.to_float(step_in_interval, name="step_in_interval_float"), name="factor") lr *= factor opts.assert_all_read() if self.config.is_true("use_horovod") and self.config.is_true("horovod_scale_lr"): # noinspection PyPackageRequirements,PyUnresolvedReferences import horovod.tensorflow as hvd lr *= hvd.size() return lr
def test_horovod_allgather(self): """Test that the allgather correctly gathers 1D, 2D, 3D tensors.""" hvd.init() rank = hvd.rank() size = hvd.size() with self.test_session() as session: dtypes = [tf.uint8, tf.int8, tf.uint16, tf.int16, tf.int32, tf.int64, tf.float32, tf.float64, tf.bool] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = tf.ones([17] * dim) * rank if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) gathered = hvd.allgather(tensor) gathered_tensor = session.run(gathered) self.assertEqual(list(gathered_tensor.shape), [17 * size] + [17] * (dim - 1)) for i in range(size): rank_tensor = tf.slice(gathered_tensor, [i * 17] + [0] * (dim - 1), [17] + [-1] * (dim - 1)) self.assertEqual(list(rank_tensor.shape), [17] * dim) # tf.equal() does not support tf.uint16 as of TensorFlow 1.2, # so need to cast rank_tensor to tf.int32. if dtype != tf.bool: value = i else: value = i % 2 self.assertTrue( session.run(tf.reduce_all( tf.equal(tf.cast(rank_tensor, tf.int32), value))), "hvd.allgather produces incorrect gathered tensor")
loss, metrics = evaluate_step(samples) if batch % self.hparams.log_interval == 0 and hvd.local_rank( ) == 0: logging.info(self.metric_checker(loss, metrics, -2)) loss_metric.update_state(loss) if hvd.local_rank() == 0: logging.info( self.metric_checker(loss_metric.result(), metrics, evaluate_epoch=epoch)) self.model.reset_metrics() return loss_metric.result(), metrics if __name__ == "__main__": logging.set_verbosity(logging.INFO) if len(sys.argv) < 2: logging.warning('Usage: python {} config_json_file'.format( sys.argv[0])) sys.exit() tf.random.set_seed(1) json_file = sys.argv[1] #config = None #with open(json_file) as f: # config = json.load(f) #p = parse_config(config) HorovodSolver.initialize_devices() #multi-servers training should use hvd.rank() train(json_file, HorovodSolver, hvd.size(), hvd.rank())
def main(_): hvd.init() FLAGS.output_dir = FLAGS.output_dir if hvd.rank() == 0 else os.path.join( FLAGS.output_dir, str(hvd.rank())) tf.logging.set_verbosity(tf.logging.INFO) processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "xnli": XnliProcessor, "cla": ClaProcessor, "pair": PairProcessor } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 config = tf.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host), log_step_count_steps=25, session_config=config) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) num_train_steps = num_train_steps // hvd.size() num_warmup_steps = num_warmup_steps // hvd.size() model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) hooks = [hvd.BroadcastGlobalVariablesHook(0)] estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=hooks) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) ####################################################################################################################### # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy steps_and_files = [] filenames = tf.gfile.ListDirectory(FLAGS.output_dir) for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) global_step = int(cur_filename.split("-")[-1]) tf.logging.info("Add {} to eval list.".format(cur_filename)) steps_and_files.append([global_step, cur_filename]) steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") print("output_eval_file:", output_eval_file) tf.logging.info("output_eval_file:" + output_eval_file) with tf.gfile.GFile(output_eval_file, "w") as writer: for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=filename) tf.logging.info("***** Eval results %s *****" % (filename)) writer.write("***** Eval results %s *****\n" % (filename)) for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) ####################################################################################################################### # result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) # # output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") # with tf.gfile.GFile(output_eval_file, "w") as writer: # tf.logging.info("***** Eval results *****") # for key in sorted(result.keys()): # tf.logging.info(" %s = %s", key, str(result[key])) # writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: true_labels = [] with open(os.path.join(FLAGS.data_dir, "test.tsv"), 'r', encoding='utf-8') as f: for line in f.readlines(): line = line.strip() true_labels.append(int(line.split('\t')[0])) predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) predictions = [] output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] a = probabilities.tolist() predictions.append(a.index(max(a))) if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples count = 0 for i in range(len(predictions)): if predictions[i] == true_labels[i]: count += 1 print("Average accuracy: ", count / len(predictions)) with open(os.path.join(FLAGS.data_dir, "id2label.json"), 'r', encoding='utf-8') as f: ld2label = json.load(f) cla_labels = [i for i in range(FLAGS.cla_nums)] report = metrics.classification_report( y_true=true_labels, y_pred=predictions, labels=cla_labels, target_names=[ld2label[str(i)].split()[0] for i in cla_labels], digits=4) confution_matrix = metrics.confusion_matrix(y_true=true_labels, y_pred=predictions, labels=cla_labels) print(report) print(confution_matrix) with open(os.path.join(FLAGS.output_dir, "eval_report.txt"), 'w', encoding='utf-8') as f: f.write(report)
def loss_function(): logits = model(data, training=True) return tf.losses.sparse_softmax_cross_entropy(target, logits) def log(s, nl=True): if hvd.rank() != 0: return print(s, end='\n' if nl else '') log('Model: %s' % args.model) log('Batch size: %d' % args.batch_size) device = 'GPU' if args.cuda else 'CPU' log('Number of %ss: %d' % (device, hvd.size())) def run(benchmark_step): # Warm-up log('Running warmup...') timeit.timeit(benchmark_step, number=args.num_warmup_batches) # Benchmark log('Running benchmark...') img_secs = [] for x in range(args.num_iters): time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter) img_sec = args.batch_size * args.num_batches_per_iter / time log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device)) img_secs.append(img_sec)
def parallax_run_hybrid(single_gpu_meta_graph_def, config): # Initialize horovod hvd.init() #worker_id = hvd.rank() local_worker_id = hvd.local_rank() num_workers = hvd.size() machine_id, hostname = _get_worker_info() create_profile_directory(config.profile_config.profile_dir, config.profile_config.profile_worker, config.resource_info, hostname) sess_config = config.sess_config if sess_config is None: sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.visible_device_list = str(hvd.local_rank()) cluster_spec = get_tf_clusterspec_for_hybrid(config.resource_info) worker_id = 0 for i in range(machine_id): worker_id += len(config.resource_info['worker'][i]['gpus']) worker_id += hvd.local_rank() if config.profile_config.profile_dir: for ps_i, ps in enumerate(config.resource_info['ps']): if ps['hostname'] == hostname: if local_worker_id == 0: tasks = ['ps:%d'%ps_i, 'worker:%d'%worker_id] else: tasks = ['worker:%d'%worker_id] append_task_info(config.profile_config.profile_dir, hostname, tasks) break server = tf.train.Server(cluster_spec, job_name='worker', task_index=worker_id, protocol=config.communication_config.ps_config.protocol, config=sess_config) meta_graph_def, tensor_or_op_name_to_replica_names = graph_transform_hybrid( single_gpu_meta_graph_def, worker_id, local_worker_id, machine_id, hostname, config) with tf.Graph().as_default() as graph_to_run: parallax_log.debug("Importing MPI graph on worker %d" % worker_id) tf.train.import_meta_graph(meta_graph_def) if config.export_graph_path: export_meta_graph(config.export_graph_path, worker_id) if config.profile_config.profile_dir: path = os.path.join(config.profile_config.profile_dir, hostname, 'worker:%d'%worker_id) export_meta_graph(path, worker_id) if worker_id != config.profile_config.profile_worker: #Only one CUPTI profiler can run in a machine #See tensorflow/tensorflow/core/platform/default/device_tracer.cc:L452 config.profile_config.profile_dir = None else: config.profile_config.profile_dir = \ os.path.join(config.profile_config.profile_dir, hostname, 'worker:%d'%worker_id, 'run_meta') ckpt_hooks = \ build_ckpt_hooks(config.get_ckpt_config()) \ if worker_id == 0 else None sess = tf.train.MonitoredTrainingSession( master=server.target, is_chief=True, checkpoint_dir=config.get_ckpt_config().ckpt_dir if worker_id == 0 else None, # TODO: Allow user-defined hooks hooks=None, chief_only_hooks=ckpt_hooks, save_checkpoint_secs=None, save_summaries_steps=None, save_summaries_secs=None, config=sess_config) parallax_log.debug( "Created MonitoredTrainingSession for worker %d" % worker_id) _init_global_vars(sess) parallax_log.debug( "Finished initialization process, start training on worker %d" % worker_id) step = sess.run(tf.get_collection(tf.GraphKeys.GLOBAL_STEP)[0]) sess_context = \ ParallaxSessionContext(step, config.profile_config.profile_dir, config.profile_config.profile_steps, config.profile_config.profile_range, tensor_or_op_name_to_replica_names, 1) sess_context.set_parallax_session_context() return sess, num_workers, worker_id, 1
session_init=get_model_loader(args.load), input_names=MODEL.get_inference_tensor_names()[0], output_names=MODEL.get_inference_tensor_names()[1]) if args.predict: predictor = OfflinePredictor(predcfg) for image_file in args.predict: do_predict(predictor, image_file) elif args.evaluate: assert args.evaluate.endswith('.json'), args.evaluate do_evaluate(predcfg, args.evaluate) else: is_horovod = cfg.TRAINER == 'horovod' if is_horovod: hvd.init() logger.info("Horovod Rank={}, Size={}".format( hvd.rank(), hvd.size())) if not is_horovod or hvd.rank() == 0: logger.set_logger_dir(args.logdir, 'd') logger.info("Environment Information:\n" + collect_env_info()) finalize_configs(is_training=True) stepnum = cfg.TRAIN.STEPS_PER_EPOCH # warmup is step based, lr is epoch based init_lr = cfg.TRAIN.WARMUP_INIT_LR * min(8. / cfg.TRAIN.NUM_GPUS, 1.) warmup_schedule = [(0, init_lr), (cfg.TRAIN.WARMUP, cfg.TRAIN.BASE_LR)] warmup_end_epoch = cfg.TRAIN.WARMUP * 1. / stepnum lr_schedule = [(int(warmup_end_epoch + 0.5), cfg.TRAIN.BASE_LR)] factor = 8. / cfg.TRAIN.NUM_GPUS
def main(args): # Horovod: initialize Horovod. hvd.init() # Keras automatically creates a cache directory in ~/.keras/datasets for # storing the downloaded MNIST data. This creates a race # condition among the workers that share the same filesystem. If the # directory already exists by the time this worker gets around to creating # it, ignore the resulting exception and continue. cache_dir = os.path.join(os.path.expanduser("~"), ".keras", "datasets") if not os.path.exists(cache_dir): try: os.mkdir(cache_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(cache_dir): pass else: raise # Download and load MNIST dataset. (train_data, train_labels), (eval_data, eval_labels) = keras.datasets.mnist.load_data( "MNIST-data-%d" % hvd.rank() ) # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it # into (-1, 784) to feed into our network. Also, need to normalize the # features between 0 and 1. train_data = np.reshape(train_data, (-1, 784)) / 255.0 eval_data = np.reshape(eval_data, (-1, 784)) / 255.0 # Horovod: pin GPU to be used to process local rank (one GPU per process) if not args.use_only_cpu: config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) estimator_config = tf.estimator.RunConfig(session_config=config) else: estimator_config = None # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. model_dir = args.model_dir if hvd.rank() == 0 else None # Create the Estimator mnist_classifier = tf.estimator.Estimator( model_fn=cnn_model_fn, model_dir=model_dir, config=estimator_config ) # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from # rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights or # restored from a checkpoint. bcast_hook = hvd.BroadcastGlobalVariablesHook(0) # Train the model train_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": train_data}, y=train_labels, batch_size=100, num_epochs=None, shuffle=True ) # Horovod: adjust number of steps based on number of GPUs. mnist_classifier.train( input_fn=train_input_fn, steps=args.num_steps // hvd.size(), hooks=[bcast_hook] ) # Evaluate the model and print results eval_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False ) eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) print(eval_results)
def cnn_model_fn(features, labels, mode): """Model function for CNN.""" # Input Layer # Reshape X to 4-D tensor: [batch_size, width, height, channels] # MNIST images are 28x28 pixels, and have one color channel input_layer = tf.reshape(features["x"], [-1, 28, 28, 1]) # Convolutional Layer #1 # Computes 32 features using a 5x5 filter with ReLU activation. # Padding is added to preserve width and height. # Input Tensor Shape: [batch_size, 28, 28, 1] # Output Tensor Shape: [batch_size, 28, 28, 32] conv1 = tf.layers.conv2d( inputs=input_layer, filters=32, kernel_size=[5, 5], padding="same", activation=tf.nn.relu ) # Pooling Layer #1 # First max pooling layer with a 2x2 filter and stride of 2 # Input Tensor Shape: [batch_size, 28, 28, 32] # Output Tensor Shape: [batch_size, 14, 14, 32] pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2) # Convolutional Layer #2 # Computes 64 features using a 5x5 filter. # Padding is added to preserve width and height. # Input Tensor Shape: [batch_size, 14, 14, 32] # Output Tensor Shape: [batch_size, 14, 14, 64] conv2 = tf.layers.conv2d( inputs=pool1, filters=64, kernel_size=[5, 5], padding="same", activation=tf.nn.relu ) # Pooling Layer #2 # Second max pooling layer with a 2x2 filter and stride of 2 # Input Tensor Shape: [batch_size, 14, 14, 64] # Output Tensor Shape: [batch_size, 7, 7, 64] pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2) # Flatten tensor into a batch of vectors # Input Tensor Shape: [batch_size, 7, 7, 64] # Output Tensor Shape: [batch_size, 7 * 7 * 64] pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64]) # Dense Layer # Densely connected layer with 1024 neurons # Input Tensor Shape: [batch_size, 7 * 7 * 64] # Output Tensor Shape: [batch_size, 1024] dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu) # Add dropout operation; 0.6 probability that element will be kept dropout = tf.layers.dropout( inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN ) # Logits layer # Input Tensor Shape: [batch_size, 1024] # Output Tensor Shape: [batch_size, 10] logits = tf.layers.dense(inputs=dropout, units=10) predictions = { # Generate predictions (for PREDICT and EVAL mode) "classes": tf.argmax(input=logits, axis=1), # Add `softmax_tensor` to the graph. It is used for PREDICT and by the # `logging_hook`. "probabilities": tf.nn.softmax(logits, name="softmax_tensor"), } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Calculate Loss (for both TRAIN and EVAL modes) onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10) loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits) # Configure the Training Op (for TRAIN mode) if mode == tf.estimator.ModeKeys.TRAIN: # Horovod: scale learning rate by the number of workers. optimizer = tf.train.MomentumOptimizer(learning_rate=0.001 * hvd.size(), momentum=0.9) # Horovod: add Horovod Distributed Optimizer. optimizer = hvd.DistributedOptimizer(optimizer) train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) # Add evaluation metrics (for EVAL mode) eval_metric_ops = { "accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["classes"]) } return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
def multiplier(epoch): # Adjust epoch to produce round numbers at the end of each epoch, so that TensorBoard # learning rate graphs look better. epoch += 1. / self.steps_per_epoch return 1. / hvd.size() * (epoch * (hvd.size() - 1) / warmup_epochs + 1)
def main(_): """ Builds the model and runs """ if FLAGS.distributed: import horovod.tensorflow as hvd hvd.init() tf.logging.set_verbosity(tf.logging.INFO) # Loads GPT-2 model configuration if FLAGS.config_type == "json": gpt2_config = model_utils.transform_gpt2_to_texar_config( FLAGS.config_model) elif FLAGS.config_type == 'texar': gpt2_config = importlib.import_module(FLAGS.config_model) else: raise ValueError('Unknown config_type.') # Creates a data pre-processor for, e.g., BPE encoding proc = processor.get_encoder(FLAGS.pretrain_model_dir) max_decoding_length = config_train.max_decoding_length assert max_decoding_length <= gpt2_config.position_size, ( "max_decoding_length should not be greater than position_size. " "{}>{}".format(max_decoding_length, gpt2_config.position_size)) # Loads data # Configures training data shard in distributed mode if FLAGS.distributed: config_train.train_hparam["dataset"]["num_shards"] = hvd.size() config_train.train_hparam["dataset"]["shard_id"] = hvd.rank() config_train.train_hparam["batch_size"] //= hvd.size() datasets = {} if FLAGS.do_train: train_dataset = tx.data.TFRecordData(hparams=config_train.train_hparam) datasets['train'] = train_dataset if FLAGS.do_eval: dev_dataset = tx.data.TFRecordData(hparams=config_train.dev_hparam) datasets['dev'] = dev_dataset if FLAGS.do_test: test_dataset = tx.data.TFRecordData(hparams=config_train.test_hparam) datasets['test'] = test_dataset iterator = tx.data.FeedableDataIterator(datasets) batch = iterator.get_next() batch_size = tf.shape(batch['text_ids'])[0] # Builds the GPT-2 model word_embedder = tx.modules.WordEmbedder(vocab_size=gpt2_config.vocab_size, hparams=gpt2_config.embed) pos_embedder = tx.modules.PositionEmbedder( position_size=gpt2_config.position_size, hparams=gpt2_config.pos_embed) # Ties output layer with input word embedding output_layer = tf.transpose(word_embedder.embedding, (1, 0)) decoder = tx.modules.TransformerDecoder(vocab_size=gpt2_config.vocab_size, output_layer=output_layer, hparams=gpt2_config.decoder) # For training seq_len = tf.fill([batch_size], tf.shape(batch['text_ids'])[1]) pos_embeds = pos_embedder(sequence_length=seq_len) input_embeds = word_embedder(batch['text_ids']) + pos_embeds outputs = decoder(inputs=input_embeds, decoding_strategy='train_greedy') loss = tx.losses.sequence_sparse_softmax_cross_entropy( labels=batch['text_ids'][:, 1:], logits=outputs.logits[:, :-1, :], sequence_length=batch['length'] - 1, average_across_timesteps=True, sum_over_timesteps=False) ppl = tf.exp(loss) global_step = tf.Variable(0, trainable=False) opt = tx.core.get_optimizer(global_step=global_step, hparams=config_train.opt) if FLAGS.distributed: opt = hvd.DistributedOptimizer(opt) train_op = tf.contrib.layers.optimize_loss(loss=loss, global_step=global_step, learning_rate=None, optimizer=opt) # For generation: generates continuations of test text def _embedding_fn(x, y): # `x` is token ids, `y` is time steps return word_embedder(x) + pos_embedder(y) end_token = proc.encoder['<|endoftext|>'] start_tokens = batch['text_ids'][:, 0] helper = tx.modules.TopKSampleEmbeddingHelper( embedding=_embedding_fn, start_tokens=start_tokens, end_token=end_token, top_k=FLAGS.top_k, softmax_temperature=FLAGS.temperature) outputs_infer, _ = decoder(context=batch['text_ids'], context_sequence_length=batch['length'], max_decoding_length=max_decoding_length, helper=helper) sample_id = outputs_infer.sample_id # Train/eval/test routine saver = tf.train.Saver() saver_best = tf.train.Saver(max_to_keep=1) dev_best = {'loss': 1e8, 'ppl': 1e8} def _is_head(): if not FLAGS.distributed: return True else: return hvd.rank() == 0 def _train_epoch(sess): """Trains on the training set, and evaluates on the dev set periodically. """ iterator.restart_dataset(sess, 'train') fetches = {'loss': train_op, 'step': global_step} while True: try: feed_dict = { iterator.handle: iterator.get_handle(sess, 'train'), tx.global_mode(): tf.estimator.ModeKeys.TRAIN, } rets = sess.run(fetches, feed_dict) step = rets['step'] dis_steps = config_train.display_steps if _is_head() and dis_steps > 0 and step % dis_steps == 0: tf.logging.info('step:%d; loss:%f' % (step, rets['loss'])) eval_steps = config_train.eval_steps if _is_head() and eval_steps > 0 and step % eval_steps == 0: _dev_epoch(sess) ckpt_steps = config_train.checkpoint_steps if _is_head() and ckpt_steps > 0 and step % ckpt_steps == 0: ckpt_fn = os.path.join(FLAGS.output_dir, 'model.ckpt') ckpt_fn = saver.save(sess, ckpt_fn, global_step=step) tf.logging.info('Checkpoint to {}'.format(ckpt_fn)) except tf.errors.OutOfRangeError: break def _dev_epoch(sess): """Evaluates on the dev set. """ iterator.restart_dataset(sess, 'dev') cum_loss = 0. cum_ppl = 0. nsamples = 0 fetches = { 'loss': loss, 'ppl': ppl, 'batch_size': batch_size, } while True: try: feed_dict = { iterator.handle: iterator.get_handle(sess, 'dev'), tx.context.global_mode(): tf.estimator.ModeKeys.EVAL, } rets = sess.run(fetches, feed_dict) cum_loss += rets['loss'] * rets['batch_size'] cum_ppl += rets['ppl'] * rets['batch_size'] nsamples += rets['batch_size'] except tf.errors.OutOfRangeError: break avg_loss = cum_loss / nsamples avg_ppl = cum_ppl / nsamples tf.logging.info('dev loss: {}; ppl: {}; nsamples: {}'.format( avg_loss, avg_ppl, nsamples)) if FLAGS.do_train and avg_loss < dev_best['loss']: dev_best['loss'] = avg_loss dev_best['ppl'] = avg_ppl ckpt_fn = os.path.join(FLAGS.output_dir, 'model_best.ckpt') ckpt_fn = saver_best.save(sess, ckpt_fn) tf.logging.info('Checkpoint best to {}'.format(ckpt_fn)) def _test_epoch(sess): """Generates samples on the test set. """ iterator.restart_dataset(sess, 'test') _all_inputs = [] _all_samples = [] fetches = { 'inputs': batch['text_ids'], 'length': batch['length'], 'samples': sample_id } while True: try: feed_dict = { iterator.handle: iterator.get_handle(sess, 'test'), tx.context.global_mode(): tf.estimator.ModeKeys.PREDICT, } rets = sess.run(fetches, feed_dict=feed_dict) _inputs = [] for i, l in zip(rets['inputs'], rets['length']): # Delete padding _inputs.append(i[:l].tolist()) _all_inputs.extend(_inputs) _samples = [] for s, l in zip(rets['samples'], rets['length']): # Delete inputs from samples _samples.append(s[l:].tolist()) _all_samples.extend(_samples) except tf.errors.OutOfRangeError: break # Parse samples and write to file eos_token_id = proc.encoder['<|endoftext|>'] _all_input_text = [] for i in _all_inputs: if i[0] == eos_token_id: # '<|endoftext|>' is used as the BOS token. Delete it here i = i[1:] i_text = proc.decode(i) _all_input_text.append(i_text) # '<|endoftext|>' is used as the PAD token. Delete them here _all_input_text = tx.utils.strip_eos(_all_input_text, eos_token='<|endoftext|>') _all_samples_text = [] for i, s in zip(_all_inputs, _all_samples): s_text = proc.decode(s) s_text = s_text.replace('\n', ' ') _all_samples_text.append(s_text) _all_samples_text = tx.utils.strip_eos(_all_samples_text, eos_token='<|endoftext|>') output_file = os.path.join(FLAGS.output_dir, "test_samples.tsv") tf.logging.info('Write samples to {}'.format(output_file)) tx.utils.write_paired_text(_all_input_text, _all_samples_text, output_file) # Broadcasts global variables from rank-0 process if FLAGS.distributed: bcast = hvd.broadcast_global_variables(0) session_config = tf.ConfigProto() if FLAGS.distributed: session_config.gpu_options.visible_device_list = str(hvd.local_rank()) with tf.Session(config=session_config) as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) if FLAGS.distributed: bcast.run() # Restores trained model if specified if FLAGS.checkpoint: tf.logging.info('Restore from {}'.format(FLAGS.checkpoint)) saver.restore(sess, FLAGS.checkpoint) elif FLAGS.pretrain_checkpoint: tf.logging.info('Restore from {}'.format( FLAGS.pretrain_checkpoint)) model_utils.init_gpt2_checkpoint(sess, FLAGS.pretrain_checkpoint) print("\nFinished loading\n") iterator.initialize_dataset(sess) if FLAGS.do_train: for _ in range(config_train.max_train_epoch): _train_epoch(sess) saver.save(sess, FLAGS.output_dir + '/model.ckpt') if FLAGS.do_eval: _dev_epoch(sess) if FLAGS.do_test: _test_epoch(sess)
def on_state_reset(): optimizer.lr.assign(lr * hvd.size())
def main(_): '''Main routine for Horovod Tensorflow Mnist example.''' # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list=str(hvd.local_rank())) config = tf.ConfigProto(gpu_options=gpu_options) batch_size = 100 # Download and load MNIST dataset. if hvd.rank() == 0: # mnist = learn.datasets.mnist.read_data_sets(MNIST_DATADIR) image, label = get_data_mnist(batch_size) # hvd.allreduce(tf.constant([0]), average=False) # Barrier (not working) with tf.Session(config=config): # download/unzip in rank 0 only. hvd_keras.allreduce([0], name="Barrier") if hvd.rank() != 0: # mnist = learn.datasets.mnist.read_data_sets(MNIST_DATADIR) image, label = get_data_mnist(batch_size) # Build model... # with tf.name_scope('input'): # image = tf.placeholder(tf.float32, [None, 784], name='image') # label = tf.placeholder(tf.float32, [None], name='label') predict, loss = conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN) # Horovod: adjust learning rate based on number of GPUs. opt = tf.train.RMSPropOptimizer(0.001 * hvd.size()) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) # global_step = tf.contrib.framework.get_or_create_global_step() global_step = tf.train.get_or_create_global_step() train_op = opt.minimize(loss, global_step=global_step) hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable # states from rank 0 to all other processes. This is necessary to # ensure consistent initialization of all workers when training is # started with random weights or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=20000 // hvd.size()), tf.train.LoggingTensorHook(tensors={ 'step': global_step, 'loss': loss }, every_n_iter=10), ] # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when # done or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop(): # Run a training step synchronously. # image_, label_ = mnist.train.next_batch(100) # mon_sess.run(train_op, feed_dict={image: image_, label: label_}) mon_sess.run(train_op)
from ESN import EchoStateRNNCell import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt from time import time import horovod.tensorflow as hvd # Initialize Horovod hvd.init() mnist = tf.keras.datasets.mnist (X_train, y_train), (X_test, y_test) = mnist.load_data() X_train, X_test = X_train / 255.0, X_test / 255.0 X_train, y_train = X_train[hvd.rank()::hvd.size()], y_train[hvd.rank()::hvd. size()] print("MNIST shape", X_train.shape, X_test.shape) if False: # debug only X_train = X_train[:10000] y_train = y_train[:10000] # Pin GPU to be used to process local rank (one GPU per process) # takes only current needed GPU memory config = tf.ConfigProto(intra_op_parallelism_threads=hvd.size(), inter_op_parallelism_threads=hvd.size()) config.gpu_options.allow_growth = False config.gpu_options.visible_device_list = str(hvd.local_rank())
attacker = PGDAttacker( args.attack_iter, args.attack_epsilon, args.attack_step_size, prob_start_from_clean=0.2 if not args.eval else 0.0) if args.use_fp16xla: attacker.USE_FP16 = True attacker.USE_XLA = True model.set_attacker(attacker) os.system("nvidia-smi") hvd.init() if args.eval: sessinit = SmartInit(args.load) if hvd.size() == 1: # single-GPU eval, slow ds = get_val_dataflow(args.data, args.batch) eval_on_ILSVRC12(model, sessinit, ds) else: logger.info("CMD: " + " ".join(sys.argv)) cb = create_eval_callback("eval", model.get_inference_func(attacker), lambda e: True) trainer = HorovodTrainer() trainer.setup_graph(model.get_input_signature(), PlaceholderInput(), model.build_graph, model.get_optimizer) # train for an empty epoch, to reuse the distributed evaluation code trainer.train_with_defaults( callbacks=[cb],
def main(_): hvd.init() sess_config = tf.ConfigProto() sess_config.gpu_options.visible_device_list = str(hvd.local_rank()) graph = tf.Graph() from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score with graph.as_default(): import json # config = json.load(open("/data/xuht/bert/chinese_L-12_H-768_A-12/bert_config.json", "r")) config = json.load(open(FLAGS.config_file, "r")) import json label_dict = json.load(open(FLAGS.label_id)) label_tensor = np.asarray(label_dict["class_ratio"]).astype(np.float32) init_checkpoint = FLAGS.init_checkpoint print("===init checkoutpoint==={}".format(init_checkpoint)) # init_checkpoint = "/data/xuht/bert/chinese_L-12_H-768_A-12/bert_model.ckpt" # init_checkpoint = "/data/xuht/concat/model_1/oqmrc.ckpt" config = Bunch(config) config.use_one_hot_embeddings = True config.scope = "bert" config.dropout_prob = 0.1 config.label_type = "single_label" # config.loss = "focal_loss" # os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu_id sess = tf.Session(config=sess_config) train_size = int(FLAGS.train_size/hvd.size()) num_train_steps = int( train_size / FLAGS.batch_size * FLAGS.epoch) num_warmup_steps = int(num_train_steps * 0.01) num_storage_steps = int(train_size / FLAGS.batch_size) print(num_train_steps, num_warmup_steps, "=============") opt_config = Bunch({"init_lr":(1e-5/hvd.size()), "num_train_steps":num_train_steps, "num_warmup_steps":num_warmup_steps}) model_io_config = Bunch({"fix_lm":False}) model_io_fn = model_io.ModelIO(model_io_config) num_choice = FLAGS.num_classes max_seq_length = FLAGS.max_length vib_config = {"kl_type":"original", "beta":0.1} model_train_fn = bert_order_classifier.classifier_vib_model_fn_builder( config, num_choice, init_checkpoint, model_reuse=None, load_pretrained=True, model_io_fn=model_io_fn, model_io_config=model_io_config, opt_config=opt_config, input_name=["a", "b"], vib_config=vib_config, label_tensor=None) model_eval_fn = bert_order_classifier.classifier_vib_model_fn_builder( config, num_choice, init_checkpoint, model_reuse=True, load_pretrained=True, model_io_fn=model_io_fn, model_io_config=model_io_config, opt_config=opt_config, input_name=["a", "b"], vib_config=vib_config, label_tensor=None) def metric_fn(features, logits, loss): print(logits.get_shape(), "===logits shape===") pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32) prob = tf.nn.softmax(logits) accuracy = correct = tf.equal( tf.cast(pred_label, tf.int32), tf.cast(features["label_ids"], tf.int32) ) accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) return {"accuracy":accuracy, "loss":loss, "pred_label":pred_label, "label_ids":features["label_ids"]} name_to_features = { "input_ids_a": tf.FixedLenFeature([max_seq_length], tf.int64), "input_mask_a": tf.FixedLenFeature([max_seq_length], tf.int64), "segment_ids_a": tf.FixedLenFeature([max_seq_length], tf.int64), "input_ids_b": tf.FixedLenFeature([max_seq_length], tf.int64), "input_mask_b": tf.FixedLenFeature([max_seq_length], tf.int64), "segment_ids_b": tf.FixedLenFeature([max_seq_length], tf.int64), "label_ids": tf.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example. """ example = tf.parse_single_example(record, name_to_features) # tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t return example params = Bunch({}) params.epoch = FLAGS.epoch params.batch_size = FLAGS.batch_size # train_features = tf_data_utils.train_input_fn("/data/xuht/wsdm19/data/train.tfrecords", # _decode_record, name_to_features, params) # eval_features = tf_data_utils.eval_input_fn("/data/xuht/wsdm19/data/dev.tfrecords", # _decode_record, name_to_features, params) train_features = tf_data_utils.train_input_fn(FLAGS.train_file, _decode_record, name_to_features, params) eval_features = tf_data_utils.eval_input_fn(FLAGS.dev_file, _decode_record, name_to_features, params) [train_op, train_loss, train_per_example_loss, train_logits] = model_train_fn(train_features, [], tf.estimator.ModeKeys.TRAIN) [_, eval_loss, eval_per_example_loss, eval_logits] = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL) result = metric_fn(eval_features, eval_logits, eval_loss) model_io_fn.set_saver() init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) sess.run(hvd.broadcast_global_variables(0)) def eval_fn(result): i = 0 total_accuracy = 0 label, label_id = [], [] label_weight = [] while True: try: eval_result = sess.run(result) total_accuracy += eval_result["accuracy"] label_id.extend(eval_result["label_ids"]) label.extend(eval_result["pred_label"]) for item in eval_result["label_ids"]: label_weight.append(label_tensor[item]) i += 1 except tf.errors.OutOfRangeError: print("End of dataset") break f1 = f1_score(label_id, label, average="macro", sample_weight=label_weight) accuracy = accuracy_score(label_id, label, sample_weight=label_weight) print("test accuracy accuracy {} {} f1 {}".format(total_accuracy/i, accuracy, f1)) return total_accuracy/ i, f1 def train_fn(op, loss): i = 0 cnt = 0 total_loss = 0.0 while True: try: [_, train_loss] = sess.run([op, loss]) total_loss += train_loss i += 1 cnt += 1 if np.mod(i, num_storage_steps) == 0: print(total_loss/cnt) # model_io_fn.save_model(sess, "/data/xuht/wsdm19/data/model_11_15_focal_loss/oqmrc_{}.ckpt".format(int(i/8000))) if hvd.rank() == 0: model_io_fn.save_model(sess, FLAGS.model_output+"/oqmrc_{}.ckpt".format(int(i/num_storage_steps))) print("==successful storing model=={}".format(int(i/num_storage_steps))) total_loss = 0 cnt = 0 except tf.errors.OutOfRangeError: break print("===========begin to train============") train_fn(train_op, train_loss) print("===========begin to eval============") accuracy, f1 = eval_fn(result) print("==accuracy {} f1 {}==".format(accuracy, f1)) # model_io_fn.save_model(sess, "/data/xuht/wsdm19/data/model_11_15_focal_loss/oqmrc.ckpt") if hvd.rank() == 0: model_io_fn.save_model(sess, FLAGS.model_output+"/oqmrc.ckpt")
def parallel_train(training_dataset): import horovod.tensorflow as hvd hvd.init() # Horovod ds = training_dataset.shuffle(buffer_size=4096) ds = ds.shard(num_shards=hvd.size(), index=hvd.rank()) ds = ds.repeat(n_epoch) ds = ds.map(_map_fn, num_parallel_calls=4) ds = ds.batch(batch_size) ds = ds.prefetch(buffer_size=1) iterator = ds.make_one_shot_iterator() one_element = iterator.get_next() net, total_loss, log_tensors = make_model(*one_element, is_train=True, reuse=False) x_ = net.img # net input last_conf = net.last_conf # net output last_paf = net.last_paf # net output confs_ = net.confs # GT pafs_ = net.pafs # GT mask = net.m1 # mask1, GT # net.m2 = m2 # mask2, GT stage_losses = net.stage_losses l2_loss = net.l2_loss global_step = tf.Variable(1, trainable=False) # scaled_lr = lr_init * hvd.size() # Horovod: scale the learning rate linearly scaled_lr = lr_init # Linear scaling rule is not working in openpose training. with tf.variable_scope('learning_rate'): lr_v = tf.Variable(scaled_lr, trainable=False) opt = tf.train.MomentumOptimizer(lr_v, 0.9) opt = hvd.DistributedOptimizer(opt) # Horovod train_op = opt.minimize(total_loss, global_step=global_step) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True # Horovod config.gpu_options.visible_device_list = str(hvd.local_rank()) # Horovod # Add variable initializer. init = tf.global_variables_initializer() # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. bcast = hvd.broadcast_global_variables(0) # Horovod # Horovod: adjust number of steps based on number of GPUs. global n_step, lr_decay_every_step n_step = n_step // hvd.size() + 1 # Horovod lr_decay_every_step = lr_decay_every_step // hvd.size() + 1 # Horovod # Start training with tf.Session(config=config) as sess: init.run() bcast.run() # Horovod print('Worker{}: Initialized'.format(hvd.rank())) print( 'Worker{}: Start - n_step: {} batch_size: {} lr_init: {} lr_decay_every_step: {}' .format(hvd.rank(), n_step, batch_size, lr_init, lr_decay_every_step)) # restore pre-trained weights try: # tl.files.load_and_assign_npz(sess, os.path.join(model_path, 'pose.npz'), net) tl.files.load_and_assign_npz_dict(sess=sess, name=os.path.join( model_path, 'pose.npz')) except: print("no pre-trained model") # train until the end while True: step = sess.run(global_step) if step == n_step: break tic = time.time() if step != 0 and (step % lr_decay_every_step == 0): new_lr_decay = lr_decay_factor**(step // lr_decay_every_step) sess.run(tf.assign(lr_v, scaled_lr * new_lr_decay)) [_, _loss, _stage_losses, _l2, conf_result, paf_result] = \ sess.run([train_op, total_loss, stage_losses, l2_loss, last_conf, last_paf]) # tstring = time.strftime('%d-%m %H:%M:%S', time.localtime(time.time())) lr = sess.run(lr_v) print( 'Worker{}: Total Loss at iteration {} / {} is: {} Learning rate {:10e} l2_loss {:10e} Took: {}s' .format(hvd.rank(), step, n_step, _loss, lr, _l2, time.time() - tic)) for ix, ll in enumerate(_stage_losses): print('Worker{}:', hvd.rank(), 'Network#', ix, 'For Branch', ix % 2 + 1, 'Loss:', ll) # save intermediate results and model if hvd.rank() == 0: # Horovod if (step != 0) and (step % save_interval == 0): # save some results [ img_out, confs_ground, pafs_ground, conf_result, paf_result, mask_out ] = sess.run( [x_, confs_, pafs_, last_conf, last_paf, mask]) draw_results(img_out, confs_ground, conf_result, pafs_ground, paf_result, mask_out, 'train_%d_' % step) # save model # tl.files.save_npz( # net.all_params, os.path.join(model_path, 'pose' + str(step) + '.npz'), sess=sess) # tl.files.save_npz(net.all_params, os.path.join(model_path, 'pose.npz'), sess=sess) tl.files.save_npz_dict(net.all_params, os.path.join( model_path, 'pose' + str(step) + '.npz'), sess=sess) tl.files.save_npz_dict(net.all_params, os.path.join( model_path, 'pose.npz'), sess=sess)
def main(_): # Horovod: initialize Horovod. hvd.init() # Keras automatically creates a cache directory in ~/.keras/datasets for # storing the downloaded MNIST data. This creates a race # condition among the workers that share the same filesystem. If the # directory already exists by the time this worker gets around to creating # it, ignore the resulting exception and continue. cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets') if not os.path.exists(cache_dir): try: os.mkdir(cache_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(cache_dir): pass else: raise # Download and load MNIST dataset. (x_train, y_train), (x_test, y_test) = \ keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank()) # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it # into (-1, 784) to feed into our network. Also, need to normalize the # features between 0 and 1. x_train = np.reshape(x_train, (-1, 784)) / 255.0 x_test = np.reshape(x_test, (-1, 784)) / 255.0 # Build model... with tf.name_scope('input'): image = tf.placeholder(tf.float32, [None, 784], name='image') label = tf.placeholder(tf.float32, [None], name='label') predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN) # Horovod: adjust learning rate based on number of GPUs. opt = tf.train.RMSPropOptimizer(0.001 * hvd.size()) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) global_step = tf.train.get_or_create_global_step() train_op = opt.minimize(loss, global_step=global_step) hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=2000 // hvd.size()), tf.train.LoggingTensorHook(tensors={ 'step': global_step, 'loss': loss }, every_n_iter=100) ] # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None training_batch_generator = train_input_generator(x_train, y_train, batch_size=100) # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. builder = option_builder.ProfileOptionBuilder opts1 = builder(builder.time_and_memory()).\ order_by('micros').\ with_max_depth(10).\ with_file_output("./pctx/opts1-rank-%d" % hvd.rank()).\ build() opts2 = builder.trainable_variables_parameter() # with profile_context.ProfileContext("./pctx", # trace_steps=range(100, 110), # dump_steps=[110]) as pctx: with profile_context.ProfileContext("./pctx") as pctx: pctx.add_auto_profiling('op', opts1, [800, 900, 1000]) pctx.add_auto_profiling('scope', opts2, [1000]) with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop(): # Run a training step synchronously. image_, label_ = next(training_batch_generator) mon_sess.run(train_op, feed_dict={ image: image_, label: label_ }) pctx.profiler.advise(options=model_analyzer.ALL_ADVICE)
def main(_): tf.get_logger().setLevel(logging.ERROR) hvd.init() FLAGS = PARSER.parse_args() backends = [StdOutBackend(Verbosity.DEFAULT)] if FLAGS.log_dir: backends += [JSONStreamBackend(Verbosity.DEFAULT, FLAGS.log_dir)] DLLogger.init(backends=backends) os.environ['CUDA_CACHE_DISABLE'] = '0' os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1' os.environ['TF_ADJUST_HUE_FUSED'] = '1' os.environ['TF_ADJUST_SATURATION_FUSED'] = '1' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' os.environ['TF_SYNC_ON_FINISH'] = '0' os.environ['TF_AUTOTUNE_THRESHOLD'] = '2' os.environ['TF_DISABLE_NVTX_RANGES'] = '1' if hvd.rank() == 0: DLLogger.log(step=tuple(), data={"mixed_precision": "ENABLED" if FLAGS.use_amp else "DISABLED"}) dataset = MSDDataset(json_path=os.path.join(FLAGS.data_dir, 'dataset.json'), dst_size=FLAGS.input_shape, seed=FLAGS.seed, interpolator=FLAGS.resize_interpolator, data_normalization=FLAGS.data_normalization, batch_size=FLAGS.batch_size, train_split=FLAGS.train_split, split_seed=FLAGS.split_seed) FLAGS.labels = dataset.labels gpu_options = tf.GPUOptions() config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True) config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) run_config = tf.estimator.RunConfig( save_summary_steps=None, save_checkpoints_steps=dataset.train_steps * FLAGS.train_epochs, save_checkpoints_secs=None, tf_random_seed=None, session_config=config, keep_checkpoint_max=1) estimator = tf.estimator.Estimator( model_fn=vnet_v2, model_dir=FLAGS.model_dir if hvd.rank() == 0 else None, config=run_config, params=FLAGS) train_hooks = [hvd.BroadcastGlobalVariablesHook(0)] if 'train' in FLAGS.exec_mode: steps = dataset.train_steps * FLAGS.train_epochs if FLAGS.benchmark: steps = FLAGS.warmup_steps * 2 if hvd.rank() == 0: train_hooks += [ProfilingHook(FLAGS.warmup_steps, FLAGS.batch_size * hvd.size(), DLLogger)] else: if hvd.rank() == 0: train_hooks += [TrainHook(FLAGS.log_every, DLLogger)] estimator.train( input_fn=lambda: dataset.train_fn(FLAGS.augment), steps=steps, hooks=train_hooks) if 'evaluate' in FLAGS.exec_mode: if hvd.rank() == 0: if FLAGS.train_split >= 1.0: raise ValueError("Missing argument: --train_split < 1.0") result = estimator.evaluate( input_fn=dataset.eval_fn, steps=dataset.eval_steps, hooks=[]) DLLogger.log(step=tuple(), data={'background_dice': result['background dice']}) DLLogger.log(step=tuple(), data={'anterior_dice': result['Anterior dice']}) DLLogger.log(step=tuple(), data={'posterior_dice': result['Posterior dice']}) if 'predict' in FLAGS.exec_mode: count = 1 hooks = [] if hvd.rank() == 0: if FLAGS.benchmark: count = math.ceil((FLAGS.warmup_steps * 2) / dataset.test_steps) hooks += [ProfilingHook(FLAGS.warmup_steps, FLAGS.batch_size * hvd.size(), DLLogger, training=False)] predictions = estimator.predict(input_fn=lambda: dataset.test_fn(count=count), hooks=hooks) pred = [p['prediction'] for p in predictions] predict_path = os.path.join(FLAGS.model_dir, 'predictions') if os.path.exists(predict_path): shutil.rmtree(predict_path) os.makedirs(predict_path) pickle.dump(pred, open(os.path.join(predict_path, 'predictions.pkl'), 'wb'))
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu, use_multi_gpu): """Creates an optimizer training op.""" global_step = tf.train.get_or_create_global_step() learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) # Implements linear decay of the learning rate. learning_rate = tf.train.polynomial_decay(learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) # Implements linear warmup. I.e., if global_step < num_warmup_steps, the # learning rate will be `global_step/num_warmup_steps * init_lr`. if num_warmup_steps: global_steps_int = tf.cast(global_step, tf.int32) warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = init_lr * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) learning_rate = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) # Horovod: scale learning rate by the number of GPUs. if use_multi_gpu: learning_rate = learning_rate * hvd.size() # It is recommended that you use this optimizer for fine tuning, since this # is how the model was trained (note that the Adam m/v variables are NOT # loaded from init_checkpoint.) optimizer = AdamWeightDecayOptimizer( learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) if use_multi_gpu: optimizer = hvd.DistributedOptimizer(optimizer, compression=hvd.Compression.fp16, sparse_as_dense=True) if use_tpu: optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) tvars = tf.trainable_variables() if use_multi_gpu: grads_and_vars = optimizer.compute_gradients(loss, tvars) grads = [grad for grad, var in grads_and_vars] tvars = [var for grad, var in grads_and_vars] else: grads = tf.gradients(loss, tvars) # This is how the model was pre-trained. (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # Normally the global step update is done inside of `apply_gradients`. # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use # a different optimizer, you should probably take this line out. new_global_step = global_step + 1 train_op = tf.group(train_op, [global_step.assign(new_global_step)]) return train_op
def train(*tf_records: "Records to train on"): """Train on examples.""" tf.logging.set_verbosity(tf.logging.INFO) estimator = dual_net.get_estimator() effective_batch_size = FLAGS.train_batch_size if FLAGS.dist_train: effective_batch_size = int(FLAGS.train_batch_size / hvd.size()) if FLAGS.use_tpu: effective_batch_size *= FLAGS.num_tpu_cores if FLAGS.use_tpu: if FLAGS.use_bt: def _input_fn(params): games = bigtable_input.GameQueue(FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table) games_nr = bigtable_input.GameQueue(FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table + '-nr') return preprocessing.get_tpu_bt_input_tensors( games, games_nr, params['batch_size'], number_of_games=FLAGS.window_size, random_rotation=True) else: def _input_fn(params): return preprocessing.get_tpu_input_tensors( params['batch_size'], tf_records, random_rotation=True) # Hooks are broken with TPUestimator at the moment. hooks = [] else: def _input_fn(): return preprocessing.get_input_tensors( effective_batch_size, tf_records, filter_amount=FLAGS.filter_amount, shuffle_buffer_size=FLAGS.shuffle_buffer_size, random_rotation=True, seed=FLAGS.training_seed, dist_train=FLAGS.dist_train) hooks = [ UpdateRatioSessionHook(FLAGS.work_dir), EchoStepCounterHook(output_dir=FLAGS.work_dir) ] if FLAGS.dist_train: hooks.append(hvd.BroadcastGlobalVariablesHook(0)) steps = FLAGS.steps_to_train logging.info("Training, steps = %s, batch = %s -> %s examples", steps or '?', effective_batch_size, (steps * effective_batch_size) if steps else '?') if FLAGS.use_bt: games = bigtable_input.GameQueue(FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table) if not games.read_wait_cell(): games.require_fresh_games(20000) latest_game = games.latest_game_number index_from = max(latest_game, games.read_wait_cell()) print("== Last game before training:", latest_game, flush=True) print("== Wait cell:", games.read_wait_cell(), flush=True) try: estimator.train(_input_fn, steps=steps, hooks=hooks) if FLAGS.use_bt: bigtable_input.set_fresh_watermark(games, index_from, FLAGS.window_size) except: if FLAGS.use_bt: games.require_fresh_games(0) raise
def BatchNormEidt(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5, center=True, scale=True, beta_initializer=tf.zeros_initializer(), gamma_initializer=tf.ones_initializer(), virtual_batch_size=None, data_format='channels_last', ema_update='default', sync_statistics=None, internal_update=None, bit_activation=2): """ A more powerful version of `tf.layers.batch_normalization`. It differs from the offical one in the following aspects: 1. Accepts an alternative ``data_format`` option when ``axis`` is None. For 2D input, this argument will be ignored. 2. Default value for ``momentum`` and ``epsilon`` is different. 3. Default value for ``training`` is automatically obtained from tensorpack's ``TowerContext``. User-provided value can overwrite this behavior. 4. Support the ``ema_update`` option, which covers broader use cases than the standard EMA update. 5. Support the ``sync_statistics`` option, which implements "SyncBN" and is very useful in small-batch models. Args: training (bool): if True, use per-batch statistics to normalize. Otherwise, use stored EMA to normalize. By default, it is equal to `get_current_tower_context().is_training`. This is not a good argument name, but it is what the Tensorflow layer uses. ema_update (str): Only effective when ``training=True``. It has the following options: * "default": same as "collection". Because this is the default behavior in tensorflow. * "skip": do not update EMA. This can be useful when you reuse a batch norm layer in several places but do not want them to all update your EMA. * "collection": Add EMA update ops to collection `tf.GraphKeys.UPDATE_OPS`. The ops in the collection will be run automatically by the callback :class:`RunUpdateOps`, along with your training iterations. This can waste compute if your training iterations do not always depend on the BatchNorm layer. * "internal": EMA is updated inside this layer itself by control dependencies. In common cases, it has similar speed to "collection". But it covers more cases, e.g.: 1. BatchNorm is used inside dynamic control flow. The collection-based update does not support dynamic control flows. 2. BatchNorm layer is sometimes unused (e.g., in GANs you have two networks to train alternatively). Putting all update ops into a single collection will waste a lot of compute. 3. Other part of the model relies on the "updated" EMA. The collection-based method does not update EMA immediately. Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/14699 sync_statistics (str or None): one of None, "nccl", or "horovod". It determines how to compute the "per-batch statistics" when ``training==True``. * None: it uses statistics of the input tensor to normalize during training. This is the standard way BatchNorm was implemented in most frameworks. * "nccl": this layer must be used under tensorpack's multi-GPU trainers. It uses the aggregated statistics of the whole batch (across all GPUs) to normalize. * "horovod": this layer must be used under tensorpack's :class:`HorovodTrainer`. It uses the aggregated statistics of the whole batch (across all MPI ranks) to normalize. Note that on single machine this is significantly slower than the "nccl" implementation. When not None, each GPU computes its own E[x] and E[x^2], which are then averaged among all GPUs to compute global mean & variance. Therefore each GPU needs to have the same batch size. The synchronization is based on the current variable scope + the name of the layer (`BatchNorm('name', input)`). Therefore, you need to make sure that: 1. The BatchNorm layer on different GPUs needs to have the same name, so that statistics can be synchronized. If names do not match, this layer will hang. 2. A BatchNorm layer cannot be reused within one tower. 3. A BatchNorm layer needs to be executed for the same number of times by all GPUs. If different GPUs execute one BatchNorm layer for different number of times (e.g., if some GPUs do not execute it), this layer may hang. This option is also known as "SyncBN" or "Cross-GPU BatchNorm" as mentioned in: `MegDet: A Large Mini-Batch Object Detector <https://arxiv.org/abs/1711.07240>`_. Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/18222. When `sync_statistics` is enabled, `ema_update` is set to "internal" automatically. This is to avoid running `UPDATE_OPS`, which requires synchronization. internal_update: deprecated option. Don't use. Variable Names: * ``beta``: the bias term. Will be zero-inited by default. * ``gamma``: the scale term. Will be one-inited by default. * ``mean/EMA``: the moving average of mean. * ``variance/EMA``: the moving average of variance. Note: This layer is more flexible than the standard "BatchNorm" layer and provides more features: 1. No matter whether you're doing training or not, you can set the ``training`` argument to use batch statistics or EMA statistics. i.e., you can use batch statistics during inference, or use EMA statistics during training. Using EMA statistics in training is useful when you load a pre-trained BN and don't want to update it. 2. As long as `training=True`, `sync_statistics` and `ema_update` option will take effect. """ # parse training/ctx def get_quan_point(): return np.array([(2**bit_activation-i+0.5)/(2**bit_activation-1) \ for i in range(2**bit_activation,1,-1)]) ctx = get_current_tower_context() if training is None: training = ctx.is_training training = bool(training) # parse shapes data_format = get_data_format(data_format, keras_mode=False) shape = inputs.get_shape().as_list() ndims = len(shape) assert ndims in [2, 4], ndims if sync_statistics is not None: sync_statistics = sync_statistics.lower() assert sync_statistics in [None, 'nccl', 'horovod'], sync_statistics assert ema_update in ["default", "collection", "internal", "skip"] if internal_update is not None: log_deprecated("BatchNorm(internal_update=)", "Use ema_update='internal' instead!", "2020-01-01") assert ema_update == 'default', \ "Do not use internal_update and ema_update together! internal_update is deprecated" ema_update = "internal" if internal_update else "collection" if ema_update == "default": ema_update = "collection" # Logic: # 1. EMA update is possible only when we compute batch statistics (training=True) # 2. We know that in training, non-main training tower does not need EMA update # We don't know about what to do in prediction context, so be conservative and do the update. # 3. User can explicit disable update by "skip". do_ema_update = training and \ (ctx.is_main_training_tower or not ctx.is_training) \ and (ema_update != "skip") if axis is None: if ndims == 2: axis = 1 else: axis = 1 if data_format == 'NCHW' else 3 assert axis in [1, 3], axis num_chan = shape[axis] TF_version = get_tf_version_tuple() freeze_bn_backward = not training and ctx.is_training if freeze_bn_backward: assert TF_version >= (1, 4), \ "Fine tuning a BatchNorm model with fixed statistics needs TF>=1.4!" if ctx.is_main_training_tower: # only warn in first tower logger.warn( "[BatchNorm] Using moving_mean/moving_variance in training.") # Using moving_mean/moving_variance in training, which means we # loaded a pre-trained BN and only fine-tuning the affine part. do_sync_bn = (sync_statistics is not None) and training if not do_sync_bn: # Use the builtin layer for anything except for sync-bn coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS]) with rename_get_variable({ 'moving_mean': 'mean/EMA', 'moving_variance': 'variance/EMA' }): tf_args = dict( axis=axis, momentum=momentum, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, # https://github.com/tensorflow/tensorflow/issues/10857#issuecomment-410185429 fused=(ndims == 4 and axis in [1, 3] and not freeze_bn_backward), _reuse=tf.get_variable_scope().reuse) if TF_version >= (1, 5): tf_args['virtual_batch_size'] = virtual_batch_size else: assert virtual_batch_size is None, "Feature not supported in this version of TF!" use_fp16 = inputs.dtype == tf.float16 if use_fp16: # non-fused does not support fp16; fused does not support all layouts. # we made our best guess here tf_args['fused'] = True if training: layer = tf.layers.BatchNormalization(**tf_args) xn = layer.apply(inputs, training=training, scope=tf.get_variable_scope()) else: layer = tf.layers.BatchNormalization(**tf_args) xnn = layer.apply(inputs, training=training, scope=tf.get_variable_scope()) i1 = inputs[0, 0, 0, :] i2 = inputs[1, 1, 1, :] x1 = xnn[0, 0, 0, :] x2 = xnn[1, 1, 1, :] mean0 = i1 - x1 * (i1 - i2) / (x1 - x2) var0 = (i1 - i2) / (x1 - x2) #quantize BN during inference print('in quantize BN') quan_points = get_quan_point() #add_moving_summary(tf.identity(quan_points[3],name='origin_quan_points_3')) quan_values = np.array([round((quan_points[i]-0.005)*(2**bit_activation-1))\ /(float(2**bit_activation-1)) for i in range(len(quan_points))]) quan_values = np.append(quan_values, np.array([1.]), axis=-1) moving_mean_ = tf.identity(mean0, name='moving_mean_') moving_mean_ = tf.expand_dims(moving_mean_, axis=-1) moving_var_ = tf.identity(var0, name='moving_var') moving_var_ = tf.expand_dims(moving_var_, axis=-1) quan_points = moving_var_ * quan_points + moving_mean_ b, w, h, c = inputs.shape inputs = tf.transpose(tf.reshape(inputs, [-1, c])) label1 = tf.cast(tf.less_equal( inputs, tf.expand_dims(quan_points[:, 0], axis=-1)), dtype=tf.float32) label2 = tf.cast(tf.math.logical_and(tf.math.less_equal(inputs,tf.expand_dims(quan_points[:,1],axis=-1)),\ tf.math.greater(inputs,tf.expand_dims(quan_points[:,0],axis=-1))),dtype=tf.float32) label3 = tf.cast(tf.math.logical_and(tf.math.less_equal(inputs,tf.expand_dims(quan_points[:,2],axis=-1)),\ tf.math.greater(inputs,tf.expand_dims(quan_points[:,1],axis=-1))),dtype=tf.float32) label4 = tf.cast(tf.math.greater( inputs, tf.expand_dims(quan_points[:, 2], axis=-1)), dtype=tf.float32) xn = label1*quan_values[0]+label2*quan_values[1]+label3*quan_values[2]+\ label4*quan_values[3] xn = tf.reshape(tf.transpose(xn), [-1, w, h, c]) # Add EMA variables to the correct collection if ctx.is_main_training_tower: for v in layer.non_trainable_variables: if isinstance(v, tf.Variable): tf.add_to_collection(tf.GraphKeys.MODEL_VARIABLES, v) if not do_ema_update: restore_collection(coll_bk) if do_ema_update and ema_update == "internal": # Implement "internal" update. restore_collection(coll_bk) assert layer.updates with tf.control_dependencies(layer.updates): ret = tf.identity(xn, name='output') else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder( moving_mean=layer.moving_mean, mean=layer.moving_mean, # for backward-compatibility moving_variance=layer.moving_variance, variance=layer.moving_variance) # for backward-compatibility if scale: vh.gamma = layer.gamma if center: vh.beta = layer.beta else: red_axis = [0] if ndims == 2 else ( [0, 2, 3] if axis == 1 else [0, 1, 2]) new_shape = None # don't need to reshape unless ... if ndims == 4 and axis == 1: new_shape = [1, num_chan, 1, 1] batch_mean = tf.reduce_mean(inputs, axis=red_axis) batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=red_axis) if sync_statistics == 'nccl': num_dev = ctx.total if num_dev == 1: logger.warn( "BatchNorm(sync_statistics='nccl') is used with only one tower!" ) else: assert six.PY2 or TF_version >= (1, 10), \ "Cross-GPU BatchNorm is only supported in TF>=1.10 ." \ "Upgrade TF or apply this patch manually: https://github.com/tensorflow/tensorflow/pull/20360" if TF_version <= (1, 12): try: from tensorflow.contrib.nccl.python.ops.nccl_ops import _validate_and_load_nccl_so except Exception: pass else: _validate_and_load_nccl_so() from tensorflow.contrib.nccl.ops import gen_nccl_ops else: from tensorflow.python.ops import gen_nccl_ops shared_name = re.sub('tower[0-9]+/', '', tf.get_variable_scope().name) batch_mean = gen_nccl_ops.nccl_all_reduce( input=batch_mean, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev) batch_mean_square = gen_nccl_ops.nccl_all_reduce( input=batch_mean_square, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean_square') * (1.0 / num_dev) elif sync_statistics == 'horovod': # Require https://github.com/uber/horovod/pull/331 import horovod.tensorflow as hvd if hvd.size() == 1: logger.warn( "BatchNorm(sync_statistics='horovod') is used with only one process!" ) else: import horovod hvd_version = tuple(map(int, horovod.__version__.split('.'))) assert hvd_version >= ( 0, 13, 6), "sync_statistics=horovod needs horovod>=0.13.6 !" batch_mean = hvd.allreduce(batch_mean, average=True) batch_mean_square = hvd.allreduce(batch_mean_square, average=True) batch_var = batch_mean_square - tf.square(batch_mean) batch_mean_vec = batch_mean batch_var_vec = batch_var beta, gamma, moving_mean, moving_var = get_bn_variables( num_chan, scale, center, beta_initializer, gamma_initializer) if new_shape is not None: batch_mean = tf.reshape(batch_mean, new_shape) batch_var = tf.reshape(batch_var, new_shape) # Using fused_batch_norm(is_training=False) is actually slightly faster, # but hopefully this call will be JITed in the future. xn = tf.nn.batch_normalization(inputs, batch_mean, batch_var, tf.reshape(beta, new_shape), tf.reshape(gamma, new_shape), epsilon) else: xn = tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta, gamma, epsilon) if do_ema_update: ret = internal_update_bn_ema(xn, batch_mean_vec, batch_var_vec, moving_mean, moving_var, momentum) else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder( moving_mean=moving_mean, mean=moving_mean, # for backward-compatibility moving_variance=moving_var, variance=moving_var) # for backward-compatibility if scale: vh.gamma = gamma if center: vh.beta = beta return ret
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None): """Model definition entry. Args: features: the input image tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include class targets and box targets which are dense label maps. The labels are generated from get_input_fn function in data/dataloader.py mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. model: the model outputs class logits and box regression outputs. variable_filter_fn: the filter function that takes trainable_variables and returns the variable list after applying the filter rule. Returns: tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction. Raises: RuntimeError: if both ckpt and backbone_ckpt are set. """ # Convert params (dict) to Config for easier access. training_hooks = None if params['data_format'] == 'channels_first': features = tf.transpose(features, [0, 3, 1, 2]) def _model_outputs(inputs): return model(inputs, config=hparams_config.Config(params)) cls_outputs, box_outputs = utils.build_model_with_precision( params['precision'], _model_outputs, features) levels = cls_outputs.keys() for level in levels: cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32) box_outputs[level] = tf.cast(box_outputs[level], tf.float32) # First check if it is in PREDICT mode. if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'image': features, } for level in levels: predictions['cls_outputs_%d' % level] = cls_outputs[level] predictions['box_outputs_%d' % level] = box_outputs[level] return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Set up training loss and learning rate. update_learning_rate_schedule_parameters(params) global_step = tf.train.get_or_create_global_step() learning_rate = learning_rate_schedule(params, global_step) # cls_loss and box_loss are for logging. only total_loss is optimized. det_loss, cls_loss, box_loss, box_iou_loss = detection_loss( cls_outputs, box_outputs, labels, params) reg_l2loss = reg_l2_loss(params['weight_decay']) total_loss = det_loss + reg_l2loss if mode == tf.estimator.ModeKeys.TRAIN: utils.scalar('lrn_rate', learning_rate) utils.scalar('trainloss/cls_loss', cls_loss) utils.scalar('trainloss/box_loss', box_loss) utils.scalar('trainloss/box_iou_loss', box_iou_loss) utils.scalar('trainloss/det_loss', det_loss) utils.scalar('trainloss/reg_l2_loss', reg_l2loss) utils.scalar('trainloss/loss', total_loss) moving_average_decay = params['moving_average_decay'] if moving_average_decay: ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay, num_updates=global_step) ema_vars = utils.get_ema_vars() if params['strategy'] == 'horovod': import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top learning_rate = learning_rate * hvd.size() if mode == tf.estimator.ModeKeys.TRAIN: if params['optimizer'].lower() == 'sgd': optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=params['momentum']) elif params['optimizer'].lower() == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate) else: raise ValueError('optimizers should be adam or sgd') if params['strategy'] == 'tpu': optimizer = tf.tpu.CrossShardOptimizer(optimizer) elif params['strategy'] == 'horovod': optimizer = hvd.DistributedOptimizer(optimizer) training_hooks = [hvd.BroadcastGlobalVariablesHook(0)] # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) var_list = tf.trainable_variables() if variable_filter_fn: var_list = variable_filter_fn(var_list) if params.get('clip_gradients_norm', 0) > 0: logging.info('clip gradients norm by %f', params['clip_gradients_norm']) grads_and_vars = optimizer.compute_gradients(total_loss, var_list) with tf.name_scope('clip'): grads = [gv[0] for gv in grads_and_vars] tvars = [gv[1] for gv in grads_and_vars] clipped_grads, gnorm = tf.clip_by_global_norm( grads, params['clip_gradients_norm']) utils.scalar('gnorm', gnorm) grads_and_vars = list(zip(clipped_grads, tvars)) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(grads_and_vars, global_step) else: with tf.control_dependencies(update_ops): train_op = optimizer.minimize(total_loss, global_step, var_list=var_list) if moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(**kwargs): """Returns a dictionary that has the evaluation metrics.""" batch_size = params['batch_size'] if params['strategy'] == 'tpu': batch_size = params['batch_size'] * params['num_shards'] eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) if params.get('testdev_dir', None): logging.info('Eval testdev_dir %s', params['testdev_dir']) coco_metrics = coco_metric_fn( batch_size, anchor_labeler, params['val_json_file'], testdev_dir=params['testdev_dir'], disable_pyfun=params.get('disable_pyfun', None), **kwargs) else: logging.info('Eval val with groudtruths %s.', params['val_json_file']) coco_metrics = coco_metric_fn(batch_size, anchor_labeler, params['val_json_file'], **kwargs) # Add metrics to output. output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics cls_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(cls_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) box_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(box_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) metric_fn_inputs = { 'cls_loss_repeat': cls_loss_repeat, 'box_loss_repeat': box_loss_repeat, 'source_ids': labels['source_ids'], 'groundtruth_data': labels['groundtruth_data'], 'image_scales': labels['image_scales'], } add_metric_fn_inputs(params, cls_outputs, box_outputs, metric_fn_inputs) eval_metrics = (metric_fn, metric_fn_inputs) checkpoint = params.get('ckpt') or params.get('backbone_ckpt') if checkpoint and mode == tf.estimator.ModeKeys.TRAIN: # Initialize the model from an EfficientDet or backbone checkpoint. if params.get('ckpt') and params.get('backbone_ckpt'): raise RuntimeError( '--backbone_ckpt and --checkpoint are mutually exclusive') if params.get('backbone_ckpt'): var_scope = params['backbone_name'] + '/' if params['ckpt_var_scope'] is None: # Use backbone name as default checkpoint scope. ckpt_scope = params['backbone_name'] + '/' else: ckpt_scope = params['ckpt_var_scope'] + '/' else: # Load every var in the given checkpoint var_scope = ckpt_scope = '/' def scaffold_fn(): """Loads pretrained model through scaffold function.""" logging.info('restore variables from %s', checkpoint) var_map = utils.get_ckpt_var_map(ckpt_path=checkpoint, ckpt_scope=ckpt_scope, var_scope=var_scope, var_exclude_expr=params.get( 'var_exclude_expr', None)) tf.train.init_from_checkpoint(checkpoint, var_map) return tf.train.Scaffold() elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay: def scaffold_fn(): """Load moving average variables for eval.""" logging.info('Load EMA vars with ema_decay=%f', moving_average_decay) restore_vars_dict = ema.variables_to_restore(ema_vars) saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) else: scaffold_fn = None return tf.estimator.tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, eval_metrics=eval_metrics, host_call=utils.get_tpu_host_call( global_step, params), scaffold_fn=scaffold_fn, training_hooks=training_hooks)
def finalize_configs(is_training): """ Run some sanity checks, and populate some configs from others """ _C.freeze(False) # populate new keys now if isinstance(_C.DATA.VAL, six.string_types): # support single string (the typical case) as well _C.DATA.VAL = (_C.DATA.VAL, ) if isinstance(_C.DATA.TRAIN, six.string_types): # support single string _C.DATA.TRAIN = (_C.DATA.TRAIN, ) # finalize dataset definitions ... from dataset import DatasetRegistry datasets = list(_C.DATA.TRAIN) + list(_C.DATA.VAL) _C.DATA.CLASS_NAMES = DatasetRegistry.get_metadata(datasets[0], "class_names") _C.DATA.NUM_CATEGORY = len(_C.DATA.CLASS_NAMES) - 1 assert _C.BACKBONE.NORM in ['FreezeBN', 'SyncBN', 'GN', 'None'], _C.BACKBONE.NORM if _C.BACKBONE.NORM != 'FreezeBN': assert not _C.BACKBONE.FREEZE_AFFINE assert _C.BACKBONE.FREEZE_AT in [0, 1, 2] _C.RPN.NUM_ANCHOR = len(_C.RPN.ANCHOR_SIZES) * len(_C.RPN.ANCHOR_RATIOS) assert len(_C.FPN.ANCHOR_STRIDES) == len(_C.RPN.ANCHOR_SIZES) # image size into the backbone has to be multiple of this number _C.FPN.RESOLUTION_REQUIREMENT = _C.FPN.ANCHOR_STRIDES[3] # [3] because we build FPN with features r2,r3,r4,r5 if _C.MODE_FPN: size_mult = _C.FPN.RESOLUTION_REQUIREMENT * 1. _C.PREPROC.MAX_SIZE = np.ceil(_C.PREPROC.MAX_SIZE / size_mult) * size_mult assert _C.FPN.PROPOSAL_MODE in ['Level', 'Joint'] assert _C.FPN.FRCNN_HEAD_FUNC.endswith('_head') assert _C.FPN.MRCNN_HEAD_FUNC.endswith('_head') assert _C.FPN.NORM in ['None', 'GN'] if _C.FPN.CASCADE: # the first threshold is the proposal sampling threshold assert _C.CASCADE.IOUS[0] == _C.FRCNN.FG_THRESH assert len(_C.CASCADE.BBOX_REG_WEIGHTS) == len(_C.CASCADE.IOUS) if is_training: train_scales = _C.PREPROC.TRAIN_SHORT_EDGE_SIZE if isinstance(train_scales, (list, tuple)) and train_scales[1] - train_scales[0] > 100: # don't autotune if augmentation is on os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0' os.environ['TF_AUTOTUNE_THRESHOLD'] = '1' assert _C.TRAINER in ['horovod', 'replicated'], _C.TRAINER lr = _C.TRAIN.LR_SCHEDULE if isinstance(lr, six.string_types): if lr.endswith("x"): LR_SCHEDULE_KITER = { "{}x".format(k): [180 * k - 120, 180 * k - 40, 180 * k] for k in range(2, 10)} LR_SCHEDULE_KITER["1x"] = [120, 160, 180] _C.TRAIN.LR_SCHEDULE = [x * 1000 for x in LR_SCHEDULE_KITER[lr]] else: _C.TRAIN.LR_SCHEDULE = eval(lr) # setup NUM_GPUS if _C.TRAINER == 'horovod': import horovod.tensorflow as hvd ngpu = hvd.size() logger.info("Horovod Rank={}, Size={}, LocalRank={}".format( hvd.rank(), hvd.size(), hvd.local_rank())) else: assert 'OMPI_COMM_WORLD_SIZE' not in os.environ ngpu = get_num_gpu() assert ngpu > 0, "Has to train with GPU!" assert ngpu % 8 == 0 or 8 % ngpu == 0, "Can only train with 1,2,4 or >=8 GPUs, but found {} GPUs".format(ngpu) else: # autotune is too slow for inference os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0' ngpu = get_num_gpu() if _C.TRAIN.NUM_GPUS is None: _C.TRAIN.NUM_GPUS = ngpu else: if _C.TRAINER == 'horovod': assert _C.TRAIN.NUM_GPUS == ngpu else: assert _C.TRAIN.NUM_GPUS <= ngpu _C.freeze() logger.info("Config: ------------------------------------------\n" + str(_C))
def train_ffn(model_cls, **model_kwargs): with tf.Graph().as_default(): model = model_cls(**model_kwargs) # initialize the model eval_shape_zyx = train_eval_size(model).tolist( )[::-1] # size of the subvolume (within which the FOV moves) eval_tracker = EvalTracker( eval_shape_zyx) # computes summary statistics inside EFOV load_data_ops = define_data_input( model, queue_batch=1) # this creates a batch of training subvolumes prepare_ffn(model) # here the tf graph is defined merge_summaries_op = tf.summary.merge_all( ) # merges all summaries defined in the graph. if hvd.rank() == 0: save_flags() var_to_reduce = tf.placeholder(tf.float32) bcast_op = hvd.broadcast_global_variables(0) avg_op = hvd.allreduce(var_to_reduce, average=True) # Start supervisor. sv = tf.train.Supervisor( logdir=(FLAGS.train_dir if hvd.rank() == 0 else None), is_chief=True, saver=(tf.train.Saver(max_to_keep=FLAGS.max_to_keep, keep_checkpoint_every_n_hours=1) if hvd.rank() == 0 else None), save_model_secs=(FLAGS.save_model_secs if hvd.rank() == 0 else 0), summary_op=None, save_summaries_secs=0, # will perform custom summaries instead ) sess = sv.prepare_or_wait_for_session( FLAGS.master, config=tf.ConfigProto( log_device_placement=False, allow_soft_placement=True, intra_op_parallelism_threads=FLAGS.num_intra_threads, inter_op_parallelism_threads=FLAGS.num_inter_threads)) # broadcast initial weights. This ensures that all horovod ranks # start at the same point in parameter space if hvd.rank() == 0: print("broadcasting initial weights") sess.run(bcast_op) eval_tracker.sess = sess #--connect the eval tracker to the session eval_tracker.avg_op = avg_op fov_shifts = list(model.shifts) # x, y, z if FLAGS.shuffle_moves: random.shuffle( fov_shifts ) #--this will shuffle the FOV positions that make up an extended FOV (EFOV) policy_map = { 'fixed': partial(fixed_offsets, fov_shifts=fov_shifts), 'max_pred_moves': max_pred_offsets } # batch iterator for getting the next batch batch_it = get_batch(lambda: sess.run(load_data_ops), eval_tracker, model, FLAGS.batch_size, policy_map[FLAGS.fov_policy]) step = 0 t_last = time.time() if hvd.rank() == 0: timing = [] # list of times for benchmarking steps_since_last_summary = 0 if hvd.rank() == 0: print("starting training") while step < FLAGS.max_steps: time_step_start = time.time() if steps_since_last_summary == FLAGS.summary_every_steps: summ_op = merge_summaries_op steps_since_last_summary = 1 if hvd.rank() == 0: print("step ", step, "is a summary step") else: summ_op = None steps_since_last_summary += 1 # get the next batch - this is reading the data from disk. seed, patches, labels, weights = next(batch_it) summaries = [] scaled_lr = FLAGS.learning_rate if (FLAGS.scaling_rule > 0): #--scale the learning rate linearly (1) or sqrt (2) if FLAGS.scaled_lr == 1: scaled_lr *= hvd.size() elif FLAGS.scaled_lr == 2: scaled_lr *= np.sqrt(hvd.size()) if step < FLAGS.warmup_steps: scaled_lr = FLAGS.learning_rate + (step / float( FLAGS.warmup_steps)) * (scaled_lr - FLAGS.learning_rate) if ( FLAGS.decay_learning_rate_fraction > 0 ): # constantly decay the learning rate using exponential decay scaled_lr *= (FLAGS.decay_learning_rate_fraction)**( step / FLAGS.decay_learning_rate_steps) updated_seed, step, summ, my_loss = run_training_step( # run training step on a SINGLE FOV sess, model, summ_op, feed_dict={ model.loss_weights: weights, model.labels: labels, model.offset_label: 'off', model.input_patches: patches, model.input_seed: seed, model.learning_rate: scaled_lr }) # compute average loss avg_loss = sess.run(avg_op, feed_dict={var_to_reduce: my_loss}) # Save prediction results in the original seed array so that # they can be used in subsequent steps. mask.update_at( seed, (0, 0, 0), updated_seed) # updates the mask inside the subvolume batches if hvd.rank() == 0: this_time = time.time( ) - time_step_start # how long did this step take timing.append(this_time) print("step %i took %.2f seconds" % (step - 1, this_time)) if summ is not None: summaries.append(tf.Summary.FromString( summ)) # this adds the summaries from the single FOV # Compute a loss over the whole training patch (i.e. more than a # single-step field of view of the network). This quantifies the # quality of the final object mask. tp, fp, tn, fn, num_patches = eval_tracker.get_summaries_scalar( ) tp_sum = hvd.size() * sess.run( avg_op, feed_dict={var_to_reduce: float(tp)}) fp_sum = hvd.size() * sess.run( avg_op, feed_dict={var_to_reduce: float(fp)}) tn_sum = hvd.size() * sess.run( avg_op, feed_dict={var_to_reduce: float(tn)}) fn_sum = hvd.size() * sess.run( avg_op, feed_dict={var_to_reduce: float(fn)}) avg_num_patches = sess.run( avg_op, feed_dict={var_to_reduce: float(num_patches)}) accuracy = (tp_sum + tn_sum) / (tp_sum + fp_sum + tn_sum + fn_sum) precision = (tp_sum) / (tp_sum + fp_sum) recall = (tp_sum) / (tp_sum + fn_sum) f1 = 2.0 * precision * recall / (precision + recall) eval_tracker_summaries = ([ tf.Summary.Value(tag='eval/patches', simple_value=avg_num_patches), tf.Summary.Value(tag='eval/accuracy', simple_value=accuracy), tf.Summary.Value(tag='eval/precision', simple_value=precision), tf.Summary.Value(tag='eval/recall', simple_value=recall), tf.Summary.Value(tag='eval/f1', simple_value=f1) ]) if hvd.rank() == 0: logging.info('Saving summaries.') summ = tf.Summary() #initialize tensorflow summary summ.value.extend( eval_tracker_summaries) # add EFOV metrics summ.value.extend(eval_tracker.get_summaries_images() ) # add image summaries for s in summaries: summ.value.extend(s.value) # add FOV metrics # other custom summary items: summ.value.extend([ tf.Summary.Value(tag='avg_pixel_loss', simple_value=avg_loss) ]) #avg pixel loss summ.value.extend([ tf.Summary.Value(tag='learning_rate', simple_value=scaled_lr) ]) #(scaled) learning rate summ.value.extend([ tf.Summary.Value(tag='avg_time_per_step', simple_value=np.mean(timing)) ]) #avg time per step print("avg time per step: ", np.mean(timing), np.std(timing)) print("avg throughput: ", FLAGS.batch_size / np.mean(timing)) timing = [] # reset the timing array sv.summary_computed(sess, summ, step) # reset eval tracker before the next training step. eval_tracker.reset() if hvd.rank() == 0: print("all steps done!") if (FLAGS.do_benchmark_test == 1): print("benchmark result: ") print("steps, ranks, threads, mean, sigma:") string = str(FLAGS.max_steps) + "," + str( FLAGS.batch_size) + "," + str( hvd.size()) + "," + str(FLAGS.nthreads) + "," + str( np.mean(timing)) + "," + str(np.std(timing)) + "\n" print(string) with open(FLAGS.timelog, "a") as myfile: myfile.write(string)
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) nranks_per_gpu = args.nranks_per_gpu local_rank = hvd.local_rank() gpu_local_rank = local_rank // nranks_per_gpu print('local_rank, GPU_LOCAL_RANK: {}, {}'.format( local_rank, gpu_local_rank)) # Pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True # config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.visible_device_list = str(gpu_local_rank) K.set_session(tf.Session(config=config)) # input image dimensions img_rows, img_cols, img_chns = 28, 28, 1 # number of convolutional filters to use filters = 64 # convolution kernel size num_conv = 3 hvdsize = hvd.size() batch_size = 128 # 100 if K.image_data_format() == 'channels_first': original_img_size = (img_chns, img_rows, img_cols) else: original_img_size = (img_rows, img_cols, img_chns) latent_dim = 2 intermediate_dim = 128 epsilon_std = 1.0 epochs = args.epochs # 5 # train the VAE on MNIST digits (x_train, _), (x_test, y_test) = mnist.load_data() x_train = x_train.astype('float32') / 255. x_train = x_train.reshape((x_train.shape[0],) + original_img_size) x_test = x_test.astype('float32') / 255. x_test = x_test.reshape((x_test.shape[0],) + original_img_size) if hvd.rank() == 0: print('x_train.shape:', x_train.shape) train_samples = x_train.shape[0] # steps_per_epoch = train_samples // batch_size // hvdsize speedupopt = args.speedup if speedupopt == SpeedupOpts.imgspersec: steps_per_epoch = train_samples // batch_size else: steps_per_epoch = int(round( float(train_samples) / batch_size / hvdsize + 0.5)) # Create the dataset and its associated one-shot iterator. buffer_size = 10000 dataset = Dataset.from_tensor_slices(x_train) dataset = dataset.repeat() dataset = dataset.shuffle(buffer_size) dataset = dataset.batch(batch_size) iterator = dataset.make_one_shot_iterator() x_train_batch = iterator.get_next() ldict = make_shared_layers_dict( img_chns, img_rows, img_cols, batch_size, filters, num_conv, intermediate_dim, latent_dim, epsilon_std) # ldict is a dictionary that holds all layers. Since these layers are # instantiated once, they are shared amongs vae, encoder, and generator. x = Input(tensor=x_train_batch) vae = make_vae(ldict, x) # : :type vae: Model lr = 0.001 # * hvdsize opt = tf.train.RMSPropOptimizer(lr) # Add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) # , use_locking=True) opt = TFOptimizer(opt) # opt = RMSprop(lr) # Add Horovod Distributed Optimizer. # opt = hvd_keras.DistributedOptimizer(opt) # , use_locking=True) vae.compile(optimizer=opt, loss=None) if hvd.rank() == 0: vae.summary() callbacks = [] if hvd.rank() == 0: callbacks += [BatchTiming(), SamplesPerSec(batch_size * hvdsize)] sess = K.get_session() sess.run(hvd.broadcast_global_variables(0)) # Fit the model using data from the TF data tensors. vae.fit(steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks) if hvd.rank() == 0: x = Input(shape=original_img_size) vae_val = make_vae(ldict, x) vae_val.compile(optimizer=opt, loss=None) loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size) print('\n\nVAE VALIDATION LOSS: {}'.format(loss)) x = Input(shape=original_img_size) z_mean, _ = get_encoded(ldict, x) encoder = Model(x, z_mean) # : :type encoder: Model decoder_input = Input(shape=(latent_dim,)) x_decoded_mean_squash = get_decoded(ldict, decoder_input) generator = Model(decoder_input, x_decoded_mean_squash) # : :type generator: Model # display a 2D plot of the digit classes in the latent space x_test_encoded = encoder.predict(x_test, batch_size=batch_size) plt.figure(figsize=(6, 6)) plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test) plt.colorbar() # plt.show() plt.savefig('vae_scatter.ps') plt.close() # display a 2D manifold of the digits n = 15 # figure with 15x15 digits digit_size = 28 figure = np.zeros((digit_size * n, digit_size * n)) # Linearly spaced coordinates on the unit square were transformed # through the inverse CDF (ppf) of the Gaussian # To produce values of the latent variables z, since the prior of the # latent space is Gaussian grid_x = norm.ppf(np.linspace(0.05, 0.95, n)) grid_y = norm.ppf(np.linspace(0.05, 0.95, n)) for i, yi in enumerate(grid_x): for j, xi in enumerate(grid_y): z_sample = np.array([[xi, yi]]) z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2) x_decoded = generator.predict(z_sample, batch_size=batch_size) digit = x_decoded[0].reshape(digit_size, digit_size) figure[i * digit_size: (i + 1) * digit_size, j * digit_size: (j + 1) * digit_size] = digit plt.figure(figsize=(10, 10)) plt.imshow(figure, cmap='Greys_r') # plt.show() plt.savefig('vae_digit.ps') plt.close() K.clear_session()
def main(_): # Horovod: initialize Horovod. hvd.init() # Keras automatically creates a cache directory in ~/.keras/datasets for # storing the downloaded MNIST data. This creates a race # condition among the workers that share the same filesystem. If the # directory already exists by the time this worker gets around to creating # it, ignore the resulting exception and continue. cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets') if not os.path.exists(cache_dir): try: os.mkdir(cache_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(cache_dir): pass else: raise # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) tf.enable_eager_execution(config=config) mnist_model = tf.keras.Sequential([ tf.keras.layers.Conv2D(16, [3, 3], activation='relu'), tf.keras.layers.Conv2D(16, [3, 3], activation='relu'), tf.keras.layers.GlobalAveragePooling2D(), tf.keras.layers.Dense(10) ]) # Horovod: adjust learning rate based on number of GPUs. opt = tf.train.RMSPropOptimizer(0.001 * hvd.size()) # Make sure the Fetcher worked mnist_filename = 'mnist.npz' mnist_path = os.path.join(cache_dir, mnist_filename) if not os.path.isfile(mnist_path): raise FileNotFoundError("Dataset not found. Looked in " + mnist_path) (mnist_images, mnist_labels), _ = \ tf.keras.datasets.mnist.load_data(path=mnist_filename) dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64))) dataset = dataset.shuffle(1000).batch(32) # Horovod: adjust number of steps based on number of GPUs. for (batch, (images, labels)) in enumerate(dataset.take(20000 // hvd.size())): with tf.GradientTape() as tape: logits = mnist_model(images, training=True) loss_value = tf.losses.sparse_softmax_cross_entropy(labels, logits) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. if batch == 0: hvd.broadcast_variables(mnist_model.variables, root_rank=0) # Horovod: add Horovod Distributed GradientTape. tape = hvd.DistributedGradientTape(tape) grads = tape.gradient(loss_value, mnist_model.variables) opt.apply_gradients(zip(grads, mnist_model.variables), global_step=tf.train.get_or_create_global_step()) if batch % 50 == 0 and hvd.local_rank() == 0: print('Step #%d\tLoss: %.6f' % (batch, loss_value)) emit({"batch": str(batch), "train_loss": "%.6f" % loss_value})
def train(sess, model, hps, logdir, visualise): _print(hps) _print('Starting training. Logging to', logdir) _print('epoch n_processed n_images ips dtrain dtest dsample dtot train_results test_results msg') # Train sess.graph.finalize() n_processed = 0 n_images = 0 train_time = 0.0 test_loss_best = 999999 if hvd.rank() == 0: train_logger = ResultLogger(logdir + "train.txt", **hps.__dict__) test_logger = ResultLogger(logdir + "test.txt", **hps.__dict__) tcurr = time.time() for epoch in range(1, hps.epochs): t = time.time() train_results = [] for it in range(hps.train_its): # Set learning rate, linearly annealed from 0 in the first hps.epochs_warmup epochs. lr = hps.lr * min(1., n_processed / (hps.n_train * hps.epochs_warmup)) # Run a training step synchronously. _t = time.time() train_results += [model.train(lr)] if hps.verbose and hvd.rank() == 0: _print(n_processed, time.time()-_t, train_results[-1]) sys.stdout.flush() # Images seen wrt anchor resolution n_processed += hvd.size() * hps.n_batch_train # Actual images seen at current resolution n_images += hvd.size() * hps.local_batch_train train_results = np.mean(np.asarray(train_results), axis=0) dtrain = time.time() - t ips = (hps.train_its * hvd.size() * hps.local_batch_train) / dtrain train_time += dtrain if hvd.rank() == 0: train_logger.log(epoch=epoch, n_processed=n_processed, n_images=n_images, train_time=int( train_time), **process_results(train_results)) if epoch < 10 or (epoch < 50 and epoch % 10 == 0) or epoch % hps.epochs_full_valid == 0: test_results = [] msg = '' t = time.time() # model.polyak_swap() if epoch % hps.epochs_full_valid == 0: # Full validation run for it in range(hps.full_test_its): test_results += [model.test()] test_results = np.mean(np.asarray(test_results), axis=0) if hvd.rank() == 0: test_logger.log(epoch=epoch, n_processed=n_processed, n_images=n_images, **process_results(test_results)) # Save checkpoint if test_results[0] < test_loss_best: test_loss_best = test_results[0] model.save(logdir+"model_best_loss.ckpt") msg += ' *' dtest = time.time() - t # Sample t = time.time() if epoch == 1 or epoch == 10 or epoch % hps.epochs_full_sample == 0: visualise(epoch) dsample = time.time() - t if hvd.rank() == 0: dcurr = time.time() - tcurr tcurr = time.time() _print(epoch, n_processed, n_images, "{:.1f} {:.1f} {:.1f} {:.1f} {:.1f}".format( ips, dtrain, dtest, dsample, dcurr), train_results, test_results, msg) # model.polyak_swap() if hvd.rank() == 0: _print("Finished!")
def main(_): """ Builds the model and runs """ if FLAGS.distributed: import horovod.tensorflow as hvd hvd.init() tf.logging.set_verbosity(tf.logging.INFO) if len(config_train.name) > 0: output_dir = os.path.join(FLAGS.output_dir, config_train.name) else: output_dir = FLAGS.output_dir tx.utils.maybe_create_dir(output_dir) ## Loads GPT-2 model configuration if FLAGS.config_type == "json": gpt2_config = model_utils.transform_gpt2_to_texar_config( FLAGS.config_model) elif FLAGS.config_type == 'texar': gpt2_config = importlib.import_module( FLAGS.config_model) else: raise ValueError('Unknown config_type.') # Creates a data pre-processor for, e.g., BPE encoding proc = processor.get_encoder(FLAGS.pretrained_model_dir) max_decoding_length = config_train.max_decoding_length assert max_decoding_length <= gpt2_config.position_size, ( "max_decoding_length should not be greater than position_size. " "{}>{}".format(max_decoding_length, gpt2_config.position_size)) ## Loads data # Configures training data shard in distribued mode if FLAGS.distributed: config_train.train_hparam["dataset"]["num_shards"] = hvd.size() config_train.train_hparam["dataset"]["shard_id"] = hvd.rank() config_train.train_hparam["batch_size"] //= hvd.size() datasets = {} #if FLAGS.do_train: train_dataset = tx.data.TFRecordData(hparams=config_train.train_hparam) datasets['train'] = train_dataset #if FLAGS.do_eval: dev_dataset = tx.data.TFRecordData(hparams=config_train.dev_hparam) datasets['dev'] = dev_dataset #if FLAGS.do_test: test_dataset = tx.data.TFRecordData(hparams=config_train.test_hparam) datasets['test'] = test_dataset iterator = tx.data.FeedableDataIterator(datasets) batch = iterator.get_next() batch_size = tf.shape(batch['x1x4_ids'])[0] ## Builds the GPT-2 model vocab_size = gpt2_config.vocab_size word_embedder = tx.modules.WordEmbedder( vocab_size=vocab_size, hparams=gpt2_config.embed) pos_embedder = tx.modules.PositionEmbedder( position_size=gpt2_config.position_size, hparams=gpt2_config.pos_embed) # Ties output layer with input word embedding output_layer = tf.transpose(word_embedder.embedding, (1, 0)) decoder = tx.modules.TransformerDecoder( vocab_size=vocab_size, output_layer=output_layer, hparams=gpt2_config.decoder) # For training def _get_recon_loss(ids, full_len, prefix_len, mask_prefix=True, do_print=False): ids = ids[:,:tf.reduce_max(full_len)] batch_size__ = tf.shape(ids)[0] seq_len = tf.fill([batch_size__], tf.shape(ids)[1]) pos_embeds = pos_embedder(sequence_length=seq_len) input_embeds = word_embedder(ids) + pos_embeds outputs = decoder(inputs=input_embeds, decoding_strategy='train_greedy') max_full_len = tf.reduce_max(full_len) ids = ids[:, :max_full_len] logits = outputs.logits[:, :max_full_len] if mask_prefix: loss_recon = tx.losses.sequence_sparse_softmax_cross_entropy( labels=ids[:, 1:], logits=logits[:, :-1, :], sequence_length=full_len-1, average_across_timesteps=False, sum_over_timesteps=False, average_across_batch=False, sum_over_batch=False) mask_recon = tf.sequence_mask( full_len-1, dtype=tf.float32) mask_recon_prefix = 1 - tf.sequence_mask( prefix_len-1, maxlen=max_full_len-1,#max_decoding_length-1, dtype=tf.float32) mask_recon = mask_recon * mask_recon_prefix if do_print: print_op_1 = tf.print(mask_recon) loss_recon_flat = tx.utils.reduce_with_weights( tensor=loss_recon, weights=mask_recon, average_across_remaining=False, sum_over_remaining=False, average_across_batch=False) print_op_2 = tf.print(loss_recon_flat) with tf.control_dependencies([print_op_1, print_op_2]): loss_recon = tx.utils.reduce_with_weights( tensor=loss_recon, weights=mask_recon, average_across_remaining=True, sum_over_remaining=False) return loss_recon, mask_recon, loss_recon_flat else: loss_recon = tx.utils.reduce_with_weights( tensor=loss_recon, weights=mask_recon, average_across_remaining=True, sum_over_remaining=False) else: loss_recon = tx.losses.sequence_sparse_softmax_cross_entropy( labels=ids[:, 1:], logits=logits[:, :-1, :], sequence_length=full_len-1, average_across_timesteps=True, sum_over_timesteps=False, average_across_batch=True, sum_over_batch=False) return loss_recon ## ROC Loss-1: fine-tune loss x1_len = tf.placeholder(tf.int32, shape=[None], name='x1_len') x1x4_ids = tf.placeholder(tf.int32, shape=[None, None], name='x1x4_ids') x1x4_len = tf.placeholder(tf.int32, shape=[None], name='x1x4_len') loss_fine = _get_recon_loss(x1x4_ids, x1x4_len, x1_len) tau = tf.placeholder(tf.float32, shape=[], name='tau') # generate soft yy def _soft_embedding_fn(soft_ids, times): return word_embedder(soft_ids=soft_ids) + pos_embedder(times) end_token = proc.encoder['<|endoftext|>'] if not FLAGS.supervised: loss = config_train.w_fine * loss_fine loss_dict = { 'loss': loss, 'loss_fine': config_train.w_fine * loss_fine, } else: loss = loss_yy loss_dict = { 'loss': loss, 'loss_yy': loss_yy, # dumb 'loss_mask_recon': tf.constant(0), 'loss_bt': tf.constant(0), 'loss_d_xx2': tf.constant(0), 'loss_d_x2': tf.constant(0), 'loss_fine': tf.constant(0), 'loss_xx2': tf.constant(0) } ## Inference def _embedding_fn(ids, times): return word_embedder(ids) + pos_embedder(times) def _infer(context_name): helper = tx.modules.TopKSampleEmbeddingHelper( embedding=_embedding_fn, start_tokens=batch['%s_ids' % context_name][:, 0], end_token=end_token, top_k=FLAGS.top_k, softmax_temperature=FLAGS.temperature) outputs_infer, len_infer = decoder( context=batch['%s_ids' % context_name], context_sequence_length=batch['%s_len' % context_name], max_decoding_length=max_decoding_length, helper=helper) yy_ids = tx.utils.varlength_roll( outputs_infer.sample_id, -batch['%s_len' % context_name]) yy_len = len_infer - batch['%s_len' % context_name] yy_ids = yy_ids[:, :tf.reduce_max(yy_len)] # yy_logits = outputs_infer.logits # # yy_loss = _evaluate_loss_test(yy_logits, target_name, context_name) return yy_ids, yy_len def _evaluate_loss_test(target_name, context_name, bpe_loss=FLAGS.bpe_loss): ids = batch['%s_ids' % target_name] full_len = batch['%s_len' % target_name] ids = ids[:, :tf.reduce_max(full_len)] batch_size__ = tf.shape(ids)[0] seq_len = tf.fill([batch_size__], tf.shape(ids)[1]) pos_embeds = pos_embedder(sequence_length=seq_len) input_embeds = word_embedder(ids) + pos_embeds # greedy output outputs = decoder(inputs=input_embeds, decoding_strategy='train_greedy') max_full_len = tf.reduce_max(full_len) logits = outputs.logits[:, :max_full_len] test_loss = tx.losses.sequence_sparse_softmax_cross_entropy( labels=ids[:, 1:], logits=logits[:, :-1, :], sequence_length=full_len - 1, average_across_timesteps=False, sum_over_timesteps=False, # not bpe_loss, # True, average_across_batch=False, sum_over_batch=False) mask_recon = tf.sequence_mask( full_len - 1, dtype=tf.float32) mask_recon_prefix = 1 - tf.sequence_mask( batch['%s_len' % context_name] - 1, maxlen=max_full_len - 1, # max_decoding_length-1, dtype=tf.float32) mask_recon = mask_recon * mask_recon_prefix test_loss = tx.utils.reduce_with_weights( tensor=test_loss, weights=mask_recon, average_across_batch=bpe_loss, average_across_remaining=bpe_loss, sum_over_remaining=not bpe_loss) return test_loss # [bs,] ? x4_ids_fine, x4_len_fine = _infer('x1') x4_loss_fine = _evaluate_loss_test('x1x4', 'x1') ## Optimization def _get_beam_ids(context_name): # beam-search predictions = decoder( beam_width=5, length_penalty=config_train.length_penalty, embedding=_embedding_fn, context=batch['%s_ids' % context_name], context_sequence_length=batch['%s_len' % context_name], max_decoding_length=max_decoding_length, end_token=end_token, mode=tf.estimator.ModeKeys.PREDICT) beam_output_ids = tx.utils.varlength_roll(predictions["sample_id"][:, :, 0], -batch['%s_len' % context_name]) return beam_output_ids beam_search_ids = _get_beam_ids('x1') def _get_greedy_story(context_name): greedy_res, greedy_len = decoder( decoding_strategy='infer_greedy', embedding=_embedding_fn, context=batch['%s_ids' % context_name], context_sequence_length=batch['%s_len' % context_name], max_decoding_length=max_decoding_length, end_token=end_token, mode=tf.estimator.ModeKeys.PREDICT) greedy_ids = tx.utils.varlength_roll(greedy_res.sample_id, -batch['%s_len' % context_name]) greedy_ids_len = greedy_len - batch['%s_len' % context_name] greedy_ids = greedy_ids[:, :tf.reduce_max(greedy_ids_len)] return greedy_ids, greedy_ids_len greedy_ids, greedy_len = _get_greedy_story('x1') trainable_variables = tx.utils.collect_trainable_variables( [word_embedder, pos_embedder, decoder]) global_step = tf.Variable(0, trainable=False) opt = tx.core.get_optimizer( global_step=global_step, hparams=config_train.opt) if FLAGS.distributed: opt = hvd.DistributedOptimizer(opt) train_op = tf.contrib.layers.optimize_loss( loss=loss, global_step=global_step, learning_rate=None, optimizer=opt, variables=trainable_variables) ## Train/eval/test routine saver = tf.train.Saver() saver_best = tf.train.Saver(max_to_keep=1) dev_best = { 'loss': 1e8, 'loss_fine': 1e8} def _log_losses(losses, step=None): loss_str = 'loss: %.4f, loss_fine: %.4f' % \ (losses['loss'], losses['loss_fine']) if step is not None: loss_str = 'step: %d, %s' % (step, loss_str) _log(loss_str) def _is_head(): if not FLAGS.distributed: return True else: return hvd.rank() == 0 def _train_epoch(sess, initial=False): """Trains on the training set, and evaluates on the dev set periodically. """ iterator.restart_dataset(sess, 'train') while True: try: # (1) Get data and yy sample fetches_data = { 'batch': batch, 'batch_size': batch_size, } feed_dict_data = { iterator.handle: iterator.get_handle(sess, 'train'), tx.global_mode(): tf.estimator.ModeKeys.PREDICT, } rets_data = sess.run(fetches_data, feed_dict_data) # (2) Optimize loss feed_dict = { #x1_ids: rets_data['batch']['x1_ids'], x1_len: rets_data['batch']['x1_len'], x1x4_ids: rets_data['batch']['x1x4_ids'], x1x4_len: rets_data['batch']['x1x4_len'], tau: config_train.tau, tx.global_mode(): tf.estimator.ModeKeys.TRAIN, } fetches = { 'train_op': train_op, 'step': global_step, } fetches.update(loss_dict) rets = sess.run(fetches, feed_dict) step = rets['step'] dis_steps = config_train.display_steps if _is_head() and dis_steps > 0 and step % dis_steps == 0: _log_losses(rets, step) eval_steps = config_train.eval_steps if _is_head() and eval_steps > 0 and step % eval_steps == 0: _dev_epoch(sess) sample_steps = config_train.sample_steps if _is_head() and sample_steps > 0 and step % sample_steps == 0: print('-----------testing-----------------') _test_epoch(sess, step=step) ckpt_steps = config_train.checkpoint_steps if _is_head() and ckpt_steps > 0 and step % ckpt_steps == 0: ckpt_fn = os.path.join(output_dir, 'model.ckpt') ckpt_fn = saver.save(sess, ckpt_fn, global_step=step) _log('Checkpoint to {}'.format(ckpt_fn)) except tf.errors.OutOfRangeError: break def _dev_epoch(sess): """Evaluates on the dev set. """ iterator.restart_dataset(sess, 'dev') results = tx.utils.AverageRecorder() nsamples = 0 fetches = {} fetches.update(loss_dict) # i = 0 while True: try: # (1) Get data and yy sample fetches_data = { 'batch': batch, 'batch_size': batch_size, } feed_dict_data = { iterator.handle: iterator.get_handle(sess, 'dev'), tx.global_mode(): tf.estimator.ModeKeys.PREDICT, } rets_data = sess.run(fetches_data, feed_dict_data) # (2) eval loss feed_dict = { #x1_ids: rets_data['batch']['x1_ids'], x1_len: rets_data['batch']['x1_len'], x1x4_ids: rets_data['batch']['x1x4_ids'], x1x4_len: rets_data['batch']['x1x4_len'], tau: config_train.tau, tx.global_mode(): tf.estimator.ModeKeys.PREDICT, } rets = sess.run(fetches, feed_dict) results.add(rets, weight=rets_data['batch_size']) nsamples += rets_data['batch_size'] except tf.errors.OutOfRangeError: break _log_losses(results.avg()) _log('nsamples: %d' % nsamples) avg_loss = results.avg('loss') if FLAGS.do_train and avg_loss < dev_best['loss']: dev_best.update(results.avg()) ckpt_fn = os.path.join(output_dir, 'model_best.ckpt') ckpt_fn = saver_best.save(sess, ckpt_fn) _log('Checkpoint best to {}'.format(ckpt_fn)) def _test_epoch(sess, step=None): """Generates samples on the test set. """ iterator.restart_dataset(sess, 'test') _all_inputs = [] _all_samples = [] _all_loss = [] # if FLAGS.finetune and FLAGS.roc: # raise ValueError('Cannot set --finetune and --roc at the same time') if FLAGS.finetune: _log('Generation input: x1') if FLAGS.greedy: fetches = { 'inputs': batch['x1_ids'], 'length': batch['x1_len'], 'samples_length': greedy_len, 'samples': greedy_ids } elif FLAGS.beam: fetches = { 'inputs': batch['x1_ids'], 'length': batch['x1_len'], # 'samples_length': x4_len_fine, 'samples': beam_search_ids } else: fetches = { 'inputs': batch['x1_ids'], 'length': batch['x1_len'], 'samples_length': x4_len_fine, 'samples': x4_ids_fine, 'sample_loss': x4_loss_fine, 'outputs': batch['x1x4_ids'], 'out_length': batch['x1x4_len'] } res_fn_appendix = "x1" while True: try: feed_dict = { iterator.handle: iterator.get_handle(sess, 'test'), tx.context.global_mode(): tf.estimator.ModeKeys.PREDICT, } rets = sess.run(fetches, feed_dict=feed_dict) # ! ---- _inputs = [] for i, l in zip(rets['inputs'], rets['length']): # Delete padding _inputs.append(i[:l].tolist()) _all_inputs.extend(_inputs) _samples = [] if not FLAGS.beam: for s, l in zip(rets['samples'], rets['samples_length']): _samples.append(s[:l].tolist()) else: _samples.extend(h.tolist() for h in rets['samples']) _samples = utils.list_strip_eos(_samples, eos_token=proc.encoder['<|endoftext|>']) _all_samples.extend(_samples) # ----! _loss = [] if not FLAGS.bpe_loss: for los in rets["sample_loss"]: _loss.append(los) else: _loss = [rets["sample_loss"]] _all_loss.extend(_loss) except tf.errors.OutOfRangeError: break # Parse samples and write to file eos_token_id = proc.encoder['<|endoftext|>'] # !---- _all_input_text = [] for i in _all_inputs: if i[0] == eos_token_id: i = i[1:] i_text = proc.decode(i) _all_input_text.append(i_text) _all_input_text = tx.utils.strip_eos(_all_input_text, eos_token='<|endoftext|>') _all_samples_text = [] for j, (i, s) in enumerate(zip(_all_inputs, _all_samples)): s_text = proc.decode(s) s_text = s_text.replace('\n', ' ') # print(s_text) _all_samples_text.append(s_text) if j % 1000 == 0: print("{} stories is process of total {}".format(j, len(_all_inputs))) _all_samples_text = tx.utils.strip_eos(_all_samples_text, eos_token='<|endoftext|>') if step is None: fn = "test_samples_%s_sample_k%d.tsv" % (res_fn_appendix, FLAGS.top_k) else: fn = "test_samples_%s_%d_beam.tsv" % (res_fn_appendix, step) output_file = os.path.join(output_dir, fn) _log('Write samples to {}'.format(output_file)) if not FLAGS.beam: tx.utils.write_paired_text( _all_input_text, _all_samples_text, output_file) with open(output_file[:-4]+".txt", 'w') as f: for item in _all_samples_text: f.write("%s\n" % item.strip(" | ")) else: with open(output_file, 'w') as f: for item in _all_samples_text: f.write("%s\n" % item) # ----! if FLAGS.ppl: if not FLAGS.bpe_loss: # load target file target = [i.strip().split() for i in open("emotion_evaluation/baselines/ground-truth/ground_truth_story-processed.txt")] for j, (txt, los) in enumerate(zip(target, _all_loss)): _all_loss[j] = los/len(txt) np.save(os.path.join(output_dir, "test_loss_word.npy"), np.array(_all_loss)) avg_loss = np.mean(np.array(_all_loss)) ppl = np.exp(avg_loss) msg = 'test_loss (per word): %.4f, test_perplexity: %.4f' % \ (avg_loss, ppl ) else: avg_loss = np.mean(np.array(_all_loss)) ppl = np.exp(avg_loss) msg = 'test_loss (bpe): %.4f, test_perplexity: %.4f' % \ (avg_loss, ppl ) _log(msg) # Broadcasts global variables from rank-0 process if FLAGS.distributed: bcast = hvd.broadcast_global_variables(0) session_config = tf.ConfigProto() if FLAGS.distributed: session_config.gpu_options.visible_device_list = str(hvd.local_rank()) with tf.Session(config=session_config) as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) # smry_writer = tf.summary.FileWriter(FLAGS.output_dir, graph=sess.graph) if FLAGS.distributed: bcast.run() #Restores trained model if specified if FLAGS.checkpoint: _log('Restore from {}'.format(FLAGS.checkpoint)) saver.restore(sess, FLAGS.checkpoint) elif FLAGS.pretrain_checkpoint: _log('Restore from {}'.format(FLAGS.pretrain_checkpoint)) model_utils.init_gpt2_checkpoint(sess, FLAGS.pretrain_checkpoint) print("\nFinished loading\n") saver.save(sess, output_dir + '/gpt2_model.ckpt') iterator.initialize_dataset(sess) if FLAGS.do_train: for epoch in range(config_train.max_train_epoch): print("Training epoch {}".format(epoch)) _train_epoch(sess, epoch==0) saver.save(sess, output_dir + '/model.ckpt') if FLAGS.do_eval: _dev_epoch(sess) if FLAGS.do_test: _test_epoch(sess)
def do_train(model): batch = args.batch total_batch = batch * hvd.size() if args.fake: data = FakeData([[batch, 224, 224, 3], [batch]], 1000, random=False, dtype=['uint8', 'int32']) data = StagingInput(QueueInput(data)) callbacks = [] steps_per_epoch = 50 else: logger.info("#Tower: {}; Batch size per tower: {}".format( hvd.size(), batch)) zmq_addr = 'ipc://@imagenet-train-b{}'.format(batch) if args.no_zmq_ops: dataflow = RemoteDataZMQ(zmq_addr, hwm=150, bind=False) data = QueueInput(dataflow) else: data = ZMQInput(zmq_addr, 30, bind=False) data = StagingInput(data) steps_per_epoch = int(np.round(1281167 / total_batch)) BASE_LR = 0.1 * (total_batch // 256) """ ImageNet in 1 Hour, Sec 2.1: Linear Scaling Rule: When the minibatch size is multiplied by k, multiply the learning rate by k. """ logger.info("Base LR: {}".format(BASE_LR)) callbacks = [ ModelSaver(max_to_keep=10), EstimatedTimeLeft(), ScheduledHyperParamSetter('learning_rate', [(0, BASE_LR), (35, BASE_LR * 1e-1), (70, BASE_LR * 1e-2), (95, BASE_LR * 1e-3)]) ] """ Feature Denoising, Sec 5: Our models are trained for a total of 110 epochs; we decrease the learning rate by 10× at the 35- th, 70-th, and 95-th epoch """ max_epoch = 110 if BASE_LR > 0.1: callbacks.append( ScheduledHyperParamSetter('learning_rate', [(0, 0.1), (5 * steps_per_epoch, BASE_LR)], interp='linear', step_based=True)) """ ImageNet in 1 Hour, Sec 2.2: we start from a learning rate of η and increment it by a constant amount at each iteration such that it reaches ηˆ = kη after 5 epochs """ if not args.fake: # add distributed evaluation, for various attackers that we care. def add_eval_callback(name, attacker, condition): cb = create_eval_callback( name, model.get_inference_func(attacker), # always eval in the last 2 epochs no matter what lambda epoch_num: condition(epoch_num) or epoch_num > max_epoch - 2) callbacks.append(cb) add_eval_callback('eval-clean', NoOpAttacker(), lambda e: True) add_eval_callback( 'eval-10step', PGDAttacker(10, args.attack_epsilon, args.attack_step_size), lambda e: True) add_eval_callback( 'eval-50step', PGDAttacker(50, args.attack_epsilon, args.attack_step_size), lambda e: e % 20 == 0) add_eval_callback( 'eval-100step', PGDAttacker(100, args.attack_epsilon, args.attack_step_size), lambda e: e % 10 == 0 or e > max_epoch - 5) for k in [20, 30, 40, 60, 70, 80, 90]: add_eval_callback( 'eval-{}step'.format(k), PGDAttacker(k, args.attack_epsilon, args.attack_step_size), lambda e: False) trainer = HorovodTrainer(average=True) trainer.setup_graph(model.get_input_signature(), data, model.build_graph, model.get_optimizer) trainer.train_with_defaults(callbacks=callbacks, steps_per_epoch=steps_per_epoch, session_init=SmartInit(args.load), max_epoch=max_epoch, starting_epoch=args.starting_epoch)
def test_horovod_size(self): """Test that the size returned by hvd.size() is correct.""" _, true_size = mpi_env_rank_and_size() hvd.init() size = hvd.size() assert true_size == size
import os import json import time from math import pi from typing import Optional, Tuple import numpy as np import tensorflow as tf from tensorflow.python.keras import backend as K try: import horovod.tensorflow as hvd NUM_RANKS = hvd.size() NUM_WORKERS = NUM_RANKS * hvd.local_size() HAS_HOROVOD = True print(f'hvd.size : {hvd.size()}') print(f'hvd.local_size: {hvd.local_size()}') except (ImportError, ModuleNotFoundError): NUM_RANKS = 1 NUM_WORKERS = NUM_RANKS HAS_HOROVOD = False import utils.file_io as io from config import BIN_DIR from lattice.gauge_lattice import GaugeLattice from utils.attr_dict import AttrDict
def main(device, input_path_test, downsampling_fact, downsampling_mode, channels, data_format, label_id, weights, image_dir, checkpoint_dir, output_graph_file, tst_sz, loss_type, model, decoder, fs_type, batch, batchnorm, dtype, scale_factor, predmode): #init horovod comm_rank = 0 comm_local_rank = 0 comm_size = 1 comm_local_size = 1 if horovod: hvd.init() comm_rank = hvd.rank() comm_local_rank = hvd.local_rank() comm_size = hvd.size() #not all horovod versions have that implemented try: comm_local_size = hvd.local_size() except: comm_local_size = 1 if comm_rank == 0: print("Using distributed computation with Horovod: {} total ranks". format(comm_size, comm_rank)) #downsampling? recompute image dimensions image_height = image_height_orig // downsampling_fact image_width = image_width_orig // downsampling_fact #session config sess_config = tf.ConfigProto( inter_op_parallelism_threads=2, #1 intra_op_parallelism_threads=33, #6 log_device_placement=False, allow_soft_placement=True) sess_config.gpu_options.visible_device_list = str(comm_local_rank) sess_config.gpu_options.force_gpu_compatible = True #get data test_graph = tf.Graph() if comm_rank == 0: print("Loading data...") tst_data = load_data(input_path_test, shuffle=False, max_files=tst_sz, use_horovod=False) if comm_rank == 0: print("Shape of tst_data is {}".format(tst_data.shape[0])) print("done.") #print some stats if comm_rank == 0: print("Num workers: {}".format(comm_size)) print("Local batch size: {}".format(batch)) if dtype == tf.float32: print("Precision: {}".format("FP32")) else: print("Precision: {}".format("FP16")) print("Decoder: {}".format(decoder)) print("Batch normalization: {}".format(batchnorm)) print("Channels: {}".format(channels)) print("Loss type: {}".format(loss_type)) print("Loss weights: {}".format(weights)) print("Loss scale factor: {}".format(scale_factor)) print("Num test samples: {}".format(tst_data.shape[0])) #compute epochs and stuff: if fs_type == "local": num_samples = tst_data.shape[0] // comm_local_size else: num_samples = tst_data.shape[0] // comm_size with test_graph.as_default(): #create readers tst_reader = h5_input_reader(input_path_test, channels, weights, dtype, normalization_file="stats.h5", update_on_read=False, data_format=data_format, label_id=label_id, read_labels=(not predmode)) #create datasets if fs_type == "local": tst_dataset = create_dataset(tst_reader, tst_data, batch, 1, comm_local_size, comm_local_rank, dtype, shuffle=False) else: tst_dataset = create_dataset(tst_reader, tst_data, batch, 1, comm_size, comm_rank, dtype, shuffle=False) #create iterators handle = tf.placeholder(tf.string, shape=[], name="iterator-placeholder") if not predmode: #in evaluation mode, issue data, label, weight and filename iterator = tf.data.Iterator.from_string_handle( handle, (dtype, tf.int32, dtype, tf.string), ((batch, len(channels), image_height_orig, image_width_orig) if data_format == "channels_first" else (batch, image_height_orig, image_width_orig, len(channels)), (batch, image_height_orig, image_width_orig), (batch, image_height_orig, image_width_orig), (batch))) else: #in prediction mode, just issue data and filename iterator = tf.data.Iterator.from_string_handle( handle, (dtype, tf.string), ((batch, len(channels), image_height_orig, image_width_orig) if data_format == "channels_first" else (batch, image_height_orig, image_width_orig, len(channels)), (batch))) next_elem = iterator.get_next() print(next_elem[0].shape, next_elem[1].shape) #if downsampling, do some preprocessing if downsampling_fact != 1: if downsampling_mode == "scale": rand_select = tf.cast(tf.one_hot(tf.random_uniform( (batch, image_height, image_width), minval=0, maxval=downsampling_fact * downsampling_fact, dtype=tf.int32), depth=downsampling_fact * downsampling_fact, axis=-1), dtype=tf.int32) if not predmode: next_elem = (tf.layers.average_pooling2d(next_elem[0], downsampling_fact, downsampling_fact, 'valid', data_format), \ tf.reduce_max(tf.multiply(tf.image.extract_image_patches(tf.expand_dims(next_elem[1], axis=-1), \ [1, downsampling_fact, downsampling_fact, 1], \ [1, downsampling_fact, downsampling_fact, 1], \ [1,1,1,1], 'VALID'), rand_select), axis=-1), \ tf.squeeze(tf.layers.average_pooling2d(tf.expand_dims(next_elem[2], axis=-1), downsampling_fact, downsampling_fact, 'valid', "channels_last"), axis=-1), \ next_elem[3]) else: next_elem = (tf.layers.average_pooling2d(next_elem[0], downsampling_fact, downsampling_fact, 'valid', data_format), \ next_elem[1]) elif downsampling_mode == "center-crop": #some parameters length = 1. / float(downsampling_fact) offset = length / 2. boxes = [[offset, offset, offset + length, offset + length] ] * batch box_ind = list(range(0, batch)) crop_size = [image_height, image_width] #be careful with data order if data_format == "channels_first": next_elem[0] = tf.transpose(next_elem[0], perm=[0, 2, 3, 1]) #crop if not predmode: next_elem = (tf.image.crop_and_resize(next_elem[0], boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="data_cropping"), \ ensure_type(tf.squeeze(tf.image.crop_and_resize(tf.expand_dims(next_elem[1],axis=-1), boxes, box_ind, crop_size, method='nearest', extrapolation_value=0, name="label_cropping"), axis=-1), tf.int32), \ tf.squeeze(tf.image.crop_and_resize(tf.expand_dims(next_elem[2],axis=-1), boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="weight_cropping"), axis=-1), \ next_elem[3]) else: next_elem = (tf.image.crop_and_resize( next_elem[0], boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="data_cropping"), next_elem[1]) #be careful with data order if data_format == "channels_first": next_elem[0] = tf.transpose(next_elem[0], perm=[0, 3, 1, 2]) else: raise ValueError( "Error, downsampling mode {} not supported. Supported are [center-crop, scale]" .format(downsampling_mode)) #create init handles #tst tst_iterator = tst_dataset.make_initializable_iterator() tst_handle_string = tst_iterator.string_handle() tst_init_op = iterator.make_initializer(tst_dataset) print(next_elem[0].shape, next_elem[1]) #compute the input filter number based on number of channels used num_channels = len(channels) #set up model model = deeplab_v3_plus_generator(num_classes=3, output_stride=8, base_architecture=model, decoder=decoder, batchnorm=batchnorm, pre_trained_model=None, batch_norm_decay=None, data_format=data_format) logit, prediction = model(next_elem[0], True, dtype) #cast the logits to fp32 logit = ensure_type(logit, tf.float32) if not predmode: #set up loss loss = None if loss_type == "weighted": #cast weights to FP32 w_cast = ensure_type(next_elem[2], tf.float32) loss = tf.losses.sparse_softmax_cross_entropy( labels=next_elem[1], logits=logit, weights=w_cast, reduction=tf.losses.Reduction.SUM) if scale_factor != 1.0: loss *= scale_factor elif loss_type == "weighted_mean": #cast weights to FP32 w_cast = ensure_type(next_elem[2], tf.float32) loss = tf.losses.sparse_softmax_cross_entropy( labels=next_elem[1], logits=logit, weights=w_cast, reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS) if scale_factor != 1.0: loss *= scale_factor elif loss_type == "focal": #one-hot-encode labels_one_hot = tf.contrib.layers.one_hot_encoding( next_elem[1], 3) #cast to FP32 labels_one_hot = ensure_type(labels_one_hot, tf.float32) loss = focal_loss(onehot_labels=labels_one_hot, logits=logit, alpha=1., gamma=2.) else: raise ValueError("Error, loss type {} not supported.", format(loss_type)) #set up streaming metrics iou_op, iou_update_op = tf.metrics.mean_iou( labels=next_elem[1], predictions=tf.argmax(prediction, axis=3), num_classes=3, weights=None, metrics_collections=None, updates_collections=None, name="iou_score") iou_reset_op = tf.variables_initializer([ i for i in tf.local_variables() if i.name.startswith('iou_score/') ]) #initializers: init_op = tf.global_variables_initializer() init_local_op = tf.local_variables_initializer() #create image dir if not exists if not os.path.isdir(image_dir): os.makedirs(image_dir) #start session with tf.Session(config=sess_config) as sess: #initialize sess.run([init_op, init_local_op]) #restore from checkpoint: load_model(sess, tf.train.Saver(), checkpoint_dir) #create iterator handles tst_handle = sess.run(tst_handle_string) #init iterators sess.run(tst_init_op, feed_dict={handle: tst_handle}) #remove training nodes if output_graph_file: print( "Storing inference graph to {}.".format(output_graph_file)) inference_graph_def = tf.graph_util.remove_training_nodes( sess.graph_def, protected_nodes=None) #save the inference graph with open(output_graph_file, 'wb') as ogf: ogf.write(inference_graph_def.SerializeToString()) #start inference eval_loss = 0. eval_steps = 0 print("Starting evaluation on test set") while True: try: if not predmode: #construct feed dict _, tmp_loss, tst_model_predictions, tst_model_labels, tst_model_filenames = sess.run( [ iou_update_op, loss, prediction, next_elem[1], next_elem[3] ], feed_dict={handle: tst_handle}) else: tst_model_predictions, tst_model_filenames = sess.run( [prediction, next_elem[1]], feed_dict={handle: tst_handle}) #print some images if have_imsave: for i in range(tst_model_labels.shape[0]): suf = '{}_rank{}_{}.png'.format( eval_steps, comm_rank, i) imsave( image_dir + '/test_pred_estep' + suf, np.argmax(tst_model_predictions[i, ...], axis=-1) * 100) if not predmode: imsave(image_dir + '/test_label_estep' + suf, tst_model_labels[i, ...] * 100) imsave( image_dir + '/test_combined_estep' + suf, plot_colormap[ tst_model_labels[i, ...], np.argmax(tst_model_predictions[i, ...], axis=-1)]) else: if not predmode: np.savez( image_dir + '/test_estep' + str(eval_steps) + '_rank' + str(comm_rank) + '.npz', prediction=np.argmax( tst_model_predictions[...], axis=-1) * 100, label=tst_model_labels[...] * 100, filename=tst_model_filenames) else: np.savez( image_dir + '/test_estep' + str(eval_steps) + '_rank' + str(comm_rank) + '.npz', prediction=np.argmax( tst_model_predictions[...], axis=-1) * 100, filename=tst_model_filenames) #update loss if not predmode: eval_loss += tmp_loss eval_steps += 1 except tf.errors.OutOfRangeError: eval_steps = np.max([eval_steps, 1]) if not predmode: eval_loss /= eval_steps print("COMPLETED: evaluation loss is {}".format( eval_loss)) iou_score = sess.run(iou_op) print("COMPLETED: evaluation IoU is {}".format( iou_score)) break
stddev=1.0 / math.sqrt(embedding_size))) nce_biases = tf.Variable(tf.zeros([vocabulary_size])) # Compute the average NCE loss for the batch. # tf.nce_loss automatically draws a new sample of the negative labels each # time we evaluate the loss. loss = tf.reduce_mean( tf.nn.nce_loss(weights=nce_weights, biases=nce_biases, labels=train_labels, inputs=embed, num_sampled=num_sampled, num_classes=vocabulary_size)) # Horovod: adjust learning rate based on number of GPUs. optimizer = tf.train.GradientDescentOptimizer(1.0 * hvd.size()) # Horovod: add Horovod Distributed Optimizer. optimizer = hvd.DistributedOptimizer(optimizer) train_op = optimizer.minimize(loss) # Compute the cosine similarity between minibatch examples and all embeddings. norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) normalized_embeddings = embeddings / norm valid_embeddings = tf.nn.embedding_lookup( normalized_embeddings, valid_dataset) similarity = tf.matmul( valid_embeddings, normalized_embeddings, transpose_b=True) # Add variable initializer.
# initialization. if first_batch: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(opt.variables(), root_rank=0) def log(s, nl=True): if hvd.rank() != 0: return print(s, end='\n' if nl else '') log('Model: %s' % args.model) log('Batch size: %d' % args.batch_size) device = 'GPU' if args.cuda else 'CPU' log('Number of %ss: %d' % (device, hvd.size())) with tf.device(device): # Warm-up log('Running warmup...') benchmark_step(first_batch=True) timeit.timeit(lambda: benchmark_step(first_batch=False), number=args.num_warmup_batches) # Benchmark log('Running benchmark...') img_secs = [] for x in range(args.num_iters): time = timeit.timeit(lambda: benchmark_step(first_batch=False), number=args.num_batches_per_iter) img_sec = args.batch_size * args.num_batches_per_iter / time