def main(hps): # Initialize Horovod. hvd.init() # Create tensorflow session sess = tensorflow_session() # Download and load dataset. tf.set_random_seed(hvd.rank() + hvd.size() * hps.seed) np.random.seed(hvd.rank() + hvd.size() * hps.seed) # Get data and set train_its and valid_its train_iterator, test_iterator, data_init = get_data(hps, sess) hps.train_its, hps.test_its, hps.full_test_its = get_its(hps) # Create log dir logdir = os.path.abspath(hps.logdir) + "/" if not os.path.exists(logdir): os.mkdir(logdir) # Create model import model model = model.model(sess, hps, train_iterator, test_iterator, data_init) # Initialize visualization functions visualise = init_visualizations(hps, model, logdir) if not hps.inference: # Perform training train(sess, model, hps, logdir, visualise) else: infer(sess, model, hps, test_iterator)
def main(unused_argv): # Horovod: initialize Horovod. hvd.init() # Load training and eval data mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank()) train_data = mnist.train.images # Returns np.array train_labels = np.asarray(mnist.train.labels, dtype=np.int32) eval_data = mnist.test.images # Returns np.array eval_labels = np.asarray(mnist.test.labels, dtype=np.int32) # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. model_dir = './mnist_convnet_model' if hvd.rank() == 0 else None # Create the Estimator mnist_classifier = tf.estimator.Estimator( model_fn=cnn_model_fn, model_dir=model_dir, config=tf.estimator.RunConfig(session_config=config)) # Set up logging for predictions # Log the values in the "Softmax" tensor with label "probabilities" tensors_to_log = {"probabilities": "softmax_tensor"} logging_hook = tf.train.LoggingTensorHook( tensors=tensors_to_log, every_n_iter=500) # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from # rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights or # restored from a checkpoint. bcast_hook = hvd.BroadcastGlobalVariablesHook(0) # Train the model train_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": train_data}, y=train_labels, batch_size=100, num_epochs=None, shuffle=True) # Horovod: adjust number of steps based on number of GPUs. mnist_classifier.train( input_fn=train_input_fn, steps=20000 // hvd.size(), hooks=[logging_hook, bcast_hook]) # Evaluate the model and print results eval_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False) eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) print(eval_results)
def get_data(hps, sess): if hps.image_size == -1: hps.image_size = {'mnist': 32, 'cifar10': 32, 'imagenet-oord': 64, 'imagenet': 256, 'celeba': 256, 'lsun_realnvp': 64, 'lsun': 256}[hps.problem] if hps.n_test == -1: hps.n_test = {'mnist': 10000, 'cifar10': 10000, 'imagenet-oord': 50000, 'imagenet': 50000, 'celeba': 3000, 'lsun_realnvp': 300*hvd.size(), 'lsun': 300*hvd.size()}[hps.problem] hps.n_y = {'mnist': 10, 'cifar10': 10, 'imagenet-oord': 1000, 'imagenet': 1000, 'celeba': 1, 'lsun_realnvp': 1, 'lsun': 1}[hps.problem] if hps.data_dir == "": hps.data_dir = {'mnist': None, 'cifar10': None, 'imagenet-oord': '/mnt/host/imagenet-oord-tfr', 'imagenet': '/mnt/host/imagenet-tfr', 'celeba': '/mnt/host/celeba-reshard-tfr', 'lsun_realnvp': '/mnt/host/lsun_realnvp', 'lsun': '/mnt/host/lsun'}[hps.problem] if hps.problem == 'lsun_realnvp': hps.rnd_crop = True else: hps.rnd_crop = False if hps.category: hps.data_dir += ('/%s' % hps.category) # Use anchor_size to rescale batch size based on image_size s = hps.anchor_size hps.local_batch_train = hps.n_batch_train * \ s * s // (hps.image_size * hps.image_size) hps.local_batch_test = {64: 50, 32: 25, 16: 10, 8: 5, 4: 2, 2: 2, 1: 1}[ hps.local_batch_train] # round down to closest divisor of 50 hps.local_batch_init = hps.n_batch_init * \ s * s // (hps.image_size * hps.image_size) print("Rank {} Batch sizes Train {} Test {} Init {}".format( hvd.rank(), hps.local_batch_train, hps.local_batch_test, hps.local_batch_init)) if hps.problem in ['imagenet-oord', 'imagenet', 'celeba', 'lsun_realnvp', 'lsun']: hps.direct_iterator = True import data_loaders.get_data as v train_iterator, test_iterator, data_init = \ v.get_data(sess, hps.data_dir, hvd.size(), hvd.rank(), hps.pmap, hps.fmap, hps.local_batch_train, hps.local_batch_test, hps.local_batch_init, hps.image_size, hps.rnd_crop) elif hps.problem in ['mnist', 'cifar10']: hps.direct_iterator = False import data_loaders.get_mnist_cifar as v train_iterator, test_iterator, data_init = \ v.get_data(hps.problem, hvd.size(), hvd.rank(), hps.dal, hps.local_batch_train, hps.local_batch_test, hps.local_batch_init, hps.image_size) else: raise Exception() return train_iterator, test_iterator, data_init
def init_backend_engine(): """ Initializes ``engine``, which is either :class:`TFEngine.Engine` or Theano :class:`Engine.Engine`. """ BackendEngine.select_engine(config=config) if BackendEngine.is_theano_selected(): print("Theano:", describe_theano_version(), file=log.v3) import TheanoUtil TheanoUtil.monkey_patches() elif BackendEngine.is_tensorflow_selected(): print("TensorFlow:", describe_tensorflow_version(), file=log.v3) if get_tensorflow_version_tuple()[0] == 0: print("Warning: TF <1.0 is not supported and likely broken.", file=log.v2) if os.environ.get("TF_DEVICE"): print("Devices: Use %s via TF_DEVICE instead of %s." % ( os.environ.get("TF_DEVICE"), config.opt_typed_value("device")), file=log.v4) config.set("device", os.environ.get("TF_DEVICE")) if config.is_true("use_horovod"): import socket # noinspection PyPackageRequirements,PyUnresolvedReferences import horovod.tensorflow as hvd from TFUtil import init_horovod init_horovod() # make sure it is initialized if "gpu" in config.value("device", "") or os.environ.get("CUDA_VISIBLE_DEVICES", ""): # We assume that we want to use a GPU. gpu_opts = config.typed_dict.setdefault("tf_session_opts", {}).setdefault("gpu_options", {}) assert "visible_device_list" not in gpu_opts gpu_opts["visible_device_list"] = str(hvd.local_rank()) print("Horovod: Hostname %s, pid %i, using GPU %s." % ( socket.gethostname(), os.getpid(), gpu_opts["visible_device_list"]), file=log.v3) else: if hvd.rank() == 0: # Don't spam in all ranks. print("Horovod: Not using GPU.", file=log.v3) horovod_reduce_type = config.value("horovod_reduce_type", "") if horovod_reduce_type == "": horovod_reduce_type = "grad" config.set("horovod_reduce_type", horovod_reduce_type) else: assert horovod_reduce_type in ["grad", "param"], "config option 'horovod_reduce_type' invalid" if hvd.rank() == 0: # Don't spam in all ranks. print("Horovod: Reduce type:", horovod_reduce_type, file=log.v3) from TFUtil import debug_register_better_repr, setup_tf_thread_pools, print_available_devices tf_session_opts = config.typed_value("tf_session_opts", {}) assert isinstance(tf_session_opts, dict) # This must be done after the Horovod logic, such that we only touch the devices we are supposed to touch. setup_tf_thread_pools(log_file=log.v3, tf_session_opts=tf_session_opts) # Print available devices. Also make sure that get_tf_list_local_devices uses the correct TF session opts. print_available_devices(tf_session_opts=tf_session_opts, file=log.v2) debug_register_better_repr() else: raise NotImplementedError
def main(_): # Initialize Horovod. hvd.init() # Download and load MNIST dataset. mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank()) # Build model... with tf.name_scope('input'): image = tf.placeholder(tf.float32, [None, 784], name='image') label = tf.placeholder(tf.float32, [None], name='label') predict, loss = conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN) opt = tf.train.RMSPropOptimizer(0.01) # Add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) global_step = tf.contrib.framework.get_or_create_global_step() train_op = opt.minimize(loss, global_step=global_step) # BroadcastGlobalVariablesHook broadcasts initial variable states from rank 0 # to all other processes. This is necessary to ensure consistent initialization # of all workers when training is started with random weights or restored # from a checkpoint. hooks = [hvd.BroadcastGlobalVariablesHook(0), tf.train.StopAtStepHook(last_step=100), tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss}, every_n_iter=10), ] # Pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Save checkpoints only on worker 0 to prevent other workers from corrupting them. checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop(): # Run a training step synchronously. image_, label_ = mnist.train.next_batch(100) mon_sess.run(train_op, feed_dict={image: image_, label: label_})
def get_its(hps): # These run for a fixed amount of time. As anchored batch is smaller, we've actually seen fewer examples train_its = int(np.ceil(hps.n_train / (hps.n_batch_train * hvd.size()))) test_its = int(np.ceil(hps.n_test / (hps.n_batch_train * hvd.size()))) train_epoch = train_its * hps.n_batch_train * hvd.size() # Do a full validation run if hvd.rank() == 0: print(hps.n_test, hps.local_batch_test, hvd.size()) assert hps.n_test % (hps.local_batch_test * hvd.size()) == 0 full_test_its = hps.n_test // (hps.local_batch_test * hvd.size()) if hvd.rank() == 0: print("Train epoch size: " + str(train_epoch)) return train_its, test_its, full_test_its
def add_edge_padding(x, filter_size): assert filter_size[0] % 2 == 1 if filter_size[0] == 1 and filter_size[1] == 1: return x a = (filter_size[0] - 1) // 2 # vertical padding size b = (filter_size[1] - 1) // 2 # horizontal padding size if True: x = tf.pad(x, [[0, 0], [a, a], [b, b], [0, 0]]) name = "_".join([str(dim) for dim in [a, b, *int_shape(x)[1:3]]]) pads = tf.get_collection(name) if not pads: if hvd.rank() == 0: print("Creating pad", name) pad = np.zeros([1] + int_shape(x)[1:3] + [1], dtype='float32') pad[:, :a, :, 0] = 1. pad[:, -a:, :, 0] = 1. pad[:, :, :b, 0] = 1. pad[:, :, -b:, 0] = 1. pad = tf.convert_to_tensor(pad) tf.add_to_collection(name, pad) else: pad = pads[0] pad = tf.tile(pad, [tf.shape(x)[0], 1, 1, 1]) x = tf.concat([x, pad], axis=3) else: pad = tf.pad(tf.zeros_like(x[:, :, :, :1]) - 1, [[0, 0], [a, a], [b, b], [0, 0]]) + 1 x = tf.pad(x, [[0, 0], [a, a], [b, b], [0, 0]]) x = tf.concat([x, pad], axis=3) return x
def test_horovod_allreduce_error(self): """Test that the allreduce raises an error if different ranks try to send tensors of different rank or dimension.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session() as session: # Same rank, different dimension tf.set_random_seed(1234) dims = [17 + rank] * 3 tensor = tf.random_uniform(dims, -1.0, 1.0) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.allreduce(tensor)) # Same number of elements, different rank tf.set_random_seed(1234) if rank == 0: dims = [17, 23 * 57] else: dims = [17, 23, 57] tensor = tf.random_uniform(dims, -1.0, 1.0) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.allreduce(tensor))
def test_horovod_broadcast(self): """Test that the broadcast correctly broadcasts 1D, 2D, 3D tensors.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session() as session: dtypes = [tf.uint8, tf.int8, tf.uint16, tf.int16, tf.int32, tf.int64, tf.float32, tf.float64, tf.bool] dims = [1, 2, 3] root_ranks = list(range(size)) for dtype, dim, root_rank in itertools.product(dtypes, dims, root_ranks): try: tensor = tf.ones([17] * dim) * rank root_tensor = tf.ones([17] * dim) * root_rank if dtype == tf.bool: tensor = tensor % 2 root_tensor = root_tensor % 2 tensor = tf.cast(tensor, dtype=dtype) root_tensor = tf.cast(root_tensor, dtype=dtype) broadcasted_tensor = hvd.broadcast(tensor, root_rank) self.assertTrue( session.run(tf.reduce_all(tf.equal( tf.cast(root_tensor, tf.int32), tf.cast(broadcasted_tensor, tf.int32)))), "hvd.broadcast produces incorrect broadcasted tensor") except Exception: import traceback traceback.print_exc()
def draw_samples(epoch): if hvd.rank() != 0: return rows = 10 if hps.image_size <= 64 else 4 cols = rows n_batch = rows*cols y = np.asarray([_y % hps.n_y for _y in ( list(range(cols)) * rows)], dtype='int32') # temperatures = [0., .25, .5, .626, .75, .875, 1.] #previously temperatures = [0., .25, .5, .6, .7, .8, .9, 1.] x_samples = [] x_samples.append(sample_batch(y, [.0]*n_batch)) x_samples.append(sample_batch(y, [.25]*n_batch)) x_samples.append(sample_batch(y, [.5]*n_batch)) x_samples.append(sample_batch(y, [.6]*n_batch)) x_samples.append(sample_batch(y, [.7]*n_batch)) x_samples.append(sample_batch(y, [.8]*n_batch)) x_samples.append(sample_batch(y, [.9] * n_batch)) x_samples.append(sample_batch(y, [1.]*n_batch)) # previously: 0, .25, .5, .625, .75, .875, 1. for i in range(len(x_samples)): x_sample = np.reshape( x_samples[i], (n_batch, hps.image_size, hps.image_size, 3)) graphics.save_raster(x_sample, logdir + 'epoch_{}_sample_{}.png'.format(epoch, i))
def test_horovod_broadcast_grad(self): """Test the correctness of the broadcast gradient.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session(config=self.config) as session: # As of TensorFlow v1.9, gradients are not supported on # integer tensors dtypes = [tf.float32, tf.float64] dims = [1, 2, 3] root_ranks = list(range(size)) for dtype, dim, root_rank in itertools.product( dtypes, dims, root_ranks): tensor = tf.ones([5] * dim) * rank if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) broadcasted_tensor = hvd.broadcast(tensor, root_rank) grad_ys = tf.ones([5] * dim) grad = tf.gradients(broadcasted_tensor, tensor, grad_ys)[0] grad_out = session.run(grad) c = size if rank == root_rank else 0 expected = np.ones([5] * dim) * c err = np.linalg.norm(expected - grad_out) self.assertLess(err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def print_act_stats(x, _str=""): if not do_print_act_stats: return x if hvd.rank() != 0: return x if len(x.get_shape()) == 1: x_mean, x_var = tf.nn.moments(x, [0], keep_dims=True) if len(x.get_shape()) == 2: x_mean, x_var = tf.nn.moments(x, [0], keep_dims=True) if len(x.get_shape()) == 4: x_mean, x_var = tf.nn.moments(x, [0, 1, 2], keep_dims=True) stats = [tf.reduce_min(x_mean), tf.reduce_mean(x_mean), tf.reduce_max(x_mean), tf.reduce_min(tf.sqrt(x_var)), tf.reduce_mean(tf.sqrt(x_var)), tf.reduce_max(tf.sqrt(x_var))] return tf.Print(x, stats, "["+_str+"] "+x.name)
def test_horovod_broadcast_rank_error(self): """Test that the broadcast returns an error if different ranks specify different root rank.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session() as session: tensor = tf.ones([17] * 3, dtype=tf.float32) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.broadcast(tensor, rank))
def test_horovod_allgather_variable_size(self): """Test that the allgather correctly gathers 1D, 2D, 3D tensors, even if those tensors have different sizes along the first dim.""" hvd.init() rank = hvd.rank() size = hvd.size() with self.test_session() as session: dtypes = [tf.uint8, tf.int8, tf.uint16, tf.int16, tf.int32, tf.int64, tf.float32, tf.float64, tf.bool] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): # Support tests up to MPI Size of 35 if size > 35: break tensor_sizes = [17, 32, 81, 12, 15, 23, 22] * 5 tensor_sizes = tensor_sizes[:size] tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) gathered = hvd.allgather(tensor) gathered_tensor = session.run(gathered) expected_size = sum(tensor_sizes) self.assertEqual(list(gathered_tensor.shape), [expected_size] + [17] * (dim - 1)) for i in range(size): rank_size = [tensor_sizes[i]] + [17] * (dim - 1) rank_tensor = tf.slice( gathered, [sum(tensor_sizes[:i])] + [0] * (dim - 1), rank_size) self.assertEqual(list(rank_tensor.shape), rank_size) # tf.equal() does not support tf.uint16 as of TensorFlow 1.2, # so need to cast rank_tensor to tf.int32. if dtype != tf.bool: value = i else: value = i % 2 self.assertTrue( session.run(tf.reduce_all( tf.equal(tf.cast(rank_tensor, tf.int32), value))), "hvd.allgather produces incorrect gathered tensor")
def _setup_graph(self): num_gpu = cfg.TRAIN.NUM_GPUS if cfg.TRAINER == 'replicated': # Use two predictor threads per GPU to get better throughput self.num_predictor = num_gpu * 2 self.predictors = [self._build_coco_predictor(k % num_gpu) for k in range(self.num_predictor)] self.dataflows = [get_eval_dataflow(shard=k, num_shards=self.num_predictor) for k in range(self.num_predictor)] else: # Only eval on the first machine. # Alternatively, can eval on all ranks and use allgather, but allgather sometimes hangs self._horovod_run_eval = hvd.rank() == hvd.local_rank() if self._horovod_run_eval: self.predictor = self._build_coco_predictor(0) self.dataflow = get_eval_dataflow(shard=hvd.local_rank(), num_shards=hvd.local_size()) self.barrier = hvd.allreduce(tf.random_normal(shape=[1]))
def test_horovod_broadcast_error(self): """Test that the broadcast returns an error if any dimension besides the first is different among the tensors being broadcasted.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session() as session: tensor_size = [17] * 3 tensor_size[1] = 10 * (rank + 1) tensor = tf.ones(tensor_size, dtype=tf.float32) * rank with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.broadcast(tensor, 0))
def test_horovod_broadcast_type_error(self): """Test that the broadcast returns an error if the types being broadcasted differ among the processes""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session() as session: tensor_size = [17] * 3 dtype = tf.int32 if rank % 2 == 0 else tf.float32 tensor = tf.ones(tensor_size, dtype=dtype) * rank with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.broadcast(tensor, 0))
def test_horovod_allreduce_type_error(self): """Test that the allreduce raises an error if different ranks try to send tensors of different type.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session() as session: # Same rank, different dimension dims = [17] * 3 tensor = tf.ones(dims, dtype=tf.int32 if rank % 2 == 0 else tf.float32) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.allreduce(tensor))
def init_by_config(self, config): """ :param Config.Config config: """ logs = config.list('log', []) log_verbosity = config.int_list('log_verbosity', []) log_format = config.list('log_format', []) if config.is_true("use_horovod"): # noinspection PyPackageRequirements,PyUnresolvedReferences import horovod.tensorflow as hvd from TFUtil import init_horovod init_horovod() # make sure it is initialized new_logs = [] for fn in logs: fn_prefix, fn_ext = os.path.splitext(fn) fn_ext = ".horovod-%i-%i%s" % (hvd.rank(), hvd.size(), fn_ext) new_logs.append(fn_prefix + fn_ext) logs = new_logs self.initialize(logs=logs, verbosity=log_verbosity, formatter=log_format)
def test_horovod_allgather_grad(self): """Test the correctness of the allgather gradient.""" hvd.init() rank = hvd.rank() size = hvd.size() with self.test_session(config=self.config) as session: # As of TensorFlow v1.9, gradients are not supported on # integer tensors dtypes = [tf.float32, tf.float64] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5 tensor_sizes = tensor_sizes[:size] tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) gathered = hvd.allgather(tensor) grad_list = [] for r, tensor_size in enumerate(tensor_sizes): g = tf.ones([tensor_size] + [17] * (dim - 1)) * r grad_list.append(g) grad_ys = tf.concat(grad_list, axis=0) grad = tf.gradients(gathered, tensor, grad_ys)[0] grad_out = session.run(grad) expected = np.ones( [tensor_sizes[rank]] + [17] * (dim - 1) ) * rank * size err = np.linalg.norm(expected - grad_out) self.assertLess(err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def _eval(self): logdir = args.logdir if cfg.TRAINER == 'replicated': with ThreadPoolExecutor(max_workers=self.num_predictor, thread_name_prefix='EvalWorker') as executor, \ tqdm.tqdm(total=sum([df.size() for df in self.dataflows])) as pbar: futures = [] for dataflow, pred in zip(self.dataflows, self.predictors): futures.append(executor.submit(eval_coco, dataflow, pred, pbar)) all_results = list(itertools.chain(*[fut.result() for fut in futures])) else: if self._horovod_run_eval: local_results = eval_coco(self.dataflow, self.predictor) output_partial = os.path.join( logdir, 'outputs{}-part{}.json'.format(self.global_step, hvd.local_rank())) with open(output_partial, 'w') as f: json.dump(local_results, f) self.barrier.eval() if hvd.rank() > 0: return all_results = [] for k in range(hvd.local_size()): output_partial = os.path.join( logdir, 'outputs{}-part{}.json'.format(self.global_step, k)) with open(output_partial, 'r') as f: obj = json.load(f) all_results.extend(obj) os.unlink(output_partial) output_file = os.path.join( logdir, 'outputs{}.json'.format(self.global_step)) with open(output_file, 'w') as f: json.dump(all_results, f) try: scores = print_evaluation_scores(output_file) for k, v in scores.items(): self.trainer.monitors.put_scalar(k, v) except Exception: logger.exception("Exception in COCO evaluation.")
def test_horovod_allgather(self): """Test that the allgather correctly gathers 1D, 2D, 3D tensors.""" hvd.init() rank = hvd.rank() size = hvd.size() with self.test_session() as session: dtypes = [tf.uint8, tf.int8, tf.uint16, tf.int16, tf.int32, tf.int64, tf.float32, tf.float64, tf.bool] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = tf.ones([17] * dim) * rank if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) gathered = hvd.allgather(tensor) gathered_tensor = session.run(gathered) self.assertEqual(list(gathered_tensor.shape), [17 * size] + [17] * (dim - 1)) for i in range(size): rank_tensor = tf.slice(gathered_tensor, [i * 17] + [0] * (dim - 1), [17] + [-1] * (dim - 1)) self.assertEqual(list(rank_tensor.shape), [17] * dim) # tf.equal() does not support tf.uint16 as of TensorFlow 1.2, # so need to cast rank_tensor to tf.int32. if dtype != tf.bool: value = i else: value = i % 2 self.assertTrue( session.run(tf.reduce_all( tf.equal(tf.cast(rank_tensor, tf.int32), value))), "hvd.allgather produces incorrect gathered tensor")
def log(s, nl=True): if hvd.rank() != 0: return print(s, end='\n' if nl else '')
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) nranks_per_gpu = args.nranks_per_gpu local_rank = hvd.local_rank() gpu_local_rank = local_rank // nranks_per_gpu print('local_rank, GPU_LOCAL_RANK: {}, {}'.format( local_rank, gpu_local_rank)) # Pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True # config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.visible_device_list = str(gpu_local_rank) K.set_session(tf.Session(config=config)) # input image dimensions img_rows, img_cols, img_chns = 28, 28, 1 # number of convolutional filters to use filters = 64 # convolution kernel size num_conv = 3 hvdsize = hvd.size() batch_size = 128 # 100 if K.image_data_format() == 'channels_first': original_img_size = (img_chns, img_rows, img_cols) else: original_img_size = (img_rows, img_cols, img_chns) latent_dim = 2 intermediate_dim = 128 epsilon_std = 1.0 epochs = args.epochs # 5 # train the VAE on MNIST digits (x_train, _), (x_test, y_test) = mnist.load_data() x_train = x_train.astype('float32') / 255. x_train = x_train.reshape((x_train.shape[0],) + original_img_size) x_test = x_test.astype('float32') / 255. x_test = x_test.reshape((x_test.shape[0],) + original_img_size) if hvd.rank() == 0: print('x_train.shape:', x_train.shape) train_samples = x_train.shape[0] # steps_per_epoch = train_samples // batch_size // hvdsize speedupopt = args.speedup if speedupopt == SpeedupOpts.imgspersec: steps_per_epoch = train_samples // batch_size else: steps_per_epoch = int(round( float(train_samples) / batch_size / hvdsize + 0.5)) # Create the dataset and its associated one-shot iterator. buffer_size = 10000 dataset = Dataset.from_tensor_slices(x_train) dataset = dataset.repeat() dataset = dataset.shuffle(buffer_size) dataset = dataset.batch(batch_size) iterator = dataset.make_one_shot_iterator() x_train_batch = iterator.get_next() ldict = make_shared_layers_dict( img_chns, img_rows, img_cols, batch_size, filters, num_conv, intermediate_dim, latent_dim, epsilon_std) # ldict is a dictionary that holds all layers. Since these layers are # instantiated once, they are shared amongs vae, encoder, and generator. x = Input(tensor=x_train_batch) vae = make_vae(ldict, x) # : :type vae: Model lr = 0.001 # * hvdsize opt = tf.train.RMSPropOptimizer(lr) # Add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) # , use_locking=True) opt = TFOptimizer(opt) # opt = RMSprop(lr) # Add Horovod Distributed Optimizer. # opt = hvd_keras.DistributedOptimizer(opt) # , use_locking=True) vae.compile(optimizer=opt, loss=None) if hvd.rank() == 0: vae.summary() callbacks = [] if hvd.rank() == 0: callbacks += [BatchTiming(), SamplesPerSec(batch_size * hvdsize)] sess = K.get_session() sess.run(hvd.broadcast_global_variables(0)) # Fit the model using data from the TF data tensors. vae.fit(steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks) if hvd.rank() == 0: x = Input(shape=original_img_size) vae_val = make_vae(ldict, x) vae_val.compile(optimizer=opt, loss=None) loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size) print('\n\nVAE VALIDATION LOSS: {}'.format(loss)) x = Input(shape=original_img_size) z_mean, _ = get_encoded(ldict, x) encoder = Model(x, z_mean) # : :type encoder: Model decoder_input = Input(shape=(latent_dim,)) x_decoded_mean_squash = get_decoded(ldict, decoder_input) generator = Model(decoder_input, x_decoded_mean_squash) # : :type generator: Model # display a 2D plot of the digit classes in the latent space x_test_encoded = encoder.predict(x_test, batch_size=batch_size) plt.figure(figsize=(6, 6)) plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test) plt.colorbar() # plt.show() plt.savefig('vae_scatter.ps') plt.close() # display a 2D manifold of the digits n = 15 # figure with 15x15 digits digit_size = 28 figure = np.zeros((digit_size * n, digit_size * n)) # Linearly spaced coordinates on the unit square were transformed # through the inverse CDF (ppf) of the Gaussian # To produce values of the latent variables z, since the prior of the # latent space is Gaussian grid_x = norm.ppf(np.linspace(0.05, 0.95, n)) grid_y = norm.ppf(np.linspace(0.05, 0.95, n)) for i, yi in enumerate(grid_x): for j, xi in enumerate(grid_y): z_sample = np.array([[xi, yi]]) z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2) x_decoded = generator.predict(z_sample, batch_size=batch_size) digit = x_decoded[0].reshape(digit_size, digit_size) figure[i * digit_size: (i + 1) * digit_size, j * digit_size: (j + 1) * digit_size] = digit plt.figure(figsize=(10, 10)) plt.imshow(figure, cmap='Greys_r') # plt.show() plt.savefig('vae_digit.ps') plt.close() K.clear_session()
def test_horovod_rank(self): """Test that the rank returned by hvd.rank() is correct.""" true_rank, _ = mpi_env_rank_and_size() hvd.init() rank = hvd.rank() self.assertEqual(true_rank, rank)
def main(_): tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if FLAGS.use_fp16: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" if FLAGS.horovod: import horovod.tensorflow as hvd hvd.init() bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tf.io.gfile.makedirs(FLAGS.output_dir) input_files = [] for input_file_dir in FLAGS.input_files_dir.split(","): input_files.extend(tf.io.gfile.glob(os.path.join(input_file_dir, "*"))) if FLAGS.horovod and len(input_files) < hvd.size(): raise ValueError("Input Files must be sharded") if FLAGS.use_fp16 and FLAGS.manual_fp16: raise ValueError( "AMP and Manual Mixed Precision Training are both activated! Error" ) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 config = tf.compat.v1.ConfigProto() if FLAGS.horovod: config.gpu_options.visible_device_list = str(hvd.local_rank()) if hvd.rank() == 0: tf.compat.v1.logging.info("***** Configuaration *****") for key in FLAGS.__flags.keys(): tf.compat.v1.logging.info(' {}: {}'.format( key, getattr(FLAGS, key))) tf.compat.v1.logging.info("**************************") # config.gpu_options.per_process_gpu_memory_fraction = 0.7 if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1 config.graph_options.rewrite_options.memory_optimization = rewriter_config_pb2.RewriterConfig.NO_MEM_OPT run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir, session_config=config, save_checkpoints_steps=FLAGS.save_checkpoints_steps if not FLAGS.horovod or hvd.rank() == 0 else None, # This variable controls how often estimator reports examples/sec. # Default value is every 100 steps. # When --report_loss is True, we set to very large value to prevent # default info reporting from estimator. # Ideally we should set it to None, but that does not work. log_step_count_steps=10000 if FLAGS.report_loss else 100) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(), num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps, use_one_hot_embeddings=False, hvd=None if not FLAGS.horovod else hvd) training_hooks = [] if FLAGS.report_loss and (not FLAGS.horovod or hvd.rank() == 0): global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.horovod else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size( ) training_hooks.append( _LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps, dllogging, FLAGS.display_loss_steps)) if FLAGS.horovod and hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) if FLAGS.do_train: tf.compat.v1.logging.info("***** Running training *****") tf.compat.v1.logging.info(" Batch size = %d", FLAGS.train_batch_size) train_input_fn = input_fn_builder( input_files=input_files, batch_size=FLAGS.train_batch_size, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True, hvd=None if not FLAGS.horovod else hvd) estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=FLAGS.num_train_steps) if FLAGS.do_eval and (not FLAGS.horovod or hvd.rank() == 0): tf.compat.v1.logging.info("***** Running evaluation *****") tf.compat.v1.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_files = [] for eval_file_dir in FLAGS.eval_files_dir.split(","): eval_files.extend( tf.io.gfile.glob(os.path.join(eval_file_dir, "*"))) eval_input_fn = input_fn_builder( input_files=eval_files, batch_size=FLAGS.eval_batch_size, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False, hvd=None if not FLAGS.horovod else hvd) eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)] eval_start_time = time.time() result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps, hooks=eval_hooks) eval_time_elapsed = time.time() - eval_start_time eval_time_wo_overhead = eval_hooks[-1].total_time num_sentences = (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.eval_batch_size ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info( "Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed, eval_hooks[-1].count * FLAGS.eval_batch_size) tf.compat.v1.logging.info( "Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead, (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.eval_batch_size) tf.compat.v1.logging.info("Summary Inference Statistics on EVAL set") tf.compat.v1.logging.info("Batch size = %d", FLAGS.eval_batch_size) tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length) tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32") tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT) tf.compat.v1.logging.info("-----------------------------") output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.io.gfile.GFile(output_eval_file, "w") as writer: tf.compat.v1.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.compat.v1.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for Estimator.""" def metric_fn(per_example_loss, label_ids, logits): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) if task_name == "cola": FN, FN_op = tf.metrics.false_negatives(labels=label_ids, predictions=predictions) FP, FP_op = tf.metrics.false_positives(labels=label_ids, predictions=predictions) TP, TP_op = tf.metrics.true_positives(labels=label_ids, predictions=predictions) TN, TN_op = tf.metrics.true_negatives(labels=label_ids, predictions=predictions) MCC = (TP * TN - FP * FN) / ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))**0.5 MCC_op = tf.group(FN_op, TN_op, TP_op, FP_op, tf.identity(MCC, name="MCC")) return {"MCC": (MCC, MCC_op)} else: accuracy = tf.metrics.accuracy(labels=label_ids, predictions=predictions) loss = tf.metrics.mean(values=per_example_loss) return { "eval_accuracy": accuracy, "eval_loss": loss, } tf.compat.v1.logging.info("*** Features ***") tf.compat.v1.logging.info("*** Features ***") for name in sorted(features.keys()): tf.compat.v1.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) if not is_training and FLAGS.use_trt: trt_graph = get_frozen_tftrt_model(bert_config, input_ids.shape, num_labels, use_one_hot_embeddings, init_checkpoint) (total_loss, per_example_loss, logits, probabilities) = tf.import_graph_def( trt_graph, input_map={ 'input_ids': input_ids, 'input_mask': input_mask, 'segment_ids': segment_ids, 'label_ids': label_ids }, return_elements=[ 'loss/cls_loss:0', 'loss/cls_per_example_loss:0', 'loss/cls_logits:0', 'loss/cls_probabilities:0' ], name='') if mode == tf.estimator.ModeKeys.PREDICT: predictions = {"probabilities": probabilities} output_spec = tf.estimator.EstimatorSpec( mode=mode, predictions=predictions) elif mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = metric_fn(per_example_loss, label_ids, logits) output_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=eval_metric_ops) return output_spec (total_loss, per_example_loss, logits, probabilities) = create_model(bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, use_one_hot_embeddings) tvars = tf.trainable_variables() initialized_variable_names = {} if init_checkpoint and (hvd is None or hvd.rank() == 0): (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) if FLAGS.verbose_logging: tf.compat.v1.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.compat.v1.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, hvd, False, FLAGS.use_fp16, FLAGS.num_accumulation_steps) output_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) elif mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = metric_fn(per_example_loss, label_ids, logits) output_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=eval_metric_ops) else: output_spec = tf.estimator.EstimatorSpec(mode=mode, predictions=probabilities) return output_spec
tf.keras.layers.Dense(10, activation='softmax') ]) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0), ] # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks.append( keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) model.fit(x_train, y_train, batch_size=64, callbacks=callbacks, epochs=5, verbose=1 if hvd.rank() == 0 else 0, validation_data=(x_test, y_test)) score = model.evaluate(x_test, y_test, verbose=0) print('Test loss:', score[0]) print('Test accuracy:', score[1])
def main(_): hvd.init() # Read/download local dataset. Different copy for each process. mnist = tf.contrib.learn.datasets.mnist.read_data_sets( "mnist_data_{}".format(hvd.rank())) # Name images placeholder to be able to retrieve it from saved meta graph. images_placeholder = tf.placeholder(tf.float32, [None, 784], name=INPUT_NAME) dense_dropout_placeholder = tf.placeholder_with_default(1.0, []) labels_placeholder = tf.placeholder(tf.int64, [None]) logits, scores, predictions = build_net(images_placeholder, dense_dropout_placeholder) # Exporting meta graph right now takes care of removing Horovod specific ops before serving. Graph right now # also does not contain any training specific ops, so it is optimized for serving too. tf.train.export_meta_graph("graph.meta", as_text=True) loss = tf.losses.softmax_cross_entropy(tf.one_hot(labels_placeholder, 10), logits) accuracy = tf.reduce_mean( tf.cast(tf.equal(predictions, labels_placeholder), tf.float32)) # Define summary ops to save summaries for later use in tensorboard. tf.summary.scalar("accuracy", accuracy) tf.summary.scalar("loss", loss) summary_op = tf.summary.merge_all() # Horovod: adjust learning rate based on number of workers. optimizer = tf.train.RMSPropOptimizer(0.001 * hvd.size()) global_step = tf.contrib.framework.get_or_create_global_step() # Wrap standard optimizer in Horovod distributed one. train = hvd.DistributedOptimizer(optimizer).minimize( loss, global_step=global_step) hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of workers. tf.train.StopAtStepHook(last_step=2000 // hvd.size()), tf.train.LoggingTensorHook(tensors={ 'step': global_step, 'loss': loss }, every_n_iter=10), ] # Only master saves summaries. if hvd.rank() == 0: hooks += [ # As previously mentioned summaries are saved to EXPERIMENT_OUTPUT_PATH so that they can be discovered by # tensorboard. tf.train.SummarySaverHook(save_steps=1, output_dir=os.path.join( EXPERIMENT_OUTPUT_PATH, "tensorboard"), summary_op=summary_op) ] # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. As previously mentioned # checkpoints are saved to EXPERIMNET_OUTPUT_PATH which makes them accessible by user. checkpoint_dir = os.path.join(EXPERIMENT_OUTPUT_PATH, "checkpoints") if hvd.rank() == 0 else None # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks) as mon_sess: while not mon_sess.should_stop(): images, labels = mnist.train.next_batch(64) _, loss_val, accuracy_val, global_step_val = mon_sess.run( [train, loss, accuracy, global_step], feed_dict={ images_placeholder: images, labels_placeholder: labels, dense_dropout_placeholder: 0.5 }) # Only master publishes metrics. if hvd.rank() == 0: # Publish metrics just like in the single node example. publish({ "loss": str(loss_val), "accuracy": str(accuracy_val), "global_step": str(global_step_val) }) # Save servable model only from Horovod master. if hvd.rank() == 0: # Create a new graph to import the previously exported one. with tf.Graph().as_default(): # Import previously saved meta graph. restorer = tf.train.import_meta_graph("graph.meta") with tf.Session() as session: checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir) restorer.restore(session, checkpoint_file) # Get handlers for images placeholder and scores op with names defined before. images_placeholder = tf.get_default_graph().get_tensor_by_name( INPUT_NAME + ":0") scores = tf.get_default_graph().get_tensor_by_name( SCORES_NAME + ":0") # Save servable model to EXPERIMENT_OUTPUT_PATH to make it accessible to the user. builder = tf.saved_model.builder.SavedModelBuilder( os.path.join(EXPERIMENT_OUTPUT_PATH, "models", "00001")) prediction_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={ MODEL_INPUT_NAME: tf.saved_model.utils.build_tensor_info( images_placeholder) }, outputs={ MODEL_OUTPUT_NAME: tf.saved_model.utils.build_tensor_info(scores) }, method_name=tf.saved_model.signature_constants. PREDICT_METHOD_NAME)) builder.add_meta_graph_and_variables( session, [tf.saved_model.tag_constants.SERVING], signature_def_map={ MODEL_SIGNATURE_NAME: prediction_signature }, main_op=tf.tables_initializer(), strip_default_attrs=True) builder.save()
def train_ctl(model_func, params): image_width = params['image_width'] image_height = params['image_height'] image_format = params['image_format'] distort_color = params['distort_color'] momentum = params['momentum'] loss_scale = params['loss_scale'] data_dir = params['data_dir'] data_idx_dir = params['data_idx_dir'] batch_size = params['batch_size'] num_iter = params['num_iter'] iter_unit = params['iter_unit'] log_dir = params['log_dir'] export_dir = params['export_dir'] tensorboard_dir = params['tensorboard_dir'] display_every = params['display_every'] precision = params['precision'] dali_mode = params['dali_mode'] use_xla = params['use_xla'] if data_dir is not None: file_format = os.path.join(data_dir, '%s-*') train_files = sorted(tf.io.gfile.glob(file_format % 'train')) valid_files = sorted(tf.io.gfile.glob(file_format % 'validation')) num_train_samples = common.get_num_records(train_files) num_valid_samples = common.get_num_records(valid_files) else: num_train_samples = 1281982 num_valid_samples = 5000 train_idx_files = None valid_idx_files = None if data_idx_dir is not None: file_format = os.path.join(data_idx_dir, '%s-*') train_idx_files = sorted(tf.io.gfile.glob(file_format % 'train')) valid_idx_files = sorted(tf.io.gfile.glob(file_format % 'validation')) if iter_unit.lower() == 'epoch': num_epochs = num_iter nstep_per_epoch = num_train_samples // (batch_size * hvd.size()) nstep_per_valid = num_valid_samples // (batch_size * hvd.size()) else: assert iter_unit.lower() == 'batch' num_epochs = 1 nstep_per_epoch = min(num_iter, num_train_samples // (batch_size * hvd.size())) nstep_per_valid = min(10, num_valid_samples // (batch_size * hvd.size())) if export_dir: assert os.path.exists(export_dir) save_format = export_dir + "/saved_model_rn50.h5" if use_xla: tf.config.optimizer.set_jit(True) # Horovod: pin GPU to be used to process local rank (one GPU per process) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') if tensorboard_dir and hvd.rank() == 0: assert os.path.exists(tensorboard_dir) summary_writer = tf.summary.create_file_writer(tensorboard_dir) else: summary_writer = None if precision == 'fp16': policy = keras.mixed_precision.experimental.Policy( 'mixed_float16', loss_scale) keras.mixed_precision.experimental.set_policy(policy) lr_schedule = common.create_piecewise_constant_decay_with_warmup( batch_size=batch_size * hvd.size(), epoch_size=num_train_samples, warmup_epochs=common.LR_SCHEDULE[0][1], boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]), multipliers=list(p[0] for p in common.LR_SCHEDULE), compute_lr_on_cpu=True) opt = keras.optimizers.SGD(learning_rate=lr_schedule, momentum=momentum) backend.set_image_data_format(image_format) dtype = 'float16' if precision == 'fp16' else 'float32' backend.set_floatx(dtype) model = model_func(num_classes=image_processing.NUM_CLASSES, batch_size=batch_size) loss_func = keras.losses.SparseCategoricalCrossentropy() train_top1 = tf.keras.metrics.SparseTopKCategoricalAccuracy( k=1, name='train_top1') train_top5 = tf.keras.metrics.SparseTopKCategoricalAccuracy( k=5, name='train_top5') val_loss = tf.keras.metrics.Mean(name='val_loss', dtype=tf.float32) val_top1 = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=1, name='val_top1') val_top5 = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5, name='val_top5') if log_dir: # We save check points only when using the real data. assert data_dir, "--data_dir cannot be empty when using --log_dir" assert os.path.exists(log_dir) ckpt = tf.train.Checkpoint(epoch=tf.Variable(0), optimizer=opt, net=model) manager = tf.train.CheckpointManager(ckpt, log_dir, max_to_keep=3, checkpoint_name="model-ckpt") @tf.function def train_step(inputs, first_batch): images, labels = inputs with tf.GradientTape() as tape: predictions = model(images, training=True) loss = loss_func(labels, predictions) loss += tf.reduce_sum(model.losses) loss_copy = loss # Scale the losses if precision == 'fp16': loss = loss * tf.cast(loss_scale, loss.dtype) tape = hvd.DistributedGradientTape(tape) old_grads = tape.gradient(loss, model.trainable_variables) # Unscale the grads if precision == 'fp16': loss_scale_reciprocal = 1. / loss_scale grads = [ g * tf.cast(loss_scale_reciprocal, g.dtype) if g is not None else None for g in old_grads ] else: grads = old_grads opt.apply_gradients(zip(grads, model.trainable_variables)) train_top1.update_state(labels, predictions) train_top5.update_state(labels, predictions) if hvd.size() > 1 and first_batch: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(opt.variables(), root_rank=0) return loss_copy @tf.function def valid_step(inputs): images, labels = inputs predictions = model(images, training=False) loss = loss_func(labels, predictions) val_loss.update_state(loss) val_top1.update_state(labels, predictions) val_top5.update_state(labels, predictions) if data_dir is not None: num_preproc_threads = 4 if dali_mode else 10 train_input = image_processing.image_set( train_files, batch_size, image_height, image_width, training=True, distort_color=distort_color, deterministic=False, num_threads=num_preproc_threads, use_dali=dali_mode, idx_filenames=train_idx_files) valid_input = image_processing.image_set( valid_files, batch_size, image_height, image_width, training=False, distort_color=False, deterministic=False, num_threads=num_preproc_threads, use_dali=dali_mode, idx_filenames=valid_idx_files) else: if dali_mode: raise ValueError("Must provide --data_dir if Dali is enabled") else: train_input = image_processing.fake_image_set( batch_size, image_height, image_width) global_steps = 0 log_steps = display_every try: initial_epoch = 0 if log_dir: ckpt.restore(manager.latest_checkpoint) if manager.latest_checkpoint: if hvd.rank() == 0: print("Restored from {}".format(manager.latest_checkpoint)) initial_epoch = max( int(re.findall(r'\d+', manager.latest_checkpoint)[0]), initial_epoch) else: if hvd.rank() == 0: print("Initializing from scratch.") # Training Loop for epoch in range(num_epochs): if epoch < initial_epoch: continue # on_epoch_begin epoch_start = time.time() total_loss = 0.0 num_batches = 0 train_top1.reset_states() train_top5.reset_states() if not dali_mode: train_iter = iter(train_input) for _ in range(nstep_per_epoch): # on_batch_begin global_steps += 1 if global_steps == 1: start_time = time.time() if global_steps == 1 and hvd.rank() == 0 and summary_writer: tf.summary.trace_on(graph=True, profiler=True) if not dali_mode: x = next(train_iter) else: x = train_input.get_device_minibatches() total_loss += train_step(x, global_steps == 1) if global_steps == 1 and hvd.rank() == 0 and summary_writer: with summary_writer.as_default(): tf.summary.trace_export( name="train_step", step=0, profiler_outdir=tensorboard_dir) # on_batch_end if global_steps % log_steps == 0: timestamp = time.time() elapsed_time = timestamp - start_time examples_per_second = \ (batch_size * hvd.size() * log_steps) / elapsed_time if hvd.rank() == 0: print("global_step: %d images_per_sec: %.1f" % (global_steps, examples_per_second)) start_time = timestamp num_batches += 1 train_loss = total_loss / num_batches # on_epoch_end epoch_run_time = time.time() - epoch_start if hvd.rank() == 0: print("epoch: %d time_taken: %.1f" % (epoch, epoch_run_time)) if data_dir is not None: val_loss.reset_states() val_top1.reset_states() val_top5.reset_states() if not dali_mode: test_iter = iter(valid_input) for _ in range(nstep_per_valid): if not dali_mode: x = next(test_iter) else: x = valid_input.get_device_minibatches() valid_step(x) if log_dir: ckpt.epoch.assign_add(1) if hvd.rank() == 0: save_path = manager.save() print("Saved checkpoint for epoch {}: {}".format( int(ckpt.epoch), save_path)) if hvd.rank() == 0: output_str = ( "loss: {} - top1: {} - top5: {} - val_loss: {} - " "val_top1: {} - val_top5: {}") print( output_str.format(train_loss, train_top1.result(), train_top5.result(), val_loss.result(), val_top1.result(), val_top5.result())) if hvd.rank() == 0 and summary_writer: with summary_writer.as_default(): tf.summary.scalar('train_loss', train_loss, global_steps) tf.summary.scalar('train_top1', train_top1.result(), global_steps) tf.summary.scalar('train_top5', train_top5.result(), global_steps) tf.summary.scalar('val_loss', val_loss.result(), global_steps) tf.summary.scalar('val_top1', val_top1.result(), global_steps) tf.summary.scalar('val_top5', val_top5.result(), global_steps) if hvd.rank() == 0 and summary_writer: summary_writer.close() except KeyboardInterrupt: print("Keyboard interrupt") if export_dir and hvd.rank() == 0: model.save(save_format) print(f"The model is saved to {save_format}")
def test_hvd(): import horovod.tensorflow as hvd hvd.init() print('rank', hvd.rank(), 'local', hvd.local_rank(), 'size', hvd.size())
def main(_): # Horovod: initialize Horovod. hvd.init() # Keras automatically creates a cache directory in ~/.keras/datasets for # storing the downloaded MNIST data. This creates a race # condition among the workers that share the same filesystem. If the # directory already exists by the time this worker gets around to creating # it, ignore the resulting exception and continue. cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets') if not os.path.exists(cache_dir): try: os.mkdir(cache_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(cache_dir): pass else: raise # Download and load MNIST dataset. (x_train, y_train), (x_test, y_test) = \ keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank()) # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it # into (-1, 784) to feed into our network. Also, need to normalize the # features between 0 and 1. x_train = np.reshape(x_train, (-1, 784)) / 255.0 x_test = np.reshape(x_test, (-1, 784)) / 255.0 # Build model... with tf.name_scope('input'): image = tf.placeholder(tf.float32, [None, 784], name='image') label = tf.placeholder(tf.float32, [None], name='label') predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN) # Horovod: adjust learning rate based on number of GPUs. opt = tf.train.RMSPropOptimizer(0.001 * hvd.size()) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) global_step = tf.train.get_or_create_global_step() train_op = opt.minimize(loss, global_step=global_step) hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=10000 // hvd.size()), tf.train.LoggingTensorHook(tensors={ 'step': global_step, 'loss': loss }, every_n_iter=100), tf.train.ProfilerHook(save_steps=1000, output_dir="./phook", show_memory=True) ] # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None training_batch_generator = train_input_generator(x_train, y_train, batch_size=100) # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop(): # Run a training step synchronously. image_, label_ = next(training_batch_generator) mon_sess.run(train_op, feed_dict={image: image_, label: label_})
def main(_): tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) if FLAGS.horovod: hvd.init() if FLAGS.use_fp16: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" processors = {'consensus': ConsensusProcessor} tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) tf.io.gfile.makedirs(FLAGS.output_dir) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 master_process = True training_hooks = [] global_batch_size = FLAGS.train_batch_size hvd_rank = 0 config = tf.compat.v1.ConfigProto() if FLAGS.horovod: global_batch_size = FLAGS.train_batch_size * hvd.size() master_process = (hvd.rank() == 0) hvd_rank = hvd.rank() config.gpu_options.visible_device_list = str(hvd.local_rank()) if hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1 run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir if master_process else None, session_config=config, save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None, keep_checkpoint_max=1) if master_process: tf.compat.v1.logging.info("***** Configuration *****") for key in FLAGS.__flags.keys(): tf.compat.v1.logging.info(' {}: {}'.format( key, getattr(FLAGS, key))) tf.compat.v1.logging.info("**************************") train_examples = None num_train_steps = None num_warmup_steps = None training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank)) if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / global_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) start_index = 0 end_index = len(train_examples) tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")] if FLAGS.horovod: tmp_filenames = [ os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i)) for i in range(hvd.size()) ] num_examples_per_rank = len(train_examples) // hvd.size() remainder = len(train_examples) % hvd.size() if hvd.rank() < remainder: start_index = hvd.rank() * (num_examples_per_rank + 1) end_index = start_index + num_examples_per_rank + 1 else: start_index = hvd.rank() * num_examples_per_rank + remainder end_index = start_index + (num_examples_per_rank) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(), num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_one_hot_embeddings=False, hvd=None if not FLAGS.horovod else hvd, use_fp16=FLAGS.use_fp16) estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) if FLAGS.do_train: filed_based_convert_examples_to_features( train_examples[start_index:end_index], label_list, FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank]) tf.compat.v1.logging.info("***** Running training *****") tf.compat.v1.logging.info(" Num examples = %d", len(train_examples)) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.compat.v1.logging.info(" Num steps = %d", num_train_steps) tf.compat.v1.logging.info(" Num of labels = %d", len(label_list)) train_input_fn = file_based_input_fn_builder( input_file=tmp_filenames, batch_size=FLAGS.train_batch_size, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, hvd=None if not FLAGS.horovod else hvd) train_start_time = time.time() estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=training_hooks) train_time_elapsed = time.time() - train_start_time train_time_wo_overhead = training_hooks[-1].total_time avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed ss_sentences_per_second = ( num_train_steps - training_hooks[-1].skipped ) * global_batch_size * 1.0 / train_time_wo_overhead if master_process: tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info( "Total Training Time = %0.2f for Sentences = %d", train_time_elapsed, num_train_steps * global_batch_size) tf.compat.v1.logging.info( "Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead, (num_train_steps - training_hooks[-1].skipped) * global_batch_size) tf.compat.v1.logging.info( "Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second) tf.compat.v1.logging.info( "Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) tf.compat.v1.logging.info("-----------------------------") if FLAGS.do_eval and master_process: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") filed_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.compat.v1.logging.info("***** Running evaluation *****") tf.compat.v1.logging.info( " Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None eval_drop_remainder = False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, batch_size=FLAGS.eval_batch_size, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.io.gfile.GFile(output_eval_file, "w") as writer: tf.compat.v1.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.compat.v1.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict and master_process: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.compat.v1.logging.info("***** Running prediction*****") tf.compat.v1.logging.info( " Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, batch_size=FLAGS.predict_batch_size, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) eval_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)] eval_start_time = time.time() output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.io.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.compat.v1.logging.info("***** Predict results *****") for prediction in estimator.predict(input_fn=predict_input_fn, hooks=eval_hooks, yield_single_examples=True): probabilities = prediction["probabilities"] output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples eval_time_elapsed = time.time() - eval_start_time eval_time_wo_overhead = eval_hooks[-1].total_time time_list = eval_hooks[-1].time_list time_list.sort() num_sentences = (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.predict_batch_size avg = np.mean(time_list) cf_50 = max(time_list[:int(len(time_list) * 0.50)]) cf_90 = max(time_list[:int(len(time_list) * 0.90)]) cf_95 = max(time_list[:int(len(time_list) * 0.95)]) cf_99 = max(time_list[:int(len(time_list) * 0.99)]) cf_100 = max(time_list[:int(len(time_list) * 1)]) ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info( "Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed, eval_hooks[-1].count * FLAGS.predict_batch_size) tf.compat.v1.logging.info( "Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead, (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.predict_batch_size) tf.compat.v1.logging.info("Summary Inference Statistics") tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size) tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length) tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32") tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000) tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000) tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) tf.compat.v1.logging.info("-----------------------------")
def model_fn(features, labels, mode, params): tf.compat.v1.logging.info("*** Features ***") for name in sorted(features.keys()): tf.compat.v1.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] is_real_example = None if "is_real_example" in features: is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) else: is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) is_training = (mode == tf.estimator.ModeKeys.TRAIN) (total_loss, per_example_loss, logits, probabilities) = create_model(bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, use_one_hot_embeddings) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint and (hvd is None or hvd.rank() == 0): (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.compat.v1.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.compat.v1.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, hvd, False, use_fp16) output_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(per_example_loss, label_ids, logits, is_real_example): predictions = tf.argmax(logits, axis=-1, output_type=tf.int64) accuracy = tf.compat.v1.metrics.accuracy( labels=label_ids, predictions=predictions, weights=is_real_example) loss = tf.compat.v1.metrics.mean(values=per_example_loss, weights=is_real_example) return { "eval_accuracy": accuracy, "eval_loss": loss, } eval_metric_ops = metric_fn(per_example_loss, label_ids, logits, is_real_example) output_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=eval_metric_ops) else: output_spec = tf.estimator.EstimatorSpec( mode=mode, predictions={"probabilities": probabilities}) return output_spec
logger.warn( "TF<1.6 has a bug which may lead to crash in FasterRCNN if you're unlucky." ) args = parser.parse_args() if args.config: cfg.update_args(args.config) register_coco(cfg.DATA.BASEDIR) # add COCO datasets to the registry register_balloon( cfg.DATA.BASEDIR) # add the demo balloon datasets to the registry # Setup logger ... is_horovod = cfg.TRAINER == 'horovod' if is_horovod: hvd.init() logger.info("Horovod Rank={}, Size={}".format(hvd.rank(), hvd.size())) if not is_horovod or hvd.rank() == 0: logger.set_logger_dir(args.logdir, 'd') logger.info("Environment Information:\n" + collect_env_info()) finalize_configs(is_training=True) # Compute the training schedule from the number of GPUs ... stepnum = cfg.TRAIN.STEPS_PER_EPOCH # warmup is step based, lr is epoch based init_lr = cfg.TRAIN.WARMUP_INIT_LR * min(8. / cfg.TRAIN.NUM_GPUS, 1.) warmup_schedule = [(0, init_lr), (cfg.TRAIN.WARMUP, cfg.TRAIN.BASE_LR)] warmup_end_epoch = cfg.TRAIN.WARMUP * 1. / stepnum lr_schedule = [(int(warmup_end_epoch + 0.5), cfg.TRAIN.BASE_LR)]
def maybe_download(filename, expected_bytes): """Download a file if not present, and make sure it's the right size.""" if not os.path.exists(filename): filename, _ = urllib.request.urlretrieve(url, filename) statinfo = os.stat(filename) if statinfo.st_size == expected_bytes: print('Found and verified', filename) else: print(statinfo.st_size) raise Exception( 'Failed to verify ' + url + '. Can you get to it with a browser?') return filename filename = maybe_download('text8-%d.zip' % hvd.rank(), 31344016) # Read the data into a list of strings. def read_data(filename): """Extract the first file enclosed in a zip file as a list of words.""" with zipfile.ZipFile(filename) as f: data = tf.compat.as_str(f.read(f.namelist()[0])).split() return data vocabulary = read_data(filename) print('Data size', len(vocabulary)) # Step 2: Build the dictionary and replace rare words with UNK token. vocabulary_size = 50000
def main(args, config): if args.horovod: verbose = hvd.rank() == 0 local_rank = hvd.local_rank() else: verbose = True local_rank = 0 global_batch_size = args.batch_size * hvd.size( ) if args.horovod else args.batch_size timestamp = time.strftime("%Y-%m-%d_%H:%M:%S", time.gmtime()) logdir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'runs', timestamp) global_step = 0 tf.reset_default_graph() # ------------------------------------------------------------------------------------------# # DATASET data_path = os.path.join(args.dataset_root, f'{args.image_size}x{args.image_size}/') # retrieve dataset npy_data = NumpyPathDataset(data_path, args.scratch_path, copy_files=local_rank == 0, is_correct_phase=True) dataset = tf.data.Dataset.from_tensor_slices(npy_data.scratch_files) if args.horovod: dataset.shard(hvd.size(), hvd.rank()) if args.data_format == "NCDHW": current_shape = [ args.batch_size, args.image_channels, args.image_size // 4, args.image_size, args.image_size ] else: current_shape = [ args.batch_size, args.image_size // 4, args.image_size, args.image_size, args.image_channels ] real_image_input = tf.placeholder(shape=current_shape, dtype=tf.float32) # ------------------ NOISE ---------------- rand_batch1 = np.random.rand(*real_image_input.shape) * 0.5 noise_black_patches1 = rand_batch1.copy() # x_input = image_input + tf.random.normal(shape=image_input.shape) * args.noise_strength # x_input = image_input + tf.random.gamma(shape=x_input.shape, alpha=0.05) # x_input = x_input + tf.random.uniform(shape=x_input.shape) * args.noise_strength # x_input = x_input + tf.random.poisson(lam=0.5, shape=x_input.shape) #add box_sampler noise which mimics conebeam noise for i in range(real_image_input.shape[0]): for _ in range(100): arr_slices = uniform_box_sampler(noise_black_patches1, min_width=(1, 1, 1, 3, 3), max_width=(1, 1, 3, 6, 6))[0] noise_black_patches1[arr_slices] = 0 x_input = real_image_input + noise_black_patches1 y = real_image_input # ------------------ NETWORK ---------------- prediction = forward(x_input, args) # ------------------ OPTIM ----------------- if args.loss_fn is "mean_squared_error": loss = tf.losses.mean_squared_error(labels=y, predictions=prediction) else: assert args.loss_fn != "mean_squared_error", "Choose one of the available args.loss_fn" lr_scaler = hvd.size() if args.horovod else 1 optimizer = tf.train.AdamOptimizer(args.learning_rate * lr_scaler) if args.horovod: optimizer = hvd.DistributedOptimizer(optimizer) train_step = optimizer.minimize(loss) # ------------- SUMMARIES ------------- if args.data_format == "NCDHW": train_input = tf.transpose(x_input[0], (1, 2, 3, 0)) prediction_input = tf.transpose(prediction[0], (1, 2, 3, 0)) real_input = tf.transpose(y[0], (1, 2, 3, 0)) else: train_input = tf.transpose(x_input[0], (0, 1, 2, 3)) prediction_input = tf.transpose(prediction[0], (0, 1, 2, 3)) real_input = tf.transpose(y[0], (0, 1, 2, 3)) prediction_input = tf.clip_by_value(prediction_input, clip_value_min=args.clip_value_min, clip_value_max=args.clip_value_max) #transform images into grid shape = train_input.get_shape().as_list() image_shape = shape[1:3] print(shape) print(image_shape) grid_cols = int(2**np.floor(np.log(np.sqrt(shape[0])) / np.log(2))) grid_rows = shape[0] // grid_cols grid_shape = [grid_rows, grid_cols] train_input = image_grid(train_input, grid_shape, image_shape=shape[1:3], num_channels=shape[-1]) shape = prediction_input.get_shape().as_list() grid_cols = int(2**np.floor(np.log(np.sqrt(shape[0])) / np.log(2))) grid_rows = shape[0] // grid_cols grid_shape = [grid_rows, grid_cols] prediction_input = image_grid(prediction_input, grid_shape, image_shape=shape[1:3], num_channels=shape[-1]) shape = real_input.get_shape().as_list() grid_cols = int(2**np.floor(np.log(np.sqrt(shape[0])) / np.log(2))) grid_rows = shape[0] // grid_cols grid_shape = [grid_rows, grid_cols] real_input = image_grid(real_input, grid_shape, image_shape=shape[1:3], num_channels=shape[-1]) with tf.variable_scope("train_summaries"): train_loss = tf.summary.scalar('train_loss', loss) train_imageNoise = tf.summary.image('train_imageNoise', train_input) train_imageRemake = tf.summary.image('train_imageRemake', prediction_input) train_imageReal = tf.summary.image('train_imageReal', real_input) image_summary_train = tf.summary.merge( [train_loss, train_imageReal, train_imageRemake, train_imageNoise]) with tf.variable_scope("test_summaries"): test_loss = tf.summary.scalar('test_loss', loss) test_imageNoise = tf.summary.image('test_imageNoise', train_input) test_imageRemake = tf.summary.image('test_imageRemake', prediction_input) test_imageReal = tf.summary.image('test_imageReal', real_input) image_summary_test = tf.summary.merge( [test_loss, test_imageNoise, test_imageRemake, test_imageReal]) # -------------- SESSION ------------- with tf.Session(config=config) as sess: sess.run(tf.initialize_all_variables()) if verbose: writer = tf.summary.FileWriter(logdir=logdir, graph=sess.graph, session=sess) #calculate percentage testset and trainingset train_size = int(len(npy_data) * args.train_size) test_size = int(len(npy_data) * (1 - args.train_size) + 1) num_train_steps = train_size // global_batch_size num_test_steps = test_size // global_batch_size for epoch in range(args.epochs): epoch_loss_train = 0 epoch_loss_test = 0 # TRAINING for i in range(num_train_steps): #prepare trainingbatch batch_loc = np.random.randint(num_test_steps, len(npy_data) - args.batch_size) batch_paths = npy_data[batch_loc:batch_loc + args.batch_size] batch = np.stack(np.load(path) for path in batch_paths) batch = batch[:, np.newaxis, ...].astype(np.float32) / 1024 - 1 if args.data_format == "NDHWC": batch = np.transpose(batch, (0, 2, 3, 4, 1)) _, summary, c = sess.run( [train_step, image_summary_train, loss], feed_dict={real_image_input: batch}) if i % args.logging_interval == 0 and verbose: global_step = (epoch * num_train_steps * global_batch_size) + i * global_batch_size writer.add_summary(summary, global_step) writer.flush() epoch_loss_train += c # TESTING for i in range(num_test_steps): #prepare testbatch batch_loc = np.random.randint(0, num_test_steps - args.batch_size) batch_paths = npy_data[batch_loc:batch_loc + args.batch_size] batch = np.stack(np.load(path) for path in batch_paths) batch = batch[:, np.newaxis, ...].astype(np.float32) / 1024 - 1 if args.data_format == "NDHWC": batch = np.transpose(batch, (0, 2, 3, 4, 1)) c = sess.run(loss, feed_dict={real_image_input: batch}) if i % args.logging_interval == 0 and verbose: epoch_loss_test += c if verbose: # writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='loss_test', simple_value=epoch_loss_test / num_test_steps)]), global_step) test_image_summary = sess.run( image_summary_test, feed_dict={real_image_input: batch}) writer.add_summary(test_image_summary, global_step) writer.flush() if verbose: print(f'Epoch [{epoch}/{args.epochs}]\t' f'Train Loss: {epoch_loss_train / num_train_steps}\t' f'Test Loss: {epoch_loss_test / num_test_steps}\t')
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if FLAGS.horovod: import horovod.tensorflow as hvd hvd.init() bert_config = modeling.BertConfig.from_json_file(bert_config_file.name) tf.gfile.MakeDirs(FLAGS.output_dir) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Input Files ***") for input_file in input_files: tf.logging.info(" %s" % input_file) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) config = tf.ConfigProto() if FLAGS.horovod: config.gpu_options.visible_device_list = str(hvd.local_rank()) if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, session_config=config, save_checkpoints_steps=FLAGS.save_checkpoints_steps if not FLAGS.horovod or hvd.rank() == 0 else None, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host), # This variable controls how often estimator reports examples/sec. # Default value is every 100 steps. # When --report_loss is True, we set to very large value to prevent # default info reporting from estimator. # Ideally we should set it to None, but that does not work. log_step_count_steps=10000 if FLAGS.report_loss else 100) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(), num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, disable_nsp=FLAGS.disable_nsp, hvd=None if not FLAGS.horovod else hvd) training_hooks = [] if FLAGS.horovod and hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) if FLAGS.report_loss: global_batch_size = FLAGS.train_batch_size if not FLAGS.horovod else FLAGS.train_batch_size * hvd.size( ) training_hooks.append( _LogSessionRunHook(global_batch_size, 1, -1 if not FLAGS.horovod else hvd.rank())) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size) if FLAGS.do_train: tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) train_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True, hvd=None if not FLAGS.horovod else hvd) estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=FLAGS.num_train_steps) if FLAGS.do_eval and (not FLAGS.horovod or hvd.rank() == 0): tf.logging.info("***** Running evaluation *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False, hvd=None if not FLAGS.horovod else hvd) result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(_): #liangaws:测试sagemaker传入python程序的参数。 import sys print(sys.argv) #liangaws: initialize Horovod. hvd.init() #------check Arguments------ if FLAGS.dt_dir == "": FLAGS.dt_dir = (date.today() + timedelta(-1)).strftime('%Y%m%d') #FLAGS.model_dir = FLAGS.model_dir + FLAGS.dt_dir #FLAGS.data_dir = FLAGS.data_dir + FLAGS.dt_dir print('task_type ', FLAGS.task_type) print('model_dir ', FLAGS.model_dir) print('data_dir ', FLAGS.data_dir) print('dt_dir ', FLAGS.dt_dir) print('num_epochs ', FLAGS.num_epochs) print('feature_size ', FLAGS.feature_size) print('field_size ', FLAGS.field_size) print('embedding_size ', FLAGS.embedding_size) print('batch_size ', FLAGS.batch_size) print('deep_layers ', FLAGS.deep_layers) print('dropout ', FLAGS.dropout) print('loss_type ', FLAGS.loss_type) print('optimizer ', FLAGS.optimizer) print('learning_rate ', FLAGS.learning_rate) print('batch_norm_decay ', FLAGS.batch_norm_decay) print('batch_norm ', FLAGS.batch_norm) print('l2_reg ', FLAGS.l2_reg) #------init Envs------ #liangaws: 这里利用glob.glob函数可以把data_dir目录下的所有训练文件名抽取出来组成一个list,之后可以直接把这个文件名list传给TextLineDataset。 tr_files = glob.glob("%s/tr*libsvm" % FLAGS.data_dir) random.shuffle(tr_files) print("tr_files:", tr_files) va_files = glob.glob("%s/va*libsvm" % FLAGS.data_dir) print("va_files:", va_files) te_files = glob.glob("%s/te*libsvm" % FLAGS.data_dir) print("te_files:", te_files) if FLAGS.clear_existing_model: try: shutil.rmtree(FLAGS.model_dir) except Exception as e: print(e, "at clear_existing_model") else: print("existing model cleaned at %s" % FLAGS.model_dir) #liangaws:这里注释掉调用设置parameter server方式进行分布式训练的环境参数,因为这个训练环境要用Sagemaker来控制。 #set_dist_env() #------bulid Tasks------ model_params = { "field_size": FLAGS.field_size, "feature_size": FLAGS.feature_size, "embedding_size": FLAGS.embedding_size, "learning_rate": FLAGS.learning_rate, "batch_norm_decay": FLAGS.batch_norm_decay, "l2_reg": FLAGS.l2_reg, "deep_layers": FLAGS.deep_layers, "dropout": FLAGS.dropout } #liangaws:这里注释掉config设置,暂时不使用这个。 """ config = tf.estimator.RunConfig().replace(session_config = tf.ConfigProto(device_count={'GPU':0, 'CPU':FLAGS.num_threads}), log_step_count_steps=FLAGS.log_steps, save_summary_steps=FLAGS.log_steps) """ #liangaws:设置checkpoint的周期和最大数量 #config = tf.estimator.RunConfig().replace(save_checkpoints_secs = 5, # keep_checkpoint_max = 5, #log_step_count_steps=FLAGS.log_steps, save_summary_steps=FLAGS.log_steps) #liangaws: 使用Horovod, pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # liangaws: 使用Horovod的时候, save checkpoints only on worker 0 to prevent other workers from corrupting them. print('current horovod rank is ', hvd.rank()) print('input model dir is ', FLAGS.model_dir) print("host is ", FLAGS.hosts) print('current host is ', FLAGS.current_host) if hvd.rank() == 0: DeepFM = tf.estimator.Estimator( model_fn=model_fn, model_dir=FLAGS.model_dir, params=model_params, config=tf.estimator.RunConfig().replace(session_config=config)) else: DeepFM = tf.estimator.Estimator( model_fn=model_fn, model_dir=None, params=model_params, config=tf.estimator.RunConfig().replace(session_config=config)) # liangaws: 使用Horovod的时候, BroadcastGlobalVariablesHook broadcasts initial variable states from rank 0 to all other processes. This is necessary to ensure consistent initialization of all workers when training is started with random weights or restored from a checkpoint. bcast_hook = hvd.BroadcastGlobalVariablesHook(0) #liangaws: 为了在Sagemaker pipe mode下使用horovod的单机多个worker进程,需要在调用Sagemaker的estimator fit的时候用多个channel,至少单机的每个worker需要一个channel。从SM设置的环境变量SM_CHANNELS可以获得当前的所有channel名字,之后每个worker用单独的channel来进行数据读取。 #这里channel名字的顺序与调用Sagemaker estimator fit时候写入的顺序是不同的。比如对于{'training':train_s3, 'training-2':train2_s3, 'evaluation': validate_s3}这样的三个channel,环境变量被SM设置为['evaluation', 'training', 'training-2'],也就是说最后一个channel 'evaluation'出现在环境变量SM_CHANNELS中的第一个,其他channel则是按照原来顺序排列。 channel_names = json.loads(os.environ['SM_CHANNELS']) print("channel name", channel_names) print("first channel", channel_names[0]) print("last channel name", channel_names[-1]) eval_channel = channel_names[0] if FLAGS.task_type == 'train': #liangaws:增加hook到TrainSpec中 """ train_spec = tf.estimator.TrainSpec(input_fn=lambda: input_fn(tr_files, channel='training', num_epochs=FLAGS.num_epochs, batch_size=FLAGS.batch_size), hooks=[bcast_hook]) eval_spec = tf.estimator.EvalSpec(input_fn=lambda: input_fn(va_files, channel='evaluation', num_epochs=1, batch_size=FLAGS.batch_size), steps=None, start_delay_secs=1000, throttle_secs=1200) tf.estimator.train_and_evaluate(DeepFM, train_spec, eval_spec) """ if FLAGS.pipe_mode == 0: #file mode for _ in range(FLAGS.num_epochs): DeepFM.train(input_fn=lambda: input_fn( tr_files, num_epochs=1, batch_size=FLAGS.batch_size), hooks=[bcast_hook]) if hvd.rank() == 0: #只需要在horovod的master做模型评估 DeepFM.evaluate(input_fn=lambda: input_fn( va_files, num_epochs=1, batch_size=FLAGS.batch_size)) else: #pipe mode #liangaws: horovod + pipe mode方式下,训练中worker第二次进入input_fn中的时候,继续使用PipeModeDataset对同一个FIFO读取数据会出问题。 """ train_spec = tf.estimator.TrainSpec(input_fn=lambda: input_fn(channel=channel_names[1 + hvd.local_rank()], num_epochs=FLAGS.num_epochs, batch_size=FLAGS.batch_size), hooks=[bcast_hook]) eval_spec = tf.estimator.EvalSpec(input_fn=lambda: input_fn(channel=eval_channel, num_epochs=1, batch_size=FLAGS.batch_size), steps=None, start_delay_secs=1000, throttle_secs=1200) tf.estimator.train_and_evaluate(DeepFM, train_spec, eval_spec) """ DeepFM.train(input_fn=lambda: input_fn( channel=channel_names[1 + hvd.local_rank()], num_epochs=FLAGS.num_epochs, batch_size=FLAGS.batch_size), hooks=[bcast_hook]) if hvd.rank() == 0: #只需要在horovod的master做模型评估 DeepFM.evaluate( input_fn=lambda: input_fn(channel=eval_channel, num_epochs=1, batch_size=FLAGS.batch_size)) elif FLAGS.task_type == 'eval': DeepFM.evaluate(input_fn=lambda: input_fn( va_files, num_epochs=1, batch_size=FLAGS.batch_size)) elif FLAGS.task_type == 'infer': preds = DeepFM.predict(input_fn=lambda: input_fn( te_files, num_epochs=1, batch_size=FLAGS.batch_size), predict_keys="prob") with open(FLAGS.data_dir + "/pred.txt", "w") as fo: for prob in preds: fo.write("%f\n" % (prob['prob'])) #liangaws:这里修改当任务类型是train或者export的时候都保存模型 if FLAGS.task_type == 'export' or FLAGS.task_type == 'train': #feature_spec = tf.feature_column.make_parse_example_spec(feature_columns) #feature_spec = { # 'feat_ids': tf.FixedLenFeature(dtype=tf.int64, shape=[None, FLAGS.field_size]), # 'feat_vals': tf.FixedLenFeature(dtype=tf.float32, shape=[None, FLAGS.field_size]) #} #serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec) feature_spec = { 'feat_ids': tf.placeholder(dtype=tf.int64, shape=[None, FLAGS.field_size], name='feat_ids'), 'feat_vals': tf.placeholder(dtype=tf.float32, shape=[None, FLAGS.field_size], name='feat_vals') } serving_input_receiver_fn = tf.estimator.export.build_raw_serving_input_receiver_fn( feature_spec) #liangaws: 使用Horovod的时候: Save model and history only on worker 0 (i.e. master) if hvd.rank() == 0: DeepFM.export_savedmodel(FLAGS.servable_model_dir, serving_input_receiver_fn)
def dataset_fn(self, batch_size, training, input_shape, mask_shape, num_threads, use_gpu_prefetch, normalize_data_method, only_defective_images, augment_data, seed=None): super(DAGM2007_Dataset, self).dataset_fn( batch_size=batch_size, training=training, input_shape=input_shape, mask_shape=mask_shape, num_threads=num_threads, use_gpu_prefetch=use_gpu_prefetch, normalize_data_method= normalize_data_method, # [None, "zero_centered", "zero_one"] only_defective_images=only_defective_images, augment_data=augment_data, seed=seed) shuffle_buffer_size = 10000 def decode_csv(line): input_image_name, image_mask_name, label = tf.decode_csv( line, record_defaults=[[""], [""], [0]], field_delim=',') def decode_image(filepath, resize_shape, normalize_data_method): image_content = tf.read_file(filepath) # image = tf.image.decode_image(image_content, channels=resize_shape[-1]) image = tf.image.decode_png(contents=image_content, channels=resize_shape[-1], dtype=tf.uint8) image = tf.image.resize_images( image, size=resize_shape[:2], method=tf.image.ResizeMethod. BILINEAR, # [BILINEAR, NEAREST_NEIGHBOR, BICUBIC, AREA] align_corners=False, preserve_aspect_ratio=True) image.set_shape(resize_shape) image = tf.cast(image, tf.float32) if normalize_data_method == "zero_centered": image = tf.divide(image, 127.5) - 1 elif normalize_data_method == "zero_one": image = tf.divide(image, 255.0) return image input_image = decode_image( filepath=tf.strings.join([image_dir, input_image_name], separator='/'), resize_shape=input_shape, normalize_data_method=normalize_data_method, ) mask_image = tf.cond( tf.equal(image_mask_name, ""), true_fn=lambda: tf.zeros(mask_shape, dtype=tf.float32), false_fn=lambda: decode_image( filepath=tf.strings.join([mask_image_dir, image_mask_name], separator='/'), resize_shape=mask_shape, normalize_data_method="zero_one", ), ) if augment_data: if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0: LOGGER.log("Using data augmentation ...") #input_image = tf.image.per_image_standardization(input_image) horizontal_flip = tf.random_uniform(shape=(), seed=seed) > 0.5 input_image = tf.cond( horizontal_flip, lambda: tf.image.flip_left_right(input_image), lambda: input_image) mask_image = tf.cond( horizontal_flip, lambda: tf.image.flip_left_right(mask_image), lambda: mask_image) n_rots = tf.random_uniform(shape=(), dtype=tf.int32, minval=0, maxval=3, seed=seed) input_image = tf.image.rot90(input_image, k=n_rots) mask_image = tf.image.rot90(mask_image, k=n_rots) label = tf.cast(label, tf.int32) return (input_image, mask_image), label image_dir, csv_file = self._get_data_dirs(training=training) mask_image_dir = os.path.join(image_dir, "Label") dataset = tf.data.TextLineDataset(csv_file) dataset = dataset.skip(1) # Skip CSV Header if only_defective_images: dataset = dataset.filter( lambda line: tf.not_equal(tf.strings.substr(line, -1, 1), "0")) dataset = dataset.cache() if training: dataset = dataset.apply( tf.data.experimental.shuffle_and_repeat( buffer_size=shuffle_buffer_size, seed=seed)) if hvd_utils.is_using_hvd(): dataset = dataset.shard(hvd.size(), hvd.rank()) else: dataset = dataset.repeat() dataset = dataset.apply( tf.data.experimental.map_and_batch( map_func=decode_csv, num_parallel_calls=num_threads, batch_size=batch_size, drop_remainder=True, )) dataset = dataset.prefetch(buffer_size=tf.contrib.data.AUTOTUNE) if use_gpu_prefetch: dataset.apply( tf.data.experimental.prefetch_to_device( device="/gpu:0", buffer_size=batch_size * 8)) return dataset
def main(_): os.environ[ "TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false" #causes memory fragmentation for bert leading to OOM tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path) if FLAGS.horovod: hvd.init() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "xnli": XnliProcessor, } if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.io.gfile.makedirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) master_process = True training_hooks = [] global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps hvd_rank = 0 config = tf.compat.v1.ConfigProto() if FLAGS.horovod: tf.compat.v1.logging.info("Multi-GPU training with TF Horovod") tf.compat.v1.logging.info("hvd.size() = %d hvd.rank() = %d", hvd.size(), hvd.rank()) global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size( ) master_process = (hvd.rank() == 0) hvd_rank = hvd.rank() config.gpu_options.visible_device_list = str(hvd.local_rank()) if hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1 run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir if master_process else None, session_config=config, save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None, keep_checkpoint_max=1) if master_process: tf.compat.v1.logging.info("***** Configuaration *****") for key in FLAGS.__flags.keys(): tf.compat.v1.logging.info(' {}: {}'.format( key, getattr(FLAGS, key))) tf.compat.v1.logging.info("**************************") train_examples = None num_train_steps = None num_warmup_steps = None training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank)) if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / global_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) start_index = 0 end_index = len(train_examples) tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")] if FLAGS.horovod: tmp_filenames = [ os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i)) for i in range(hvd.size()) ] num_examples_per_rank = len(train_examples) // hvd.size() remainder = len(train_examples) % hvd.size() if hvd.rank() < remainder: start_index = hvd.rank() * (num_examples_per_rank + 1) end_index = start_index + num_examples_per_rank + 1 else: start_index = hvd.rank() * num_examples_per_rank + remainder end_index = start_index + (num_examples_per_rank) model_fn = model_fn_builder(task_name=task_name, bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(), num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_one_hot_embeddings=False, hvd=None if not FLAGS.horovod else hvd) estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) if FLAGS.do_train: file_based_convert_examples_to_features( train_examples[start_index:end_index], label_list, FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank]) tf.compat.v1.logging.info("***** Running training *****") tf.compat.v1.logging.info(" Num examples = %d", len(train_examples)) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.compat.v1.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=tmp_filenames, batch_size=FLAGS.train_batch_size, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, hvd=None if not FLAGS.horovod else hvd) train_start_time = time.time() estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=training_hooks) train_time_elapsed = time.time() - train_start_time train_time_wo_overhead = training_hooks[-1].total_time avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed ss_sentences_per_second = ( num_train_steps - training_hooks[-1].skipped ) * global_batch_size * 1.0 / train_time_wo_overhead if master_process: tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info( "Total Training Time = %0.2f for Sentences = %d", train_time_elapsed, num_train_steps * global_batch_size) tf.compat.v1.logging.info( "Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead, (num_train_steps - training_hooks[-1].skipped) * global_batch_size) tf.compat.v1.logging.info( "Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second) tf.compat.v1.logging.info( "Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) tf.compat.v1.logging.info("-----------------------------") if FLAGS.do_eval and master_process: eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.compat.v1.logging.info("***** Running evaluation *****") tf.compat.v1.logging.info(" Num examples = %d", len(eval_examples)) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_drop_remainder = False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, batch_size=FLAGS.eval_batch_size, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)] eval_start_time = time.time() result = estimator.evaluate(input_fn=eval_input_fn, hooks=eval_hooks) eval_time_elapsed = time.time() - eval_start_time time_list = eval_hooks[-1].time_list time_list.sort() # Removing outliers (init/warmup) in throughput computation. eval_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.99)]) num_sentences = (int(len(time_list) * 0.99)) * FLAGS.predict_batch_size avg = np.mean(time_list) cf_50 = max(time_list[:int(len(time_list) * 0.50)]) cf_90 = max(time_list[:int(len(time_list) * 0.90)]) cf_95 = max(time_list[:int(len(time_list) * 0.95)]) cf_99 = max(time_list[:int(len(time_list) * 0.99)]) cf_100 = max(time_list[:int(len(time_list) * 1)]) ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info( "Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed, eval_hooks[-1].count * FLAGS.eval_batch_size) tf.compat.v1.logging.info( "Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead, num_sentences) tf.compat.v1.logging.info("Summary Inference Statistics on EVAL set") tf.compat.v1.logging.info("Batch size = %d", FLAGS.eval_batch_size) tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length) tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32") tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000) tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000) tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) dllogging.logger.log( step=(), data={"throughput_train": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT) tf.compat.v1.logging.info("-----------------------------") output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.io.gfile.GFile(output_eval_file, "w") as writer: tf.compat.v1.logging.info("***** Eval results *****") for key in sorted(result.keys()): dllogging.logger.log(step=(), data={key: float(result[key])}, verbosity=Verbosity.DEFAULT) tf.compat.v1.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict and master_process: predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.compat.v1.logging.info("***** Running prediction*****") tf.compat.v1.logging.info(" Num examples = %d", len(predict_examples)) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, batch_size=FLAGS.predict_batch_size, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) predict_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)] predict_start_time = time.time() output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.io.gfile.GFile(output_predict_file, "w") as writer: tf.compat.v1.logging.info("***** Predict results *****") for prediction in estimator.predict(input_fn=predict_input_fn, hooks=predict_hooks, yield_single_examples=False): output_line = "\t".join( str(class_probability) for class_probability in prediction) + "\n" writer.write(output_line) predict_time_elapsed = time.time() - predict_start_time predict_time_wo_overhead = predict_hooks[-1].total_time time_list = predict_hooks[-1].time_list time_list.sort() num_sentences = (predict_hooks[-1].count - predict_hooks[-1].skipped) * FLAGS.predict_batch_size avg = np.mean(time_list) cf_50 = max(time_list[:int(len(time_list) * 0.50)]) cf_90 = max(time_list[:int(len(time_list) * 0.90)]) cf_95 = max(time_list[:int(len(time_list) * 0.95)]) cf_99 = max(time_list[:int(len(time_list) * 0.99)]) cf_100 = max(time_list[:int(len(time_list) * 1)]) ss_sentences_per_second = num_sentences * 1.0 / predict_time_wo_overhead tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info( "Total Inference Time = %0.2f for Sentences = %d", predict_time_elapsed, predict_hooks[-1].count * FLAGS.predict_batch_size) tf.compat.v1.logging.info( "Total Inference Time W/O Overhead = %0.2f for Sentences = %d", predict_time_wo_overhead, (predict_hooks[-1].count - predict_hooks[-1].skipped) * FLAGS.predict_batch_size) tf.compat.v1.logging.info("Summary Inference Statistics on TEST SET") tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size) tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length) tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32") tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000) tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000) tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT) tf.compat.v1.logging.info("-----------------------------")
def train(sess, model, hps, logdir, visualise): _print(hps) _print('Starting training. Logging to', logdir) _print('epoch n_processed n_images ips dtrain dtest dsample dtot train_results test_results msg') # Train sess.graph.finalize() n_processed = 0 n_images = 0 train_time = 0.0 test_loss_best = 999999 if hvd.rank() == 0: train_logger = ResultLogger(logdir + "train.txt", **hps.__dict__) test_logger = ResultLogger(logdir + "test.txt", **hps.__dict__) tcurr = time.time() for epoch in range(1, hps.epochs): t = time.time() train_results = [] for it in range(hps.train_its): # Set learning rate, linearly annealed from 0 in the first hps.epochs_warmup epochs. lr = hps.lr * min(1., n_processed / (hps.n_train * hps.epochs_warmup)) # Run a training step synchronously. _t = time.time() train_results += [model.train(lr)] if hps.verbose and hvd.rank() == 0: _print(n_processed, time.time()-_t, train_results[-1]) sys.stdout.flush() # Images seen wrt anchor resolution n_processed += hvd.size() * hps.n_batch_train # Actual images seen at current resolution n_images += hvd.size() * hps.local_batch_train train_results = np.mean(np.asarray(train_results), axis=0) dtrain = time.time() - t ips = (hps.train_its * hvd.size() * hps.local_batch_train) / dtrain train_time += dtrain if hvd.rank() == 0: train_logger.log(epoch=epoch, n_processed=n_processed, n_images=n_images, train_time=int( train_time), **process_results(train_results)) if epoch < 10 or (epoch < 50 and epoch % 10 == 0) or epoch % hps.epochs_full_valid == 0: test_results = [] msg = '' t = time.time() # model.polyak_swap() if epoch % hps.epochs_full_valid == 0: # Full validation run for it in range(hps.full_test_its): test_results += [model.test()] test_results = np.mean(np.asarray(test_results), axis=0) if hvd.rank() == 0: test_logger.log(epoch=epoch, n_processed=n_processed, n_images=n_images, **process_results(test_results)) # Save checkpoint if test_results[0] < test_loss_best: test_loss_best = test_results[0] model.save(logdir+"model_best_loss.ckpt") msg += ' *' dtest = time.time() - t # Sample t = time.time() if epoch == 1 or epoch == 10 or epoch % hps.epochs_full_sample == 0: visualise(epoch) dsample = time.time() - t if hvd.rank() == 0: dcurr = time.time() - tcurr tcurr = time.time() _print(epoch, n_processed, n_images, "{:.1f} {:.1f} {:.1f} {:.1f} {:.1f}".format( ips, dtrain, dtest, dsample, dcurr), train_results, test_results, msg) # model.polyak_swap() if hvd.rank() == 0: _print("Finished!")
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-v", "--version", action="version", version="OpenNMT-tf %s" % __version__) parser.add_argument("run", choices=[ "train_and_eval", "train", "eval", "infer", "export", "score" ], help="Run type.") parser.add_argument("--config", required=True, nargs="+", help="List of configuration files.") parser.add_argument("--auto_config", default=False, action="store_true", help="Enable automatic configuration values.") parser.add_argument("--model_type", default="", choices=list( classes_in_module(catalog, public_only=True)), help="Model type from the catalog.") parser.add_argument("--model", default="", help="Custom model configuration file.") parser.add_argument( "--run_dir", default="", help="If set, model_dir will be created relative to this location.") parser.add_argument( "--data_dir", default="", help="If set, data files are expected to be relative to this location." ) parser.add_argument("--features_file", default=[], nargs="+", help="Run inference on this file.") parser.add_argument( "--predictions_file", default="", help= ("File used to save predictions. If not set, predictions are printed " "on the standard output.")) parser.add_argument("--log_prediction_time", default=False, action="store_true", help="Logs some prediction time metrics.") parser.add_argument( "--checkpoint_path", default=None, help=("Checkpoint or directory to use for inference or export " "(when a directory is set, the latest checkpoint is used).")) parser.add_argument("--source_scope", default=None, help=("Checkpoint scope name to restore to model.")) parser.add_argument("--target_scope", default=None, help=("target scope name to restore to model.")) parser.add_argument("--export_dir_base", default=None, help="The base directory of the exported model.") parser.add_argument("--num_gpus", type=int, default=1, help="Number of GPUs to use for in-graph replication.") parser.add_argument( "--chief_host", default="", help="hostname:port of the chief worker (for distributed training).") parser.add_argument( "--worker_hosts", default="", help=("Comma-separated list of hostname:port of workers " "(for distributed training).")) parser.add_argument( "--ps_hosts", default="", help=("Comma-separated list of hostname:port of parameter servers " "(for distributed training).")) parser.add_argument( "--task_type", default="chief", choices=["chief", "worker", "ps", "evaluator"], help="Type of the task to run (for distributed training).") parser.add_argument("--task_index", type=int, default=0, help="ID of the task (for distributed training).") parser.add_argument("--horovod", default=False, action="store_true", help="Enable Horovod support for this run.") parser.add_argument("--log_level", default="INFO", choices=["DEBUG", "ERROR", "FATAL", "INFO", "WARN"], help="Logs verbosity.") parser.add_argument("--seed", type=int, default=None, help="Random seed.") parser.add_argument("--gpu_allow_growth", default=False, action="store_true", help="Allocate GPU memory dynamically.") parser.add_argument( "--intra_op_parallelism_threads", type=int, default=0, help=("Number of intra op threads (0 means the system picks " "an appropriate number).")) parser.add_argument( "--inter_op_parallelism_threads", type=int, default=0, help=("Number of inter op threads (0 means the system picks " "an appropriate number).")) parser.add_argument( "--session_config", default=None, help=( "Path to a file containing a tf.ConfigProto message in text format " "and used to create the TensorFlow sessions.")) args = parser.parse_args() tf.logging.set_verbosity(getattr(tf.logging, args.log_level)) # Setup cluster if defined. if args.chief_host: if args.run != "train_and_eval": raise ValueError( "Distributed training is only supported with the train_and_eval run type" ) os.environ["TF_CONFIG"] = json.dumps({ "cluster": { "chief": [args.chief_host], "worker": args.worker_hosts.split(","), "ps": args.ps_hosts.split(",") }, "task": { "type": args.task_type, "index": args.task_index } }) # Initialize Horovd if defined. if args.horovod: import horovod.tensorflow as hvd hvd.init() is_chief = hvd.rank() == 0 else: hvd = None is_chief = args.task_type == "chief" # Load and merge run configurations. config = load_config(args.config) if args.run_dir: config["model_dir"] = os.path.join(args.run_dir, config["model_dir"]) if args.data_dir: config["data"] = _prefix_paths(args.data_dir, config["data"]) if is_chief and not tf.gfile.Exists(config["model_dir"]): tf.logging.info("Creating model directory %s", config["model_dir"]) tf.gfile.MakeDirs(config["model_dir"]) model = load_model(config["model_dir"], model_file=args.model, model_name=args.model_type, serialize_model=is_chief) session_config = tf.ConfigProto( intra_op_parallelism_threads=args.intra_op_parallelism_threads, inter_op_parallelism_threads=args.inter_op_parallelism_threads, gpu_options=tf.GPUOptions(allow_growth=args.gpu_allow_growth)) if args.session_config is not None: with open(args.session_config, "rb") as session_config_file: text_format.Merge(session_config_file.read(), session_config) runner = Runner(model, config, seed=args.seed, num_devices=args.num_gpus, session_config=session_config, auto_config=args.auto_config, hvd=hvd) if args.run == "train_and_eval": runner.train_and_evaluate(checkpoint_path=args.checkpoint_path, source_scope=args.source_scope, target_scope=args.target_scope) elif args.run == "train": runner.train(checkpoint_path=args.checkpoint_path, source_scope=args.source_scope, target_scope=args.target_scope) elif args.run == "eval": runner.evaluate(checkpoint_path=args.checkpoint_path) elif args.run == "infer": if not args.features_file: parser.error("--features_file is required for inference.") elif len(args.features_file) == 1: args.features_file = args.features_file[0] runner.infer(args.features_file, predictions_file=args.predictions_file, checkpoint_path=args.checkpoint_path, log_time=args.log_prediction_time) elif args.run == "export": runner.export(checkpoint_path=args.checkpoint_path, export_dir_base=args.export_dir_base) elif args.run == "score": if not args.features_file: parser.error("--features_file is required for scoring.") runner.score(args.features_file, args.predictions_file, checkpoint_path=args.checkpoint_path)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-v", "--version", action="version", version="OpenNMT-tf %s" % __version__) parser.add_argument("--config", required=True, nargs="+", help="List of configuration files.") parser.add_argument( "--auto_config", default=False, action="store_true", help="Enable automatic configuration values.", ) parser.add_argument( "--model_type", default="", choices=list(sorted(catalog.list_model_names_from_catalog())), help="Model type from the catalog.", ) parser.add_argument("--model", default="", help="Custom model configuration file.") parser.add_argument( "--run_dir", default="", help="If set, model_dir will be created relative to this location.", ) parser.add_argument( "--data_dir", default="", help="If set, data files are expected to be relative to this location.", ) parser.add_argument( "--checkpoint_path", default=None, help=("Specific checkpoint or model directory to load " "(when a directory is set, the latest checkpoint is used)."), ) parser.add_argument( "--log_level", default="INFO", choices=["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"], help="Logs verbosity.", ) parser.add_argument("--seed", type=int, default=None, help="Random seed.") parser.add_argument( "--gpu_allow_growth", default=False, action="store_true", help="Allocate GPU memory dynamically.", ) parser.add_argument( "--intra_op_parallelism_threads", type=int, default=0, help=("Number of intra op threads (0 means the system picks " "an appropriate number)."), ) parser.add_argument( "--inter_op_parallelism_threads", type=int, default=0, help=("Number of inter op threads (0 means the system picks " "an appropriate number)."), ) parser.add_argument( "--mixed_precision", default=False, action="store_true", help="Enable mixed precision.", ) parser.add_argument( "--eager_execution", default=False, action="store_true", help="Enable TensorFlow eager execution.", ) subparsers = parser.add_subparsers(help="Run type.", dest="run_type") subparsers.required = True parser_train = subparsers.add_parser("train", help="Training.") parser_train.add_argument( "--with_eval", default=False, action="store_true", help="Enable automatic evaluation.", ) parser_train.add_argument( "--num_gpus", type=int, default=1, help="Number of GPUs to use for in-graph replication.", ) parser_train.add_argument( "--horovod", default=False, action="store_true", help="Enable Horovod training mode.", ) parser_eval = subparsers.add_parser("eval", help="Evaluation.") parser_eval.add_argument("--features_file", nargs="+", default=None, help="Input features files.") parser_eval.add_argument("--labels_file", default=None, help="Output labels files.") parser_infer = subparsers.add_parser("infer", help="Inference.") parser_infer.add_argument("--features_file", nargs="+", required=True, help="Run inference on this file.") parser_infer.add_argument( "--predictions_file", default="", help= ("File used to save predictions. If not set, predictions are printed " "on the standard output."), ) parser_infer.add_argument( "--log_prediction_time", default=False, action="store_true", help="Logs some prediction time metrics.", ) parser_export = subparsers.add_parser("export", help="Model export.") parser_export.add_argument( "--output_dir", "--export_dir", required=True, help="The directory of the exported model.", ) parser_export.add_argument( "--format", "--export_format", choices=exporters.list_exporters(), default="saved_model", help="Format of the exported model.", ) parser_score = subparsers.add_parser("score", help="Scoring.") parser_score.add_argument("--features_file", nargs="+", required=True, help="Features file.") parser_score.add_argument("--predictions_file", default=None, help="Predictions to score.") parser_average_checkpoints = subparsers.add_parser( "average_checkpoints", help="Checkpoint averaging.") parser_average_checkpoints.add_argument( "--output_dir", required=True, help="The output directory for the averaged checkpoint.", ) parser_average_checkpoints.add_argument( "--max_count", type=int, default=8, help="The maximal number of checkpoints to average.", ) parser_update_vocab = subparsers.add_parser( "update_vocab", help="Update model vocabularies in checkpoint.") parser_update_vocab.add_argument( "--output_dir", required=True, help="The output directory for the updated checkpoint.", ) parser_update_vocab.add_argument("--src_vocab", default=None, help="Path to the new source vocabulary.") parser_update_vocab.add_argument("--tgt_vocab", default=None, help="Path to the new target vocabulary.") # When using an option that takes multiple values just before the run type, # the run type is treated as a value of this option. To fix this issue, we # inject a placeholder option just before the run type to clearly separate it. parser.add_argument("--placeholder", action="store_true", help=argparse.SUPPRESS) run_types = set(subparsers.choices.keys()) args = sys.argv[1:] for i, arg in enumerate(args): if arg in run_types: args.insert(i, "--placeholder") break args = parser.parse_args(args) if (hasattr(args, "features_file") and args.features_file and len(args.features_file) == 1): args.features_file = args.features_file[0] _initialize_logging(getattr(logging, args.log_level)) tf.config.threading.set_intra_op_parallelism_threads( args.intra_op_parallelism_threads) tf.config.threading.set_inter_op_parallelism_threads( args.inter_op_parallelism_threads) if args.eager_execution: tf.config.run_functions_eagerly(True) gpus = tf.config.list_physical_devices(device_type="GPU") if hasattr(args, "horovod") and args.horovod: import horovod.tensorflow as hvd hvd.init() is_master = hvd.rank() == 0 if gpus: local_gpu = gpus[hvd.local_rank()] tf.config.set_visible_devices(local_gpu, device_type="GPU") gpus = [local_gpu] else: hvd = None is_master = True if args.gpu_allow_growth: for device in gpus: tf.config.experimental.set_memory_growth(device, enable=True) # Load and merge run configurations. config = load_config(args.config) if args.run_dir: config["model_dir"] = os.path.join(args.run_dir, config["model_dir"]) if args.data_dir: config["data"] = _prefix_paths(args.data_dir, config["data"]) if is_master and not tf.io.gfile.exists(config["model_dir"]): tf.get_logger().info("Creating model directory %s", config["model_dir"]) tf.io.gfile.makedirs(config["model_dir"]) model = load_model( config["model_dir"], model_file=args.model, model_name=args.model_type, serialize_model=is_master, as_builder=True, ) runner = Runner( model, config, auto_config=args.auto_config, mixed_precision=args.mixed_precision, seed=args.seed, ) if args.run_type == "train": runner.train( num_devices=args.num_gpus, with_eval=args.with_eval, checkpoint_path=args.checkpoint_path, hvd=hvd, ) elif args.run_type == "eval": metrics = runner.evaluate( checkpoint_path=args.checkpoint_path, features_file=args.features_file, labels_file=args.labels_file, ) print(metrics) elif args.run_type == "infer": runner.infer( args.features_file, predictions_file=args.predictions_file, checkpoint_path=args.checkpoint_path, log_time=args.log_prediction_time, ) elif args.run_type == "export": runner.export( args.output_dir, checkpoint_path=args.checkpoint_path, exporter=exporters.make_exporter(args.format), ) elif args.run_type == "score": runner.score( args.features_file, args.predictions_file, checkpoint_path=args.checkpoint_path, ) elif args.run_type == "average_checkpoints": runner.average_checkpoints(args.output_dir, max_count=args.max_count) elif args.run_type == "update_vocab": runner.update_vocab(args.output_dir, src_vocab=args.src_vocab, tgt_vocab=args.tgt_vocab)
def main(): tf.set_random_seed(1234) np.random.seed(4321) # initiate horovod hvd.init() cmdline = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Basic options cmdline.add_argument('--num_batches', default=2, type=int, help="""number of each minibatch.""") cmdline.add_argument('--batch_size', default=None, type=int, help="""Size of each minibatch.""") cmdline.add_argument('--log_frequency', default=None, type=int, help="""Logging frequency.""") cmdline.add_argument('--max_steps', default=None, type=int, help="""Maximum steps.""") cmdline.add_argument('--network_config', default=None, type=str, help="""Neural net architecture.""") cmdline.add_argument('--data_dir', default=None, type=str, help="""Data directory [train/test].""") cmdline.add_argument('--checkpt_dir', default=None, type=str, help="""Checkpoint directory.""") cmdline.add_argument('--input_flags', default=None, type=str, help="""Input json.""") cmdline.add_argument('--hyper_params', default=None, type=str, help="""Hyper parameters.""") cmdline.add_argument('--ilr', default=None, type=float, help="""Initial learning rate ( hyper parameter).""") cmdline.add_argument( '--epochs_per_decay', default=None, type=float, help="""Number of epochs per lr decay ( hyper parameter).""") cmdline.add_argument('--scaling', default=None, type=float, help="""Scaling (hyper parameter).""") cmdline.add_argument('--bn_decay', default=None, type=float, help="""Batch norm decay (hyper parameter).""") cmdline.add_argument('--save_epochs', default=0.5, type=float, help="""Number of epochs to save checkpoint. """) cmdline.add_argument('--mode', default='train', type=str, help="""train or eval (:validates from checkpoint)""") cmdline.add_argument('--cpu_threads', default=10, type=int, help="""cpu threads per rank""") add_bool_argument(cmdline, '--fp16', default=None, help="""Train with half-precision.""") add_bool_argument(cmdline, '--fp32', default=None, help="""Train with single-precision.""") add_bool_argument(cmdline, '--restart', default=None, help="""Restart training from checkpoint.""") add_bool_argument(cmdline, '--nvme', default=None, help="""Copy data to burst buffer.""") FLAGS, unknown_args = cmdline.parse_known_args() if len(unknown_args) > 0: for bad_arg in unknown_args: if hvd.rank() == 0: print('<ERROR> Unknown command line arg: %s' % bad_arg) raise ValueError('Invalid command line arg(s)') # Load input flags if FLAGS.input_flags is not None: params = io_utils.get_dict_from_json(FLAGS.input_flags) params['input_flags'] = FLAGS.input_flags else: params = io_utils.get_dict_from_json('input_flags.json') params['input_flags'] = 'input_flags.json' params['start_time'] = time.time() params['cmdline'] = 'unknown' if FLAGS.batch_size is not None: params['batch_size'] = FLAGS.batch_size if FLAGS.log_frequency is not None: params['log_frequency'] = FLAGS.log_frequency if FLAGS.max_steps is not None: params['max_steps'] = FLAGS.max_steps if FLAGS.network_config is not None: params['network_config'] = FLAGS.network_config if FLAGS.data_dir is not None: params['data_dir'] = FLAGS.data_dir if FLAGS.checkpt_dir is not None: params['checkpt_dir'] = FLAGS.checkpt_dir if FLAGS.hyper_params is not None: params['hyper_params'] = FLAGS.hyper_params if FLAGS.fp16 is not None: params['IMAGE_FP16'] = True if FLAGS.fp32 is not None: params['IMAGE_FP16'] = False if FLAGS.restart is not None: params['restart'] = True if FLAGS.save_epochs is not None: params['epochs_per_saving'] = FLAGS.save_epochs if FLAGS.mode == 'train': params['mode'] = 'train' if FLAGS.mode == 'eval': params['mode'] = 'eval' if FLAGS.cpu_threads is not None: params['IO_threads'] = FLAGS.cpu_threads if FLAGS.nvme is not None: params = nvme_staging(params['data_dir'], params, mode=params['mode']) benchmark_io(params, filetype='lmdb', num_batches=FLAGS.num_batches)
import json import os import horovod.tensorflow as hvd if __name__ == '__main__': hvd.init() with open(os.path.join('/opt/ml/model/rank-%s' % hvd.rank()), 'w+') as f: basic_info = {'rank': hvd.rank(), 'size': hvd.size()} json.dump(basic_info, f) print('Saved file "rank-%s": %s' % (hvd.rank(), basic_info))
def main(argv=None): # Initialize Horovod. hvd.init() # Pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) KB.set_session(tf.Session(config=config)) # print('LOCAL RANK, OVERAL RANK: {}, {}'.format(hvd.local_rank(), # hvd.rank())) ngpus = hvd.size() main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = _parser(desc) num_devices_tfrecord = 1 height, width = 224, 224 # Image dimensions. Gets resized if not match. distort_color = args.distort_color data_dir = args.datadir batch_size = args.batch_size # * ngpus epochs = args.epochs imgs_per_epoch = args.imgs_per_epoch # Fit the model using data from the TFRecord data tensors. device_minibatches = RecordInputImagenetPreprocessor.device_minibatches images_tfrecord, labels_tfrecord, nrecords = device_minibatches( num_devices_tfrecord, data_dir, batch_size, height, width, distort_color, val=False) images_tfrecord = images_tfrecord[0] labels_tfrecord = labels_tfrecord[0] # CASTING FOR KERAS # labels[device_num] = tf.cast(labels_tfrecord, dtype) nclasses = 1000 labels_tfrecord = tf.one_hot(labels_tfrecord, nclasses) nimgs_to_use = imgs_per_epoch if imgs_per_epoch > 0 else nrecords steps_per_epoch = nimgs_to_use // batch_size // hvd.size() # steps_per_epoch = 100 # batch_shape = images_tfrecord.get_shape().as_list() # images = Input(tensor=images_tfrecord, batch_shape=x_batch_shape) images = Input(tensor=images_tfrecord) model = ResNet50(input_tensor=images, weights=None) if hvd.rank() == 0: model.summary() print('Num images: {}'.format(nrecords)) if nimgs_to_use < nrecords: print('Using {} images per epoch'.format(nimgs_to_use)) # print('IMAGES_TFRECORD: {}'.format(images_tfrecord)) # print('LABELS_TFRECORD: {}'.format(labels_tfrecord)) # Add Horovod Distributed Optimizer from nvcnn.py # momentum = 0.9 # lr = 0.1 # learning_rate = tf.train.exponential_decay( # lr, # self.global_step, # decay_steps=FLAGS.lr_decay_epochs * nstep_per_epoch, # decay_rate=FLAGS.lr_decay_rate, # staircase=True) # opt = tf.train.MomentumOptimizer(self.learning_rate, momentum, # use_nesterov=True) # lr = 0.001 * ngpus # opt = tf.train.AdamOptimizer() # opt = hvd.DistributedOptimizer(opt) # , use_locking=True) # opt = KO.TFOptimizer(opt) # Required for tf.train based optimizers opt = KO.Adam() opt = hvd_keras.DistributedOptimizer(opt) model.compile(loss='categorical_crossentropy', optimizer=opt, # metrics=['accuracy'], target_tensors=[labels_tfrecord]) # Broadcast variables from rank 0 to all other processes. KB.get_session().run(hvd.broadcast_global_variables(0)) callbacks = [] if hvd.rank() == 0: callbacks += [BatchTiming(), SamplesPerSec(ngpus * batch_size)] # RecordInput is a yield op which doesn't use queue runners or queues. # Start the queue runners. # sess = KB.get_session() # sess.run([tf.local_variables_initializer(), # tf.global_variables_initializer()]) # coord = tf.train.Coordinator() # threads = tf.train.start_queue_runners(sess, coord) start_time = time.time() model.fit( steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks, verbose=1) # verbose=hvd.rank() == 0) elapsed_time = time.time() - start_time if hvd.rank() == 0: print('[{}] finished in {} s' .format('TRAINING', round(elapsed_time, 3))) # loss = model.evaluate(None, None, steps=steps_per_epoch_val) images_tfrecord_val, labels_tfrecord_val, nrecords_val = \ device_minibatches(num_devices_tfrecord, data_dir, batch_size, height, width, distort_color, val=True) images_tfrecord_val = images_tfrecord_val[0] labels_tfrecord_val = labels_tfrecord_val[0] labels_tfrecord_val = tf.one_hot(labels_tfrecord_val, nclasses) # print('IMAGES_TFRECORD_VAL: {}'.format(images_tfrecord_val)) # print('labels_tfrecord_val: {}'.format(labels_tfrecord_val)) steps_per_epoch_val = nrecords_val // batch_size images_val = Input(tensor=images_tfrecord_val) model_val = model model_val.layers[0] = KL.InputLayer(input_tensor=images_val) model_val.compile( loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'], target_tensors=[labels_tfrecord_val]) # model.summary() loss = model_val.evaluate(x=None, y=None, steps=steps_per_epoch_val) print('\nNum images evaluated, steps: {}, {}'. format(nrecords_val, steps_per_epoch_val)) print('\nTest loss, acc: {}'.format(loss)) # print('\nTest accuracy: {0}'.format(acc)) # Clean up the TF session. # coord.request_stop() # coord.join(threads) KB.clear_session() # do this for Horovod
parser.add_argument('--load', help='Load a model to start training from. It overwrites BACKBONE.WEIGHTS') parser.add_argument('--logdir', help='Log directory. Will remove the old one if already exists.', default='train_log/maskrcnn') parser.add_argument('--config', help="A list of KEY=VALUE to overwrite those defined in config.py", nargs='+') args = parser.parse_args() if args.config: cfg.update_args(args.config) register_coco(cfg.DATA.BASEDIR) # add COCO datasets to the registry register_balloon(cfg.DATA.BASEDIR) # add the demo balloon datasets to the registry # Setup logging ... is_horovod = cfg.TRAINER == 'horovod' if is_horovod: hvd.init() if not is_horovod or hvd.rank() == 0: logger.set_logger_dir(args.logdir, 'd') logger.info("Environment Information:\n" + collect_env_info()) finalize_configs(is_training=True) # Create model MODEL = ResNetFPNModel() if cfg.MODE_FPN else ResNetC4Model() # Compute the training schedule from the number of GPUs ... stepnum = cfg.TRAIN.STEPS_PER_EPOCH # warmup is step based, lr is epoch based init_lr = cfg.TRAIN.WARMUP_INIT_LR * min(8. / cfg.TRAIN.NUM_GPUS, 1.) warmup_schedule = [(0, init_lr), (cfg.TRAIN.WARMUP, cfg.TRAIN.BASE_LR)] warmup_end_epoch = cfg.TRAIN.WARMUP * 1. / stepnum lr_schedule = [(int(warmup_end_epoch + 0.5), cfg.TRAIN.BASE_LR)]
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for Estimator.""" if FLAGS.verbose_logging: tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) unique_ids = features["unique_ids"] input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) if not is_training and FLAGS.use_trt: trt_graph = get_frozen_tftrt_model(bert_config, input_ids.shape, use_one_hot_embeddings, init_checkpoint) (start_logits, end_logits) = tf.import_graph_def( trt_graph, input_map={ 'input_ids': input_ids, 'input_mask': input_mask, 'segment_ids': segment_ids }, return_elements=['unstack:0', 'unstack:1'], name='') predictions = { "unique_ids": unique_ids, "start_logits": start_logits, "end_logits": end_logits, } output_spec = tf.estimator.TPUEstimatorSpec( mode=mode, predictions=predictions) return output_spec (start_logits, end_logits) = create_model( bert_config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) tvars = tf.trainable_variables() initialized_variable_names = {} if init_checkpoint and (hvd is None or hvd.rank() == 0): (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) if FLAGS.verbose_logging: tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" %d name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: seq_length = modeling.get_shape_list(input_ids)[1] def compute_loss(logits, positions): one_hot_positions = tf.one_hot(positions, depth=seq_length, dtype=tf.float32) log_probs = tf.nn.log_softmax(logits, axis=-1) loss = -tf.reduce_mean( tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) return loss start_positions = features["start_positions"] end_positions = features["end_positions"] start_loss = compute_loss(start_logits, start_positions) end_loss = compute_loss(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2.0 train_op = optimization.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, hvd, False, use_fp16, FLAGS.num_accumulation_steps) output_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) elif mode == tf.estimator.ModeKeys.PREDICT: predictions = { "unique_ids": unique_ids, "start_logits": start_logits, "end_logits": end_logits, } output_spec = tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) else: raise ValueError("Only TRAIN and PREDICT modes are supported: %s" % (mode)) return output_spec
def input_fn(filenames='', channel='training', batch_size=32, num_epochs=1, perform_shuffle=False): print('Parsing', filenames) def decode_libsvm(line): #columns = tf.decode_csv(value, record_defaults=CSV_COLUMN_DEFAULTS) #features = dict(zip(CSV_COLUMNS, columns)) #labels = features.pop(LABEL_COLUMN) columns = tf.string_split([line], ' ') labels = tf.string_to_number(columns.values[0], out_type=tf.float32) splits = tf.string_split(columns.values[1:], ':') id_vals = tf.reshape(splits.values, splits.dense_shape) feat_ids, feat_vals = tf.split(id_vals, num_or_size_splits=2, axis=1) feat_ids = tf.string_to_number(feat_ids, out_type=tf.int32) feat_vals = tf.string_to_number(feat_vals, out_type=tf.float32) #feat_ids = tf.reshape(feat_ids,shape=[-1,FLAGS.field_size]) #for i in range(splits.dense_shape.eval()[0]): # feat_ids.append(tf.string_to_number(splits.values[2*i], out_type=tf.int32)) # feat_vals.append(tf.string_to_number(splits.values[2*i+1])) #return tf.reshape(feat_ids,shape=[-1,field_size]), tf.reshape(feat_vals,shape=[-1,field_size]), labels return {"feat_ids": feat_ids, "feat_vals": feat_vals}, labels # Extract lines from input files using the Dataset API, can pass one filename or filename list print("pipe mode ", FLAGS.pipe_mode) if FLAGS.pipe_mode == 0: """ dataset = tf.data.TextLineDataset(filenames).map(decode_libsvm, num_parallel_calls=10).prefetch(500000) # multi-thread pre-process then prefetch # Randomizes input using a window of 256 elements (read into memory) if perform_shuffle: dataset = dataset.shuffle(buffer_size=256) # epochs from blending together. dataset = dataset.repeat(num_epochs) dataset = dataset.batch(batch_size, drop_remainder=True) # Batch size to use """ dataset = tf.data.TextLineDataset(filenames) #liangaws: 这里假设Sagemaker用的是S3fullreplicate,也就是sagemaker会把每个channle的数据都在每个训练实例上复制一份。所在这里直接基于每个worker的rank来做shard。 dataset = dataset.shard(hvd.size(), hvd.rank()) dataset = dataset.map(decode_libsvm, num_parallel_calls=10) dataset = dataset.prefetch( 500000) # multi-thread pre-process then prefetch if perform_shuffle: dataset = dataset.shuffle(buffer_size=256) # epochs from blending together. if num_epochs > 1: dataset = dataset.repeat(num_epochs) dataset = dataset.batch(batch_size, drop_remainder=True) # Batch size to use #return dataset.make_one_shot_iterator() iterator = dataset.make_one_shot_iterator() batch_features, batch_labels = iterator.get_next() #return tf.reshape(batch_ids,shape=[-1,field_size]), tf.reshape(batch_vals,shape=[-1,field_size]), batch_labels return batch_features, batch_labels else: print("-------enter into pipe mode branch!------------") dataset = PipeModeDataset(channel, record_format='TextLine') number_host = len(FLAGS.hosts) #liangaws: horovod + pipe mode下,如果每个训练实例有多个worker,需要每个worker对应一个不同的channel,因此建议每个channel中的数据集是提前经过切分好的。只要在多个训练实例上并且每个训练实例是多个worker进程的情况下,才需要对不同训练实例上的同一个channel的数据做shard。 if number_host > 1 and hvd.size() > number_host: #liangaws: 在Sagemaker horovod方式下,发现current-host都是一样的。 #index = FLAGS.hosts.index(FLAGS.current_host) index = hvd.rank() // FLAGS.worker_per_host dataset = dataset.shard(number_host, index) if num_epochs > 1: dataset = dataset.repeat(num_epochs) dataset = dataset.prefetch(500000) dataset = dataset.map(decode_libsvm, num_parallel_calls=10) dataset = dataset.batch(batch_size, drop_remainder=True) return dataset
def _print(*args, **kwargs): if hvd.rank() == 0: print(*args, **kwargs)
def log_training_step(opts, model, file_writer, x, y, loss, pred, step, metrics, optimizer, steptime, epoch): """ Log to file writer during training""" if hvd.local_rank() == 0 and hvd.rank() == 0: # Make y [batch_size,image_size,image_size,1], prepare for metrics y = tf.argmax(y, axis=-1)[..., None] compute_loss, compute_miou, compute_auc = metrics compute_miou.update_state(y, pred) compute_auc.update_state(y, pred) # Training Prints tf.print('\nEpoch:', epoch, 'Step', step, '/', opts.steps_per_epoch, ': loss', loss, ': miou', compute_miou.result().numpy(), ': auc', compute_auc.result().numpy(), '\n') with file_writer.as_default(): image = tf.cast(255 * x, tf.uint8) mask = tf.cast(255 * y, tf.uint8) summary_predictions = tf.cast(tf.expand_dims(pred * 255, axis=-1), tf.uint8) tf.summary.scalar('Training StepTime', steptime, step=tf.cast(step, tf.int64)) tf.summary.image('Train_image', image, step=tf.cast(step, tf.int64), max_outputs=2) tf.summary.image('Train_mask', mask, step=tf.cast(step, tf.int64), max_outputs=2) tf.summary.image('Train_prediction', summary_predictions, step=tf.cast(step, tf.int64), max_outputs=2) tf.summary.scalar('Training Loss', loss, step=tf.cast(step, tf.int64)) tf.summary.scalar('Training mIoU', compute_miou.result().numpy(), step=tf.cast(step, tf.int64)) tf.summary.scalar('Training AUC', compute_auc.result().numpy(), step=tf.cast(step, tf.int64)) # Logging the optimizer's hyperparameters for key in optimizer._hyper: tf.summary.scalar(key, optimizer._hyper[key].numpy(), step=tf.cast(step, tf.int64)) # Extract weights and filter out None elemens for aspp without weights weights = filter(None, [x.weights for x in model.layers]) for var in weights: tf.summary.histogram('%s' % var[0].name, var[0], step=tf.cast(step, tf.int64)) file_writer.flush() return
def init_backend_engine(): """ Initializes ``engine``, which is either :class:`TFEngine.Engine` or Theano :class:`Engine.Engine`. """ BackendEngine.select_engine(config=config) if BackendEngine.is_theano_selected(): print("Theano:", describe_theano_version(), file=log.v3) import TheanoUtil TheanoUtil.monkey_patches() elif BackendEngine.is_tensorflow_selected(): print("TensorFlow:", describe_tensorflow_version(), file=log.v3) if get_tensorflow_version_tuple()[0] == 0: print("Warning: TF <1.0 is not supported and likely broken.", file=log.v2) if os.environ.get("TF_DEVICE"): print("Devices: Use %s via TF_DEVICE instead of %s." % (os.environ.get("TF_DEVICE"), config.opt_typed_value("device")), file=log.v4) config.set("device", os.environ.get("TF_DEVICE")) if config.is_true("use_horovod"): import socket # noinspection PyPackageRequirements,PyUnresolvedReferences import horovod.tensorflow as hvd from TFUtil import init_horovod init_horovod() # make sure it is initialized if "gpu" in config.value("device", "") or os.environ.get( "CUDA_VISIBLE_DEVICES", ""): # We assume that we want to use a GPU. gpu_opts = config.typed_dict.setdefault("tf_session_opts", {}).setdefault( "gpu_options", {}) assert "visible_device_list" not in gpu_opts gpu_opts["visible_device_list"] = str(hvd.local_rank()) print("Horovod: Hostname %s, pid %i, using GPU %s." % (socket.gethostname(), os.getpid(), gpu_opts["visible_device_list"]), file=log.v3) else: if hvd.rank() == 0: # Don't spam in all ranks. print("Horovod: Not using GPU.", file=log.v3) horovod_reduce_type = config.value("horovod_reduce_type", "") if horovod_reduce_type == "": horovod_reduce_type = "grad" config.set("horovod_reduce_type", horovod_reduce_type) else: assert horovod_reduce_type in [ "grad", "param" ], "config option 'horovod_reduce_type' invalid" if hvd.rank() == 0: # Don't spam in all ranks. print("Horovod: Reduce type:", horovod_reduce_type, file=log.v3) from TFUtil import debug_register_better_repr, setup_tf_thread_pools, print_available_devices tf_session_opts = config.typed_value("tf_session_opts", {}) assert isinstance(tf_session_opts, dict) # This must be done after the Horovod logic, such that we only touch the devices we are supposed to touch. setup_tf_thread_pools(log_file=log.v3, tf_session_opts=tf_session_opts) # Print available devices. Also make sure that get_tf_list_local_devices uses the correct TF session opts. print_available_devices(tf_session_opts=tf_session_opts, file=log.v2) debug_register_better_repr() if config.is_true("distributed_tf"): import TFDistributed TFDistributed.init_distributed_tf(config) else: raise NotImplementedError
def get_model_and_optimizer(opts): """ Load the model and optimizer """ if opts.evaluate: assert opts.model_dir, "WARNING: Please provide --model_dir when --evaluate" if opts.model_dir: print(f'Resuming model from {opts.model_dir}...') model = tf.keras.models.load_model(opts.model_dir) else: model = Deeplabv3(input_shape=(opts.image_size, opts.image_size, 3), classes=2, backbone='xception', opts=opts) if opts.horovod: # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if opts.fp16_allreduce else hvd.Compression.none if opts.optimizer == 'Adam': opt = tf.optimizers.Adam(opts.base_lr * hvd.size(), epsilon=opts.epsilon) elif opts.optimizer == 'SGD': opt = tf.optimizers.SGD(opts.base_lr * hvd.size(), opts.momentum, opts.nesterov) else: raise NotImplementedError( 'Only SGD and Adam are supported for now') # opt = mixed_precision.LossScaleOptimizer(opt, loss_scale='dynamic') # Horovod: add Horovod DistributedOptimizer. # opt = hvd.DistributedOptimizer(opt, backward_passes_per_step=5, op=hvd.Adasum) else: if opts.optimizer == 'Adam': opt = tf.optimizers.Adam(opts.base_lr, epsilon=opts.epsilon) elif opts.optimizer == 'SGD': opt = tf.optimizers.SGD(opts.base_lr, opts.momentum, opts.nesterov) else: raise NotImplementedError( 'Only SGD and Adam are supported for now') compression = None if hvd.rank() == 0: print("Compiling model...") model.layers[0].build(input_shape=(None, opts.image_size, opts.image_size, 3)) # for layer in model.layers[0].layers: # for var in layer.variables: # print(var.name, var.shape, var.device) if hvd.rank() == 0: model.summary() # if opts.model == 'deeplab': # for layer in model.layers: print(layer.name,layer.dtype) # else: # for layer in model.layers[0].layers: print(layer.name,layer.dtype) return model, opt, compression
def test(args): import filelock with filelock.FileLock('/tmp/robotstify.lock'): import gym import sys try: import goexplore_py.complex_fetch_env except Exception: print('Could not import complex_fetch_env, is goexplore_py in PYTHONPATH?') import tensorflow as tf import horovod.tensorflow as hvd hvd.init() print('initialized worker %d' % hvd.rank(), flush=True) from baselines.common import set_global_seeds set_global_seeds(hvd.rank()) from baselines import bench from baselines.common import set_global_seeds from atari_reset.wrappers import VecFrameStack, VideoWriter, my_wrapper,\ EpsGreedyEnv, StickyActionEnv, NoopResetEnv, SubprocVecEnv, PreventSlugEnv, FetchSaveEnv, TanhWrap from atari_reset.ppo import learn from atari_reset.policies import CnnPolicy, GRUPolicy, FFPolicy set_global_seeds(hvd.rank()) ncpu = 2 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) tf.Session(config=config).__enter__() max_noops = 30 if args.noops else 0 print('SAVE PATH', args.save_path) def make_env(rank): def env_fn(): if args.game == 'fetch': assert args.fetch_target_location is not None, 'For now, we require a target location for fetch' kwargs = {} dargs = vars(args) for attr in dargs: if attr.startswith('fetch_'): if attr == 'fetch_type': kwargs['model_file'] = f'teleOp_{args.fetch_type}.xml' elif attr != 'fetch_total_timestep': kwargs[attr[len('fetch_'):]] = dargs[attr] env = goexplore_py.complex_fetch_env.ComplexFetchEnv( **kwargs ) elif args.game == 'fetch_dumb': env = goexplore_py.dumb_fetch_env.ComplexFetchEnv() else: env = gym.make(args.game + 'NoFrameskip-v4') if args.seed_env: env.seed(0) # if args.unlimited_score: # # This removes the TimeLimit wrapper around the env # env = env.env # env = PreventSlugEnv(env) # change for long runs # env._max_episode_steps *= 1000 env = bench.Monitor(env, "{}.monitor.json".format(rank), allow_early_resets=True) if False and rank%nenvs == 0 and hvd.local_rank()==0: os.makedirs(args.save_path + '/vids/' + args.game, exist_ok=True) videofile_prefix = args.save_path + '/vids/' + args.game env = VideoWriter(env, videofile_prefix) if 'fetch' not in args.game: if args.noops: os.makedirs(args.save_path, exist_ok=True) env = NoopResetEnv(env, 30, nenvs, args.save_path, num_per_noop=args.num_per_noop, unlimited_score=args.unlimited_score) env = my_wrapper(env, clip_rewards=True, sticky=args.sticky) if args.epsgreedy: env = EpsGreedyEnv(env) else: os.makedirs(f'{args.save_path}', exist_ok=True) env = FetchSaveEnv(env, rank=rank, n_ranks=nenvs, save_path=f'{args.save_path}/', demo_path=args.demo) env = TanhWrap(env) # def print_rec(e): # print(e.__class__.__name__) # if hasattr(e, 'env'): # print_rec(e.env) # import time # import random # time.sleep(random.random() * 10) # print('\tSHOWING STUFF') # print_rec(env) # print('\n\n\n') return env return env_fn nenvs = args.nenvs env = SubprocVecEnv([make_env(i + nenvs * hvd.rank()) for i in range(nenvs)]) env = VecFrameStack(env, 1 if 'fetch' in args.game else 4) if 'fetch' in args.game: print('Fetch environment, using the feedforward policy.') args.policy = FFPolicy else: args.policy = {'cnn': CnnPolicy, 'gru': GRUPolicy}[args.policy] args.sil_pg_weight_by_value = False args.sil_vf_relu = False args.sil_vf_coef = 0 args.sil_coef = 0 args.sil_ent_coef = 0 args.ent_coef = 0 args.vf_coef = 0 args.cliprange = 1 args.l2_coef = 0 args.adam_epsilon = 1e-8 args.gamma = 0.99 args.lam = 0.10 args.scale_rewards = 1.0 args.sil_weight_success_rate = True args.norm_adv = 1.0 args.log_interval = 1 args.save_interval = 100 args.subtract_rew_avg = True args.clip_rewards = False learn(env, args, True)
weight_decay=RUNNING_CONFIG.weight_decay, learning_rate=RUNNING_CONFIG.learning_rate, learning_rate_decay_factor=RUNNING_CONFIG. learning_rate_decay_factor, learning_rate_decay_steps=RUNNING_CONFIG.learning_rate_decay_steps, rmsprop_decay=RUNNING_CONFIG.rmsprop_decay, rmsprop_momentum=RUNNING_CONFIG.rmsprop_momentum, use_auto_loss_scaling=FLAGS.use_auto_loss_scaling, augment_data=RUNNING_CONFIG.augment_data, is_benchmark=RUNNING_CONFIG.exec_mode == 'training_benchmark') if RUNNING_CONFIG.exec_mode in [ "train_and_evaluate", 'evaluate', 'inference_benchmark' ]: if RUNNING_CONFIG.exec_mode == 'inference_benchmark' and hvd_utils.is_using_hvd( ): raise NotImplementedError( "Only single GPU inference is implemented.") elif not hvd_utils.is_using_hvd() or hvd.rank() == 0: runner.evaluate( iter_unit=RUNNING_CONFIG.iter_unit if RUNNING_CONFIG.exec_mode != "train_and_evaluate" else "epoch", num_iter=RUNNING_CONFIG.num_iter if RUNNING_CONFIG.exec_mode != "train_and_evaluate" else 1, warmup_steps=RUNNING_CONFIG.warmup_steps, batch_size=RUNNING_CONFIG.batch_size, is_benchmark=RUNNING_CONFIG.exec_mode == 'inference_benchmark')
batch_inputs, batch_labels = generate_batch( batch_size, num_skips, skip_window) feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels} # We perform one update step by evaluating the optimizer op (including it # in the list of returned values for session.run() _, loss_val = session.run([train_op, loss], feed_dict=feed_dict) average_loss += loss_val if step % 2000 == 0: if step > 0: average_loss /= 2000 # The average loss is an estimate of the loss over the last 2000 batches. print('Average loss at step ', step, ': ', average_loss) run.log("Loss", average_loss) average_loss = 0 final_embeddings = normalized_embeddings.eval() # Evaluate similarity in the end on worker 0. if hvd.rank() == 0: sim = similarity.eval() for i in xrange(valid_size): valid_word = reverse_dictionary[valid_examples[i]] top_k = 8 # number of nearest neighbors nearest = (-sim[i, :]).argsort()[1:top_k + 1] log_str = 'Nearest to %s:' % valid_word for k in xrange(top_k): close_word = reverse_dictionary[nearest[k]] log_str = '%s %s,' % (log_str, close_word) print(log_str)
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) # Initialize Horovod. hvd.init() logdevp = args.logdevp # For debugging log_device_placement, allow_soft_placement = (True, True) \ if _DEVPROF or logdevp else (False, False) nranks_per_gpu = args.nranks_per_gpu local_rank = hvd.local_rank() gpu_local_rank = local_rank // nranks_per_gpu print('local_rank, GPU_LOCAL_RANK: {}, {}'.format( local_rank, gpu_local_rank)) # Pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto(log_device_placement=log_device_placement, allow_soft_placement=allow_soft_placement) config.gpu_options.allow_growth = True # config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.visible_device_list = str(gpu_local_rank) KB.set_session(tf.Session(config=config)) hvdsize = hvd.size() checkpt = getattr(args, 'checkpt', None) checkpt_flag = False if checkpt is None else True filepath = checkpt # print('CHECKPT:', checkpt) batch_size = args.batch_size num_classes = 10 epochs = args.epochs data_augmentation = args.aug datadir = getattr(args, 'datadir', None) # The data, shuffled and split between train and test sets: (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) \ if datadir is not None else cifar10.load_data() train_samples = x_train.shape[0] test_samples = x_test.shape[0] steps_per_epoch = train_samples // batch_size // hvdsize test_batches = test_samples // batch_size print(train_samples, 'train samples') print(test_samples, 'test samples') x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 # Convert class vectors to binary class matrices. y_train = to_categorical(y_train, num_classes).squeeze() y_test = to_categorical(y_test, num_classes).squeeze() callbacks = [] if hvd.rank() == 0: callbacks += [BatchTiming(), SamplesPerSec(batch_size * hvdsize)] print(x_train.shape, 'train shape') # with tf.device('/cpu:0'): model = make_model(x_train.shape, num_classes, filepath if checkpt_flag else None) lr = 0.0001 * hvdsize opt = tf.train.RMSPropOptimizer(lr) # Add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) # , use_locking=True) opt = TFOptimizer(opt) # Required for tf.train based optimizers # ------------------------------------- HAVE TO GET SESSION AFTER OPTIMIZER # sess = KB.get_session() # ------------------------------------------------------------------------- # Let's train the model using RMSprop model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) if hvd.rank() == 0: model.summary() KB.get_session().run(hvd.broadcast_global_variables(0)) if not data_augmentation: print('Not using data augmentation.') # model.fit(x_train, y_train, # batch_size=batch_size, # epochs=epochs, # validation_data=(x_test, y_test), # shuffle=True, # callbacks=callbacks) train_gen = ImageDataGenerator() test_gen = ImageDataGenerator() # Train the model. The training will randomly sample 1 / N batches of # training data and 3 / N batches of validation data on every worker, # where N is the number of workers. Over-sampling of validation data # helps to increase probability that every validation example will be # evaluated. start_time = time.time() model.fit_generator( train_gen.flow(x_train, y_train, batch_size=batch_size), steps_per_epoch=steps_per_epoch, callbacks=callbacks, epochs=epochs, verbose=hvd.rank() == 0, validation_data=test_gen.flow(x_test, y_test, batch_size=batch_size), validation_steps=3 * test_batches // hvdsize) else: print('Using real-time data augmentation.') # This will do preprocessing and realtime data augmentation: datagen = ImageDataGenerator( featurewise_center=False, # set input mean to 0 over the dataset samplewise_center=False, # set each sample mean to 0 # divide inputs by std of the dataset featurewise_std_normalization=False, samplewise_std_normalization=False, # divide each input by its std zca_whitening=False, # apply ZCA whitening # randomly rotate images in the range (degrees, 0 to 180) rotation_range=0, # randomly shift images horizontally (fraction of total width) width_shift_range=0.1, # randomly shift images vertically (fraction of total height) height_shift_range=0.1, horizontal_flip=True, # randomly flip images vertical_flip=False) # randomly flip images # Compute quantities required for feature-wise normalization # (std, mean, and principal components if ZCA whitening is applied). datagen.fit(x_train) start_time = time.time() # Fit the model on the batches generated by datagen.flow(). model.fit_generator( datagen.flow(x_train, y_train, batch_size=batch_size), steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=(x_test, y_test), verbose=hvd.rank() == 0, callbacks=callbacks) if hvd.rank() == 0: elapsed_time = time.time() - start_time print('[{}] finished in {} s' .format('TRAINING', round(elapsed_time, 3))) metrics = model.evaluate(x=x_test, y=y_test, batch_size=batch_size) print('\nCIFAR VALIDATION LOSS, ACC: {}, {}'.format(*metrics)) KB.clear_session()
def save_adv(image_orig, image_adv, label, target_label, logits, data_obj): # print("********---------", data_obj.self.step) session = data_obj.trainer.sess # print(hvd.rank(), hvd.local_rank()) output_filename = 'rank-%.5d-%.5d' % (hvd.rank(), data_obj.self.step) output_file = os.path.join(data_obj.save_dir, output_filename) writer = tf.python_io.TFRecordWriter(output_file) data_obj.self.step += 1 count = len(label) out_image_orig = (np.transpose(image_orig, [0, 2, 3, 1]) + 1.0) / IMAGE_SCALE out_image_adv = (np.transpose(image_adv, [0, 2, 3, 1]) + 1.0) / IMAGE_SCALE out_image_orig = np.clip(out_image_orig, 0, 255).round() out_image_adv = np.clip(out_image_adv, 0, 255) out_image_adv_float = out_image_adv out_image_adv_floor = np.floor(out_image_adv) out_image_adv_ceil = np.ceil(out_image_adv) out_image_adv_round = np.round(out_image_adv) diff_data = (out_image_adv_round - out_image_orig) / 255 diff_data = diff_data.reshape([count, -1]) dist_l0 = np.linalg.norm(diff_data, 0, axis=1) dist_l1 = np.linalg.norm(diff_data, 1, axis=1) dist_l2 = np.linalg.norm(diff_data, 2, axis=1) dist_l_inf = np.linalg.norm(diff_data, np.inf, axis=1) # convert image to uint8 type out_image_adv_floor = out_image_adv_floor.astype(np.uint8) out_image_adv_ceil = out_image_adv_ceil.astype(np.uint8) out_image_adv_round = out_image_adv_round.astype(np.uint8) _img_compressed = data_obj.op_img_compressed _img_raw_data = data_obj.op_img_raw_data _image_size = data_obj.image_size # print(np.shape(logits), logits.dtype, type(logits), type(logits[0])) for i in range(0, count): new_feature_map = { "image/orig": _bytes_feature( session.run(_img_compressed, feed_dict={_img_raw_data: out_image_orig[i]})), "image/float": _bytes_feature(out_image_adv_float[i].tobytes()), "image/floor": _bytes_feature( session.run(_img_compressed, feed_dict={_img_raw_data: out_image_adv_floor[i]})), "image/ceil": _bytes_feature( session.run(_img_compressed, feed_dict={_img_raw_data: out_image_adv_ceil[i]})), "image/round": _bytes_feature( session.run(_img_compressed, feed_dict={_img_raw_data: out_image_adv_round[i]})), "image/shape": _int64_feature([_image_size, _image_size, 3]), "diff/l0": _float_feature(dist_l0[i]), "diff/l1": _float_feature(dist_l1[i]), "diff/l2": _float_feature(dist_l2[i]), "diff/l_inf": _float_feature(dist_l_inf[i]), "label/adv": _int64_feature(target_label[i]), "label/orig": _int64_feature(label[i]), "label/pred": _float_feature(logits[i]) } example = tf.train.Example(features=tf.train.Features( feature=new_feature_map)) writer.write(example.SerializeToString()) writer.close() return image_orig, image_adv, label, target_label, logits
type=float, help="percentage of the size of training set, " "eg: 0.9 (90%)") args = parser.parse_args() print('------------------ RUN CONFIRURATION --------------------\n') print('KEY\t\t\tVALUE') for arg in vars(args): print(f'{arg:<20}\t{getattr(args, arg):<40}') print('---------------------------------------------------------\n') assert float(np.log2(args.image_size)) == int(np.log2(args.image_size)) if args.horovod: hvd.init() np.random.seed(args.seed + hvd.rank()) tf.random.set_random_seed(args.seed + hvd.rank()) random.seed(args.seed + hvd.rank()) print(f"Rank {hvd.rank()}:{hvd.local_rank()} reporting!") else: np.random.seed(args.seed) tf.random.set_random_seed(args.seed) random.seed(args.seed) gopts = tf.GraphOptions(place_pruned_graph=True) config = tf.ConfigProto(graph_options=gopts, allow_soft_placement=True) if args.gpu: config.gpu_options.allow_growth = True