def main(hps): # Initialize Horovod. hvd.init() # Create tensorflow session sess = tensorflow_session() # Download and load dataset. tf.set_random_seed(hvd.rank() + hvd.size() * hps.seed) np.random.seed(hvd.rank() + hvd.size() * hps.seed) # Get data and set train_its and valid_its train_iterator, test_iterator, data_init = get_data(hps, sess) hps.train_its, hps.test_its, hps.full_test_its = get_its(hps) # Create log dir logdir = os.path.abspath(hps.logdir) + "/" if not os.path.exists(logdir): os.mkdir(logdir) # Create model import model model = model.model(sess, hps, train_iterator, test_iterator, data_init) # Initialize visualization functions visualise = init_visualizations(hps, model, logdir) if not hps.inference: # Perform training train(sess, model, hps, logdir, visualise) else: infer(sess, model, hps, test_iterator)
def main(): # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') (mnist_images, mnist_labels), _ = \ tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % hvd.rank()) dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64))) dataset = dataset.repeat().shuffle(10000).batch(128) mnist_model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), tf.keras.layers.Dropout(0.25), tf.keras.layers.Flatten(), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(10, activation='softmax') ]) loss = tf.losses.SparseCategoricalCrossentropy() # Horovod: adjust learning rate based on number of GPUs. opt = tf.optimizers.Adam(0.001 * hvd.size()) checkpoint_dir = './checkpoints' checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt) @tf.function def training_step(images, labels, first_batch): with tf.GradientTape() as tape: probs = mnist_model(images, training=True) loss_value = loss(labels, probs) # Horovod: add Horovod Distributed GradientTape. tape = hvd.DistributedGradientTape(tape) grads = tape.gradient(loss_value, mnist_model.trainable_variables) opt.apply_gradients(zip(grads, mnist_model.trainable_variables)) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. # # Note: broadcast should be done after the first gradient step to ensure optimizer # initialization. if first_batch: hvd.broadcast_variables(mnist_model.variables, root_rank=0) hvd.broadcast_variables(opt.variables(), root_rank=0) return loss_value # Horovod: adjust number of steps based on number of GPUs. for batch, (images, labels) in enumerate(dataset.take(10000 // hvd.size())): loss_value = training_step(images, labels, batch == 0) if batch % 10 == 0 and hvd.rank() == 0: print('Step #%d\tLoss: %.6f' % (batch, loss_value)) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting it. if hvd.rank() == 0: checkpoint.save(checkpoint_dir)
def main(unused_argv): tf.logging.set_verbosity(tf.logging.INFO) hvd.init() flags.mark_flag_as_required('model_dir') flags.mark_flag_as_required('pipeline_config_path') session_config = tf.ConfigProto() session_config.gpu_options.per_process_gpu_memory_fraction=0.9 session_config.gpu_options.visible_device_list = str(hvd.local_rank()) if FLAGS.amp: session_config.graph_options.rewrite_options.auto_mixed_precision = True if FLAGS.allow_xla: session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 model_dir = FLAGS.model_dir if hvd.rank() == 0 else None config = tf.estimator.RunConfig(model_dir=model_dir, session_config=session_config) train_and_eval_dict = model_lib.create_estimator_and_inputs( run_config=config, eval_count=FLAGS.eval_count, hparams=model_hparams.create_hparams(FLAGS.hparams_overrides), pipeline_config_path=FLAGS.pipeline_config_path, train_steps=FLAGS.num_train_steps, sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples, sample_1_of_n_eval_on_train_examples=( FLAGS.sample_1_of_n_eval_on_train_examples), use_amp=FLAGS.amp, ) estimator = train_and_eval_dict['estimator'] train_input_fn = train_and_eval_dict['train_input_fn'] eval_input_fns = train_and_eval_dict['eval_input_fns'] eval_on_train_input_fn = train_and_eval_dict['eval_on_train_input_fn'] predict_input_fn = train_and_eval_dict['predict_input_fn'] train_steps = train_and_eval_dict['train_steps'] if FLAGS.checkpoint_dir: if FLAGS.eval_training_data: name = 'training_data' input_fn = eval_on_train_input_fn else: name = 'validation_data' # The first eval input will be evaluated. input_fn = eval_input_fns[0] if FLAGS.run_once: estimator.evaluate(input_fn, steps=None, checkpoint_path=tf.train.latest_checkpoint( FLAGS.checkpoint_dir)) else: model_lib.continuous_eval(estimator, FLAGS.checkpoint_dir, input_fn, train_steps, name) else: train_spec, eval_specs = model_lib.create_train_and_eval_specs( train_input_fn, eval_input_fns, eval_on_train_input_fn, predict_input_fn, train_steps, eval_on_train_data=False) train_hooks = [hvd.BroadcastGlobalVariablesHook(0), DLLoggerHook(hvd.size()*train_and_eval_dict['train_batch_size'], hvd.rank())] eval_hooks = [] for x in range(FLAGS.eval_count): estimator.train(train_input_fn, hooks=train_hooks, steps=train_steps // FLAGS.eval_count) if hvd.rank() == 0: eval_input_fn = eval_input_fns[0] results = estimator.evaluate(eval_input_fn, steps=None, hooks=eval_hooks)
def train_retinaface(cfg): # init os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' if cfg['distributed']: import horovod.tensorflow as hvd # Initialize Horovod hvd.init() else: hvd = [] os.environ['CUDA_VISIBLE_DEVICES'] = '0' reset_random_seeds() logger = tf.get_logger() logger.disabled = True logger.setLevel(logging.FATAL) set_memory_growth(hvd) # define network model = RetinaFaceModel(cfg, training=True) model.summary(line_length=80) # define prior box priors = prior_box((cfg['input_size'], cfg['input_size']), cfg['min_sizes'], cfg['steps'], cfg['clip']) # load dataset train_dataset = load_dataset(cfg, priors, 'train', hvd) if cfg['evaluation_during_training']: val_dataset = load_dataset(cfg, priors, 'val', []) # define optimizer if cfg['distributed']: init_lr = cfg['init_lr'] * hvd.size() min_lr = cfg['min_lr'] * hvd.size() steps_per_epoch = cfg['dataset_len'] // (cfg['batch_size'] * hvd.size()) else: init_lr = cfg['init_lr'] min_lr = cfg['min_lr'] steps_per_epoch = cfg['dataset_len'] // cfg['batch_size'] learning_rate = MultiStepWarmUpLR( initial_learning_rate=init_lr, lr_steps=[e * steps_per_epoch for e in cfg['lr_decay_epoch']], lr_rate=cfg['lr_rate'], warmup_steps=cfg['warmup_epoch'] * steps_per_epoch, min_lr=min_lr) optimizer = tf.keras.optimizers.SGD( learning_rate=learning_rate, momentum=0.9, nesterov=True) # define losses function multi_box_loss = MultiBoxLoss(num_class=cfg['num_class']) # load checkpoint checkpoint_dir = os.path.join(cfg['output_path'], 'checkpoints', cfg['sub_name']) checkpoint = tf.train.Checkpoint(epoch=tf.Variable(0, name='epoch'), optimizer=optimizer, model=model) manager = tf.train.CheckpointManager(checkpoint=checkpoint, directory=checkpoint_dir, max_to_keep=3) os.makedirs(checkpoint_dir, exist_ok=True) with open(os.path.join(checkpoint_dir, 'cfg.pickle'), 'wb') as handle: pickle.dump(cfg, handle, protocol=pickle.HIGHEST_PROTOCOL) if manager.latest_checkpoint: checkpoint.restore(manager.latest_checkpoint) print('[*] load ckpt from {}'.format(manager.latest_checkpoint)) else: print("[*] training from scratch.") # define training step function @tf.function def train_step(inputs, labels, first_batch, epoch): with tf.GradientTape() as tape: predictions = model(inputs, training=True) losses = {} losses['reg'] = tf.reduce_sum(model.losses) losses['loc'], losses['landm'], losses['class'] = \ multi_box_loss(labels, predictions) total_loss = tf.add_n([l for l in losses.values()]) if cfg['distributed']: # Horovod: add Horovod Distributed GradientTape. tape = hvd.DistributedGradientTape(tape) grads = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) if cfg['distributed'] and first_batch and epoch: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(optimizer.variables(), root_rank=0) return total_loss, losses def test_step(inputs, img_name): _, img_height_raw, img_width_raw, _ = inputs.shape # pad input image to avoid unmatched shape problem img = inputs[0].numpy() # if img_name == '6_Funeral_Funeral_6_618': # resize = 0.5 # this image is too big to avoid OOM problem # img = cv2.resize(img, None, None, fx=resize, fy=resize, # interpolation=cv2.INTER_LINEAR) img, pad_params = pad_input_image(img, max_steps=max(cfg['steps'])) input_img = img[np.newaxis, ...] predictions = model(input_img, training=False) outputs = pred_to_outputs(cfg, predictions, input_img.shape).numpy() # recover padding effect outputs = recover_pad_output(outputs, pad_params) bboxs = outputs[:, :4] confs = outputs[:, -1] pred_boxes = [] for box, conf in zip(bboxs, confs): x = int(box[0] * img_width_raw) y = int(box[1] * img_height_raw) w = int(box[2] * img_width_raw) - int(box[0] * img_width_raw) h = int(box[3] * img_height_raw) - int(box[1] * img_height_raw) pred_boxes.append([x, y, w, h, conf]) pred_boxes = np.array(pred_boxes).astype('float') return pred_boxes #training loop summary_writer = tf.summary.create_file_writer(os.path.join(cfg['output_path'], 'logs', cfg['sub_name'])) prog_bar = ProgressBar(steps_per_epoch, 0) if cfg['evaluation_during_training']: widerface_eval_hard = WiderFaceEval(split='hard') for epoch in range(cfg['epoch']): try: actual_epoch = epoch + 1 if cfg['distributed']: if hvd.rank() == 0: print("\nStart of epoch %d" % (actual_epoch,)) else: print("\nStart of epoch %d" % (actual_epoch,)) checkpoint.epoch.assign_add(1) start_time = time.time() #Iterate over the batches of the dataset. for batch, (x_batch_train, y_batch_train, img_name) in enumerate(train_dataset): total_loss, losses = train_step(x_batch_train, y_batch_train, batch == 0, epoch == 0) if cfg['distributed']: if hvd.rank() == 0: # prog_bar.update("epoch={}/{}, loss={:.4f}, lr={:.1e}".format( # checkpoint.epoch.numpy(), cfg['epoch'], total_loss.numpy(), optimizer._decayed_lr(tf.float32))) if batch % 100 == 0: print("batch={}/{}, epoch={}/{}, loss={:.4f}, lr={:.1e}".format( batch, steps_per_epoch, checkpoint.epoch.numpy(), cfg['epoch'], total_loss.numpy(), optimizer._decayed_lr(tf.float32))) else: prog_bar.update("epoch={}/{}, loss={:.4f}, lr={:.1e}".format( checkpoint.epoch.numpy(), cfg['epoch'], total_loss.numpy(), optimizer._decayed_lr(tf.float32))) # Display metrics at the end of each epoch. # train_acc = train_acc_metric.result() # print("\nTraining loss over epoch: %.4f" % (float(total_loss.numpy()),)) if cfg['distributed']: if hvd.rank() == 0: print("Time taken: %.2fs" % (time.time() - start_time)) manager.save() print("\n[*] save ckpt file at {}".format(manager.latest_checkpoint)) else: print("Time taken: %.2fs" % (time.time() - start_time)) manager.save() print("\n[*] save ckpt file at {}".format(manager.latest_checkpoint)) if cfg['evaluation_during_training']: # Run a validation loop at the end of each epoch. for batch, (x_batch_val, y_batch_val, img_name) in enumerate(val_dataset.take(500)): if '/' in img_name.numpy()[0].decode(): img_name = img_name.numpy()[0].decode().split('/')[1].split('.')[0] else: img_name = [] pred_boxes = test_step(x_batch_val, img_name) gt_boxes = labels_to_boxes(y_batch_val) widerface_eval_hard.update(pred_boxes, gt_boxes, img_name) ap_hard = widerface_eval_hard.calculate_ap() widerface_eval_hard.reset() if cfg['distributed']: if hvd.rank() == 0: print("Validation acc: %.4f" % (float(ap_hard),)) else: print("Validation acc: %.4f" % (float(ap_hard),)) def tensorboard_writer(): with summary_writer.as_default(): tf.summary.scalar('loss/total_loss', total_loss, step=actual_epoch) for k, l in losses.items(): tf.summary.scalar('loss/{}'.format(k), l, step=actual_epoch) tf.summary.scalar('learning_rate', optimizer._decayed_lr(tf.float32), step=actual_epoch) if cfg['evaluation_during_training']: tf.summary.scalar('Val AP', ap_hard, step=actual_epoch) if cfg['distributed']: if hvd.rank() == 0: tensorboard_writer() else: tensorboard_writer() except Exception as E: print(E) continue if cfg['distributed']: if hvd.rank() == 0: manager.save() print("\n[*] training done! save ckpt file at {}".format( manager.latest_checkpoint)) else: manager.save() print("\n[*] training done! save ckpt file at {}".format( manager.latest_checkpoint))
def train(action_set, level_names): """Train.""" local_job_device = '/job:%s/task:%d' % (FLAGS.job_name, FLAGS.task) shared_job_device = '/job:learner/task:0' is_actor_fn = lambda i: FLAGS.job_name == 'actor' and i == FLAGS.task is_learner = FLAGS.job_name == 'learner' actor_hosts = FLAGS.actor_hosts.split(',') num_actors = len(actor_hosts) learner_host = FLAGS.learner_host.split(',') assert (len(learner_host) == 1) if is_learner: assert (FLAGS.task == 0) assert (has_horovod == True) hvd.init() # Placing the variable on CPU, makes it cheaper to send it to all the # actors. Continual copying the variables from the GPU is slow. global_variable_device = '/job:learner/task:0' + '/cpu' filters = [shared_job_device, local_job_device] cluster = tf.train.ClusterSpec({ 'actor': actor_hosts, 'learner': learner_host }) config = tf.ConfigProto(allow_soft_placement=True, device_filters=filters) if is_learner: config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task, config=config) # Only used to find the actor output structure. Agent = agent_factory(FLAGS.agent_name) with tf.Graph().as_default(): agent = Agent(len(action_set)) env = create_environment(level_names[0], seed=1) structure = build_actor(agent, env, level_names[0], action_set) flattened_structure = nest.flatten(structure) dtypes = [t.dtype for t in flattened_structure] shapes = [t.shape.as_list() for t in flattened_structure] # build graph for actor or learner with tf.Graph().as_default(), \ tf.device(local_job_device + '/cpu'), \ pin_global_variables(global_variable_device): tf.set_random_seed(FLAGS.seed) # Makes initialization deterministic. # Create Queue and Agent on the learner. with tf.device(shared_job_device): queue = tf.FIFOQueue(FLAGS.queue_capacity, dtypes, shapes, shared_name='buffer') agent = Agent(len(action_set)) # Build actors and ops to enqueue their output. enqueue_ops = [] for i in range(num_actors): if is_actor_fn(i): level_name = level_names[i % len(level_names)] tf.logging.info('Creating actor %d with level %s', i, level_name) env = create_environment(level_name, seed=i + 1) actor_output = build_actor(agent, env, level_name, action_set) with tf.device(shared_job_device): enqueue_ops.append( queue.enqueue(nest.flatten(actor_output))) # Build learner. if is_learner: # Create global step, which is the number of environment frames # processed. g_step = tf.get_variable('num_environment_frames', initializer=tf.zeros_initializer(), shape=[], dtype=tf.int64, trainable=False, collections=[ tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES ]) # Create batch (time major) and recreate structure. dequeued = queue.dequeue_many(FLAGS.batch_size) dequeued = nest.pack_sequence_as(structure, dequeued) def make_time_major(s): return nest.map_structure( lambda t: tf.transpose(t, [1, 0] + list( range(t.shape.ndims))[2:]), s) dequeued = dequeued._replace( env_outputs=make_time_major(dequeued.env_outputs), agent_outputs=make_time_major(dequeued.agent_outputs)) with tf.device("/gpu"): # Using StagingArea allows us to prepare the next batch and send it to # the GPU while we're performing a training step. This adds up to 1 # step policy lag. flattened_output = nest.flatten(dequeued) area = tf.contrib.staging.StagingArea( [t.dtype for t in flattened_output], [t.shape for t in flattened_output]) stage_op = area.put(flattened_output) data_from_actors = nest.pack_sequence_as(structure, area.get()) # Unroll agent on sequence, create losses and update ops. if hasattr(data_from_actors, 'agent_state'): agent_state = data_from_actors.agent_state else: agent_state = agent.initial_state(1) output, optimizer = build_learner( agent, agent_state=agent_state, env_outputs=data_from_actors.env_outputs, agent_outputs=data_from_actors.agent_outputs, g_step=g_step) # Create MonitoredSession (to run the graph, checkpoint and log). is_chief = is_learner # MonitoredTrainingSession inits all global variables hooks = [py_process.PyProcessHook()] if is_learner: # for variable initialization across learners hooks.append(hvd.BroadcastGlobalVariablesHook(0)) tf.logging.info('Creating MonitoredSession, is_chief %s', is_chief) if is_learner: tf.logging.info('At rank %d', hvd.rank()) # rank 0 takes care of ckpt saving checkpoint_dir = FLAGS.logdir if is_learner and hvd.rank( ) == 0 else None with tf.train.MonitoredTrainingSession(server.target, is_chief=is_chief, checkpoint_dir=checkpoint_dir, save_checkpoint_secs=600, save_summaries_secs=30, log_step_count_steps=50000, config=config, hooks=hooks) as session: if is_learner: # tb Logging summary_writer = (tf.summary.FileWriterCache.get(FLAGS.logdir) if hvd.rank() == 0 else None) # Prepare data for first run. session.run_step_fn( lambda step_context: step_context.session.run(stage_op)) # Execute learning and track performance. num_env_frames_v = 0 while num_env_frames_v < FLAGS.total_environment_frames: level_names_v, done_v, infos_v, num_env_frames_v, _ = session.run( (data_from_actors.level_name, ) + output + (stage_op, )) level_names_v = np.repeat([level_names_v], done_v.shape[0], 0) for (level_name, episode_return, episode_step, episode_raw_return, episode_raw_step) in zip( level_names_v[done_v], infos_v.episode_return[done_v], infos_v.episode_step[done_v], infos_v.episode_raw_return[done_v], infos_v.episode_raw_step[done_v]): episode_frames = episode_step tf.logging.info( 'learner rank: %d, Env: %s Episode return: %f ' 'Episode raw return: %f', hvd.rank(), level_name, episode_return, episode_raw_return) if hvd.rank() == 0: # tb Logging summary = tf.summary.Summary() summary.value.add(tag=level_name + '/episode_return', simple_value=episode_return) summary.value.add(tag=level_name + '/episode_frames', simple_value=episode_frames) summary.value.add(tag=level_name + '/episode_raw_return', simple_value=episode_raw_return) summary.value.add(tag=level_name + '/episode_raw_frames', simple_value=episode_raw_step) summary_writer.add_summary(summary, num_env_frames_v) else: # Execute actors (they just need to enqueue their output). while True: session.run(enqueue_ops)
def main(): gpu_thread_count = 2 os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' os.environ['TF_GPU_THREAD_COUNT'] = str(gpu_thread_count) os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' hvd.init() # random.seed(5 * (1 + hvd.rank())) # np.random.seed(7 * (1 + hvd.rank())) # tf.set_random_seed(31 * (1 + hvd.rank())) cmdline = add_cli_args() FLAGS, unknown_args = cmdline.parse_known_args() if len(unknown_args) > 0: for bad_arg in unknown_args: print("ERROR: Unknown command line arg: %s" % bad_arg) raise ValueError("Invalid command line arg(s)") FLAGS.data_dir = None if FLAGS.data_dir == "" else FLAGS.data_dir FLAGS.log_dir = None if FLAGS.log_dir == "" else FLAGS.log_dir #+ FLAGS.log_dir_suffix filename_pattern = os.path.join(FLAGS.data_dir, '%s-*') train_filenames = sorted(tf.gfile.Glob(filename_pattern % 'train')) eval_filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation')) num_training_samples = get_num_records(train_filenames) training_samples_per_rank = num_training_samples // hvd.size() height, width = 224, 224 global_batch_size = FLAGS.batch_size * hvd.size() if FLAGS.num_epochs is not None: if FLAGS.data_dir is None: raise ValueError("num_epochs requires --data_dir to be specified") nstep = num_training_samples * FLAGS.num_epochs // global_batch_size decay_steps = nstep else: nstep = FLAGS.num_batches FLAGS.num_epochs = max( nstep * global_batch_size // num_training_samples, 1) decay_steps = 90 * num_training_samples // global_batch_size nstep_per_epoch = num_training_samples // global_batch_size if FLAGS.lr_decay_mode == 'steps': steps = [ int(x) * nstep_per_epoch for x in FLAGS.lr_decay_steps.split(',') ] lr_steps = [FLAGS.lr] for i in range(len(FLAGS.lr_decay_steps.split(','))): lr_steps.append(FLAGS.lr * pow(FLAGS.lr_decay_factor, i + 1)) else: steps = [] lr_steps = [] if not FLAGS.save_checkpoints_steps: # default to save one checkpoint per epoch FLAGS.save_checkpoints_steps = nstep_per_epoch if not FLAGS.save_summary_steps: # default to save one checkpoint per epoch FLAGS.save_summary_steps = nstep_per_epoch warmup_it = nstep_per_epoch * FLAGS.warmup_epochs if not FLAGS.log_name: FLAGS.log_name = "aws_tf_resnet" if FLAGS.eval: FLAGS.log_name = 'eval' + FLAGS.log_name logger = logging.getLogger(FLAGS.log_name) logger.setLevel(logging.INFO) # INFO, ERROR # file handler which logs debug messages if not os.path.isdir(FLAGS.log_dir): try: os.makedirs(FLAGS.log_dir) except FileExistsError: # if log_dir is common for multiple ranks like on nfs pass fh = logging.FileHandler(os.path.join(FLAGS.log_dir, FLAGS.log_name)) fh.setLevel(logging.DEBUG) # console handler ch = logging.StreamHandler() ch.setLevel(logging.INFO) # add formatter to the handlers # formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') formatter = logging.Formatter('%(message)s') fh.setFormatter(formatter) ch.setFormatter(formatter) # add handlers to logger logger.addHandler(fh) logger.addHandler(ch) rank0log(logger, 'PY' + str(sys.version) + 'TF' + str(tf.__version__)) config = tf.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.force_gpu_compatible = True # Force pinned memory config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads config.inter_op_parallelism_threads = 5 rank0log(logger, "Horovod size: ", hvd.size()) if FLAGS.local_ckpt: do_checkpoint = hvd.local_rank() == 0 else: do_checkpoint = hvd.rank() == 0 classifier = tf.estimator.Estimator( model_fn=cnn_model_function, model_dir=FLAGS.log_dir, params={ 'model': FLAGS.model, 'decay_steps': decay_steps, 'n_classes': 1000, 'dtype': tf.float16 if FLAGS.fp16 else tf.float32, 'format': 'channels_first', 'device': '/gpu:0', 'lr': FLAGS.lr, 'mom': FLAGS.mom, 'wdecay': FLAGS.wdecay, 'steps': steps, 'lr_steps': lr_steps, 'lr_decay_mode': FLAGS.lr_decay_mode, 'warmup_it': warmup_it, 'warmup_lr': FLAGS.warmup_lr, 'loss_scale': FLAGS.loss_scale, 'adv_bn_init': FLAGS.adv_bn_init, 'conv_init': tf.variance_scaling_initializer() if FLAGS.adv_conv_init else None, }, config=tf.estimator.RunConfig( # tf_random_seed=31 * (1 + hvd.rank()), session_config=config, save_summary_steps=FLAGS.save_summary_steps if do_checkpoint else None, save_checkpoints_steps=FLAGS.save_checkpoints_steps if do_checkpoint else None, keep_checkpoint_max=None)) if not FLAGS.eval: num_preproc_threads = 5 rank0log(logger, "Preproc threads", num_preproc_threads) training_hooks = [ hvd.BroadcastGlobalVariablesHook(0), PrefillStagingAreasHook() ] if hvd.rank() == 0: training_hooks.append( LogSessionRunHook(global_batch_size, num_training_samples, FLAGS.display_every, logger)) try: start_time = time.time() classifier.train( input_fn=lambda: make_dataset(train_filenames, training_samples_per_rank, FLAGS.batch_size, height, width, training=True, num_threads=num_preproc_threads, shard=True), max_steps=nstep, hooks=training_hooks) rank0log(logger, "Finished in ", time.time() - start_time) except KeyboardInterrupt: print("Keyboard interrupt") if True: rank0log(logger, "Evaluating") rank0log( logger, "Validation dataset size: {}".format( get_num_records(eval_filenames))) barrier = hvd.allreduce(tf.constant(0, dtype=tf.float32)) tf.Session(config=config).run(barrier) time.sleep(5) # a little extra margin... if not FLAGS.eval: FLAGS.num_gpus = hvd.size() if FLAGS.num_gpus == 1: rank0log( logger, """If you are evaluating checkpoints of a multi-GPU run on a single GPU, ensure you set --num_gpus to the number of GPUs it was trained on. This will ensure that the epoch number is accurately displayed in the below logs.""" ) try: ckpts = sort_and_load_ckpts(FLAGS.log_dir) for i, c in enumerate(ckpts): if i < len(ckpts) - 1: if (not FLAGS.eval_interval) or \ (i % FLAGS.eval_interval != 0): continue eval_result = classifier.evaluate( input_fn=lambda: make_dataset(eval_filenames, get_num_records( eval_filenames), FLAGS.batch_size, height, width, training=False, shard=True), checkpoint_path=c['path']) c['epoch'] = (c['step'] * FLAGS.num_gpus) / (nstep_per_epoch * hvd.size()) c['top1'] = eval_result['val-top1acc'] c['top5'] = eval_result['val-top5acc'] c['loss'] = eval_result['loss'] rank0log(logger, ' step epoch top1 top5 loss time(h)') barrier = hvd.allreduce(tf.constant(0, dtype=tf.float32)) for i, c in enumerate(ckpts): tf.Session(config=config).run(barrier) if 'top1' not in c: continue rank0log( logger, '{:5d} {:5.1f} {:5.3f} {:6.2f} {:6.2f} {:10.3f}'. format(c['step'], c['epoch'], c['top1'] * 100, c['top5'] * 100, c['loss'], c['mtime'])) rank0log(logger, "Finished evaluation") except KeyboardInterrupt: logger.error("Keyboard interrupt")
def parse_args(): global DEBUG global MODE global INFERENCE_ONLY global INFERENCE_OUT global DATA_THREADS global RANK global RANKS global DTYPE #global hvd global DISTRIBUTED global BATCH_SIZE global BATCH_SIZE_TEST # Parse command line args. parser = argparse.ArgumentParser(prog='ML-Dock-GN using Tensorflow + graph_nets Backend', description='Processing input flags for Training Run.') parser.add_argument('--batch_size', type=int, default=4, help='The (local) minibatch size used in training.') parser.add_argument('--batch_size_test', type=int, default=8, help='The (local) minibatch size used in testing.') parser.add_argument('--map_train', type=str, required=True, help='Path to .map file for training set.') parser.add_argument('--map_test', type=str, required=True, help='Path to .map file for test set.') parser.add_argument('--epochs', type=int, default=100, help='Number of epochs to train.') parser.add_argument('--mlp_layers', type=str, default="4,4", help='Number of layers in each MLP.') parser.add_argument('--mlp_latent', type=str, default="32,16", help='Number of neurons in each MLP layer.') parser.add_argument('--num_features', type=str, default="64,64", help='Number of output protein features, ligand features.') parser.add_argument('--gnn_layers', type=str, default="4,8", help='Number of message passing steps.') parser.add_argument('--lr_init', type=float, default=0.01, help='Initial learning rate.') parser.add_argument('--hvd', type=bool, default=False, help='Enable the use of Horovod.') parser.add_argument('--debug', type=bool, default=True, help='Enable debug tests / prints.') parser.add_argument('--use_clr', type=bool, default=False, help='Use Cyclic Learning Rate if true else constant.') parser.add_argument('--inference_only', type=bool, default=False, help='Ignore training, only use test set.') parser.add_argument('--inference_out', type=str, default=None, help='Write test set predictions to file.') parser.add_argument('--data_threads', type=int, default=1, help='Number of data loading threads.') parser.add_argument('--mode', type=str, default="regression", help='Training mode: "regression" or "classification".') parser.add_argument('--restore', type=str, default=None, help='Path to checkpoint file.') parser.add_argument('--plot_history', type=bool, default=False, help='Save training/testing history images') parser.add_argument('--use_fp16', type=bool, default=False, help='Use half-precision (tf.float16)') args = parser.parse_args() DEBUG = args.debug MODE = args.mode INFERENCE_ONLY = args.inference_only INFERENCE_OUT = args.inference_out DATA_THREADS = args.data_threads DTYPE = tf.float16 if args.use_fp16 else tf.float32 BATCH_SIZE=args.batch_size BATCH_SIZE_TEST=args.batch_size_test print(args) if args.hvd: print("Starting horovod...") import horovod.tensorflow as hvd #hvd = hvd_temp hvd.init() RANK = hvd.rank() RANKS = hvd.size() DISTRIBUTED=True print("Initialization of horovod complete...") #Index the output filenames for inference output data by rank ID if(args.inference_out != None): INFERENCE_OUT = str(args.inference_out).split(".")[0] + "_%s.map"%str(RANK) print("Rank %s"%str(RANK), " is saving inference output to %s"%str(INFERENCE_OUT)) if RANK != 0: #Only rank 0 should print debug info DEBUG = False #Reduce logging for all ranks other than 0 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # banner_print("PharML.Bind-GNN: Version 1.0.1 - Framework for Open Therapeutics with Graph Neural Networks.") banner_print("============================================================================================") banner_print(" Developed by") banner_print(" Jacob Balma: [email protected]") banner_print(" Aaron Vose: [email protected]") banner_print(" Yuri Petersen: [email protected]") banner_print("This work is supported by collaboration with Cray, Inc, Medical University of South Carolina (MUSC) and Hewlett Packard Enterprise (HPE). ") banner_print("============================================================================================") if DEBUG: print(args) # Return parsed args. return args
def main(_): # Horovod: initialize Horovod. hvd.init() hvd_size = hvd.size() print("hvd size: {}".format(hvd_size)) parser = argparse.ArgumentParser() parser.add_argument('--data_dir', type=str, default='/tensorflow/mnist/', help='Directory which contains dataset') parser.add_argument('--steps', type=int, default=100, help='steps') FLAGS, _ = parser.parse_known_args() # Ensure data directory passed to the script contains proper dataset dir_content = os.listdir(FLAGS.data_dir) for file in FILENAMES: if file not in dir_content: print( "Directory provided by user does not contains proper dataset") FLAGS.data_dir = os.path.join(FLAGS.data_dir, "input_data_{}".format(hvd.rank())) break # Read/download local dataset. Different copy for each process. mnist = learn.datasets.mnist.read_data_sets(FLAGS.data_dir) # Name images placeholder to be able to retrieve it from saved meta graph. images_placeholder = tf.placeholder(tf.float32, [None, 784], name=INPUT_NAME) dense_dropout_placeholder = tf.placeholder_with_default(1.0, []) labels_placeholder = tf.placeholder(tf.int64, [None]) logits, scores, predictions = build_net(images_placeholder, dense_dropout_placeholder) # Exporting meta graph right now takes care of removing Horovod specific ops before serving. Graph right now # also does not contain any training specific ops, so it is optimized for serving too. tf.train.export_meta_graph("graph.meta", as_text=True) loss = tf.losses.softmax_cross_entropy(tf.one_hot(labels_placeholder, 10), logits) accuracy = tf.reduce_mean( tf.cast(tf.equal(predictions, labels_placeholder), tf.float32)) # Define summary ops to save summaries for later use in tensorboard. tf.summary.scalar("accuracy", accuracy) tf.summary.scalar("loss", loss) summary_op = tf.summary.merge_all() # Horovod: adjust learning rate based on number of workers. optimizer = tf.train.RMSPropOptimizer(0.001 * hvd.size()) global_step = tf.contrib.framework.get_or_create_global_step() # Wrap standard optimizer in Horovod distributed one. train = hvd.DistributedOptimizer(optimizer).minimize( loss, global_step=global_step) hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of workers. tf.train.StopAtStepHook(FLAGS.steps // hvd_size), tf.train.LoggingTensorHook(tensors={ 'step': global_step, 'loss': loss }, every_n_iter=10), ] # Only master saves summaries. if hvd.rank() == 0: hooks += [ # As previously mentioned summaries are saved to EXPERIMENT_OUTPUT_PATH so that they can be discovered by # tensorboard. tf.train.SummarySaverHook(save_steps=1, output_dir=os.path.join( EXPERIMENT_OUTPUT_PATH, "tensorboard"), summary_op=summary_op) ] # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. As previously mentioned # checkpoints are saved to EXPERIMNET_OUTPUT_PATH which makes them accessible by user. checkpoint_dir = os.path.join(EXPERIMENT_OUTPUT_PATH, "checkpoints") if hvd.rank() == 0 else None # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks) as mon_sess: while not mon_sess.should_stop(): images, labels = mnist.train.next_batch(64) _, loss_val, accuracy_val, global_step_val = mon_sess.run( [train, loss, accuracy, global_step], feed_dict={ images_placeholder: images, labels_placeholder: labels, dense_dropout_placeholder: 0.5 }) # Only master publishes metrics. if hvd.rank() == 0: # Publish metrics just like in the single node example. publish({ "loss": str(loss_val), "accuracy": str(accuracy_val), "global_step": str(global_step_val) }) # Save servable model only from Horovod master. if hvd.rank() == 0: # Create a new graph to import the previously exported one. with tf.Graph().as_default(): # Import previously saved meta graph. restorer = tf.train.import_meta_graph("graph.meta") with tf.Session() as session: checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir) restorer.restore(session, checkpoint_file) # Get handlers for images placeholder and scores op with names defined before. images_placeholder = tf.get_default_graph().get_tensor_by_name( INPUT_NAME + ":0") scores = tf.get_default_graph().get_tensor_by_name( SCORES_NAME + ":0") # Save servable model to EXPERIMENT_OUTPUT_PATH to make it accessible to the user. builder = tf.saved_model.builder.SavedModelBuilder( os.path.join(EXPERIMENT_OUTPUT_PATH, "1")) prediction_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={ MODEL_INPUT_NAME: tf.saved_model.utils.build_tensor_info( images_placeholder) }, outputs={ MODEL_OUTPUT_NAME: tf.saved_model.utils.build_tensor_info(scores) }, method_name=tf.saved_model.signature_constants. PREDICT_METHOD_NAME)) builder.add_meta_graph_and_variables( session, [tf.saved_model.tag_constants.SERVING], signature_def_map={ MODEL_SIGNATURE_NAME: prediction_signature }, main_op=tf.tables_initializer(), strip_default_attrs=True) builder.save()
def main(): hvd.init() n_epochs = 10 batch_size = 5 step = len(im) // batch_size params = parse_args(PARSER.parse_args()) optimizer = tf.keras.optimizers.Adam(learning_rate=params.learning_rate) ce_loss = tf.keras.metrics.Mean(name='ce_loss') f1_loss = tf.keras.metrics.Mean(name='dice_loss') checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) pb_i = Progbar(step, stateful_metrics=metrics_names) count = 0 for epoch in range(n_epochs): if count >= step: count = 0 features = im[epoch * batch_size:(epoch * batch_size) + batch_size] features = np.reshape(features, (len(features), features[0].shape[1], features[0].shape[2], features[0].shape[0])) features = features.astype('float32') labels = lb[epoch * batch_size:(epoch * batch_size) + batch_size] labels = np.reshape( labels, (len(labels), labels[0].shape[0], labels[0].shape[1], 1)) labels = labels.astype('float32') print(features.shape, labels.shape) print('Epoch {} out of epochs {}'.format(epoch, n_epochs)) for i, (features_, labels_) in enumerate(zip(features, labels)): with tf.GradientTape() as tape: output_map = model(features) crossentropy_loss, dice_loss = partial_losses( output_map, labels) added_losses = tf.add(crossentropy_loss, dice_loss, name='total_loss_ref') values = [('Xent', crossentropy_loss), ('added_losses', added_losses)] pb_i.add(1, values=values) # calculate the gradients using our tape and then update the # model weights tape = hvd.DistributedGradientTape(tape) gradients = tape.gradient(added_losses, model.trainable_variables) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) # Calculate something wrong here # val_total_loss = 0 # val_total_acc = 0 # total_val_num = 0 # for bIdx, (val_X, val_y) in enumerate(val_batch): # if bIdx >= features.shape[0]: # break # y_pred = model(val_X, training=False) print('Xen: ', crossentropy_loss, dice_loss, added_losses)
def __init__( self, # Model Params input_format, # NCHW or NHWC compute_format, # NCHW or NHWC n_channels, activation_fn, weight_init_method, model_variant, input_shape, mask_shape, input_normalization_method, # Training HParams augment_data, loss_fn_name, # Runtime HParams use_tf_amp, use_xla, # Directory Params model_dir=None, log_dir=None, sample_dir=None, data_dir=None, dataset_name=None, dataset_hparams=None, # Debug Params log_every_n_steps=1, debug_verbosity=0, seed=None): if dataset_hparams is None: dataset_hparams = dict() if compute_format not in ["NHWC", 'NCHW']: raise ValueError( "Unknown `compute_format` received: %s (allowed: ['NHWC', 'NCHW'])" % compute_format) if input_format not in ["NHWC", 'NCHW']: raise ValueError( "Unknown `input_format` received: %s (allowed: ['NHWC', 'NCHW'])" % input_format) if n_channels not in [1, 3]: raise ValueError( "Unsupported number of channels: %d (allowed: 1 (grayscale) and 3 (color))" % n_channels) if data_dir is not None and not os.path.exists(data_dir): raise ValueError("The `data_dir` received does not exists: %s" % data_dir) LOGGER.set_model_name('UNet_TF') LOGGER.set_backends([ dllg.JsonBackend(log_file=os.path.join(model_dir, 'dlloger_out.json'), logging_scope=dllg.Scope.TRAIN_ITER, iteration_interval=log_every_n_steps), dllg.StdOutBackend(log_file=None, logging_scope=dllg.Scope.TRAIN_ITER, iteration_interval=log_every_n_steps) ]) if hvd_utils.is_using_hvd(): hvd.init() if hvd.rank() != 0: os.environ['WANDB_MODE'] = 'dryrun' wandb_id = os.environ.get('WANDB_ID', None) if wandb_id is None: wandb.init(config={ 'SLURM_JOB_ID': os.environ.get('SLURM_JOB_ID', None) }) else: wandb.init(config={ 'SLURM_JOB_ID': os.environ.get('SLURM_JOB_ID', None) }, id=f"{wandb_id}{hvd.rank()}") wandb.tensorboard.patch(save=False) if hvd.local_rank() == 0: LOGGER.log("Horovod successfully initialized ...") tf_seed = 2 * (seed + hvd.rank()) if seed is not None else None else: tf_seed = 2 * seed if seed is not None else None # ============================================ # Optimisation Flags - Do not remove # ============================================ os.environ['CUDA_CACHE_DISABLE'] = '0' os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL' # os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' os.environ['TF_GPU_THREAD_COUNT'] = '1' if not hvd_utils.is_using_hvd( ) else str(hvd.size()) os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1' os.environ['TF_ADJUST_HUE_FUSED'] = '1' os.environ['TF_ADJUST_SATURATION_FUSED'] = '1' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' os.environ['TF_SYNC_ON_FINISH'] = '0' os.environ['TF_AUTOTUNE_THRESHOLD'] = '2' # os.environ['TF_DISABLE_NVTX_RANGES'] = '1' # ================================================= self.use_xla = use_xla if use_tf_amp: if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0: LOGGER.log("TF AMP is activated - Experimental Feature") os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" # ================================================= model_hparams = tf.contrib.training.HParams( # Model Params input_format=input_format, compute_format=compute_format, input_shape=input_shape, mask_shape=mask_shape, n_channels=n_channels, activation_fn=activation_fn, weight_init_method=weight_init_method, model_variant=model_variant, input_normalization_method=input_normalization_method, # Training HParams augment_data=augment_data, loss_fn_name=loss_fn_name, # Runtime Params use_tf_amp=use_tf_amp, # Debug Params log_every_n_steps=log_every_n_steps, debug_verbosity=debug_verbosity, seed=tf_seed) run_config_additional = tf.contrib.training.HParams( dataset_hparams=dataset_hparams, model_dir=model_dir if not hvd_utils.is_using_hvd() or hvd.rank() == 0 else None, log_dir=log_dir if not hvd_utils.is_using_hvd() or hvd.rank() == 0 else None, sample_dir=sample_dir if not hvd_utils.is_using_hvd() or hvd.rank() == 0 else None, data_dir=data_dir, num_preprocessing_threads=32, ) if not hvd_utils.is_using_hvd() or hvd.rank() == 0: try: os.makedirs(sample_dir) except FileExistsError: pass self.run_hparams = Runner._build_hparams(model_hparams, run_config_additional) if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0: LOGGER.log('Defining Model Estimator ...\n') self._model = UNet_v1( model_name="UNet_v1", input_format=self.run_hparams.input_format, compute_format=self.run_hparams.compute_format, n_output_channels=1, unet_variant=self.run_hparams.model_variant, weight_init_method=self.run_hparams.weight_init_method, activation_fn=self.run_hparams.activation_fn) if self.run_hparams.seed is not None: if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0: LOGGER.log("Deterministic Run - Seed: %d\n" % seed) tf.set_random_seed(self.run_hparams.seed) np.random.seed(self.run_hparams.seed) random.seed(self.run_hparams.seed) if dataset_name not in known_datasets.keys(): raise RuntimeError( "The dataset `%s` is unknown, allowed values: %s ..." % (dataset_name, list(known_datasets.keys()))) self.dataset = known_datasets[dataset_name]( data_dir=data_dir, **self.run_hparams.dataset_hparams) self.num_gpus = 1 if not hvd_utils.is_using_hvd() else hvd.size()
def main(_): """ Starting point of the application """ hvd.init() set_flags() params = parse_args(PARSER.parse_args()) model_dir = prepare_model_dir(params) logger = get_logger(params) estimator = build_estimator(params, model_dir) dataset = Dataset(data_dir=params.data_dir, batch_size=params.batch_size, fold=params.crossvalidation_idx, augment=params.augment, gpu_id=hvd.rank(), num_gpus=hvd.size(), seed=params.seed) if 'train' in params.exec_mode: max_steps = params.max_steps // (1 if params.benchmark else hvd.size()) hooks = [hvd.BroadcastGlobalVariablesHook(0), TrainingHook(logger, max_steps=max_steps, log_every=params.log_every)] if params.benchmark and hvd.rank() == 0: hooks.append(ProfilingHook(logger, batch_size=params.batch_size, log_every=params.log_every, warmup_steps=params.warmup_steps, mode='train')) estimator.train( input_fn=dataset.train_fn, steps=max_steps, hooks=hooks) if 'evaluate' in params.exec_mode: if hvd.rank() == 0: results = estimator.evaluate(input_fn=dataset.eval_fn, steps=dataset.eval_size) logger.log(step=(), data={"eval_ce_loss": float(results["eval_ce_loss"]), "eval_dice_loss": float(results["eval_dice_loss"]), "eval_total_loss": float(results["eval_total_loss"]), "eval_dice_score": float(results["eval_dice_score"])}) if 'predict' in params.exec_mode: if hvd.rank() == 0: predict_steps = dataset.test_size hooks = None if params.benchmark: hooks = [ProfilingHook(logger, batch_size=params.batch_size, log_every=params.log_every, warmup_steps=params.warmup_steps, mode="test")] predict_steps = params.warmup_steps * 2 * params.batch_size predictions = estimator.predict( input_fn=lambda: dataset.test_fn(count=math.ceil(predict_steps / dataset.test_size)), hooks=hooks) binary_masks = [np.argmax(p['logits'], axis=-1).astype(np.uint8) * 255 for p in predictions] if not params.benchmark: multipage_tif = [Image.fromarray(mask).resize(size=(512, 512), resample=Image.BILINEAR) for mask in binary_masks] output_dir = os.path.join(params.model_dir, 'pred') if not os.path.exists(output_dir): os.makedirs(output_dir) multipage_tif[0].save(os.path.join(output_dir, 'test-masks.tif'), compression="tiff_deflate", save_all=True, append_images=multipage_tif[1:])
def main(args): # Initialize horovod hvd.init() gpus = tf.config.list_physical_devices("GPU") tf.config.set_visible_devices(gpus[hvd.local_rank()], "GPU") # Generate local filename # Assume the dataset has been splited in advance local_file = args.data_filename_prefix + str(hvd.local_rank()) + ".file" # generate local batch size assert (args.global_batch_size % hvd.size() == 0) local_batch_size = args.global_batch_size // hvd.size() dataset = utility.TFDataset(filename=local_file, batchsize=local_batch_size, as_sparse_tensor=False, repeat=1) dataset = dataset.prefetch(tf.data.AUTOTUNE) # Because there is no tensorflow distribute strategy, sok.Init() will call horovod to # broadcast nccl id and random seed, so it must be called after hvd.init() sok.Init(global_batch_size=args.global_batch_size) model = SOKDenseDemo( max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu, embedding_vec_size=args.embedding_vec_size, slot_num=args.slot_num, nnz_per_slot=args.nnz_per_slot, num_dense_layers=args.num_dense_layers) embedding_optimizer = utility.get_embedding_optimizer( args.optimizer)(learning_rate=0.1) dense_optimizer = utility.get_dense_optimizer( args.optimizer)(learning_rate=0.1) loss_fn = tf.keras.losses.BinaryCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE) def _replica_loss(labels, logits): loss = loss_fn(labels, logits) return tf.nn.compute_average_loss( loss, global_batch_size=args.global_batch_size) @tf.function def _train_step(inputs, labels, first_batch): with tf.GradientTape() as tape, tf.GradientTape() as emb_tape: logit = model(inputs, training=True) replica_loss = _replica_loss(labels, logit) # Horovod: wrap tf.GradientTape with Horovod DistributedGradientTape tape = hvd.DistributedGradientTape(tape) # There is no need to wrap the emb_tape because the communication is done by sok # emb_tape = hvd.DistributedGradientTape(emb_tape) emb_variable, other_variable = sok.split_embedding_variable_from_others( model.trainable_variables) # type(emb_tape) here is hvd.DistributedGradientTape # type(tape) here is tf.GradientTape emb_grads = emb_tape.gradient(replica_loss, emb_variable) grads = tape.gradient(replica_loss, other_variable) if "plugin" not in args.optimizer: with sok.OptimizerScope(emb_variable): embedding_optimizer.apply_gradients( zip(emb_grads, emb_variable), experimental_aggregate_gradients=False) else: embedding_optimizer.apply_gradients( zip(emb_grads, emb_variable), experimental_aggregate_gradients=False) dense_optimizer.apply_gradients(zip(grads, other_variable)) # Note: broadcast should be done after the first gradient step to ensure optimizer has been initialized. # There is no need to broadcast emb_variable and embedding_optimizer, because the parallel mode inside # sok is model parallel and the communication is down by sok itself. if first_batch: hvd.broadcast_variables(other_variable, root_rank=0) hvd.broadcast_variables(dense_optimizer.variables(), root_rank=0) return replica_loss for i, (inputs, labels) in enumerate(dataset): if args.stop_at_iter > 0 and i >= args.stop_at_iter: break rng = nvtx.start_range(message="Iteration_" + str(i), color="blue") total_loss = _train_step(inputs, labels, i == 0) nvtx.end_range(rng) print("[INFO]: Iteration: {}, loss={}".format(i, total_loss))
def main(input_path,blocks,weights,image_dir,checkpoint_dir,trn_sz,learning_rate): #init horovod comm_rank = 0 comm_local_rank = 0 comm_size = 1 if horovod: hvd.init() comm_rank = hvd.rank() comm_local_rank = hvd.local_rank() comm_size = hvd.size() if comm_rank == 0: print("Using distributed computation with Horovod: {} total ranks".format(comm_size,comm_rank)) #parameters batch = 1 channels = [0,1,2,10] num_epochs = 150 dtype = tf.float32 #session config sess_config=tf.ConfigProto(inter_op_parallelism_threads=2, #1 intra_op_parallelism_threads=33, #6 log_device_placement=False, allow_soft_placement=True) sess_config.gpu_options.visible_device_list = str(comm_local_rank) #get data training_graph = tf.Graph() if comm_rank == 0: print("Loading data...") path, trn_data, val_data, tst_data = load_data(input_path,comm_size,comm_rank,trn_sz) if comm_rank == 0: print("Shape of trn_data is {}".format(trn_data.shape[0])) print("done.") with training_graph.as_default(): #create datasets #files = tf.placeholder(tf.string, shape=[None]) trn_manager = h5_input_manager(path, trn_data, channels, update_on_read=True) trn_dataset = create_dataset(trn_manager, batch, num_epochs, shuffle=True) val_manager = h5_input_manager(path, val_data, channels, update_on_read=False) val_dataset = create_dataset(val_manager, batch, 1, shuffle=False) #create iterators handle = tf.placeholder(tf.string, shape=[], name="iterator-placeholder") iterator = tf.data.Iterator.from_string_handle(handle, (tf.float32, tf.int32), ((batch, len(channels), image_height, image_width), (batch, image_height, image_width)) ) next_elem = iterator.get_next() #create init handles #trn trn_iterator = trn_dataset.make_initializable_iterator() trn_handle_string = trn_iterator.string_handle() trn_init_op = iterator.make_initializer(trn_dataset) #val val_iterator = val_dataset.make_initializable_iterator() val_handle_string = val_iterator.string_handle() val_init_op = iterator.make_initializer(val_dataset) #set up model logit, prediction, weight = create_tiramisu(3, next_elem[0], image_height, image_width, len(channels), loss_weights=weights, nb_layers_per_block=blocks, p=0.2, wd=1e-4, dtype=dtype) #set up loss labels_one_hot = tf.contrib.layers.one_hot_encoding(next_elem[1], 3) weighted_labels_one_hot = tf.multiply(labels_one_hot, weight) loss = tf.losses.softmax_cross_entropy(onehot_labels=weighted_labels_one_hot,logits=logit) #loss = tf.losses.sparse_softmax_cross_entropy(labels=next_elem[1],logits=logit,weights=weight) #set up global step global_step = tf.train.get_or_create_global_step() #set up optimizer opt = tf.train.AdamOptimizer(learning_rate=learning_rate) if horovod: opt = hvd.DistributedOptimizer(opt) train_op = opt.minimize(loss, global_step=global_step) #set up streaming metrics iou_op, iou_update_op = tf.metrics.mean_iou(prediction,labels_one_hot,3,weights=None,metrics_collections=None,updates_collections=None,name="iou_score") #compute epochs and stuff: num_samples = trn_data.shape[0] // comm_size num_steps_per_epoch = num_samples // batch num_steps = num_epochs*num_steps_per_epoch #hooks #these hooks are essential. regularize the step hook by adding one additional step at the end hooks = [tf.train.StopAtStepHook(last_step=num_steps+1)] if horovod: hooks.append(hvd.BroadcastGlobalVariablesHook(0)) #initializers: init_op = tf.global_variables_initializer() init_local_op = tf.local_variables_initializer() #checkpointing if comm_rank == 0: checkpoint_save_freq = num_steps_per_epoch * 10 checkpoint_saver = tf.train.Saver(max_to_keep = 1000) hooks.append(tf.train.CheckpointSaverHook(checkpoint_dir=checkpoint_dir, save_steps=checkpoint_save_freq, saver=checkpoint_saver)) #create image dir if not exists if not os.path.isdir(image_dir): os.makedirs(image_dir) ##DEBUG ##summary #if comm_rank == 0: # print("write graph for debugging") # tf.summary.scalar("loss",loss) # summary_op = tf.summary.merge_all() # #hooks.append(tf.train.SummarySaverHook(save_steps=num_steps_per_epoch, summary_writer=summary_writer, summary_op=summary_op)) # with tf.Session(config=sess_config) as sess: # sess.run([init_op, init_local_op]) # #create iterator handles # trn_handle = sess.run(trn_handle_string) # #init iterators # sess.run(trn_init_op, feed_dict={handle: trn_handle, datafiles: trn_data, labelfiles: trn_labels}) # #summary: # sess.run(summary_op, feed_dict={handle: trn_handle}) # #summary file writer # summary_writer = tf.summary.FileWriter('./logs', sess.graph) ##DEBUG #start session with tf.train.MonitoredTrainingSession(config=sess_config, hooks=hooks) as sess: #initialize sess.run([init_op, init_local_op]) #create iterator handles trn_handle, val_handle = sess.run([trn_handle_string, val_handle_string]) #init iterators sess.run(trn_init_op, feed_dict={handle: trn_handle}) sess.run(val_init_op, feed_dict={handle: val_handle}) #do the training epoch = 1 train_loss = 0. start_time = time.time() while not sess.should_stop(): #training loop try: #construct feed dict _, _, train_steps, tmp_loss = sess.run([train_op, iou_update_op, global_step, loss], feed_dict={handle: trn_handle}) train_steps_in_epoch = train_steps%num_steps_per_epoch train_loss += tmp_loss if train_steps_in_epoch > 0: #print step report print("REPORT: rank {}, training loss for step {} (of {}) is {}".format(comm_rank, train_steps, num_steps, train_loss/train_steps_in_epoch)) else: end_time = time.time() #print epoch report train_loss /= num_steps_per_epoch print("COMPLETED: rank {}, training loss for epoch {} (of {}) is {}, epoch duration {} s".format(comm_rank, epoch, num_epochs, train_loss, end_time - start_time)) iou_score = sess.run(iou_op) print("COMPLETED: rank {}, training IoU for epoch {} (of {}) is {}, epoch duration {} s".format(comm_rank, epoch, num_epochs, iou_score, end_time - start_time)) start_time = time.time() #evaluation loop eval_loss = 0. eval_steps = 0 #update the input reader val_manager.minvals = trn_manager.minvals val_manager.maxvals = trn_manager.maxvals while True: try: #construct feed dict _, tmp_loss, val_model_predictions, val_model_labels = sess.run([iou_update_op, loss, prediction, next_elem[1]], feed_dict={handle: val_handle}) if use_scipy: imsave(image_dir+'/test_pred_epoch'+str(epoch)+'_estep' +str(eval_steps)+'_rank'+str(comm_rank)+'.png',np.argmax(val_model_predictions[0,...],axis=2)*100) imsave(image_dir+'/test_label_epoch'+str(epoch)+'_estep' +str(eval_steps)+'_rank'+str(comm_rank)+'.png',val_model_labels[0,...]*100) else: np.save(image_dir+'/test_pred_epoch'+str(epoch)+'_estep' +str(eval_steps)+'_rank'+str(comm_rank)+'.npy',np.argmax(val_model_predictions[0,...],axis=2)*100) np.save(image_dir+'/test_label_epoch'+str(epoch)+'_estep' +str(eval_steps)+'_rank'+str(comm_rank)+'.npy',val_model_labels[0,...]*100) eval_loss += tmp_loss eval_steps += 1 except tf.errors.OutOfRangeError: eval_steps = np.max([eval_steps,1]) eval_loss /= eval_steps print("COMPLETED: rank {}, evaluation loss for epoch {} (of {}) is {}".format(comm_rank, epoch-1, num_epochs, eval_loss)) iou_score = sess.run(iou_op) print("COMPLETED: rank {}, evaluation IoU for epoch {} (of {}) is {}".format(comm_rank, epoch-1, num_epochs, iou_score)) sess.run(val_init_op, feed_dict={handle: val_handle}) break #reset counters epoch += 1 train_loss = 0. except tf.errors.OutOfRangeError: break
def horovod(): hvd.init()
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( 'At least one of `do_train` or `do_eval` must be True.') if FLAGS.horovod: import horovod.tensorflow as hvd hvd.init() tmp_outptu_dir = ('{0}-tmp-{1}').format(FLAGS.output_dir, hvd.local_rank()) FLAGS.output_dir = FLAGS.output_dir if hvd.rank( ) == 0 else tmp_outptu_dir bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( 'Cannot use sequence length %d because the BERT model was only trained up to sequence length %d' % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) input_files = [] for input_pattern in FLAGS.input_file.split(','): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info( ('*** Input Files ***: {0} files').format(len(input_files))) if len(input_files) == 0: raise ValueError('No input file is found') tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) config = tf.ConfigProto() # from tensorflow.core.protobuf import rewriter_config_pb2 # rewrite_config = rewriter_config_pb2.RewriterConfig( # auto_mixed_precision=rewriter_config_pb2.RewriterConfig.ON, # # do not remove duplicated nodes # arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF) # rewrite_config.min_graph_nodes = -1 # graph_options = tf.GraphOptions( # rewrite_options=rewrite_config, build_cost_model=1) # # config = tf.ConfigProto(graph_options=graph_options) # # config.graph_options.optimizer_options.opt_level = -1 # if not FLAGS.use_fp16: # config.graph_options.rewrite_options.auto_mixed_precision = ( # rewriter_config_pb2.RewriterConfig.ON) if FLAGS.horovod: config.gpu_options.visible_device_list = str(hvd.local_rank()) if FLAGS.use_xla: config.graph_options.optimizer_options.do_common_subexpression_elimination = True config.graph_options.optimizer_options.do_constant_folding = True config.graph_options.optimizer_options.do_function_inlining = True config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_2 config.gpu_options.allow_growth = True is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, keep_checkpoint_max=1000 if not FLAGS.horovod or hvd.rank() == 0 else 1, save_checkpoints_steps=FLAGS.save_checkpoints_steps if not FLAGS.horovod or hvd.rank() == 0 else None, session_config=config, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host), log_step_count_steps=10000 * FLAGS.iterations_per_loop if FLAGS.report_loss else 100 * FLAGS.iterations_per_loop) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(), num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, hvd=None if not FLAGS.horovod else hvd) training_hooks = [] if FLAGS.horovod and hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) if FLAGS.report_loss and (not FLAGS.horovod or hvd.rank() == 0): global_batch_size = FLAGS.train_batch_size if not FLAGS.horovod else FLAGS.train_batch_size * hvd.size( ) training_hooks.append( _LogSessionRunHook(global_batch_size, 100, -1 if not FLAGS.horovod else hvd.rank())) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size) if FLAGS.do_train: tf.logging.info('***** Running training *****') tf.logging.info(' Batch size = %d', FLAGS.train_batch_size) train_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True, hvd=None if not FLAGS.horovod else hvd) estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=FLAGS.num_train_steps) if FLAGS.do_eval and (not FLAGS.horovod or hvd.rank() == 0): tf.logging.info('***** Running evaluation *****') tf.logging.info(' Batch size = %d', FLAGS.eval_batch_size) eval_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False, hvd=None if not FLAGS.horovod else hvd) result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, 'eval_results.txt') with tf.gfile.GFile(output_eval_file, 'w') as (writer): tf.logging.info('***** Eval results *****') for key in sorted(result.keys()): tf.logging.info(' %s = %s', key, str(result[key])) writer.write('%s = %s\n' % (key, str(result[key]))) return
def main(e2e_start_time): # Parse essential argumentss parser = argparse.ArgumentParser() parser.add_argument("--model_name", required=True) parser.add_argument("--model_size", default="base", type=str, help="base or large") parser.add_argument("--pretrain_tfrecords", type=str) parser.add_argument("--phase2", action='store_true') parser.add_argument("--fp16_compression", action='store_true') parser.add_argument("--amp", action='store_true', help="Whether to use fp16.") parser.add_argument("--xla", action='store_true', help="Whether to use xla.") parser.add_argument("--seed", default=42, type=int) parser.add_argument("--num_train_steps", type=int) parser.add_argument("--num_warmup_steps", type=int) parser.add_argument("--learning_rate", type=float) parser.add_argument("--train_batch_size", type=int) parser.add_argument("--max_seq_length", type=int) parser.add_argument("--mask_prob", type=float) parser.add_argument("--disc_weight", type=float) parser.add_argument("--generator_hidden_size", type=float) parser.add_argument("--log_freq", type=int, default=10, help="Training metrics logging frequency") parser.add_argument("--save_checkpoints_steps", type=int) parser.add_argument("--keep_checkpoint_max", type=int) parser.add_argument("--restore_checkpoint", default=None, type=str) parser.add_argument("--load_weights", action='store_true') parser.add_argument("--weights_dir") parser.add_argument("--optimizer", default="adam", type=str, help="adam or lamb") parser.add_argument( "--skip_adaptive", action='store_true', help="Whether to apply adaptive LR on LayerNorm and biases") parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Number of Gradient Accumulation steps") parser.add_argument("--lr_decay_power", type=float, default=0.5, help="LR decay power") parser.add_argument("--opt_beta_1", type=float, default=0.878, help="Optimizer beta1") parser.add_argument("--opt_beta_2", type=float, default=0.974, help="Optimizer beta2") parser.add_argument("--end_lr", type=float, default=0.0, help="Ending LR") parser.add_argument("--log_dir", type=str, default=None, help="Path to store logs") parser.add_argument("--results_dir", type=str, default=None, help="Path to store all model results") parser.add_argument("--skip_checkpoint", action='store_true', default=False, help="Path to store logs") parser.add_argument( '--json-summary', type=str, default=None, help= 'If provided, the json summary will be written to the specified file.') args = parser.parse_args() config = PretrainingConfig(**args.__dict__) # Padding for divisibility by 8 if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) # Set up tensorflow hvd.init() args.log_dir = config.log_dir # DLLogger setup_logger(args) set_affinity(hvd.local_rank()) gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') tf.config.optimizer.set_jit(config.xla) #tf.config.optimizer.set_experimental_options({"auto_mixed_precision": config.amp}) if config.amp: policy = tf.keras.mixed_precision.experimental.Policy( "mixed_float16", loss_scale="dynamic") tf.keras.mixed_precision.experimental.set_policy(policy) print('Compute dtype: %s' % policy.compute_dtype) # Compute dtype: float16 print('Variable dtype: %s' % policy.variable_dtype) # Variable dtype: float32 #tf.random.set_seed(config.seed) # Set up config cont' if config.load_weights and config.restore_checkpoint: raise ValueError( "`load_weights` and `restore_checkpoint` should not be on at the same time." ) if config.phase2 and not config.restore_checkpoint: raise ValueError( "`phase2` cannot be used without `restore_checkpoint`.") utils.heading("Config:") log_config(config) # Save pretrain configs pretrain_config_json = os.path.join(config.checkpoints_dir, 'pretrain_config.json') if is_main_process(): utils.write_json(config.__dict__, pretrain_config_json) log("Configuration saved in {}".format(pretrain_config_json)) # Set up model model = PretrainingModel(config) # Set up metrics metrics = dict() metrics["train_perf"] = tf.keras.metrics.Mean(name="train_perf") metrics["total_loss"] = tf.keras.metrics.Mean(name="total_loss") metrics["masked_lm_accuracy"] = tf.keras.metrics.Accuracy( name="masked_lm_accuracy") metrics["masked_lm_loss"] = tf.keras.metrics.Mean(name="masked_lm_loss") if config.electra_objective: metrics["sampled_masked_lm_accuracy"] = tf.keras.metrics.Accuracy( name="sampled_masked_lm_accuracy") if config.disc_weight > 0: metrics["disc_loss"] = tf.keras.metrics.Mean(name="disc_loss") metrics["disc_auc"] = tf.keras.metrics.AUC(name="disc_auc") metrics["disc_accuracy"] = tf.keras.metrics.Accuracy( name="disc_accuracy") metrics["disc_precision"] = tf.keras.metrics.Accuracy( name="disc_precision") metrics["disc_recall"] = tf.keras.metrics.Accuracy( name="disc_recall") # Set up tensorboard current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") train_log_dir = os.path.join( config.log_dir, current_time, 'train_' + str(get_rank()) + '_of_' + str(get_world_size())) train_summary_writer = tf.summary.create_file_writer(train_log_dir) # Set up dataset dataset = pretrain_utils.get_dataset(config, config.train_batch_size, world_size=get_world_size(), rank=get_rank()) train_iterator = iter(dataset) # Set up optimizer optimizer = create_optimizer(init_lr=config.learning_rate, num_train_steps=config.num_train_steps, num_warmup_steps=config.num_warmup_steps, weight_decay_rate=config.weight_decay_rate, optimizer=config.optimizer, skip_adaptive=config.skip_adaptive, power=config.lr_decay_power, beta_1=config.opt_beta_1, beta_2=config.opt_beta_2, end_lr=config.end_lr) accumulator = GradientAccumulator() if config.amp: optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( optimizer, "dynamic") # Set up model checkpoint checkpoint = tf.train.Checkpoint(step=tf.Variable(0), phase2=tf.Variable(False), optimizer=optimizer, model=model) manager = tf.train.CheckpointManager( checkpoint, config.checkpoints_dir, max_to_keep=config.keep_checkpoint_max) if config.restore_checkpoint and config.restore_checkpoint != "latest": checkpoint.restore(config.restore_checkpoint) log(" ** Restored model checkpoint from {}".format( config.restore_checkpoint)) elif config.restore_checkpoint and config.restore_checkpoint == "latest" and manager.latest_checkpoint: checkpoint.restore(manager.latest_checkpoint) log(" ** Restored model checkpoint from {}".format( manager.latest_checkpoint)) elif config.load_weights: model.generator(model.generator.dummy_inputs) model.discriminator(model.discriminator.dummy_inputs) model.generator.load_weights( os.path.join(config.weights_dir, 'generator', 'tf_model.h5')) model.discriminator.load_weights( os.path.join(config.weights_dir, 'discriminator', 'tf_model.h5')) else: log(" ** Initializing from scratch.") restore_iterator = bool( config.restore_checkpoint) and config.restore_checkpoint == "latest" # Initialize global step for phase2 if config.phase2 and not bool(checkpoint.phase2): optimizer.iterations.assign(0) checkpoint.step.assign(0) checkpoint.phase2.assign(True) restore_iterator = False if bool(checkpoint.phase2): manager = tf.train.CheckpointManager( checkpoint, config.checkpoints_dir, checkpoint_name='ckpt-p2', max_to_keep=config.keep_checkpoint_max) # Set up iterator checkpoint iter_checkpoint = tf.train.Checkpoint(train_iterator=train_iterator, world_size=tf.Variable( get_world_size()), rank=tf.Variable(get_rank())) iter_manager = tf.train.CheckpointManager( iter_checkpoint, os.path.join(config.checkpoints_dir, 'iter_ckpt_rank_' + '{:02}'.format(get_rank())), checkpoint_name='iter_ckpt_rank_' + '{:02}'.format(get_rank()), max_to_keep=config.keep_checkpoint_max) if restore_iterator and iter_manager.latest_checkpoint: ckpt_world_size = tf.train.load_variable( iter_manager.latest_checkpoint, 'world_size/.ATTRIBUTES/VARIABLE_VALUE') if ckpt_world_size == get_world_size(): iter_checkpoint.restore(iter_manager.latest_checkpoint) log(" ** Restored iterator checkpoint from {}".format( iter_manager.latest_checkpoint), all_rank=True) utils.heading("Running training") accumulator.reset() train_start, start_step = time.time(), int(checkpoint.step) - 1 local_step = 0 saved_ckpt = False while int(checkpoint.step) <= config.num_train_steps: saved_ckpt = False step = int(checkpoint.step) features = next(train_iterator) iter_start = time.time() # if step == 200: tf.profiler.experimental.start(logdir=train_log_dir) total_loss, eval_fn_inputs = train_one_step( config, model, optimizer, features, accumulator, local_step == 1, take_step=local_step % args.gradient_accumulation_steps == 0) # if step == 300: tf.profiler.experimental.stop() metrics["train_perf"].update_state(config.train_batch_size * get_world_size() / (time.time() - iter_start)) metrics["total_loss"].update_state(values=total_loss) metric_fn(config, metrics, eval_fn_inputs) if (step % args.log_freq == 0) and (local_step % args.gradient_accumulation_steps == 0): log_info_dict = { k: float(v.result().numpy() * 100) if "accuracy" in k else float(v.result().numpy()) for k, v in metrics.items() } dllogger.log(step=(step, ), data=log_info_dict, verbosity=0) log('Step:{step:6d}, Loss:{total_loss:10.6f}, Gen_loss:{masked_lm_loss:10.6f}, Disc_loss:{disc_loss:10.6f}, Gen_acc:{masked_lm_accuracy:6.2f}, ' 'Disc_acc:{disc_accuracy:6.2f}, Perf:{train_perf:4.0f}, Loss Scaler: {loss_scale}, Elapsed: {elapsed}, ETA: {eta}, ' .format(step=step, **log_info_dict, loss_scale=optimizer.loss_scale if config.amp else 1, elapsed=utils.get_readable_time(time.time() - train_start), eta=utils.get_readable_time( (time.time() - train_start) / (step - start_step) * (config.num_train_steps - step))), all_rank=True) with train_summary_writer.as_default(): for key, m in metrics.items(): tf.summary.scalar(key, m.result(), step=step) if int(checkpoint.step) < config.num_train_steps: for m in metrics.values(): m.reset_states() #Print allreduced metrics on the last step if int(checkpoint.step) == config.num_train_steps and ( local_step % args.gradient_accumulation_steps == 0): log_info_dict = { k: float(hvd.allreduce(v.result()).numpy() * 100) if "accuracy" in k else float(hvd.allreduce(v.result()).numpy()) for k, v in metrics.items() } log_info_dict["training_sequences_per_second"] = log_info_dict[ "train_perf"] log_info_dict["final_loss"] = log_info_dict["total_loss"] log_info_dict["e2e_train_time"] = time.time() - e2e_start_time dllogger.log(step=(), data=log_info_dict, verbosity=0) log('<FINAL STEP METRICS> Step:{step:6d}, Loss:{total_loss:10.6f}, Gen_loss:{masked_lm_loss:10.6f}, Disc_loss:{disc_loss:10.6f}, Gen_acc:{masked_lm_accuracy:6.2f}, ' 'Disc_acc:{disc_accuracy:6.2f}, Perf:{train_perf:4.0f},'. format(step=step, **log_info_dict), all_rank=False) if local_step % args.gradient_accumulation_steps == 0: checkpoint.step.assign(int(optimizer.iterations)) local_step += 1 if not config.skip_checkpoint and ( local_step % (config.save_checkpoints_steps * args.gradient_accumulation_steps) == 0): saved_ckpt = True if is_main_process(): save_path = manager.save(checkpoint_number=step) log(" ** Saved model checkpoint for step {}: {}".format( step, save_path)) iter_save_path = iter_manager.save(checkpoint_number=step) log(" ** Saved iterator checkpoint for step {}: {}".format( step, iter_save_path), all_rank=True) step = (int(checkpoint.step) - 1) dllogger.flush() if not config.skip_checkpoint and not saved_ckpt: if is_main_process(): save_path = manager.save(checkpoint_number=step) log(" ** Saved model checkpoint for step {}: {}".format( step, save_path)) iter_save_path = iter_manager.save(checkpoint_number=step) log(" ** Saved iterator checkpoint for step {}: {}".format( step, iter_save_path), all_rank=True) return args
def train(model, loss_fn, Dataset=None, dataset=None, valid_dataset=None, valid_dataset2=None, test_dataset=None, evaluate_fn=None, inference_fn=None, eval_fn=None, write_valid=True, valid_names=None, infer_names=None, infer_debug_names=None, valid_write_fn=None, infer_write_fn=None, valid_suffix='.valid', infer_suffix='.infer', write_streaming=False, optimizer=None, param_groups=None, init_fn=None, sep=','): use_horovod = 'OMPI_COMM_WORLD_RANK' in os.environ if Dataset is None: assert dataset logging.info('Dataset', Dataset, 'dataset', dataset, 'valid_dataset', valid_dataset, 'test_dataset', test_dataset, loss_fn) if FLAGS.torch: torch.manual_seed(FLAGS.seed or 0) if torch.cuda.device_count(): torch.cuda.manual_seed(FLAGS.seed or 0) if use_horovod: import horovod.torch as hvd hvd.init() #print('-----------------', hvd, hvd.size()) assert hvd.mpi_threads_supported() assert hvd.size() == comm.Get_size() # hvd.init already done on apps.train.py init torch.cuda.set_device(hvd.local_rank()) # https://pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html else: if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) model.to(device) input_ = FLAGS.train_input inputs = gezi.list_files(input_) inputs.sort() all_inputs = inputs #batch_size = FLAGS.batch_size batch_size = melt.batch_size() num_gpus = melt.num_gpus() #batch_size = max(batch_size, 1) #batch_size_ = batch_size if not FLAGS.batch_sizes else int(FLAGS.batch_sizes.split(',')[-1]) batch_size_ = FLAGS.eval_batch_size or batch_size if dataset is None: if FLAGS.fold is not None: inputs = [ x for x in inputs if not x.endswith('%d.record' % FLAGS.fold) and not x.endswith('%d.tfrecord' % FLAGS.fold) ] # if FLAGS.valid_input: # inputs += [x for x in gezi.list_files(FLAGS.valid_input) if not x.endswith('%d.record' % FLAGS.fold)] logging.info('inputs', len(inputs), inputs[:100]) num_folds = FLAGS.num_folds or len(inputs) + 1 if dataset is None: dataset = Dataset('train') assert len(inputs) > 0 train_dataset = dataset.make_batch(batch_size, inputs, simple_parse=FLAGS.simple_parse) num_examples = dataset.num_examples_per_epoch('train') else: assert FLAGS.torch_only, 'only torch only currently support input dataset not Dataset class type, because we do not have len function there' train_dataset = dataset num_examples = len(train_dataset) num_all_examples = num_examples if valid_dataset is None: valid_inputs = None if FLAGS.valid_input: valid_inputs = gezi.list_files(FLAGS.valid_input) else: if FLAGS.fold is not None: #valid_inputs = [x for x in all_inputs if x not in inputs] if not FLAGS.test_aug: valid_inputs = [ x for x in all_inputs if not 'aug' in x and x not in inputs ] else: valid_inputs = [ x for x in all_inputs if 'aug' in x and x not in inputs ] logging.info('valid_inputs', valid_inputs) num_valid_examples = None if valid_dataset is not None: num_valid_examples = len(valid_dataset) else: if valid_inputs: valid_dataset = dataset.make_batch(batch_size_, valid_inputs, subset='valid', hvd_shard=FLAGS.horovod_eval) valid_dataset2 = dataset.make_batch(batch_size, valid_inputs, subset='valid', repeat=True, initializable=False, hvd_shard=False) valid_dataset2_iter = iter(valid_dataset2) else: valid_datsset = None valid_dataset2 = None if num_examples: if FLAGS.fold is not None: num_examples = int(num_examples * (num_folds - 1) / num_folds) num_steps_per_epoch = -(-num_examples // batch_size) else: num_steps_per_epoch = None logging.info('num_train_examples:', num_examples) if use_horovod and num_examples: num_steps_per_epoch = -(-num_examples // (batch_size * hvd.size())) if num_valid_examples is None: if FLAGS.valid_input: num_valid_examples = dataset.num_examples_per_epoch('valid') num_valid_steps_per_epoch = -(-num_valid_examples // batch_size_ ) if num_valid_examples else None else: if FLAGS.fold is not None: if num_examples: num_valid_examples = int(num_all_examples * (1 / num_folds)) num_valid_steps_per_epoch = -(-num_valid_examples // batch_size_) else: num_valid_steps_per_epoch = None if use_horovod and FLAGS.horovod_eval and num_valid_examples: num_valid_steps_per_epoch = -(-num_valid_examples // (batch_size_ * hvd.size())) logging.info('num_valid_examples:', num_valid_examples) if test_dataset is None: if FLAGS.test_input: test_inputs = gezi.list_files(FLAGS.test_input) #test_inputs = [x for x in test_inputs if not 'aug' in x] logging.info('test_inputs', test_inputs) else: test_inputs = None num_test_examples = None if test_dataset is not None: num_test_examples = len(test_dataset) else: if test_inputs: test_dataset = dataset.make_batch(batch_size_, test_inputs, subset='test') num_test_examples = dataset.num_examples_per_epoch('test') else: test_dataset = None num_test_steps_per_epoch = -(-num_test_examples // batch_size_) if num_test_examples else None if use_horovod and FLAGS.horovod_eval and num_test_examples: num_test_steps_per_epoch = -(-num_test_examples // (batch_size_ * hvd.size())) logging.info('num_test_examples:', num_test_examples) summary = tf.contrib.summary # writer = summary.create_file_writer(FLAGS.log_dir + '/epoch') # writer_train = summary.create_file_writer(FLAGS.log_dir + '/train') # writer_valid = summary.create_file_writer(FLAGS.log_dir + '/valid') writer = summary.create_file_writer(FLAGS.log_dir) writer_train = summary.create_file_writer(FLAGS.log_dir) writer_valid = summary.create_file_writer(FLAGS.log_dir) global_step = tf.train.get_or_create_global_step() ## RuntimeError: tf.summary.FileWriter is not compatible with eager execution. Use tf.contrib.summary instead. #logger = gezi.SummaryWriter(FLAGS.log_dir) learning_rate = tfe.Variable(FLAGS.learning_rate, name="learning_rate") tf.add_to_collection('learning_rate', learning_rate) learning_rate_weight = tf.get_collection('learning_rate_weight')[-1] try: learning_rate_weights = tf.get_collection('learning_rate_weights')[-1] except Exception: learning_rate_weights = None # ckpt dir save models one per epoch ckpt_dir = os.path.join(FLAGS.model_dir, 'ckpt') os.system('mkdir -p %s' % ckpt_dir) # HACK ckpt dir is actually save mini epoch like when you set save_interval_epochs=0.1, this is usefull when you training large dataset ckpt_dir2 = os.path.join(FLAGS.model_dir, 'ckpt2') os.system('mkdir -p %s' % ckpt_dir2) #TODO FIXME now I just changed tf code so to not by default save only latest 5 # refer to https://github.com/tensorflow/tensorflow/issues/22036 # manager = tf.contrib.checkpoint.CheckpointManager( # checkpoint, directory=ckpt_dir, max_to_keep=5) # latest_checkpoint = manager.latest_checkpoint latest_checkpoint = tf.train.latest_checkpoint(ckpt_dir) if latest_checkpoint: logging.info('Latest checkpoint:', latest_checkpoint) else: latest_checkpoint = tf.train.latest_checkpoint(ckpt_dir2) logging.info('Latest checkpoint:', latest_checkpoint) if os.path.exists(FLAGS.model_dir + '.index'): latest_checkpoint = FLAGS.model_dir if 'test' in FLAGS.work_mode or 'valid' in FLAGS.work_mode: #assert not os.path.isdir(FLAGS.model_dir), FLAGS.model_dir latest_checkpoint = FLAGS.model_dir #assert os.path.exists(latest_checkpoint) and os.path.isfile(latest_checkpoint) checkpoint_prefix = os.path.join(ckpt_dir, 'ckpt') checkpoint_prefix2 = os.path.join(ckpt_dir2, 'ckpt') if not FLAGS.torch: try: optimizer = optimizer or melt.get_optimizer( FLAGS.optimizer)(learning_rate) except Exception: logging.warning( f'Fail to using {FLAGS.optimizer} use adam instead') optimizer = melt.get_optimizer('adam')(learning_rate) # TODO... if learning_rate_weights is None: checkpoint = tf.train.Checkpoint( learning_rate=learning_rate, learning_rate_weight=learning_rate_weight, model=model, optimizer=optimizer, global_step=global_step) else: checkpoint = tf.train.Checkpoint( learning_rate=learning_rate, learning_rate_weight=learning_rate_weight, learning_rate_weights=learning_rate_weights, model=model, optimizer=optimizer, global_step=global_step) checkpoint.restore(latest_checkpoint) checkpoint2 = copy.deepcopy(checkpoint) start_epoch = int( latest_checkpoint.split('-') [-1]) if latest_checkpoint and 'ckpt' in latest_checkpoint else 0 start_step = 0 # TODO else: # TODO torch with learning rate adjust # https://github.com/horovod/horovod/blob/master/examples/pytorch_mnist.py # TODO full support for pytorch now not work if optimizer is None: import lele is_dynamic_opt = True if FLAGS.optimizer == 'noam': optimizer_ = torch.optim.Adamax(model.parameters(), lr=0) if use_horovod: optimizer_ = hvd.DistributedOptimizer(optimizer_) optimizer = lele.training.optimizers.NoamOpt( 128, 2, 4000, optimzier_) elif FLAGS.optimizer == 'bert': num_train_steps = int( num_steps_per_epoch * (FLAGS.num_decay_epochs or FLAGS.num_epochs)) if FLAGS.warmup_steps and use_horovod: FLAGS.warmup_steps = max( int(FLAGS.warmup_steps / hvd.size()), 1) num_warmup_steps = FLAGS.warmup_steps or int( num_steps_per_epoch * FLAGS.warmup_epochs) or int( num_train_steps * FLAGS.warmup_proportion) logging.info('num_train_steps', num_train_steps, 'num_warmup_steps', num_warmup_steps, 'warmup_proportion', FLAGS.warmup_proportion) optimizer_ = torch.optim.Adamax(model.parameters(), lr=0) if use_horovod: optimizer_ = hvd.DistributedOptimizer(optimizer_) optimizer = lele.training.optimizers.BertOpt( FLAGS.learning_rate, FLAGS.min_learning_rate, num_train_steps, num_warmup_steps, optimizer_) else: is_dynamic_opt = False optimizer = torch.optim.Adamax( param_groups if param_groups else model.parameters(), lr=FLAGS.learning_rate) if use_horovod: optimizer = hvd.DistributedOptimizer(optimizer) start_epoch = 0 latest_path = latest_checkpoint + '.pyt' if latest_checkpoint else os.path.join( FLAGS.model_dir, 'latest.pyt') if not os.path.exists(latest_path): latest_path = os.path.join(FLAGS.model_dir, 'latest.pyt') if os.path.exists(latest_path): logging.info('loading torch model from', latest_path) checkpoint = torch.load(latest_path) if not FLAGS.torch_finetune: start_epoch = checkpoint['epoch'] step = checkpoint['step'] global_step.assign(step + 1) load_torch_model(model, latest_path) if FLAGS.torch_load_optimizer: optimizer.load_state_dict(checkpoint['optimizer']) # TODO by this way restart can not change learning rate.. if learning_rate_weights is None: checkpoint = tf.train.Checkpoint( learning_rate=learning_rate, learning_rate_weight=learning_rate_weight, global_step=global_step) else: checkpoint = tf.train.Checkpoint( learning_rate=learning_rate, learning_rate_weight=learning_rate_weight, learning_rate_weights=learning_rate_weights, global_step=global_step) try: checkpoint.restore(latest_checkpoint) checkpoint2 = copy.deepcopy(checkpoint) except Exception: pass if FLAGS.torch and is_dynamic_opt: optimizer._step = global_step.numpy() #model.load_weights(os.path.join(ckpt_dir, 'ckpt-1')) #model.save('./weight3.hd5') logging.info('optimizer:', optimizer) if FLAGS.torch_lr: learning_rate.assign(optimizer.rate(1)) if FLAGS.torch: learning_rate.assign(optimizer.param_groups[0]['lr']) logging.info('learning rate got from pytorch latest.py as', learning_rate.numpy()) learning_rate.assign(learning_rate * FLAGS.learning_rate_start_factor) if learning_rate_weights is not None: learning_rate_weights.assign(learning_rate_weights * FLAGS.learning_rate_start_factor) # TODO currently not support 0.1 epoch.. like this num_epochs = FLAGS.num_epochs if FLAGS.num_epochs != 0 else 1024 will_valid = valid_dataset and not FLAGS.work_mode == 'test' and not 'SHOW' in os.environ and not 'QUICK' in os.environ if global_step.numpy() == 0: will_valid = False if gezi.get_env('EVFIRST') == '1': will_valid = True if gezi.get_env('EVFIRST') == '0': will_valid = False if will_valid: logging.info('----------valid') if hasattr(model, 'eval'): model.eval() names = None if evaluate_fn is not None: vals, names = evaluate_fn(model, valid_dataset, tf.train.latest_checkpoint(ckpt_dir), num_valid_steps_per_epoch) elif eval_fn: model_path = None if not write_valid else latest_checkpoint names = valid_names if valid_names is not None else [ infer_names[0] ] + [x + '_y' for x in infer_names[1:] ] + infer_names[1:] if infer_names else None logging.info('model_path:', model_path, 'model_dir:', FLAGS.model_dir) vals, names = evaluate(model, valid_dataset, eval_fn, model_path, names, valid_write_fn, write_streaming, num_valid_steps_per_epoch, num_valid_examples, suffix=valid_suffix, sep=sep) if names: logging.info2( 'epoch:%.2f/%d step:%d' % (global_step.numpy() / num_steps_per_epoch, num_epochs, global_step.numpy()), ['%s:%.4f' % (name, val) for name, val in zip(names, vals)]) if FLAGS.work_mode == 'valid' or gezi.get_env('METRIC') == '1': exit(0) if 'test' in FLAGS.work_mode or gezi.get_env( 'TEST') == '1' or gezi.get_env('INFER') == '1': logging.info('--------test/inference') if test_dataset: if hasattr(model, eval): model.eval() if inference_fn is None: # model_path = FLAGS.model_dir + '.pyt' if not latest_checkpoint else latest_checkpoint # logging.info('model_path', model_path) assert latest_checkpoint inference(model, test_dataset, latest_checkpoint, infer_names, infer_debug_names, infer_write_fn, write_streaming, num_test_steps_per_epoch, num_test_examples, suffix=infer_suffix) else: inference_fn(model, test_dataset, tf.train.latest_checkpoint(ckpt_dir), num_test_steps_per_epoch) exit(0) if 'SHOW' in os.environ: num_epochs = start_epoch + 1 class PytObj(object): def __init__(self, x): self.x = x def numpy(self): return self.x class PytMean(object): def __init__(self): self._val = 0. self.count = 0 self.is_call = True def clear(self): self._val = 0 self.count = 0 def __call__(self, val): if not self.is_call: self.clear() self.is_call = True self._val += val.item() self.count += 1 def result(self): if self.is_call: self.is_call = False if not self.count: val = 0 else: val = self._val / self.count # TODO just for compact with tf .. return PytObj(val) Mean = tfe.metrics.Mean if not FLAGS.torch else PytMean num_insts = 0 if FLAGS.learning_rate_decay_factor > 0: #assert FLAGS.learning_rate_values is None, 'use exponential_decay or piecewise_constant?' #NOTICE if you do finetune or other things which might change batch_size then you'd better direclty set num_steps_per_decay #since global step / decay_steps will not be correct epoch as num_steps per epoch changed #so if if you change batch set you have to reset global step as fixed step assert FLAGS.num_steps_per_decay or ( FLAGS.num_epochs_per_decay and num_steps_per_epoch ), 'must set num_steps_per_epoch or num_epochs_per_decay and num_steps_per_epoch' decay_steps = FLAGS.num_steps_per_decay or int( num_steps_per_epoch * FLAGS.num_epochs_per_decay) decay_start_step = FLAGS.decay_start_step or int( num_steps_per_epoch * FLAGS.decay_start_epoch) # decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps) logging.info( 'learning_rate_decay_factor:{} decay_epochs:{} decay_steps:{} decay_start_epoch:{} decay_start_step:{}' .format(FLAGS.learning_rate_decay_factor, FLAGS.num_epochs_per_decay, decay_steps, FLAGS.decay_start_epoch, decay_start_step)) #-------------------------start training if hasattr(model, 'train'): model.train() timer = gezi.Timer() loss_avg = Mean() valid_loss_avg = Mean() num_epochs = num_epochs if num_epochs else 0 loops = min(num_epochs, 1) if FLAGS.torch_only else 1 for _ in range(loops): for i, (x, y) in enumerate(train_dataset): #print('-------------------', i) print(len(x['index']), len(x['value']), len(x['id'])) print(x['index'][0].size(), x['index'][1].size(), y.size()) print(x['value'][0].size(), x['value'][1].size(), y.size()) print(x['id'][0], x['id'][1], y.size()) if i == 3: exit(0) continue if FLAGS.torch: x, y = to_torch(x, y) if is_dynamic_opt: learning_rate.assign(optimizer.rate()) def loss_fn_(x, y): if not FLAGS.torch and 'training' in inspect.getargspec( model.call).args: y_ = model(x, training=True) else: y_ = model(x) if not FLAGS.torch: return loss_fn(y, y_) else: return loss_fn(y_, y) if not FLAGS.torch: loss, grads = melt.eager.grad(model, x, y, loss_fn) grads, _ = tf.clip_by_global_norm(grads, FLAGS.clip_gradients) #optimizer.apply_gradients(zip(grads, model.variables)) optimizer.apply_gradients(zip(grads, model.trainable_variables)) # https://github.com/horovod/horovod/blob/master/examples/tensorflow_mnist_eager.py # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. # Note: broadcast should be done after the first gradient step to ensure optimizer # initialization. # TODO check eager mode if use_horovod and epoch == start_epoch and i == 0: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(optimizier.variables(), root_rank=0) else: optimizer.zero_grad() loss = loss_fn_(x, y) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), FLAGS.clip_gradients) optimizer.step() global_step.assign_add(1) loss_avg(loss) ## https://discuss.pytorch.org/t/calling-loss-backward-reduce-memory-usage/2735 # if FLAGS.torch: # del loss batch_size_ = list( x.values())[0].shape[FLAGS.batch_size_dim] if type(x) == type( {}) else x.shape[FLAGS.batch_size_dim] num_insts += int(batch_size_) if global_step.numpy() % FLAGS.interval_steps == 0: #checkpoint.save(checkpoint_prefix) elapsed = timer.elapsed() steps_per_second = FLAGS.interval_steps / elapsed instances_per_second = num_insts / elapsed num_insts = 0 if num_steps_per_epoch is None: epoch_time_info = '' else: hours_per_epoch = num_steps_per_epoch / FLAGS.interval_steps * elapsed / 3600 epoch_time_info = '1epoch:[{:.2f}h]'.format( hours_per_epoch) if valid_dataset2: # try: # x, y = next(iter(valid_dataset2)) # except Exception: # # TODO FIXME how.. iterate stop restart.., here hack for my iterator see projects/lm/dataset # x, y = next(iter(valid_dataset2)) ## valid dataset2 is repeated ## NOTICE will always the first batch ... as below #x, y = next(iter(valid_dataset2)) x, y = next(valid_dataset2_iter) #print(x['id'][0]) if FLAGS.torch: x, y = to_torch(x, y) if hasattr(model, 'eval'): model.eval() valid_loss = loss_fn_(x, y) valid_loss = valid_loss.numpy( ) if not FLAGS.torch else valid_loss.item() if hasattr(model, 'train'): model.train() if not use_horovod or hvd.rank() == 0: # 'train_loss:[%.4f]' % loss_avg.result().numpy(), # 'valid_loss:[%.4f]' % valid_loss_avg.result().numpy() logging.info2( 'epoch:%.2f/%d' % ((global_step.numpy() / num_steps_per_epoch), num_epochs), 'step:%d' % global_step.numpy(), 'elapsed:[%.2f]' % elapsed, 'batch_size:[%d]' % batch_size_, 'gpus:[%d]' % num_gpus, 'batches/s:[%.2f]' % steps_per_second, 'insts/s:[%d]' % instances_per_second, '%s' % epoch_time_info, 'lr:[%.6f]' % learning_rate.numpy(), 'train_loss:[%.4f]' % loss_avg.result().numpy(), 'valid_loss:[%.4f]' % valid_loss) if global_step.numpy( ) % FLAGS.valid_interval_steps == 0: with writer_valid.as_default( ), summary.always_record_summaries(): summary.scalar('loss/valid', valid_loss) writer_valid.flush() else: if not use_horovod or hvd.rank() == 0: #'train_loss:[%.4f]' % loss_avg.result().numpy() logging.info2( 'epoch:%.2f/%d' % ((epoch + i / num_steps_per_epoch), num_epochs), 'step:%d' % global_step.numpy(), 'elapsed:[%.2f]' % elapsed, 'batch_size:[%d]' % batch_size_, 'gpus:[%d]' % num_gpus, 'batches/s:[%.2f]' % steps_per_second, 'insts/s:[%d]' % instances_per_second, '%s' % epoch_time_info, 'lr:[%.6f]' % learning_rate.numpy(), 'train_loss:[%.4f]' % loss_avg.result().numpy()) if not use_horovod or hvd.rank() == 0: if global_step.numpy() % FLAGS.valid_interval_steps == 0: with writer_train.as_default( ), summary.always_record_summaries(): summary.scalar('loss/train_avg', loss_avg.result().numpy()) summary.scalar('learning_rate', learning_rate.numpy()) summary.scalar('other/batch_size', batch_size_) summary.scalar('other/epoch', melt.epoch()) summary.scalar('perf/steps_per_second', steps_per_second) summary.scalar('perf/instances_per_second', instances_per_second) writer_train.flush() if valid_dataset and FLAGS.metric_eval_interval_steps and global_step.numpy( ) and global_step.numpy() % FLAGS.metric_eval_interval_steps == 0: if hasattr(model, eval): model.eval() vals, names = None, None if evaluate_fn is not None: vals, names = evaluate_fn(model, valid_dataset, None, num_valid_steps_per_epoch) elif eval_fn: names = valid_names if valid_names is not None else [ infer_names[0] ] + [x + '_y' for x in infer_names[1:] ] + infer_names[1:] if infer_names else None vals, names = evaluate(model, valid_dataset, eval_fn, None, names, valid_write_fn, write_streaming, num_valid_steps_per_epoch, num_valid_examples, sep=sep) if not use_horovod or hvd.rank() == 0: if vals and names: with writer_valid.as_default( ), summary.always_record_summaries(): for name, val in zip(names, vals): summary.scalar(f'step_eval/{name}', val) writer_valid.flush() if FLAGS.torch: if not FLAGS.torch_lr: # control learning rate by tensorflow learning rate for param_group in optimizer.param_groups: # important learning rate decay param_group['lr'] = learning_rate.numpy() if hasattr(model, 'train'): model.train() if not use_horovod or hvd.rank() == 0: if names and vals: logging.info2( 'epoch:%.2f/%d' % ((global_step.numpy() / num_steps_per_epoch), num_epochs), 'valid_step:%d' % global_step.numpy(), 'valid_metrics', [ '%s:%.5f' % (name, val) for name, val in zip(names, vals) ]) if not use_horovod or hvd.rank() == 0: # TODO save ok ? if global_step.numpy() % FLAGS.save_interval_steps == 0: if FLAGS.torch: state = { 'epoch': int(global_step.numpy() / num_steps_per_epoch), 'step': global_step.numpy(), 'state_dict': model.state_dict() if not hasattr(model, 'module') else model.module.state_dict(), 'optimizer': optimizer.state_dict(), } torch.save(state, os.path.join(FLAGS.model_dir, 'latest.pyt')) # TODO fixme why if both checpoint2 and chekpoint used... not ok.. if FLAGS.save_interval_epochs and global_step.numpy() % int( num_steps_per_epoch * FLAGS.save_interval_epochs) == 0: checkpoint2.save(checkpoint_prefix2) if FLAGS.torch: state = { 'epoch': int(global_step.numpy() / num_steps_per_epoch), 'step': global_step.numpy(), 'state_dict': model.state_dict() if not hasattr(model, 'module') else model.module.state_dict(), 'optimizer': optimizer.state_dict(), } torch.save( state, tf.train.latest_checkpoint(ckpt_dir2) + '.pyt') if FLAGS.learning_rate_decay_factor > 0: if global_step.numpy( ) >= decay_start_step and global_step.numpy( ) % decay_steps == 0: lr = max( learning_rate.numpy() * FLAGS.learning_rate_decay_factor, FLAGS.min_learning_rate) if lr < learning_rate.numpy(): learning_rate.assign(lr) if FLAGS.torch: for param_group in optimizer.param_groups: param_group['lr'] = learning_rate.numpy() if i == 0: try: if not FLAGS.torch: logging.info(model.summary()) # #tf.keras.utils.plot_model(model, to_file='/home/gezi/model.png', show_shapes=False, show_layer_names=True, rankdir='TB') # import keras # keras.utils.plot_model(model, to_file='/home/gezi/model.png', show_shapes=False, show_layer_names=True, rankdir='LR', expand_nested=True, dpi=96) else: logging.info(model) except Exception: traceback.print_exc() logging.info( 'Fail to do model.summary() may be you have layer define in init but not used in call' ) if 'SHOW' in os.environ: exit(0) if valid_dataset and global_step.numpy() % int( num_steps_per_epoch * FLAGS.valid_interval_epochs) == 0: if hasattr(model, 'eval'): model.eval() vals, names = None, None if evaluate_fn is not None: vals, names = evaluate_fn( model, valid_dataset, tf.train.latest_checkpoint(ckpt_dir), num_valid_steps_per_epoch) elif eval_fn: model_path = None if not write_valid else tf.train.latest_checkpoint( ckpt_dir) print('---------metric evaluate step', global_step.numpy(), 'model_path:', model_path) names = valid_names if valid_names is not None else [ infer_names[0] ] + [x + '_y' for x in infer_names[1:] ] + infer_names[1:] if infer_names else None vals, names = evaluate(model, valid_dataset, eval_fn, model_path, names, valid_write_fn, write_streaming, num_valid_steps_per_epoch, num_valid_examples, suffix=valid_suffix, sep=sep) if not use_horovod or hvd.rank() == 0: if vals and names: logging.info2( 'epoch:%.2f/%d' % (global_step.numpy() / num_steps_per_epoch, num_epochs), 'step:%d' % global_step.numpy(), 'valid_metrics', [ '%s:%.5f' % (name, val) for name, val in zip(names, vals) ]) if not use_horovod or hvd.rank() == 0: with writer.as_default(), summary.always_record_summaries( ): temp = global_step.value() global_step.assign( int(global_step.numpy() / int(num_steps_per_epoch * FLAGS.valid_interval_epochs))) if valid_dataset: if hasattr(model, 'eval'): model.eval() if vals and names: for name, val in zip(names, vals): summary.scalar(f'eval/{name}', val) writer.flush() global_step.assign(temp) if test_dataset and global_step.numpy() % int( num_steps_per_epoch * FLAGS.inference_interval_epochs) == 0: if hasattr(model, 'eval'): model.eval() if inference_fn is None: inference(model, test_dataset, tf.train.latest_checkpoint(ckpt_dir), infer_names, infer_debug_names, infer_write_fn, write_streaming, num_test_steps_per_epoch, num_test_examples, suffix=infer_suffix, sep=sep) else: inference_fn(model, test_dataset, tf.train.latest_checkpoint(ckpt_dir), num_test_steps_per_epoch) if num_epochs and (global_step.numpy() % num_steps_per_epoch) == 0 and int( global_step.numpy() / num_steps_per_epoch) == num_epochs: logging.info(f'Finshed training of {num_epochs} epochs') exit(0)
def main(_): hvd.init() os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.rank()) if hvd.rank() == 0: tf.logging.set_verbosity(tf.logging.INFO) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tf.gfile.MakeDirs(FLAGS.output_dir) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file) label_list = tokenizer.get_labels() model_dir = FLAGS.output_dir if hvd.rank() == 0 else None run_config = tf.estimator.RunConfig( model_dir=model_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, log_step_count_steps=10) train_examples = None num_train_steps = 0 num_warmup_steps = 0 if FLAGS.do_train: train_examples = read_datagrand_examples(input_file=FLAGS.data_dir + FLAGS.train_file, tokenizer=tokenizer, has_label=True) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_epochs_per_eval) num_train_steps = num_train_steps // hvd.size() num_warmup_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_epochs_warmup) num_warmup_steps = num_warmup_steps // hvd.size() # different gpu has different shuffled data rng = random.Random(hvd.rank()) rng.shuffle(train_examples) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, model_dir=model_dir, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps) estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train_shard{}.tf_record".format(hvd.rank())) file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, batch_size=FLAGS.train_batch_size, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) # prepare eval dataset if hvd.rank() == 0: eval_gold_examples = read_datagrand_examples( input_file=FLAGS.data_dir + FLAGS.eval_gold_file, tokenizer=tokenizer, has_label=True) num_actual_eval_examples = len(eval_gold_examples) eval_gold_file = os.path.join(FLAGS.output_dir, "eval_gold.tf_record") file_based_convert_examples_to_features(eval_gold_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_gold_file) eval_examples = read_datagrand_examples(input_file=FLAGS.data_dir + FLAGS.eval_file, tokenizer=tokenizer, has_label=False) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file, has_label=False) bcast_hook = hvd.BroadcastGlobalVariablesHook(0) f1_history = [] max_train_iter = 3 train_iter = 0 while train_iter < max_train_iter: # early stopping condition: # f1 score doesn't increase within 5 iterations estimator.train(input_fn=train_input_fn, steps=num_train_steps, hooks=[bcast_hook]) train_iter += 1 # evaluation if hvd.rank() == 0: tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None eval_input_fn = file_based_input_fn_builder( input_file=eval_gold_file, batch_size=FLAGS.eval_batch_size, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) eval_input_fn = file_based_input_fn_builder( input_file=eval_file, batch_size=FLAGS.eval_batch_size, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) results = estimator.predict(input_fn=eval_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "eval_predictions.txt") from_predictions_to_file(eval_examples, results, output_predict_file, tokenizer) metrics, metrics_val = compute_metrics( output_predict_file, FLAGS.data_dir + FLAGS.eval_gold_file) tf.logging.info("eval predictions result: ") tf.logging.info("{}: {:.2f}".format(metrics, metrics_val)) f1_history.append(metrics_val) if len(f1_history) >= 5 and f1_history[-1] <= f1_history[-5]: tf.logging.info("***** training converges ******") break """
def main(argv): # read args from command `mpirun ... python hvd_run_mnist_training.py inputFilePath outputFilePath` inputFilePath = argv[1] #outputFilePath = argv[2] exportModelDir = argv[2] # Horovod: initialize Horovod. hvd.init() # Load training and eval data table = load_pyarrow_table(inputFilePath) # later I will change code to avoid using `to_pandas` pdf = table.to_pandas() train_data = np.reshape( np.array(np.concatenate(pdf['features']), dtype=np.float32), (-1, 784)) train_labels = np.array(pdf['label'], dtype=np.float32) # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. model_dir = './mnist_convnet_model_' + str(random.randint(0, 2<<30))\ if hvd.rank() == 0 else None # Create the Estimator mnist_classifier = tf.estimator.Estimator( model_fn=cnn_model_fn, model_dir=model_dir, config=tf.estimator.RunConfig(session_config=config)) # Set up logging for predictions # Log the values in the "Softmax" tensor with label "probabilities" tensors_to_log = {"probabilities": "softmax_tensor"} logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=500) # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from # rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights or # restored from a checkpoint. bcast_hook = hvd.BroadcastGlobalVariablesHook(0) # Train the model train_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": train_data}, y=train_labels, batch_size=100, num_epochs=None, shuffle=True) # Horovod: adjust number of steps based on number of GPUs. mnist_classifier.train(input_fn=train_input_fn, steps=5 // hvd.size(), hooks=[logging_hook, bcast_hook]) """ feature_x = tf.feature_column.numeric_column("x", [784]) feature_columns = [feature_x] feature_spec = tf.feature_column.make_parse_example_spec(feature_columns) serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec) """ """ def serving_input_receiver_fn(): serialized_tf_example = tf.placeholder(dtype=tf.string, shape=[None], name='input_tensors') receiver_tensors = {'inputs': serialized_tf_example} feature_spec = {'x': tf.FixedLenFeature([784],tf.float32)} features = tf.parse_example(serialized_tf_example, feature_spec) return tf.estimator.export.ServingInputReceiver(features, receiver_tensors) """ """ with open(outputFilePath, "w") as f: varlist = mnist_classifier.get_variable_names() vars = {} for var in varlist: vars[var] = mnist_classifier.get_variable_value(var).tolist() # the result is large (135MB). only store keys to output file for now. f.write(str(varlist)) """ def serving_input_receiver_fn(): # The outer dimension (None) allows us to batch up inputs for # efficiency. However, it also means that if we want a prediction # for a single instance, we'll need to wrap it in an outer list. inputs = {"x": tf.placeholder(shape=[None, 784], dtype=tf.float32)} return tf.estimator.export.ServingInputReceiver(inputs, inputs) mnist_classifier.export_savedmodel(exportModelDir, serving_input_receiver_fn)
def main(unused_argv): # Horovod: initialize Horovod. hvd.init() filename_train = get_filenames(True, data_dir) filename_test = get_filenames(False, data_dir) # Load training and eval data #mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank()) #train_data = mnist.train.images # Returns np.array #train_labels = np.asarray(mnist.train.labels, dtype=np.int32) #eval_data = mnist.test.images # Returns np.array #eval_labels = np.asarray(mnist.test.labels, dtype=np.int32) # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. model_dir1 = model_dir if hvd.rank() == 0 else None # Create the Estimator classifier = tf.estimator.Estimator( model_fn=cnn_model_fn, model_dir=model_dir1, config=tf.estimator.RunConfig(session_config=config)) # Set up logging for predictions # Log the values in the "Softmax" tensor with label "probabilities" tensors_to_log = {"probabilities": "softmax_tensor"} logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=500) # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from # rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights or # restored from a checkpoint. bcast_hook = hvd.BroadcastGlobalVariablesHook(0) input_function = input_fn # Train the model def input_fn_train(): return input_function(is_training=True, data_dir=data_dir, batch_size=batch_size, filenames=filename_train, num_epochs=epochs_between_evals) # Horovod: adjust number of steps based on number of GPUs. classifier.train(input_fn=input_fn_train, steps=20000 // hvd.size(), hooks=[logging_hook, bcast_hook]) # Evaluate the model and print results def input_fn_eval(): return input_function(is_training=False, data_dir=flags_obj.data_dir, batch_size=batch_size, filenames=filename_test, num_epochs=1) eval_results = classifier.evaluate(input_fn=input_fn_eval) print(eval_results)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if FLAGS.use_fp16: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" if FLAGS.horovod: import horovod.tensorflow as hvd hvd.init() bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tf.gfile.MakeDirs(FLAGS.output_dir) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Input Files ***") for input_file in input_files: tf.logging.info(" %s" % input_file) config = tf.ConfigProto() if FLAGS.horovod: config.gpu_options.visible_device_list = str(hvd.local_rank()) if len(input_files) < hvd.size(): raise ValueError("Input Files must be sharded") if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 config = tf.ConfigProto() if FLAGS.horovod: config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.allow_growth = True # config.gpu_options.per_process_gpu_memory_fraction = 0.7 if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir, session_config=config, save_checkpoints_steps=FLAGS.save_checkpoints_steps if not FLAGS.horovod or hvd.rank() == 0 else None, # This variable controls how often estimator reports examples/sec. # Default value is every 100 steps. # When --report_loss is True, we set to very large value to prevent # default info reporting from estimator. # Ideally we should set it to None, but that does not work. log_step_count_steps=10000 if FLAGS.report_loss else 100) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(), num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps, use_one_hot_embeddings=False, hvd=None if not FLAGS.horovod else hvd) training_hooks = [] if FLAGS.horovod and hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) if FLAGS.report_loss: global_batch_size = FLAGS.train_batch_size if not FLAGS.horovod else FLAGS.train_batch_size * hvd.size( ) training_hooks.append( _LogSessionRunHook(global_batch_size, 1, -1 if not FLAGS.horovod else hvd.rank())) training_hooks = [] if FLAGS.report_loss and (not FLAGS.horovod or hvd.rank() == 0): global_batch_size = FLAGS.train_batch_size if not FLAGS.horovod else FLAGS.train_batch_size * hvd.size( ) training_hooks.append(_LogSessionRunHook(global_batch_size, 100)) if FLAGS.horovod: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) if FLAGS.do_train: tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) train_input_fn = input_fn_builder( input_files=input_files, batch_size=FLAGS.train_batch_size, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True, hvd=None if not FLAGS.horovod else hvd) estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=FLAGS.num_train_steps) if FLAGS.do_eval and (not FLAGS.horovod or hvd.rank() == 0): tf.logging.info("***** Running evaluation *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_input_fn = input_fn_builder( input_files=input_files, batch_size=FLAGS.eval_batch_size, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False, hvd=None if not FLAGS.horovod else hvd) result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(_): # Horovod: initialize Horovod. hvd.init() os.environ['KMP_SETTINGS'] = str(1) os.environ['KMP_BLOCKTIME'] = str(0) os.environ['OMP_NUM_THREADS'] = str(threads) os.environ['KMP_AFFINITY'] = 'granularity=fine,compact,1,0' config = tf.ConfigProto() config.intra_op_parallelism_threads = threads config.inter_op_parallelism_threads = pools config.gpu_options.visible_device_list = str(hvd.local_rank()) tf.enable_eager_execution(config=config) mnist_model = tf.keras.Sequential([ tf.keras.layers.Conv2D(16, [3, 3], activation='relu'), tf.keras.layers.Conv2D(16, [3, 3], activation='relu'), tf.keras.layers.GlobalAveragePooling2D(), tf.keras.layers.Dense(10) ]) # Horovod: adjust learning rate based on number of GPUs. opt = tf.train.RMSPropOptimizer(0.001 * hvd.size()) (mnist_images, mnist_labels), _ = \ tf.keras.datasets.mnist.load_data(path=os.path.join(args.datadir, 'mnist.npz')) dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64))) dataset = dataset.shuffle(1000).batch(args.batch_size) checkpoint_dir = os.path.join(args.modeldir, 'checkpoints') step_counter = tf.train.get_or_create_global_step() checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt, step_counter=step_counter) # Horovod: adjust number of steps based on number of GPUs. for (batch, (images, labels)) in enumerate(dataset.take(2000 // hvd.size())): with tf.GradientTape() as tape: logits = mnist_model(images, training=True) loss_value = tf.losses.sparse_softmax_cross_entropy(labels, logits) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. if batch == 0: hvd.broadcast_variables(mnist_model.variables, root_rank=0) # Horovod: add Horovod Distributed GradientTape. tape = hvd.DistributedGradientTape(tape) grads = tape.gradient(loss_value, mnist_model.variables) opt.apply_gradients(zip(grads, mnist_model.variables), global_step=tf.train.get_or_create_global_step()) if batch % 10 == 0 and hvd.local_rank() == 0: print('Step #%d\tLoss: %.6f' % (batch, loss_value)) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting it. if hvd.rank() == 0: checkpoint.save(checkpoint_dir)
def main(): # Parse essential args parser = argparse.ArgumentParser() parser.add_argument("--data_dir", required=True, help="Location of data files (model weights, etc).") parser.add_argument("--model_name", required=True, help="The name of the model being fine-tuned.") parser.add_argument("--pretrain_tfrecords", type=str) parser.add_argument("--seed", type=int) parser.add_argument("--num_train_steps", type=int) parser.add_argument("--num_warmup_steps", type=int) parser.add_argument("--learning_rate", type=float) parser.add_argument("--train_batch_size", type=int) parser.add_argument("--max_seq_length", type=int) parser.add_argument("--mask_prob", type=float) parser.add_argument("--disc_weight", type=float) parser.add_argument("--generator_hidden_size", type=float) parser.add_argument("--save_checkpoints_steps", type=int) parser.add_argument("--keep_checkpoint_max", type=int) parser.add_argument("--restore_checkpoint", action='store_true') parser.add_argument("--optimizer", default="adam", type=str, help="adam or lamb") args = parser.parse_args() config = PretrainingConfig(**args.__dict__) # Set up tensorflow hvd.init() gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') tf.config.optimizer.set_jit(config.xla) tf.config.optimizer.set_experimental_options( {"auto_mixed_precision": config.amp}) tf.random.set_seed(config.seed) # Set up config if config.do_train == config.do_eval: raise ValueError( "Exactly one of `do_train` or `do_eval` must be True.") if config.debug and config.do_train: utils.rmkdir(config.model_dir) utils.heading("Config:") log_config(config) # Save pretrain configs pretrain_config_json = os.path.join(config.checkpoints_dir, 'pretrain_config.json') if is_main_process(): utils.write_json(config.__dict__, pretrain_config_json) log("Configuration saved in {}".format(pretrain_config_json)) # Set up model model = PretrainingModel(config) # Set up metrics perf_metrics = dict() perf_metrics["train_perf"] = tf.keras.metrics.Mean(name="train_perf") eval_metrics = dict() eval_metrics["total_loss"] = tf.keras.metrics.Mean(name="total_loss") eval_metrics["masked_lm_accuracy"] = tf.keras.metrics.Accuracy( name="masked_lm_accuracy") eval_metrics["masked_lm_loss"] = tf.keras.metrics.Mean( name="masked_lm_loss") if config.electra_objective: eval_metrics["sampled_masked_lm_accuracy"] = tf.keras.metrics.Accuracy( name="sampled_masked_lm_accuracy") if config.disc_weight > 0: eval_metrics["disc_loss"] = tf.keras.metrics.Mean(name="disc_loss") eval_metrics["disc_auc"] = tf.keras.metrics.AUC(name="disc_auc") eval_metrics["disc_accuracy"] = tf.keras.metrics.Accuracy( name="disc_accuracy") eval_metrics["disc_precision"] = tf.keras.metrics.Accuracy( name="disc_precision") eval_metrics["disc_recall"] = tf.keras.metrics.Accuracy( name="disc_recall") # Set up tensorboard current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") train_log_dir = os.path.join( config.log_dir, current_time, 'train_' + str(get_rank()) + '_of_' + str(get_world_size())) train_summary_writer = tf.summary.create_file_writer(train_log_dir) # Set up dataset dataset = pretrain_utils.get_dataset(config, config.train_batch_size, world_size=get_world_size(), rank=get_rank()) train_iterator = iter(dataset) # Set up optimizer optimizer = create_optimizer(init_lr=config.learning_rate, num_train_steps=config.num_train_steps, num_warmup_steps=config.num_warmup_steps, weight_decay_rate=config.weight_decay_rate, optimizer=config.optimizer) if config.amp: optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( optimizer, "dynamic") if config.do_train: # Set up checkpoint manager checkpoint = tf.train.Checkpoint(step=tf.Variable(1), optimizer=optimizer, model=model) manager = tf.train.CheckpointManager( checkpoint, config.checkpoints_dir, max_to_keep=config.keep_checkpoint_max) iter_checkpoint = tf.train.Checkpoint(train_iterator=train_iterator) iter_manager = tf.train.CheckpointManager( iter_checkpoint, os.path.join(config.checkpoints_dir, 'iter_ckpt_rank_' + '{:02}'.format(get_rank())), checkpoint_name='iter_ckpt_rank_' + '{:02}'.format(get_rank()), max_to_keep=config.keep_checkpoint_max) if config.restore_checkpoint and manager.latest_checkpoint: checkpoint.restore(manager.latest_checkpoint) log(" ** Restored model checkpoint from {}".format( manager.latest_checkpoint)) if iter_manager.latest_checkpoint: iter_checkpoint.restore(iter_manager.latest_checkpoint) log(" ** Restored iterator checkpoint from {}".format( iter_manager.latest_checkpoint), all_rank=True) else: log(" ** Initializing from scratch.") utils.heading("Running training") train_start, start_step = time.time(), int(checkpoint.step) - 1 while int(checkpoint.step) <= config.num_train_steps: step = int(checkpoint.step) features = next(train_iterator) iter_start = time.time() # if step == 200: tf.profiler.experimental.start(logdir=train_log_dir) total_loss, eval_fn_inputs = train_one_step( config, model, optimizer, features, step <= 1) # if step == 300: tf.profiler.experimental.stop() perf_metrics["train_perf"].update_state(config.train_batch_size * get_world_size() / (time.time() - iter_start)) eval_metrics["total_loss"].update_state(values=total_loss) metric_fn(config, eval_metrics, eval_fn_inputs) if step % 100 == 0: log('Step:{:6d}, Loss:{:10.6f}, Gen_loss:{:10.6f}, Disc_loss:{:10.6f}, Gen_acc:{:6.2f}, ' 'Disc_acc:{:6.2f}, Perf:{:4.0f}, Elapsed: {}, ETA: {}, '. format( step, total_loss, eval_metrics["masked_lm_loss"].result().numpy(), eval_metrics["disc_loss"].result().numpy(), eval_metrics["masked_lm_accuracy"].result().numpy() * 100, eval_metrics["disc_accuracy"].result().numpy() * 100, perf_metrics["train_perf"].result().numpy(), utils.get_readable_time(time.time() - train_start), utils.get_readable_time( (time.time() - train_start) / (step - start_step) * (config.num_train_steps - step))), all_rank=True) with train_summary_writer.as_default(): for key, m in eval_metrics.items(): tf.summary.scalar(key, m.result(), step=step) for m in eval_metrics.values(): m.reset_states() checkpoint.step.assign_add(1) if step % config.save_checkpoints_steps == 0: if is_main_process(): save_path = manager.save() log(" ** Saved model checkpoint for step {}: {}".format( step, save_path)) iter_save_path = iter_manager.save() log(" ** Saved iterator checkpoint for step {}: {}".format( step, iter_save_path), all_rank=True) if config.do_eval: pass
def main(): # Parse args and create config args, base_config, base_model, config_module = get_base_config(sys.argv[1:]) if args.mode == "interactive_infer": raise ValueError( "Interactive infer is meant to be run from an IPython", "notebook not from run.py." ) # restore_best_checkpoint = base_config.get('restore_best_checkpoint', False) # # Check logdir and create it if necessary # checkpoint = check_logdir(args, base_config, restore_best_checkpoint) load_model = base_config.get('load_model', None) restore_best_checkpoint = base_config.get('restore_best_checkpoint', False) base_ckpt_dir = check_base_model_logdir(load_model, args, restore_best_checkpoint) base_config['load_model'] = base_ckpt_dir # Check logdir and create it if necessary checkpoint = check_logdir(args, base_config, restore_best_checkpoint) # Initilize Horovod if base_config['use_horovod']: import horovod.tensorflow as hvd hvd.init() if hvd.rank() == 0: deco_print("Using horovod") from mpi4py import MPI MPI.COMM_WORLD.Barrier() else: hvd = None if args.enable_logs: if hvd is None or hvd.rank() == 0: old_stdout, old_stderr, stdout_log, stderr_log = create_logdir( args, base_config ) base_config['logdir'] = os.path.join(base_config['logdir'], 'logs') if args.mode == 'train' or args.mode == 'train_eval' or args.benchmark: if hvd is None or hvd.rank() == 0: if checkpoint is None or args.benchmark: if base_ckpt_dir: deco_print("Starting training from the base model") else: deco_print("Starting training from scratch") else: deco_print( "Restored checkpoint from {}. Resuming training".format(checkpoint), ) elif args.mode == 'eval' or args.mode == 'infer': if hvd is None or hvd.rank() == 0: deco_print("Loading model from {}".format(checkpoint)) # Create model and train/eval/infer with tf.Graph().as_default(): model = create_model( args, base_config, config_module, base_model, hvd, checkpoint) if args.mode == "train_eval": train(model[0], model[1], debug_port=args.debug_port) elif args.mode == "train": train(model, None, debug_port=args.debug_port) elif args.mode == "eval": evaluate(model, checkpoint) elif args.mode == "infer": infer(model, checkpoint, args.infer_output_file) if args.enable_logs and (hvd is None or hvd.rank() == 0): sys.stdout = old_stdout sys.stderr = old_stderr stdout_log.close() stderr_log.close()
def main(_): print(FLAGS) print(tf.__version__, "==tensorflow version==") hvd.init() init_checkpoint = os.path.join(FLAGS.buckets, FLAGS.init_checkpoint) train_file = os.path.join(FLAGS.buckets, FLAGS.train_file) dev_file = os.path.join(FLAGS.buckets, FLAGS.dev_file) checkpoint_dir = os.path.join(FLAGS.buckets, FLAGS.model_output) print(init_checkpoint, train_file, dev_file, checkpoint_dir) worker_count = hvd.size() task_index = hvd.local_rank() is_chief = task_index == 0 print("==worker_count==", worker_count, "==local_rank==", task_index, "==is is_chief==", is_chief) cluster = "" target = "" # FLAGS.config_file = os.path.join(FLAGS.buckets, FLAGS.config_file) FLAGS.label_id = os.path.join(FLAGS.buckets, FLAGS.label_id) if FLAGS.mode == "single_task": train_eval_api = hvd_train_eval elif FLAGS.mode == "multi_task": train_eval_api = multitask_hvd_train_eval if FLAGS.run_type == "sess": train_eval_api.monitored_sess( FLAGS=FLAGS, worker_count=worker_count, task_index=task_index, cluster=cluster, is_chief=is_chief, target=target, init_checkpoint=init_checkpoint, train_file=train_file, dev_file=dev_file, checkpoint_dir=checkpoint_dir, distribution_strategy=FLAGS.distribution_strategy, rule_model=FLAGS.rule_model, parse_type=FLAGS.parse_type, train_op=FLAGS.train_op, running_type=FLAGS.running_type, input_target=FLAGS.input_target, decay=FLAGS.decay, warmup=FLAGS.warmup, distillation=FLAGS.distillation, temperature=FLAGS.temperature, distillation_ratio=FLAGS.distillation_ratio) elif FLAGS.run_type == "estimator": train_eval_api.monitored_estimator( FLAGS=FLAGS, worker_count=worker_count, task_index=task_index, cluster=cluster, is_chief=is_chief, target=target, init_checkpoint=init_checkpoint, train_file=train_file, dev_file=dev_file, checkpoint_dir=checkpoint_dir, distribution_strategy=FLAGS.distribution_strategy, rule_model=FLAGS.rule_model, parse_type=FLAGS.parse_type, train_op=FLAGS.train_op, running_type=FLAGS.running_type, input_target=FLAGS.input_target, decay=FLAGS.decay, warmup=FLAGS.warmup, distillation=FLAGS.distillation, temperature=FLAGS.temperature, distillation_ratio=FLAGS.distillation_ratio)
def main(): gpu_thread_count = 2 os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' os.environ['TF_GPU_THREAD_COUNT'] = str(gpu_thread_count) os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' hvd.init() config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.force_gpu_compatible = True # Force pinned memory config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads config.inter_op_parallelism_threads = 5 #config.gpu_options.allow_growth = True log_name = 'hvd_train.txt' ''' training stratey ''' training_strategy = [{ 'epoch': [0, 4], 'lr': [1.0, 3.0], 'lr_method': 'linear', 'batch_size': 768, 'image_size': (128, 128), 'data_dir': '160', 'prefix': 'train' }, { 'epoch': [4, 15], 'lr': [3.0, 0.01], 'lr_method': 'linear', 'batch_size': 768, 'image_size': (128, 128), 'data_dir': '160', 'prefix': 'train' }, { 'epoch': [15, 32], 'lr': [0.2, 0.002], 'lr_method': 'exp', 'batch_size': 256, 'image_size': (224, 224), 'data_dir': '320', 'prefix': 'train' }, { 'epoch': [32, 37], 'lr': [0.003, 0.0005], 'lr_method': 'linear', 'batch_size': 128, 'image_size': (288, 288), 'data_dir': '320', 'prefix': 'train' }] training_strategy = [{ 'epoch': [0, 6], 'lr': [1.0, 2.0], 'lr_method': 'linear', 'batch_size': 740, 'image_size': (128, 128), 'data_dir': '160', 'prefix': 'train' }, { 'epoch': [6, 21], 'lr': [2.0, 0.45], 'lr_method': 'linear', 'batch_size': 740, 'image_size': (128, 128), 'data_dir': '160', 'prefix': 'train' }, { 'epoch': [21, 32], 'lr': [0.45, 0.02], 'lr_method': 'exp', 'batch_size': 256, 'image_size': (224, 224), 'data_dir': '320', 'prefix': 'train' }, { 'epoch': [32, 36], 'lr': [0.02, 0.004], 'lr_method': 'exp', 'batch_size': 196, 'image_size': (224, 224), 'data_dir': '320', 'prefix': 'train' }, { 'epoch': [36, 40], 'lr': [0.004, 0.002], 'lr_method': 'exp', 'batch_size': 128, 'image_size': (288, 288), 'data_dir': '320', 'prefix': 'train' }] num_training_samples = 1281167 num_eval_samples = 50000 cmdline = add_cli_args() FLAGS, unknown_args = cmdline.parse_known_args() do_checkpoint = hvd.rank() == 0 barrier = hvd.allreduce(tf.constant(0, dtype=tf.float32)) tf.Session(config=config).run(barrier) if hvd.local_rank() == 0 and not os.path.isdir(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) barrier = hvd.allreduce(tf.constant(0, dtype=tf.float32)) tf.Session(config=config).run(barrier) logger = logging.getLogger(log_name) logger.setLevel(logging.INFO) # INFO, ERROR ch = logging.StreamHandler() ch.setLevel(logging.INFO) formatter = logging.Formatter('%(message)s') ch.setFormatter(formatter) logger.addHandler(ch) fh = logging.FileHandler(os.path.join(FLAGS.log_dir, log_name)) fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) # add handlers to logger logger.addHandler(fh) if not FLAGS.save_checkpoints_steps: # default to save one checkpoint per epoch FLAGS.save_checkpoints_steps = 625 if not FLAGS.save_summary_steps: # default to save one checkpoint per epoch FLAGS.save_summary_steps = 625 data_strategy, lr_strategy = dynamicpipe.lr_strategy_parsing( training_strategy, num_training_samples, FLAGS.num_gpus) num_steps = lr_strategy[-1]['steps'][-1] + FLAGS.display_every rank0log(logger, 'Data strategy: ' + str(data_strategy)) rank0log(logger, 'Learning rate strategy:' + str(lr_strategy)) rank0log(logger, 'Total Max Training Steps: ' + str(num_steps)) rank0log( logger, 'Checkpointing every ' + str(FLAGS.save_checkpoints_steps) + ' steps') rank0log( logger, 'Saving summary every ' + str(FLAGS.save_summary_steps) + ' steps') rank0log(logger, 'PY' + str(sys.version) + 'TF' + str(tf.__version__)) rank0log(logger, "Horovod size: ", hvd.size()) classifier = tf.estimator.Estimator( model_fn=cnn_model_function, model_dir=FLAGS.log_dir, params={ 'n_classes': 1000, 'mom': FLAGS.mom, 'num_steps': num_steps, 'wdecay': FLAGS.wdecay, 'loss_scale': FLAGS.loss_scale, 'num_training_samples': num_training_samples, 'lr_strategy': lr_strategy }, config=tf.estimator.RunConfig( session_config=config, save_summary_steps=FLAGS.save_summary_steps if do_checkpoint else None, save_checkpoints_steps=FLAGS.save_checkpoints_steps if do_checkpoint else None, keep_checkpoint_max=None)) num_preproc_threads = 6 rank0log(logger, "Using preprocessing threads per GPU: ", num_preproc_threads) training_hooks = [ hvd.BroadcastGlobalVariablesHook(0), PrefillStagingAreasHook() ] if hvd.rank() == 0: training_hooks.append( LogSessionRunHook(num_training_samples, FLAGS.num_gpus, FLAGS.display_every, logger)) start_time = time.time() classifier.train( input_fn=lambda: dynamicpipe.data_pipeline(num_training_samples, FLAGS.num_gpus, data_strategy, FLAGS.data_dir, mode="TRAINING"), max_steps=num_steps, hooks=training_hooks) rank0log(logger, "Log: Finished in ", time.time() - start_time) rank0log(logger, "Log: Evaluating") rank0log(logger, "Log: Validation dataset size: 50000") eval_strategy = [{ 'epoch': 1, 'batch_size': 128, 'image_size': (288, 288), 'data_dir': '320', 'prefix': 'validation' }] #evaluation on single GPU #if hvd.rank() == 0: rank0log(logger, ' step top1 top5 loss checkpoint_time(UTC)') ckpts = sort_and_load_ckpts(FLAGS.log_dir) for i, c in enumerate(ckpts): if hvd.rank() == i % FLAGS.num_gpus: eval_result = classifier.evaluate( input_fn=lambda: dynamicpipe.data_pipeline(num_eval_samples, 1, eval_strategy, FLAGS.data_dir, mode="EVAL"), checkpoint_path=c['path']) c['epoch'] = i c['top1'] = eval_result['val-top1acc'] c['top5'] = eval_result['val-top5acc'] c['loss'] = eval_result['loss'] logger.info( 'Log @eval: count@{:5d} step@{:5d} top1@{:5.3f} top5@{:6.2f} loss@{:6.2f} time@{time}' .format(c['epoch'], c['step'], c['top1'] * 100, c['top5'] * 100, c['loss'], time=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(c['mtime'])))) rank0log(logger, "Log Finished evaluation")
def main(_): hvd.init() sess_config = tf.ConfigProto() sess_config.gpu_options.visible_device_list = str(hvd.local_rank()) graph = tf.Graph() with graph.as_default(): import json config = json.load(open(FLAGS.config_file, "r")) init_checkpoint = FLAGS.init_checkpoint config = Bunch(config) config.use_one_hot_embeddings = True config.scope = "bert" config.dropout_prob = 0.1 config.label_type = "single_label" if FLAGS.if_shard == "0": train_size = FLAGS.train_size epoch = int(FLAGS.epoch / hvd.size()) elif FLAGS.if_shard == "1": train_size = int(FLAGS.train_size / hvd.size()) epoch = FLAGS.epoch init_lr = 2e-5 num_train_steps = int(train_size / FLAGS.batch_size * epoch) num_warmup_steps = int(num_train_steps * 0.1) num_storage_steps = int(train_size / FLAGS.batch_size) print(" model type {}".format(FLAGS.model_type)) print(num_train_steps, num_warmup_steps, "=============") opt_config = Bunch({ "init_lr": init_lr / hvd.size(), "num_train_steps": num_train_steps, "num_warmup_steps": num_warmup_steps }) sess = tf.Session(config=sess_config) model_io_config = Bunch({"fix_lm": False}) model_io_fn = model_io.ModelIO(model_io_config) optimizer_fn = optimizer.Optimizer(opt_config) num_classes = FLAGS.num_classes model_train_fn = bert_classifier.classifier_model_fn_builder( config, num_classes, init_checkpoint, reuse=None, load_pretrained=True, model_io_fn=model_io_fn, optimizer_fn=optimizer_fn, model_io_config=model_io_config, opt_config=opt_config) model_eval_fn = bert_classifier.classifier_model_fn_builder( config, num_classes, init_checkpoint, reuse=True, load_pretrained=True, model_io_fn=model_io_fn, optimizer_fn=optimizer_fn, model_io_config=model_io_config, opt_config=opt_config) def metric_fn(features, logits, loss): print(logits.get_shape(), "===logits shape===") pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32) prob = tf.nn.softmax(logits) accuracy = correct = tf.equal( tf.cast(pred_label, tf.int32), tf.cast(features["label_ids"], tf.int32)) accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) return { "accuracy": accuracy, "loss": loss, "pred_label": pred_label, "label_ids": features["label_ids"] } name_to_features = { "input_ids": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "input_mask": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "segment_ids": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "label_ids": tf.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example. """ example = tf.parse_single_example(record, name_to_features) # tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t return example params = Bunch({}) params.epoch = FLAGS.epoch params.batch_size = FLAGS.batch_size train_features = tf_data_utils.train_input_fn(FLAGS.train_file, _decode_record, name_to_features, params, if_shard=FLAGS.if_shard) eval_features = tf_data_utils.eval_input_fn(FLAGS.dev_file, _decode_record, name_to_features, params, if_shard=FLAGS.if_shard) [train_op, train_loss, train_per_example_loss, train_logits] = model_train_fn(train_features, [], tf.estimator.ModeKeys.TRAIN) [_, eval_loss, eval_per_example_loss, eval_logits] = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL) result = metric_fn(eval_features, eval_logits, eval_loss) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) sess.run(hvd.broadcast_global_variables(0)) model_io_fn.set_saver() print("===horovod rank==={}".format(hvd.rank())) def eval_fn(result): i = 0 total_accuracy = 0 label, label_id = [], [] while True: try: eval_result = sess.run(result) total_accuracy += eval_result["accuracy"] label_id.extend(eval_result["label_ids"]) label.extend(eval_result["pred_label"]) i += 1 except tf.errors.OutOfRangeError: print("End of dataset") break macro_f1 = f1_score(label_id, label, average="macro") micro_f1 = f1_score(label_id, label, average="micro") macro_precision = precision_score(label_id, label, average="macro") micro_precision = precision_score(label_id, label, average="micro") macro_recall = recall_score(label_id, label, average="macro") micro_recall = recall_score(label_id, label, average="micro") accuracy = accuracy_score(label_id, label) print("test accuracy {} macro_f1 score {} micro_f1 {} accuracy {}". format(total_accuracy / i, macro_f1, micro_f1, accuracy)) return total_accuracy / i, label_id, label def train_fn(op, loss): i = 0 total_loss = 0 cnt = 0 while True: try: [_, train_loss] = sess.run([op, loss]) i += 1 cnt += 1 total_loss += train_loss # print("==device id {} global step {}".format(hvd.rank(), step)) if np.mod(i, num_storage_steps) == 0: print(total_loss / cnt) if hvd.rank() == 0: model_io_fn.save_model( sess, FLAGS.model_output + "/oqmrc_{}.ckpt".format( int(i / num_storage_steps))) cnt = 0 total_loss = 0 except tf.errors.OutOfRangeError: print("End of dataset") break import time import time start = time.time() train_fn(train_op, train_loss) acc, true_label, pred_label = eval_fn(result) end = time.time() print("==total time {} numbers of devices {}".format( end - start, hvd.size())) if hvd.rank() == 0: model_io_fn.save_model(sess, FLAGS.model_output + "/oqmrc.ckpt") import _pickle as pkl pkl.dump({ "true_label": true_label, "pred_label": pred_label }, open(FLAGS.model_output + "/eval_result.json", "wb"))
else: pred = OfflinePredictor(PredictConfig( model=MODEL, session_init=get_model_loader(args.load), input_names=MODEL.get_inference_tensor_names()[0], output_names=MODEL.get_inference_tensor_names()[1])) if args.evaluate: assert args.evaluate.endswith('.json'), args.evaluate offline_evaluate(pred, args.evaluate) elif args.predict: COCODetection(cfg.DATA.BASEDIR, 'val2014') # Only to load the class names into caches predict(pred, args.predict) else: is_horovod = cfg.TRAINER == 'horovod' if is_horovod: hvd.init() logger.info("Horovod Rank={}, Size={}".format(hvd.rank(), hvd.size())) if not is_horovod or hvd.rank() == 0: logger.set_logger_dir(args.logdir, 'd') finalize_configs(is_training=True) stepnum = cfg.TRAIN.STEPS_PER_EPOCH # warmup is step based, lr is epoch based init_lr = cfg.TRAIN.BASE_LR * 0.33 * min(8. / cfg.TRAIN.NUM_GPUS, 1.) warmup_schedule = [(0, init_lr), (cfg.TRAIN.WARMUP, cfg.TRAIN.BASE_LR)] warmup_end_epoch = cfg.TRAIN.WARMUP * 1. / stepnum lr_schedule = [(int(warmup_end_epoch + 0.5), cfg.TRAIN.BASE_LR)] factor = 8. / cfg.TRAIN.NUM_GPUS
def main(_): hvd.init() sess_config = tf.ConfigProto() sess_config.gpu_options.visible_device_list = str(hvd.local_rank()) graph = tf.Graph() from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score with graph.as_default(): import json # config = json.load(open("/data/xuht/bert/chinese_L-12_H-768_A-12/bert_config.json", "r")) config = json.load(open(FLAGS.config_file, "r")) init_checkpoint = FLAGS.init_checkpoint print("===init checkoutpoint==={}".format(init_checkpoint)) config = Bunch(config) config.use_one_hot_embeddings = True config.scope = "bert" config.dropout_prob = 0.1 config.label_type = "single_label" config.lm_ratio = 0.1 config.task_ratio = 1.0 json.dump(config, open(FLAGS.model_output+"/config.json", "w")) init_lr = 2e-5 if FLAGS.if_shard == "0": train_size = FLAGS.train_size epoch = int(FLAGS.epoch / hvd.size()) elif FLAGS.if_shard == "1": train_size = int(FLAGS.train_size/hvd.size()) epoch = FLAGS.epoch sess = tf.Session(config=sess_config) num_train_steps = int( train_size / FLAGS.batch_size * epoch) num_warmup_steps = int(num_train_steps * 0.1) num_storage_steps = int(train_size / FLAGS.batch_size) print(num_train_steps, num_warmup_steps, "=============") opt_config = Bunch({"init_lr":init_lr/(hvd.size()), "num_train_steps":num_train_steps, "num_warmup_steps":num_warmup_steps}) model_io_config = Bunch({"fix_lm":False}) model_io_fn = model_io.ModelIO(model_io_config) optimizer_fn = optimizer.Optimizer(opt_config) num_choice = FLAGS.num_classes max_seq_length = FLAGS.max_length max_predictions_per_seq = FLAGS.max_predictions_per_seq model_train_fn = classifier_fn.classifier_model_fn_builder(config, num_choice, init_checkpoint, reuse=None, load_pretrained=True, model_io_fn=model_io_fn, optimizer_fn=optimizer_fn, model_io_config=model_io_config, opt_config=opt_config) model_eval_fn = classifier_fn.classifier_model_fn_builder(config, num_choice, init_checkpoint, reuse=True, load_pretrained=True, model_io_fn=model_io_fn, optimizer_fn=optimizer_fn, model_io_config=model_io_config, opt_config=opt_config) name_to_features = { "input_ids": tf.FixedLenFeature([max_seq_length], tf.int64), "input_mask": tf.FixedLenFeature([max_seq_length], tf.int64), "segment_ids": tf.FixedLenFeature([max_seq_length], tf.int64), "masked_lm_positions": tf.FixedLenFeature([max_predictions_per_seq], tf.int64), "masked_lm_ids": tf.FixedLenFeature([max_predictions_per_seq], tf.int64), "masked_lm_weights": tf.FixedLenFeature([max_predictions_per_seq], tf.float32), "label_ids": tf.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example. """ example = tf.parse_single_example(record, name_to_features) # tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t return example params = Bunch({}) params.epoch = epoch params.batch_size = FLAGS.batch_size def parse_folder(path): files = os.listdir(path) output = [] for file_name in files: output.append(os.path.join(path, file_name)) random.shuffle(output) return output train_features = tf_data_utils.train_input_fn( parse_folder(FLAGS.train_file), _decode_record, name_to_features, params) train_dict = model_train_fn(train_features, [], tf.estimator.ModeKeys.TRAIN) eval_features = tf_data_utils.eval_input_fn( parse_folder(FLAGS.dev_file), _decode_record, name_to_features, params) eval_dict = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL) model_io_fn.set_saver() init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) sess.run(hvd.broadcast_global_variables(0)) def eval_fn(op_dict): def eval_fn(op_dict): i = 0 eval_total_dict = {} while True: try: eval_result = sess.run(op_dict) for key in eval_result: if key in ["probabilities", "label_ids"]: if key in eval_total_dict: eval_total_dict[key].extend(eval_result[key]) else: eval_total_dict[key] = [] eval_total_dict[key].extend(eval_result[key]) i += 1 except tf.errors.OutOfRangeError: print("End of dataset") break for key in eval_result: if key not in ["probabilities", "label_ids"]: eval_total_dict[key] = eval_result[key] label_id = eval_total_dict["label_ids"] label = np.argmax(np.array(eval_total_dict["probabilities"]), axis=-1) macro_f1 = f1_score(label_id, label, average="macro") micro_f1 = f1_score(label_id, label, average="micro") accuracy = accuracy_score(label_id, label) print("test accuracy {} macro_f1 score {} micro_f1 {} masked_lm_accuracy {} sentence_f {}".format(accuracy, macro_f1, micro_f1, eval_total_dict["masked_lm_accuracy"], eval_total_dict["sentence_f"])) return eval_total_dict def run_eval(steps): import _pickle as pkl eval_features = tf_data_utils.eval_input_fn( parse_folder(FLAGS.dev_file), _decode_record, name_to_features, params) eval_dict = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL) sess.run(tf.local_variables_initializer()) eval_finial_dict = eval_fn(eval_dict) if hvd.rank() == 0: pkl.dump(eval_finial_dict, open(FLAGS.model_output+"/eval_dict_{}.pkl".format(steps), "wb")) return eval_finial_dict def train_fn(op_dict): i = 0 cnt = 0 loss_dict = {} monitoring_train = [] monitoring_eval = [] while True: try: train_result = sess.run(op_dict) for key in train_result: if key == "train_op": continue else: if np.isnan(train_result[key]): print(train_loss, "get nan loss") break else: if key in loss_dict: loss_dict[key] += train_result[key] else: loss_dict[key] = train_result[key] i += 1 cnt += 1 if np.mod(i, num_storage_steps) == 0: string = "" for key in loss_dict: tmp = key + " " + str(loss_dict[key]/cnt) + "\t" string += tmp print(string) monitoring_train.append(loss_dict) eval_finial_dict = run_eval(int(i/num_storage_steps)) monitoring_eval.append(eval_finial_dict) for key in loss_dict: loss_dict[key] = 0.0 if hvd.rank() == 0: model_io_fn.save_model(sess, FLAGS.model_output+"/model_{}.ckpt".format(int(i/num_storage_steps))) print("==successful storing model=={}".format(int(i/num_storage_steps))) cnt = 0 except tf.errors.OutOfRangeError: if hvd.rank() == 0: import _pickle as pkl pkl.dump({"train":monitoring_train, "eval":monitoring_eval}, open(FLAGS.model_output+"/monitoring.pkl", "wb")) break print("===========begin to train============") train_fn(train_dict) if hvd.rank() == 0: import _pickle as pkl model_io_fn.save_model(sess, FLAGS.model_output+"/model.ckpt") print("===========begin to eval============") eval_finial_dict = run_eval("final") if __name__ == "__main__": tf.app.run()
def main(_): hvd.init() FLAGS.output_dir = FLAGS.output_dir if hvd.rank() == 0 else os.path.join( FLAGS.output_dir, str(hvd.rank())) FLAGS.num_train_steps = FLAGS.num_train_steps // hvd.size() FLAGS.num_warmup_steps = FLAGS.num_warmup_steps // hvd.size() tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(bert_config_file.name) tf.gfile.MakeDirs(FLAGS.output_dir) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Input Files ***") for input_file in input_files: tf.logging.info(" %s" % input_file) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 config = tf.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host), log_step_count_steps=25, session_config=config) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size) if FLAGS.do_train: tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) train_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True) hooks = [hvd.BroadcastGlobalVariablesHook(0)] estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps, hooks=hooks) if FLAGS.do_eval: tf.logging.info("***** Running evaluation *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False) result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main() -> None: # Horovod Init hvd.init() size = hvd.size() # Config GPUs gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') # get optimizer & loss function loss_function = tf.keras.losses.SparseCategoricalCrossentropy() opt = tf.keras.optimizers.Adam(lr=Config.LEARNING_RATE * size) # Data imagenet = ImageNet(take=20) train_ds, val_ds = imagenet.train_ds, imagenet.val_ds n_train_batches = train_ds.cardinality().numpy() n_val_batches = val_ds.cardinality().numpy() # Callbacks callbacks = [] callbacks.append(hvdK.callbacks.BroadcastGlobalVariablesCallback(0)) callbacks.append(hvdK.callbacks.MetricAverageCallback()) callbacks.append( hvdK.callbacks.LearningRateWarmupCallback( warmup_epochs=5, initial_lr=Config.LEARNING_RATE)) callbacks.append( tf.keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1)) if hvd.rank() == 0: ckpt_dir = Config.SAVED_WEIGHTS_DIR + "/" + Config.RUN_NAME if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) ckpt = tf.keras.callbacks.ModelCheckpoint(filepath=ckpt_dir+ \ "/epoch-{epoch:02d}-loss={val_loss:.2f}.h5", monitor='val_loss', save_best_only=True, mode='min') log_dir = Config.LOG_DIR + "/" + Config.RUN_NAME tensorboard = tf.keras.callbacks.TensorBoard(log_dir=log_dir) callbacks.append(ckpt) callbacks.append(tensorboard) callbacks.append(tfa.callbacks.TQDMProgressBar()) # Model model = ResNet50() model.loss_function = loss_function model.train_step = types.MethodType(distributed_train_step, model) model.compile(optimizer=opt, loss=loss_function) # Train model.fit(train_ds, steps_per_epoch=n_train_batches // size, validation_data=val_ds, validation_steps=n_val_batches // size, epochs=Config.EPOCHS, verbose=0, callbacks=callbacks)