def main(_): # Horovod: initialize Horovod. hvd.init() # delete previous saving checkpoints and model # if os.path.exists('./checkpoints') and os.path.isdir('./checkpoints'): # shutil.rmtree('./checkpoints') if os.path.exists(os.path.join(home, 'data', 'model')) and os.path.isdir( os.path.join(home, 'data', 'model')): shutil.rmtree(os.path.join(home, 'data', 'model')) # Data set sources : http://archive.ics.uci.edu/ml/datasets/ \ # Smartphone-Based+Recognition+of+Human+Activities+and+Postural+Transitions # sensorData_timestamp.txt is pre-processed data and is based on UCI datasets. # load dataset from DB mysql_to_csv(sql='Select * From sensorData', file_path='./sensorData_timestamp1.csv', host='163.180.117.202', port=3847, user='******', password='******', dbName='hardbnew') columns = [ 'user', 'activity', 'timestamp', 'acc_x-axis', 'acc_y-axis', 'acc_z-axis', 'gyro_x-axis', 'gyro_y-axis', 'gyro_z-axis' ] df = pd.read_csv('./sensorData_timestamp1.csv', header=None, names=columns, lineterminator='\n') df = df.dropna() step = 20 segments = [] labels = [] for i in range(0, len(df) - n_time_steps, step): acc_xs = df['acc_x-axis'].values[i:i + n_time_steps] acc_ys = df['acc_y-axis'].values[i:i + n_time_steps] acc_zs = df['acc_z-axis'].values[i:i + n_time_steps] gyro_xs = df['gyro_x-axis'].values[i:i + n_time_steps] gyro_ys = df['gyro_y-axis'].values[i:i + n_time_steps] gyro_zs = df['gyro_z-axis'].values[i:i + n_time_steps] label = stats.mode(df['activity'][i:i + n_time_steps])[0][0] segments.append([acc_xs, acc_ys, acc_zs, gyro_xs, gyro_ys, gyro_zs]) labels.append(label) reshaped_segments = np.asarray(segments, dtype=np.float32).reshape( -1, n_time_steps, n_features) tmp_df = pd.get_dummies(labels) labels = np.asarray(tmp_df, dtype=np.float32) reverse_one_hot_encode = tmp_df.idxmax().reset_index().rename(columns={ 'index': 'activity', 0: 'idx' }) pickle.dump( reverse_one_hot_encode, open(os.path.join(home, 'data', 'reverse_one_hot_encode'), "wb")) # Data split train : test = 80 : 20 # This split method cause overfit. We need to K-fold taining method. x_train, x_test, y_train, y_test = train_test_split( reshaped_segments, labels, test_size=0.2, random_state=random_seed) pickle.dump(x_test, open(os.path.join(home, 'data', 'x_test'), "wb")) pickle.dump(y_test, open(os.path.join(home, 'data', 'y_test'), "wb")) # Build model... with tf.name_scope('input'): x = tf.placeholder(tf.float32, [None, n_time_steps, n_features], name="inputs") y = tf.placeholder(tf.float32, [None, n_classes], name="label") predict, loss = create_lstm_model(x, y) tf.summary.scalar("loss", loss) # correct_pred = tf.equal(tf.argmax(predict, 1), tf.argmax(y, 1)) # accuracy = tf.reduce_mean(tf.cast(correct_pred, dtype=tf.float32)) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) # Horovod: add Horovod Distributed Optimizer. optimizer = hvd.DistributedOptimizer(optimizer) global_step = tf.train.get_or_create_global_step() train_op = optimizer.minimize(loss, global_step=global_step) hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=8000 // hvd.size()), tf.train.LoggingTensorHook(tensors={ 'step': global_step, 'loss': loss }, every_n_iter=10), tf.train.SummarySaverHook(save_secs=10, output_dir='/tmp/tf', summary_op=tf.summary.merge_all()) ] # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None training_batch_generator = train_input_generator(x_train, y_train, batch_size=batch_size) # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop(): # Run a training step synchronously. input_batch, target = next(training_batch_generator) mon_sess.run(train_op, feed_dict={x: input_batch, y: target}) # save model if hvd.rank() != 0: return checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir) optGraph = optimize_for_inference_lib.optimize_for_inference( tf.get_default_graph().as_graph_def(), ["input/inputs"], ["y_"], dtypes.float32.as_datatype_enum) frozenGraph = freeze_graph.freeze_graph_with_def_protos( optGraph, None, checkpoint_file, "y_", None, None, "frozen.pb", True, None) with tf.Graph().as_default(): importer.import_graph_def(frozenGraph, name="") with tf.Session() as sess: inputs = tf.get_default_graph().get_tensor_by_name( "input/inputs:0") model = tf.get_default_graph().get_tensor_by_name("y_:0") predictor = tf.argmax(model, 1, name="predictor") inputs_classes = tf.saved_model.utils.build_tensor_info( inputs) # input outputs_classes = tf.saved_model.utils.build_tensor_info( predictor) # output signature = (tf.saved_model.signature_def_utils.build_signature_def( inputs={ tf.saved_model.signature_constants.CLASSIFY_INPUTS: inputs_classes }, outputs={ tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES: outputs_classes }, method_name=tf.saved_model.signature_constants. PREDICT_METHOD_NAME)) builder = tf.saved_model.builder.SavedModelBuilder( os.path.join(home, 'data', 'model')) legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op') builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={'predict_activity': signature}, legacy_init_op=legacy_init_op) builder.save()
def main(): print("Local rank: ", hvd.local_rank(), hvd.size()) logdir = osp.join(FLAGS.logdir, FLAGS.exp) if hvd.rank() == 0: if not osp.exists(logdir): os.makedirs(logdir) logger = TensorBoardOutputFormat(logdir) else: logger = None LABEL = None print("Loading data...") if FLAGS.dataset == 'cifar10': dataset = Cifar10(augment=FLAGS.augment, rescale=FLAGS.rescale) test_dataset = Cifar10(train=False, rescale=FLAGS.rescale) channel_num = 3 X_NOISE = tf.placeholder(shape=(None, 32, 32, 3), dtype=tf.float32) X = tf.placeholder(shape=(None, 32, 32, 3), dtype=tf.float32) LABEL = tf.placeholder(shape=(None, 10), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 10), dtype=tf.float32) if FLAGS.large_model: model = ResNet32Large(num_channels=channel_num, num_filters=128, train=True) elif FLAGS.larger_model: model = ResNet32Larger(num_channels=channel_num, num_filters=128) elif FLAGS.wider_model: model = ResNet32Wider(num_channels=channel_num, num_filters=192) else: model = ResNet32(num_channels=channel_num, num_filters=128) elif FLAGS.dataset == 'imagenet': dataset = Imagenet(train=True) test_dataset = Imagenet(train=False) channel_num = 3 X_NOISE = tf.placeholder(shape=(None, 32, 32, 3), dtype=tf.float32) X = tf.placeholder(shape=(None, 32, 32, 3), dtype=tf.float32) LABEL = tf.placeholder(shape=(None, 1000), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 1000), dtype=tf.float32) model = ResNet32Wider(num_channels=channel_num, num_filters=256) elif FLAGS.dataset == 'imagenetfull': channel_num = 3 X_NOISE = tf.placeholder(shape=(None, 128, 128, 3), dtype=tf.float32) X = tf.placeholder(shape=(None, 128, 128, 3), dtype=tf.float32) LABEL = tf.placeholder(shape=(None, 1000), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 1000), dtype=tf.float32) model = ResNet128(num_channels=channel_num, num_filters=64) elif FLAGS.dataset == 'mnist': dataset = Mnist(rescale=FLAGS.rescale) test_dataset = dataset channel_num = 1 X_NOISE = tf.placeholder(shape=(None, 28, 28), dtype=tf.float32) X = tf.placeholder(shape=(None, 28, 28), dtype=tf.float32) LABEL = tf.placeholder(shape=(None, 10), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 10), dtype=tf.float32) model = MnistNet(num_channels=channel_num, num_filters=FLAGS.num_filters) elif FLAGS.dataset == 'dsprites': dataset = DSprites(cond_shape=FLAGS.cond_shape, cond_size=FLAGS.cond_size, cond_pos=FLAGS.cond_pos, cond_rot=FLAGS.cond_rot) test_dataset = dataset channel_num = 1 X_NOISE = tf.placeholder(shape=(None, 64, 64), dtype=tf.float32) X = tf.placeholder(shape=(None, 64, 64), dtype=tf.float32) if FLAGS.dpos_only: LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 2), dtype=tf.float32) elif FLAGS.dsize_only: LABEL = tf.placeholder(shape=(None, 1), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 1), dtype=tf.float32) elif FLAGS.drot_only: LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 2), dtype=tf.float32) elif FLAGS.cond_size: LABEL = tf.placeholder(shape=(None, 1), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 1), dtype=tf.float32) elif FLAGS.cond_shape: LABEL = tf.placeholder(shape=(None, 3), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 3), dtype=tf.float32) elif FLAGS.cond_pos: LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 2), dtype=tf.float32) elif FLAGS.cond_rot: LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 2), dtype=tf.float32) else: LABEL = tf.placeholder(shape=(None, 3), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 3), dtype=tf.float32) model = DspritesNet(num_channels=channel_num, num_filters=FLAGS.num_filters, cond_size=FLAGS.cond_size, cond_shape=FLAGS.cond_shape, cond_pos=FLAGS.cond_pos, cond_rot=FLAGS.cond_rot) print("Done loading...") if FLAGS.dataset == "imagenetfull": # In the case of full imagenet, use custom_tensorflow dataloader data_loader = TFImagenetLoader('train', FLAGS.batch_size, hvd.rank(), hvd.size(), rescale=FLAGS.rescale) else: data_loader = DataLoader(dataset, batch_size=FLAGS.batch_size, num_workers=FLAGS.data_workers, drop_last=True, shuffle=True) batch_size = FLAGS.batch_size weights = [model.construct_weights('context_0')] Y = tf.placeholder(shape=(None), dtype=tf.int32) # Varibles to run in training X_SPLIT = tf.split(X, FLAGS.num_gpus) X_NOISE_SPLIT = tf.split(X_NOISE, FLAGS.num_gpus) LABEL_SPLIT = tf.split(LABEL, FLAGS.num_gpus) LABEL_POS_SPLIT = tf.split(LABEL_POS, FLAGS.num_gpus) LABEL_SPLIT_INIT = list(LABEL_SPLIT) tower_grads = [] tower_gen_grads = [] x_mod_list = [] optimizer = AdamOptimizer(FLAGS.lr, beta1=0.0, beta2=0.999) optimizer = hvd.DistributedOptimizer(optimizer) for j in range(FLAGS.num_gpus): if FLAGS.model_cclass: ind_batch_size = FLAGS.batch_size // FLAGS.num_gpus label_tensor = tf.Variable(tf.convert_to_tensor(np.reshape( np.tile(np.eye(10), (FLAGS.batch_size, 1, 1)), (FLAGS.batch_size * 10, 10)), dtype=tf.float32), trainable=False, dtype=tf.float32) x_split = tf.tile( tf.reshape(X_SPLIT[j], (ind_batch_size, 1, 32, 32, 3)), (1, 10, 1, 1, 1)) x_split = tf.reshape(x_split, (ind_batch_size * 10, 32, 32, 3)) energy_pos = model.forward(x_split, weights[0], label=label_tensor, stop_at_grad=False) energy_pos_full = tf.reshape(energy_pos, (ind_batch_size, 10)) energy_partition_est = tf.reduce_logsumexp(energy_pos_full, axis=1, keepdims=True) uniform = tf.random_uniform(tf.shape(energy_pos_full)) label_tensor = tf.argmax(-energy_pos_full - tf.log(-tf.log(uniform)) - energy_partition_est, axis=1) label = tf.one_hot(label_tensor, 10, dtype=tf.float32) label = tf.Print(label, [label_tensor, energy_pos_full]) LABEL_SPLIT[j] = label energy_pos = tf.concat(energy_pos, axis=0) else: energy_pos = [ model.forward(X_SPLIT[j], weights[0], label=LABEL_POS_SPLIT[j], stop_at_grad=False) ] energy_pos = tf.concat(energy_pos, axis=0) print("Building graph...") x_mod = x_orig = X_NOISE_SPLIT[j] x_grads = [] energy_negs = [] loss_energys = [] energy_negs.extend([ model.forward(tf.stop_gradient(x_mod), weights[0], label=LABEL_SPLIT[j], stop_at_grad=False, reuse=True) ]) eps_begin = tf.zeros(1) steps = tf.constant(0) c = lambda i, x: tf.less(i, FLAGS.num_steps) def langevin_step(counter, x_mod): x_mod = x_mod + tf.random_normal( tf.shape(x_mod), mean=0.0, stddev=0.005 * FLAGS.rescale * FLAGS.noise_scale) energy_noise = energy_start = tf.concat([ model.forward(x_mod, weights[0], label=LABEL_SPLIT[j], reuse=True, stop_at_grad=False, stop_batch=True) ], axis=0) x_grad, label_grad = tf.gradients(FLAGS.temperature * energy_noise, [x_mod, LABEL_SPLIT[j]]) energy_noise_old = energy_noise lr = FLAGS.step_lr if FLAGS.proj_norm != 0.0: if FLAGS.proj_norm_type == 'l2': x_grad = tf.clip_by_norm(x_grad, FLAGS.proj_norm) elif FLAGS.proj_norm_type == 'li': x_grad = tf.clip_by_value(x_grad, -FLAGS.proj_norm, FLAGS.proj_norm) else: print("Other types of projection are not supported!!!") assert False # Clip gradient norm for now if FLAGS.hmc: # Step size should be tuned to get around 65% acceptance def energy(x): return FLAGS.temperature * \ model.forward(x, weights[0], label=LABEL_SPLIT[j], reuse=True) x_last = hmc(x_mod, 15., 10, energy) else: x_last = x_mod - (lr) * x_grad x_mod = x_last x_mod = tf.clip_by_value(x_mod, 0, FLAGS.rescale) counter = counter + 1 return counter, x_mod steps, x_mod = tf.while_loop(c, langevin_step, (steps, x_mod)) energy_eval = model.forward(x_mod, weights[0], label=LABEL_SPLIT[j], stop_at_grad=False, reuse=True) x_grad = tf.gradients(FLAGS.temperature * energy_eval, [x_mod])[0] x_grads.append(x_grad) energy_negs.append( model.forward(tf.stop_gradient(x_mod), weights[0], label=LABEL_SPLIT[j], stop_at_grad=False, reuse=True)) test_x_mod = x_mod temp = FLAGS.temperature energy_neg = energy_negs[-1] x_off = tf.reduce_mean( tf.abs(x_mod[:tf.shape(X_SPLIT[j])[0]] - X_SPLIT[j])) loss_energy = model.forward(x_mod, weights[0], reuse=True, label=LABEL, stop_grad=True) print("Finished processing loop construction ...") target_vars = {} if FLAGS.cclass or FLAGS.model_cclass: label_sum = tf.reduce_sum(LABEL_SPLIT[0], axis=0) label_prob = label_sum / tf.reduce_sum(label_sum) label_ent = -tf.reduce_sum( label_prob * tf.math.log(label_prob + 1e-7)) else: label_ent = tf.zeros(1) target_vars['label_ent'] = label_ent if FLAGS.train: if FLAGS.objective == 'logsumexp': pos_term = temp * energy_pos energy_neg_reduced = (energy_neg - tf.reduce_min(energy_neg)) coeff = tf.stop_gradient(tf.exp(-temp * energy_neg_reduced)) norm_constant = tf.stop_gradient(tf.reduce_sum(coeff)) + 1e-4 pos_loss = tf.reduce_mean(temp * energy_pos) neg_loss = coeff * (-1 * temp * energy_neg) / norm_constant loss_ml = FLAGS.ml_coeff * (pos_loss + tf.reduce_sum(neg_loss)) elif FLAGS.objective == 'cd': pos_loss = tf.reduce_mean(temp * energy_pos) neg_loss = -tf.reduce_mean(temp * energy_neg) loss_ml = FLAGS.ml_coeff * (pos_loss + tf.reduce_sum(neg_loss)) elif FLAGS.objective == 'softplus': loss_ml = FLAGS.ml_coeff * \ tf.nn.softplus(temp * (energy_pos - energy_neg)) loss_total = tf.reduce_mean(loss_ml) if not FLAGS.zero_kl: loss_total = loss_total + tf.reduce_mean(loss_energy) loss_total = loss_total + \ FLAGS.l2_coeff * (tf.reduce_mean(tf.square(energy_pos)) + tf.reduce_mean(tf.square((energy_neg)))) print("Started gradient computation...") gvs = optimizer.compute_gradients(loss_total) gvs = [(k, v) for (k, v) in gvs if k is not None] print("Applying gradients...") tower_grads.append(gvs) print("Finished applying gradients.") target_vars['loss_ml'] = loss_ml target_vars['total_loss'] = loss_total target_vars['loss_energy'] = loss_energy target_vars['weights'] = weights target_vars['gvs'] = gvs target_vars['X'] = X target_vars['Y'] = Y target_vars['LABEL'] = LABEL target_vars['LABEL_POS'] = LABEL_POS target_vars['X_NOISE'] = X_NOISE target_vars['energy_pos'] = energy_pos target_vars['energy_start'] = energy_negs[0] if len(x_grads) >= 1: target_vars['x_grad'] = x_grads[-1] target_vars['x_grad_first'] = x_grads[0] else: target_vars['x_grad'] = tf.zeros(1) target_vars['x_grad_first'] = tf.zeros(1) target_vars['x_mod'] = x_mod target_vars['x_off'] = x_off target_vars['temp'] = temp target_vars['energy_neg'] = energy_neg target_vars['test_x_mod'] = test_x_mod target_vars['eps_begin'] = eps_begin if FLAGS.train: grads = average_gradients(tower_grads) train_op = optimizer.apply_gradients(grads) target_vars['train_op'] = train_op config = tf.ConfigProto() if hvd.size() > 1: config.gpu_options.visible_device_list = str(hvd.local_rank()) sess = tf.Session(config=config) saver = loader = tf.train.Saver(max_to_keep=30, keep_checkpoint_every_n_hours=6) total_parameters = 0 for variable in tf.trainable_variables(): # shape is an array of tf.Dimension shape = variable.get_shape() variable_parameters = 1 for dim in shape: variable_parameters *= dim.value total_parameters += variable_parameters print("Model has a total of {} parameters".format(total_parameters)) sess.run(tf.global_variables_initializer()) resume_itr = 0 if (FLAGS.resume_iter != -1 or not FLAGS.train) and hvd.rank() == 0: model_file = osp.join(logdir, 'model_{}'.format(FLAGS.resume_iter)) resume_itr = FLAGS.resume_iter # saver.restore(sess, model_file) optimistic_restore(sess, model_file) sess.run(hvd.broadcast_global_variables(0)) print("Initializing variables...") print("Start broadcast") print("End broadcast") if FLAGS.train: print("Training phase") train(target_vars, saver, sess, logger, data_loader, resume_itr, logdir) print("Testing phase") test(target_vars, saver, sess, logger, data_loader)
logits = fully_connected(hidden2, n_outputs, scope="outputs", activation_fn=None) no_op = tf.no_op(name="no_op") with tf.name_scope("loss"): xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=y, logits=logits) loss = tf.reduce_mean(xentropy, name="loss") learning_rate = 0.01 with tf.name_scope("train"): optimizer = tf.train.GradientDescentOptimizer(learning_rate) optimizer = hvd.DistributedOptimizer(optimizer) training_op = optimizer.minimize(loss, name='optimize') with tf.name_scope("eval"): correct = tf.nn.in_top_k(logits, y, 1) accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) init = tf.global_variables_initializer() bcast = hvd.broadcast_global_variables(0) n_epochs = 20 batch_size = 100 def shuffle_batch(X, y, batch_size): rnd_idx = np.random.permutation(len(X)) n_batches = len(X) // batch_size
epochs = 1000 keep_probability = 0.5 #0.5 dropout per the paper going with 0.7 since 0.5 just doesn't work with regularization never converges, and loss goes up, back to 0.5 starter_learning_rate = 0.001 # changed to .1 from 0.01; changed to 0.001 from 0.01, back to 0.01 global_step = tf.train.get_or_create_global_step() learning_rate = tf.compat.v1.train.exponential_decay(starter_learning_rate, global_step, 100000, 0.96, staircase=True) acc, cost = model(x, y, keep_probability) train_op = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=0.1) train_op = hvd.DistributedOptimizer(train_op) train_op = train_op.minimize(cost, global_step=global_step) # weight decay config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) iter = train_ds.make_initializable_iterator() val_iter = val_ds.make_initializable_iterator() iter_op = iter.get_next() val_iter_op = val_iter.get_next() with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) bcast = hvd.broadcast_global_variables(0) sess.run(bcast)
def start_training(config): if config.IS_DISTRIBUTION: import horovod.tensorflow as hvd # initialize Horovod. hvd.init() num_worker = hvd.size() rank = hvd.rank() # verify that MPI multi-threading is supported. assert hvd.mpi_threads_supported() # make sure MPI is not re-initialized. import mpi4py.rc mpi4py.rc.initialize = False # import mpi4py from mpi4py import MPI comm = MPI.COMM_WORLD # check size and rank are syncronized assert num_worker == comm.Get_size() assert rank == comm.Get_rank() else: num_worker = 1 rank = 0 ModelClass = config.NETWORK_CLASS network_kwargs = dict( (key.lower(), val) for key, val in config.NETWORK.items()) if "train_validation_saving_size".upper() in config.DATASET.keys(): use_train_validation_saving = config.DATASET.TRAIN_VALIDATION_SAVING_SIZE > 0 else: use_train_validation_saving = False if use_train_validation_saving: top_train_validation_saving_set_accuracy = 0 train_dataset = setup_dataset(config, "train", rank) print("train dataset num:", train_dataset.num_per_epoch) if use_train_validation_saving: train_validation_saving_dataset = setup_dataset( config, "train_validation_saving", rank) print("train_validation_saving dataset num:", train_validation_saving_dataset.num_per_epoch) validation_dataset = setup_dataset(config, "validation", rank) print("validation dataset num:", validation_dataset.num_per_epoch) graph = tf.Graph() with graph.as_default(): if ModelClass.__module__.startswith("lmnet.networks.object_detection"): model = ModelClass( classes=train_dataset.classes, num_max_boxes=train_dataset.num_max_boxes, is_debug=config.IS_DEBUG, **network_kwargs, ) elif ModelClass.__module__.startswith("lmnet.networks.segmentation"): model = ModelClass( classes=train_dataset.classes, label_colors=train_dataset.label_colors, is_debug=config.IS_DEBUG, **network_kwargs, ) else: model = ModelClass( classes=train_dataset.classes, is_debug=config.IS_DEBUG, **network_kwargs, ) global_step = tf.Variable(0, name="global_step", trainable=False) is_training_placeholder = tf.placeholder( tf.bool, name="is_training_placeholder") images_placeholder, labels_placeholder = model.placeholderes() output = model.inference(images_placeholder, is_training_placeholder) if ModelClass.__module__.startswith("lmnet.networks.object_detection"): loss = model.loss(output, labels_placeholder, is_training_placeholder) else: loss = model.loss(output, labels_placeholder) opt = model.optimizer(global_step) if config.IS_DISTRIBUTION: # add Horovod Distributed Optimizer opt = hvd.DistributedOptimizer(opt) train_op = model.train(loss, opt, global_step) metrics_ops_dict, metrics_update_op = model.metrics( output, labels_placeholder) # TODO(wakisaka): Deal with many networks. model.summary(output, labels_placeholder) summary_op = tf.summary.merge_all() metrics_summary_op, metrics_placeholders = executor.prepare_metrics( metrics_ops_dict) init_op = tf.global_variables_initializer() reset_metrics_op = tf.local_variables_initializer() if config.IS_DISTRIBUTION: # add Horovod broadcasting variables from rank 0 to all bcast_global_variables_op = hvd.broadcast_global_variables(0) if use_train_validation_saving: saver = tf.train.Saver(max_to_keep=1) else: saver = tf.train.Saver(max_to_keep=None) if config.IS_PRETRAIN: all_vars = tf.global_variables() pretrain_var_list = [ var for var in all_vars if var.name.startswith(tuple(config.PRETRAIN_VARS)) ] print("pretrain_vars", [var.name for var in pretrain_var_list]) pretrain_saver = tf.train.Saver(pretrain_var_list, name="pretrain_saver") if config.IS_DISTRIBUTION: # For distributed training session_config = tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True, visible_device_list=str(hvd.local_rank()))) else: # TODO(wakisaka): For debug. # session_config = tf.ConfigProto( # gpu_options=tf.GPUOptions( # allow_growth=True, # per_process_gpu_memory_fraction=0.1 # ) # ) session_config = tf.ConfigProto( ) # tf.ConfigProto(log_device_placement=True) # TODO(wakisaka): XLA JIT # session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 sess = tf.Session(graph=graph, config=session_config) sess.run([init_op, reset_metrics_op]) if rank == 0: train_writer = tf.summary.FileWriter( environment.TENSORBOARD_DIR + "/train", sess.graph) if use_train_validation_saving: train_val_saving_writer = tf.summary.FileWriter( environment.TENSORBOARD_DIR + "/train_validation_saving") val_writer = tf.summary.FileWriter(environment.TENSORBOARD_DIR + "/validation") if config.IS_PRETRAIN: print("------- Load pretrain data ----------") pretrain_saver.restore( sess, os.path.join(config.PRETRAIN_DIR, config.PRETRAIN_FILE)) sess.run(tf.assign(global_step, 0)) last_step = 0 # for recovery ckpt = tf.train.get_checkpoint_state(environment.CHECKPOINTS_DIR) if ckpt and ckpt.model_checkpoint_path: print("--------- Restore last checkpoint -------------") saver.restore(sess, ckpt.model_checkpoint_path) # saver.recover_last_checkpoints(ckpt.model_checkpoint_path) last_step = sess.run(global_step) # TODO(wakisaka): tensorflow v1.3 remain previous event log in tensorboard. # https://github.com/tensorflow/tensorflow/blob/r1.3/tensorflow/python/training/supervisor.py#L1072 train_writer.add_session_log(SessionLog(status=SessionLog.START), global_step=last_step + 1) val_writer.add_session_log(SessionLog(status=SessionLog.START), global_step=last_step + 1) print("recovered. last step", last_step) if config.IS_DISTRIBUTION: # broadcast variables from rank 0 to all other processes sess.run(bcast_global_variables_op) # calculate step per epoch for each nodes train_num_per_epoch = train_dataset.num_per_epoch num_per_nodes = (train_num_per_epoch + num_worker - 1) // num_worker step_per_epoch = num_per_nodes // config.BATCH_SIZE begin_index = (train_num_per_epoch * rank) // num_worker end_index = begin_index + num_per_nodes last_step = sess.run(global_step) # Calculate max steps. The priority of config.MAX_EPOCHS is higher than config.MAX_STEPS. if "MAX_EPOCHS" in config: max_steps = int(train_dataset.num_per_epoch / config.BATCH_SIZE * config.MAX_EPOCHS) else: max_steps = config.MAX_STEPS print("max_steps: {}".format(max_steps)) for step in range(last_step, max_steps): print("step", step) if config.IS_DISTRIBUTION: # scatter dataset if step % step_per_epoch == 0: indices = train_dataset.get_shuffle_index( ) if rank == 0 else None # broadcast shuffled indices indices = comm.bcast(indices, 0) feed_indices = indices[begin_index:end_index] # update each dataset by splited indices train_dataset.update_dataset(feed_indices) images, labels = train_dataset.feed() feed_dict = { is_training_placeholder: True, images_placeholder: images, labels_placeholder: labels, } if step * ((step + 1) % config.SUMMARISE_STEPS) == 0 and rank == 0: # Runtime statistics for develop. # run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # run_metadata = tf.RunMetadata() sess.run(reset_metrics_op) _, summary, _ = sess.run( [train_op, summary_op, metrics_update_op], feed_dict=feed_dict, # options=run_options, # run_metadata=run_metadata, ) # train_writer.add_run_metadata(run_metadata, "step: {}".format(step + 1)) train_writer.add_summary(summary, step + 1) metrics_values = sess.run(list(metrics_ops_dict.values())) metrics_feed_dict = { placeholder: value for placeholder, value in zip(metrics_placeholders, metrics_values) } metrics_summary, = sess.run( [metrics_summary_op], feed_dict=metrics_feed_dict, ) train_writer.add_summary(metrics_summary, step + 1) else: sess.run([train_op], feed_dict=feed_dict) to_be_saved = step == 0 or ( step + 1) == max_steps or (step + 1) % config.SAVE_STEPS == 0 if to_be_saved and rank == 0: if use_train_validation_saving: sess.run(reset_metrics_op) train_validation_saving_step_size = int( math.ceil(train_validation_saving_dataset.num_per_epoch / config.BATCH_SIZE)) print("train_validation_saving_step_size", train_validation_saving_step_size) current_train_validation_saving_set_accuracy = 0 for train_validation_saving_step in range( train_validation_saving_step_size): print("train_validation_saving_step", train_validation_saving_step) images, labels = train_validation_saving_dataset.feed() feed_dict = { is_training_placeholder: False, images_placeholder: images, labels_placeholder: labels, } if train_validation_saving_step % config.SUMMARISE_STEPS == 0: summary, _ = sess.run([summary_op, metrics_update_op], feed_dict=feed_dict) train_val_saving_writer.add_summary(summary, step + 1) else: sess.run([metrics_update_op], feed_dict=feed_dict) metrics_values = sess.run(list(metrics_ops_dict.values())) metrics_feed_dict = { placeholder: value for placeholder, value in zip(metrics_placeholders, metrics_values) } metrics_summary, = sess.run( [metrics_summary_op], feed_dict=metrics_feed_dict, ) train_val_saving_writer.add_summary(metrics_summary, step + 1) current_train_validation_saving_set_accuracy = sess.run( metrics_ops_dict["accuracy"]) if current_train_validation_saving_set_accuracy > top_train_validation_saving_set_accuracy: top_train_validation_saving_set_accuracy = current_train_validation_saving_set_accuracy print("New top train_validation_saving accuracy is: ", top_train_validation_saving_set_accuracy) _save_checkpoint(saver, sess, global_step, step) else: _save_checkpoint(saver, sess, global_step, step) if step == 0: # check create pb on only first step. minimal_graph = tf.graph_util.convert_variables_to_constants( sess, sess.graph.as_graph_def(add_shapes=True), ["output"], ) pb_name = "minimal_graph_with_shape_{}.pb".format(step + 1) pbtxt_name = "minimal_graph_with_shape_{}.pbtxt".format(step + 1) tf.train.write_graph(minimal_graph, environment.CHECKPOINTS_DIR, pb_name, as_text=False) tf.train.write_graph(minimal_graph, environment.CHECKPOINTS_DIR, pbtxt_name, as_text=True) if step == 0 or (step + 1) % config.TEST_STEPS == 0: # init metrics values sess.run(reset_metrics_op) test_step_size = int( math.ceil(validation_dataset.num_per_epoch / config.BATCH_SIZE)) print("test_step_size", test_step_size) for test_step in range(test_step_size): print("test_step", test_step) images, labels = validation_dataset.feed() feed_dict = { is_training_placeholder: False, images_placeholder: images, labels_placeholder: labels, } if test_step % config.SUMMARISE_STEPS == 0: summary, _ = sess.run([summary_op, metrics_update_op], feed_dict=feed_dict) if rank == 0: val_writer.add_summary(summary, step + 1) else: sess.run([metrics_update_op], feed_dict=feed_dict) metrics_values = sess.run(list(metrics_ops_dict.values())) metrics_feed_dict = { placeholder: value for placeholder, value in zip(metrics_placeholders, metrics_values) } metrics_summary, = sess.run( [metrics_summary_op], feed_dict=metrics_feed_dict, ) if rank == 0: val_writer.add_summary(metrics_summary, step + 1) # training loop end. print("reach max step")
def main(argv): # Initialize Horovod. hvd.init() # Download and load MNIST dataset. mnist = learn.datasets.mnist.read_data_sets(training_data_dir) # Build model... with tf.name_scope('input'): image = tf.placeholder(tf.float32, [None, 784], name='image') label = tf.placeholder(tf.float32, [None], name='label') predict, loss = conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN) opt = tf.train.RMSPropOptimizer(0.01) # Add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) global_step = tf.contrib.framework.get_or_create_global_step() train_op = opt.minimize(loss, global_step=global_step) # BroadcastGlobalVariablesHook broadcasts initial variable states from rank 0 # to all other processes. This is necessary to ensure consistent initialization # of all workers when training is started with random weights or restored # from a checkpoint. # Save checkpoints only on worker 0 to prevent other workers from corrupting them. checkpoint_dir = checkpoint_path if hvd.rank() == 0 else None hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=20), tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss}, every_n_iter=10), ] # Pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop(): # Run a training step synchronously. image_, label_ = mnist.train.next_batch(100) mon_sess.run(train_op, feed_dict={image: image_, label: label_}) mon_sess.
def _cnn_model_function(features, labels, mode, params): model_func = params['model'] model_format = params['format'] model_dtype = params['dtype'] momentum = params['momentum'] learning_rate_init = params['learning_rate_init'] learning_rate_power = params['learning_rate_power'] decay_steps = params['decay_steps'] weight_decay = params['weight_decay'] loss_scale = params['loss_scale'] larc_eta = params['larc_eta'] larc_mode = params['larc_mode'] deterministic = params['deterministic'] num_classes = params['n_classes'] dali_cpu = params['dali_cpu'] device = '/gpu:0' labels = tf.reshape(labels, (-1, )) # Squash unnecessary unary dim inputs = features # TODO: Should be using feature columns? is_training = (mode == tf.estimator.ModeKeys.TRAIN) with tf.device(device): inputs = tf.cast(inputs, model_dtype) if model_format == 'channels_first': inputs = tf.transpose(inputs, [0, 3, 1, 2]) with nvutils.fp32_trainable_vars( regularizer=tf.contrib.layers.l2_regularizer(weight_decay)): top_layer = model_func(inputs, training=is_training) logits = tf.layers.dense(top_layer, num_classes) predicted_classes = tf.argmax(logits, axis=1, output_type=tf.int32) logits = tf.cast(logits, tf.float32) if mode == tf.estimator.ModeKeys.PREDICT: probabilities = tf.softmax(logits) predictions = { 'class_ids': predicted_classes[:, None], 'probabilities': probabilities, 'logits': logits } return tf.estimator.EstimatorSpec(mode, predictions=predictions) loss = tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels) loss = tf.identity( loss, name='loss' ) # For access by logger (TODO: Better way to access it?) reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) loss = tf.add_n([loss] + reg_losses, name='total_loss') with tf.device( None): # Allow fallback to CPU if no GPU support for these ops top1_accuracy = tf.metrics.accuracy(labels=labels, predictions=predicted_classes) top5_accuracy = tf.metrics.mean( tf.nn.in_top_k(predictions=logits, targets=labels, k=5)) tf.summary.scalar('top1_accuracy', top1_accuracy[1]) tf.summary.scalar('top5_accuracy', top5_accuracy[1]) if mode == tf.estimator.ModeKeys.EVAL: metrics = { 'top1_accuracy': top1_accuracy, 'top5_accuracy': top5_accuracy } return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics) assert (mode == tf.estimator.ModeKeys.TRAIN) #batch_size = inputs.shape[0] batch_size = tf.shape(inputs)[0] learning_rate = tf.train.polynomial_decay(learning_rate_init, tf.train.get_global_step(), decay_steps=decay_steps, end_learning_rate=0., power=learning_rate_power, cycle=False, name='learning_rate') opt = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=True) opt = hvd.DistributedOptimizer(opt) opt = nvutils.LarcOptimizer(opt, learning_rate, larc_eta, clip=larc_mode) opt = nvutils.LossScalingOptimizer(opt, scale=loss_scale) gate_gradients = (tf.train.Optimizer.GATE_OP if deterministic else tf.train.Optimizer.GATE_NONE) train_op = opt.minimize(loss, global_step=tf.train.get_global_step(), gate_gradients=gate_gradients, name='step_update') update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) or [] train_op = tf.group(train_op, update_ops) return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
def training_deploy(self, model, funcs=[]): # Horovod: initialize Horovod (prepare MPI envoriment) if self.is_distribute_training: import horovod.tensorflow as hvd hvd.init() # reset num_clones = 1 self.num_clones = 1 self.rank = hvd.rank() self.local_rank = hvd.local_rank() get_global_context().quiet = False if self.rank == 0 else True tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default() as graph: # Default graph self.graph = graph # Session config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True devices = self.ctx.devices if len(self.ctx.devices) > 0 else self.devices config.gpu_options.visible_device_list = ','.join(str(x) for x in devices) if len(devices) > 0 else '' self.sess = tf.Session(graph=graph, config=config) ####################### # Config model deploy # ####################### deploy_config = tfmodel_deploy.DeploymentConfig(num_clones=self.num_clones, devices=[], clone_on_cpu=self.clone_on_cpu, replica_id=self.replica_id, num_replicas=self.worker_replicas, num_ps_tasks=self.num_ps_tasks, clone_id_map={0:self.local_rank} if self.is_distribute_training else {}) # init some info with tf.device(deploy_config.inputs_device()): # Create global_step with tf.device(deploy_config.variables_device()): global_step = slim.get_or_create_global_step() ################################### #### define model input (CPU) ## ################################### with tf.variable_scope('input'): data_queue = self.ctx.model.model_input(self.is_training) if data_queue is not None: self._has_model_input = True ################################### #### define model (CPU or GPU) # ################################### func = model.model_fn @functools.wraps(func) def network_fn(*args, **kwargs): res = func(self.is_training, *args, **kwargs) if kwargs['clone'] == 0: # 1.step save graph file tf.train.write_graph(self.sess.graph_def, self.dump_dir, 'graph.pbtxt') # # 2.step transfer to local graph net # logger.info('build model graph svg') # svg_graph = _convert_to_svg_graph(os.path.join(self.dump_dir, 'graph.pbtxt'), # self.dump_dir, # ['input']) # if svg_graph is not None: # self.ctx.job.send({'DATA': {'GRAPH': svg_graph}}) return res #################################### ####### Create summary ######## #################################### summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) #################################### ####### Create model clones ######## #################################### self.clones = tfmodel_deploy.create_clones(deploy_config, network_fn, [data_queue] if data_queue is not None else None) first_clone_scope = deploy_config.clone_scope(0) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # create other func for create_func in funcs: self._create_funcs.append(create_func()) ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.optimizer_device()): # samples total number num_samples = self.num_samples if self.num_samples > 0 else self.ctx.data_source.size # Horovod: adjust learning rate based on number of GPUs self.lr = _configure_learning_rate(self, num_samples, global_step) summaries.add(tf.summary.scalar('learning_rate', self.lr)) # config optimizer optimizer = _configure_optimizer(self, self.lr) # Horovod: add Horovod Distributed Optimizer if self.is_distribute_training: optimizer = hvd.DistributedOptimizer(optimizer) # Variables to train. variables_to_train = _get_variables_to_train(self) with tf.control_dependencies(self.model_dependence): # Train_tensor total_loss, clones_gradients = \ tfmodel_deploy.optimize_clones(self.clones, optimizer, regularization_losses=None if self.regularization_loss else [], var_list=variables_to_train) summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) # Value ops update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): self.val_ops = tf.identity(total_loss, name='train_op') if self.clones[0].outputs is not None: self.val_ops = [self.val_ops] if type(self.clones[0].outputs) == list: self.val_ops.extend(self.clones[0].outputs) elif type(self.clones[0].outputs) == tuple: self.val_ops.extend(list(self.clones[0].outputs)) else: self.val_ops.append(self.clones[0].outputs) if type(self.val_ops) != list: self.val_ops = [self.val_ops] summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. self.summary_op = tf.summary.merge(list(summaries), name='summary_op') if self.summary_op is not None: val_ops_temp = [self.summary_op] val_ops_temp.extend(self.val_ops) self.val_ops = val_ops_temp # summary write if not os.path.exists(os.path.join(self.dump_dir, 'summary')): os.makedirs(os.path.join(self.dump_dir, 'summary')) self.train_writer = tf.summary.FileWriter(os.path.join(self.dump_dir, 'summary'), graph) # Global initialization self.sess.run(tf.global_variables_initializer()) self.sess.run(tf.local_variables_initializer()) # coord self.coord = tf.train.Coordinator() self.threads = tf.train.start_queue_runners(sess=self.sess, coord=self.coord) custom_dataset_queue = tf.get_collection('CUSTOM_DATASET_QUEUE') if len(custom_dataset_queue) > 0: custom_dataset_queue[0].coord = self.coord custom_threads = custom_dataset_queue[0].start_threads(self.sess) self.threads.extend(custom_threads) # Training saver # model_variables = slim.get_model_variables() if model.model_variables is None else model.model_variables self.saver = tf.train.Saver(max_to_keep=2) # Restore from checkpoint if not self.is_distribute_training or (self.is_distribute_training and self.rank == 0): restore_fns = _get_init_fn(self, model, self.dump_dir, self.ctx) if restore_fns is not None: for restore_fn in restore_fns: restore_fn(self.sess) # Restore from custom auxilary init funcs for func in self._aux_init_funcs: func(self.sess) # resotre from auxilary checkpoint for auxilary_scope, auxilary_checkpoint in self.auxilary_checkpoints.items(): self.restore_scopy_from(model, auxilary_scope, auxilary_checkpoint) # Horovod boardcast global variables if self.is_distribute_training: bgv = hvd.BroadcastGlobalVariablesHook(0) bgv.begin() bgv.after_create_session(self.sess, self.coord)
def create_optimizer(hparams, loss): """ Creates an optimizer training op. If the parameter lr_bert is specified, then use another adam for this learning rate. """ tvars = tf.trainable_variables() # Print trainable variables print("# Trainable variables") total_param = 0 for param in tvars: if param.name.startswith('bert'): psize = 1 for s in param.get_shape(): psize *= s total_param += psize print(" %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device)) print('total bert parameters:', total_param) # Define optimizer parameters init_lr = hparams.learning_rate num_train_steps = hparams.num_train_steps num_warmup_steps = hparams.num_warmup_steps lr_bert = hparams.lr_bert global_step = tf.train.get_or_create_global_step() learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) if hparams.optimizer == "bert_adam": # Using optimizer with bert's implementation # Implements linear decay of the learning rate. learning_rate = tf.train.polynomial_decay( learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) # Implements linear warmup. I.e., if global_step < num_warmup_steps, the # learning rate will be `global_step/num_warmup_steps * init_lr`. if num_warmup_steps: global_steps_int = tf.cast(global_step, tf.int32) warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = init_lr * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) learning_rate = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) # It is recommended that you use this optimizer for fine tuning, since this # is how the model was trained (note that the Adam m/v variables are NOT # loaded from init_checkpoint.) optimizer = AdamWeightDecayOptimizer( learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) if hparams.use_horovod: import horovod.tensorflow as hvd # Horovod's distributed optimizer handles allreduce calls, synchronous only optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True) grads_and_vars = optimizer.compute_gradients(loss, tvars) grads = [grad for grad, var in grads_and_vars] tvars = [var for grad, var in grads_and_vars] else: grads = tf.gradients(loss, tvars) grads, grad_norm = tf.clip_by_global_norm(grads, clip_norm=1.0) if lr_bert is None: # If not a separate learning rate for bert (lr_bert) is specified, # all components use the same learning rate train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=global_step) # Normally the global step update is done inside of `apply_gradients`. # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use # a different optimizer, you should probably take this line out. new_global_step = global_step + 1 train_op = tf.group(train_op, [global_step.assign(new_global_step)]) else: # the BERT components will use another learning rate optimizer_bert = AdamWeightDecayOptimizer( learning_rate=learning_rate * lr_bert / init_lr, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) if hparams.use_horovod: # Treat the bert optimizer the same as the original optimizer: wrapped with horovod optimizer_bert = hvd.DistributedOptimizer(optimizer_bert, sparse_as_dense=True) bert_grad, bert_tvars = [], [] other_grad, other_tvars = [], [] for grad, tvar in zip(grads, tvars): if tvar is not None and grad is not None: if tvar.name.startswith('bert'): bert_grad.append(grad) bert_tvars.append(tvar) print('****bert param:', tvar.name) else: other_grad.append(grad) other_tvars.append(tvar) print('****other param:', tvar.name) print('--------------\n', '# of bert', len(bert_grad), '# of other', len(other_grad), '\n--------------') bert_train_op = optimizer_bert.apply_gradients( zip(bert_grad, bert_tvars), global_step=global_step) other_train_op = optimizer.apply_gradients( zip(other_grad, other_tvars), global_step=global_step) new_global_step = global_step + 1 train_op = tf.group(bert_train_op, other_train_op, [global_step.assign(new_global_step)]) return train_op, grad_norm, learning_rate elif hparams.optimizer == "sgd": opt = tf.train.GradientDescentOptimizer(learning_rate) elif hparams.optimizer == "adam": opt = tf.train.AdamOptimizer(learning_rate) else: raise ValueError("Only support sgd/adam/bert_adam as optimizer option") # Gradients gradients = tf.gradients(loss, tvars, colocate_gradients_with_ops=True) clipped_gradients, grad_norm = tf.clip_by_global_norm(gradients, hparams.max_gradient_norm) train_op = opt.apply_gradients(zip(clipped_gradients, tvars), global_step=global_step) return train_op, grad_norm, learning_rate
def model_fn(features, labels, mode, params=None): """Constructs the object detection model. Args: features: Dictionary of feature tensors, returned from `input_fn`. labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL, otherwise None. mode: Mode key from tf.estimator.ModeKeys. params: Parameter dictionary passed from the estimator. Returns: An `EstimatorSpec` that encapsulates the model and its serving configurations. """ params = params or {} total_loss, train_op, detections, export_outputs = None, None, None, None is_training = mode == tf.estimator.ModeKeys.TRAIN # Make sure to set the Keras learning phase. True during training, # False for inference. tf.keras.backend.set_learning_phase(is_training) detection_model = detection_model_fn(is_training=is_training, add_summaries=(not use_tpu)) scaffold_fn = None if mode == tf.estimator.ModeKeys.TRAIN: labels = unstack_batch(labels, unpad_groundtruth_tensors=train_config. unpad_groundtruth_tensors) elif mode == tf.estimator.ModeKeys.EVAL: # For evaling on train data, it is necessary to check whether groundtruth # must be unpadded. boxes_shape = (labels[fields.InputDataFields.groundtruth_boxes]. get_shape().as_list()) unpad_groundtruth_tensors = boxes_shape[ 1] is not None and not use_tpu labels = unstack_batch( labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes] gt_classes_list = labels[ fields.InputDataFields.groundtruth_classes] gt_masks_list = None if fields.InputDataFields.groundtruth_instance_masks in labels: gt_masks_list = labels[ fields.InputDataFields.groundtruth_instance_masks] gt_keypoints_list = None if fields.InputDataFields.groundtruth_keypoints in labels: gt_keypoints_list = labels[ fields.InputDataFields.groundtruth_keypoints] gt_weights_list = None if fields.InputDataFields.groundtruth_weights in labels: gt_weights_list = labels[ fields.InputDataFields.groundtruth_weights] gt_confidences_list = None if fields.InputDataFields.groundtruth_confidences in labels: gt_confidences_list = labels[ fields.InputDataFields.groundtruth_confidences] gt_is_crowd_list = None if fields.InputDataFields.groundtruth_is_crowd in labels: gt_is_crowd_list = labels[ fields.InputDataFields.groundtruth_is_crowd] detection_model.provide_groundtruth( groundtruth_boxes_list=gt_boxes_list, groundtruth_classes_list=gt_classes_list, groundtruth_confidences_list=gt_confidences_list, groundtruth_masks_list=gt_masks_list, groundtruth_keypoints_list=gt_keypoints_list, groundtruth_weights_list=gt_weights_list, groundtruth_is_crowd_list=gt_is_crowd_list) preprocessed_images = features[fields.InputDataFields.image] if use_tpu and train_config.use_bfloat16: with tf.contrib.tpu.bfloat16_scope(): prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape]) for k, v in prediction_dict.items(): if v.dtype == tf.bfloat16: prediction_dict[k] = tf.cast(v, tf.float32) else: prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape]) def postprocess_wrapper(args): return detection_model.postprocess(args[0], args[1]) if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT): if use_tpu and postprocess_on_cpu: detections = tf.contrib.tpu.outside_compilation( postprocess_wrapper, (prediction_dict, features[fields.InputDataFields.true_image_shape])) else: detections = postprocess_wrapper( (prediction_dict, features[fields.InputDataFields.true_image_shape])) if mode == tf.estimator.ModeKeys.TRAIN: if train_config.fine_tune_checkpoint and hparams.load_pretrained: if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, set train_config.fine_tune_checkpoint_type # based on train_config.from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' asg_map = detection_model.restore_map( fine_tune_checkpoint_type=train_config. fine_tune_checkpoint_type, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( asg_map, train_config.fine_tune_checkpoint, include_global_step=False)) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint( train_config.fine_tune_checkpoint, available_var_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint( train_config.fine_tune_checkpoint, available_var_map) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): losses_dict = detection_model.loss( prediction_dict, features[fields.InputDataFields.true_image_shape]) losses = [loss_tensor for loss_tensor in losses_dict.values()] if train_config.add_regularization_loss: regularization_losses = detection_model.regularization_losses() if regularization_losses: regularization_loss = tf.add_n(regularization_losses, name='regularization_loss') losses.append(regularization_loss) losses_dict[ 'Loss/regularization_loss'] = regularization_loss total_loss = tf.add_n(losses, name='total_loss') losses_dict['Loss/total_loss'] = total_loss if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=is_training) graph_rewriter_fn() # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we # can write learning rate summaries on TPU without host calls. global_step = tf.train.get_or_create_global_step() training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) if mode == tf.estimator.ModeKeys.TRAIN: if use_tpu: training_optimizer = tf.contrib.tpu.CrossShardOptimizer( training_optimizer) ## ADDED for multi-gpu training_optimizer = hvd.DistributedOptimizer( training_optimizer, device_dense='/cpu:0') # Optionally freeze some layers by setting their gradients to be zero. trainable_variables = None include_variables = (train_config.update_trainable_variables if train_config.update_trainable_variables else None) exclude_variables = (train_config.freeze_variables if train_config.freeze_variables else None) trainable_variables = tf.contrib.framework.filter_variables( tf.trainable_variables(), include_patterns=include_variables, exclude_patterns=exclude_variables) clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm if not use_tpu: for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var) summaries = [] if use_tpu else None if train_config.summarize_gradients: summaries = [ 'gradients', 'gradient_norm', 'global_gradient_norm' ] train_op = tf.contrib.layers.optimize_loss( loss=total_loss, global_step=global_step, learning_rate=None, clip_gradients=clip_gradients_value, optimizer=training_optimizer, update_ops=detection_model.updates(), variables=trainable_variables, summaries=summaries, name='') # Preventing scope prefix on all variables. if mode == tf.estimator.ModeKeys.PREDICT: exported_output = exporter_lib.add_output_tensor_nodes(detections) export_outputs = { tf.saved_model.signature_constants.PREDICT_METHOD_NAME: tf.estimator.export.PredictOutput(exported_output) } eval_metric_ops = None scaffold = None if mode == tf.estimator.ModeKeys.EVAL: class_agnostic = (fields.DetectionResultFields.detection_classes not in detections) groundtruth = _prepare_groundtruth_for_eval( detection_model, class_agnostic, eval_input_config.max_number_of_boxes) use_original_images = fields.InputDataFields.original_image in features if use_original_images: eval_images = features[fields.InputDataFields.original_image] true_image_shapes = tf.slice( features[fields.InputDataFields.true_image_shape], [0, 0], [-1, 3]) original_image_spatial_shapes = features[ fields.InputDataFields.original_image_spatial_shape] else: eval_images = features[fields.InputDataFields.image] true_image_shapes = None original_image_spatial_shapes = None eval_dict = eval_util.result_dict_for_batched_example( eval_images, features[inputs.HASH_KEY], detections, groundtruth, class_agnostic=class_agnostic, scale_to_absolute=True, original_image_spatial_shapes=original_image_spatial_shapes, true_image_shapes=true_image_shapes) if class_agnostic: category_index = label_map_util.create_class_agnostic_category_index( ) else: category_index = label_map_util.create_category_index_from_labelmap( eval_input_config.label_map_path) vis_metric_ops = None if not use_tpu and use_original_images: eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections( category_index, max_examples_to_draw=eval_config.num_visualizations, max_boxes_to_draw=eval_config.max_num_boxes_to_visualize, min_score_thresh=eval_config.min_score_threshold, use_normalized_coordinates=False) vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops( eval_dict) # Eval metrics on a single example. eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators( eval_config, list(category_index.values()), eval_dict) for loss_key, loss_tensor in iter(losses_dict.items()): eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor) for var in optimizer_summary_vars: eval_metric_ops[var.op.name] = (var, tf.no_op()) if vis_metric_ops is not None: eval_metric_ops.update(vis_metric_ops) eval_metric_ops = {str(k): v for k, v in eval_metric_ops.items()} if eval_config.use_moving_averages: variable_averages = tf.train.ExponentialMovingAverage(0.0) variables_to_restore = variable_averages.variables_to_restore() keep_checkpoint_every_n_hours = ( train_config.keep_checkpoint_every_n_hours) saver = tf.train.Saver( variables_to_restore, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours ) scaffold = tf.train.Scaffold(saver=saver) # EVAL executes on CPU, so use regular non-TPU EstimatorSpec. if use_tpu and mode != tf.estimator.ModeKeys.EVAL: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, scaffold_fn=scaffold_fn, predictions=detections, loss=total_loss, train_op=train_op, eval_metrics=eval_metric_ops, export_outputs=export_outputs) else: if scaffold is None: keep_checkpoint_every_n_hours = ( train_config.keep_checkpoint_every_n_hours) saver = tf.train.Saver( sharded=True, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, save_relative_paths=True) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) scaffold = tf.train.Scaffold(saver=saver) return tf.estimator.EstimatorSpec(mode=mode, predictions=detections, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, export_outputs=export_outputs, scaffold=scaffold)
def main(_): """ Builds the model and runs """ if FLAGS.distributed: import horovod.tensorflow as hvd hvd.init() tf.logging.set_verbosity(tf.logging.INFO) if len(config_train.name) > 0: output_dir = os.path.join(FLAGS.output_dir, config_train.name) else: output_dir = FLAGS.output_dir tx.utils.maybe_create_dir(output_dir) ## Loads GPT-2 model configuration if FLAGS.config_type == "json": gpt2_config = model_utils.transform_gpt2_to_texar_config( FLAGS.config_model) elif FLAGS.config_type == 'texar': gpt2_config = importlib.import_module(FLAGS.config_model) else: raise ValueError('Unknown config_type.') # Creates a data pre-processor for, e.g., BPE encoding proc = processor.get_encoder(FLAGS.pretrained_model_dir) end_token = proc.encoder['<|endoftext|>'] max_decoding_length = config_train.max_decoding_length assert max_decoding_length <= gpt2_config.position_size, ( "max_decoding_length should not be greater than position_size. " "{}>{}".format(max_decoding_length, gpt2_config.position_size)) ## Loads data # Configures training data shard in distribued mode if FLAGS.distributed: config_train.train_hparam["dataset"]["num_shards"] = hvd.size() config_train.train_hparam["dataset"]["shard_id"] = hvd.rank() config_train.train_hparam["batch_size"] //= hvd.size() datasets = {} #if FLAGS.do_train: train_dataset = tx.data.TFRecordData(hparams=config_train.train_hparam) datasets['train'] = train_dataset #if FLAGS.do_eval: dev_dataset = tx.data.TFRecordData(hparams=config_train.dev_hparam) datasets['dev'] = dev_dataset #if FLAGS.do_test: test_dataset = tx.data.TFRecordData(hparams=config_train.test_hparam) datasets['test'] = test_dataset iterator = tx.data.FeedableDataIterator(datasets) batch = iterator.get_next() batch_size = tf.shape(batch['x1x4_ids'])[0] ## Builds the GPT-2 model vocab_size = gpt2_config.vocab_size word_embedder = tx.modules.WordEmbedder(vocab_size=vocab_size, hparams=gpt2_config.embed) pos_embedder = tx.modules.PositionEmbedder( position_size=gpt2_config.position_size, hparams=gpt2_config.pos_embed) # Ties output layer with input word embedding output_layer = tf.transpose(word_embedder.embedding, (1, 0)) decoder = tx.modules.TransformerDecoder(vocab_size=vocab_size, output_layer=output_layer, hparams=gpt2_config.decoder) def _embedding_fn(ids, times): return word_embedder(ids) + pos_embedder(times) # For training def _get_recon_loss(ids, full_len, prefix_len=None, mask_prefix=True, do_print=False): ids = ids[:, :tf.reduce_max(full_len)] batch_size__ = tf.shape(ids)[0] seq_len = tf.fill([batch_size__], tf.shape(ids)[1]) pos_embeds = pos_embedder(sequence_length=seq_len) input_embeds = word_embedder(ids) + pos_embeds # greedy output outputs = decoder(inputs=input_embeds, decoding_strategy='train_greedy') max_full_len = tf.reduce_max(full_len) ids = ids[:, :max_full_len] logits = outputs.logits[:, :max_full_len] if mask_prefix: loss_recon = tx.losses.sequence_sparse_softmax_cross_entropy( labels=ids[:, 1:], logits=logits[:, :-1, :], sequence_length=full_len - 1, average_across_timesteps=False, sum_over_timesteps=False, average_across_batch=False, sum_over_batch=False) mask_recon = tf.sequence_mask(full_len - 1, dtype=tf.float32) mask_recon_prefix = 1 - tf.sequence_mask( prefix_len - 1, maxlen=max_full_len - 1, #max_decoding_length-1, dtype=tf.float32) mask_recon = mask_recon * mask_recon_prefix if do_print: print_op_1 = tf.print(mask_recon) loss_recon_flat = tx.utils.reduce_with_weights( tensor=loss_recon, weights=mask_recon, average_across_remaining=False, sum_over_remaining=False, average_across_batch=False) print_op_2 = tf.print(loss_recon_flat) with tf.control_dependencies([print_op_1, print_op_2]): loss_recon = tx.utils.reduce_with_weights( tensor=loss_recon, weights=mask_recon, average_across_remaining=True, sum_over_remaining=False) return loss_recon, mask_recon, loss_recon_flat else: loss_recon = tx.utils.reduce_with_weights( tensor=loss_recon, weights=mask_recon, average_across_remaining=True, sum_over_remaining=False) else: loss_recon = tx.losses.sequence_sparse_softmax_cross_entropy( labels=ids[:, 1:], logits=logits[:, :-1, :], sequence_length=full_len - 1, average_across_timesteps=True, sum_over_timesteps=False, average_across_batch=False, sum_over_batch=False) return loss_recon # For RL fine-tuning def _get_sample_story(context_ids, context_len): sample_output, sample_len = decoder( decoding_strategy='infer_sample', embedding=_embedding_fn, context=context_ids, context_sequence_length=context_len, max_decoding_length=max_decoding_length, end_token=end_token, softmax_temperature=FLAGS.temperature, mode=tf.estimator.ModeKeys.PREDICT) return sample_output, sample_len # return ids, batch_loss, ids_len def _get_sample_rolled(output, length, context_len): ids = output.sample_id ids = tx.utils.varlength_roll(ids, -context_len) # final sample ids rolled ids_len = length - context_len ids = ids[:, :tf.reduce_max(ids_len)] return ids, ids_len def compute_batch_loss(output, sample_len, context_len): max_full_len = tf.reduce_max(sample_len) ids = output.sample_id[:, :max_full_len] logits = output.logits[:, :max_full_len] #(bs, sl, vocab) sampleLogprobs = tx.losses.sequence_sparse_softmax_cross_entropy( labels=ids[:, 1:], logits=logits, sequence_length=sample_len - 1, average_across_timesteps=False, sum_over_timesteps=False, average_across_batch=False, sum_over_batch=False) mask = tf.sequence_mask(sample_len - 1, dtype=tf.float32) mask_prefix = 1 - tf.sequence_mask( context_len - 1, maxlen=max_full_len - 1, #max_decoding_length-1, dtype=tf.float32) mask = mask * mask_prefix batch_loss = tx.utils.reduce_with_weights( tensor=sampleLogprobs, weights=mask, average_across_batch=False, average_across_remaining=True, sum_over_remaining=False) return batch_loss def _get_greedy_story(context_ids, context_len): greedy_res, greedy_len = decoder( decoding_strategy='infer_greedy', embedding=_embedding_fn, context=context_ids, context_sequence_length=context_len, max_decoding_length=max_decoding_length, end_token=end_token, mode=tf.estimator.ModeKeys.PREDICT) greedy_ids = tx.utils.varlength_roll(greedy_res.sample_id, -context_len) greedy_ids_len = greedy_len - context_len greedy_ids = greedy_ids[:, :tf.reduce_max(greedy_ids_len)] return greedy_ids, greedy_ids_len ## ROC Loss-1: ML loss x1_len = tf.placeholder(tf.int32, shape=[None], name='x1_len') x1x4_ids = tf.placeholder(tf.int32, shape=[None, None], name='x1x4_ids') x1x4_len = tf.placeholder(tf.int32, shape=[None], name='x1x4_len') loss_fine = _get_recon_loss(x1x4_ids, x1x4_len, x1_len) x1_ids = tf.placeholder(tf.int32, shape=[None, None], name='x1_ids') reward = tf.placeholder(tf.float32, shape=[None], name="reward") sampled_story = tf.placeholder(tf.int32, shape=[None, None], name="sampled_story") #smilar to sample_que sampled_story_len = tf.placeholder(tf.int32, shape=[None], name='sample_story_len') ## Loss-2: RL loss symbols_output, symbols_len = _get_sample_story(x1_ids, x1_len) symbols_rl, len_rl = _get_sample_rolled(symbols_output, symbols_len, x1_len) symbols_gr, len_gr = _get_greedy_story(x1_ids, x1_len) batch_loss_rl = _get_recon_loss(sampled_story, sampled_story_len, mask_prefix=False) rl_loss_fine = tf.reduce_mean(batch_loss_rl * reward) def _get_beam_ids(context_ids, context_len, target): # beam-search predictions = decoder(beam_width=5, length_penalty=config_train.length_penalty, embedding=_embedding_fn, context=context_ids, context_sequence_length=context_len, max_decoding_length=max_decoding_length, end_token=end_token, mode=tf.estimator.ModeKeys.PREDICT) beam_output_ids = tx.utils.varlength_roll( predictions["sample_id"][:, :, 0], -context_len) target_ids = tx.utils.varlength_roll(target, -context_len) return beam_output_ids, target_ids target_ids = tx.utils.varlength_roll(x1x4_ids, -x1_len) tau = tf.placeholder(tf.float32, shape=[], name='tau') if not FLAGS.sc_rl: loss = config_train.w_fine * loss_fine loss_dict = { 'loss': loss, 'loss_fine': config_train.w_fine * loss_fine, } else: loss = (1 - config_train.w_rl ) * config_train.w_fine * loss_fine + config_train.w_rl * ( config_train.w_fine_rl * rl_loss_fine) # loss_dict = { 'loss': loss, 'loss_fine': (1 - config_train.w_rl) * config_train.w_fine * loss_fine, 'rl_loss_fine': config_train.w_rl * config_train.w_fine_rl * rl_loss_fine, } ## Inference def _infer(context_name): helper = tx.modules.TopKSampleEmbeddingHelper( embedding=_embedding_fn, start_tokens=batch['%s_ids' % context_name][:, 0], end_token=end_token, top_k=FLAGS.top_k, softmax_temperature=FLAGS.temperature) outputs_infer, len_infer = decoder( context=batch['%s_ids' % context_name], context_sequence_length=batch['%s_len' % context_name], max_decoding_length=max_decoding_length, helper=helper) # outputs_infer contains sample_id and logits yy_ids = tx.utils.varlength_roll( outputs_infer.sample_id, -batch['%s_len' % context_name]) # shift beginning indices (context) to end yy_len = len_infer - batch['%s_len' % context_name] yy_ids = yy_ids[:, :tf.reduce_max(yy_len)] return yy_ids, yy_len x4_ids_fine, x4_len_fine = _infer('x1') def _infer_beam_ids(context_name): # beam-search predictions = decoder(beam_width=5, length_penalty=config_train.length_penalty, embedding=_embedding_fn, context=batch['%s_ids' % context_name], context_sequence_length=batch['%s_len' % context_name], max_decoding_length=max_decoding_length, end_token=end_token, mode=tf.estimator.ModeKeys.PREDICT) beam_output_ids = tx.utils.varlength_roll( predictions["sample_id"][:, :, 0], -batch['%s_len' % context_name]) return beam_output_ids beam_search_ids = _infer_beam_ids('x1') ## Optimization trainable_variables = tx.utils.collect_trainable_variables( [word_embedder, pos_embedder, decoder]) global_step = tf.Variable(0, trainable=False) opt = tx.core.get_optimizer(global_step=global_step, hparams=config_train.opt) if FLAGS.distributed: opt = hvd.DistributedOptimizer(opt) train_op = tf.contrib.layers.optimize_loss(loss=loss, global_step=global_step, learning_rate=None, optimizer=opt, variables=trainable_variables) ## Train/eval/test routine saver = tf.train.Saver() saver_best = tf.train.Saver(max_to_keep=1) dev_best = { 'loss': 1e8, 'loss_fine': 1e8, 'rl_loss_fine': 1e8, 'best_reward': -1e8, 'bleu': 0., 'meteor': 0. } #'best_reward': -1e8 def _log_losses(losses, step=None): loss_str = 'loss: %.4f, loss_fine: %.4f, rl_loss_fine: %.4f' % \ (losses['loss'], losses['loss_fine'], losses['rl_loss_fine'] ) if step is not None: loss_str = 'step: %d, %s' % (step, loss_str) _log(loss_str) def _is_head(): if not FLAGS.distributed: return True else: return hvd.rank() == 0 def _train_epoch(sess, initial=False): """Trains on the training set, and evaluates on the dev set periodically. """ # load train arc label data train_arc_file = [ i.strip().split() for i in open( os.path.join(config_train.arc_data, "train_mapped.txt")) ] iterator.restart_dataset(sess, 'train') while True: try: # (1) Get data and yy sample fetches_data = { 'batch': batch, 'batch_size': batch_size, } feed_dict_data = { iterator.handle: iterator.get_handle(sess, 'train'), tx.global_mode(): tf.estimator.ModeKeys.PREDICT, } rets_data = sess.run(fetches_data, feed_dict_data) reward_fetches = { 'sample_rl': symbols_rl, 'sample_len': len_rl, 'greedy_sym': symbols_gr, 'greedy_len': len_gr, } reward_rets = sess.run(reward_fetches, feed_dict={ x1_ids: rets_data['batch']['x1_ids'], x1_len: rets_data['batch']['x1_len'], tx.global_mode(): tf.estimator.ModeKeys.PREDICT }) # prepare sample stories for classification ids_rl, text_rl = _get_text( proc, reward_rets['sample_rl'], reward_rets['sample_len']) #list of list story_rl = format_generated_stories_for_clf( text_rl, FLAGS.rl_method) #print("Rl Story: ", story_rl) _, text_base = _get_text(proc, reward_rets['greedy_sym'], reward_rets['greedy_len']) story_base = format_generated_stories_for_clf( text_base, FLAGS.rl_method) #print("Greedy Story", story_base) # add reward calculation here reward_rl = get_reward(predictor, story_rl, rets_data['batch']['unique_id'], train_arc_file, method=FLAGS.rl_method) reward_base = get_reward(predictor, story_base, rets_data['batch']['unique_id'], train_arc_file, method=FLAGS.rl_method) # self-critical reward reward_sc = [ rr - rb for rr, rb in zip(reward_rl, reward_base) ] # class list # print(reward_rl, reward_base, reward_sc) ids_rl = utils.list_strip_eos(ids_rl, end_token) new_in_sample_ids, new_in_sample_len = _fix(ids_rl, end_token) # (2) Optimize loss feed_dict = { x1_ids: rets_data['batch']['x1_ids'], x1_len: rets_data['batch']['x1_len'], x1x4_ids: rets_data['batch']['x1x4_ids'], x1x4_len: rets_data['batch']['x1x4_len'], sampled_story: new_in_sample_ids, sampled_story_len: new_in_sample_len, tau: config_train.tau, tx.global_mode(): tf.estimator.ModeKeys.TRAIN, reward: np.array(reward_sc) } fetches = { 'train_op': train_op, 'step': global_step, } fetches.update(loss_dict) rets = sess.run(fetches, feed_dict, options=run_opts) step = rets['step'] dis_steps = config_train.display_steps if _is_head() and dis_steps > 0 and step % dis_steps == 0: _log_losses(rets, step) eval_steps = config_train.eval_steps if _is_head() and eval_steps > 0 and step % eval_steps == 0: _dev_epoch(sess, evaluate_func=evaluate_full) # not used sample_steps = config_train.sample_steps if _is_head( ) and sample_steps > 0 and step % sample_steps == 0: print('-----------testing-----------------') _test_epoch(sess, step=step) # not used ckpt_steps = config_train.checkpoint_steps if _is_head() and ckpt_steps > 0 and step % ckpt_steps == 0: ckpt_fn = os.path.join(output_dir, 'model.ckpt') ckpt_fn = saver.save(sess, ckpt_fn, global_step=step) _log('Checkpoint to {}'.format(ckpt_fn)) except tf.errors.OutOfRangeError: break def _dev_epoch(sess, evaluate_func=evaluate_full): """Evaluates on the dev set. """ dev_arc_file = [ i.strip().split() for i in open( os.path.join(config_train.arc_data, "dev_mapped.txt")) ] with open( os.path.join(config_train.tfrecord_data_dir, "x4_emo_features.dev"), 'rb') as fp: emotion_feats = np.array(pickle.load(fp)) iterator.restart_dataset(sess, 'dev') nsamples = 0 hypotheses = [] references = [] reward_score = [] losses = [] hypotheses_dict = {} while True: try: # (1) Get data and yy sample fetches_data = { 'batch': batch, 'batch_size': batch_size, } feed_dict_data = { iterator.handle: iterator.get_handle(sess, 'dev'), tx.global_mode(): tf.estimator.ModeKeys.PREDICT, } rets_data = sess.run(fetches_data, feed_dict_data) # (2) eval loss feed_dict = { x1_ids: rets_data['batch']['x1_ids'], x1_len: rets_data['batch']['x1_len'], x1x4_ids: rets_data['batch']['x1x4_ids'], x1x4_len: rets_data['batch']['x1x4_len'], # x4_emo: rets_data['batch']['x4_emo'], tau: config_train.tau, tx.global_mode(): tf.estimator.ModeKeys.PREDICT, } # rets_loss = sess.run(fetches, feed_dict) fetches = { 'loss_fine': loss_dict['loss_fine'], #'beam_search_ids': beam_search_ids, 'greedy_sym': symbols_gr, 'greedy_len': len_gr, 'target_ids': target_ids } rets = sess.run(fetches, feed_dict) losses.append(rets['loss_fine']) _, beam_text = _get_text(proc, rets['greedy_sym'], rets['greedy_len']) beam_story = format_generated_stories_for_clf( beam_text, FLAGS.rl_method) _, target_text = _get_text(proc, rets['target_ids'], rets_data['batch']['x1x4_len']) hypotheses.extend(beam_text) references.extend(target_text) hypotheses_dict_ = generate_all_valid_sample_dict( predictor, rets_data['batch']['unique_id'], beam_story, method=FLAGS.rl_method) for key, react in hypotheses_dict_.items(): if key not in hypotheses_dict: hypotheses_dict[ key] = react # dictionary key=unique_id value =list of list nsamples += rets_data['batch_size'] except tf.errors.OutOfRangeError: break avg_loss = np.mean(losses) metrics = evaluate_func(references, hypotheses, hypotheses_dict, dev_arc_file, emotion_feats, method=FLAGS.rl_method) msg = 'loss_fine: %.4f, bleu: %.4f, meteor: %.4f, reward: %.4f' % \ (avg_loss, metrics['bleu'], metrics['meteor'], metrics["best_reward"] ) _log('nsamples validation: %d' % nsamples) _log(msg) if FLAGS.best_model == "emotion": if FLAGS.do_train and metrics["best_reward"] > dev_best[ 'best_reward']: # dev_best.update(results.avg()) dev_best['loss_fine'] = avg_loss dev_best['best_reward'] = metrics["best_reward"] dev_best.update(metrics) ckpt_fn = os.path.join(output_dir, 'model_best.ckpt') ckpt_fn = saver_best.save(sess, ckpt_fn) _log('Checkpoint best to {}'.format(ckpt_fn)) elif FLAGS.best_model == "bleu": if FLAGS.do_train and metrics["bleu"] > dev_best['bleu']: # dev_best.update(results.avg()) dev_best['loss_fine'] = avg_loss dev_best['best_reward'] = metrics["best_reward"] dev_best.update(metrics) ckpt_fn = os.path.join(output_dir, 'model_best.ckpt') ckpt_fn = saver_best.save(sess, ckpt_fn) _log('Checkpoint best to {}'.format(ckpt_fn)) elif FLAGS.do_train and avg_loss < dev_best['loss']: # dev_best.update(results.avg()) dev_best['loss_fine'] = avg_loss dev_best.update(metrics) dev_best['best_reward'] = metrics["best_reward"] ckpt_fn = os.path.join(output_dir, 'model_best.ckpt') ckpt_fn = saver_best.save(sess, ckpt_fn) _log('Checkpoint best to {}'.format(ckpt_fn)) def _test_epoch(sess, step=None): """Generates samples on the test set. """ iterator.restart_dataset(sess, 'test') _all_inputs = [] _all_samples = [] if FLAGS.finetune: _log('Generation input: x1') fetches = { 'inputs': batch['x1_ids'], 'length': batch['x1_len'], 'samples_length': x4_len_fine, 'samples': x4_ids_fine } res_fn_appendix = "x1" while True: try: feed_dict = { iterator.handle: iterator.get_handle(sess, 'test'), tx.context.global_mode(): tf.estimator.ModeKeys.PREDICT, } rets = sess.run(fetches, feed_dict=feed_dict) _inputs = [] for i, l in zip(rets['inputs'], rets['length']): # Delete padding _inputs.append(i[:l].tolist()) _all_inputs.extend(_inputs) _samples = [] for s, l in zip(rets['samples'], rets['samples_length']): _samples.append(s[:l].tolist( )) # rets['samples'] are np array [bs, max_seq_len=200] _all_samples.extend(_samples) except tf.errors.OutOfRangeError: break # Parse samples and write to file eos_token_id = proc.encoder['<|endoftext|>'] _all_input_text = [] for i in _all_inputs: if i[0] == eos_token_id: i = i[1:] i_text = proc.decode(i) _all_input_text.append(i_text) _all_input_text = tx.utils.strip_eos(_all_input_text, eos_token='<|endoftext|>') _all_samples_text = [] for i, s in zip(_all_inputs, _all_samples): s_text = proc.decode(s) s_text = s_text.strip(" |").replace('\n', ' ') _all_samples_text.append(s_text) _all_samples_text = tx.utils.strip_eos(_all_samples_text, eos_token='<|endoftext|>') if step is None: fn = "test_samples_%s.tsv" % res_fn_appendix else: fn = "test_samples_%s_%d.tsv" % (res_fn_appendix, step) output_file = os.path.join(output_dir, fn) _log('Write samples to {}'.format(output_file)) tx.utils.write_paired_text(_all_input_text, _all_samples_text, output_file) # Broadcasts global variables from rank-0 process if FLAGS.distributed: bcast = hvd.broadcast_global_variables(0) session_config = tf.ConfigProto() if FLAGS.distributed: session_config.gpu_options.visible_device_list = str(hvd.local_rank()) session_config.gpu_options = tf.GPUOptions(allow_growth=True) with tf.Session(config=session_config) as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) #smry_writer = tf.summary.FileWriter(FLAGS.output_dir, graph=sess.graph) if FLAGS.distributed: bcast.run() #Restores trained model if specified if FLAGS.checkpoint: _log('Restore from {}'.format(FLAGS.checkpoint)) saver.restore(sess, FLAGS.checkpoint) elif FLAGS.pretrain_checkpoint: _log('Restore from {}'.format(FLAGS.pretrain_checkpoint)) model_utils.init_gpt2_checkpoint(sess, FLAGS.pretrain_checkpoint) print("\nFinished loading\n") saver.save(sess, output_dir + '/gpt2_model.ckpt') iterator.initialize_dataset(sess) if FLAGS.do_train: for epoch in range(config_train.max_train_epoch): print("Training epoch {}".format(epoch)) _train_epoch(sess, epoch == 0) saver.save(sess, output_dir + '/model.ckpt') if FLAGS.do_eval: _dev_epoch(sess) if FLAGS.do_test: _test_epoch(sess)
def cnn_model_fn(features, labels, mode): """Model function for CNN.""" # Input Layer # Reshape X to 4-D tensor: [batch_size, width, height, channels] # MNIST images are 28x28 pixels, and have one color channel input_layer = tf.reshape(features["x"], [-1, 28, 28, 1]) # Convolutional Layer #1 # Computes 32 features using a 5x5 filter with ReLU activation. # Padding is added to preserve width and height. # Input Tensor Shape: [batch_size, 28, 28, 1] # Output Tensor Shape: [batch_size, 28, 28, 32] conv1 = tf.layers.conv2d(inputs=input_layer, filters=32, kernel_size=[5, 5], padding="same", activation=tf.nn.relu) # Pooling Layer #1 # First max pooling layer with a 2x2 filter and stride of 2 # Input Tensor Shape: [batch_size, 28, 28, 32] # Output Tensor Shape: [batch_size, 14, 14, 32] pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2) # Convolutional Layer #2 # Computes 64 features using a 5x5 filter. # Padding is added to preserve width and height. # Input Tensor Shape: [batch_size, 14, 14, 32] # Output Tensor Shape: [batch_size, 14, 14, 64] conv2 = tf.layers.conv2d(inputs=pool1, filters=64, kernel_size=[5, 5], padding="same", activation=tf.nn.relu) # Pooling Layer #2 # Second max pooling layer with a 2x2 filter and stride of 2 # Input Tensor Shape: [batch_size, 14, 14, 64] # Output Tensor Shape: [batch_size, 7, 7, 64] pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2) # Flatten tensor into a batch of vectors # Input Tensor Shape: [batch_size, 7, 7, 64] # Output Tensor Shape: [batch_size, 7 * 7 * 64] pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64]) # Dense Layer # Densely connected layer with 1024 neurons # Input Tensor Shape: [batch_size, 7 * 7 * 64] # Output Tensor Shape: [batch_size, 1024] dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu) # Add dropout operation; 0.6 probability that element will be kept dropout = tf.layers.dropout(inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN) # Logits layer # Input Tensor Shape: [batch_size, 1024] # Output Tensor Shape: [batch_size, 10] logits = tf.layers.dense(inputs=dropout, units=10) predictions = { # Generate predictions (for PREDICT and EVAL mode) "classes": tf.argmax(input=logits, axis=1), # Add `softmax_tensor` to the graph. It is used for PREDICT and by the # `logging_hook`. "probabilities": tf.nn.softmax(logits, name="softmax_tensor") } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Calculate Loss (for both TRAIN and EVAL modes) onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10) loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits) # Configure the Training Op (for TRAIN mode) if mode == tf.estimator.ModeKeys.TRAIN: # Horovod: scale learning rate by the number of workers. optimizer = tf.train.MomentumOptimizer(learning_rate=0.001 * hvd.size(), momentum=0.9) # Horovod: add Horovod Distributed Optimizer. optimizer = hvd.DistributedOptimizer(optimizer) train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) # Add evaluation metrics (for EVAL mode) eval_metric_ops = { "accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["classes"]) } return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
def run_mnist(_): # Import data mnist = learn.datasets.mnist.read_data_sets(FLAGS.data_dir + 'MNIST-data-%d' % hvd.rank(), one_hot=True) # Create the model with tf.name_scope("mnist_placholder"): x = tf.placeholder(tf.float32, [None, 784]) W = tf.Variable(tf.zeros([784, 10])) b = tf.Variable(tf.zeros([10])) y = tf.matmul(x, W) + b # Define loss and optimizer y_ = tf.placeholder(tf.float32, [None, 10]) # The raw formulation of cross-entropy, # # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)), # reduction_indices=[1])) # # can be numerically unstable. # # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw # outputs of 'y', and then average across the batch. cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)) #global_step = tf.train.get_or_create_global_step() global_step = tf.contrib.framework.get_or_create_global_step() opt = tf.train.GradientDescentOptimizer(0.5) # Add MPI Distributed Optimizer with tf.name_scope("horovod_opt"): opt = hvd.DistributedOptimizer(opt) train_step = opt.minimize(cross_entropy, global_step=global_step) # The StopAtStepHook handles stopping after running given steps. hooks = [ hvd.BroadcastGlobalVariablesHook(0), tf.train.StopAtStepHook(last_step=10) ] # Test trained model correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) # Enable soft placement and tracing as needed config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True, inter_op_parallelism_threads=1) config_ngraph_enabled = ngraph_bridge.update_config(config) #config.graph_options.optimizer_options.global_jit_level = jit_level run_metadata = tf.RunMetadata() #init_op = tf.global_variables_initializer() print("Variables initialized ...") # The MonitoredTrainingSession takes care of session initialization with tf.train.MonitoredTrainingSession( hooks=hooks, config=config_ngraph_enabled) as mon_sess: start = time.time() train_writer = tf.summary.FileWriter(FLAGS.log_dir, mon_sess.graph) while not mon_sess.should_stop(): # Train batch_xs, batch_ys = mnist.train.next_batch(100) mon_sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys}) # Test trained model if not mon_sess.should_stop(): print( "Accuracy: ", mon_sess.run(accuracy, feed_dict={ x: mnist.test.images, y_: mnist.test.labels })) end = time.time() if hvd.rank() == 0: print("Training time: %f seconds" % (end - start))
def main(_): mnist = input_data.read_data_sets( './mnist', one_hot=True) # they has been normalized to range (0,1) test_x = mnist.test.images[:2000] test_y = mnist.test.labels[:2000] # plot one example print(mnist.train.images.shape) # (55000, 28 * 28) print(mnist.train.labels.shape) # (55000, 10) # Init horovod hvd.init() # Pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) tf_x = tf.placeholder(tf.float32, [None, 28 * 28]) / 255. image = tf.reshape(tf_x, [-1, 28, 28, 1]) # (batch, height, width, channel) tf_y = tf.placeholder(tf.int32, [None, 10]) # input y # get global step global_step = tf.train.get_or_create_global_step() # CNN conv1 = tf.layers.conv2d( # shape (28, 28, 1) inputs=image, filters=16, kernel_size=5, strides=1, padding='same', activation=tf.nn.relu) # -> (28, 28, 16) pool1 = tf.layers.max_pooling2d( conv1, pool_size=2, strides=2, ) # -> (14, 14, 16) conv2 = tf.layers.conv2d(pool1, 32, 5, 1, 'same', activation=tf.nn.relu) # -> (14, 14, 32) pool2 = tf.layers.max_pooling2d(conv2, 2, 2) # -> (7, 7, 32) flat = tf.reshape(pool2, [-1, 7 * 7 * 32]) # -> (7*7*32, ) output = tf.layers.dense(flat, 10) # output layer accuracy = tf.metrics.accuracy( # return (acc, update_op), and create 2 local variables labels=tf.argmax(tf_y, axis=1), predictions=tf.argmax(output, axis=1), )[1] loss = tf.losses.softmax_cross_entropy(onehot_labels=tf_y, logits=output) # compute cost optimizer = tf.train.AdamOptimizer(LR * hvd.size()) # Increase learning rate optimizer = hvd.DistributedOptimizer( optimizer) # Add Horovod Distributed Optimizer train_op = optimizer.minimize(loss, global_step=global_step) # define hooks hooks = [ hvd.BroadcastGlobalVariablesHook(0), tf.train.StopAtStepHook(last_step=600 // hvd.size()), ] if hvd.rank() == 0: hooks.append( tf.train.LoggingTensorHook(tensors={ 'step': global_step, 'loss': loss }, every_n_iter=10)) # Use MonitoredTrainingSession with tf.train.MonitoredTrainingSession(config=config, hooks=hooks) as mon_sess: start = BATCH_SIZE * hvd.rank() end = BATCH_SIZE * (hvd.rank() + 1) print(start, end, BATCH_SIZE * hvd.size()) while not mon_sess.should_stop(): b_x, b_y = mnist.train.next_batch(BATCH_SIZE * hvd.size()) b_x, b_y = b_x[start:end], b_y[start:end] mon_sess.run([train_op, loss], {tf_x: b_x, tf_y: b_y})
def __init__(self, league_mgr_addr, model_pool_addrs, learner_ports, rm_size, batch_size, ob_space, ac_space, policy, gpu_id, policy_config={}, ent_coef=1e-2, distill_coef=1e-2, vf_coef=0.5, max_grad_norm=0.5, rwd_shape=False, pub_interval=500, log_interval=100, save_interval=0, total_timesteps=5e7, burn_in_timesteps=0, learner_id='', batch_worker_num=4, pull_worker_num=2, unroll_length=32, rollout_length=1, use_mixed_precision=False, use_sparse_as_dense=True, adam_beta1=0.9, adam_beta2=0.999, adam_eps=1e-5, data_type=PGData, data_server_version='v1', decode=False, log_infos_interval=20, **kwargs): super(PGLearner, self).__init__(league_mgr_addr, model_pool_addrs, learner_ports, learner_id) self.LR = tf.placeholder(tf.float32, []) """Learning Rate""" self.CLIPRANGE = tf.placeholder(tf.float32, []) """Learning Rate Clip Range""" self.ep_loss_coef = {} """Coefficients for those losses from the endpoints. Override it in derived class.""" # TODO(pengsun): fix the policy_config default value self._init_const(total_timesteps, burn_in_timesteps, batch_size, unroll_length, rwd_shape, ent_coef, vf_coef, pub_interval, log_interval, save_interval, policy, distill_coef, policy_config, rollout_length) # allow_soft_placement=True can fix issue when some op cannot be defined on # GPUs for tf-1.8.0; tf-1.13.1 does not have this issue config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(gpu_id) self.sess = tf.Session(config=config) self.rank = hvd.rank() if has_hvd else 0 # Prepare dataset ds = data_type(ob_space, ac_space, self.n_v, use_lstm=self.rnn, hs_len=self.hs_len, distillation=self.distillation, version='v2') self._data_server = DataServer(self._pull_data, rm_size, unroll_length, batch_size, ds, gpu_id_list=(0, ), batch_worker_num=batch_worker_num, pull_worker_num=pull_worker_num, rollout_length=rollout_length, prefetch_buffer_size=2, version=data_server_version, decode=decode, log_infos_interval=log_infos_interval) # prepare net config net_config = policy.net_config_cls(ob_space, ac_space, **policy_config) net_config.clip_range = self.CLIPRANGE if rwd_shape: # make net_config.reward-shaping-weights a tf.placeholder so as to change # it during training. # NOTE: Assume there is reward_weights_shape in net_config # TODO(pengsun): use NetInputsData instead of this quick-and-dirty hacking? reward_weights_shape = net_config.reward_weights_shape self.rwd_weights = tf.placeholder(tf.float32, reward_weights_shape) net_config.reward_weights = self.rwd_weights if hasattr(net_config, 'lam'): # make net_config.lambda-for-td-lambda a tf.placeholder so as to change it # during training. # TODO(pengsun): use NetInputsData instead of this quick-and-dirty hacking? self.LAM = tf.placeholder(tf.float32, []) net_config.lam = self.LAM else: self.LAM = None # build the policy net with tf.variable_scope('model', reuse=tf.AUTO_REUSE) as model_scope: pass def create_policy(inputs, nc): return policy.net_build_fun(inputs=inputs, nc=nc, scope=model_scope) device = '/gpu:{}'.format(0) with tf.device(device): input_data = self._data_server.input_datas[0] if 'use_xla' in policy_config and policy_config['use_xla']: try: # Use tensorflow's accerlated linear algebra compile method with tf.xla.experimental.jit_scope(True): model = create_policy(input_data, net_config) except: logger.log( "WARNING: using tf.xla requires tf version>=1.15.") model = create_policy(input_data, net_config) else: model = create_policy(input_data, net_config) loss, vf_loss, losses = self.build_loss(model, input_data) if has_hvd: self.losses = [hvd.allreduce(loss) for loss in losses] else: self.losses = list(losses) self.params = tf.trainable_variables(scope='model') self.params_vf = tf.trainable_variables(scope='model/vf') self.param_norm = tf.global_norm(self.params) self.trainer = tf.train.AdamOptimizer(learning_rate=self.LR, beta1=adam_beta1, beta2=adam_beta2, epsilon=adam_eps) self.burn_in_trainer = tf.train.AdamOptimizer( learning_rate=self.LR, epsilon=1e-5) # same as default and IL if use_mixed_precision: try: self.trainer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite( self.trainer) self.burn_in_trainer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite( self.burn_in_trainer) except: logger.warn( "using tf mixed_precision requires tf version>=1.15.") if has_hvd: self.trainer = hvd.DistributedOptimizer( self.trainer, sparse_as_dense=use_sparse_as_dense) self.burn_in_trainer = hvd.DistributedOptimizer( self.burn_in_trainer, sparse_as_dense=use_sparse_as_dense) grads_and_vars = self.trainer.compute_gradients(loss, self.params) grads_and_vars_vf = self.burn_in_trainer.compute_gradients( vf_loss, self.params_vf) clip_vars = model.vars.lstm_vars grads_and_vars, self.clip_grad_norm, self.nonclip_grad_norm = self.clip_grads_vars( grads_and_vars, clip_vars, max_grad_norm) grads_and_vars_vf, self.clip_grad_norm_vf, self.nonclip_grad_norm_vf = self.clip_grads_vars( grads_and_vars_vf, clip_vars, max_grad_norm) self._train_batch = self.trainer.apply_gradients(grads_and_vars) self._burn_in = self.burn_in_trainer.apply_gradients(grads_and_vars_vf) self.loss_endpoints_names = model.loss.loss_endpoints.keys() self._build_ops() if has_hvd: barrier_op = hvd.allreduce(tf.Variable(0.)) broadcast_op = hvd.broadcast_global_variables(0) tf.global_variables_initializer().run(session=self.sess) self.sess.graph.finalize() self.barrier = lambda: self.sess.run(barrier_op) if has_hvd else None self.broadcast = lambda: self.sess.run(broadcast_op ) if has_hvd else None self.broadcast() # logging stuff format_strs = (['stdout', 'log', 'tensorboard', 'csv'] if self.rank == 0 else ['stdout', 'log', 'tensorboard', 'csv']) logger.configure(dir='training_log/{}rank{}'.format( self._learner_id, self.rank), format_strs=format_strs)
dataset = dataset.repeat(100) iterator = dataset.make_one_shot_iterator() next_item = iterator.get_next() # Define the model slope = tf.Variable(np.random.randn()) offset = tf.Variable(np.random.randn()) x, y = next_item # The model is the continuation of the pipeline y_hat = slope * x + offset loss = tf.losses.mean_squared_error(y_hat, y) opt = tf.train.GradientDescentOptimizer(.5) train = hvd.DistributedOptimizer(opt).minimize(loss) hooks = [hvd.BroadcastGlobalVariablesHook(0)] history = [] with tf.train.MonitoredTrainingSession(hooks=hooks) as sess: # Initialization of the variables `slope` and `offset` # is done automatically by tf.train.MonitoredTrainingSession print( 'rank', hvd.rank(), 'inital slope = %12.6f\n initial offset = %12.6f' % sess.run( (slope, offset))) while not sess.should_stop(): _, loss_val, m, n = sess.run((train, loss, slope, offset)) history.append([sess.run(slope), sess.run(offset), loss_val])
def __call__(self, features, labels, mode, params): if mode == tf.estimator.ModeKeys.TRAIN: mandatory_params = ["batch_size", "lr_init", "num_gpus", "steps_per_epoch", "momentum", "weight_decay", "loss_scale", "label_smoothing"] for p in mandatory_params: if p not in params: raise RuntimeError("Parameter {} is missing.".format(p)) if mode == tf.estimator.ModeKeys.TRAIN and not self.model_hparams.use_dali: with tf.device('/cpu:0'): # Stage inputs on the host cpu_prefetch_op, (features, labels) = self._stage([features, labels]) with tf.device('/gpu:0'): # Stage inputs to the device gpu_prefetch_op, (features, labels) = self._stage([features, labels]) with tf.device("/gpu:0"): if features.dtype != self.model_hparams.dtype: features = tf.cast(features, self.model_hparams.dtype) # Subtract mean per channel # and enforce values between [-1, 1] if not self.model_hparams.use_dali: features = normalized_inputs(features) mixup = 0 eta = 0 if mode == tf.estimator.ModeKeys.TRAIN: eta = params['label_smoothing'] mixup = params['mixup'] if mode != tf.estimator.ModeKeys.PREDICT: one_hot_smoothed_labels = tf.one_hot(labels, 1001, on_value = 1 - eta + eta/1001, off_value = eta/1001) if mixup != 0: print("Using mixup training with beta=", params['mixup']) beta_distribution = tf.distributions.Beta(params['mixup'], params['mixup']) feature_coefficients = beta_distribution.sample(sample_shape=[params['batch_size'], 1, 1, 1]) reversed_feature_coefficients = tf.subtract(tf.ones(shape=feature_coefficients.shape), feature_coefficients) rotated_features = tf.reverse(features, axis=[0]) features = feature_coefficients * features + reversed_feature_coefficients * rotated_features label_coefficients = tf.squeeze(feature_coefficients, axis=[2, 3]) rotated_labels = tf.reverse(one_hot_smoothed_labels, axis=[0]) reversed_label_coefficients = tf.subtract(tf.ones(shape=label_coefficients.shape), label_coefficients) one_hot_smoothed_labels = label_coefficients * one_hot_smoothed_labels + reversed_label_coefficients * rotated_labels # Update Global Step global_step = tf.train.get_or_create_global_step() tf.identity(global_step, name="global_step_ref") tf.identity(features, name="features_ref") if mode == tf.estimator.ModeKeys.TRAIN: tf.identity(labels, name="labels_ref") probs, logits = self.build_model( features, training=mode == tf.estimator.ModeKeys.TRAIN, reuse=False ) y_preds = tf.argmax(logits, axis=1, output_type=tf.int32) # Check the output dtype, shall be FP32 in training assert (probs.dtype == tf.float32) assert (logits.dtype == tf.float32) assert (y_preds.dtype == tf.int32) tf.identity(logits, name="logits_ref") tf.identity(probs, name="probs_ref") tf.identity(y_preds, name="y_preds_ref") #if mode == tf.estimator.ModeKeys.TRAIN: # # assert (len(tf.trainable_variables()) == 161) # #else: # # assert (len(tf.trainable_variables()) == 0) if mode == tf.estimator.ModeKeys.PREDICT: predictions = {'classes': y_preds, 'probabilities': probs} return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={'predict': tf.estimator.export.PredictOutput(predictions)} ) else: with tf.device("/gpu:0"): if mode == tf.estimator.ModeKeys.TRAIN: acc_top1 = tf.nn.in_top_k(predictions=logits, targets=labels, k=1) acc_top5 = tf.nn.in_top_k(predictions=logits, targets=labels, k=5) else: acc_top1, acc_top1_update_op = tf.metrics.mean(tf.nn.in_top_k(predictions=logits, targets=labels, k=1)) acc_top5, acc_top5_update_op = tf.metrics.mean(tf.nn.in_top_k(predictions=logits, targets=labels, k=5)) tf.identity(acc_top1, name="acc_top1_ref") tf.identity(acc_top5, name="acc_top5_ref") predictions = { 'classes': y_preds, 'probabilities': probs, 'accuracy_top1': acc_top1, 'accuracy_top5': acc_top5 } cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=one_hot_smoothed_labels) assert (cross_entropy.dtype == tf.float32) tf.identity(cross_entropy, name='cross_entropy_loss_ref') def loss_filter_fn(name): """we don't need to compute L2 loss for BN and bias (eq. to add a cste)""" return all([ tensor_name not in name.lower() # for tensor_name in ["batchnorm", "batch_norm", "batch_normalization", "bias"] for tensor_name in ["batchnorm", "batch_norm", "batch_normalization"] ]) filtered_params = [tf.cast(v, tf.float32) for v in tf.trainable_variables() if loss_filter_fn(v.name)] if len(filtered_params) != 0: l2_loss_per_vars = [tf.nn.l2_loss(v) for v in filtered_params] l2_loss = tf.multiply(tf.add_n(l2_loss_per_vars), params["weight_decay"]) else: l2_loss = tf.zeros(shape=(), dtype=tf.float32) assert (l2_loss.dtype == tf.float32) tf.identity(l2_loss, name='l2_loss_ref') total_loss = tf.add(cross_entropy, l2_loss, name="total_loss") assert (total_loss.dtype == tf.float32) tf.identity(total_loss, name='total_loss_ref') tf.summary.scalar('cross_entropy', cross_entropy) tf.summary.scalar('l2_loss', l2_loss) tf.summary.scalar('total_loss', total_loss) if mode == tf.estimator.ModeKeys.TRAIN: with tf.device("/cpu:0"): learning_rate = learning_rate_scheduler( lr_init=params["lr_init"], lr_warmup_epochs=params["lr_warmup_epochs"], global_step=global_step, batch_size=params["batch_size"], num_batches_per_epoch=params["steps_per_epoch"], num_decay_steps=params["num_decay_steps"], num_gpus=params["num_gpus"], use_cosine_lr=params["use_cosine_lr"] ) tf.identity(learning_rate, name='learning_rate_ref') tf.summary.scalar('learning_rate', learning_rate) optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=params["momentum"]) if params["apply_loss_scaling"]: optimizer = FixedLossScalerOptimizer(optimizer, scale=params["loss_scale"]) if hvd_utils.is_using_hvd(): optimizer = hvd.DistributedOptimizer(optimizer) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if mode != tf.estimator.ModeKeys.TRAIN: update_ops += [acc_top1_update_op, acc_top5_update_op] deterministic = True gate_gradients = (tf.train.Optimizer.GATE_OP if deterministic else tf.train.Optimizer.GATE_NONE) backprop_op = optimizer.minimize(total_loss, gate_gradients=gate_gradients, global_step=global_step) if self.model_hparams.use_dali: train_ops = tf.group(backprop_op, update_ops, name='train_ops') else: train_ops = tf.group(backprop_op, cpu_prefetch_op, gpu_prefetch_op, update_ops, name='train_ops') return tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_ops) elif mode == tf.estimator.ModeKeys.EVAL: eval_metrics = { "top1_accuracy": (acc_top1, acc_top1_update_op), "top5_accuracy": (acc_top5, acc_top5_update_op) } return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, loss=total_loss, eval_metric_ops=eval_metrics ) else: raise NotImplementedError('Unknown mode {}'.format(mode))
def _build_optimizer(self, loss): optimizer = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=self.beta1, beta2=self.beta2) return hvd.DistributedOptimizer(optimizer).minimize( loss, global_step=tf.train.get_or_create_global_step())
def __init__(self, policy, ob_space, ac_space, nenv, nsteps, ent_coef, vf_coef, l2_coef, cliprange, adam_epsilon=1e-6, load_path=None, test_mode=False): sess = tf.get_default_session() act_model = policy(sess, ob_space, ac_space, nenv, 1, test_mode=test_mode, reuse=False) train_model = policy(sess, ob_space, ac_space, nenv, nsteps, test_mode=test_mode, reuse=True) A = train_model.pdtype.sample_placeholder([nenv * nsteps], name='action') ADV = tf.placeholder(tf.float32, [nenv * nsteps], name='advantage') VALID = tf.placeholder(tf.float32, [nenv * nsteps], name='valid') R = tf.placeholder(tf.float32, [nenv * nsteps], name='return') OLDNEGLOGPAC = tf.placeholder(tf.float32, [nenv * nsteps], name='neglogprob') OLDVPRED = tf.placeholder(tf.float32, [nenv * nsteps], name='valuepred') LR = tf.placeholder(tf.float32, [], name='lr') neglogpac = train_model.pd.neglogp(A) entropy = tf.reduce_mean(VALID * train_model.pd.entropy()) vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(vpred - OLDVPRED, -cliprange, cliprange) vf_losses1 = tf.square(vpred - R) vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean( VALID * tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - cliprange, 1.0 + cliprange) pg_loss = tf.reduce_mean(VALID * tf.maximum(pg_losses, pg_losses2)) mv = tf.reduce_mean(VALID) approxkl = .5 * tf.reduce_mean( VALID * tf.square(neglogpac - OLDNEGLOGPAC)) / mv clipfrac = tf.reduce_mean(VALID * tf.to_float( tf.greater(tf.abs(ratio - 1.0), cliprange))) / mv params = tf.trainable_variables() l2_loss = .5 * sum([tf.reduce_sum(tf.square(p)) for p in params]) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + l2_coef * l2_loss opt = tf.train.AdamOptimizer(LR, epsilon=adam_epsilon) opt = hvd.DistributedOptimizer(opt) train_op = opt.minimize(loss) def train(lr, obs, returns, advs, masks, actions, values, neglogpacs, valids, increase_ent, states=None): td_map = { LR: lr, train_model.X: obs, A: actions, ADV: advs, VALID: valids, R: returns, OLDNEGLOGPAC: neglogpacs, OLDVPRED: values, train_model.E: increase_ent } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks return sess.run([ pg_loss, vf_loss, l2_loss, entropy, approxkl, clipfrac, train_op ], feed_dict=td_map)[:-1] self.loss_names = [ 'policy_loss', 'value_loss', 'l2_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = save self.load = load sess.run(tf.global_variables_initializer()) if load_path and hvd.rank() == 0: self.load(load_path) sess.run(hvd.broadcast_global_variables(0)) tf.get_default_graph().finalize()
def train(*, flow_constructor, logdir, lr_schedule, dropout_p, seed, init_bs, total_bs, ema_decay, steps_per_log, epochs_per_val, max_grad_norm, dtype=tf.float32, scale_loss=None, restore_checkpoint=None, scale_grad=None, dataset='cifar10', steps_per_extra_samples=None): hvd, MPI, is_root, mpi_average = setup_horovod() # Seeding and logging setup seed_all(hvd.rank() + hvd.size() * seed) assert total_bs % hvd.size() == 0 local_bs = total_bs // hvd.size() logger = None logdir = '{}_mpi{}_{}'.format(os.path.expanduser(logdir), hvd.size(), time.time()) checkpointdir = os.path.join(logdir, 'checkpoints') if is_root: print('Floating point format:', dtype) pprint(locals()) os.makedirs(logdir) os.makedirs(checkpointdir) logger = TensorBoardOutput(logdir) # Load data if is_root: # Load once on root first to prevent downloading conflicts print('Loading data') load_data(dataset=dataset, dtype=dtype.as_numpy_dtype) MPI.COMM_WORLD.Barrier() data_train, data_val = load_data(dataset=dataset, dtype=dtype.as_numpy_dtype) img_shp = list(data_train.shape[1:]) if is_root: print('Training data: {}, Validation data: {}'.format( data_train.shape[0], data_val.shape[0])) print('Image shape:', img_shp) bpd_scale_factor = 1. / (np.log(2) * np.prod(img_shp)) # Build graph if is_root: print('Building graph') dequant_flow, flow = flow_constructor() # Data-dependent init if is_root: print('===== Init graph =====') x_init_sym = tf.placeholder(dtype, [init_bs] + img_shp) _, _, init_loss_sym, _ = build_forward(x=x_init_sym, dequant_flow=dequant_flow, flow=flow, flow_kwargs=dict( vcfg=VarConfig(init=True, ema=None, dtype=dtype), dropout_p=dropout_p, verbose=is_root)) # Training if is_root: print('===== Training graph =====') x_sym = tf.placeholder(dtype, [local_bs] + img_shp) _, y_sym, loss_sym, _ = build_forward(x=x_sym, dequant_flow=dequant_flow, flow=flow, flow_kwargs=dict(vcfg=VarConfig( init=False, ema=None, dtype=dtype), dropout_p=dropout_p, verbose=is_root)) # EMA params = tf.trainable_variables() if is_root: # for p in params: # print(p.name, p.shape) print('Parameters', sum(np.prod(p.get_shape().as_list()) for p in params)) ema = tf.train.ExponentialMovingAverage(decay=ema_decay) maintain_averages_op = tf.group(ema.apply(params)) # Op for setting the ema params to the current non-ema params (for use after data-dependent init) name2var = {v.name: v for v in tf.global_variables()} copy_params_to_ema = tf.group([ name2var[p.name.replace(':0', '') + '/ExponentialMovingAverage:0'].assign(p) for p in params ]) # Validation and sampling (with EMA) if is_root: print('===== Validation graph =====') val_flow_kwargs = dict(vcfg=VarConfig(init=False, ema=ema, dtype=dtype), dropout_p=0, verbose=is_root) val_dequant_x_sym, val_y_sym, val_loss_sym, _ = build_forward( x=x_sym, dequant_flow=dequant_flow, flow=flow, flow_kwargs=val_flow_kwargs) # for debugging invertibility val_inverr_sym = tf.reduce_max( tf.abs(val_dequant_x_sym - flow.inverse(val_y_sym, **val_flow_kwargs)[0])) if is_root: print('===== Sampling graph =====') samples_sym, _ = flow.inverse( tf.random_normal(y_sym.shape.as_list(), dtype=dtype), **val_flow_kwargs) allgathered_samples_sym = hvd.allgather(tf.to_float(samples_sym)) assert len(tf.trainable_variables()) == len(params) def run_validation(sess, i_step): data_val_shard = np.array_split(data_val, hvd.size(), axis=0)[hvd.rank()] shard_losses, shard_inverrs = zip(*[ sess.run([val_loss_sym, val_inverr_sym], {x_sym: val_batch}) for val_batch, in iterbatches([data_val_shard], batch_size=local_bs, include_final_partial_batch=False) ]) val_loss, total_count = mpi_average(shard_losses) inv_err, _ = mpi_average(shard_inverrs) samples = sess.run(allgathered_samples_sym) if is_root: logger.writekvs( [('val_bpd', bpd_scale_factor * val_loss), ('val_inverr', inv_err), ('num_val_examples', total_count * local_bs), ('samples', tile_imgs(np.clip(samples, 0, 255).astype(np.uint8)))], i_step) def run_sampling_only(sess, i_step): samples = sess.run(allgathered_samples_sym) if is_root: logger.writekvs( [('samples', tile_imgs(np.clip(samples, 0, 255).astype(np.uint8)))], i_step) # Optimization lr_sym = tf.placeholder(dtype, [], 'lr') optimizer = hvd.DistributedOptimizer(tf.train.AdamOptimizer(lr_sym)) if scale_loss is None: grads_and_vars = optimizer.compute_gradients(loss_sym, var_list=params) else: grads_and_vars = [(g / scale_loss, v) for (g, v) in optimizer.compute_gradients( loss_sym * scale_loss, var_list=params)] if scale_grad is not None: grads_and_vars = [(g / scale_grad, v) for (g, v) in grads_and_vars] if max_grad_norm is not None: clipped_grads, grad_norm_sym = tf.clip_by_global_norm( [g for (g, _) in grads_and_vars], max_grad_norm) grads_and_vars = [ (cg, v) for (cg, (_, v)) in zip(clipped_grads, grads_and_vars) ] else: grad_norm_sym = tf.constant(0.) opt_sym = tf.group(optimizer.apply_gradients(grads_and_vars), maintain_averages_op) def loop(sess: tf.Session): i_step = 0 if is_root: print('Initializing') sess.run(tf.global_variables_initializer()) if restore_checkpoint is not None: # Restore from checkpoint if is_root: saver = tf.train.Saver() print('Restoring checkpoint:', restore_checkpoint) restore_step = int(restore_checkpoint.split('-')[-1]) print('Restoring from step:', restore_step) saver.restore(sess, restore_checkpoint) i_step = restore_step else: saver = None else: # No checkpoint: perform data dependent initialization if is_root: print('Data dependent init') init_loss = sess.run( init_loss_sym, { x_init_sym: data_train[np.random.randint(0, data_train.shape[0], init_bs)] }) if is_root: print('Init loss:', init_loss * bpd_scale_factor) sess.run(copy_params_to_ema) saver = tf.train.Saver() if is_root else None if is_root: print('Broadcasting initial parameters') sess.run(hvd.broadcast_global_variables(0)) sess.graph.finalize() if is_root: print('Training') loss_hist = deque(maxlen=steps_per_log) gnorm_hist = deque(maxlen=steps_per_log) for i_epoch in range(99999999999): if i_epoch % epochs_per_val == 0: run_validation(sess, i_step=i_step) if saver is not None: saver.save(sess, os.path.join(checkpointdir, 'model'), global_step=i_step) epoch_start_t = time.time() for i_epoch_step, (batch, ) in enumerate( iterbatches( # non-sharded: each gpu goes through the whole dataset [data_train], batch_size=local_bs, include_final_partial_batch=False, )): if steps_per_extra_samples is not None and i_step % steps_per_extra_samples == 0: run_sampling_only(sess, i_step) lr = lr_schedule(i_step) loss, gnorm, _ = sess.run([loss_sym, grad_norm_sym, opt_sym], { x_sym: batch, lr_sym: lr }) loss_hist.append(loss) gnorm_hist.append(gnorm) # Skip timing the very first step, which will be unusually slow due to TF initialization if i_epoch == i_epoch_step == 0: epoch_start_t = time.time() if i_step % steps_per_log == 0: loss_hist_means = MPI.COMM_WORLD.gather(float( np.mean(loss_hist)), root=0) gnorm_hist_means = MPI.COMM_WORLD.gather(float( np.mean(gnorm_hist)), root=0) steps_per_sec = (i_epoch_step + 1) / (time.time() - epoch_start_t) if is_root: kvs = [ ('iter', i_step), ('epoch', i_epoch + i_epoch_step * local_bs / data_train.shape[0]), # epoch for this gpu ('bpd', float( np.mean(loss_hist_means) * bpd_scale_factor)), ('gnorm', float(np.mean(gnorm_hist_means))), ('lr', float(lr)), ('fps', steps_per_sec * total_bs ), # fps calculated over all gpus (this epoch) ('sps', steps_per_sec), ] logger.writekvs(kvs, i_step) i_step += 1 # End of epoch # Train config = tf.ConfigProto() # config.log_device_placement = True config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str( hvd.local_rank()) # Pin GPU to local rank (one GPU per process) with tf.Session(config=config) as sess: loop(sess)
def main(_): # Horovod: initialize Horovod. hvd.init() # Keras automatically creates a cache directory in ~/.keras/datasets for # storing the downloaded MNIST data. This creates a race # condition among the workers that share the same filesystem. If the # directory already exists by the time this worker gets around to creating # it, ignore the resulting exception and continue. cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets') if not os.path.exists(cache_dir): try: os.mkdir(cache_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(cache_dir): pass else: raise # Download and load MNIST dataset. (x_train, y_train), (x_test, y_test) = \ keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank()) # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it # into (-1, 784) to feed into our network. Also, need to normalize the # features between 0 and 1. x_train = np.reshape(x_train, (-1, 784)) / 255.0 x_test = np.reshape(x_test, (-1, 784)) / 255.0 # Build model... with tf.name_scope('input'): image = tf.placeholder(tf.float32, [None, 784], name='image') label = tf.placeholder(tf.float32, [None], name='label') predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN) lr_scaler = hvd.size() # By default, Adasum doesn't need scaling when increasing batch size. If used with NCCL, # scale lr by local_size if args.use_adasum: lr_scaler = hvd.local_size() if hvd.nccl_built() else 1 # Horovod: adjust learning rate based on lr_scaler. opt = tf.train.AdamOptimizer(args.lr * lr_scaler) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer( opt, op=hvd.Adasum if args.use_adasum else hvd.Average) global_step = tf.train.get_or_create_global_step() train_op = opt.minimize(loss, global_step=global_step) hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=args.num_steps // hvd.size()), tf.train.LoggingTensorHook(tensors={ 'step': global_step, 'loss': loss }, every_n_iter=10), ] # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None training_batch_generator = train_input_generator(x_train, y_train, batch_size=100) # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop(): # Run a training step synchronously. image_, label_ = next(training_batch_generator) mon_sess.run(train_op, feed_dict={image: image_, label: label_})
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None): """Model definition entry. Args: features: the input image tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include class targets and box targets which are dense label maps. The labels are generated from get_input_fn function in data/dataloader.py mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. model: the model outputs class logits and box regression outputs. variable_filter_fn: the filter function that takes trainable_variables and returns the variable list after applying the filter rule. Returns: tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction. Raises: RuntimeError: if both ckpt and backbone_ckpt are set. """ if params.get('img_summary_steps', None): utils.image('input_image', features) training_hooks = None if params['data_format'] == 'channels_first': features = tf.transpose(features, [0, 3, 1, 2]) def _model_outputs(inputs): # Convert params (dict) to Config for easier access. return model(inputs, config=hparams_config.Config(params)) cls_outputs, box_outputs = utils.build_model_with_precision( params['precision'], _model_outputs, features, params['is_training_bn']) levels = cls_outputs.keys() for level in levels: cls_outputs[level] = tf.cast(cls_outputs[level], tf.float64) box_outputs[level] = tf.cast(box_outputs[level], tf.float64) # First check if it is in PREDICT mode. if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'image': features, } for level in levels: predictions['cls_outputs_%d' % level] = cls_outputs[level] predictions['box_outputs_%d' % level] = box_outputs[level] return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Set up training loss and learning rate. update_learning_rate_schedule_parameters(params) global_step = tf.train.get_or_create_global_step() learning_rate = learning_rate_schedule(params, global_step) # cls_loss and box_loss are for logging. only total_loss is optimized. det_loss, cls_loss, box_loss, box_iou_loss = detection_loss( cls_outputs, box_outputs, labels, params) reg_l2loss = reg_l2_loss(params['weight_decay']) total_loss = det_loss + reg_l2loss if mode == tf.estimator.ModeKeys.TRAIN: utils.scalar('lrn_rate', learning_rate) utils.scalar('trainloss/cls_loss', cls_loss) utils.scalar('trainloss/box_loss', box_loss) utils.scalar('trainloss/det_loss', det_loss) utils.scalar('trainloss/reg_l2_loss', reg_l2loss) utils.scalar('trainloss/loss', total_loss) if box_iou_loss: utils.scalar('trainloss/box_iou_loss', box_iou_loss) moving_average_decay = params['moving_average_decay'] if moving_average_decay: ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay, num_updates=global_step) ema_vars = utils.get_ema_vars() if params['strategy'] == 'horovod': import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top learning_rate = learning_rate * hvd.size() if mode == tf.estimator.ModeKeys.TRAIN: if params['optimizer'].lower() == 'sgd': optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=params['momentum']) elif params['optimizer'].lower() == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate) else: raise ValueError('optimizers should be adam or sgd') if params['strategy'] == 'tpu': optimizer = tf.tpu.CrossShardOptimizer(optimizer) elif params['strategy'] == 'horovod': optimizer = hvd.DistributedOptimizer(optimizer) training_hooks = [hvd.BroadcastGlobalVariablesHook(0)] # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) var_list = tf.trainable_variables() if variable_filter_fn: var_list = variable_filter_fn(var_list) if params.get('clip_gradients_norm', 0) > 0: logging.info('clip gradients norm by %f', params['clip_gradients_norm']) grads_and_vars = optimizer.compute_gradients(total_loss, var_list) with tf.name_scope('clip'): grads = [gv[0] for gv in grads_and_vars] tvars = [gv[1] for gv in grads_and_vars] clipped_grads, gnorm = tf.clip_by_global_norm( grads, params['clip_gradients_norm']) utils.scalar('gnorm', gnorm) grads_and_vars = list(zip(clipped_grads, tvars)) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(grads_and_vars, global_step) else: with tf.control_dependencies(update_ops): train_op = optimizer.minimize(total_loss, global_step, var_list=var_list) if moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(**kwargs): """Returns a dictionary that has the evaluation metrics.""" batch_size = params['batch_size'] if params['strategy'] == 'tpu': batch_size = params['batch_size'] * params['num_shards'] eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) if params.get('testdev_dir', None): logging.info('Eval testdev_dir %s', params['testdev_dir']) coco_metrics = coco_metric_fn( batch_size, anchor_labeler, params['val_json_file'], testdev_dir=params['testdev_dir'], disable_pyfun=params.get('disable_pyfun', None), **kwargs) else: logging.info('Eval val with groudtruths %s.', params['val_json_file']) coco_metrics = coco_metric_fn(batch_size, anchor_labeler, params['val_json_file'], **kwargs) # Add metrics to output. output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics cls_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(cls_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) box_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(box_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) metric_fn_inputs = { 'cls_loss_repeat': cls_loss_repeat, 'box_loss_repeat': box_loss_repeat, 'source_ids': labels['source_ids'], 'groundtruth_data': labels['groundtruth_data'], 'image_scales': labels['image_scales'], } add_metric_fn_inputs(params, cls_outputs, box_outputs, metric_fn_inputs) eval_metrics = (metric_fn, metric_fn_inputs) checkpoint = params.get('ckpt') or params.get('backbone_ckpt') if checkpoint and mode == tf.estimator.ModeKeys.TRAIN: # Initialize the model from an EfficientDet or backbone checkpoint. if params.get('ckpt') and params.get('backbone_ckpt'): raise RuntimeError( '--backbone_ckpt and --checkpoint are mutually exclusive') if params.get('backbone_ckpt'): var_scope = params['backbone_name'] + '/' if params['ckpt_var_scope'] is None: # Use backbone name as default checkpoint scope. ckpt_scope = params['backbone_name'] + '/' else: ckpt_scope = params['ckpt_var_scope'] + '/' else: # Load every var in the given checkpoint var_scope = ckpt_scope = '/' def scaffold_fn(): """Loads pretrained model through scaffold function.""" logging.info('restore variables from %s', checkpoint) var_map = utils.get_ckpt_var_map(ckpt_path=checkpoint, ckpt_scope=ckpt_scope, var_scope=var_scope, var_exclude_expr=params.get( 'var_exclude_expr', None)) tf.train.init_from_checkpoint(checkpoint, var_map) return tf.train.Scaffold() elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay: def scaffold_fn(): """Load moving average variables for eval.""" logging.info('Load EMA vars with ema_decay=%f', moving_average_decay) restore_vars_dict = ema.variables_to_restore(ema_vars) saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) else: scaffold_fn = None return tf.estimator.tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, eval_metrics=eval_metrics, host_call=utils.get_tpu_host_call( global_step, params), scaffold_fn=scaffold_fn, training_hooks=training_hooks)
def main(_): """ Builds the model and runs. """ if FLAGS.distributed: import horovod.tensorflow as hvd hvd.init() tf.logging.set_verbosity(tf.logging.INFO) tx.utils.maybe_create_dir(FLAGS.output_dir) # Loads data num_train_data = config_data.num_train_data # Configures distribued mode if FLAGS.distributed: config_data.train_hparam["dataset"]["num_shards"] = hvd.size() config_data.train_hparam["dataset"]["shard_id"] = hvd.rank() config_data.train_hparam["batch_size"] //= hvd.size() train_dataset = tx.data.TFRecordData(hparams=config_data.train_hparam) eval_dataset = tx.data.TFRecordData(hparams=config_data.eval_hparam) test_dataset = tx.data.TFRecordData(hparams=config_data.test_hparam) iterator = tx.data.FeedableDataIterator({ 'train': train_dataset, 'eval': eval_dataset, 'test': test_dataset }) batch = iterator.get_next() input_ids = batch["input_ids"] segment_ids = batch["segment_ids"] batch_size = tf.shape(input_ids)[0] input_length = tf.reduce_sum(1 - tf.cast(tf.equal(input_ids, 0), tf.int32), axis=1) # Builds BERT hparams = {'clas_strategy': 'cls_time'} model = tx.modules.BERTClassifier( pretrained_model_name=FLAGS.pretrained_model_name, hparams=hparams) logits, preds = model(input_ids, input_length, segment_ids) accu = tx.evals.accuracy(batch['label_ids'], preds) # Optimization loss = tf.losses.sparse_softmax_cross_entropy(labels=batch["label_ids"], logits=logits) global_step = tf.Variable(0, trainable=False) # Builds learning rate decay scheduler static_lr = config_downstream.lr['static_lr'] num_train_steps = int(num_train_data / config_data.train_batch_size * config_data.max_train_epoch) num_warmup_steps = int(num_train_steps * config_data.warmup_proportion) lr = model_utils.get_lr( global_step, num_train_steps, # lr is a Tensor num_warmup_steps, static_lr) opt = tx.core.get_optimizer(global_step=global_step, learning_rate=lr, hparams=config_downstream.opt) if FLAGS.distributed: opt = hvd.DistributedOptimizer(opt) train_op = tf.contrib.layers.optimize_loss(loss=loss, global_step=global_step, learning_rate=None, optimizer=opt) # Train/eval/test routine def _is_head(): if not FLAGS.distributed: return True return hvd.rank() == 0 def _train_epoch(sess): """Trains on the training set, and evaluates on the dev set periodically. """ iterator.restart_dataset(sess, 'train') fetches = { 'train_op': train_op, 'loss': loss, 'batch_size': batch_size, 'step': global_step } while True: try: feed_dict = { iterator.handle: iterator.get_handle(sess, 'train'), tx.global_mode(): tf.estimator.ModeKeys.TRAIN, } rets = sess.run(fetches, feed_dict) step = rets['step'] dis_steps = config_data.display_steps if _is_head() and dis_steps > 0 and step % dis_steps == 0: tf.logging.info('step:%d; loss:%f;' % (step, rets['loss'])) eval_steps = config_data.eval_steps if _is_head() and eval_steps > 0 and step % eval_steps == 0: _eval_epoch(sess) except tf.errors.OutOfRangeError: break def _eval_epoch(sess): """Evaluates on the dev set. """ iterator.restart_dataset(sess, 'eval') cum_acc = 0.0 cum_loss = 0.0 nsamples = 0 fetches = { 'accu': accu, 'loss': loss, 'batch_size': batch_size, } while True: try: feed_dict = { iterator.handle: iterator.get_handle(sess, 'eval'), tx.context.global_mode(): tf.estimator.ModeKeys.EVAL, } rets = sess.run(fetches, feed_dict) cum_acc += rets['accu'] * rets['batch_size'] cum_loss += rets['loss'] * rets['batch_size'] nsamples += rets['batch_size'] except tf.errors.OutOfRangeError: break tf.logging.info('eval accu: {}; loss: {}; nsamples: {}'.format( cum_acc / nsamples, cum_loss / nsamples, nsamples)) def _test_epoch(sess): """Does predictions on the test set. """ iterator.restart_dataset(sess, 'test') _all_preds = [] while True: try: feed_dict = { iterator.handle: iterator.get_handle(sess, 'test'), tx.context.global_mode(): tf.estimator.ModeKeys.PREDICT, } _preds = sess.run(preds, feed_dict=feed_dict) _all_preds.extend(_preds.tolist()) except tf.errors.OutOfRangeError: break output_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_file, "w") as writer: writer.write('\n'.join(str(p) for p in _all_preds)) # Broadcasts global variables from rank-0 process if FLAGS.distributed: bcast = hvd.broadcast_global_variables(0) session_config = tf.ConfigProto() if FLAGS.distributed: session_config.gpu_options.visible_device_list = str(hvd.local_rank()) with tf.Session(config=session_config) as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) if FLAGS.distributed: bcast.run() # Restores trained model if specified saver = tf.train.Saver() if FLAGS.checkpoint: saver.restore(sess, FLAGS.checkpoint) iterator.initialize_dataset(sess) if FLAGS.do_train: for i in range(config_data.max_train_epoch): _train_epoch(sess) saver.save(sess, FLAGS.output_dir + '/model.ckpt') if FLAGS.do_eval: _eval_epoch(sess) if FLAGS.do_test: _test_epoch(sess)
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu, use_hvd=False): """Creates an optimizer training op.""" global_step = tf.train.get_or_create_global_step() learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) # Implements linear decay of the learning rate. learning_rate = tf.train.polynomial_decay(learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) # if use_hvd: # # May want to scale learning rate by number of GPUs # learning_rate *= hvd.size() # Implements linear warmup. I.e., if global_step < num_warmup_steps, the # learning rate will be `global_step/num_warmup_steps * init_lr`. if num_warmup_steps: global_steps_int = tf.cast(global_step, tf.int32) warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = init_lr * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) learning_rate = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) # It is recommended that you use this optimizer for fine tuning, since this # is how the model was trained (note that the Adam m/v variables are NOT # loaded from init_checkpoint.) optimizer = AdamWeightDecayOptimizer( learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) if use_hvd: # [HVD] Wrap the original optimizer by Horovod's distributed optimizer, which handles all the under the hood allreduce calls. # Notice Horovod only does synchronized parameter update. optimizer = hvd.DistributedOptimizer(optimizer) if use_tpu: optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) tvars = tf.trainable_variables() if use_hvd: # [HVD] Use distributed optimizer to compute gradients grads_and_vars = optimizer.compute_gradients(loss, tvars) grads = [grad for grad, var in grads_and_vars] tvars = [var for grad, var in grads_and_vars] else: # Use standard TF gradients grads = tf.gradients(loss, tvars) # This is how the model was pre-trained. (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # Normally the global step update is done inside of `apply_gradients`. # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use # a different optimizer, you should probably take this line out. new_global_step = global_step + 1 new_global_step = tf.identity(new_global_step, name='step_update') train_op = tf.group(train_op, [global_step.assign(new_global_step)]) return train_op
def train_main(dataset, model_name='117M', seed=None, batch_size=2, sample_length=1023, sample_num=1, sample_every=4500, run_name='run1', restore_from='latest', save_every=2000, combine=50000): enc = encoder_sp.get_encoder(model_name) hparams = model.default_hparams() with open(os.path.join('models', model_name, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if sample_length is None: sample_length = hparams.n_ctx // 2 elif sample_length > hparams.n_ctx: raise ValueError( "Can't get samples longer than window size: %s" % hparams.n_ctx) # TF config config = tf.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: context = tf.placeholder(tf.int32, [batch_size, None]) np.random.seed(seed) tf.set_random_seed(seed) output = model.model(hparams=hparams, X=context) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=context[:, 1:], logits=output['logits'][:, :-1])) tf_sample = sample.sample_sequence( hparams=hparams, length=sample_length, context=context, batch_size=batch_size, temperature=0.8, top_k=40) train_vars = [v for v in tf.trainable_variables() if 'model' in v.name] opt = tf.train.AdamOptimizer() opt = hvd.DistributedOptimizer(opt) train_op = opt.minimize(loss, var_list=train_vars) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. bcast = hvd.broadcast_global_variables(0) saver = tf.train.Saver( var_list=train_vars, max_to_keep=5, keep_checkpoint_every_n_hours=2) sess.run(tf.global_variables_initializer()) if restore_from == 'latest': ckpt = tf.train.latest_checkpoint( os.path.join(CHECKPOINT_DIR, run_name)) if ckpt is None: # Get fresh GPT weights if new run. ckpt = tf.train.latest_checkpoint( os.path.join('models', model_name)) elif restore_from == 'fresh': ckpt = tf.train.latest_checkpoint( os.path.join('models', model_name)) else: ckpt = tf.train.latest_checkpoint(restore_from) print(str(hvd.local_rank()), 'Loading checkpoint', ckpt) saver.restore(sess, ckpt) bcast.run() print(str(hvd.local_rank()), 'Loading dataset...') chunks = load_dataset(enc, dataset, combine) data_sampler = Sampler(chunks) print(str(hvd.local_rank()), 'dataset has', data_sampler.total_size, 'tokens') print(str(hvd.local_rank()), 'Training...') counter = 1 if os.path.exists(os.path.join(CHECKPOINT_DIR, run_name, 'counter')): # Load the step number if we're resuming a run # Add 1 so we don't immediately try to save again with open(os.path.join(CHECKPOINT_DIR, run_name, 'counter'), 'r') as fp: counter = int(fp.read()) + 1 def save(): maketree(os.path.join(CHECKPOINT_DIR, run_name)) print( 'Saving', os.path.join(CHECKPOINT_DIR, run_name, 'model-{}').format(counter)) saver.save( sess, os.path.join(CHECKPOINT_DIR, run_name, 'model'), global_step=counter) with open(os.path.join(CHECKPOINT_DIR, run_name, 'counter'), 'w') as fp: fp.write(str(counter) + '\n') def generate_samples(): context_tokens = data_sampler.sample(1) all_text = [] index = 0 while index < sample_num: out = sess.run( tf_sample, feed_dict={context: batch_size*[context_tokens]}) for i in range(min(sample_num - index, batch_size)): text = enc.decode(out[i]) text = '======== SAMPLE {} ========\n{}\n'.format(index + 1, text) all_text.append(text) index += 1 print(text) maketree(os.path.join(SAMPLE_DIR, run_name)) with open( os.path.join(SAMPLE_DIR, run_name, 'samples-{}').format(counter), 'w') as fp: fp.write('\n'.join(all_text)) avg_loss = (0.0, 0.0) start_time = time.time() try: while True: batch = [data_sampler.sample(1024) for _ in range(batch_size)] _, lv = sess.run((train_op, loss), feed_dict={context: batch}) avg_loss = (avg_loss[0] * 0.99 + lv, avg_loss[1] * 0.99 + 1.0) if hvd.rank() == 0: if counter % save_every == 0: save() if counter % sample_every == 0: generate_samples() print( '[{counter} | {time:2.2f}] loss={loss:2.2f} avg={avg:2.2f}' .format( counter=counter, time=time.time() - start_time, loss=lv, avg=avg_loss[0] / avg_loss[1])) counter += 1 except KeyboardInterrupt: print('interrupted') if hvd.rank() == 0: save()
def main(): """Create the model and start the training.""" hvd.init() args = get_arguments() args.snapshot_dir = args.snapshot_dir.replace( 'DeepDICD/', 'DeepDICD/' + args.model_name + '-' + args.domain + '-') print(toMagenta(args.snapshot_dir)) ss = args.domain.split('-') if ss[0] == 'D': args.list1 = args.list1.replace('amazon.txt', 'dslr.txt') elif ss[0] == 'W': args.list1 = args.list1.replace('amazon.txt', 'webcam.txt') if ss[1] == 'A': args.list2 = args.list2.replace('dslr.txt', 'amazon.txt') elif ss[1] == 'W': args.list2 = args.list2.replace('dslr.txt', 'webcam.txt') print(toMagenta(args.list1)) print(toMagenta(args.list2)) start_steps = args.start_steps h = args.h w = args.w # construct data generator file1 = open(args.list1) num1 = len(file1.readlines()) file2 = open(args.list2) num2 = len(file2.readlines()) file1.close() file2.close() steps_per_epoch = int((num1 / (args.batch_size))) num_steps = int(steps_per_epoch * args.num_epochs) val_num_steps = int(num2 / args.batch_size) print(toCyan('src domain: {:d}, tar domain {:d}'.format(num1, num2))) print( toCyan('steps_per_epoch x num_epochs:{:d} x {:d}'.format( steps_per_epoch, args.num_epochs))) # Chong # split_batch_size=int(args.batch_size/hvd.size()) myDataloader = Dataloader(args.img_dir, args.list1, args.list2, args.batch_size, args.h, args.w, args.num_threads) src_img = myDataloader.simg_batch src_label = myDataloader.slabel_batch tar_img = myDataloader.timg_batch tar_label = myDataloader.tlabel_batch coord = tf.train.Coordinator() # Using Poly learning rate policy baseLR1 = tf.constant(args.lr1) baseLR2 = tf.constant(args.lr2) step_ph = tf.placeholder(dtype=tf.float32, shape=()) # lr1 = tf.scalar_mul(baseLR1, tf.pow((1 - step_ph / num_steps), args.power)) # lr2 = tf.scalar_mul(baseLR2, tf.pow((1 - step_ph / num_steps), args.power)) lr1 = baseLR1 / tf.pow(1 + 0.001 * step_ph / steps_per_epoch, 0.75) lr2 = baseLR2 / tf.pow(1 + 0.001 * step_ph / steps_per_epoch, 0.75) # lr1=baseLR1 # lr2=baseLR2 # decay_steps=steps_per_epoch*10 # lr1=tf.train.exponential_decay(baseLR1,step_ph,decay_steps,0.1,staircase=True) # lr2=tf.train.exponential_decay(baseLR2,step_ph,decay_steps,0.1,staircase=True) keep_prob = tf.placeholder(dtype=tf.float32, shape=()) # loss_balance =1- tf.scalar_mul(1., tf.pow((1 - step_ph / num_steps), args.power)) loss_balance = tf.constant(1.) # boundaries = [np.float32(np.int32((8/10) * num_steps)), np.float32(np.int((9/10) * num_steps))] # values = [0., 0.1, 0.2] # loss_balance = tf.train.piecewise_constant(step_ph, boundaries, values) model = DeepCoralModel(args, keep_prob, src_img, src_label, tar_img, tar_label) model.build_losses(loss_balance) # loss_balance model.build_outputs() summary_ = model.build_summary() loss = model.loss # Gets moving_mean and moving_variance update operations from tf.GraphKeys.UPDATE_OPS if args.no_update_mean_var == True: update_ops = None else: print(toMagenta('updating mean and var in batchnorm')) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) all_trainable_var = [v for v in tf.global_variables()] fine_tune_var = [v for v in all_trainable_var if 'fc8' not in v.name] fine_tune_var_weights = [v for v in fine_tune_var if 'weights' in v.name] fine_tune_var_bias = [v for v in fine_tune_var if 'bias' in v.name] retrain_var = [v for v in all_trainable_var if 'fc8' in v.name] retrain_var_weights = [v for v in retrain_var if 'weights' in v.name] retrain_var_bias = [v for v in retrain_var if 'bias' in v.name] with tf.control_dependencies(update_ops): opt1_1 = tf.train.MomentumOptimizer(lr1 * hvd.size(), args.momentum) opt1_1 = hvd.DistributedOptimizer(opt1_1) grads1_1 = tf.gradients(loss, fine_tune_var_weights) train_op_1_1 = opt1_1.apply_gradients( zip(grads1_1, fine_tune_var_weights)) opt1_2 = tf.train.MomentumOptimizer(2 * lr1 * hvd.size(), args.momentum) opt1_2 = hvd.DistributedOptimizer(opt1_2) grads1_2 = tf.gradients(loss, fine_tune_var_bias) train_op_1_2 = opt1_2.apply_gradients(zip(grads1_2, fine_tune_var_bias)) opt2_1 = tf.train.MomentumOptimizer(lr2 * hvd.size(), args.momentum) opt2_1 = hvd.DistributedOptimizer(opt2_1) grads2_1 = tf.gradients(loss, retrain_var_weights) train_op_2_1 = opt2_1.apply_gradients( zip(grads2_1, retrain_var_weights)) opt2_2 = tf.train.MomentumOptimizer(2 * lr2 * hvd.size(), args.momentum) opt2_2 = hvd.DistributedOptimizer(opt2_2) grads2_2 = tf.gradients(loss, retrain_var_bias) train_op_2_2 = opt2_2.apply_gradients(zip(grads2_2, retrain_var_bias)) train_op = tf.group(train_op_1_1, train_op_1_2, train_op_2_1, train_op_2_2) # Set up tf session and initialize variables. # config = tf.ConfigProto() #Chong config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.per_process_gpu_memory_fraction = 0.4 sess = tf.Session(config=config) init_local = tf.local_variables_initializer() init = tf.global_variables_initializer() # construct summary summary_.append(tf.summary.scalar('train/lr1', lr1)) summary_.append(tf.summary.scalar('train/lr2', lr2)) summary_.append(tf.summary.scalar('train/loss_balance', loss_balance)) summary_merged = tf.summary.merge(summary_) if hvd.rank() == 0: FinalSummary = tf.summary.FileWriter(args.snapshot_dir, sess.graph) # init sess.run([init_local, init]) bcast = hvd.broadcast_global_variables(0) sess.run(bcast) # Saver for storing checkpoints of the model. var = tf.global_variables() skip_var = ['fc8'] saver = tf.train.Saver(var_list=var, max_to_keep=5) ckpt = tf.train.get_checkpoint_state(args.snapshot_dir) if ckpt and ckpt.model_checkpoint_path and args.resume: loader = tf.train.Saver(var_list=var) load_step = int( os.path.basename(ckpt.model_checkpoint_path).split('-')[1]) load(loader, sess, ckpt.model_checkpoint_path) elif not args.not_load_pretrained: print(toRed('Restore from pre-trained model...' + args.restore_from)) model.load_initial_weights(sess, args.restore_from, skip_var) #Chong:0531 # Start queue threads. threads = tf.train.start_queue_runners(coord=coord, sess=sess) # Iterate over training steps. acc2_history = 0 for step in range(start_steps, num_steps): start_time = time.time() feed_dict = {step_ph: step, keep_prob: 0.5} summary, total_loss, _ = sess.run([summary_merged, loss, train_op], feed_dict=feed_dict) if hvd.rank() == 0: FinalSummary.add_summary(summary, step) duration = time.time() - start_time remain_time = duration * (num_steps - step) / 3600 print( '\r', toCyan( '{:s}:{:d}-{:d}-{:d} total loss = {:.3f},({:.3f} sec/step, ERT: {:.3f})' .format(args.model_name + '-' + args.domain, step % steps_per_epoch, step // steps_per_epoch, args.num_epochs, total_loss, duration, remain_time)), end='') if step % args.test_every == 0: acc1, acc2 = 0, 0 for jj in range(val_num_steps): feed_dict = {keep_prob: 1} src_acc, tar_acc = sess.run([model.src_acc, model.tar_acc], feed_dict=feed_dict) acc1 += np.sum(src_acc) acc2 += np.sum(tar_acc) acc1 = acc1 / (val_num_steps * args.batch_size) acc2 = acc2 / (val_num_steps * args.batch_size) # pdb.set_trace() test_summary = tf.Summary() test_summary.value.add(tag='test/source_accuracy', simple_value=acc1) test_summary.value.add(tag='test/target_accuracy', simple_value=acc2) FinalSummary.add_summary(test_summary, step) if acc2 > acc2_history: save(saver, sess, args.snapshot_dir, step) acc2_history = acc2 coord.request_stop() coord.join(threads) sess.close()
config.gpu_options.allow_growth = False config.gpu_options.visible_device_list = '' if args.eager: tf.enable_eager_execution(config) # Set up standard model. model = getattr(applications, args.model)(weights=None) opt = tf.train.GradientDescentOptimizer(0.01) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. opt = hvd.DistributedOptimizer(opt, compression=compression) init = tf.global_variables_initializer() bcast_op = hvd.broadcast_global_variables(0) data = tf.random_uniform([args.batch_size, 224, 224, 3]) target = tf.random_uniform([args.batch_size, 1], minval=0, maxval=999, dtype=tf.int64) def loss_function(): logits = model(data, training=True) return tf.losses.sparse_softmax_cross_entropy(target, logits) def log(s, nl=True): if hvd.rank() != 0:
def main(_): start_time = datetime.now() tf.logging.info("Starting at: {}".format(start_time)) tf.logging.info("Batch size: {} images per step".format(FLAGS.batch_size)) if not FLAGS.no_horovod: # Initialize Horovod. hvd.init() # Download MNIST dataset. mnist = input_data.read_data_sets(FLAGS.data_path, one_hot=False) # Input tensors with tf.name_scope("input"): image = tf.placeholder(tf.float32, [None, 784], name="image") label = tf.placeholder(tf.float32, [None], name="label") # Define model predict, loss, accuracy = get_model(image, label) if not FLAGS.no_horovod: # Horovod: adjust learning rate based on number workers opt = tf.train.RMSPropOptimizer(FLAGS.learning_rate * hvd.size()) else: opt = tf.train.RMSPropOptimizer(FLAGS.learning_rate) # Wrap optimizer with Horovod Distributed Optimizer. if FLAGS.no_horovod is None: opt = hvd.DistributedOptimizer(opt) global_step = tf.train.get_or_create_global_step() train_op = opt.minimize(loss, global_step=global_step) if not FLAGS.no_horovod: last_step = FLAGS.total_steps // hvd.size() else: last_step = FLAGS.total_steps def formatter_log(tensors): if FLAGS.no_horovod: logstring = "Step {} of {}: " \ " training loss = {:.4f}," \ " training accuracy = {:.4f}".\ format(tensors["step"], last_step, tensors["loss"], tensors["accuracy"]) else: logstring = "HOROVOD (Worker #{}), Step {} of {}: " \ " training loss = {:.4f}," \ " training accuracy = {:.4f}".\ format(hvd.rank(), tensors["step"], last_step, tensors["loss"], tensors["accuracy"]) return logstring hooks = [ tf.train.StopAtStepHook(last_step=last_step), # Prints the loss and step every log_steps steps tf.train.LoggingTensorHook(tensors={ "step": global_step, "loss": loss, "accuracy": accuracy }, every_n_iter=FLAGS.log_steps, formatter=formatter_log), ] # Horovod: BroadcastGlobalVariablesHook broadcasts # initial variable states from rank 0 to all other # processes. This is necessary to ensure consistent # initialization of all workers when training is # started with random weights # or restored from a checkpoint. if not FLAGS.no_horovod: hooks.append(hvd.BroadcastGlobalVariablesHook(0)) # Horovod: save checkpoints only on # worker 0 to prevent other workers from # corrupting them. if hvd.rank() == 0: checkpoint_dir = "{}/{}-workers/{}".\ format(FLAGS.output_path, hvd.size(), datetime.now().strftime("%Y%m%d-%H%M%S")) else: checkpoint_dir = None else: checkpoint_dir = "{}/no_hvd/{}".\ format(FLAGS.output_path, datetime.now().strftime("%Y%m%d-%H%M%S")) # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, # and closing when done or an error occurs. with tf.train.\ MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, save_summaries_steps=FLAGS.log_steps, log_step_count_steps=FLAGS.log_steps, config=config) as mon_sess: while not mon_sess.should_stop(): # Run a training step synchronously. image_, label_ = mnist.train.next_batch(FLAGS.batch_size) mon_sess.run(train_op, feed_dict={image: image_, label: label_}) stop_time = datetime.now() tf.logging.info("Stopping at: {}".format(stop_time)) tf.logging.info("Elapsed time was: {}".format(stop_time - start_time))
def main(FLAGS): if FLAGS.hvd: hvd.init() if hvd.local_rank() == 0: tf.logging.set_verbosity(tf.logging.INFO) log_path = os.path.join(FLAGS.results_dir, FLAGS.log_filename) os.makedirs(FLAGS.results_dir, exist_ok=True) dllogger.init(backends=[ dllogger.JSONStreamBackend( verbosity=dllogger.Verbosity.VERBOSE, filename=log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE) ]) else: tf.logging.set_verbosity(tf.logging.ERROR) dllogger.init(backends=[]) num_gpus = hvd.size() else: tf.logging.set_verbosity(tf.logging.INFO) log_path = os.path.join(FLAGS.results_dir, FLAGS.log_filename) os.makedirs(FLAGS.results_dir, exist_ok=True) dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE) ]) num_gpus = 1 dllogger.log(data=vars(FLAGS), step='PARAMETER') create_batches = FLAGS.batch_size // FLAGS.prebatch_size wide_columns, deep_columns = get_feature_columns( use_all_columns=FLAGS.use_all_columns) tf_transform_output = tft.TFTransformOutput( FLAGS.transformed_metadata_path) if not FLAGS.hvd or hvd.local_rank() == 0: tf.compat.v1.logging.warn('command line arguments: {}'.format( json.dumps(vars(FLAGS)))) if not os.path.exists(FLAGS.results_dir): os.mkdir(FLAGS.results_dir) with open('{}/args.json'.format(FLAGS.results_dir), 'w') as f: json.dump(vars(FLAGS), f, indent=4) if FLAGS.gpu: session_config = tf.compat.v1.ConfigProto( log_device_placement=FLAGS.log_device_placement) else: session_config = tf.compat.v1.ConfigProto( device_count={'GPU': 0}, log_device_placement=FLAGS.log_device_placement) if FLAGS.hvd: session_config.gpu_options.visible_device_list = str(hvd.local_rank()) if FLAGS.xla: session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 if FLAGS.benchmark: model_dir = None else: model_dir = FLAGS.model_dir if FLAGS.save_checkpoints_steps != 0: run_config = tf.estimator.RunConfig(model_dir=model_dir).replace( session_config=session_config, save_checkpoints_steps=FLAGS.save_checkpoints_steps, keep_checkpoint_max=1) else: run_config = tf.estimator.RunConfig(model_dir=model_dir).replace( session_config=session_config, save_checkpoints_secs=FLAGS.save_checkpoints_secs, keep_checkpoint_max=1) wide_optimizer = tf.compat.v1.train.FtrlOptimizer( learning_rate=FLAGS.linear_learning_rate, l1_regularization_strength=FLAGS.linear_l1_regularization, l2_regularization_strength=FLAGS.linear_l2_regularization) deep_optimizer = tf.compat.v1.train.ProximalAdagradOptimizer( learning_rate=FLAGS.deep_learning_rate, initial_accumulator_value=0.1, l1_regularization_strength=FLAGS.deep_l1_regularization, l2_regularization_strength=FLAGS.deep_l2_regularization, use_locking=False) if FLAGS.hvd: wide_optimizer = hvd.DistributedOptimizer(wide_optimizer) deep_optimizer = hvd.DistributedOptimizer(deep_optimizer) stats_filename = os.path.join(FLAGS.transformed_metadata_path, 'stats.json') embed_columns = None # input functions to read data from disk train_input_fn = lambda: separate_input_fn( tf_transform_output, FLAGS.train_data_pattern, create_batches, tf.estimator.ModeKeys.TRAIN, reader_num_threads=FLAGS.reader_num_threads, parser_num_threads=FLAGS.parser_num_threads, shuffle_buffer_size=int(FLAGS.shuffle_percentage * create_batches), prefetch_buffer_size=FLAGS.prefetch_buffer_size, print_display_ids=FLAGS.print_display_ids) eval_input_fn = lambda: separate_input_fn( tf_transform_output, FLAGS.eval_data_pattern, (FLAGS.eval_batch_size // FLAGS.prebatch_size), tf.estimator.ModeKeys.EVAL, reader_num_threads=1, parser_num_threads=1, shuffle_buffer_size=int(FLAGS.shuffle_percentage * create_batches), prefetch_buffer_size=FLAGS.prefetch_buffer_size, print_display_ids=FLAGS.print_display_ids) estimator = construct_estimator(FLAGS.model_type, not FLAGS.canned_estimator, run_config, wide_columns, wide_optimizer, deep_columns, FLAGS.deep_hidden_units, FLAGS.deep_dropout, deep_optimizer, amp=FLAGS.amp) estimator = tf.estimator.add_metrics(estimator, map_custom_metric) estimator = tf.estimator.add_metrics(estimator, map_custom_metric_with_leak) steps_per_epoch = FLAGS.training_set_size / FLAGS.batch_size print('Steps per epoch: {}'.format(steps_per_epoch)) max_steps = int(FLAGS.num_epochs * steps_per_epoch) hooks = [] if FLAGS.hvd: hooks.append(hvd.BroadcastGlobalVariablesHook(0)) if FLAGS.predict or FLAGS.evaluate: # inference if FLAGS.benchmark: benchmark_hook = BenchmarkLoggingHook( global_batch_size=num_gpus * FLAGS.eval_batch_size, warmup_steps=FLAGS.benchmark_warmup_steps) hooks.append(benchmark_hook) eval_steps = FLAGS.benchmark_steps else: eval_steps = FLAGS.eval_steps predict_result_iter = estimator.predict(input_fn=eval_input_fn, hooks=hooks, yield_single_examples=False) results = [] for i, r in enumerate(predict_result_iter): print('predicting batch: ', i) results.append(r) # TODO: use eval_steps if i >= eval_steps - 1: break if FLAGS.benchmark: infer_throughput = benchmark_hook.mean_throughput.value() if FLAGS.benchmark: dllogger.log(data={'infer_throughput': infer_throughput}, step=tuple()) elif FLAGS.evaluate: print( 'evaluating using estimator.evaluate with eval_batch_size = ', FLAGS.eval_batch_size, ' and eval_steps = ', FLAGS.eval_steps) result = estimator.evaluate(eval_input_fn, hooks=hooks, steps=FLAGS.eval_steps) dllogger.log(step=(), data={ 'map_infer': float(result['map']), 'map_with_leak_infer': float(result['map_with_leak']) }) elif FLAGS.predict: scores = [r['probabilities'][:, 1] for r in results] scores = np.hstack(scores) scores_path = os.path.join(FLAGS.model_dir, 'scores.txt') print('saving the numpy scores array to: ', scores_path) np.savetxt(scores_path, scores, fmt="%f", delimiter='\n') else: # training if FLAGS.benchmark: benchmark_hook = BenchmarkLoggingHook( global_batch_size=num_gpus * FLAGS.batch_size, warmup_steps=FLAGS.benchmark_warmup_steps) hooks.append(benchmark_hook) estimator.train(train_input_fn, hooks=hooks, steps=FLAGS.benchmark_steps) train_throughput = benchmark_hook.mean_throughput.value() dllogger.log(data={'train_throughput': train_throughput}, step=tuple()) else: train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=max_steps, hooks=hooks) eval_spec = tf.estimator.EvalSpec( input_fn=eval_input_fn, throttle_secs=FLAGS.eval_throttle_secs, steps=FLAGS.eval_steps) result = tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) if result: dllogger.log(step=(), data={ 'map': float(result[0]['map']), 'map_with_leak': float(result[0]['map_with_leak']) })
def _set_train_or_infer(self, res, reverse_target_vocab_table, hparams): """Set up training and inference.""" if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.train_loss = res[1] self.word_count = tf.reduce_sum( self.iterator.source_sequence_length) + tf.reduce_sum( self.iterator.target_sequence_length) elif self.mode == tf.contrib.learn.ModeKeys.EVAL: self.eval_loss = res[1] elif self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_logits, _, self.final_context_state, self.sample_id = res self.sample_words = reverse_target_vocab_table.lookup( tf.to_int64(self.sample_id)) if self.mode != tf.contrib.learn.ModeKeys.INFER: ## Count the number of predicted words for compute ppl. self.predict_count = tf.reduce_sum( self.iterator.target_sequence_length) params = tf.trainable_variables() # Gradients and SGD update operation for training the model. # Arrange for the embedding vars to appear at the beginning. if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.learning_rate = tf.constant(hparams.learning_rate) # warm-up self.learning_rate = self._get_learning_rate_warmup(hparams) # decay self.learning_rate = self._get_learning_rate_decay(hparams) # Optimizer if hparams.optimizer == "sgd": opt = tf.train.GradientDescentOptimizer(self.learning_rate) elif hparams.optimizer == "adam": opt = tf.train.AdamOptimizer(self.learning_rate) else: raise ValueError("Unknown optimizer type %s" % hparams.optimizer) # Add Horovod Distributed Optimizer opt = hvd.DistributedOptimizer(opt) # Gradients #gradients = tf.gradients( # self.train_loss, # params, # colocate_gradients_with_ops=hparams.colocate_gradients_with_ops) # Horovod compute_gradients # Allreduce the gradients before returning them gradients, variables = zip( *opt.compute_gradients(self.train_loss, params, colocate_gradients_with_ops=hparams. colocate_gradients_with_ops)) clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip( gradients, max_gradient_norm=hparams.max_gradient_norm) self.grad_norm_summary = grad_norm_summary self.grad_norm = grad_norm self.update = opt.apply_gradients(zip(clipped_grads, params), global_step=self.global_step) # Summary self.train_summary = self._get_train_summary() elif self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_summary = self._get_infer_summary(hparams) # Print trainable variables utils.print_out("# Trainable variables") utils.print_out("Format: <name>, <shape>, <(soft) device placement>") for param in params: utils.print_out( " %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device))