def run_mnist_eager(): """Run MNIST training and eval loop in eager mode. """ data_dir = '/tmp/tensorflow/mnist/input_data' + str(ddl.rank()) model_dir = '/tmp/tensorflow/mnist/checkpoints/' + str(ddl.rank()) + '/' # Delete model dir if os.path.isdir(model_dir) and ddl.local_rank() == 0: shutil.rmtree(model_dir) data_format = 'channels_first' # Load the datasets train_ds, _ = mnist_dataset.train(data_dir, (1, 28, 28), label_int=True) train_ds = train_ds.shard(ddl.size(), ddl.rank()).shuffle(60000).batch(batch_size) test_ds, _ = mnist_dataset.test(data_dir, (1, 28, 28), label_int=True) test_ds = test_ds.batch(batch_size) # Create the model and optimizer model = create_model(data_format) optimizer = tf.train.MomentumOptimizer(0.01, 0.5) train_dir = None test_dir = None summary_writer = tf.contrib.summary.create_file_writer(train_dir, flush_millis=10000) test_summary_writer = tf.contrib.summary.create_file_writer( test_dir, flush_millis=10000, name='test') # Create and restore checkpoint (if one exists on the path) checkpoint_prefix = os.path.join(model_dir, 'ckpt-r' + str(ddl.rank())) step_counter = tf.train.get_or_create_global_step() checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer, step_counter=step_counter) # Restore variables on creation if a checkpoint exists. checkpoint.restore(tf.train.latest_checkpoint(model_dir)) # Train and evaluate for a set number of epochs. for _ in range(train_epochs): start = time.time() with summary_writer.as_default(): train(model, optimizer, train_ds, step_counter, 10) end = time.time() if ddl.rank() == 0: print('\nTrain time for epoch #%d (%d total steps): %f' % (checkpoint.save_counter.numpy() + 1, step_counter.numpy(), end - start)) with test_summary_writer.as_default(): test(model, test_ds) checkpoint.save(checkpoint_prefix)
warnings.filterwarnings("ignore") K.set_image_data_format('channels_last') # TF dimension ordering in this code gpuOptions = tensorflow.GPUOptions(allow_growth=True) sess = tensorflow.Session(config=tensorflow.ConfigProto( gpu_options=gpuOptions)) K.set_session(sess) img_rows = 512 img_cols = 512 NUM_CLASSES = 4 print('ddl.size():', ddl.size()) NUM_BATCHSIZE = 16 NUM_LRATE = 1e-4 * ddl.size() NUM_MAX_EPOCHS = 50 smooth = 1. ROOT = 'datapath' def load_train_data(): imgs_train = np.load( os.path.join(ROOT, 'img_20191216_aug_' + str(ddl.rank()) + '.npy')) imgs_mask_train = np.load(
def main(_): # Parameters learning_rate = 0.001 training_iters = FLAGS.num_iterations batch_size = 100 display_step = 1 # Network Parameters n_input = 784 # MNIST data input (img shape: 28*28) n_classes = 10 # MNIST total classes (0-9 digits) dropout = 0.75 # Dropout, probability to keep units ############################################################################ # Import MNIST data ############################################################################ data_dir = FLAGS.data_dir + str(ddl.local_rank()) (train_set, num_of_train_imgs) = dataset.train(data_dir, (28, 28, 1), VARTYPE) train_set = train_set.shard(ddl.size(), ddl.rank()) train_set = train_set.batch(batch_size).cache().shuffle(buffer_size=1000).repeat() X_train, Y_train = train_set.make_one_shot_iterator().get_next() # Construct model pred, keep_prob = deepnn(X_train) # Define loss and optimizer with tf.name_scope('loss'): cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(labels=Y_train, logits=pred)) with tf.name_scope('adam_optimizer'): optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-4) objective = optimizer.minimize(cost) # Evaluate model with tf.name_scope('accuracy'): correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(Y_train, 1)) correct_prediction = tf.cast(correct_prediction, VARTYPE) accuracy = tf.reduce_mean(correct_prediction) graph_location = tempfile.mkdtemp() print('Saving graph to: %s' % graph_location) train_writer = tf.summary.FileWriter(graph_location) train_writer.add_graph(tf.get_default_graph()) # Launch the graph with tf.Session(config=tf.ConfigProto()) as sess: sess.run(tf.global_variables_initializer()) my_variable = bias_variable([5, 5, 1, 32]) sess.run(my_variable.initializer) step = 1 # Keep training until reach max iterations while step * batch_size < training_iters: # Run optimization op (backprop) sess.run(objective) if step % display_step == 0: # Calculate batch loss and accuracy loss, acc = sess.run([cost, accuracy]) print("DDL " + str(ddl.rank()) + "] Iter " + str(step * batch_size) + ", Minibatch Loss= " + "{:.6f}".format(loss) + ", Training Accuracy= " + "{:.5f}".format(acc)) step += 1 print("DDL "+str(ddl.rank())+"] Optimization Finished!") # Calculate accuracy for 256 mnist test images print("DDL "+str(ddl.rank())+"] Testing Accuracy:", sess.run(accuracy))
# input image dimensions img_rows, img_cols = 28, 28 # data_dir data_dir = "/tmp/mnist_convnet_model_data" + str(ddl.rank()) input_shape = () if K.image_data_format() == 'channels_first': input_shape = (1, img_rows, img_cols) else: input_shape = (img_rows, img_cols, 1) # the data, split between train and test sets (train_set, num_of_train_imgs) = dataset.train(data_dir, input_shape) train_set = train_set.shard(ddl.size(), ddl.rank()) train_set = train_set.cache().shuffle( buffer_size=1000).batch(batch_size).repeat() (eval_set, num_of_test_imgs) = dataset.test(data_dir, input_shape) eval_full = eval_set eval_set = eval_set.shard(ddl.size(), ddl.rank()) eval_set = eval_set.batch(batch_size).repeat() num_of_all_test_imgs = num_of_test_imgs num_of_train_imgs /= ddl.size() num_of_test_imgs /= ddl.size() model = Sequential() model.add( Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
def main(): ############################################################################ # Import MNIST data ############################################################################ mnist = input_data.read_data_sets(training_data_dir) # Parameters learning_rate = 0.001 training_iters = 2500 batch_size = 100 display_step = 1 # Network Parameters n_input = 784 # MNIST data input (img shape: 28*28) n_classes = 10 # MNIST total classes (0-9 digits) dropout = 0.75 # Dropout, probability to keep units # tf Graph input x = tf.placeholder(tf.float32, [None, n_input], name="x") # Construct model keep_prob = tf.placeholder_with_default(1.0,shape=(), name="keepprob") pred = deepnn(x,1.0) pRes = tf.identity(pred,name="pRes") if os.getenv("OMPI_COMM_WORLD_RANK") == "0": print("writing checkpoint file", chkptpath+"_basegraph.meta") tf.train.export_meta_graph(chkptpath+"_basegraph.meta", as_text=True) #import the ddl library; this creates objects for distribution so #it must be done after exporting meta graph import ddl y = tf.placeholder(tf.int64, [None], name="y") # Define loss and optimizer with tf.name_scope('loss'): cost = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(labels=y, logits=pred)) with tf.name_scope('adam_optimizer'): optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) objective = optimizer.minimize(cost) predictor = tf.argmax(pred, 1, name="predictor") # Evaluate model with tf.name_scope('accuracy'): correct_prediction = tf.equal(predictor, y) correct_prediction = tf.cast(correct_prediction, tf.float32) accuracy = tf.reduce_mean(correct_prediction) saver = tf.train.Saver() # Launch the graph with tf.Session(config=tf.ConfigProto()) as sess: sess.run(tf.global_variables_initializer()) step = 1 # Keep training until reach max iterations while step * batch_size < training_iters: ################################################### ### USE ddl.rank() and ddl.size() to load data ### ################################################### batch_x, batch_y = mnist.train.next_batch(batch_size*ddl.size()) #select one of partitions batch_x = np.split(batch_x,ddl.size())[ddl.rank()] batch_y = np.split(batch_y,ddl.size())[ddl.rank()] # Run optimization op (backprop) sess.run(objective, feed_dict={x: batch_x, y: batch_y}) if step % display_step == 0: # Calculate batch loss and accuracy loss, acc = sess.run([cost, accuracy], feed_dict={x: batch_x, y: batch_y}) print("DDL " + str(ddl.rank()) + "] Iter " + str(step * batch_size) + ", Minibatch Loss= " + "{:.6f}".format(loss) + ", Training Accuracy= " + "{:.5f}".format(acc)) step += 1 if os.getenv("OMPI_COMM_WORLD_RANK") == "0" and step%10==0 and step!=0: saver.save(sess, chkptpath,global_step=step) print('[%d] save checkpoint' % step+" path: "+chkptpath) print("DDL "+str(ddl.rank())+"] Optimization Finished!") # Calculate accuracy for 256 mnist test images print("DDL "+str(ddl.rank())+"] Testing Accuracy:", \ sess.run(accuracy, feed_dict={x: mnist.test.images[:256], y: mnist.test.labels[:256]}))
def main(_): # Note: Not using DDL_OPTIONS; doing explicit DDL calls! # Explicit initialization call: ddl.init(FLAGS.ddl_options) # Parameters learning_rate = 0.001 training_iters = FLAGS.num_iterations batch_size = 100 display_step = 1 # Network Parameters n_input = 784 # MNIST data input (img shape: 28*28) n_classes = 10 # MNIST total classes (0-9 digits) dropout = 0.75 # Dropout, probability to keep units ############################################################################ # Import MNIST data ############################################################################ data_dir = FLAGS.data_dir + str(ddl.local_rank()) (train_set, num_of_train_imgs) = dataset.train(data_dir, (28, 28, 1)) train_set = train_set.shard(ddl.size(), ddl.rank()) train_set = train_set.batch(batch_size).cache().shuffle( buffer_size=1000).repeat() X_train, Y_train = train_set.make_one_shot_iterator().get_next() # Construct model pred, keep_prob = deepnn(X_train) # Define loss and optimizer with tf.name_scope('loss'): cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(labels=Y_train, logits=pred)) with tf.name_scope('adam_optimizer'): optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) grads_and_vars = optimizer.compute_gradients(cost) # obtain learnable variables and their gradients across the cluster nodes # and do reduce_scatter by making explicit DDL reduce call. # Note: all zipping is hidden grads_and_vars = ddl.grads_reduce(grads_and_vars, average=True) objective = optimizer.apply_gradients(grads_and_vars) # Evaluate model with tf.name_scope('accuracy'): correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(Y_train, 1)) correct_prediction = tf.cast(correct_prediction, tf.float32) accuracy = tf.reduce_mean(correct_prediction) graph_location = tempfile.mkdtemp() print('Saving graph to: %s' % graph_location) train_writer = tf.summary.FileWriter(graph_location) train_writer.add_graph(tf.get_default_graph()) # Launch the graph with tf.Session() as sess: sess.run(tf.global_variables_initializer()) step = 1 # Keep training until reach max iterations while step * batch_size < training_iters: # Run optimization op (backprop) sess.run(objective) if step % display_step == 0: # Calculate batch loss and accuracy loss, acc = sess.run([cost, accuracy]) print("DDL " + str(ddl.rank()) + "] Iter " + str(step * batch_size) + ", Minibatch Loss= " + "{:.6f}".format(loss) + ", Training Accuracy= " + "{:.5f}".format(acc)) step += 1 print("DDL " + str(ddl.rank()) + "] Optimization Finished!") # Calculate accuracy for 256 mnist test images print("DDL " + str(ddl.rank()) + "] Testing Accuracy:", sess.run(accuracy))
callbacks.append(ddl.DDLCallback()) callbacks.append(ddl.DDLGlobalVariablesCallback()) # Normalize into 0~1 range x_train /= x_train.max() x_test /= x_test.max() y_train_binary = to_categorical( y_train ) # For categorical crossentropy loss, we need to binarize multi-class labels y_test_binary = to_categorical( y_test ) # For categorical crossentropy loss, we need to binarize multi-class labels # Split the training data into ddl.size() batches for distributed training. x_train_dist = np.array_split(x_train, ddl.size())[ddl.rank()] y_train_dist = np.array_split(y_train, ddl.size())[ddl.rank()] y_train_dist_binary = np.array_split(y_train_binary, ddl.size())[ddl.rank()] ''' Training step one. Train for NN ''' if model_type == 'triplet' or model_type == 'contrastive': model = build_nn([568, 256, 100], x_train_dist.shape[1], l1_reg=l1_reg, l2_reg=l2_reg, activation_func='tanh')[0] # Set initial weights as DAE trained weights (skip dropout and batchnorm layers) # for layer,weight in zip(model.layers[1:8:3],pretrain_weights): # layer.set_weights(weight)
def train(hparams, scope=None, target_session=""): """Train a translation model.""" log_device_placement = hparams.log_device_placement out_dir = hparams.out_dir num_train_steps = hparams.num_train_steps steps_per_stats = hparams.steps_per_stats steps_per_external_eval = hparams.steps_per_external_eval steps_per_eval = 10 * steps_per_stats avg_ckpts = hparams.avg_ckpts if not steps_per_external_eval: steps_per_external_eval = 5 * steps_per_eval if not hparams.attention: model_creator = nmt_model.Model else: # Attention if (hparams.encoder_type == "gnmt" or hparams.attention_architecture in ["gnmt", "gnmt_v2"]): model_creator = gnmt_model.GNMTModel elif hparams.attention_architecture == "standard": model_creator = attention_model.AttentionModel else: raise ValueError("Unknown attention architecture %s" % hparams.attention_architecture) utils.print_out("Detected %d ranks, the current rank is %d " % (ddl.size(), ddl.rank())) train_model = model_helper.create_train_model(model_creator, hparams, scope, num_workers=ddl.size(), jobid=ddl.rank()) ddl.disable_bcast() eval_model = model_helper.create_eval_model(model_creator, hparams, scope) infer_model = model_helper.create_infer_model(model_creator, hparams, scope) # Preload data for sample decoding. dev_src_file = "%s.%s" % (hparams.dev_prefix, hparams.src) dev_tgt_file = "%s.%s" % (hparams.dev_prefix, hparams.tgt) sample_src_data = inference.load_data(dev_src_file) sample_tgt_data = inference.load_data(dev_tgt_file) summary_name = "train_log_rank_%d" % ddl.rank() model_dir = hparams.out_dir # Log and output files log_file = os.path.join(out_dir, "log_%d_rank_%d" % (time.time(), ddl.rank())) log_f = tf.gfile.GFile(log_file, mode="a") utils.print_out("# log_file=%s" % log_file, log_f) # TensorFlow model config_proto = utils.get_config_proto( log_device_placement=log_device_placement, num_intra_threads=hparams.num_intra_threads, num_inter_threads=hparams.num_inter_threads) train_sess = tf.Session( target=target_session, config=config_proto, graph=train_model.graph) eval_sess = tf.Session( target=target_session, config=config_proto, graph=eval_model.graph) infer_sess = tf.Session( target=target_session, config=config_proto, graph=infer_model.graph) with train_model.graph.as_default(): loaded_train_model, global_step = model_helper.create_or_load_model( train_model.model, model_dir, train_sess, "train") # Summary writer summary_writer = tf.summary.FileWriter( os.path.join(out_dir, summary_name), train_model.graph) #GJ18: do all evaluations on a single GPU! # First evaluation if ddl.rank() == 0: run_full_eval( model_dir, infer_model, infer_sess, eval_model, eval_sess, hparams, summary_writer, sample_src_data, sample_tgt_data, avg_ckpts) last_stats_step = global_step last_eval_step = global_step last_external_eval_step = global_step # This is the training loop. stats, info, start_train_time = before_train( loaded_train_model, train_model, train_sess, global_step, hparams, log_f) while global_step < num_train_steps: ### Run a step ### start_time = time.time() try: step_result = loaded_train_model.train(train_sess) hparams.epoch_step += 1 except tf.errors.OutOfRangeError: # Finished going through the training dataset. Go to next epoch. hparams.epoch_step = 0 if ddl.rank() == 0: utils.print_out( "# Finished an epoch, step %d. Perform external evaluation" % global_step) run_sample_decode(infer_model, infer_sess, model_dir, hparams, summary_writer, sample_src_data, sample_tgt_data) run_external_eval(infer_model, infer_sess, model_dir, hparams, summary_writer) if avg_ckpts: run_avg_external_eval(infer_model, infer_sess, model_dir, hparams, summary_writer, global_step) train_sess.run( train_model.iterator.initializer, feed_dict={train_model.skip_count_placeholder: 0}) continue # Process step_result, accumulate stats, and write summary global_step, info["learning_rate"], step_summary = update_stats( stats, start_time, step_result) summary_writer.add_summary(step_summary, global_step) # Once in a while, we print statistics. if global_step - last_stats_step >= steps_per_stats: last_stats_step = global_step is_overflow = process_stats( stats, info, global_step, steps_per_stats, log_f) print_step_info(" ", global_step, info, _get_best_results(hparams), log_f) if is_overflow: break # Reset statistics stats = init_stats() if global_step - last_eval_step >= steps_per_eval: last_eval_step = global_step utils.print_out("# Save eval, global step %d" % global_step) utils.add_summary(summary_writer, global_step, "train_ppl", info["train_ppl"]) if ddl.rank() == 0: # Save checkpoint loaded_train_model.saver.save( train_sess, os.path.join(out_dir, "translate.ckpt"), global_step=global_step) # Evaluate on dev/test run_sample_decode(infer_model, infer_sess, model_dir, hparams, summary_writer, sample_src_data, sample_tgt_data) run_internal_eval( eval_model, eval_sess, model_dir, hparams, summary_writer) if global_step - last_external_eval_step >= steps_per_external_eval: last_external_eval_step = global_step # Save checkpoint if ddl.rank() == 0: loaded_train_model.saver.save( train_sess, os.path.join(out_dir, "translate.ckpt"), global_step=global_step) run_sample_decode(infer_model, infer_sess, model_dir, hparams, summary_writer, sample_src_data, sample_tgt_data) run_external_eval( infer_model, infer_sess, model_dir, hparams, summary_writer) if avg_ckpts: run_avg_external_eval(infer_model, infer_sess, model_dir, hparams, summary_writer, global_step) # Done training if ddl.rank() == 0: loaded_train_model.saver.save( train_sess, os.path.join(out_dir, "translate.ckpt"), global_step=global_step) (result_summary, _, final_eval_metrics) = ( run_full_eval( model_dir, infer_model, infer_sess, eval_model, eval_sess, hparams, summary_writer, sample_src_data, sample_tgt_data, avg_ckpts)) print_step_info("# Final, ", global_step, info, result_summary, log_f) utils.print_time("# Done training!", start_train_time) summary_writer.close() if ddl.rank() == 0: utils.print_out("# Start evaluating saved best models.") for metric in hparams.metrics: best_model_dir = getattr(hparams, "best_" + metric + "_dir") summary_writer = tf.summary.FileWriter( os.path.join(best_model_dir, summary_name), infer_model.graph) result_summary, best_global_step, _ = run_full_eval( best_model_dir, infer_model, infer_sess, eval_model, eval_sess, hparams, summary_writer, sample_src_data, sample_tgt_data) print_step_info("# Best %s, " % metric, best_global_step, info, result_summary, log_f) summary_writer.close() if avg_ckpts: best_model_dir = getattr(hparams, "avg_best_" + metric + "_dir") summary_writer = tf.summary.FileWriter( os.path.join(best_model_dir, summary_name), infer_model.graph) result_summary, best_global_step, _ = run_full_eval( best_model_dir, infer_model, infer_sess, eval_model, eval_sess, hparams, summary_writer, sample_src_data, sample_tgt_data) print_step_info("# Averaged Best %s, " % metric, best_global_step, info, result_summary, log_f) summary_writer.close()
def main(_): ############################################################################ # Import MNIST data ############################################################################ mnist = input_data.read_data_sets(training_data_dir) # Parameters learning_rate = 0.001 training_iters = 2000 batch_size = 100 display_step = 1 # Network Parameters n_input = 784 # MNIST data input (img shape: 28*28) n_classes = 10 # MNIST total classes (0-9 digits) dropout = 0.75 # Dropout, probability to keep units # tf Graph input x = tf.placeholder(tf.float32, [None, n_input]) y = tf.placeholder(tf.int64, [None]) # Construct model pred, keep_prob = deepnn(x) # Define loss and optimizer with tf.name_scope('loss'): cost = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(labels=y, logits=pred)) with tf.name_scope('adam_optimizer'): optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) objective = optimizer.minimize(cost) predictor = tf.argmax(pred, 1, name="predictor") # Evaluate model with tf.name_scope('accuracy'): correct_prediction = tf.equal(predictor, y) correct_prediction = tf.cast(correct_prediction, tf.float32) accuracy = tf.reduce_mean(correct_prediction) graph_location = tempfile.mkdtemp() print('Saving graph to: %s' % graph_location) train_writer = tf.summary.FileWriter(graph_location) train_writer.add_graph(tf.get_default_graph()) # Launch the graph with tf.Session(config=tf.ConfigProto()) as sess: sess.run(tf.global_variables_initializer()) step = 1 # Keep training until reach max iterations while step * batch_size < training_iters: ################################################### ### USE ddl.rank() and ddl.size() to load data ### ################################################### batch_x, batch_y = mnist.train.next_batch(batch_size * ddl.size()) #select one of partitions batch_x = np.split(batch_x, ddl.size())[ddl.rank()] batch_y = np.split(batch_y, ddl.size())[ddl.rank()] # Run optimization op (backprop) sess.run(objective, feed_dict={ x: batch_x, y: batch_y, keep_prob: dropout }) if step % display_step == 0: # Calculate batch loss and accuracy loss, acc = sess.run([cost, accuracy], feed_dict={ x: batch_x, y: batch_y, keep_prob: 1. }) print("DDL " + str(ddl.rank()) + "] Iter " + str(step * batch_size) + ", Minibatch Loss= " + "{:.6f}".format(loss) + ", Training Accuracy= " + "{:.5f}".format(acc)) step += 1 print("DDL " + str(ddl.rank()) + "] Optimization Finished!") classification_inputs = tf.saved_model.utils.build_tensor_info(x) classification_outputs_classes = tf.saved_model.utils.build_tensor_info( predictor) classification_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={ tf.saved_model.signature_constants.CLASSIFY_INPUTS: classification_inputs }, outputs={ tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES: classification_outputs_classes }, method_name=tf.saved_model.signature_constants. CLASSIFY_METHOD_NAME)) print("classification_signature content:") print(classification_signature) # Calculate accuracy for 256 mnist test images print("DDL "+str(ddl.rank())+"] Testing Accuracy:", \ sess.run(accuracy, feed_dict={x: mnist.test.images[:256], y: mnist.test.labels[:256], keep_prob: 1.})) if ddl.rank() == 0: #model_path = "/tmp/mnist_chk" builder = tf.saved_model.builder.SavedModelBuilder(model_path) legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op') builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={ 'predict_images': classification_signature, }, legacy_init_op=legacy_init_op) save_path = str(builder.save()) # save_path = saver.save(sess, model_path) print("Model saved in file: %s" % save_path)