def _main_(): # ################# # Setup export path ################### version = 1 output_dir = os.path.join(output_loc, model_name) export_path = os.path.join(output_dir, str(version)) # ###################### # Interference Pipeline # ###################### input_names = 'image_tensor' output_names = ['detection_boxes', 'detection_classes', 'detection_scores', 'num_detections'] with tf.Session() as sess: input_tensor = tf.placeholder(dtype=tf.uint8, shape=(None, None, None, 3), name=input_names) # ################### # load frozen graph # ################### graph_def = load_graph_from_pb(frozen_graph) outputs = tf.import_graph_def(graph_def, input_map={'image_tensor': input_tensor}, return_elements=output_names, name='') outputs = [sess.graph.get_tensor_by_name(ops.name +':0')for ops in outputs] outputs = dict(zip(output_names, outputs)) # ##################### # Quantize Frozen Model # ##################### transforms = ["add_default_attributes", "quantize_weights", "round_weights", "fold_batch_norms", "fold_old_batch_norms"] quantized_graph = TransformGraph(input_graph_def=graph_def, inputs=input_names, outputs=output_names, transforms=transforms) # ##################### # Export to TF Serving# # ##################### # Reference: https://github.com/tensorflow/models/tree/master/research/object_detection with tf.Graph().as_default(): tf.import_graph_def(quantized_graph, name='') # Optimizing graph rewrite_options = rewriter_config_pb2.RewriterConfig() rewrite_options.optimizers.append('pruning') rewrite_options.optimizers.append('constfold') rewrite_options.optimizers.append('layout') graph_options = tf.GraphOptions(rewrite_options=rewrite_options, infer_shapes=True) # Build model for TF Serving config = tf.ConfigProto(graph_options=graph_options) # @TODO: add XLA for higher performance (AOT for ARM, JIT for x86/GPUs) # https://www.tensorflow.org/performance/xla/ # config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 # Reference: # https://www.tensorflow.org/guide/saved_model with session.Session(config=config) as sess: builder = tf.saved_model.builder.SavedModelBuilder(export_path) tensor_info_inputs = {'inputs': tf.saved_model.utils.build_tensor_info(input_tensor)} tensor_info_outputs = {} for k, v in outputs.items(): tensor_info_outputs[k] = tf.saved_model.utils.build_tensor_info(v) detection_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs = tensor_info_inputs, outputs = tensor_info_outputs, method_name= signature_constants.PREDICT_METHOD_NAME)) builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], #tag_constants.SERVING is IMP to specify as this indicates the saved graph is meant for serving signature_def_map={'predict_images': detection_signature, signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: detection_signature, }, ) builder.save() print("\n\nModel is ready for TF Serving. (saved at {}/saved_model.pb)".format(export_path))
X, Z1, _X = [], [], [] X.append(tf.random_uniform([dim, dim], 0, 10, name='X' + str(0))) _X.append(tf.placeholder(dtype=tf.float32, shape=[dim, dim])) Z1.append(tf.matmul(_X[0], _X[0])) with tf.device(dev2): Y, Z2, _Y = [], [], [] Y.append(tf.random_uniform([dim, dim], 0, 10, name='Y' + str(0))) _Y.append(tf.placeholder(dtype=tf.float32, shape=[dim, dim])) Z2.append(tf.matmul(_Y[0], _Y[0])) with tf.device(dev3): Z3 = [] Z3.append(tf.add(Z2[0], Z1[0])) config_proto = tf.ConfigProto(graph_options=tf.GraphOptions( build_cost_model=1)) config_proto.intra_op_parallelism_threads = 1 config_proto.inter_op_parallelism_threads = 1 config_proto.graph_options.optimizer_options.opt_level = -1 config_proto.graph_options.rewrite_options.constant_folding = ( rewriter_config_pb2.RewriterConfig.OFF) config_proto.graph_options.rewrite_options.arithmetic_optimization = ( rewriter_config_pb2.RewriterConfig.OFF) config_proto.graph_options.rewrite_options.dependency_optimization = ( rewriter_config_pb2.RewriterConfig.OFF) config_proto.graph_options.rewrite_options.layout_optimizer = ( rewriter_config_pb2.RewriterConfig.OFF) sess = tf.Session(config=config_proto) sess.run(tf.global_variables_initializer())
def _add_infer_shapes(graph_def): with tf.Graph().as_default(): with tf.Session(config=tf.ConfigProto( graph_options=tf.GraphOptions(infer_shapes=True))) as sess: tf.import_graph_def(graph_def, name="") return sess.graph_def
def create_session(): optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0) config = tf.ConfigProto(operation_timeout_in_ms=150000, graph_options=tf.GraphOptions(optimizer_options=optimizer_options)) # config.graph_options.rewrite_options.constant_folding = rewriter_config_pb2.RewriterConfig.OFF config.graph_options.place_pruned_graph = True return tf.Session(config=config)
def main(unused_argv): params = hyperparameters.get_hyperparameters(FLAGS.default_hparams_file, FLAGS.hparams_file, FLAGS, FLAGS.hparams) tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu if (FLAGS.tpu or params['use_tpu']) else '', zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) if params['use_async_checkpointing']: save_checkpoints_steps = None else: save_checkpoints_steps = max(2500, params['iterations_per_loop']) config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=get_model_dir(params), save_checkpoints_steps=save_checkpoints_steps, keep_checkpoint_max=None, # Keep all checkpoints. log_step_count_steps=FLAGS.log_step_count_steps, session_config=tf.ConfigProto( graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True))), tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=params['iterations_per_loop'], num_shards=params['num_cores'], # copybara:strip_begin tpu_job_name=FLAGS.tpu_job_name, # copybara:strip_end per_host_input_for_training=contrib_tpu.InputPipelineConfig .PER_HOST_V2)) # pylint: disable=line-too-long resnet_classifier = contrib_tpu.TPUEstimator( use_tpu=params['use_tpu'], model_fn=resnet_model_fn, config=config, params=params, train_batch_size=params['train_batch_size'], eval_batch_size=params['eval_batch_size'], export_to_tpu=FLAGS.export_to_tpu) # copybara:strip_begin if FLAGS.xla_compile: resnet_classifier = contrib_tpu.TPUEstimator( use_tpu=params['use_tpu'], model_fn=xla.estimator_model_fn(resnet_model_fn), config=config, params=params, train_batch_size=params['train_batch_size'], eval_batch_size=params['eval_batch_size'], export_to_tpu=FLAGS.export_to_tpu) # copybara:strip_end assert (params['precision'] == 'bfloat16' or params['precision'] == 'float32'), ( 'Invalid value for precision parameter; ' 'must be bfloat16 or float32.') tf.logging.info('Precision: %s', params['precision']) use_bfloat16 = params['precision'] == 'bfloat16' # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. if FLAGS.bigtable_instance: tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table) select_train, select_eval = _select_tables_from_flags() imagenet_train = imagenet_input.ImageNetBigtableInput( is_training=True, use_bfloat16=use_bfloat16, transpose_input=params['transpose_input'], selection=select_train) imagenet_eval = imagenet_input.ImageNetBigtableInput( is_training=False, use_bfloat16=use_bfloat16, transpose_input=params['transpose_input'], selection=select_eval) else: if FLAGS.data_dir == FAKE_DATA_DIR: tf.logging.info('Using fake dataset.') else: tf.logging.info('Using dataset: %s', FLAGS.data_dir) imagenet_train, imagenet_eval = [ imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=params['transpose_input'], cache=params['use_cache'] and is_training, image_size=params['image_size'], num_parallel_calls=params['num_parallel_calls'], use_bfloat16=use_bfloat16) for is_training in [True, False] ] steps_per_epoch = params['num_train_images'] // params['train_batch_size'] eval_steps = params['num_eval_images'] // params['eval_batch_size'] if FLAGS.mode == 'eval': # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( get_model_dir(params), timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time() # This time will include compilation time eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= params['train_steps']: tf.logging.info('Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info('Checkpoint %s no longer exists, skipping checkpoint', ckpt) elif FLAGS.mode == 'eval_igt': # IGT evaluation mode. Evaluate metrics for the desired parameters # (true or shifted) on the desired dataset (train or eval). Note that # train is still with data augmentation. # Get checkpoint file names. index_files = tf.gfile.Glob( os.path.join(get_model_dir(params), 'model.ckpt-*.index')) checkpoints = [fn[:-len('.index')] for fn in index_files] # Need to sort them to get proper tensorboard plotting (increasing event # timestamps correspond to increasing steps). checkpoint_steps = [] for ckpt in checkpoints: tf.logging.info(ckpt) step_match = re.match(r'.*model.ckpt-([0-9]*)', ckpt) checkpoint_steps.append(int(step_match.group(1))) checkpoints = [ ckpt for _, ckpt in sorted(zip(checkpoint_steps, checkpoints)) ] tf.logging.info('There are {} checkpoints'.format(len(checkpoints))) tf.logging.info(', '.join(checkpoints)) # Keep track of the last processed checkpoint (fault tolerance). analysis_state_path = os.path.join( get_model_dir(params), 'analysis_state_' + FLAGS.igt_eval_set + '_' + FLAGS.igt_eval_mode) next_analysis_index = 0 if tf.gfile.Exists(analysis_state_path): with tf.gfile.Open(analysis_state_path) as fd: next_analysis_index = int(fd.read()) # Process each checkpoint. while next_analysis_index < len(checkpoints): tf.logging.info('Next analysis index: {}'.format(next_analysis_index)) ckpt_path = checkpoints[next_analysis_index] tf.logging.info('Starting to evaluate: {}.'.format(ckpt_path)) start_timestamp = time.time() # This time will include compilation time if FLAGS.igt_eval_set == 'train': the_input_fn = imagenet_train.input_fn the_steps = steps_per_epoch elif FLAGS.igt_eval_set == 'eval': the_input_fn = imagenet_eval.input_fn the_steps = eval_steps else: raise ValueError('Unsupported igt_eval_set') eval_results = resnet_classifier.evaluate( input_fn=the_input_fn, steps=the_steps, checkpoint_path=ckpt_path, name=FLAGS.igt_eval_set + '_' + FLAGS.igt_eval_mode) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) next_analysis_index += 1 file_io.atomic_write_string_to_file(analysis_state_path, str(next_analysis_index)) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir(get_model_dir(params)) # pylint:disable=protected-access,g-line-too-long steps_per_epoch = params['num_train_images'] // params['train_batch_size'] tf.logging.info( 'Training for %d steps (%.2f epochs in total). Current' ' step %d.', params['train_steps'], params['train_steps'] / steps_per_epoch, current_step) start_timestamp = time.time() # This time will include compilation time if FLAGS.mode == 'train': hooks = [] if params['use_async_checkpointing']: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=get_model_dir(params), save_steps=max(2500, params['iterations_per_loop']))) resnet_classifier.train( input_fn=imagenet_train.input_fn, max_steps=params['train_steps'], hooks=hooks) else: assert FLAGS.mode == 'train_and_eval' while current_step < params['train_steps']: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, params['train_steps']) resnet_classifier.train( input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info('Finished training up to step %d. Elapsed seconds %d.', next_checkpoint, int(time.time() - start_timestamp)) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info('Starting to evaluate.') eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=params['num_eval_images'] // params['eval_batch_size']) tf.logging.info('Eval results at step %d: %s', next_checkpoint, eval_results) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Finished training up to step %d. Elapsed seconds %d.', params['train_steps'], elapsed_time) if FLAGS.export_dir is not None: # The guide to serve a exported TensorFlow model is at: # https://www.tensorflow.org/serving/serving_basic tf.logging.info('Starting to export model.') unused_export_path = resnet_classifier.export_saved_model( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=imagenet_input.image_serving_input_fn)
def main(): # Prepare training and testing data loadpath = "./" src_file = loadpath + "Pairs2M.src.num" tgt_file = loadpath + "Pairs2M.tgt.num" dic_file = loadpath + "Pairs2M.reddit.dic" opt = Options() opt_t = Options() train, val, test, wordtoix, ixtoword = read_pair_data_full( src_file, tgt_file, dic_file, max_num=opt.data_size, p_f=loadpath + 'demo.p') train = [ x for x in train if 2 < len(x[1]) < opt.maxlen - 4 and 2 < len(x[0]) < opt_t.maxlen - 4 ] val = [ x for x in val if 2 < len(x[1]) < opt.maxlen - 4 and 2 < len(x[0]) < opt_t.maxlen - 4 ] if TEST_FLAG: test = test + val + train opt.test_freq = 1 opt.n_words = len(ixtoword) opt_t.n_words = len(ixtoword) print dict(opt) if opt.model == 'cnn_rnn': opt_t.maxlen = opt_t.maxlen - opt_t.filter_shape + 1 opt_t.update_params() print dict(opt_t) print('Total words: %d' % opt.n_words) # load w2v if os.path.exists(opt.embedding_path_lime): with open(opt.embedding_path_lime, 'rb') as pfile: embedding = cPickle.load(pfile) else: w2v = gensim.models.KeyedVectors.load_word2vec_format( opt.embedding_path, binary=True) embedding = { i: copy.deepcopy(w2v[ixtoword[i]]) for i in range(opt.n_words) if ixtoword[i] in w2v } with open(opt.embedding_path_lime, 'wb') as pfile: cPickle.dump(embedding, pfile, protocol=cPickle.HIGHEST_PROTOCOL) for d in ['/gpu:0']: with tf.device(d): src_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len]) tgt_ = tf.placeholder(tf.int32, shape=[opt_t.batch_size, opt_t.sent_len]) res_, gan_cost_d_, train_op_d, gan_cost_g_, train_op_g = dialog_gan( src_, tgt_, opt, opt_t) merged = tf.summary.merge_all() uidx = 0 graph_options = tf.GraphOptions(build_cost_model=1) config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True, graph_options=graph_options) config.gpu_options.per_process_gpu_memory_fraction = 0.95 np.set_printoptions(precision=3) np.set_printoptions(threshold=np.inf) saver = tf.train.Saver() run_metadata = tf.RunMetadata() with tf.Session(config=config) as sess: train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph) test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph) sess.run(tf.global_variables_initializer()) if opt.restore: try: t_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES) #tf.trainable_variables() if opt.load_from_ae: save_keys = tensors_key_in_file( opt.load_path) #t_var g_W:0 key: W ss = [ var for var in t_vars if var.name[2:][:-2] in save_keys.keys() ] ss = [ var.name[2:] for var in ss if var.get_shape() == save_keys[var.name[2:][:-2]] ] cc = { var.name[2:][:-2]: var for var in t_vars if var.name[2:] in ss } loader = tf.train.Saver(var_list=cc) loader.restore(sess, opt.load_path) print("Loading variables from '%s'." % opt.load_path) print( "Loaded variables:" + " ".join( [var.name for var in t_vars if var.name[2:] in ss])) else: save_keys = tensors_key_in_file(opt.load_path) ss = [ var for var in t_vars if var.name[:-2] in save_keys.keys() ] ss = [ var.name for var in ss if var.get_shape() == save_keys[var.name[:-2]] ] loader = tf.train.Saver( var_list=[var for var in t_vars if var.name in ss]) loader.restore(sess, opt.load_path) print("Loading variables from '%s'." % opt.load_path) print("Loaded variables:" + str(ss)) # load reverse model try: save_keys = tensors_key_in_file('./save/rev_model') ss = [ var for var in t_vars if var.name[:-2] in save_keys.keys() and 'g_rev_' in var.name ] ss = [ var.name for var in ss if var.get_shape() == save_keys[var.name[:-2]] ] loader = tf.train.Saver( var_list=[var for var in t_vars if var.name in ss]) loader.restore(sess, './save/rev_model') print( "Loading reverse variables from ./save/rev_model") print("Loaded variables:" + str(ss)) except Exception as e: print("No reverse model loaded") except Exception as e: print 'Error: ' + str(e) print("No saving session, using random initialization") sess.run(tf.global_variables_initializer()) loss_d, loss_g = 0, 0 for epoch in range(opt.max_epochs): print("Starting epoch %d" % epoch) kf = get_minibatches_idx(len(train), opt.batch_size, shuffle=True) for _, train_index in kf: uidx += 1 tgt, src = zip(*[train[t] for t in train_index]) x_batch = prepare_data_for_cnn(src, opt) # Batch L y_batch = prepare_data_for_rnn( tgt, opt_t, is_add_GO=False ) if opt.model == 'cnn_rnn' else prepare_data_for_cnn( tgt, opt_t) feed = {src_: x_batch, tgt_: y_batch} if uidx % opt.d_freq == 0: if profile: _, loss_d = sess.run( [train_op_d, gan_cost_d_], feed_dict=feed, options=tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE), run_metadata=run_metadata) else: _, loss_d = sess.run([train_op_d, gan_cost_d_], feed_dict=feed) if uidx % opt.g_freq == 0: if profile: _, loss_g = sess.run( [train_op_g, gan_cost_g_], feed_dict=feed, options=tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE), run_metadata=run_metadata) else: _, loss_g = sess.run([train_op_g, gan_cost_g_], feed_dict=feed) if profile: tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), run_meta=run_metadata, tfprof_options=tf.contrib.tfprof.model_analyzer. PRINT_ALL_TIMING_MEMORY) exit(0) if uidx % opt.valid_freq == 0: VALID_SIZE = 1024 valid_multiplier = np.int( np.floor(VALID_SIZE / opt.batch_size)) res_all, val_tgt_all, loss_val_d_all, loss_val_g_all = [], [], [], [] for val_step in range(valid_multiplier): valid_index = np.random.choice(len(val), opt.batch_size) val_tgt, val_src = zip(*[val[t] for t in valid_index]) val_tgt_all.extend(val_tgt) x_val_batch = prepare_data_for_cnn(val_src, opt) # Batch L y_val_batch = prepare_data_for_rnn( val_tgt, opt_t, is_add_GO=False ) if opt.model == 'cnn_rnn' else prepare_data_for_cnn( val_tgt, opt_t) feed_val = {src_: x_val_batch, tgt_: y_val_batch} loss_val_d, loss_val_g = sess.run( [gan_cost_d_, gan_cost_g_], feed_dict=feed_val) loss_val_d_all.append(loss_val_d) loss_val_g_all.append(loss_val_g) res = sess.run(res_, feed_dict=feed_val) res_all.extend(res['syn_sent']) print("Validation: loss D %f loss G %f " % (np.mean(loss_val_d_all), np.mean(loss_val_g_all))) #print "Val Perm :" + " ".join([ixtoword[x] for x in val_src_permutated[0] if x != 0]) print "Val Source:" + u' '.join([ ixtoword[x] for x in val_src[0] if x != 0 ]).encode('utf-8').strip() print "Val Target :" + u' '.join([ ixtoword[x] for x in val_tgt[0] if x != 0 ]).encode('utf-8').strip() print "Val Generated:" + u' '.join([ ixtoword[x] for x in res['syn_sent'][0] if x != 0 ]).encode('utf-8').strip() print "" val_set = [prepare_for_bleu(s) for s in val_tgt_all] gen = [prepare_for_bleu(s) for s in res_all] [bleu1s, bleu2s, bleu3s, bleu4s] = cal_BLEU_4(gen, {0: val_set}, is_corpus=opt.is_corpus) [rouge1, rouge2, rouge3, rouge4, rougeL, rouges] = cal_ROUGE(gen, {0: val_set}, is_corpus=opt.is_corpus) etp_score, dist_score = cal_entropy(gen) bleu_nltk = cal_BLEU_4_nltk(gen, val_set, is_corpus=opt.is_corpus) rel_score = cal_relevance(gen, val_set, embedding) print 'Val BLEU: ' + ' '.join([ str(round(it, 3)) for it in (bleu_nltk, bleu1s, bleu2s, bleu3s, bleu4s) ]) print 'Val Rouge: ' + ' '.join([ str(round(it, 3)) for it in (rouge1, rouge2, rouge3, rouge4) ]) print 'Val Entropy: ' + ' '.join([ str(round(it, 3)) for it in (etp_score[0], etp_score[1], etp_score[2], etp_score[3]) ]) print 'Val Diversity: ' + ' '.join([ str(round(it, 3)) for it in (dist_score[0], dist_score[1], dist_score[2], dist_score[3]) ]) print 'Val Relevance(G,A,E): ' + ' '.join([ str(round(it, 3)) for it in (rel_score[0], rel_score[1], rel_score[2]) ]) print 'Val Avg. length: ' + str( round( np.mean([ len([y for y in x if y != 0]) for x in res_all ]), 3)) print "" summary = sess.run(merged, feed_dict=feed_val) summary2 = tf.Summary(value=[ tf.Summary.Value(tag="bleu-2", simple_value=bleu2s), tf.Summary.Value(tag="rouge-2", simple_value=rouge2), tf.Summary.Value(tag="etp-4", simple_value=etp_score[3]) ]) test_writer.add_summary(summary, uidx) test_writer.add_summary(summary2, uidx) if uidx % opt.test_freq == 0: iter_num = np.int(np.floor(len(test) / opt.batch_size)) + 1 res_all, test_tgt_all = [], [] for i in range(iter_num): test_index = range(i * opt.batch_size, (i + 1) * opt.batch_size) test_tgt, test_src = zip( *[test[t % len(test)] for t in test_index]) test_tgt_all.extend(test_tgt) x_batch = prepare_data_for_cnn(test_src, opt) y_batch = prepare_data_for_rnn( test_tgt, opt_t, is_add_GO=False ) if opt.model == 'cnn_rnn' else prepare_data_for_cnn( test_tgt, opt_t) feed = {src_: x_batch, tgt_: y_batch} res = sess.run(res_, feed_dict=feed) res_all.extend(res['syn_sent']) test_set = [prepare_for_bleu(s) for s in test_tgt_all] gen = [prepare_for_bleu(s) for s in res_all] [bleu1s, bleu2s, bleu3s, bleu4s] = cal_BLEU_4(gen, {0: test_set}, is_corpus=opt.is_corpus) [rouge1, rouge2, rouge3, rouge4, rougeL, rouges] = cal_ROUGE(gen, {0: test_set}, is_corpus=opt.is_corpus) etp_score, dist_score = cal_entropy(gen) bleu_nltk = cal_BLEU_4_nltk(gen, test_set, is_corpus=opt.is_corpus) rel_score = cal_relevance(gen, test_set, embedding) print 'Test BLEU: ' + ' '.join([ str(round(it, 3)) for it in (bleu_nltk, bleu1s, bleu2s, bleu3s, bleu4s) ]) print 'Test Rouge: ' + ' '.join([ str(round(it, 3)) for it in (rouge1, rouge2, rouge3, rouge4) ]) print 'Test Entropy: ' + ' '.join([ str(round(it, 3)) for it in (etp_score[0], etp_score[1], etp_score[2], etp_score[3]) ]) print 'Test Diversity: ' + ' '.join([ str(round(it, 3)) for it in (dist_score[0], dist_score[1], dist_score[2], dist_score[3]) ]) print 'Test Relevance(G,A,E): ' + ' '.join([ str(round(it, 3)) for it in (rel_score[0], rel_score[1], rel_score[2]) ]) print 'Test Avg. length: ' + str( round( np.mean([ len([y for y in x if y != 0]) for x in res_all ]), 3)) print '' if TEST_FLAG: exit() if uidx % opt.print_freq == 0: print("Iteration %d: loss D %f loss G %f" % (uidx, loss_d, loss_g)) res = sess.run(res_, feed_dict=feed) if opt.grad_penalty: print "grad_penalty: " + str(res['gp']) print "Source:" + u' '.join([ ixtoword[x] for x in x_batch[0] if x != 0 ]).encode('utf-8').strip() print "Target:" + u' '.join([ ixtoword[x] for x in y_batch[0] if x != 0 ]).encode('utf-8').strip() print "Generated:" + u' '.join([ ixtoword[x] for x in res['syn_sent'][0] if x != 0 ]).encode('utf-8').strip() print "" sys.stdout.flush() summary = sess.run(merged, feed_dict=feed) train_writer.add_summary(summary, uidx) if uidx % opt.save_freq == 0: saver.save(sess, opt.save_path)
def load(self, ckpt_path, hparams, master='local', batch_timeout_micros=80 * 1000, buckets=None): self.hparams = hparams self.buckets = buckets self.tpu_graph = tf.Graph() tpu_config = tf.ConfigProto( operation_timeout_in_ms=600 * 1000, allow_soft_placement=True, graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True)), isolate_session_state=True) # Find tpu master. print('master value set to:', master) tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( master, zone=None, project=None) master = tpu_cluster_resolver.get_master() self.sess = tf.Session(master, graph=self.tpu_graph, config=tpu_config) with self.tpu_graph.as_default(): self.vocab_table = tf.contrib.lookup.index_to_string_table_from_file( self.vocab_prefix, default_value=vocab_utils.UNK) if self.scenario == 'Offline': with self.tpu_graph.as_default(): self.source = tf.placeholder(shape=(hparams.infer_batch_size, hparams.src_max_len_infer), dtype=tf.int32) self.source_sequence_length = tf.placeholder( shape=(hparams.infer_batch_size), dtype=tf.int32) inputs = [[self.source, self.source_sequence_length]] self.predict_ops.append(self.offline_op(inputs)) else: with self.tpu_graph.as_default(): self.source = tf.placeholder( shape=[None, hparams.src_max_len_infer], dtype=tf.int32) self.source_sequence_length = tf.placeholder(shape=[None], dtype=tf.int32) inputs = [self.source, self.source_sequence_length] for _ in buckets: self.predict_ops.append( self.server_op( inputs, num_batch_threads=16, max_batch_size=hparams.infer_batch_size, batch_timeout_micros=batch_timeout_micros, allowed_batch_sizes=[hparams.infer_batch_size], max_enqueued_batches=10000)) # Add longest sequence predict op. self.predict_ops.append( self.server_op( inputs, num_batch_threads=16, max_batch_size=hparams.infer_batch_size, batch_timeout_micros=5000 * 1000, allowed_batch_sizes=[hparams.infer_batch_size], max_enqueued_batches=10000)) with self.tpu_graph.as_default(): vs = tf.global_variables() assign_ops = [] var_map = {} with tf.variable_scope('f32', dtype=tf.float32): for i in vs: if 'output_projection' in i.name: new_var = tf.get_variable( i.name[:-2], [i.shape[0], hparams.tgt_vocab_size]) assign_ops.append( tf.assign( i, tf.pad( tf.cast(new_var, i.dtype), [[0, 0], [ 0, 128 * (hparams.tgt_vocab_size // 128 + 1) - hparams.tgt_vocab_size ]]))) else: new_var = tf.get_variable(i.name[:-2], i.shape) assign_ops.append( tf.assign(i, tf.cast(new_var, i.dtype))) var_map[i.name[:-2]] = new_var.name[:-2] self.sess.run(tpu.initialize_system()) tf.train.init_from_checkpoint(ckpt_path, var_map) self.sess.run(tf.initializers.global_variables()) self.sess.run(tf.tables_initializer()) self.sess.run(assign_ops) return self
def run_model(opt, train, val, ixtoword): try: params = np.load('./param_g.npz') if params['Wemb'].shape == (opt.n_words, opt.embed_size): print('Use saved embedding.') opt.W_emb = params['Wemb'] else: print('Emb Dimension mismatch: param_g.npz:'+ str(params['Wemb'].shape) + ' opt: ' + str((opt.n_words, opt.embed_size))) opt.fix_emb = False except IOError: print('No embedding file found.') opt.fix_emb = False with tf.device('/gpu:1'): x_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len]) x_org_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len]) # is_train_ = tf.placeholder(tf.bool, name='is_train_') res_, g_loss_, d_loss_, gen_op, dis_op = textGAN(x_, x_org_, opt) merged = tf.summary.merge_all() # opt.is_train = False # res_val_, loss_val_, _ = auto_encoder(x_, x_org_, opt) # merged_val = tf.summary.merge_all() #tensorboard --logdir=run1:/tmp/tensorflow/ --port 6006 #writer = tf.train.SummaryWriter(opt.log_path, graph=tf.get_default_graph()) uidx = 0 config = tf.ConfigProto(log_device_placement = False, allow_soft_placement=True, graph_options=tf.GraphOptions(build_cost_model=1)) #config = tf.ConfigProto(device_count={'GPU':0}) config.gpu_options.per_process_gpu_memory_fraction = 0.8 np.set_printoptions(precision=3) np.set_printoptions(threshold=np.inf) saver = tf.train.Saver() run_metadata = tf.RunMetadata() with tf.Session(config = config) as sess: train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph) test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph) sess.run(tf.global_variables_initializer()) if opt.restore: try: #pdb.set_trace() t_vars = tf.trainable_variables() #print([var.name[:-2] for var in t_vars]) loader = restore_from_save(t_vars, sess, opt) print('\nload successfully\n') except Exception as e: print(e) print("No saving session, using random initialization") sess.run(tf.global_variables_initializer()) # for i in range(34): # valid_index = np.random.choice( # len(val), opt.batch_size) # val_sents = [val[t] for t in valid_index] # val_sents_permutated = add_noise(val_sents, opt) # x_val_batch = prepare_data_for_cnn( # val_sents_permutated, opt) # x_val_batch_org = prepare_data_for_rnn(val_sents, opt) # res = sess.run(res_, feed_dict={ # x_: x_val_batch, x_org_: x_val_batch_org}) # if i == 0: # valid_text = res['syn_sent'] # else: # valid_text = np.concatenate( # (valid_text, res['syn_sent']), 0) # np.savetxt('./text_news/vae_words.txt', valid_text, fmt='%i', delimiter=' ') # pdb.set_trace() for epoch in range(opt.max_epochs): print("Starting epoch %d" % epoch) # if epoch >= 10: # print("Relax embedding ") # opt.fix_emb = False # opt.batch_size = 2 kf = get_minibatches_idx(len(train), opt.batch_size, shuffle=True) for _, train_index in kf: uidx += 1 sents = [train[t] for t in train_index] sents_permutated = add_noise(sents, opt) #sents[0] = np.random.permutation(sents[0]) x_batch = prepare_data_for_cnn(sents_permutated, opt) # Batch L x_batch_org = prepare_data_for_rnn(sents, opt) d_loss = 0 g_loss = 0 if profile: if uidx % opt.dis_steps == 0: _, d_loss = sess.run([dis_op, d_loss_], feed_dict={x_: x_batch, x_org_: x_batch_org}, options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),run_metadata=run_metadata) if uidx % opt.gen_steps == 0: _, g_loss = sess.run([gen_op, g_loss_], feed_dict={x_: x_batch, x_org_: x_batch_org}, options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),run_metadata=run_metadata) else: if uidx % opt.dis_steps == 0: _, d_loss = sess.run([dis_op, d_loss_], feed_dict={x_: x_batch, x_org_: x_batch_org}) if uidx % opt.gen_steps == 0: _, g_loss = sess.run([gen_op, g_loss_], feed_dict={x_: x_batch, x_org_: x_batch_org}) ''' validation ''' if uidx % opt.valid_freq == 0: valid_index = np.random.choice(len(val), opt.batch_size) val_sents = [val[t] for t in valid_index] val_sents_permutated = add_noise(val_sents, opt) x_val_batch = prepare_data_for_cnn(val_sents_permutated, opt) x_val_batch_org = prepare_data_for_rnn(val_sents, opt) d_loss_val = sess.run(d_loss_, feed_dict={x_: x_val_batch, x_org_: x_val_batch_org}) g_loss_val = sess.run(g_loss_, feed_dict={x_: x_val_batch, x_org_: x_val_batch_org}) res = sess.run(res_, feed_dict={x_: x_val_batch, x_org_: x_val_batch_org}) print("Validation d_loss %f, g_loss %f mean_dist %f" % (d_loss_val, g_loss_val, res['mean_dist'])) print("Sent:" + u' '.join([ixtoword[x] for x in res['syn_sent'] [0] if x != 0]).encode('utf-8', 'ignore').decode("utf8").strip()) print("MMD loss %f, GAN loss %f" % (res['mmd'], res['gan'])) # np.savetxt('./text_arxiv/syn_val_words.txt', res['syn_sent'], fmt='%i', delimiter=' ') if opt.discrimination: print ("Real Prob %f Fake Prob %f" % (res['prob_r'], res['prob_f'])) for i in range(4): valid_index = np.random.choice( len(val), opt.batch_size) val_sents = [val[t] for t in valid_index] val_sents_permutated = add_noise(val_sents, opt) x_val_batch = prepare_data_for_cnn( val_sents_permutated, opt) x_val_batch_org = prepare_data_for_rnn(val_sents, opt) res = sess.run(res_, feed_dict={ x_: x_val_batch, x_org_: x_val_batch_org}) if i == 0: valid_text = res['syn_sent'] else: valid_text = np.concatenate( (valid_text, res['syn_sent']), 0) np.savetxt('./text_news/syn_val_words.txt',valid_text, fmt='%i', delimiter=' ') val_set = [prepare_for_bleu(s) for s in val_sents] [bleu2s, bleu3s, bleu4s] = cal_BLEU([prepare_for_bleu(s) for s in res['syn_sent']], {0: val_set}) print('Val BLEU (2,3,4): ' + ' '.join([str(round(it, 3)) for it in (bleu2s, bleu3s, bleu4s)])) summary = sess.run(merged, feed_dict={x_: x_val_batch, x_org_: x_val_batch_org}) test_writer.add_summary(summary, uidx)
def main(unused_argv): input_image_size = FLAGS.input_image_size if not input_image_size: if FLAGS.model_name.startswith('efficientnet-edgetpu'): _, _, input_image_size, _ = efficientnet_edgetpu_builder.efficientnet_edgetpu_params( FLAGS.model_name) elif FLAGS.model_name.startswith('efficientnet-tpu'): _, _, input_image_size, _ = efficientnet_tpu_builder.efficientnet_tpu_params( FLAGS.model_name) elif FLAGS.model_name.startswith('efficientnet'): _, _, input_image_size, _ = efficientnet_builder.efficientnet_params( FLAGS.model_name) else: raise ValueError( 'input_image_size must be set except for EfficientNet') # For imagenet dataset, include background label if number of output classes # is 1001 include_background_label = False #(FLAGS.num_label_classes == 1001) if FLAGS.tpu or FLAGS.use_tpu: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) else: tpu_cluster_resolver = None if FLAGS.use_async_checkpointing: save_checkpoints_steps = None else: save_checkpoints_steps = max(100, FLAGS.iterations_per_loop) config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_steps=save_checkpoints_steps, log_step_count_steps=FLAGS.log_step_count_steps, session_config=tf.ConfigProto( graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True))), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig .PER_HOST_V2)) # pylint: disable=line-too-long # Initializes model parameters. params = dict(steps_per_epoch=FLAGS.num_train_images / FLAGS.train_batch_size, use_bfloat16=FLAGS.use_bfloat16) est = tf.contrib.tpu.TPUEstimator(use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, export_to_tpu=FLAGS.export_to_tpu, params=params) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. def build_imagenet_input(is_training): """Generate ImageNetInput for training and eval.""" if FLAGS.bigtable_instance: tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table) select_train, select_eval = _select_tables_from_flags() return imagenet_input.ImageNetBigtableInput( is_training=is_training, use_bfloat16=FLAGS.use_bfloat16, transpose_input=FLAGS.transpose_input, selection=select_train if is_training else select_eval, include_background_label=include_background_label, autoaugment_name=FLAGS.autoaugment_name, mixup_alpha=FLAGS.mixup_alpha) else: if FLAGS.data_dir == FAKE_DATA_DIR: tf.logging.info('Using fake dataset.') else: tf.logging.info('Using dataset: %s', FLAGS.data_dir) return imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=FLAGS.transpose_input, cache=FLAGS.use_cache and is_training, image_size=input_image_size, num_parallel_calls=FLAGS.num_parallel_calls, use_bfloat16=FLAGS.use_bfloat16, include_background_label=include_background_label, autoaugment_name=FLAGS.autoaugment_name, mixup_alpha=FLAGS.mixup_alpha, num_classes=FLAGS.num_label_classes) imagenet_train = build_imagenet_input(is_training=True) imagenet_eval = build_imagenet_input(is_training=False) if FLAGS.mode == 'eval': eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time( ) # This time will include compilation time eval_results = est.evaluate(input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) utils.archive_ckpt(eval_results, eval_results['top_1_accuracy'], ckpt) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) # pylint: disable=protected-access,line-too-long tf.logging.info( 'Training for %d steps (%.2f epochs in total). Current' ' step %d.', FLAGS.train_steps, FLAGS.train_steps / params['steps_per_epoch'], current_step) start_timestamp = time.time( ) # This time will include compilation time if FLAGS.mode == 'train': hooks = [] if FLAGS.use_async_checkpointing: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(100, FLAGS.iterations_per_loop))) est.train(input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps, hooks=hooks) else: assert FLAGS.mode == 'train_and_eval' while current_step < FLAGS.train_steps: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, FLAGS.train_steps) est.train(input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', next_checkpoint, int(time.time() - start_timestamp)) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info('Starting to evaluate.') eval_results = est.evaluate(input_fn=imagenet_eval.input_fn, steps=FLAGS.num_eval_images // FLAGS.eval_batch_size) tf.logging.info('Eval results at step %d: %s', next_checkpoint, eval_results) ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) utils.archive_ckpt(eval_results, eval_results['top_1_accuracy'], ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', FLAGS.train_steps, elapsed_time) if FLAGS.export_dir: export(est, FLAGS.export_dir, input_image_size)
def cfg(): optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0, do_constant_folding=False) graph_options = tf.GraphOptions(optimizer_options=optimizer_options) return tf.ConfigProto(log_device_placement=True, graph_options=graph_options)
def main(unused_argv): # Mnas optimize - set the proper image data format tf.keras.backend.set_image_data_format(FLAGS.data_format) # Mnas optimize - optimization flags # gpu_thread_count = 2 # os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' # os.environ['TF_GPU_THREAD_COUNT'] = str(gpu_thread_count) # os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1' # os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # enable mixed precision? -> Not much benefits seen yet # os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" node0 = "172.31.11.9:6060" node1 = "172.31.1.33:6060" strategy = tf.distribute.MirroredStrategy() if FLAGS.total_nodes > 1: strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(tf.distribute.experimental.CollectiveCommunication.NCCL) if not FLAGS.is_evaluator: if FLAGS.node_num == 0: os.environ['TF_CONFIG'] = json.dumps({ 'cluster': { 'worker': [node0, node1] }, 'task': {'type': 'worker', 'index': 0} }) else: os.environ['TF_CONFIG'] = json.dumps({ 'cluster': { 'worker': [node0, node1] }, 'task': {'type': 'worker', 'index': 1} }) else: os.environ['TF_CONFIG'] = json.dumps({ 'cluster': { 'evaluator': ["localhost:6060"] }, 'task': {'type': 'evaluator', 'index': 0} }) if FLAGS.use_async_checkpointing: save_checkpoints_steps = None else: save_checkpoints_steps = max(100, FLAGS.iterations_per_loop) gconfig = tf.ConfigProto( graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True))) if FLAGS.use_xla: gconfig.session_config.graph_options.optimizer_options.global_jit_level = (tf.OptimizerOptions.ON_1) # mnasnet opt - check if this is required! gconfig.gpu_options.allow_growth = True #gconfig.session_config.gpu_options.visible_device_list = str(hvd.local_rank()) config = tf.estimator.RunConfig( model_dir=FLAGS.model_dir, save_checkpoints_steps=save_checkpoints_steps, log_step_count_steps=FLAGS.log_step_count_steps, train_distribute=strategy, session_config=gconfig) # pylint: disable=line-too-long print('mnasnet opt - config cluster spec', config.cluster_spec) # Initializes model parameters. params = dict( steps_per_epoch=FLAGS.num_train_images / FLAGS.train_batch_size, batch_size=FLAGS.train_batch_size, dtype = tf.float32, use_bfloat16=FLAGS.use_bfloat16, quantized_training=FLAGS.quantized_training) mnasnet_est = tf.estimator.Estimator( model_fn=mnasnet_model_fn, model_dir=FLAGS.model_dir, config=config, params=params) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. if FLAGS.bigtable_instance: tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table) select_train, select_eval = _select_tables_from_flags() imagenet_train, imagenet_eval = [imagenet_input.ImageNetBigtableInput( is_training=is_training, use_bfloat16=False, transpose_input=FLAGS.transpose_input, selection=selection) for (is_training, selection) in [(True, select_train), (False, select_eval)]] else: if FLAGS.data_dir == FAKE_DATA_DIR: tf.logging.info('Using fake dataset.') else: tf.logging.info('Using dataset: %s', FLAGS.data_dir) imagenet_train, imagenet_eval = [ imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=FLAGS.transpose_input, cache=FLAGS.use_cache and is_training, image_size=FLAGS.input_image_size, num_parallel_calls=FLAGS.num_parallel_calls, use_bfloat16=FLAGS.use_bfloat16) for is_training in [True, False] ] if FLAGS.mode == 'eval': eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time() # This time will include compilation time eval_results = mnasnet_est.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info('Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info('Checkpoint %s no longer exists, skipping checkpoint', ckpt) if FLAGS.export_dir: export(mnasnet_est, FLAGS.export_dir, FLAGS.post_quantize) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir( # pylint: disable=protected-access FLAGS.model_dir) tf.logging.info( 'Training for %d steps (%.2f epochs in total). Current' ' step %d.', FLAGS.train_steps, FLAGS.train_steps / params['steps_per_epoch'], current_step) start_timestamp = time.time() # This time will include compilation time if FLAGS.mode == 'train': hooks = [] if FLAGS.use_async_checkpointing: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(100, FLAGS.iterations_per_loop))) mnasnet_est.train( input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps, hooks=hooks) else: assert FLAGS.mode == 'train_and_eval' train_spec = tf.estimator.TrainSpec(input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps) eval_spec = tf.estimator.EvalSpec(input_fn=imagenet_eval.input_fn, steps=FLAGS.num_eval_images // FLAGS.eval_batch_size, throttle_secs=600) tf.estimator.train_and_evaluate(mnasnet_est, train_spec, eval_spec) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Finished training up to step %d. Elapsed seconds %d.', FLAGS.train_steps, elapsed_time) if FLAGS.export_dir: export(mnasnet_est, FLAGS.export_dir, FLAGS.post_quantize)
def main(unused_argv): params = params_dict.ParamsDict(mnasnet_config.MNASNET_CFG, mnasnet_config.MNASNET_RESTRICTIONS) params = params_dict.override_params_dict(params, FLAGS.config_file, is_strict=True) params = params_dict.override_params_dict(params, FLAGS.params_override, is_strict=True) params = flags_to_params.override_params_from_input_flags(params, FLAGS) additional_params = { 'steps_per_epoch': params.num_train_images / params.train_batch_size, 'quantized_training': FLAGS.quantized_training, } params = params_dict.override_params_dict(params, additional_params, is_strict=False) params.validate() params.lock() if FLAGS.tpu or params.use_tpu: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) else: tpu_cluster_resolver = None if params.use_async_checkpointing: save_checkpoints_steps = None else: save_checkpoints_steps = max(100, params.iterations_per_loop) config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_steps=save_checkpoints_steps, log_step_count_steps=FLAGS.log_step_count_steps, session_config=tf.ConfigProto( graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True))), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=params.iterations_per_loop, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig .PER_HOST_V2)) # pylint: disable=line-too-long # Validates Flags. if params.precision == 'bfloat16' and params.use_keras: raise ValueError( 'Keras layers do not have full support to bfloat16 activation training.' ' You have set precision as %s and use_keras as %s' % (params.precision, params.use_keras)) # Initializes model parameters. mnasnet_est = tf.contrib.tpu.TPUEstimator( use_tpu=params.use_tpu, model_fn=build_model_fn, config=config, train_batch_size=params.train_batch_size, eval_batch_size=params.eval_batch_size, export_to_tpu=FLAGS.export_to_tpu, params=params.as_dict()) if FLAGS.mode == 'export_only': export(mnasnet_est, FLAGS.export_dir, params, FLAGS.post_quantize) return # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. if FLAGS.bigtable_instance: tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table) select_train, select_eval = _select_tables_from_flags() imagenet_train, imagenet_eval = [ imagenet_input.ImageNetBigtableInput( is_training=is_training, use_bfloat16=False, transpose_input=params.transpose_input, selection=selection) for (is_training, selection) in [(True, select_train), (False, select_eval)] ] else: if FLAGS.data_dir == FAKE_DATA_DIR: tf.logging.info('Using fake dataset.') else: tf.logging.info('Using dataset: %s', FLAGS.data_dir) imagenet_train, imagenet_eval = [ imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=params.transpose_input, cache=params.use_cache and is_training, image_size=params.input_image_size, num_parallel_calls=params.num_parallel_calls, use_bfloat16=(params.precision == 'bfloat16')) for is_training in [True, False] ] if FLAGS.mode == 'eval': eval_steps = params.num_eval_images // params.eval_batch_size # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time( ) # This time will include compilation time eval_results = mnasnet_est.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) mnas_utils.archive_ckpt(eval_results, eval_results['top_1_accuracy'], ckpt) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= params.train_steps: tf.logging.info( 'Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) if FLAGS.export_dir: export(mnasnet_est, FLAGS.export_dir, params, FLAGS.post_quantize) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir( # pylint: disable=protected-access FLAGS.model_dir) tf.logging.info( 'Training for %d steps (%.2f epochs in total). Current' ' step %d.', params.train_steps, params.train_steps / params.steps_per_epoch, current_step) start_timestamp = time.time( ) # This time will include compilation time if FLAGS.mode == 'train': hooks = [] if params.use_async_checkpointing: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(100, params.iterations_per_loop))) mnasnet_est.train(input_fn=imagenet_train.input_fn, max_steps=params.train_steps, hooks=hooks) else: assert FLAGS.mode == 'train_and_eval' while current_step < params.train_steps: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, params.train_steps) mnasnet_est.train(input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', next_checkpoint, int(time.time() - start_timestamp)) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info('Starting to evaluate.') eval_results = mnasnet_est.evaluate( input_fn=imagenet_eval.input_fn, steps=params.num_eval_images // params.eval_batch_size) tf.logging.info('Eval results at step %d: %s', next_checkpoint, eval_results) ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) mnas_utils.archive_ckpt(eval_results, eval_results['top_1_accuracy'], ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', params.train_steps, elapsed_time) if FLAGS.export_dir: export(mnasnet_est, FLAGS.export_dir, params, FLAGS.post_quantize)
def main(__): print(f"Tensorflow Version is {tf.__version__}") # mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True) if FLAGS.job_name is None or FLAGS.job_name == '': raise ValueError('Must specify an explicit job_name !') else: print('job_name : %s' % FLAGS.job_name) if FLAGS.task_index is None or FLAGS.task_index == '': raise ValueError('Must specify an explicit task_index!') else: print('task_index : %d' % FLAGS.task_index) ps_spec = FLAGS.ps_hosts.split(',') worker_spec = FLAGS.worker_hosts.split(',') # 创建集群 num_worker = len(worker_spec) print("Cluster num is {}".format(num_worker)) cluster = tf.train.ClusterSpec({'ps': ps_spec, 'worker': worker_spec}) server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == 'ps': server.join() is_chief = (FLAGS.task_index == 0) worker_device = '/job:worker/task:%d/cpu:0' % FLAGS.task_index with tf.device( tf.train.replica_device_setter(cluster=cluster, worker_device=worker_device)): # with tf.device("/cpu:0"): # --------[PART 01] build model -------------- # ----[0] init (1 V 2 placehold) # 0.1 Variable # 0.2 placeholder global_step = tf.Variable(0, name='global_step', trainable=False) # 创建纪录全局训练步数变量 hid_w = tf.Variable(tf.truncated_normal( [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units], stddev=1.0 / IMAGE_PIXELS), name='hid_w') hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name='hid_b') sm_w = tf.Variable(tf.truncated_normal([FLAGS.hidden_units, 10], stddev=1.0 / math.sqrt(FLAGS.hidden_units)), name='sm_w') sm_b = tf.Variable(tf.zeros([10]), name='sm_b') x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS]) y_ = tf.placeholder(tf.float32, [None, 10]) # real_y # ----[1] Forward hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) hid = tf.nn.relu(hid_lin) y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) #----[2].BackProp (loss ,opt ) cross_entropy = - \ tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) opt = tf.train.AdamOptimizer(FLAGS.learning_rate) train_step = opt.minimize(cross_entropy, global_step=global_step) # 生成本地的参数初始化操作init_op init_op = tf.global_variables_initializer() #train_dir = tempfile.mkdtemp() sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.log_file, init_op=init_op, recovery_wait_secs=1, global_step=global_step) if is_chief: print('Worker %d: Initailizing session...' % FLAGS.task_index) else: print('Worker %d: Waiting for session to be initaialized...' % FLAGS.task_index) # --------[PART 02 ] Train model -------------- run_metadata = tf.RunMetadata() run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) config = tf.ConfigProto(graph_options=tf.GraphOptions( optimizer_options=tf.OptimizerOptions( opt_level=tf.OptimizerOptions.L0))) sess = sv.prepare_or_wait_for_session(server.target) print('Worker %d: Session initialization complete.' % FLAGS.task_index) time_begin = time.time() print('Traing begins @ %f' % time_begin) local_step = 0 while True: batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size) train_feed = {x: batch_xs, y_: batch_ys} """ feed_dict 输入数据是最慢的 _, step = sess.run([train_step, global_step], feed_dict=train_feed) """ local_step += 1 now = time.time() print('%f: Worker %d: traing step %d dome (global step:%d)' % (now, FLAGS.task_index, local_step, step)) if step >= FLAGS.train_steps: break time_end = time.time() print('Training ends @ %f' % time_end) train_time = time_end - time_begin print('Training elapsed time:%f s' % train_time) val_feed = {x: mnist.validation.images, y_: mnist.validation.labels} val_xent = sess.run(cross_entropy, feed_dict=val_feed) print('After %d training step(s), validation cross entropy = %g' % (FLAGS.train_steps, val_xent)) sess.close() # ------------[Part 4] Output ---------------- if FLAGS.record_type == "record_parameters": par_str = "lr_%g_b1_%g_b2_%g_bsize_%g" % ( FLAGS.learning_rate, FLAGS.beta_1, FLAGS.beta_2, FLAGS.batch_size) FLAGS.model_save_path = os.path.join(FLAGS.current_path, "model", par_str + ".h5") FLAGS.history_output_path = os.path.join(FLAGS.current_path, "train_process", par_str + ".csv") FLAGS.predict_label_output_path = os.path.join(FLAGS.current_path, "submit", par_str + ".csv") # 1 . model save model.save(FLAGS.model_save_path) history_df = pd.DataFrame(history.history) history_df.to_csv(FLAGS.history_output_path) # 2. predict predict_data = pd.read_csv(FLAGS.predict_x_path) predict_x = predict_data.values.reshape([len(predict_data)] + [28, 28, 1]) predict_y = model.predict(predict_x) # label predict_label = np.argmax(predict_y, axis=1) df_predict_label = pd.DataFrame({ "ImageId": range(1, len(predict_label) + 1), "Label": predict_label }) # export df_predict_label.to_csv(FLAGS.predict_label_output_path, index=False)
def export(model_params, checkpoint_file, config=None): # Input data batch_size = 1 im_size = model_params.im_size guide_image = tf.placeholder(tf.float32, [batch_size, 224, 224, 3]) gb_image = tf.placeholder(tf.float32, [batch_size, im_size[1], im_size[0], 1]) input_image = tf.placeholder(tf.float32, [batch_size, im_size[1], im_size[0], 3]) # Create model model_func = get_model_func(model_params.base_model) # split the model into visual modulator and other parts, visual modulator only need to run once if model_params.use_visual_modulator: if model_params.base_model == 'lite': v_m_params = visual_modulator_lite(guide_image, model_params, is_training=False) else: v_m_params = visual_modulator(guide_image, model_params, is_training=False) else: v_m_params = None net, end_points = model_func([guide_image, gb_image, input_image], model_params, visual_modulator_params=v_m_params, is_training=False) probabilities = tf.nn.sigmoid(net, name='prob') global_step = tf.Variable(0, name='global_step', trainable=False) rewrite_options = rewriter_config_pb2.RewriterConfig() rewrite_options.optimizers.append('pruning') rewrite_options.optimizers.append('constfold') rewrite_options.optimizers.append('layout') graph_options = tf.GraphOptions(rewrite_options=rewrite_options, infer_shapes=True) config = tf.ConfigProto( graph_options=graph_options, allow_soft_placement=True, ) output_names = ['prob'] for i, v_m_param in enumerate(v_m_params): visual_mod_name = 'visual_mod_params_%d' % (i + 1) tf.identity(v_m_param, name=visual_mod_name) output_names.append(visual_mod_name) # Create a saver to load the network saver = tf.train.Saver([ v for v in tf.global_variables() ]) #if '-up' not in v.name and '-cr' not in v.name]) save_name = checkpoint_file + '.graph.pb' with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) saver.restore(sess, checkpoint_file) if not model_params.base_model == 'lite': sess.run(interp_surgery(tf.global_variables())) output_graph_def = graph_util.convert_variables_to_constants( sess, sess.graph_def, output_names) with open(save_name, 'wb') as writer: writer.write(output_graph_def.SerializeToString()) model_params.output_names = output_names with open(save_name + '.json', 'w') as writer: json.dump(vars(model_params), writer) print 'Model saved in', save_name
# from https://github.com/tensorflow/tensorflow/issues/7251 import os os.environ["CUDA_VISIBLE_DEVICES"] = "0" import tensorflow as tf from tensorflow.python.client.timeline import Timeline with tf.device("/gpu:0"): x = tf.ones(100, name="x") idxs = tf.range(100) for i in range(10): y = tf.identity(x, name="identity-" + str(i)) x = tf.dynamic_stitch([idxs, idxs], [x, y], name="stitch-" + str(i)) config = tf.ConfigProto(graph_options=tf.GraphOptions( optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0))) sess = tf.InteractiveSession(config=config) metadata = tf.RunMetadata() sess.run(x, options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE, output_partition_graphs=True), run_metadata=metadata) timeline = Timeline(metadata.step_stats) with open("dynamic_stitch_gpu_profile.json", "w") as f: f.write(timeline.generate_chrome_trace_format()) with open("dynamic_stitch_gpu_profile.pbtxt", "w") as f: f.write(str(metadata))
def main(): args = parse_args() model_dir = args.model[0] if args.model else None if not model_dir: if args.start: model_dir = 'models/' + datetime.now().strftime('%Y%m%d.%H%M') if args.name: model_dir += '-' + args.name[0] + '/' else: model_dir += '/' else: model_dir = most_recent_model() params = { 'steps': args.steps[0] if args.steps else MAX_STEPS, 'batch_size': args.batch_size[0] if args.batch_size else BATCH_SIZE, 'learning_rate': 1e-4 if args.warm_start else 3e-4, 'num_channels': get_num_channels(args, model_dir) or 128, 'num_blocks': get_num_blocks(args, model_dir) or 9 } config = tf.estimator.RunConfig( tf_random_seed=0xfde6885f if args.deterministic else None, session_config=tf.ConfigProto( graph_options=tf.GraphOptions( optimizer_options=tf.OptimizerOptions( do_common_subexpression_elimination=not args.debug, do_constant_folding=not args.debug, do_function_inlining=not args.debug)), gpu_options=tf.GPUOptions(allow_growth=True))) if args.warm_start: steps_to_skip = 10000 warm_start_from = tf.estimator.WarmStartSettings( ckpt_to_initialize_from=args.warm_start[0], vars_to_warm_start='[0-9x].*' # only layers ) else: steps_to_skip = 0 warm_start_from = None if args.deterministic: # since 16-bit floating point is not accurate enough for deterministic output, fix # it to `f32` instead. set_compute_type(tf.float32) if args.mask: features_mask = list(map(lambda x: float(x), args.mask[0].split(';'))) else: features_mask = None hooks = [tf_debug.LocalCLIDebugHook()] if args.debug else [] nn = tf.estimator.Estimator(config=config, model_fn=model_fn, model_dir=model_dir, params=params, warm_start_from=warm_start_from) if args.start or args.resume: nn.train(hooks=hooks + [LearningRateScheduler(steps_to_skip)], input_fn=lambda: input_fn(args.files, params[ 'batch_size'], features_mask, True, args.deterministic), steps=params['steps'] // params['batch_size']) elif args.verify: # iterate over the entire dataset and collect the metric, which we will # then pretty-print as a JSON object to standard output results = nn.evaluate( hooks=hooks, input_fn=lambda: input_fn(args.files, params[ 'batch_size'], features_mask, False, args.deterministic), steps=params['steps'] // params['batch_size']) print( json.dumps( results, default=lambda x: float(x) if x != int(x) else int(x), # handle `Decimal` types sort_keys=True, separators=(',', ': '), indent=4)) elif args.dump: predictor = nn.predict(input_fn=lambda: input_fn([], params[ 'batch_size'], None, False, False), hooks=[DumpHook()]) for _ in predictor: pass elif args.features_map > 0: predictor = nn.predict(input_fn=lambda: input_fn( args.files, 1, features_mask, False, args.deterministic)) count = 0 print('(;GM[1]FF[4]SZ[19]') for results in predictor: board_state = to_sgf_heat_map(results['features'], results['tower']) print('(;{})'.format(board_state)) count += 1 if count > 100: break print(')') elif args.print: # tensors are given then print all available tensors with some statistics. if not args.files: out = {} for var in nn.get_variable_names(): var_value = np.asarray(nn.get_variable_value(var)) out[var] = { 'mean': float(np.average(var_value)), 'std': float(np.std(var_value)) } print( json.dumps( out, default=lambda x: float(x) if x != int(x) else int(x), # handle `Decimal` types sort_keys=True, separators=(',', ': '), indent=4)) else: for var in args.files: print(var, nn.get_variable_value(var).tolist())
def main(): np.random.seed(0) tf.set_random_seed(0) dtype = np.float32 train_images = u.get_mnist_images() dsize = 10000 patches = train_images[:, :dsize].astype(dtype) fs = [dsize, 28 * 28, 196, 28 * 28] # values from deeplearning.stanford.edu/wiki/index.php/UFLDL_Tutorial X0 = patches lambda_ = 3e-3 rho = tf.constant(0.1, dtype=dtype) beta = 3 W0_0 = u.ng_init(fs[2], fs[3]) W1_0 = u.ng_init(fs[3], fs[2]) W0f = u.flatten([W0_0.flatten(), W1_0.flatten()]) def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = f(-1) n = len(fs) - 2 # helper to create variables with numpy or TF initial value init_dict = {} # {var_placeholder: init_value} vard = {} # {var: u.VarInfo} def init_var(val, name, trainable=False, noinit=False): if isinstance(val, tf.Tensor): collections = [] if noinit else None var = tf.Variable(val, name=name, collections=collections) else: val = np.array(val) assert u.is_numeric, "Unknown type" holder = tf.placeholder(dtype, shape=val.shape, name=name + "_holder") var = tf.Variable(holder, name=name, trainable=trainable) init_dict[holder] = val var_p = tf.placeholder(var.dtype, var.shape) var_setter = var.assign(var_p) vard[var] = u.VarInfo(var_setter, var_p) return var lr = init_var(0.2, "lr") Wf = init_var(W0f, "Wf", True) Wf_copy = init_var(W0f, "Wf_copy", True) W = u.unflatten(Wf, fs[1:]) # perftodo: this creates transposes X = init_var(X0, "X") W.insert(0, X) def sigmoid(x): return tf.sigmoid(x) def d_sigmoid(y): return y * (1 - y) def kl(x, y): return x * tf.log(x / y) + (1 - x) * tf.log((1 - x) / (1 - y)) def d_kl(x, y): return (1 - x) / (1 - y) - x / y # A[i] = activations needed to compute gradient of W[i] # A[n+1] = network output A = [None] * (n + 2) fail_node = tf.Print(0, [0], "fail, this must never run") with tf.control_dependencies([fail_node]): A[0] = u.Identity(dsize, dtype=dtype) A[1] = W[0] for i in range(1, n + 1): A[i + 1] = sigmoid(W[i] @ A[i]) # reconstruction error and sparsity error err = (A[3] - A[1]) rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True) / dsize # B[i] = backprops needed to compute gradient of W[i] # B2[i] = backprops from sampled labels needed for natural gradient B = [None] * (n + 1) B2 = [None] * (n + 1) B[n] = err * d_sigmoid(A[n + 1]) sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0) sampled_labels = init_var(sampled_labels_live, "sampled_labels", noinit=True) B2[n] = sampled_labels * d_sigmoid(A[n + 1]) for i in range(n - 1, -1, -1): backprop = t(W[i + 1]) @ B[i + 1] backprop2 = t(W[i + 1]) @ B2[i + 1] B[i] = backprop * d_sigmoid(A[i + 1]) B2[i] = backprop2 * d_sigmoid(A[i + 1]) # dW[i] = gradient of W[i] dW = [None] * (n + 1) pre_dW = [None] * (n + 1) # preconditioned dW pre_dW_stable = [None] * (n + 1) # preconditioned stable dW cov_A = [None] * (n + 1) # covariance of activations[i] cov_B2 = [None] * (n + 1) # covariance of synthetic backprops[i] vars_svd_A = [None] * (n + 1) vars_svd_B2 = [None] * (n + 1) for i in range(1, n + 1): cov_op = A[i] @ t(A[i]) / dsize + lambda_ * u.Identity(A[i].shape[0]) cov_A[i] = init_var(cov_op, "cov_A%d" % (i, )) cov_op = B2[i] @ t(B2[i]) / dsize + lambda_ * u.Identity( B2[i].shape[0]) cov_B2[i] = init_var(cov_op, "cov_B2%d" % (i, )) vars_svd_A[i] = u.SvdWrapper(cov_A[i], "svd_A_%d" % (i, ), do_inverses=True) vars_svd_B2[i] = u.SvdWrapper(cov_B2[i], "svd_B2_%d" % (i, ), do_inverses=True) whitened_A = vars_svd_A[i].inv @ A[i] whitened_B = vars_svd_B2[i].inv @ B[i] pre_dW[i] = (whitened_B @ t(whitened_A)) / dsize dW[i] = (B[i] @ t(A[i])) / dsize # Loss function reconstruction = u.L2(err) / (2 * dsize) loss = reconstruction grad_live = u.flatten(dW[1:]) pre_grad_live = u.flatten(pre_dW[1:]) # fisher preconditioned gradient grad = init_var(grad_live, "grad") pre_grad = init_var(pre_grad_live, "pre_grad") update_params_op = Wf.assign(Wf - lr * pre_grad).op save_params_op = Wf_copy.assign(Wf).op pre_grad_dot_grad = tf.reduce_sum(pre_grad * grad) grad_norm = tf.reduce_sum(grad * grad) pre_grad_norm = u.L2(pre_grad) def dump_svd_info(step): """Dump singular values and gradient values in those coordinates.""" for i in range(1, n + 1): svd = vars_svd_A[i] s0, u0, v0 = sess.run([svd.s, svd.u, svd.v]) u.dump(s0, "A_%d_%d" % (i, step)) A0 = A[i].eval() At0 = v0.T @ A0 u.dump(A0 @ A0.T, "Acov_%d_%d" % (i, step)) u.dump(At0 @ At0.T, "Atcov_%d_%d" % (i, step)) u.dump(s0, "As_%d_%d" % (i, step)) for i in range(1, n + 1): svd = vars_svd_B2[i] s0, u0, v0 = sess.run([svd.s, svd.u, svd.v]) u.dump(s0, "B2_%d_%d" % (i, step)) B0 = B[i].eval() Bt0 = v0.T @ B0 u.dump(B0 @ B0.T, "Bcov_%d_%d" % (i, step)) u.dump(Bt0 @ Bt0.T, "Btcov_%d_%d" % (i, step)) u.dump(s0, "Bs_%d_%d" % (i, step)) def advance_batch(): sess.run(sampled_labels.initializer) # new labels for next call def update_covariances(): ops_A = [cov_A[i].initializer for i in range(1, n + 1)] ops_B2 = [cov_B2[i].initializer for i in range(1, n + 1)] sess.run(ops_A + ops_B2) def update_svds(): vars_svd_A[2].update() vars_svd_B2[2].update() vars_svd_B2[1].update() def init_svds(): """Initialize our SVD to identity matrices.""" ops = [] for i in range(1, n + 1): ops.extend(vars_svd_A[i].init_ops) ops.extend(vars_svd_B2[i].init_ops) sess = tf.get_default_session() sess.run(ops) init_op = tf.global_variables_initializer() from tensorflow.core.protobuf import rewriter_config_pb2 rewrite_options = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True, constant_folding=rewriter_config_pb2.RewriterConfig.OFF, memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL) optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0) graph_options = tf.GraphOptions(optimizer_options=optimizer_options, rewrite_options=rewrite_options) config = tf.ConfigProto(graph_options=graph_options) sess = tf.InteractiveSession(config=config) sess.run(Wf.initializer, feed_dict=init_dict) sess.run(X.initializer, feed_dict=init_dict) advance_batch() update_covariances() init_svds() sess.run(init_op, feed_dict=init_dict) # initialize everything else print("Running training.") u.reset_time() step_lengths = [] # keep track of learning rates losses = [] # adaptive line search parameters alpha = 0.3 # acceptable fraction of predicted decrease beta = 0.8 # how much to shrink when violation growth_rate = 1.05 # how much to grow when too conservative def update_cov_A(i): sess.run(cov_A[i].initializer) def update_cov_B2(i): sess.run(cov_B2[i].initializer) # only update whitening matrix of input activations in the beginning vars_svd_A[1].update() for step in range(40): update_covariances() update_svds() sess.run(grad.initializer) sess.run(pre_grad.initializer) lr0, loss0 = sess.run([lr, loss]) update_params_op.run() advance_batch() losses.append(loss0) step_lengths.append(lr0) print("Step %d loss %.2f" % (step, loss0)) u.record_time() assert losses[-1] < 0.59 assert losses[-1] > 0.57 assert 20e-3 < min( u.global_time_list) < 50e-3, "Time should be 40ms on 1080" u.summarize_time() print("Test passed")
def main(_): ps_hosts = FLAGS.ps_hosts.split(",") worker_hosts = FLAGS.worker_hosts.split(",") # Create a cluster from the parameter server and worker hosts. cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) if FLAGS.job_name == "ps": ps_config = tf.ConfigProto(gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=0.00001)) # Create and start a server for the local task. server = tf.train.Server( cluster, # protocol = "grpc_rdma", job_name=FLAGS.job_name, task_index=FLAGS.task_index, config=ps_config) server.join() elif FLAGS.job_name == "worker": # Create and start a server for the local task. server = tf.train.Server( cluster, # protocol = "grpc_rdma", job_name=FLAGS.job_name, task_index=FLAGS.task_index) local_worker_device = "/job:worker/task:%d" % FLAGS.task_index with tf.device( tf.train.replica_device_setter( ps_device='/job:ps/cpu:0', worker_device=local_worker_device, cluster=cluster)): if FLAGS.network == 'lstm': from models.lstm import KitModel elif FLAGS.network == 'gru': from models.gru import KitModel elif FLAGS.network == 'fc': from models.fullyconnect import KitModel elif FLAGS.network == 'alexnet': from models.alexnet import KitModel elif FLAGS.network == 'vgg16': from models.vgg16 import KitModel elif FLAGS.network == 'vgg19' or FLAGS.network == 'vgg_e': from models.vgg19 import KitModel elif FLAGS.network == 'inception_v3': from models.inception_v3 import KitModel elif FLAGS.network == 'resnet': from models.resnet import KitModel elif FLAGS.network == 'seq2seq': import models.translate.translate from models.translate.translate import dist_train dist_train(FLAGS, server, cluster) sys.exit() else: sys.exit("Invalid network [%s]" % args.network) this_model = KitModel(FLAGS) this_model.build_model() train_dir = tempfile.mkdtemp() sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, device_filters=[ "/job:ps", "/job:worker/task:%d" % FLAGS.task_index ], graph_options=tf.GraphOptions( optimizer_options=tf.OptimizerOptions( opt_level=tf.OptimizerOptions.L1)), gpu_options=tf.GPUOptions(visible_device_list="")) if FLAGS.infer_shapes == True: sess_config.graph_options.infer_shapes = FLAGS.infer_shapes sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0), logdir=train_dir, init_op=tf.global_variables_initializer(), global_step=this_model.global_step, summary_writer=None, saver=None) if FLAGS.task_index == 0: print("Worker %d: Initializing session..." % FLAGS.task_index) else: print("Worker %d: Waiting for session to be initialized..." % FLAGS.task_index) sess = sv.prepare_or_wait_for_session(server.target, config=sess_config, start_standard_services=True) print_model() print("Start warmup %d epoch." % FLAGS.warmup) for _ in range(FLAGS.warmup): this_model.get_data() sess.run(this_model.train_op, feed_dict=this_model.get_feed_dict()) current_step = 0 duration = 0 while current_step < FLAGS.epoch: current_step += 1 this_model.get_data() print("Start step %d" % current_step) start_time = time.time() _, step_loss = sess.run([this_model.train_op, this_model.cost], feed_dict=this_model.get_feed_dict()) end_time = time.time() print( "Finish step %d, loss = %f, speed = %f sampes/s, duration = %f seconds" % (current_step, step_loss, FLAGS.batch_size / (end_time - start_time), end_time - start_time)) duration += end_time - start_time print("Total Time = %f s." % duration) #writer.close() else: sys.exit("Invalid job role name [%s]!" % args.job_name)
def main(_): ps_hosts = FLAGS.ps_hosts.split(",") worker_hosts = FLAGS.worker_hosts.split(",") # Create a cluster from the parameter server and worker hosts. cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) if FLAGS.job_name == "ps": ps_config = tf.ConfigProto(gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=0.01)) # Create and start a server for the local task. server = tf.train.Server( cluster, # protocol = "grpc_rdma", job_name=FLAGS.job_name, task_index=FLAGS.task_index, config=ps_config) server.join() elif FLAGS.job_name == "worker": # Create and start a server for the local task. server = tf.train.Server( cluster, # protocol = "grpc+verbs", job_name=FLAGS.job_name, task_index=FLAGS.task_index) ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.data_dir) ##################################### # Select the preprocessing function # ##################################### image_preprocessing_fn = preprocessing_factory.get_preprocessing( FLAGS.network, is_training=True) ###################### # Select the network # ###################### network_fn = nets_factory.get_network_fn(FLAGS.network, dataset.num_classes, is_training=True) if FLAGS.dataset_name != "synthetic": provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) train_image_size = FLAGS.train_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, train_image_size, train_image_size) images, labels = tf.train.batch([image, label], batch_size=FLAGS.batch_size, num_threads=4, capacity=5 * FLAGS.batch_size) else: images = random_ops.random_uniform( (FLAGS.batch_size, network_fn.default_image_size, network_fn.default_image_size, 3), maxval=1) labels = random_ops.random_uniform((FLAGS.batch_size, ), maxval=FLAGS.num_classes - 1, dtype=tf.int32) with tf.device( tf.train.replica_device_setter( ps_device='/job:ps/cpu:0', worker_device=("/job:worker/task:%d" % FLAGS.task_index), cluster=cluster)): global_step = tf.contrib.framework.get_or_create_global_step() #images, labels = cifar.distorted_inputs(FLAGS) logits, end_points = network_fn(images) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=labels) cost = tf.reduce_mean(loss) train_op = tf.train.AdagradOptimizer(0.01).minimize( cost, global_step=global_step) saver = tf.train.Saver() print_model() train_dir = tempfile.mkdtemp() sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, device_filters=[ "/job:ps", "/job:worker/task:%d" % FLAGS.task_index ], graph_options=tf.GraphOptions( optimizer_options=tf.OptimizerOptions( opt_level=tf.OptimizerOptions.L1)), gpu_options=tf.GPUOptions(visible_device_list="")) if FLAGS.infer_shapes == True: sess_config.graph_options.infer_shapes = FLAGS.infer_shapes sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0), logdir=train_dir, init_op=tf.global_variables_initializer(), global_step=global_step, summary_writer=None, saver=saver) if FLAGS.task_index == 0: print("Worker %d: Initializing session..." % FLAGS.task_index) else: print("Worker %d: Waiting for session to be initialized..." % FLAGS.task_index) sess = sv.prepare_or_wait_for_session(server.target, config=sess_config, start_standard_services=True) writer = tf.summary.FileWriter('./graphs', sess.graph) writer.close() tf.train.export_meta_graph(filename='kit_meta_graph.txt', graph=sess.graph, as_text=True) print("Graph Saved.") coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) print("Start warmup %d epoch." % FLAGS.warmup) for _ in range(FLAGS.warmup): sess.run(train_op) options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() current_step = 0 duration = 0 while current_step < FLAGS.epoch: current_step += 1 start_time = time.time() _, step_loss = sess.run([train_op, cost], options=options, run_metadata=run_metadata) end_time = time.time() print( "Finish step %d, loss = %f, speed = %f sampes/s, duration = %f seconds" % (current_step, step_loss, FLAGS.batch_size / (end_time - start_time), end_time - start_time)) duration += end_time - start_time if current_step == 3: fetched_timeline = timeline.Timeline(run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format() with open('timeline.json', 'w') as f: f.write(chrome_trace) print("Total Time = %f s." % duration) saver.save(sess, "kit_alexnet") else: sys.exit("Invalid job role name [%s]!" % args.job_name)
def tf_run_frozen_graph(self, file, xla, parallel, warmup, num_iter): print("run frozen graph----------------------------") graph_def, graph = self.import_graph(file) if (self.debug): print() print('Operations:') assert graph is not None ops = graph.get_operations() # type: Iterable[tf.Operation] input_nodes = [] variables_nodes = [] last_nodes = [] for op in ops: if (self.debug): print('- {0:20s} "{1}" ({2} outputs)'.format( op.type, op.name, len(op.outputs))) last_nodes = op.outputs if op.type == 'Placeholder': for node in op.outputs: input_nodes.append(node) if "Variable" in op.type: variables_nodes.append(op) if (self.debug): print() print('Sources (operations without inputs):') for op in ops: if len(op.inputs) > 0: continue print('- {0}'.format(op.name)) print() print('Operation inputs:') for op in ops: if len(op.inputs) == 0: continue print('- {0:20}'.format(op.name)) print(' {0}'.format(', '.join(i.name for i in op.inputs))) print() print('Tensors:') for op in ops: for out in op.outputs: print('- {0:20} {1:10} "{2}"'.format( str(out.shape), out.dtype.name, out.name)) with tf.Session(graph=graph) as sess: var_inits = [] g_def = graph.as_graph_def() for var in variables_nodes: vt = graph.get_tensor_by_name(var.outputs[0].name) # v = tf.get_variable(name = var.name, shape = vt.shape, initializer = tf.ones_initializer) # v = tf.get_variable(name = var.name, shape = vt.shape, initializer = tf.ones_initializer) # Ones initializer dt = tf.as_dtype(vt.dtype.base_dtype).as_datatype_enum dt_int32 = tf.as_dtype(tf.int32).as_datatype_enum init = tf.NodeDef( name=var.name + "/ones", op="Fill", input=[var.name + "/ones/shape", var.name + "/ones/const"], attr={ 'T': tf.AttrValue(type=dt), 'index_type': tf.AttrValue(type=dt_int32) }) shape = tf.NodeDef( name=var.name + "/ones/shape", op="Const", attr={ "dtype": tf.AttrValue(type=dt_int32), "value": tf.AttrValue(tensor=tf.make_tensor_proto( vt.get_shape().as_list())) }) const = tf.NodeDef( name=var.name + "/ones/const", op="Const", #dtype =tf.AttrValue(type=dt), attr={ "dtype": tf.AttrValue(type=dt), "value": tf.AttrValue(tensor=tf.make_tensor_proto(1.0, dt)) }) node = tf.NodeDef(name=var.name + "/assign", op='Assign', input=[var.name, var.name + "/ones"], attr={ 'use_locking': tf.AttrValue(b=False), 'validate_shape': tf.AttrValue(b=True), 'T': tf.AttrValue(type=dt) }) g_def.node.extend([shape, const, init, node]) var_inits.append("^" + var.name + "/assign") noop_assign = tf.NodeDef(name="init_all_var", op="NoOp", input=var_inits) g_def.node.extend([noop_assign]) tf.reset_default_graph() tf.import_graph_def(g_def) session_conf = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, graph_options=tf.GraphOptions(infer_shapes=True), inter_op_parallelism_threads=parallel) if xla: session_conf.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 with tf.Session(config=session_conf) as sess: init = tf.get_default_graph().get_operation_by_name( "import/init_all_var") input_nodes = [] varlist = [] feed_dict = {} aps = [] ops = tf.get_default_graph().get_operations() for op in ops: if op.type == 'Placeholder': for node in op.outputs: feed_dict[node] = np.ones( node.shape, dtype=node.dtype.as_numpy_dtype()) # Get result of applygradient for op in ops: if "ApplyGradient" in str(op.type): aps.append(op) varlist.append(op.inputs[0]) last_outputs = [] num_nodes = len(ops) name2nodeIdx_map = {} for i in range(num_nodes): name2nodeIdx_map[ops[i].name] = i node_outputs_ = [[] for i in range(num_nodes)] for n in range(num_nodes): op = ops[n] pending_count = len(op.inputs) for i in range(pending_count): input_name_id = op.inputs[i].name.split(':') node_outputs_[name2nodeIdx_map[input_name_id[0]]].append(n) for n in range(num_nodes): if len(node_outputs_[n]) == 0 and ops[n].type != 'NoOp': print('- {0:20s} {1}'.format(ops[n].type, ops[n].name)) for m in range(len(ops[n].inputs)): print('<-in-- {0:20s}'.format(ops[n].inputs[m].name)) last_outputs.append(ops[n].inputs[m]) # Init as Ones sess.run(init) # Get vals before apply_gradients for i in range(warmup): ret = sess.run(last_outputs + varlist, feed_dict) for i in range(0, len(last_outputs)): out_flat = ret[i].flat if (len(out_flat) > 0): max_len = min(10, len(out_flat)) print(last_outputs[i].name) print(out_flat[:max_len], "...(size=", len(out_flat), "end with", out_flat[-1], ")") # Do the apply_gradient sess.run(init) ret1 = sess.run(varlist + aps, feed_dict) print("Updated:") for i in range(0, len(varlist)): print(varlist[i].name, ret1[i]) iter_times = [] for i in range(num_iter): start_time = time.time() ret = sess.run(last_outputs + varlist, feed_dict) ret1 = sess.run(varlist + aps, feed_dict) iter_time = (time.time() - start_time) * 1000 iter_times.append(iter_time) print("Iteration time %f ms" % (iter_time)) print("Summary: [min, max, mean] = [%f, %f, %f] ms" % (min(iter_times), max(iter_times), sum(iter_times) / len(iter_times)))
def main(unused_argv): tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu if (FLAGS.tpu or FLAGS.use_tpu) else '', zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) if FLAGS.use_async_checkpointing: save_checkpoints_steps = None else: save_checkpoints_steps = max(100, FLAGS.iterations_per_loop) config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_steps=save_checkpoints_steps, log_step_count_steps=FLAGS.log_step_count_steps, session_config=tf.ConfigProto( graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True))), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_cores, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig .PER_HOST_V2)) # pylint: disable=line-too-long resnet_classifier = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=resnet_model_fn, config=config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, export_to_tpu=FLAGS.export_to_tpu) assert FLAGS.precision == 'bfloat16' or FLAGS.precision == 'float32', ( 'Invalid value for --precision flag; must be bfloat16 or float32.') tf.logging.info('Precision: %s', FLAGS.precision) use_bfloat16 = FLAGS.precision == 'bfloat16' # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. if FLAGS.bigtable_instance: tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table) select_train, select_eval = _select_tables_from_flags() imagenet_train, imagenet_eval = [ imagenet_input.ImageNetBigtableInput( is_training=is_training, use_bfloat16=use_bfloat16, transpose_input=FLAGS.transpose_input, selection=selection) for (is_training, selection) in [(True, select_train), (False, select_eval)] ] else: if FLAGS.data_dir == FAKE_DATA_DIR: tf.logging.info('Using fake dataset.') else: tf.logging.info('Using dataset: %s', FLAGS.data_dir) imagenet_train, imagenet_eval = [ imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=FLAGS.transpose_input, cache=FLAGS.use_cache and is_training, num_parallel_calls=FLAGS.num_parallel_calls, use_bfloat16=use_bfloat16) for is_training in [True, False] ] steps_per_epoch = FLAGS.num_train_images // FLAGS.train_batch_size eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size if FLAGS.mode == 'eval': # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time( ) # This time will include compilation time eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) # pylint: disable=protected-access,line-too-long steps_per_epoch = FLAGS.num_train_images // FLAGS.train_batch_size tf.logging.info( 'Training for %d steps (%.2f epochs in total). Current' ' step %d.', FLAGS.train_steps, FLAGS.train_steps / steps_per_epoch, current_step) start_timestamp = time.time( ) # This time will include compilation time if FLAGS.mode == 'train': hooks = [] if FLAGS.use_async_checkpointing: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(100, FLAGS.iterations_per_loop))) if FLAGS.profile_every_n_steps > 0: hooks.append( tpu_profiler_hook.TPUProfilerHook( save_steps=FLAGS.profile_every_n_steps, output_dir=FLAGS.model_dir, tpu=FLAGS.tpu)) resnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps, hooks=hooks) else: assert FLAGS.mode == 'train_and_eval' while current_step < FLAGS.train_steps: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, FLAGS.train_steps) resnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', next_checkpoint, int(time.time() - start_timestamp)) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info('Starting to evaluate.') eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=FLAGS.num_eval_images // FLAGS.eval_batch_size) tf.logging.info('Eval results at step %d: %s', next_checkpoint, eval_results) elapsed_time = int(time.time() - start_timestamp) tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', FLAGS.train_steps, elapsed_time) if FLAGS.export_dir is not None: # The guide to serve a exported TensorFlow model is at: # https://www.tensorflow.org/serving/serving_basic tf.logging.info('Starting to export model.') resnet_classifier.export_saved_model( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=imagenet_input.image_serving_input_fn )
def facs_model(learning_rate, scale_class_weight, use_two_fc, use_three_fc, use_four_fc, use_five_fc, use_six_fc, use_seven_fc, hparam): config = tf.ConfigProto(graph_options=tf.GraphOptions( optimizer_options=tf.OptimizerOptions( opt_level=tf.OptimizerOptions.L0))) tf.reset_default_graph() sess = tf.Session("", config=config) # Setup placeholders, and reshape the data x = tf.placeholder(tf.float32, [None, n_input], name="x") y = tf.placeholder(tf.float32, [None, n_output], name="labels") sw = tf.placeholder(tf.float32, [None, n_output], name='intensity_weights') # Main function compares the number of FCs for performance. if use_two_fc: fc1 = fc_layer(x, n_input, n_hidden_1, "fc1") relu = tf.nn.relu(fc1) tf.summary.histogram("fc1/relu", relu) logits = fc_layer(relu, n_hidden_1, n_output, "fc2") elif use_three_fc: fc1 = fc_layer(x, n_input, n_hidden_1, "fc1") relu = tf.nn.relu(fc1) tf.summary.histogram("fc3/relu", relu) fc2 = fc_layer(relu, n_hidden_1, n_hidden_2, "fc2") relu_2 = tf.nn.relu(fc2) tf.summary.histogram("fc3/relu", relu_2) logits = fc_layer(relu_2, n_hidden_2, n_output, "fc3") elif use_four_fc: fc1 = fc_layer(x, n_input, n_hidden_1, "fc1") relu = tf.nn.relu(fc1) tf.summary.histogram("fc4/relu", relu) fc2 = fc_layer(relu, n_hidden_1, n_hidden_2, "fc2") relu_2 = tf.nn.relu(fc2) tf.summary.histogram("fc4/relu", relu_2) fc3 = fc_layer(relu_2, n_hidden_2, n_hidden_3, "fc3") relu_3 = tf.nn.relu(fc3) tf.summary.histogram("fc4/relu", relu_3) logits = fc_layer(relu_3, n_hidden_3, n_output, "fc4") elif use_five_fc: fc1 = fc_layer(x, n_input, n_hidden_1, "fc1") relu = tf.nn.relu(fc1) tf.summary.histogram("fc5/relu", relu) fc2 = fc_layer(relu, n_hidden_1, n_hidden_2, "fc2") relu_2 = tf.nn.relu(fc2) tf.summary.histogram("fc5/relu", relu_2) fc3 = fc_layer(relu_2, n_hidden_2, n_hidden_3, "fc3") relu_3 = tf.nn.relu(fc3) tf.summary.histogram("fc5/relu", relu_3) fc4 = fc_layer(relu_3, n_hidden_3, n_hidden_4, "fc4") relu_4 = tf.nn.relu(fc4) tf.summary.histogram("fc5/relu", relu_4) logits = fc_layer(relu_4, n_hidden_4, n_output, "fc5") elif use_six_fc: fc1 = fc_layer(x, n_input, n_hidden_1, "fc1") relu = tf.nn.relu(fc1) tf.summary.histogram("fc6/relu", relu) fc2 = fc_layer(relu, n_hidden_1, n_hidden_2, "fc2") relu_2 = tf.nn.relu(fc2) tf.summary.histogram("fc6/relu", relu_2) fc3 = fc_layer(relu_2, n_hidden_2, n_hidden_3, "fc3") relu_3 = tf.nn.relu(fc3) tf.summary.histogram("fc6/relu", relu_3) fc4 = fc_layer(relu_3, n_hidden_3, n_hidden_4, "fc4") relu_4 = tf.nn.relu(fc4) tf.summary.histogram("fc6/relu", relu_4) fc5 = fc_layer(relu_4, n_hidden_4, n_hidden_5, "fc5") relu_5 = tf.nn.relu(fc5) tf.summary.histogram("fc6/relu", relu_5) logits = fc_layer(relu_5, n_hidden_5, n_output, "fc6") elif use_seven_fc: fc1 = fc_layer(x, n_input, n_hidden_1, "fc1") relu = tf.nn.relu(fc1) tf.summary.histogram("fc7/relu", relu) fc2 = fc_layer(relu, n_hidden_1, n_hidden_2, "fc2") relu_2 = tf.nn.relu(fc2) tf.summary.histogram("fc7/relu", relu_2) fc3 = fc_layer(relu_2, n_hidden_2, n_hidden_3, "fc3") relu_3 = tf.nn.relu(fc3) tf.summary.histogram("fc7/relu", relu_3) fc4 = fc_layer(relu_3, n_hidden_3, n_hidden_4, "fc4") relu_4 = tf.nn.relu(fc4) tf.summary.histogram("fc7/relu", relu_4) fc5 = fc_layer(relu_4, n_hidden_4, n_hidden_5, "fc5") relu_5 = tf.nn.relu(fc5) tf.summary.histogram("fc7/relu", relu_5) fc6 = fc_layer(relu_5, n_hidden_5, n_hidden_6, "fc6") relu_6 = tf.nn.relu(fc6) tf.summary.histogram("fc7/relu", relu_6) logits = fc_layer(relu_6, n_hidden_6, n_output, "fc7") else: logits = fc_layer(x, n_input, n_output, "fc") # Loss function with tf.name_scope("xent"): # The positive and negative samples in the data are unbalanced. # To push the algorithm to focus on fitting positives, I weighted the # positive values more than the negative. maxY = tf.reduce_sum(y, 1) * scale_class_weight class_weights = (maxY + 1) / 6 # Some expressions are more intense than others in the CK+ database and # and that is weighted in the loss function by sample weights, sw. # However, I got better results with just weighting all AUs # with equal intensity. # mult_w = tf.multiply(y, sw) # sum_w = tf.reduce_sum(mult_w,1) # # class_weights = ( sum_w + 1) / 6 print(class_weights.get_shape()) xent = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=y, name="xent")) xent = tf.reduce_mean(xent * class_weights) tf.summary.scalar("xent", xent) with tf.name_scope("train"): train_step = tf.train.AdamOptimizer(learning_rate).minimize(xent) with tf.name_scope("accuracy"): zero = tf.constant(0, dtype=tf.float32) onesMat = tf.ones_like(logits) zerosMat = tf.zeros_like(logits) onesY = tf.ones_like(y, dtype=tf.float32) yFloat = tf.cast(y, dtype=tf.float32) yFlipped = onesY - yFloat # PREDICTION - If logits >= 0, logits = 1, else logits = 0. logitsBin = tf.cast(tf.where(logits >= zero, onesMat, zerosMat), dtype=tf.float32, name="op_to_restore") tf.add_to_collection("coll", logitsBin) tf.add_to_collection("coll", x) print('logitsBin', logitsBin.get_shape()) print('y', y.get_shape()) print('where_logitsBin', tf.where(logitsBin)[:, 1].get_shape()) print('where_y', tf.where(y)[:, 1].get_shape()) time_steps = tf.cast(tf.shape(y)[0], dtype='int32') print(time_steps.get_shape()) nFacs = tf.count_nonzero(y, 1, dtype=tf.float32) onesFacs = tf.ones_like(nFacs) nFacs_Zeros = onesFacs * numFacs - nFacs nFacs = tf.where(tf.equal(nFacs, zero), onesFacs, nFacs) nFacs_Zeros = tf.where(tf.equal(nFacs_Zeros, zero), onesFacs, nFacs_Zeros) # Find TPR, TNR, FPR, FNR. matrix_positive = tf.cast( tf.equal(logitsBin, y) & tf.equal(yFloat, tf.constant(1, dtype=tf.float32)), dtype=tf.float32) correct_pos = tf.reduce_sum(matrix_positive) / tf.reduce_sum(yFloat) tf.summary.scalar("TruePosRate", correct_pos) matrix_negative = tf.cast(tf.equal(logitsBin, y) & tf.equal(yFloat, zero), dtype=tf.float32) correct_neg = tf.reduce_sum(matrix_negative) / tf.reduce_sum(yFlipped) tf.summary.scalar("TrueNegRate", correct_neg) matrix_falsePos = tf.cast(tf.not_equal(logitsBin, y) & tf.equal(y, zero), dtype=tf.float32) #or yFlipped = 1 falsePos = tf.reduce_sum(matrix_falsePos) / tf.reduce_sum(yFlipped) tf.summary.scalar("falsePosRate", falsePos) matrix_falseNeg = tf.cast( tf.not_equal(logitsBin, y) & tf.equal(yFloat, tf.constant(1, dtype=tf.float32)), dtype=tf.float32) falseNeg = tf.reduce_sum(matrix_falseNeg) / tf.reduce_sum(yFloat) tf.summary.scalar("falseNegRate", falseNeg) tp_sum = tf.reduce_sum(matrix_positive, 0) tp_sum_append = tf.concat([tf.constant([0], dtype=tf.float32), tp_sum], 0) tf_sum = tf.reduce_sum(matrix_negative, 0) fp_sum = tf.reduce_sum(matrix_falsePos, 0) fn_sum = tf.reduce_sum(matrix_falseNeg, 0) # Get Matrix of Confusion for multiclass binary classifier. confusion = tf.Variable(initial_value=tf.zeros( [n_output + 1, n_output + 1]), name='confusion') confusion1 = tf.Variable(initial_value=tf.cast(tf.diag( np.repeat(1, n_output + 1)), dtype=tf.float32), name='confusion1') confusion2 = tf.Variable(initial_value=tf.zeros( [n_output + 1, n_output + 1]), name='confusion2') confusion3 = tf.Variable(initial_value=tf.zeros( [n_output + 1, n_output + 1]), name='confusion3') confusion4 = tf.Variable(initial_value=tf.zeros( [n_output + 1, n_output + 1]), name='confusion4') confusion1 = confusion1[0, 0].assign(5) confusion1 = confusion1 * tp_sum_append confusion2 = confusion2[0, 0].assign(tf.reduce_sum(tf_sum)) confusion3 = tf.assign(confusion3[0, 1:n_output + 1], fp_sum) confusion4 = confusion4[1:n_output + 1, 0].assign(fn_sum) confusion = confusion1 + confusion2 + confusion3 + confusion4 txtConfusion = tf.as_string(confusion, precision=0, name='txtConfusion') tf.summary.text('txtConfusion', txtConfusion) correct_prediction = tf.cast(tf.equal(logitsBin, y), dtype=tf.float32, name="correct_prediction") accuracy = tf.reduce_mean(correct_prediction, name="accuracy") tf.summary.scalar("accuracy", accuracy) # Summary for tensorboard summ = tf.summary.merge_all() saver = tf.train.Saver() init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init) writer = tf.summary.FileWriter(LOGDIR + hparam + '/train') test_writer = tf.summary.FileWriter(LOGDIR + hparam + '/test') writer.add_graph(sess.graph) for i in range(3001): if i % 5 == 0: [train_accuracy, s] = sess.run([accuracy, summ], feed_dict={ x: train_x, y: train_y, sw: sw_train }) sess.run([confusion], feed_dict={ x: test_x, y: test_y, sw: sw_test }) writer.add_summary(s, i) if i % 50 == 0: [acc, s] = sess.run([accuracy, summ], feed_dict={ x: test_x, y: test_y, sw: sw_test }) sess.run([confusion], feed_dict={ x: test_x, y: test_y, sw: sw_test }) test_writer.add_summary(s, i) saver.save(sess, os.path.join(savepath, hparam, "model"), i) sess.run(train_step, feed_dict={x: train_x, y: train_y, sw: sw_train})
print(f"Rank {hvd.rank()}:{hvd.local_rank()} reporting!") else: np.random.seed(args.seed) tf.random.set_random_seed(args.seed) random.seed(args.seed) if args.architecture in ('stylegan2'): assert args.starting_phase == args.ending_phase if 'OMP_NUM_THREADS' not in os.environ: print("Warning: OMP_NUM_THREADS not set. Setting it to 1.") os.environ['OMP_NUM_THREADS'] = str(1) gopts = tf.GraphOptions(place_pruned_graph=True) config = tf.ConfigProto(graph_options=gopts, allow_soft_placement=True) # config = tf.ConfigProto() if args.gpu: config.gpu_options.allow_growth = True # config.inter_op_parallelism_threads = 1 #config.gpu_options.per_process_gpu_memory_fraction = 0.96 if args.horovod: config.gpu_options.visible_device_list = str(hvd.local_rank()) else: config = tf.ConfigProto( graph_options=gopts, intra_op_parallelism_threads=int(os.environ['OMP_NUM_THREADS']), inter_op_parallelism_threads=args.num_inter_ops,
def _testDecoderFPropFloatHelper(self, func_inline=False, num_decoder_layers=1, target_seq_len=5, residual_start=0): """Computes decoder from params and computes loss with random inputs.""" cluster = cluster_factory.ForTestingWorker(add_summary=True) config = tf.ConfigProto(graph_options=tf.GraphOptions( optimizer_options=tf.OptimizerOptions( do_function_inlining=func_inline))) with cluster, self.session(graph=tf.Graph(), use_gpu=False, config=config) as sess: tf.set_random_seed(8372749040) vn_config = py_utils.VariationalNoiseParams(None, False, False) p = self._DecoderParams(vn_config) p.rnn_layers = num_decoder_layers p.residual_start = residual_start p.target_seq_len = target_seq_len dec = p.Instantiate() src_seq_len = 5 src_enc = tf.random_normal([src_seq_len, 2, 8], seed=9283748) src_enc_padding = tf.constant( [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 1.0], [1.0, 1.0]], dtype=tf.float32) encoder_outputs = py_utils.NestedMap(encoded=src_enc, padding=src_enc_padding) target_ids = tf.transpose( tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 15], [5, 6, 7, 8], [10, 5, 2, 5]], dtype=tf.int32)) target_labels = tf.transpose( tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 13], [5, 7, 8, 10], [10, 5, 2, 4]], dtype=tf.int32)) target_paddings = tf.transpose( tf.constant([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [1, 1, 1, 1]], dtype=tf.float32)) target_transcripts = tf.constant( ['abcd', 'bcde', 'klmp', 'fghi', 'kfcf']) target_weights = 1.0 - target_paddings targets = py_utils.NestedMap({ 'ids': target_ids, 'labels': target_labels, 'weights': target_weights, 'paddings': target_paddings, 'transcripts': target_transcripts, }) metrics = dec.FPropDefaultTheta(encoder_outputs, targets) loss = metrics['loss'][0] correct_predicts = metrics['fraction_of_correct_next_step_preds'][ 0] summaries = tf.summary.merge( tf.get_collection(tf.GraphKeys.SUMMARIES)) tf.global_variables_initializer().run() loss_v, _ = sess.run([loss, correct_predicts]) summaries.eval() return loss_v
lms_obj.run(graph=tf.get_default_graph()) from tensorflow.examples.tutorials.mnist import input_data mnist = input_data.read_data_sets("/tmp/MNIST_data/data/") from tensorflow.core.protobuf import rewriter_config_pb2 rewrite_options = rewriter_config_pb2.RewriterConfig(disable_model_pruning=True, #constant_folding=rewriter_config_pb2.RewriterConfig.OFF, #dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF, #layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF, #arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF, #min_graph_nodes=-1, memory_optimization=rewriter_config_pb2.RewriterConfig.NO_MEM_OPT)#SCHEDULING_HEURISTICS) graph_options = tf.GraphOptions(rewrite_options=rewrite_options)#, infer_shapes=True) config = tf.ConfigProto(graph_options=graph_options, allow_soft_placement=True, log_device_placement=True) config.gpu_options.allow_growth=True #run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) #run_metadata = tf.RunMetadata() #graph = tf.get_default_graph() #writer = tf.summary.FileWriter("./rewriter_graph1") #writer.add_graph(graph=graph) import numpy as np picture = np.ones([batch_size, 200 * 200], dtype=np.float32) picture_label = np.ones([batch_size], dtype=np.float32) with tf.Session(config=config) as sess:
def _testDecoderFPropGradientCheckerHelper(self, func_inline=False): config = tf.ConfigProto(graph_options=tf.GraphOptions( optimizer_options=tf.OptimizerOptions( do_function_inlining=func_inline))) with self.session(graph=tf.Graph(), use_gpu=False, config=config) as sess: tf.set_random_seed(8372749040) np.random.seed(274854) vn_config = py_utils.VariationalNoiseParams(None, False, False) p = self._DecoderParams(vn_config) p.dtype = tf.float64 dec = p.Instantiate() src_seq_len = 5 src_enc = tf.constant(np.random.uniform(size=(src_seq_len, 2, 8)), tf.float64) src_enc_padding = tf.constant( [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 1.0], [1.0, 1.0]], dtype=tf.float64) encoder_outputs = py_utils.NestedMap(encoded=src_enc, padding=src_enc_padding) target_ids = tf.transpose( tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 15], [5, 6, 7, 8], [10, 5, 2, 5]], dtype=tf.int32)) target_labels = tf.transpose( tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 13], [5, 7, 8, 10], [10, 5, 2, 4]], dtype=tf.int32)) target_paddings = tf.transpose( tf.constant([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [1, 1, 1, 1]], dtype=tf.float64)) target_transcripts = tf.constant( ['abcd', 'bcde', 'klmp', 'fghi', 'kfcf']) target_weights = 1.0 - target_paddings targets = py_utils.NestedMap({ 'ids': target_ids, 'labels': target_labels, 'weights': target_weights, 'paddings': target_paddings, 'transcripts': target_transcripts, }) metrics = dec.FPropDefaultTheta(encoder_outputs, targets) loss = metrics['loss'][0] all_vars = tf.trainable_variables() grads = tf.gradients(loss, all_vars) def DenseGrad(var, grad): if isinstance(grad, tf.Tensor): return grad elif isinstance(grad, tf.IndexedSlices): return tf.unsorted_segment_sum(grad.values, grad.indices, tf.shape(var)[0]) dense_grads = [DenseGrad(x, y) for (x, y) in zip(all_vars, grads)] tf.global_variables_initializer().run() test_utils.CompareToGoldenSingleFloat(self, 3.458078, loss.eval()) # Second run to make sure the function is determistic. test_utils.CompareToGoldenSingleFloat(self, 3.458078, loss.eval()) symbolic_grads = [x.eval() for x in dense_grads if x is not None] numerical_grads = [] for v in all_vars: numerical_grads.append( test_utils.ComputeNumericGradient(sess, loss, v)) for x, y in zip(symbolic_grads, numerical_grads): self.assertAllClose(x, y)
def freeze_graph_with_def_protos( input_graph_def, input_saver_def, input_checkpoint, output_node_names, restore_op_name, filename_tensor_name, clear_devices, initializer_nodes, optimize_graph=True, variable_names_blacklist=''): """Converts all variables in a graph and checkpoint into constants.""" del restore_op_name, filename_tensor_name # Unused by updated loading code. # 'input_checkpoint' may be a prefix if we're using Saver V2 format if not saver_lib.checkpoint_exists(input_checkpoint): raise ValueError( 'Input checkpoint "' + input_checkpoint + '" does not exist!') if not output_node_names: raise ValueError( 'You must supply the name of a node to --output_node_names.') # Remove all the explicit device specifications for this node. This helps to # make the graph more portable. if clear_devices: for node in input_graph_def.node: node.device = '' with tf.Graph().as_default(): tf.import_graph_def(input_graph_def, name='') if optimize_graph: logging.info('Graph Rewriter optimizations enabled') rewrite_options = rewriter_config_pb2.RewriterConfig( optimize_tensor_layout=True) rewrite_options.optimizers.append('pruning') rewrite_options.optimizers.append('constfold') rewrite_options.optimizers.append('layout') graph_options = tf.GraphOptions( rewrite_options=rewrite_options, infer_shapes=True) else: logging.info('Graph Rewriter optimizations disabled') graph_options = tf.GraphOptions() config = tf.ConfigProto(graph_options=graph_options) with session.Session(config=config) as sess: if input_saver_def: saver = saver_lib.Saver(saver_def=input_saver_def) saver.restore(sess, input_checkpoint) else: var_list = {} reader = pywrap_tensorflow.NewCheckpointReader(input_checkpoint) var_to_shape_map = reader.get_variable_to_shape_map() for key in var_to_shape_map: try: tensor = sess.graph.get_tensor_by_name(key + ':0') except KeyError: # This tensor doesn't exist in the graph (for example it's # 'global_step' or a similar housekeeping element) so skip it. continue var_list[key] = tensor saver = saver_lib.Saver(var_list=var_list) saver.restore(sess, input_checkpoint) if initializer_nodes: sess.run(initializer_nodes) variable_names_blacklist = (variable_names_blacklist.split(',') if variable_names_blacklist else None) output_graph_def = graph_util.convert_variables_to_constants( sess, input_graph_def, output_node_names.split(','), variable_names_blacklist=variable_names_blacklist) return output_graph_def
a1 = tf.ones((n, n)) a2 = tf.ones((n, n)) with tf.device("cpu:1"): a3 = tf.matmul(a1, a2) with tf.device("cpu:2"): a4 = tf.matmul(a1, a2) with tf.device("cpu:3"): a5 = tf.matmul(a3, a4) # Turn off graph optimizations no_opt = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0, do_common_subexpression_elimination=False, do_function_inlining=False, do_constant_folding=False) config = tf.ConfigProto( graph_options=tf.GraphOptions(optimizer_options=no_opt), log_device_placement=True, allow_soft_placement=False, device_count={"CPU": 8}, inter_op_parallelism_threads=3, intra_op_parallelism_threads=1) sess = tf.Session(config=config) run_metadata = tf.RunMetadata() run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE, output_partition_graphs=True) # Run session. sess.run(a5.op, options=run_options, run_metadata=run_metadata) trace = timeline.Timeline(step_stats=run_metadata.step_stats)
def main(unused_argv): # Mnas optimize - set the proper image data format tf.keras.backend.set_image_data_format(FLAGS.data_format) # Mnas optimize - optimization flags # gpu_thread_count = 2 # os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' # os.environ['TF_GPU_THREAD_COUNT'] = str(gpu_thread_count) # os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1' # os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # enable mixed precision? -> Not much benefits seen yet # os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" # Horovod: initialize Horovod. if FLAGS.use_horovod: hvd.init() tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu if (FLAGS.tpu or FLAGS.use_tpu) else '', zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) if FLAGS.use_async_checkpointing: save_checkpoints_steps = None else: if not FLAGS.use_horovod: save_checkpoints_steps = max(100, FLAGS.iterations_per_loop) else: save_checkpoints_steps = max( 100, FLAGS.iterations_per_loop) if hvd.rank() == 0 else None config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_steps=save_checkpoints_steps, log_step_count_steps=FLAGS.log_step_count_steps, session_config=tf.ConfigProto( graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True))), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig .PER_HOST_V2)) # pylint: disable=line-too-long if FLAGS.use_xla: config.session_config.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) # Horovod: pin GPU to be used to process local rank (one GPU per process) if FLAGS.use_horovod: config.session_config.gpu_options.allow_growth = True config.session_config.gpu_options.visible_device_list = str( hvd.local_rank()) # Validates Flags. if FLAGS.use_bfloat16 and FLAGS.use_keras: raise ValueError( 'Keras layers do not have full support to bfloat16 activation training.' ' You have set use_bfloat as %s and use_keras as %s' % (FLAGS.use_bfloat16, FLAGS.use_keras)) # Initializes model parameters. steps_per_epoch = FLAGS.num_train_images / FLAGS.train_batch_size steps_per_epoch = steps_per_epoch // hvd.size( ) if FLAGS.use_horovod else steps_per_epoch params = dict(steps_per_epoch=steps_per_epoch, use_bfloat16=FLAGS.use_bfloat16, quantized_training=FLAGS.quantized_training) if FLAGS.use_horovod: params['hvd'] = True params['hvd_curr_host'] = hvd.rank() params['hvd_num_hosts'] = hvd.size() mnasnet_est = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=mnasnet_model_fn, config=config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, export_to_tpu=FLAGS.export_to_tpu, params=params) # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from # rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights or # restored from a checkpoint. if FLAGS.use_horovod: bcast_hook = hvd.BroadcastGlobalVariablesHook(0) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. if FLAGS.bigtable_instance: tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table) select_train, select_eval = _select_tables_from_flags() imagenet_train, imagenet_eval = [ imagenet_input.ImageNetBigtableInput( is_training=is_training, use_bfloat16=False, transpose_input=FLAGS.transpose_input, selection=selection) for (is_training, selection) in [(True, select_train), (False, select_eval)] ] else: if FLAGS.data_dir == FAKE_DATA_DIR: tf.logging.info('Using fake dataset.') else: tf.logging.info('Using dataset: %s', FLAGS.data_dir) imagenet_train, imagenet_eval = [ imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=FLAGS.transpose_input, cache=FLAGS.use_cache and is_training, image_size=FLAGS.input_image_size, num_parallel_calls=FLAGS.num_parallel_calls, use_bfloat16=FLAGS.use_bfloat16) for is_training in [True, False] ] if FLAGS.mode == 'eval': eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time( ) # This time will include compilation time eval_results = mnasnet_est.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) if FLAGS.export_dir: export(mnasnet_est, FLAGS.export_dir, FLAGS.post_quantize) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir( # pylint: disable=protected-access FLAGS.model_dir) tf.logging.info( 'Training for %d steps (%.2f epochs in total). Current' ' step %d.', FLAGS.train_steps, FLAGS.train_steps / params['steps_per_epoch'], current_step) start_timestamp = time.time( ) # This time will include compilation time if FLAGS.mode == 'train': hooks = [] if FLAGS.use_async_checkpointing: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(100, FLAGS.iterations_per_loop))) mnasnet_est.train(input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps, hooks=hooks) else: assert FLAGS.mode == 'train_and_eval' curr_rank = 0 if FLAGS.use_horovod: curr_rank = hvd.rank() while current_step < FLAGS.train_steps: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, FLAGS.train_steps) if FLAGS.use_horovod: # try dali pipeline mnasnet_est.train(input_fn=imagenet_train.train_data_fn, max_steps=next_checkpoint, hooks=[bcast_hook]) # this uses the old tf data pipeline # mnasnet_est.train( # input_fn=imagenet_train.input_fn, max_steps=next_checkpoint, hooks=[bcast_hook]) else: mnasnet_est.train(input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d. Hvd rank %d', next_checkpoint, int(time.time() - start_timestamp), curr_rank) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. eval_on_single_gpu = FLAGS.eval_on_single_gpu tf.logging.info('Starting to evaluate.') if eval_on_single_gpu: if curr_rank == 0: eval_results = mnasnet_est.evaluate( input_fn=imagenet_eval.train_data_fn, #input_fn steps=FLAGS.num_eval_images // FLAGS.eval_batch_size) tf.logging.info( 'Eval results at step %d: %s. Hvd rank %d', next_checkpoint, eval_results, curr_rank) else: eval_results = mnasnet_est.evaluate( input_fn=imagenet_eval.train_data_fn, #input_fn steps=FLAGS.num_eval_images // FLAGS.eval_batch_size) tf.logging.info('Eval results at step %d: %s. Hvd rank %d', next_checkpoint, eval_results, curr_rank) elapsed_time = int(time.time() - start_timestamp) tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', FLAGS.train_steps, elapsed_time) if FLAGS.export_dir: export(mnasnet_est, FLAGS.export_dir, FLAGS.post_quantize)
def test_avgpool2d(): ''' Run tests on the Wave custom avgpool2d operator. ''' tf.reset_default_graph() # Turn off graph-rewriting optimizations config = tf.ConfigProto(graph_options=tf.GraphOptions(optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0))) iterations = 100 for i in range(iterations): tf.reset_default_graph() # NCHW t_n = 1 t_h = 64 t_w = 64 t_c = 2 # window w_n = 1 w_h = 2 w_w = 2 w_c = 1 #strides s_n = 1 s_h = 2 s_w = 2 s_c = 1 # N H W C max_in = tf.get_variable("a", [t_n, t_h, t_w, t_c], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.1)) t_init = tf.global_variables_initializer() # SAME variant with tf.Session('', config=config) as sess: t_init.run() # print("Wave Kernel:\n-------------------------------------------------") z_op = waveflow.wavecomp_ops_module.wave_avg_pool_dfx( max_in, ksize=[w_n, w_h, w_w, w_c], strides=[s_n, s_h, s_w, s_c], padding='SAME', data_format='NHWC') # Base tensorflow. Only supports NHWC. z2_op = nn_ops.avg_pool( max_in, ksize=[w_n, w_h, w_w, w_c], strides=[s_n, s_h, s_w, s_c], padding='SAME', data_format='NHWC') # z = z_op.eval() # z2 = z2_op.eval() z, z2 = sess.run([z_op, z2_op]) # print("\nTF:\n-------------------------------------------------") assert_str = "Failure on i: %d, mode: SAME" % (i) if not compare_tensor(z, z2, assert_str): print("z: shape: %s, %s" % (z.shape, z)) print("z (np): shape: %s, %s" % (z2.shape, z2)) print("\n\n") assert False # Valid variant with tf.Session('', config=config) as sess: t_init.run() # print("Wave Kernel:\n-------------------------------------------------") z_op = waveflow.wavecomp_ops_module.wave_avg_pool_dfx( max_in, ksize=[w_n, w_h, w_w, w_c], strides=[s_n, s_h, s_w, s_c], padding='VALID', data_format='NHWC') # Base tensorflow. Only supports NHWC. z2_op = nn_ops.avg_pool( max_in, ksize= [w_n, w_h, w_w, w_c], strides=[s_n, s_h, s_w, s_c], padding='VALID', data_format='NHWC') z, z2 = sess.run([z_op, z2_op]) # print("\nTF:\n-------------------------------------------------") assert_str = "Failure on i: %d, mode: VALID" % (i) if not compare_tensor(z, z2, assert_str): print("z: shape: %s, %s" % (z.shape, z)) print("z (np): shape: %s, %s" % (z2.shape, z2)) print("\n\n") assert False return True