def main(_): stdout_backup = sys.stdout #log_file = open("message.log", "a") #sys.stdout = log_file if not FLAGS.data_path: raise ValueError("Must set --data_path to PTB data directory") #raw_data = reader.spindle_raw_data(FLAGS.data_path) #train_data, valid_data, test_data, train_label, valid_label, test_label = raw_data target_f = FLAGS.target_f config = get_config() config.num_steps = num_steps = int((250 * 0.001) / (1 / target_f)) eval_config = get_config() eval_config.num_steps = num_steps #eval_config.batch_size = 20 #eval_config.num_steps = 50 with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): #train_input = SpindleInput(config=config, data=train_data, label=train_label, name="TrainInput") with tf.device('/cpu:0'): train_input = SpindleInputDatasetAPI(config=config, name="train") #pdb.set_trace() with tf.variable_scope("Model", reuse=None, initializer=initializer): m = SpindleModel(is_training=True, config=config, input_=train_input) tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) with tf.name_scope("Valid"): #valid_input = SpindleInput(config=config, data=valid_data, label=valid_label, name="ValidInput") valid_input = SpindleInputDatasetAPI(config=config, name="valid") with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = SpindleModel(is_training=False, config=config, input_=valid_input) tf.summary.scalar("Validation Loss", mvalid.cost) with tf.name_scope("Test"): #test_input = SpindleInput(config=eval_config, data=test_data, label=test_label, name="TestInput") test_input = SpindleInputDatasetAPI(config=eval_config, name="test") with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest = SpindleModel(is_training=False, config=eval_config, input_=test_input) models = {"Train": m, "Valid": mvalid, "Test": mtest} for name, model in models.items(): model.export_ops(name) metagraph = tf.train.export_meta_graph() if tf.__version__ < "1.1.0" and FLAGS.num_gpus > 1: raise ValueError( "num_gpus > 1 is not supported for TensorFlow versions " "below 1.1.0") soft_placement = False if FLAGS.num_gpus > 1: soft_placement = True util.auto_parallel(metagraph, m) with tf.Graph().as_default(): tf.train.import_meta_graph(metagraph) for model in models.values(): model.import_ops() sv = tf.train.Supervisor(logdir=FLAGS.save_path, save_model_secs=600) #gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2) gpu_options = tf.GPUOptions(allow_growth=True) config_proto = tf.ConfigProto(allow_soft_placement=soft_placement, gpu_options=gpu_options) #total_parameters = 0 #for variable in tf.trainable_variables(): # # shape is an array of tf.Dimension # shape = variable.get_shape() # print(shape) # print(len(shape)) # variable_parameters = 1 # for dim in shape: # print(dim) # variable_parameters *= dim.value # print(variable_parameters) # total_parameters += variable_parameters #print(total_parameters) #pdb.set_trace() print( "=============================================New Test !!!!!==================================================" ) print(" Learning rate = %.6f Keep Probability = %.2f " % (config.learning_rate, config.keep_prob)) with sv.managed_session(config=config_proto) as session: gpus = [ x.name for x in device_lib.list_local_devices() if x.device_type == "GPU" ] if FLAGS.num_gpus > len(gpus): raise ValueError( "Your machine has only %d gpus " "which is less than the requested --num_gpus=%d." % (len(gpus), FLAGS.num_gpus)) #pdb.set_trace() #sv.saver.restore(session, tf.train.latest_checkpoint(os.path.join(FLAGS.data_path, "../saved_model/"))) if config.test_mode == 0: training_loss = [] # sv.saver.restore(session, tf.train.latest_checkpoint(os.path.join(FLAGS.data_path, "../../Cross_Valid/1/"))) for i in range(config.max_max_epoch): lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.6f" % (i + 1, session.run(m.lr))) train_loss = run_epoch(session, m, eval_op=m.train_op, verbose=True) print("Epoch: %d Train Loss: %.3f" % (i + 1, train_loss)) valid_loss = run_epoch(session, mvalid, verbose=False) print("Epoch: %d" % (i + 1)) cprint("Valid Loss: %.3f" % (valid_loss), 'white', 'on_yellow') training_loss.append(train_loss) #fig=plt.figure() #plt.plot(training_loss) #fig.show() #test_loss = run_epoch(session, mtest, verbose=False) #print("Test Loss: %.3f" % test_loss) #variables_names =[v.name for v in tf.trainable_variables()] #value=session.run('Model/conv1/conv1/kernel:0') #pdb.set_trace() if FLAGS.save_path: print("Saving model to %s." % FLAGS.save_path) sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step) else: sv.saver.restore(session, tf.train.latest_checkpoint(FLAGS.model_path)) #sv.saver.restore(session, tf.train.latest_checkpoint(os.path.join(FLAGS.data_path, "../../Cross_Valid/1"))) #pdb.set_trace() #variables_names =[v.name for v in tf.trainable_variables()] #value=session.run('Model/conv1/conv1/kernel:0') #pdb.set_trace() test_loss = run_epoch(session, mtest, verbose=False) print("Test Loss: %.3f" % test_loss) #log_file.close() sys.stdout = stdout_backup
def main(_): if not FLAGS.data_path: raise ValueError("Must set --data_path to LM data directory") gpus = [ x.name for x in device_lib.list_local_devices() if x.device_type == "GPU" ] if FLAGS.num_gpus > len(gpus): raise ValueError("Your machine has only %d gpus " "which is less than the requested --num_gpus=%d." % (len(gpus), FLAGS.num_gpus)) raw_data = reader.lm_raw_data(FLAGS.data_path) train_data, valid_data, test_data, vocabulary = raw_data config = get_config() config.vocab_size = vocabulary eval_config = get_config() eval_config.vocab_size = vocabulary eval_config.batch_size = 1 eval_config.num_steps = 1 infer_config = get_config() infer_config.vocab_size = vocabulary infer_config.batch_size = 1 with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): train_input = LMInput(config=config, data=train_data, name="TrainInput") with tf.compat.v1.variable_scope("Model", reuse=None, initializer=initializer): m = LMModel(is_training=True, config=config, input_=train_input, is_inference=False) tf.compat.v1.summary.scalar("Training_Loss", m.cost) tf.compat.v1.summary.scalar("Learning_Rate", m.lr) with tf.name_scope("Valid"): valid_input = LMInput(config=config, data=valid_data, name="ValidInput") with tf.compat.v1.variable_scope("Model", reuse=True, initializer=initializer): mvalid = LMModel(is_training=False, config=config, input_=valid_input, is_inference=False) tf.compat.v1.summary.scalar("Validation_Loss", mvalid.cost) with tf.name_scope("Test"): test_input = LMInput(config=eval_config, data=test_data, name="TestInput") with tf.compat.v1.variable_scope("Model", reuse=True, initializer=initializer): mtest = LMModel(is_training=False, config=eval_config, input_=test_input, is_inference=False) with tf.name_scope("Infer"): with tf.compat.v1.variable_scope("Model", reuse=True, initializer=initializer): minfer = LMModel(is_training=False, config=infer_config, input_=None, is_inference=True) models = {"Train": m, "Valid": mvalid, "Test": mtest, "Infer": minfer} for name, model in models.items(): model.export_ops(name) metagraph = tf.compat.v1.train.export_meta_graph() if tf.__version__ < "1.1.0" and FLAGS.num_gpus > 1: raise ValueError( "num_gpus > 1 is not supported for TensorFlow versions " "below 1.1.0") soft_placement = False if FLAGS.num_gpus > 1: soft_placement = True util.auto_parallel(metagraph, m) with tf.Graph().as_default(): tf.compat.v1.train.import_meta_graph(metagraph) for model in models.values(): model.import_ops() sv = tf.train.Supervisor(logdir=FLAGS.save_path) # sv = tf.train.MonitoredTrainingSession()#logdir=FLAGS.save_path) config_proto = tf.compat.v1.ConfigProto( allow_soft_placement=soft_placement) with sv.managed_session(config=config_proto) as session: #Export the current inference model to tflite. #converter = tf.lite.TFLiteConverter.from_session(session, [minfer.input_data], [minfer.logits]) #converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, # tf.lite.OpsSet.SELECT_TF_OPS] #tflite_model = converter.convert() #open("converted_model.tflite", "wb").write(tflite_model) for i in range(config.max_max_epoch): lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, eval_op=m.train_op, verbose=True) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) test_perplexity = run_epoch(session, mtest) print("Test Perplexity: %.3f" % test_perplexity) if FLAGS.save_path: print("Saving model to %s." % FLAGS.save_path) sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step) #Export the current inference model to tflite. converter = tf.lite.TFLiteConverter.from_session( session, [minfer.input_data], [minfer.logits]) #converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, # tf.lite.OpsSet.SELECT_TF_OPS] tflite_model = converter.convert() open("converted_model.tflite", "wb").write(tflite_model)
def run(args): global FLAGS importlib.reload(reader) FLAGS = args if not FLAGS.data_path: raise ValueError("Must set --data_path to PTB data directory") gpus = [ x.name for x in device_lib.list_local_devices() if x.device_type == "GPU" ] if FLAGS.num_gpus > len(gpus): raise ValueError("Your machine has only %d gpus " "which is less than the requested --num_gpus=%d." % (len(gpus), FLAGS.num_gpus)) config = get_config() raw_data = reader.ptb_raw_data_bias(FLAGS.data_path, config.vocab_size, professions=set(FLAGS.professions)) train_data, valid_data, test_data_m, test_data_f, vocab_size, sentence_ends_m, sentence_ends_f, professions_pos = raw_data eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): train_input = PTBInput(config=config, data=train_data, name="TrainInput") with tf.variable_scope("Model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config, input_=train_input) tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) with tf.name_scope("Valid"): valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config, input_=valid_input) tf.summary.scalar("Validation Loss", mvalid.cost) with tf.name_scope("TestMale"): test_input_m = PTBInput(config=eval_config, data=test_data_m, name="TestInputMale") with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest_m = PTBModel(is_training=False, config=eval_config, input_=test_input_m) with tf.name_scope("TestFemale"): test_input_f = PTBInput(config=eval_config, data=test_data_f, name="TestInputFemale") with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest_f = PTBModel(is_training=False, config=eval_config, input_=test_input_f) models = { "Train": m, "Valid": mvalid, "TestMale": mtest_m, "TestFemale": mtest_f } for name, model in models.items(): model.export_ops(name) metagraph = tf.train.export_meta_graph() if tf.__version__ < "1.1.0" and FLAGS.num_gpus > 1: raise ValueError( "num_gpus > 1 is not supported for TensorFlow versions " "below 1.1.0") soft_placement = False if FLAGS.num_gpus > 1: soft_placement = True util.auto_parallel(metagraph, m) with tf.Graph().as_default(): if args.train: tf.train.import_meta_graph(metagraph) else: tf.train.import_meta_graph(args.meta_file) for model in models.values(): model.import_ops() saver = tf.train.Saver() sv = tf.train.Supervisor(logdir=FLAGS.save_path) config_proto = tf.ConfigProto(allow_soft_placement=soft_placement) with sv.managed_session(config=config_proto) as session: if args.train: for i in range(config.max_max_epoch): lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) start_time = time.time() train_perplexity, _, _ = run_epoch(session, m, eval_op=m.train_op) print("Time for Epoch = %.2f s" % (time.time() - start_time)) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity, _, _ = run_epoch(session, mvalid) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) else: saver.restore(session, FLAGS.save_path) doc_perp_m,sentence_perps_m,profession_costs_m = run_epoch(session, mtest_m, \ sentence_ends = sentence_ends_m, professions_pos = professions_pos) doc_perp_f,sentence_perps_f,profession_costs_f = run_epoch(session, mtest_f, sentence_ends = sentence_ends_f\ , professions_pos = professions_pos) if FLAGS.save_path and args.train: print("Saving model to %s." % FLAGS.save_path) # sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step) saver.save(session, FLAGS.save_path) if FLAGS.train: return None, None, None, None else: return sentence_perps_m, sentence_perps_f, profession_costs_m, profession_costs_f
def main(model_select="small", dat_path = "../data", sav_path = "./saved_model/", mixing_pi = 0.25, prior_log_sigma1 = -1.0, prior_log_sigma2 = -7.0): global model_type global data_path global save_path global global_prior_pi global global_log_sigma1 global global_log_sigma2 global global_num_gpus model_type = model_select data_path = dat_path save_path = sav_path global_prior_pi = mixing_pi global_log_sigma1 = prior_log_sigma1 global_log_sigma2 = prior_log_sigma2 gpus = [x.name for x in device_lib.list_local_devices() if x.device_type == "GPU"] if len(gpus) == 0: global_num_gpus = 1 else: global_num_gpus = len(gpus) raw_data = reader.ptb_raw_data(data_path) train_data, valid_data, test_data, _, _ = raw_data config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 subprocess.Popen(["tensorboard","--logdir=tensorboard"]) with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): train_input = PTBInput(config=config, data=train_data, name="TrainInput") with tf.variable_scope("Model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config, input_=train_input) tf.summary.scalar("Training_Loss", m.cost) tf.summary.scalar("Learning_Rate", m.lr) tf.summary.scalar("KL_Loss", m.kl_loss) tf.summary.scalar("Total_Loss", m.total_loss) with tf.name_scope("Valid"): valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config, input_=valid_input) tf.summary.scalar("Validation_Loss", mvalid.cost) with tf.name_scope("Test"): test_input = PTBInput( config=eval_config, data=test_data, name="TestInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest = PTBModel(is_training=False, config=eval_config, input_=test_input) models = {"Train": m, "Valid": mvalid, "Test": mtest} for name, model in models.items(): model.export_ops(name) metagraph = tf.train.export_meta_graph() soft_placement = False if global_num_gpus > 1: soft_placement = True util.auto_parallel(metagraph, m) with tf.Graph().as_default(): tf.train.import_meta_graph(metagraph) for model in models.values(): model.import_ops() sv = tf.train.Supervisor(logdir=save_path) config_proto = tf.ConfigProto(allow_soft_placement=soft_placement) with sv.managed_session(config=config_proto) as session: for i in range(config.max_max_epoch): lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, eval_op=m.train_op, verbose=True) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) test_perplexity = run_epoch(session, mtest) print("Test Perplexity: %.3f" % test_perplexity) if save_path: print("Saving model to %s." % save_path) sv.saver.save(session, save_path, global_step=sv.global_step)
def main(_): if not FLAGS.data_path: raise ValueError("Must set --data_path to PTB data directory") """ gpus = [ x.name for x in device_lib.list_local_devices() if x.device_type == "GPU" ] if FLAGS.num_gpus > len(gpus): raise ValueError( "Your machine has only %d gpus " "which is less than the requested --num_gpus=%d." % (len(gpus), FLAGS.num_gpus)) """ raw_data = reader.ptb_raw_data(FLAGS.data_path) train_data, valid_data, test_data, _ = raw_data config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): train_input = PTBInput(config=config, data=train_data, name="TrainInput") with tf.variable_scope("Model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config, input_=train_input) tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) with tf.name_scope("Valid"): valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config, input_=valid_input) tf.summary.scalar("Validation Loss", mvalid.cost) with tf.name_scope("Test"): test_input = PTBInput(config=eval_config, data=test_data, name="TestInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest = PTBModel(is_training=False, config=eval_config, input_=test_input) models = {"Train": m, "Valid": mvalid, "Test": mtest} for name, model in models.items(): model.export_ops(name) metagraph = tf.train.export_meta_graph() if tf.__version__ < "1.1.0" and FLAGS.num_gpus > 1: raise ValueError( "num_gpus > 1 is not supported for TensorFlow versions " "below 1.1.0") soft_placement = False if FLAGS.num_gpus > 1: soft_placement = True util.auto_parallel(metagraph, m) with tf.Graph().as_default(): tf.train.import_meta_graph(metagraph) for model in models.values(): model.import_ops() sv = tf.train.Supervisor(logdir=FLAGS.save_path) config_proto = tf.ConfigProto(allow_soft_placement=soft_placement) config_proto.gpu_options.allow_growth = True max_duration = FLAGS.max_duration with sv.managed_session(config=config_proto) as session: start_time_mb = datetime.datetime.now() total_words = 0 print("start:", start_time_mb) for i in range(config.max_max_epoch): seconds = (datetime.datetime.now() - start_time_mb).total_seconds() minutes = seconds / 60.0 if max_duration is not None and minutes >= max_duration: break lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) # print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity, num_words = run_epoch( session, m, eval_op=m.train_op, verbose=True, start_time_mb=start_time_mb) total_words += num_words # print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) #valid_perplexity = run_epoch(session, mvalid, start_time_mb=start_time_mb) # print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) end_time_mb = datetime.datetime.now() print("end:", str(end_time_mb)) total_time = (end_time_mb - start_time_mb).total_seconds() print('Total wps: %.f' % (total_words / float(total_time))) """ test_perplexity = run_epoch(session, mtest) print("Test Perplexity: %.3f" % test_perplexity) """ if FLAGS.save_path: print("Saving model to %s." % FLAGS.save_path) sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step)
def main(_): if not FLAGS.data_path: raise ValueError("Must set --data_path to PTB data directory") gpus = [ x.name for x in device_lib.list_local_devices() if x.device_type == "GPU" ] if FLAGS.num_gpus > len(gpus): raise ValueError("Your machine has only %d gpus " "which is less than the requested --num_gpus=%d." % (len(gpus), FLAGS.num_gpus)) global_begin_time = time.time() raw_data = reader.ptb_raw_data(FLAGS.data_path) train_data, valid_data, test_data, _ = raw_data config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 print('The training configuration is as follows:', flush=True) print_config(config) with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): train_input = PTBInput(config=config, data=train_data, name="TrainInput") with tf.variable_scope("Model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config, input_=train_input) tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) with tf.name_scope("Valid"): valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config, input_=valid_input) tf.summary.scalar("Validation Loss", mvalid.cost) with tf.name_scope("Test"): test_input = PTBInput(config=eval_config, data=test_data, name="TestInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest = PTBModel(is_training=False, config=eval_config, input_=test_input) tf.summary.scalar("Test Loss", mtest.cost) models = {"Train": m, "Valid": mvalid, "Test": mtest} for name, model in models.items(): model.export_ops(name) metagraph = tf.train.export_meta_graph() if tf.__version__ < "1.1.0" and FLAGS.num_gpus > 1: raise ValueError( "num_gpus > 1 is not supported for TensorFlow versions " "below 1.1.0") soft_placement = False if FLAGS.num_gpus > 1: soft_placement = True util.auto_parallel(metagraph, m) print("Gragh construction time: %.3f" % (time.time() - global_begin_time), flush=True) with tf.Graph().as_default(): tf.train.import_meta_graph(metagraph) for model in models.values(): model.import_ops() if FLAGS.save_path: if not os.path.isdir(FLAGS.save_path): os.mkdir(FLAGS.save_path) sv = tf.train.Supervisor(logdir=FLAGS.save_path) config_proto = tf.ConfigProto(allow_soft_placement=soft_placement) with sv.managed_session(config=config_proto) as session: for i in range(config.max_max_epoch): lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.8f" % (i + 1, session.run(m.lr)), flush=True) train_perplexity = run_epoch(session, m, eval_op=m.train_op, parallel=True, verbose=True) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity), flush=True) valid_perplexity = run_epoch(session, mvalid) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity), flush=True) if FLAGS.test_when_training: test_perplexity = run_epoch(session, mtest) print("Epoch: %d Test Perplexity: %.3f" % (i + 1, test_perplexity), flush=True) print("Current running time: %.3f" % ((time.time() - global_begin_time) / 3600), flush=True) if not FLAGS.test_when_training: test_perplexity = run_epoch(session, mtest) print("Test Perplexity: %.3f" % (test_perplexity), flush=True) if config.rnn_mode != CUDNN: print("The number of parameters: {:.3f}".format( get_num_params() / 1000000), flush=True) if FLAGS.num_gpus: print("Peak memory usage of GPUs: {}".format( session.run(m.memory_use) / (1024**3)), flush=True) if FLAGS.save_path: print("Saving model to %s." % FLAGS.save_path, flush=True) sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step) print("Global duration time: %.3f" % ((time.time() - global_begin_time) / 3600), flush=True)
def _main(_): if not FLAGS.data_path: raise ValueError("Must set --data_path to PTB data directory") os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu hdf5_file = FLAGS.save_path + '.hdf5' raw_data = reader.ptb_raw_data(FLAGS.data_path, FLAGS.vocab_path) train_data, valid_data, test_data, _ = raw_data print('data load finished!') config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 def custom_getter(getter, name, *args, **kwargs): kwargs['trainable'] = False kwargs['initializer'] = _pretrained_initializer( name, hdf5_file) return getter(name, *args, **kwargs) with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): train_input = PTBInput(config=config, data=train_data, name="TrainInput") with tf.variable_scope("Model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config, input_=train_input) tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) with tf.variable_scope("Model", reuse=tf.AUTO_REUSE), h5py.File(hdf5_file, 'r') as fin: data_dict = {} data_dict['embedding'] = fin['Model/embedding:0'] data_dict['RNN/multi_rnn_cell/cell_0/basic_lstm_cell/kernel'] = fin[ 'Model/RNN/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0'] data_dict['RNN/multi_rnn_cell/cell_0/basic_lstm_cell/bias'] = fin[ 'Model/RNN/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0'] data_dict['RNN/multi_rnn_cell/cell_1/basic_lstm_cell/kernel'] = fin[ 'Model/RNN/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0'] data_dict['RNN/multi_rnn_cell/cell_1/basic_lstm_cell/bias'] = fin[ 'Model/RNN/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0'] data_dict['softmax_w'] = fin['Model/softmax_w:0'] data_dict['softmax_b'] = fin['Model/softmax_b:0'] for param_name, data in data_dict.iteritems(): try: var = tf.get_variable(param_name) var.assign(tf.convert_to_tensor(data[...])) except ValueError: raise with tf.name_scope("Valid"): valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config, input_=valid_input) tf.summary.scalar("Validation Loss", mvalid.cost) with tf.name_scope("Test"): test_input = PTBInput( config=eval_config, data=test_data, name="TestInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest = PTBModel(is_training=False, config=eval_config, input_=test_input) models = {"Train": m, "Valid": mvalid, "Test": mtest} for name, model in models.items(): model.export_ops(name) metagraph = tf.train.export_meta_graph() if tf.__version__ < "1.1.0" and FLAGS.num_gpus > 1: raise ValueError("num_gpus > 1 is not supported for TensorFlow versions " "below 1.1.0") soft_placement = False if FLAGS.num_gpus > 1: soft_placement = True util.auto_parallel(metagraph, m) with tf.Graph().as_default(): tf.train.import_meta_graph(metagraph) for model in models.values(): model.import_ops() sv = tf.train.Supervisor(logdir=FLAGS.save_path) config_proto = tf.ConfigProto(allow_soft_placement=soft_placement) with sv.managed_session(config=config_proto) as session: valid_perplexity, _ = run_epoch(session, mvalid) print("Valid Perplexity: %.3f" % valid_perplexity)
def main(_): if not FLAGS.data_path: raise ValueError("Must set --data_path to PTB data directory") gpus = [ x.name for x in device_lib.list_local_devices() if x.device_type == "GPU" ] if FLAGS.num_gpus > len(gpus): raise ValueError("Your machine has only %d gpus " "which is less than the requested --num_gpus=%d." % (len(gpus), FLAGS.num_gpus)) raw_data = reader.ptb_raw_data(FLAGS.data_path) train_data, valid_data, test_data, _ = raw_data config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 with tf.Graph().as_default(): # 相关参数的初始值为随机均匀分布,范围是[-init_scale,+init_scale] initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): train_input = PTBInput(config=config, data=train_data, name="TrainInput") with tf.variable_scope("Model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config, input_=train_input) tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) with tf.name_scope("Valid"): valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config, input_=valid_input) tf.summary.scalar("Validation Loss", mvalid.cost) with tf.name_scope("Test"): test_input = PTBInput(config=eval_config, data=test_data, name="TestInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest = PTBModel(is_training=False, config=eval_config, input_=test_input) models = {"Train": m, "Valid": mvalid, "Test": mtest} for name, model in models.iteritems(): model.export_ops(name) metagraph = tf.train.export_meta_graph() if tf.__version__ < "1.1.0" and FLAGS.num_gpus > 1: raise ValueError( "num_gpus > 1 is not supported for TensorFlow versions " "below 1.1.0") soft_placement = False if FLAGS.num_gpus > 1: soft_placement = True util.auto_parallel(metagraph, m) with tf.Graph().as_default(): tf.train.import_meta_graph(metagraph) for model in models.values(): model.import_ops() sv = tf.train.Supervisor(logdir=FLAGS.save_path) config_proto = tf.ConfigProto(allow_soft_placement=True) with sv.managed_session(config=config_proto) as session: for i in range(config.max_max_epoch): lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, eval_op=m.train_op, verbose=True) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) test_perplexity = run_epoch(session, mtest) print("Test Perplexity: %.3f" % test_perplexity) if FLAGS.save_path: print("Saving model to %s." % FLAGS.save_path) sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step) session.graph._unsafe_unfinalize() # Export tensorflow serving export_path = os.path.join( tf.compat.as_bytes(FLAGS.model_path), tf.compat.as_bytes(str(FLAGS.model_version))) builder = saved_model_builder.SavedModelBuilder(export_path) prediction_inputs = { 'input': tf.saved_model.utils.build_tensor_info(mtest.input_data) } prediction_outputs = { 'output': tf.saved_model.utils.build_tensor_info(mtest.predict), 'cell_state': tf.saved_model.utils.build_tensor_info( mtest.final_state[-1].c), 'embed_lookup': tf.saved_model.utils.build_tensor_info(mtest.embed_lookup) } prediction_signature = tf.saved_model.signature_def_utils.build_signature_def( inputs=prediction_inputs, outputs=prediction_outputs, method_name=tf.saved_model.signature_constants. PREDICT_METHOD_NAME) builder.add_meta_graph_and_variables( session, [tf.saved_model.tag_constants.SERVING], signature_def_map={ 'predict_signature': prediction_signature, }) session.graph.finalize() builder.save() print("Done export!")
def main(_):#"========================================================================================================" if not FLAGS.data_path: raise ValueError("Must set --data_path to PTB data directory") gpus = [ x.name for x in device_lib.list_local_devices() if x.device_type == "GPU" ] if FLAGS.num_gpus > len(gpus): raise ValueError( "Your machine has only %d gpus " "which is less than the requested --num_gpus=%d." % (len(gpus), FLAGS.num_gpus)) raw_data = reader.ptb_raw_data(FLAGS.data_path) train_data, valid_data, test_data, _ = raw_data config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): train_input = PTBInput(config=config, data=train_data, name="TrainInput") with tf.variable_scope("Model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config, input_=train_input) tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) with tf.name_scope("Valid"): valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config, input_=valid_input) tf.summary.scalar("Validation Loss", mvalid.cost) with tf.name_scope("Test"): test_input = PTBInput( config=eval_config, data=test_data, name="TestInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest = PTBModel(is_training=False, config=eval_config,input_=test_input) models = {"Train": m, "Valid": mvalid, "Test": mtest} for name, model in models.items(): model.export_ops(name) metagraph = tf.train.export_meta_graph() if tf.__version__ < "1.1.0" and FLAGS.num_gpus > 1: raise ValueError("num_gpus > 1 is not supported for TensorFlow versions " "below 1.1.0") soft_placement = False if FLAGS.num_gpus > 1: soft_placement = True util.auto_parallel(metagraph, m) save_path = './data/model/mode/model.ckpt' with tf.Graph().as_default(): tf.train.import_meta_graph(metagraph) for model in models.values(): model.import_ops() sv = tf.train.Supervisor(logdir=FLAGS.save_path) config_proto = tf.ConfigProto(allow_soft_placement=soft_placement) with sv.managed_session(config=config_proto) as session: for i in range(config.max_max_epoch): lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, eval_op=m.train_op, verbose=True) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) test_perplexity = run_epoch(session, mtest) print("Test Perplexity: %.3f" % test_perplexity) if FLAGS.save_path: print("Saving model to %s." % FLAGS.save_path) sv.saver.save(session,save_path) # sess = session # model_path = './data/model/' # path = './data/model/' # dir_list = os.listdir(path) # if len(dir_list) == 0: # version = 1 # else: # last_version = len(dir_list) # version = last_version + 1 # path = path + "{}".format(str(version)) # prediction_signature = ( # tf.saved_model.signature_def_utils.build_signature_def( # inputs={'input_images': tf.saved_model.utils.build_tensor_info(train_input.input_data)}, # outputs={'output': tf.saved_model.utils.build_tensor_info(m.logits)}, # method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME # ) # ) # builder = tf.saved_model.builder.SavedModelBuilder(path) # builder.add_meta_graph_and_variables( # sess, [tf.saved_model.tag_constants.SERVING], # signature_def_map={ # 'generate_images': prediction_signature # }, # legacy_init_op=tf.group(tf.tables_initializer(), name='legacy_init_op')) # builder.save(as_text=False) with tf.Session(graph=tf.Graph()) as sess: saver = tf.train.import_meta_graph("./data/model/model.ckpt-0.meta") saver.restore(sess,"./data/model/model.ckpt-0") print("Model restore") path = './data/model/' dir_list = os.listdir(path) if len(dir_list) == 0: version = 1 else: last_version = len(dir_list) version = last_version + 1 path = path + "{}".format(str(version)) prediction_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={'input_images': tf.saved_model.utils.build_tensor_info(train_input.input_data)}, outputs={'output': tf.saved_model.utils.build_tensor_info(m.logits)}, method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME ) ) builder = tf.saved_model.builder.SavedModelBuilder(path) builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={ 'generate_images': prediction_signature }, legacy_init_op=tf.group(tf.tables_initializer(), name='legacy_init_op')) builder.save(as_text=False)
def main(_): if not FLAGS.data_path: raise ValueError("Must set --data_path to PTB data directory") os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu raw_data = reader.ptb_raw_data(FLAGS.data_path, FLAGS.vocab_path) train_data, valid_data, test_data, _ = raw_data print('data load finished!') config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): train_input = PTBInput(config=config, data=train_data, name="TrainInput") with tf.variable_scope("Model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config, input_=train_input) tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) with tf.name_scope("Valid"): valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config, input_=valid_input) tf.summary.scalar("Validation Loss", mvalid.cost) with tf.name_scope("Test"): test_input = PTBInput( config=eval_config, data=test_data, name="TestInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest = PTBModel(is_training=False, config=eval_config, input_=test_input) models = {"Train": m, "Valid": mvalid, "Test": mtest} for name, model in models.items(): model.export_ops(name) metagraph = tf.train.export_meta_graph() if tf.__version__ < "1.1.0" and FLAGS.num_gpus > 1: raise ValueError("num_gpus > 1 is not supported for TensorFlow versions " "below 1.1.0") soft_placement = False if FLAGS.num_gpus > 1: soft_placement = True util.auto_parallel(metagraph, m) with tf.Graph().as_default(): tf.train.import_meta_graph(metagraph) for model in models.values(): model.import_ops() sv = tf.train.Supervisor(logdir=FLAGS.save_path) config_proto = tf.ConfigProto(allow_soft_placement=soft_placement) with sv.managed_session(config=config_proto) as session: for i in range(config.max_max_epoch): lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity, paras = run_epoch(session, m, eval_op=m.train_op, verbose=True) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity, _ = run_epoch(session, mvalid) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) if FLAGS.save_path: print("Saving model to %s." % FLAGS.save_path) sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step) hdf5_file = FLAGS.save_path + '.hdf5' with h5py.File(hdf5_file, 'w') as fin: for k, v in paras.items(): fin[k] = v with h5py.File(hdf5_file, 'r') as fout: for k in paras.keys(): print(k)
def main(_): raw_data = reader.ptb_raw_data(FLAGS.data_path) train_data, valid_data, test_data, _ = raw_data config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): train_input = PTBInput(config=config, data=train_data, name="TrainInput") with tf.variable_scope("Model", reuse=None, initializer=initializer): m = RNNModel(is_training=True, config=config, input_=train_input) tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) with tf.name_scope("Valid"): valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = RNNModel(is_training=False, config=config, input_=valid_input) tf.summary.scalar("Validation Loss", mvalid.cost) with tf.name_scope("Test"): test_input = PTBInput( config=config, data=test_data, name="TestInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest = RNNModel(is_training=False, config=config, input_=test_input) models = {"Train": m, "Valid": mvalid, "Test": mtest} for name, model in models.items(): model.export_ops(name) metagraph = tf.train.export_meta_graph() if tf.__version__ < "1.1.0" and FLAGS.num_gpus > 1: raise ValueError("num_gpus > 1 is not supported for TensorFlow versions " "below 1.1.0") soft_placement = False if FLAGS.num_gpus > 1: soft_placement = True util.auto_parallel(metagraph, m) with tf.Graph().as_default(): tf.train.import_meta_graph(metagraph) for model in models.values(): model.import_ops() sv = tf.train.Supervisor(logdir=FLAGS.save_path) config_proto = tf.ConfigProto(allow_soft_placement=soft_placement) config_proto.gpu_options.allow_growth = True with sv.managed_session(config=config_proto) as session: best_valid_perplexity = 10000 valid_perplexity = 0 best_test_perplexity = 10000 test_perplexity = 0 for i in range(config.max_max_epoch): if valid_perplexity > best_valid_perplexity or test_perplexity > best_test_perplexity: # lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) if config.learning_rate > 0.0001: config.learning_rate = config.learning_rate * config.lr_decay else: config.learning_rate = config.learning_rate else: config.learning_rate = config.learning_rate m.assign_lr(session, config.learning_rate) print("Epoch: %d Learning rate: %.4f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, eval_op=m.train_op, verbose=True) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid) if valid_perplexity < best_valid_perplexity: best_valid_perplexity = valid_perplexity print("Epoch: %d Valid Perplexity: %.3f best valid: %.3f" % (i + 1, valid_perplexity, best_valid_perplexity)) test_perplexity = run_epoch(session, mtest) if test_perplexity < best_test_perplexity: best_test_perplexity = test_perplexity f = open('ppl_hidden_'+str(config.hidden_size)+'.txt', 'w') f.write('best_test_perplexity:'+str(best_test_perplexity)+'\n') f.write('best_valid_perplexity:'+str(best_valid_perplexity)+'\n') f.close() print("Epoch: %d Test Perplexity: %.3f best test: %.3f" % (i + 1, test_perplexity, best_test_perplexity)) if FLAGS.save_path: print("Saving model to %s." % FLAGS.save_path) sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step)
from __future__ import absolute_import
def main(_): if not FLAGS.data_path: raise ValueError("Must set --data_path to PTB data directory") gpus = [ x.name for x in device_lib.list_local_devices() if x.device_type == "GPU" ] if FLAGS.num_gpus > len(gpus): raise ValueError("Your machine has only %d gpus " "which is less than the requested --num_gpus=%d." % (len(gpus), FLAGS.num_gpus)) raw_data = reader.ptb_raw_data(FLAGS.data_path) train_data, valid_data, test_data, _, dict_id_word = raw_data dict_word_id = dict(zip(dict_id_word.values(), dict_id_word.keys())) eos_id = dict_word_id['<eos>'] config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = np.shape(test_data)[0] - 1 saver = None filename = None if FLAGS.save_path: filename = FLAGS.save_path + '/lmodel.ckpt' with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): train_input = PTBInput(config=config, data=train_data, name="TrainInput") with tf.variable_scope("Model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config, input_=train_input) tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) with tf.name_scope("Valid"): valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config, input_=valid_input) tf.summary.scalar("Validation Loss", mvalid.cost) with tf.name_scope("Test"): test_input = PTBInput(config=eval_config, data=test_data, name="TestInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest = PTBModel(is_training=False, config=eval_config, input_=test_input) tf.summary.scalar("Test Loss", mtest.cost) models = {"Train": m, "Valid": mvalid, "Test": mtest} for name, model in models.items(): model.export_ops(name) metagraph = tf.train.export_meta_graph() if tf.__version__ < "1.1.0" and FLAGS.num_gpus > 1: raise ValueError( "num_gpus > 1 is not supported for TensorFlow versions " "below 1.1.0") soft_placement = False if FLAGS.num_gpus > 1: soft_placement = True util.auto_parallel(metagraph, m) if FLAGS.save_path: saver = tf.train.Saver() try: saver.restore(tf.Session(), filename) except Exception as e: pass with tf.Graph().as_default(): tf.train.import_meta_graph(metagraph) for model in models.values(): model.import_ops() sv = tf.train.Supervisor(logdir=FLAGS.save_path) config_proto = tf.ConfigProto(allow_soft_placement=soft_placement) with sv.managed_session(config=config_proto) as session: predicted_word_output = word_predict(session, mtest, predict_op=mtest._output, log_output=True, dict_ids=dict_id_word) if FLAGS.save_path: print("Saving model to %s." % filename) save_path = saver.save(session, filename)
def main(_): if not FLAGS.data_path: raise ValueError("Must set --data_path to PTB data directory") gpus = [ x.name for x in device_lib.list_local_devices() if x.device_type == "GPU" ] if FLAGS.num_gpus > len(gpus): raise ValueError("Your machine has only %d gpus " "which is less than the requested --num_gpus=%d." % (len(gpus), FLAGS.num_gpus)) raw_data = reader.ptb_raw_data(FLAGS.data_path) train_data, valid_data, test_data, _ = raw_data config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 # 输入数据并制作计算图(构建模型) with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): train_input = PTBInput( config=config, data=train_data, name="TrainInput") # reader里的x, y 即input_data, target with tf.variable_scope("Model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config, input_=train_input) tf.summary.scalar("Training Loss", m.cost) # 记录tensorboard tf.summary.scalar("Learning Rate", m.lr) with tf.name_scope("Valid"): valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config, input_=valid_input) tf.summary.scalar("Validation Loss", mvalid.cost) with tf.name_scope("Test"): test_input = PTBInput(config=eval_config, data=test_data, name="TestInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest = PTBModel(is_training=False, config=eval_config, input_=test_input) models = {"Train": m, "Valid": mvalid, "Test": mtest} for name, model in models.items(): model.export_ops(name) metagraph = tf.train.export_meta_graph() # TODO:这里的meta_graph是什么东西? if tf.__version__ < "1.1.0" and FLAGS.num_gpus > 1: raise ValueError( "num_gpus > 1 is not supported for TensorFlow versions " "below 1.1.0") soft_placement = False if FLAGS.num_gpus > 1: soft_placement = True util.auto_parallel(metagraph, m) # 载入三个模型并开始训练 with tf.Graph().as_default(): tf.train.import_meta_graph(metagraph) for model in models.values(): model.import_ops() # Supervisor模块集成了多个功能: # 1)自动去checkpoint加载数据或初始化数据 ,因此我们就不需要手动初始化或者从checkpoint中加载数据 # 2)自身有一个Saver,可以用来保存checkpoint,因此不需要创建Saver,直接使用Supervisor里的Saver即可 # 3)有一个summary_computed用来保存Summary,因此不需要创建summary_writer sv = tf.train.Supervisor(logdir=FLAGS.save_path) config_proto = tf.ConfigProto(allow_soft_placement=soft_placement) with sv.managed_session(config=config_proto) as session: for i in range(config.max_max_epoch): lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, eval_op=m.train_op, verbose=True) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) test_perplexity = run_epoch( session, mtest ) # TODO: 为什么这里的mvalid、mtest模型看上去和m是独立的两个模型,实际上却"继承"了m的训练结果? print("Test Perplexity: %.3f" % test_perplexity ) # tf.variable_scope("Model", reuse=True, ..) 复用了参数 if FLAGS.save_path: print("Saving model to %s." % FLAGS.save_path) sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step)
def main(_): if not FLAGS.data_path: raise ValueError("Must set --data_path to PTB data directory") gpus = [ x.name for x in device_lib.list_local_devices() if x.device_type == "GPU" ] if FLAGS.num_gpus > len(gpus): raise ValueError("Your machine has only %d gpus " "which is less than the requested --num_gpus=%d." % (len(gpus), FLAGS.num_gpus)) raw_data = reader.ptb_raw_data(FLAGS.data_path) train_data, valid_data, test_data, _ = raw_data config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 #=========================== added by yctung start ============================= ps_hosts = FLAGS.ps_hosts.split(",") worker_hosts = FLAGS.worker_hosts.split(",") # Create a cluster from the parameter server and worker hosts. cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) # Create and start a server for the local task. server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) print("job name = " + FLAGS.job_name) if FLAGS.job_name == "ps": print("--- I am ps ---") server.join() elif FLAGS.job_name == "worker": print("--- I am worker ---") #=========================== added by yctung end ============================= with tf.Graph().as_default(): initializer = tf.random_uniform_initializer( -config.init_scale, config.init_scale) with tf.name_scope("Train"): train_input = PTBInput(config=config, data=train_data, name="TrainInput") with tf.variable_scope("Model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config, input_=train_input) tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) with tf.name_scope("Valid"): valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config, input_=valid_input) tf.summary.scalar("Validation Loss", mvalid.cost) with tf.name_scope("Test"): test_input = PTBInput(config=eval_config, data=test_data, name="TestInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest = PTBModel(is_training=False, config=eval_config, input_=test_input) models = {"Train": m, "Valid": mvalid, "Test": mtest} for name, model in models.items(): model.export_ops(name) metagraph = tf.train.export_meta_graph() if tf.__version__ < "1.1.0" and FLAGS.num_gpus > 1: raise ValueError( "num_gpus > 1 is not supported for TensorFlow versions " "below 1.1.0") soft_placement = False if FLAGS.num_gpus > 1: soft_placement = True util.auto_parallel(metagraph, m) with tf.Graph().as_default(): tf.train.import_meta_graph(metagraph) for model in models.values(): model.import_ops() # sv = tf.train.Supervisor(logdir=FLAGS.save_path) # config_proto = tf.ConfigProto(allow_soft_placement=soft_placement) # with sv.managed_session(config=config_proto) as session: # yctung: add the distributed session setting is_chief = (FLAGS.task_index == 0 ) #checks if this is the chief node sv = tf.train.Supervisor(logdir=FLAGS.save_path, is_chief=is_chief) config_proto = tf.ConfigProto(allow_soft_placement=soft_placement) with sv.prepare_or_wait_for_session(server.target) as session: for i in range(config.max_max_epoch): lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, eval_op=m.train_op, verbose=True) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) test_perplexity = run_epoch(session, mtest) print("Test Perplexity: %.3f" % test_perplexity) if FLAGS.save_path: print("Saving model to %s." % FLAGS.save_path) sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step)
def train_and_validate(batch_size_20, batch_size_40, batch_size_60, batch_size_80, learning_rate): if not FLAGS.data_path: raise ValueError("Must set --data_path to PTB data directory") gpus = [ x.name for x in device_lib.list_local_devices() if x.device_type == "GPU" ] if FLAGS.num_gpus > len(gpus): raise ValueError("Your machine has only %d gpus " "which is less than the requested --num_gpus=%d." % (len(gpus), FLAGS.num_gpus)) raw_data = reader.ptb_raw_data(FLAGS.data_path) train_data, valid_data, test_data, _ = raw_data config = get_config() # Modify default model configuration with input hyperparameters # Proposed strategy will input discretized/integer value as float so casting to int is needed # Basic strategy will input a float value # config.batch_size = int(round(batch_size)) config.learning_rate = learning_rate # For categorical parameters, use one-hot encoding to set config batch_size_list = [ batch_size_20, batch_size_40, batch_size_60, batch_size_80, ] batch_size_idx = batch_size_list.index(max(batch_size_list)) config.batch_size = 20 * (batch_size_idx + 1) # if batch_size_20 == 1: # config.batch_size = 20 # elif batch_size_40 == 1: # config.batch_size = 40 # elif batch_size_60 == 1: # config.batch_size = 60 # elif batch_size_80 == 1: # config.batch_size = 80 # else: # raise Exception("Categorical parameter is not properly one-hot encoded") eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 print("lr=%s bs=%s" % (config.learning_rate, config.batch_size)) with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): train_input = PTBInput(config=config, data=train_data, name="TrainInput") with tf.variable_scope("Model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config, input_=train_input) tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) with tf.name_scope("Valid"): valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config, input_=valid_input) tf.summary.scalar("Validation Loss", mvalid.cost) with tf.name_scope("Test"): test_input = PTBInput(config=eval_config, data=test_data, name="TestInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest = PTBModel(is_training=False, config=eval_config, input_=test_input) models = {"Train": m, "Valid": mvalid, "Test": mtest} for name, model in models.items(): model.export_ops(name) metagraph = tf.train.export_meta_graph() if (StrictVersion(tf.__version__) < StrictVersion("1.1.0") and FLAGS.num_gpus > 1): raise ValueError( "num_gpus > 1 is not supported for TensorFlow versions " "below 1.1.0") soft_placement = False if FLAGS.num_gpus > 1: soft_placement = True util.auto_parallel(metagraph, m) with tf.Graph().as_default(): tf.train.import_meta_graph(metagraph) for model in models.values(): model.import_ops() sv = tf.train.Supervisor(logdir=FLAGS.save_path) config_proto = tf.ConfigProto(allow_soft_placement=soft_placement) with sv.managed_session(config=config_proto) as session: for i in range(config.max_max_epoch): lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) if FLAGS.verbose: print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, eval_op=m.train_op, verbose=False) if FLAGS.verbose: print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid) if FLAGS.verbose: print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) # test_perplexity = run_epoch(session, mtest) # print("Test Perplexity: %.3f" % test_perplexity) if FLAGS.save_path: if FLAGS.verbose: print("Saving model to %s." % FLAGS.save_path) sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step) # Returning negative value since target is to be maximized return -valid_perplexity
train_input = MSI_Input(config=config, data=x_train, name="TrainInput") with tf.variable_scope("Model", reuse=None, initializer=initializer): m = MSI_Model(is_training=True, config=config, input_=train_input) tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) with tf.name_scope("Valid"): valid_input = MSI_Input(config=config, data=x_val, name="ValidInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = MSI_Model(is_training=False, config=config, input_=valid_input) tf.summary.scalar("Validation Loss", mvalid.cost) with tf.name_scope("Test"): test_input = MSI_Input(config=eval_config, data=x_test, name="TestInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest = MSI_Model(is_training=False, config=eval_config, input_=test_input) models = {"Train": m, "Valid": mvalid, "Test": mtest} for name, model in models.items(): model.export_ops(name) metagraph = tf.train.export_meta_graph() soft_placement = False if n_GPUS > 1: soft_placement = True util.auto_parallel(metagraph, m) if __name__ == "__main__": tf.app.run()
def main(_): if not FLAGS.data_path: raise ValueError('Must set --data_path to PTB data directory') gpus = [ x.name for x in device_lib.list_local_devices() if x.device_type == 'GPU' ] if FLAGS.num_gpus > len(gpus): raise ValueError('Your machine has only %d gpus ' 'which is less than the requested --num_gpus=%d.' % (len(gpus), FLAGS.num_gpus)) # Genereate words to ids dictionary and convert words to ids raw_data = reader.ptb_raw_data(FLAGS.data_path) train_data, valid_data, test_data, _ = raw_data # Get hyperparameters config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) # Generate with tf.name_scope("Train"): train_input = PTBInput(config=config, data=train_data, name="TrainInput") with tf.variable_scope("Model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config, input_=train_input) tf.summary.scalar("Training Loss,", m.cost) tf.summary.scalar("Learning Rate", m.lr) with tf.name_scope('Valid'): valid_input = PTBInput(config=config, data=valid_data, name='ValidInput') with tf.variable_scope('Model', reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config, input_=valid_input) tf.summary.scalar('Validation Loss', mvalid.cost) with tf.name_scope('Test'): test_input = PTBInput(config=eval_config, data=test_data, name='TestInput') with tf.variable_scope('Model', reuse=True, initializer=initializer): mtest = PTBModel(is_training=False, config=eval_config, input_=test_input) # Add ops to collection (tf.add_to_collection), The collection is managed by tensorflow" models = {'Train': m, 'Valid': mvalid, 'Test': mtest} for name, model in models.items(): model.export_ops(name) metagraph = tf.train.export_meta_graph( ) # Export the graph, it can be stored in the disk if tf.__version__ < '1.1.0' and FLAGS.num_gpus > 1: raise ValueError( 'num_gpus > 1 is not supported for TensorFlow versions ' 'below 1.1.0') # Parallel config soft_placement = False if FLAGS.num_gpus > 1: soft_placement = True util.auto_parallel(metagraph, m) with tf.Graph().as_default(): # Import ops and graph tf.train.import_meta_graph(metagraph) for model in models.values(): model.import_ops() # Use supervisor to save and load checkpoint, pre-train variables sv = tf.train.Supervisor(logdir=FLAGS.save_path) config_proto = tf.ConfigProto(allow_soft_placement=soft_placement ) # Used to set config for session with sv.managed_session(config=config_proto) as session: # Times to loop corpusvxcvzxvxvzxvzxvz for i in range(config.max_max_epoch): time1 = time.time() # Calculate learning decay lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print('Epoch: %d Learning rate: %.3f' % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, eval_op=m.train_op, verbose=True) print('Epoch: %d Train Perplexity: %.3f' % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid) print('Epoch: %d Valid Perplexity: %.3f' % (i + 1, valid_perplexity)) print('One loop used %d s' % time.time() - time1) test_perplexity = run_epoch(session, mtest) print('Epoch: %d Valid Perplexity: %.3f' % test_perplexity) if FLAGS.save_path: print('Saving model to %s.' % FLAGS.save_path) sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step)
def train(configs, data): config, eval_config = configs train_data, valid_data, test_data = data with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): train_input = PTBInput(config=config, data=train_data, name="TrainInput") with tf.variable_scope("Model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config, input_=train_input) tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) with tf.name_scope("Valid"): valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config, input_=valid_input) tf.summary.scalar("Validation Loss", mvalid.cost) with tf.name_scope("Test"): test_input = PTBInput(config=eval_config, data=test_data, name="TestInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest = PTBModel(is_training=False, config=eval_config, input_=test_input) tf.summary.scalar("Test Loss", mtest.cost) models = {"Train": m, "Valid": mvalid, "Test": mtest} for name, model in models.items(): model.export_ops(name) metagraph = tf.train.export_meta_graph() if tf.__version__ < "1.1.0" and FLAGS.num_gpus > 1: raise ValueError( "num_gpus > 1 is not supported for TensorFlow versions " "below 1.1.0") soft_placement = False if FLAGS.num_gpus > 1: soft_placement = True util.auto_parallel(metagraph, m) with tf.Graph().as_default(): tf.train.import_meta_graph(metagraph) for model in models.values(): model.import_ops() if FLAGS.save_path: if not os.path.isdir(FLAGS.save_path): os.mkdir(FLAGS.save_path) sv = tf.train.Supervisor(logdir=FLAGS.save_path) config_proto = tf.ConfigProto(allow_soft_placement=soft_placement) best_pp = -1 best_epoch = 0 with sv.managed_session(config=config_proto) as session: for i in range(config.max_max_epoch): lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) train_perplexity = run_epoch(session, m, eval_op=m.train_op, verbose=True) valid_perplexity = run_epoch(session, mvalid) if best_pp == -1 or valid_perplexity < best_pp: best_pp = valid_perplexity best_epoch = i + 1 return best_pp, best_epoch
def main(_): if not FLAGS.data_path: raise ValueError("Must set --data_path to PTB data directory") gpus = [ x.name for x in device_lib.list_local_devices() if x.device_type == "GPU" ] if FLAGS.num_gpus > len(gpus): raise ValueError( "Your machine has only %d gpus " "which is less than the requested --num_gpus=%d." % (len(gpus), FLAGS.num_gpus)) raw_data = reader.ptb_raw_data(FLAGS.data_path) train_data, valid_data, test_data, _ = raw_data config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): train_input = PTBInput(config=config, data=train_data, name="TrainInput") with tf.variable_scope("Model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config, input_=train_input) tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) with tf.name_scope("Valid"): valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config, input_=valid_input) tf.summary.scalar("Validation Loss", mvalid.cost) with tf.name_scope("Test"): test_input = PTBInput( config=eval_config, data=test_data, name="TestInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest = PTBModel(is_training=False, config=eval_config, input_=test_input) models = {"Train": m, "Valid": mvalid, "Test": mtest} for name, model in models.items(): model.export_ops(name) metagraph = tf.train.export_meta_graph() if tf.__version__ < "1.1.0" and FLAGS.num_gpus > 1: raise ValueError("num_gpus > 1 is not supported for TensorFlow versions " "below 1.1.0") soft_placement = False if FLAGS.num_gpus > 1: soft_placement = True util.auto_parallel(metagraph, m) with tf.Graph().as_default(): tf.train.import_meta_graph(metagraph) for model in models.values(): model.import_ops() sv = tf.train.Supervisor(logdir=FLAGS.save_path) config_proto = tf.ConfigProto(allow_soft_placement=soft_placement) with sv.managed_session(config=config_proto) as session: for i in range(config.max_max_epoch): lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, eval_op=m.train_op, verbose=True) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) test_perplexity = run_epoch(session, mtest) print("Test Perplexity: %.3f" % test_perplexity) if FLAGS.save_path: print("Saving model to %s." % FLAGS.save_path) sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step)
def main(_): if not FLAGS.data_path: raise ValueError("Must set --data_path to PTB data directory") gpus = [ x.name for x in device_lib.list_local_devices() if x.device_type == "GPU" ] if FLAGS.num_gpus > len(gpus): raise ValueError("Your machine has only %d gpus " "which is less than the requested --num_gpus=%d." % (len(gpus), FLAGS.num_gpus)) raw_data = reader.ptb_raw_data(FLAGS.data_path) train_data, valid_data, test_data, _ = raw_data config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): train_input = PTBInput(config=config, data=train_data, name="TrainInput") with tf.variable_scope("Model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config, input_=train_input) tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) with tf.name_scope("Valid"): valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config, input_=valid_input) tf.summary.scalar("Validation Loss", mvalid.cost) with tf.name_scope("Test"): test_input = PTBInput(config=eval_config, data=test_data, name="TestInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest = PTBModel(is_training=False, config=eval_config, input_=test_input) models = {"Train": m, "Valid": mvalid, "Test": mtest} for name, model in models.items(): model.export_ops(name) metagraph = tf.train.export_meta_graph() temp_meta = MessageToJson(metagraph.graph_def) with open('kernelLogs/metagraph.json', 'w') as outfile: json.dump(temp_meta, outfile) #sys.exit() if tf.__version__ < "1.1.0" and FLAGS.num_gpus > 1: raise ValueError( "num_gpus > 1 is not supported for TensorFlow versions " "below 1.1.0") # soft_placement = True soft_placement = False if FLAGS.num_gpus > 1: soft_placement = True util.auto_parallel(metagraph, m) #added by ubaid all_ops = tf.get_default_graph().get_operations() adj_list_graph = {} for op in all_ops: adj_list_graph[op.name] = set([inp.name for inp in op.inputs]) adj_list_graph_notensors = {} for op in all_ops: adj_list_graph_notensors[op.name] = set( [inp.name.split(":")[0] for inp in op.inputs]) adj_list_graph_notensors = { op_name: list(op_deps) for op_name, op_deps in adj_list_graph_notensors.items() } adj_list_graph = { op_name: list(op_deps) for op_name, op_deps in adj_list_graph.items() } with open('kernelLogs/org_graph_rnnlm_ptb_%s.json' % (FLAGS.model), 'w') as outfile: json.dump(adj_list_graph, outfile) with open( 'kernelLogs/org_graph_notensors_rnnlm_ptb_%s.json' % (FLAGS.model), 'w') as outfile: json.dump(adj_list_graph_notensors, outfile) #sys.exit() ##### with tf.Graph().as_default(): tf.train.import_meta_graph(metagraph) for model in models.values(): model.import_ops() sv = tf.train.Supervisor(logdir=FLAGS.save_path) #config_proto = tf.ConfigProto(allow_soft_placement=soft_placement) # added by xilenteyex config_proto = tf.ConfigProto( allow_soft_placement=soft_placement, graph_options=tf.GraphOptions(build_cost_model=1)) config_proto.intra_op_parallelism_threads = 1 config_proto.inter_op_parallelism_threads = 1 config_proto.graph_options.optimizer_options.opt_level = -1 config_proto.graph_options.rewrite_options.constant_folding = ( rewriter_config_pb2.RewriterConfig.OFF) config_proto.graph_options.rewrite_options.arithmetic_optimization = ( rewriter_config_pb2.RewriterConfig.OFF) config_proto.graph_options.rewrite_options.dependency_optimization = ( rewriter_config_pb2.RewriterConfig.OFF) config_proto.graph_options.rewrite_options.layout_optimizer = ( rewriter_config_pb2.RewriterConfig.OFF) ###### with sv.managed_session(config=config_proto) as session: for i in range(config.max_max_epoch): lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, eval_op=m.train_op, verbose=True, epoch_no=i) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) test_perplexity = run_epoch(session, mtest) print("Test Perplexity: %.3f" % test_perplexity) if FLAGS.save_path: print("Saving model to %s." % FLAGS.save_path) sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step)
def main(_): if not FLAGS.data_path: raise ValueError("Must set --data_path to PTB data directory") gpus = [ x.name for x in device_lib.list_local_devices() if x.device_type == "GPU" ] if FLAGS.num_gpus > len(gpus): raise ValueError( "Your machine has only %d gpus " "which is less than the requested --num_gpus=%d." % (len(gpus), FLAGS.num_gpus)) raw_data = reader.ptb_raw_data(FLAGS.data_path) train_data, valid_data, test_data, vocabulary = raw_data print(len(train_data)) config = get_config() eval_config = get_config() eval_config.batch_size = 35 eval_config.num_steps = 43 with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): train_input = PTBInput(config=config, data=train_data, name="TrainInput") print(train_input) with tf.variable_scope("Model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config, input_=train_input) tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) tf.summary.scalar("Training probs", m.probabilities) with tf.name_scope("Valid"): valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config, input_=valid_input) tf.summary.scalar("Validation Loss", mvalid.cost) with tf.name_scope("Test"): test_input = PTBInput( config=eval_config, data=test_data, name="TestInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest = PTBModel(is_training=False, config=eval_config, input_=test_input) models = {"Train": m, "Valid": mvalid, "Test": mtest} for name, model in models.items(): model.export_ops(name) metagraph = tf.train.export_meta_graph() if tf.__version__ < "1.1.0" and FLAGS.num_gpus > 1: raise ValueError("num_gpus > 1 is not supported for TensorFlow versions " "below 1.1.0") soft_placement = False if FLAGS.num_gpus > 1: soft_placement = True util.auto_parallel(metagraph, m) with tf.Graph().as_default(): tf.train.import_meta_graph(metagraph) for model in models.values(): model.import_ops() sv = tf.train.Supervisor(logdir=FLAGS.save_path) config_proto = tf.ConfigProto(allow_soft_placement=soft_placement) with sv.managed_session(config=config_proto) as session: for i in range(config.max_max_epoch): lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, eval_op=m.train_op, verbose=True) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) test_perplexity = run_epoch(session, mtest) print("Test Perplexity: %.3f" % test_perplexity) if FLAGS.save_path: print("Saving model to %s." % FLAGS.save_path) sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step)
def main(_): if not FLAGS.data_path: raise ValueError("Must set --data_path to PTB data directory") gpus = [ x.name for x in device_lib.list_local_devices() if x.device_type == "GPU" ] if FLAGS.num_gpus > len(gpus): raise ValueError("Your machine has only %d gpus " "which is less than the requested --num_gpus=%d." % (len(gpus), FLAGS.num_gpus)) raw_data = reader.ptb_raw_data(FLAGS.data_path) #train_data, valid_data, test_data, _ = raw_data train_data, test_data, _ = raw_data config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 with tf.Graph().as_default(): # If we are testing an existing model ... if FLAGS.load_path: # NOTE: there are two ways to restore an existing model, rebuilding the graph from scratch then # calling saver.restore for those objects, or importing the old metagraph then calling saver.restore # and then fetching the ops/tensors via methods like get_tensor_by_name # what follows is the first method with tf.name_scope("Train"): train_input = PTBInput(config=config, data=train_data, name="TrainInput") with tf.variable_scope("Model", reuse=None): m = PTBModel(is_training=True, config=config, input_=train_input, name="Train") tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) with tf.name_scope("Test"): test_input = PTBInput(config=eval_config, data=test_data, name="TestInput") #,iter=0) with tf.variable_scope("Model", reuse=True): mtest = PTBModel(is_training=False, config=eval_config, input_=test_input, name="Test") session = tf.InteractiveSession() saver = tf.train.Saver( ) #tf.train.import_meta_graph(FLAGS.load_path + ".meta") saver.restore(session, FLAGS.load_path) #mtest.import_ops() print("Model restored from %s." % FLAGS.load_path) of = open("HPL2.out", 'w') run_epoch(session, mtest, input=test_data[0], ep_size=len(test_data[0]) - 1, of=of) #run_epoch(session, mtest, input=test_input)#, ep_size=len(test_data[0]), ) iter = 1 for i in range(len(test_data) - 1): run_epoch(session, mtest, input=test_data[iter], ep_size=len(test_data[iter]) - 1, of=of) #run_epoch(session,mtest, input=test_input)#test_data[iter], ep_size = len(test_data[iter])) iter += 1 of.close() quit() # If we are training a model .... initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): train_input = PTBInput(config=config, data=train_data, name="TrainInput") with tf.variable_scope("Model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config, input_=train_input) tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) #with tf.name_scope("Valid"): # valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") # with tf.variable_scope("Model", reuse=True, initializer=initializer): # mvalid = PTBModel(is_training=False, config=config, input_=valid_input) # tf.summary.scalar("Validation Loss", mvalid.cost) with tf.name_scope("Test"): test_input = PTBInput(config=eval_config, data=test_data, name="TestInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest = PTBModel(is_training=False, config=eval_config, input_=test_input) models = { "Train": m, "Test": mtest } #, "Valid": mvalid, "Test": mtest} for name, model in models.items(): model.export_ops(name) metagraph = tf.train.export_meta_graph() if tf.__version__ < "1.1.0" and FLAGS.num_gpus > 1: raise ValueError( "num_gpus > 1 is not supported for TensorFlow versions " "below 1.1.0") soft_placement = False if FLAGS.num_gpus > 1: soft_placement = True util.auto_parallel(metagraph, m) with tf.Graph().as_default(): tf.train.import_meta_graph(metagraph) for model in models.values(): model.import_ops() sv = tf.train.Supervisor(logdir=FLAGS.save_path) config_proto = tf.ConfigProto(allow_soft_placement=soft_placement) with sv.managed_session(config=config_proto) as session: if not FLAGS.load_path: sv.saver.restore(session, FLAGS.save_path + "-13450") for i in range(config.max_max_epoch): lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, eval_op=m.train_op, verbose=True) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) #valid_perplexity = run_epoch(session, mvalid) #print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) #test_perplexity = run_epoch(session, mtest) if FLAGS.save_path: print("Saving model to %s." % FLAGS.save_path) print("SAVED TO: %s." % sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step)) sv.saver.export_meta_graph(FLAGS.save_path + ".meta") else: test_perplexity = run_epoch(session, mtest)