FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print('all related parameters in :') for attr, value in sorted(FLAGS.__flags.items()): print('{}={}'.format(attr.upper(), value)) print('参数打印完毕.....') #加载数据 train_x, train_y, dev_x, dev_y = data_helper.load_dataset(FLAGS.raw_file) print('load data finished!') with tf.Session() as sess: han = HAN_model.HAN(FLAGS.vocab_size, FLAGS.num_classes, FLAGS.embedding_size, FLAGS.hidden_size) with tf.name_scope('loss'): loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=han.input_y, logits=han.out, name='loss')) with tf.name_scope('accuracy'): predict = tf.argmax(han.out, axis=1, name='predict') label = tf.argmax(han.input_y, axis=1, name='label') acc = tf.reduce_mean(tf.cast(tf.equal(predict, label), tf.float32)) timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir))
def evaluate(evalDataSet, ckpt_value, eval_value, time_list): with tf.Graph().as_default() as g, tf.device('/gpu:0'): # Placeholders for input, output and dropout feature_size = OPTION.SENT_HIDDEN_SIZE * 2 input_x = tf.placeholder(tf.int32, [None, OPTION.SEQUENCE_LEN, OPTION.SENT_LEN], name="input_x") input_y = tf.placeholder(tf.int32, [None, OPTION.NUM_CLASSES], name="input_y") feature_size = feature_size * min(time_list[-1] - 0, OPTION.DP_DEPTH) if feature_size > 0: features_before = tf.placeholder(tf.float32, [None, feature_size], name="features_before") else: features_before = None han = HAN_model.Model(sequence_length=OPTION.SEQUENCE_LEN, sent_length=OPTION.SENT_LEN, num_classes=OPTION.NUM_CLASSES, vocab_size=None, embedding_size=OPTION.EMEBEDDING_DIMENSION, Word2vec=True, Trainable=False) # inference model. logits, _ = han.inference(input_x, features_before, eval_data=True) # Calculate loss. loss = myTF.calculate_cross_entropy_loss(logits, input_y) logits = tf.nn.softmax(logits) # Restore the moving average version of the learned variables for eval. # ????????????????????????? variable_averages = tf.train.ExponentialMovingAverage( OPTION.MOVING_AVERAGE_DECAY) variables_to_restore = variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(OPTION.EVAL_DIR, g) last_eval_ckpt = ckpt_value best_eval_value = eval_value config = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) config.gpu_options.per_process_gpu_memory_fraction = 0.5 # 程序最多只能占用指定gpu50%的显存 config.gpu_options.allow_growth = True # 程序按需申请内存 while True: # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU implementations. with tf.Session(config=config) as sess: ckpt = tf.train.get_checkpoint_state(OPTION.CHECKPOINT_DIR) if ckpt and ckpt.model_checkpoint_path: # extract global_step global_step_for_restore = int( ckpt.model_checkpoint_path.split('/')[-1].split('-') [-1]) if global_step_for_restore > last_eval_ckpt: # Restores from checkpoint saver.restore(sess, ckpt.model_checkpoint_path) else: if tf.gfile.Exists("TRAIN_SUCCEED"): print("Train terminated, eval terminating...") return else: print('No checkpoint file found') time.sleep(FLAGS.eval_interval_secs) continue if global_step_for_restore > last_eval_ckpt: max_steps_per_epoch = int( math.ceil(evalDataSet.get_dataset_size() / float(OPTION.EVAL_BATCH_SIZE))) start_time = time.time() total_predicted_value = [] total_true_value = [] total_loss = [] for step in range(max_steps_per_epoch): test_data, test_label, test_features = evalDataSet.next_batch( OPTION.EVAL_BATCH_SIZE) if feature_size > 0: feed_dict = { input_x: test_data, input_y: test_label, features_before: test_features } else: feed_dict = { input_x: test_data, input_y: test_label } predicted_value, true_value, loss_value = sess.run( [logits, input_y, loss], feed_dict=feed_dict) total_predicted_value.append(predicted_value) total_true_value.append(true_value) total_loss.append(loss_value) duration = time.time() - start_time # test_data, test_label = evalDataSet.next_batch(OPTION.EVAL_BATCH_SIZE) summary = tf.Summary() # summary.ParseFromString(sess.run(summary_op, feed_dict={input_x: test_data, input_y: test_label})) total_predicted_value = np.concatenate( total_predicted_value, axis=0) total_true_value = np.concatenate(total_true_value, axis=0) total_loss = np.concatenate(total_loss, axis=0) total_predicted_value = total_predicted_value[ 0:evalDataSet.get_dataset_size()] total_true_value = total_true_value[0:evalDataSet. get_dataset_size()] total_loss = total_loss[0:evalDataSet.get_dataset_size()] assert evalDataSet.get_dataset_size( ) == total_predicted_value.shape[0], 'sample_count error!' best_eval_value = evaluation_result( OPTION.EVAL_DIR, total_predicted_value, total_true_value, total_loss, global_step_for_restore, best_eval_value, summary) summary_writer.add_summary(summary, global_step_for_restore) last_eval_ckpt = global_step_for_restore if FLAGS.run_once: break time.sleep(FLAGS.eval_interval_secs)
def train(newTrain, checkpoint, trainDataSet): with tf.Graph().as_default(), tf.device('/gpu:0'): global_step = tf.Variable(0, name="global_step", trainable=False) # Placeholders for input, output and dropout input_x = tf.placeholder(tf.int32, [None, OPTION.SEQUENCE_LEN, OPTION.SENT_LEN], name="input_x") input_y = tf.placeholder(tf.int32, [None, OPTION.NUM_CLASSES], name="input_y") han = HAN_model.Model(sequence_length=OPTION.SEQUENCE_LEN, sent_length=OPTION.SENT_LEN, num_classes=OPTION.NUM_CLASSES, vocab_size=None, embedding_size=OPTION.EMEBEDDING_DIMENSION, Word2vec=True, Trainable=False) # inference model. logits, _ = han.inference(input_x) # Calculate loss. loss = HAN_model.calculate_loss(logits, input_y) # Calculate accuracy accuracy = myTF.calculate_accuracy(logits, input_y) # updates the model parameters. train_op = myTF.train(loss, global_step) # Create a saver. saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=5) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() # Build an initialization operation to run below. init = tf.global_variables_initializer() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU implementations. config = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) # config.gpu_options.per_process_gpu_memory_fraction = 0.5 # 程序最多只能占用指定gpu50%的显存 config.gpu_options.allow_growth = OPTION.MEMORY_ALLOW_GROWTH # 程序按需申请内存 sess = tf.Session(config=config) first_step = 0 if not newTrain: print('restoring...') if checkpoint == '0': # choose the latest one ckpt = tf.train.get_checkpoint_state(OPTION.TRAIN_DIR) if ckpt and ckpt.model_checkpoint_path: # new_saver = tf.train.import_meta_graph(ckpt.model_checkpoint_path+'.meta') # Restores from checkpoint saver.restore(sess, ckpt.model_checkpoint_path) global_step_for_restore = ckpt.model_checkpoint_path.split( '/')[-1].split('-')[-1] first_step = int(global_step_for_restore) + 1 else: print('No checkpoint file found') return else: # if os.path.exists( os.path.join(OPTION.TRAIN_DIR, 'model.ckpt-' + checkpoint + '.index')): # new_saver = tf.train.import_meta_graph( # os.path.join(OPTION.TRAIN_DIR, 'model.ckpt-' + checkpoint + '.meta')) saver.restore( sess, os.path.join(OPTION.TRAIN_DIR, 'model.ckpt-' + checkpoint)) first_step = int(checkpoint) + 1 else: print('No checkpoint file found') return else: sess.run(init) if os.path.exists( os.path.join(OPTION.PRE_TRAIN_MODEL, 'model.ckpt-pretrain.index')): # saver_load = tf.train.Saver(var_list=tf.get_collection('pretrained_variables')) saver_load = tf.train.Saver(var_list=tf.trainable_variables()) print('load pretrained variables...') saver_load.restore( sess, os.path.join(OPTION.PRE_TRAIN_MODEL, 'model.ckpt-pretrain')) summary_writer = tf.summary.FileWriter(OPTION.TRAIN_DIR, sess.graph) filename_train_log = os.path.join(OPTION.TRAIN_DIR, 'log_train') if os.path.exists(filename_train_log): file_train_log = open(filename_train_log, 'a') else: file_train_log = open(filename_train_log, 'w') max_steps_per_epoch = int( math.ceil(trainDataSet.get_dataset_size() / float(OPTION.BATCH_SIZE))) max_steps = max_steps_per_epoch * OPTION.NUM_EPOCHS # ckpt_period = max_steps_per_epoch // OPTION.MIN_CKPTS # if ckpt_period > OPTION.MAX_CKPT_PERIOD: # ckpt_period = OPTION.MAX_CKPT_PERIOD ckpt_period = OPTION.MAX_CKPT_PERIOD for step in range(first_step, max_steps): train_data, train_label = trainDataSet.next_batch( OPTION.BATCH_SIZE) start_time = time.time() _, loss_value, accuracy_value, current_global_step = sess.run( [train_op, loss, accuracy, global_step], feed_dict={ input_x: train_data, input_y: train_label }) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' assert step + 1 == current_global_step, 'step:%d, current_global_step:%d' % ( step, current_global_step) current_epoch = int( current_global_step / float(max_steps_per_epoch)) + 1 current_step = current_global_step % max_steps_per_epoch if current_global_step % 10 == 0: sec_per_batch = float(duration) format_str = '%s: step=%d(%d/%d), loss=%.4f, acc=%.4f; %.3f sec/batch)' % ( datetime.now(), current_global_step, current_step, current_epoch, loss_value, accuracy_value, sec_per_batch) print(format_str, file=file_train_log) print(format_str) if current_global_step % OPTION.SUMMARY_PERIOD == 0: summary_str = sess.run(summary_op, feed_dict={ input_x: train_data, input_y: train_label }) summary_writer.add_summary(summary_str, current_global_step) # Save the model checkpoint periodically. (named 'model.ckpt-global_step.meta') if current_global_step % ckpt_period == 0 or (current_global_step + 1) == max_steps: checkpoint_path = os.path.join(OPTION.TRAIN_DIR, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=current_global_step) file_train_log.close()
def evaluate(trainDataSet,time,model_time): with tf.Graph().as_default() as g, tf.device('/cpu:0'): # Placeholders for input, output and dropout input_x = tf.placeholder(tf.int32, [None, OPTION.SEQUENCE_LEN, OPTION.SENT_LEN], name="input_x") input_y = tf.placeholder(tf.int32, [None, OPTION.NUM_CLASSES], name="input_y") han = MODEL.Model(sequence_length = OPTION.SEQUENCE_LEN, sent_length = OPTION.SENT_LEN, num_classes=OPTION.NUM_CLASSES, vocab_size=None, embedding_size=OPTION.EMEBEDDING_DIMENSION, Word2vec=True, Trainable=False) # inference model. logits, _ = han.inference(input_x, eval_data=True) # Calculate loss. loss = myTF.calculate_cross_entropy_loss(logits, input_y) # get model paramaters # paramaters_list_reshape = han.get_paramaters_list_reshape() # Restore the moving average version of the learned variables for eval. # ????????????????????????? variable_averages = tf.train.ExponentialMovingAverage(OPTION.MOVING_AVERAGE_DECAY) variables_to_restore = variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(OPTION.EVAL_DIR, g) # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU implementations. with tf.Session(config=tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement)) as sess: if os.path.exists(os.path.join(OPTION.EVAL_DIR, 'model.ckpt-best.index')): # new_saver = tf.train.import_meta_graph( # os.path.join(OPTION.TRAIN_DIR, 'model.ckpt-' + checkpoint + '.meta')) saver.restore(sess, os.path.join(OPTION.EVAL_DIR, 'model.ckpt-best')) else: print('No checkpoint file found') return max_steps_per_epoch = int(math.ceil( trainDataSet.get_dataset_size() / float(OPTION.EVAL_BATCH_SIZE))) total_predicted_value = [] for step in range(max_steps_per_epoch): train_data, train_label = trainDataSet.next_batch(OPTION.EVAL_BATCH_SIZE) predicted_value = sess.run(loss, feed_dict={input_x: train_data, input_y: train_label}) total_predicted_value.append(predicted_value) # test_data, test_label = evalDataSet.next_batch(OPTION.EVAL_BATCH_SIZE) summary = tf.Summary() # summary.ParseFromString(sess.run(summary_op, feed_dict={input_x: test_data, input_y: test_label})) total_predicted_value = np.concatenate(total_predicted_value,axis=0) assert trainDataSet.get_dataset_size() == total_predicted_value.shape[0], 'sample_count error!' detail_filename = os.path.join(OPTION.MODELPARA_DIR, 'loss_%d_%d' %(time,model_time)) if os.path.exists(detail_filename): os.remove(detail_filename) np.savetxt(detail_filename, total_predicted_value, fmt='%f')
def evaluate(evalDataSet, time, model_time, name='train'): with tf.Graph().as_default() as g, tf.device('/gpu:0'): feature_size = OPTION.SENT_HIDDEN_SIZE * 2 # Placeholders for input, output and dropout input_x = tf.placeholder(tf.int32, [None, OPTION.SEQUENCE_LEN, OPTION.SENT_LEN], name="input_x") feature_size = feature_size * min(model_time - 0, OPTION.DP_DEPTH) if feature_size > 0: features_before = tf.placeholder(tf.float32, [None, feature_size], name="features_before") else: features_before = None han = HAN_model.Model(sequence_length=OPTION.SEQUENCE_LEN, sent_length=OPTION.SENT_LEN, num_classes=OPTION.NUM_CLASSES, vocab_size=None, embedding_size=OPTION.EMEBEDDING_DIMENSION, Word2vec=True, Trainable=False) # inference model. _, features = han.inference(input_x, features_before, eval_data=True) # get model paramaters # paramaters_list_reshape = han.get_paramaters_list_reshape() # Restore the moving average version of the learned variables for eval. # ????????????????????????? variable_averages = tf.train.ExponentialMovingAverage( OPTION.MOVING_AVERAGE_DECAY) variables_to_restore = variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(OPTION.EVAL_DIR, g) config = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) # config.gpu_options.per_process_gpu_memory_fraction = 0.5 # 程序最多只能占用指定gpu50%的显存 config.gpu_options.allow_growth = OPTION.MEMORY_ALLOW_GROWTH # 程序按需申请内存 # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU implementations. with tf.Session(config=config) as sess: if os.path.exists( os.path.join(OPTION.EVAL_DIR, 'model.ckpt-best.index')): # new_saver = tf.train.import_meta_graph( # os.path.join(OPTION.TRAIN_DIR, 'model.ckpt-' + checkpoint + '.meta')) saver.restore(sess, os.path.join(OPTION.EVAL_DIR, 'model.ckpt-best')) else: print('No checkpoint file found') return max_steps_per_epoch = int( math.ceil(evalDataSet.get_dataset_size() / float(OPTION.EVAL_BATCH_SIZE))) total_predicted_value = [] for step in range(max_steps_per_epoch): test_data, test_features = evalDataSet.next_batch( OPTION.EVAL_BATCH_SIZE) if feature_size > 0: feed_dict = { input_x: test_data, features_before: test_features } else: feed_dict = {input_x: test_data} predicted_value = sess.run(features, feed_dict=feed_dict) total_predicted_value.append(predicted_value) # test_data, test_label = evalDataSet.next_batch(OPTION.EVAL_BATCH_SIZE) summary = tf.Summary() # summary.ParseFromString(sess.run(summary_op, feed_dict={input_x: test_data, input_y: test_label})) total_predicted_value = np.concatenate(total_predicted_value, axis=0) total_predicted_value = total_predicted_value[0:evalDataSet. get_dataset_size()] assert evalDataSet.get_dataset_size( ) == total_predicted_value.shape[0], 'sample_count error!' detail_filename = os.path.join( OPTION.MODELPARA_DIR, 'features_%s_%d_%d' % (name, time, model_time)) if os.path.exists(detail_filename): os.remove(detail_filename) np.savetxt(detail_filename, total_predicted_value, fmt='%.4f')