[[w[0] for w in s] for s in test_sentences]) ) ) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) train_data = prepare_dataset( train_sentences, char_to_id, tag_to_id, FLAGS.lower, FLAGS.self_seg ) dev_data = prepare_dataset( dev_sentences, char_to_id, tag_to_id, FLAGS.lower, FLAGS.self_seg ) test_data = prepare_dataset( test_sentences, char_to_id, tag_to_id, FLAGS.lower, FLAGS.self_seg ) print("%i / %i / %i sentences in train / dev / test." % ( len(train_data), len(dev_data), len(test_data))) #长度不足补0 train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100)
def train(): # load data sets train_sentences=load_sentences(FLAGS.train_file,FLAGS.zeros) dev_sentences=load_sentences(FLAGS.dev_file,FLAGS.zeros) test_sentences=load_sentences(FLAGS.test_file,FLAGS.zeros) # appoint tagging scheme (IOB/IOBES) train_sentences=update_tag_scheme(train_sentences,FLAGS.tag_schema) dev_sentences=update_tag_scheme(dev_sentences,FLAGS.tag_schema) test_sentences=update_tag_scheme(test_sentences,FLAGS.tag_schema) #create maps if not exist if not os.path.exists(FLAGS.map_file): if FLAGS.pre_emb: char_to_id,_=char_mapping(train_sentences) char_to_id,id_to_char=augment_with_pretrained(char_to_id,'wiki_100.utf8') else: char_to_id, id_to_char=char_mapping(train_sentences) tag_to_id, id_to_tag=tag_mapping(train_sentences) with open(FLAGS.map_file,'wb') as f: cPickle.dump([char_to_id,id_to_char,tag_to_id,id_to_tag],f,cPickle.HIGHEST_PROTOCOL) else: with open(FLAGS.map_file,'rb') as f: char_to_id, id_to_char, tag_to_id, id_to_tag=cPickle.load(f) # prepare data, get a collection of list containing index train_data=prepare_dataset(train_sentences,char_to_id,tag_to_id,True) dev_data=prepare_dataset(dev_sentences,char_to_id,tag_to_id,True) test_data=prepare_dataset(test_sentences,char_to_id,tag_to_id,True) print "%i %i %i sentences in train / dev / test." % (len(train_data),len(dev_data),len(test_data)) if not FLAGS.pre_emb: pre_emb=None else: pre_emb=load_word2vec(FLAGS.pre_emb_file,char_to_id,FLAGS.char_dim) print "init embedding shape: (%d,%d)" %(pre_emb.shape[0],pre_emb.shape[1]) train_manager=BatchManager(train_data,FLAGS.batch_size,True) dev_manager=BatchManager(dev_data,FLAGS.batch_size,False) test_manager=BatchManager(test_data,FLAGS.batch_size,False) config=BasicModelConfig(FLAGS,len(char_to_id),len(tag_to_id),4) tfConfig = tf.ConfigProto() tfConfig.gpu_options.per_process_gpu_memory_fraction = FLAGS.memory_usage with tf.Session(config=tfConfig) as sess: print "Train started!" model=BasicModel(config,pre_emb) saver=tf.train.Saver() # tensorboard if not os.path.exists(FLAGS.summaries_dir): os.mkdir(FLAGS.summaries_dir) merged=tf.summary.merge_all() train_writer=tf.summary.FileWriter(os.path.join(FLAGS.summaries_dir,FLAGS.model_name,"train"),sess.graph) test_writer=tf.summary.FileWriter(os.path.join(FLAGS.summaries_dir,FLAGS.model_name,"test"),sess.graph) # load previous trained model or create a new model if not os.path.exists(FLAGS.checkpoints): os.mkdir(FLAGS.checkpoints) model_name=os.path.join(FLAGS.checkpoints,FLAGS.model_name) ckpt=tf.train.get_checkpoint_state(FLAGS.checkpoints) if ckpt and ckpt.model_checkpoint_path: print "restore from previous traied model: %s" % FLAGS.model_name saver.restore(sess,ckpt.model_checkpoint_path) else: sess.run(tf.global_variables_initializer()) def evaluate(sess,model,manager): strings=[] predicts=[] goldens=[] bar = ProgressBar(max_value=manager.num_batch) for batch in bar(manager.iter_batch()): batch_string,batch_predict,batch_golden=model.evaluate_step(sess,batch) strings.extend(batch_string) predicts.extend(batch_predict) goldens.extend(batch_golden) return strings,predicts,goldens best_eval_f1=0 noimpro_num=0 for i in range(FLAGS.max_epoch): #train train_loss=[] bar = ProgressBar(max_value=train_manager.num_batch) for step,batch in bar(enumerate(train_manager.iter_batch())): batch.append(merged) summary,global_step,batch_loss=model.train_step(sess,batch,FLAGS.dropout_keep) #add summary to tensorboard train_writer.add_summary(summary,global_step) train_loss.append(batch_loss) print "Epoch %d Train loss is %.4f" % (i+1,np.mean(train_loss)) #dev strings,predicts,goldens=evaluate(sess,model,dev_manager) eval_f1=report_results(strings,predicts,goldens,id_to_char,id_to_tag,'outputs/dev') if eval_f1>best_eval_f1: best_eval_f1=eval_f1 noimpro_num=0 saver.save(sess,model_name) else: noimpro_num+=1 print "Epoch %d Best eval f1:%.6f" % (i+1,best_eval_f1) #test strings,predicts,goldens=evaluate(sess,model,test_manager) test_f1=report_results(strings,predicts,goldens,id_to_char,id_to_tag,'outputs/test',True) #early_stop if noimpro_num>=3: print "Early stop! Final F1 scores on test data is :%.6f" % test_f1 break print
def do_train(config): train, dev, test = load_data(config) # 加载数据 word_to_id, id_to_word, tag_to_id, id_to_tag = create_maps( train, config) # 创建或读取maps # 配置信息及保存 config["num_chars"] = len(word_to_id) # 词总数 config["num_tags"] = len(tag_to_id) # 标签总数 with open(config["config_file"], "w") as f: json.dump(config, f, ensure_ascii=False, indent=4) # 数据处理 train_data = prepare_dataset(train, word_to_id, tag_to_id, config["lower"]) dev_data = prepare_dataset(dev, word_to_id, tag_to_id, config["lower"]) test_data = prepare_dataset(test, word_to_id, tag_to_id, config["lower"]) print("train/dev/test 句子数:{} / {} / {}".format(len(train_data), len(dev_data), len(test_data))) # 分batch train_manager = BatchManager(train_data, config["batch_size"]) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) steps_per_epoch = train_manager.len_data # 每个轮次的steps # 创建相关路径 make_path(config) # logger logger = get_logger(config["log_file"]) # GPU限制 tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: # 创建模型, 可以提供使用现有参数配置 model = Model(config) ckpt = tf.train.get_checkpoint_state( config["ckpt_path"]) # 从模型路径获取ckpt if ckpt and tf.train.checkpoint_exists( ckpt.model_checkpoint_path): # 现有模型 logger.info("读取现有模型...") model.saver.restore(sess, ckpt.model_checkpoint_path) else: logger.info("新建模型...") sess.run(tf.global_variables_initializer()) # 不使用预训练的embeddings # 如果使用预训练的embeddings if config["pre_emb"]: emb_weights = sess.run(model.char_lookup.read_value()) emb_weights = load_word2vec(config["emb_file"], id_to_word, config["char_dim"], emb_weights) sess.run(model.char_lookup.assign(emb_weights)) logger.info("Load pre-trained embedding.") logger.info("开始训练...") loss = [] for i in range(config["max_epoch"]): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % config["steps_check"] == 0: iteration = step // steps_per_epoch + 1 logger.info( "iteration:{} step:{}/{}, NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger, config) if best: save_model(sess, model, config["ckpt_path"], logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger, config)
def train(): # 加载数据集 train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # 选择tag形式 (IOB / IOBES) 默认使用IOBES update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) if not os.path.isfile(FLAGS.map_file): if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: # {'S-LOC': 10, 'E-LOC': 3, 'B-ORG': 4, 'S-PER': 11, 'S-ORG': 12, 'O': 0, # 'E-ORG': 5, 'I-LOC': 6, 'I-PER': 7, 'I-ORG': 1, 'B-PER': 8, 'B-LOC': 2, 'E-PER': 9} char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # 转化成数字化的数据 train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) #长度不足补0 train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # GPU设置 tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) # 每100次算一次平均loss loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) tf.logging.set_verbosity( tf.logging.INFO) # choose what level of logging you want if FLAGS.mode == 'rl_train': tf.logging.info('Starting model in %s mode...', FLAGS.mode + '_' + FLAGS.reward_type) else: tf.logging.info('Starting model in %s mode...', FLAGS.mode) # If in decode mode, set batch_size = beam_size # Reason: in decode mode, we decode one example at a time. # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses. if FLAGS.mode == 'beam_search_decode': FLAGS.batch_size = FLAGS.beam_size train_data, valid_data, test_data = prepare_dataset(FLAGS.data_path) print('TrainData Size:', len(train_data)) print('ValidData Size:', len(valid_data)) print('TestData Size:', len(test_data)) print("Building vocabulary ..... ") word2id, id2word, _, max_ending_len, min_ending_len = creat_vocab( train_data, FLAGS.word_vocab_size) print("Finished building vocabulary!") word_vocab_size = len(word2id.keys()) # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = [ 'mode', 'loss_rate_of_sem', 'loss_rate_of_mle', 'word_vocab_size', 'use_mixed_loss', 'lr', 'train_keep_prob', 'rl_loss_scale_factor', 'rand_unif_init_mag', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'coverage', 'cov_loss_wt', 'pointer_gen' ] hps_dict = {} for key, val in FLAGS.__flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val # add it to the dict hps_dict['max_dec_steps'] = max_ending_len hps_dict['min_ending_len'] = min_ending_len if FLAGS.word_vocab_size == None: hps_dict['word_vocab_size'] = word_vocab_size hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) # create minibatches of data train_batches = get_batches(len(train_data), FLAGS.batch_size) valid_batches = get_batches(len(valid_data), FLAGS.batch_size) tf.set_random_seed(111) # a seed value for randomness if hps.mode == 'seq2seq_train': train_dir = os.path.join(FLAGS.exp_name, "train_seq2seq") if not os.path.exists(train_dir): os.makedirs(train_dir) with tf.Graph().as_default(): initializer = tf.random_uniform_initializer( -hps.rand_unif_init_mag, hps.rand_unif_init_mag) with tf.variable_scope("Model", reuse=None, initializer=initializer): m_train = SCST_RLModel(is_training=True, hps=hps) with tf.variable_scope("Model", reuse=True, initializer=initializer): m_valid = SCST_RLModel(is_training=False, hps=hps) if FLAGS.convert_to_coverage_model: assert FLAGS.coverage, "To convert your non-coverage model to a coverage model, run with convert_to_coverage_model=True and coverage=True" convert_to_coverage_model() sv = tf.train.Supervisor(logdir=train_dir, save_model_secs=FLAGS.save_model_secs) sess_context_manager = sv.managed_session(config=util.get_config()) tf.logging.info("Created session.") try: run_seq2seq_training( m_train, m_valid, train_data, train_batches, valid_data, valid_batches, word2id, max_ending_len, sv, sess_context_manager ) # this is an infinite loop until interrupted except KeyboardInterrupt: tf.logging.info( "Caught keyboard interrupt on worker. Stopping supervisor..." ) sv.stop() elif hps.mode == 'rl_train': train_dir = os.path.join( FLAGS.exp_name, "train_rl" + '_' + FLAGS.reward_type + 'mu_' + str(FLAGS.rl_loss_scale_factor)) if not os.path.exists(train_dir): os.makedirs(train_dir) with tf.Graph().as_default(): initializer = tf.random_uniform_initializer( -hps.rand_unif_init_mag, hps.rand_unif_init_mag) with tf.variable_scope("Model", reuse=None, initializer=initializer): m_train = SCST_RLModel(is_training=True, hps=hps) with tf.variable_scope("Model", reuse=True, initializer=initializer): m_valid = SCST_RLModel(is_training=False, hps=hps) # define load_pretrain funtion for restoring best seq2seq model from eval_dir ckpt_dir = 'eval_seq2seq' latest_filename = "checkpoint_best" if ckpt_dir == "eval_seq2seq" else None ckpt_dir = os.path.join(FLAGS.exp_name, ckpt_dir) ckpt_state = tf.train.get_checkpoint_state( ckpt_dir, latest_filename=latest_filename) print("loading pre_trained seq2seq model from %s", ckpt_state.model_checkpoint_path) saver = tf.train.Saver() def load_pretrain(sess): return saver.restore(sess, ckpt_state.model_checkpoint_path) sv = tf.train.Supervisor(logdir=train_dir, saver=saver, save_model_secs=FLAGS.save_model_secs, init_fn=load_pretrain) sess_context_manager = sv.managed_session(config=util.get_config()) tf.logging.info("Created session.") try: run_rl_training(m_train, m_valid, train_data, train_batches, valid_data, valid_batches, word2id, max_ending_len, sv, sess_context_manager ) # this is an infinite loop until interrupted except KeyboardInterrupt: tf.logging.info( "Caught keyboard interrupt on worker. Stopping supervisor..." ) sv.stop() elif hps.mode == 'beam_search_decode': # This will be the hyperparameters for the decoder model decode_model_hps = hps._replace( max_dec_steps=1 ) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries test_examples_list = prepare_data_for_beam_seach_decode( test_data, FLAGS.batch_size, word2id, max_plot_len, max_ending_len, FLAGS.pointer_gen) with tf.Graph().as_default(): initializer = tf.random_uniform_initializer( -hps.rand_unif_init_mag, hps.rand_unif_init_mag) with tf.variable_scope("Model", reuse=None, initializer=initializer): model_test = SCST_RLModel(is_training=False, hps=decode_model_hps) run_beam_search_decode(model_test, test_examples_list, id2word, data='test_data', ckpt_dir=FLAGS.decode_ckpt_dir) else: raise ValueError( "The 'mode' flag must be one of seq2seq_train/rl_train/beam_search_decode" )