import argparse parser = argparse.ArgumentParser() # for word segmentation parser.add_argument("--source_input", default='corpus/source_input', dest="src_inp", help="source input file for word segmentation") parser.add_argument("--target_input", default='corpus/target_input', dest="trg_inp", help="target input file for word segmentation") parser.add_argument("--source_mode", default='word', choices=['char', 'word'], dest="src_mode", help="char or word") parser.add_argument("--target_mode", default='char', choices=['char', 'word'], dest="trg_mode", help="char or word") # for dataset splitting (train & val) parser.add_argument('--source', default=FLAGS.source_data, dest='src', help='source file') parser.add_argument('--target', default=FLAGS.target_data, dest='trg', help='target file') args = parser.parse_args() #step 0. apply word segmentation ## source data_utils.word_seg(args.src_inp,args.src,args.src_mode.lower()) ## target data_utils.word_seg(args.trg_inp,args.trg,args.trg_mode.lower()) #step 1. get mapping of whole data data_utils.prepare_whole_data(FLAGS.source_data, FLAGS.target_data, FLAGS.src_vocab_size, FLAGS.trg_vocab_size, skip_to_token=True) #step 2. split data into train & val data_utils.split_train_val(args.src,args.trg) #step 3. generate tokens of train & val data_utils.prepare_whole_data(FLAGS.source_data, FLAGS.target_data, FLAGS.src_vocab_size, FLAGS.trg_vocab_size, mode=word_seg_strategy) #step 4. pretrain fasttext data_utils.train_fasttext(fasttext_model,source_mapping,fasttext_hkl)
def train_MLE(): data_utils.prepare_whole_data(FLAGS.data, FLAGS.data_test, FLAGS.source_data, FLAGS.target_data, FLAGS.src_vocab_size, FLAGS.trg_vocab_size) _ , trg_vocab_list = data_utils.read_map(FLAGS.target_data + '.' + str(FLAGS.trg_vocab_size) + '.mapping') d_train = data_utils.read_data(FLAGS.source_data + '_train.token',FLAGS.target_data + '_train.token',buckets) d_valid = data_utils.read_data(FLAGS.source_data + '_val.token',FLAGS.target_data + '_val.token',buckets) print('Total document size of training data: %s' % sum(len(l) for l in d_train)) print('Total document size of validation data: %s' % sum(len(l) for l in d_valid)) train_bucket_sizes = [len(d_train[b]) for b in range(len(d_train))] train_total_size = float(sum(train_bucket_sizes)) train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in range(len(train_bucket_sizes))] print('train_bucket_sizes: ',train_bucket_sizes) print('train_total_size: ',train_total_size) print('train_buckets_scale: ',train_buckets_scale) valid_bucket_sizes = [len(d_valid[b]) for b in range(len(d_valid))] valid_total_size = float(sum(valid_bucket_sizes)) valid_buckets_scale = [sum(valid_bucket_sizes[:i + 1]) / valid_total_size for i in range(len(valid_bucket_sizes))] print('valid_bucket_sizes: ',valid_bucket_sizes) print('valid_total_size: ',valid_total_size) print('valid_buckets_scale: ',valid_buckets_scale) with tf.Session() as sess: model = create_seq2seq(sess, 'MLE') if FLAGS.reset_sampling_prob: with tf.variable_scope('sampling_prob',reuse=tf.AUTO_REUSE): sess.run(tf.assign(model.sampling_probability,reset_prob)) if FLAGS.schedule_sampling: print('model.sampling_probability: ',model.sampling_probability_clip) #sess.run(tf.assign(model.sampling_probability,1.0)) step = 0 loss = 0 loss_list = [] if FLAGS.schedule_sampling: print('sampling_decay_steps: ',FLAGS.sampling_decay_steps) print('sampling_probability: ',sess.run(model.sampling_probability_clip)) print('-----') while step < FLAGS.max_step: step += 1 random_number = np.random.random_sample() # buckets_scale accumulated percentage bucket_id = min([i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number]) encoder_input, decoder_input, weight, en_s, de_s = model.get_batch(d_train, bucket_id, sen=True) #print('batch_size: ',model.batch_size) ==> 64 #print('batch_size: ',len(encoder_input[0])) ==> 64 #print('batch_size: ',len(encoder_input)) ==> 15,50,... #print('batch_size: ',len(decoder_input)) ==> 15,50,... #print('batch_size: ',len(weight)) ==> 15,50,... output, loss_train, _ = model.run(sess, encoder_input, decoder_input, weight, bucket_id) loss += loss_train / FLAGS.check_step #if step!=0 and step % FLAGS.sampling_decay_steps == 0: # sess.run(model.sampling_probability_decay) # print('sampling_probability: ',sess.run(model.sampling_probability)) if step % FLAGS.print_step == 0: print('Input :') print(en_s[0].strip()) print('Output:') print(_output(output[0], trg_vocab_list)) print('\n{} steps trained ...\n\n'.format(step)) if step % FLAGS.check_step == 0: print('\nStep %s, Training perplexity: %s, Learning rate: %s' % (step, math.exp(loss), sess.run(model.learning_rate))) for i in range(len(d_train)): encoder_input, decoder_input, weight = model.get_batch(d_valid, i) _, loss_valid = model.run(sess, encoder_input, decoder_input, weight, i, forward_only = True) print(' Validation perplexity in bucket %s: %s' % (i, math.exp(loss_valid))) if len(loss_list) > 2 and loss > max(loss_list[-3:]): sess.run(model.learning_rate_decay) else: if step!=0: if FLAGS.schedule_sampling: sess.run(model.sampling_probability_decay) print('sampling_probability: ',sess.run(model.sampling_probability_clip)) loss_list.append(loss) loss = 0 checkpoint_path = os.path.join(FLAGS.model_pre_dir, "MLE.ckpt") model.saver.save(sess, checkpoint_path, global_step = step) print('Saving model at step %s\n' % step) if step == FLAGS.sampling_global_step: break
def train_MLE(): data_utils.prepare_whole_data(FLAGS.source_data_dir, FLAGS.target_data_dir, FLAGS.vocab_size) # read dataset and split to training set and validation set d = data_utils.read_data(FLAGS.source_data_dir + '.token', FLAGS.target_data_dir + '.token', buckets) np.random.seed(SEED) np.random.shuffle(d) print('Total document size: %s' % sum(len(l) for l in d)) print('len(d): ', len(d)) d_train = [[] for _ in range(len(d))] d_valid = [[] for _ in range(len(d))] for i in range(len(d)): d_train[i] = d[i][:int(0.9 * len(d[i]))] d_valid[i] = d[i][int(-0.1 * len(d[i])):] train_bucket_sizes = [len(d_train[b]) for b in range(len(d_train))] train_total_size = float(sum(train_bucket_sizes)) train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in range(len(train_bucket_sizes))] print('train_bucket_sizes: ',train_bucket_sizes) print('train_total_size: ',train_total_size) print('train_buckets_scale: ',train_buckets_scale) valid_bucket_sizes = [len(d_valid[b]) for b in range(len(d_valid))] valid_total_size = float(sum(valid_bucket_sizes)) valid_buckets_scale = [sum(valid_bucket_sizes[:i + 1]) / valid_total_size for i in range(len(valid_bucket_sizes))] print('valid_bucket_sizes: ',valid_bucket_sizes) print('valid_total_size: ',valid_total_size) print('valid_buckets_scale: ',valid_buckets_scale) with tf.Session() as sess: model = create_seq2seq(sess, 'MLE') step = 0 loss = 0 loss_list = [] print('sampling_decay_steps: ',FLAGS.sampling_decay_steps) print('sampling_probability: ',sess.run(model.sampling_probability)) print('-----') while(True): step += 1 random_number = np.random.random_sample() # buckets_scale 是累加百分比 bucket_id = min([i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number]) encoder_input, decoder_input, weight = model.get_batch(d_train, bucket_id) #print('batch_size: ',model.batch_size) ==> 64 #print('batch_size: ',len(encoder_input[0])) ==> 64 #print('batch_size: ',len(encoder_input)) ==> 15,50,... #print('batch_size: ',len(decoder_input)) ==> 15,50,... #print('batch_size: ',len(weight)) ==> 15,50,... loss_train, _ = model.run(sess, encoder_input, decoder_input, weight, bucket_id) loss += loss_train / FLAGS.check_step #print(model.token2word(sen)[0]) if step!=0 and step % FLAGS.sampling_decay_steps == 0: sess.run(model.sampling_probability_decay) print('sampling_probability: ',sess.run(model.sampling_probability)) if_feed_prev = bernoulli_sampling(model.sampling_probability) if_feed_prev = sess.run(if_feed_prev) print('if_feed_prev: ',not if_feed_prev) if step % FLAGS.check_step == 0: print('Step %s, Training perplexity: %s, Learning rate: %s' % (step, math.exp(loss), sess.run(model.learning_rate))) for i in range(len(d)): encoder_input, decoder_input, weight = model.get_batch(d_valid, i) loss_valid, _ = model.run(sess, encoder_input, decoder_input, weight, i, forward_only = True) print(' Validation perplexity in bucket %s: %s' % (i, math.exp(loss_valid))) if len(loss_list) > 2 and loss > max(loss_list[-3:]): sess.run(model.learning_rate_decay) loss_list.append(loss) loss = 0 checkpoint_path = os.path.join(FLAGS.model_dir, "MLE.ckpt") model.saver.save(sess, checkpoint_path, global_step = step) print('Saving model at step %s' % step)
def train_RL(): data_utils.prepare_whole_data(FLAGS.data, FLAGS.data_test, FLAGS.source_data, FLAGS.target_data, FLAGS.src_vocab_size, FLAGS.trg_vocab_size) d_train = data_utils.read_data(FLAGS.source_data + '_train.token',FLAGS.target_data + '_train.token',buckets) #print(d_train[0][0]) g1 = tf.Graph() g2 = tf.Graph() g3 = tf.Graph() sess1 = tf.Session(graph = g1) sess2 = tf.Session(graph = g2) sess3 = tf.Session(graph = g3) # model is for training seq2seq with Reinforcement Learning with g1.as_default(): model = create_seq2seq(sess1, 'RL') # we set sample size = ? model.batch_size = 5 # model_LM is for a reward function (language model) with g2.as_default(): model_LM = create_seq2seq(sess2, 'MLE') model_LM.beam_search = False # calculate probibility of only one sentence model_LM.batch_size = 1 def LM(encoder_input, decoder_input, weight, bucket_id): return model_LM.run(sess2, encoder_input, decoder_input, weight, bucket_id, forward_only = True)[0] # new reward function: sentiment score with g3.as_default(): model_SA = main.create_model(sess3) model_SA.batch_size = 1 def SA(sentence, encoder_length): sentence = ' '.join(sentence) token_ids = utils.convert_to_token(sentence, model_SA.vocab_map) encoder_input, encoder_length, _, _ = model_SA.get_batch([(0, token_ids, sentence)]) return model_SA.step(sess3, encoder_input, encoder_length)[0][0] train_bucket_sizes = [len(d_train[b]) for b in range(len(d_train))] train_total_size = float(sum(train_bucket_sizes)) train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in range(len(train_bucket_sizes))] # make RL object read vocab mapping dict, list model.RL_readmap(FLAGS.source_data + '.' + str(FLAGS.src_vocab_size) + '.mapping', FLAGS.target_data + '.' + str(FLAGS.trg_vocab_size) + '.mapping') step = 0 while step < FLAGS.max_step: step += 1 random_number = np.random.random_sample() bucket_id = min([i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number]) # the same encoder_input for sampling batch_size times #encoder_input, decoder_input, weight = model.get_batch(d, bucket_id, rand = False) encoder_input, decoder_input, weight, en_s, de_s = model.get_batch(d_train, bucket_id, sen=True) output, loss, _ = model.run(sess1, encoder_input, decoder_input, weight, bucket_id, X = LM, Y = SA) # debug #encoder_input = np.reshape(np.transpose(encoder_input, (1, 0, 2)), (-1, FLAGS.vocab_size)) #encoder_input = np.split(encoder_input, FLAGS.max_length) #print(model.token2word(encoder_input)[0]) #print(model.token2word(sen)[0]) if step % FLAGS.print_step == 0: print('Input :') print(en_s[0].strip()) print('Output:') print(_output(output[0], model.trg_vocab_list)) print('\n{} steps trained ...'.format(step)) if step % FLAGS.check_step == 0: print('Loss at step %s: %s' % (step, loss)) checkpoint_path = os.path.join(FLAGS.model_rl_dir, "RL.ckpt") model.saver.save(sess1, checkpoint_path, global_step = step) print('Saving model at step %s' % step)
def train_MLE(): data_utils.prepare_whole_data(FLAGS.source_data_dir, FLAGS.target_data_dir, FLAGS.vocab_size) # read dataset and split to training set and validation set d = data_utils.read_data(FLAGS.source_data_dir + '.token', FLAGS.target_data_dir + '.token', buckets) print('Total document size: %s' % sum(len(l) for l in d)) d_train = [[] for _ in range(len(d))] d_valid = [[] for _ in range(len(d))] for i in range(len(d)): d_train[i] = d[i][:int(0.9 * len(d[i]))] d_valid[i] = d[i][int(-0.1 * len(d[i])):] train_bucket_sizes = [len(d[b]) for b in range(len(d))] train_total_size = float(sum(train_bucket_sizes)) train_buckets_scale = [ sum(train_bucket_sizes[:i + 1]) / train_total_size for i in range(len(train_bucket_sizes)) ] sess = tf.Session() model = create_seq2seq(sess, 'MLE') step = 0 loss = 0 loss_list = [] while (True): step += 1 random_number = np.random.random_sample() bucket_id = min([ i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number ]) encoder_input, decoder_input, weight = model.get_batch( d_train, bucket_id) loss_train, _ = model.run(sess, encoder_input, decoder_input, weight, bucket_id) loss += loss_train / FLAGS.check_step #print(model.token2word(sen)[0]) if step % FLAGS.check_step == 0: print('Step %s, Training perplexity: %s, Learning rate: %s' % (step, math.exp(loss), sess.run(model.learning_rate))) for i in range(len(d)): encoder_input, decoder_input, weight = model.get_batch( d_valid, i) loss_valid, _ = model.run(sess, encoder_input, decoder_input, weight, i, forward_only=True) print(' Validation perplexity in bucket %s: %s' % (i, math.exp(loss_valid))) if len(loss_list) > 2 and loss > max(loss_list[-3:]): sess.run(model.learning_rate_decay) loss_list.append(loss) loss = 0 checkpoint_path = os.path.join(FLAGS.model_dir, "MLE.ckpt") model.saver.save(sess, checkpoint_path, global_step=step) print('Saving model at step %s' % step)