Esempio n. 1
0
import argparse

parser = argparse.ArgumentParser()
# for word segmentation
parser.add_argument("--source_input", default='corpus/source_input', dest="src_inp", help="source input file for word segmentation")
parser.add_argument("--target_input", default='corpus/target_input', dest="trg_inp", help="target input file for word segmentation")
parser.add_argument("--source_mode", default='word', choices=['char', 'word'], dest="src_mode", help="char or word")
parser.add_argument("--target_mode", default='char', choices=['char', 'word'], dest="trg_mode", help="char or word")
# for dataset splitting (train & val)
parser.add_argument('--source', default=FLAGS.source_data, dest='src', help='source file')
parser.add_argument('--target', default=FLAGS.target_data, dest='trg', help='target file')
args = parser.parse_args()

#step 0. apply word segmentation 
## source
data_utils.word_seg(args.src_inp,args.src,args.src_mode.lower())
## target
data_utils.word_seg(args.trg_inp,args.trg,args.trg_mode.lower())

#step 1. get mapping of whole data
data_utils.prepare_whole_data(FLAGS.source_data, FLAGS.target_data, FLAGS.src_vocab_size, FLAGS.trg_vocab_size, skip_to_token=True)

#step 2. split data into train & val
data_utils.split_train_val(args.src,args.trg)

#step 3. generate tokens of train & val
data_utils.prepare_whole_data(FLAGS.source_data, FLAGS.target_data, FLAGS.src_vocab_size, FLAGS.trg_vocab_size, mode=word_seg_strategy)

#step 4. pretrain fasttext
data_utils.train_fasttext(fasttext_model,source_mapping,fasttext_hkl)
Esempio n. 2
0
def train_MLE(): 

  data_utils.prepare_whole_data(FLAGS.data, FLAGS.data_test, FLAGS.source_data, FLAGS.target_data, FLAGS.src_vocab_size, FLAGS.trg_vocab_size)
  _ , trg_vocab_list = data_utils.read_map(FLAGS.target_data + '.' + str(FLAGS.trg_vocab_size) + '.mapping')

  d_train = data_utils.read_data(FLAGS.source_data + '_train.token',FLAGS.target_data + '_train.token',buckets)
  d_valid = data_utils.read_data(FLAGS.source_data + '_val.token',FLAGS.target_data + '_val.token',buckets)
  
  print('Total document size of training data: %s' % sum(len(l) for l in d_train))
  print('Total document size of validation data: %s' % sum(len(l) for l in d_valid))

  train_bucket_sizes = [len(d_train[b]) for b in range(len(d_train))]
  train_total_size = float(sum(train_bucket_sizes))
  train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
                         for i in range(len(train_bucket_sizes))]
  print('train_bucket_sizes: ',train_bucket_sizes)
  print('train_total_size: ',train_total_size)
  print('train_buckets_scale: ',train_buckets_scale)
  valid_bucket_sizes = [len(d_valid[b]) for b in range(len(d_valid))]
  valid_total_size = float(sum(valid_bucket_sizes))
  valid_buckets_scale = [sum(valid_bucket_sizes[:i + 1]) / valid_total_size
                         for i in range(len(valid_bucket_sizes))]
  print('valid_bucket_sizes: ',valid_bucket_sizes)
  print('valid_total_size: ',valid_total_size)
  print('valid_buckets_scale: ',valid_buckets_scale)

  with tf.Session() as sess:

    model = create_seq2seq(sess, 'MLE')
    if FLAGS.reset_sampling_prob: 
      with tf.variable_scope('sampling_prob',reuse=tf.AUTO_REUSE):
        sess.run(tf.assign(model.sampling_probability,reset_prob))
    if FLAGS.schedule_sampling:
      print('model.sampling_probability: ',model.sampling_probability_clip)
    #sess.run(tf.assign(model.sampling_probability,1.0))
    step = 0
    loss = 0
    loss_list = []
 
    if FLAGS.schedule_sampling:
      print('sampling_decay_steps: ',FLAGS.sampling_decay_steps)
      print('sampling_probability: ',sess.run(model.sampling_probability_clip))
      print('-----')

    while step < FLAGS.max_step:
      step += 1

      random_number = np.random.random_sample()
      # buckets_scale accumulated percentage
      bucket_id = min([i for i in range(len(train_buckets_scale))
                         if train_buckets_scale[i] > random_number])
      encoder_input, decoder_input, weight, en_s, de_s = model.get_batch(d_train, bucket_id, sen=True)
      #print('batch_size: ',model.batch_size)      ==> 64
      #print('batch_size: ',len(encoder_input[0])) ==> 64
      #print('batch_size: ',len(encoder_input))    ==> 15,50,...
      #print('batch_size: ',len(decoder_input))    ==> 15,50,... 
      #print('batch_size: ',len(weight))           ==> 15,50,...
      output, loss_train, _ = model.run(sess, encoder_input, decoder_input, weight, bucket_id)
      loss += loss_train / FLAGS.check_step

      #if step!=0 and step % FLAGS.sampling_decay_steps == 0:
      #  sess.run(model.sampling_probability_decay)
      #  print('sampling_probability: ',sess.run(model.sampling_probability))
        
      if step % FLAGS.print_step == 0:
        print('Input :')
        print(en_s[0].strip())
        print('Output:')
        print(_output(output[0], trg_vocab_list))
        print('\n{} steps trained ...\n\n'.format(step))

      if step % FLAGS.check_step == 0:
        print('\nStep %s, Training perplexity: %s, Learning rate: %s' % (step, math.exp(loss),
                                  sess.run(model.learning_rate))) 
        for i in range(len(d_train)):
          encoder_input, decoder_input, weight = model.get_batch(d_valid, i)
          _, loss_valid = model.run(sess, encoder_input, decoder_input, weight, i, forward_only = True)
          print('  Validation perplexity in bucket %s: %s' % (i, math.exp(loss_valid)))
        if len(loss_list) > 2 and loss > max(loss_list[-3:]):
          sess.run(model.learning_rate_decay)
        else:
          if step!=0:
            if FLAGS.schedule_sampling:
              sess.run(model.sampling_probability_decay)
              print('sampling_probability: ',sess.run(model.sampling_probability_clip))
        loss_list.append(loss)  
        loss = 0

        checkpoint_path = os.path.join(FLAGS.model_pre_dir, "MLE.ckpt")
        model.saver.save(sess, checkpoint_path, global_step = step)
        print('Saving model at step %s\n' % step)
      if step == FLAGS.sampling_global_step: break
Esempio n. 3
0
def train_MLE(): 
  data_utils.prepare_whole_data(FLAGS.source_data_dir, FLAGS.target_data_dir, FLAGS.vocab_size)

  # read dataset and split to training set and validation set
  d = data_utils.read_data(FLAGS.source_data_dir + '.token', FLAGS.target_data_dir + '.token', buckets)
  np.random.seed(SEED)
  np.random.shuffle(d)
  print('Total document size: %s' % sum(len(l) for l in d))
  print('len(d): ', len(d))
  d_train = [[] for _ in range(len(d))]
  d_valid = [[] for _ in range(len(d))]
  for i in range(len(d)):
    d_train[i] = d[i][:int(0.9 * len(d[i]))]
    d_valid[i] = d[i][int(-0.1 * len(d[i])):]

  train_bucket_sizes = [len(d_train[b]) for b in range(len(d_train))]
  train_total_size = float(sum(train_bucket_sizes))
  train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
                         for i in range(len(train_bucket_sizes))]
  print('train_bucket_sizes: ',train_bucket_sizes)
  print('train_total_size: ',train_total_size)
  print('train_buckets_scale: ',train_buckets_scale)
  valid_bucket_sizes = [len(d_valid[b]) for b in range(len(d_valid))]
  valid_total_size = float(sum(valid_bucket_sizes))
  valid_buckets_scale = [sum(valid_bucket_sizes[:i + 1]) / valid_total_size
                         for i in range(len(valid_bucket_sizes))]
  print('valid_bucket_sizes: ',valid_bucket_sizes)
  print('valid_total_size: ',valid_total_size)
  print('valid_buckets_scale: ',valid_buckets_scale)

  with tf.Session() as sess:

    model = create_seq2seq(sess, 'MLE')
    step = 0
    loss = 0
    loss_list = []
 
    print('sampling_decay_steps: ',FLAGS.sampling_decay_steps)
    print('sampling_probability: ',sess.run(model.sampling_probability))
    print('-----')
    while(True):
      step += 1

      random_number = np.random.random_sample()
      # buckets_scale 是累加百分比
      bucket_id = min([i for i in range(len(train_buckets_scale))
                         if train_buckets_scale[i] > random_number])
      encoder_input, decoder_input, weight = model.get_batch(d_train, bucket_id)
      #print('batch_size: ',model.batch_size)      ==> 64
      #print('batch_size: ',len(encoder_input[0])) ==> 64
      #print('batch_size: ',len(encoder_input))    ==> 15,50,...
      #print('batch_size: ',len(decoder_input))    ==> 15,50,... 
      #print('batch_size: ',len(weight))           ==> 15,50,...
      loss_train, _ = model.run(sess, encoder_input, decoder_input, weight, bucket_id)
      loss += loss_train / FLAGS.check_step
      #print(model.token2word(sen)[0])
      if step!=0 and step % FLAGS.sampling_decay_steps == 0:
        sess.run(model.sampling_probability_decay)
        print('sampling_probability: ',sess.run(model.sampling_probability))
        if_feed_prev = bernoulli_sampling(model.sampling_probability)
        if_feed_prev = sess.run(if_feed_prev)
        print('if_feed_prev: ',not if_feed_prev)
        
      if step % FLAGS.check_step == 0:
        print('Step %s, Training perplexity: %s, Learning rate: %s' % (step, math.exp(loss),
                                  sess.run(model.learning_rate))) 
        for i in range(len(d)):
          encoder_input, decoder_input, weight = model.get_batch(d_valid, i)
          loss_valid, _ = model.run(sess, encoder_input, decoder_input, weight, i, forward_only = True)
          print('  Validation perplexity in bucket %s: %s' % (i, math.exp(loss_valid)))
        if len(loss_list) > 2 and loss > max(loss_list[-3:]):
          sess.run(model.learning_rate_decay)
        loss_list.append(loss)  
        loss = 0

        checkpoint_path = os.path.join(FLAGS.model_dir, "MLE.ckpt")
        model.saver.save(sess, checkpoint_path, global_step = step)
        print('Saving model at step %s' % step)
Esempio n. 4
0
def train_RL():

  data_utils.prepare_whole_data(FLAGS.data, FLAGS.data_test, FLAGS.source_data, FLAGS.target_data, FLAGS.src_vocab_size, FLAGS.trg_vocab_size)
  d_train = data_utils.read_data(FLAGS.source_data + '_train.token',FLAGS.target_data + '_train.token',buckets)
  #print(d_train[0][0])

  g1 = tf.Graph()
  g2 = tf.Graph()
  g3 = tf.Graph()
  sess1 = tf.Session(graph = g1)
  sess2 = tf.Session(graph = g2)
  sess3 = tf.Session(graph = g3)
  # model is for training seq2seq with Reinforcement Learning
  with g1.as_default():
    model = create_seq2seq(sess1, 'RL')
    # we set sample size = ?
    model.batch_size = 5
  # model_LM is for a reward function (language model)
  with g2.as_default():
    model_LM = create_seq2seq(sess2, 'MLE')
    model_LM.beam_search = False
    # calculate probibility of only one sentence
    model_LM.batch_size = 1

  def LM(encoder_input, decoder_input, weight, bucket_id):
    return model_LM.run(sess2, encoder_input, decoder_input, weight, bucket_id, forward_only = True)[0]
  # new reward function: sentiment score
  with g3.as_default():
    model_SA = main.create_model(sess3) 
    model_SA.batch_size = 1
 
  def SA(sentence, encoder_length):
    sentence = ' '.join(sentence)
    token_ids = utils.convert_to_token(sentence, model_SA.vocab_map)
    encoder_input, encoder_length, _, _ = model_SA.get_batch([(0, token_ids, sentence)])
    return model_SA.step(sess3, encoder_input, encoder_length)[0][0]


  train_bucket_sizes = [len(d_train[b]) for b in range(len(d_train))]
  train_total_size = float(sum(train_bucket_sizes))
  train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
                         for i in range(len(train_bucket_sizes))]

  # make RL object read vocab mapping dict, list  
  model.RL_readmap(FLAGS.source_data + '.' + str(FLAGS.src_vocab_size) + '.mapping', FLAGS.target_data + '.' + str(FLAGS.trg_vocab_size) + '.mapping')
  step = 0
  

  while step < FLAGS.max_step:
    step += 1

    random_number = np.random.random_sample()
    bucket_id = min([i for i in range(len(train_buckets_scale))
                       if train_buckets_scale[i] > random_number])
    
    # the same encoder_input for sampling batch_size times
    #encoder_input, decoder_input, weight = model.get_batch(d, bucket_id, rand = False)    

    encoder_input, decoder_input, weight, en_s, de_s = model.get_batch(d_train, bucket_id, sen=True)
    output, loss, _ = model.run(sess1, encoder_input, decoder_input, weight, bucket_id, X = LM, Y = SA)
   
    # debug 
    #encoder_input = np.reshape(np.transpose(encoder_input, (1, 0, 2)), (-1, FLAGS.vocab_size))
    #encoder_input = np.split(encoder_input, FLAGS.max_length)

    #print(model.token2word(encoder_input)[0])
    #print(model.token2word(sen)[0])
    
    if step % FLAGS.print_step == 0:
      print('Input :')
      print(en_s[0].strip())
      print('Output:')
      print(_output(output[0], model.trg_vocab_list))
      print('\n{} steps trained ...'.format(step))

    if step % FLAGS.check_step == 0:
      print('Loss at step %s: %s' % (step, loss))
      checkpoint_path = os.path.join(FLAGS.model_rl_dir, "RL.ckpt")
      model.saver.save(sess1, checkpoint_path, global_step = step)
      print('Saving model at step %s' % step)
def train_MLE():
    data_utils.prepare_whole_data(FLAGS.source_data_dir, FLAGS.target_data_dir,
                                  FLAGS.vocab_size)

    # read dataset and split to training set and validation set
    d = data_utils.read_data(FLAGS.source_data_dir + '.token',
                             FLAGS.target_data_dir + '.token', buckets)
    print('Total document size: %s' % sum(len(l) for l in d))

    d_train = [[] for _ in range(len(d))]
    d_valid = [[] for _ in range(len(d))]
    for i in range(len(d)):
        d_train[i] = d[i][:int(0.9 * len(d[i]))]
        d_valid[i] = d[i][int(-0.1 * len(d[i])):]

    train_bucket_sizes = [len(d[b]) for b in range(len(d))]
    train_total_size = float(sum(train_bucket_sizes))
    train_buckets_scale = [
        sum(train_bucket_sizes[:i + 1]) / train_total_size
        for i in range(len(train_bucket_sizes))
    ]

    sess = tf.Session()

    model = create_seq2seq(sess, 'MLE')
    step = 0
    loss = 0
    loss_list = []

    while (True):
        step += 1

        random_number = np.random.random_sample()
        bucket_id = min([
            i for i in range(len(train_buckets_scale))
            if train_buckets_scale[i] > random_number
        ])
        encoder_input, decoder_input, weight = model.get_batch(
            d_train, bucket_id)
        loss_train, _ = model.run(sess, encoder_input, decoder_input, weight,
                                  bucket_id)
        loss += loss_train / FLAGS.check_step
        #print(model.token2word(sen)[0])
        if step % FLAGS.check_step == 0:
            print('Step %s, Training perplexity: %s, Learning rate: %s' %
                  (step, math.exp(loss), sess.run(model.learning_rate)))
            for i in range(len(d)):
                encoder_input, decoder_input, weight = model.get_batch(
                    d_valid, i)
                loss_valid, _ = model.run(sess,
                                          encoder_input,
                                          decoder_input,
                                          weight,
                                          i,
                                          forward_only=True)
                print('  Validation perplexity in bucket %s: %s' %
                      (i, math.exp(loss_valid)))
            if len(loss_list) > 2 and loss > max(loss_list[-3:]):
                sess.run(model.learning_rate_decay)
            loss_list.append(loss)
            loss = 0

            checkpoint_path = os.path.join(FLAGS.model_dir, "MLE.ckpt")
            model.saver.save(sess, checkpoint_path, global_step=step)
            print('Saving model at step %s' % step)