def test():
    if FLAGS.src_word_seg == 'word':
        import jieba
        jieba.load_userdict('dict_fasttext.txt')
    sess = tf.Session()
    src_vocab_dict, _ = data_utils.read_map(source_mapping)
    _, trg_vocab_dict = data_utils.read_map(target_mapping)
    model = create_seq2seq(sess, 'TEST')
    model.batch_size = 1

    sys.stdout.write("Input sentence: ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()
    if FLAGS.src_word_seg == 'word':
        sentence = (' ').join(jieba.lcut(sentence))
        print('sentence: ', sentence)
    elif FLAGS.src_word_seg == 'char':
        sentence = (' ').join([s for s in sentence])
    while (sentence):
        token_ids = data_utils.convert_to_token(tf.compat.as_bytes(sentence),
                                                src_vocab_dict, False)
        bucket_id = len(buckets) - 1
        for i, bucket in enumerate(buckets):
            if bucket[0] >= len(token_ids):
                bucket_id = i
                break
        # Get a 1-element batch to feed the sentence to the model.
        encoder_input, decoder_input, weight = model.get_batch(
            {bucket_id: [(token_ids, [])]}, bucket_id)
        # Get output logits for the sentence.
        output = model.run(sess, encoder_input, decoder_input, weight,
                           bucket_id)
        # This is a greedy decoder - outputs are just argmaxes of output_logits.

        inference(model, output, src_vocab_dict, trg_vocab_dict)
        # Print out French sentence corresponding to outputs.
        #print("Syetem reply: " + "".join([tf.compat.as_str(trg_vocab_dict[output]) for output in outputs]))
        print("User input  : ", end="")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        if FLAGS.src_word_seg == 'word':
            sentence = (' ').join(jieba.lcut(sentence))
            print('sentence: ', sentence)
        elif FLAGS.src_word_seg == 'char':
            sentence = (' ').join([s for s in sentence])
Ejemplo n.º 2
0
def test(filename):
    if FLAGS.src_word_seg == 'word':
        import jieba_fast as jieba
        jieba.load_userdict("dict_fasttext.txt")
    sess = tf.Session()
    src_vocab_dict, _ = data_utils.read_map(source_mapping)
    trg_vocab_dict, _ = data_utils.read_map(target_mapping)
    model = create_seq2seq(sess, 'TEST')
    model.batch_size = 1
    #model.decoder_max_len = None

    #sources = ["你是誰","你是誰"]
    #targets = ["你是不是想人家","我是說你是我老婆"]
    df = pd.read_csv(filename)
    df = df.fillna('')
    sources = list(df["context"])
    targets = list(df["utterance"])
    scores = []
    for source, target in zip(sources, targets):
        if FLAGS.src_word_seg == 'word':
            source = (' ').join(jieba.lcut(source))
        elif FLAGS.src_word_seg == 'char':
            source = (' ').join([s for s in source])
        if FLAGS.trg_word_seg == 'word':
            target = (' ').join(jieba.lcut(target))
        elif FLAGS.trg_word_seg == 'char':
            target = (' ').join([t for t in target])
        src_token_ids = data_utils.convert_to_token(tf.compat.as_bytes(source),
                                                    src_vocab_dict, False)
        trg_token_ids = data_utils.convert_to_token(tf.compat.as_bytes(target),
                                                    trg_vocab_dict, False)
        trg_len = len(trg_token_ids)
        for i, bucket in enumerate(buckets):
            if bucket[0] >= len(src_token_ids):
                bucket_id = i
                break
        encoder_input, decoder_input, weight = model.get_batch(
            {bucket_id: [(src_token_ids, [])]}, bucket_id)
        output = model.run(sess, encoder_input, decoder_input, weight,
                           bucket_id)[:trg_len]
        output = [o[0][t] for t, o in zip(trg_token_ids, output)]
        output = np.mean(output)
        scores.append(output)
    scores = np.mean(scores)
    return scores
Ejemplo n.º 3
0
def val(mo):

  d_valid = data_utils.read_val_data(FLAGS.source_data + '_val.token',FLAGS.target_data + '_val.token',buckets)

  _ , trg_vocab_list = data_utils.read_map(FLAGS.target_data + '.' + str(FLAGS.trg_vocab_size) + '.mapping')
  
  print('Total document size of validation data: %s' % sum(len(l) for l in d_valid))

  with tf.Session() as sess:
    
    model = create_seq2seq(sess, 'TEST')
    loss_list = []
    
    cf = csv.writer(open(FLAGS.output, 'w'), delimiter = '|')
    cf.writerow(['context', 'utterance'])    

    for i in range(len(d_valid)):
      encoder_input, decoder_input, weight, en_s, de_s = model.get_one(d_valid, i, sen=True)
      output = model.run(sess, encoder_input, decoder_input, weight, d_valid[i][0])
      cf.writerow([''.join(en_s.strip().split()), _output(output[0], trg_vocab_list)])
      if i % 1000 == 0:
        print('Generate {} ...'.format(i))
def test():
    sess = tf.Session()
    vocab_dict, vocab_list = data_utils.read_map(FLAGS.source_data_dir + '.' +
                                                 str(FLAGS.vocab_size) +
                                                 '.mapping')
    model = create_seq2seq(sess, 'TEST')
    model.batch_size = 1

    sys.stdout.write("Input sentence: ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()

    while (sentence):
        token_ids = data_utils.convert_to_token(tf.compat.as_bytes(sentence),
                                                vocab_dict, False)
        bucket_id = len(buckets) - 1
        for i, bucket in enumerate(buckets):
            if bucket[0] >= len(token_ids):
                bucket_id = i
                break
        # Get a 1-element batch to feed the sentence to the model.
        encoder_input, decoder_input, weight = model.get_batch(
            {bucket_id: [(token_ids, [])]}, bucket_id)
        # Get output logits for the sentence.
        output = model.run(sess, encoder_input, decoder_input, weight,
                           bucket_id)
        # This is a greedy decoder - outputs are just argmaxes of output_logits.
        outputs = [int(np.argmax(logit, axis=1)) for logit in output]
        # If there is an EOS symbol in outputs, cut them at that point.
        if data_utils.EOS_ID in outputs:
            outputs = outputs[:outputs.index(data_utils.EOS_ID)]
        # Print out French sentence corresponding to outputs.
        print("Syetem reply: " + " ".join(
            [tf.compat.as_str(vocab_list[output]) for output in outputs]))
        print("User input  : ", end="")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
Ejemplo n.º 5
0
import numpy as np
import os
import sys
sys.path.append('sentiment_analysis/')
import math
import data_utils
import seq2seq_model
from sentiment_analysis import run
from sentiment_analysis import dataset
from flags import FLAGS
from run import replace_words, SEED, buckets, sub_words, create_seq2seq
from util import *

sess = tf.Session()
vocab_dict, vocab_list = data_utils.read_map(FLAGS.source_data_dir + '.' +
                                             str(FLAGS.vocab_size) +
                                             '.mapping')
model = create_seq2seq(sess, 'TEST')
model.batch_size = 1

# create a Flask app instance
app = Flask(__name__)


# method to reply to a message from the sender
def reply(user_id, msg):
    data = {"recipient": {"id": user_id}, "message": {"text": msg}}
    # Post request using the Facebook Graph API v3.1
    resp = requests.post(
        "https://graph.facebook.com/v3.1/me/messages?access_token=" +
        ACCESS_TOKEN,
Ejemplo n.º 6
0
 def RL_readmap(self, src_map_path, trg_map_path):
     self.src_vocab_dict, self.src_vocab_list = data_utils.read_map(
         src_map_path)
     self.trg_vocab_dict, self.trg_vocab_list = data_utils.read_map(
         trg_map_path)
def get_output_dfs_per_model(model_dict,params,use_current_model=False,export_each=True,pretrain_vec='fasttext'):
    if use_current_model:
        source_mapping = '%s.%s.mapping'%(FLAGS.source_data,FLAGS.src_vocab_size) 
        target_mapping = '%s.%s.mapping'%(FLAGS.target_data,FLAGS.trg_vocab_size)
        source_data = FLAGS.source_data
        target_data = FLAGS.target_data
        #model_name = FLAGS.model_dir 
        #model_name = re.sub('/','',model_name)
        #model_name = re.sub('model','',model_name)
    else:
        source_data = 'corpus/%s/source'%model_dict['corpus']
        target_data = 'corpus/%s/target'%model_dict['corpus']
        source_mapping = '%s.%s.mapping'%(source_data,model_dict['source_token']) 
        target_mapping = '%s.%s.mapping'%(target_data,model_dict['target_token'])
        model_name = model_dict['name']
        FLAGS.source_data = source_data
        FLAGS.target_data = target_data
        FLAGS.src_vocab_size = model_dict['source_token'] 
        FLAGS.trg_vocab_size = model_dict['target_token'] 
        if FLAGS.mode == "MLE":
            FLAGS.model_dir = os.path.join('model/',model_name)
        elif FLAGS.mode == "RL":
            FLAGS.model_rl_dir = os.path.join('model_RL/',model_name)
        if pretrain_vec == 'fasttext':
            fasttext_npy = 'corpus/%s/fasttext.npy'%model_dict['corpus']
            FLAGS.pretrain_vec = np.load(fasttext_npy)
    print('########################################################################')
    if FLAGS.mode == "MLE":
        print('model_dir: ',FLAGS.model_dir)
    elif FLAGS.mode == "RL":
        print('model_rl_dir: ',FLAGS.model_rl_dir)
    print('fasttext_npy: ',fasttext_npy)
    print('########################################################################')

    d_valid = data_utils.read_data(source_data + '_val.token',target_data + '_val.token',buckets)  

    dfs = []
    DF = namedtuple("DF",("id","source","target")) 
    for bucket_id, d_val in enumerate(d_valid):
        for i, d in enumerate(d_val):
            source,target = (None,None)
            try:
                source = ''.join(data_utils.token_to_text(d[0],source_mapping))
            except IndexError:
                pass
            try:
                target = ''.join(data_utils.token_to_text(d[1],target_mapping))
                target = re.sub('EOS','',target)
            except IndexError:
                pass
            dfs.append(DF("%s_%s"%(bucket_id,i),source,target))
    dfs = pd.DataFrame(dfs)
    dfs = [dfs]
    
    src_vocab_dict, _ = data_utils.read_map(source_mapping)
    _ , trg_vocab_dict = data_utils.read_map(target_mapping)
    params = get_all_products(params)
    tf.reset_default_graph()
    with tf.Session() as sess:
        model = create_seq2seq(sess, 'TEST')
        model.batch_size = batch_size 
        for param in params:
            print('param: ', param)
            df = get_output_df(sess,param,d_valid,model,src_vocab_dict,trg_vocab_dict)
            dfs.append(df)
    dfs = map(lambda x: x.set_index('id'), dfs)
    dfs = reduce(lambda x,y : x.join(y),dfs) 
    dfs = dfs.reset_index()
    if export_each:
        o_file = '%s.csv'%model_dict['name']
        o_file = re.sub("/","_",o_file)
        dfs.to_csv(os.path.join(outputs_dir,o_file),index=False)
    return dfs
 def RL_readmap(self, map_path):
   self.vocab_dict, self.vocab_list = data_utils.read_map(map_path)
Ejemplo n.º 9
0
def train_MLE(): 

  data_utils.prepare_whole_data(FLAGS.data, FLAGS.data_test, FLAGS.source_data, FLAGS.target_data, FLAGS.src_vocab_size, FLAGS.trg_vocab_size)
  _ , trg_vocab_list = data_utils.read_map(FLAGS.target_data + '.' + str(FLAGS.trg_vocab_size) + '.mapping')

  d_train = data_utils.read_data(FLAGS.source_data + '_train.token',FLAGS.target_data + '_train.token',buckets)
  d_valid = data_utils.read_data(FLAGS.source_data + '_val.token',FLAGS.target_data + '_val.token',buckets)
  
  print('Total document size of training data: %s' % sum(len(l) for l in d_train))
  print('Total document size of validation data: %s' % sum(len(l) for l in d_valid))

  train_bucket_sizes = [len(d_train[b]) for b in range(len(d_train))]
  train_total_size = float(sum(train_bucket_sizes))
  train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
                         for i in range(len(train_bucket_sizes))]
  print('train_bucket_sizes: ',train_bucket_sizes)
  print('train_total_size: ',train_total_size)
  print('train_buckets_scale: ',train_buckets_scale)
  valid_bucket_sizes = [len(d_valid[b]) for b in range(len(d_valid))]
  valid_total_size = float(sum(valid_bucket_sizes))
  valid_buckets_scale = [sum(valid_bucket_sizes[:i + 1]) / valid_total_size
                         for i in range(len(valid_bucket_sizes))]
  print('valid_bucket_sizes: ',valid_bucket_sizes)
  print('valid_total_size: ',valid_total_size)
  print('valid_buckets_scale: ',valid_buckets_scale)

  with tf.Session() as sess:

    model = create_seq2seq(sess, 'MLE')
    if FLAGS.reset_sampling_prob: 
      with tf.variable_scope('sampling_prob',reuse=tf.AUTO_REUSE):
        sess.run(tf.assign(model.sampling_probability,reset_prob))
    if FLAGS.schedule_sampling:
      print('model.sampling_probability: ',model.sampling_probability_clip)
    #sess.run(tf.assign(model.sampling_probability,1.0))
    step = 0
    loss = 0
    loss_list = []
 
    if FLAGS.schedule_sampling:
      print('sampling_decay_steps: ',FLAGS.sampling_decay_steps)
      print('sampling_probability: ',sess.run(model.sampling_probability_clip))
      print('-----')

    while step < FLAGS.max_step:
      step += 1

      random_number = np.random.random_sample()
      # buckets_scale accumulated percentage
      bucket_id = min([i for i in range(len(train_buckets_scale))
                         if train_buckets_scale[i] > random_number])
      encoder_input, decoder_input, weight, en_s, de_s = model.get_batch(d_train, bucket_id, sen=True)
      #print('batch_size: ',model.batch_size)      ==> 64
      #print('batch_size: ',len(encoder_input[0])) ==> 64
      #print('batch_size: ',len(encoder_input))    ==> 15,50,...
      #print('batch_size: ',len(decoder_input))    ==> 15,50,... 
      #print('batch_size: ',len(weight))           ==> 15,50,...
      output, loss_train, _ = model.run(sess, encoder_input, decoder_input, weight, bucket_id)
      loss += loss_train / FLAGS.check_step

      #if step!=0 and step % FLAGS.sampling_decay_steps == 0:
      #  sess.run(model.sampling_probability_decay)
      #  print('sampling_probability: ',sess.run(model.sampling_probability))
        
      if step % FLAGS.print_step == 0:
        print('Input :')
        print(en_s[0].strip())
        print('Output:')
        print(_output(output[0], trg_vocab_list))
        print('\n{} steps trained ...\n\n'.format(step))

      if step % FLAGS.check_step == 0:
        print('\nStep %s, Training perplexity: %s, Learning rate: %s' % (step, math.exp(loss),
                                  sess.run(model.learning_rate))) 
        for i in range(len(d_train)):
          encoder_input, decoder_input, weight = model.get_batch(d_valid, i)
          _, loss_valid = model.run(sess, encoder_input, decoder_input, weight, i, forward_only = True)
          print('  Validation perplexity in bucket %s: %s' % (i, math.exp(loss_valid)))
        if len(loss_list) > 2 and loss > max(loss_list[-3:]):
          sess.run(model.learning_rate_decay)
        else:
          if step!=0:
            if FLAGS.schedule_sampling:
              sess.run(model.sampling_probability_decay)
              print('sampling_probability: ',sess.run(model.sampling_probability_clip))
        loss_list.append(loss)  
        loss = 0

        checkpoint_path = os.path.join(FLAGS.model_pre_dir, "MLE.ckpt")
        model.saver.save(sess, checkpoint_path, global_step = step)
        print('Saving model at step %s\n' % step)
      if step == FLAGS.sampling_global_step: break
Ejemplo n.º 10
0
def test():
  if FLAGS.src_word_seg == 'word':
    import jieba
    jieba.initialize()
  sess = tf.Session()
  src_vocab_dict, _ = data_utils.read_map(FLAGS.source_data + '.' + str(FLAGS.src_vocab_size) + '.mapping')
  _ , trg_vocab_list = data_utils.read_map(FLAGS.target_data + '.' + str(FLAGS.trg_vocab_size) + '.mapping')
  model = create_seq2seq(sess, 'TEST')
  model.batch_size = 1
  
  sys.stdout.write("Input sentence: ")
  sys.stdout.flush()
  sentence = sys.stdin.readline()
  if FLAGS.src_word_seg == 'word':
    sentence = (' ').join(jieba.lcut(sentence))
    print('sentence: ',sentence)
  elif FLAGS.src_word_seg == 'char':
    sentence = (' ').join([s for s in sentence])
  while(sentence):
    token_ids = data_utils.convert_to_token(tf.compat.as_bytes(sentence), src_vocab_dict, False)
    bucket_id = len(buckets) - 1
    for i, bucket in enumerate(buckets):
      if bucket[0] >= len(token_ids):
        bucket_id = i
        break
    # Get a 1-element batch to feed the sentence to the model.
    encoder_input, decoder_input, weight = model.get_batch({bucket_id: [(token_ids, [], "", "")]}, bucket_id)
    # Get output logits for the sentence.
    output = model.run(sess, encoder_input, decoder_input, weight, bucket_id)
    # This is a greedy decoder - outputs are just argmaxes of output_logits.
    
    # beam search all
    if bool(model.beam_search) is True:
        if bool(FLAGS.debug):
            outs = []
            for _ in range(model.beam_size):
                outs.append([])
   
            for out in output:
                for i,o in enumerate(out):
                    outs[i].append(o)
            outs = np.array(outs)
            #print('outs: ',outs.shape)
            outputss = []
            for out in outs:
                #print('out: ',out.shape)
                outputs = [int(np.argmax(logit)) for logit in out]
                outputss.append(outputs)
    
            for i,outputs in enumerate(outputss):
                sys_reply = "".join([tf.compat.as_str(trg_vocab_list[output]) for output in outputs])
                sys_reply = data_utils.sub_words(sys_reply)
                sys_reply = qulify_sentence(sys_reply)
                if i == 0:
                    print(colored("Syetem reply(bs best): " + sys_reply,"red"))
                else:
                    print("Syetem reply(bs all): " + sys_reply)
        else:
            output = model.run(sess, encoder_input, decoder_input, weight, bucket_id)
            outputs = [int(np.argmax(logit, axis=1)) for logit in output]
            if data_utils.EOS_ID in outputs:
              outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            sys_reply = "".join([tf.compat.as_str(trg_vocab_list[output]) for output in outputs])
            sys_reply = data_utils.sub_words(sys_reply)
            sys_reply = qulify_sentence(sys_reply)
            print("Syetem reply(bs best): " + sys_reply)
            

    # MLE
    else:
        output = model.run(sess, encoder_input, decoder_input, weight, bucket_id)
        print(output)
        print('output: ', len(output), output.shape, output[0].shape)
        outputs = [int(np.argmax(logit, axis=1)) for logit in output]
        # If there is an EOS symbol in outputs, cut them at that point.
        if data_utils.EOS_ID in outputs:
          outputs = outputs[:outputs.index(data_utils.EOS_ID)]
        sys_reply = "".join([tf.compat.as_str(trg_vocab_list[output]) for output in outputs])
        sys_reply = data_utils.sub_words(sys_reply)
        sys_reply = qulify_sentence(sys_reply)
        print("Syetem reply(MLE): " + sys_reply)


    # Print out French sentence corresponding to outputs.
    #print("Syetem reply: " + "".join([tf.compat.as_str(trg_vocab_list[output]) for output in outputs]))
    print ("User input  : ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()
    if FLAGS.src_word_seg == 'word':
      sentence = (' ').join(jieba.lcut(sentence))
      print ('sentence: ', sentence)
    elif FLAGS.src_word_seg == 'char':
      sentence = (' ').join([s for s in sentence])
Ejemplo n.º 11
0
import os
import sys
sys.path.append('sentiment_analysis/')
import math
import data_utils
import seq2seq_model
from sentiment_analysis import run
from sentiment_analysis import dataset
from run import create_seq2seq
from flags import FLAGS,SEED,buckets,replace_words,source_mapping,target_mapping 
from utils import qulify_sentence 
import jieba
jieba.initialize()

sess = tf.Session()
src_vocab_dict, _ = data_utils.read_map(source_mapping)
_ , trg_vocab_dict = data_utils.read_map(target_mapping)
model = create_seq2seq(sess, 'TEST')
model.batch_size = 1

# create a Flask app instance
app = Flask(__name__)

# method to reply to a message from the sender
def reply(user_id, msg):
    data = {
        "recipient": {"id": user_id},
        "message": {"text": msg}
    }
    # Post request using the Facebook Graph API v3.1
    resp = requests.post("https://graph.facebook.com/v3.1/me/messages?access_token=" + ACCESS_TOKEN, json=data)