def handle_incoming_messages(): data = request.json sender = data['entry'][0]['messaging'][0]['sender']['id'] message = data['entry'][0]['messaging'][0]['message']['text'] sentence = (' ').join([s for s in message]) token_ids = data_utils.convert_to_token(tf.compat.as_bytes(sentence), vocab_dict, False) bucket_id = len(buckets) - 1 for i, bucket in enumerate(buckets): if bucket[0] >= len(token_ids): bucket_id = i break # Get a 1-element batch to feed the sentence to the model. encoder_input, decoder_input, weight = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. output = model.run(sess, encoder_input, decoder_input, weight, bucket_id) outputs = [int(np.argmax(logit, axis=1)) for logit in output] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] message = "".join( [tf.compat.as_str(vocab_list[output]) for output in outputs]) message = sub_words(message) message = qulify_sentence(message) reply(sender, message) return "ok"
def test(filename): if FLAGS.src_word_seg == 'word': import jieba_fast as jieba jieba.load_userdict("dict_fasttext.txt") sess = tf.Session() src_vocab_dict, _ = data_utils.read_map(source_mapping) trg_vocab_dict, _ = data_utils.read_map(target_mapping) model = create_seq2seq(sess, 'TEST') model.batch_size = 1 #model.decoder_max_len = None #sources = ["你是誰","你是誰"] #targets = ["你是不是想人家","我是說你是我老婆"] df = pd.read_csv(filename) df = df.fillna('') sources = list(df["context"]) targets = list(df["utterance"]) scores = [] for source, target in zip(sources, targets): if FLAGS.src_word_seg == 'word': source = (' ').join(jieba.lcut(source)) elif FLAGS.src_word_seg == 'char': source = (' ').join([s for s in source]) if FLAGS.trg_word_seg == 'word': target = (' ').join(jieba.lcut(target)) elif FLAGS.trg_word_seg == 'char': target = (' ').join([t for t in target]) src_token_ids = data_utils.convert_to_token(tf.compat.as_bytes(source), src_vocab_dict, False) trg_token_ids = data_utils.convert_to_token(tf.compat.as_bytes(target), trg_vocab_dict, False) trg_len = len(trg_token_ids) for i, bucket in enumerate(buckets): if bucket[0] >= len(src_token_ids): bucket_id = i break encoder_input, decoder_input, weight = model.get_batch( {bucket_id: [(src_token_ids, [])]}, bucket_id) output = model.run(sess, encoder_input, decoder_input, weight, bucket_id)[:trg_len] output = [o[0][t] for t, o in zip(trg_token_ids, output)] output = np.mean(output) scores.append(output) scores = np.mean(scores) return scores
def test(): if FLAGS.src_word_seg == 'word': import jieba jieba.load_userdict('dict_fasttext.txt') sess = tf.Session() src_vocab_dict, _ = data_utils.read_map(source_mapping) _, trg_vocab_dict = data_utils.read_map(target_mapping) model = create_seq2seq(sess, 'TEST') model.batch_size = 1 sys.stdout.write("Input sentence: ") sys.stdout.flush() sentence = sys.stdin.readline() if FLAGS.src_word_seg == 'word': sentence = (' ').join(jieba.lcut(sentence)) print('sentence: ', sentence) elif FLAGS.src_word_seg == 'char': sentence = (' ').join([s for s in sentence]) while (sentence): token_ids = data_utils.convert_to_token(tf.compat.as_bytes(sentence), src_vocab_dict, False) bucket_id = len(buckets) - 1 for i, bucket in enumerate(buckets): if bucket[0] >= len(token_ids): bucket_id = i break # Get a 1-element batch to feed the sentence to the model. encoder_input, decoder_input, weight = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. output = model.run(sess, encoder_input, decoder_input, weight, bucket_id) # This is a greedy decoder - outputs are just argmaxes of output_logits. inference(model, output, src_vocab_dict, trg_vocab_dict) # Print out French sentence corresponding to outputs. #print("Syetem reply: " + "".join([tf.compat.as_str(trg_vocab_dict[output]) for output in outputs])) print("User input : ", end="") sys.stdout.flush() sentence = sys.stdin.readline() if FLAGS.src_word_seg == 'word': sentence = (' ').join(jieba.lcut(sentence)) print('sentence: ', sentence) elif FLAGS.src_word_seg == 'char': sentence = (' ').join([s for s in sentence])
def test(): sess = tf.Session() vocab_dict, vocab_list = data_utils.read_map(FLAGS.source_data_dir + '.' + str(FLAGS.vocab_size) + '.mapping') model = create_seq2seq(sess, 'TEST') model.batch_size = 1 sys.stdout.write("Input sentence: ") sys.stdout.flush() sentence = sys.stdin.readline() while (sentence): token_ids = data_utils.convert_to_token(tf.compat.as_bytes(sentence), vocab_dict, False) bucket_id = len(buckets) - 1 for i, bucket in enumerate(buckets): if bucket[0] >= len(token_ids): bucket_id = i break # Get a 1-element batch to feed the sentence to the model. encoder_input, decoder_input, weight = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. output = model.run(sess, encoder_input, decoder_input, weight, bucket_id) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print("Syetem reply: " + " ".join( [tf.compat.as_str(vocab_list[output]) for output in outputs])) print("User input : ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def run(self, sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=False, X=None, Y=None): if self.mode == 'TEST': encoder_size = self.buckets[bucket_id][0] decoder_size = self.buckets[-1][-1] decoder_inputs = np.reshape( np.repeat(decoder_inputs[0], decoder_size), (-1, 1)) target_weights = np.reshape( np.repeat(target_weights[0], decoder_size), (-1, 1)) #print('decoder_inputs: ',len(decoder_inputs)) else: encoder_size, decoder_size = self.buckets[bucket_id] #print('bucket_id: ',bucket_id) #print('encoder_size: ',encoder_size) #print('decoder_size: ',decoder_size) input_feed = {} for l in range(encoder_size): input_feed[self.encoder_inputs[l].name] = encoder_inputs[l] for l in range(decoder_size): input_feed[self.decoder_inputs[l].name] = decoder_inputs[l] input_feed[self.target_weights[l].name] = target_weights[l] last_target = self.decoder_inputs[decoder_size].name input_feed[last_target] = np.zeros([self.batch_size], dtype=np.int32) if self.mode == 'MLE': if forward_only: output_feed = [self.outputs[bucket_id], self.losses[bucket_id]] outputs = sess.run(output_feed, input_feed) return outputs else: output_feed = [ self.outputs[bucket_id], self.losses[bucket_id], self.update[bucket_id] ] outputs = sess.run(output_feed, input_feed) return outputs elif self.mode == 'TEST': output_feed = [self.outputs[bucket_id]] outputs = sess.run(output_feed, input_feed) return outputs elif self.mode == 'RL': # check mode: sample or from decoder input # True for sample and False for from decoder input input_feed[self.loop_or_not] = True # input_feed == {..., <tf.Tensor 'Placeholder:0' shape=<unknown> dtype=bool>: True} # step 1: get seq2seq sampled output output_feed = [self.RL_index[bucket_id]] # output_feed == [<tf.Tensor 'concat:0' shape=(?, 10) dtype=int64>] outputs = sess.run(output_feed, input_feed) # outputs == batch_size list of token # sentence_rl is a batch sized list of sampled decoded natural sentence #sentence_rl = self.token2word_RL(outputs[0]) #for a in sentence_rl: # print(a) # step 2: get rewards according to some rules reward = np.ones((self.batch_size), dtype=np.float32) new_data = [] for i in range(self.batch_size): token_ids = list(outputs[0][i]) # token_ids是tf.multinomial取樣出來的東西 if data_utils.EOS_ID in token_ids: token_ids = token_ids[:token_ids.index(data_utils.EOS_ID)] new_data.append(([], token_ids + [data_utils.EOS_ID], "", "")) # in this case, X is language model score # reward 1: ease of answering temp_reward = [ self.prob( token_ids, data_utils.convert_to_token(tf.compat.as_bytes(sen), self.trg_vocab_dict, False) + [data_utils.EOS_ID], X, bucket_id) / float(len(sen)) for sen in self.dummy_reply ] r1 = -np.mean(temp_reward) # reward 2: semantic coherence r_input = list(reversed([o[i] for o in encoder_inputs])) if data_utils.EOS_ID in r_input: r_input = r_input[:r_input.index(data_utils.EOS_ID)] if r_input == []: r_input = [data_utils.EOS_ID] r2 = self.prob(r_input, token_ids, X, bucket_id) / float( len(token_ids)) if len(token_ids) != 0 else 0 # reward 3: sentiment analysis score word_token = [ self.trg_vocab_list[token].decode('utf-8') for token in token_ids ] r3 = Y(word_token, np.array([len(token_ids)], dtype=np.int32))[0] reward[i] = r1 * self.r1 + r2 * self.r2 + r3 * self.r3 #print(reward) # advantage reward = reward - np.mean(reward) #print(reward) _, decoder_inputs, target_weights = self.get_batch( {bucket_id: new_data}, bucket_id, rand=False) # step 3: update seq2seq model for l in range(decoder_size): input_feed[self.decoder_inputs[l].name] = decoder_inputs[l] input_feed[self.target_weights[l].name] = target_weights[l] input_feed[self.reward] = reward input_feed[self.loop_or_not] = False output_feed = [ self.outputs[bucket_id], self.losses[bucket_id], self.update[bucket_id] ] #output_feed = [self.losses[bucket_id]] outputs = sess.run(output_feed, input_feed) return outputs
def test(): if FLAGS.src_word_seg == 'word': import jieba jieba.initialize() sess = tf.Session() src_vocab_dict, _ = data_utils.read_map(FLAGS.source_data + '.' + str(FLAGS.src_vocab_size) + '.mapping') _ , trg_vocab_list = data_utils.read_map(FLAGS.target_data + '.' + str(FLAGS.trg_vocab_size) + '.mapping') model = create_seq2seq(sess, 'TEST') model.batch_size = 1 sys.stdout.write("Input sentence: ") sys.stdout.flush() sentence = sys.stdin.readline() if FLAGS.src_word_seg == 'word': sentence = (' ').join(jieba.lcut(sentence)) print('sentence: ',sentence) elif FLAGS.src_word_seg == 'char': sentence = (' ').join([s for s in sentence]) while(sentence): token_ids = data_utils.convert_to_token(tf.compat.as_bytes(sentence), src_vocab_dict, False) bucket_id = len(buckets) - 1 for i, bucket in enumerate(buckets): if bucket[0] >= len(token_ids): bucket_id = i break # Get a 1-element batch to feed the sentence to the model. encoder_input, decoder_input, weight = model.get_batch({bucket_id: [(token_ids, [], "", "")]}, bucket_id) # Get output logits for the sentence. output = model.run(sess, encoder_input, decoder_input, weight, bucket_id) # This is a greedy decoder - outputs are just argmaxes of output_logits. # beam search all if bool(model.beam_search) is True: if bool(FLAGS.debug): outs = [] for _ in range(model.beam_size): outs.append([]) for out in output: for i,o in enumerate(out): outs[i].append(o) outs = np.array(outs) #print('outs: ',outs.shape) outputss = [] for out in outs: #print('out: ',out.shape) outputs = [int(np.argmax(logit)) for logit in out] outputss.append(outputs) for i,outputs in enumerate(outputss): sys_reply = "".join([tf.compat.as_str(trg_vocab_list[output]) for output in outputs]) sys_reply = data_utils.sub_words(sys_reply) sys_reply = qulify_sentence(sys_reply) if i == 0: print(colored("Syetem reply(bs best): " + sys_reply,"red")) else: print("Syetem reply(bs all): " + sys_reply) else: output = model.run(sess, encoder_input, decoder_input, weight, bucket_id) outputs = [int(np.argmax(logit, axis=1)) for logit in output] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] sys_reply = "".join([tf.compat.as_str(trg_vocab_list[output]) for output in outputs]) sys_reply = data_utils.sub_words(sys_reply) sys_reply = qulify_sentence(sys_reply) print("Syetem reply(bs best): " + sys_reply) # MLE else: output = model.run(sess, encoder_input, decoder_input, weight, bucket_id) print(output) print('output: ', len(output), output.shape, output[0].shape) outputs = [int(np.argmax(logit, axis=1)) for logit in output] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] sys_reply = "".join([tf.compat.as_str(trg_vocab_list[output]) for output in outputs]) sys_reply = data_utils.sub_words(sys_reply) sys_reply = qulify_sentence(sys_reply) print("Syetem reply(MLE): " + sys_reply) # Print out French sentence corresponding to outputs. #print("Syetem reply: " + "".join([tf.compat.as_str(trg_vocab_list[output]) for output in outputs])) print ("User input : ") sys.stdout.flush() sentence = sys.stdin.readline() if FLAGS.src_word_seg == 'word': sentence = (' ').join(jieba.lcut(sentence)) print ('sentence: ', sentence) elif FLAGS.src_word_seg == 'char': sentence = (' ').join([s for s in sentence])