def test(): if FLAGS.src_word_seg == 'word': import jieba jieba.load_userdict('dict_fasttext.txt') sess = tf.Session() src_vocab_dict, _ = data_utils.read_map(source_mapping) _, trg_vocab_dict = data_utils.read_map(target_mapping) model = create_seq2seq(sess, 'TEST') model.batch_size = 1 sys.stdout.write("Input sentence: ") sys.stdout.flush() sentence = sys.stdin.readline() if FLAGS.src_word_seg == 'word': sentence = (' ').join(jieba.lcut(sentence)) print('sentence: ', sentence) elif FLAGS.src_word_seg == 'char': sentence = (' ').join([s for s in sentence]) while (sentence): token_ids = data_utils.convert_to_token(tf.compat.as_bytes(sentence), src_vocab_dict, False) bucket_id = len(buckets) - 1 for i, bucket in enumerate(buckets): if bucket[0] >= len(token_ids): bucket_id = i break # Get a 1-element batch to feed the sentence to the model. encoder_input, decoder_input, weight = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. output = model.run(sess, encoder_input, decoder_input, weight, bucket_id) # This is a greedy decoder - outputs are just argmaxes of output_logits. inference(model, output, src_vocab_dict, trg_vocab_dict) # Print out French sentence corresponding to outputs. #print("Syetem reply: " + "".join([tf.compat.as_str(trg_vocab_dict[output]) for output in outputs])) print("User input : ", end="") sys.stdout.flush() sentence = sys.stdin.readline() if FLAGS.src_word_seg == 'word': sentence = (' ').join(jieba.lcut(sentence)) print('sentence: ', sentence) elif FLAGS.src_word_seg == 'char': sentence = (' ').join([s for s in sentence])
def test(filename): if FLAGS.src_word_seg == 'word': import jieba_fast as jieba jieba.load_userdict("dict_fasttext.txt") sess = tf.Session() src_vocab_dict, _ = data_utils.read_map(source_mapping) trg_vocab_dict, _ = data_utils.read_map(target_mapping) model = create_seq2seq(sess, 'TEST') model.batch_size = 1 #model.decoder_max_len = None #sources = ["你是誰","你是誰"] #targets = ["你是不是想人家","我是說你是我老婆"] df = pd.read_csv(filename) df = df.fillna('') sources = list(df["context"]) targets = list(df["utterance"]) scores = [] for source, target in zip(sources, targets): if FLAGS.src_word_seg == 'word': source = (' ').join(jieba.lcut(source)) elif FLAGS.src_word_seg == 'char': source = (' ').join([s for s in source]) if FLAGS.trg_word_seg == 'word': target = (' ').join(jieba.lcut(target)) elif FLAGS.trg_word_seg == 'char': target = (' ').join([t for t in target]) src_token_ids = data_utils.convert_to_token(tf.compat.as_bytes(source), src_vocab_dict, False) trg_token_ids = data_utils.convert_to_token(tf.compat.as_bytes(target), trg_vocab_dict, False) trg_len = len(trg_token_ids) for i, bucket in enumerate(buckets): if bucket[0] >= len(src_token_ids): bucket_id = i break encoder_input, decoder_input, weight = model.get_batch( {bucket_id: [(src_token_ids, [])]}, bucket_id) output = model.run(sess, encoder_input, decoder_input, weight, bucket_id)[:trg_len] output = [o[0][t] for t, o in zip(trg_token_ids, output)] output = np.mean(output) scores.append(output) scores = np.mean(scores) return scores
def val(mo): d_valid = data_utils.read_val_data(FLAGS.source_data + '_val.token',FLAGS.target_data + '_val.token',buckets) _ , trg_vocab_list = data_utils.read_map(FLAGS.target_data + '.' + str(FLAGS.trg_vocab_size) + '.mapping') print('Total document size of validation data: %s' % sum(len(l) for l in d_valid)) with tf.Session() as sess: model = create_seq2seq(sess, 'TEST') loss_list = [] cf = csv.writer(open(FLAGS.output, 'w'), delimiter = '|') cf.writerow(['context', 'utterance']) for i in range(len(d_valid)): encoder_input, decoder_input, weight, en_s, de_s = model.get_one(d_valid, i, sen=True) output = model.run(sess, encoder_input, decoder_input, weight, d_valid[i][0]) cf.writerow([''.join(en_s.strip().split()), _output(output[0], trg_vocab_list)]) if i % 1000 == 0: print('Generate {} ...'.format(i))
def test(): sess = tf.Session() vocab_dict, vocab_list = data_utils.read_map(FLAGS.source_data_dir + '.' + str(FLAGS.vocab_size) + '.mapping') model = create_seq2seq(sess, 'TEST') model.batch_size = 1 sys.stdout.write("Input sentence: ") sys.stdout.flush() sentence = sys.stdin.readline() while (sentence): token_ids = data_utils.convert_to_token(tf.compat.as_bytes(sentence), vocab_dict, False) bucket_id = len(buckets) - 1 for i, bucket in enumerate(buckets): if bucket[0] >= len(token_ids): bucket_id = i break # Get a 1-element batch to feed the sentence to the model. encoder_input, decoder_input, weight = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. output = model.run(sess, encoder_input, decoder_input, weight, bucket_id) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print("Syetem reply: " + " ".join( [tf.compat.as_str(vocab_list[output]) for output in outputs])) print("User input : ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
import numpy as np import os import sys sys.path.append('sentiment_analysis/') import math import data_utils import seq2seq_model from sentiment_analysis import run from sentiment_analysis import dataset from flags import FLAGS from run import replace_words, SEED, buckets, sub_words, create_seq2seq from util import * sess = tf.Session() vocab_dict, vocab_list = data_utils.read_map(FLAGS.source_data_dir + '.' + str(FLAGS.vocab_size) + '.mapping') model = create_seq2seq(sess, 'TEST') model.batch_size = 1 # create a Flask app instance app = Flask(__name__) # method to reply to a message from the sender def reply(user_id, msg): data = {"recipient": {"id": user_id}, "message": {"text": msg}} # Post request using the Facebook Graph API v3.1 resp = requests.post( "https://graph.facebook.com/v3.1/me/messages?access_token=" + ACCESS_TOKEN,
def RL_readmap(self, src_map_path, trg_map_path): self.src_vocab_dict, self.src_vocab_list = data_utils.read_map( src_map_path) self.trg_vocab_dict, self.trg_vocab_list = data_utils.read_map( trg_map_path)
def get_output_dfs_per_model(model_dict,params,use_current_model=False,export_each=True,pretrain_vec='fasttext'): if use_current_model: source_mapping = '%s.%s.mapping'%(FLAGS.source_data,FLAGS.src_vocab_size) target_mapping = '%s.%s.mapping'%(FLAGS.target_data,FLAGS.trg_vocab_size) source_data = FLAGS.source_data target_data = FLAGS.target_data #model_name = FLAGS.model_dir #model_name = re.sub('/','',model_name) #model_name = re.sub('model','',model_name) else: source_data = 'corpus/%s/source'%model_dict['corpus'] target_data = 'corpus/%s/target'%model_dict['corpus'] source_mapping = '%s.%s.mapping'%(source_data,model_dict['source_token']) target_mapping = '%s.%s.mapping'%(target_data,model_dict['target_token']) model_name = model_dict['name'] FLAGS.source_data = source_data FLAGS.target_data = target_data FLAGS.src_vocab_size = model_dict['source_token'] FLAGS.trg_vocab_size = model_dict['target_token'] if FLAGS.mode == "MLE": FLAGS.model_dir = os.path.join('model/',model_name) elif FLAGS.mode == "RL": FLAGS.model_rl_dir = os.path.join('model_RL/',model_name) if pretrain_vec == 'fasttext': fasttext_npy = 'corpus/%s/fasttext.npy'%model_dict['corpus'] FLAGS.pretrain_vec = np.load(fasttext_npy) print('########################################################################') if FLAGS.mode == "MLE": print('model_dir: ',FLAGS.model_dir) elif FLAGS.mode == "RL": print('model_rl_dir: ',FLAGS.model_rl_dir) print('fasttext_npy: ',fasttext_npy) print('########################################################################') d_valid = data_utils.read_data(source_data + '_val.token',target_data + '_val.token',buckets) dfs = [] DF = namedtuple("DF",("id","source","target")) for bucket_id, d_val in enumerate(d_valid): for i, d in enumerate(d_val): source,target = (None,None) try: source = ''.join(data_utils.token_to_text(d[0],source_mapping)) except IndexError: pass try: target = ''.join(data_utils.token_to_text(d[1],target_mapping)) target = re.sub('EOS','',target) except IndexError: pass dfs.append(DF("%s_%s"%(bucket_id,i),source,target)) dfs = pd.DataFrame(dfs) dfs = [dfs] src_vocab_dict, _ = data_utils.read_map(source_mapping) _ , trg_vocab_dict = data_utils.read_map(target_mapping) params = get_all_products(params) tf.reset_default_graph() with tf.Session() as sess: model = create_seq2seq(sess, 'TEST') model.batch_size = batch_size for param in params: print('param: ', param) df = get_output_df(sess,param,d_valid,model,src_vocab_dict,trg_vocab_dict) dfs.append(df) dfs = map(lambda x: x.set_index('id'), dfs) dfs = reduce(lambda x,y : x.join(y),dfs) dfs = dfs.reset_index() if export_each: o_file = '%s.csv'%model_dict['name'] o_file = re.sub("/","_",o_file) dfs.to_csv(os.path.join(outputs_dir,o_file),index=False) return dfs
def RL_readmap(self, map_path): self.vocab_dict, self.vocab_list = data_utils.read_map(map_path)
def train_MLE(): data_utils.prepare_whole_data(FLAGS.data, FLAGS.data_test, FLAGS.source_data, FLAGS.target_data, FLAGS.src_vocab_size, FLAGS.trg_vocab_size) _ , trg_vocab_list = data_utils.read_map(FLAGS.target_data + '.' + str(FLAGS.trg_vocab_size) + '.mapping') d_train = data_utils.read_data(FLAGS.source_data + '_train.token',FLAGS.target_data + '_train.token',buckets) d_valid = data_utils.read_data(FLAGS.source_data + '_val.token',FLAGS.target_data + '_val.token',buckets) print('Total document size of training data: %s' % sum(len(l) for l in d_train)) print('Total document size of validation data: %s' % sum(len(l) for l in d_valid)) train_bucket_sizes = [len(d_train[b]) for b in range(len(d_train))] train_total_size = float(sum(train_bucket_sizes)) train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in range(len(train_bucket_sizes))] print('train_bucket_sizes: ',train_bucket_sizes) print('train_total_size: ',train_total_size) print('train_buckets_scale: ',train_buckets_scale) valid_bucket_sizes = [len(d_valid[b]) for b in range(len(d_valid))] valid_total_size = float(sum(valid_bucket_sizes)) valid_buckets_scale = [sum(valid_bucket_sizes[:i + 1]) / valid_total_size for i in range(len(valid_bucket_sizes))] print('valid_bucket_sizes: ',valid_bucket_sizes) print('valid_total_size: ',valid_total_size) print('valid_buckets_scale: ',valid_buckets_scale) with tf.Session() as sess: model = create_seq2seq(sess, 'MLE') if FLAGS.reset_sampling_prob: with tf.variable_scope('sampling_prob',reuse=tf.AUTO_REUSE): sess.run(tf.assign(model.sampling_probability,reset_prob)) if FLAGS.schedule_sampling: print('model.sampling_probability: ',model.sampling_probability_clip) #sess.run(tf.assign(model.sampling_probability,1.0)) step = 0 loss = 0 loss_list = [] if FLAGS.schedule_sampling: print('sampling_decay_steps: ',FLAGS.sampling_decay_steps) print('sampling_probability: ',sess.run(model.sampling_probability_clip)) print('-----') while step < FLAGS.max_step: step += 1 random_number = np.random.random_sample() # buckets_scale accumulated percentage bucket_id = min([i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number]) encoder_input, decoder_input, weight, en_s, de_s = model.get_batch(d_train, bucket_id, sen=True) #print('batch_size: ',model.batch_size) ==> 64 #print('batch_size: ',len(encoder_input[0])) ==> 64 #print('batch_size: ',len(encoder_input)) ==> 15,50,... #print('batch_size: ',len(decoder_input)) ==> 15,50,... #print('batch_size: ',len(weight)) ==> 15,50,... output, loss_train, _ = model.run(sess, encoder_input, decoder_input, weight, bucket_id) loss += loss_train / FLAGS.check_step #if step!=0 and step % FLAGS.sampling_decay_steps == 0: # sess.run(model.sampling_probability_decay) # print('sampling_probability: ',sess.run(model.sampling_probability)) if step % FLAGS.print_step == 0: print('Input :') print(en_s[0].strip()) print('Output:') print(_output(output[0], trg_vocab_list)) print('\n{} steps trained ...\n\n'.format(step)) if step % FLAGS.check_step == 0: print('\nStep %s, Training perplexity: %s, Learning rate: %s' % (step, math.exp(loss), sess.run(model.learning_rate))) for i in range(len(d_train)): encoder_input, decoder_input, weight = model.get_batch(d_valid, i) _, loss_valid = model.run(sess, encoder_input, decoder_input, weight, i, forward_only = True) print(' Validation perplexity in bucket %s: %s' % (i, math.exp(loss_valid))) if len(loss_list) > 2 and loss > max(loss_list[-3:]): sess.run(model.learning_rate_decay) else: if step!=0: if FLAGS.schedule_sampling: sess.run(model.sampling_probability_decay) print('sampling_probability: ',sess.run(model.sampling_probability_clip)) loss_list.append(loss) loss = 0 checkpoint_path = os.path.join(FLAGS.model_pre_dir, "MLE.ckpt") model.saver.save(sess, checkpoint_path, global_step = step) print('Saving model at step %s\n' % step) if step == FLAGS.sampling_global_step: break
def test(): if FLAGS.src_word_seg == 'word': import jieba jieba.initialize() sess = tf.Session() src_vocab_dict, _ = data_utils.read_map(FLAGS.source_data + '.' + str(FLAGS.src_vocab_size) + '.mapping') _ , trg_vocab_list = data_utils.read_map(FLAGS.target_data + '.' + str(FLAGS.trg_vocab_size) + '.mapping') model = create_seq2seq(sess, 'TEST') model.batch_size = 1 sys.stdout.write("Input sentence: ") sys.stdout.flush() sentence = sys.stdin.readline() if FLAGS.src_word_seg == 'word': sentence = (' ').join(jieba.lcut(sentence)) print('sentence: ',sentence) elif FLAGS.src_word_seg == 'char': sentence = (' ').join([s for s in sentence]) while(sentence): token_ids = data_utils.convert_to_token(tf.compat.as_bytes(sentence), src_vocab_dict, False) bucket_id = len(buckets) - 1 for i, bucket in enumerate(buckets): if bucket[0] >= len(token_ids): bucket_id = i break # Get a 1-element batch to feed the sentence to the model. encoder_input, decoder_input, weight = model.get_batch({bucket_id: [(token_ids, [], "", "")]}, bucket_id) # Get output logits for the sentence. output = model.run(sess, encoder_input, decoder_input, weight, bucket_id) # This is a greedy decoder - outputs are just argmaxes of output_logits. # beam search all if bool(model.beam_search) is True: if bool(FLAGS.debug): outs = [] for _ in range(model.beam_size): outs.append([]) for out in output: for i,o in enumerate(out): outs[i].append(o) outs = np.array(outs) #print('outs: ',outs.shape) outputss = [] for out in outs: #print('out: ',out.shape) outputs = [int(np.argmax(logit)) for logit in out] outputss.append(outputs) for i,outputs in enumerate(outputss): sys_reply = "".join([tf.compat.as_str(trg_vocab_list[output]) for output in outputs]) sys_reply = data_utils.sub_words(sys_reply) sys_reply = qulify_sentence(sys_reply) if i == 0: print(colored("Syetem reply(bs best): " + sys_reply,"red")) else: print("Syetem reply(bs all): " + sys_reply) else: output = model.run(sess, encoder_input, decoder_input, weight, bucket_id) outputs = [int(np.argmax(logit, axis=1)) for logit in output] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] sys_reply = "".join([tf.compat.as_str(trg_vocab_list[output]) for output in outputs]) sys_reply = data_utils.sub_words(sys_reply) sys_reply = qulify_sentence(sys_reply) print("Syetem reply(bs best): " + sys_reply) # MLE else: output = model.run(sess, encoder_input, decoder_input, weight, bucket_id) print(output) print('output: ', len(output), output.shape, output[0].shape) outputs = [int(np.argmax(logit, axis=1)) for logit in output] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] sys_reply = "".join([tf.compat.as_str(trg_vocab_list[output]) for output in outputs]) sys_reply = data_utils.sub_words(sys_reply) sys_reply = qulify_sentence(sys_reply) print("Syetem reply(MLE): " + sys_reply) # Print out French sentence corresponding to outputs. #print("Syetem reply: " + "".join([tf.compat.as_str(trg_vocab_list[output]) for output in outputs])) print ("User input : ") sys.stdout.flush() sentence = sys.stdin.readline() if FLAGS.src_word_seg == 'word': sentence = (' ').join(jieba.lcut(sentence)) print ('sentence: ', sentence) elif FLAGS.src_word_seg == 'char': sentence = (' ').join([s for s in sentence])
import os import sys sys.path.append('sentiment_analysis/') import math import data_utils import seq2seq_model from sentiment_analysis import run from sentiment_analysis import dataset from run import create_seq2seq from flags import FLAGS,SEED,buckets,replace_words,source_mapping,target_mapping from utils import qulify_sentence import jieba jieba.initialize() sess = tf.Session() src_vocab_dict, _ = data_utils.read_map(source_mapping) _ , trg_vocab_dict = data_utils.read_map(target_mapping) model = create_seq2seq(sess, 'TEST') model.batch_size = 1 # create a Flask app instance app = Flask(__name__) # method to reply to a message from the sender def reply(user_id, msg): data = { "recipient": {"id": user_id}, "message": {"text": msg} } # Post request using the Facebook Graph API v3.1 resp = requests.post("https://graph.facebook.com/v3.1/me/messages?access_token=" + ACCESS_TOKEN, json=data)