def initial_setup(data_corpus): metadata, idx_q, idx_a = data.load_data(PATH='data/{}/'.format(data_corpus)) (trainX, trainY), (testX, testY), (validX, validY) = data.split_dataset(idx_q, idx_a) trainX = tl.prepro.remove_pad_sequences(trainX.tolist()) trainY = tl.prepro.remove_pad_sequences(trainY.tolist()) testX = tl.prepro.remove_pad_sequences(testX.tolist()) testY = tl.prepro.remove_pad_sequences(testY.tolist()) validX = tl.prepro.remove_pad_sequences(validX.tolist()) validY = tl.prepro.remove_pad_sequences(validY.tolist()) return metadata, trainX, trainY, testX, testY, validX, validY
def __init__(self): data_corpus = "twitter" metadata, idx_q, idx_a = data.load_data( PATH='data/{}/'.format(data_corpus)) src_vocab_size = len(metadata['idx2w']) # 8002 (0~8001) emb_dim = 1024 word2idx = metadata['w2idx'] # dict word 2 index idx2word = metadata['idx2w'] # list index 2 word unk_id = word2idx['unk'] # 1 pad_id = word2idx['_'] # 0 start_id = src_vocab_size # 8002 end_id = src_vocab_size + 1 # 8003 word2idx.update({'start_id': start_id}) word2idx.update({'end_id': end_id}) idx2word = idx2word + ['start_id', 'end_id'] src_vocab_size = tgt_vocab_size = src_vocab_size + 2 # num_epochs = 5 vocabulary_size = src_vocab_size decoder_seq_length = 20 self.unk_id = unk_id self.pad_id = pad_id self.start_id = start_id self.end_id = end_id self.word2idx = word2idx self.idx2word = idx2word self.model_ = Seq2seq( decoder_seq_length=decoder_seq_length, cell_enc=tf.keras.layers.GRUCell, cell_dec=tf.keras.layers.GRUCell, n_layer=3, n_units=256, embedding_layer=tl.layers.Embedding( vocabulary_size=vocabulary_size, embedding_size=emb_dim), ) load_weights = tl.files.load_npz(name='model.npz') tl.files.assign_weights(load_weights, self.model_)
def load_data(path): metadata, idx_q, idx_a = data.load_data(path) (trainX, trainY), (testX, testY), (validX, validY) = data.split_dataset(idx_q, idx_a) trainX = trainX.tolist() trainY = trainY.tolist() testX = testX.tolist() testY = testY.tolist() validX = validX.tolist() validY = validY.tolist() trainX = tl.prepro.remove_pad_sequences(trainX) trainY = tl.prepro.remove_pad_sequences(trainY) testX = tl.prepro.remove_pad_sequences(testX) testY = tl.prepro.remove_pad_sequences(testY) validX = tl.prepro.remove_pad_sequences(validX) validY = tl.prepro.remove_pad_sequences(validY) return trainX, trainY, testX, testY, validX, validY, metadata
#! /usr/bin/python # -*- coding: utf8 -*- import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import * import tensorflow as tf import numpy as np import time ###============= prepare data from data.twitter import data metadata, idx_q, idx_a = data.load_data('data/twitter/') # Twitter # from data.cornell_corpus import data # metadata, idx_q, idx_a = data.load_data(PATH='data/cornell_corpus/') # Cornell Moive (trainX, trainY), (testX, testY), (validX, validY) = data.split_dataset(idx_q, idx_a) trainX = trainX.tolist() trainY = trainY.tolist() testX = testX.tolist() testY = testY.tolist() validX = validX.tolist() validY = validY.tolist() trainX = tl.prepro.remove_pad_sequences(trainX) trainY = tl.prepro.remove_pad_sequences(trainY) testX = tl.prepro.remove_pad_sequences(testX) testY = tl.prepro.remove_pad_sequences(testY) validX = tl.prepro.remove_pad_sequences(validX) validY = tl.prepro.remove_pad_sequences(validY)
References ---------- http://suriyadeepan.github.io/2016-12-31-practical-seq2seq/ """ import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import * import tensorflow as tf import numpy as np import time ###============= prepare data from data.twitter import data metadata, idx_q, idx_a = data.load_data(PATH='data/twitter/') # Twitter # from data.cornell_corpus import data # metadata, idx_q, idx_a = data.load_data(PATH='data/cornell_corpus/') # Cornell Moive (trainX, trainY), (testX, testY), (validX, validY) = data.split_dataset(idx_q, idx_a) trainX = trainX.tolist() trainY = trainY.tolist() testX = testX.tolist() testY = testY.tolist() validX = validX.tolist() validY = validY.tolist() trainX = tl.prepro.remove_pad_sequences(trainX) trainY = tl.prepro.remove_pad_sequences(trainY) testX = tl.prepro.remove_pad_sequences(testX) testY = tl.prepro.remove_pad_sequences(testY)
def main(): metadata, idx_q, idx_a = data.load_data(PATH='data/twitter/') # Twitter # from data.cornell_corpus import data # metadata, idx_q, idx_a = data.load_data(PATH='data/cornell_corpus/') # Cornell Moive (trainX, trainY), (testX, testY), (validX, validY) = data.split_dataset(idx_q, idx_a) trainX = trainX.tolist() trainY = trainY.tolist() testX = testX.tolist() testY = testY.tolist() validX = validX.tolist() validY = validY.tolist() trainX = tl.prepro.remove_pad_sequences(trainX) trainY = tl.prepro.remove_pad_sequences(trainY) testX = tl.prepro.remove_pad_sequences(testX) testY = tl.prepro.remove_pad_sequences(testY) validX = tl.prepro.remove_pad_sequences(validX) validY = tl.prepro.remove_pad_sequences(validY) ###============= parameters xseq_len = len(trainX) #.shape[-1] yseq_len = len(trainY) #.shape[-1] assert xseq_len == yseq_len batch_size = 32 n_step = int(xseq_len / batch_size) xvocab_size = len(metadata['idx2w']) # 8002 (0~8001) emb_dim = 1024 global w2idx global idx2w global encode_seqs2 global decode_seqs2 global start_id global end_id w2idx = metadata['w2idx'] # dict word 2 index idx2w = metadata['idx2w'] # list index 2 word unk_id = w2idx['unk'] # 1 pad_id = w2idx['_'] # 0 start_id = xvocab_size # 8002 end_id = xvocab_size + 1 # 8003 w2idx.update({'start_id': start_id}) w2idx.update({'end_id': end_id}) idx2w = idx2w + ['start_id', 'end_id'] xvocab_size = yvocab_size = xvocab_size + 2 """ A data for Seq2Seq should look like this: input_seqs : ['how', 'are', 'you', '<PAD_ID'>] decode_seqs : ['<START_ID>', 'I', 'am', 'fine', '<PAD_ID'>] target_seqs : ['I', 'am', 'fine', '<END_ID>', '<PAD_ID'>] target_mask : [1, 1, 1, 1, 0] """ print("encode_seqs", [idx2w[id] for id in trainX[10]]) target_seqs = tl.prepro.sequences_add_end_id([trainY[10]], end_id=end_id)[0] # target_seqs = tl.prepro.remove_pad_sequences([target_seqs], pad_id=pad_id)[0] print("target_seqs", [idx2w[id] for id in target_seqs]) decode_seqs = tl.prepro.sequences_add_start_id([trainY[10]], start_id=start_id, remove_last=False)[0] # decode_seqs = tl.prepro.remove_pad_sequences([decode_seqs], pad_id=pad_id)[0] print("decode_seqs", [idx2w[id] for id in decode_seqs]) target_mask = tl.prepro.sequences_get_mask([target_seqs])[0] print("target_mask", target_mask) print(len(target_seqs), len(decode_seqs), len(target_mask)) ###============= model global net_rnn def model(encode_seqs, decode_seqs, is_train=True, reuse=False): with tf.variable_scope("model", reuse=reuse): # for chatbot, you can use the same embedding layer, # for translation, you may want to use 2 seperated embedding layers with tf.variable_scope("embedding") as vs: net_encode = EmbeddingInputlayer(inputs=encode_seqs, vocabulary_size=xvocab_size, embedding_size=emb_dim, name='seq_embedding') vs.reuse_variables() tl.layers.set_name_reuse(True) net_decode = EmbeddingInputlayer(inputs=decode_seqs, vocabulary_size=xvocab_size, embedding_size=emb_dim, name='seq_embedding') net_rnn = Seq2Seq( net_encode, net_decode, cell_fn=tf.contrib.rnn.BasicLSTMCell, n_hidden=emb_dim, initializer=tf.random_uniform_initializer(-0.1, 0.1), encode_sequence_length=retrieve_seq_length_op2(encode_seqs), decode_sequence_length=retrieve_seq_length_op2(decode_seqs), initial_state_encode=None, dropout=(0.5 if is_train else None), n_layer=3, return_seq_2d=True, name='seq2seq') net_out = DenseLayer(net_rnn, n_units=xvocab_size, act=tf.identity, name='output') return net_out, net_rnn # model for training encode_seqs = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="encode_seqs") decode_seqs = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="decode_seqs") target_seqs = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="target_seqs") target_mask = tf.placeholder( dtype=tf.int64, shape=[batch_size, None], name="target_mask") # tl.prepro.sequences_get_mask() net_out, _ = model(encode_seqs, decode_seqs, is_train=True, reuse=False) # model for inferencing encode_seqs2 = tf.placeholder(dtype=tf.int64, shape=[1, None], name="encode_seqs") decode_seqs2 = tf.placeholder(dtype=tf.int64, shape=[1, None], name="decode_seqs") net, net_rnn = model(encode_seqs2, decode_seqs2, is_train=False, reuse=True) global y y = tf.nn.softmax(net.outputs) # loss for training # print(net_out.outputs) # (?, 8004) # print(target_seqs) # (32, ?) # loss_weights = tf.ones_like(target_seqs, dtype=tf.float32) # loss = tf.contrib.legacy_seq2seq.sequence_loss(net_out.outputs, target_seqs, loss_weights, yvocab_size) loss = tl.cost.cross_entropy_seq_with_mask(logits=net_out.outputs, target_seqs=target_seqs, input_mask=target_mask, return_details=False, name='cost') net_out.print_params(False) lr = 0.0001 train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss) # Truncated Backpropagation for training (option) # max_grad_norm = 30 # grads, _ = tf.clip_by_global_norm(tf.gradients(loss, net_out.all_params),max_grad_norm) # optimizer = tf.train.GradientDescentOptimizer(lr) # train_op = optimizer.apply_gradients(zip(grads, net_out.all_params)) # sess = tf.InteractiveSession() global sess sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) tl.layers.initialize_global_variables(sess) tl.files.load_and_assign_npz(sess=sess, name='n.npz', network=net) run(host='0.0.0.0', port=40026)
class AI: metadata, idx_q, idx_a = data.load_data(PATH='data/{}/'.format("twitter")) src_vocab_size = len(metadata['idx2w']) emb_dim = 1024 word2idx = metadata['w2idx'] idx2word = metadata['idx2w'] unk_id = word2idx['unk'] pad_id = word2idx['_'] start_id = src_vocab_size end_id = src_vocab_size + 1 word2idx.update({'start_id': start_id}) word2idx.update({'end_id': end_id}) idx2word = idx2word + ['start_id', 'end_id'] src_vocab_size = src_vocab_size + 2 vocabulary_size = src_vocab_size decoder_seq_length = 20 # Creates an instance of the AI with it's name being passed from a name-generating function def __init__(self): # preProcessor is only utilised here to avoid duplicating string2array self.preProcessor = DataPreProcessor("") self.model_ = Seq2seq( decoder_seq_length=self.decoder_seq_length, cell_enc=tf.keras.layers.GRUCell, cell_dec=tf.keras.layers.GRUCell, n_layer=3, n_units=256, embedding_layer=tl.layers.Embedding( vocabulary_size=self.vocabulary_size, embedding_size=self.emb_dim), ) load_weights = tl.files.load_npz(name="data/model.npz") tl.files.assign_weights(load_weights, self.model_) # Take a name, create a mood def initialise(self, name): self.name = name feelings = [ "good", "well", "great", "grand", "excellent", "ecstatic", "happy", "sad", "annoyed", "frustrated", "angry", "tired", "okay", "alright" ] self.feel = feelings[randint(0, len(feelings) - 1)] # Handle the creation of a response from the given input def respond(self, seed, number): simpleStart = self.simpleResponse(seed) self.model_.eval() seed_id = [self.word2idx.get(w, self.unk_id) for w in seed.split(" ")] sentence_id = self.model_(inputs=[[seed_id]], seq_length=20, start_token=self.start_id, top_n=number) sentence = [] for w_id in sentence_id[0]: w = self.idx2word[w_id] if w == 'end_id': break sentence = sentence + [w] # A catch all just in case there are no responses, but we have yet to find an input to trigger this if sentence == []: sentence = [ "I'm", "sorry,", "I", "just", "don't", "quite", "understand", "what", "you're", "asking..." ] return simpleStart + sentence # Handle simple questions that the AI is less than optimal at answering def simpleResponse(self, input): sentence = [] input = self.preProcessor.string2Array(input) tally = [0, 0, 0] greetings = [ "hello", "hi", "greetings", "salutations", "hey", "yo", "howdy" ] names = [["what", "who"], ["is", "are"], ["you", "your"], ["name"]] wellbeing = [["how"], ["do", "are"], ["you"], ["doing", "feeling", "feel"]] # Tallying key words in the user query to determine if certain questions were being asked for x in input: for y in range(len(greetings)): if x == greetings[y]: tally[0] = 1 break for y in range(len(names)): for z in names[y]: if x is z: tally[1] = tally[1] + 1 break for y in range(len(wellbeing)): for z in wellbeing[y]: if x == z: tally[2] = tally[2] + 1 break # Handle a return greeting, and maybe ask how the user is if tally[0] > 0: sentence.append(greetings[randint(0, 6)]) if randint(0, 1) is 1: sentence.append("how") sentence.append("are") sentence.append("you") value = randint(0, 2) if value is 0: sentence.append("doing") elif value is 1: sentence.append("feeling") # Handle questions about it's name with a simple answer if tally[1] > 2 and len(input) < 5: if randint(0, 1) is 1: sentence.append("I") sentence.append("am") else: sentence.append("my") sentence.append("name") sentence.append("is") sentence.append(self.name) # Handle a 'how are you' type question with a pre-determined emotional state if tally[2] > 2 and len(input) < 5: sentence.append("I") sentence.append("am") if randint(0, 1) is 1: sentence.append("feeling") sentence.append(self.feel) return sentence
import time import tensorflow as tf import tensorlayer as tl from sklearn.utils import shuffle from tensorlayer.layers import EmbeddingInputlayer, Seq2Seq, DenseLayer, retrieve_seq_length_op2 from data.twitter import data metadata, idx_q, idx_a = data.load_data(PATH='data/twitter/') (trainX, trainY), (testX, testY), (validX, validY) = data.split_dataset(idx_q, idx_a) trainX = trainX.tolist() trainY = trainY.tolist() testX = testX.tolist() testY = testY.tolist() validX = validX.tolist() validY = validY.tolist() trainX = tl.prepro.remove_pad_sequences(trainX) trainY = tl.prepro.remove_pad_sequences(trainY) testX = tl.prepro.remove_pad_sequences(testX) testY = tl.prepro.remove_pad_sequences(testY) validX = tl.prepro.remove_pad_sequences(validX) validY = tl.prepro.remove_pad_sequences(validY) # Hyperparameters batch_size = 32 embedding_dimension = 1024 learning_rate = 0.0001 number_epochs = 1000
""" import time import tensorflow as tf import tensorlayer as tl import numpy as np from tensorlayer.layers import DenseLayer, EmbeddingInputlayer, Seq2Seq, retrieve_seq_length_op2 from sklearn.utils import shuffle from data.twitter import data # Data Preparation data_corpus = 'twitter' metadata, idx_q, idx_a = data.load_data(PATH='data/{}/'.format(data_corpus)) (trainX, trainY), (testX, testY), (validX, validY) = data.split_dataset(idx_q, idx_a) trainX = trainX.tolist() trainY = trainY.tolist() testX = testX.tolist() testY = testY.tolist() validX = validX.tolist() validY = validY.tolist() trainX = tl.prepro.remove_pad_sequences(trainX) trainY = tl.prepro.remove_pad_sequences(trainY) testX = tl.prepro.remove_pad_sequences(testX) testY = tl.prepro.remove_pad_sequences(testY) validX = tl.prepro.remove_pad_sequences(validX)
References ---------- http://suriyadeepan.github.io/2016-12-31-practical-seq2seq/ """ import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import * import tensorflow as tf import numpy as np import time ###============= prepare data from data.twitter import data metadata, idx_q, idx_a = data.load_data(PATH='seq2seq-chatbot/data/fb_chat/') # Twitter # from data.cornell_corpus import data # metadata, idx_q, idx_a = data.load_data(PATH='data/cornell_corpus/') # Cornell Moive (trainX, trainY), (testX, testY), (validX, validY) = data.split_dataset(idx_q, idx_a) trainX = trainX.tolist() trainY = trainY.tolist() testX = testX.tolist() testY = testY.tolist() validX = validX.tolist() validY = validY.tolist() trainX = tl.prepro.remove_pad_sequences(trainX) trainY = tl.prepro.remove_pad_sequences(trainY) testX = tl.prepro.remove_pad_sequences(testX) testY = tl.prepro.remove_pad_sequences(testY)
def initial_setup(): metadata, idx_q, idx_a = data.load_data(PATH='data/twitter/') return metadata
decode_ans = tl.prepro.pad_sequences(decode_ans) mask = tl.prepro.sequences_get_mask(label_ans) _, error = session.run( [self.optimized_rnn, self.loss], { encode_q: ques, decode_a: decode_ans, label_a: label_ans, label_mask: mask }) self.total_error += error tl.files.save_npz(test_rnn.all_params, 'ChatbotRNN.npz', session) if __name__ == "__main__": frequentWords, questions, answers = data.load_data(PATH='data/twitter/') word2id = frequentWords['w2idx'] id2word = frequentWords['idx2w'] word2id.update({'<GO>': len(id2word)}) word2id.update({'<EOS>': len(id2word) + 1}) size_of_dict = len(id2word) + 2 QuesTrain, AnsTrain, QuesTest, AnsTest = data.split_dataset( questions, answers) removeTrail = removeZeros() assert removeTrail == True QTrain, ATrain = shuffle(QTrain, ATrain, random_state=0) encode_q = tf.placeholder(tf.int64, [cluster_size, None]) decode_a = tf.placeholder(tf.int64, [cluster_size, None])
def load_data(self): metadata, idx_q, idx_a = data.load_data(PATH='data/twitter/') return metadata
def main(): metadata, idx_q, idx_a = data.load_data(PATH='data/twitter/') trainX, trainY, testX, testY, validX, validY = getDataset(idx_q, idx_a) xseq_len = len(trainX) yseq_len = len(trainY) assert xseq_len == yseq_len batch_size = 32 n_step = int(xseq_len / batch_size) xvocab_size = len(metadata['idx2w']) emb_dim = 1024 w2idx = metadata['w2idx'] idx2w = metadata['idx2w'] unk_id = w2idx['unk'] pad_id = w2idx['_'] start_id = xvocab_size end_id = xvocab_size + 1 w2idx.update({'start_id': start_id}) w2idx.update({'end_id': end_id}) idx2w = idx2w + ['start_id', 'end_id'] xvocab_size = yvocab_size = xvocab_size + 2 target_seqs = tl.prepro.sequences_add_end_id([trainY[10]], end_id=end_id)[0] decode_seqs = tl.prepro.sequences_add_start_id([trainY[10]], start_id=start_id, remove_last=False)[0] target_mask = tl.prepro.sequences_get_mask([target_seqs])[0] encode_seqs = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="encode_seqs") decode_seqs = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="decode_seqs") target_seqs = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="target_seqs") target_mask = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="target_mask") net_out, _ = model(encode_seqs, decode_seqs, xvocab_size, is_train=True, reuse=False) encode_seqs2 = tf.placeholder(dtype=tf.int64, shape=[1, None], name="encode_seqs") decode_seqs2 = tf.placeholder(dtype=tf.int64, shape=[1, None], name="decode_seqs") net, net_rnn = model(encode_seqs2, decode_seqs2, xvocab_size, is_train=False, reuse=True) y = tf.nn.softmax(net.outputs) loss = tl.cost.cross_entropy_seq_with_mask(logits=net_out.outputs, target_seqs=target_seqs, input_mask=target_mask, return_details=False, name='cost') net_out.print_params(False) lr = 0.0001 train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss) gpu_option = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, gpu_options=gpu_option)) tl.layers.initialize_global_variables(sess) load_parameter = tl.files.load_and_assign_npz(sess=sess, name='twitter.npz', network=net) if not load_parameter: print("Loading npz fail, starting to train.") n_epoch = 50 for epoch in range(n_epoch): epoch_time = time.time() from sklearn.utils import shuffle trainX, trainY = shuffle(trainX, trainY, random_state=0) total_err, n_iter = 0, 0 for X, Y in tl.iterate.minibatches(inputs=trainX, targets=trainY, batch_size=batch_size, shuffle=False): step_time = time.time() X = tl.prepro.pad_sequences(X) _target_seqs = tl.prepro.sequences_add_end_id(Y, end_id=end_id) _target_seqs = tl.prepro.pad_sequences(_target_seqs) _decode_seqs = tl.prepro.sequences_add_start_id( Y, start_id=start_id, remove_last=False) _decode_seqs = tl.prepro.pad_sequences(_decode_seqs) _target_mask = tl.prepro.sequences_get_mask(_target_seqs) _, err = sess.run( [train_op, loss], { encode_seqs: X, decode_seqs: _decode_seqs, target_seqs: _target_seqs, target_mask: _target_mask }) if n_iter % 200 == 0: print("Epoch[%d/%d] step:[%d/%d] loss:%f took:%.5fs" % (epoch, n_epoch, n_iter, n_step, err, time.time() - step_time)) total_err += err n_iter += 1 if n_iter % 1000 == 0: print("Query> happy birthday to you") getReplying(y, w2idx, idx2w, decode_seqs2, encode_seqs2, start_id, end_id, sess, net_rnn, "happy birthday to you") print("Query> help me to do the exam") getReplying(y, w2idx, idx2w, decode_seqs2, encode_seqs2, start_id, end_id, sess, net_rnn, "help me to do the exam") print("Query> ny is so cold now") getReplying(y, w2idx, idx2w, decode_seqs2, encode_seqs2, start_id, end_id, sess, net_rnn, "ny is so cold now") print( "Epoch[%d/%d] averaged loss:%f took:%.5fs" % (epoch, n_epoch, total_err / n_iter, time.time() - epoch_time)) tl.files.save_npz(net.all_params, name='n.npz', sess=sess) while (True): getReplying(y, w2idx, idx2w, decode_seqs2, encode_seqs2, start_id, end_id, sess, net_rnn, input("You>"))