def get_training_batch(w2v_model, tokenized_dialog, token_to_index): for sents_batch in _batch(tokenized_dialog, SAMPLES_BATCH_SIZE): if not sents_batch: continue X = np.zeros((len(sents_batch), INPUT_SEQUENCE_LENGTH, TOKEN_REPRESENTATION_SIZE), dtype=np.float32) Y = np.zeros((len(sents_batch), ANSWER_MAX_TOKEN_LENGTH, TOKEN_REPRESENTATION_SIZE), dtype=np.float32) Y_ids = np.zeros((len(sents_batch), ANSWER_MAX_TOKEN_LENGTH), dtype=np.int32) for s_index, sentence in enumerate(sents_batch[::2]): if s_index == len(sents_batch) - 1: break for t_index, token in enumerate(sents_batch[s_index][:INPUT_SEQUENCE_LENGTH]): X[s_index, t_index] = get_token_vector(token, w2v_model) for t_index, token in enumerate(sents_batch[s_index + 1][:ANSWER_MAX_TOKEN_LENGTH]): Y[s_index, t_index] = get_token_vector(token, w2v_model) Y_ids[s_index, t_index] = token_to_index[token] X = np.fliplr(X) # reverse inputs # print sents_batch[0] # print sents_batch[1] yield X, Y, Y_ids
def get_training_batch(w2v_model_en, w2v_model_de, tokenized_dialog_en,tokenized_dialog_de, token_to_index_de): token_voc_size = len(token_to_index_de) for sents_batch in _batch(tokenized_dialog_en,tokenized_dialog_de, SAMPLES_BATCH_SIZE): print "sents_batch: ", np.shape(sents_batch) if not sents_batch: continue X = np.zeros((len(sents_batch)/2, INPUT_SEQUENCE_LENGTH, TOKEN_REPRESENTATION_SIZE), dtype=np.float) # Y = np.zeros((len(sents_batch)/2, ANSWER_MAX_TOKEN_LENGTH, token_voc_size), dtype=np.bool) Y = np.zeros((len(sents_batch)/2, ANSWER_MAX_TOKEN_LENGTH, TOKEN_REPRESENTATION_SIZE), dtype=np.float) # for s_index, sentence in enumerate(sents_batch): for s_index in range(0, len(sents_batch),2): # print "s_index: ",s_index # print "s_s_index",s_s_index if s_index == len(sents_batch) - 1: break # print "s_s_index: ",s_index/2 for t_index, token in enumerate(sents_batch[s_index][:INPUT_SEQUENCE_LENGTH]): X[s_index/2, t_index] = get_token_vector(token, w2v_model_en) # print "see====>",len(X[s_index/2, t_index]) for t_index, token in enumerate(sents_batch[s_index + 1][:ANSWER_MAX_TOKEN_LENGTH]): # Y[s_index/2, t_index, token_to_index_de[token]] = 1 Y[s_index/2, t_index] = get_token_vector(token, w2v_model_de) # print X[s_index/2] # print '-------------------------------------------------' # print Y[s_index/2] # print "SHAPES X and Y:",np.shape(X),np.shape(Y) # print X # print '------------' # print Y yield X, Y
def get_training_batch(w2v_model, tokenized_dialog, token_to_index): token_voc_size = len(token_to_index) for sents_batch in _batch(tokenized_dialog, SAMPLES_BATCH_SIZE): if not sents_batch: continue X = np.zeros((len(sents_batch), INPUT_SEQUENCE_LENGTH, TOKEN_REPRESENTATION_SIZE), dtype=np.float) Y = np.zeros( (len(sents_batch), ANSWER_MAX_TOKEN_LENGTH, token_voc_size), dtype=np.bool) for s_index, sentence in enumerate(sents_batch): if s_index == len(sents_batch) - 1: break for t_index, token in enumerate( sents_batch[s_index][:INPUT_SEQUENCE_LENGTH]): X[s_index, t_index] = get_token_vector(token, w2v_model) for t_index, token in enumerate( sents_batch[s_index + 1][:INPUT_SEQUENCE_LENGTH]): Y[s_index, t_index, token_to_index[token]] = 1 yield X, Y
def compute_similarities(prediction_vector, w2v_model, index_to_token): similarities = [] for i in range(len(index_to_token)): similarity = 1 - spatial.distance.cosine( get_token_vector(index_to_token[i], w2v_model), prediction_vector) similarities.append(similarity) return similarities
def transform_w2v_model_to_matrix(w2v_model, index_to_token): all_words = index_to_token.values() token_to_index = {v: k for k, v in index_to_token.items()} n_words = len(index_to_token) output = np.zeros((n_words, TOKEN_REPRESENTATION_SIZE)) for word in all_words: idx = token_to_index[word] output[idx] = get_token_vector(word, w2v_model) return output
def _sequence_to_vector(sentence, w2v_model): # Here we need predicted vectors only for one sequence, not for the whole batch, # however StatefulRNN works in a such a way that we have to feed predict() function # the same number of examples as in our train batch. # Then we can use only the first predicted sequence and disregard all the rest. # If you have more questions, feel free to address them to https://github.com/farizrahman4u/seq2seq X = np.zeros((TRAIN_BATCH_SIZE, INPUT_SEQUENCE_LENGTH, TOKEN_REPRESENTATION_SIZE)) for t, token in enumerate(sentence): X[0, t] = get_token_vector(token, w2v_model) return X
def get_training_batch_new(w2v_model_en, w2v_model_de, tokenized_dialog_en,tokenized_dialog_de, token_to_index_de): token_voc_size = len(token_to_index_de) for sents_batch in _batch(tokenized_dialog_en,tokenized_dialog_de, SAMPLES_BATCH_SIZE): print "sents_batch: ", np.shape(sents_batch) if not sents_batch: continue input_seq_length = len(sents_batch[0]) output_seq_length = len(sents_batch[1]) for (a,b) in BUCKETS: if a > len(sents_batch[0]): input_seq_length = a output_seq_length = b break print "isl", input_seq_length print "osl",output_seq_length print "len1", sents_batch[0] print "len2", sents_batch[1] X = np.zeros((len(sents_batch)/2, input_seq_length, TOKEN_REPRESENTATION_SIZE), dtype=np.float) Y = np.zeros((len(sents_batch)/2, output_seq_length, token_voc_size), dtype=np.bool) nn_model=get_nn_model_new(token_voc_size,input_seq_length, output_seq_length) # Y = np.zeros((len(sents_batch)/2, ANSWER_MAX_TOKEN_LENGTH, TOKEN_REPRESENTATION_SIZE), dtype=np.float) # for s_index, sentence in enumerate(sents_batch): for s_index in range(0, len(sents_batch),2): # print "s_index: ",s_index # print "s_s_index",s_s_index if s_index == len(sents_batch) - 1: break # print "s_s_index: ",s_index/2 for t_index, token in enumerate(sents_batch[s_index][:input_seq_length]): X[s_index/2, t_index] = get_token_vector(token, w2v_model_en) # print "see====>",len(X[s_index/2, t_index]) for t_index, token in enumerate(sents_batch[s_index + 1][:output_seq_length]): Y[s_index/2, t_index, token_to_index_de[token]] = 1 # Y[s_index/2, t_index] = get_token_vector(token, w2v_model_de) # print X[s_index/2] # print '-------------------------------------------------' # print Y[s_index/2] # print "SHAPES X and Y:",np.shape(X),np.shape(Y) # print X # print '------------' # print Y yield X, Y,nn_model
def get_training_batch(w2v_model, tokenized_dialog, token_to_index): token_voc_size = len(token_to_index) for sents_batch in _batch(tokenized_dialog, SAMPLES_BATCH_SIZE): if not sents_batch: continue X = np.zeros((len(sents_batch), INPUT_SEQUENCE_LENGTH, TOKEN_REPRESENTATION_SIZE), dtype=np.float) Y = np.zeros((len(sents_batch), ANSWER_MAX_TOKEN_LENGTH, token_voc_size), dtype=np.bool) for s_index, sentence in enumerate(sents_batch): if s_index == len(sents_batch) - 1: break for t_index, token in enumerate(sents_batch[s_index][:INPUT_SEQUENCE_LENGTH]): X[s_index, t_index] = get_token_vector(token, w2v_model) for t_index, token in enumerate(sents_batch[s_index + 1][:INPUT_SEQUENCE_LENGTH]): Y[s_index, t_index, token_to_index[token]] = 1 yield X, Y
def _predict_sequence(input_sequence, nn_model, w2v_model, index_to_token, temperature): token_to_index = dict(zip(index_to_token.values(), index_to_token.keys())) all_tokens = token_to_index.keys() response = [] tokens_probs = [] input_ids = [] for token in input_sequence: if token in all_tokens: token_id = token_to_index[token] else: token_id = token_to_index[EMPTY_TOKEN] input_ids.append(token_id) input_vectors = [] for token in input_sequence: token_vector = get_token_vector(token, w2v_model) input_vectors.append(token_vector) x_batch = np.zeros((1, INPUT_SEQUENCE_LENGTH, TOKEN_REPRESENTATION_SIZE), dtype=np.float32) for i, token_vector in enumerate(input_vectors[-INPUT_SEQUENCE_LENGTH:]): x_batch[0][i] = token_vector x_batch = np.fliplr(x_batch) # reverse inputs curr_y_batch = np.zeros((1, ANSWER_MAX_TOKEN_LENGTH, TOKEN_REPRESENTATION_SIZE), dtype=np.float32) curr_y_batch[0][0] = get_token_vector(START_TOKEN, w2v_model) next_token = START_TOKEN i = 0 while next_token != EOS_SYMBOL and len(response) < ANSWER_MAX_TOKEN_LENGTH-1: probs_batch = nn_model.predict(x_batch, curr_y_batch) # print probs_batch.shape # print probs_batch # probs_batch has shape (batch_size * seq_len, vocab_size) # but here batch_size == 1 # we only need the i-th prediction, so take it curr_token_prob_dist = probs_batch[i] next_token_id, next_token_prob = _sample(curr_token_prob_dist, temperature) next_token = index_to_token[next_token_id] response.append(next_token) tokens_probs.append(next_token_prob) i += 1 curr_y_batch[0][i] = get_token_vector(next_token, w2v_model) # print all sequences for debugging for d in probs_batch: next_token_id, next_token_prob = _sample(d, temperature) next_token = index_to_token[next_token_id] print next_token, print print response_perplexity = get_sequence_perplexity(tokens_probs) return response, response_perplexity
def compute_similarities(prediction_vector,w2v_model,index_to_token): similarities=[] for i in range(len(index_to_token)): similarity=1 - spatial.distance.cosine(get_token_vector(index_to_token[i],w2v_model),prediction_vector) similarities.append(similarity) return similarities