Exemple #1
0
def get_training_batch(w2v_model, tokenized_dialog, token_to_index):

    for sents_batch in _batch(tokenized_dialog, SAMPLES_BATCH_SIZE):
        if not sents_batch:
            continue

        X = np.zeros((len(sents_batch), INPUT_SEQUENCE_LENGTH, TOKEN_REPRESENTATION_SIZE), dtype=np.float32)
        Y = np.zeros((len(sents_batch), ANSWER_MAX_TOKEN_LENGTH, TOKEN_REPRESENTATION_SIZE), dtype=np.float32)
        Y_ids = np.zeros((len(sents_batch), ANSWER_MAX_TOKEN_LENGTH), dtype=np.int32)

        for s_index, sentence in enumerate(sents_batch[::2]):
            if s_index == len(sents_batch) - 1:
                break

            for t_index, token in enumerate(sents_batch[s_index][:INPUT_SEQUENCE_LENGTH]):
                X[s_index, t_index] = get_token_vector(token, w2v_model)

            for t_index, token in enumerate(sents_batch[s_index + 1][:ANSWER_MAX_TOKEN_LENGTH]):
                Y[s_index, t_index] = get_token_vector(token, w2v_model)
                Y_ids[s_index, t_index] = token_to_index[token]

        X = np.fliplr(X)  # reverse inputs

        # print sents_batch[0]
        # print sents_batch[1]

        yield X, Y, Y_ids
Exemple #2
0
def get_training_batch(w2v_model_en, w2v_model_de, tokenized_dialog_en,tokenized_dialog_de, token_to_index_de):
    token_voc_size = len(token_to_index_de)
    for sents_batch in _batch(tokenized_dialog_en,tokenized_dialog_de, SAMPLES_BATCH_SIZE):
        print "sents_batch: ", np.shape(sents_batch)
        if not sents_batch:
            continue

        X = np.zeros((len(sents_batch)/2, INPUT_SEQUENCE_LENGTH, TOKEN_REPRESENTATION_SIZE), dtype=np.float)
        # Y = np.zeros((len(sents_batch)/2, ANSWER_MAX_TOKEN_LENGTH, token_voc_size), dtype=np.bool)
        Y = np.zeros((len(sents_batch)/2, ANSWER_MAX_TOKEN_LENGTH, TOKEN_REPRESENTATION_SIZE), dtype=np.float)
        # for s_index, sentence in enumerate(sents_batch):
        for s_index in range(0, len(sents_batch),2):
            # print "s_index: ",s_index
            # print "s_s_index",s_s_index
            if s_index == len(sents_batch) - 1:
                break

            # print "s_s_index: ",s_index/2
            for t_index, token in enumerate(sents_batch[s_index][:INPUT_SEQUENCE_LENGTH]):
                X[s_index/2, t_index] = get_token_vector(token, w2v_model_en)
                # print "see====>",len(X[s_index/2, t_index])

            for t_index, token in enumerate(sents_batch[s_index + 1][:ANSWER_MAX_TOKEN_LENGTH]):
                # Y[s_index/2, t_index, token_to_index_de[token]] = 1
                Y[s_index/2, t_index] = get_token_vector(token, w2v_model_de)

            # print X[s_index/2]
            # print '-------------------------------------------------'
            # print Y[s_index/2]
            # print "SHAPES X and Y:",np.shape(X),np.shape(Y)

        # print X
        # print '------------'
        # print Y
        yield X, Y
Exemple #3
0
def get_training_batch(w2v_model, tokenized_dialog, token_to_index):
    token_voc_size = len(token_to_index)

    for sents_batch in _batch(tokenized_dialog, SAMPLES_BATCH_SIZE):
        if not sents_batch:
            continue

        X = np.zeros((len(sents_batch), INPUT_SEQUENCE_LENGTH,
                      TOKEN_REPRESENTATION_SIZE),
                     dtype=np.float)
        Y = np.zeros(
            (len(sents_batch), ANSWER_MAX_TOKEN_LENGTH, token_voc_size),
            dtype=np.bool)
        for s_index, sentence in enumerate(sents_batch):
            if s_index == len(sents_batch) - 1:
                break

            for t_index, token in enumerate(
                    sents_batch[s_index][:INPUT_SEQUENCE_LENGTH]):
                X[s_index, t_index] = get_token_vector(token, w2v_model)

            for t_index, token in enumerate(
                    sents_batch[s_index + 1][:INPUT_SEQUENCE_LENGTH]):
                Y[s_index, t_index, token_to_index[token]] = 1

        yield X, Y
def compute_similarities(prediction_vector, w2v_model, index_to_token):
    similarities = []
    for i in range(len(index_to_token)):
        similarity = 1 - spatial.distance.cosine(
            get_token_vector(index_to_token[i], w2v_model), prediction_vector)
        similarities.append(similarity)
    return similarities
Exemple #5
0
def transform_w2v_model_to_matrix(w2v_model, index_to_token):
    all_words = index_to_token.values()
    token_to_index = {v: k for k, v in index_to_token.items()}
    n_words = len(index_to_token)
    output = np.zeros((n_words, TOKEN_REPRESENTATION_SIZE))
    for word in all_words:
        idx = token_to_index[word]
        output[idx] = get_token_vector(word, w2v_model)
    return output
Exemple #6
0
def _sequence_to_vector(sentence, w2v_model):
    # Here we need predicted vectors only for one sequence, not for the whole batch,
    # however StatefulRNN works in a such a way that we have to feed predict() function
    # the same number of examples as in our train batch.
    # Then we can use only the first predicted sequence and disregard all the rest.
    # If you have more questions, feel free to address them to https://github.com/farizrahman4u/seq2seq
    X = np.zeros((TRAIN_BATCH_SIZE, INPUT_SEQUENCE_LENGTH, TOKEN_REPRESENTATION_SIZE))

    for t, token in enumerate(sentence):
        X[0, t] = get_token_vector(token, w2v_model)

    return X
Exemple #7
0
def get_training_batch_new(w2v_model_en, w2v_model_de, tokenized_dialog_en,tokenized_dialog_de, token_to_index_de):
    token_voc_size = len(token_to_index_de)
    for sents_batch in _batch(tokenized_dialog_en,tokenized_dialog_de, SAMPLES_BATCH_SIZE):
        print "sents_batch: ", np.shape(sents_batch)
        if not sents_batch:
            continue

        input_seq_length = len(sents_batch[0])
        output_seq_length = len(sents_batch[1])
        for (a,b) in BUCKETS:
            if a > len(sents_batch[0]):
                input_seq_length = a
                output_seq_length = b
                break

        print "isl", input_seq_length
        print "osl",output_seq_length
        print "len1", sents_batch[0]
        print "len2", sents_batch[1]

        X = np.zeros((len(sents_batch)/2, input_seq_length, TOKEN_REPRESENTATION_SIZE), dtype=np.float)
        Y = np.zeros((len(sents_batch)/2, output_seq_length, token_voc_size), dtype=np.bool)

        nn_model=get_nn_model_new(token_voc_size,input_seq_length, output_seq_length)
        # Y = np.zeros((len(sents_batch)/2, ANSWER_MAX_TOKEN_LENGTH, TOKEN_REPRESENTATION_SIZE), dtype=np.float)
        # for s_index, sentence in enumerate(sents_batch):
        for s_index in range(0, len(sents_batch),2):
            # print "s_index: ",s_index
            # print "s_s_index",s_s_index
            if s_index == len(sents_batch) - 1:
                break

            # print "s_s_index: ",s_index/2
            for t_index, token in enumerate(sents_batch[s_index][:input_seq_length]):
                X[s_index/2, t_index] = get_token_vector(token, w2v_model_en)
                # print "see====>",len(X[s_index/2, t_index])

            for t_index, token in enumerate(sents_batch[s_index + 1][:output_seq_length]):
                Y[s_index/2, t_index, token_to_index_de[token]] = 1
                # Y[s_index/2, t_index] = get_token_vector(token, w2v_model_de)

            # print X[s_index/2]
            # print '-------------------------------------------------'
            # print Y[s_index/2]
            # print "SHAPES X and Y:",np.shape(X),np.shape(Y)

        # print X
        # print '------------'
        # print Y
        yield X, Y,nn_model
Exemple #8
0
def get_training_batch(w2v_model, tokenized_dialog, token_to_index):
    token_voc_size = len(token_to_index)

    for sents_batch in _batch(tokenized_dialog, SAMPLES_BATCH_SIZE):
        if not sents_batch:
            continue

        X = np.zeros((len(sents_batch), INPUT_SEQUENCE_LENGTH, TOKEN_REPRESENTATION_SIZE), dtype=np.float)
        Y = np.zeros((len(sents_batch), ANSWER_MAX_TOKEN_LENGTH, token_voc_size), dtype=np.bool)
        for s_index, sentence in enumerate(sents_batch):
            if s_index == len(sents_batch) - 1:
                break

            for t_index, token in enumerate(sents_batch[s_index][:INPUT_SEQUENCE_LENGTH]):
                X[s_index, t_index] = get_token_vector(token, w2v_model)

            for t_index, token in enumerate(sents_batch[s_index + 1][:INPUT_SEQUENCE_LENGTH]):
                Y[s_index, t_index, token_to_index[token]] = 1

        yield X, Y
Exemple #9
0
def _predict_sequence(input_sequence, nn_model, w2v_model, index_to_token, temperature):
    token_to_index = dict(zip(index_to_token.values(), index_to_token.keys()))
    all_tokens = token_to_index.keys()
    response = []
    tokens_probs = []

    input_ids = []
    for token in input_sequence:
        if token in all_tokens:
            token_id = token_to_index[token]
        else:
            token_id = token_to_index[EMPTY_TOKEN]
        input_ids.append(token_id)

    input_vectors = []
    for token in input_sequence:
        token_vector = get_token_vector(token, w2v_model)
        input_vectors.append(token_vector)

    x_batch = np.zeros((1, INPUT_SEQUENCE_LENGTH, TOKEN_REPRESENTATION_SIZE), dtype=np.float32)

    for i, token_vector in enumerate(input_vectors[-INPUT_SEQUENCE_LENGTH:]):
        x_batch[0][i] = token_vector

    x_batch = np.fliplr(x_batch)  # reverse inputs

    curr_y_batch = np.zeros((1, ANSWER_MAX_TOKEN_LENGTH, TOKEN_REPRESENTATION_SIZE), dtype=np.float32)
    curr_y_batch[0][0] = get_token_vector(START_TOKEN, w2v_model)

    next_token = START_TOKEN
    i = 0

    while next_token != EOS_SYMBOL and len(response) < ANSWER_MAX_TOKEN_LENGTH-1:
        probs_batch = nn_model.predict(x_batch, curr_y_batch)

        # print probs_batch.shape
        # print probs_batch

        # probs_batch has shape (batch_size * seq_len, vocab_size)
        # but here batch_size == 1
        # we only need the i-th prediction, so take it
        curr_token_prob_dist = probs_batch[i]
        next_token_id, next_token_prob = _sample(curr_token_prob_dist, temperature)

        next_token = index_to_token[next_token_id]
        response.append(next_token)
        tokens_probs.append(next_token_prob)

        i += 1
        curr_y_batch[0][i] = get_token_vector(next_token, w2v_model)

        # print all sequences for debugging
        for d in probs_batch:
            next_token_id, next_token_prob = _sample(d, temperature)
            next_token = index_to_token[next_token_id]
            print next_token,

        print
        print

    response_perplexity = get_sequence_perplexity(tokens_probs)

    return response, response_perplexity
def compute_similarities(prediction_vector,w2v_model,index_to_token):
    similarities=[]
    for i in range(len(index_to_token)):
        similarity=1 - spatial.distance.cosine(get_token_vector(index_to_token[i],w2v_model),prediction_vector)
        similarities.append(similarity)
    return similarities