Ejemplo n.º 1
0
def predict(tweet_word):
    vocab = load_vocab()
    input_data_keras = formatK_tweet(tweet_word, vocab)
    for i in range(128):
        if (len(input_data_keras) < 128):
            input_data_keras.append(0)
    input_data_sk = word_transform(tweet_word)

    cnn = load_model_keras('./variables/cnn_model.tf')
    lstm = load_model_keras('./variables/lstm_model.tf')
    lstm_improved = load_model_keras_custom('./variables/lstm+_model.tf',
                                            {"peephole_lstm_cells": tfa.rnn.PeepholeLSTMCell(32),
                                             "root_mean_squared_logarithmic_error": root_mean_squared_logarithmic_error})
    svm = load_model_sk('./variables/svm_model.sav')
    sgd = load_model_sk('./variables/sgd_model.sav')

    # print("Prediction:")
    # print("CNN: ", cnn.predict([input_data_keras[:128]]))
    # print("LSTM: ", lstm.predict([input_data_keras[:128]]))
    # print("LSTM+: ", lstm_improved.predict([input_data_keras[:128]]))
    # print("SVM: ", svm.predict(input_data_sk))
    # print("SGD: ", sgd.predict(input_data_sk))
    return {"CNN": cnn.predict([input_data_keras[:128]])[0][0], "LSTM": lstm.predict([input_data_keras])[0][0],
            "LSTM+": lstm_improved.predict([input_data_keras])[0][0], "SVM": svm.predict(input_data_sk)[0],
            "SGD": sgd.predict(input_data_sk)[0]}
Ejemplo n.º 2
0
def get_embedding_matrix(word_dim,mode,vocab_size):
    if mode == modekeys.TRAIN:
        vocab, vocab_dict = helper.load_vocab('twitter_data/rg_vocab.txt')
        glove_vectors,glove_dict  = helper.load_glove_vectors('twitter_data/my_vector.txt', vocab)
        initial_value = helper.build_initial_embedding_matrix(vocab_dict, glove_dict, glove_vectors, word_dim)
        embedding_w = tf.get_variable(name='embedding_W', initializer=initial_value, trainable=True)
    else:
        embedding_w = tf.get_variable(name='embedding_W',shape=[vocab_size,word_dim],dtype=tf.float32,trainable=True)
    return embedding_w
def load_word_embedding(vocab_path, word_embed_path):
    vocabulary, vocab_dict = helper.load_vocab(vocab_path)
    glove_vectors, glove_dict = helper.load_glove_vectors(
        word_embed_path, vocabulary)
    vocab_size = len(vocabulary)
    word_dim = glove_vectors.shape[1]
    embedding_matrix = helper.build_initial_embedding_matrix(
        vocab_dict=vocab_dict,
        glove_vectors=glove_vectors,
        glove_dict=glove_dict,
        embedding_dim=word_dim)
    embedding_W = tf.get_variable('word_embedding_W',
                                  dtype=tf.float32,
                                  initializer=embedding_matrix,
                                  trainable=False)
    return embedding_W
def get_embeddings(hparams):
    if hparams.glove_path and hparams.vocab_path:
        tf.logging.info("Loading Glove embeddings...")
        vocab_array, vocab_dict = helper.load_vocab(hparams.vocab_path)
        glove_vectors, glove_dict = helper.load_glove_vectors(
            hparams.glove_path, vocab=set(vocab_array))
        initializer = helper.build_initial_embedding_matrix(
            vocab_dict, glove_dict, glove_vectors, hparams.embedding_dim)
    else:
        tf.logging.info(
            "No glove/vocab path specificed, starting with random embeddings.")
        initializer = tf.random_uniform_initializer(-0.25, 0.25)

    return tf.get_variable("word_embeddings",
                           initializer=initializer,
                           trainable=False)
Ejemplo n.º 5
0
def get_embedding_matrix(word_dim, mode, vocab_size, random_seed,
                         word_embed_path, vocab_path):
    if mode == modekeys.TRAIN:
        vocab, vocab_dict = helper.load_vocab(vocab_path)
        glove_vectors, glove_dict = helper.load_glove_vectors(
            word_embed_path, vocab)
        initial_value = helper.build_initial_embedding_matrix(
            vocab_dict, glove_dict, glove_vectors, word_dim, random_seed)
        embedding_w = tf.get_variable(name='embedding_W',
                                      initializer=initial_value,
                                      trainable=True)
    else:
        embedding_w = tf.get_variable(name='embedding_W',
                                      shape=[vocab_size, word_dim],
                                      dtype=tf.float32,
                                      trainable=False)
    return embedding_w
Ejemplo n.º 6
0
    for p in ps:
        source_seq = [w2i_source[w] for w in doc_source[p].split()] + [w2i_source["<PAD>"]] * (
                max_source_len - len(doc_source[p].split()))
        target_seq = [w2i_target[w] for w in doc_target[p].split()] + [w2i_target["<PAD>"]] * (
                max_target_len - 1 - len(doc_target[p].split())) + [w2i_target["<EOS>"]]
        source_batch.append(source_seq)
        target_batch.append(target_seq)
    return source_batch, source_lens, target_batch, target_lens


if __name__ == '__main__':
    print 'loading data ...'
    doc_source = helper.load_file('./data/small_vocab_en.txt')
    doc_target = helper.load_file('./data/small_vocab_fr.txt')
    s_token2idx, s_idx2token = helper.load_vocab('./data/small_vocab_en.txt', helper.SOURCE_CODES)
    t_token2idx, t_idx2token = helper.load_vocab('./data/small_vocab_fr.txt', helper.TARGET_CODES)
    print 'building model...'
    config = config()
    config.source_vocab_size = len(s_token2idx)
    config.target_vocab_size = len(t_token2idx)
    model = Seq2seq(config, t_token2idx, useTeacherForcing=True)
    batches = 10000
    print_every = 100
    print 'run model...'
    with tf.Session() as sess:
        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())
        losses = []
        total_loss = 0
        for batch in range(batches):
Ejemplo n.º 7
0
def construct_training_data_batches(config):
    # train_src = 'data/iwslt15/train.en'
    # train_tgt = 'data/iwslt15/train.en'
    # # train_src = 'data/iwslt15/mytrain3.en'
    # # train_tgt = 'data/iwslt15/mytrain3.vi'
    # vocab_src = 'data/iwslt15/vocab.en'
    # vocab_tgt = 'data/iwslt15/vocab.en'

    train_src = config['train_src']
    train_tgt = config['train_tgt']
    vocab_src = config['vocab_src']
    vocab_tgt = config['vocab_tgt']

    batch_size = config['batch_size']
    max_sentence_length = config['max_sentence_length']

    vocab_paths = {'vocab_src': vocab_src, 'vocab_tgt': vocab_tgt}
    data_paths = {'train_src': train_src, 'train_tgt': train_tgt}

    src_word2id, tgt_word2id = load_vocab(vocab_paths)
    train_src_sentences, train_tgt_sentences = load_data(data_paths)

    vocab_size = {'src': len(src_word2id), 'tgt': len(tgt_word2id)}
    print("num_vocab_src: ", vocab_size['src'])
    print("num_vocab_tgt: ", vocab_size['tgt'])

    train_src_word_ids = []  # num_sentences x max_sentence_length
    train_tgt_word_ids = []  # num_sentences x max_sentence_length
    train_src_sentence_lengths = []
    train_tgt_sentence_lengths = []

    # EOS id
    src_eos_id = src_word2id['</s>']
    tgt_eos_id = tgt_word2id['</s>']

    # Source and Target sentences
    for src_sentence, tgt_sentence in zip(train_src_sentences,
                                          train_tgt_sentences):
        src_words = src_sentence.split()
        tgt_words = tgt_sentence.split()

        if len(src_words) > max_sentence_length or len(
                tgt_words) > max_sentence_length:
            continue

        # source
        src_ids = [src_eos_id] * max_sentence_length
        for i, word in enumerate(src_words):
            if word in src_word2id:
                src_ids[i] = src_word2id[word]
            else:
                src_ids[i] = src_word2id['<unk>']
        train_src_word_ids.append(src_ids)
        train_src_sentence_lengths.append(len(src_words) +
                                          1)  # include one EOS

        # target
        tgt_ids = [tgt_eos_id] * max_sentence_length
        for i, word in enumerate(tgt_words):
            if word in tgt_word2id:
                tgt_ids[i] = tgt_word2id[word]
            else:
                tgt_ids[i] = tgt_word2id['<unk>']
        train_tgt_word_ids.append(tgt_ids)
        train_tgt_sentence_lengths.append(len(tgt_words) +
                                          1)  # include one EOS

    assert (len(train_src_word_ids) == len(train_tgt_word_ids)
            ), "train_src_word_ids != train_src_word_ids"
    num_training_sentences = len(train_src_word_ids)
    print("num_training_sentences: ",
          num_training_sentences)  # only those that are not too long

    # shuffle
    _x = list(
        zip(train_src_word_ids, train_tgt_word_ids, train_src_sentence_lengths,
            train_tgt_sentence_lengths))
    random.shuffle(_x)
    train_src_word_ids, train_tgt_word_ids, train_src_sentence_lengths, train_tgt_sentence_lengths = zip(
        *_x)

    batches = []

    for i in range(int(num_training_sentences / batch_size)):
        i_start = i * batch_size
        i_end = i_start + batch_size
        batch = {
            'src_word_ids': train_src_word_ids[i_start:i_end],
            'tgt_word_ids': train_tgt_word_ids[i_start:i_end],
            'src_sentence_lengths': train_src_sentence_lengths[i_start:i_end],
            'tgt_sentence_lengths': train_tgt_sentence_lengths[i_start:i_end]
        }

        batches.append(batch)

    return batches, vocab_size, src_word2id, tgt_word2id
Ejemplo n.º 8
0
import random
import time
from model import Seq2seq
import helper
from train import config, get_batch

tf_config = tf.ConfigProto(allow_soft_placement=True)
tf_config.gpu_options.allow_growth = True

model_path = "checkpoint/model.ckpt"

if __name__ == "__main__":
    print("(1)load data......")
    docs_source = ['new jersey is usually hot during autumn , and it is never quiet in winter .\n']
    docs_target = ["new jersey est généralement chaud pendant l' automne , et il est jamais calme en hiver .\n"]
    w2i_source, i2w_source = helper.load_vocab('./data/small_vocab_en.txt', helper.SOURCE_CODES)
    w2i_target, i2w_target = helper.load_vocab('./data/small_vocab_fr.txt', helper.TARGET_CODES)

    print("(2) build model......")
    config = config()
    config.source_vocab_size = len(w2i_source)
    config.target_vocab_size = len(w2i_target)
    model = Seq2seq(config, w2i_target, useTeacherForcing=False)

    print("(3) run model......")
    print_every = 100
    max_target_len = 20

    with tf.Session(config=tf_config) as sess:
        saver = tf.train.Saver()
        saver.restore(sess, model_path)
Ejemplo n.º 9
0
def translate(config):
    if 'X_SGE_CUDA_DEVICE' in os.environ:
        print('running on the stack...')
        cuda_device = os.environ['X_SGE_CUDA_DEVICE']
        print('X_SGE_CUDA_DEVICE is set to {}'.format(cuda_device))
        os.environ['CUDA_VISIBLE_DEVICES'] = cuda_device

    else:  # development only e.g. air202
        print('running locally...')
        os.environ[
            'CUDA_VISIBLE_DEVICES'] = '0'  # choose the device (GPU) here

    sess_config = tf.ConfigProto()

    vocab_paths = {
        'vocab_src': config['vocab_src'],
        'vocab_tgt': config['vocab_tgt']
    }
    src_word2id, tgt_word2id = load_vocab(vocab_paths)

    tgt_id2word = list(tgt_word2id.keys())

    params = {
        'vocab_src_size': len(src_word2id),
        'vocab_tgt_size': len(tgt_word2id),
        'go_id': tgt_word2id['<go>'],
        'eos_id': tgt_word2id['</s>']
    }

    # build the model
    model = EncoderDecoder(config, params)
    model.build_network()

    # save & restore model
    saver = tf.train.Saver()
    save_path = config['load']
    model_number = config['model_number'] if config[
        'model_number'] != None else config['num_epochs'] - 1
    full_save_path_to_model = save_path + '/model-' + str(model_number)

    # ------ PPL Parser for Fluency Score ------ #
    # parser = PplParser()
    # rnnlm_model = "/home/alta/BLTSpeaking/ged-pm574/gec-lm/train-rnnlm/rnnlms/v3/one-billion/RNN_weight.OOS.cuedrnnlm.rnnlm.300.300/train_LM.wgt.iter9"
    # # test_file = "/home/alta/BLTSpeaking/ged-pm574/nmt-exp/tmp/translate_ppl.txt"
    # test_file = config['tgtfile']
    # intermediatefile = "tmp/trans-intermediate.txt"
    # inputwlist = "/home/alta/BLTSpeaking/ged-pm574/gec-lm/train-rnnlm/rnnlms/v3/one-billion/lib/wlists/train.lst.index"
    # outputwlist = "/home/alta/BLTSpeaking/ged-pm574/gec-lm/train-rnnlm/rnnlms/v3/one-billion/lib/wlists/train.lst.index"
    # vocabsize = "64002"
    # parser.make_cmd(rnnlm_model, test_file, inputwlist, outputwlist, vocabsize, intermediatefile)
    # ------------------------------------------ #

    with tf.Session(config=sess_config) as sess:
        # Restore variables from disk.
        saver.restore(sess, full_save_path_to_model)
        # print("Model restored")

        src_sent_ids, src_sent_len = src_data(config['srcfile'], src_word2id,
                                              config['max_sentence_length'])

        num_sentences = len(src_sent_ids)
        batch_size = 1000
        num_batches = int(num_sentences / batch_size) + 1

        print('num_batches =', num_batches)

        beam_width = config['beam_width']

        outputs = []

        for i in range(num_batches):

            i_start = batch_size * i
            i_end = i_start + batch_size if i_start + batch_size <= num_sentences else num_sentences
            translate_dict = {
                model.src_word_ids: src_sent_ids[i_start:i_end],
                model.src_sentence_lengths: src_sent_len[i_start:i_end],
                model.dropout: 0.0
            }

            predicted_ids = sess.run(model.predicted_ids,
                                     feed_dict=translate_dict)

            for sentence in predicted_ids:
                beam = []
                for k in range(beam_width):
                    translation = sentence[:, k]
                    words = []
                    for id in translation:
                        if id == params['eos_id']:
                            break

                        words.append(tgt_id2word[id])

                    beam.append(words)

                outputs.append(beam)

            print('#', end='')
            sys.stdout.flush()

        print("num outputs: ", len(outputs))
        # for i in range(len(outputs)):
        #     if len(outputs[i]) != 10:
        #         pdb.set_trace()
        # print("no problem!")
        # pdb.set_trace()

        with open(config['tgtfile'], 'w', encoding="utf8") as file:
            for output in outputs:
                for beam in output:
                    x = "<s> " + " ".join(beam[:-1]).upper() + " </s>\n"
                    file.write(x)
Ejemplo n.º 10
0
def translate(config):
    if 'X_SGE_CUDA_DEVICE' in os.environ:
        print('running on the stack...')
        cuda_device = os.environ['X_SGE_CUDA_DEVICE']
        print('X_SGE_CUDA_DEVICE is set to {}'.format(cuda_device))
        os.environ['CUDA_VISIBLE_DEVICES'] = cuda_device

    else: # development only e.g. air202
        print('running locally...')
        os.environ['CUDA_VISIBLE_DEVICES'] = '' # choose the device (GPU) here

    sess_config = tf.ConfigProto()

    vocab_paths = {'vocab_src': config['vocab_src'], 'vocab_tgt': config['vocab_tgt']}
    src_word2id, tgt_word2id = load_vocab(vocab_paths)

    tgt_id2word = list(tgt_word2id.keys())

    params = {'vocab_src_size': len(src_word2id),
            'vocab_tgt_size': len(tgt_word2id),
            'go_id':  tgt_word2id['<go>'],
            'eos_id':  tgt_word2id['</s>']}

    # build the model
    model = EncoderDecoder(config, params)
    model.build_network()

    # save & restore model
    saver = tf.train.Saver()
    save_path = config['load']
    model_number = config['model_number'] if config['model_number'] != None else config['num_epochs'] - 1
    full_save_path_to_model = save_path + '/model-' + str(model_number)

    with tf.Session(config=sess_config) as sess:
        # Restore variables from disk.
        saver.restore(sess, full_save_path_to_model)
        # print("Model restored")

        src_sent_ids, src_sent_len = src_data(config['srcfile'], src_word2id,
                                            config['max_sentence_length'], config['spellcheck'])

        num_sentences = len(src_sent_ids)
        # batch_size = config['batch_size'] # maybe too small (inefficient) - but should be not too large
        batch_size = 100 # this is okay - it requires much lower memory compared to training
        num_batches = int(num_sentences/batch_size) + 1

        tgt_lines = []
        print('num_batches =', num_batches)

        for i in range(num_batches):

            i_start = batch_size*i
            i_end = i_start+batch_size if i_start+batch_size <= num_sentences else num_sentences
            translate_dict = {model.src_word_ids: src_sent_ids[i_start:i_end],
                        model.src_sentence_lengths: src_sent_len[i_start:i_end],
                        model.dropout: 0.0}

            [translations] = sess.run([model.translations], feed_dict=translate_dict)

            for translation in translations:
                words = []
                for id in translation:
                    if id == params['eos_id']:
                        break
                    words.append(tgt_id2word[id])

                # print(' '.join(words))
                tgt_lines.append(' '.join(words))

            print('#')
            sys.stdout.flush()

        with open(config['tgtfile'], 'w') as file:
            for line in tgt_lines:
                file.write(line + '\n')
        print('translation done!')