def predict(tweet_word): vocab = load_vocab() input_data_keras = formatK_tweet(tweet_word, vocab) for i in range(128): if (len(input_data_keras) < 128): input_data_keras.append(0) input_data_sk = word_transform(tweet_word) cnn = load_model_keras('./variables/cnn_model.tf') lstm = load_model_keras('./variables/lstm_model.tf') lstm_improved = load_model_keras_custom('./variables/lstm+_model.tf', {"peephole_lstm_cells": tfa.rnn.PeepholeLSTMCell(32), "root_mean_squared_logarithmic_error": root_mean_squared_logarithmic_error}) svm = load_model_sk('./variables/svm_model.sav') sgd = load_model_sk('./variables/sgd_model.sav') # print("Prediction:") # print("CNN: ", cnn.predict([input_data_keras[:128]])) # print("LSTM: ", lstm.predict([input_data_keras[:128]])) # print("LSTM+: ", lstm_improved.predict([input_data_keras[:128]])) # print("SVM: ", svm.predict(input_data_sk)) # print("SGD: ", sgd.predict(input_data_sk)) return {"CNN": cnn.predict([input_data_keras[:128]])[0][0], "LSTM": lstm.predict([input_data_keras])[0][0], "LSTM+": lstm_improved.predict([input_data_keras])[0][0], "SVM": svm.predict(input_data_sk)[0], "SGD": sgd.predict(input_data_sk)[0]}
def get_embedding_matrix(word_dim,mode,vocab_size): if mode == modekeys.TRAIN: vocab, vocab_dict = helper.load_vocab('twitter_data/rg_vocab.txt') glove_vectors,glove_dict = helper.load_glove_vectors('twitter_data/my_vector.txt', vocab) initial_value = helper.build_initial_embedding_matrix(vocab_dict, glove_dict, glove_vectors, word_dim) embedding_w = tf.get_variable(name='embedding_W', initializer=initial_value, trainable=True) else: embedding_w = tf.get_variable(name='embedding_W',shape=[vocab_size,word_dim],dtype=tf.float32,trainable=True) return embedding_w
def load_word_embedding(vocab_path, word_embed_path): vocabulary, vocab_dict = helper.load_vocab(vocab_path) glove_vectors, glove_dict = helper.load_glove_vectors( word_embed_path, vocabulary) vocab_size = len(vocabulary) word_dim = glove_vectors.shape[1] embedding_matrix = helper.build_initial_embedding_matrix( vocab_dict=vocab_dict, glove_vectors=glove_vectors, glove_dict=glove_dict, embedding_dim=word_dim) embedding_W = tf.get_variable('word_embedding_W', dtype=tf.float32, initializer=embedding_matrix, trainable=False) return embedding_W
def get_embeddings(hparams): if hparams.glove_path and hparams.vocab_path: tf.logging.info("Loading Glove embeddings...") vocab_array, vocab_dict = helper.load_vocab(hparams.vocab_path) glove_vectors, glove_dict = helper.load_glove_vectors( hparams.glove_path, vocab=set(vocab_array)) initializer = helper.build_initial_embedding_matrix( vocab_dict, glove_dict, glove_vectors, hparams.embedding_dim) else: tf.logging.info( "No glove/vocab path specificed, starting with random embeddings.") initializer = tf.random_uniform_initializer(-0.25, 0.25) return tf.get_variable("word_embeddings", initializer=initializer, trainable=False)
def get_embedding_matrix(word_dim, mode, vocab_size, random_seed, word_embed_path, vocab_path): if mode == modekeys.TRAIN: vocab, vocab_dict = helper.load_vocab(vocab_path) glove_vectors, glove_dict = helper.load_glove_vectors( word_embed_path, vocab) initial_value = helper.build_initial_embedding_matrix( vocab_dict, glove_dict, glove_vectors, word_dim, random_seed) embedding_w = tf.get_variable(name='embedding_W', initializer=initial_value, trainable=True) else: embedding_w = tf.get_variable(name='embedding_W', shape=[vocab_size, word_dim], dtype=tf.float32, trainable=False) return embedding_w
for p in ps: source_seq = [w2i_source[w] for w in doc_source[p].split()] + [w2i_source["<PAD>"]] * ( max_source_len - len(doc_source[p].split())) target_seq = [w2i_target[w] for w in doc_target[p].split()] + [w2i_target["<PAD>"]] * ( max_target_len - 1 - len(doc_target[p].split())) + [w2i_target["<EOS>"]] source_batch.append(source_seq) target_batch.append(target_seq) return source_batch, source_lens, target_batch, target_lens if __name__ == '__main__': print 'loading data ...' doc_source = helper.load_file('./data/small_vocab_en.txt') doc_target = helper.load_file('./data/small_vocab_fr.txt') s_token2idx, s_idx2token = helper.load_vocab('./data/small_vocab_en.txt', helper.SOURCE_CODES) t_token2idx, t_idx2token = helper.load_vocab('./data/small_vocab_fr.txt', helper.TARGET_CODES) print 'building model...' config = config() config.source_vocab_size = len(s_token2idx) config.target_vocab_size = len(t_token2idx) model = Seq2seq(config, t_token2idx, useTeacherForcing=True) batches = 10000 print_every = 100 print 'run model...' with tf.Session() as sess: saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) losses = [] total_loss = 0 for batch in range(batches):
def construct_training_data_batches(config): # train_src = 'data/iwslt15/train.en' # train_tgt = 'data/iwslt15/train.en' # # train_src = 'data/iwslt15/mytrain3.en' # # train_tgt = 'data/iwslt15/mytrain3.vi' # vocab_src = 'data/iwslt15/vocab.en' # vocab_tgt = 'data/iwslt15/vocab.en' train_src = config['train_src'] train_tgt = config['train_tgt'] vocab_src = config['vocab_src'] vocab_tgt = config['vocab_tgt'] batch_size = config['batch_size'] max_sentence_length = config['max_sentence_length'] vocab_paths = {'vocab_src': vocab_src, 'vocab_tgt': vocab_tgt} data_paths = {'train_src': train_src, 'train_tgt': train_tgt} src_word2id, tgt_word2id = load_vocab(vocab_paths) train_src_sentences, train_tgt_sentences = load_data(data_paths) vocab_size = {'src': len(src_word2id), 'tgt': len(tgt_word2id)} print("num_vocab_src: ", vocab_size['src']) print("num_vocab_tgt: ", vocab_size['tgt']) train_src_word_ids = [] # num_sentences x max_sentence_length train_tgt_word_ids = [] # num_sentences x max_sentence_length train_src_sentence_lengths = [] train_tgt_sentence_lengths = [] # EOS id src_eos_id = src_word2id['</s>'] tgt_eos_id = tgt_word2id['</s>'] # Source and Target sentences for src_sentence, tgt_sentence in zip(train_src_sentences, train_tgt_sentences): src_words = src_sentence.split() tgt_words = tgt_sentence.split() if len(src_words) > max_sentence_length or len( tgt_words) > max_sentence_length: continue # source src_ids = [src_eos_id] * max_sentence_length for i, word in enumerate(src_words): if word in src_word2id: src_ids[i] = src_word2id[word] else: src_ids[i] = src_word2id['<unk>'] train_src_word_ids.append(src_ids) train_src_sentence_lengths.append(len(src_words) + 1) # include one EOS # target tgt_ids = [tgt_eos_id] * max_sentence_length for i, word in enumerate(tgt_words): if word in tgt_word2id: tgt_ids[i] = tgt_word2id[word] else: tgt_ids[i] = tgt_word2id['<unk>'] train_tgt_word_ids.append(tgt_ids) train_tgt_sentence_lengths.append(len(tgt_words) + 1) # include one EOS assert (len(train_src_word_ids) == len(train_tgt_word_ids) ), "train_src_word_ids != train_src_word_ids" num_training_sentences = len(train_src_word_ids) print("num_training_sentences: ", num_training_sentences) # only those that are not too long # shuffle _x = list( zip(train_src_word_ids, train_tgt_word_ids, train_src_sentence_lengths, train_tgt_sentence_lengths)) random.shuffle(_x) train_src_word_ids, train_tgt_word_ids, train_src_sentence_lengths, train_tgt_sentence_lengths = zip( *_x) batches = [] for i in range(int(num_training_sentences / batch_size)): i_start = i * batch_size i_end = i_start + batch_size batch = { 'src_word_ids': train_src_word_ids[i_start:i_end], 'tgt_word_ids': train_tgt_word_ids[i_start:i_end], 'src_sentence_lengths': train_src_sentence_lengths[i_start:i_end], 'tgt_sentence_lengths': train_tgt_sentence_lengths[i_start:i_end] } batches.append(batch) return batches, vocab_size, src_word2id, tgt_word2id
import random import time from model import Seq2seq import helper from train import config, get_batch tf_config = tf.ConfigProto(allow_soft_placement=True) tf_config.gpu_options.allow_growth = True model_path = "checkpoint/model.ckpt" if __name__ == "__main__": print("(1)load data......") docs_source = ['new jersey is usually hot during autumn , and it is never quiet in winter .\n'] docs_target = ["new jersey est généralement chaud pendant l' automne , et il est jamais calme en hiver .\n"] w2i_source, i2w_source = helper.load_vocab('./data/small_vocab_en.txt', helper.SOURCE_CODES) w2i_target, i2w_target = helper.load_vocab('./data/small_vocab_fr.txt', helper.TARGET_CODES) print("(2) build model......") config = config() config.source_vocab_size = len(w2i_source) config.target_vocab_size = len(w2i_target) model = Seq2seq(config, w2i_target, useTeacherForcing=False) print("(3) run model......") print_every = 100 max_target_len = 20 with tf.Session(config=tf_config) as sess: saver = tf.train.Saver() saver.restore(sess, model_path)
def translate(config): if 'X_SGE_CUDA_DEVICE' in os.environ: print('running on the stack...') cuda_device = os.environ['X_SGE_CUDA_DEVICE'] print('X_SGE_CUDA_DEVICE is set to {}'.format(cuda_device)) os.environ['CUDA_VISIBLE_DEVICES'] = cuda_device else: # development only e.g. air202 print('running locally...') os.environ[ 'CUDA_VISIBLE_DEVICES'] = '0' # choose the device (GPU) here sess_config = tf.ConfigProto() vocab_paths = { 'vocab_src': config['vocab_src'], 'vocab_tgt': config['vocab_tgt'] } src_word2id, tgt_word2id = load_vocab(vocab_paths) tgt_id2word = list(tgt_word2id.keys()) params = { 'vocab_src_size': len(src_word2id), 'vocab_tgt_size': len(tgt_word2id), 'go_id': tgt_word2id['<go>'], 'eos_id': tgt_word2id['</s>'] } # build the model model = EncoderDecoder(config, params) model.build_network() # save & restore model saver = tf.train.Saver() save_path = config['load'] model_number = config['model_number'] if config[ 'model_number'] != None else config['num_epochs'] - 1 full_save_path_to_model = save_path + '/model-' + str(model_number) # ------ PPL Parser for Fluency Score ------ # # parser = PplParser() # rnnlm_model = "/home/alta/BLTSpeaking/ged-pm574/gec-lm/train-rnnlm/rnnlms/v3/one-billion/RNN_weight.OOS.cuedrnnlm.rnnlm.300.300/train_LM.wgt.iter9" # # test_file = "/home/alta/BLTSpeaking/ged-pm574/nmt-exp/tmp/translate_ppl.txt" # test_file = config['tgtfile'] # intermediatefile = "tmp/trans-intermediate.txt" # inputwlist = "/home/alta/BLTSpeaking/ged-pm574/gec-lm/train-rnnlm/rnnlms/v3/one-billion/lib/wlists/train.lst.index" # outputwlist = "/home/alta/BLTSpeaking/ged-pm574/gec-lm/train-rnnlm/rnnlms/v3/one-billion/lib/wlists/train.lst.index" # vocabsize = "64002" # parser.make_cmd(rnnlm_model, test_file, inputwlist, outputwlist, vocabsize, intermediatefile) # ------------------------------------------ # with tf.Session(config=sess_config) as sess: # Restore variables from disk. saver.restore(sess, full_save_path_to_model) # print("Model restored") src_sent_ids, src_sent_len = src_data(config['srcfile'], src_word2id, config['max_sentence_length']) num_sentences = len(src_sent_ids) batch_size = 1000 num_batches = int(num_sentences / batch_size) + 1 print('num_batches =', num_batches) beam_width = config['beam_width'] outputs = [] for i in range(num_batches): i_start = batch_size * i i_end = i_start + batch_size if i_start + batch_size <= num_sentences else num_sentences translate_dict = { model.src_word_ids: src_sent_ids[i_start:i_end], model.src_sentence_lengths: src_sent_len[i_start:i_end], model.dropout: 0.0 } predicted_ids = sess.run(model.predicted_ids, feed_dict=translate_dict) for sentence in predicted_ids: beam = [] for k in range(beam_width): translation = sentence[:, k] words = [] for id in translation: if id == params['eos_id']: break words.append(tgt_id2word[id]) beam.append(words) outputs.append(beam) print('#', end='') sys.stdout.flush() print("num outputs: ", len(outputs)) # for i in range(len(outputs)): # if len(outputs[i]) != 10: # pdb.set_trace() # print("no problem!") # pdb.set_trace() with open(config['tgtfile'], 'w', encoding="utf8") as file: for output in outputs: for beam in output: x = "<s> " + " ".join(beam[:-1]).upper() + " </s>\n" file.write(x)
def translate(config): if 'X_SGE_CUDA_DEVICE' in os.environ: print('running on the stack...') cuda_device = os.environ['X_SGE_CUDA_DEVICE'] print('X_SGE_CUDA_DEVICE is set to {}'.format(cuda_device)) os.environ['CUDA_VISIBLE_DEVICES'] = cuda_device else: # development only e.g. air202 print('running locally...') os.environ['CUDA_VISIBLE_DEVICES'] = '' # choose the device (GPU) here sess_config = tf.ConfigProto() vocab_paths = {'vocab_src': config['vocab_src'], 'vocab_tgt': config['vocab_tgt']} src_word2id, tgt_word2id = load_vocab(vocab_paths) tgt_id2word = list(tgt_word2id.keys()) params = {'vocab_src_size': len(src_word2id), 'vocab_tgt_size': len(tgt_word2id), 'go_id': tgt_word2id['<go>'], 'eos_id': tgt_word2id['</s>']} # build the model model = EncoderDecoder(config, params) model.build_network() # save & restore model saver = tf.train.Saver() save_path = config['load'] model_number = config['model_number'] if config['model_number'] != None else config['num_epochs'] - 1 full_save_path_to_model = save_path + '/model-' + str(model_number) with tf.Session(config=sess_config) as sess: # Restore variables from disk. saver.restore(sess, full_save_path_to_model) # print("Model restored") src_sent_ids, src_sent_len = src_data(config['srcfile'], src_word2id, config['max_sentence_length'], config['spellcheck']) num_sentences = len(src_sent_ids) # batch_size = config['batch_size'] # maybe too small (inefficient) - but should be not too large batch_size = 100 # this is okay - it requires much lower memory compared to training num_batches = int(num_sentences/batch_size) + 1 tgt_lines = [] print('num_batches =', num_batches) for i in range(num_batches): i_start = batch_size*i i_end = i_start+batch_size if i_start+batch_size <= num_sentences else num_sentences translate_dict = {model.src_word_ids: src_sent_ids[i_start:i_end], model.src_sentence_lengths: src_sent_len[i_start:i_end], model.dropout: 0.0} [translations] = sess.run([model.translations], feed_dict=translate_dict) for translation in translations: words = [] for id in translation: if id == params['eos_id']: break words.append(tgt_id2word[id]) # print(' '.join(words)) tgt_lines.append(' '.join(words)) print('#') sys.stdout.flush() with open(config['tgtfile'], 'w') as file: for line in tgt_lines: file.write(line + '\n') print('translation done!')