def do_eval(test_data_path, shuffle=False): if FLAGS.load_model is None: raise ValueError("You need to specify the model location by --load_model=[location]") # Load Testing Data question_1, question_2, labels = get_input_from_csv(test_data_path) if shuffle: question_1, question_2, labels = shuffle_data(question_1, question_2, labels) # Load Pre-trained Model if FLAGS.best_glove: import en_core_web_md nlp = en_core_web_md.load() # load best-matching version for Glove else: nlp = spacy.load('en') embedding_matrix = load_glove_embeddings(nlp.vocab, n_unknown=FLAGS.num_unknown) # shape=(1071074, 300) tf.logging.info('Build model ...') esim = ESIM(embedding_matrix, FLAGS.max_length, FLAGS.num_hidden, FLAGS.num_classes, FLAGS.keep_prob, FLAGS.learning_rate) if FLAGS.load_model: model = esim.build_model(FLAGS.load_model) else: raise ValueError("You need to specify the model location by --load_model=[location]") # Convert the "raw data" to word-ids format && convert "labels" to one-hot vectors q1_test, q2_test = convert_questions_to_word_ids(question_1, question_2, nlp, max_length=FLAGS.max_length, tree_truncate=FLAGS.tree_truncate) labels = to_categorical(np.asarray(labels, dtype='int32')) scores = model.evaluate([q1_test, q2_test], labels, batch_size=FLAGS.batch_size, verbose=1) print("=================== RESULTS =====================") print("[*] LOSS OF TEST DATA: %.4f" % scores[0]) print("[*] ACCURACY OF TEST DATA: %.4f" % scores[1])
def decode(): embed_path = FLAGS.embed_path or pjoin( "data", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) embeddings = utils.load_glove_embeddings(embed_path) with tf.Session() as sess: # Create model and load parameters. # model = create_model(sess, embeddings, True) # model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") en_vocab, rev_fr_vocab = preprocess_data.initialize_vocabulary( vocab_path) FLAGS.vocab_size = len(en_vocab) print("embeddings.shape[0]: " + str(embeddings.shape[0])) print("len(en_vocab):" + str(len(en_vocab))) assert embeddings.shape[0] == len(en_vocab) model = create_model(sess, embeddings, True) model.batch_size = 1 # We decode one sentence at a time. # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = preprocess_data.sentence_to_token_ids( tf.compat.as_bytes(sentence), en_vocab) # Which bucket does it belong to? bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits ] # If there is an EOS symbol in outputs, cut them at that point. if preprocess_data.EOS_ID in outputs: outputs = outputs[:outputs.index(preprocess_data.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([ tf.compat.as_str(rev_fr_vocab[output]) for output in outputs ])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def train(train_data, val_data, batch_size, n_epochs, save_dir=None): # Stage 1: Read training data (csv) && Preprocessing them tf.logging.info('Loading training and validataion data ...') train_question_1, train_question_2, train_labels = get_input_from_csv(train_data) # val_question_1, val_question_2, val_labels = get_input_from_csv(val_data) # Stage 2: Load Pre-trained embedding matrix (Using GLOVE here) tf.logging.info('Loading pre-trained embedding matrix ...') if FLAGS.best_glove: import en_core_web_md nlp = en_core_web_md.load() # load best-matching version for Glove else: nlp = spacy.load('en') embedding_matrix = load_glove_embeddings(nlp.vocab, n_unknown=FLAGS.num_unknown) # shape=(1071074, 300) # Stage 3: Build Model tf.logging.info('Build model ...') esim = ESIM(embedding_matrix, FLAGS.max_length, FLAGS.num_hidden, FLAGS.num_classes, FLAGS.keep_prob, FLAGS.learning_rate) if FLAGS.load_model: model = esim.build_model(FLAGS.load_model) else: model = esim.build_model() # Stage 4: Convert the "raw data" to word-ids format && convert "labels" to one-hot vectors tf.logging.info('Converting questions into ids ...') q1_train, q2_train = convert_questions_to_word_ids(train_question_1, train_question_2, nlp, max_length=FLAGS.max_length, tree_truncate=FLAGS.tree_truncate) train_labels = to_categorical(np.asarray(train_labels, dtype='int32')) # q1_val, q2_val = convert_questions_to_word_ids(val_question_1, val_question_2, nlp, max_length=FLAGS.max_length, tree_truncate=FLAGS.tree_truncate) # val_labels = to_categorical(np.asarray(val_labels, dtype='int32')) # Stage 5: Training tf.logging.info('Start training ...') callbacks = [] save_dir = save_dir if save_dir is not None else 'checkpoints' filepath = os.path.join(save_dir, "weights-{epoch:02d}-{val_acc:.2f}.hdf5") checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks.append(checkpoint) if FLAGS.tensorboard: graph_dir = os.path.join('.', 'GRAPHs') if not os.path.exists(graph_dir): os.makedirs(graph_dir) tb = TensorBoard(log_dir=graph_dir, histogram_freq=0, write_graph=True, write_images=True) callbacks.append(tb) model.fit( x=[q1_train, q2_train], y=train_labels, batch_size=batch_size, epochs=n_epochs, # validation_data=([q1_val, q2_val], val_labels), validation_split=0.2, callbacks=callbacks, shuffle=True, verbose=FLAGS.verbose )
def forward(self, char_encoded, C_lengths, raw_sentences): """ Pass the input sentences through the GRU layers. :param X: batch of sentences :return: """ batch_size = len(raw_sentences) elmo_embeddings = load_elmo_embeddings(raw_sentences).to(self.device) glove_embeddings = load_glove_embeddings(raw_sentences).to(self.device) char_embeddings = self.charRNN(char_encoded, C_lengths).to(self.device) one_hot_embeddings = load_onehot_embeddings(raw_sentences).to(self.device) num_words, char_dim = char_embeddings.size() char_embeddings = char_embeddings.view(batch_size, num_words // batch_size, char_dim) final_embeddings = torch.cat([elmo_embeddings, glove_embeddings, char_embeddings, one_hot_embeddings], dim=2) # Dropout pre BiRNN final_embeddings = self.dropout(final_embeddings) # Get the shared layer representations. shared_output, _ = self.wordRNN(final_embeddings) return shared_output
def do_pred(test_data_path): if FLAGS.load_model is None: raise ValueError("You need to specify the model location by --load_model=[location]") # Load Testing Data question_1, question_2 = get_test_from_csv(test_data_path) # Load Pre-trained Model if FLAGS.best_glove: import en_core_web_md nlp = en_core_web_md.load() # load best-matching version for Glove else: nlp = spacy.load('en') embedding_matrix = load_glove_embeddings(nlp.vocab, n_unknown=FLAGS.num_unknown) # shape=(1071074, 300) tf.logging.info('Build model ...') esim = ESIM(embedding_matrix, FLAGS.max_length, FLAGS.num_hidden, FLAGS.num_classes, FLAGS.keep_prob, FLAGS.learning_rate) if FLAGS.load_model: model = esim.build_model(FLAGS.load_model) else: raise ValueError("You need to specify the model location by --load_model=[location]") # Convert the "raw data" to word-ids format && convert "labels" to one-hot vectors q1_test, q2_test = convert_questions_to_word_ids(question_1, question_2, nlp, max_length=FLAGS.max_length, tree_truncate=FLAGS.tree_truncate) predictions = model.predict([q1_test, q2_test]) print("[*] Predictions Results: \n", predictions[0]) for i in range(len(q1_test)): print("=============== %d Prediction ===============" % i) print("Q1: %s" % question_1[i]) print("Q2: %s" % question_2[i]) if np.argmax(predictions[i]) == 1: print("IS_DUPLICATE: YES score: %.6f" % predictions[i][1]) else: print("IS_DUPLICATE: NO score: %.6f" % predictions[i][0])
def loadwordmodel(self, wordembfile, destfile, wordembsize, log, device): if not os.path.exists(destfile): log.info('loading pre-trained word embeddings from ' + wordembfile + '... (takes several minutes)') if os.path.exists(wordembfile) and 'fasttext' in wordembfile: from gensim.models import FastText wordvectors = FastText.load_fasttext_format(wordembfile) elif os.path.exists(wordembfile) and 'glove' in wordembfile: wordvectors = utils.load_glove_embeddings(wordembfile) else: log.error('word embedding model ' + wordembfile + ' cannot be found!') sys.exit() word_embs = [] c = 0 for i, idx in enumerate(self.idx2word): try: word_embs.append( torch.from_numpy(wordvectors[idx]).float()) except KeyError: c += 1 word_embs.append(torch.zeros(wordembsize)) log.info('number of words without a pretrained word embedding: ' + str(c) + '/' + str(len(self.idx2word))) self.word_embs = torch.stack(word_embs) self.word_embs[0].fill_(0) # fill embedding for <PAD> with 0s torch.save(self.word_embs, destfile) else: log.info('loading pre-trained word embeddings from ' + wordembfile + '...') self.word_embs = torch.load(destfile) log.info('loaded pre-trained word vectors successfully!') if device >= 0: self.word_embs = self.word_embs.to('cuda:' + str(device))
word_index = imdb.get_word_index(os.path.join(project_path, 'data/imdb_word_index.json')) word_inverted_index = {v: k for k, v in word_index.items()} # The first indexes in the map are reserved to represet things other than tokens index_offset = 3 word_inverted_index[-1 - index_offset] = '_' # Padding at the end word_inverted_index[ 1 - index_offset] = '>' # Start of the sentence word_inverted_index[ 2 - index_offset] = '?' # OOV word_inverted_index[ 3 - index_offset] = '' # Un-used x_len_train = np.array([min(len(x), sentence_size) for x in x_train_variable]) x_len_test = np.array([min(len(x), sentence_size) for x in x_test_variable]) embedding_matrix = load_glove_embeddings('data/glove.6B.50d.txt', word_index, vocab_size, embedding_size) params = {'embedding_initializer': embedding_matrix} lstm_classifier = tf.estimator.Estimator(model_fn=lstm_model_fn, model_dir=os.path.join(model_dir, 'cnn_pretrained'), params=params) # Save a reference to the classifier to run predictions later lstm_classifier.train(input_fn=train_input_fn(x_train, x_len_train, y_train, x_train_variable), steps=500) eval_results = lstm_classifier.evaluate(input_fn=eval_input_fn(x_test, x_len_test, y_test)) predictions = np.array([p['logistic'][0] for p in lstm_classifier.predict(input_fn=eval_input_fn(x_test, x_len_test, y_test))]) tf.reset_default_graph() pr = summary_lib.pr_curve('precision_recall', predictions=predictions, labels=y_test.astype(bool), num_thresholds=21) with tf.Session() as sess: writer = tf.summary.FileWriter(os.path.join(lstm_classifier.model_dir, 'eval'), sess.graph)
def train(): """Train a en->fr translation model using WMT data.""" data_config = DataConfig(FLAGS.data_dir) logFile = open('data/log.txt', 'w') embed_path = FLAGS.embed_path or pjoin( "data", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) embeddings = utils.load_glove_embeddings(embed_path) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = preprocess_data.initialize_vocabulary(vocab_path) FLAGS.vocab_size = len(vocab) print(embeddings.shape[0], len(vocab)) assert embeddings.shape[0] == len(vocab) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) as sess: # Create model. with tf.device('/gpu:1'): print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, embeddings, False) tic = time.time() params = tf.trainable_variables() num_params = sum( map(lambda t: np.prod(tf.shape(t.value()).eval()), params)) toc = time.time() logging.info("Number of params: %d (retreival took %f secs)" % (num_params, toc - tic)) # Read data into buckets and compute their sizes. print("Reading development and training data (limit: %d)." % FLAGS.max_train_data_size) dev_set = read_data(data_config.val_from, data_config.val_to) train_set = read_data(data_config.train_from, data_config.train_to, FLAGS.max_train_data_size) train_bucket_sizes = [ len(train_set[b]) for b in xrange(len(_buckets)) ] train_total_size = float(sum(train_bucket_sizes)) # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to # the size if i-th training bucket, as used later. train_buckets_scale = [ sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes)) ] # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] breakCount = 0 while True: # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. random_number_01 = np.random.random_sample() bucket_id = min([ i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01 ]) # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_set, bucket_id) _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 if current_step % FLAGS.steps_per_print == 0: perplexity = math.exp( float(loss)) if loss < 300 else float("inf") print( "global step %d learning rate %.4f step_loss %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_loss, perplexity)) # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: # Print statistics for the previous epoch. perplexity = math.exp( float(loss)) if loss < 300 else float("inf") print("checkpoint here") print( "====== global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) logFile.write( "====== global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) logFile.write("\n") # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max( previous_losses[-3:]): sess.run(model.learning_rate_decay_op) previous_losses.append(loss) # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) if perplexity < 2: breakCount += 1 print("breakCount ", breakCount) step_time, loss = 0.0, 0.0 # Run evals on development set and print their perplexity. for bucket_id in xrange(len(_buckets)): if len(dev_set[bucket_id]) == 0: print(" eval: empty bucket %d" % (bucket_id)) continue encoder_inputs, decoder_inputs, target_weights = model.get_batch( dev_set, bucket_id) _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) eval_ppx = math.exp(float( eval_loss)) if eval_loss < 300 else float("inf") print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) sys.stdout.flush() if breakCount > 20: print("successfully breakdown") logFile.close() break