def skipthoughts_articles(articles, max_title_sentences=None, max_article_sentences=None): """ Filter articles so that we have at max `max_title_sentences` sentences in the title and `max_article_sentences` sentences in the body of the article. Then, we add in the skipthought vectors for all sentences in the titles and bodies of the articles into the `headline_vectors` and `article_vectors` key. """ article_vectors = [] st_model = st.load_model(data_path=SKIPTHOUGHTS_DATA) for article in tqdm(articles, 'skipthoughts encoding articles'): title_sentences = nltk.sent_tokenize(article['Headline']) if max_title_sentences is not None and \ len(title_sentences) > max_title_sentences: continue article_sentences = nltk.sent_tokenize(article['articleBody']) if max_article_sentences is not None and \ len(article_sentences) > max_article_sentences: continue vectors = st.encode(st_model, title_sentences + article_sentences, verbose=False, batch_size=128).astype('float16') N = len(title_sentences) article['headline_vectors'] = vectors[:N] article['article_vectors'] = vectors[N:] article_vectors.append(article) return article_vectors
def __init__(self): self.trmodel = tools.load_model("data/trainer.npz", "data/dictionary_fry.pkl") print "===== Loaded Trained Model =====" self.stmodel = skipthoughts.load_model() print "===== Loaded Skipthoughts Model ====="
def skipthought_encode(answers): """ Obtains sentence embeddings for each sentence in the emails """ num_answers = len(answers) enc_answers = [None] * len(answers) cum_sum_sentences = [0] sent_count = 0 for answer in answers: sent_count += len(answer) cum_sum_sentences.append(sent_count) all_sentences = [sent for answer in answers for sent in answer] print('Loading pre-trained models...') model = skipthoughts.load_model() encoder = skipthoughts.Encoder(model) print('Encoding sentences...') enc_sentences = encoder.encode(all_sentences, verbose=False) for i in range(len(answers)): begin = cum_sum_sentences[i] end = cum_sum_sentences[i + 1] enc_answers[i] = enc_sentences[begin:end] return enc_answers
def get_test_sent(test_file): with open(test_file, "r") as f: test_sent = [] for row in f.read().splitlines(): test_sent.append(row.split(",")[1]) model = skipthoughts.load_model() vecs = skipthoughts.encode(model, test_sent) return vecs
def _init_skip_thoughts(): global skip_thoughts_model if skip_thoughts_model: return global skipthoughts from skipthoughts import skipthoughts skip_thoughts_model = skipthoughts.load_model()
def build_imgs(): with open('./data/tags_clean.csv', 'r') as tag_file: tag_reader = csv.reader(tag_file, delimiter='\t') img_objs = [] colors = [ "red", "orange", "yellow", "green", "blue", "purple", "blonde", "pink", "black", "white", "brown" ] num = 0 print "generate captions from training tags....." for row in tag_reader: img_id = row[0].split(',')[0] tag_row = [row[0].split(',')[1]] + row[1:] img = skimage.io.imread('./data/faces/{}.jpg'.format(int(img_id))) img = skimage.transform.resize(img, (64, 64)) match_sent = [] mismatch_sent = [] tag_hair = [] tag_eyes = [] for tag in tag_row: tag = tag.split(':')[0] for color in colors: if "{} hair".format(color) in tag: tag_hair.append(tag) if "{} eyes".format(color) in tag: tag_eyes.append(tag) for t_h in tag_hair: for t_e in tag_eyes: r = random.random() if r > 0.5: match_sent.append('{} {}'.format(t_h, t_e)) else: match_sent.append('{} {}'.format(t_e, t_h)) if match_sent: # print match_sent img_objs.append(realimg(img, match_sent)) num += 1 # if num >= 64: break # print match_sent model = skipthoughts.load_model() k = 0 for idx, img_obj1 in enumerate(img_objs): find = 0 for img_obj2 in img_objs[1:]: for sent in img_obj2.match_sent: if sent not in img_obj1.match_sent: img_objs[idx].wimg.append(img_obj2.img) img_obj1.mismatch_sent.append(sent) find += 1 if find >= 1: break if find >= 1: break img_obj1.sent2embed(model) print "{}/{}".format(k, len(img_objs)) k += 1 with open("./train_data/img_objs_new.pk", "w") as f: pk.dump(img_objs, f)
def init(): """ Initialise the Sent2Vec encoder. This includes loading the model, which may take several minutes! This function must be called before any other function in this module. """ global _encoder model = skipthoughts.load_model() _encoder = skipthoughts.Encoder(model)
def build_model(self): self.build_memory() # embed() self.skip_model = skip.load_model() self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, self.target) self.opt = tf.train.GradientDescentOptimizer(self.lr) grads = self.opt.compute_gradients(self.loss) inc_op = self.global_step.assign_add(1) with tf.control_dependencies([inc_op]): self.apply_grad_op = self.opt.apply_gradients(grads) tf.initialize_all_variables().run()
def build_model(self): self.build_memory() # embed() self.skip_model = skip.load_model() self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits( self.logits, self.target) self.opt = tf.train.GradientDescentOptimizer(self.lr) grads = self.opt.compute_gradients(self.loss) inc_op = self.global_step.assign_add(1) with tf.control_dependencies([inc_op]): self.apply_grad_op = self.opt.apply_gradients(grads) tf.initialize_all_variables().run()
def load_data(): ret_data = [] mapping = pd.ExcelFile("transcriptVoiceMap.xlsx").parse(0) # load skipthoughts encoder model = st.load_model() encoder = st.Encoder(model) all_fst_transcript_vectors = encoder.encode([str(x) for x in mapping["fst transcript"]]) all_snd_transcript_vectors = encoder.encode([str(x) for x in mapping["snd transcript"]]) # load voice emotions model emotions_model = pickle.load(open("emotions_model.sav", 'rb')) all_fst_wav_vectors = get_activations(emotions_model, np.array([get_emotions_vec(x) for x in mapping["fst wav name"]])) all_snd_wav_vectors = get_activations(emotions_model, np.array([get_emotions_vec(x) for x in mapping["snd wav name"]])) for idx in range(len(mapping)): transcript_vec = all_fst_transcript_vectors[idx] snd_transcript_vec = all_snd_transcript_vectors[idx] transcript_vec = np.concatenate( (np.abs(transcript_vec - snd_transcript_vec), transcript_vec * snd_transcript_vec)) # load agent execution flag (0=executed, 1=not executed): output = mapping["output"][idx] emotions_vec = all_fst_wav_vectors[idx] emotions_vec = np.concatenate((emotions_vec, all_snd_wav_vectors[idx])) fst_wav = mapping["fst wav name"][idx] snd_wav = mapping["snd wav name"][idx] frames_with_speech = count_frames_with_speech(fst_wav) frames_with_speech = count_frames_with_speech(snd_wav) - frames_with_speech label_idx = mapping["label"][idx] label = np.zeros(categories) label[label_idx] = 1 ret_data.append((transcript_vec, [output], emotions_vec, [frames_with_speech], label)) return ret_data
def create_embedding_npy(json_file='', ): model = st.load_model() eyes_color_list = [ 'gray', 'aqua', 'orange', 'red', 'blue', 'black', 'pink', 'green', 'brown', 'purple', 'yellow' ] hair_color_list = [ 'gray', 'aqua', 'pink', 'white', 'red', 'purple', 'blue', 'black', 'green', 'brown', 'orange' ] fidx2arridx_dict = {} jobj = json.load(open(json_file, 'r')) tag_strs = [] count = 0 for fidx, color_d in jobj.items(): if len(color_d['eyes']) == 1 and len(color_d['hair']) == 1: eyes_color = eyes_color_list[color_d['eyes'][0]] hair_color = hair_color_list[color_d['hair'][0]] tag_str = ' '.join([hair_color, 'hair', eyes_color, 'eyes']) tag_strs.append(tag_str) fidx2arridx_dict[fidx] = count count += 1 tag_embeddings = st.encode(model, tag_strs) print tag_embeddings.shape print len(fidx2arridx_dict) with open('fidx2arridx.json', 'w') as f: json.dump(fidx2arridx_dict, f) np.save('tags_embedding.npy', tag_embeddings)
def main(): caption_file = "captions.txt" training_image_file = "train_images4.txt" captions = [] with open(caption_file) as f: line_list = f.read().split("\n") line_list = line_list[7500:9000] f1 = open(training_image_file, "w") for i in range(len(line_list)): img = line_list[i].split("\t")[0] cap = line_list[i].split("\t")[1] if len(cap) > 0: captions.append(cap) f1.write(img + "\n") f1.close() model = skipthoughts.load_model() caption_vectors = skipthoughts.encode(model, captions) h = h5py.File("/content/drive/MyDrive/train_caption_vectors4.hdf5", "w") h.create_dataset("vectors", data=caption_vectors) h.close()
def main(): parser = argparse.ArgumentParser( description='Evaluate a model on the quora question-pair dataset.') parser.add_argument('--quora-data-dir', required=True, help='path to the directory containing the quora data') parser.add_argument( '--st-model-dir', required=True, help='path to the directory containing the skipthoughts model') parser.add_argument('--output-dir', default='.', help='path to the directory to write to') parser.add_argument('-v', '--verbose', action='store_true') args = parser.parse_args() output_file = os.path.join(args.output_dir, 'oov_stats.txt') with log.FileWriterStdoutPrinter(output_file) as writer: print "Loading skipthoughts model..." model = st.load_model(args.st_model_dir) print "Initializing skipthoughts word dict..." word_dict = st.init_word_dict(model) print "Analyzing word dict..." analyze_dict(word_dict, writer) print "Loading training set..." train = du.load_csv(os.path.join(args.quora_data_dir, 'train.csv')) writer.emit_line("Analyzing word counts in train.csv...") analyze_oov(word_dict, train, writer, args.output_dir, 'train') # Be sure to write data to disk for train before moving on to test, which is much bigger print "Loading test set..." test = du.load_csv(os.path.join(args.quora_data_dir, 'test.csv')) writer.emit_line("Analyzing word counts in test.csv...") analyze_oov(word_dict, test, writer, args.output_dir, 'test')
def load_encoder(model_dir): model = st.load_model(model_dir) en = st.Encoder(model) return en
def gen_model(): """ Get the Skipthoughts model to be used in encoding """ model = skipthoughts.load_model() return model
from __future__ import print_function from gutenburg import Bookshelf from skipthoughts import skipthoughts import numpy as np from unidecode import unidecode import progressbar as PB import os import time import string from nltk import sent_tokenize skipthoughts_model = skipthoughts.load_model( data_path="/home/micha/work/tldr/skipthoughts/data/" ) EOP = np.ones(4800) EOC = -1 * np.ones(4800) def null(*args, **kwargs): pass print = null def skipthoughts_encode(sentences, model=skipthoughts_model): vectors = None print(time.time(), len(sentences), "encoding") vectors = skipthoughts.encode( model, sentences, preprocess=lambda x: x, use_norm=False, verbose=False ) print(time.time(), vectors.shape, "done") for i, sent in enumerate(sentences):
import skipthoughts.skipthoughts as skipthoughts model = skipthoughts.load_model() encoder = skipthoughts.Encoder(model) import skipthoughts.eval_sick2 as eval_sick2 from keras.models import load_model import gensim import os import collections import smart_open import random import time import numpy as np import sys # limit gpu usage import tensorflow as tf from keras.backend.tensorflow_backend import set_session config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.3 set_session(tf.Session(config=config)) prefix = sys.argv[1] q_file = prefix + '_q.txt.flat.filter' a_file = prefix + '_a.txt.flat.filter' #q_speaker = prefix+'_q_speaker.txt' #a_speaker = prefix+'_a_speaker.txt' #speaker = sys.argv[2] top = int(sys.argv[2]) #q_para_file = prefix+'_q_paragraph.txt'