def sent2embed(self, model): match_sent = self.match_sent if match_sent: self.match_embed = skipthoughts.encode(model, match_sent) mismatch_sent = self.mismatch_sent if mismatch_sent: self.mismatch_embed = skipthoughts.encode(model, mismatch_sent)
def encode(self, inputs): story = skip.encode(self.skip_model, inputs.story) story = np.asarray(story, dtype=np.float32).reshape([self.batch_size, self.nstory, -1]) query = skip.encode(self.skip_model, inputs.query) query = np.asarray(query, dtype=np.float32).reshape([self.batch_size, 1, -1]) answer = skip.encode(self.skip_model, inputs.answer) answer = np.asarray(answer, dtype=np.float32).reshape([self.batch_size, self.nanswer, -1]) target = np.asarray(inputs.target, dtype=np.int64).reshape([self.batch_size]) return story, query, answer, target
def encode(self, inputs): story = skip.encode(self.skip_model, inputs.story) story = np.asarray(story, dtype=np.float32).reshape( [self.batch_size, self.nstory, -1]) query = skip.encode(self.skip_model, inputs.query) query = np.asarray(query, dtype=np.float32).reshape([self.batch_size, 1, -1]) answer = skip.encode(self.skip_model, inputs.answer) answer = np.asarray(answer, dtype=np.float32).reshape( [self.batch_size, self.nanswer, -1]) target = np.asarray(inputs.target, dtype=np.int64).reshape([self.batch_size]) return story, query, answer, target
def skipthoughts_articles(articles, max_title_sentences=None, max_article_sentences=None): """ Filter articles so that we have at max `max_title_sentences` sentences in the title and `max_article_sentences` sentences in the body of the article. Then, we add in the skipthought vectors for all sentences in the titles and bodies of the articles into the `headline_vectors` and `article_vectors` key. """ article_vectors = [] st_model = st.load_model(data_path=SKIPTHOUGHTS_DATA) for article in tqdm(articles, 'skipthoughts encoding articles'): title_sentences = nltk.sent_tokenize(article['Headline']) if max_title_sentences is not None and \ len(title_sentences) > max_title_sentences: continue article_sentences = nltk.sent_tokenize(article['articleBody']) if max_article_sentences is not None and \ len(article_sentences) > max_article_sentences: continue vectors = st.encode(st_model, title_sentences + article_sentences, verbose=False, batch_size=128).astype('float16') N = len(title_sentences) article['headline_vectors'] = vectors[:N] article['article_vectors'] = vectors[N:] article_vectors.append(article) return article_vectors
def get_test_sent(test_file): with open(test_file, "r") as f: test_sent = [] for row in f.read().splitlines(): test_sent.append(row.split(",")[1]) model = skipthoughts.load_model() vecs = skipthoughts.encode(model, test_sent) return vecs
def gen_encodings(model, sources, category): """ Generate encodings in advance. It can save computation time if training multiple times. Not used in this demo. """ result = skipthoughts.encode(model, sources, use_norm=True, verbose=True) np.save("data/source_" + category + "_encodings.npy", result) return result
def skipthoughts_encode(sentences, model=skipthoughts_model): vectors = None print(time.time(), len(sentences), "encoding") vectors = skipthoughts.encode( model, sentences, preprocess=lambda x: x, use_norm=False, verbose=False ) print(time.time(), vectors.shape, "done") for i, sent in enumerate(sentences): if sent == "EOP": vectors[i, :] = EOP elif sent == "EOC": vectors[i, :] = EOC return vectors
def prepare_data(caps, features, worddict, model, maxlen=None, n_words=10000): """ Put data into format useable by the model """ seqs = [] feat_list = [] for i, cc in enumerate(caps): seqs.append([worddict[w] if worddict[w] < n_words else 1 for w in cc.split()]) feat_list.append(features[i]) lengths = [len(s) for s in seqs] if maxlen != None and numpy.max(lengths) >= maxlen: new_seqs = [] new_feat_list = [] new_lengths = [] for l, s, y in zip(lengths, seqs, feat_list): if l < maxlen: new_seqs.append(s) new_feat_list.append(y) new_lengths.append(l) lengths = new_lengths feat_list = new_feat_list seqs = new_seqs if len(lengths) < 1: return None, None, None # Compute skip-thought vectors for this mini-batch feat_list = skipthoughts.encode(model, feat_list, use_eos=False, verbose=False) y = numpy.zeros((len(feat_list), len(feat_list[0]))).astype('float32') for idx, ff in enumerate(feat_list): y[idx,:] = ff n_samples = len(seqs) maxlen = numpy.max(lengths)+1 x = numpy.zeros((maxlen, n_samples)).astype('int64') x_mask = numpy.zeros((maxlen, n_samples)).astype('float32') for idx, s in enumerate(seqs): x[:lengths[idx],idx] = s x_mask[:lengths[idx]+1,idx] = 1. return x, x_mask, y
def skip_thoughts_vecs(sentences): _init_skip_thoughts() print('---------------- skip_thoughts -------------------') vecs = skipthoughts.encode(skip_thoughts_model, sentences, batch_size=len(sentences)) return vecs batch_size = 512 data = list() # batches n = len(sentences) for i in xrange(0, n, batch_size): print('i = %d ---> START = %d, END = %d' % (i, i, i + batch_size)) data.append((sentences[i:i + batch_size], labels[i:i + batch_size])) pool = Pool(20) try: pdata = pool.map(to_skip_thoughts_vec, data) pool.close() pool.join() except (KeyboardInterrupt, SystemExit): pool.close() pool.terminate() import pdb pdb.set_trace() vecs = np.zeros((n, SKIPTHOUGHTS_DIM)) _labels = np.zeros((n)) for i, batch in enumerate(pdata): start = i * batch_size end = start + batch_size print('i = %d ---> START = %d, END = %d' % (i, start, end)) vecs[start:end] = batch[0] _labels[start:end] = batch[1] return vecs, _labels
def create_embedding_npy(json_file='', ): model = st.load_model() eyes_color_list = [ 'gray', 'aqua', 'orange', 'red', 'blue', 'black', 'pink', 'green', 'brown', 'purple', 'yellow' ] hair_color_list = [ 'gray', 'aqua', 'pink', 'white', 'red', 'purple', 'blue', 'black', 'green', 'brown', 'orange' ] fidx2arridx_dict = {} jobj = json.load(open(json_file, 'r')) tag_strs = [] count = 0 for fidx, color_d in jobj.items(): if len(color_d['eyes']) == 1 and len(color_d['hair']) == 1: eyes_color = eyes_color_list[color_d['eyes'][0]] hair_color = hair_color_list[color_d['hair'][0]] tag_str = ' '.join([hair_color, 'hair', eyes_color, 'eyes']) tag_strs.append(tag_str) fidx2arridx_dict[fidx] = count count += 1 tag_embeddings = st.encode(model, tag_strs) print tag_embeddings.shape print len(fidx2arridx_dict) with open('fidx2arridx.json', 'w') as f: json.dump(fidx2arridx_dict, f) np.save('tags_embedding.npy', tag_embeddings)
def main(): caption_file = "captions.txt" training_image_file = "train_images4.txt" captions = [] with open(caption_file) as f: line_list = f.read().split("\n") line_list = line_list[7500:9000] f1 = open(training_image_file, "w") for i in range(len(line_list)): img = line_list[i].split("\t")[0] cap = line_list[i].split("\t")[1] if len(cap) > 0: captions.append(cap) f1.write(img + "\n") f1.close() model = skipthoughts.load_model() caption_vectors = skipthoughts.encode(model, captions) h = h5py.File("/content/drive/MyDrive/train_caption_vectors4.hdf5", "w") h.create_dataset("vectors", data=caption_vectors) h.close()
def gen_response(self, message): return tools.run_sampler( self.trmodel, skipthoughts.encode(self.stmodel, [message], use_norm=True, verbose=False))[0]
def gen_response(self, message): return tools.run_sampler( self.trmodel, skipthoughts.encode(self.stmodel, [message], use_norm=True, verbose=False) )[0]
def to_skip_thoughts_vec(data): vecs = skipthoughts.encode(skip_thoughts_model, data[0], batch_size=512) return (vecs, data[1])