Example #1
0
    def sent2embed(self, model):
        match_sent = self.match_sent
        if match_sent:
            self.match_embed = skipthoughts.encode(model, match_sent)

        mismatch_sent = self.mismatch_sent
        if mismatch_sent:
            self.mismatch_embed = skipthoughts.encode(model, mismatch_sent)
Example #2
0
 def encode(self, inputs):
   story = skip.encode(self.skip_model, inputs.story)
   story = np.asarray(story, dtype=np.float32).reshape([self.batch_size, self.nstory, -1])
   query = skip.encode(self.skip_model, inputs.query)
   query = np.asarray(query, dtype=np.float32).reshape([self.batch_size, 1, -1])
   answer = skip.encode(self.skip_model, inputs.answer)
   answer = np.asarray(answer, dtype=np.float32).reshape([self.batch_size, self.nanswer, -1])
   target = np.asarray(inputs.target, dtype=np.int64).reshape([self.batch_size])
   return story, query, answer, target
Example #3
0
 def encode(self, inputs):
     story = skip.encode(self.skip_model, inputs.story)
     story = np.asarray(story, dtype=np.float32).reshape(
         [self.batch_size, self.nstory, -1])
     query = skip.encode(self.skip_model, inputs.query)
     query = np.asarray(query,
                        dtype=np.float32).reshape([self.batch_size, 1, -1])
     answer = skip.encode(self.skip_model, inputs.answer)
     answer = np.asarray(answer, dtype=np.float32).reshape(
         [self.batch_size, self.nanswer, -1])
     target = np.asarray(inputs.target,
                         dtype=np.int64).reshape([self.batch_size])
     return story, query, answer, target
def skipthoughts_articles(articles, max_title_sentences=None,
                          max_article_sentences=None):
    """
    Filter articles so that we have at max `max_title_sentences` sentences in
    the title and `max_article_sentences` sentences in the body of the article.

    Then, we add in the skipthought vectors for all sentences in the titles and
    bodies of the articles into the `headline_vectors` and `article_vectors`
    key.
    """
    article_vectors = []
    st_model = st.load_model(data_path=SKIPTHOUGHTS_DATA)
    for article in tqdm(articles, 'skipthoughts encoding articles'):
        title_sentences = nltk.sent_tokenize(article['Headline'])
        if max_title_sentences is not None and  \
                len(title_sentences) > max_title_sentences:
            continue
        article_sentences = nltk.sent_tokenize(article['articleBody'])
        if max_article_sentences is not None and \
                len(article_sentences) > max_article_sentences:
            continue
        vectors = st.encode(st_model, title_sentences + article_sentences,
                            verbose=False, batch_size=128).astype('float16')
        N = len(title_sentences)
        article['headline_vectors'] = vectors[:N]
        article['article_vectors'] = vectors[N:]
        article_vectors.append(article)
    return article_vectors
Example #5
0
def get_test_sent(test_file):
    with open(test_file, "r") as f:
        test_sent = []
        for row in f.read().splitlines():
            test_sent.append(row.split(",")[1])
        model = skipthoughts.load_model()
        vecs = skipthoughts.encode(model, test_sent)
        return vecs
Example #6
0
def gen_encodings(model, sources, category):
	""" Generate encodings in advance. It can save computation time if
		training multiple times. Not used in this demo.
	"""

	result = skipthoughts.encode(model, sources, use_norm=True, verbose=True)
	np.save("data/source_" + category + "_encodings.npy", result)

	return result
Example #7
0
def gen_encodings(model, sources, category):
    """ Generate encodings in advance. It can save computation time if
		training multiple times. Not used in this demo.
	"""

    result = skipthoughts.encode(model, sources, use_norm=True, verbose=True)
    np.save("data/source_" + category + "_encodings.npy", result)

    return result
Example #8
0
def skipthoughts_encode(sentences, model=skipthoughts_model):
    vectors = None
    print(time.time(), len(sentences), "encoding")
    vectors = skipthoughts.encode(
        model, sentences, preprocess=lambda x: x,
        use_norm=False, verbose=False
    )
    print(time.time(), vectors.shape, "done")
    for i, sent in enumerate(sentences):
        if sent == "EOP":
            vectors[i, :] = EOP
        elif sent == "EOC":
            vectors[i, :] = EOC
    return vectors
def prepare_data(caps, features, worddict, model, maxlen=None, n_words=10000):
    """
    Put data into format useable by the model
    """
    seqs = []
    feat_list = []
    for i, cc in enumerate(caps):
        seqs.append([worddict[w] if worddict[w] < n_words else 1 for w in cc.split()])
        feat_list.append(features[i])

    lengths = [len(s) for s in seqs]

    if maxlen != None and numpy.max(lengths) >= maxlen:
        new_seqs = []
        new_feat_list = []
        new_lengths = []
        for l, s, y in zip(lengths, seqs, feat_list):
            if l < maxlen:
                new_seqs.append(s)
                new_feat_list.append(y)
                new_lengths.append(l)
        lengths = new_lengths
        feat_list = new_feat_list
        seqs = new_seqs

        if len(lengths) < 1:
            return None, None, None

    # Compute skip-thought vectors for this mini-batch
    feat_list = skipthoughts.encode(model, feat_list, use_eos=False, verbose=False)

    y = numpy.zeros((len(feat_list), len(feat_list[0]))).astype('float32')
    for idx, ff in enumerate(feat_list):
        y[idx,:] = ff

    n_samples = len(seqs)
    maxlen = numpy.max(lengths)+1

    x = numpy.zeros((maxlen, n_samples)).astype('int64')
    x_mask = numpy.zeros((maxlen, n_samples)).astype('float32')
    for idx, s in enumerate(seqs):
        x[:lengths[idx],idx] = s
        x_mask[:lengths[idx]+1,idx] = 1.

    return x, x_mask, y
Example #10
0
def skip_thoughts_vecs(sentences):
    _init_skip_thoughts()

    print('---------------- skip_thoughts -------------------')

    vecs = skipthoughts.encode(skip_thoughts_model,
                               sentences,
                               batch_size=len(sentences))
    return vecs

    batch_size = 512
    data = list()  # batches
    n = len(sentences)
    for i in xrange(0, n, batch_size):
        print('i = %d ---> START = %d, END = %d' % (i, i, i + batch_size))
        data.append((sentences[i:i + batch_size], labels[i:i + batch_size]))

    pool = Pool(20)
    try:
        pdata = pool.map(to_skip_thoughts_vec, data)
        pool.close()
        pool.join()
    except (KeyboardInterrupt, SystemExit):
        pool.close()
        pool.terminate()

    import pdb
    pdb.set_trace()

    vecs = np.zeros((n, SKIPTHOUGHTS_DIM))
    _labels = np.zeros((n))
    for i, batch in enumerate(pdata):
        start = i * batch_size
        end = start + batch_size
        print('i = %d ---> START = %d, END = %d' % (i, start, end))
        vecs[start:end] = batch[0]
        _labels[start:end] = batch[1]

    return vecs, _labels
Example #11
0
def create_embedding_npy(json_file='', ):

    model = st.load_model()

    eyes_color_list = [
        'gray', 'aqua', 'orange', 'red', 'blue', 'black', 'pink', 'green',
        'brown', 'purple', 'yellow'
    ]
    hair_color_list = [
        'gray', 'aqua', 'pink', 'white', 'red', 'purple', 'blue', 'black',
        'green', 'brown', 'orange'
    ]

    fidx2arridx_dict = {}

    jobj = json.load(open(json_file, 'r'))

    tag_strs = []
    count = 0
    for fidx, color_d in jobj.items():
        if len(color_d['eyes']) == 1 and len(color_d['hair']) == 1:
            eyes_color = eyes_color_list[color_d['eyes'][0]]
            hair_color = hair_color_list[color_d['hair'][0]]
            tag_str = ' '.join([hair_color, 'hair', eyes_color, 'eyes'])
            tag_strs.append(tag_str)

            fidx2arridx_dict[fidx] = count
            count += 1

    tag_embeddings = st.encode(model, tag_strs)

    print tag_embeddings.shape
    print len(fidx2arridx_dict)

    with open('fidx2arridx.json', 'w') as f:
        json.dump(fidx2arridx_dict, f)

    np.save('tags_embedding.npy', tag_embeddings)
Example #12
0
def main():
    caption_file = "captions.txt"
    training_image_file = "train_images4.txt"

    captions = []
    with open(caption_file) as f:
        line_list = f.read().split("\n")
        line_list = line_list[7500:9000]
        f1 = open(training_image_file, "w")
        for i in range(len(line_list)):
            img = line_list[i].split("\t")[0]
            cap = line_list[i].split("\t")[1]
            if len(cap) > 0:
                captions.append(cap)
                f1.write(img + "\n")
        f1.close()

    model = skipthoughts.load_model()
    caption_vectors = skipthoughts.encode(model, captions)

    h = h5py.File("/content/drive/MyDrive/train_caption_vectors4.hdf5", "w")
    h.create_dataset("vectors", data=caption_vectors)
    h.close()
Example #13
0
 def gen_response(self, message):
     return tools.run_sampler(
         self.trmodel,
         skipthoughts.encode(self.stmodel, [message],
                             use_norm=True,
                             verbose=False))[0]
Example #14
0
	def gen_response(self, message):
		return tools.run_sampler(
			self.trmodel, 
			skipthoughts.encode(self.stmodel, [message], use_norm=True, verbose=False)
		)[0]
Example #15
0
def to_skip_thoughts_vec(data):
    vecs = skipthoughts.encode(skip_thoughts_model, data[0], batch_size=512)

    return (vecs, data[1])