def skipthoughts_articles(articles, max_title_sentences=None,
                          max_article_sentences=None):
    """
    Filter articles so that we have at max `max_title_sentences` sentences in
    the title and `max_article_sentences` sentences in the body of the article.

    Then, we add in the skipthought vectors for all sentences in the titles and
    bodies of the articles into the `headline_vectors` and `article_vectors`
    key.
    """
    article_vectors = []
    st_model = st.load_model(data_path=SKIPTHOUGHTS_DATA)
    for article in tqdm(articles, 'skipthoughts encoding articles'):
        title_sentences = nltk.sent_tokenize(article['Headline'])
        if max_title_sentences is not None and  \
                len(title_sentences) > max_title_sentences:
            continue
        article_sentences = nltk.sent_tokenize(article['articleBody'])
        if max_article_sentences is not None and \
                len(article_sentences) > max_article_sentences:
            continue
        vectors = st.encode(st_model, title_sentences + article_sentences,
                            verbose=False, batch_size=128).astype('float16')
        N = len(title_sentences)
        article['headline_vectors'] = vectors[:N]
        article['article_vectors'] = vectors[N:]
        article_vectors.append(article)
    return article_vectors
Exemple #2
0
    def __init__(self):

        self.trmodel = tools.load_model("data/trainer.npz",
                                        "data/dictionary_fry.pkl")
        print "===== Loaded Trained Model ====="
        self.stmodel = skipthoughts.load_model()
        print "===== Loaded Skipthoughts Model ====="
Exemple #3
0
def skipthought_encode(answers):
    """
    Obtains sentence embeddings for each sentence in the emails
    """
    num_answers = len(answers)
    enc_answers = [None] * len(answers)
    cum_sum_sentences = [0]
    sent_count = 0

    for answer in answers:
        sent_count += len(answer)
        cum_sum_sentences.append(sent_count)

    all_sentences = [sent for answer in answers for sent in answer]
    print('Loading pre-trained models...')
    model = skipthoughts.load_model()
    encoder = skipthoughts.Encoder(model)
    print('Encoding sentences...')
    enc_sentences = encoder.encode(all_sentences, verbose=False)

    for i in range(len(answers)):
        begin = cum_sum_sentences[i]
        end = cum_sum_sentences[i + 1]
        enc_answers[i] = enc_sentences[begin:end]
    return enc_answers
Exemple #4
0
def get_test_sent(test_file):
    with open(test_file, "r") as f:
        test_sent = []
        for row in f.read().splitlines():
            test_sent.append(row.split(",")[1])
        model = skipthoughts.load_model()
        vecs = skipthoughts.encode(model, test_sent)
        return vecs
Exemple #5
0
def _init_skip_thoughts():
    global skip_thoughts_model

    if skip_thoughts_model:
        return

    global skipthoughts
    from skipthoughts import skipthoughts
    skip_thoughts_model = skipthoughts.load_model()
Exemple #6
0
def build_imgs():
    with open('./data/tags_clean.csv', 'r') as tag_file:
        tag_reader = csv.reader(tag_file, delimiter='\t')
        img_objs = []
        colors = [
            "red", "orange", "yellow", "green", "blue", "purple", "blonde",
            "pink", "black", "white", "brown"
        ]
        num = 0
        print "generate captions from training tags....."
        for row in tag_reader:
            img_id = row[0].split(',')[0]
            tag_row = [row[0].split(',')[1]] + row[1:]
            img = skimage.io.imread('./data/faces/{}.jpg'.format(int(img_id)))
            img = skimage.transform.resize(img, (64, 64))
            match_sent = []
            mismatch_sent = []
            tag_hair = []
            tag_eyes = []
            for tag in tag_row:
                tag = tag.split(':')[0]
                for color in colors:
                    if "{} hair".format(color) in tag:
                        tag_hair.append(tag)
                    if "{} eyes".format(color) in tag:
                        tag_eyes.append(tag)
            for t_h in tag_hair:
                for t_e in tag_eyes:
                    r = random.random()
                    if r > 0.5:
                        match_sent.append('{} {}'.format(t_h, t_e))
                    else:
                        match_sent.append('{} {}'.format(t_e, t_h))
            if match_sent:
                #  print match_sent
                img_objs.append(realimg(img, match_sent))
                num += 1
                #  if num >= 64: break
                #  print match_sent
        model = skipthoughts.load_model()
        k = 0
        for idx, img_obj1 in enumerate(img_objs):
            find = 0
            for img_obj2 in img_objs[1:]:
                for sent in img_obj2.match_sent:
                    if sent not in img_obj1.match_sent:
                        img_objs[idx].wimg.append(img_obj2.img)
                        img_obj1.mismatch_sent.append(sent)
                        find += 1
                    if find >= 1: break
                if find >= 1: break
            img_obj1.sent2embed(model)
            print "{}/{}".format(k, len(img_objs))
            k += 1
    with open("./train_data/img_objs_new.pk", "w") as f:
        pk.dump(img_objs, f)
def init():
    """
    Initialise the Sent2Vec encoder.

    This includes loading the model, which may take several minutes! This
    function must be called before any other function in this module.
    """
    global _encoder
    model = skipthoughts.load_model()
    _encoder = skipthoughts.Encoder(model)
Exemple #8
0
  def build_model(self):
    self.build_memory()
    # embed()
    self.skip_model = skip.load_model()

    self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, self.target)
    self.opt = tf.train.GradientDescentOptimizer(self.lr)

    grads = self.opt.compute_gradients(self.loss)
    inc_op = self.global_step.assign_add(1)
    with tf.control_dependencies([inc_op]):
      self.apply_grad_op = self.opt.apply_gradients(grads)

    tf.initialize_all_variables().run()
Exemple #9
0
    def build_model(self):
        self.build_memory()
        # embed()
        self.skip_model = skip.load_model()

        self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            self.logits, self.target)
        self.opt = tf.train.GradientDescentOptimizer(self.lr)

        grads = self.opt.compute_gradients(self.loss)
        inc_op = self.global_step.assign_add(1)
        with tf.control_dependencies([inc_op]):
            self.apply_grad_op = self.opt.apply_gradients(grads)

        tf.initialize_all_variables().run()
def load_data():
    ret_data = []

    mapping = pd.ExcelFile("transcriptVoiceMap.xlsx").parse(0)

    # load skipthoughts encoder
    model = st.load_model()
    encoder = st.Encoder(model)
    all_fst_transcript_vectors = encoder.encode([str(x) for x in mapping["fst transcript"]])
    all_snd_transcript_vectors = encoder.encode([str(x) for x in mapping["snd transcript"]])

    # load voice emotions model
    emotions_model = pickle.load(open("emotions_model.sav", 'rb'))
    all_fst_wav_vectors = get_activations(emotions_model,
                                          np.array([get_emotions_vec(x) for x in mapping["fst wav name"]]))
    all_snd_wav_vectors = get_activations(emotions_model,
                                          np.array([get_emotions_vec(x) for x in mapping["snd wav name"]]))

    for idx in range(len(mapping)):
        transcript_vec = all_fst_transcript_vectors[idx]
        snd_transcript_vec = all_snd_transcript_vectors[idx]
        transcript_vec = np.concatenate(
            (np.abs(transcript_vec - snd_transcript_vec), transcript_vec * snd_transcript_vec))

        # load agent execution flag (0=executed, 1=not executed):
        output = mapping["output"][idx]

        emotions_vec = all_fst_wav_vectors[idx]
        emotions_vec = np.concatenate((emotions_vec, all_snd_wav_vectors[idx]))

        fst_wav = mapping["fst wav name"][idx]
        snd_wav = mapping["snd wav name"][idx]
        frames_with_speech = count_frames_with_speech(fst_wav)
        frames_with_speech = count_frames_with_speech(snd_wav) - frames_with_speech

        label_idx = mapping["label"][idx]
        label = np.zeros(categories)
        label[label_idx] = 1

        ret_data.append((transcript_vec, [output], emotions_vec, [frames_with_speech], label))

    return ret_data
Exemple #11
0
def create_embedding_npy(json_file='', ):

    model = st.load_model()

    eyes_color_list = [
        'gray', 'aqua', 'orange', 'red', 'blue', 'black', 'pink', 'green',
        'brown', 'purple', 'yellow'
    ]
    hair_color_list = [
        'gray', 'aqua', 'pink', 'white', 'red', 'purple', 'blue', 'black',
        'green', 'brown', 'orange'
    ]

    fidx2arridx_dict = {}

    jobj = json.load(open(json_file, 'r'))

    tag_strs = []
    count = 0
    for fidx, color_d in jobj.items():
        if len(color_d['eyes']) == 1 and len(color_d['hair']) == 1:
            eyes_color = eyes_color_list[color_d['eyes'][0]]
            hair_color = hair_color_list[color_d['hair'][0]]
            tag_str = ' '.join([hair_color, 'hair', eyes_color, 'eyes'])
            tag_strs.append(tag_str)

            fidx2arridx_dict[fidx] = count
            count += 1

    tag_embeddings = st.encode(model, tag_strs)

    print tag_embeddings.shape
    print len(fidx2arridx_dict)

    with open('fidx2arridx.json', 'w') as f:
        json.dump(fidx2arridx_dict, f)

    np.save('tags_embedding.npy', tag_embeddings)
Exemple #12
0
def main():
    caption_file = "captions.txt"
    training_image_file = "train_images4.txt"

    captions = []
    with open(caption_file) as f:
        line_list = f.read().split("\n")
        line_list = line_list[7500:9000]
        f1 = open(training_image_file, "w")
        for i in range(len(line_list)):
            img = line_list[i].split("\t")[0]
            cap = line_list[i].split("\t")[1]
            if len(cap) > 0:
                captions.append(cap)
                f1.write(img + "\n")
        f1.close()

    model = skipthoughts.load_model()
    caption_vectors = skipthoughts.encode(model, captions)

    h = h5py.File("/content/drive/MyDrive/train_caption_vectors4.hdf5", "w")
    h.create_dataset("vectors", data=caption_vectors)
    h.close()
Exemple #13
0
def main():
    parser = argparse.ArgumentParser(
        description='Evaluate a model on the quora question-pair dataset.')
    parser.add_argument('--quora-data-dir',
                        required=True,
                        help='path to the directory containing the quora data')
    parser.add_argument(
        '--st-model-dir',
        required=True,
        help='path to the directory containing the skipthoughts model')
    parser.add_argument('--output-dir',
                        default='.',
                        help='path to the directory to write to')
    parser.add_argument('-v', '--verbose', action='store_true')
    args = parser.parse_args()

    output_file = os.path.join(args.output_dir, 'oov_stats.txt')

    with log.FileWriterStdoutPrinter(output_file) as writer:
        print "Loading skipthoughts model..."
        model = st.load_model(args.st_model_dir)
        print "Initializing skipthoughts word dict..."
        word_dict = st.init_word_dict(model)
        print "Analyzing word dict..."
        analyze_dict(word_dict, writer)

        print "Loading training set..."
        train = du.load_csv(os.path.join(args.quora_data_dir, 'train.csv'))
        writer.emit_line("Analyzing word counts in train.csv...")
        analyze_oov(word_dict, train, writer, args.output_dir, 'train')

        # Be sure to write data to disk for train before moving on to test, which is much bigger

        print "Loading test set..."
        test = du.load_csv(os.path.join(args.quora_data_dir, 'test.csv'))
        writer.emit_line("Analyzing word counts in test.csv...")
        analyze_oov(word_dict, test, writer, args.output_dir, 'test')
Exemple #14
0
def load_encoder(model_dir):
    model = st.load_model(model_dir)
    en = st.Encoder(model)
    return en
Exemple #15
0
def gen_model():
    """ Get the Skipthoughts model to be used in encoding """

    model = skipthoughts.load_model()
    return model
Exemple #16
0
def gen_model():
	""" Get the Skipthoughts model to be used in encoding """

	model = skipthoughts.load_model()
	return model
Exemple #17
0
from __future__ import print_function
from gutenburg import Bookshelf
from skipthoughts import skipthoughts
import numpy as np
from unidecode import unidecode
import progressbar as PB
import os
import time
import string
from nltk import sent_tokenize

skipthoughts_model = skipthoughts.load_model(
    data_path="/home/micha/work/tldr/skipthoughts/data/"
)

EOP = np.ones(4800)
EOC = -1 * np.ones(4800)

def null(*args, **kwargs):
    pass
print = null

def skipthoughts_encode(sentences, model=skipthoughts_model):
    vectors = None
    print(time.time(), len(sentences), "encoding")
    vectors = skipthoughts.encode(
        model, sentences, preprocess=lambda x: x,
        use_norm=False, verbose=False
    )
    print(time.time(), vectors.shape, "done")
    for i, sent in enumerate(sentences):
Exemple #18
0
	def __init__(self):

		self.trmodel = tools.load_model("data/trainer.npz", "data/dictionary_fry.pkl")
		print "===== Loaded Trained Model ====="
		self.stmodel = skipthoughts.load_model()
		print "===== Loaded Skipthoughts Model ====="
Exemple #19
0
import skipthoughts.skipthoughts as skipthoughts
model = skipthoughts.load_model()
encoder = skipthoughts.Encoder(model)
import skipthoughts.eval_sick2 as eval_sick2
from keras.models import load_model
import gensim
import os
import collections
import smart_open
import random
import time
import numpy as np

import sys

# limit gpu usage
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.3
set_session(tf.Session(config=config))

prefix = sys.argv[1]

q_file = prefix + '_q.txt.flat.filter'
a_file = prefix + '_a.txt.flat.filter'
#q_speaker = prefix+'_q_speaker.txt'
#a_speaker = prefix+'_a_speaker.txt'
#speaker = sys.argv[2]
top = int(sys.argv[2])
#q_para_file = prefix+'_q_paragraph.txt'