Esempio n. 1
0
def evaluate(model, k=10, seed=1234, evalcv=True, evaltest=False):
    """
    Run experiment
    k: number of CV folds
    test: whether to evaluate on test set
    """
    print('Preparing data...')
    traintext, testtext = load_data()
    train, train_labels = prepare_data(traintext)
    test, test_labels = prepare_data(testtext)
    train_labels = prepare_labels(train_labels)
    test_labels = prepare_labels(test_labels)
    train, train_labels = shuffle(train, train_labels, random_state=seed)

    print('Computing training skipthoughts...')
    trainF = skipthoughts.encode(model, train, verbose=False, use_eos=False)

    if evalcv:
        print('Running cross-validation...')
        interval = [2**t for t in range(0, 9, 1)]  # coarse-grained
        C = eval_kfold(trainF, train_labels, k=k, scan=interval, seed=seed)

    if evaltest:
        if not evalcv:
            C = 128  # Best parameter found from CV

        print('Computing testing skipthoughts...')
        testF = skipthoughts.encode(model, test, verbose=False, use_eos=False)

        print('Evaluating...')
        clf = LogisticRegression(C=C)
        clf.fit(trainF, train_labels)
        yhat = clf.predict(testF)
        print('Test accuracy: ' + str(clf.score(testF, test_labels)))
Esempio n. 2
0
def evaluate(model, k=10, seed=1234, evalcv=True, evaltest=False):
    """
    Run experiment
    k: number of CV folds
    test: whether to evaluate on test set
    """
    print 'Preparing data...'
    traintext, testtext = load_data()
    train, train_labels = prepare_data(traintext)
    test, test_labels = prepare_data(testtext)
    train_labels = prepare_labels(train_labels)
    test_labels = prepare_labels(test_labels)
    train, train_labels = shuffle(train, train_labels, random_state=seed)

    print 'Computing training skipthoughts...'
    trainF = skipthoughts.encode(model, train, verbose=False, use_eos=False)
    
    if evalcv:
        print 'Running cross-validation...'
        interval = [2**t for t in range(0,9,1)]     # coarse-grained
        C = eval_kfold(trainF, train_labels, k=k, scan=interval, seed=seed)

    if evaltest:
        if not evalcv:
            C = 128     # Best parameter found from CV

        print 'Computing testing skipthoughts...'
        testF = skipthoughts.encode(model, test, verbose=False, use_eos=False)

        print 'Evaluating...'
        clf = LogisticRegression(C=C)
        clf.fit(trainF, train_labels)
        yhat = clf.predict(testF)
        print 'Test accuracy: ' + str(clf.score(testF, test_labels))
Esempio n. 3
0
 def embd_sent(self,sent):
     import skipthoughts
     if self.type == 'uni':
         return skipthoughts.encode(self.model, [sent])[0][0:2400]
     elif self.type == 'bi':
         return skipthoughts.encode(self.model, [sent])[0][2400:]
     else:
         return skipthoughts.encode(self.model, [sent])[0]
Esempio n. 4
0
 def embd_multiple_sents(self,sents):
     import skipthoughts
     if self.type == 'combined':
         return skipthoughts.encode(self.model, sents)
     elif self.type == 'uni':
         ix_from = 0
         ix_to = 2400
     else:
         ix_from = 2400
         ix_to = 4800
     return np.asarray([v[ix_from:ix_to] for v in skipthoughts.encode(self.model, sents)])
Esempio n. 5
0
def evaluate(model,
             k=10,
             seed=1234,
             evalcv=True,
             evaltest=False,
             use_feats=True):
    """
    Run experiment
    k: number of CV folds
    test: whether to evaluate on test set
    """
    print 'Preparing data...'
    traintext, testtext, labels = load_data()

    print 'Computing training skipthoughts...'
    trainA = skipthoughts.encode(model, traintext[0], verbose=False)
    trainB = skipthoughts.encode(model, traintext[1], verbose=False)

    if evalcv:
        print 'Running cross-validation...'
        C = eval_kfold(trainA,
                       trainB,
                       traintext,
                       labels[0],
                       shuffle=True,
                       k=10,
                       seed=1234,
                       use_feats=use_feats)

    if evaltest:
        if not evalcv:
            C = 4  # Best parameter found from CV (combine-skip with use_feats=True)

        print 'Computing testing skipthoughts...'
        testA = skipthoughts.encode(model, testtext[0], verbose=False)
        testB = skipthoughts.encode(model, testtext[1], verbose=False)

        if use_feats:
            train_features = np.c_[np.abs(trainA - trainB), trainA * trainB,
                                   feats(traintext[0], traintext[1])]
            test_features = np.c_[np.abs(testA - testB), testA * testB,
                                  feats(testtext[0], testtext[1])]
        else:
            train_features = np.c_[np.abs(trainA - trainB), trainA * trainB]
            test_features = np.c_[np.abs(testA - testB), testA * testB]

        print 'Evaluating...'
        clf = LogisticRegression(C=C)
        clf.fit(train_features, labels[0])
        yhat = clf.predict(test_features)
        print 'Test accuracy: ' + str(clf.score(test_features, labels[1]))
        print 'Test F1: ' + str(f1(labels[1], yhat))
Esempio n. 6
0
def generate_story_loss(z, image_loc, k=100, bw=50, lyric=False):
    """
    Generate a story for an image at location image_loc
    """
    # Load the image
    rawim, im = load_image(image_loc)

    # Run image through convnet
    feats = compute_features(z['net'], im).flatten()
    feats /= norm(feats)

    # Embed image into joint space
    feats = embedding.encode_images(z['vse'], feats[None, :])

    # Compute the nearest neighbours
    scores = numpy.dot(feats, z['cvec'].T).flatten()
    sorted_args = numpy.argsort(scores)[::-1]
    sentences = [z['cap'][a] for a in sorted_args[:k]]

    print 'NEAREST-CAPTIONS: '
    for s in sentences[:5]:
        print s
    print ''

    # Compute skip-thought vectors for sentences
    svecs = skipthoughts.encode(z['stv'], sentences, verbose=False)

    # Style shifting
    # shift = svecs.mean(0) - z['bneg'] + z['bpos']

    return svecs.mean(0), z['bneg'], z['bpos']
Esempio n. 7
0
def predict():
    queries = request.get_json(silent=True, force=True)['input']
    # query = "This is a red flower with yellow stamen."
    encoded = Variable(torch.Tensor(skipthoughts.encode(model, queries)))
    if torch.cuda.is_available():
        encoded = encoded.cuda()
    image_paths = []

    for batch_i in range(BATCH_SIZE):
        noise_vec = Variable(torch.randn(len(queries), 100, 1, 1))
        if torch.cuda.is_available():
            noise_vec = noise_vec.cuda()

        gen_images = generator.forward(encoded, noise_vec)
        gen_images = gen_images.cpu()

        for i, img in enumerate(gen_images):
            curr = img.data.numpy()
            curr = np.swapaxes(curr, 0, 1)
            curr = np.swapaxes(curr, 1, 2)
            path = 'Data/samples/' + str(batch_i) + '_' + str(i) + '.png'
            scipy.misc.imsave(path, curr)
            image_paths.append(path)

    return jsonify({'images': image_paths})
Esempio n. 8
0
def encode_and_save(image_captions, image_classes, data_dir: str,
                    dataset: str):
    model = skipthoughts.load_model()
    encoded_captions = {}
    for i, img in enumerate(image_captions):
        st = time.time()
        encoded_captions[img] = skipthoughts.encode(model, image_captions[img])
        if i % 20 == 0:
            print(i, len(image_captions), img)
            print("Seconds", time.time() - st)

    img_ids = list(image_captions.keys())

    random.shuffle(img_ids)
    n_train_instances = int(len(img_ids) * 0.9)
    tr_image_ids = img_ids[0:n_train_instances]
    val_image_ids = img_ids[n_train_instances:-1]

    pickle.dump(
        image_captions,
        open(os.path.join(data_dir, dataset, dataset + '_caps.pkl'), "wb"))

    pickle.dump(tr_image_ids,
                open(os.path.join(data_dir, dataset, 'train_ids.pkl'), "wb"))
    pickle.dump(val_image_ids,
                open(os.path.join(data_dir, dataset, 'val_ids.pkl'), "wb"))

    ec_pkl_path = join(data_dir, dataset, dataset + '_tv.pkl')
    pickle.dump(encoded_captions, open(ec_pkl_path, "wb"))

    fc_pkl_path = join(data_dir, dataset, dataset + '_tc.pkl')
    pickle.dump(image_classes, open(fc_pkl_path, "wb"))
def extract_sentence_vectors(model):

	sentences = []
	selected_jsons = []
	ifile = open('op.json', 'r')
	
	

	for idx,line in enumerate(ifile):

		jj = json.loads(line)
		
		s1 = jj[0]
		s2 = jj[1]
		
		sentences.append(s1['sent'])
		sentences.append(s2['sent'])


	
	vectors = skipthoughts.encode(model, sentences)
 	sent_pair_vectors =  numpy.reshape(vectors, (len(vectors)/2, len(vectors[0]) * 2))
	
	print(sent_pair_vectors.shape)
	numpy.save("toy_pair_vectors.npy", sent_pair_vectors)
Esempio n. 10
0
	def encode(self, sentences, verbose=False):
		self.sentences = sentences
		if self.loaded_custom_model:
			self.vectors = penseur_utils.encode(self.model, sentences, verbose)
		else:
			self.vectors = skipthoughts.encode(self.model, sentences, verbose)
		return self.vectors
Esempio n. 11
0
def save_caption_vectors_shapes(data_dir):
	import time
	
	img_dir = join(data_dir, 'shapes/images')
	image_files = [f for f in os.listdir(img_dir) if 'png' in f]
	print image_files[300:400]
	print len(image_files)
	image_captions = { img_file : [] for img_file in image_files }

	caption_dir = join(data_dir, 'shapes/texts')
        caption_files = [f for f in os.listdir(caption_dir) if 'txt' in f]
        for cap_file in caption_files:
                with open(join(caption_dir,cap_file)) as f:
                        captions = f.read().split('\n')
                img_file = cap_file[0:5] + ".png"
                # 5 captions per image
                image_captions[img_file] += [cap for cap in captions if len(cap) > 0][0:5]

	print len(image_captions)

	model = skipthoughts.load_model()
	encoded_captions = {}

	for i, img in enumerate(image_captions):
		st = time.time()
		encoded_captions[img] = skipthoughts.encode(model, image_captions[img])
		print i, len(image_captions), img
		print "Seconds", time.time() - st
		
	
	h = h5py.File(join(data_dir, 'shapes_tv.hdf5'))
	for key in encoded_captions:
		h.create_dataset(key, data=encoded_captions[key])
	h.close()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--caption_file',
                        type=str,
                        default='Data/sample_captions.txt',
                        help='caption file')
    parser.add_argument('--data_dir',
                        type=str,
                        default='Data',
                        help='Data Directory')

    args = parser.parse_args()
    with open(args.caption_file) as f:
        captions = f.read().split('\n')

    captions = [cap for cap in captions if len(cap) > 0]
    print(captions)
    model = skipthoughts.load_model()
    caption_vectors = skipthoughts.encode(model, captions)

    if os.path.isfile(join(args.data_dir, 'sample_caption_vectors.hdf5')):
        os.remove(join(args.data_dir, 'sample_caption_vectors.hdf5'))
    h = h5py.File(join(args.data_dir, 'sample_caption_vectors.hdf5'))
    h.create_dataset('vectors', data=caption_vectors)
    h.close()
def extract_sentence_vectors(model):

    sentences = []
    selected_jsons = []
    ifile = open('../../snli_1.0/snli_1.0_train.jsonl', 'r')

    for idx, line in enumerate(ifile):
        #	print(idx)
        dropout_chance = random.random()
        if dropout_chance < 0.1:
            jj = json.loads(line)
            sentences.append(jj['sentence1'])
            sentences.append(jj['sentence2'])
            selected_jsons.append(jj)

    print(len(sentences), len(selected_jsons))

    #sys.exit()

    vectors = skipthoughts.encode(model, sentences)
    sent_pair_vectors = numpy.reshape(vectors,
                                      (len(vectors) / 2, len(vectors[0]) * 2))

    print(sent_pair_vectors.shape)
    numpy.save("sentence_pair_vectors.npy", sent_pair_vectors)
    with open('selected_sentence_pairs.json', 'w') as f:
        for jsons in selected_jsons:
            f.write(json.dumps(jsons))
            f.write('\n')
Esempio n. 14
0
def batcher(params, batch):
    batch = [' '.join(sent) if sent != [] else '.' for sent in batch]
    embeddings = skipthoughts.encode(params['encoder'],
                                     batch,
                                     verbose=False,
                                     use_eos=True)
    return embeddings
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--caption_file',
                        type=str,
                        default='Data/captions.txt',
                        help='caption file')
    parser.add_argument('--data_dir',
                        type=str,
                        default='Data',
                        help='Data Directory')

    args = parser.parse_args()
    with open(args.caption_file) as f:
        captions = f.read().split('\n')

    # captions : Text description of pictures stored in file sample_captions.txt
    captions = [cap for cap in captions if len(cap) > 0]
    print(captions)

    # create skipthoughts vectors
    model = skipthoughts.load_model()
    print('Creation of skipthought vectors : loading ....')
    caption_vectors = skipthoughts.encode(model, captions)
    print('Creation of skipthought vectors : DONE !')
    #print(caption_vectors)
    #print(np.shape(caption_vectors)).3

    # create tensor vectors with skipthought vectors as input
    print('Save skipthought vector : loading ....')
    np.save('skipvectors_2000.npy', caption_vectors)
    print('Save skipthought vector : DONE !')
Esempio n. 16
0
def prepare_data(query, imdb_key):
  query_representation = skip.encode(skip_model, [query])
  candidate_qa = [QAInfo for QAInfo in qa if QAInfo.imdb_key == imdb_key]
  skip_encode = list()
  for QAInfo in candidate_qa:
    try: skip_encode.append(qa_representation[QAInfo.qid])
    except: pass
  similarity = [(np.inner(query_representation, rep)[0][0], i) for i, rep in enumerate(skip_encode)]
  similarity.sort(reverse=True)
  most_similar = [candidate_qa[i] for score, i in similarity[:1]]

  retrieved_question = most_similar[0].question
  retrieved_answer = most_similar[0].answers
  retrieved_story = story[imdb_key]

  q_embed = np.array(gensim_w2v.encode_w2v_gensim(retrieved_question))
  a_embed = np.array([gensim_w2v.encode_w2v_gensim(a) for a in retrieved_answer])
  s_embed = np.zeros((1, 60, 300))
  s_embed[:,:len(retrieved_story)] = \
      np.reshape(np.array([gensim_w2v.encode_w2v_gensim(s) for s in retrieved_story]),\
                 (1,len(retrieved_story), 300))

  s_embed = np.reshape(s_embed, (1, 60, 300))
  q_embed = np.reshape(q_embed, (1, 1, 300))
  a_embed = np.reshape(a_embed, (1, 5, 300))
  return most_similar[0], s_embed, q_embed, a_embed
Esempio n. 17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--caption_file',
                        type=str,
                        default='Data/text.txt',
                        help='caption file')
    parser.add_argument('--data_dir',
                        type=str,
                        default='Data',
                        help='Data Directory')

    args = parser.parse_args()

    model = skipthoughts.load_model()
    encoded_captions = {}
    file_path = os.path.join(args.caption_file)
    dump_path = os.path.join(args.data_dir, 'enc_text.pkl')
    with open(file_path) as f:
        str_captions = f.read()
        captions = str_captions.split('\n')
        print(captions)
        encoded_captions['features'] = skipthoughts.encode(model, captions)

    pickle.dump(encoded_captions, open(dump_path, "wb"))
    print('Finished extracting Skip-Thought vectors of the given text '
          'descriptions')
Esempio n. 18
0
def save_caption_vectors(all_captions, target_dir, split, experiment):
    h = h5py.File(
        os.path.join(target_dir, experiment, '{}_captions.hdf5'.format(split)))
    model = skipthoughts.load_model()

    for class_name, image_captions in all_captions.items():
        print("number of images: ", len(image_captions))

        img_batches = [[] for i in range(NUM_BATCHES)]
        caption_batches = [[] for i in range(NUM_BATCHES)]
        counter = 0
        for img, captions in image_captions.items():
            counter = counter % NUM_BATCHES
            img_batches[counter].append(img)
            caption_batches[counter] += captions
            counter += 1
        print("batched for {}".format(class_name))

        group = h.create_group(class_name)
        for i in range(NUM_BATCHES):
            imgs = img_batches[i]
            captions = caption_batches[i]
            encoded_captions = skipthoughts.encode(model, captions)

            cstart = 0
            for img in imgs:
                num_caps = len(image_captions[img])
                print(cstart, num_caps, len(encoded_captions))
                group.create_dataset(img,
                                     data=encoded_captions[cstart:cstart +
                                                           num_caps])
                cstart += num_caps

            print("Batch {} of {} Done".format(i + 1, NUM_BATCHES))
    h.close()
Esempio n. 19
0
def prepare_data(caps,
                 features,
                 worddict,
                 model,
                 d,
                 maxlen=None,
                 n_words=10000):
    """
    Put data into format useable by the model
    """
    seqs = []
    feat_list = []
    for i, cc in enumerate(caps):
        seqs.append(
            [worddict[w] if worddict[w] < n_words else 1 for w in cc.split()])
        feat_list.append(features[i])

    lengths = [len(s) for s in seqs]
    print 'building seqs'
    if maxlen != None and numpy.max(lengths) >= maxlen:
        new_seqs = []
        new_feat_list = []
        new_lengths = []
        for l, s, y in zip(lengths, seqs, feat_list):
            if l < maxlen:
                new_seqs.append(s)
                new_feat_list.append(y)
                new_lengths.append(l)
        lengths = new_lengths
        feat_list = new_feat_list
        seqs = new_seqs

        if len(lengths) < 1:
            return None, None, None

    # Compute skip-thought vectors for this mini-batch
    print 'encoding skipthoughts'
    feat_list = skipthoughts.encode(model,
                                    feat_list,
                                    d,
                                    use_eos=False,
                                    verbose=False)
    print 'finished skipthoughts encoding'
    print 'feature list size %d, %d' % (len(feat_list), len(feat_list[0]))
    y = numpy.zeros((len(feat_list), len(feat_list[0]))).astype('float32')
    for idx, ff in enumerate(feat_list):
        y[idx, :] = ff

    n_samples = len(seqs)
    maxlen = numpy.max(lengths) + 1

    x = numpy.zeros((maxlen, n_samples)).astype('int64')
    x_mask = numpy.zeros((maxlen, n_samples)).astype('float32')
    print 'building mask'
    for idx, s in enumerate(seqs):
        x[:lengths[idx], idx] = s
        x_mask[:lengths[idx] + 1, idx] = 1.

    return x, x_mask, y
Esempio n. 20
0
def read_snli_from_csv(model):
    train_saved_path = './snli/processed-train.pkl'
    dev_saved_path = './snli/processed-dev.pkl'
    test_saved_path = './snli/processed-test.pkl'
    if os.path.isfile(train_saved_path) and os.path.isfile(test_saved_path):
        X_train, train_labels = joblib.load(train_saved_path)
        X_test, test_labels = joblib.load(test_saved_path)
        return X_train, X_test, train_labels, test_labels

    if model is None:
        raise ValueError("model is None")

    train_df = pd.read_csv('./snli/snli_1.0/snli_1.0_train.txt', delimiter='\t')
    train_df = train_df[pd.notnull(train_df.sentence2)]
    train_df = train_df[train_df.gold_label != '-']
    train_df = train_df[:(len(train_df) / 3)]

    train_ts, train_hs, train_labels = get_sentence_sample(train_df)
    logger.info('encoding train samples ...')
    logger.info('encoding ts ...')
    vectorized_train_ts = skipthoughts.encode(model, train_ts)
    logger.info('encoding hs ...')
    vectorized_train_hs = skipthoughts.encode(model, train_hs)
    del train_df, train_ts, train_hs
    X_train = np.concatenate((vectorized_train_ts, vectorized_train_hs), axis=1)
    logger.info('dump to file ...')
    joblib.dump((X_train, train_labels), train_saved_path)

    test_df = pd.read_csv('./snli/snli_1.0/snli_1.0_test.txt', delimiter='\t')
    test_df = test_df[pd.notnull(test_df.sentence2)]
    test_df = test_df[test_df.gold_label != '-']

    test_ts, test_hs, test_labels = get_sentence_sample(test_df)
    logger.info('encoding test samples ...')
    logger.info('encoding ts ...')
    vectorized_test_ts = skipthoughts.encode(model, test_ts)
    logger.info('encoding hs ...')
    vectorized_test_hs = skipthoughts.encode(model, test_hs)
    del test_df, test_ts, test_hs
    X_test = np.concatenate((vectorized_test_ts, vectorized_test_hs), axis=1)

    logger.info('dump to file ...')
    joblib.dump((X_test, test_labels), test_saved_path)
    logger.info('done')

    return X_train, X_test, train_labels, test_labels
Esempio n. 21
0
def batcher(params, batch):
    embeddings = skipthoughts.encode(params.encoder, [
        str(' '.join(sent), errors="ignore") if sent != [] else '.'
        for sent in batch
    ],
                                     verbose=False,
                                     use_eos=True)
    return embeddings
def encode_sentences(desc, sentence_list, model, imdb_key=None, is_qa=False):
    """Encode a list of sentences given the model.
    """

    if desc == 'skipthought':
        # encode a sentence list directly
        features = skipthoughts.encode(model, sentence_list, verbose=False)

    elif desc == 'vis-text-embed':
        # normalize sentence lists
        norm_sentence_list = [
            utils.normalize_alphanumeric(sentence.lower())
            for sentence in sentence_list
        ]
        # allows to encode a sentence list directly
        features = model.encode(norm_sentence_list)

    elif desc.startswith('tfidf'):
        desc_dim = len(model.vocab)
        midx = model.doc_names.index(imdb_key)
        # use scipy sparse matrix when encoding stories, otherwise too huge!
        if is_qa:
            features = np.zeros((len(sentence_list), desc_dim),
                                dtype='float32')
        else:
            features = sps.dok_matrix((len(sentence_list), desc_dim),
                                      dtype='float32')

        for s, sentence in enumerate(sentence_list):
            # NOTE: use both alphanumeric and stemming normalization
            sentence = utils.normalize_stemming(
                utils.normalize_alphanumeric(sentence.lower())).split(' ')
            # for each word in the normalized sentence
            for word in sentence:
                if word not in model.vocab: continue
                widx = model.vocab.index(word)
                features[s, widx] = model.tfidf[widx][midx]

            if is_qa:  # if not sparse, use numpy.linalg.norm
                features[s] /= (np.linalg.norm(features[s]) + 1e-6)
            else:  # if sparse, use scipy.sparse.linalg.norm
                features[s] /= (sps.linalg.norm(features[s]) + 1e-6)

    elif desc == 'word2vec':
        desc_dim = model.get_vector(model.vocab[-1]).shape[0]
        features = np.zeros((len(sentence_list), desc_dim), dtype='float32')
        for s, sentence in enumerate(sentence_list):
            # NOTE: use only alphanumeric normalization, no stemming
            sentence = utils.normalize_alphanumeric(
                sentence.lower()).split(' ')
            # for each word in the normalized sentence
            for word in sentence:
                if word not in model.vocab: continue
                features[s] += model.get_vector(word)

            features[s] /= (np.linalg.norm(features[s]) + 1e-6)

    return features
def compute_answer_vector2(answers, model):
    """Takes a dictionary of answer and return a dictionary of vectors representing the answer"""
    answer_vector = {}
    for answer_option in answers.keys():
        a_list = answers[answer_option].split(".")
        b = np.array(sum(skipthoughts.encode(model, sentence) for sentence in a_list))
        avg_factor = 1.0 / len(answers.keys())
        answer_vector[answer_option] = np.multiply(b, avg_factor)
    return answer_vector
Esempio n. 24
0
def story(z, image_loc, k=20, bw=5, lyric=False):
    """
    Generate a story for an image at location image_loc
    """
    # Load the image
    rawim, im = load_image(image_loc)

    # Run image through convnet
    feats = compute_features(z['net'], im).flatten()
    feats /= norm(feats)

    # Embed image into joint space
    feats = embedding.encode_images(z['vse'], feats[None, :])

    # Compute the nearest neighbours
    scores = numpy.dot(feats, z['cvec'].T).flatten()
    sorted_args = numpy.argsort(scores)[::-1]
    sentences = [z['cap'][a] for a in sorted_args[:k]]
    #pipeline broken here and one good caption whose image name is same is added
    f = open('/Users/shreyajain/Downloads/output.txt').read()
    #f2 = open('/Users/shreyajain/Downloads/input_story.txt','w')
    image_name = image_loc.split('/')[-1]
    text = f.split('\n')
    for t in range(4, len(text)):
        l = text[t].split()

        if l != []:
            name = l[0].split('/')[-1]
            if name == image_name:
                caption = l[1:]
                #caption = ' '.join(caption)
                sentences = caption + sentences
                break
    # print 'NEAREST-CAPTIONS: '
    # for s in sentences[:5]:
    #     print s
    # print ''

    # Compute skip-thought vectors for sentences
    svecs = skipthoughts.encode(z['stv'], sentences, verbose=False)

    # Style shifting
    shift = svecs.mean(0) - z['bneg'] + z['bpos']

    # Generate story conditioned on shift
    passage = decoder.run_sampler(z['dec'], shift, beam_width=bw)
    #print 'OUTPUT: '
    if lyric:
        for line in passage.split(','):
            if line[0] != ' ':
                print line
            else:
                print line[1:]
    else:
        return passage
Esempio n. 25
0
def encoding(input):

    encoded_vector_dir = './encoded_vector'
    try:
        caption_vectors = skipthoughts.encode(model, input)
        print("Sentence encoding sucessful")
    except:
        print("Failed sentence encoding")
    files_ = 'test_vector.pkl'
    with open(join(encoded_vector_dir, files_), mode='wb') as myfile:
        pickle.dump(caption_vectors, myfile)
Esempio n. 26
0
def main():

    beta1 = 0.5
    lr = 2e-4
    z_dim = 100
    t_dim = 256
    batch_size = 64
    image_size = 64
    gfc_dim = 1024
    caption_vector_length = 4800
    epochs = 600
    path = sys.argv[1]

    test_data, test_id = load_test_data(path)
    embed_model = skipthoughts.load_model()
    caption_vectors = skipthoughts.encode(embed_model, test_data)

    #np.save("test_embedd.npy", caption_vectors)
    #exit()
    #caption_vectors = np.load("test_embedd.npy")
    caption_vectors = np.tile(caption_vectors, (5, 1))

    model_options = {
        'z_dim': 100,
        't_dim': 256,
        'batch_size': len(test_data) * 5,
        'image_size': 64,
        'gf_dim': 64,
        'df_dim': 64,
        'gfc_dim': 1024,
        'caption_vector_length': 4800
    }

    gan = model.GAN(model_options)
    input_tensors, variables, loss, outputs, checks = gan.build_model()

    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        tf.global_variables_initializer().run()

        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state("Data/Models/")
        saver.restore(sess, ckpt.model_checkpoint_path)
        print("Model restored.")

        z_noise = np.random.uniform(-1, 1, [5 * len(test_data), z_dim])

        gen = sess.run(outputs['generator'],
                       feed_dict={
                           input_tensors['t_real_caption']: caption_vectors,
                           input_tensors['t_z']: z_noise
                       })
        print "Saving Images, Model"

        save_image(gen, test_id)
def main():
    ann_root = '/home/fl302/Projects/VQA-tensorflow/data/annotations'
    questions = _read_json(ann_root,
                           'MultipleChoice_mscoco_val2014_questions.json',
                           'questions')

    # create model
    model = skipthoughts.load_model()

    # create buffers
    quest_ids, quest_coding = [], []
    quest_buffer = []
    # now, do the job
    for i, info in enumerate(questions):
        print('Skip thought: extracted %d/%d' % (i, len(questions)))
        if i > 100:
            break
        quest_id = info['question_id']
        quest = info['question'].lower()
        quest_buffer.append(quest)
        quest_ids.append(quest_id)
        if i % 100 == 0 and i > 0:
            quest_vectors = skipthoughts.encode(model, quest_buffer)
            # append to the main buffer
            quest_coding.append(quest_vectors.copy())
            # clear question buffer
            quest_buffer = []
    # process last batch
    if quest_buffer:
        quest_vectors = skipthoughts.encode(model, quest_buffer)
        quest_coding.append(quest_vectors.copy())

    # concatenate
    quest_coding = np.concatenate(quest_coding, axis=0).astype(np.float32)
    quest_ids = np.array(quest_ids, dtype=np.int32)

    # save to file
    save_hdf5('vqa_val_skipthought.h5', {
        'quest_id': quest_ids,
        'quest_coding': quest_coding
    })
Esempio n. 28
0
def generate(sentences, stv, bpos, bneg, dec):
    # Compute skip-thought vectors for sentences
    svecs = skipthoughts.encode(stv, sentences, verbose=False)
    console.log("Encoded skipthought vector")
    # Style shifting
    shift = svecs.mean(0) - bneg + bpos
    console.log("Shifted style")
    # TODO: clean up here
    # Generate story conditioned on shift
    passage = decoder.run_sampler(dec, shift, beam_width=500)
    console.log("Sampled passage")
    return passage
Esempio n. 29
0
def transform_ques_weak(x, sqa, word_to_id):

    z = np.zeros((110,4800))

    #model = skipthoughts.load_model()
    
    quest = x
    
    indices = []
    for i in range(0, sqa):
        
        qi2 = skipthoughts.encode(model, x[i][2])
        
        s = qi2.shape[0]
        z[:s] = qi2
        quest[i][2] = z.tolist()
        

        '''
        four = encoded_already.s1()
        quest[i][2] = four[0]
        '''

        quest[i][3] = skipthoughts.encode(model, x[i][3])

        q3l = quest[i][3].tolist()
        quest[i][3] = q3l[0]    #because skipthoughts automatically puts two brackets around a single sentene encoding
        

        '''
        four2 = encoded_already.s2()
        quest[i][3] = four2[0]
        '''

        
        quest[i][4] = word_to_id[x[i][4][0]]

        i += 1

    return quest
Esempio n. 30
0
def get_similarity(text1, text2):
    x = [text1, text2]
    vectors = skipthoughts.encode(model, x)

    # need reshaping to prevent warning from scikit-learn
    a = vectors[0].reshape(1, -1)
    b = vectors[1].reshape(1, -1)

    result_sim = float(cosine_similarity(a, b))

    print(result_sim, text1, text2)

    return result_sim
Esempio n. 31
0
def get_image_tag_pair(tag_dict_in_use, img_path="data/faces/"):
    print("start loading skipthoughts model")
    # get text vector
    model = skipthoughts.load_model()
    list_text = skipthoughts.encode(model, list(tag_dict_in_use.values()))

    # get image
    list_image = []
    for key, item in tag_dict_in_use.items():
        img = skimage.io.imread(os.path.join(img_path, str(key) + ".jpg"))
        img = skimage.transform.resize(img, (64, 64))
        list_image.append(img)
    return list_image, list_text
Esempio n. 32
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--caption_file',
                        type=str,
                        default='Data/sample_captions.txt',
                        help='caption file')
    parser.add_argument('--data_dir',
                        type=str,
                        default='Data',
                        help='Data Directory')
    parser.add_argument('--data_set',
                        type=str,
                        default='flowers',
                        help="Define the name of data sets")

    args = parser.parse_args()
    _n_labels = 4096
    if args.data_set == "ImageNet":
        with open("./Data/sample_caption_ImageNet.txt") as f:
            captions = f.read().split('\n')
        captions = [cap for cap in captions if len(cap) > 0]
        caption_vector_list = []
        for cap in captions:
            _n = cap.split(',')
            _zeros0 = np.zeros(_n_labels)
            _zeros1 = np.zeros(_n_labels)
            _zeros0[int(_n[0])] = 1
            _zeros1[int(_n[0])] = 1
            _onehot = np.concatenate([_zeros0, _zeros1], axis=0)
            caption_vector_list.append(_onehot)
        print(len(caption_vector_list), len(caption_vector_list[0]))
        h = h5py.File(join(args.data_dir, 'sample_caption_ImageNet.hdf5'))
        h.create_dataset('vectors', data=caption_vector_list)
        h.close()

    if args.data_set == "flowers":
        with open(args.caption_file) as f:
            captions = f.read().split('\n')

        captions = [cap for cap in captions if len(cap) > 0]
        print(captions)
        model = skipthoughts.load_model()
        caption_vectors = skipthoughts.encode(model, captions)
        print(caption_vectors)
        print(caption_vectors.shape, len(caption_vectors[0]))

        if os.path.isfile(join(args.data_dir, 'sample_caption_vectors.hdf5')):
            os.remove(join(args.data_dir, 'sample_caption_vectors.hdf5'))
        h = h5py.File(join(args.data_dir, 'sample_caption_vectors.hdf5'))
        h.create_dataset('vectors', data=caption_vectors)
        h.close()
def encode_sentences(desc, sentence_list, model, imdb_key=None, is_qa=False):
    """Encode a list of sentences given the model.
    """

    if desc == 'skipthought':
        # encode a sentence list directly
        features = skipthoughts.encode(model, sentence_list, verbose=False)

    elif desc == 'vis-text-embed':
        # normalize sentence lists
        norm_sentence_list = [utils.normalize_alphanumeric(sentence.lower()) for sentence in sentence_list]
        # allows to encode a sentence list directly
        features = model.encode(norm_sentence_list)

    elif desc.startswith('tfidf'):
        desc_dim = len(model.vocab)
        midx = model.doc_names.index(imdb_key)
        # use scipy sparse matrix when encoding stories, otherwise too huge!
        if is_qa:
            features = np.zeros((len(sentence_list), desc_dim), dtype='float32')
        else:
            features = sps.dok_matrix((len(sentence_list), desc_dim), dtype='float32')

        for s, sentence in enumerate(sentence_list):
            # NOTE: use both alphanumeric and stemming normalization
            sentence = utils.normalize_stemming(utils.normalize_alphanumeric(sentence.lower())).split(' ')
            # for each word in the normalized sentence
            for word in sentence:
                if word not in model.vocab: continue
                widx = model.vocab.index(word)
                features[s,widx] = model.tfidf[widx][midx]

            if is_qa:  # if not sparse, use numpy.linalg.norm
                features[s] /= (np.linalg.norm(features[s]) + 1e-6)
            else:  # if sparse, use scipy.sparse.linalg.norm
                features[s] /= (sps.linalg.norm(features[s]) + 1e-6)

    elif desc == 'word2vec':
        desc_dim = model.get_vector(model.vocab[-1]).shape[0]
        features = np.zeros((len(sentence_list), desc_dim), dtype='float32')
        for s, sentence in enumerate(sentence_list):
            # NOTE: use only alphanumeric normalization, no stemming
            sentence = utils.normalize_alphanumeric(sentence.lower()).split(' ')
            # for each word in the normalized sentence
            for word in sentence:
                if word not in model.vocab: continue
                features[s] += model.get_vector(word)

            features[s] /= (np.linalg.norm(features[s]) + 1e-6)

    return features
Esempio n. 34
0
def evaluate(model, seed=1234, evaltest=False):
    """
    Run experiment
    """
    print 'Preparing data...'
    train, dev, test, scores = load_data()
    train[0], train[1], scores[0] = shuffle(train[0],
                                            train[1],
                                            scores[0],
                                            random_state=seed)

    print 'Computing training skipthoughts...'
    trainA = skipthoughts.encode(model, train[0], verbose=False, use_eos=True)
    trainB = skipthoughts.encode(model, train[1], verbose=False, use_eos=True)

    print 'Computing development skipthoughts...'
    devA = skipthoughts.encode(model, dev[0], verbose=False, use_eos=True)
    devB = skipthoughts.encode(model, dev[1], verbose=False, use_eos=True)

    print 'Computing feature combinations...'
    trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
    devF = np.c_[np.abs(devA - devB), devA * devB]

    print 'Encoding labels...'
    trainY = encode_labels(scores[0])
    devY = encode_labels(scores[1])

    print 'Compiling model...'
    lrmodel = prepare_model(ninputs=trainF.shape[1])

    print 'Training...'
    bestlrmodel = train_model(lrmodel, trainF, trainY, devF, devY, scores[1])

    if evaltest:
        print 'Computing test skipthoughts...'
        testA = skipthoughts.encode(model,
                                    test[0],
                                    verbose=False,
                                    use_eos=True)
        testB = skipthoughts.encode(model,
                                    test[1],
                                    verbose=False,
                                    use_eos=True)

        print 'Computing feature combinations...'
        testF = np.c_[np.abs(testA - testB), testA * testB]

        print 'Evaluating...'
        r = np.arange(1, 6)
        yhat = np.dot(bestlrmodel.predict_proba(testF, verbose=2), r)
        pr = pearsonr(yhat, scores[2])[0]
        sr = spearmanr(yhat, scores[2])[0]
        se = mse(yhat, scores[2])
        print 'Test Pearson: ' + str(pr)
        print 'Test Spearman: ' + str(sr)
        print 'Test MSE: ' + str(se)

        return yhat
Esempio n. 35
0
def evaluate(model, k=10, seed=1234, evalcv=True, evaltest=False, use_feats=True):
    """
    Run experiment
    k: number of CV folds
    test: whether to evaluate on test set
    """
    print 'Preparing data...'
    traintext, testtext, labels = load_data()

    print 'Computing training skipthoughts...'
    trainA = skipthoughts.encode(model, traintext[0], verbose=False)
    trainB = skipthoughts.encode(model, traintext[1], verbose=False)

    if evalcv:
        print 'Running cross-validation...'
        C = eval_kfold(trainA, trainB, traintext, labels[0], shuffle=True, k=10, seed=1234, use_feats=use_feats)

    if evaltest:
        if not evalcv:
            C = 4    # Best parameter found from CV (combine-skip with use_feats=True)

        print 'Computing testing skipthoughts...'
        testA = skipthoughts.encode(model, testtext[0], verbose=False)
        testB = skipthoughts.encode(model, testtext[1], verbose=False)

        if use_feats:
            train_features = np.c_[np.abs(trainA - trainB), trainA * trainB, feats(traintext[0], traintext[1])]
            test_features = np.c_[np.abs(testA - testB), testA * testB, feats(testtext[0], testtext[1])]
        else:
            train_features = np.c_[np.abs(trainA - trainB), trainA * trainB]
            test_features = np.c_[np.abs(testA - testB), testA * testB]

        print 'Evaluating...'
        clf = LogisticRegression(C=C)
        clf.fit(train_features, labels[0])
        yhat = clf.predict(test_features)
        print 'Test accuracy: ' + str(clf.score(test_features, labels[1]))
        print 'Test F1: ' + str(f1(labels[1], yhat))
Esempio n. 36
0
def save_caption_vectors_faces(data_dir):
    import time

    data_dir = 'Data/'

    img_dir = join(data_dir, 'faces/jpg')
    image_files = [f for f in os.listdir(img_dir) if 'jpg' in f]
    print image_files[1:20]
    print len(image_files)
    image_captions = {img_file: [] for img_file in image_files}

    caption_dir = join(data_dir, 'faces/tags/tags_clean.csv')
    with open(caption_dir, 'r') as fin:
        for line in fin:
            l = line.split(',')
            img_id = str(l[0]) + '.jpg'
            tags = l[1].split('\t')
            captions = []
            for t in tags:
                t2 = t.split(':')
                if len(t2) > 1:
                    captions.append([str(t2[0]), int(t2[1])])
            captions.sort(key=lambda tup: tup[1], reverse=True)
            for ind, c in enumerate(captions):
                # ==================================
                # TODO: top 10 tags
                # if ind >= 10:
                # 	break
                # image_captions[img_id].append(c[0])
                # ===================================
                # TODO: eyes and hair tags only
                if 'eye' in c[0] or 'hair' in c[0]:
                    image_captions[img_id].append(c[0])

    print len(image_captions)

    model = skipthoughts.load_model()
    encoded_captions = {}

    for i, img in enumerate(image_captions):
        st = time.time()
        encoded_captions[img] = skipthoughts.encode(model, image_captions[img])
        print i, len(image_captions), img
        print "Seconds", time.time() - st

    h = h5py.File(join(data_dir, 'faces_tv.hdf5'))
    for key in encoded_captions:
        h.create_dataset(key, data=encoded_captions[key])
    h.close()
Esempio n. 37
0
def prepare_data(caps, features, worddict, model, maxlen=None, n_words=10000):
    """
    Put data into format useable by the model
    """
    seqs = []
    feat_list = []
    for i, cc in enumerate(caps):
        seqs.append([worddict[w] if worddict[w] < n_words else 1 for w in cc.split()])
        feat_list.append(features[i])

    lengths = [len(s) for s in seqs]

    if maxlen != None and numpy.max(lengths) >= maxlen:
        new_seqs = []
        new_feat_list = []
        new_lengths = []
        for l, s, y in zip(lengths, seqs, feat_list):
            if l < maxlen:
                new_seqs.append(s)
                new_feat_list.append(y)
                new_lengths.append(l)
        lengths = new_lengths
        feat_list = new_feat_list
        seqs = new_seqs

        if len(lengths) < 1:
            return None, None, None

    # Compute skip-thought vectors for this mini-batch
    feat_list = skipthoughts.encode(model, feat_list, use_eos=False, verbose=False)

    y = numpy.zeros((len(feat_list), len(feat_list[0]))).astype('float32')
    for idx, ff in enumerate(feat_list):
        y[idx,:] = ff

    n_samples = len(seqs)
    maxlen = numpy.max(lengths)+1

    x = numpy.zeros((maxlen, n_samples)).astype('int64')
    x_mask = numpy.zeros((maxlen, n_samples)).astype('float32')
    for idx, s in enumerate(seqs):
        x[:lengths[idx],idx] = s
        x_mask[:lengths[idx]+1,idx] = 1.

    return x, x_mask, y
Esempio n. 38
0
def story(z, image_loc, k=100, bw=50, lyric=False):
    """
    Generate a story for an image at location image_loc
    """
    # Load the image
    rawim, im = load_image(image_loc)

    # Run image through convnet
    feats = compute_features(z['net'], im).flatten()
    feats /= norm(feats)

    # Embed image into joint space
    feats = embedding.encode_images(z['vse'], feats[None,:])

    # Compute the nearest neighbours
    scores = numpy.dot(feats, z['cvec'].T).flatten()
    sorted_args = numpy.argsort(scores)[::-1]
    sentences = [z['cap'][a] for a in sorted_args[:k]]

    print 'NEAREST-CAPTIONS: '
    for s in sentences[:5]:
        print s
    print ''

    # Compute skip-thought vectors for sentences
    svecs = skipthoughts.encode(z['stv'], sentences, verbose=False)

    # Style shifting
    shift = svecs.mean(0) - z['bneg'] + z['bpos']

    # Generate story conditioned on shift
    passage = decoder.run_sampler(z['dec'], shift, beam_width=bw)
    print 'OUTPUT: '
    if lyric:
        for line in passage.split(','):
            if line[0] != ' ':
                print line
            else:
                print line[1:]
    else:
        print passage
        return passage
Esempio n. 39
0
def evaluate(model, seed=1234, evaltest=False):
    """
    Run experiment
    """
    print 'Preparing data...'
    train, dev, test, scores = load_data()
    train[0], train[1], scores[0] = shuffle(train[0], train[1], scores[0], random_state=seed)
    
    print 'Computing training skipthoughts...'
    trainA = skipthoughts.encode(model, train[0], verbose=False, use_eos=True)
    trainB = skipthoughts.encode(model, train[1], verbose=False, use_eos=True)
    
    print 'Computing development skipthoughts...'
    devA = skipthoughts.encode(model, dev[0], verbose=False, use_eos=True)
    devB = skipthoughts.encode(model, dev[1], verbose=False, use_eos=True)

    print 'Computing feature combinations...'
    trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
    devF = np.c_[np.abs(devA - devB), devA * devB]

    print 'Encoding labels...'
    trainY = encode_labels(scores[0])
    devY = encode_labels(scores[1])

    print 'Compiling model...'
    lrmodel = prepare_model(ninputs=trainF.shape[1])

    print 'Training...'
    bestlrmodel = train_model(lrmodel, trainF, trainY, devF, devY, scores[1])

    if evaltest:
        print 'Computing test skipthoughts...'
        testA = skipthoughts.encode(model, test[0], verbose=False, use_eos=True)
        testB = skipthoughts.encode(model, test[1], verbose=False, use_eos=True)

        print 'Computing feature combinations...'
        testF = np.c_[np.abs(testA - testB), testA * testB]

        print 'Evaluating...'
        r = np.arange(1,6)
        yhat = np.dot(bestlrmodel.predict_proba(testF, verbose=2), r)
        pr = pearsonr(yhat, scores[2])[0]
        sr = spearmanr(yhat, scores[2])[0]
        se = mse(yhat, scores[2])
        print 'Test Pearson: ' + str(pr)
        print 'Test Spearman: ' + str(sr)
        print 'Test MSE: ' + str(se)

        return yhat
Esempio n. 40
0
def save_caption_vectors_flowers(data_dir):
	import time
	
	img_dir = join(data_dir, 'flowers/jpg')
	image_files = [f for f in os.listdir(img_dir) if 'jpg' in f]
	print image_files[300:400]
	print len(image_files)
	image_captions = { img_file : [] for img_file in image_files }

	caption_dir = join(data_dir, 'flowers/text_c10')
	class_dirs = []
	for i in range(1, 103):
		class_dir_name = 'class_%.5d'%(i)
		class_dirs.append( join(caption_dir, class_dir_name))

	for class_dir in class_dirs:
		caption_files = [f for f in os.listdir(class_dir) if 'txt' in f]
		for cap_file in caption_files:
			with open(join(class_dir,cap_file)) as f:
				captions = f.read().split('\n')
			img_file = cap_file[0:11] + ".jpg"
			# 5 captions per image
			image_captions[img_file] += [cap for cap in captions if len(cap) > 0][0:5]

	print len(image_captions)

	model = skipthoughts.load_model()
	encoded_captions = {}


	for i, img in enumerate(image_captions):
		st = time.time()
		encoded_captions[img] = skipthoughts.encode(model, image_captions[img])
		print i, len(image_captions), img
		print "Seconds", time.time() - st
		
	
	h = h5py.File(join(data_dir, 'flower_tv.hdf5'))
	for key in encoded_captions:
		h.create_dataset(key, data=encoded_captions[key])
	h.close()
Esempio n. 41
0
def load_data(model, name, loc='./data/', seed=1234):
    """
    Load one of MR, CR, SUBJ or MPQA
    """
    z = {}
    if name == 'MR':
        pos, neg = load_rt(loc=loc)
    elif name == 'SUBJ':
        pos, neg = load_subj(loc=loc)
    elif name == 'CR':
        pos, neg = load_cr(loc=loc)
    elif name == 'MPQA':
        pos, neg = load_mpqa(loc=loc)

    labels = compute_labels(pos, neg)
    text, labels = shuffle_data(pos+neg, labels, seed=seed)
    z['text'] = text
    z['labels'] = labels
    print 'Computing skip-thought vectors...'
    features = skipthoughts.encode(model, text, verbose=False)
    return z, features
Esempio n. 42
0
def save_caption_vectors_ms_coco(data_dir, split, batch_size):
	meta_data = {}
	ic_file = join(data_dir, 'annotations/captions_{}2014.json'.format(split))
	with open(ic_file) as f:
		ic_data = json.loads(f.read())

	meta_data['data_length'] = len(ic_data['annotations'])
	with open(join(data_dir, 'meta_{}.pkl'.format(split)), 'wb') as f:
		pickle.dump(meta_data, f)

	model = skipthoughts.load_model()
	batch_no = 0
	print "Total Batches", len(ic_data['annotations'])/batch_size

	while batch_no*batch_size < len(ic_data['annotations']):
		captions = []
		image_ids = []
		idx = batch_no
		for i in range(batch_no*batch_size, (batch_no+1)*batch_size):
			idx = i%len(ic_data['annotations'])
			captions.append(ic_data['annotations'][idx]['caption'])
			image_ids.append(ic_data['annotations'][idx]['image_id'])

		print captions
		print image_ids
		# Thought Vectors
		tv_batch = skipthoughts.encode(model, captions)
		h5f_tv_batch = h5py.File( join(data_dir, 'tvs/'+split + '_tvs_' + str(batch_no)), 'w')
		h5f_tv_batch.create_dataset('tv', data=tv_batch)
		h5f_tv_batch.close()

		h5f_tv_batch_image_ids = h5py.File( join(data_dir, 'tvs/'+split + '_tv_image_id_' + str(batch_no)), 'w')
		h5f_tv_batch_image_ids.create_dataset('tv', data=image_ids)
		h5f_tv_batch_image_ids.close()

		print "Batches Done", batch_no, len(ic_data['annotations'])/batch_size
		batch_no += 1
Esempio n. 43
0
            def embedding_thread(x, y, output):
                imdb_key_check = {}
                last_stories = []
                for i in tqdm(xrange(x,y)):
                    error = False

                    qa_info = self.qa[i]
                    question = str(qa_info.question)
                    answers = qa_info.answers
                    correct_index = qa_info.correct_index
                    imdb_key = str(qa_info.imdb_key)
                    validation_flag = str(qa_info.qid)

                    for answer in answers:
                        if len(answer) == 0 : error = True
                    if error == True :continue

                    question_embedding = skipthoughts.encode(model, [question])
                    words_in_question = word_tokenize(question)
                    assert question_embedding.shape == (1,4800)

                    local_answers = skipthoughts.encode(model, answers)


                    stories = self.story[imdb_key]

                    local_stories = []
                    if imdb_key in imdb_key_check: local_stories = last_stories
                    else:
                        imdb_key_check[imdb_key] = 1
                        local_stories = skipthoughts.encode(model, stories)
                        last_stories = local_stories

                    skip_dim = 4800
                    self.zq.append(question_embedding)
                    self.zaj.append(np.array(local_answers).reshape(5,4800))
                    zsl_row = np.array(local_stories).shape[0]
                    print "zsl shape >> ",
                    print np.array(local_stories).shape
                    self.zsl.append(np.array(local_stories).reshape(zsl_row,4800))
                    self.qid.append(qa_info.qid)

                    print "==========================="
                    print "each QAInfo status >> "
                    print "question embedding shape >> ",
                    print np.array(self.zq).shape
                    print np.array(self.zq_val).shape
                    print "answer embedding shape >> ",
                    print np.array(self.zaj).shape
                    print np.array(self.zaj_val).shape
                    print "stories embedding shape >> ",
                    try:
                        print np.array(self.zsl).shape
                        print np.array(self.zsl_val).shape
                    except:
                        print "warning : dimension error."

                    print "ground truth shape >> ",
                    print np.array(self.ground_truth).shape
                    print np.array(self.ground_truth_val).shape
                    print "=========================="

                output.put(self.zq)
                output.put(self.zaj)
                output.put(self.zsl)
                output.put(self.qid)
def compute_question_vector(question, model):
    """Takes a question and computes a thought vector representing that question"""
    q_vec = skipthoughts.encode(model, question)
    return q_vec
Esempio n. 45
0
vectors_of_papers_for_diff_authors = []
for i in abstract.keys():
	t = time()
	outfile.write("<author>" + "\n")
	outfile.write(i + "\n")
	#print i
	outfile.write("</author>" + "\n")
	outfile.write("<paper_id>" + "\n")
	for j in identifier[i]:
		outfile.write(j + "\n")
	outfile.write("</paper_id>" + "\n")
	outfile.write("<paper_vector>" + "\n")
	# for k in abstract[i]:
	# 	print k
	# 	abs_in_list = [k]
	vecs = skipthoughts.encode(model,abstract[i])
	vectors_of_papers_for_diff_authors.append(vecs)

	for vec in vecs:
		outfile.write(" ".join(str(vec)[1:-1])+'\n')

	outfile.write("</paper_vector>" + "\n\n")
	#print "Time taken for author: ", i, time()-t

dict_temp = {'author_names':authors, 'paper_vectors':vectors_of_papers_for_diff_authors}
#pkl.dump({'author_names':authors, 'paper_vectors':vectors_of_papers_for_diff_authors}, vec_pickle)
io.savemat('vectors_three_authors.mat',dict_temp)
checkfile.close()
outfile.close()
#vec_pickle.close()
test_sentences = preprocess.get_chain_sentences(test_data)


def get_chunk(iterable, size):
    return [iterable[x:x + size] for x in xrange(0, len(iterable), size)]


def save_vectors(dataset, chain_vectors, save_file_name, train=True):
    vectors_dict = {}
    size = 5 if train else 6
    for key, vectors in zip(dataset.keys(), get_chunk(chain_vectors, size)):
        vectors_dict[key] = vectors
    print("save", save_file_name)
    shape = vectors.shape
    shape = (1, shape[0], shape[1])
    vectors_matrix = np.concatenate([v.reshape(shape) for k, v in sorted(
        vectors_dict.items(), key=lambda x:x[0])], axis=0)
    np.save(save_file_name, vectors_matrix)

print("encode valid vectors")
valid_vectors = skipthoughts.encode(model, valid_sentences)
save_vectors(valid_data, valid_vectors, "valid_vectors", train=False)

print("encode test vectors")
test_vectors = skipthoughts.encode(model, test_sentences)
save_vectors(test_data, test_vectors, "test_vectors", train=False)

print("encode train vectors")
train_vectors = skipthoughts.encode(model, train_sentences)
save_vectors(train_data, train_vectors, "train_vectors", train=True)
		istitle = 1
	elif line == '<author>':
		isauth = 1
	elif line == '<abstract>':
		isabs = 1
	elif isend == 1:
		author2=author1[0]
		author2+='$'
		author2 = re.sub('[\s]','_',author2)
		identifier = author2+re.sub('[\s]','_',title1)
		identifier = re.sub(r'\.', '', identifier)
		abstract1 = title1+'$'+abstract1
		abstract1=re.sub(r'\"','\'',abstract1)
		abstract1=abstract1.decode('utf8')
		a=[abstract1]
		vector=skipthoughts.encode(model,a)
		for i in author1:
			if i in unique_authors_set:
				if identifier not in author_papers[i]:
					author_papers[i].append(identifier)
					paper_vectors[i].append(vector)
		author2,title1,abstract1='','',''
		author1=[]
		isend = 0

if isend == 1:
		author2=author1[0]
		author2+='$'
		author2 = re.sub('[\s]','_',author2)
		identifier = author2+re.sub('[\s]','_',title1)
		identifier = re.sub(r'\.', '', identifier)
import skipthoughts

model= skipthoughts.load_model()
vectors= skipthoughts.encode(model,['This morning, I decided to take a  walk. I wonder how these things come into play']) 
print vectors
Esempio n. 49
0
import skipthoughts
import play
import os
path_to_file="/Users/wheatwaves/deeplearning/skip-thoughts/data/data/sentencesOfPureText/"
path_to_save_file="/Users/wheatwaves/deeplearning/skip-thoughts/data/data/sentencesOfPureText/"
g=os.walk(path_to_file)
names=[]
for root, dirs, files in g:
	names=files
model=skipthoughts.load_model()
for name in names[1:]:
	try:
		f=open(path_to_file+name)
		s=[line.strip().decode('utf-8') for line in f.readlines()]
		f.close()
		M=skipthoughts.encode(model,s)
		scores=[0]*len(s)
	except:
		f=open(path_to_file+name)
		s=[line.strip() for line in f.readlines()]
		f.close()
		M=skipthoughts.encode(model,s)
		scores=[0]*len(s)
	for i in xrange(len(M)):
		for j in xrange(len(M)):
			if i!=j:
				scores[i]+=play.cos_similarity(M[i],M[j])
	f=open(path_to_save_file+name+'.scores','w')
	for num in scores:
		f.write(str(num)+'\n')
	f.close()
Esempio n. 50
0
# -*- coding: utf-8 -*-

# Created by junfeng on 3/28/16.

# logging config
import logging

logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
                    datefmt='%m/%d/%Y %I:%M:%S %p',
                    level=logging.DEBUG)
logger = logging.getLogger(__name__)

import skipthoughts
model = skipthoughts.load_model()

X = [
    'Hello, skip thoughts',
]
vectors = skipthoughts.encode(model, X)
print(vectors)
            value = pair.get('value')
            if value == 'TRUE':
                labels[i] = 1
            t = pair.find('t')
            h = pair.find('h')
            t = t.string.strip()
            h = h.string.strip()
            ts.append(t)
            hs.append(h)
            samples.append(u'{0} {1}'.format(t, h))
            if i % 1000 == 0:
                logger.info('processed sample {0}'.format(i))
        logger.info('unique ts: {0}, unique hs: {1}'.format(len(set(ts)), len(set(hs))))
        logger.info('unique sample: {0}'.format(len(set(samples))))
        logger.info('TRUE labels: {0}'.format(np.sum(labels)))
        return ts, hs, labels


if __name__ == '__main__':
    logger.info('read rte dataset xml file ...')
    ts, hs, labels = read_rte_xml()
    logger.info('read model ...')
    model = read_model()
    logger.info('encoding ts ...')
    vectorized_ts = skipthoughts.encode(model, ts)
    logger.info('encoding hs ...')
    vectorized_hs = skipthoughts.encode(model, hs)
    logger.info('dump to file ...')
    joblib.dump((vectorized_ts, vectorized_hs, labels), './data/processed-rte-dataset.pkl')
    logger.info('done')
Esempio n. 52
0
                (row * dimension) : ((row + 1) * dimension), (column * dimension) : ((column + 1) * dimension), :
            ] = c[:][:][:]

        # Note that this is architecture is hardcoded for now
        # After generating blurry images ; sharpen them
        K = 32
        factors = [1, 1, 1]
        kernel_sizes = [7, 7, 5]
        num_filts = [128, 128, 3]
        pathToGANWeights = args.gan_path

        sys.path.append(args.skipthought_path)
        import skipthoughts

        model = skipthoughts.load_model()
        y_skipthought = skipthoughts.encode(model, [sentence])
        y_skipthought = np.float32(np.repeat(y_skipthought, 100, axis=0))

        batch_size = generated_imgs.shape[0]
        print generated_imgs.shape, y_skipthought.shape

        generate_edges_func = gan(K, batch_size, factors, kernel_sizes, num_filts, pathToGANWeights)
        edges = generate_edges_func(generated_imgs, y_skipthought)

        generated_imgs = generated_imgs.reshape([generated_imgs.shape[0], 3, K, K])
        sharp_imgs = generated_imgs + edges
        sharp_imgs[sharp_imgs > 1] = 1
        sharp_imgs[sharp_imgs < 0] = 0

        total_image_sharp = np.zeros(
            (dimension * int(math.sqrt(num_samples)), dimension * int(math.sqrt(num_samples)), 3)
Esempio n. 53
0
lyrics_table=lyrics_table[['lyrics']].applymap(lyrics_clean)

for i in lyrics_table.index:
    lyrics_table.loc[i]['lyrics'].index=[i]*len(lyrics_table.loc[i]['lyrics'])

lyrics_table.iloc[1]['lyrics'][['sentence']].values.transpose().tolist()[0]

all_lyrics_table = pd.concat(lyrics_table['lyrics'].values)
# Calculate sentence similarity
from analysis_package import *
import skipthoughts
model=skipthoughts.load_model()

all_lyrics = all_lyrics_table[['sentence']].values.transpose().tolist()[0]
len(all_lyrics)
vectors = skipthoughts.encode(model,all_lyrics[:10000],use_eos=True)

lyrics_embedding_table = pd.DataFrame(vectors,index = all_lyrics[:10000])

neighbor_table = k_nearest_neighbor(lyrics_embedding_table,5)

def range_filter(df, keys, lower, upper):
    key = keys[0]
    true_false_table = np.array(df[[key]] < upper) & np.array(df[[key]] > lower)
    for key in keys:
        true_false_table = true_false_table & (np.array(df[[key]] < upper) & np.array(df[[key]] > lower))
    return df[true_false_table]
range_filter(neighbor_table,[3,5,7,9],0.,0.5)
range_filter(neighbor_table,[3,5,7],0.,0.5)
range_filter(neighbor_table,[3,5],0.,0.5)
Esempio n. 54
0
            def embedding_thread(x, y, output):
                imdb_key_check = {}
                last_stories = []
                for i in tqdm(xrange(x,y)):
                    error = False

                    qa_info = self.qa[i]
                    question = str(qa_info.question)
                    answers = qa_info.answers
                    correct_index = qa_info.correct_index
                    imdb_key = str(qa_info.imdb_key)
                    validation_flag = str(qa_info.qid)

                    for answer in answers:
                        if len(answer) == 0 : error = True
                    if error == True :continue

                    question_embedding = skipthoughts.encode(model, [question])
                    words_in_question = word_tokenize(question)
                    assert question_embedding.shape == (1,4800)

                    local_answers = skipthoughts.encode(model, answers)


                    stories = self.story[imdb_key]

                    local_stories = []
                    if imdb_key in imdb_key_check: local_stories = last_stories
                    else:
                        imdb_key_check[imdb_key] = 1
                        local_stories = skipthoughts.encode(model, stories)
                        '''
                        for sentence in stories:
                            #local_stories.append(skipthoughts.encode(model, [sentence]))
                            paragraph_tokenize = sent_tokenize(paragraph)
                            for sentences in paragraph_tokenize:
                                words_detected = 0
                                for w in words_in_question:
                                    if sentences.find(w) != -1: words_detected += 1

                                if words_detected >= 1: local_stories.append(skipthoughts.encode(model, [sentences])) # skip embedding : story
                        '''
                        print local_stories.shape
                        last_stories = local_stories

                    skip_dim = 4800
                    if validation_flag.find('train') != - 1:
                        self.zq.append(question_embedding)
                        self.zaj.append(np.array(local_answers).reshape(5,4800))
                        self.ground_truth.append(correct_index)
                        zsl_row = np.array(local_stories).shape[0]
                        print "zsl shape >> ",
                        print np.array(local_stories).shape
                        self.zsl.append(np.array(local_stories).reshape(zsl_row,4800))

                    elif validation_flag.find('val') != -1:
                        self.zq_val.append(question_embedding)
                        self.zaj_val.append(np.array(local_answers).reshape(5,4800))
                        self.ground_truth_val.append(correct_index)
                        zsl_row = np.array(local_stories).shape[0]
                        self.zsl_val.append(np.array(local_stories).reshape(zsl_row,4800))



                    print "==========================="
                    print "each QAInfo status >> "
                    print "question embedding shape >> ",
                    print np.array(self.zq).shape
                    print np.array(self.zq_val).shape
                    print "answer embedding shape >> ",
                    print np.array(self.zaj).shape
                    print np.array(self.zaj_val).shape
                    print "stories embedding shape >> ",
                    try:
                        print np.array(self.zsl).shape
                        print np.array(self.zsl_val).shape
                    except:
                        print "warning : dimension error."

                    print "ground truth shape >> ",
                    print np.array(self.ground_truth).shape
                    print np.array(self.ground_truth_val).shape
                    print "=========================="

                output.put(self.zq)
                output.put(self.zq_val)
                output.put(self.zaj)
                output.put(self.zaj_val)
                output.put(self.zsl)
                output.put(self.zsl_val)
                output.put(self.ground_truth)
                output.put(self.ground_truth_val)
Esempio n. 55
0
def transform_ques_weak(x, sqa, word_to_id):
    '''print "x[0]"
    print x[0]
    print "x[1]"
    print x[1]
    print "x[2]"
    print x[2]
    print "x[3]"
    print x[3]
    print "x[4]"
    print x[4]
    #print "x"
    #print x
    print "x[13][2]"
    print x[13][2]
    print "x[13][3]"
    print x[13][3]
    print "x[13][4]"
    print x[13][4]
    
    print "word_to_id"
    print word_to_id
    print "x[4]"
    print x[4]
    print "x[0][4]"
    print x[0][4]
    '''
    d1 = [[14, 11, 21, 3, 4, 27, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
        [1, 11, 21, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
        [20, 2, 3, 4, 12, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
        [20, 28, 3, 4, 26, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
        [31, 2, 3, 4, 26, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
        [14, 39, 4, 9, 10, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
        [20, 2, 3, 4, 27, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
        [14, 13, 4, 9, 3, 20, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
        [20, 13, 4, 9, 3, 14, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
        [14, 35, 4, 9, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

    d2 = [[14, 11, 21, 3, 4, 27, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
        [1, 11, 21, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
        [20, 2, 3, 4, 12, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
        [20, 28, 3, 4, 26, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
        [31, 2, 3, 4, 26, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
        [14, 39, 4, 9, 10, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
        [20, 2, 3, 4, 27, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]



    e = [14, 36, 4, 9, 3, 1, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

    z = np.zeros((110,4800))

    #model = skipthoughts.load_model()
    quest = x
    '''
    indices = []
    for stmt in quest[2]:
        index_stmt = map(lambda x: word_to_id[x], stmt)
        indices.append(index_stmt)
    quest[2] = indices
    '''
    indices = []
    for i in range(0, sqa):
        
        qi2 = skipthoughts.encode(model, x[i][2])
        
        s = qi2.shape[0]
        z[:s] = qi2
        quest[i][2] = z.tolist()
        

        '''
        four = encoded_already.s1()
        quest[i][2] = four[0]
        '''

        #print "quest[i][2]"
        #print quest[i][2]
        

        
        #quest[i][2] = d1

        
        #quest[3][2] = d2
        #quest[13][2] = d2

        
        
        
        quest[i][3] = skipthoughts.encode(model, x[i][3])

        q3l = quest[i][3].tolist()
        quest[i][3] = q3l[0]    #because skipthoughts automatically puts two brackets around a single sentene encoding
        

        '''
        four2 = encoded_already.s2()
        quest[i][3] = four2[0]
        '''

        #quest[i][3] = e

        #quest[i][4] = skipthoughts.encode(model, x[i][4])
        #quest[i][4] = quest[i][4].tolist()
        quest[i][4] = word_to_id[x[i][4][0]]

        i += 1

    return quest
def compute_question_vector2(question, model):
    """Takes a question and computes a thought vector representing that question"""
    q_list = question.split(".")
    q_vec = sum(skipthoughts.encode(model, sentence) for sentence in q_list)
    return q_vec
Esempio n. 57
0
            def embedding_thread(a, b):
                imdb_key_check = {}
                last_stories = []
                for i in tqdm(xrange(a,b)):
                    error = False
                    #if i == 100 : break

                    qa_info = self.qa[i]
                    question = str(qa_info.question)
                    answers = qa_info.answers
                    correct_index = qa_info.correct_index
                    imdb_key = str(qa_info.imdb_key)
                    validation_flag = str(qa_info.qid)

                    question_embedding = skipthoughts.encode(model, [question])
                    assert question_embedding.shape == (1,4800)

                    for answer in answers:
                        if len(answer) == 0 : error = True
                    if error == True :continue
                    local_answers = [skipthoughts.encode(model, [str(answer)]) for answer in answers]

                    gt = [0.0] * 5
                    gt[correct_index] = 1.0

                    stories = self.story[imdb_key]


                    local_stories = []
                    #for s in stories : print [str(s)]
                    if imdb_key in imdb_key_check: local_stories = last_stories
                    else:
                        imdb_key_check[imdb_key] = 1
                        local_stories = [skipthoughts.encode(model, [str(s)])  for s in stories]
                        last_stories = local_stories

                    skip_dim = 4800
                    if validation_flag.find('train') != - 1:
                        self.zq.append(question_embedding.reshape((skip_dim)))
                        self.zaj.append(np.transpose(np.array(local_answers).reshape(5,skip_dim)))
                        self.ground_truth.append(np.array(gt))
                        zsl_row = np.array(local_stories).shape[0]
                        print "zsl shape >> ",
                        print np.array(local_stories).shape
                        self.zsl.append(np.transpose(np.array(local_stories).reshape(zsl_row, skip_dim)))

                    elif validation_flag.find('val') != -1:
                        self.zq_val.append(question_embedding.reshape((skip_dim)))
                        self.zaj_val.append(np.transpose(np.array(local_answers).reshape(5,skip_dim)))
                        self.ground_truth_val.append(np.array(gt))
                        zsl_row = np.array(local_stories).shape[0]
                        self.zsl_val.append(np.transpose(np.array(local_stories).reshape(zsl_row,skip_dim)))




                    print "==========================="
                    print "each QAInfo status >> "
                    print "question embedding shape >> ",
                    print np.array(self.zq_val).shape
                    print "answer embedding shape >> ",
                    print np.array(self.zaj_val).shape
                    print "stories embedding shape >> ",
                    try:
                        print np.array(self.zsl_val).shape
                    except:
                        print "warning : dimension error."

                    print "ground truth shape >> ",
                    print np.array(self.ground_truth_val).shape
                    print "=========================="
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('--caption_file', type=str, default='Data/sample_captions.txt',
					   help='caption file')
	parser.add_argument('--data_dir', type=str, default='Data',
					   help='Data Directory')

	args = parser.parse_args()
	with open( args.caption_file ) as f:
		captions = f.read().split('\n')

	captions = [cap for cap in captions if len(cap) > 0]
	print captions
	model = skipthoughts.load_model()
	caption_vectors = skipthoughts.encode(model, captions)

	if os.path.isfile(join(args.data_dir, 'sample_caption_vectors.hdf5')):
		os.remove(join(args.data_dir, 'sample_caption_vectors.hdf5'))
	h = h5py.File(join(args.data_dir, 'sample_caption_vectors.hdf5'))
	h.create_dataset('vectors', data=caption_vectors)		
	h.close()