def evaluate(model, k=10, seed=1234, evalcv=True, evaltest=False): """ Run experiment k: number of CV folds test: whether to evaluate on test set """ print('Preparing data...') traintext, testtext = load_data() train, train_labels = prepare_data(traintext) test, test_labels = prepare_data(testtext) train_labels = prepare_labels(train_labels) test_labels = prepare_labels(test_labels) train, train_labels = shuffle(train, train_labels, random_state=seed) print('Computing training skipthoughts...') trainF = skipthoughts.encode(model, train, verbose=False, use_eos=False) if evalcv: print('Running cross-validation...') interval = [2**t for t in range(0, 9, 1)] # coarse-grained C = eval_kfold(trainF, train_labels, k=k, scan=interval, seed=seed) if evaltest: if not evalcv: C = 128 # Best parameter found from CV print('Computing testing skipthoughts...') testF = skipthoughts.encode(model, test, verbose=False, use_eos=False) print('Evaluating...') clf = LogisticRegression(C=C) clf.fit(trainF, train_labels) yhat = clf.predict(testF) print('Test accuracy: ' + str(clf.score(testF, test_labels)))
def evaluate(model, k=10, seed=1234, evalcv=True, evaltest=False): """ Run experiment k: number of CV folds test: whether to evaluate on test set """ print 'Preparing data...' traintext, testtext = load_data() train, train_labels = prepare_data(traintext) test, test_labels = prepare_data(testtext) train_labels = prepare_labels(train_labels) test_labels = prepare_labels(test_labels) train, train_labels = shuffle(train, train_labels, random_state=seed) print 'Computing training skipthoughts...' trainF = skipthoughts.encode(model, train, verbose=False, use_eos=False) if evalcv: print 'Running cross-validation...' interval = [2**t for t in range(0,9,1)] # coarse-grained C = eval_kfold(trainF, train_labels, k=k, scan=interval, seed=seed) if evaltest: if not evalcv: C = 128 # Best parameter found from CV print 'Computing testing skipthoughts...' testF = skipthoughts.encode(model, test, verbose=False, use_eos=False) print 'Evaluating...' clf = LogisticRegression(C=C) clf.fit(trainF, train_labels) yhat = clf.predict(testF) print 'Test accuracy: ' + str(clf.score(testF, test_labels))
def embd_sent(self,sent): import skipthoughts if self.type == 'uni': return skipthoughts.encode(self.model, [sent])[0][0:2400] elif self.type == 'bi': return skipthoughts.encode(self.model, [sent])[0][2400:] else: return skipthoughts.encode(self.model, [sent])[0]
def embd_multiple_sents(self,sents): import skipthoughts if self.type == 'combined': return skipthoughts.encode(self.model, sents) elif self.type == 'uni': ix_from = 0 ix_to = 2400 else: ix_from = 2400 ix_to = 4800 return np.asarray([v[ix_from:ix_to] for v in skipthoughts.encode(self.model, sents)])
def evaluate(model, k=10, seed=1234, evalcv=True, evaltest=False, use_feats=True): """ Run experiment k: number of CV folds test: whether to evaluate on test set """ print 'Preparing data...' traintext, testtext, labels = load_data() print 'Computing training skipthoughts...' trainA = skipthoughts.encode(model, traintext[0], verbose=False) trainB = skipthoughts.encode(model, traintext[1], verbose=False) if evalcv: print 'Running cross-validation...' C = eval_kfold(trainA, trainB, traintext, labels[0], shuffle=True, k=10, seed=1234, use_feats=use_feats) if evaltest: if not evalcv: C = 4 # Best parameter found from CV (combine-skip with use_feats=True) print 'Computing testing skipthoughts...' testA = skipthoughts.encode(model, testtext[0], verbose=False) testB = skipthoughts.encode(model, testtext[1], verbose=False) if use_feats: train_features = np.c_[np.abs(trainA - trainB), trainA * trainB, feats(traintext[0], traintext[1])] test_features = np.c_[np.abs(testA - testB), testA * testB, feats(testtext[0], testtext[1])] else: train_features = np.c_[np.abs(trainA - trainB), trainA * trainB] test_features = np.c_[np.abs(testA - testB), testA * testB] print 'Evaluating...' clf = LogisticRegression(C=C) clf.fit(train_features, labels[0]) yhat = clf.predict(test_features) print 'Test accuracy: ' + str(clf.score(test_features, labels[1])) print 'Test F1: ' + str(f1(labels[1], yhat))
def generate_story_loss(z, image_loc, k=100, bw=50, lyric=False): """ Generate a story for an image at location image_loc """ # Load the image rawim, im = load_image(image_loc) # Run image through convnet feats = compute_features(z['net'], im).flatten() feats /= norm(feats) # Embed image into joint space feats = embedding.encode_images(z['vse'], feats[None, :]) # Compute the nearest neighbours scores = numpy.dot(feats, z['cvec'].T).flatten() sorted_args = numpy.argsort(scores)[::-1] sentences = [z['cap'][a] for a in sorted_args[:k]] print 'NEAREST-CAPTIONS: ' for s in sentences[:5]: print s print '' # Compute skip-thought vectors for sentences svecs = skipthoughts.encode(z['stv'], sentences, verbose=False) # Style shifting # shift = svecs.mean(0) - z['bneg'] + z['bpos'] return svecs.mean(0), z['bneg'], z['bpos']
def predict(): queries = request.get_json(silent=True, force=True)['input'] # query = "This is a red flower with yellow stamen." encoded = Variable(torch.Tensor(skipthoughts.encode(model, queries))) if torch.cuda.is_available(): encoded = encoded.cuda() image_paths = [] for batch_i in range(BATCH_SIZE): noise_vec = Variable(torch.randn(len(queries), 100, 1, 1)) if torch.cuda.is_available(): noise_vec = noise_vec.cuda() gen_images = generator.forward(encoded, noise_vec) gen_images = gen_images.cpu() for i, img in enumerate(gen_images): curr = img.data.numpy() curr = np.swapaxes(curr, 0, 1) curr = np.swapaxes(curr, 1, 2) path = 'Data/samples/' + str(batch_i) + '_' + str(i) + '.png' scipy.misc.imsave(path, curr) image_paths.append(path) return jsonify({'images': image_paths})
def encode_and_save(image_captions, image_classes, data_dir: str, dataset: str): model = skipthoughts.load_model() encoded_captions = {} for i, img in enumerate(image_captions): st = time.time() encoded_captions[img] = skipthoughts.encode(model, image_captions[img]) if i % 20 == 0: print(i, len(image_captions), img) print("Seconds", time.time() - st) img_ids = list(image_captions.keys()) random.shuffle(img_ids) n_train_instances = int(len(img_ids) * 0.9) tr_image_ids = img_ids[0:n_train_instances] val_image_ids = img_ids[n_train_instances:-1] pickle.dump( image_captions, open(os.path.join(data_dir, dataset, dataset + '_caps.pkl'), "wb")) pickle.dump(tr_image_ids, open(os.path.join(data_dir, dataset, 'train_ids.pkl'), "wb")) pickle.dump(val_image_ids, open(os.path.join(data_dir, dataset, 'val_ids.pkl'), "wb")) ec_pkl_path = join(data_dir, dataset, dataset + '_tv.pkl') pickle.dump(encoded_captions, open(ec_pkl_path, "wb")) fc_pkl_path = join(data_dir, dataset, dataset + '_tc.pkl') pickle.dump(image_classes, open(fc_pkl_path, "wb"))
def extract_sentence_vectors(model): sentences = [] selected_jsons = [] ifile = open('op.json', 'r') for idx,line in enumerate(ifile): jj = json.loads(line) s1 = jj[0] s2 = jj[1] sentences.append(s1['sent']) sentences.append(s2['sent']) vectors = skipthoughts.encode(model, sentences) sent_pair_vectors = numpy.reshape(vectors, (len(vectors)/2, len(vectors[0]) * 2)) print(sent_pair_vectors.shape) numpy.save("toy_pair_vectors.npy", sent_pair_vectors)
def encode(self, sentences, verbose=False): self.sentences = sentences if self.loaded_custom_model: self.vectors = penseur_utils.encode(self.model, sentences, verbose) else: self.vectors = skipthoughts.encode(self.model, sentences, verbose) return self.vectors
def save_caption_vectors_shapes(data_dir): import time img_dir = join(data_dir, 'shapes/images') image_files = [f for f in os.listdir(img_dir) if 'png' in f] print image_files[300:400] print len(image_files) image_captions = { img_file : [] for img_file in image_files } caption_dir = join(data_dir, 'shapes/texts') caption_files = [f for f in os.listdir(caption_dir) if 'txt' in f] for cap_file in caption_files: with open(join(caption_dir,cap_file)) as f: captions = f.read().split('\n') img_file = cap_file[0:5] + ".png" # 5 captions per image image_captions[img_file] += [cap for cap in captions if len(cap) > 0][0:5] print len(image_captions) model = skipthoughts.load_model() encoded_captions = {} for i, img in enumerate(image_captions): st = time.time() encoded_captions[img] = skipthoughts.encode(model, image_captions[img]) print i, len(image_captions), img print "Seconds", time.time() - st h = h5py.File(join(data_dir, 'shapes_tv.hdf5')) for key in encoded_captions: h.create_dataset(key, data=encoded_captions[key]) h.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--caption_file', type=str, default='Data/sample_captions.txt', help='caption file') parser.add_argument('--data_dir', type=str, default='Data', help='Data Directory') args = parser.parse_args() with open(args.caption_file) as f: captions = f.read().split('\n') captions = [cap for cap in captions if len(cap) > 0] print(captions) model = skipthoughts.load_model() caption_vectors = skipthoughts.encode(model, captions) if os.path.isfile(join(args.data_dir, 'sample_caption_vectors.hdf5')): os.remove(join(args.data_dir, 'sample_caption_vectors.hdf5')) h = h5py.File(join(args.data_dir, 'sample_caption_vectors.hdf5')) h.create_dataset('vectors', data=caption_vectors) h.close()
def extract_sentence_vectors(model): sentences = [] selected_jsons = [] ifile = open('../../snli_1.0/snli_1.0_train.jsonl', 'r') for idx, line in enumerate(ifile): # print(idx) dropout_chance = random.random() if dropout_chance < 0.1: jj = json.loads(line) sentences.append(jj['sentence1']) sentences.append(jj['sentence2']) selected_jsons.append(jj) print(len(sentences), len(selected_jsons)) #sys.exit() vectors = skipthoughts.encode(model, sentences) sent_pair_vectors = numpy.reshape(vectors, (len(vectors) / 2, len(vectors[0]) * 2)) print(sent_pair_vectors.shape) numpy.save("sentence_pair_vectors.npy", sent_pair_vectors) with open('selected_sentence_pairs.json', 'w') as f: for jsons in selected_jsons: f.write(json.dumps(jsons)) f.write('\n')
def batcher(params, batch): batch = [' '.join(sent) if sent != [] else '.' for sent in batch] embeddings = skipthoughts.encode(params['encoder'], batch, verbose=False, use_eos=True) return embeddings
def main(): parser = argparse.ArgumentParser() parser.add_argument('--caption_file', type=str, default='Data/captions.txt', help='caption file') parser.add_argument('--data_dir', type=str, default='Data', help='Data Directory') args = parser.parse_args() with open(args.caption_file) as f: captions = f.read().split('\n') # captions : Text description of pictures stored in file sample_captions.txt captions = [cap for cap in captions if len(cap) > 0] print(captions) # create skipthoughts vectors model = skipthoughts.load_model() print('Creation of skipthought vectors : loading ....') caption_vectors = skipthoughts.encode(model, captions) print('Creation of skipthought vectors : DONE !') #print(caption_vectors) #print(np.shape(caption_vectors)).3 # create tensor vectors with skipthought vectors as input print('Save skipthought vector : loading ....') np.save('skipvectors_2000.npy', caption_vectors) print('Save skipthought vector : DONE !')
def prepare_data(query, imdb_key): query_representation = skip.encode(skip_model, [query]) candidate_qa = [QAInfo for QAInfo in qa if QAInfo.imdb_key == imdb_key] skip_encode = list() for QAInfo in candidate_qa: try: skip_encode.append(qa_representation[QAInfo.qid]) except: pass similarity = [(np.inner(query_representation, rep)[0][0], i) for i, rep in enumerate(skip_encode)] similarity.sort(reverse=True) most_similar = [candidate_qa[i] for score, i in similarity[:1]] retrieved_question = most_similar[0].question retrieved_answer = most_similar[0].answers retrieved_story = story[imdb_key] q_embed = np.array(gensim_w2v.encode_w2v_gensim(retrieved_question)) a_embed = np.array([gensim_w2v.encode_w2v_gensim(a) for a in retrieved_answer]) s_embed = np.zeros((1, 60, 300)) s_embed[:,:len(retrieved_story)] = \ np.reshape(np.array([gensim_w2v.encode_w2v_gensim(s) for s in retrieved_story]),\ (1,len(retrieved_story), 300)) s_embed = np.reshape(s_embed, (1, 60, 300)) q_embed = np.reshape(q_embed, (1, 1, 300)) a_embed = np.reshape(a_embed, (1, 5, 300)) return most_similar[0], s_embed, q_embed, a_embed
def main(): parser = argparse.ArgumentParser() parser.add_argument('--caption_file', type=str, default='Data/text.txt', help='caption file') parser.add_argument('--data_dir', type=str, default='Data', help='Data Directory') args = parser.parse_args() model = skipthoughts.load_model() encoded_captions = {} file_path = os.path.join(args.caption_file) dump_path = os.path.join(args.data_dir, 'enc_text.pkl') with open(file_path) as f: str_captions = f.read() captions = str_captions.split('\n') print(captions) encoded_captions['features'] = skipthoughts.encode(model, captions) pickle.dump(encoded_captions, open(dump_path, "wb")) print('Finished extracting Skip-Thought vectors of the given text ' 'descriptions')
def save_caption_vectors(all_captions, target_dir, split, experiment): h = h5py.File( os.path.join(target_dir, experiment, '{}_captions.hdf5'.format(split))) model = skipthoughts.load_model() for class_name, image_captions in all_captions.items(): print("number of images: ", len(image_captions)) img_batches = [[] for i in range(NUM_BATCHES)] caption_batches = [[] for i in range(NUM_BATCHES)] counter = 0 for img, captions in image_captions.items(): counter = counter % NUM_BATCHES img_batches[counter].append(img) caption_batches[counter] += captions counter += 1 print("batched for {}".format(class_name)) group = h.create_group(class_name) for i in range(NUM_BATCHES): imgs = img_batches[i] captions = caption_batches[i] encoded_captions = skipthoughts.encode(model, captions) cstart = 0 for img in imgs: num_caps = len(image_captions[img]) print(cstart, num_caps, len(encoded_captions)) group.create_dataset(img, data=encoded_captions[cstart:cstart + num_caps]) cstart += num_caps print("Batch {} of {} Done".format(i + 1, NUM_BATCHES)) h.close()
def prepare_data(caps, features, worddict, model, d, maxlen=None, n_words=10000): """ Put data into format useable by the model """ seqs = [] feat_list = [] for i, cc in enumerate(caps): seqs.append( [worddict[w] if worddict[w] < n_words else 1 for w in cc.split()]) feat_list.append(features[i]) lengths = [len(s) for s in seqs] print 'building seqs' if maxlen != None and numpy.max(lengths) >= maxlen: new_seqs = [] new_feat_list = [] new_lengths = [] for l, s, y in zip(lengths, seqs, feat_list): if l < maxlen: new_seqs.append(s) new_feat_list.append(y) new_lengths.append(l) lengths = new_lengths feat_list = new_feat_list seqs = new_seqs if len(lengths) < 1: return None, None, None # Compute skip-thought vectors for this mini-batch print 'encoding skipthoughts' feat_list = skipthoughts.encode(model, feat_list, d, use_eos=False, verbose=False) print 'finished skipthoughts encoding' print 'feature list size %d, %d' % (len(feat_list), len(feat_list[0])) y = numpy.zeros((len(feat_list), len(feat_list[0]))).astype('float32') for idx, ff in enumerate(feat_list): y[idx, :] = ff n_samples = len(seqs) maxlen = numpy.max(lengths) + 1 x = numpy.zeros((maxlen, n_samples)).astype('int64') x_mask = numpy.zeros((maxlen, n_samples)).astype('float32') print 'building mask' for idx, s in enumerate(seqs): x[:lengths[idx], idx] = s x_mask[:lengths[idx] + 1, idx] = 1. return x, x_mask, y
def read_snli_from_csv(model): train_saved_path = './snli/processed-train.pkl' dev_saved_path = './snli/processed-dev.pkl' test_saved_path = './snli/processed-test.pkl' if os.path.isfile(train_saved_path) and os.path.isfile(test_saved_path): X_train, train_labels = joblib.load(train_saved_path) X_test, test_labels = joblib.load(test_saved_path) return X_train, X_test, train_labels, test_labels if model is None: raise ValueError("model is None") train_df = pd.read_csv('./snli/snli_1.0/snli_1.0_train.txt', delimiter='\t') train_df = train_df[pd.notnull(train_df.sentence2)] train_df = train_df[train_df.gold_label != '-'] train_df = train_df[:(len(train_df) / 3)] train_ts, train_hs, train_labels = get_sentence_sample(train_df) logger.info('encoding train samples ...') logger.info('encoding ts ...') vectorized_train_ts = skipthoughts.encode(model, train_ts) logger.info('encoding hs ...') vectorized_train_hs = skipthoughts.encode(model, train_hs) del train_df, train_ts, train_hs X_train = np.concatenate((vectorized_train_ts, vectorized_train_hs), axis=1) logger.info('dump to file ...') joblib.dump((X_train, train_labels), train_saved_path) test_df = pd.read_csv('./snli/snli_1.0/snli_1.0_test.txt', delimiter='\t') test_df = test_df[pd.notnull(test_df.sentence2)] test_df = test_df[test_df.gold_label != '-'] test_ts, test_hs, test_labels = get_sentence_sample(test_df) logger.info('encoding test samples ...') logger.info('encoding ts ...') vectorized_test_ts = skipthoughts.encode(model, test_ts) logger.info('encoding hs ...') vectorized_test_hs = skipthoughts.encode(model, test_hs) del test_df, test_ts, test_hs X_test = np.concatenate((vectorized_test_ts, vectorized_test_hs), axis=1) logger.info('dump to file ...') joblib.dump((X_test, test_labels), test_saved_path) logger.info('done') return X_train, X_test, train_labels, test_labels
def batcher(params, batch): embeddings = skipthoughts.encode(params.encoder, [ str(' '.join(sent), errors="ignore") if sent != [] else '.' for sent in batch ], verbose=False, use_eos=True) return embeddings
def encode_sentences(desc, sentence_list, model, imdb_key=None, is_qa=False): """Encode a list of sentences given the model. """ if desc == 'skipthought': # encode a sentence list directly features = skipthoughts.encode(model, sentence_list, verbose=False) elif desc == 'vis-text-embed': # normalize sentence lists norm_sentence_list = [ utils.normalize_alphanumeric(sentence.lower()) for sentence in sentence_list ] # allows to encode a sentence list directly features = model.encode(norm_sentence_list) elif desc.startswith('tfidf'): desc_dim = len(model.vocab) midx = model.doc_names.index(imdb_key) # use scipy sparse matrix when encoding stories, otherwise too huge! if is_qa: features = np.zeros((len(sentence_list), desc_dim), dtype='float32') else: features = sps.dok_matrix((len(sentence_list), desc_dim), dtype='float32') for s, sentence in enumerate(sentence_list): # NOTE: use both alphanumeric and stemming normalization sentence = utils.normalize_stemming( utils.normalize_alphanumeric(sentence.lower())).split(' ') # for each word in the normalized sentence for word in sentence: if word not in model.vocab: continue widx = model.vocab.index(word) features[s, widx] = model.tfidf[widx][midx] if is_qa: # if not sparse, use numpy.linalg.norm features[s] /= (np.linalg.norm(features[s]) + 1e-6) else: # if sparse, use scipy.sparse.linalg.norm features[s] /= (sps.linalg.norm(features[s]) + 1e-6) elif desc == 'word2vec': desc_dim = model.get_vector(model.vocab[-1]).shape[0] features = np.zeros((len(sentence_list), desc_dim), dtype='float32') for s, sentence in enumerate(sentence_list): # NOTE: use only alphanumeric normalization, no stemming sentence = utils.normalize_alphanumeric( sentence.lower()).split(' ') # for each word in the normalized sentence for word in sentence: if word not in model.vocab: continue features[s] += model.get_vector(word) features[s] /= (np.linalg.norm(features[s]) + 1e-6) return features
def compute_answer_vector2(answers, model): """Takes a dictionary of answer and return a dictionary of vectors representing the answer""" answer_vector = {} for answer_option in answers.keys(): a_list = answers[answer_option].split(".") b = np.array(sum(skipthoughts.encode(model, sentence) for sentence in a_list)) avg_factor = 1.0 / len(answers.keys()) answer_vector[answer_option] = np.multiply(b, avg_factor) return answer_vector
def story(z, image_loc, k=20, bw=5, lyric=False): """ Generate a story for an image at location image_loc """ # Load the image rawim, im = load_image(image_loc) # Run image through convnet feats = compute_features(z['net'], im).flatten() feats /= norm(feats) # Embed image into joint space feats = embedding.encode_images(z['vse'], feats[None, :]) # Compute the nearest neighbours scores = numpy.dot(feats, z['cvec'].T).flatten() sorted_args = numpy.argsort(scores)[::-1] sentences = [z['cap'][a] for a in sorted_args[:k]] #pipeline broken here and one good caption whose image name is same is added f = open('/Users/shreyajain/Downloads/output.txt').read() #f2 = open('/Users/shreyajain/Downloads/input_story.txt','w') image_name = image_loc.split('/')[-1] text = f.split('\n') for t in range(4, len(text)): l = text[t].split() if l != []: name = l[0].split('/')[-1] if name == image_name: caption = l[1:] #caption = ' '.join(caption) sentences = caption + sentences break # print 'NEAREST-CAPTIONS: ' # for s in sentences[:5]: # print s # print '' # Compute skip-thought vectors for sentences svecs = skipthoughts.encode(z['stv'], sentences, verbose=False) # Style shifting shift = svecs.mean(0) - z['bneg'] + z['bpos'] # Generate story conditioned on shift passage = decoder.run_sampler(z['dec'], shift, beam_width=bw) #print 'OUTPUT: ' if lyric: for line in passage.split(','): if line[0] != ' ': print line else: print line[1:] else: return passage
def encoding(input): encoded_vector_dir = './encoded_vector' try: caption_vectors = skipthoughts.encode(model, input) print("Sentence encoding sucessful") except: print("Failed sentence encoding") files_ = 'test_vector.pkl' with open(join(encoded_vector_dir, files_), mode='wb') as myfile: pickle.dump(caption_vectors, myfile)
def main(): beta1 = 0.5 lr = 2e-4 z_dim = 100 t_dim = 256 batch_size = 64 image_size = 64 gfc_dim = 1024 caption_vector_length = 4800 epochs = 600 path = sys.argv[1] test_data, test_id = load_test_data(path) embed_model = skipthoughts.load_model() caption_vectors = skipthoughts.encode(embed_model, test_data) #np.save("test_embedd.npy", caption_vectors) #exit() #caption_vectors = np.load("test_embedd.npy") caption_vectors = np.tile(caption_vectors, (5, 1)) model_options = { 'z_dim': 100, 't_dim': 256, 'batch_size': len(test_data) * 5, 'image_size': 64, 'gf_dim': 64, 'df_dim': 64, 'gfc_dim': 1024, 'caption_vector_length': 4800 } gan = model.GAN(model_options) input_tensors, variables, loss, outputs, checks = gan.build_model() with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: tf.global_variables_initializer().run() saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state("Data/Models/") saver.restore(sess, ckpt.model_checkpoint_path) print("Model restored.") z_noise = np.random.uniform(-1, 1, [5 * len(test_data), z_dim]) gen = sess.run(outputs['generator'], feed_dict={ input_tensors['t_real_caption']: caption_vectors, input_tensors['t_z']: z_noise }) print "Saving Images, Model" save_image(gen, test_id)
def main(): ann_root = '/home/fl302/Projects/VQA-tensorflow/data/annotations' questions = _read_json(ann_root, 'MultipleChoice_mscoco_val2014_questions.json', 'questions') # create model model = skipthoughts.load_model() # create buffers quest_ids, quest_coding = [], [] quest_buffer = [] # now, do the job for i, info in enumerate(questions): print('Skip thought: extracted %d/%d' % (i, len(questions))) if i > 100: break quest_id = info['question_id'] quest = info['question'].lower() quest_buffer.append(quest) quest_ids.append(quest_id) if i % 100 == 0 and i > 0: quest_vectors = skipthoughts.encode(model, quest_buffer) # append to the main buffer quest_coding.append(quest_vectors.copy()) # clear question buffer quest_buffer = [] # process last batch if quest_buffer: quest_vectors = skipthoughts.encode(model, quest_buffer) quest_coding.append(quest_vectors.copy()) # concatenate quest_coding = np.concatenate(quest_coding, axis=0).astype(np.float32) quest_ids = np.array(quest_ids, dtype=np.int32) # save to file save_hdf5('vqa_val_skipthought.h5', { 'quest_id': quest_ids, 'quest_coding': quest_coding })
def generate(sentences, stv, bpos, bneg, dec): # Compute skip-thought vectors for sentences svecs = skipthoughts.encode(stv, sentences, verbose=False) console.log("Encoded skipthought vector") # Style shifting shift = svecs.mean(0) - bneg + bpos console.log("Shifted style") # TODO: clean up here # Generate story conditioned on shift passage = decoder.run_sampler(dec, shift, beam_width=500) console.log("Sampled passage") return passage
def transform_ques_weak(x, sqa, word_to_id): z = np.zeros((110,4800)) #model = skipthoughts.load_model() quest = x indices = [] for i in range(0, sqa): qi2 = skipthoughts.encode(model, x[i][2]) s = qi2.shape[0] z[:s] = qi2 quest[i][2] = z.tolist() ''' four = encoded_already.s1() quest[i][2] = four[0] ''' quest[i][3] = skipthoughts.encode(model, x[i][3]) q3l = quest[i][3].tolist() quest[i][3] = q3l[0] #because skipthoughts automatically puts two brackets around a single sentene encoding ''' four2 = encoded_already.s2() quest[i][3] = four2[0] ''' quest[i][4] = word_to_id[x[i][4][0]] i += 1 return quest
def get_similarity(text1, text2): x = [text1, text2] vectors = skipthoughts.encode(model, x) # need reshaping to prevent warning from scikit-learn a = vectors[0].reshape(1, -1) b = vectors[1].reshape(1, -1) result_sim = float(cosine_similarity(a, b)) print(result_sim, text1, text2) return result_sim
def get_image_tag_pair(tag_dict_in_use, img_path="data/faces/"): print("start loading skipthoughts model") # get text vector model = skipthoughts.load_model() list_text = skipthoughts.encode(model, list(tag_dict_in_use.values())) # get image list_image = [] for key, item in tag_dict_in_use.items(): img = skimage.io.imread(os.path.join(img_path, str(key) + ".jpg")) img = skimage.transform.resize(img, (64, 64)) list_image.append(img) return list_image, list_text
def main(): parser = argparse.ArgumentParser() parser.add_argument('--caption_file', type=str, default='Data/sample_captions.txt', help='caption file') parser.add_argument('--data_dir', type=str, default='Data', help='Data Directory') parser.add_argument('--data_set', type=str, default='flowers', help="Define the name of data sets") args = parser.parse_args() _n_labels = 4096 if args.data_set == "ImageNet": with open("./Data/sample_caption_ImageNet.txt") as f: captions = f.read().split('\n') captions = [cap for cap in captions if len(cap) > 0] caption_vector_list = [] for cap in captions: _n = cap.split(',') _zeros0 = np.zeros(_n_labels) _zeros1 = np.zeros(_n_labels) _zeros0[int(_n[0])] = 1 _zeros1[int(_n[0])] = 1 _onehot = np.concatenate([_zeros0, _zeros1], axis=0) caption_vector_list.append(_onehot) print(len(caption_vector_list), len(caption_vector_list[0])) h = h5py.File(join(args.data_dir, 'sample_caption_ImageNet.hdf5')) h.create_dataset('vectors', data=caption_vector_list) h.close() if args.data_set == "flowers": with open(args.caption_file) as f: captions = f.read().split('\n') captions = [cap for cap in captions if len(cap) > 0] print(captions) model = skipthoughts.load_model() caption_vectors = skipthoughts.encode(model, captions) print(caption_vectors) print(caption_vectors.shape, len(caption_vectors[0])) if os.path.isfile(join(args.data_dir, 'sample_caption_vectors.hdf5')): os.remove(join(args.data_dir, 'sample_caption_vectors.hdf5')) h = h5py.File(join(args.data_dir, 'sample_caption_vectors.hdf5')) h.create_dataset('vectors', data=caption_vectors) h.close()
def encode_sentences(desc, sentence_list, model, imdb_key=None, is_qa=False): """Encode a list of sentences given the model. """ if desc == 'skipthought': # encode a sentence list directly features = skipthoughts.encode(model, sentence_list, verbose=False) elif desc == 'vis-text-embed': # normalize sentence lists norm_sentence_list = [utils.normalize_alphanumeric(sentence.lower()) for sentence in sentence_list] # allows to encode a sentence list directly features = model.encode(norm_sentence_list) elif desc.startswith('tfidf'): desc_dim = len(model.vocab) midx = model.doc_names.index(imdb_key) # use scipy sparse matrix when encoding stories, otherwise too huge! if is_qa: features = np.zeros((len(sentence_list), desc_dim), dtype='float32') else: features = sps.dok_matrix((len(sentence_list), desc_dim), dtype='float32') for s, sentence in enumerate(sentence_list): # NOTE: use both alphanumeric and stemming normalization sentence = utils.normalize_stemming(utils.normalize_alphanumeric(sentence.lower())).split(' ') # for each word in the normalized sentence for word in sentence: if word not in model.vocab: continue widx = model.vocab.index(word) features[s,widx] = model.tfidf[widx][midx] if is_qa: # if not sparse, use numpy.linalg.norm features[s] /= (np.linalg.norm(features[s]) + 1e-6) else: # if sparse, use scipy.sparse.linalg.norm features[s] /= (sps.linalg.norm(features[s]) + 1e-6) elif desc == 'word2vec': desc_dim = model.get_vector(model.vocab[-1]).shape[0] features = np.zeros((len(sentence_list), desc_dim), dtype='float32') for s, sentence in enumerate(sentence_list): # NOTE: use only alphanumeric normalization, no stemming sentence = utils.normalize_alphanumeric(sentence.lower()).split(' ') # for each word in the normalized sentence for word in sentence: if word not in model.vocab: continue features[s] += model.get_vector(word) features[s] /= (np.linalg.norm(features[s]) + 1e-6) return features
def evaluate(model, seed=1234, evaltest=False): """ Run experiment """ print 'Preparing data...' train, dev, test, scores = load_data() train[0], train[1], scores[0] = shuffle(train[0], train[1], scores[0], random_state=seed) print 'Computing training skipthoughts...' trainA = skipthoughts.encode(model, train[0], verbose=False, use_eos=True) trainB = skipthoughts.encode(model, train[1], verbose=False, use_eos=True) print 'Computing development skipthoughts...' devA = skipthoughts.encode(model, dev[0], verbose=False, use_eos=True) devB = skipthoughts.encode(model, dev[1], verbose=False, use_eos=True) print 'Computing feature combinations...' trainF = np.c_[np.abs(trainA - trainB), trainA * trainB] devF = np.c_[np.abs(devA - devB), devA * devB] print 'Encoding labels...' trainY = encode_labels(scores[0]) devY = encode_labels(scores[1]) print 'Compiling model...' lrmodel = prepare_model(ninputs=trainF.shape[1]) print 'Training...' bestlrmodel = train_model(lrmodel, trainF, trainY, devF, devY, scores[1]) if evaltest: print 'Computing test skipthoughts...' testA = skipthoughts.encode(model, test[0], verbose=False, use_eos=True) testB = skipthoughts.encode(model, test[1], verbose=False, use_eos=True) print 'Computing feature combinations...' testF = np.c_[np.abs(testA - testB), testA * testB] print 'Evaluating...' r = np.arange(1, 6) yhat = np.dot(bestlrmodel.predict_proba(testF, verbose=2), r) pr = pearsonr(yhat, scores[2])[0] sr = spearmanr(yhat, scores[2])[0] se = mse(yhat, scores[2]) print 'Test Pearson: ' + str(pr) print 'Test Spearman: ' + str(sr) print 'Test MSE: ' + str(se) return yhat
def save_caption_vectors_faces(data_dir): import time data_dir = 'Data/' img_dir = join(data_dir, 'faces/jpg') image_files = [f for f in os.listdir(img_dir) if 'jpg' in f] print image_files[1:20] print len(image_files) image_captions = {img_file: [] for img_file in image_files} caption_dir = join(data_dir, 'faces/tags/tags_clean.csv') with open(caption_dir, 'r') as fin: for line in fin: l = line.split(',') img_id = str(l[0]) + '.jpg' tags = l[1].split('\t') captions = [] for t in tags: t2 = t.split(':') if len(t2) > 1: captions.append([str(t2[0]), int(t2[1])]) captions.sort(key=lambda tup: tup[1], reverse=True) for ind, c in enumerate(captions): # ================================== # TODO: top 10 tags # if ind >= 10: # break # image_captions[img_id].append(c[0]) # =================================== # TODO: eyes and hair tags only if 'eye' in c[0] or 'hair' in c[0]: image_captions[img_id].append(c[0]) print len(image_captions) model = skipthoughts.load_model() encoded_captions = {} for i, img in enumerate(image_captions): st = time.time() encoded_captions[img] = skipthoughts.encode(model, image_captions[img]) print i, len(image_captions), img print "Seconds", time.time() - st h = h5py.File(join(data_dir, 'faces_tv.hdf5')) for key in encoded_captions: h.create_dataset(key, data=encoded_captions[key]) h.close()
def prepare_data(caps, features, worddict, model, maxlen=None, n_words=10000): """ Put data into format useable by the model """ seqs = [] feat_list = [] for i, cc in enumerate(caps): seqs.append([worddict[w] if worddict[w] < n_words else 1 for w in cc.split()]) feat_list.append(features[i]) lengths = [len(s) for s in seqs] if maxlen != None and numpy.max(lengths) >= maxlen: new_seqs = [] new_feat_list = [] new_lengths = [] for l, s, y in zip(lengths, seqs, feat_list): if l < maxlen: new_seqs.append(s) new_feat_list.append(y) new_lengths.append(l) lengths = new_lengths feat_list = new_feat_list seqs = new_seqs if len(lengths) < 1: return None, None, None # Compute skip-thought vectors for this mini-batch feat_list = skipthoughts.encode(model, feat_list, use_eos=False, verbose=False) y = numpy.zeros((len(feat_list), len(feat_list[0]))).astype('float32') for idx, ff in enumerate(feat_list): y[idx,:] = ff n_samples = len(seqs) maxlen = numpy.max(lengths)+1 x = numpy.zeros((maxlen, n_samples)).astype('int64') x_mask = numpy.zeros((maxlen, n_samples)).astype('float32') for idx, s in enumerate(seqs): x[:lengths[idx],idx] = s x_mask[:lengths[idx]+1,idx] = 1. return x, x_mask, y
def story(z, image_loc, k=100, bw=50, lyric=False): """ Generate a story for an image at location image_loc """ # Load the image rawim, im = load_image(image_loc) # Run image through convnet feats = compute_features(z['net'], im).flatten() feats /= norm(feats) # Embed image into joint space feats = embedding.encode_images(z['vse'], feats[None,:]) # Compute the nearest neighbours scores = numpy.dot(feats, z['cvec'].T).flatten() sorted_args = numpy.argsort(scores)[::-1] sentences = [z['cap'][a] for a in sorted_args[:k]] print 'NEAREST-CAPTIONS: ' for s in sentences[:5]: print s print '' # Compute skip-thought vectors for sentences svecs = skipthoughts.encode(z['stv'], sentences, verbose=False) # Style shifting shift = svecs.mean(0) - z['bneg'] + z['bpos'] # Generate story conditioned on shift passage = decoder.run_sampler(z['dec'], shift, beam_width=bw) print 'OUTPUT: ' if lyric: for line in passage.split(','): if line[0] != ' ': print line else: print line[1:] else: print passage return passage
def evaluate(model, seed=1234, evaltest=False): """ Run experiment """ print 'Preparing data...' train, dev, test, scores = load_data() train[0], train[1], scores[0] = shuffle(train[0], train[1], scores[0], random_state=seed) print 'Computing training skipthoughts...' trainA = skipthoughts.encode(model, train[0], verbose=False, use_eos=True) trainB = skipthoughts.encode(model, train[1], verbose=False, use_eos=True) print 'Computing development skipthoughts...' devA = skipthoughts.encode(model, dev[0], verbose=False, use_eos=True) devB = skipthoughts.encode(model, dev[1], verbose=False, use_eos=True) print 'Computing feature combinations...' trainF = np.c_[np.abs(trainA - trainB), trainA * trainB] devF = np.c_[np.abs(devA - devB), devA * devB] print 'Encoding labels...' trainY = encode_labels(scores[0]) devY = encode_labels(scores[1]) print 'Compiling model...' lrmodel = prepare_model(ninputs=trainF.shape[1]) print 'Training...' bestlrmodel = train_model(lrmodel, trainF, trainY, devF, devY, scores[1]) if evaltest: print 'Computing test skipthoughts...' testA = skipthoughts.encode(model, test[0], verbose=False, use_eos=True) testB = skipthoughts.encode(model, test[1], verbose=False, use_eos=True) print 'Computing feature combinations...' testF = np.c_[np.abs(testA - testB), testA * testB] print 'Evaluating...' r = np.arange(1,6) yhat = np.dot(bestlrmodel.predict_proba(testF, verbose=2), r) pr = pearsonr(yhat, scores[2])[0] sr = spearmanr(yhat, scores[2])[0] se = mse(yhat, scores[2]) print 'Test Pearson: ' + str(pr) print 'Test Spearman: ' + str(sr) print 'Test MSE: ' + str(se) return yhat
def save_caption_vectors_flowers(data_dir): import time img_dir = join(data_dir, 'flowers/jpg') image_files = [f for f in os.listdir(img_dir) if 'jpg' in f] print image_files[300:400] print len(image_files) image_captions = { img_file : [] for img_file in image_files } caption_dir = join(data_dir, 'flowers/text_c10') class_dirs = [] for i in range(1, 103): class_dir_name = 'class_%.5d'%(i) class_dirs.append( join(caption_dir, class_dir_name)) for class_dir in class_dirs: caption_files = [f for f in os.listdir(class_dir) if 'txt' in f] for cap_file in caption_files: with open(join(class_dir,cap_file)) as f: captions = f.read().split('\n') img_file = cap_file[0:11] + ".jpg" # 5 captions per image image_captions[img_file] += [cap for cap in captions if len(cap) > 0][0:5] print len(image_captions) model = skipthoughts.load_model() encoded_captions = {} for i, img in enumerate(image_captions): st = time.time() encoded_captions[img] = skipthoughts.encode(model, image_captions[img]) print i, len(image_captions), img print "Seconds", time.time() - st h = h5py.File(join(data_dir, 'flower_tv.hdf5')) for key in encoded_captions: h.create_dataset(key, data=encoded_captions[key]) h.close()
def load_data(model, name, loc='./data/', seed=1234): """ Load one of MR, CR, SUBJ or MPQA """ z = {} if name == 'MR': pos, neg = load_rt(loc=loc) elif name == 'SUBJ': pos, neg = load_subj(loc=loc) elif name == 'CR': pos, neg = load_cr(loc=loc) elif name == 'MPQA': pos, neg = load_mpqa(loc=loc) labels = compute_labels(pos, neg) text, labels = shuffle_data(pos+neg, labels, seed=seed) z['text'] = text z['labels'] = labels print 'Computing skip-thought vectors...' features = skipthoughts.encode(model, text, verbose=False) return z, features
def save_caption_vectors_ms_coco(data_dir, split, batch_size): meta_data = {} ic_file = join(data_dir, 'annotations/captions_{}2014.json'.format(split)) with open(ic_file) as f: ic_data = json.loads(f.read()) meta_data['data_length'] = len(ic_data['annotations']) with open(join(data_dir, 'meta_{}.pkl'.format(split)), 'wb') as f: pickle.dump(meta_data, f) model = skipthoughts.load_model() batch_no = 0 print "Total Batches", len(ic_data['annotations'])/batch_size while batch_no*batch_size < len(ic_data['annotations']): captions = [] image_ids = [] idx = batch_no for i in range(batch_no*batch_size, (batch_no+1)*batch_size): idx = i%len(ic_data['annotations']) captions.append(ic_data['annotations'][idx]['caption']) image_ids.append(ic_data['annotations'][idx]['image_id']) print captions print image_ids # Thought Vectors tv_batch = skipthoughts.encode(model, captions) h5f_tv_batch = h5py.File( join(data_dir, 'tvs/'+split + '_tvs_' + str(batch_no)), 'w') h5f_tv_batch.create_dataset('tv', data=tv_batch) h5f_tv_batch.close() h5f_tv_batch_image_ids = h5py.File( join(data_dir, 'tvs/'+split + '_tv_image_id_' + str(batch_no)), 'w') h5f_tv_batch_image_ids.create_dataset('tv', data=image_ids) h5f_tv_batch_image_ids.close() print "Batches Done", batch_no, len(ic_data['annotations'])/batch_size batch_no += 1
def embedding_thread(x, y, output): imdb_key_check = {} last_stories = [] for i in tqdm(xrange(x,y)): error = False qa_info = self.qa[i] question = str(qa_info.question) answers = qa_info.answers correct_index = qa_info.correct_index imdb_key = str(qa_info.imdb_key) validation_flag = str(qa_info.qid) for answer in answers: if len(answer) == 0 : error = True if error == True :continue question_embedding = skipthoughts.encode(model, [question]) words_in_question = word_tokenize(question) assert question_embedding.shape == (1,4800) local_answers = skipthoughts.encode(model, answers) stories = self.story[imdb_key] local_stories = [] if imdb_key in imdb_key_check: local_stories = last_stories else: imdb_key_check[imdb_key] = 1 local_stories = skipthoughts.encode(model, stories) last_stories = local_stories skip_dim = 4800 self.zq.append(question_embedding) self.zaj.append(np.array(local_answers).reshape(5,4800)) zsl_row = np.array(local_stories).shape[0] print "zsl shape >> ", print np.array(local_stories).shape self.zsl.append(np.array(local_stories).reshape(zsl_row,4800)) self.qid.append(qa_info.qid) print "===========================" print "each QAInfo status >> " print "question embedding shape >> ", print np.array(self.zq).shape print np.array(self.zq_val).shape print "answer embedding shape >> ", print np.array(self.zaj).shape print np.array(self.zaj_val).shape print "stories embedding shape >> ", try: print np.array(self.zsl).shape print np.array(self.zsl_val).shape except: print "warning : dimension error." print "ground truth shape >> ", print np.array(self.ground_truth).shape print np.array(self.ground_truth_val).shape print "==========================" output.put(self.zq) output.put(self.zaj) output.put(self.zsl) output.put(self.qid)
def compute_question_vector(question, model): """Takes a question and computes a thought vector representing that question""" q_vec = skipthoughts.encode(model, question) return q_vec
vectors_of_papers_for_diff_authors = [] for i in abstract.keys(): t = time() outfile.write("<author>" + "\n") outfile.write(i + "\n") #print i outfile.write("</author>" + "\n") outfile.write("<paper_id>" + "\n") for j in identifier[i]: outfile.write(j + "\n") outfile.write("</paper_id>" + "\n") outfile.write("<paper_vector>" + "\n") # for k in abstract[i]: # print k # abs_in_list = [k] vecs = skipthoughts.encode(model,abstract[i]) vectors_of_papers_for_diff_authors.append(vecs) for vec in vecs: outfile.write(" ".join(str(vec)[1:-1])+'\n') outfile.write("</paper_vector>" + "\n\n") #print "Time taken for author: ", i, time()-t dict_temp = {'author_names':authors, 'paper_vectors':vectors_of_papers_for_diff_authors} #pkl.dump({'author_names':authors, 'paper_vectors':vectors_of_papers_for_diff_authors}, vec_pickle) io.savemat('vectors_three_authors.mat',dict_temp) checkfile.close() outfile.close() #vec_pickle.close()
test_sentences = preprocess.get_chain_sentences(test_data) def get_chunk(iterable, size): return [iterable[x:x + size] for x in xrange(0, len(iterable), size)] def save_vectors(dataset, chain_vectors, save_file_name, train=True): vectors_dict = {} size = 5 if train else 6 for key, vectors in zip(dataset.keys(), get_chunk(chain_vectors, size)): vectors_dict[key] = vectors print("save", save_file_name) shape = vectors.shape shape = (1, shape[0], shape[1]) vectors_matrix = np.concatenate([v.reshape(shape) for k, v in sorted( vectors_dict.items(), key=lambda x:x[0])], axis=0) np.save(save_file_name, vectors_matrix) print("encode valid vectors") valid_vectors = skipthoughts.encode(model, valid_sentences) save_vectors(valid_data, valid_vectors, "valid_vectors", train=False) print("encode test vectors") test_vectors = skipthoughts.encode(model, test_sentences) save_vectors(test_data, test_vectors, "test_vectors", train=False) print("encode train vectors") train_vectors = skipthoughts.encode(model, train_sentences) save_vectors(train_data, train_vectors, "train_vectors", train=True)
istitle = 1 elif line == '<author>': isauth = 1 elif line == '<abstract>': isabs = 1 elif isend == 1: author2=author1[0] author2+='$' author2 = re.sub('[\s]','_',author2) identifier = author2+re.sub('[\s]','_',title1) identifier = re.sub(r'\.', '', identifier) abstract1 = title1+'$'+abstract1 abstract1=re.sub(r'\"','\'',abstract1) abstract1=abstract1.decode('utf8') a=[abstract1] vector=skipthoughts.encode(model,a) for i in author1: if i in unique_authors_set: if identifier not in author_papers[i]: author_papers[i].append(identifier) paper_vectors[i].append(vector) author2,title1,abstract1='','','' author1=[] isend = 0 if isend == 1: author2=author1[0] author2+='$' author2 = re.sub('[\s]','_',author2) identifier = author2+re.sub('[\s]','_',title1) identifier = re.sub(r'\.', '', identifier)
import skipthoughts model= skipthoughts.load_model() vectors= skipthoughts.encode(model,['This morning, I decided to take a walk. I wonder how these things come into play']) print vectors
import skipthoughts import play import os path_to_file="/Users/wheatwaves/deeplearning/skip-thoughts/data/data/sentencesOfPureText/" path_to_save_file="/Users/wheatwaves/deeplearning/skip-thoughts/data/data/sentencesOfPureText/" g=os.walk(path_to_file) names=[] for root, dirs, files in g: names=files model=skipthoughts.load_model() for name in names[1:]: try: f=open(path_to_file+name) s=[line.strip().decode('utf-8') for line in f.readlines()] f.close() M=skipthoughts.encode(model,s) scores=[0]*len(s) except: f=open(path_to_file+name) s=[line.strip() for line in f.readlines()] f.close() M=skipthoughts.encode(model,s) scores=[0]*len(s) for i in xrange(len(M)): for j in xrange(len(M)): if i!=j: scores[i]+=play.cos_similarity(M[i],M[j]) f=open(path_to_save_file+name+'.scores','w') for num in scores: f.write(str(num)+'\n') f.close()
# -*- coding: utf-8 -*- # Created by junfeng on 3/28/16. # logging config import logging logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.DEBUG) logger = logging.getLogger(__name__) import skipthoughts model = skipthoughts.load_model() X = [ 'Hello, skip thoughts', ] vectors = skipthoughts.encode(model, X) print(vectors)
value = pair.get('value') if value == 'TRUE': labels[i] = 1 t = pair.find('t') h = pair.find('h') t = t.string.strip() h = h.string.strip() ts.append(t) hs.append(h) samples.append(u'{0} {1}'.format(t, h)) if i % 1000 == 0: logger.info('processed sample {0}'.format(i)) logger.info('unique ts: {0}, unique hs: {1}'.format(len(set(ts)), len(set(hs)))) logger.info('unique sample: {0}'.format(len(set(samples)))) logger.info('TRUE labels: {0}'.format(np.sum(labels))) return ts, hs, labels if __name__ == '__main__': logger.info('read rte dataset xml file ...') ts, hs, labels = read_rte_xml() logger.info('read model ...') model = read_model() logger.info('encoding ts ...') vectorized_ts = skipthoughts.encode(model, ts) logger.info('encoding hs ...') vectorized_hs = skipthoughts.encode(model, hs) logger.info('dump to file ...') joblib.dump((vectorized_ts, vectorized_hs, labels), './data/processed-rte-dataset.pkl') logger.info('done')
(row * dimension) : ((row + 1) * dimension), (column * dimension) : ((column + 1) * dimension), : ] = c[:][:][:] # Note that this is architecture is hardcoded for now # After generating blurry images ; sharpen them K = 32 factors = [1, 1, 1] kernel_sizes = [7, 7, 5] num_filts = [128, 128, 3] pathToGANWeights = args.gan_path sys.path.append(args.skipthought_path) import skipthoughts model = skipthoughts.load_model() y_skipthought = skipthoughts.encode(model, [sentence]) y_skipthought = np.float32(np.repeat(y_skipthought, 100, axis=0)) batch_size = generated_imgs.shape[0] print generated_imgs.shape, y_skipthought.shape generate_edges_func = gan(K, batch_size, factors, kernel_sizes, num_filts, pathToGANWeights) edges = generate_edges_func(generated_imgs, y_skipthought) generated_imgs = generated_imgs.reshape([generated_imgs.shape[0], 3, K, K]) sharp_imgs = generated_imgs + edges sharp_imgs[sharp_imgs > 1] = 1 sharp_imgs[sharp_imgs < 0] = 0 total_image_sharp = np.zeros( (dimension * int(math.sqrt(num_samples)), dimension * int(math.sqrt(num_samples)), 3)
lyrics_table=lyrics_table[['lyrics']].applymap(lyrics_clean) for i in lyrics_table.index: lyrics_table.loc[i]['lyrics'].index=[i]*len(lyrics_table.loc[i]['lyrics']) lyrics_table.iloc[1]['lyrics'][['sentence']].values.transpose().tolist()[0] all_lyrics_table = pd.concat(lyrics_table['lyrics'].values) # Calculate sentence similarity from analysis_package import * import skipthoughts model=skipthoughts.load_model() all_lyrics = all_lyrics_table[['sentence']].values.transpose().tolist()[0] len(all_lyrics) vectors = skipthoughts.encode(model,all_lyrics[:10000],use_eos=True) lyrics_embedding_table = pd.DataFrame(vectors,index = all_lyrics[:10000]) neighbor_table = k_nearest_neighbor(lyrics_embedding_table,5) def range_filter(df, keys, lower, upper): key = keys[0] true_false_table = np.array(df[[key]] < upper) & np.array(df[[key]] > lower) for key in keys: true_false_table = true_false_table & (np.array(df[[key]] < upper) & np.array(df[[key]] > lower)) return df[true_false_table] range_filter(neighbor_table,[3,5,7,9],0.,0.5) range_filter(neighbor_table,[3,5,7],0.,0.5) range_filter(neighbor_table,[3,5],0.,0.5)
def embedding_thread(x, y, output): imdb_key_check = {} last_stories = [] for i in tqdm(xrange(x,y)): error = False qa_info = self.qa[i] question = str(qa_info.question) answers = qa_info.answers correct_index = qa_info.correct_index imdb_key = str(qa_info.imdb_key) validation_flag = str(qa_info.qid) for answer in answers: if len(answer) == 0 : error = True if error == True :continue question_embedding = skipthoughts.encode(model, [question]) words_in_question = word_tokenize(question) assert question_embedding.shape == (1,4800) local_answers = skipthoughts.encode(model, answers) stories = self.story[imdb_key] local_stories = [] if imdb_key in imdb_key_check: local_stories = last_stories else: imdb_key_check[imdb_key] = 1 local_stories = skipthoughts.encode(model, stories) ''' for sentence in stories: #local_stories.append(skipthoughts.encode(model, [sentence])) paragraph_tokenize = sent_tokenize(paragraph) for sentences in paragraph_tokenize: words_detected = 0 for w in words_in_question: if sentences.find(w) != -1: words_detected += 1 if words_detected >= 1: local_stories.append(skipthoughts.encode(model, [sentences])) # skip embedding : story ''' print local_stories.shape last_stories = local_stories skip_dim = 4800 if validation_flag.find('train') != - 1: self.zq.append(question_embedding) self.zaj.append(np.array(local_answers).reshape(5,4800)) self.ground_truth.append(correct_index) zsl_row = np.array(local_stories).shape[0] print "zsl shape >> ", print np.array(local_stories).shape self.zsl.append(np.array(local_stories).reshape(zsl_row,4800)) elif validation_flag.find('val') != -1: self.zq_val.append(question_embedding) self.zaj_val.append(np.array(local_answers).reshape(5,4800)) self.ground_truth_val.append(correct_index) zsl_row = np.array(local_stories).shape[0] self.zsl_val.append(np.array(local_stories).reshape(zsl_row,4800)) print "===========================" print "each QAInfo status >> " print "question embedding shape >> ", print np.array(self.zq).shape print np.array(self.zq_val).shape print "answer embedding shape >> ", print np.array(self.zaj).shape print np.array(self.zaj_val).shape print "stories embedding shape >> ", try: print np.array(self.zsl).shape print np.array(self.zsl_val).shape except: print "warning : dimension error." print "ground truth shape >> ", print np.array(self.ground_truth).shape print np.array(self.ground_truth_val).shape print "==========================" output.put(self.zq) output.put(self.zq_val) output.put(self.zaj) output.put(self.zaj_val) output.put(self.zsl) output.put(self.zsl_val) output.put(self.ground_truth) output.put(self.ground_truth_val)
def transform_ques_weak(x, sqa, word_to_id): '''print "x[0]" print x[0] print "x[1]" print x[1] print "x[2]" print x[2] print "x[3]" print x[3] print "x[4]" print x[4] #print "x" #print x print "x[13][2]" print x[13][2] print "x[13][3]" print x[13][3] print "x[13][4]" print x[13][4] print "word_to_id" print word_to_id print "x[4]" print x[4] print "x[0][4]" print x[0][4] ''' d1 = [[14, 11, 21, 3, 4, 27, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 11, 21, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [20, 2, 3, 4, 12, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [20, 28, 3, 4, 26, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [31, 2, 3, 4, 26, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [14, 39, 4, 9, 10, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [20, 2, 3, 4, 27, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [14, 13, 4, 9, 3, 20, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [20, 13, 4, 9, 3, 14, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [14, 35, 4, 9, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] d2 = [[14, 11, 21, 3, 4, 27, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 11, 21, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [20, 2, 3, 4, 12, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [20, 28, 3, 4, 26, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [31, 2, 3, 4, 26, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [14, 39, 4, 9, 10, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [20, 2, 3, 4, 27, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] e = [14, 36, 4, 9, 3, 1, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] z = np.zeros((110,4800)) #model = skipthoughts.load_model() quest = x ''' indices = [] for stmt in quest[2]: index_stmt = map(lambda x: word_to_id[x], stmt) indices.append(index_stmt) quest[2] = indices ''' indices = [] for i in range(0, sqa): qi2 = skipthoughts.encode(model, x[i][2]) s = qi2.shape[0] z[:s] = qi2 quest[i][2] = z.tolist() ''' four = encoded_already.s1() quest[i][2] = four[0] ''' #print "quest[i][2]" #print quest[i][2] #quest[i][2] = d1 #quest[3][2] = d2 #quest[13][2] = d2 quest[i][3] = skipthoughts.encode(model, x[i][3]) q3l = quest[i][3].tolist() quest[i][3] = q3l[0] #because skipthoughts automatically puts two brackets around a single sentene encoding ''' four2 = encoded_already.s2() quest[i][3] = four2[0] ''' #quest[i][3] = e #quest[i][4] = skipthoughts.encode(model, x[i][4]) #quest[i][4] = quest[i][4].tolist() quest[i][4] = word_to_id[x[i][4][0]] i += 1 return quest
def compute_question_vector2(question, model): """Takes a question and computes a thought vector representing that question""" q_list = question.split(".") q_vec = sum(skipthoughts.encode(model, sentence) for sentence in q_list) return q_vec
def embedding_thread(a, b): imdb_key_check = {} last_stories = [] for i in tqdm(xrange(a,b)): error = False #if i == 100 : break qa_info = self.qa[i] question = str(qa_info.question) answers = qa_info.answers correct_index = qa_info.correct_index imdb_key = str(qa_info.imdb_key) validation_flag = str(qa_info.qid) question_embedding = skipthoughts.encode(model, [question]) assert question_embedding.shape == (1,4800) for answer in answers: if len(answer) == 0 : error = True if error == True :continue local_answers = [skipthoughts.encode(model, [str(answer)]) for answer in answers] gt = [0.0] * 5 gt[correct_index] = 1.0 stories = self.story[imdb_key] local_stories = [] #for s in stories : print [str(s)] if imdb_key in imdb_key_check: local_stories = last_stories else: imdb_key_check[imdb_key] = 1 local_stories = [skipthoughts.encode(model, [str(s)]) for s in stories] last_stories = local_stories skip_dim = 4800 if validation_flag.find('train') != - 1: self.zq.append(question_embedding.reshape((skip_dim))) self.zaj.append(np.transpose(np.array(local_answers).reshape(5,skip_dim))) self.ground_truth.append(np.array(gt)) zsl_row = np.array(local_stories).shape[0] print "zsl shape >> ", print np.array(local_stories).shape self.zsl.append(np.transpose(np.array(local_stories).reshape(zsl_row, skip_dim))) elif validation_flag.find('val') != -1: self.zq_val.append(question_embedding.reshape((skip_dim))) self.zaj_val.append(np.transpose(np.array(local_answers).reshape(5,skip_dim))) self.ground_truth_val.append(np.array(gt)) zsl_row = np.array(local_stories).shape[0] self.zsl_val.append(np.transpose(np.array(local_stories).reshape(zsl_row,skip_dim))) print "===========================" print "each QAInfo status >> " print "question embedding shape >> ", print np.array(self.zq_val).shape print "answer embedding shape >> ", print np.array(self.zaj_val).shape print "stories embedding shape >> ", try: print np.array(self.zsl_val).shape except: print "warning : dimension error." print "ground truth shape >> ", print np.array(self.ground_truth_val).shape print "=========================="
def main(): parser = argparse.ArgumentParser() parser.add_argument('--caption_file', type=str, default='Data/sample_captions.txt', help='caption file') parser.add_argument('--data_dir', type=str, default='Data', help='Data Directory') args = parser.parse_args() with open( args.caption_file ) as f: captions = f.read().split('\n') captions = [cap for cap in captions if len(cap) > 0] print captions model = skipthoughts.load_model() caption_vectors = skipthoughts.encode(model, captions) if os.path.isfile(join(args.data_dir, 'sample_caption_vectors.hdf5')): os.remove(join(args.data_dir, 'sample_caption_vectors.hdf5')) h = h5py.File(join(args.data_dir, 'sample_caption_vectors.hdf5')) h.create_dataset('vectors', data=caption_vectors) h.close()