def prepare_train_data(config): """ Prepare the data for training the model. """ print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size) vocabulary.load(config.vocabulary_file) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) print("Processing the captions...") annotations = pd.read_csv(config.temp_annotation_file) captions = annotations['caption'].values image_ids = annotations['image_id'].values image_files = annotations['image_file'].values data = np.load(config.temp_data_file).item() word_idxs = data['word_idxs'] masks = data['masks'] print("Captions processed.") print("Number of captions = %d" % (len(captions))) print("Building the dataset...") dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs, masks, True, True) print("Dataset built.") return dataset
def prepare_train_data(config): """ Prepare the data for training the model. """ vqa = VQA(config.train_answer_file, config.train_question_file) vqa.filter_by_ques_len(config.max_question_length) vqa.filter_by_ans_len(1) print("Reading the questions and answers...") annotations = process_vqa(vqa, 'COCO_train2014', config.train_image_dir, config.temp_train_annotation_file) image_files = annotations['image_file'].values questions = annotations['question'].values question_ids = annotations['question_id'].values answers = annotations['answer'].values print("Questions and answers read.") print("Number of questions = %d" % (len(question_ids))) print("Building the vocabulary...") vocabulary = Vocabulary() if not os.path.exists(config.vocabulary_file): for question in tqdm(questions): vocabulary.add_words(word_tokenize(question)) for answer in tqdm(answers): vocabulary.add_words(word_tokenize(answer)) vocabulary.compute_frequency() vocabulary.save(config.vocabulary_file) else: vocabulary.load(config.vocabulary_file) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) config.vocabulary_size = vocabulary.size print("Processing the questions and answers...") if not os.path.exists(config.temp_train_data_file): question_word_idxs, question_lens = process_questions( questions, vocabulary, config) answer_idxs = process_answers(answers, vocabulary) data = { 'question_word_idxs': question_word_idxs, 'question_lens': question_lens, 'answer_idxs': answer_idxs } np.save(config.temp_train_data_file, data) else: data = np.load(config.temp_train_data_file).item() question_word_idxs = data['question_word_idxs'] question_lens = data['question_lens'] answer_idxs = data['answer_idxs'] print("Questions and answers processed.") print("Building the dataset...") dataset = DataSet(image_files, question_word_idxs, question_lens, question_ids, config.batch_size, answer_idxs, True, True) print("Dataset built.") return dataset, config
def build_vocabulary(config, captions, oracle_file): print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size, config.ctrl_symbols) if True: #not os.path.exists(config.vocabulary_file): vocabulary.build(captions) vocabulary.save(config.vocabulary_file) else: vocabulary.load(config.vocabulary_file) #print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) #return vocabulary print("NUM CAPTIONS: " + str(len(captions))) if not os.path.exists(config.temp_data_file): word_idxs = [] sent_lens = [] for caption in captions: current_word_idxs, current_length = vocabulary.process_sentence( caption) current_num_words = min(config.max_caption_length - 2, current_length) pad_length = config.max_caption_length - current_length - 2 current_word_idxs = [config._START_ ] + current_word_idxs[:current_num_words] + [ config._END_ ] + [config._PAD_] * pad_length word_idxs.append(current_word_idxs) sent_lens.append(current_num_words + 2) word_idxs = np.array(word_idxs) data = {'word_idxs': word_idxs, 'sentence_len': sent_lens} np.save(config.temp_data_file, data) else: data = np.load(config.temp_data_file).item() word_idxs = data['word_idxs'] sent_lens = data['sentence_len'] if oracle_file is not None: with open(oracle_file, 'w') as outfile: paras = "" for line in word_idxs: for word in line: paras += (str(word) + ' ') paras += '\n' outfile.write(paras) return vocabulary
def main(): args = get_args() vocab = Vocabulary.load(args.vocab_prefix.strip()) output_dir = path.realpath(args.output_dir.strip()) if args.soseos: line2arr = partial(line2arr_with_soseos, vocab) print("sos-eos!!!") else: line2arr = partial(line2arr_no_soseos, vocab) counter = -1 for line in sys.stdin.readlines(): counter += 1 # print(counter) # sys.stdout.flush() # counter += 1 if counter % 100 == 0: print(counter) sys.stdout.flush() fname = line.strip() # try: lines = open(fname, 'r').readlines() stripped = map(lambda x: x.strip(), lines) non_empty = filter(lambda x: x != "", stripped) file_arr = [line2arr(line) for line in non_empty ] np_arr = np.array(file_arr) try: new_fname = path.join(output_dir, path.split(fname)[1].replace(".tok", ".npy")) np.save(new_fname, np_arr) except: print("errored out on: {0}".format(fname))
def get_huffman_tree(params): if "huff_tree_loc" in params: with open(params["huff_tree_loc"], 'rb') as f: huff_tree = pickle.load(f) else: vocab_size = params["n_vocab"] soseos_counts_estim = [40114695 for i in range(2)] vocab = Vocabulary.load( "/hdd/data/nlp/raw/unzipped/ff15_book/vocabs/final_vocab") sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True) sorted_counts = [x[1] for x in sorted_vocab] cutoff_counts = sorted_counts[0:vocab_size] oov_counts = [sum(sorted_counts[vocab_size:])] # print("#words: {0}".format(len(sorted_vocab))) # print("cutoff oov = {0}".format(sorted_vocab[vocab_size])) # print("oov words right after cutoff:") # print([x[0] for x in sorted_vocab[vocab_size:vocab_size + 50]]) # print("randomly sampled oov words:") # print(random.sample([x[0] for x in sorted_vocab[vocab_size:]], 50)) oov_percent = (100.0 * oov_counts[0]) / sum(cutoff_counts) # print("oov % = {0:.5f}".format(oov_percent)) all_counts = soseos_counts_estim + cutoff_counts + oov_counts params["vocab_counts"] = all_counts as_hash = {i: v for (i, v) in enumerate(all_counts)} huff_tree = chainer.links.BinaryHierarchicalSoftmax.create_huffman_tree( as_hash) print("loaded huffman tree") return huff_tree
def main2(): vocabs = [Vocabulary.load("./vocabs/v{0}".format(i)) for i in range(8)] print("loaded vocabs!") master_vocab = Vocabulary.merge_vocabularies(vocabs) master_vocab.save("./vocabs/final_vocab") import pdb pdb.set_trace()
def main3(): master_vocab = Vocabulary.load("./vocabs/final_vocab") offsets = master_vocab.get_offsets() with open("./vocabs/offsets.pkl", 'wb') as out_f: pickle.dump(offsets, out_f) import pdb pdb.set_trace() print(32)
def get_vocabulary(cls, vocab_path, captions_path, tokenized_captions, threshold=1): # Load or construct vocabulary if os.path.exists(vocab_path): vocab = Vocabulary.load(vocab_path) else: vocab = cls.build_vocab(captions_path, tokenized_captions, threshold) #TODO: check if saving is safe Vocabulary.save(vocab, vocab_path) print("Saved the vocabulary to '%s'" %vocab_path) return vocab
def prepare_train_data(config): """ Prepare the data for training the model. """ if not os.path.exists(config.prepare_annotation_dir): os.mkdir(config.prepare_annotation_dir) coco = COCO(config, config.train_caption_file, config.val_caption_file) print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size) if not os.path.exists(config.vocabulary_file): coco.filter_by_cap_len(config.max_caption_length) vocabulary.build(coco.all_captions()) vocabulary.save(config.vocabulary_file) vocabulary.save_counts(config.word_count_file) else: vocabulary.load(config.vocabulary_file) print("Vocabulary built.") print("Number of words = %d" %(vocabulary.size)) print("Processing the captions...") if not os.path.exists(config.train_csv_file): coco.filter_by_words(set(vocabulary.words)) captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns] image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns] image_files = [ os.path.join(config.dataset_image_dir, 'train' if coco.imgs[image_id]['file_name'].find('train2014')>=0 else 'val', coco.imgs[image_id]['file_name']) for image_id in image_ids ] annotations = pd.DataFrame({'image_id': image_ids, 'image_file': image_files, 'caption': captions}) annotations.to_csv(config.train_csv_file) else: annotations = pd.read_csv(config.train_csv_file) captions = annotations['caption'].values image_ids = annotations['image_id'].values image_files = annotations['image_file'].values
def test(self): trace('loading model ...') self.trg_vocab = Vocabulary.load("model/" + self.model + '.trgvocab') self.batch_size = len(trg_batch) encdec = EncoderDecoder.load_spec("model/" + self.model + '.spec') serializers.load_hdf5("model/" + self.model + '.weights', encdec) trace('generating translation ...') generated = 0 with open(self.target, 'w') as fp: self.__forward_img() trace('sample %8d ...' % (generated + 1)) hyp_batch = self.__forward_word(self.trg_batch, encdec, False, self.generation_limit) for hyp in hyp_batch: hyp.append('</s>') hyp = hyp[:hyp.index('</s>')] print('hyp : ' +''.join(hyp)) print(' '.join(hyp), file=fp) trace('finished.')
def test(self): trace('loading model ...') trg_vocab = Vocabulary.load(self.model + '.trgvocab') self.encdec = EncoderDecoderAttention.load_spec(self.model + '.spec') serializers.load_hdf5(self.model + '.weights', self.encdec) trace('generating translation ...') generated = 0 trace('sample %8d - %8d ...' % (generated + 1, generated)) hyp_batch = self.forward(trg_vocab, False, self.generation_limit) source_cuont = 0 with open(self.target, 'w') as fp: for hyp in hyp_batch: hyp.append('</s>') hyp = hyp[: hyp.index('</s>')] print('hyp : ' + ''.join(hyp)) fp.write(' '.join(hyp)) source_cuont = source_cuont + 1 trace('finished.')
def test(self): trace('loading model ...') trg_vocab = Vocabulary.load(self.model + '.trgvocab') self.encdec = EncoderDecoderAttention.load_spec(self.model + '.spec') serializers.load_hdf5(self.model + '.weights', self.encdec) trace('generating translation ...') generated = 0 trace('sample %8d - %8d ...' % (generated + 1, generated)) hyp_batch = self.forward(trg_vocab, False, self.generation_limit) source_cuont = 0 with open(self.target, 'w') as fp: for hyp in hyp_batch: hyp.append('</s>') hyp = hyp[:hyp.index('</s>')] print('hyp : ' + ''.join(hyp)) fp.write(' '.join(hyp)) source_cuont = source_cuont + 1 trace('finished.')
def test(self): trace('loading model ...') self.trg_vocab = Vocabulary.load("model/" + self.model + '.trgvocab') self.batch_size = len(trg_batch) encdec = EncoderDecoder.load_spec("model/" + self.model + '.spec') serializers.load_hdf5("model/" + self.model + '.weights', encdec) trace('generating translation ...') generated = 0 with open(self.target, 'w') as fp: self.__forward_img() trace('sample %8d ...' % (generated + 1)) hyp_batch = self.__forward_word(self.trg_batch, encdec, False, self.generation_limit) for hyp in hyp_batch: hyp.append('</s>') hyp = hyp[:hyp.index('</s>')] print('hyp : ' + ''.join(hyp)) print(' '.join(hyp), file=fp) trace('finished.')
def prepare_train_data(config): """ Prepare the data for training the model. """ coco = COCO(config.train_caption_file) coco.filter_by_cap_len(config.max_caption_length) print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size) if not os.path.exists(config.vocabulary_file): vocabulary.build(coco.all_captions()) vocabulary.save(config.vocabulary_file) else: vocabulary.load(config.vocabulary_file) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) coco.filter_by_words(set(vocabulary.words)) print("Processing the captions...") if not os.path.exists(config.temp_annotation_file): captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns] image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns] image_files = [ os.path.join(config.train_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids ] annotations = pd.DataFrame({ 'image_id': image_ids, 'image_file': image_files, 'caption': captions }) annotations.to_csv(config.temp_annotation_file) else: annotations = pd.read_csv(config.temp_annotation_file) captions = annotations['caption'].values image_ids = annotations['image_id'].values image_files = annotations['image_file'].values if not os.path.exists(config.temp_data_file): word_idxs = [] masks = [] for caption in tqdm(captions): current_word_idxs_ = vocabulary.process_sentence(caption) current_num_words = len(current_word_idxs_) current_word_idxs = np.zeros(config.max_caption_length, dtype=np.int32) current_masks = np.zeros(config.max_caption_length) current_word_idxs[:current_num_words] = np.array( current_word_idxs_) current_masks[:current_num_words] = 1.0 word_idxs.append(current_word_idxs) masks.append(current_masks) word_idxs = np.array(word_idxs) masks = np.array(masks) data = {'word_idxs': word_idxs, 'masks': masks} np.save(config.temp_data_file, data) else: data = np.load(config.temp_data_file, encoding='latin1').item() word_idxs = data['word_idxs'] masks = data['masks'] print("Captions processed.") print("Number of captions = %d" % (len(captions))) print("Building the dataset...") if (config.train_data_count_limit > 0): print("-----------------------------------------------") print("Restricting Sz:\t", config.train_data_count_limit) print("Batch Sz:\t", config.batch_size) image_ids = image_ids[0:config.train_data_count_limit] image_files = image_files[0:config.train_data_count_limit] word_idxs = word_idxs[0:config.train_data_count_limit] masks = masks[0:config.train_data_count_limit] """ Dump the image paths to a file """ filepath = 'train_images.csv' with open(filepath, 'w') as file_handler: for i in range(0, config.train_data_count_limit): file_handler.write("{}\n".format(image_files[i])) #print(image_files) print("-----------------------------------------------") dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs, masks, True, True) print("Dataset built.") return dataset
def main(argv): print("Testing the model ...") config = Config() config.beam_size = FLAGS.beam_size config.phase = 'test' if not os.path.exists(config.test_result_dir): os.mkdir(config.test_result_dir) print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size) vocabulary.load(config.vocabulary_file) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) test_data = DataProvider(config) test_gt_coco = test_data.returncoco() model = ShowAttendTell(config) model.build() with tf.Session() as sess: model.setup_graph_from_checkpoint(sess, config.caption_checkpoint_dir) tf.get_default_graph().finalize() captiongen = CaptionGenerator(model, vocabulary, config.beam_size, config.max_caption_length, config.batch_size) # Generate the captions for the images results = [] idx = 0 for k in tqdm(list(range(test_data.num_batches)), desc='batch'): batch, images = test_data.next_batch_and_images() caption_data = captiongen.beam_search(sess, images, vocabulary) fake_cnt = 0 if k<test_data.num_batches-1 \ else test_data.fake_count for l in range(test_data.batch_size - fake_cnt): word_idxs = caption_data[l][0].sentence score = caption_data[l][0].score caption = vocabulary.get_sentence(word_idxs) results.append({ 'image_id': test_data.image_ids[idx], 'caption': caption }) idx += 1 # Save the result in an image file, if requested if config.save_test_result_as_image: image_file = batch[l] image_name = image_file.split(os.sep)[-1] image_name = os.path.splitext(image_name)[0] img = plt.imread(image_file) plt.switch_backend('agg') plt.imshow(img) plt.axis('off') plt.title(caption) plt.savefig( os.path.join(config.test_result_dir, image_name + '_result.png')) fp = open(config.test_result_file, 'wb') json.dump(results, fp) fp.close() # Evaluate these captions test_result_coco = test_gt_coco.loadRes(config.test_result_file) scorer = COCOEvalCap(test_gt_coco, test_result_coco) scorer.evaluate() print("Evaluation complete.")
def prepare_train_data(config): """ Prepare the data for training the model. """ vocabulary = Vocabulary(config.vocabulary_size) print("Vocabulary complete.") print("Number of words = %d" % (vocabulary.size)) print("Processing the captions...") if not os.path.exists(config.temp_annotation_file): coco.filter_by_words(set(vocabulary.words)) print("Filtering the captions to those that exist") captions = [] image_ids = [] id = 0 for ann_id in coco.anns: id = id + 1 if id < 1000: entry = coco.anns[ann_id] should_add = entry['image_id'] in coco.imgs and os.path.exists( os.path.join(config.train_image_dir, coco.imgs[entry['image_id']]['file_name'])) if should_add: captions.append(entry['caption']) image_ids.append(entry['image_id']) image_files = [ os.path.join(config.train_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids ] annotations = pd.DataFrame({ 'image_id': image_ids, 'image_file': image_files, 'caption': captions }) annotations.to_csv(config.temp_annotation_file) else: print("Loading the captions from ", config.temp_annotation_file) annotations = pd.read_csv(config.temp_annotation_file, encoding='latin-1') captions = annotations['caption'].values image_ids = annotations['image_id'].values image_files = annotations['image_file'].values if not os.path.exists(config.vocabulary_file): print("Building the vocabulary...") vocabulary.build(captions) vocabulary.save(config.vocabulary_file) else: print("Loading the vocabulary from ", config.vocabulary_file) vocabulary.load(config.vocabulary_file) if not os.path.exists(config.temp_data_file): word_idxs = [] masks = [] for caption in tqdm(captions): current_word_idxs_ = vocabulary.process_sentence(caption) current_num_words = len(current_word_idxs_) current_word_idxs = np.zeros(config.max_caption_length, dtype=np.int32) current_masks = np.zeros(config.max_caption_length) current_word_idxs[:current_num_words] = np.array( current_word_idxs_) current_masks[:current_num_words] = 1.0 word_idxs.append(current_word_idxs) masks.append(current_masks) word_idxs = np.array(word_idxs) masks = np.array(masks) data = {'word_idxs': word_idxs, 'masks': masks} np.save(config.temp_data_file, data) else: loaded = np.load(config.temp_data_file, allow_pickle=True) data = loaded.item() word_idxs = data['word_idxs'] masks = data['masks'] print("Captions processed.") print("Number of captions = %d" % (len(captions))) print("Building the dataset...") dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs, masks, True, True) print("Dataset built.") return dataset
def prepare_train_data(config): """ Prepare the data for training the model. """ coco = COCO(config.train_caption_file) print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size) if not os.path.exists(config.vocabulary_file): vocabulary.build(coco.all_captions()) vocabulary.save(config.vocabulary_file) else: vocabulary.load(config.vocabulary_file) print("Vocabulary built.") print("Number of words = %d" % vocabulary.size) coco.filter_by_words(set(vocabulary.words)) print("Processing the captions...") if not os.path.exists(config.temp_annotation_file): captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns] image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns] image_files = [os.path.join(config.train_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids] annotations = pd.DataFrame({'image_id': image_ids, 'image_file': image_files, 'caption': captions}) annotations.to_csv(config.temp_annotation_file) else: annotations = pd.read_csv(config.temp_annotation_file) captions = annotations['caption'].values image_ids = annotations['image_id'].values image_files = annotations['image_file'].values if not os.path.exists(config.temp_data_file): word_idxs = [] masks = [] for caption in tqdm(captions): current_word_idxs_ = vocabulary.process_sentence(caption) current_num_words = len(current_word_idxs_) current_word_idxs = np.zeros(config.max_caption_length, dtype=np.int32) current_masks = np.zeros(config.max_caption_length) current_word_idxs[:current_num_words] = np.array(current_word_idxs_) current_masks[:current_num_words] = 1.0 word_idxs.append(current_word_idxs) masks.append(current_masks) word_idxs = np.array(word_idxs) masks = np.array(masks) data = {'word_idxs': word_idxs, 'masks': masks} np.save(config.temp_data_file, data) else: # RV # save np.load # np_load_old = np.load # modify the default parameters of np.load # np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k) # data = np.load(config.temp_data_file, allow_pickle=True, encoding="latin1").item() # # restore np.load for future normal usage # np.load = np_load_old # RV word_idxs = data['word_idxs'] masks = data['masks'] print("Captions processed.") print("Number of captions = %d" % (len(captions))) # RV # Select the first 30000 captions from the shuffled set # num_examples = 5000 # word_idxs = word_idxs[:num_examples] # image_files = image_files[:num_examples] print("Number of captions = %d" % (len(captions))) print("Building the dataset...") dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs, masks, True, True) print("Dataset built.") return dataset
def prepare_train_data(config): """ Prepare the data for training the model. """ coco = COCO(config.train_caption_file, config.ignore_file) #coco.filter_by_cap_len(config.max_caption_length) #print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size, config.ctrl_symbols) if not os.path.exists(config.vocabulary_file): vocabulary.build(coco.all_captions()) vocabulary.save(config.vocabulary_file) else: vocabulary.load(config.vocabulary_file) #print("Vocabulary built.") #print("Number of words = %d" %(vocabulary.size)) #coco.filter_by_words(set(vocabulary.words)) #print("Processing the captions...") if not os.path.exists(config.temp_annotation_file): captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns] image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns] image_files = [os.path.join(config.train_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids] annotations = pd.DataFrame({'image_id': image_ids, 'image_file': image_files, 'caption': captions}) annotations.to_csv(config.temp_annotation_file) else: annotations = pd.read_csv(config.temp_annotation_file) captions = [] image_ids = [] image_files = [] for id, file, feat, cap in annotations.values: image_ids.append(id) image_files.append(feat) captions.append(cap) print("NUM CAPTIONS: " + str(len(captions))) if not os.path.exists(config.temp_data_file): word_idxs = [] masks = [] sent_lens = [] for caption in tqdm(captions): current_word_idxs, current_length = vocabulary.process_sentence(caption) current_num_words = min(config.max_caption_length-2, current_length) current_word_idxs = [config._START_] + current_word_idxs[:current_num_words] + [config._END_] pad_length = config.max_caption_length - current_num_words -2 if pad_length > 0: current_word_idxs += [config._PAD_] * (pad_length) #print("sent length:"+str(len(current_word_idxs))+", real len:"+str(current_length)) current_masks = np.zeros(config.max_caption_length) current_masks[:current_num_words] = 1.0 word_idxs.append(current_word_idxs) masks.append(current_masks) sent_lens.append(current_num_words+2) word_idxs = np.array(word_idxs) masks = np.array(masks) data = {'word_idxs': word_idxs, 'masks': masks, 'sentence_len': sent_lens} np.save(config.temp_data_file, data) else: data = np.load(config.temp_data_file).item() word_idxs = data['word_idxs'] masks = None #data['masks'] sent_lens = data['sentence_len'] #print("Captions processed.") #print("Number of captions = %d" %(len(captions))) #print("Number of word_idxs = %d" %(len(word_idxs))) #print("Number of sent_lens = %d" %(len(sent_lens))) dataset = DataSet(coco, vocabulary, image_ids, image_files, config.batch_size, word_idxs, masks, sent_lens, True, True) return dataset
def prepare_train_data(config): """ Prepare the data for training the model. """ coco = COCO(config.train_caption_file) coco.filter_by_cap_len(config.max_caption_length) print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size) if not os.path.exists(config.vocabulary_file): vocabulary.build(coco.all_captions()) vocabulary.save(config.vocabulary_file) else: vocabulary.load(config.vocabulary_file) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) coco.filter_by_words(set(vocabulary.words)) print("Processing the captions...") if not os.path.exists(config.temp_annotation_file): captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns] image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns] image_files = [ os.path.join(config.train_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids ] annotations = pd.DataFrame({ 'image_id': image_ids, 'image_file': image_files, 'caption': captions }) annotations.to_csv(config.temp_annotation_file) else: annotations = pd.read_csv(config.temp_annotation_file) captions = annotations['caption'].values image_ids = annotations['image_id'].values image_files = annotations['image_file'].values if not os.path.exists(config.temp_data_file): word_idxs = [] masks = [] for caption in tqdm(captions): current_word_idxs_ = vocabulary.process_sentence(caption) current_num_words = len(current_word_idxs_) current_word_idxs = np.zeros(config.max_caption_length, dtype=np.int32) current_masks = np.zeros(config.max_caption_length) current_word_idxs[:current_num_words] = np.array( current_word_idxs_) current_masks[:current_num_words] = 1.0 word_idxs.append(current_word_idxs) masks.append(current_masks) word_idxs = np.array(word_idxs) masks = np.array(masks) data = {'word_idxs': word_idxs, 'masks': masks} np.save(config.temp_data_file, data) else: data = np.load(config.temp_data_file, encoding="latin1").item() word_idxs = data['word_idxs'] masks = data['masks'] print("Captions processed.") print("Number of captions = %d" % (len(captions))) print("Building the dataset...") dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs, masks, True, True) print("Dataset built.") return dataset
def prepare_train_data(config): """ Prepare the data for training the model. """ coco = COCO(config.train_caption_file) coco.filter_by_cap_len(config.max_caption_length) if config.distributed: images = os.listdir(config.train_image_dir) ids = [int(x[15:27]) for x in images] print 'Input Path: ' + config.train_image_dir + ' Number of files in input path: ' + str( int(len(ids))) print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size) if not os.path.exists(config.vocabulary_file): vocabulary.build(coco.all_captions()) vocabulary.save(config.vocabulary_file) else: vocabulary.load(config.vocabulary_file) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) coco.filter_by_words(set(vocabulary.words)) if config.distributed: print('Filter captions by images') coco.filter_by_images(ids) #print(coco.getImgIds(ids)) print("Processing the captions...") if not os.path.exists(config.temp_annotation_file): captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns] image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns] image_files = [ os.path.join(config.train_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids ] annotations = pd.DataFrame({ 'image_id': image_ids, 'image_file': image_files, 'caption': captions }) annotations.set_index('image_id', inplace=True) annotations = annotations.loc[ids] if not config.distributed: annotations.to_csv(config.temp_annotation_file) else: annotations = pd.read_csv(config.temp_annotation_file) captions = annotations['caption'].values image_ids = annotations['image_id'].values image_files = annotations['image_file'].values if not os.path.exists(config.temp_data_file): word_idxs = [] masks = [] for caption in tqdm(captions): current_word_idxs_ = vocabulary.process_sentence(caption) current_num_words = len(current_word_idxs_) current_word_idxs = np.zeros(config.max_caption_length, dtype=np.int32) current_masks = np.zeros(config.max_caption_length) current_word_idxs[:current_num_words] = np.array( current_word_idxs_) current_masks[:current_num_words] = 1.0 word_idxs.append(current_word_idxs) masks.append(current_masks) word_idxs = np.array(word_idxs) masks = np.array(masks) data = {'word_idxs': word_idxs, 'masks': masks} if not config.distributed: np.save(config.temp_data_file, data) else: data = np.load(config.temp_data_file).item() word_idxs = data['word_idxs'] masks = data['masks'] print("Captions processed.") print("Number of captions = %d" % (len(captions))) print("Building the dataset...") dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs, masks, True, True) print("Dataset built.") #print "Input Path: " + config.train_image_dir + " Number of files after data preparation: " + str(len(image_files)) #print "Images IDs to be used on this server: " + str(image_ids) return dataset