def build_vocabulary(config, max_ann_num=None): """ Build the vocabulary from the training data and save it to a file. """ coco = COCO(config.train_caption_file, config.max_train_ann_num) coco.filter_by_cap_len(config.max_caption_length) vocabulary = Vocabulary(config.vocabulary_size) if not config.max_train_ann_num: vocabulary.build(coco.all_captions()) else: vocabulary.build((coco.all_captions())[:config.max_train_ann_num]) vocabulary.save(config.vocabulary_file) return vocabulary
def build_vocabulary(config): """ Build the vocabulary from the training data and save it to a file. """ coco = COCO(config.train_caption_file) coco.filter_by_cap_len(config.max_caption_length) vocabulary = Vocabulary(config.vocabulary_size) vocabulary.build(coco.all_captions()) vocabulary.save(config.vocabulary_file) return vocabulary
def process_train_data(config, data_loc, orcale_file=None, has_image=False): if data_loc is None: data_loc = 'data/caption.txt' if not has_image: return process_text_only(config, data_loc, orcale_file) coco = COCO(config.train_caption_file, config.ignore_file) vocabulary = build_vocabulary(config, coco.all_captions(), orcale_file) print("Processing the captions...") if not os.path.exists(config.temp_annotation_file): captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns] image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns] image_files = [ os.path.join(config.train_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids ] feature_files = [ os.path.join( config.train_feature_dir, os.path.basename(coco.imgs[image_id]['file_name'].replace( '.jpg', '.npy'))) for image_id in image_ids ] annotations = pd.DataFrame({ 'image_id': image_ids, 'image_file': image_files, 'feature_file': feature_files, 'caption': captions }) annotations.to_csv(config.temp_annotation_file) print(len(image_ids), len(image_files), len(feature_files), len(captions)) else: annotations = pd.read_csv(config.temp_annotation_file) captions = [] image_ids = [] image_files = [] feature_files = [] for _, id, file, feature, cap in annotations.values: image_ids.append(id) image_files.append(file) feature_files.append(feature) captions.append(cap) print("load data...") print(len(image_ids), len(image_files), len(feature_files), len(captions)) with open(config.temp_image_file, 'w') as outfile: for img_file in image_files: outfile.write(img_file + "\n") with open(config.temp_feature_file, 'w') as outfile: for feature in feature_files: outfile.write(feature + "\n") return config.max_caption_length, vocabulary.size + len( config.ctrl_symbols), vocabulary
def process_test_data(config): #vocabulary = Vocabulary(config.vocabulary_size, config.ctrl_symbols) coco = COCO(config.train_caption_file, config.ignore_file) vocabulary = build_vocabulary(config, coco.all_captions()) if not os.path.exists(config.test_temp_file): image_files = [config.test_image_dir+f for f in os.listdir(config.test_image_dir)] feature_files = [config.test_image_vgg_dir+f for f in os.listdir(config.test_image_vgg_dir)] data = pd.DataFrame({'image_file': image_files, 'feature_file': feature_files}) data.to_csv(config.test_temp_file) return config.max_caption_length, vocabulary.size + len(config.ctrl_symbols), vocabulary
def process_val_data(config): #vocabulary = Vocabulary(config.vocabulary_size, config.ctrl_symbols) coco = COCO(config.train_caption_file, config.ignore_file) vocabulary = build_vocabulary(config, coco.all_captions()) coco = COCO(config.eval_caption_file, config.ignore_file_eval) all_captions = coco.all_captions if not os.path.exists(config.eval_temp_file): df = pd.read_csv(config.ignore_file_eval).values ignore_ids = [int(idx) for seqno, idx in df] captions = [] image_ids = [] for ann_id in coco.anns: if int(ann_id) not in ignore_ids: #print(ann_id) captions.append(coco.anns[ann_id]['caption']) image_ids.append(coco.anns[ann_id]['image_id']) image_files = [os.path.join(config.train_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids] feature_files = [os.path.join(config.train_feature_dir, os.path.basename(coco.imgs[image_id]['file_name'].replace('.jpg', '.npy'))) for image_id in image_ids] annotations = pd.DataFrame({'image_id': image_ids, 'image_file': image_files, 'feature_file': feature_files, 'caption': captions}) annotations.to_csv(config.eval_temp_file) data = pd.DataFrame({'image_file': image_files, 'feature_file': feature_files}) data.to_csv(config.eval_temp_file) return config.max_caption_length, vocabulary.size + len(config.ctrl_symbols), vocabulary
def prepare_train_data(config): """ Prepare the data for training the model. """ coco = COCO(config.train_caption_file) coco.filter_by_cap_len(config.max_caption_length) print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size) if not os.path.exists(config.vocabulary_file): vocabulary.build(coco.all_captions()) vocabulary.save(config.vocabulary_file) else: vocabulary.load(config.vocabulary_file) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) coco.filter_by_words(set(vocabulary.words)) print("Processing the captions...") if not os.path.exists(config.temp_annotation_file): captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns] image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns] image_files = [ os.path.join(config.train_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids ] annotations = pd.DataFrame({ 'image_id': image_ids, 'image_file': image_files, 'caption': captions }) annotations.to_csv(config.temp_annotation_file) else: annotations = pd.read_csv(config.temp_annotation_file) captions = annotations['caption'].values image_ids = annotations['image_id'].values image_files = annotations['image_file'].values if not os.path.exists(config.temp_data_file): word_idxs = [] masks = [] for caption in tqdm(captions): current_word_idxs_ = vocabulary.process_sentence(caption) current_num_words = len(current_word_idxs_) current_word_idxs = np.zeros(config.max_caption_length, dtype=np.int32) current_masks = np.zeros(config.max_caption_length) current_word_idxs[:current_num_words] = np.array( current_word_idxs_) current_masks[:current_num_words] = 1.0 word_idxs.append(current_word_idxs) masks.append(current_masks) word_idxs = np.array(word_idxs) masks = np.array(masks) data = {'word_idxs': word_idxs, 'masks': masks} np.save(config.temp_data_file, data) else: data = np.load(config.temp_data_file, encoding="latin1").item() word_idxs = data['word_idxs'] masks = data['masks'] print("Captions processed.") print("Number of captions = %d" % (len(captions))) print("Building the dataset...") dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs, masks, True, True) print("Dataset built.") return dataset
def prepare_train_data(config): """ Prepare the data for training the model. """ coco = COCO(config.train_caption_file) coco.filter_by_cap_len(config.max_caption_length) print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size) if not os.path.exists(config.vocabulary_file): vocabulary.build(coco.all_captions()) vocabulary.save(config.vocabulary_file) else: vocabulary.load(config.vocabulary_file) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) coco.filter_by_words(set(vocabulary.words)) print("Processing the captions...") if not os.path.exists(config.temp_annotation_file): captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns] image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns] image_files = [ os.path.join(config.train_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids ] annotations = pd.DataFrame({ 'image_id': image_ids, 'image_file': image_files, 'caption': captions }) annotations.to_csv(config.temp_annotation_file) else: annotations = pd.read_csv(config.temp_annotation_file) captions = annotations['caption'].values image_ids = annotations['image_id'].values image_files = annotations['image_file'].values if not os.path.exists(config.temp_data_file): word_idxs = [] masks = [] for caption in tqdm(captions): current_word_idxs_ = vocabulary.process_sentence(caption) current_num_words = len(current_word_idxs_) current_word_idxs = np.zeros(config.max_caption_length, dtype=np.int32) current_masks = np.zeros(config.max_caption_length) current_word_idxs[:current_num_words] = np.array( current_word_idxs_) current_masks[:current_num_words] = 1.0 word_idxs.append(current_word_idxs) masks.append(current_masks) word_idxs = np.array(word_idxs) masks = np.array(masks) data = {'word_idxs': word_idxs, 'masks': masks} np.save(config.temp_data_file, data) else: data = np.load(config.temp_data_file, encoding='latin1').item() word_idxs = data['word_idxs'] masks = data['masks'] print("Captions processed.") print("Number of captions = %d" % (len(captions))) print("Building the dataset...") if (config.train_data_count_limit > 0): print("-----------------------------------------------") print("Restricting Sz:\t", config.train_data_count_limit) print("Batch Sz:\t", config.batch_size) image_ids = image_ids[0:config.train_data_count_limit] image_files = image_files[0:config.train_data_count_limit] word_idxs = word_idxs[0:config.train_data_count_limit] masks = masks[0:config.train_data_count_limit] """ Dump the image paths to a file """ filepath = 'train_images.csv' with open(filepath, 'w') as file_handler: for i in range(0, config.train_data_count_limit): file_handler.write("{}\n".format(image_files[i])) #print(image_files) print("-----------------------------------------------") dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs, masks, True, True) print("Dataset built.") return dataset
def process_val_data_old(config, orcale_file=None, has_image=False, all_captions = True): print("Processing the captions...") if not os.path.exists(config.eval_temp_file): print("Creating temp annotation file....") if all_captions: print("ALL CAPTIONS") captions = [] image_ids = [] image_files = [] feature_files = [] ignore_ids = [] #if ignore_file: df = pd.read_csv(config.ignore_file_eval).values ignore_ids = [idx for seqno, idx in df] ann_file = pd.read_csv(config.eval_caption_file) with open(config.eval_caption_file, 'r') as f: reader = csv.reader(f) for id, file_name, caption in reader: if int(id) not in ignore_ids: image_ids.append(id) image_files.append(file_name) feature_files.append(os.path.join(config.train_feature_dir, os.path.basename(file_name.replace('.jpg', '.npy')))) captions.append(caption) annotations = pd.DataFrame({'image_id': image_ids, 'image_file': image_files, 'feature_file': feature_files, 'caption': captions}) annotations.to_csv(config.eval_temp_file) all_captions = captions else: print("NOT ALL CAPTIONS") coco = COCO(config.eval_caption_file, config.ignore_file_eval) all_captions = coco.all_captions captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns] image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns] image_files = [os.path.join(config.train_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids] feature_files = [os.path.join(config.train_feature_dir, os.path.basename(coco.imgs[image_id]['file_name'].replace('.jpg', '.npy'))) for image_id in image_ids] annotations = pd.DataFrame({'image_id': image_ids, 'image_file': image_files, 'feature_file': feature_files, 'caption': captions}) annotations.to_csv(config.eval_temp_file) print(len(image_ids), len(image_files), len(feature_files), len(captions)) else: annotations = pd.read_csv(config.eval_temp_file) captions = [] image_ids = [] image_files = [] feature_files = [] for _, id, file, feature, cap in annotations.values: image_ids.append(id) image_files.append(file) feature_files.append(feature) captions.append(cap) print("load data...") print(len(image_ids), len(image_files), len(feature_files), len(captions)) all_captions = captions with open(config.temp_image_file_eval, 'w') as outfile: for img_file in image_files: outfile.write(img_file+"\n") with open(config.temp_feature_file_eval, 'w') as outfile: for feature in feature_files: outfile.write(feature+"\n") coco = COCO(config.train_caption_file, config.ignore_file) vocabulary = build_vocabulary(config, coco.all_captions()) return config.max_caption_length, vocabulary.size + len(config.ctrl_symbols), vocabulary
def prepare_train_data(config): """ Prepare the data for training the model. """ coco = COCO(config.train_caption_file, config.ignore_file) #coco.filter_by_cap_len(config.max_caption_length) #print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size, config.ctrl_symbols) if not os.path.exists(config.vocabulary_file): vocabulary.build(coco.all_captions()) vocabulary.save(config.vocabulary_file) else: vocabulary.load(config.vocabulary_file) #print("Vocabulary built.") #print("Number of words = %d" %(vocabulary.size)) #coco.filter_by_words(set(vocabulary.words)) #print("Processing the captions...") if not os.path.exists(config.temp_annotation_file): captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns] image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns] image_files = [os.path.join(config.train_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids] annotations = pd.DataFrame({'image_id': image_ids, 'image_file': image_files, 'caption': captions}) annotations.to_csv(config.temp_annotation_file) else: annotations = pd.read_csv(config.temp_annotation_file) captions = [] image_ids = [] image_files = [] for id, file, feat, cap in annotations.values: image_ids.append(id) image_files.append(feat) captions.append(cap) print("NUM CAPTIONS: " + str(len(captions))) if not os.path.exists(config.temp_data_file): word_idxs = [] masks = [] sent_lens = [] for caption in tqdm(captions): current_word_idxs, current_length = vocabulary.process_sentence(caption) current_num_words = min(config.max_caption_length-2, current_length) current_word_idxs = [config._START_] + current_word_idxs[:current_num_words] + [config._END_] pad_length = config.max_caption_length - current_num_words -2 if pad_length > 0: current_word_idxs += [config._PAD_] * (pad_length) #print("sent length:"+str(len(current_word_idxs))+", real len:"+str(current_length)) current_masks = np.zeros(config.max_caption_length) current_masks[:current_num_words] = 1.0 word_idxs.append(current_word_idxs) masks.append(current_masks) sent_lens.append(current_num_words+2) word_idxs = np.array(word_idxs) masks = np.array(masks) data = {'word_idxs': word_idxs, 'masks': masks, 'sentence_len': sent_lens} np.save(config.temp_data_file, data) else: data = np.load(config.temp_data_file).item() word_idxs = data['word_idxs'] masks = None #data['masks'] sent_lens = data['sentence_len'] #print("Captions processed.") #print("Number of captions = %d" %(len(captions))) #print("Number of word_idxs = %d" %(len(word_idxs))) #print("Number of sent_lens = %d" %(len(sent_lens))) dataset = DataSet(coco, vocabulary, image_ids, image_files, config.batch_size, word_idxs, masks, sent_lens, True, True) return dataset
def prepare_train_data(config): """ Prepare the data for training the model. """ coco = COCO(config.train_caption_file) coco.filter_by_cap_len(config.max_caption_length) if config.distributed: images = os.listdir(config.train_image_dir) ids = [int(x[15:27]) for x in images] print 'Input Path: ' + config.train_image_dir + ' Number of files in input path: ' + str( int(len(ids))) print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size) if not os.path.exists(config.vocabulary_file): vocabulary.build(coco.all_captions()) vocabulary.save(config.vocabulary_file) else: vocabulary.load(config.vocabulary_file) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) coco.filter_by_words(set(vocabulary.words)) if config.distributed: print('Filter captions by images') coco.filter_by_images(ids) #print(coco.getImgIds(ids)) print("Processing the captions...") if not os.path.exists(config.temp_annotation_file): captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns] image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns] image_files = [ os.path.join(config.train_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids ] annotations = pd.DataFrame({ 'image_id': image_ids, 'image_file': image_files, 'caption': captions }) annotations.set_index('image_id', inplace=True) annotations = annotations.loc[ids] if not config.distributed: annotations.to_csv(config.temp_annotation_file) else: annotations = pd.read_csv(config.temp_annotation_file) captions = annotations['caption'].values image_ids = annotations['image_id'].values image_files = annotations['image_file'].values if not os.path.exists(config.temp_data_file): word_idxs = [] masks = [] for caption in tqdm(captions): current_word_idxs_ = vocabulary.process_sentence(caption) current_num_words = len(current_word_idxs_) current_word_idxs = np.zeros(config.max_caption_length, dtype=np.int32) current_masks = np.zeros(config.max_caption_length) current_word_idxs[:current_num_words] = np.array( current_word_idxs_) current_masks[:current_num_words] = 1.0 word_idxs.append(current_word_idxs) masks.append(current_masks) word_idxs = np.array(word_idxs) masks = np.array(masks) data = {'word_idxs': word_idxs, 'masks': masks} if not config.distributed: np.save(config.temp_data_file, data) else: data = np.load(config.temp_data_file).item() word_idxs = data['word_idxs'] masks = data['masks'] print("Captions processed.") print("Number of captions = %d" % (len(captions))) print("Building the dataset...") dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs, masks, True, True) print("Dataset built.") #print "Input Path: " + config.train_image_dir + " Number of files after data preparation: " + str(len(image_files)) #print "Images IDs to be used on this server: " + str(image_ids) return dataset
def prepare_train_data(config): """ Prepare the data for training the model. """ coco = COCO(config.train_caption_file) print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size) if not os.path.exists(config.vocabulary_file): vocabulary.build(coco.all_captions()) vocabulary.save(config.vocabulary_file) else: vocabulary.load(config.vocabulary_file) print("Vocabulary built.") print("Number of words = %d" % vocabulary.size) coco.filter_by_words(set(vocabulary.words)) print("Processing the captions...") if not os.path.exists(config.temp_annotation_file): captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns] image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns] image_files = [os.path.join(config.train_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids] annotations = pd.DataFrame({'image_id': image_ids, 'image_file': image_files, 'caption': captions}) annotations.to_csv(config.temp_annotation_file) else: annotations = pd.read_csv(config.temp_annotation_file) captions = annotations['caption'].values image_ids = annotations['image_id'].values image_files = annotations['image_file'].values if not os.path.exists(config.temp_data_file): word_idxs = [] masks = [] for caption in tqdm(captions): current_word_idxs_ = vocabulary.process_sentence(caption) current_num_words = len(current_word_idxs_) current_word_idxs = np.zeros(config.max_caption_length, dtype=np.int32) current_masks = np.zeros(config.max_caption_length) current_word_idxs[:current_num_words] = np.array(current_word_idxs_) current_masks[:current_num_words] = 1.0 word_idxs.append(current_word_idxs) masks.append(current_masks) word_idxs = np.array(word_idxs) masks = np.array(masks) data = {'word_idxs': word_idxs, 'masks': masks} np.save(config.temp_data_file, data) else: # RV # save np.load # np_load_old = np.load # modify the default parameters of np.load # np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k) # data = np.load(config.temp_data_file, allow_pickle=True, encoding="latin1").item() # # restore np.load for future normal usage # np.load = np_load_old # RV word_idxs = data['word_idxs'] masks = data['masks'] print("Captions processed.") print("Number of captions = %d" % (len(captions))) # RV # Select the first 30000 captions from the shuffled set # num_examples = 5000 # word_idxs = word_idxs[:num_examples] # image_files = image_files[:num_examples] print("Number of captions = %d" % (len(captions))) print("Building the dataset...") dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs, masks, True, True) print("Dataset built.") return dataset