Beispiel #1
0
def build_vocabulary(config):
    """ Build the vocabulary from the training data and save it to a file. """
    coco = COCO(config.train_caption_file)
    coco.filter_by_cap_len(config.max_caption_length)

    vocabulary = Vocabulary(config.vocabulary_size)
    vocabulary.build(coco.all_captions())
    vocabulary.save(config.vocabulary_file)
    return vocabulary
Beispiel #2
0
def process_train_data(config, data_loc, orcale_file=None, has_image=False):
    if data_loc is None:
        data_loc = 'data/caption.txt'
    if not has_image:
        return process_text_only(config, data_loc, orcale_file)

    coco = COCO(config.train_caption_file, config.ignore_file)

    vocabulary = build_vocabulary(config, coco.all_captions(), orcale_file)
    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [
            os.path.join(config.train_image_dir,
                         coco.imgs[image_id]['file_name'])
            for image_id in image_ids
        ]
        feature_files = [
            os.path.join(
                config.train_feature_dir,
                os.path.basename(coco.imgs[image_id]['file_name'].replace(
                    '.jpg', '.npy'))) for image_id in image_ids
        ]
        annotations = pd.DataFrame({
            'image_id': image_ids,
            'image_file': image_files,
            'feature_file': feature_files,
            'caption': captions
        })
        annotations.to_csv(config.temp_annotation_file)
        print(len(image_ids), len(image_files), len(feature_files),
              len(captions))
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        captions = []
        image_ids = []
        image_files = []
        feature_files = []
        for _, id, file, feature, cap in annotations.values:
            image_ids.append(id)
            image_files.append(file)
            feature_files.append(feature)
            captions.append(cap)
        print("load data...")
        print(len(image_ids), len(image_files), len(feature_files),
              len(captions))
    with open(config.temp_image_file, 'w') as outfile:
        for img_file in image_files:
            outfile.write(img_file + "\n")
    with open(config.temp_feature_file, 'w') as outfile:
        for feature in feature_files:
            outfile.write(feature + "\n")

    return config.max_caption_length, vocabulary.size + len(
        config.ctrl_symbols), vocabulary
Beispiel #3
0
def process_test_data(config):
    #vocabulary = Vocabulary(config.vocabulary_size, config.ctrl_symbols)
    coco = COCO(config.train_caption_file, config.ignore_file)

    vocabulary = build_vocabulary(config, coco.all_captions())
    if not os.path.exists(config.test_temp_file):
        image_files = [config.test_image_dir+f for f in os.listdir(config.test_image_dir)]
        feature_files = [config.test_image_vgg_dir+f for f in os.listdir(config.test_image_vgg_dir)]
        data = pd.DataFrame({'image_file': image_files, 'feature_file': feature_files})
        data.to_csv(config.test_temp_file)

    return config.max_caption_length, vocabulary.size + len(config.ctrl_symbols), vocabulary
def prepare_eval_data(config):
    """ Prepare the data for evaluating the model. """
    coco = COCO(config.eval_caption_file)
    image_ids = list(coco.imgs.keys())
    image_files = [
        os.path.join(config.eval_image_dir, coco.imgs[image_id]['file_name'])
        for image_id in image_ids
    ]

    print("Building the vocabulary...")
    if os.path.exists(config.vocabulary_file):
        vocabulary = Vocabulary(config.vocabulary_size, config.vocabulary_file)
    else:
        vocabulary = build_vocabulary(config)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    print("Building the dataset...")
    if (config.eval_data_count_limit > 0):
        print("-----------------------------------------------")
        print("Restricting Sz:\t", config.eval_data_count_limit)
        print("Batch Sz:\t", config.batch_size)
        image_ids = image_ids[0:config.eval_data_count_limit]
        image_files = image_files[0:config.eval_data_count_limit]
        """ Dump the image paths to a file """
        filepath = 'eval_images.csv'
        with open(filepath, 'w') as file_handler:
            for i in range(0, config.eval_data_count_limit):
                file_handler.write("{}\n".format(image_files[i]))
        #print(image_files)
        print("-----------------------------------------------")
    dataset = DataSet(image_ids, image_files, config.batch_size)
    print("Dataset built.")
    return coco, dataset, vocabulary
def prepare_eval_data(config):
    """ Prepare the data for evaluating the model. """
    coco = COCO(config.eval_caption_file)

    if config.is_person_model == 'Y':
        file_data = pd.read_csv(config.person_eval_caption_file)
        image_ids = file_data['image_id'].values
        image_files = file_data['image_file'].values
    else:
        image_ids = list(coco.imgs.keys())
        image_files = [os.path.join(config.eval_image_dir,
                                    coco.imgs[image_id]['file_name'])
                       for image_id in image_ids]

    print("Building the vocabulary...")
    if os.path.exists(config.vocabulary_file):
        vocabulary = Vocabulary(config.vocabulary_size,
                                config.vocabulary_file)
    else:
        vocabulary = build_vocabulary(config)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    print("Building the dataset...")

    dataset = DataSet(image_ids, image_files, config.batch_size)
    print("Dataset built.")
    return coco, dataset, vocabulary
Beispiel #6
0
def prepare_eval_data(config):
    """ Prepare the data for evaluating the model. """
    coco = COCO(config.eval_caption_file, config.max_eval_ann_num)
    image_ids = []
    image_files = []
    if not config.max_eval_ann_num:
        print('No config.max_eval_ann_num')
        image_ids = list(coco.imgs.keys())
        image_files = [
            os.path.join(config.eval_image_dir,
                         coco.imgs[image_id]['file_name'])
            for image_id in image_ids
        ]
    else:
        print('config.max_eval_ann_num=', config.max_eval_ann_num)
        image_ids = [
            coco.anns[ann_id]['image_id']
            for ann_id in islice(coco.anns, 0, config.max_eval_ann_num)
        ]
        image_files = [
            os.path.join(config.eval_image_dir,
                         coco.imgs[image_id]['file_name'])
            for image_id in islice(image_ids, 0, config.max_eval_ann_num)
        ]

    print("Building the vocabulary...")
    if os.path.exists(config.vocabulary_file):
        vocabulary = Vocabulary(config.vocabulary_size, config.vocabulary_file)
    else:
        vocabulary = build_vocabulary(config)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    print('Download Images')
    coco.download(config.eval_image_dir, image_ids)
    print('Finished download images')

    print("Building the dataset...")
    dataset = DataSet(image_ids, image_files, config.batch_size)
    print("Dataset built.")
    return coco, dataset, vocabulary
Beispiel #7
0
def process_val_data(config):
    #vocabulary = Vocabulary(config.vocabulary_size, config.ctrl_symbols)
    coco = COCO(config.train_caption_file, config.ignore_file)

    vocabulary = build_vocabulary(config, coco.all_captions())


    coco = COCO(config.eval_caption_file, config.ignore_file_eval)

    all_captions = coco.all_captions
    if not os.path.exists(config.eval_temp_file):
        df = pd.read_csv(config.ignore_file_eval).values
        ignore_ids = [int(idx) for seqno, idx in df]
        captions = []
        image_ids = []
        for ann_id in coco.anns: 
            if int(ann_id) not in ignore_ids:
                #print(ann_id)
                captions.append(coco.anns[ann_id]['caption'])
                image_ids.append(coco.anns[ann_id]['image_id'])
        image_files = [os.path.join(config.train_image_dir,
                                    coco.imgs[image_id]['file_name'])
                                    for image_id in image_ids]
        feature_files = [os.path.join(config.train_feature_dir,
                                    os.path.basename(coco.imgs[image_id]['file_name'].replace('.jpg', '.npy')))
                                    for image_id in image_ids]
        annotations = pd.DataFrame({'image_id': image_ids,
                                    'image_file': image_files,
                                    'feature_file': feature_files,
                                    'caption': captions})
        annotations.to_csv(config.eval_temp_file)

        data = pd.DataFrame({'image_file': image_files, 'feature_file': feature_files})
        data.to_csv(config.eval_temp_file)

    return config.max_caption_length, vocabulary.size + len(config.ctrl_symbols), vocabulary
Beispiel #8
0
def build_vocabulary(config, max_ann_num=None):
    """ Build the vocabulary from the training data and save it to a file. """
    coco = COCO(config.train_caption_file, config.max_train_ann_num)
    coco.filter_by_cap_len(config.max_caption_length)

    vocabulary = Vocabulary(config.vocabulary_size)
    if not config.max_train_ann_num:
        vocabulary.build(coco.all_captions())
    else:
        vocabulary.build((coco.all_captions())[:config.max_train_ann_num])
    vocabulary.save(config.vocabulary_file)
    return vocabulary
def prepare_eval_new_data(caption_file, image_dir, config):
    """ Prepare the data for evaluating the model with new dataset. """
    coco = COCO(caption_file)
    image_ids = list(coco.imgs.keys())
    image_files = [
        os.path.join(image_dir, coco.imgs[image_id]['file_name'])
        for image_id in image_ids
    ]

    print("Building the vocabulary...")
    if os.path.exists(config.vocabulary_file):
        vocabulary = Vocabulary(config.vocabulary_size, config.vocabulary_file)
    else:
        vocabulary = build_vocabulary(config)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    print("Building the dataset...")
    dataset = DataSet(image_ids, image_files, config.batch_size)
    print("Dataset built.")
    return coco, dataset, vocabulary
Beispiel #10
0
def prepare_test_data(config):
    """ Prepare the data for testing the model. """
    coco = COCO(config.eval_caption_file)
    
    files = os.listdir(config.test_image_dir)
    image_files = [os.path.join(config.test_image_dir, f) for f in files
        if f.lower().endswith('.jpg') or f.lower().endswith('.jpeg')]
    image_ids = list(range(len(image_files)))

    print("Building the vocabulary...")
    if os.path.exists(config.vocabulary_file):
        vocabulary = Vocabulary(config.vocabulary_size,
                                config.vocabulary_file)
    else:
        vocabulary = build_vocabulary(config)
    print("Vocabulary built.")
    print("Number of words = %d" %(vocabulary.size))

    print("Building the dataset...")
    dataset = DataSet(coco, vocabulary, image_ids, image_files, config.batch_size)
    print("Dataset built.")
    return dataset
Beispiel #11
0
def prepare_eval_data(config):
    """ Prepare the data for evaluating the model. """
    coco = COCO(config.eval_caption_file, config.ignore_file_eval)
    image_ids = list(coco.imgs.keys())
    image_files = [os.path.join(config.eval_image_dir,
                                coco.imgs[image_id]['file_name'])
                                for image_id in image_ids]
    print("IMAGE FILES SHAPE PREP DATA " + str(len(image_files)))
    print("Building the vocabulary...")
    if os.path.exists(config.vocabulary_file):
        vocabulary = Vocabulary(config.vocabulary_size,
                                config.ctrl_symbols,
                                config.vocabulary_file)
    else:
        vocabulary = build_vocabulary(config)
    print("Vocabulary built.")
    print("Number of words = %d" %(vocabulary.size))

    print("Building the dataset...")
    dataset = DataSet(coco, vocabulary, image_ids, image_files, config.batch_size)
    print("Dataset built.")
    return dataset
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file)
    coco.filter_by_cap_len(config.max_caption_length)

    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    coco.filter_by_words(set(vocabulary.words))

    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [
            os.path.join(config.train_image_dir,
                         coco.imgs[image_id]['file_name'])
            for image_id in image_ids
        ]
        annotations = pd.DataFrame({
            'image_id': image_ids,
            'image_file': image_files,
            'caption': captions
        })
        annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        for caption in tqdm(captions):
            current_word_idxs_ = vocabulary.process_sentence(caption)
            current_num_words = len(current_word_idxs_)
            current_word_idxs = np.zeros(config.max_caption_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_caption_length)
            current_word_idxs[:current_num_words] = np.array(
                current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks}
        np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file, encoding='latin1').item()
        word_idxs = data['word_idxs']
        masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    if (config.train_data_count_limit > 0):
        print("-----------------------------------------------")
        print("Restricting Sz:\t", config.train_data_count_limit)
        print("Batch Sz:\t", config.batch_size)
        image_ids = image_ids[0:config.train_data_count_limit]
        image_files = image_files[0:config.train_data_count_limit]
        word_idxs = word_idxs[0:config.train_data_count_limit]
        masks = masks[0:config.train_data_count_limit]
        """ Dump the image paths to a file """
        filepath = 'train_images.csv'
        with open(filepath, 'w') as file_handler:
            for i in range(0, config.train_data_count_limit):
                file_handler.write("{}\n".format(image_files[i]))
        #print(image_files)
        print("-----------------------------------------------")

    dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs,
                      masks, True, True)
    print("Dataset built.")
    return dataset
Beispiel #13
0
def create_data_subset(folder_name, category):
    """
    Assuming that the dataset has already been downloaded. Here is an example of getting all images and the
    corresponding captions from the training dataset for the category horse. The images and the csv file containing
    the image captions are saved into a new folder. You can easily generalize this approach to separate all
    categories into new folders
    """
    # note this only refers to  the training set and not the validation set
    # coco = COCO('train/instances_train2014.json')

    coco = COCO('val/instances_val2014.json')

    # note this only refers to the captions of the training set and not the validation set
    #caps = COCO('train/captions_train2014.json')

    caps = COCO('val/captions_val2014.json')

    categories = coco.loadCats(coco.getCatIds())
    names = [cat['name'] for cat in categories]

    print("Available categories: ")
    for index, n in enumerate(names):
        print(index, n)

    category_ids = coco.getCatIds(catNms=[category])
    image_ids = coco.getImgIds(catIds=category_ids)
    images = coco.loadImgs(image_ids)
    annIds = caps.getAnnIds(imgIds=image_ids)
    annotations = caps.loadAnns(annIds)

    # Split the annotations every 5 captions since there are 5 captions for each image
    annotations = [annotations[x:x + 5] for x in range(0, len(annotations), 5)]

    # Create empty dataframe with two columns for the image file name and the corresponding captions
    df = pd.DataFrame(columns=['image_id', 'image_file', 'caption'])

    # Create folder in for the images of the selected category
    # os.mkdir(folder_name)

    print('Folder created to store images....')

    # Create map for image id (key) to captions (values)
    captions_dict = {}
    for i, n in enumerate(annotations):
        captions_dict[annotations[i][0]['image_id']] = annotations[i]

    print('Retrieving images....')

    person_file_names = []
    for img in tqdm(images):
        person_file_names.append(img['file_name'])
        for entry in captions_dict[img['id']]:
            df.loc[len(df)] = [
                entry['image_id'], img['file_name'], entry['caption']
            ]

    print('Caption csv creation in progress....')

    # Convert dataframe to csv file and save to folder
    df.to_csv(folder_name + "/val_person_captions.csv", index=False)

    print('Csv created....')

    print('Storing images....')

    # # Copy all images of given category to new folder
    # for filename in tqdm(os.listdir('train/images')):
    #     if filename in person_file_names:
    #         shutil.copy(os.path.join('train/images', filename), folder_name)

    print('Done creating data subset with images....')
Beispiel #14
0
def process_val_data_old(config, orcale_file=None, has_image=False, all_captions = True):

    print("Processing the captions...")
    if not os.path.exists(config.eval_temp_file):
        print("Creating temp annotation file....")
        if all_captions:
            print("ALL CAPTIONS")
            captions = []
            image_ids = []
            image_files = []
            feature_files = []
            ignore_ids = []
            #if ignore_file:

            df = pd.read_csv(config.ignore_file_eval).values
            ignore_ids = [idx for seqno, idx in df]

            ann_file = pd.read_csv(config.eval_caption_file)
            with open(config.eval_caption_file, 'r') as f:
                reader = csv.reader(f)
                for id, file_name, caption in reader:
                    if int(id) not in ignore_ids:
                        image_ids.append(id)
                        image_files.append(file_name)
                        feature_files.append(os.path.join(config.train_feature_dir,
                                        os.path.basename(file_name.replace('.jpg', '.npy'))))
                        captions.append(caption)
            annotations = pd.DataFrame({'image_id': image_ids,
                                        'image_file': image_files,
                                        'feature_file': feature_files,
                                        'caption': captions})
            annotations.to_csv(config.eval_temp_file)

            all_captions = captions
        else:   
            print("NOT ALL CAPTIONS")   
            coco = COCO(config.eval_caption_file, config.ignore_file_eval)
            all_captions = coco.all_captions
            captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
            image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
            image_files = [os.path.join(config.train_image_dir,
                                        coco.imgs[image_id]['file_name'])
                                        for image_id in image_ids]
            feature_files = [os.path.join(config.train_feature_dir,
                                        os.path.basename(coco.imgs[image_id]['file_name'].replace('.jpg', '.npy')))
                                        for image_id in image_ids]
            annotations = pd.DataFrame({'image_id': image_ids,
                                        'image_file': image_files,
                                        'feature_file': feature_files,
                                        'caption': captions})
            annotations.to_csv(config.eval_temp_file)

        print(len(image_ids), len(image_files), len(feature_files), len(captions))
    else:
        annotations = pd.read_csv(config.eval_temp_file)
        captions = [] 
        image_ids = [] 
        image_files = [] 
        feature_files = []
        for _, id, file, feature, cap in annotations.values:
            image_ids.append(id)
            image_files.append(file)
            feature_files.append(feature)
            captions.append(cap)
        print("load data...")
        print(len(image_ids), len(image_files), len(feature_files), len(captions))

        all_captions = captions
    with open(config.temp_image_file_eval, 'w') as outfile:
        for img_file in image_files:
            outfile.write(img_file+"\n")
    with open(config.temp_feature_file_eval, 'w') as outfile:
        for feature in feature_files:
            outfile.write(feature+"\n")

    coco = COCO(config.train_caption_file, config.ignore_file)

    vocabulary = build_vocabulary(config, coco.all_captions())

    return config.max_caption_length, vocabulary.size + len(config.ctrl_symbols), vocabulary
Beispiel #15
0
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file)
    coco.filter_by_cap_len(config.max_caption_length)
    if config.distributed:
        images = os.listdir(config.train_image_dir)
        ids = [int(x[15:27]) for x in images]
        print 'Input Path: ' + config.train_image_dir + ' Number of files in input path: ' + str(
            int(len(ids)))

    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    coco.filter_by_words(set(vocabulary.words))

    if config.distributed:
        print('Filter captions by images')
        coco.filter_by_images(ids)
        #print(coco.getImgIds(ids))

    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [
            os.path.join(config.train_image_dir,
                         coco.imgs[image_id]['file_name'])
            for image_id in image_ids
        ]
        annotations = pd.DataFrame({
            'image_id': image_ids,
            'image_file': image_files,
            'caption': captions
        })
        annotations.set_index('image_id', inplace=True)
        annotations = annotations.loc[ids]
        if not config.distributed:
            annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        for caption in tqdm(captions):
            current_word_idxs_ = vocabulary.process_sentence(caption)
            current_num_words = len(current_word_idxs_)
            current_word_idxs = np.zeros(config.max_caption_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_caption_length)
            current_word_idxs[:current_num_words] = np.array(
                current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks}
        if not config.distributed:
            np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file).item()
        word_idxs = data['word_idxs']
        masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs,
                      masks, True, True)
    print("Dataset built.")
    #print "Input Path: " + config.train_image_dir + " Number of files after data preparation: " + str(len(image_files))
    #print "Images IDs to be used on this server: " + str(image_ids)
    return dataset
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file)

    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % vocabulary.size)

    coco.filter_by_words(set(vocabulary.words))

    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [os.path.join(config.train_image_dir,
                                    coco.imgs[image_id]['file_name'])
                       for image_id in image_ids]
        annotations = pd.DataFrame({'image_id': image_ids,
                                    'image_file': image_files,
                                    'caption': captions})
        annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        for caption in tqdm(captions):
            current_word_idxs_ = vocabulary.process_sentence(caption)
            current_num_words = len(current_word_idxs_)
            current_word_idxs = np.zeros(config.max_caption_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_caption_length)
            current_word_idxs[:current_num_words] = np.array(current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks}
        np.save(config.temp_data_file, data)
    else:
        # RV
        # save np.load
        # np_load_old = np.load

        # modify the default parameters of np.load
        # np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k)
        #
        data = np.load(config.temp_data_file, allow_pickle=True, encoding="latin1").item()
        #
        # restore np.load for future normal usage
        # np.load = np_load_old
        # RV
        word_idxs = data['word_idxs']
        masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    # RV
    # Select the first 30000 captions from the shuffled set
    # num_examples = 5000
    # word_idxs = word_idxs[:num_examples]
    # image_files = image_files[:num_examples]

    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    dataset = DataSet(image_ids,
                      image_files,
                      config.batch_size,
                      word_idxs,
                      masks,
                      True,
                      True)
    print("Dataset built.")
    return dataset
Beispiel #17
0
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file)
    coco.filter_by_cap_len(config.max_caption_length)

    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    coco.filter_by_words(set(vocabulary.words))

    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [
            os.path.join(config.train_image_dir,
                         coco.imgs[image_id]['file_name'])
            for image_id in image_ids
        ]
        annotations = pd.DataFrame({
            'image_id': image_ids,
            'image_file': image_files,
            'caption': captions
        })
        annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        for caption in tqdm(captions):
            current_word_idxs_ = vocabulary.process_sentence(caption)
            current_num_words = len(current_word_idxs_)
            current_word_idxs = np.zeros(config.max_caption_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_caption_length)
            current_word_idxs[:current_num_words] = np.array(
                current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks}
        np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file, encoding="latin1").item()
        word_idxs = data['word_idxs']
        masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs,
                      masks, True, True)
    print("Dataset built.")
    return dataset
Beispiel #18
0
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file, config.ignore_file)
    #coco.filter_by_cap_len(config.max_caption_length)

    #print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size, config.ctrl_symbols)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    #print("Vocabulary built.")
    #print("Number of words = %d" %(vocabulary.size))

    #coco.filter_by_words(set(vocabulary.words))

    #print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [os.path.join(config.train_image_dir,
                                    coco.imgs[image_id]['file_name'])
                                    for image_id in image_ids]
        annotations = pd.DataFrame({'image_id': image_ids,
                                    'image_file': image_files,
                                    'caption': captions})
        annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        
        captions = [] 
        image_ids = [] 
        image_files = [] 

        for id, file, feat, cap in annotations.values:
            
            image_ids.append(id)
            image_files.append(feat)
            captions.append(cap)
        
    print("NUM CAPTIONS: " + str(len(captions)))
    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        sent_lens = []
        for caption in tqdm(captions):
            current_word_idxs, current_length = vocabulary.process_sentence(caption)
            current_num_words = min(config.max_caption_length-2, current_length)

            current_word_idxs = [config._START_] + current_word_idxs[:current_num_words] + [config._END_]
            pad_length = config.max_caption_length - current_num_words -2
            if pad_length > 0:
                current_word_idxs += [config._PAD_] * (pad_length)
            #print("sent length:"+str(len(current_word_idxs))+", real len:"+str(current_length))
            current_masks = np.zeros(config.max_caption_length)
            current_masks[:current_num_words] = 1.0

            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
            sent_lens.append(current_num_words+2)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks, 'sentence_len': sent_lens}
        np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file).item()
        word_idxs = data['word_idxs']
        masks = None #data['masks']
        sent_lens = data['sentence_len']
    #print("Captions processed.")
    #print("Number of captions = %d" %(len(captions)))
    #print("Number of word_idxs = %d" %(len(word_idxs)))
    #print("Number of sent_lens = %d" %(len(sent_lens)))
    dataset = DataSet(coco,
                      vocabulary,
                      image_ids,
                      image_files,
                      config.batch_size,
                      word_idxs,
                      masks,
                      sent_lens,
                      True,
                      True)
    return dataset