Exemple #1
0
def build_vocabulary(config, max_ann_num=None):
    """ Build the vocabulary from the training data and save it to a file. """
    coco = COCO(config.train_caption_file, config.max_train_ann_num)
    coco.filter_by_cap_len(config.max_caption_length)

    vocabulary = Vocabulary(config.vocabulary_size)
    if not config.max_train_ann_num:
        vocabulary.build(coco.all_captions())
    else:
        vocabulary.build((coco.all_captions())[:config.max_train_ann_num])
    vocabulary.save(config.vocabulary_file)
    return vocabulary
Exemple #2
0
def build_vocabulary(config):
    """ Build the vocabulary from the training data and save it to a file. """
    coco = COCO(config.train_caption_file)
    coco.filter_by_cap_len(config.max_caption_length)

    vocabulary = Vocabulary(config.vocabulary_size)
    vocabulary.build(coco.all_captions())
    vocabulary.save(config.vocabulary_file)
    return vocabulary
Exemple #3
0
def process_train_data(config, data_loc, orcale_file=None, has_image=False):
    if data_loc is None:
        data_loc = 'data/caption.txt'
    if not has_image:
        return process_text_only(config, data_loc, orcale_file)

    coco = COCO(config.train_caption_file, config.ignore_file)

    vocabulary = build_vocabulary(config, coco.all_captions(), orcale_file)
    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [
            os.path.join(config.train_image_dir,
                         coco.imgs[image_id]['file_name'])
            for image_id in image_ids
        ]
        feature_files = [
            os.path.join(
                config.train_feature_dir,
                os.path.basename(coco.imgs[image_id]['file_name'].replace(
                    '.jpg', '.npy'))) for image_id in image_ids
        ]
        annotations = pd.DataFrame({
            'image_id': image_ids,
            'image_file': image_files,
            'feature_file': feature_files,
            'caption': captions
        })
        annotations.to_csv(config.temp_annotation_file)
        print(len(image_ids), len(image_files), len(feature_files),
              len(captions))
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        captions = []
        image_ids = []
        image_files = []
        feature_files = []
        for _, id, file, feature, cap in annotations.values:
            image_ids.append(id)
            image_files.append(file)
            feature_files.append(feature)
            captions.append(cap)
        print("load data...")
        print(len(image_ids), len(image_files), len(feature_files),
              len(captions))
    with open(config.temp_image_file, 'w') as outfile:
        for img_file in image_files:
            outfile.write(img_file + "\n")
    with open(config.temp_feature_file, 'w') as outfile:
        for feature in feature_files:
            outfile.write(feature + "\n")

    return config.max_caption_length, vocabulary.size + len(
        config.ctrl_symbols), vocabulary
Exemple #4
0
def process_test_data(config):
    #vocabulary = Vocabulary(config.vocabulary_size, config.ctrl_symbols)
    coco = COCO(config.train_caption_file, config.ignore_file)

    vocabulary = build_vocabulary(config, coco.all_captions())
    if not os.path.exists(config.test_temp_file):
        image_files = [config.test_image_dir+f for f in os.listdir(config.test_image_dir)]
        feature_files = [config.test_image_vgg_dir+f for f in os.listdir(config.test_image_vgg_dir)]
        data = pd.DataFrame({'image_file': image_files, 'feature_file': feature_files})
        data.to_csv(config.test_temp_file)

    return config.max_caption_length, vocabulary.size + len(config.ctrl_symbols), vocabulary
Exemple #5
0
def process_val_data(config):
    #vocabulary = Vocabulary(config.vocabulary_size, config.ctrl_symbols)
    coco = COCO(config.train_caption_file, config.ignore_file)

    vocabulary = build_vocabulary(config, coco.all_captions())


    coco = COCO(config.eval_caption_file, config.ignore_file_eval)

    all_captions = coco.all_captions
    if not os.path.exists(config.eval_temp_file):
        df = pd.read_csv(config.ignore_file_eval).values
        ignore_ids = [int(idx) for seqno, idx in df]
        captions = []
        image_ids = []
        for ann_id in coco.anns: 
            if int(ann_id) not in ignore_ids:
                #print(ann_id)
                captions.append(coco.anns[ann_id]['caption'])
                image_ids.append(coco.anns[ann_id]['image_id'])
        image_files = [os.path.join(config.train_image_dir,
                                    coco.imgs[image_id]['file_name'])
                                    for image_id in image_ids]
        feature_files = [os.path.join(config.train_feature_dir,
                                    os.path.basename(coco.imgs[image_id]['file_name'].replace('.jpg', '.npy')))
                                    for image_id in image_ids]
        annotations = pd.DataFrame({'image_id': image_ids,
                                    'image_file': image_files,
                                    'feature_file': feature_files,
                                    'caption': captions})
        annotations.to_csv(config.eval_temp_file)

        data = pd.DataFrame({'image_file': image_files, 'feature_file': feature_files})
        data.to_csv(config.eval_temp_file)

    return config.max_caption_length, vocabulary.size + len(config.ctrl_symbols), vocabulary
Exemple #6
0
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file)
    coco.filter_by_cap_len(config.max_caption_length)

    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    coco.filter_by_words(set(vocabulary.words))

    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [
            os.path.join(config.train_image_dir,
                         coco.imgs[image_id]['file_name'])
            for image_id in image_ids
        ]
        annotations = pd.DataFrame({
            'image_id': image_ids,
            'image_file': image_files,
            'caption': captions
        })
        annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        for caption in tqdm(captions):
            current_word_idxs_ = vocabulary.process_sentence(caption)
            current_num_words = len(current_word_idxs_)
            current_word_idxs = np.zeros(config.max_caption_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_caption_length)
            current_word_idxs[:current_num_words] = np.array(
                current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks}
        np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file, encoding="latin1").item()
        word_idxs = data['word_idxs']
        masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs,
                      masks, True, True)
    print("Dataset built.")
    return dataset
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file)
    coco.filter_by_cap_len(config.max_caption_length)

    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    coco.filter_by_words(set(vocabulary.words))

    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [
            os.path.join(config.train_image_dir,
                         coco.imgs[image_id]['file_name'])
            for image_id in image_ids
        ]
        annotations = pd.DataFrame({
            'image_id': image_ids,
            'image_file': image_files,
            'caption': captions
        })
        annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        for caption in tqdm(captions):
            current_word_idxs_ = vocabulary.process_sentence(caption)
            current_num_words = len(current_word_idxs_)
            current_word_idxs = np.zeros(config.max_caption_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_caption_length)
            current_word_idxs[:current_num_words] = np.array(
                current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks}
        np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file, encoding='latin1').item()
        word_idxs = data['word_idxs']
        masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    if (config.train_data_count_limit > 0):
        print("-----------------------------------------------")
        print("Restricting Sz:\t", config.train_data_count_limit)
        print("Batch Sz:\t", config.batch_size)
        image_ids = image_ids[0:config.train_data_count_limit]
        image_files = image_files[0:config.train_data_count_limit]
        word_idxs = word_idxs[0:config.train_data_count_limit]
        masks = masks[0:config.train_data_count_limit]
        """ Dump the image paths to a file """
        filepath = 'train_images.csv'
        with open(filepath, 'w') as file_handler:
            for i in range(0, config.train_data_count_limit):
                file_handler.write("{}\n".format(image_files[i]))
        #print(image_files)
        print("-----------------------------------------------")

    dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs,
                      masks, True, True)
    print("Dataset built.")
    return dataset
Exemple #8
0
def process_val_data_old(config, orcale_file=None, has_image=False, all_captions = True):

    print("Processing the captions...")
    if not os.path.exists(config.eval_temp_file):
        print("Creating temp annotation file....")
        if all_captions:
            print("ALL CAPTIONS")
            captions = []
            image_ids = []
            image_files = []
            feature_files = []
            ignore_ids = []
            #if ignore_file:

            df = pd.read_csv(config.ignore_file_eval).values
            ignore_ids = [idx for seqno, idx in df]

            ann_file = pd.read_csv(config.eval_caption_file)
            with open(config.eval_caption_file, 'r') as f:
                reader = csv.reader(f)
                for id, file_name, caption in reader:
                    if int(id) not in ignore_ids:
                        image_ids.append(id)
                        image_files.append(file_name)
                        feature_files.append(os.path.join(config.train_feature_dir,
                                        os.path.basename(file_name.replace('.jpg', '.npy'))))
                        captions.append(caption)
            annotations = pd.DataFrame({'image_id': image_ids,
                                        'image_file': image_files,
                                        'feature_file': feature_files,
                                        'caption': captions})
            annotations.to_csv(config.eval_temp_file)

            all_captions = captions
        else:   
            print("NOT ALL CAPTIONS")   
            coco = COCO(config.eval_caption_file, config.ignore_file_eval)
            all_captions = coco.all_captions
            captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
            image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
            image_files = [os.path.join(config.train_image_dir,
                                        coco.imgs[image_id]['file_name'])
                                        for image_id in image_ids]
            feature_files = [os.path.join(config.train_feature_dir,
                                        os.path.basename(coco.imgs[image_id]['file_name'].replace('.jpg', '.npy')))
                                        for image_id in image_ids]
            annotations = pd.DataFrame({'image_id': image_ids,
                                        'image_file': image_files,
                                        'feature_file': feature_files,
                                        'caption': captions})
            annotations.to_csv(config.eval_temp_file)

        print(len(image_ids), len(image_files), len(feature_files), len(captions))
    else:
        annotations = pd.read_csv(config.eval_temp_file)
        captions = [] 
        image_ids = [] 
        image_files = [] 
        feature_files = []
        for _, id, file, feature, cap in annotations.values:
            image_ids.append(id)
            image_files.append(file)
            feature_files.append(feature)
            captions.append(cap)
        print("load data...")
        print(len(image_ids), len(image_files), len(feature_files), len(captions))

        all_captions = captions
    with open(config.temp_image_file_eval, 'w') as outfile:
        for img_file in image_files:
            outfile.write(img_file+"\n")
    with open(config.temp_feature_file_eval, 'w') as outfile:
        for feature in feature_files:
            outfile.write(feature+"\n")

    coco = COCO(config.train_caption_file, config.ignore_file)

    vocabulary = build_vocabulary(config, coco.all_captions())

    return config.max_caption_length, vocabulary.size + len(config.ctrl_symbols), vocabulary
Exemple #9
0
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file, config.ignore_file)
    #coco.filter_by_cap_len(config.max_caption_length)

    #print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size, config.ctrl_symbols)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    #print("Vocabulary built.")
    #print("Number of words = %d" %(vocabulary.size))

    #coco.filter_by_words(set(vocabulary.words))

    #print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [os.path.join(config.train_image_dir,
                                    coco.imgs[image_id]['file_name'])
                                    for image_id in image_ids]
        annotations = pd.DataFrame({'image_id': image_ids,
                                    'image_file': image_files,
                                    'caption': captions})
        annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        
        captions = [] 
        image_ids = [] 
        image_files = [] 

        for id, file, feat, cap in annotations.values:
            
            image_ids.append(id)
            image_files.append(feat)
            captions.append(cap)
        
    print("NUM CAPTIONS: " + str(len(captions)))
    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        sent_lens = []
        for caption in tqdm(captions):
            current_word_idxs, current_length = vocabulary.process_sentence(caption)
            current_num_words = min(config.max_caption_length-2, current_length)

            current_word_idxs = [config._START_] + current_word_idxs[:current_num_words] + [config._END_]
            pad_length = config.max_caption_length - current_num_words -2
            if pad_length > 0:
                current_word_idxs += [config._PAD_] * (pad_length)
            #print("sent length:"+str(len(current_word_idxs))+", real len:"+str(current_length))
            current_masks = np.zeros(config.max_caption_length)
            current_masks[:current_num_words] = 1.0

            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
            sent_lens.append(current_num_words+2)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks, 'sentence_len': sent_lens}
        np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file).item()
        word_idxs = data['word_idxs']
        masks = None #data['masks']
        sent_lens = data['sentence_len']
    #print("Captions processed.")
    #print("Number of captions = %d" %(len(captions)))
    #print("Number of word_idxs = %d" %(len(word_idxs)))
    #print("Number of sent_lens = %d" %(len(sent_lens)))
    dataset = DataSet(coco,
                      vocabulary,
                      image_ids,
                      image_files,
                      config.batch_size,
                      word_idxs,
                      masks,
                      sent_lens,
                      True,
                      True)
    return dataset
Exemple #10
0
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file)
    coco.filter_by_cap_len(config.max_caption_length)
    if config.distributed:
        images = os.listdir(config.train_image_dir)
        ids = [int(x[15:27]) for x in images]
        print 'Input Path: ' + config.train_image_dir + ' Number of files in input path: ' + str(
            int(len(ids)))

    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    coco.filter_by_words(set(vocabulary.words))

    if config.distributed:
        print('Filter captions by images')
        coco.filter_by_images(ids)
        #print(coco.getImgIds(ids))

    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [
            os.path.join(config.train_image_dir,
                         coco.imgs[image_id]['file_name'])
            for image_id in image_ids
        ]
        annotations = pd.DataFrame({
            'image_id': image_ids,
            'image_file': image_files,
            'caption': captions
        })
        annotations.set_index('image_id', inplace=True)
        annotations = annotations.loc[ids]
        if not config.distributed:
            annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        for caption in tqdm(captions):
            current_word_idxs_ = vocabulary.process_sentence(caption)
            current_num_words = len(current_word_idxs_)
            current_word_idxs = np.zeros(config.max_caption_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_caption_length)
            current_word_idxs[:current_num_words] = np.array(
                current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks}
        if not config.distributed:
            np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file).item()
        word_idxs = data['word_idxs']
        masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs,
                      masks, True, True)
    print("Dataset built.")
    #print "Input Path: " + config.train_image_dir + " Number of files after data preparation: " + str(len(image_files))
    #print "Images IDs to be used on this server: " + str(image_ids)
    return dataset
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file)

    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % vocabulary.size)

    coco.filter_by_words(set(vocabulary.words))

    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [os.path.join(config.train_image_dir,
                                    coco.imgs[image_id]['file_name'])
                       for image_id in image_ids]
        annotations = pd.DataFrame({'image_id': image_ids,
                                    'image_file': image_files,
                                    'caption': captions})
        annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        for caption in tqdm(captions):
            current_word_idxs_ = vocabulary.process_sentence(caption)
            current_num_words = len(current_word_idxs_)
            current_word_idxs = np.zeros(config.max_caption_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_caption_length)
            current_word_idxs[:current_num_words] = np.array(current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks}
        np.save(config.temp_data_file, data)
    else:
        # RV
        # save np.load
        # np_load_old = np.load

        # modify the default parameters of np.load
        # np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k)
        #
        data = np.load(config.temp_data_file, allow_pickle=True, encoding="latin1").item()
        #
        # restore np.load for future normal usage
        # np.load = np_load_old
        # RV
        word_idxs = data['word_idxs']
        masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    # RV
    # Select the first 30000 captions from the shuffled set
    # num_examples = 5000
    # word_idxs = word_idxs[:num_examples]
    # image_files = image_files[:num_examples]

    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    dataset = DataSet(image_ids,
                      image_files,
                      config.batch_size,
                      word_idxs,
                      masks,
                      True,
                      True)
    print("Dataset built.")
    return dataset