def __init__(self, experiment_paths, experiment_config, dataset, vocab=None, include_all_boxes=False):
    SequenceGenerator.__init__(self)
    self.exp_name = experiment_config.exp_name
    self.batch_num_streams = experiment_config.train.batch_size
    self.max_words = experiment_config.train.max_words
    self.pad = experiment_config.pad if hasattr(experiment_config, 'pad') else True
    self.truncate = experiment_config.truncate if hasattr(experiment_config, 'truncate') else True
    self.swap_axis_streams = frozenset(('timestep_input', 'timestep_cont', 'timestep_target'))
    self.index = 0
    self.num_resets = 0
    self.num_truncates = 0
    self.num_pads = 0
    self.num_outs = 0

    self.dataset = dataset
    self.image_refexp_pairs = self.dataset.image_refexp_pairs

    # Load image features
    if self.dataset.image_features is None:
      features_filename = "%s/COCO_region_features.h5" % experiment_paths.precomputed_image_features
      self.dataset.extract_image_object_features(features_filename, feature_layer='fc7',
                                                 include_all_boxes=include_all_boxes)

    # Load vocab
    if vocab is not None:
      self.vocabulary_inverted = vocab
      self.vocabulary = {}
      for index, word in enumerate(self.vocabulary_inverted):
        self.vocabulary[word] = index
    else:
      self.init_vocabulary()

    # make the number of image/refexp pairs a multiple of the buffer size
    # so each timestep of each batch is useful and we can align the images
    align = experiment_config.aligned if hasattr(experiment_config, 'aligned') else True
    if align:
      num_pairs = len(self.image_refexp_pairs)
      remainder = num_pairs % self.batch_num_streams
      if remainder > 0:
        num_needed = self.batch_num_streams - remainder
        for i in range(num_needed):
          choice = random.randint(0, num_pairs - 1)
          self.image_refexp_pairs.append(self.image_refexp_pairs[choice])
      assert len(self.image_refexp_pairs) % self.batch_num_streams == 0

    shuffle = experiment_config.shuffle if hasattr(experiment_config, 'shuffle') else True
    if shuffle:
      random.shuffle(self.image_refexp_pairs)
Beispiel #2
0
 def __init__(self, fsg_lines, vocab_filename, batch_num_streams=8, max_words=MAX_WORDS,
              pad=True, truncate=True):
     self.max_words = max_words
     self.lines = fsg_lines
     self.line_index = 0
     self.num_resets = 0
     self.num_truncates = 0
     self.num_pads = 0
     self.num_outs = 0
     self.vocabulary = {}
     self.vocabulary_inverted = []
     self.vocab_counts = []
     # initialize vocabulary
     self.init_vocabulary(vocab_filename)
     SequenceGenerator.__init__(self)
     self.batch_num_streams = batch_num_streams
     self.pad = pad
     self.truncate = truncate
     self.negative_one_padded_streams = frozenset(('target_sentence'))
Beispiel #3
0
    def __init__(self,
                 coco,
                 split_name,
                 batch_num_streams,
                 image_root,
                 vocab=None,
                 max_words=MAX_WORDS,
                 align=True,
                 shuffle=True,
                 gt_captions=True,
                 pad=True,
                 truncate=True,
                 split_ids=None):

        #split_ids: image list e.g. '2801146217_03a0b59ccb.jpg\n', '1321723162_9d4c78b8af.jpg\n'
        #
        #
        self.max_words = max_words
        num_empty_lines = 0
        self.images = []
        num_total = 0
        num_missing = 0
        num_captions = 0
        known_images = {}
        #self.coco = coco

        self.image_path_to_id = {}
        self.image_sentence_pairs = []

        ###
        ##  generate image_id list, which will be used in retrieval experiments
        ###
        if split_ids is None:
            split_ids = coco.imgs.keys()
        self.image_path_to_id = {}
        # pdb.set_trace()
        # print split_ids
        for image_id in split_ids:
            image_path = '%s/%s.jpg' % (image_root, image_id)

            # print 'image_info ',image_info
            #print 'image_path ',image_path
            self.image_path_to_id[image_path] = image_id
            #print self.image_path_to_id
            ###'./data/flickr8K/images/train/143688895_e837c3bc76.jpg': '143688895_e837c3bc76',

        self.image_sentence_pairs = split_image_captions(
            split_name, image_root)
        #print image_sentence_pairs

        #generate word vocabulary based on image caption sentences
        if vocab is None:
            self.init_vocabulary(self.image_sentence_pairs)
        else:
            self.vocabulary_inverted = vocab
            self.vocabulary = {}
            for index, word in enumerate(self.vocabulary_inverted):
                self.vocabulary[word] = index

        self.index = 0
        self.num_resets = 0
        self.num_truncates = 0
        self.num_pads = 0
        self.num_outs = 0
        self.image_list = []
        SequenceGenerator.__init__(self)
        self.batch_num_streams = batch_num_streams
        # make the number of image/sentence pairs a multiple of the buffer size
        # so each timestep of each batch is useful and we can align the images
        if align:
            num_pairs = len(self.image_sentence_pairs)
            #pdb.set_trace()
            print 'number of pairs: ', num_pairs
            remainder = num_pairs % batch_num_streams
            if remainder > 0:
                num_needed = batch_num_streams - remainder
                for i in range(num_needed):
                    choice = random.randint(0, num_pairs - 1)
                    self.image_sentence_pairs.append(
                        self.image_sentence_pairs[choice])
            assert len(self.image_sentence_pairs) % batch_num_streams == 0
        if shuffle:
            random.shuffle(self.image_sentence_pairs)
        self.pad = pad
        self.truncate = truncate
        self.negative_one_padded_streams = frozenset(
            ('input_sentence', 'target_sentence'))
Beispiel #4
0
    def __init__(self,
                 experiment_paths,
                 experiment_config,
                 dataset,
                 vocab=None,
                 include_all_boxes=False):
        SequenceGenerator.__init__(self)
        self.exp_name = experiment_config.exp_name
        self.batch_num_streams = experiment_config.train.batch_size
        self.max_words = experiment_config.train.max_words
        self.pad = experiment_config.pad if hasattr(experiment_config,
                                                    'pad') else True
        self.truncate = experiment_config.truncate if hasattr(
            experiment_config, 'truncate') else True
        self.swap_axis_streams = frozenset(
            ('timestep_input', 'timestep_cont', 'timestep_target'))
        self.index = 0
        self.num_resets = 0
        self.num_truncates = 0
        self.num_pads = 0
        self.num_outs = 0

        self.dataset = dataset
        self.image_refexp_pairs = self.dataset.image_refexp_pairs

        # Load image features
        if self.dataset.image_features is None:
            features_filename = "%s/COCO_region_features.h5" % experiment_paths.precomputed_image_features
            self.dataset.extract_image_object_features(
                features_filename,
                feature_layer='fc7',
                include_all_boxes=include_all_boxes)

        # Load vocab
        if vocab is not None:
            self.vocabulary_inverted = vocab
            self.vocabulary = {}
            for index, word in enumerate(self.vocabulary_inverted):
                self.vocabulary[word] = index
        else:
            self.init_vocabulary()

        # make the number of image/refexp pairs a multiple of the buffer size
        # so each timestep of each batch is useful and we can align the images
        align = experiment_config.aligned if hasattr(experiment_config,
                                                     'aligned') else True
        if align:
            num_pairs = len(self.image_refexp_pairs)
            remainder = num_pairs % self.batch_num_streams
            if remainder > 0:
                num_needed = self.batch_num_streams - remainder
                for i in range(num_needed):
                    choice = random.randint(0, num_pairs - 1)
                    self.image_refexp_pairs.append(
                        self.image_refexp_pairs[choice])
            assert len(self.image_refexp_pairs) % self.batch_num_streams == 0

        shuffle = experiment_config.shuffle if hasattr(experiment_config,
                                                       'shuffle') else True
        if shuffle:
            random.shuffle(self.image_refexp_pairs)
 def __init__(self, coco, batch_num_streams, vocab=None,
              max_words=MAX_WORDS, align=True, shuffle=True, gt_captions=True,
              pad=True, truncate=True, split_ids=None):
   self.max_words = max_words
   num_empty_lines = 0
   self.images = []
   num_total = 0
   num_missing = 0
   num_captions = 0
   known_images = {}
   image_root = '%s/%s' % (COCO_PATH, coco.image_folder)
   if split_ids is None:
     split_ids = coco.images.keys()
   for image_id in split_ids:
     image_info = coco.images[image_id]
     image_path = '%s/%s/%s' % \
         (image_root, image_info['file_path'], image_info['file_name'])
     if os.path.isfile(image_path):
       assert image_id not in known_images  # no duplicates allowed
       known_images[image_id] = {}
       known_images[image_id]['path'] = image_path
       if gt_captions:
         known_images[image_id]['sentences'] = [split_sentence(anno['sentence'])
             for anno in coco.image_to_annotations[image_id]]
         num_captions += len(known_images[image_id]['sentences'])
       else:
         known_images[image_id]['sentences'] = []
     else:
       num_missing += 1
       print 'Warning (#%d): image not found: %s' % (num_missing, image_path)
     num_total += 1
   print '%d/%d images missing' % (num_missing, num_total)
   if vocab is None:
     self.init_vocabulary(known_images)
   else:
     self.vocabulary_inverted = vocab
     self.vocabulary = {}
     for index, word in enumerate(self.vocabulary_inverted):
       self.vocabulary[word] = index
   self.image_sentence_pairs = []
   num_no_sentences = 0
   for image_filename, metadata in known_images.iteritems():
     if not metadata['sentences']:
       num_no_sentences += 1
       print 'Warning (#%d): image with no sentences: %s' % (num_no_sentences, image_filename)
     for sentence in metadata['sentences']:
       self.image_sentence_pairs.append((metadata['path'], sentence))
   self.index = 0
   self.num_resets = 0
   self.num_truncates = 0
   self.num_pads = 0
   self.num_outs = 0
   self.image_list = []
   SequenceGenerator.__init__(self)
   self.batch_num_streams = batch_num_streams
   # make the number of image/sentence pairs a multiple of the buffer size
   # so each timestep of each batch is useful and we can align the images
   if align:
     num_pairs = len(self.image_sentence_pairs)
     remainder = num_pairs % batch_num_streams
     if remainder > 0:
       num_needed = batch_num_streams - remainder
       for i in range(num_needed):
         choice = random.randint(0, num_pairs - 1)
         self.image_sentence_pairs.append(self.image_sentence_pairs[choice])
     assert len(self.image_sentence_pairs) % batch_num_streams == 0
   if shuffle:
     random.shuffle(self.image_sentence_pairs)
   self.pad = pad
   self.truncate = truncate
   self.negative_one_padded_streams = frozenset(('input_sentence', 'target_sentence'))
 def __init__(self,
              filenames,
              dset,
              batch_num_streams=1,
              max_frames=MAX_FRAMES,
              align=True,
              shuffle=True,
              pad=True,
              truncate=True):
     self.max_frames = max_frames
     self.lines = []
     num_empty_lines = 0
     self.vid_poolfeats = {}  # listofdict [{}]
     for poolfeatfile, sentfile in filenames:
         print 'Reading pooled features from file: %s' % poolfeatfile
         if dset == 'train':
             vid_num = 1
         elif dset == 'val':
             vid_num = 1201
         elif dset == 'test':
             vid_num = 1301
         else:
             raise Exception('Unknown video data split name: %s' % dset)
         with open(poolfeatfile, 'rb') as poolfd:
             # each line has the fc7 mean of 1 video
             for line in poolfd:
                 line = line.strip()
                 video_id = 'vid%d' % vid_num
                 if video_id not in self.vid_poolfeats:
                     self.vid_poolfeats[video_id] = []
                 self.vid_poolfeats[video_id].append(line)
                 vid_num += 1
         # reset max_words based on maximum frames in the video
         print 'Reading sentences in: %s' % sentfile
         with open(sentfile, 'r') as sentfd:
             for line in sentfd:
                 line = line.strip()
                 id_sent = line.split('\t')
                 if len(id_sent) < 2:
                     num_empty_lines += 1
                     continue
                 self.lines.append((id_sent[0], id_sent[1]))
         if num_empty_lines > 0:
             print 'Warning: ignoring %d empty lines.' % num_empty_lines
     self.line_index = 0
     self.num_resets = 0
     self.num_truncates = 0
     self.num_pads = 0
     self.num_outs = 0
     self.frame_list = []
     SequenceGenerator.__init__(self)
     self.batch_num_streams = batch_num_streams  # needed in hdf5 to seq
     # make the number of image/sentence pairs a multiple of the buffer size
     # so each timestep of each batch is useful and we can align the images
     if align:
         num_pairs = len(self.lines)
         remainder = num_pairs % BUFFER_SIZE
         if remainder > 0:
             num_needed = BUFFER_SIZE - remainder
             for i in range(num_needed):
                 choice = random.randint(0, num_pairs - 1)
                 self.lines.append(self.lines[choice])
         assert len(self.lines) % BUFFER_SIZE == 0
     if shuffle:
         random.shuffle(self.lines)
     self.pad = pad
     self.truncate = truncate
 def __init__(self, coco,split_name,batch_num_streams, image_root, vocab=None,
              max_words=MAX_WORDS, align=True, shuffle=True, gt_captions=True,
              pad=True, truncate=True, split_ids=None):
           
   #split_ids: image list e.g. '2801146217_03a0b59ccb.jpg\n', '1321723162_9d4c78b8af.jpg\n'
   #
   #
   self.max_words = max_words
   num_empty_lines = 0
   self.images = []
   num_total = 0
   num_missing = 0
   num_captions = 0
   known_images = {}
   #self.coco = coco
 
   self.image_path_to_id = {}
   self.image_sentence_pairs = []
   
   ###
   ##  generate image_id list, which will be used in retrieval experiments
   ###
   if split_ids is None:
     split_ids = coco.imgs.keys()
   self.image_path_to_id = {}
  # pdb.set_trace()
  # print split_ids
   for image_id in split_ids:
     image_path = '%s/%s.jpg' % (image_root, image_id)
     
    # print 'image_info ',image_info
     #print 'image_path ',image_path
     self.image_path_to_id[image_path] = image_id
     #print self.image_path_to_id
     ###'./data/flickr8K/images/train/143688895_e837c3bc76.jpg': '143688895_e837c3bc76', 
   
   self.image_sentence_pairs=split_image_captions(split_name,image_root)    
   #print image_sentence_pairs
 
   #generate word vocabulary based on image caption sentences
   if vocab is None:
     self.init_vocabulary(self.image_sentence_pairs)
   else:
     self.vocabulary_inverted = vocab
     self.vocabulary = {}
     for index, word in enumerate(self.vocabulary_inverted):
       self.vocabulary[word] = index
        
   self.index = 0
   self.num_resets = 0
   self.num_truncates = 0
   self.num_pads = 0
   self.num_outs = 0
   self.image_list = []
   SequenceGenerator.__init__(self)
   self.batch_num_streams = batch_num_streams
   # make the number of image/sentence pairs a multiple of the buffer size
   # so each timestep of each batch is useful and we can align the images
   if align:
     num_pairs = len(self.image_sentence_pairs) 
     #pdb.set_trace()
     print 'number of pairs: ', num_pairs
     remainder = num_pairs % batch_num_streams
     if remainder > 0:
       num_needed = batch_num_streams - remainder
       for i in range(num_needed):
         choice = random.randint(0, num_pairs - 1)
         self.image_sentence_pairs.append(self.image_sentence_pairs[choice])
     assert len(self.image_sentence_pairs) % batch_num_streams == 0
   if shuffle:
     random.shuffle(self.image_sentence_pairs)
   self.pad = pad
   self.truncate = truncate
   self.negative_one_padded_streams = frozenset(('input_sentence', 'target_sentence'))