Beispiel #1
0
    def get_streams(self):
        ((image_filename, image_id), object_id_list,
         line) = self.image_refexp_pairs[self.index]
        if image_id in self.dataset.imgs_with_errors:
            line = EOS_IDENTIFIER

        stream = get_encoded_line(line, self.vocabulary)
        # Assumes stream has EOS word at the end
        assert (stream[-1] == self.vocabulary[EOS_IDENTIFIER])
        stream = stream[:-1]
        filtered_stream = []
        for word in stream:
            if word != self.vocabulary[UNK_IDENTIFIER]:
                filtered_stream.append(word)
        stream = filtered_stream
        if self.truncate and len(stream) >= self.max_words:
            stream = stream[:self.max_words - 1]
            self.num_truncates += 1
        pad = self.max_words - (len(stream) + 1) if self.pad else 0
        if pad > 0:
            self.num_pads += 1

        out = {}
        out['timestep_input'] = np.asarray([self.vocabulary[EOS_IDENTIFIER]] +
                                           stream + [-1] * pad, float)
        out['timestep_cont'] = np.asarray([0] + [1] * len(stream) + [0] * pad,
                                          float)
        out['timestep_target'] = np.asarray(
            stream + [self.vocabulary[EOS_IDENTIFIER]] + [-1] * pad, float)

        # Write image features to batch
        img_info = self.dataset.loadImgs(image_id)[0]
        img_wd = float(img_info['width'])
        img_ht = float(img_info['height'])

        out['fc7_img'] = self.dataset.image_features[str(
            (image_id, [0, 0, int(img_wd - 1),
                        int(img_ht - 1)]))][0]

        assert (object_id_list[0] == -1)
        object_id = object_id_list[1]
        bbox = self.dataset.loadAnns(object_id)[0]['bbox']
        out['fc7_obj'] = self.dataset.image_features[str((image_id, bbox))][0]
        bbox_area_ratio = (bbox[2] * bbox[3]) / (img_wd * img_ht)
        bbox_x1y1x2y2 = [
            bbox[0] / img_wd, bbox[1] / img_ht, (bbox[0] + bbox[2]) / img_wd,
            (bbox[1] + bbox[3]) / img_ht
        ]
        bbox_features = bbox_x1y1x2y2 + [bbox_area_ratio]
        out['bbox_features'] = bbox_features

        self.num_outs += 1
        self.next_line()
        return out
  def get_streams(self):
    ((image_filename, image_id), object_id_list, line) = self.image_refexp_pairs[self.index]
    if image_id in self.dataset.imgs_with_errors:
      line = EOS_IDENTIFIER

    stream = get_encoded_line(line, self.vocabulary)
    # Assumes stream has EOS word at the end
    assert (stream[-1] == self.vocabulary[EOS_IDENTIFIER])
    stream = stream[:-1]
    filtered_stream = []
    for word in stream:
      if word != self.vocabulary[UNK_IDENTIFIER]:
        filtered_stream.append(word)
    stream = filtered_stream
    if self.truncate and len(stream) >= self.max_words:
      stream = stream[:self.max_words-1]
      self.num_truncates += 1
    pad = self.max_words - (len(stream) + 1) if self.pad else 0
    if pad > 0:
      self.num_pads += 1

    out = {}
    out['timestep_input']  = np.asarray([self.vocabulary[EOS_IDENTIFIER]] + stream + [-1] * pad, float)
    out['timestep_cont']   = np.asarray([0] + [1] * len(stream) + [0] * pad, float)
    out['timestep_target'] = np.asarray(stream + [self.vocabulary[EOS_IDENTIFIER]] + [-1] * pad, float)

    # Write image features to batch
    img_info = self.dataset.loadImgs(image_id)[0]
    img_wd = float(img_info['width'])
    img_ht = float(img_info['height'])

    out['fc7_img'] = self.dataset.image_features[str((image_id, [0, 0, int(img_wd - 1), int(img_ht - 1)]))][0]

    assert(object_id_list[0]==-1)
    object_id = object_id_list[1]
    bbox = self.dataset.loadAnns(object_id)[0]['bbox']
    out['fc7_obj'] = self.dataset.image_features[str((image_id, bbox))][0]
    bbox_area_ratio = (bbox[2] * bbox[3]) / (img_wd * img_ht)
    bbox_x1y1x2y2 = [bbox[0] / img_wd, bbox[1] / img_ht, (bbox[0] + bbox[2]) / img_wd, (bbox[1] + bbox[3]) / img_ht]
    bbox_features = bbox_x1y1x2y2 + [bbox_area_ratio]
    out['bbox_features'] = bbox_features

    self.num_outs += 1
    self.next_line()
    return out
Beispiel #3
0
  def comprehension_experiment(self, experiment_paths, proposal_source='gt', visualize=False, eval_method=None):
    output_h5_file = '%s/COCO_region_features.h5' % experiment_paths.precomputed_image_features
    self.extract_image_features(experiment_paths, proposal_source, output_h5_file)
    h5file = h5py.File(output_h5_file, 'r')

    num_images = len(self.images)
    random.seed()
    random.shuffle(self.images)
    results = []
    for (i,image_id) in enumerate(self.images):
      image = self.dataset.loadImgs(image_id)[0]
      if proposal_source != 'gt':
        bboxes = [cand['bounding_box'] for cand in image['region_candidates']]
      else:
        obj_anns = self.dataset.coco.imgToAnns[image_id]
        bboxes = [ann['bbox'] for ann in obj_anns]

      if len(bboxes) == 0:
        print("No region candidates for %d" % image_id)
        anns = self.dataset.img_to_refexps[image_id]
        for ann in anns:
          gt_obj = ann['object_id_list'][1] if len(ann['object_id_list']) == 2 else -1
          result = {'annotation_id':gt_obj, 'predicted_bounding_boxes':[], 'refexp':ann['refexp'][0]}
          results.append(result)
        continue

      # Object region features
      for obj_i in range(len(bboxes)):
        feats = h5file[str((image_id,bboxes[obj_i]))][:]
        if obj_i == 0:
          obj_feats = feats
        else:
          obj_feats = np.vstack((obj_feats, feats))
      # Image region features
      img_wd = int(image['width'])
      img_ht = int(image['height'])
      img_feats = h5file[str((image_id,[0,0,img_wd-1,img_ht-1]))][:]
      img_feats = np.tile(img_feats,(len(obj_feats),1))
      # Bounding box features
      bbox_features = []
      for bbox in bboxes:
        img_wd = float(img_wd)
        img_ht = float(img_ht)
        bbox_area_ratio = (bbox[2]*bbox[3])/(img_wd*img_ht)
        bbox_x1y1x2y2 = [bbox[0]/img_wd, bbox[1]/img_ht,
                         min(1., (bbox[0]+bbox[2])/img_wd), min(1., (bbox[1]+bbox[3])/img_ht)]
        bbox_features.append(bbox_x1y1x2y2 + [bbox_area_ratio])

      anns = self.dataset.img_to_refexps[image_id]
      for ann in anns:
        prefix_words_unfiltered = get_encoded_line(ann['refexp'], self.lang_model.vocab)
        prefix_words = []
        for word in prefix_words_unfiltered:
          if word != self.lang_model.vocab[UNK_IDENTIFIER]:
            prefix_words.append(word)
        prefix_words = [prefix_words] * len(bboxes)
        output_captions, output_probs = self.lang_model.sample_captions(obj_feats, img_feats, bbox_features,
                                                                        prefix_words=prefix_words)
        stats = [gen_stats(output_prob) for output_prob in output_probs]
        stats = [stat['log_p_word'] for stat in stats]
        (sort_keys, sorted_stats) = zip(*sorted(enumerate(stats), key=lambda x:-x[1]))
        top_k = 10 if len(sort_keys) > 10 else len(sort_keys)
        top_bboxes = [bboxes[k] for k in sort_keys[:top_k]]

        gt_obj = ann['object_id_list'][1] if len(ann['object_id_list']) == 2 else -1
        result = {'annotation_id':gt_obj, 'predicted_bounding_boxes':top_bboxes, 'refexp':ann['refexp']}

        if visualize:
          img_filename = '%s/%s' % (self.dataset.image_root, self.dataset.loadImgs(image_id)[0]['file_name'])
          im = mpimg.imread(img_filename)
          plt.cla()
          plt.imshow(im)
          plt.axis('off')
          plt.title(ann['refexp'])

          if gt_obj != -1:
            gt_box = self.dataset.coco.loadAnns(gt_obj)[0]['bbox']
            plt.gca().add_patch(plt.Rectangle((gt_box[0], gt_box[1]),gt_box[2], gt_box[3],
                                              fill=False, edgecolor='g', linewidth=3))

          top_box = top_bboxes[0]
          plt.gca().add_patch(plt.Rectangle((top_box[0], top_box[1]),top_box[2], top_box[3],
                                            fill=False, edgecolor='r', linewidth=3))
          #top_box_score = stats[bboxes.index(top_box)]
          #plt.text(top_box[0], top_box[1], str(top_box_score), fontsize=12, bbox=dict(facecolor='red', alpha=1))

          ipdb.set_trace()

        results.append(result)

      sys.stdout.write("\rDone with %d/%d images" % (i+1,num_images))
      sys.stdout.flush()

    sys.stdout.write("\n")
    h5file.close()
    return results
Beispiel #4
0
  def comprehension_experiment(self, experiment_paths, proposal_source='gt', visualize=False, eval_method=None):
    output_h5_file = '%s/COCO_region_features.h5' % experiment_paths.precomputed_image_features
    self.extract_image_features(experiment_paths, proposal_source, output_h5_file)
    h5file = h5py.File(output_h5_file, 'r')

    if eval_method is None:
      eval_methods = ['noisy_or', 'max', 'image_context_only']
    else:
      eval_methods = [eval_method]

    results = defaultdict(list)
    num_images = len(self.images)
    random.seed()
    random.shuffle(self.images)
    for (i, image_id) in enumerate(self.images):
      image = self.dataset.loadImgs(image_id)[0]
      if proposal_source != 'gt':
        bboxes = [cand['bounding_box'] for cand in image['region_candidates']]
      else:
        anns = self.dataset.coco.imgToAnns[image_id]
        bboxes = [ann['bbox'] for ann in anns]

      if len(bboxes) == 0:
        print("No region candidates for %d" % image_id)
        anns = self.dataset.img_to_refexps[image_id]
        for ann in anns:
          gt_obj = ann['object_id_list'][1] if len(ann['object_id_list']) == 2 else -1
          result = {'annotation_id':gt_obj, 'predicted_bounding_boxes':[], 'refexp':ann['refexp']}
          for method in eval_methods:
            results[method].append(result)
        continue

      # Image region features
      batch_size = len(bboxes)
      img_wd = int(image['width'])
      img_ht = int(image['height'])
      fc7_img = h5file[str((image_id,[0,0,img_wd-1,img_ht-1]))][:]

      img_wd = float(img_wd)
      img_ht = float(img_ht)
      image_feature_length = len(fc7_img[0])
      # Any change to context_length value will also require a change in the deploy prototxt
      context_length = 10
      fc7_obj = np.zeros((batch_size,context_length,image_feature_length))
      context_fc7 = np.tile(fc7_img,(batch_size,context_length,1))
      bbox_features = np.zeros((batch_size,context_length,5))
      context_bbox_features = np.zeros((batch_size,context_length, 5),np.float16)

      context_bboxes = []
      for (bbox_idx, bbox) in enumerate(bboxes):
        # Object region features
        fc7_obj[bbox_idx,:] = h5file[str((image_id,bbox))][:]

        # Bounding box features
        bbox_area_ratio = (bbox[2]*bbox[3])/(img_wd*img_ht)
        bbox_x1y1x2y2 = [bbox[0]/img_wd, bbox[1]/img_ht,
                         min(1., (bbox[0]+bbox[2])/img_wd), min(1., (bbox[1]+bbox[3])/img_ht)]
        obj_bbox_features = bbox_x1y1x2y2 + [bbox_area_ratio]
        bbox_features[bbox_idx,:] = obj_bbox_features
        context_bbox_features[bbox_idx,:] = [0,0,1,1,1]

        # Context features
        other_bboxes = list(bboxes)  # make a copy
        other_bboxes.remove(bbox)

        if len(other_bboxes) > context_length-1:
          rand_sample = sorted(random.sample(range(len(other_bboxes)),context_length-1))
          other_bboxes = [other_bboxes[idx] for idx in rand_sample]

        context_bboxes.append(other_bboxes)

        for (other_bbox_idx, other_bbox) in enumerate(other_bboxes):
          other_bbox_area_ratio = (other_bbox[2] * other_bbox[3]) / (img_wd * img_ht)
          other_bbox_x1y1x2y2 = [other_bbox[0] / img_wd, other_bbox[1] / img_ht,
                                 (other_bbox[0] + other_bbox[2]) / img_wd, (other_bbox[1] + other_bbox[3]) / img_ht]
          other_bbox_features = other_bbox_x1y1x2y2 + [other_bbox_area_ratio]
          feats = h5file[str((image_id,other_bbox))][:]
          context_fc7[bbox_idx,other_bbox_idx,:] = feats
          context_bbox_features[bbox_idx,other_bbox_idx,:] = other_bbox_features

      for elem in context_bboxes:
        elem.append([0,0,img_wd-1,img_ht-1])

      anns = self.dataset.img_to_refexps[image_id]
      for ann in anns:
        prefix_words_unfiltered = get_encoded_line(ann['refexp'], self.lang_model.vocab)
        prefix_words = []
        for word in prefix_words_unfiltered:
          if word != self.lang_model.vocab[UNK_IDENTIFIER]:
            prefix_words.append(word)
        prefix_words = [prefix_words] * batch_size
        output_captions, output_probs, \
        output_all_probs = self.lang_model.sample_captions_with_context(fc7_obj, bbox_features,
                                                                        context_fc7, context_bbox_features,
                                                                        prefix_words=prefix_words)
        all_stats = [gen_stats(output_prob) for output_prob in output_all_probs]
        all_stats_p_word = [stat['p_word'] for stat in all_stats]
        all_stats_p_word = np.reshape(all_stats_p_word, (batch_size, context_length))

        for method in eval_methods:
          if method == 'noisy_or':
            num_context_objs = min(context_length-1,len(bboxes)-1)
            sort_all_stats_p_word = -np.sort(-all_stats_p_word[:,0:num_context_objs])
            top_all_stats_p_word = np.hstack((sort_all_stats_p_word,all_stats_p_word[:,-1:]))
            stats = (1 - np.product(1-top_all_stats_p_word,axis=1))
          elif method == 'image_context_only':
            stats = all_stats_p_word[:,-1]
          elif method == 'max':
            stats = np.max(all_stats_p_word,axis=1)
          else:
            raise StandardError("Unknown eval method %s" % method)

          (sort_keys, sorted_stats) = zip(*sorted(enumerate(stats), key=lambda x:-x[1]))
          top_k = 10 if len(sort_keys) > 10 else len(sort_keys)
          top_bboxes = [bboxes[k] for k in sort_keys[:top_k]]
          gt_obj = ann['object_id_list'][1] if len(ann['object_id_list']) == 2 else -1
          result = {'annotation_id':gt_obj, 'predicted_bounding_boxes':top_bboxes, 'refexp':ann['refexp']}
          results[method].append(result)

          gt_box = self.dataset.coco.loadAnns(gt_obj)[0]['bbox']
          if method == 'noisy_or':
            noisy_or_top_box = top_bboxes[0]
          elif method == "image_context_only":
            image_top_bbox = top_bboxes[0]

        if visualize:
          print "Image id: %d" % image_id
          img_filename = '%s/%s' % (self.dataset.image_root, self.dataset.loadImgs(image_id)[0]['file_name'])
          im = mpimg.imread(img_filename)

          if noisy_or_top_box:
            plt.figure(1)
            plt.cla()
            plt.imshow(im)
            plt.title(ann['refexp'])
            top_box = noisy_or_top_box
            top_box_ind = bboxes.index(top_box)
            plt.gca().add_patch(plt.Rectangle((top_box[0], top_box[1]),top_box[2], top_box[3],
                                              fill=False, edgecolor='b', linewidth=6))
            top_context_box_ind = np.argmax(all_stats_p_word[top_box_ind])
            top_context_box = context_bboxes[top_box_ind][top_context_box_ind]
            plt.gca().add_patch(plt.Rectangle((top_context_box[0], top_context_box[1]),top_context_box[2],
                                              top_context_box[3], fill=False, edgecolor='b', linewidth=6,
                                              linestyle='dashed'))
            plt.axis('off')

          if image_top_bbox:
            plt.figure(2)
            plt.cla()
            plt.imshow(im)
            plt.title(ann['refexp'])
            top_box = image_top_bbox
            plt.gca().add_patch(plt.Rectangle((top_box[0], top_box[1]),top_box[2], top_box[3],
                                              fill=False, edgecolor='b', linewidth=6))
            plt.axis('off')

          plt.figure(3)
          plt.cla()
          plt.imshow(im)
          plt.title(ann['refexp'])
          plt.gca().add_patch(plt.Rectangle((gt_box[0], gt_box[1]),gt_box[2], gt_box[3],
                                            fill=False, edgecolor='g', linewidth=6))
          plt.axis('off')

          while True:
            sys.stdout.write('Do you want to save? (y/n): ')
            choice = raw_input().lower()
            if choice.startswith('y'):
              plt.figure(1)
              plt.savefig('%s/%d_nor.png' % (experiment_paths.coco_path, image_id),bbox_inches='tight')
              plt.figure(2)
              plt.savefig('%s/%d_image_context.png' % (experiment_paths.coco_path, image_id),bbox_inches='tight')
              plt.figure(3)
              plt.savefig('%s/%d_gt.png' % (experiment_paths.coco_path, image_id),bbox_inches='tight')
              break
            elif choice.startswith('n'):
              break

          ipdb.set_trace()

      sys.stdout.write("\rDone with %d/%d images" % (i+1,num_images))
      sys.stdout.flush()

    sys.stdout.write("\n")
    h5file.close()
    return results
Beispiel #5
0
    def get_streams(self):
        ((image_filename, image_id), object_id_list,
         line) = self.image_refexp_pairs[self.index]
        if image_id in self.dataset.imgs_with_errors:
            line = EOS_IDENTIFIER

        stream = get_encoded_line(line, self.vocabulary)
        # Assumes stream has EOS word at the end
        assert (stream[-1] == self.vocabulary[EOS_IDENTIFIER])
        stream = stream[:-1]
        filtered_stream = []
        for word in stream:
            if word != self.vocabulary[UNK_IDENTIFIER]:
                filtered_stream.append(word)
        stream = filtered_stream
        if self.truncate and len(stream) >= self.max_words:
            stream = stream[:self.max_words - 1]
            self.num_truncates += 1

        object_id = object_id_list[1]
        object_ann = self.dataset.loadAnns(object_id)[0]
        object_category = self.dataset.loadCats(
            object_ann['category_id'])[0]['name']
        object_bbox = self.dataset.loadAnns(object_id)[0]['bbox']
        context_anns_of_same_category = []
        context_anns_of_diff_category = []
        if hasattr(self.dataset, 'coco'):
            all_anns = self.dataset.coco.imgToAnns[image_id]
        else:
            all_anns = self.dataset.imgToAnns[image_id]
        for ann in all_anns:
            if ann['id'] != object_id:
                if ann['category_id'] == object_ann['category_id']:
                    context_anns_of_same_category.append(ann)
                else:
                    context_anns_of_diff_category.append(ann)

        neg_anns_of_same_category = []
        neg_anns_of_diff_category = []
        if self.neg_proposal_source != 'gt':
            image_info = self.dataset.loadImgs(image_id)[0]
            all_anns = image_info['region_candidates']
            for ann in all_anns:
                ann['bbox'] = ann['bounding_box']
                ann_box = ann['bbox']
                iou = iou_bboxes(ann_box, object_bbox)
                if iou < 0.5 and ann[
                        'predicted_object_name'] == object_category:
                    neg_anns_of_same_category.append(ann)
                elif ann['predicted_object_name'] != object_category:
                    neg_anns_of_diff_category.append(ann)
        else:
            neg_anns_of_same_category = context_anns_of_same_category
            neg_anns_of_diff_category = context_anns_of_diff_category

        # subtract one because image is reserved as one context region
        if len(context_anns_of_same_category) > self.max_num_context - 1:
            rand_sample = sorted(
                random.sample(range(len(context_anns_of_same_category)),
                              self.max_num_context - 1))
            context_anns_of_same_category = [
                context_anns_of_same_category[idx] for idx in rand_sample
            ]
        elif len(context_anns_of_same_category) < self.max_num_context - 1:
            rand_sample = sorted(
                random.sample(
                    range(len(context_anns_of_diff_category)),
                    min(
                        self.max_num_context - 1 -
                        len(context_anns_of_same_category),
                        len(context_anns_of_diff_category))))
            context_anns_of_same_category += [
                context_anns_of_diff_category[idx] for idx in rand_sample
            ]

        if len(neg_anns_of_same_category) > self.max_num_negatives:
            rand_sample = sorted(
                random.sample(range(len(neg_anns_of_same_category)),
                              self.max_num_negatives))
            neg_anns_of_same_category = [
                neg_anns_of_same_category[idx] for idx in rand_sample
            ]
        elif len(neg_anns_of_same_category) < self.max_num_negatives:
            rand_sample = sorted(
                random.sample(
                    range(len(neg_anns_of_diff_category)),
                    min(
                        self.max_num_negatives -
                        len(neg_anns_of_same_category),
                        len(neg_anns_of_diff_category))))
            neg_anns_of_same_category += [
                neg_anns_of_diff_category[idx] for idx in rand_sample
            ]

            # If we are running short of proposal negatives, sample from gt negatives
            if len(
                    neg_anns_of_same_category
            ) < self.max_num_negatives and self.neg_proposal_source != 'gt':
                rand_sample = sorted(
                    random.sample(
                        range(len(context_anns_of_diff_category)),
                        min(
                            self.max_num_negatives -
                            len(neg_anns_of_same_category),
                            len(context_anns_of_diff_category))))
                neg_anns_of_same_category += [
                    context_anns_of_diff_category[idx] for idx in rand_sample
                ]

        pad = self.max_words - (len(stream) + 1) if self.pad else 0
        if pad > 0:
            self.num_pads += 1

        out = {}
        timestep_input = np.asarray(
            [[self.vocabulary[EOS_IDENTIFIER]] + stream + [-1] * pad],
            np.float16)
        out['timestep_input'] = np.tile(timestep_input.T,
                                        (1, self.max_num_context))
        timestep_cont = np.asarray([[0] + [1] * len(stream) + [0] * pad],
                                   np.float16)
        out['timestep_cont'] = np.tile(timestep_cont.T,
                                       (1, self.max_num_context))
        timestep_target = np.asarray(
            stream + [self.vocabulary[EOS_IDENTIFIER]] + [-1] * pad,
            np.float16)
        out['timestep_target'] = timestep_target
        self.swap_axis_streams.add('timestep_input')
        self.swap_axis_streams.add('timestep_target')
        self.swap_axis_streams.add('timestep_cont')

        # Write image features to batch
        img_info = self.dataset.loadImgs(image_id)[0]
        img_wd = float(img_info['width'])
        img_ht = float(img_info['height'])
        assert (len(object_id_list) <= 2)
        fc7_img = self.dataset.image_features[str(
            (image_id, [0, 0, int(img_wd - 1),
                        int(img_ht - 1)]))][0]
        out['fc7_img'] = np.tile(fc7_img, (self.max_num_context, 1))
        img_bbox_features = np.zeros((self.max_num_context, 5), np.float16)
        img_bbox_features[:] = [0, 0, 1, 1, 1]
        out['img_bbox_features'] = img_bbox_features

        # Write object region features to batch
        object_bbox = self.dataset.loadAnns(object_id)[0]['bbox']
        fc7_obj = self.dataset.image_features[str((image_id, object_bbox))][0]
        out['fc7_obj'] = np.tile(fc7_obj, (self.max_num_context, 1))

        bbox_area_ratio = (object_bbox[2] * object_bbox[3]) / (img_wd * img_ht)
        bbox_x1y1x2y2 = [
            object_bbox[0] / img_wd, object_bbox[1] / img_ht,
            (object_bbox[0] + object_bbox[2]) / img_wd,
            (object_bbox[1] + object_bbox[3]) / img_ht
        ]
        bbox_features = bbox_x1y1x2y2 + [bbox_area_ratio]
        out['bbox_features'] = np.tile(bbox_features,
                                       (self.max_num_context, 1))

        # Write context features to batch
        context_fc7 = np.tile(fc7_img, (self.max_num_context, 1))
        context_bbox_features = np.zeros((self.max_num_context, 5), np.float16)
        context_bbox_features[:] = [0, 0, 1, 1, 1]
        if len(context_anns_of_same_category) > 0:
            other_bboxes = [
                ann['bbox'] for ann in context_anns_of_same_category
            ]
            for idx, other_bbox in enumerate(other_bboxes):
                other_bbox_area_ratio = (other_bbox[2] *
                                         other_bbox[3]) / (img_wd * img_ht)
                other_bbox_x1y1x2y2 = [
                    other_bbox[0] / img_wd, other_bbox[1] / img_ht,
                    (other_bbox[0] + other_bbox[2]) / img_wd,
                    (other_bbox[1] + other_bbox[3]) / img_ht
                ]
                other_bbox_features = other_bbox_x1y1x2y2 + [
                    other_bbox_area_ratio
                ]
                context_fc7[idx, :] = self.dataset.image_features[str(
                    (image_id, other_bbox))][0]
                context_bbox_features[idx, :] = other_bbox_features
        out['context_fc7'] = context_fc7
        out['context_bbox_features'] = context_bbox_features

        # Write negative features to batch
        negative_fc7 = np.zeros(
            (self.max_num_negatives, self.dataset.image_feature_length),
            np.float16)
        negative_bbox_features = np.zeros((self.max_num_negatives, 5),
                                          np.float16)
        if len(neg_anns_of_same_category) > 0:
            other_bboxes = [ann['bbox'] for ann in neg_anns_of_same_category]
            for idx, other_bbox in enumerate(other_bboxes):
                other_bbox_area_ratio = (other_bbox[2] *
                                         other_bbox[3]) / (img_wd * img_ht)
                other_bbox_x1y1x2y2 = [
                    other_bbox[0] / img_wd, other_bbox[1] / img_ht,
                    (other_bbox[0] + other_bbox[2]) / img_wd,
                    (other_bbox[1] + other_bbox[3]) / img_ht
                ]
                other_bbox_features = other_bbox_x1y1x2y2 + [
                    other_bbox_area_ratio
                ]
                negative_fc7[idx, :] = self.dataset.image_features[str(
                    (image_id, other_bbox))][0]
                negative_bbox_features[idx, :] = other_bbox_features
        out['negative_fc7'] = negative_fc7
        out['negative_bbox_features'] = negative_bbox_features

        pairwise_similarity = np.asarray([[0] * self.max_num_negatives],
                                         np.float16)
        out['pairwise_similarity'] = np.tile(pairwise_similarity,
                                             (self.max_words, 1))
        self.swap_axis_streams.add('pairwise_similarity')

        self.num_outs += 1
        self.next_line()
        return out
  def get_streams(self):
    ((image_filename, image_id), object_id_list, line) = self.image_refexp_pairs[self.index]
    if image_id in self.dataset.imgs_with_errors:
      line = EOS_IDENTIFIER

    stream = get_encoded_line(line, self.vocabulary)
    # Assumes stream has EOS word at the end
    assert (stream[-1] == self.vocabulary[EOS_IDENTIFIER])
    stream = stream[:-1]
    filtered_stream = []
    for word in stream:
      if word != self.vocabulary[UNK_IDENTIFIER]:
        filtered_stream.append(word)
    stream = filtered_stream
    if self.truncate and len(stream) >= self.max_words:
      stream = stream[:self.max_words-1]
      self.num_truncates += 1

    object_id = object_id_list[1]
    object_ann = self.dataset.loadAnns(object_id)[0]
    object_category = self.dataset.loadCats(object_ann['category_id'])[0]['name']
    object_bbox = self.dataset.loadAnns(object_id)[0]['bbox']
    context_anns_of_same_category = []
    context_anns_of_diff_category = []
    if hasattr(self.dataset, 'coco'):
      all_anns = self.dataset.coco.imgToAnns[image_id]
    else:
      all_anns = self.dataset.imgToAnns[image_id]
    for ann in all_anns:
      if ann['id'] != object_id:
        if ann['category_id'] == object_ann['category_id']:
          context_anns_of_same_category.append(ann)
        else:
          context_anns_of_diff_category.append(ann)

    neg_anns_of_same_category = []
    neg_anns_of_diff_category = []
    if self.neg_proposal_source != 'gt':
      image_info = self.dataset.loadImgs(image_id)[0]
      all_anns = image_info['region_candidates']
      for ann in all_anns:
        ann['bbox'] = ann['bounding_box']
        ann_box = ann['bbox']
        iou = iou_bboxes(ann_box, object_bbox)
        if iou < 0.5 and ann['predicted_object_name'] == object_category:
          neg_anns_of_same_category.append(ann)
        elif ann['predicted_object_name'] != object_category:
          neg_anns_of_diff_category.append(ann)
    else:
      neg_anns_of_same_category = context_anns_of_same_category
      neg_anns_of_diff_category = context_anns_of_diff_category

    # subtract one because image is reserved as one context region
    if len(context_anns_of_same_category) > self.max_num_context-1:
      rand_sample = sorted(random.sample(range(len(context_anns_of_same_category)), self.max_num_context - 1))
      context_anns_of_same_category = [context_anns_of_same_category[idx] for idx in rand_sample]
    elif len(context_anns_of_same_category) < self.max_num_context-1:
      rand_sample = sorted(random.sample(range(len(context_anns_of_diff_category)),
                                         min(self.max_num_context - 1 - len(context_anns_of_same_category),
                                             len(context_anns_of_diff_category))))
      context_anns_of_same_category += [context_anns_of_diff_category[idx] for idx in rand_sample]

    if len(neg_anns_of_same_category) > self.max_num_negatives:
      rand_sample = sorted(random.sample(range(len(neg_anns_of_same_category)),self.max_num_negatives))
      neg_anns_of_same_category = [neg_anns_of_same_category[idx] for idx in rand_sample]
    elif len(neg_anns_of_same_category) < self.max_num_negatives:
      rand_sample = sorted(random.sample(range(len(neg_anns_of_diff_category)),
                                         min(self.max_num_negatives-len(neg_anns_of_same_category),
                                             len(neg_anns_of_diff_category))))
      neg_anns_of_same_category += [neg_anns_of_diff_category[idx] for idx in rand_sample]

      # If we are running short of proposal negatives, sample from gt negatives
      if len(neg_anns_of_same_category) < self.max_num_negatives and self.neg_proposal_source != 'gt':
        rand_sample = sorted(random.sample(range(len(context_anns_of_diff_category)),
                                           min(self.max_num_negatives-len(neg_anns_of_same_category),
                                               len(context_anns_of_diff_category))))
        neg_anns_of_same_category += [context_anns_of_diff_category[idx] for idx in rand_sample]

    pad = self.max_words - (len(stream) + 1) if self.pad else 0
    if pad > 0:
      self.num_pads += 1

    out = {}
    timestep_input = np.asarray([[self.vocabulary[EOS_IDENTIFIER]] + stream + [-1] * pad], np.float16)
    out['timestep_input'] = np.tile(timestep_input.T, (1,self.max_num_context))
    timestep_cont = np.asarray([[0] + [1] * len(stream) + [0] * pad], np.float16)
    out['timestep_cont'] = np.tile(timestep_cont.T, (1,self.max_num_context))
    timestep_target = np.asarray(stream + [self.vocabulary[EOS_IDENTIFIER]] + [-1] * pad, np.float16)
    out['timestep_target'] = timestep_target
    self.swap_axis_streams.add('timestep_input')
    self.swap_axis_streams.add('timestep_target')
    self.swap_axis_streams.add('timestep_cont')

    # Write image features to batch
    img_info = self.dataset.loadImgs(image_id)[0]
    img_wd = float(img_info['width'])
    img_ht = float(img_info['height'])
    assert(len(object_id_list) <= 2)
    fc7_img = self.dataset.image_features[str((image_id, [0, 0, int(img_wd - 1), int(img_ht - 1)]))][0]
    out['fc7_img'] = np.tile(fc7_img, (self.max_num_context, 1))
    img_bbox_features = np.zeros((self.max_num_context, 5), np.float16)
    img_bbox_features[:] = [0,0,1,1,1]
    out['img_bbox_features'] = img_bbox_features

    # Write object region features to batch
    object_bbox = self.dataset.loadAnns(object_id)[0]['bbox']
    fc7_obj = self.dataset.image_features[str((image_id, object_bbox))][0]
    out['fc7_obj'] = np.tile(fc7_obj, (self.max_num_context, 1))

    bbox_area_ratio = (object_bbox[2] * object_bbox[3]) / (img_wd * img_ht)
    bbox_x1y1x2y2 = [object_bbox[0] / img_wd, object_bbox[1] / img_ht,
                     (object_bbox[0] + object_bbox[2]) / img_wd, (object_bbox[1] + object_bbox[3]) / img_ht]
    bbox_features = bbox_x1y1x2y2 + [bbox_area_ratio]
    out['bbox_features'] = np.tile(bbox_features, (self.max_num_context, 1))

    # Write context features to batch
    context_fc7 = np.tile(fc7_img, (self.max_num_context, 1))
    context_bbox_features = np.zeros((self.max_num_context, 5), np.float16)
    context_bbox_features[:] = [0,0,1,1,1]
    if len(context_anns_of_same_category) > 0:
      other_bboxes = [ann['bbox'] for ann in context_anns_of_same_category]
      for idx, other_bbox in enumerate(other_bboxes):
        other_bbox_area_ratio = (other_bbox[2] * other_bbox[3]) / (img_wd * img_ht)
        other_bbox_x1y1x2y2 = [other_bbox[0] / img_wd, other_bbox[1] / img_ht,
                               (other_bbox[0] + other_bbox[2]) / img_wd, (other_bbox[1] + other_bbox[3]) / img_ht]
        other_bbox_features = other_bbox_x1y1x2y2 + [other_bbox_area_ratio]
        context_fc7[idx,:] = self.dataset.image_features[str((image_id, other_bbox))][0]
        context_bbox_features[idx,:] = other_bbox_features
    out['context_fc7'] = context_fc7
    out['context_bbox_features'] = context_bbox_features

    # Write negative features to batch
    negative_fc7 = np.zeros((self.max_num_negatives, self.dataset.image_feature_length),np.float16)
    negative_bbox_features = np.zeros((self.max_num_negatives, 5),np.float16)
    if len(neg_anns_of_same_category) > 0:
      other_bboxes = [ann['bbox'] for ann in neg_anns_of_same_category]
      for idx, other_bbox in enumerate(other_bboxes):
        other_bbox_area_ratio = (other_bbox[2] * other_bbox[3]) / (img_wd * img_ht)
        other_bbox_x1y1x2y2 = [other_bbox[0] / img_wd, other_bbox[1] / img_ht,
                               (other_bbox[0] + other_bbox[2]) / img_wd, (other_bbox[1] + other_bbox[3]) / img_ht]
        other_bbox_features = other_bbox_x1y1x2y2 + [other_bbox_area_ratio]
        negative_fc7[idx,:] = self.dataset.image_features[str((image_id, other_bbox))][0]
        negative_bbox_features[idx,:] = other_bbox_features
    out['negative_fc7'] = negative_fc7
    out['negative_bbox_features'] = negative_bbox_features

    pairwise_similarity = np.asarray([[0] * self.max_num_negatives], np.float16)
    out['pairwise_similarity'] = np.tile(pairwise_similarity, (self.max_words,1))
    self.swap_axis_streams.add('pairwise_similarity')

    self.num_outs += 1
    self.next_line()
    return out