Beispiel #1
0
def preprocess_dataset(split_name, data_split_name, batch_stream_length, aligned=False):
    filenames = [(POOLFEAT_FILE_PATTERN.format(data_split_name),
                  SENTS_FILE_PATTERN.format(data_split_name))]
    vocab_filename = VOCAB
    output_basis_path = OUTPUT_BASIS_DIR_PATTERN % split_name
    aligned = True
    fsg = fc7SequenceGenerator(filenames, data_split_name, VIDEO_STREAMS,
                               max_frames=MAX_FRAMES, align=aligned, shuffle=True, pad=aligned,
                               truncate=aligned)
    fsg.batch_stream_length = batch_stream_length
    writer = HDF5SequenceWriter(fsg, output_dir=output_basis_path)
    writer.write_to_exhaustion()
    writer.write_filelists()
    output_text_path = OUTPUT_TEXT_DIR_PATTERN % split_name
    fsg_lines = fsg.lines
    tsg = TextSequenceGenerator(fsg_lines, vocab_filename, BUFFER_SIZE,
                                max_words=MAX_WORDS, pad=aligned, truncate=aligned)
    tsg.batch_stream_length = batch_stream_length
    writer = HDF5SequenceWriter(tsg, output_dir=output_text_path)
    writer.write_to_exhaustion()
    writer.write_filelists()
    # if not os.path.isfile(vocab_filename):
    #     fsg.dump_vocabulary(vocab_out_path)
    out_path = OUT_CORPUS_PATH.format(data_split_name)
    vid_id_order_outpath = '%s/yt_pool_%s_vidid_order_%d_%d.txt' % \
                           (out_path, data_split_name, BUFFER_SIZE, MAX_WORDS)
    frame_sequence_outpath = '%s/yt_prepool_%s_sequence_%d_%d_recurrent.txt' % \
                             (out_path, data_split_name, BUFFER_SIZE, MAX_WORDS)
    fsg.dump_video_file(vid_id_order_outpath, frame_sequence_outpath)
def preprocess_dataset(split_name, coco_split_name, batch_stream_length,
                       vocab=None, aligned=True):
  with open(SPLITS_PATTERN % split_name, 'r') as split_file:
    split_image_ids = [int(line) for line in split_file.readlines()]
  output_dataset_name = split_name
  if aligned:
    output_dataset_name += '_aligned_%d' % MAX_WORDS
  else:
    output_dataset_name += '_unaligned'
  output_path = OUTPUT_DIR_PATTERN % output_dataset_name
  coco = COCO(COCO_ANNO_PATH % coco_split_name)
  sg = CocoSequenceGenerator(coco, BUFFER_SIZE, split_ids=split_image_ids,
      vocab=vocab, align=aligned, pad=aligned, truncate=aligned)
  sg.batch_stream_length = batch_stream_length
  writer = HDF5SequenceWriter(sg, output_dir=output_path)
  writer.write_to_exhaustion()
  writer.write_filelists()
  if vocab is None:
    vocab_out_path = '%s/vocabulary.txt' % OUTPUT_DIR
    sg.dump_vocabulary(vocab_out_path)
  image_out_path = '%s/image_list.txt' % output_path
  image_dummy_labels_out_path = '%s/image_list.with_dummy_labels.txt' % output_path
  sg.dump_image_file(image_out_path, image_dummy_labels_out_path)
  num_outs = sg.num_outs
  num_pads = sg.num_pads
  num_truncates = sg.num_truncates
  print 'Padded %d/%d sequences; truncated %d/%d sequences' % \
      (num_pads, num_outs, num_truncates, num_outs)
  return sg.vocabulary_inverted
def process_dataset(experiment_paths,
                    experiment_config,
                    pre_existing_file=None):
    vocab = None
    split_names = ['train', 'val']
    if hasattr(experiment_config, 'debug') and experiment_config.debug:
        split_names = ['train_debug']
    for split_name in split_names:
        if split_name == 'val' and vocab is None:
            raise StandardError(
                'Need a vocabulary constructed from train split')

        #TODO: Does saving and reading the dataset from a pickle file save time?
        if experiment_config.dataset == 'Google_RefExp':
            dataset = GoogleRefExp(split_name, experiment_paths)
        elif experiment_config.dataset == 'UNC_RefExp':
            dataset = UNCRefExp(split_name, experiment_paths)
        else:
            raise Exception('Unknown dataset: %s' % experiment_config.dataset)

        print "Processing dataset %s" % dataset.dataset_name
        if experiment_config.exp_name == 'baseline':
            sg = BaselineSequenceGenerator(experiment_paths,
                                           experiment_config,
                                           dataset,
                                           vocab=vocab,
                                           pre_existing_file=pre_existing_file)
        elif experiment_config.exp_name.startswith('max_margin'):
            sg = MaxMarginSequenceGenerator(
                experiment_paths,
                experiment_config,
                dataset,
                vocab=vocab,
                pre_existing_file=pre_existing_file)
        elif experiment_config.exp_name.startswith('mil_context'):
            sg = MILContextSequenceGenerator(
                experiment_paths,
                experiment_config,
                dataset,
                vocab=vocab,
                pre_existing_file=pre_existing_file)
        else:
            raise StandardError("Unknown experiment name %s" %
                                experiment_config.exp_name)
        output_dir = "%s/buffer_%d/%s_%s" % (
            experiment_paths.h5_data, experiment_config.train.batch_size,
            dataset.dataset_name, experiment_config.train.tag)
        writer = HDF5SequenceWriter(sg, output_dir=output_dir)
        writer.write_to_exhaustion()
        writer.write_filelists()
        sg.dataset.image_features.close()

        if vocab is None:
            sg.dump_vocabulary(experiment_config.vocab_file)
            vocab = sg.vocabulary_inverted

        print 'Padded %d/%d sequences; truncated %d/%d sequences' % \
              (sg.num_pads, sg.num_outs, sg.num_truncates, sg.num_outs)
Beispiel #4
0
def process_dataset(split_name,
                    coco_split_name,
                    batch_stream_length,
                    vocab=None,
                    aligned=True):
    with open(SPLITS_PATTERN % split_name, 'r') as split_file:
        split_image_ids = [
            line.strip().replace('.jpg', '')
            for line in split_file.readlines()
        ]  # line.strip(): remove the '\n' in each line
    output_dataset_name = split_name
    if aligned:
        output_dataset_name += '_aligned_%d' % MAX_WORDS
    else:
        output_dataset_name += '_unaligned'
    output_path = OUTPUT_DIR_PATTERN % output_dataset_name
    coco = COCO()
    image_root = COCO_IMAGE_PATTERN % coco_split_name
    #pdb.set_trace()
    #print image_root

    sg = CocoSequenceGenerator(coco,
                               split_name,
                               BUFFER_SIZE,
                               image_root,
                               split_ids=split_image_ids,
                               vocab=vocab,
                               align=aligned,
                               pad=aligned,
                               truncate=aligned)
    sg.batch_stream_length = batch_stream_length  #batch_stream_length 100000

    writer = HDF5SequenceWriter(sg, output_dir=output_path)
    writer.write_to_exhaustion()  # call hdf5_sequence_generator
    writer.write_filelists()
    if vocab is None:
        vocab_out_path = '%s/vocabulary.txt' % OUTPUT_DIR
        sg.dump_vocabulary(vocab_out_path)
    image_out_path = '%s/image_list.txt' % output_path
    image_dummy_labels_out_path = '%s/image_list.with_dummy_labels.txt' % output_path
    sg.dump_image_file(image_out_path, image_dummy_labels_out_path)
    num_outs = sg.num_outs
    num_pads = sg.num_pads
    num_truncates = sg.num_truncates
    print 'Padded %d/%d sequences; truncated %d/%d sequences' % \
        (num_pads, num_outs, num_truncates, num_outs)
    return sg.vocabulary_inverted
Beispiel #5
0
def process_dataset(split_name,
                    coco_split_name,
                    batch_stream_length,
                    vocab=None,
                    aligned=True,
                    vocab_tag=''):
    with open(SPLITS_PATTERN % split_name, 'r') as split_file:
        split_image_ids = [int(line) for line in split_file.readlines()]
    output_dataset_name = split_name
    if aligned:
        output_dataset_name += '_aligned_%d' % MAX_WORDS
    else:
        output_dataset_name += '_unaligned'
    output_path = OUTPUT_DIR_PATTERN % output_dataset_name
    #coco (I think) just puts annotations together for train/test set.  Want to use trainval set for my own images (I think... need to ask Jeff)
    coco = COCO(
        COCO_ANNO_PATH % split_name
    )  #THIS IS CHANGED!!!  This means that you have to have a json file with the captions for your split!
    image_root = COCO_IMAGE_PATTERN % coco_split_name
    sg = CocoSequenceGenerator(coco,
                               BUFFER_SIZE,
                               image_root,
                               split_ids=split_image_ids,
                               vocab=vocab,
                               align=aligned,
                               pad=aligned,
                               truncate=aligned)
    if vocab is None:
        vocab_out_path = '%s/%svocabulary.txt' % (OUTPUT_DIR, vocab_tag)
        sg.dump_vocabulary(vocab_out_path)
    sg.batch_stream_length = batch_stream_length
    writer = HDF5SequenceWriter(sg, output_dir=output_path)
    writer.write_to_exhaustion()
    writer.write_filelists()
    image_out_path = '%s/image_list.txt' % output_path
    image_dummy_labels_out_path = '%s/image_list.with_dummy_labels.txt' % output_path
    sg.dump_image_file(image_out_path, image_dummy_labels_out_path)
    num_outs = sg.num_outs
    num_pads = sg.num_pads
    num_truncates = sg.num_truncates
    print 'Padded %d/%d sequences; truncated %d/%d sequences' % \
        (num_pads, num_outs, num_truncates, num_outs)
    return sg.vocabulary_inverted