コード例 #1
0
def preprocess_dataset(split_name,
                       data_split_name,
                       batch_stream_length,
                       aligned=False):
    if split_name == 'train':
        imgid_file = IMAGEID_FILE_PATTERN.format('train2014')
        feat_files = [POOLFEAT_FILE_PATTERN.format('train2014')]
        sent_files = [SENTS_FILE_PATTERN.format('train2014')]
        label_file = [LABEL_FILE_PATTERN.format('train2014')]
    elif split_name == 'valid':
        imgid_file = IMAGEID_FILE_PATTERN.format('val2014')
        feat_files = [
            POOLFEAT_FILE_PATTERN.format('train2014'),
            POOLFEAT_FILE_PATTERN.format('mytest')
        ]
        sent_files = [
            SENTS_FILE_PATTERN.format('trainvallstm'),
            SENTS_FILE_PATTERN.format('mytest')
        ]
        label_file = [
            LABEL_FILE_PATTERN.format('vallstm2014'),
            LABEL_FILE_PATTERN.format('mytest')
        ]
    else:
        print 'Error. Invalid data_split_name: %s' % data_split_name
        return
    filenames = [(imgid_file, feat_files, sent_files, label_file)]

    vocab_filename = VOCAB
    output_basis_path = OUTPUT_BASIS_DIR_PATTERN % split_name
    aligned = True
    fsg = fc7SequenceGenerator(filenames,
                               vocab_filename,
                               VIDEO_STREAMS,
                               max_frames=MAX_FRAMES,
                               align=aligned,
                               shuffle=True,
                               pad=aligned,
                               truncate=aligned)
    fsg.batch_stream_length = batch_stream_length
    writer = HDF5SequenceWriter(fsg, output_dir=output_basis_path)
    writer.write_to_exhaustion()
    writer.write_filelists()
    output_text_path = OUTPUT_TEXT_DIR_PATTERN % split_name
    fsg_lines = fsg.lines
    tsg = TextSequenceGenerator(fsg_lines,
                                vocab_filename,
                                BUFFER_SIZE,
                                max_words=MAX_WORDS,
                                pad=aligned,
                                truncate=aligned)
    tsg.batch_stream_length = batch_stream_length
    writer = HDF5SequenceWriter(tsg, output_dir=output_text_path)
    writer.write_to_exhaustion()
    writer.write_filelists()
    if not os.path.isfile(vocab_filename):
        print "Vocabulary not found"
コード例 #2
0
def preprocess_dataset(split_name,
                       data_split_name,
                       batch_stream_length,
                       aligned=False,
                       reverse=False):
    filenames = [(FRAMEFEAT_FILE_PATTERN.format(data_split_name),
                  SENTS_FILE_PATTERN.format(data_split_name))]
    vocab_filename = VOCAB
    output_path = OUTPUT_DIR_PATTERN % split_name
    aligned = True
    fsg = fc7FrameSequenceGenerator(filenames,
                                    BUFFER_SIZE,
                                    vocab_filename,
                                    max_words=MAX_WORDS,
                                    align=aligned,
                                    shuffle=True,
                                    pad=aligned,
                                    truncate=aligned,
                                    reverse=reverse)
    fsg.batch_stream_length = batch_stream_length
    writer = HDF5SequenceWriter(fsg, output_dir=output_path)
    writer.write_to_exhaustion()
    writer.write_filelists()
    if not os.path.isfile(vocab_filename):
        print("Vocabulary not found")
        # fsg.dump_vocabulary(vocab_out_path)
    out_path = OUT_CORPUS_PATH.format(data_split_name)
    vid_id_order_outpath = '%s/yt_s2vtvgg_%s_vidid_order_%d_%d.txt' % \
                           (out_path, data_split_name, BUFFER_SIZE, MAX_WORDS)
    frame_sequence_outpath = '%s/yt_s2vtvgg_%s_sequence_%d_%d_recurrent.txt' % \
                             (out_path, data_split_name, BUFFER_SIZE, MAX_WORDS)
    fsg.dump_video_file(vid_id_order_outpath, frame_sequence_outpath)
コード例 #3
0
def preprocess_dataset(split_name,
                       data_split_name,
                       batch_stream_length,
                       aligned=False):
    filenames = [(IMAGEID_FILE_PATTERN,
                  POOLFEAT_FILE_PATTERN.format(data_split_name),
                  LABEL_FILE_PATTERN)]
    vocab_filename = VOCAB
    output_basis_path = OUTPUT_BASIS_DIR_PATTERN % split_name
    aligned = True
    fsg = fc7SequenceGenerator(filenames,
                               vocab_filename,
                               VIDEO_STREAMS,
                               max_frames=MAX_FRAMES,
                               align=aligned,
                               shuffle=True,
                               pad=aligned,
                               truncate=aligned)
    fsg.batch_stream_length = batch_stream_length
    writer = HDF5SequenceWriter(fsg, output_dir=output_basis_path)
    writer.write_to_exhaustion()
    writer.write_filelists()
    if not os.path.isfile(vocab_filename):
        print "Vocabulary not found"