def preprocess_dataset(split_name, data_split_name, batch_stream_length, aligned=False): if split_name == 'train': imgid_file = IMAGEID_FILE_PATTERN.format('train2014') feat_files = [POOLFEAT_FILE_PATTERN.format('train2014')] sent_files = [SENTS_FILE_PATTERN.format('train2014')] label_file = [LABEL_FILE_PATTERN.format('train2014')] elif split_name == 'valid': imgid_file = IMAGEID_FILE_PATTERN.format('val2014') feat_files = [ POOLFEAT_FILE_PATTERN.format('train2014'), POOLFEAT_FILE_PATTERN.format('mytest') ] sent_files = [ SENTS_FILE_PATTERN.format('trainvallstm'), SENTS_FILE_PATTERN.format('mytest') ] label_file = [ LABEL_FILE_PATTERN.format('vallstm2014'), LABEL_FILE_PATTERN.format('mytest') ] else: print 'Error. Invalid data_split_name: %s' % data_split_name return filenames = [(imgid_file, feat_files, sent_files, label_file)] vocab_filename = VOCAB output_basis_path = OUTPUT_BASIS_DIR_PATTERN % split_name aligned = True fsg = fc7SequenceGenerator(filenames, vocab_filename, VIDEO_STREAMS, max_frames=MAX_FRAMES, align=aligned, shuffle=True, pad=aligned, truncate=aligned) fsg.batch_stream_length = batch_stream_length writer = HDF5SequenceWriter(fsg, output_dir=output_basis_path) writer.write_to_exhaustion() writer.write_filelists() output_text_path = OUTPUT_TEXT_DIR_PATTERN % split_name fsg_lines = fsg.lines tsg = TextSequenceGenerator(fsg_lines, vocab_filename, BUFFER_SIZE, max_words=MAX_WORDS, pad=aligned, truncate=aligned) tsg.batch_stream_length = batch_stream_length writer = HDF5SequenceWriter(tsg, output_dir=output_text_path) writer.write_to_exhaustion() writer.write_filelists() if not os.path.isfile(vocab_filename): print "Vocabulary not found"
def preprocess_dataset(split_name, data_split_name, batch_stream_length, aligned=False, reverse=False): filenames = [(FRAMEFEAT_FILE_PATTERN.format(data_split_name), SENTS_FILE_PATTERN.format(data_split_name))] vocab_filename = VOCAB output_path = OUTPUT_DIR_PATTERN % split_name aligned = True fsg = fc7FrameSequenceGenerator(filenames, BUFFER_SIZE, vocab_filename, max_words=MAX_WORDS, align=aligned, shuffle=True, pad=aligned, truncate=aligned, reverse=reverse) fsg.batch_stream_length = batch_stream_length writer = HDF5SequenceWriter(fsg, output_dir=output_path) writer.write_to_exhaustion() writer.write_filelists() if not os.path.isfile(vocab_filename): print("Vocabulary not found") # fsg.dump_vocabulary(vocab_out_path) out_path = OUT_CORPUS_PATH.format(data_split_name) vid_id_order_outpath = '%s/yt_s2vtvgg_%s_vidid_order_%d_%d.txt' % \ (out_path, data_split_name, BUFFER_SIZE, MAX_WORDS) frame_sequence_outpath = '%s/yt_s2vtvgg_%s_sequence_%d_%d_recurrent.txt' % \ (out_path, data_split_name, BUFFER_SIZE, MAX_WORDS) fsg.dump_video_file(vid_id_order_outpath, frame_sequence_outpath)
def preprocess_dataset(split_name, data_split_name, batch_stream_length, aligned=False): filenames = [(IMAGEID_FILE_PATTERN, POOLFEAT_FILE_PATTERN.format(data_split_name), LABEL_FILE_PATTERN)] vocab_filename = VOCAB output_basis_path = OUTPUT_BASIS_DIR_PATTERN % split_name aligned = True fsg = fc7SequenceGenerator(filenames, vocab_filename, VIDEO_STREAMS, max_frames=MAX_FRAMES, align=aligned, shuffle=True, pad=aligned, truncate=aligned) fsg.batch_stream_length = batch_stream_length writer = HDF5SequenceWriter(fsg, output_dir=output_basis_path) writer.write_to_exhaustion() writer.write_filelists() if not os.path.isfile(vocab_filename): print "Vocabulary not found"