def main(unused_argv): print("tensorflow version: %s" % tf.__version__) all_frame_files = gfile.Glob(FLAGS.input_data_pattern) f_fullpath = all_frame_files[FLAGS.file_from:FLAGS.file_to] f_fns = [x.split('/')[-1] for x in f_fullpath] exist_files = gfile.Glob(FLAGS.output_path + "C*tfrecord") exist_fn = [x.split('/')[-1].replace('CAtr', 'Atr') for x in exist_files] yet_2_split = [x for x, y in zip(f_fullpath, f_fns) if y not in exist_fn] vf = [FLAGS.output_path + 'C' + x.split('/')[-1] for x in yet_2_split] mylog('number of files suggested: %d' % len(f_fullpath)) mylog('number of files yet to process: %d' % len(yet_2_split)) if FLAGS.parallel: from concurrent import futures executor = futures.ProcessPoolExecutor(max_workers=2) executor.map(process_one_file, zip(yet_2_split, vf)) else: for filenames in zip(yet_2_split, vf): #mylog('processing: {}'.format(filenames)) process_one_file(filenames) mylog("done")
def main(unused_argv): logging.set_verbosity(tf.logging.ERROR) print("tensorflow version: %s" % tf.__version__) all_frame_files = gfile.Glob(FLAGS.input_data_pattern) f_fullpath = all_frame_files[FLAGS.file_from:FLAGS.file_to] f_fns = [x.split('/')[-1] for x in f_fullpath] exist_files = gfile.Glob(FLAGS.output_path + "E*tfrecord") exist_fn = [x.split('/')[-1].replace('Etr', 'tr') for x in exist_files] exist_fn = [x.split('/')[-1].replace('Eval', 'val') for x in exist_fn] exist_fn = [x.split('/')[-1].replace('Etes', 'tes') for x in exist_fn] yet_2_split = [x for x, y in zip(f_fullpath, f_fns) if y not in exist_fn] vf0 = [FLAGS.output_path + 'O' + x.split('/')[-1] for x in yet_2_split] vf1 = [FLAGS.output_path + 'E' + x.split('/')[-1] for x in yet_2_split] vf2 = [FLAGS.output_path + 'F' + x.split('/')[-1] for x in yet_2_split] mylog('number of files suggested: %d' % len(f_fullpath)) mylog('number of files yet to process: %d' % len(yet_2_split)) #with tf.device("/gpu:0"): with tf.Graph().as_default(): build_graph() split_files(zip(yet_2_split, vf0, vf1, vf2)) mylog("done")
def pick_features_from_file(input_fn, out_fn, feats=None): start_time = time.time() opts = tf.python_io.TFRecordOptions( tf.python_io.TFRecordCompressionType.ZLIB) ex_iter = tf.python_io.tf_record_iterator(input_fn, options=opts) num_examples = 0 with tf.python_io.TFRecordWriter(out_fn, options=opts) as tfwriter: out_examples = [] for input_bytes in ex_iter: #two loops to split read/write operations input_example = tf.train.Example.FromString(input_bytes) out_examples.append( select_features_from_tfexample(input_example, feats)) for example in out_examples: tfwriter.write(example.SerializeToString()) num_examples += 1 seconds_per_file = time.time() - start_time num_examples_per_sec = num_examples / seconds_per_file mylog( "Processed in {:.0f} sec: {}, Examples: {}, Examples/second: {:.0f}.". format(seconds_per_file, input_fn, num_examples, num_examples_per_sec))
] trfiles = [ "YouTube.Kaggle/input/frame_level_link/train/train" + x + ".tfrecord" for x in trfs ] allFiles = glob.glob( "YouTube.Kaggle/input/frame_level_link/validate/*.tfrecord") allFiles = glob.glob("YouTube.Kaggle/input/video_level/test/*.tfrecord") allFiles = glob.glob("YouTube.Kaggle/input/GENERATED_DATA/f2test/*.tfrecord") allFiles = glob.glob( "YouTube.Kaggle/input/GENERATED_DATA/f2test/Atest-a.tfrecord") files = allFiles[0:len(allFiles)] #files = trfiles mylog("Number of files to review: {}.".format(len(files))) file_cnt = 0 i = 0 opts = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB) num_examples_file = open('/tmp/num_examples_by_file.csv', 'w') num_examples_file.write('filename,num_examples\n') for filename in files: file_cnt = file_cnt + 1 #print('checking %d/%d %s' % (cnt, filesSize, filename)) if file_cnt % 200 == 0: mylog("Checked {} files.".format(file_cnt)) num_examples = 0
def split_files(filenames): t0 = time.time() with tf.Session() as sess: vid_tsr = tf.get_collection("vid_tsr")[0] labs_tsr = tf.get_collection("labs_tsr")[0] rgb_tsr = tf.get_collection("rgb_tsr")[0] audio_tsr = tf.get_collection("audio_tsr")[0] seq_example_bytes = tf.get_collection("seq_example_bytes")[0] for k, file_grp in enumerate(filenames): start_time = time.time() infn, outfn0, outfn1, outfn2 = file_grp ex_iter = tf.python_io.tf_record_iterator(infn) v0Examples = [] v1Examples = [] v2Examples = [] for in_ex in ex_iter: vid, labs, rgb, audio = sess.run( [vid_tsr, labs_tsr, rgb_tsr, audio_tsr], feed_dict={seq_example_bytes: in_ex}) nframes = audio.shape[0] half = np.int(nframes / 2) if nframes > 10: rgb_1 = rgb[:half] rgb_2 = rgb[half:] audio_1 = audio[:half] audio_2 = audio[half:] else: #ignore short videos, let the called handle None's mylog( "One frame video encountered: {}, num_frames: {}, labels: {}" .format(vid, nframes, labs)) rgb_1 = rgb rgb_2 = rgb audio_1 = audio audio_2 = audio if FLAGS.skip_shorts: continue #try: v0Examples.append(np_2_vExample(vid, labs, rgb, audio)) v1Examples.append(np_2_vExample(vid, labs, rgb_1, audio_1)) v2Examples.append(np_2_vExample(vid, labs, rgb_2, audio_2)) #except: # mylog("failed. nframes: {}, rgb shape: {}".format(nframes, rgb.shape)) opts = tf.python_io.TFRecordOptions( tf.python_io.TFRecordCompressionType.ZLIB) with tf.python_io.TFRecordWriter(outfn0, options=opts) as tfwriter0: for v0_ex in v0Examples: tfwriter0.write(v0_ex.SerializeToString()) with tf.python_io.TFRecordWriter(outfn1, options=opts) as tfwriter1: for v1_ex in v1Examples: tfwriter1.write(v1_ex.SerializeToString()) with tf.python_io.TFRecordWriter(outfn2, options=opts) as tfwriter2: for v2_ex in v2Examples: tfwriter2.write(v2_ex.SerializeToString()) seconds_per_file = time.time() - start_time num_examples_per_sec = len(v1Examples) / seconds_per_file mylog( "Processed file number {} in {:.2f} sec: {}, Examples: {}, Examples/second: {:.0f}." .format(k, seconds_per_file, infn.split('/')[-1], len(v1Examples), num_examples_per_sec)) ttl_time = time.time() - t0 mylog("Processed total {} files in {:.2f}.".format(k, ttl_time)) return k