def controleren(b, e): def get_frames_mp4(bn): for j, v in enumerate(all_videos): if bn in v: return int(skvideo.io.ffprobe(v)['video']['@nb_frames']), v def get_frames_h5(p): with h5py.File(p, 'r') as mf: frs = len(mf.keys()) - 1 return frs still_todo = [] all_videos = DU.get_all_videos('train') + DU.get_all_videos('test') + DU.get_all_videos('val') all_h5 = os.listdir(P.CHALEARN30_ALL_DATA) for i, h5 in enumerate(all_h5[b:e]): print(i) base_name = h5.split('.h5')[0] h5_path = os.path.join(P.CHALEARN30_ALL_DATA, h5) h5_frames = get_frames_h5(h5_path) mp4_frames, v_path = get_frames_mp4(base_name) if h5_frames != mp4_frames: still_todo.append(v_path) print('ohno, %s' % (v_path)) print('still todo', len(still_todo)) return still_todo
def get_left_off_index(): p_main = '/scratch/users/gabras/data/chalearn30/all_data' # probably all complete p_cluster = '/scratch/users/gabras/data/chalearn30/chalearn30/all_data' main_list = os.listdir(p_main) cluster_list = os.listdir(p_cluster) diff = set(cluster_list) - set(main_list) # not_complete = set(cluster_list) - set(diff) all_train = DU.get_all_videos('train') indices_1000_2000 = [] indices_2000_3000 = [] indices_3000_4000 = [] indices_4000_5000 = [] indices_5000_6000 = [] def do_thing(t): for i, v in enumerate(all_train): if t in v: # print(i) if i < 2000: indices_1000_2000.append(i) elif i < 3000: indices_2000_3000.append(i) elif i < 4000: indices_3000_4000.append(i) elif i < 5000: indices_4000_5000.append(i) elif i < 6000: indices_5000_6000.append(i) break for p in diff: p_base = p.split('.h5')[0] do_thing(p_base) indices_1000_2000.sort() indices_2000_3000.sort() indices_3000_4000.sort() indices_4000_5000.sort() indices_5000_6000.sort() # print(indices_1000_2000[0],indices_1000_2000[-1], '\n', # indices_2000_3000[0],indices_2000_3000[-1], '\n', # indices_3000_4000[0],indices_3000_4000[-1], '\n', # indices_4000_5000[0],indices_4000_5000[-1], '\n', # indices_5000_6000[0],indices_5000_6000[-1]) all_train = np.array(all_train) all_train_1000_2000 = list(all_train[indices_1000_2000]) all_train_2000_3000 = list(all_train[indices_2000_3000]) all_train_3000_4000 = list(all_train[indices_3000_4000]) all_train_4000_5000 = list(all_train[indices_4000_5000]) all_train_5000_6000 = list(all_train[indices_5000_6000]) # return ones that have already been converted return all_train_1000_2000, all_train_2000_3000, all_train_3000_4000, all_train_4000_5000, all_train_5000_6000
def move_cluster_completeness(): p_main = '/scratch/users/gabras/data/chalearn30/all_data' # probably all complete p_cluster = '/scratch/users/gabras/data/chalearn30/chalearn30/all_data' main_list = os.listdir(p_main) cluster_list = os.listdir(p_cluster) diff = set(cluster_list) - set(main_list) # print(len(diff)) cnt = 0 completed_which_moved = 0 # get all the mp4s all_train = DU.get_all_videos('train') for n in diff: cnt += 1 print(cnt) h5 = os.path.join(p_cluster, n) try: mf = h5py.File(h5, 'r') # with h5py.File(h5, 'r') as mf: frames_h5 = len(mf.keys()) - 1 mf.close() print('opened and closed') n_base = n.split('.h5')[0] frames_mp4 = 0 # find matching mp4 for v in all_train: if n_base in v: frames_mp4 = DU.mp4_to_arr(v).shape[0] break if frames_mp4 == 0: print('ohboi') return if frames_h5 == frames_mp4: src = os.path.join(p_cluster, n) dst = os.path.join(p_main, n) shutil.move(src=src, dst=dst) completed_which_moved += 1 print(completed_which_moved, cnt) else: print('frames should be %d, but are %d' % (frames_mp4, frames_h5)) except (OSError, RuntimeError) as e: print(h5, 'failed', e)
def parallel_convert_mod(which, b, e, func, number_processes=20): all_videos = DU.get_all_videos(which) a12, a23, a34, a45, a56 = get_left_off_index() pool = Pool(processes=number_processes) all_videos = all_videos[b:e] if b == 1000: all_videos = list(set(all_videos) - set(a12)) elif b == 2000: all_videos = list(set(all_videos) - set(a23)) elif b == 3000: all_videos = list(set(all_videos) - set(a34)) elif b == 4000: all_videos = list(set(all_videos) - set(a45)) elif b == 5000: all_videos = list(set(all_videos) - set(a56)) pool.apply_async(func) pool.map(func, all_videos)
def get_missing(): p_main = '/scratch/users/gabras/data/chalearn30/all_data' # probably all complete p_cluster = '/scratch/users/gabras/data/chalearn30/chalearn30/all_data' main_list = os.listdir(p_main) cluster_list = os.listdir(p_cluster) diff = set(cluster_list) - set(main_list) all_train = DU.get_all_videos('train') missing = [] def do_thing(t): for i, v in enumerate(all_train): if t in v: missing.append(v) for p in diff: p_base = p.split('.h5')[0] do_thing(p_base) return missing
def normal_convert(which, b, e): all_videos = DU.get_all_videos(which) all_videos = all_videos[b:e] for video_path in all_videos: convert(video_path)
def parallel_convert(which, b, e, func, number_processes=20): all_videos = DU.get_all_videos(which) pool = Pool(processes=number_processes) all_videos = all_videos[b:e] pool.apply_async(func) pool.map(func, all_videos)