def _create_file_list( min_sequence_length: int, output_file: str, samples_per_video: int, source_dir_root: str, ): file_list = FileList( root=source_dir_root, classes=[AVSPEECH_NAME], min_sequence_length=min_sequence_length, ) source_dir_root = Path(source_dir_root) # split between train and val videos = sorted(source_dir_root.iterdir()) train = videos[:int(len(videos) * 0.9)] val = videos[int(len(videos) * 0.9):] for split, split_name in [(train, TRAIN_NAME), (val, VAL_NAME)]: for video_folder in sorted(split): images = sorted(video_folder.glob("*.png")) filtered_images_idx = [] # find all frames that have at least min_sequence_length-1 preceeding # frames sequence_start = img_name_to_int(images[0]) last_idx = sequence_start for list_idx, image in enumerate(images): image_idx = img_name_to_int(image) if last_idx + 1 != image_idx: sequence_start = image_idx elif image_idx - sequence_start >= min_sequence_length - 1: filtered_images_idx.append(list_idx) last_idx = image_idx selected_frames = select_frames(len(filtered_images_idx), samples_per_video) sampled_images_idx = np.asarray( filtered_images_idx)[selected_frames] file_list.add_data_points( path_list=images, target_label=AVSPEECH_NAME, split=split_name, sampled_images_idx=sampled_images_idx, ) file_list.save(output_file) logger.info(f"{output_file} created.") return file_list
def _create_file_list( compressions, data_types, min_sequence_length, output_file, samples_per_video, source_dir_root, ): file_list = FileList( root=source_dir_root, classes=FaceForensicsDataStructure.METHODS, min_sequence_length=min_sequence_length, ) # use faceforensicsdatastructure to iterate elegantly over the correct # image folders source_dir_data_structure = FaceForensicsDataStructure( source_dir_root, compressions=compressions, data_types=data_types) _min_sequence_length = _get_min_sequence_length(source_dir_data_structure) if _min_sequence_length < samples_per_video: logger.warning( f"There is a sequence that is sequence that has less frames " f"then you would like to sample: " f"{_min_sequence_length}<{samples_per_video}") for split, split_name in [(TRAIN, TRAIN_NAME), (VAL, VAL_NAME), (TEST, TEST_NAME)]: for source_sub_dir, target in zip( source_dir_data_structure.get_subdirs(), file_list.classes): for video_folder in sorted(source_sub_dir.iterdir()): if video_folder.name.split("_")[0] in split: images = sorted(video_folder.glob("*.png")) filtered_images_idx = [] # find all frames that have at least min_sequence_length-1 preceeding # frames sequence_start = img_name_to_int(images[0]) last_idx = sequence_start for list_idx, image in enumerate(images): image_idx = img_name_to_int(image) if last_idx + 1 != image_idx: sequence_start = image_idx elif image_idx - sequence_start >= min_sequence_length - 1: filtered_images_idx.append(list_idx) last_idx = image_idx selected_frames = select_frames(len(filtered_images_idx), samples_per_video) sampled_images_idx = np.asarray( filtered_images_idx)[selected_frames] file_list.add_data_points( path_list=images, target_label=target, split=split_name, sampled_images_idx=sampled_images_idx, ) file_list.save(output_file) logger.info(f"{output_file} created.") return file_list
samples_per_video = 100 for label in train: images = sorted(label.glob("*.png")) f.add_data_points( images, "avspeech", "train", np.rint( np.linspace(7, len(images), min(samples_per_video, len(images))) - 1 ).astype(int), ) for label in val: images = sorted(label.glob("*.png")) f.add_data_points( images, "avspeech", "val", np.rint( np.linspace(7, len(images), min(samples_per_video, len(images))) - 1 ).astype(int), ) f.root = str(f.root) f.save("/data/ssd1/file_lists/fid/moria_100.json") print(f.get_dataset("train")) print(f.get_dataset("val")) print(f.get_dataset("test"))
f = FileList(str(root_dir), classes=["FAKE", "REAL"], min_sequence_length=1) train_data_numbers = list(range(5, 50)) val_data_numbers = list(range(5)) for train_data_number in train_data_numbers: block = root_dir / f"extracted_images_{train_data_number}" if block.exists(): for label in block.iterdir(): images = list(label.glob("*/*.png")) f.add_data_points(images, label.name, "train", np.arange(0, len(images))) for val_data_number in val_data_numbers: block = root_dir / f"extracted_images_{val_data_number}" if block.exists(): for label in block.iterdir(): images = list(label.glob("*/*.png")) f.add_data_points(images, label.name, "val", np.arange(0, len(images))) f.add_data_points(images, label.name, "test", np.arange(0, len(images))) f.save("/data/ssd1/file_lists/dfdc/5_45_split.json") for split in [TRAIN_NAME, VAL_NAME, TEST_NAME]: data_set = FileList.get_dataset_form_file( "/data/ssd1/file_lists/dfdc/5_45_split.json", split) print(f"{split}-data-set: {data_set}")