def process(self, dataset_obj: CRAFTDataset): self.__dataset_obj = dataset_obj original_questions = [] instance_ids = sorted( list(dataset_obj.video_index_to_questions_map.keys())) for i, instance_id in enumerate(instance_ids): sid = int(dataset_obj.video_index_to_questions_map[instance_id][0] ["simulation_id"]) original_qa_json = FileIO.read_json( self.__dataset_obj.get_original_questions_path( sid, instance_id)) for qa in original_qa_json["questions"]: original_questions.append( dataset_obj.get_question_from_question_obj(qa, sid)) logger.info(f"Processed: {instance_id}/{len(instance_ids)}") os.makedirs(self.output_folder_path, exist_ok=True) FileIO.write_json(original_questions, f"{self.output_folder_path}/dataset_minimal.json") self.__dataset_obj = CRAFTDataset(self.output_folder_path, self.__dataset_obj.metadata)
def process(self, dataset_obj: CRAFTDataset): logger.info("Initiating dataset balancing stage...") dataset_obj.generate_statistics( output_folder=f"{dataset_obj.dataset_folder_path}/stats/imbalanced" ) logger.info( f"Performing various under-sampling operations on dataset...") balanced_dataset_output_path = f"{dataset_obj.dataset_folder_path}/balanced_dataset.json" DatasetUnderSampler(dataset_obj, balanced_dataset_output_path) \ .balance_answers_within_each_template_and_simulation_ids(self.purge) \ .dump() logger.info(f"Copying imbalanced dataset to its file") FileIO.copy( f"{dataset_obj.dataset_folder_path}/dataset_minimal.json", f"{dataset_obj.dataset_folder_path}/imbalanced_dataset.json") logger.info(f"Copying balanced dataset to original file") FileIO.copy(f"{dataset_obj.dataset_folder_path}/balanced_dataset.json", f"{dataset_obj.dataset_folder_path}/dataset_minimal.json") self.balanced_dataset = CRAFTDataset(dataset_obj.dataset_folder_path, dataset_obj.metadata) self.balanced_dataset.generate_statistics( output_folder=f"{dataset_obj.dataset_folder_path}/stats/balanced")
def process(self, dataset_obj: CRAFTDataset): logger.info("Collecting annotations...") self.__dataset_obj = dataset_obj with open(f"{dataset_obj.dataset_folder_path}/{self.output_file_name}", "w") as annotations_file: annotations_file.write("{") instance_ids = sorted( list(dataset_obj.video_index_to_questions_map.keys())) for i, instance_id in enumerate(instance_ids): sid = int(dataset_obj.video_index_to_questions_map[instance_id] [0]["simulation_id"]) annotations_file_path = dataset_obj.get_simulation_with_variations_output_path( sid, instance_id) with open(annotations_file_path) as this_annotations_file: annotations = json.dumps(json.load(this_annotations_file)) # Relativize paths annotations = annotations.replace( Path(dataset_obj.dataset_folder_path).resolve(). as_posix(), ".") annotations_file.write(f""" "{instance_id:06d}": {annotations} """) if i != len(instance_ids) - 1: annotations_file.write(",") if i % 10 == 0: logger.info( f"Collecting annotations: {i}/{len(instance_ids)}") annotations_file.write("}")
def process(self, config: DatasetGenerationConfig): logger.info("Initiating dataset generation process...") dataset_generator = DatasetGenerator(config) dataset_generator.execute() dataset_folder_path = dataset_generator.config.output_folder_path self.__dataset = CRAFTDataset( dataset_folder_path, FileIO.read_json(config.dataset_metadata_file_path))
def main(): dataset_folder_path = "../human_eval/data" metadata = FileIO.read_json("../../svqa/metadata.json") dataset = CRAFTDataset(dataset_folder_path, metadata) #get_videos_by_number_of_question(dataset, "Descriptive", 10) #get_videos_by_number_of_question(dataset, "Prevent", 3) #get_videos_by_number_of_question(dataset, "Counterfactual", 6) #get_videos_by_number_of_question(dataset, "Enable", 3) get_videos_by_number_of_question(dataset, "Cause", 3)
def process(self, dataset_obj: CRAFTDataset): logger.info("Balancing descriptive questions...") rnd = Random(42) qtype_q = dataset_obj.build_question_type_question_map() qtype_len, qtype_dist = self.__compute_dist(qtype_q) N_to_discard = round(qtype_len["Descriptive"] * self.percentage) logger.info(f"Discarding {N_to_discard} descriptive questions") rnd.shuffle(qtype_q["Descriptive"]) while N_to_discard > 0: q = qtype_q["Descriptive"].pop(0) if len(dataset_obj.video_index_to_questions_map[ q["video_index"]]) <= 1: qtype_q["Descriptive"].append(q) continue N_to_discard -= 1 logger.info( f"{qtype_len['Descriptive'] - len(qtype_q['Descriptive'])} descriptive questions are discarded." ) balanced_questions = [] for qs in qtype_q.values(): balanced_questions.extend(qs) logger.info(f"Sorting...") balanced_questions.sort( key=lambda q: (q["video_index"], q["question_index"])) dataset_obj.questions = balanced_questions self.__compute_dist(dataset_obj.build_question_type_question_map()) self.balanced_dataset = dataset_obj self.balanced_dataset.prepare_auxiliaries() self.__write_dataset()
class BalancingStage(Stage): def __init__(self, purge_single_answers=False): """ In this stage, questions in a tuple (template_id, simulation_id) are balanced according to the answers. :param purge_single_answers: Removes all questions if there are only one answer for that pair. """ super().__init__(name="Balancing Stage") self.purge = purge_single_answers self.balanced_dataset = None def process(self, dataset_obj: CRAFTDataset): logger.info("Initiating dataset balancing stage...") dataset_obj.generate_statistics( output_folder=f"{dataset_obj.dataset_folder_path}/stats/imbalanced" ) logger.info( f"Performing various under-sampling operations on dataset...") balanced_dataset_output_path = f"{dataset_obj.dataset_folder_path}/balanced_dataset.json" DatasetUnderSampler(dataset_obj, balanced_dataset_output_path) \ .balance_answers_within_each_template_and_simulation_ids(self.purge) \ .dump() logger.info(f"Copying imbalanced dataset to its file") FileIO.copy( f"{dataset_obj.dataset_folder_path}/dataset_minimal.json", f"{dataset_obj.dataset_folder_path}/imbalanced_dataset.json") logger.info(f"Copying balanced dataset to original file") FileIO.copy(f"{dataset_obj.dataset_folder_path}/balanced_dataset.json", f"{dataset_obj.dataset_folder_path}/dataset_minimal.json") self.balanced_dataset = CRAFTDataset(dataset_obj.dataset_folder_path, dataset_obj.metadata) self.balanced_dataset.generate_statistics( output_folder=f"{dataset_obj.dataset_folder_path}/stats/balanced") def cleanup(self): pass def get_output(self): return self.balanced_dataset
def load_dataset(self): logger.info("Loading dataset...") self.path = self.le_dataset_folder.text() with open(".state", "w") as state_file: state_file.write(self.path) global g_dataset g_dataset = CRAFTDataset(self.path, FileIO.read_json("../svqa/metadata.json")) logger.info(f"Dataset at {self.path} loaded...") self.populate_lists()
def generate_random_parts(nparts: int): split_info_random = FileIO.read_json( f"{dataset.dataset_folder_path}/split_info_random.json") split_info_hard = FileIO.read_json( f"{dataset.dataset_folder_path}/split_info_hard.json") test_questions = [] split_setting = {} for pair in split_info_random["test"]: video_index = pair["video_index"] question_index = pair["question_index"] for question in dataset.video_index_to_questions_map[video_index]: if question["question_index"] == question_index: test_questions.append(question) split_setting[f"{video_index}-{question_index}"] = "random" continue for pair in split_info_hard["test"]: video_index = pair["video_index"] question_index = pair["question_index"] for question in dataset.video_index_to_questions_map[video_index]: if question["question_index"] == question_index: test_questions.append(question) split_setting[f"{video_index}-{question_index}"] = "hard" continue human_test_dataset = CRAFTDataset(dataset_folder_path, metadata, load_immediately=False) human_test_dataset.questions = test_questions human_test_dataset.prepare_auxiliaries() human_test_dataset.build_sid_vi_q_map() visited = set() parts = [] for i in range(nparts): vis = random_simulation_select(i + 3123, human_test_dataset.sid_vi_q_map, visited) parts.append(vis) chosen_qs = [] for i in range(len(parts)): part = parts[i] for vi in part: qlist = human_test_dataset.video_index_to_questions_map[vi] for q in qlist: q["part"] = i + 1 chosen_qs.extend(qlist) return parts, chosen_qs, split_setting, test_questions
def cleanup(self): logger.info(f"Re-reading post-processed minimal dataset...") self.__dataset_obj = CRAFTDataset( self.__dataset_obj.dataset_folder_path, self.__dataset_obj.metadata)
def process(self, dataset_obj: CRAFTDataset): logger.info("Initiating post process stage before balancing...") self.__dataset_obj = dataset_obj for i, instance_id in enumerate( sorted(dataset_obj.video_index_to_questions_map.keys())): question_list = dataset_obj.video_index_to_questions_map[ instance_id] sid = int(question_list[0]["simulation_id"]) annotations = FileIO.read_json( dataset_obj.get_simulation_with_variations_output_path( sid, instance_id)) objects_in_scene = annotations["original_video_output"][ "scene_states"][0]["scene"]["objects"] dynamic_objects = [ object for object in objects_in_scene if object["bodyType"] == 2 ] new_questions_list = [] for question in question_list: # Postprocess Before Balancing 1: Do not ask shape if only one shape is present in the scene. answer_type = dataset_obj.get_answer_type_for_answer( question["answer"]) if answer_type == "Shape": if len( set([ f"{object['shape']}" for object in dynamic_objects ])) <= 1: # Remove the question that asks shape even though there's only one shape present logger.info( f"Question asks shape even though there's only 1 " f"shape present in the scene. Removing {question['video_index']}/{question['question_index']}" ) continue if "hexagon" in question["question"]: logger.info( f"Question asks about hexagons, which are not present in any of the videos. " f"Removing {question['video_index']}/{question['question_index']}" ) continue # Postprocess Before Balancing 2: Remove questions regarding collisions with the basket # to avoid ambiguity. Note that these are not yet removed from the question template # files in svqa/SVQA_1.0_templates. Following can be removed from post processing once # they are removed from the question template files and if the dataset is generated # according to the updated question templates. if question["template_id"] in [ "cause_2", "cause_5", "counterfactual_2", "counterfactual_5", "counterfactual_8", "descriptive_12", "descriptive_13", "descriptive_14", "descriptive_15", "descriptive_20", "descriptive_21", "descriptive_30", "descriptive_31", "descriptive_36", "descriptive_37", "enable_2", "enable_5", "prevent_2", "prevent_5", ]: continue # Postprocess Before Balancing 3: Correct typos in the question templates. # These are also corrected in the question template files in svqa/SVQA_1.0_templates, # so the following can be deleted. if question["template_id"] == "counterfactual_2": question_text: str = question["question"] if question_text.startswith("Will"): question_text = question_text.replace( "the basket the", "the basket if the") question_text = question_text.replace( "the container the", "the container if the") question_text = question_text.replace( "the bucket the", "the bucket if the") question["question"] = question_text if question["template_id"] in [ "prevent_0", "prevent_1", "prevent_2" ]: question_text: str = question["question"] if question_text.startswith("Is"): question_text = question_text.replace( "is prevented by", "prevented by") question_text = question_text.replace( "is kept by", "kept by") question_text = question_text.replace( "is held by", "held by") question_text = question_text.replace( "is blocked by", "blocked by") question["question"] = question_text new_questions_list.append(question) question_list[:] = new_questions_list logger.info( f"Processed: {i}/{len(dataset_obj.video_index_to_questions_map.keys())}" ) # Continue postprocessing before balancing here self.__rewrite_dataset()
N = len(min_present_qcat[1]) for qcat in qcat_qs_map: qs = list(qcat_qs_map[qcat]) rnd.shuffle(qs) undersampled = qs[:N] chosen_qs_qcat_balanced.extend(undersampled) return chosen_qs_qcat_balanced if __name__ == '__main__': output_folder_path = "./human_eval_CRAFT_10K_balanced" dataset_folder_path = "../../framework/out/CRAFT_10K" metadata = FileIO.read_json("../../svqa/metadata.json") dataset = CRAFTDataset(dataset_folder_path, metadata) os.makedirs(f"{output_folder_path}/", exist_ok=True) parts, chosen_qs, split_setting, test_questions = generate_random_parts(5) chosen_qs_qcat_balanced = undersample(chosen_qs) undersampled_human_test_dataset = CRAFTDataset(dataset_folder_path, metadata, load_immediately=False) undersampled_human_test_dataset.questions = chosen_qs_qcat_balanced undersampled_human_test_dataset.prepare_auxiliaries() undersampled_human_test_dataset.build_sid_vi_q_map() FileIO.write_json(undersampled_human_test_dataset.questions,
print(s, len(split_to_vid[s])) print("Questions") question_count_per_split = defaultdict(int) for s in split_to_vid: for vi in split_to_vid[s]: question_count_per_split[s] += len( dataset.video_index_to_questions_map[vi]) for s, c in question_count_per_split.items(): print(s, c) def proof_read(): with open("./dataset_minimal.json", "r") as dataset_file: questions = json.load(dataset_file) for q in questions: if not os.path.isfile(q["video_file_path"]): print(q["video_file_path"], False) if __name__ == '__main__': dataset_folder_path = "../../framework/out/CRAFT_10K" metadata = FileIO.read_json("../../svqa/metadata.json") dataset = CRAFTDataset(dataset_folder_path, metadata) print("Number of videos:", len(dataset.video_index_to_questions_map.keys())) split_info(dataset, "random") split_info(dataset, "hard")
def start_experiment(dataset: CRAFTDataset): logger.info(f"Starting experiment with noise amount %{NOISE_AMOUNT * 100}") os.makedirs("./perturbed_outputs", exist_ok=True) os.makedirs("./perturbed_controllers", exist_ok=True) video_sid_set = set() for question in dataset.questions: video_index = question["video_index"] simulation_id = question["simulation_id"] video_sid_set.add((video_index, simulation_id)) simulation_jobs = [] simulation_args = [] video_sid_set = list(video_sid_set) video_sid_set.sort(key=lambda x: x[0]) # Perturbation of videos original_questions = [] outputs = [] for video_sid in video_sid_set: # Test with only 10 videos for now video_index = video_sid[0] simulation_id = video_sid[1] original_variations_output_file_path = f"{dataset.intermediates_folder_path}/sid_{simulation_id}/{video_index:06d}.json" original_questions_file_path = f"{dataset.intermediates_folder_path}/sid_{simulation_id}/qa_{video_index:06d}.json" old_controller_file_path = f"{dataset.intermediates_folder_path}/sid_{simulation_id}/debug/controller_{video_index:06d}.json" simulation_jobs.append(run_simulation_instance) simulation_args.append([ original_variations_output_file_path, old_controller_file_path, simulation_id, video_index ]) new_variations_output_file_path = f"./perturbed_outputs/variations_{simulation_id}_{video_index:06d}.json" outputs.append( (video_index, simulation_id, new_variations_output_file_path, original_questions_file_path, original_variations_output_file_path)) original_questions.extend(dataset.get_questions_for_video(video_index)) logger.info(f"{len(simulation_jobs)} simulations will be perturbed") parallel_worker = ParallelWorker(simulation_jobs, simulation_args, 4) parallel_worker.execute_all() question_ask_jobs = [] question_ask_args = [] # Regenerate answers for perturbed simulations qa_outputs = [] for output in outputs: video_index = output[0] simulation_id = output[1] new_variations_output_file_path = output[2] original_questions_file_path = output[3] original_variations_output_file_path = output[4] new_perturbed_qa_file_path = f"./perturbed_outputs/qa_{video_index:06d}.json" question_ask_jobs.append(regenerate_answers) question_ask_args.append([ original_variations_output_file_path, new_variations_output_file_path, original_questions_file_path, new_perturbed_qa_file_path, simulation_id, video_index ]) qa_outputs.append( (video_index, simulation_id, new_perturbed_qa_file_path)) logger.info(f"Asking questions for perturbed simulations") parallel_worker = ParallelWorker(question_ask_jobs, question_ask_args, 8) parallel_worker.execute_all() questions_perturbed = [] for qa in qa_outputs: video_index = qa[0] simulation_id = qa[1] qa_file_path = qa[2] qa_file = FileIO.read_json(qa_file_path) questions_perturbed.extend(qa_file["questions"]) logger.info(f"Measuring similarity, this might take a while...") data, orig_size, found, ratio = measure_similarity(original_questions, questions_perturbed) logger.info(f"Number of questions from original simulations: {orig_size}") logger.info( f"Number of questions from perturbed simulations: {len(questions_perturbed)}" ) logger.info(f"Number of perturbed counterparts: {found}") logger.info(f"Match ratio: {found / orig_size}") logger.info(f"Correctness: {ratio}") logger.info(f"Dumping analysis data...") FileIO.write_json( data, f"analysis_data_{datetime.now().strftime('%m%d%Y_%H%M')}.json")
logger.info(f"Measuring similarity, this might take a while...") data, orig_size, found, ratio = measure_similarity(original_questions, questions_perturbed) logger.info(f"Number of questions from original simulations: {orig_size}") logger.info( f"Number of questions from perturbed simulations: {len(questions_perturbed)}" ) logger.info(f"Number of perturbed counterparts: {found}") logger.info(f"Match ratio: {found / orig_size}") logger.info(f"Correctness: {ratio}") logger.info(f"Dumping analysis data...") FileIO.write_json( data, f"analysis_data_{datetime.now().strftime('%m%d%Y_%H%M')}.json") if __name__ == '__main__': logger.add(f"perturbation_{datetime.now().strftime('%m%d%Y_%H%M')}.log") metadata = FileIO.read_json("../../svqa/metadata.json") logger.info(f"Reading the dataset...") dataset = CRAFTDataset( "D:\Library\Research\datasets\Dataset_3000_230920cpy\dataset.json", metadata) logger.info( f"{len(dataset.questions)} questions have been loaded into memory") start_experiment(dataset)