def process(self, dataset_obj: CRAFTDataset):

        self.__dataset_obj = dataset_obj

        original_questions = []

        instance_ids = sorted(
            list(dataset_obj.video_index_to_questions_map.keys()))
        for i, instance_id in enumerate(instance_ids):
            sid = int(dataset_obj.video_index_to_questions_map[instance_id][0]
                      ["simulation_id"])
            original_qa_json = FileIO.read_json(
                self.__dataset_obj.get_original_questions_path(
                    sid, instance_id))
            for qa in original_qa_json["questions"]:
                original_questions.append(
                    dataset_obj.get_question_from_question_obj(qa, sid))

            logger.info(f"Processed: {instance_id}/{len(instance_ids)}")

        os.makedirs(self.output_folder_path, exist_ok=True)

        FileIO.write_json(original_questions,
                          f"{self.output_folder_path}/dataset_minimal.json")

        self.__dataset_obj = CRAFTDataset(self.output_folder_path,
                                          self.__dataset_obj.metadata)
    def process(self, dataset_obj: CRAFTDataset):
        logger.info("Initiating dataset balancing stage...")

        dataset_obj.generate_statistics(
            output_folder=f"{dataset_obj.dataset_folder_path}/stats/imbalanced"
        )

        logger.info(
            f"Performing various under-sampling operations on dataset...")
        balanced_dataset_output_path = f"{dataset_obj.dataset_folder_path}/balanced_dataset.json"
        DatasetUnderSampler(dataset_obj, balanced_dataset_output_path) \
            .balance_answers_within_each_template_and_simulation_ids(self.purge) \
            .dump()
        logger.info(f"Copying imbalanced dataset to its file")
        FileIO.copy(
            f"{dataset_obj.dataset_folder_path}/dataset_minimal.json",
            f"{dataset_obj.dataset_folder_path}/imbalanced_dataset.json")
        logger.info(f"Copying balanced dataset to original file")
        FileIO.copy(f"{dataset_obj.dataset_folder_path}/balanced_dataset.json",
                    f"{dataset_obj.dataset_folder_path}/dataset_minimal.json")

        self.balanced_dataset = CRAFTDataset(dataset_obj.dataset_folder_path,
                                             dataset_obj.metadata)

        self.balanced_dataset.generate_statistics(
            output_folder=f"{dataset_obj.dataset_folder_path}/stats/balanced")
 def process(self, dataset_obj: CRAFTDataset):
     logger.info("Collecting annotations...")
     self.__dataset_obj = dataset_obj
     with open(f"{dataset_obj.dataset_folder_path}/{self.output_file_name}",
               "w") as annotations_file:
         annotations_file.write("{")
         instance_ids = sorted(
             list(dataset_obj.video_index_to_questions_map.keys()))
         for i, instance_id in enumerate(instance_ids):
             sid = int(dataset_obj.video_index_to_questions_map[instance_id]
                       [0]["simulation_id"])
             annotations_file_path = dataset_obj.get_simulation_with_variations_output_path(
                 sid, instance_id)
             with open(annotations_file_path) as this_annotations_file:
                 annotations = json.dumps(json.load(this_annotations_file))
                 # Relativize paths
                 annotations = annotations.replace(
                     Path(dataset_obj.dataset_folder_path).resolve().
                     as_posix(), ".")
                 annotations_file.write(f"""
                         "{instance_id:06d}": {annotations}
                 """)
                 if i != len(instance_ids) - 1:
                     annotations_file.write(",")
                 if i % 10 == 0:
                     logger.info(
                         f"Collecting annotations: {i}/{len(instance_ids)}")
         annotations_file.write("}")
 def process(self, config: DatasetGenerationConfig):
     logger.info("Initiating dataset generation process...")
     dataset_generator = DatasetGenerator(config)
     dataset_generator.execute()
     dataset_folder_path = dataset_generator.config.output_folder_path
     self.__dataset = CRAFTDataset(
         dataset_folder_path,
         FileIO.read_json(config.dataset_metadata_file_path))
def main():
    dataset_folder_path = "../human_eval/data"
    metadata = FileIO.read_json("../../svqa/metadata.json")
    dataset = CRAFTDataset(dataset_folder_path, metadata)

    #get_videos_by_number_of_question(dataset, "Descriptive", 10)
    #get_videos_by_number_of_question(dataset, "Prevent", 3)
    #get_videos_by_number_of_question(dataset, "Counterfactual", 6)
    #get_videos_by_number_of_question(dataset, "Enable", 3)
    get_videos_by_number_of_question(dataset, "Cause", 3)
    def process(self, dataset_obj: CRAFTDataset):
        logger.info("Balancing descriptive questions...")
        rnd = Random(42)

        qtype_q = dataset_obj.build_question_type_question_map()

        qtype_len, qtype_dist = self.__compute_dist(qtype_q)

        N_to_discard = round(qtype_len["Descriptive"] * self.percentage)

        logger.info(f"Discarding {N_to_discard} descriptive questions")

        rnd.shuffle(qtype_q["Descriptive"])

        while N_to_discard > 0:
            q = qtype_q["Descriptive"].pop(0)
            if len(dataset_obj.video_index_to_questions_map[
                    q["video_index"]]) <= 1:
                qtype_q["Descriptive"].append(q)
                continue
            N_to_discard -= 1

        logger.info(
            f"{qtype_len['Descriptive'] - len(qtype_q['Descriptive'])} descriptive questions are discarded."
        )

        balanced_questions = []
        for qs in qtype_q.values():
            balanced_questions.extend(qs)

        logger.info(f"Sorting...")
        balanced_questions.sort(
            key=lambda q: (q["video_index"], q["question_index"]))

        dataset_obj.questions = balanced_questions

        self.__compute_dist(dataset_obj.build_question_type_question_map())

        self.balanced_dataset = dataset_obj
        self.balanced_dataset.prepare_auxiliaries()

        self.__write_dataset()
class BalancingStage(Stage):
    def __init__(self, purge_single_answers=False):
        """
        In this stage, questions in a tuple (template_id, simulation_id) are balanced according to the answers.
        :param purge_single_answers: Removes all questions if there are only one answer for that pair.
        """
        super().__init__(name="Balancing Stage")
        self.purge = purge_single_answers
        self.balanced_dataset = None

    def process(self, dataset_obj: CRAFTDataset):
        logger.info("Initiating dataset balancing stage...")

        dataset_obj.generate_statistics(
            output_folder=f"{dataset_obj.dataset_folder_path}/stats/imbalanced"
        )

        logger.info(
            f"Performing various under-sampling operations on dataset...")
        balanced_dataset_output_path = f"{dataset_obj.dataset_folder_path}/balanced_dataset.json"
        DatasetUnderSampler(dataset_obj, balanced_dataset_output_path) \
            .balance_answers_within_each_template_and_simulation_ids(self.purge) \
            .dump()
        logger.info(f"Copying imbalanced dataset to its file")
        FileIO.copy(
            f"{dataset_obj.dataset_folder_path}/dataset_minimal.json",
            f"{dataset_obj.dataset_folder_path}/imbalanced_dataset.json")
        logger.info(f"Copying balanced dataset to original file")
        FileIO.copy(f"{dataset_obj.dataset_folder_path}/balanced_dataset.json",
                    f"{dataset_obj.dataset_folder_path}/dataset_minimal.json")

        self.balanced_dataset = CRAFTDataset(dataset_obj.dataset_folder_path,
                                             dataset_obj.metadata)

        self.balanced_dataset.generate_statistics(
            output_folder=f"{dataset_obj.dataset_folder_path}/stats/balanced")

    def cleanup(self):
        pass

    def get_output(self):
        return self.balanced_dataset
    def load_dataset(self):
        logger.info("Loading dataset...")
        self.path = self.le_dataset_folder.text()

        with open(".state", "w") as state_file:
            state_file.write(self.path)

        global g_dataset
        g_dataset = CRAFTDataset(self.path,
                                 FileIO.read_json("../svqa/metadata.json"))
        logger.info(f"Dataset at {self.path} loaded...")
        self.populate_lists()
Beispiel #9
0
def generate_random_parts(nparts: int):
    split_info_random = FileIO.read_json(
        f"{dataset.dataset_folder_path}/split_info_random.json")
    split_info_hard = FileIO.read_json(
        f"{dataset.dataset_folder_path}/split_info_hard.json")
    test_questions = []
    split_setting = {}
    for pair in split_info_random["test"]:
        video_index = pair["video_index"]
        question_index = pair["question_index"]

        for question in dataset.video_index_to_questions_map[video_index]:
            if question["question_index"] == question_index:
                test_questions.append(question)
                split_setting[f"{video_index}-{question_index}"] = "random"
                continue
    for pair in split_info_hard["test"]:
        video_index = pair["video_index"]
        question_index = pair["question_index"]

        for question in dataset.video_index_to_questions_map[video_index]:
            if question["question_index"] == question_index:
                test_questions.append(question)
                split_setting[f"{video_index}-{question_index}"] = "hard"
                continue
    human_test_dataset = CRAFTDataset(dataset_folder_path,
                                      metadata,
                                      load_immediately=False)
    human_test_dataset.questions = test_questions
    human_test_dataset.prepare_auxiliaries()
    human_test_dataset.build_sid_vi_q_map()
    visited = set()
    parts = []
    for i in range(nparts):
        vis = random_simulation_select(i + 3123,
                                       human_test_dataset.sid_vi_q_map,
                                       visited)
        parts.append(vis)
    chosen_qs = []
    for i in range(len(parts)):
        part = parts[i]
        for vi in part:
            qlist = human_test_dataset.video_index_to_questions_map[vi]
            for q in qlist:
                q["part"] = i + 1
            chosen_qs.extend(qlist)

    return parts, chosen_qs, split_setting, test_questions
 def cleanup(self):
     logger.info(f"Re-reading post-processed minimal dataset...")
     self.__dataset_obj = CRAFTDataset(
         self.__dataset_obj.dataset_folder_path,
         self.__dataset_obj.metadata)
    def process(self, dataset_obj: CRAFTDataset):
        logger.info("Initiating post process stage before balancing...")

        self.__dataset_obj = dataset_obj

        for i, instance_id in enumerate(
                sorted(dataset_obj.video_index_to_questions_map.keys())):
            question_list = dataset_obj.video_index_to_questions_map[
                instance_id]
            sid = int(question_list[0]["simulation_id"])

            annotations = FileIO.read_json(
                dataset_obj.get_simulation_with_variations_output_path(
                    sid, instance_id))
            objects_in_scene = annotations["original_video_output"][
                "scene_states"][0]["scene"]["objects"]
            dynamic_objects = [
                object for object in objects_in_scene
                if object["bodyType"] == 2
            ]

            new_questions_list = []
            for question in question_list:
                # Postprocess Before Balancing 1: Do not ask shape if only one shape is present in the scene.
                answer_type = dataset_obj.get_answer_type_for_answer(
                    question["answer"])
                if answer_type == "Shape":
                    if len(
                            set([
                                f"{object['shape']}"
                                for object in dynamic_objects
                            ])) <= 1:
                        # Remove the question that asks shape even though there's only one shape present
                        logger.info(
                            f"Question asks shape even though there's only 1 "
                            f"shape present in the scene. Removing {question['video_index']}/{question['question_index']}"
                        )
                        continue

                if "hexagon" in question["question"]:
                    logger.info(
                        f"Question asks about hexagons, which are not present in any of the videos. "
                        f"Removing {question['video_index']}/{question['question_index']}"
                    )
                    continue

                # Postprocess Before Balancing 2: Remove questions regarding collisions with the basket
                # to avoid ambiguity. Note that these are not yet removed from the  question template
                # files in svqa/SVQA_1.0_templates. Following can be removed from post processing once
                # they are removed from the question template files and if the dataset is generated
                # according to the updated question templates.
                if question["template_id"] in [
                        "cause_2",
                        "cause_5",
                        "counterfactual_2",
                        "counterfactual_5",
                        "counterfactual_8",
                        "descriptive_12",
                        "descriptive_13",
                        "descriptive_14",
                        "descriptive_15",
                        "descriptive_20",
                        "descriptive_21",
                        "descriptive_30",
                        "descriptive_31",
                        "descriptive_36",
                        "descriptive_37",
                        "enable_2",
                        "enable_5",
                        "prevent_2",
                        "prevent_5",
                ]:
                    continue

                # Postprocess Before Balancing 3: Correct typos in the question templates.
                # These are also corrected in the question template files in svqa/SVQA_1.0_templates,
                # so the following can be deleted.
                if question["template_id"] == "counterfactual_2":
                    question_text: str = question["question"]
                    if question_text.startswith("Will"):
                        question_text = question_text.replace(
                            "the basket the", "the basket if the")
                        question_text = question_text.replace(
                            "the container the", "the container if the")
                        question_text = question_text.replace(
                            "the bucket the", "the bucket if the")
                        question["question"] = question_text

                if question["template_id"] in [
                        "prevent_0", "prevent_1", "prevent_2"
                ]:
                    question_text: str = question["question"]
                    if question_text.startswith("Is"):
                        question_text = question_text.replace(
                            "is prevented by", "prevented by")
                        question_text = question_text.replace(
                            "is kept by", "kept by")
                        question_text = question_text.replace(
                            "is held by", "held by")
                        question_text = question_text.replace(
                            "is blocked by", "blocked by")
                        question["question"] = question_text

                new_questions_list.append(question)

            question_list[:] = new_questions_list

            logger.info(
                f"Processed: {i}/{len(dataset_obj.video_index_to_questions_map.keys())}"
            )

        # Continue postprocessing before balancing here

        self.__rewrite_dataset()
Beispiel #12
0
        N = len(min_present_qcat[1])

        for qcat in qcat_qs_map:
            qs = list(qcat_qs_map[qcat])
            rnd.shuffle(qs)
            undersampled = qs[:N]
            chosen_qs_qcat_balanced.extend(undersampled)

    return chosen_qs_qcat_balanced


if __name__ == '__main__':
    output_folder_path = "./human_eval_CRAFT_10K_balanced"
    dataset_folder_path = "../../framework/out/CRAFT_10K"
    metadata = FileIO.read_json("../../svqa/metadata.json")
    dataset = CRAFTDataset(dataset_folder_path, metadata)

    os.makedirs(f"{output_folder_path}/", exist_ok=True)

    parts, chosen_qs, split_setting, test_questions = generate_random_parts(5)

    chosen_qs_qcat_balanced = undersample(chosen_qs)

    undersampled_human_test_dataset = CRAFTDataset(dataset_folder_path,
                                                   metadata,
                                                   load_immediately=False)
    undersampled_human_test_dataset.questions = chosen_qs_qcat_balanced
    undersampled_human_test_dataset.prepare_auxiliaries()
    undersampled_human_test_dataset.build_sid_vi_q_map()

    FileIO.write_json(undersampled_human_test_dataset.questions,
            print(s, len(split_to_vid[s]))

        print("Questions")
        question_count_per_split = defaultdict(int)
        for s in split_to_vid:
            for vi in split_to_vid[s]:
                question_count_per_split[s] += len(
                    dataset.video_index_to_questions_map[vi])
        for s, c in question_count_per_split.items():
            print(s, c)


def proof_read():
    with open("./dataset_minimal.json", "r") as dataset_file:
        questions = json.load(dataset_file)

        for q in questions:
            if not os.path.isfile(q["video_file_path"]):
                print(q["video_file_path"], False)


if __name__ == '__main__':
    dataset_folder_path = "../../framework/out/CRAFT_10K"
    metadata = FileIO.read_json("../../svqa/metadata.json")
    dataset = CRAFTDataset(dataset_folder_path, metadata)

    print("Number of videos:",
          len(dataset.video_index_to_questions_map.keys()))
    split_info(dataset, "random")
    split_info(dataset, "hard")
Beispiel #14
0
def start_experiment(dataset: CRAFTDataset):
    logger.info(f"Starting experiment with noise amount %{NOISE_AMOUNT * 100}")

    os.makedirs("./perturbed_outputs", exist_ok=True)
    os.makedirs("./perturbed_controllers", exist_ok=True)

    video_sid_set = set()
    for question in dataset.questions:
        video_index = question["video_index"]
        simulation_id = question["simulation_id"]
        video_sid_set.add((video_index, simulation_id))

    simulation_jobs = []
    simulation_args = []

    video_sid_set = list(video_sid_set)
    video_sid_set.sort(key=lambda x: x[0])

    # Perturbation of videos
    original_questions = []
    outputs = []
    for video_sid in video_sid_set:  # Test with only 10 videos for now
        video_index = video_sid[0]
        simulation_id = video_sid[1]
        original_variations_output_file_path = f"{dataset.intermediates_folder_path}/sid_{simulation_id}/{video_index:06d}.json"
        original_questions_file_path = f"{dataset.intermediates_folder_path}/sid_{simulation_id}/qa_{video_index:06d}.json"
        old_controller_file_path = f"{dataset.intermediates_folder_path}/sid_{simulation_id}/debug/controller_{video_index:06d}.json"
        simulation_jobs.append(run_simulation_instance)
        simulation_args.append([
            original_variations_output_file_path, old_controller_file_path,
            simulation_id, video_index
        ])
        new_variations_output_file_path = f"./perturbed_outputs/variations_{simulation_id}_{video_index:06d}.json"
        outputs.append(
            (video_index, simulation_id, new_variations_output_file_path,
             original_questions_file_path,
             original_variations_output_file_path))
        original_questions.extend(dataset.get_questions_for_video(video_index))

    logger.info(f"{len(simulation_jobs)} simulations will be perturbed")
    parallel_worker = ParallelWorker(simulation_jobs, simulation_args, 4)
    parallel_worker.execute_all()

    question_ask_jobs = []
    question_ask_args = []

    # Regenerate answers for perturbed simulations
    qa_outputs = []
    for output in outputs:
        video_index = output[0]
        simulation_id = output[1]
        new_variations_output_file_path = output[2]
        original_questions_file_path = output[3]
        original_variations_output_file_path = output[4]
        new_perturbed_qa_file_path = f"./perturbed_outputs/qa_{video_index:06d}.json"
        question_ask_jobs.append(regenerate_answers)
        question_ask_args.append([
            original_variations_output_file_path,
            new_variations_output_file_path, original_questions_file_path,
            new_perturbed_qa_file_path, simulation_id, video_index
        ])
        qa_outputs.append(
            (video_index, simulation_id, new_perturbed_qa_file_path))

    logger.info(f"Asking questions for perturbed simulations")
    parallel_worker = ParallelWorker(question_ask_jobs, question_ask_args, 8)
    parallel_worker.execute_all()

    questions_perturbed = []
    for qa in qa_outputs:
        video_index = qa[0]
        simulation_id = qa[1]
        qa_file_path = qa[2]
        qa_file = FileIO.read_json(qa_file_path)
        questions_perturbed.extend(qa_file["questions"])

    logger.info(f"Measuring similarity, this might take a while...")
    data, orig_size, found, ratio = measure_similarity(original_questions,
                                                       questions_perturbed)
    logger.info(f"Number of questions from original simulations: {orig_size}")
    logger.info(
        f"Number of questions from perturbed simulations: {len(questions_perturbed)}"
    )
    logger.info(f"Number of perturbed counterparts: {found}")
    logger.info(f"Match ratio: {found / orig_size}")
    logger.info(f"Correctness: {ratio}")
    logger.info(f"Dumping analysis data...")
    FileIO.write_json(
        data, f"analysis_data_{datetime.now().strftime('%m%d%Y_%H%M')}.json")
Beispiel #15
0
    logger.info(f"Measuring similarity, this might take a while...")
    data, orig_size, found, ratio = measure_similarity(original_questions,
                                                       questions_perturbed)
    logger.info(f"Number of questions from original simulations: {orig_size}")
    logger.info(
        f"Number of questions from perturbed simulations: {len(questions_perturbed)}"
    )
    logger.info(f"Number of perturbed counterparts: {found}")
    logger.info(f"Match ratio: {found / orig_size}")
    logger.info(f"Correctness: {ratio}")
    logger.info(f"Dumping analysis data...")
    FileIO.write_json(
        data, f"analysis_data_{datetime.now().strftime('%m%d%Y_%H%M')}.json")


if __name__ == '__main__':
    logger.add(f"perturbation_{datetime.now().strftime('%m%d%Y_%H%M')}.log")

    metadata = FileIO.read_json("../../svqa/metadata.json")

    logger.info(f"Reading the dataset...")
    dataset = CRAFTDataset(
        "D:\Library\Research\datasets\Dataset_3000_230920cpy\dataset.json",
        metadata)

    logger.info(
        f"{len(dataset.questions)} questions have been loaded into memory")

    start_experiment(dataset)