예제 #1
0
def generate_test(output_dir, image_count, min_shapes=7, max_shapes=15):
    data = []
    for i in range(image_count):
        bg = Background(ColorPool.default().random(count=2), (500, 500),
                        "random")
        image = Image(bg, i)
        for _ in range(np.random.randint(min_shapes, max_shapes + 1)):
            image.add_shape(ShapePool.default().random().new(
                color=ColorPool.default().random(bg.colors)))
        image.save(os.path.join(output_dir, "images_test"))
        questions = QuestionsGroup().generate_all(image)
        show_progress(i, image_count, prefix="Test Generation")
        data.extend(questions)

    print()
    with open(os.path.join(output_dir, "questions_test.json"), "w") as file:
        json.dump(data, file)
        print("Test data file:", file.name)
예제 #2
0
def generate_train_validation(output_dir,
                              image_repeats=1,
                              balance_approach="group",
                              validation_split_ratio=0.01):
    # generate data
    print()
    all_data = []
    all_space = pool_iteration("colors", "shapes", "colors", image_repeats)
    total_iterations = ColorPool.default().colors_len() * \
                       ShapePool.default().shapes_len() * \
                       ColorPool.default().colors_len() * \
                       image_repeats
    for i, (bg_color, shape, shape_color, image_index) in enumerate(all_space):
        if bg_color == shape_color:
            continue

        # generate invalid image and question
        bg = Background(bg_color, (50, 50))
        image = Image(bg)
        image.save(os.path.join(output_dir, "images"))
        questions = QuestionsGroup().generate_all(image)
        all_data.extend(questions)
        bg = Background(bg_color, (50, 50))

        # generate valid image and question
        image = Image(bg, image_index + 1) if image_repeats > 1 else Image(bg)
        image.add_shape(shape.new(shape_color))
        image.save(os.path.join(output_dir, "images"))
        questions = QuestionsGroup().generate_all(image)
        all_data.extend(questions)
        show_progress(i, total_iterations, prefix="Train Generation")

    print()
    print("Total generated questions:", len(all_data))

    # balance data
    random.shuffle(all_data)
    data_table = {}
    for d in all_data:
        key = d["answer"][0]
        if key not in data_table:
            data_table[key] = []
        data_table[key].append(d)
    answer_count = {k: len(v) for k, v in data_table.items()}

    print("Number of each answer:")
    for answer, count in answer_count.items():
        print(answer, count)

    balanced_data = []

    if balance_approach == "categorical":
        data_groups = {
            "colors":
            [c.name for c in pool_iteration("colors")] + ["unknown_color"],
            "shapes":
            [s.name for s in pool_iteration("shapes")] + ["unknown_shape"]
        }

        for gname, glist in data_groups.items():
            print(gname, glist)

        print("Group minimums:")
        for gname, glist in data_groups.items():
            group_min = min(
                [answer_count[a] if a in answer_count else 0 for a in glist])
            print(gname, group_min)
            if group_min == 0:
                continue
            for a in glist:
                balanced_data.extend(data_table[a][:group_min])

    elif balance_approach == "blindfold":
        answers_min_count = min([v for _, v in answer_count.items()])
        for answer, qlist in data_table.items():
            balanced_data.extend(qlist[:answers_min_count])

    elif balance_approach == "none":
        balanced_data = all_data

    else:
        raise Exception(
            f"{balance_approach} is not a valid data balance approach")

    print("Total number of questions of balancing", len(balanced_data))
    data_table = {}
    for d in balanced_data:
        key = d["answer"][0]
        if key not in data_table:
            data_table[key] = []
        data_table[key].append(d)
    answer_count = [(k, len(v)) for k, v in data_table.items()]

    print("Number of each answer after balancing:")
    for answer, count in answer_count:
        print(answer, count)

    # spliting to validation and test
    from sklearn.model_selection import train_test_split
    data_train, data_validation = train_test_split(
        balanced_data, test_size=validation_split_ratio)
    print("Total number of train data", len(data_train))
    print("Total number of validation data", len(data_validation))

    with open(os.path.join(output_dir, "questions_train.json"), "w") as file:
        json.dump(data_train, file)
        print("Train data file:", file.name)

    with open(os.path.join(output_dir, "questions_validation.json"),
              "w") as file:
        json.dump(data_validation, file)
        print("Validation data file:", file.name)