Beispiel #1
0
    def _gen_mnist(c_per):
        constraints = []

        if os.path.exists("mnist_saved_matrix.npy"):
            distance_matrix = np.load("mnist_saved_matrix.npy")
        else:
            x_test, y_test, class_distribution = read_mnist(subsample=False)

            # Subsample STE_NUM_DIGITS digits
            subsample_idxs = subsample(range(len(list(x_test))), STE_NUM_DIGITS)
            subsampled_x = [x_test[idx] for idx in subsample_idxs]
            subsampled_labels = dict([(i, np.argmax(y_test[digit_idx])) for i, digit_idx in enumerate(subsample_idxs)])

            # create distance matrix
            distance_matrix = np.zeros((STE_NUM_DIGITS, STE_NUM_DIGITS))

            for i in tqdm(range(len(subsampled_x)), desc="Distance Generation: ", position=BAR_POSITION_OFFSET + 1,
                          leave=False):
                for j in range(len(subsampled_x)):
                    distance_matrix[i, j] = np.linalg.norm(subsampled_x[i] - subsampled_x[j], ord=2)

            np.save("mnist_saved_matrix", distance_matrix)

        for i in tqdm(range(STE_NUM_DIGITS), desc="Triplet Generation : ", position=BAR_POSITION_OFFSET + 2,
                      leave=False):

            # Take 50 Nearest neighbours
            indexed_digits = [(i, d) for i, d in enumerate(list(np.ravel(distance_matrix[i, :])))]
            closest_indices = [i for i, _ in sorted(indexed_digits, key=lambda x: x[1])][:50]

            for close_index in closest_indices:
                # take the 50 farthest neighbors
                indexed_digits = [(i, d) for i, d in enumerate(list(np.ravel(distance_matrix[i, :])))]
                farthest_indices = [i for i, _ in sorted(indexed_digits, key=lambda x: x[1], reverse=True)][:50]

                for far_index in farthest_indices:
                    if rand() >= c_per:
                        constraints.append([i, close_index, far_index])
                    else:
                        constraints.append([i, far_index, close_index])

        # Subsample again reduce the number of constraints constraints
        subsampled_constraints = subsample(constraints, 3000)
        return subsampled_constraints
Beispiel #2
0
def _create_sin(contamination_percentage):
    x = np.linspace(0, 1, ROE_SAMPLES)
    y = 3 * np.sin(20 * x) + np.random.rand(ROE_SAMPLES) * 2
    dataset = np.array([x, y])
    distance_matrix = np.zeros((len(dataset[0]), len(dataset[0])))
    for i in tqdm(range(len(distance_matrix)), desc="Distance Generation: ", leave=False):
        for j in range(len(distance_matrix)):
            distance_matrix[i, j] = np.linalg.norm(dataset[:, i] - dataset[:, j], ord=2)
    constraints = format_triplets_from_distance(distance_matrix, poison_perc=contamination_percentage)
    subsampled_constraints = subsample(constraints, 3000)
    return subsampled_constraints
Beispiel #3
0
def _create_n_density_squares(contamination_percentage):
    close_to_zero = np.random.rand(int(ROE_SAMPLES / 3), 2) / 4
    mid_from_zero = np.random.rand(int(ROE_SAMPLES / 3), 2) / 3 + 0.5
    far_from_zero = np.random.rand(int(ROE_SAMPLES / 3), 2) / 2 + 1
    dataset = np.concatenate((close_to_zero, mid_from_zero, far_from_zero))
    n_points = ROE_SAMPLES
    distance_matrix = np.zeros((n_points, n_points))

    for i in tqdm(range(len(distance_matrix)), desc="Distance Generation: ", leave=False):
        for j in range(len(distance_matrix)):
            distance_matrix[i, j] = np.linalg.norm(dataset[i, :] - dataset[j, :], ord=2)

    constraints = format_triplets_from_distance(distance_matrix, poison_perc=contamination_percentage)
    subsampled_constraints = subsample(constraints, 3000)
    return subsampled_constraints
Beispiel #4
0
    def _create_dd_squares(contamination_percentage):
        num_points_inner = int((1 - outer_density) * ROE_SAMPLES * 2)
        num_points_outer = int(outer_density * ROE_SAMPLES * 2)
        points_outer = np.random.rand(num_points_outer, 2) * 2 - 1
        points_inner = np.random.rand(num_points_inner, 2) - 0.5
        dataset = np.concatenate((points_outer, points_inner))
        n_points = num_points_inner + num_points_outer
        distance_matrix = np.zeros((n_points, n_points))

        constraints = []
        contamination_percentage = 0
        for i in tqdm(range(len(distance_matrix)), desc="Distance Generation: ", leave=False):
            for j in range(len(distance_matrix)):
                distance_matrix[i, j] = np.linalg.norm(dataset[i, :] - dataset[j, :], ord=2)

        constraints = format_triplets_from_distance(distance_matrix, poison_perc=contamination_percentage)
        return subsample(constraints, 3000)
Beispiel #5
0
def format_ml_dataset(x, y, using="features", dataset_name="None", subsample_factor=0.0):
    constraints = []

    if using == "features":
        distance_matrix = np.zeros((len(x), len(x)))
        for i in tqdm(range(len(x)), desc=f"[{dataset_name.upper()}]Distance Generation: ", leave=False):
            for j in range(len(x)):
                distance_matrix[i, j] = np.linalg.norm(x[i] - x[j], ord=2)

        for i in tqdm(range(len(x)), desc=f"[{dataset_name.upper()}]Triplet Generation : ",
                      position=BAR_POSITION_OFFSET + 2,
                      leave=False):

            # Take 50 Nearest neighbours
            indexed_digits = [(i, d) for i, d in enumerate(list(np.ravel(distance_matrix[i, :])))]
            closest_indices = [i for i, _ in sorted(indexed_digits, key=lambda x: x[1])][:50]

            for close_index in closest_indices:
                # take the 50 farthest neighbors
                indexed_digits = [(i, d) for i, d in enumerate(list(np.ravel(distance_matrix[i, :])))]
                farthest_indices = [i for i, _ in sorted(indexed_digits, key=lambda x: x[1], reverse=True)][:50]

                for far_index in farthest_indices:
                    constraints.append([i, close_index, far_index])

    elif using == "labels":
        for i, el_1 in tqdm(enumerate(y), desc=f"[{dataset_name.upper()}]Triplet Generation : "):
            for j, el_2 in enumerate(y):
                if j != i:
                    for k, el_3 in enumerate(y):
                        if k != i and j != k:
                            close = el_1 == el_2
                            distant = el_1 != el_3
                            if close and distant:
                                constraints.append([i, j, k])

        try:
            constraints = subsample(constraints, len(y) * 70 * 70)
        except:
            pass

    return constraints
Beispiel #6
0
def create_random_dataset(contamination_percentage=CONTAMINATION_PERCENTAGE, sparsify=False):
    # First lookup if a dataset with that contamination percentage has already been created
    if os.path.exists(f"./datasets/random/random-{contamination_percentage}.txt"):
        print("Using old dataset", file=sys.stderr)
        with open(f"./datasets/random/random-{contamination_percentage}.txt") as random_ds:
            constraints = []
            for line in random_ds.readlines():
                i, j, k = [int(x) for x in line.replace("\n", "").split(",")]
                constraints.append([i, j, k])

            return constraints, ROE_SAMPLES
    else:
        if os.path.exists(f"./datasets/random/random-0.0.txt"):
            print("Using old dataset", file=sys.stderr)
            with open(f"./datasets/random/random-0.0.txt") as random_ds:
                constraints = []
                for line in random_ds.readlines():
                    i, j, k = [int(x) for x in line.replace("\n", "").split(",")]
                    if rand() > contamination_percentage:
                        constraints.append([i, j, k])
                    else:
                        constraints.append([i, k, j])

            with open(f"./datasets/random/random-{contamination_percentage}.txt", "w+") as random_ds:
                for idx, constraint in enumerate(constraints):
                    i, j, k = constraint
                    if idx != len(constraints) - 1:
                        random_ds.write(f"{i},{j},{k}\n")
                    else:
                        random_ds.write(f"{i},{j},{k}")
            return constraints, ROE_SAMPLES

        else:
            # Create the dataset and ...
            print("Creating dataset", file=sys.stderr)
            dataset = [np.random.rand(1, 10) * 1 / 20 for _ in range(ROE_SAMPLES)]
            distance_matrix = np.zeros((len(dataset), len(dataset)))
            constraints = []

            for i in tqdm(range(len(dataset)), desc="Distance Generation: ", leave=False):
                for j in range(len(dataset)):
                    distance_matrix[i, j] = np.linalg.norm(dataset[i] - dataset[j], ord=2)

            for i in tqdm(range(len(dataset)), desc="Triplet Generation : ", leave=False):

                # Take 50 Nearest neighbours
                indexed_digits = [(i, d) for i, d in enumerate(list(np.ravel(distance_matrix[i, :])))]
                closest_indices = [i for i, _ in sorted(indexed_digits, key=lambda x: x[1])][:50]

                for close_index in closest_indices:
                    # take the 50 farthest neighbors
                    indexed_digits = [(i, d) for i, d in enumerate(list(np.ravel(distance_matrix[i, :])))]
                    farthest_indices = [i for i, _ in sorted(indexed_digits, key=lambda x: x[1], reverse=True)][:50]

                    for far_index in farthest_indices:
                        next = [close_index, far_index]
                        if rand() >= contamination_percentage:
                            constraints.append([i, *next])
                        else:
                            constraints.append([i, *np.random.permutation(next)])
        subsampled_constraints = subsample(constraints, 3000)

        if sparsify:
            return sparsify_instance(subsampled_constraints)
        # Save it as file!
        with open(f"./datasets/random/random-{contamination_percentage}.txt", "w+") as random_ds:
            for idx, constraint in enumerate(subsampled_constraints):
                i, j, k = constraint
                if idx != len(subsampled_constraints) - 1:
                    random_ds.write(f"{i},{j},{k}\n")
                else:
                    random_ds.write(f"{i},{j},{k}")

        return subsampled_constraints, ROE_SAMPLES