def main():
    delimiter = ' '
    data_filename_location = '../Data/SIFT/Samples/SIFT__ss-5000__1.train'
    data_train = np.genfromtxt(data_filename_location,
                               delimiter=delimiter,
                               dtype=np.float)
    print("DONE reading training set")

    data_train_norm = normalize_data(data_train)
    print("DONE normalizing training set")

    X_std = data_train_norm

    # -- Step 2.1: Do Covariance Matrix -- #
    mean_vec = np.mean(X_std, axis=0)
    # cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (X_std.shape[0] - 1)
    # print('Covariance matrix \n%s' % cov_mat)
    # print('np covariance matrix: \n%s' % np.cov(X_std.T))

    # -- Step 2.2: Do eigendecomposition on the covariance matrix -- #
    cov_mat = np.cov(X_std.T)
    eig_vals, eig_vecs = np.linalg.eig(cov_mat)
    print('Eigenvectors \n%s' % eig_vecs)
    print('\nEigenvalues \n%s' % eig_vals)

    # -- 4. Calculate Variance and Plot it for #n principal components -- #
    show_pc_variance_plot(eig_vals, 65)
Exemple #2
0
 def load_train_data(self, data_iter: Iterable[List[str]]):
     n_rows = len(data_iter)
     n_columns = len(data_iter[0].split(","))
     self.features = np.empty((n_rows, n_columns - 1))
     for idx, row in enumerate(data_iter):
         values = row.split(",")
         self.features[idx] = np.array([float(val) for val in values[:-1]])
         self.outcomes.append(values[-1])
     self.features = normalize_data(self.features)
Exemple #3
0
def main():
    # -- Experiment Info: description && motivation && data -- #
    print(
        "# -- This has the purpose of checking whether data after PCAed looks uniformly distributed on the new axes/PC. -- #"
    )

    # -- Read original data -- #
    # delimiter = ' '
    # data_filename_location = "../Data/Handmade/h3.train"
    # data_train_original = np.genfromtxt(data_filename_location, delimiter=delimiter, dtype=np.float)
    # data_train_original_norm = normalize_data(data_train_original)

    rng = np.random.RandomState(1)
    data_train_original = np.dot(rng.rand(2, 2), rng.randn(2, 1000)).T
    data_train_original_norm = normalize_data(data_train_original)

    # print(data_train_original_norm)

    # -- Plot 2D data before PCA-ing it -- #

    sns.set(style="darkgrid", color_codes=True, font_scale=1.5)
    data_train_original_norm_2D = data_train_original_norm[:, 0:2].copy()
    # print("# -- data_train_original_norm_2D -- #")
    # print(data_train_original_norm_2D)

    # -- Try stuff from: https://jakevdp.github.io/PythonDataScienceHandbook/05.09-principal-component-analysis.html -- #
    # - 1. Get the data - #
    X = data_train_original_norm_2D
    # plt.scatter(X[:, 0], X[:, 1])
    # plt.axis('equal');
    # plt.show()

    plot_pca_to_show_variance_introduction(X)

    # -- PCA as dimensionality reduction -- #
    pca = PCA(n_components=1)
    pca.fit(X)
    X_pca = pca.transform(X)
    print("original shape:   ", X.shape)
    print("transformed shape:", X_pca.shape)

    print("X: ", X)
    print("X_pca: ", X_pca)
    X_new = pca.inverse_transform(X_pca)
    print("X_new: ", X_new)

    plt.scatter(X[:, 0], X[:, 1], alpha=0.5, color="#55A868", s=30)
    # plt.scatter(X_pca[:, 0], X_pca[:, 1], alpha=1)
    plt.scatter(X_new[:, 0], X_new[:, 1], alpha=0.9, color="#C44F53", s=25)
    plt.axis('equal')
    plt.xlabel("Dimension d = 0")
    plt.ylabel("Dimension d = 1")
    plt.show()
def visualize_PCAed_data_more_versions(dataset_blobs_norm):
    plot_2D(dataset_blobs_norm)
    # -- PCA from 2D -> 1D -- #
    pca = PCA(n_components=1)
    dataset_blobs_norm_PCAed = pca.fit_transform(dataset_blobs_norm)

    zeros_col = np.zeros((len(dataset_blobs_norm_PCAed), 1))
    dataset_blobs_norm_PCAed_with_zeros_col = np.hstack([
        dataset_blobs_norm_PCAed,
        np.zeros((len(dataset_blobs_norm_PCAed), 1))
    ])
    print(
        np.hstack([
            dataset_blobs_norm_PCAed,
            np.zeros((len(dataset_blobs_norm_PCAed), 1))
        ]))
    plot_2D(normalize_data(dataset_blobs_norm_PCAed_with_zeros_col))

    # -- PCA from 2D -- #
    pca = PCA(n_components=2)
    dataset_blobs_norm_PCAed_2D = pca.fit_transform(dataset_blobs_norm)
    plot_2D(normalize_data(dataset_blobs_norm_PCAed_2D))
def main():
    # -- Experiment Info: description && motivation && data -- #
    print("# -- This is a small experiment on a 3D -> 2D dataset, such that one dimension gets gradually squeezed. -- #")
    print("# -- The plan is to see how modes get affected, as well as how omega_zero reacts -- #")
    print("# -- The starting point targets dataset from file => \"Data/Random/Small_Exp_Modes/blobs_n-100_d-3_blobs-1_seed-1.train\". -- #")


    # -- Read original data -- #
    delimiter = ' '
    data_filename_location = "../../Data/Random/Small_Exp_Modes/blobs_n-100_d-3_blobs-1_seed-1.train"
    data_train_original = np.genfromtxt(data_filename_location, delimiter=delimiter, dtype=np.float)
    data_train_original_norm = normalize_data(data_train_original)

    # data_train_original_norm__squeezed_1 = data_train_original_norm.copy()
    # data_train_original_norm__squeezed_1[:, 0] *= 390.0
    # data_train_original_norm__squeezed_1[:, 1] = 1
    # data_train_original_norm__squeezed_1[:, 2] = 0.5
    #
    # data_train_original_norm__squeezed_2 = data_train_original_norm.copy()
    # data_train_original_norm__squeezed_2[:, 0] *= 0.9
    # data_train_original_norm__squeezed_2[:, 1] *= 0.1
    # data_train_original_norm__squeezed_2[:, 2] *= 0.5

    # -- Make a simpler example, from 2D -> 2D -- #
    data_train_original_norm__v1 = data_train_original_norm[:, 0:2].copy()
    data_train_original_norm__v2 = data_train_original_norm[:, 0:2].copy()

    # data_train_original_norm__v2[:, 0] *= 0.1
    data_train_original_norm__v2[:, 0] *= 0.00000001

    plot_3subfigs_2D(data_train_original_norm__v1, data_train_original_norm__v2, data_train_original_norm__v1)

    # -- Generate gradually squeezed data and plot all variants in 3D -- #
    # plot_3subfigs_3D(data_train_original_norm, data_train_original_norm__squeezed_1, data_train_original_norm__squeezed_2)
    # plot_3subfigs_2D(data_train_original_norm, data_train_original_norm__squeezed_1, data_train_original_norm__squeezed_2)

    # print(data_train_original_norm[0])
    # print(data_train_original_norm__squeezed_1[0])
    # print(data_train_original_norm__squeezed_2[0])

    # -- Save the all the variants (3, so far) to files so I can train them afterwards -- #
    # output_destination = "../../Data/Random/Small_Exp_Modes/blobs_n-100_d-3_blobs-1_seed-1"
    # save_variants_to_files(output_destination, data_train_original_norm, data_train_original_norm__squeezed_1, data_train_original_norm__squeezed_2)

    output_destination = "../../Data/Random/Small_Exp_Modes/blobs_n-100_d-2_blobs-1_seed-1"
    np.savetxt(output_destination + "_unsqueezed.train", data_train_original_norm__v1, delimiter=' ')
    np.savetxt(output_destination + "_squeezed.train", data_train_original_norm__v2, delimiter=' ')
def main():
    # -- Experiment Info: description && motivation && data -- #
    print(
        "# -- This is a small experiment 2D generated/artificial dataset which should contain a blob and an outlier. -- #"
    )
    print(
        "# -- The plan is to see how PCAed data looks and also test SH on it by encoding to 2 bits. -- #"
    )

    # -- Step 1: Create 2D blob-- #
    n_samples = 1000
    n_dimensions = 2
    blobs_std = 0.5
    blobs = 1
    seed = 1
    dataset_blobs, color = datasets.samples_generator.make_blobs(
        n_samples=n_samples,
        n_features=n_dimensions,
        cluster_std=blobs_std,
        centers=blobs,
        shuffle=True,
        random_state=seed)

    print("dataset_blobs => ")
    print(dataset_blobs)
    dataset_blobs_for_rotation = dataset_blobs.copy()
    # -- Step 2: Add outlier to the data -- #
    outlier_1 = [10, 10]
    dataset_blobs = np.vstack([dataset_blobs.copy(), outlier_1])
    # print("dataset_blobs with outlier => ")
    # print(dataset_blobs)

    dataset_blobs_norm = normalize_data(dataset_blobs)
    output_file = "../../Results/Experiments/Small_Exp_Outliers/blob_outlier_1.data"
    # np.savetxt(output_file, dataset_blobs, delimiter=' ')

    # -- *** See what PC axes PCA picks in 2 cases: 2D -> 2D and 2D -> 1D *** -- #
    # visualize_PCAed_data_more_versions(dataset_blobs_norm.copy())

    # -- *** See what PC axes PCA picks in 2 cases: 2D -> 2D and 2D -> 1D, in case I rotate that blob *** -- #
    # (Point): If I increase the number of points in the blob, the PC axis will rotate as well, until at some point, that outlier won't even matter anymore, cause it will get projected itself on the chosen PC axis
    # visualize_PC_1D_axis(dataset_blobs_norm.copy())

    # -- *** See in case 2D -> 1D how PC axis is chosen *** -- #
    # (Point): The more round the blob is, the farther apart the outlier and the blob will be.
    dataset_blobs_outlier_rotated = visualize_PC_1D_axis_with_rotated_blob(
        dataset_blobs_for_rotation.copy(), outlier_1)
def visualize_PC_1D_axis_with_rotated_blob(dataset_blobs_for_rotation,
                                           outlier_1):
    # for angle = 243., the created blob with 1000 samples is perfectly round (or 244. and 10 000 samples)
    # for angle = 13., the created blob with 10 000 samples projects the outlier on the PC axis

    dataset_blobs_rotated = rotate_data(dataset_blobs_for_rotation.copy(),
                                        243.)
    dataset_blobs_outlier_rotated = np.vstack(
        [dataset_blobs_rotated.copy(), outlier_1])
    dataset_blobs_outlier_rotated_norm = normalize_data(
        dataset_blobs_outlier_rotated)
    print(dataset_blobs_outlier_rotated_norm.shape)

    plot_2D(dataset_blobs_outlier_rotated_norm)
    visualize_PC_1D_axis(dataset_blobs_outlier_rotated_norm.copy())

    return dataset_blobs_outlier_rotated
def make_testing_file_for_profi_vs_blob_outlier(n_samples, n_dimensions,
                                                blobs):
    testing_data, color_testing = datasets.samples_generator.make_blobs(
        n_samples=int(n_samples / 10),
        n_features=n_dimensions,
        cluster_std=1,
        centers=blobs,
        shuffle=True,
        random_state=13)

    testing_data_rotated = rotate_data(testing_data.copy(), 100.)

    print(testing_data_rotated.shape)
    plot_2D(normalize_data(testing_data_rotated))

    output_file = "../../Results/Experiments/Small_Exp_Outliers/blob-outlier_testing-on-1000-perfectly-round_ss=100.test"
    np.savetxt(output_file, testing_data_rotated, delimiter=' ')
def main():
    """Create the plot"""
    args = read_args()
    compressor_file = args.compressor
    compressor = importlib.import_module(compressor_file)
    model = pickle.load(open(args.model, 'rb'))
    input_data = np.genfromtxt(args.input, delimiter=' ', dtype=np.float)

    principal_components = model.pc_from_training
    input_data_norm = normalize_data(input_data)
    _, hash_codes = compressor.compress(input_data_norm, model,
                                        model.training_filename)

    # Split depending on assigned hash values
    hash_buckets = [[] for _ in range(0, 2**model.n_bits)]
    for point, hash_val in zip(input_data_norm, hash_codes[:, 0]):
        hash_buckets[hash_val].append(point)

    # Plot differently colored points, depending on hash
    legend_handles = []
    for hash_val, points in enumerate(hash_buckets):
        x_coords = list(map(lambda x: x[0], points))
        y_coords = list(map(lambda x: x[1], points))
        points, = plt.plot(x_coords,
                           y_coords,
                           '.',
                           label=to_binary(hash_val, model.n_bits))
        legend_handles.append(points)

    # Plot principal components
    for principal_component in principal_components:
        # Eigenvectors were flipped during training
        plt.plot([0.5, 0.5 + principal_component[1] / 2],
                 [0.5, 0.5 + principal_component[0] / 2], 'r')

    # Show legend outside to the right
    plt.legend(bbox_to_anchor=(1.04, 0.5),
               loc="center left",
               handles=legend_handles)
    # Ensure there is enough space for the legend
    plt.subplots_adjust(right=0.8)
    plt.show()
Exemple #10
0
    def predict(self, test_data: Iterable[List[str]]):
        n_rows = len(test_data)
        n_columns = len(test_data[0].split(","))
        test_matrix = np.empty((n_rows, n_columns - 1))
        for idx, row in enumerate(test_data):
            values = row.split(",")
            test_matrix[idx] = np.array([float(val) for val in values[:-1]])
        test_matrix = normalize_data(test_matrix)

        predictions = np.empty(len(test_data))
        for idx, test_row in enumerate(test_matrix):
            distances = [(idx, self._calculate_distance(data_point, test_row))
                         for idx, data_point in enumerate(self.features)]
            distances.sort(key=itemgetter(1))
            k_outcomes = [
                self.outcomes[idx] for idx, _ in distances[:self.k_neighbors]
            ]
            outcome_counts = Counter(k_outcomes)
            predictions[idx] = max(outcome_counts, key=outcome_counts.get)
        return predictions
Exemple #11
0
def main():
    """Run the test"""
    args = read_args()
    compressor = importlib.import_module(args.compressor)
    print('Reading data')
    # Some of the datasets have row indexes in the first column
    indexed = args.indexed
    bits = args.bits 
    
    if indexed: 
        data = np.genfromtxt(args.data, delimiter=' ', dtype=np.float)[:,1:]
    else: 
        data = np.genfromtxt(args.data, delimiter=' ', dtype=np.float)
    # Basic data about the dataset
    data_dimensionality = len(data[0])
    datasets_no_points = len(data)
    # Normalize the data
    data = normalize_data(data)
    print('Splitting data')
    training_data, test_data = split_dataset(data, args.samples)
    print('Creating model')

    model = train_sh(training_data, bits, args.data, '__test.log')
    print('Compressing dataset')
    _, compressed_data = compressor.compress(
        np.concatenate((test_data, training_data), axis=0),
        model,
        model.training_filename)
    compressed_training_data = compressed_data[args.samples:]
    compressed_test_data = compressed_data[:args.samples]
    # Get the bucket sizes
    bucket_sizes = check_bucket_sizes(compressed_data)
    # values of k used for measuring
    measured_ks = [10, 50, 100, 500]
    print('Computing recall and precision')
    recalls = [[[] for _ in range(0, bits+1)] for _ in measured_ks] # k*dist array of lists
    precisions = [[[] for _ in range(0, bits+1)] for _ in measured_ks] # k*dist array of lists
    num_returned = [[[] for _ in range(0, bits+1)] for _ in measured_ks]
    for idx, hashcode in enumerate(compressed_test_data):
        print("Computing {}".format(idx))
        hamming_dists = [
            sum([bin(xi^hashcode[i]).count('1') for i, xi in enumerate(x)])
            for x in compressed_training_data
        ]
        hamming_indices = np.argsort(hamming_dists)
        real_dists = [np.linalg.norm(x-test_data[idx]) for x in training_data]
        real_indices = np.argsort(real_dists)

        num_found = 0
        for distance in range(0, bits+1):
            while num_found < len(hamming_indices)\
            and hamming_dists[hamming_indices[num_found]] <= distance:
                num_found += 1
            for k_idx, k in enumerate(measured_ks):
                recall, precision = compare(hamming_indices[:num_found], real_indices[:k])
                recalls[k_idx][distance].append(recall)
                precisions[k_idx][distance].append(precision)
                num_returned[k_idx][distance].append(num_found)
    print_results(
        measured_ks, recalls, precisions, 
        num_returned, data_dimensionality, 
        datasets_no_points, bucket_sizes)