コード例 #1
0
def make_batch_assignation_evaluation(X, centroids):
    """
    Assign `size_batch` random samples of `X` to some of the centroids.
    All the samples are assigned at the same time using a matrix-vector multiplication.
    Time is recorded.

    :param X: The input data from which to take the samples.
    :param centroids: The centroids to which to assign the samples (must be of same dimension than `X`)
    :param size_batch: The number of data points to assign

    :return: None
    """
    size_batch = paraman["--batch-assignation-time"]
    if size_batch > X.shape[0]:
        logger.warning(
            "Batch size for batch assignation evaluation is bigger than data size. {} > {}. Using "
            "data size instead.".format(size_batch, X.shape[0]))
        size_batch = X.shape[0]
        paraman["--batch-assignation-time"] = size_batch

    # precomputed_centroid_norms = get_squared_froebenius_norm(centroids)
    precomputed_centroid_norms = None
    indexes_batch = np.random.permutation(X.shape[0])[:size_batch]
    start_time = time.time()
    _ = get_distances(X[indexes_batch],
                      centroids,
                      precomputed_centroids_norm=precomputed_centroid_norms)
    stop_time = time.time()

    resprinter.add({
        "batch_assignation_mean_time": (stop_time - start_time) / size_batch,
    })
コード例 #2
0
ファイル: utils.py プロジェクト: lucgiffon/qkmeans
def check_cluster_integrity(X_data, X_centroids_hat, K_nb_cluster, counts,
                            indicator_vector):
    """
    Check for each cluster if it has data points in it. If not, create a new cluster from the data points of the most populated cluster so far.

    :param X_data:
    :param X_centroids_hat:
    :param K_nb_cluster:
    :param counts:
    :param indicator_vector:
    :return:
    """
    for c in range(K_nb_cluster):

        cluster_data = X_data[indicator_vector == c]
        if len(cluster_data) == 0:
            biggest_cluster_index = np.argmax(counts)  # type: int
            biggest_cluster_data_indexes_bool = indicator_vector == biggest_cluster_index
            biggest_cluster_actual_data_indexes = np.where(
                biggest_cluster_data_indexes_bool)[0]

            random_index_in_biggest_cluster = np.random.choice(
                biggest_cluster_actual_data_indexes, size=1)[0]
            random_point_in_biggest_cluster = X_data[
                random_index_in_biggest_cluster]

            logger.warning(
                "cluster has lost data, add new cluster. cluster idx: {}".
                format(c))
            X_centroids_hat[c] = random_point_in_biggest_cluster.reshape(1, -1)
            counts[biggest_cluster_index] -= 1
            counts[c] = 1

            indicator_vector[random_index_in_biggest_cluster] = c
コード例 #3
0
def make_assignation_evaluation(X, centroids):
    nb_eval = paraman["--assignation-time"]
    if nb_eval > X.shape[0]:
        logger.warning(
            "Batch size for assignation evaluation is bigger than data size. {} > {}. Using "
            "data size instead.".format(nb_eval, X.shape[0]))
        nb_eval = X.shape[0]
        paraman["--assignation-time"] = nb_eval

    times = []
    precomputed_centroid_norms = get_squared_froebenius_norm_line_wise(
        centroids)
    for i in np.random.permutation(X.shape[0])[:nb_eval]:
        start_time = time.time()
        get_distances(X[i].reshape(1, -1),
                      centroids,
                      precomputed_centroids_norm=precomputed_centroid_norms)
        stop_time = time.time()
        times.append(stop_time - start_time)

    mean_time = np.mean(times)
    std_time = np.std(times)

    resprinter.add({
        "assignation_mean_time": mean_time,
        "assignation_std_time": std_time
    })
コード例 #4
0
def make_ami_evaluation(y_train, x_test, y_test, U_centroids,
                        indicator_vector_train):
    n_sample = paraman["--ami"]
    if n_sample > y_train.shape[0]:
        logger.warning(
            "Batch size for ami evaluation is bigger than data size. {} > {}. Using "
            "data size instead.".format(n_sample, y_train.shape[0]))
        n_sample = y_train.shape[0]
        paraman["--nystrom"] = n_sample

    indexes_samples = np.random.permutation(y_train.shape[0])[:n_sample]
    y_train = y_train[indexes_samples]
    indicator_vector_train = indicator_vector_train[indexes_samples]

    if isinstance(U_centroids, SparseFactors):
        U_centroids = U_centroids.compute_product()
    indicator_vector_test, _ = assign_points_to_clusters(x_test, U_centroids)

    train_ami = adjusted_mutual_info_score(y_train, indicator_vector_train)
    test_ami = adjusted_mutual_info_score(y_test, indicator_vector_test)

    ami_results = {
        "train_ami": train_ami,
        "test_ami": test_ami,
    }

    resprinter.add(ami_results)
コード例 #5
0
ファイル: utils.py プロジェクト: lucgiffon/qkmeans
def update_clusters_with_integrity_check(X_data, X_data_norms, X_centroids_hat,
                                         K_nb_cluster, counts,
                                         indicator_vector, distances,
                                         cluster_names, cluster_names_sorted):
    """
    Checki if no cluster has lost point and if yes, create a new cluster with the farthest point away in the cluster with the biggest population.

    All changes are made in place but for counts and cluster_names_sorted which are returned.

    :param X_data:
    :param X_data_norms:
    :param X_centroids_hat:
    :param K_nb_cluster:
    :param counts:
    :param indicator_vector:
    :param distances:
    :param cluster_names:
    :param cluster_names_sorted:
    :return:
    """

    for c in range(K_nb_cluster):
        biggest_cluster_index = np.argmax(counts)  # type: int
        biggest_cluster = cluster_names[biggest_cluster_index]
        biggest_cluster_data_indexes = indicator_vector == biggest_cluster
        index_of_farthest_point_in_biggest_cluster = np.argmax(
            distances[:, c][biggest_cluster_data_indexes])
        farthest_point_in_biggest_cluster = X_data[
            biggest_cluster_data_indexes][
                index_of_farthest_point_in_biggest_cluster]
        absolute_index_of_farthest_point_in_biggest_cluster = np.where(
            biggest_cluster_data_indexes
        )[0][index_of_farthest_point_in_biggest_cluster]

        cluster_data = X_data[indicator_vector == c]
        if len(cluster_data) == 0:
            logger.warning(
                "cluster has lost data, add new cluster. cluster idx: {}".
                format(c))
            X_centroids_hat[c] = farthest_point_in_biggest_cluster.reshape(
                1, -1)
            counts = list(counts)
            counts[biggest_cluster_index] -= 1
            counts.append(1)
            counts = np.array(counts)
            cluster_names_sorted = list(cluster_names_sorted)
            cluster_names_sorted.append(c)
            cluster_names_sorted = np.array(cluster_names_sorted)

            indicator_vector[
                absolute_index_of_farthest_point_in_biggest_cluster] = c
            distances_to_new_cluster = get_distances(
                X_data,
                X_centroids_hat[c].reshape(1, -1),
                precomputed_data_points_norm=X_data_norms)
            distances[:, c] = distances_to_new_cluster.flatten()
        else:
            X_centroids_hat[c] = np.mean(X_data[indicator_vector == c, :], 0)

    return counts, cluster_names_sorted
コード例 #6
0
def make_nystrom_evaluation(x_train, y_train, x_test, y_test, gamma, landmarks):
    # verify sample size for evaluation
    n_sample = paraman["--nystrom"]
    if n_sample > x_train.shape[0]:
        logger.warning("Batch size for nystrom evaluation is bigger than data size. {} > {}. Using "
                       "data size instead.".format(n_sample, x_train.shape[0]))
        n_sample = x_train.shape[0]
        paraman["--nystrom"] = n_sample

    indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample]
    sample = x_train[indexes_samples]
    samples_norm = None

    # Make nystrom approximation
    # nys_obj = Nystroem(gamma=gamma, n_components=landmarks.shape[0])
    # nys_obj.fit(landmarks)
    # nystrom_embedding = nys_obj.transform(sample)
    landmarks_norm = get_squared_froebenius_norm_line_wise(landmarks)[:, np.newaxis]
    metric = prepare_nystrom(landmarks, landmarks_norm, gamma=gamma)
    nystrom_embedding = nystrom_transformation(sample, landmarks, metric, landmarks_norm, samples_norm, gamma=gamma)
    nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T

    # Create real kernel matrix
    real_kernel_special = special_rbf_kernel(sample, sample, gamma, norm_X=samples_norm, norm_Y=samples_norm)
    # real_kernel = rbf_kernel(sample, sample, gamma)
    real_kernel_norm = np.linalg.norm(real_kernel_special)

    # evaluation reconstruction error
    reconstruction_error_nystrom = np.linalg.norm(nystrom_approx_kernel_value - real_kernel_special) / real_kernel_norm

    # start svm + nystrom classification
    if x_test is not None:
        logger.info("Start classification")

        x_train_nystrom_embedding = nystrom_transformation(x_train, landmarks, metric, landmarks_norm, None, gamma=gamma)
        x_test_nystrom_embedding = nystrom_transformation(x_test, landmarks, metric, landmarks_norm, None, gamma=gamma)

        linear_svc_clf = LinearSVC(class_weight="balanced")
        linear_svc_clf.fit(x_train_nystrom_embedding, y_train)
        predictions = linear_svc_clf.predict(x_test_nystrom_embedding)

        if paraman["--kddcup04"]:
            # compute recall: nb_true_positive/real_nb_positive
            recall = np.sum(predictions[y_test == 1])/np.sum(y_test[y_test == 1])
            # compute precision: nb_true_positive/nb_positive
            precision = np.sum(predictions[y_test == 1])/np.sum(predictions[predictions==1])
            f1 = 2 * precision * recall / (precision + recall)
            accuracy_nystrom_svm = f1
        else:
            accuracy_nystrom_svm = np.sum(predictions == y_test) / y_test.shape[0]

    else:
        accuracy_nystrom_svm = None

    return reconstruction_error_nystrom, accuracy_nystrom_svm
コード例 #7
0
def build_df(path_results_dir, dct_output_files_by_root, col_to_delete=[]):
    lst_df_results = []
    for root_name, dct_results in dct_output_files_by_root.items():
        try:
            result_file = path_results_dir / dct_results["results"]
            df_expe = pd.read_csv(result_file)
            df_expe["oar_id"] = root_name
            lst_df_results.append(df_expe)
        except KeyError:
            logger.warning(
                "No 'results' entry for root name {}".format(root_name))

    df_results = pd.concat(lst_df_results)

    for c in col_to_delete:
        df_results = df_results.drop([c], axis=1)
    return df_results
コード例 #8
0
ファイル: kernel.py プロジェクト: lucgiffon/qkmeans
def prepare_nystrom(landmarks, landmarks_norm, gamma):
    """
    Return the K^{-1/2} matrix of Nyström: the metric used for the transformation.

    It uses the rbf kernel.

    :param landmarks: The matrix of landmark points
    :param landmarks_norm: The norm of the matrix of landmark points
    :param gamma: The gamma value to use in the rbf kernel.
    :return:
    """
    landmarks_norm_T = landmarks_norm.T if hasattr(landmarks_norm, "T") else None
    basis_kernel_W = special_rbf_kernel(landmarks, landmarks, gamma, landmarks_norm, landmarks_norm_T)
    U, S, V = scipy.linalg.svd(basis_kernel_W)
    Sprim =  np.maximum(S, 1e-12)
    if (Sprim != S).any():
        logger.warning("One value of S in singular decomposition of W was lower than 1e-12")
    S = Sprim

    normalization_ = np.dot(U / np.sqrt(S), V)

    return normalization_
コード例 #9
0
"""
from copy import deepcopy

import numpy as np
from numpy.linalg import norm
from numpy.linalg import multi_dot
import matplotlib.pyplot as plt
from qkmeans.palm.utils import compute_objective_function

from qkmeans.utils import get_side_prod, logger

# TODO avoid conversions between dense ndarray and sparse matrices
# TODO init palm with SparseFactors

logger.warning(
    "The module {} shouldn't be used because it hasn't been maintained in a long time"
    .format(__file__))


def palm4msa(arr_X_target: np.array,
             lst_S_init: list,
             nb_factors: int,
             lst_projection_functions: list,
             f_lambda_init: float,
             nb_iter: int,
             update_right_to_left=True,
             graphical_display=False):
    """
    lst S init contains factors in decreasing indexes (e.g: the order along which they are multiplied in the product).
        example: S5 S4 S3 S2 S1
コード例 #10
0
import matplotlib.pyplot as plt

import logging
mpl_logger = logging.getLogger("matplotlib")
mpl_logger.setLevel(logging.WARNING)

import copy

import numpy as np
from qkmeans.core.utils import compute_objective, assign_points_to_clusters, get_squared_froebenius_norm_line_wise
from qkmeans.utils import logger, DataGenerator
from sklearn import datasets

logger.warning(
    "Module {} hasn't been tested and shouldn't be used.  It is work in progress"
    .format(__file__))


def kmeans_minibatch(X_data, K_nb_cluster, nb_iter, initialization,
                     batch_size):
    """

    :param X_data: The data matrix of n examples in dimensions d in shape (n, d).
    :param K_nb_cluster: The number of clusters to look for.
    :param nb_iter: The maximum number of iteration.
    :param initialization: The (K, d) matrix of centroids at initialization.
    :param batch_size: The size of each batch.
    :return:
    """
コード例 #11
0
def make_nystrom_evaluation(x_train, y_train, x_test, y_test, U_centroids):
    """
    Evaluation Nystrom construction time and approximation precision.

    The approximation is based on a subsample of size n_sample of the input data set.

    :param x_train: Input dataset as ndarray.
    :param U_centroids: The matrix of centroids as ndarray or SparseFactor object
    :param n_sample: The number of sample to take into account in the reconstruction (can't be too large)

    :return:
    """

    n_sample = paraman["--nystrom"]
    if n_sample > x_train.shape[0]:
        logger.warning("Batch size for nystrom evaluation is bigger than data size. {} > {}. Using "
                       "data size instead.".format(n_sample, x_train.shape[0]))
        n_sample = x_train.shape[0]
        paraman["--nystrom"] = n_sample

    # Compute euristic gamma as the mean of euclidian distance between example
    gamma = compute_euristic_gamma(x_train)
    log_memory_usage("Memory after euristic gamma computation in make_nystrom_evaluation")
    # precompute the centroids norm for later use (optimization)
    centroids_norm = get_squared_froebenius_norm_line_wise(U_centroids)[:, np.newaxis]
    # centroids_norm = None

    indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample]
    sample = x_train[indexes_samples]
    samples_norm = None
    log_memory_usage("Memory after sample selection in make_nystrom_evaluation")

    ########################
    # Nystrom on centroids #
    ########################
    logger.info("Build Nystrom on centroids")
    ## TIME: nystrom build time
    # nystrom build time is Nystrom preparation time for later use.
    ## START
    nystrom_build_start_time = time.process_time()
    metric = prepare_nystrom(U_centroids, centroids_norm, gamma=gamma)
    nystrom_build_stop_time = time.process_time()
    log_memory_usage("Memory after SVD computation in make_nystrom_evaluation")
    # STOP
    nystrom_build_time = nystrom_build_stop_time - nystrom_build_start_time

    ## TIME: nystrom inference time
    # Nystrom inference time is the time for Nystrom transformation for all the samples.
    ## START
    nystrom_inference_time_start = time.process_time()
    nystrom_embedding = nystrom_transformation(sample, U_centroids, metric, centroids_norm, samples_norm, gamma=gamma)
    nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T
    nystrom_inference_time_stop = time.process_time()
    log_memory_usage("Memory after kernel matrix approximation in make_nystrom_evaluation")
    ## STOP
    nystrom_inference_time = (nystrom_inference_time_stop - nystrom_inference_time_start) / n_sample

    ################################################################

    ######################
    # Nystrom on uniform #
    ######################
    logger.info("Build Nystrom on uniform sampling")

    indexes_uniform_samples = np.random.permutation(x_train.shape[0])[:U_centroids.shape[0]]
    uniform_sample = x_train[indexes_uniform_samples]
    uniform_sample_norm = get_squared_froebenius_norm_line_wise(uniform_sample)[:, np.newaxis]
    log_memory_usage("Memory after uniform sample selection in make_nystrom_evaluation")

    metric_uniform = prepare_nystrom(uniform_sample, uniform_sample_norm, gamma=gamma)
    log_memory_usage("Memory after SVD computation in uniform part of make_nystrom_evaluation")

    nystrom_embedding_uniform = nystrom_transformation(sample, uniform_sample, metric_uniform, uniform_sample_norm, samples_norm, gamma=gamma)
    nystrom_approx_kernel_value_uniform = nystrom_embedding_uniform @ nystrom_embedding_uniform.T

    #################################################################

    ###############
    # Real Kernel #
    ###############
    logger.info("Compute real kernel matrix")

    real_kernel_special = special_rbf_kernel(sample, sample, gamma, norm_X=samples_norm, norm_Y=samples_norm)
    # real_kernel = rbf_kernel(sample, sample, gamma)
    real_kernel_norm = np.linalg.norm(real_kernel_special)
    log_memory_usage("Memory after real kernel computation in make_nystrom_evaluation")

    #################################
    # Sklearn based Nystrom uniform #
    #################################

    # sklearn_nystrom = Nystroem(gamma=gamma, n_components=uniform_sample.shape[0])
    # sklearn_nystrom = sklearn_nystrom.fit(uniform_sample)
    # sklearn_transfo = sklearn_nystrom.transform(sample)
    # kernel_sklearn_nys = sklearn_transfo  @ sklearn_transfo.T

    ################################################################

    ####################
    # Error evaluation #
    ####################

    sampled_froebenius_norm = np.linalg.norm(nystrom_approx_kernel_value - real_kernel_special) / real_kernel_norm
    sampled_froebenius_norm_uniform = np.linalg.norm(nystrom_approx_kernel_value_uniform - real_kernel_special) / real_kernel_norm

    # svm evaluation
    if x_test is not None:
        logger.info("Start classification")

        time_classification_start = time.process_time()
        x_train_nystrom_embedding = nystrom_transformation(x_train, U_centroids, metric, centroids_norm, None, gamma=gamma)
        x_test_nystrom_embedding = nystrom_transformation(x_test, U_centroids, metric, centroids_norm, None, gamma=gamma)

        linear_svc_clf = LinearSVC(class_weight="balanced")
        linear_svc_clf.fit(x_train_nystrom_embedding, y_train)
        predictions = linear_svc_clf.predict(x_test_nystrom_embedding)
        time_classification_stop = time.process_time()

        if paraman["--kddcup04"]:
            # compute recall: nb_true_positive/real_nb_positive
            recall = np.sum(predictions[y_test == 1])/np.sum(y_test[y_test == 1])
            # compute precision: nb_true_positive/nb_positive
            precision = np.sum(predictions[y_test == 1])/np.sum(predictions[predictions==1])
            f1 = 2 * precision * recall / (precision + recall)
            accuracy_nystrom_svm = f1
        else:
            accuracy_nystrom_svm = np.sum(predictions == y_test) / y_test.shape[0]

        delta_time_classification = time_classification_stop - time_classification_start
    else:
        accuracy_nystrom_svm = None
        delta_time_classification = None

    nystrom_results = {
        "nystrom_build_time": nystrom_build_time,
        "nystrom_inference_time": nystrom_inference_time,
        "nystrom_sampled_error_reconstruction": sampled_froebenius_norm,
        "nystrom_sampled_error_reconstruction_uniform": sampled_froebenius_norm_uniform,
        "nystrom_svm_accuracy": accuracy_nystrom_svm,
        "nystrom_svm_time": delta_time_classification
    }

    resprinter.add(nystrom_results)
コード例 #12
0
def make_1nn_evaluation(x_train, y_train, x_test, y_test, U_centroids,
                        indicator_vector):
    def scikit_evaluation(str_type):
        clf = KNeighborsClassifier(n_neighbors=1, algorithm=str_type)
        clf.fit(x_train, y_train)

        start_inference_time = time.time()
        predictions = np.empty_like(y_test)
        for obs_idx, obs_test in enumerate(x_test):
            predictions[obs_idx] = clf.predict(obs_test.reshape(1, -1))[0]
        stop_inference_time = time.time()

        inference_time = (stop_inference_time - start_inference_time)

        accuracy = np.sum(predictions == y_test) / y_test.shape[0]

        results_1nn = {
            "1nn_{}_inference_time".format(str_type): inference_time,
            "1nn_{}_accuracy".format(str_type): accuracy
        }
        resprinter.add(results_1nn)
        return inference_time

    def kmean_tree_evaluation():

        lst_clf_by_cluster = [
            KNeighborsClassifier(n_neighbors=1, algorithm="brute").fit(
                x_train[indicator_vector == i], y_train[indicator_vector == i])
            for i in range(U_centroids.shape[0])
        ]

        start_inference_time = time.time()
        distances = get_distances(x_test, U_centroids)
        indicator_vector_test = np.argmin(distances, axis=1)
        predictions = np.empty_like(y_test)
        for obs_idx, obs_test in enumerate(x_test):
            idx_cluster = indicator_vector_test[obs_idx]
            clf_cluster = lst_clf_by_cluster[idx_cluster]
            predictions[obs_idx] = clf_cluster.predict(obs_test.reshape(1,
                                                                        -1))[0]

        stop_inference_time = time.time()
        inference_time = (stop_inference_time - start_inference_time)

        accuracy = np.sum(predictions == y_test) / y_test.shape[0]

        results_1nn = {
            "1nn_kmean_inference_time": inference_time,
            "1nn_kmean_accuracy": accuracy
        }
        resprinter.add(results_1nn)
        return inference_time

    logger.info("1 nearest neighbor with k-means search")
    kmean_tree_time = kmean_tree_evaluation()
    if paraman["kmeans"]:
        lst_knn_types = ["brute", "ball_tree", "kd_tree"]
        for knn_type in lst_knn_types:
            signal.signal(signal.SIGALRM, timeout_signal_handler)
            signal.alarm(int(kmean_tree_time * 10))
            try:
                logger.info(
                    "1 nearest neighbor with {} search".format(knn_type))
                scikit_evaluation(knn_type)
            except TimeoutError as te:
                logger.warning(
                    "Timeout during execution of 1-nn with {} version: {}".
                    format(knn_type, te))
            signal.alarm(0)
コード例 #13
0
from collections import OrderedDict
from pprint import pformat

import numpy as np
from numpy.linalg import multi_dot
from qkmeans.palm.palm import hierarchical_palm4msa, palm4msa
from qkmeans.core.kmeans import kmeans
from qkmeans.core.utils import build_constraint_set_smart, compute_objective, get_distances, get_squared_froebenius_norm_line_wise
from qkmeans.utils import visual_evaluation_palm4msa
from sklearn import datasets
import matplotlib.pyplot as plt

from qkmeans.utils import logger

logger.warning(
    "The module {} hasn't been maintained in a long time and shouldn't be used anymore."
    .format(__file__))


def qmeans(X_data: np.ndarray,
           K_nb_cluster: int,
           nb_iter: int,
           nb_factors: int,
           params_palm4msa: dict,
           initialization: np.ndarray,
           hierarchical_inside=False,
           graphical_display=False):
    """
    :param X_data: The data matrix of n examples in dimensions d in shape (n, d).
    :param K_nb_cluster: The number of clusters to look for.
    :param nb_iter: The maximum number of iteration.
コード例 #14
0
                        for idx_bar, xcoor in enumerate(x_indices + bar_width *
                                                        idx_sparsy_val):
                            try:
                                nb_param = df_sparsy_val[
                                    df_sparsy_val["--nb-cluster"] ==
                                    nb_cluster_values[idx_bar]][
                                        "nb_param_centroids"].mean()
                                ax.text(xcoor,
                                        mean_task_values[idx_bar] +
                                        std_task_values[idx_bar],
                                        '{}'.format(int(round(nb_param))),
                                        horizontalalignment='center',
                                        verticalalignment='bottom',
                                        rotation='vertical')
                            except:
                                logger.warning("nb param empty")
                                continue

                    except Exception as e:
                        if "empty dataframe" in str(e):
                            logger.warning("{} for sparsy val {}".format(
                                str(e), sparsy_val))
                        else:
                            raise e

                    try:
                        # kmeans palm
                        ##############
                        df_sparsy_val_kmeans_palm = df_hierarchical_kmeans_palm[
                            df_hierarchical_kmeans_palm["--sparsity-factor"] ==
                            sparsy_val]
コード例 #15
0
def get_dct_result_files_by_root(src_results_dir,
                                 old_filename_objective=False,
                                 tpl_results=("centroids", "results",
                                              "objective")):
    """
    From a directory with the result of oarjobs give the dictionnary of results file for each experiment. Files are:

    * OAR.`jobid`.stderr
    * OAR.`jobid`.stdout

    * `idexpe`_objective_`nameobjective`.csv contains the objective function values;
    * `idexpe`_results.csv contains the parameters of the experiments and the various metric measures;
    * `idexpe`_centrois.npy contains the numpy objects of centroids that have been decided.

    where:

     * `jobid` correspond to oar's own job identifier;
     * `nameobjective` correspond to the name of the objective function being printed;
     * `idexpe` correspond to the name

    The returned dictionnary gives:

    {
        "OAR.`jobid`": {
            "centroids": "`idexpe`_centroids.npy",
            "results": "`idexpe`_results.csv",
            "objective": "`idexpe`_objective_`objective_name`.csv"
        }
    }

    :param src_results_dir: path to
    :return:
    """
    files = src_results_dir.glob('**/*')
    files = [x for x in files if x.is_file()]
    lst_str_filenames = [file.name for file in files]

    dct_output_files_by_root = {}
    count_complete = 0
    count_has_printed_results = 0
    count_total = 0

    for pth_file in files:
        if pth_file.suffix != '.stdout' and pth_file.suffix != '.out':
            continue
        # if "_results.csv" not in pth_file.name:
        #     continue

        count_total += 1

        with open(pth_file, 'r') as stdoutfile:
            lines = stdoutfile.readlines()
            for i_line, lin in enumerate(lines):
                if lin[:2] == "--":
                    break
            else:
                logger.warning("file {} didn't contain anything".format(
                    pth_file.name))
                dct_output_files_by_root[pth_file.stem] = {}
                continue
            count_has_printed_results += 1

            data = "".join(lines[i_line:i_line + 2])

        io_data = StringIO(data)
        df = pd.read_csv(io_data)

        try:
            root_name = df["--output-file_resprinter"][0].split("_")[0]
        except KeyError:
            logger.warning("no key for resprinter in {}".format(pth_file.name))

        dct_files = {}
        complete = True

        if old_filename_objective:
            used_output_file_end_re = output_file_end_re_old
        else:
            used_output_file_end_re = output_file_end_re

        for type_file, root_re in used_output_file_end_re.items():
            if type_file not in tpl_results:
                continue
            forged_re_compiled = re.compile(r"{}".format(root_name) + root_re)
            try:
                dct_files[type_file] = list(
                    filter(forged_re_compiled.match, lst_str_filenames))[0]
            except IndexError:
                logger.warning("{} not found for root name {}".format(
                    type_file, root_name))
                complete = False

        if complete:
            count_complete += 1

        dct_output_files_by_root[pth_file.stem] = dct_files

    return dct_output_files_by_root
コード例 #16
0
def make_1nn_evaluation(x_train, y_train, x_test, y_test, U_centroids,
                        indicator_vector):
    """
    Do the 1-nearest neighbor classification using `x_train`, `y_train` as support and `x_test`, `y_test` as
    evaluation set.

    The scikilearn classifiers (brute, kdtree and balltree) are called only in the case where it is the kmeans version
    of the program that is called (for simplicity purposes: not do it many times).

    Time is recorded.
    Classification accuracy is recorded.

    :param x_train: Train data set as ndarray.
    :param y_train: Train labels as categories in ndarray.
    :param x_test: Test data as ndarray.
    :param y_test: Test labels as categories.
    :param U_centroids: The matrix of centroids as ndarray or SparseFactor object
    :param indicator_vector: The indicator vector for this matrix of centroids and this train data.

    :return:
    """
    def scikit_evaluation(str_type):
        """
        Do the scikit learn version of nearest neighbor (used for comparison)

        :param str_type:
        :return:
        """
        clf = KNeighborsClassifier(n_neighbors=1, algorithm=str_type)
        clf.fit(x_train, y_train)
        log_memory_usage(
            "Memory after definition of neighbors classifiers in scikit_evaluation of make_1nn_evaluation"
        )

        start_inference_time = time.time()
        predictions = np.empty_like(y_test)
        for obs_idx, obs_test in enumerate(x_test):
            predictions[obs_idx] = clf.predict(obs_test.reshape(1, -1))[0]
        stop_inference_time = time.time()
        log_memory_usage(
            "Memory after label assignation in scikit_evaluation of make_1nn_evaluation"
        )

        inference_time = (stop_inference_time - start_inference_time)

        accuracy = np.sum(predictions == y_test) / y_test.shape[0]

        results_1nn = {
            "1nn_{}_inference_time".format(str_type): inference_time,
            "1nn_{}_accuracy".format(str_type): accuracy
        }
        resprinter.add(results_1nn)
        return inference_time

    def kmean_tree_evaluation():
        """
        Do the K-means partitioning version of nearest neighbor?=.

        :return:
        """
        # for each cluster, there is a sub nearest neighbor classifier for points in that cluster.
        lst_clf_by_cluster = [
            KNeighborsClassifier(n_neighbors=1, algorithm="brute").fit(
                x_train[indicator_vector == i], y_train[indicator_vector == i])
            for i in range(U_centroids.shape[0])
        ]
        log_memory_usage(
            "Memory after definition of neighbors classifiers in kmean_tree_evaluation of make_1nn_evaluation"
        )
        # precomputed_centroid_norms = get_squared_froebenius_norm(landmarks)
        precomputed_centroid_norms = None
        start_inference_time = time.time()
        distances = get_distances(
            x_test,
            U_centroids,
            precomputed_centroids_norm=precomputed_centroid_norms)
        stop_get_distances_time = time.time()
        get_distance_time = stop_get_distances_time - start_inference_time
        log_memory_usage(
            "Memory after distances computation with clusters in kmean_tree_evaluation of make_1nn_evaluation"
        )
        indicator_vector_test = np.argmin(distances, axis=1)
        predictions = np.empty_like(y_test)
        for obs_idx, obs_test in enumerate(x_test):
            # get the cluster to which belongs this data point and call the associated nearest neighbor classifier
            idx_cluster = indicator_vector_test[obs_idx]
            clf_cluster = lst_clf_by_cluster[idx_cluster]
            predictions[obs_idx] = clf_cluster.predict(obs_test.reshape(1,
                                                                        -1))[0]
        stop_inference_time = time.time()
        log_memory_usage(
            "Memory after label assignation in kmean_tree_evaluation of make_1nn_evaluation"
        )
        inference_time = (stop_inference_time - start_inference_time)

        accuracy = np.sum(predictions == y_test) / y_test.shape[0]

        results_1nn = {
            "1nn_kmean_inference_time": inference_time,
            "1nn_get_distance_time": get_distance_time / x_test.shape[0],
            "1nn_kmean_accuracy": accuracy
        }
        resprinter.add(results_1nn)
        return inference_time

    logger.info("1 nearest neighbor with k-means search")
    kmean_tree_time = kmean_tree_evaluation()
    #
    if paraman["kmeans"]:
        lst_knn_types = ["brute", "ball_tree", "kd_tree"]
        for knn_type in lst_knn_types:
            # the classification must not take more than 10 times the time taken for the K means 1 nn classification or
            # it will stop.
            signal.signal(signal.SIGALRM, timeout_signal_handler)
            signal.alarm(int(kmean_tree_time * 10))  # start alarm
            try:
                logger.info(
                    "1 nearest neighbor with {} search".format(knn_type))
                scikit_evaluation(knn_type)
            except TimeoutError as te:
                logger.warning(
                    "Timeout during execution of 1-nn with {} version: {}".
                    format(knn_type, te))
            signal.alarm(0)  # stop alarm for next evaluation
コード例 #17
0
def make_nystrom_evaluation(x_train, U_centroids):
    """
    Evaluation Nystrom construction time and approximation precision.

    The approximation is based on a subsample of size n_sample of the input data set.

    :param x_train: Input dataset as ndarray.
    :param U_centroids: The matrix of centroids as ndarray or SparseFactor object
    :param n_sample: The number of sample to take into account in the reconstruction (can't be too large)

    :return:
    """
    n_sample = paraman["--nystrom"]
    if n_sample > x_train.shape[0]:
        logger.warning(
            "Batch size for nystrom evaluation is bigger than data size. {} > {}. Using "
            "data size instead.".format(n_sample, x_train.shape[0]))
        n_sample = x_train.shape[0]
        paraman["--nystrom"] = n_sample

    # Compute euristic gamma as the mean of euclidian distance between example
    gamma = compute_euristic_gamma(x_train)
    log_memory_usage(
        "Memory after euristic gamma computation in make_nystrom_evaluation")
    # precompute the centroids norm for later use (optimization)
    # centroids_norm = get_squared_froebenius_norm(landmarks)
    centroids_norm = None

    ## TIME: nystrom build time
    # nystrom build time is Nystrom preparation time for later use.
    ## START
    nystrom_build_start_time = time.time()
    basis_kernel_W = special_rbf_kernel(U_centroids, U_centroids, gamma,
                                        centroids_norm, centroids_norm)
    log_memory_usage(
        "Memory after K_11 computation in make_nystrom_evaluation")
    U, S, V = np.linalg.svd(basis_kernel_W)
    log_memory_usage("Memory after SVD computation in make_nystrom_evaluation")
    S = np.maximum(S, 1e-12)
    normalization_ = np.dot(U / np.sqrt(S), V)
    nystrom_build_stop_time = time.time()
    # STOP

    nystrom_build_time = nystrom_build_stop_time - nystrom_build_start_time

    indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample]
    sample = x_train[indexes_samples]
    log_memory_usage(
        "Memory after sample selection in make_nystrom_evaluation")

    # samples_norm = np.linalg.norm(sample, axis=1) ** 2
    samples_norm = None

    real_kernel = special_rbf_kernel(sample, sample, gamma, samples_norm,
                                     samples_norm)
    log_memory_usage(
        "Memory after real kernel computation in make_nystrom_evaluation")

    ## TIME: nystrom inference time
    # Nystrom inference time is the time for Nystrom transformation for all the samples.
    ## START
    nystrom_inference_time_start = time.time()
    nystrom_embedding = special_rbf_kernel(U_centroids, sample, gamma,
                                           centroids_norm,
                                           samples_norm).T @ normalization_
    log_memory_usage(
        "Memory after embedding computation in make_nystrom_evaluation")
    nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T
    log_memory_usage(
        "Memory after kernel matrix approximation in make_nystrom_evaluation")
    nystrom_inference_time_stop = time.time()
    ## STOP

    nystrom_inference_time = (nystrom_inference_time_stop -
                              nystrom_inference_time_start) / n_sample

    sampled_froebenius_norm = np.linalg.norm(nystrom_approx_kernel_value -
                                             real_kernel)

    nystrom_results = {
        "nystrom_build_time": nystrom_build_time,
        "nystrom_inference_time": nystrom_inference_time,
        "nystrom_sampled_error_reconstruction": sampled_froebenius_norm
    }

    resprinter.add(nystrom_results)
コード例 #18
0
def make_nystrom_evaluation(x_train, y_train, x_test, y_test, U_centroids):
    """
    Evaluation Nystrom construction time and approximation precision.

    The approximation is based on a subsample of size n_sample of the input data set.

    :param x_train: Input dataset as ndarray.
    :param U_centroids: The matrix of centroids as ndarray or SparseFactor object
    :param n_sample: The number of sample to take into account in the reconstruction (can't be too large)

    :return:
    """
    def prepare_nystrom(landmarks, landmarks_norm):
        basis_kernel_W = special_rbf_kernel(landmarks, landmarks, gamma,
                                            landmarks_norm, landmarks_norm)
        U, S, V = np.linalg.svd(basis_kernel_W)
        S = np.maximum(S, 1e-12)
        normalization_ = np.dot(U / np.sqrt(S), V)

        return normalization_

    def nystrom_transformation(x_input, landmarks, p_metric, landmarks_norm,
                               x_input_norm):
        nystrom_embedding = special_rbf_kernel(landmarks, x_input, gamma,
                                               landmarks_norm,
                                               x_input_norm).T @ p_metric
        return nystrom_embedding

    n_sample = paraman["--nystrom"]
    if n_sample > x_train.shape[0]:
        logger.warning(
            "Batch size for nystrom evaluation is bigger than data size. {} > {}. Using "
            "data size instead.".format(n_sample, x_train.shape[0]))
        n_sample = x_train.shape[0]
        paraman["--nystrom"] = n_sample

    # Compute euristic gamma as the mean of euclidian distance between example
    gamma = compute_euristic_gamma(x_train)
    log_memory_usage(
        "Memory after euristic gamma computation in make_nystrom_evaluation")
    # precompute the centroids norm for later use (optimization)
    centroids_norm = get_squared_froebenius_norm_line_wise(U_centroids)
    # centroids_norm = None

    indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample]
    sample = x_train[indexes_samples]
    samples_norm = None
    log_memory_usage(
        "Memory after sample selection in make_nystrom_evaluation")

    ########################
    # Nystrom on centroids #
    ########################
    logger.info("Build Nystrom on centroids")
    ## TIME: nystrom build time
    # nystrom build time is Nystrom preparation time for later use.
    ## START
    nystrom_build_start_time = time.process_time()
    metric = prepare_nystrom(U_centroids, centroids_norm)
    nystrom_build_stop_time = time.process_time()
    log_memory_usage("Memory after SVD computation in make_nystrom_evaluation")
    # STOP
    nystrom_build_time = nystrom_build_stop_time - nystrom_build_start_time

    ## TIME: nystrom inference time
    # Nystrom inference time is the time for Nystrom transformation for all the samples.
    ## START
    nystrom_inference_time_start = time.process_time()
    nystrom_embedding = nystrom_transformation(sample, U_centroids, metric,
                                               centroids_norm, samples_norm)
    nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T
    nystrom_inference_time_stop = time.process_time()
    log_memory_usage(
        "Memory after kernel matrix approximation in make_nystrom_evaluation")
    ## STOP
    nystrom_inference_time = (nystrom_inference_time_stop -
                              nystrom_inference_time_start) / n_sample

    ################################################################

    ######################
    # Nystrom on uniform #
    ######################
    logger.info("Build Nystrom on uniform sampling")

    indexes_uniform_samples = np.random.permutation(
        x_train.shape[0])[:U_centroids.shape[0]]
    uniform_sample = x_train[indexes_uniform_samples]
    uniform_sample_norm = None
    log_memory_usage(
        "Memory after uniform sample selection in make_nystrom_evaluation")

    metric_uniform = prepare_nystrom(uniform_sample, uniform_sample_norm)
    log_memory_usage(
        "Memory after SVD computation in uniform part of make_nystrom_evaluation"
    )

    nystrom_embedding_uniform = nystrom_transformation(sample, uniform_sample,
                                                       metric_uniform,
                                                       uniform_sample_norm,
                                                       samples_norm)
    nystrom_approx_kernel_value_uniform = nystrom_embedding_uniform @ nystrom_embedding_uniform.T

    #################################################################

    ###############
    # Real Kernel #
    ###############
    logger.info("Compute real kernel matrix")

    real_kernel = special_rbf_kernel(sample, sample, gamma, samples_norm,
                                     samples_norm)
    real_kernel_norm = np.linalg.norm(real_kernel)
    log_memory_usage(
        "Memory after real kernel computation in make_nystrom_evaluation")

    ################################################################

    ####################
    # Error evaluation #
    ####################

    sampled_froebenius_norm = np.linalg.norm(nystrom_approx_kernel_value -
                                             real_kernel) / real_kernel_norm
    sampled_froebenius_norm_uniform = np.linalg.norm(
        nystrom_approx_kernel_value_uniform - real_kernel) / real_kernel_norm

    # svm evaluation
    if x_test is not None:
        logger.info("Start classification")

        time_classification_start = time.process_time()
        x_train_nystrom_embedding = nystrom_transformation(
            x_train, U_centroids, metric, centroids_norm, None)
        x_test_nystrom_embedding = nystrom_transformation(
            x_test, U_centroids, metric, centroids_norm, None)

        linear_svc_clf = LinearSVC()
        linear_svc_clf.fit(x_train_nystrom_embedding, y_train)
        accuracy_nystrom_svm = linear_svc_clf.score(x_test_nystrom_embedding,
                                                    y_test)
        time_classification_stop = time.process_time()

        delta_time_classification = time_classification_stop - time_classification_start
    else:
        accuracy_nystrom_svm = None
        delta_time_classification = None

    nystrom_results = {
        "nystrom_build_time": nystrom_build_time,
        "nystrom_inference_time": nystrom_inference_time,
        "nystrom_sampled_error_reconstruction": sampled_froebenius_norm,
        "nystrom_sampled_error_reconstruction_uniform":
        sampled_froebenius_norm_uniform,
        "nystrom_svm_accuracy": accuracy_nystrom_svm,
        "nystrom_svm_time": delta_time_classification
    }

    resprinter.add(nystrom_results)