Exemple #1
0
def get_score(data, labels, fold_pairs, name, model, param):
    """
    Function to get score for a classifier.

    Parameters
    ----------
    data: array_like
        Data from which to derive score.
    labels: array_like or list
        Corresponding labels for each sample.
    fold_pairs: list of pairs of array_like
        A list of train/test indicies for each fold
        dhjelm(Why can't we just use the KFold object?)
    name: str
        Name of classifier.
    model: WRITEME
    param: WRITEME
        Parameters for the classifier.

    Returns
    -------
    classifier: WRITEME
    fScore: WRITEME
    """
    assert isinstance(name, str)
    logger.info("Classifying %s" % name)

    ksplit = len(fold_pairs)
    if name not in NAMES:
        raise ValueError("Classifier %s not supported. "
                         "Did you enter it properly?" % name)

    # Redefine the parameters to be used for RBF SVM (dependent on
    # training data)

    if True:  #better identifier here
        logger.info("Attempting to use grid search...")
        fScore = []
        for i, fold_pair in enumerate(fold_pairs):
            print("Classifying a %s the %d-th out of %d folds..." %
                  (name, i + 1, len(fold_pairs)))
            classifier = get_classifier(name, model, param,
                                        data[fold_pair[0], :])
            area = classify(data, labels, fold_pair, classifier)
            fScore.append(area)
    else:
        logger.warn("Multiprocessing splits not tested yet.")
        pool = Pool(processes=min(ksplit, PROCESSORS))
        classify_func = lambda f: classify(
            data,
            labels,
            fold_pairs[f],
            classifier=get_classifier(
                name, model, param, data=data[fold_pairs[f][0], :]))
        fScore = pool.map(functools.partial(classify_func, xrange(ksplit)))
        pool.close()
        pool.join()

    return classifier, fScore
Exemple #2
0
def get_score(data, labels, fold_pairs,
              name, model, param):
    """
    Function to get score for a classifier.

    Parameters
    ----------
    data: array-like
        Data from which to derive score.
    labels: array-like or list.
        Corresponding labels for each sample.
    fold_pairs: list of pairs of array-like
        A list of train/test indicies for each fold
        (Why can't we just use the KFold object?)
    name: string
        Name of classifier.
    model: WRITEME
    param: WRITEME
        Parameters for the classifier.
    """
    assert isinstance(name, str)
    logger.info("Classifying %s" % name)

    ksplit = len(fold_pairs)
    if name not in NAMES:
        raise ValueError("Classifier %s not supported. "
                         "Did you enter it properly?" % name)

    # Redefine the parameters to be used for RBF SVM (dependent on
    # training data)

    if True:  #better identifier here
        logger.info("Attempting to use grid search...")
        fScore = []
        for i, fold_pair in enumerate(fold_pairs):
            print ("Classifying a %s the %d-th out of %d folds..."
                   % (name, i+1, len(fold_pairs)))
            classifier = get_classifier(name, model, param, data[fold_pair[0], :])
            area = classify(data, labels, fold_pair, classifier)
            fScore.append(area)
    else:
        warnings.warn("Multiprocessing splits not tested yet.")
        pool = Pool(processes=min(ksplit, PROCESSORS))
        classify_func = lambda f : classify(
            data,
            labels,
            fold_pairs[f],
            classifier=get_classifier(
                name,
                model,
                param,
                data=data[fold_pairs[f][0], :]))
        fScore = pool.map(functools.partial(classify_func, xrange(ksplit)))
        pool.close()
        pool.join()

    return classifier, fScore
Exemple #3
0
def get_score(data,
              labels,
              fold_pairs,
              name,
              model,
              param,
              numTopVars,
              rank_per_fold=None,
              parallel=True,
              rand_iter=-1):
    """
    Function to get score for a classifier.

    Parameters
    ----------
    data: array_like
        Data from which to derive score.
    labels: array_like or list
        Corresponding labels for each sample.
    fold_pairs: list of pairs of array_like
        A list of train/test indicies for each fold
        dhjelm(Why can't we just use the KFold object?)
    name: str
        Name of classifier.
    model: WRITEME
    param: WRITEME
        Parameters for the classifier.
    parallel: bool
        Whether to run folds in parallel. Default: True

    Returns
    -------
    classifier: WRITEME
    allConfMats: Confusion matrix for all folds and all variables sets and best performing parameter set
                 ([numFolds, numVarSets]) 
    """
    assert isinstance(name, str)
    logging.info("Classifying %s" % name)
    ksplit = len(fold_pairs)
    #    if name not in NAMES:
    #        raise ValueError("Classifier %s not supported. "
    #                         "Did you enter it properly?" % name)

    # Redefine the parameters to be used for RBF SVM (dependent on
    # training data)
    if "SGD" in name:
        param["n_iter"] = [25]  # [np.ceil(10**3 / len(fold_pairs[0][0]))]
    classifier = get_classifier(name, model, param, rand_iter=rand_iter)

    if name == "RBF SVM":  #This doesn't use labels, but looks as ALL data
        logging.info("RBF SVM requires some preprocessing."
                     "This may take a while")
        #
        is_data_computed_gamma = True
        #
        if not is_data_computed_gamma:
            # Sahil commented the code below that computes the gamma choices from data.
            # The computed gamma choices seem too low thereby making SVM very slow. Instead, trying out fixed values.
            print param
            gamma = param['gamma']
            gamma = np.array(gamma)
            print 'gamma', gamma
        else:
            #Euclidean distances between samples
            # sahil switched from the first call to second one for computing the dist as the first one is giving error.
            # dist = pdist(StandardScaler().fit(data), "euclidean").ravel()
            dist = pdist(RobustScaler().fit_transform(data),
                         "euclidean").ravel()
            print 'dist', dist
            #Estimates for sigma (10th, 50th and 90th percentile)
            sigest = np.asarray(np.percentile(dist, [10, 50, 90]))
            print 'sigest', sigest
            #Estimates for gamma (= -1/(2*sigma^2))
            gamma = 1. / (2 * sigest**2)
            print 'gamma', gamma
        #
        #
        #Set SVM parameters with these values
        # sahil changed the code a bit to remove a bug
        # param = [{"kernel": ["rbf"],
        #           "gamma": gamma.tolist(),
        #           "C": np.logspace(-2,2,5).tolist()}]
        param = {
            "kernel": ["rbf"],
            "gamma": gamma.tolist(),
            "C": np.logspace(-2, 2, 5).tolist()
        }
    # if name not in ["Decision Tree", "Naive Bayes"]:
    if param:
        if hasattr(classifier, 'param_grid'):
            # isinstance(classifier, GridSearchCV):
            print 'param', param
            N_p = np.prod([len(l) for l in param.values()])
        elif isinstance(classifier, RandomizedSearchCV):
            N_p = classifier.n_iter
    else:
        N_p = 1


#    is_cv = isinstance(classifier, GridSearchCV) or \
#            isinstance(classifier, RandomizedSearchCV)
#    print('Name: {}, ksplit: {}, N_p: {}'.format(name, ksplit, N_p))
    if (not parallel) or ksplit <= N_p or \
    (name == "Random Forest") or ("SGD" in name):
        logging.info("Attempting to use grid search...")
        classifier.n_jobs = PROCESSORS
        classifier.pre_dispatch = 1  # np.floor(PROCESSORS/24)
        allConfMats = []
        allTotalErrs = []
        allFittedClassifiers = []
        for i, fold_pair in enumerate(fold_pairs):
            confMats = []
            totalErrs = []
            fitted_classifiers = []
            logging.info("Classifying a %s the %d-th out of %d folds..." %
                         (name, i + 1, len(fold_pairs)))
            if rank_per_fold is not None:
                rankedVars = rank_per_fold[i]
            else:
                rankedVars = np.arange(data.shape[1])
            #
            for numVars in numTopVars:
                logging.info('Classifying for top %i variables' % numVars)
                #
                # print 'rankedVars', rankedVars
                #
                confMat, totalErr, fitted_classifier = classify(
                    data[:, rankedVars[:numVars]], labels, fold_pair,
                    classifier)
                confMats.append(confMat)
                totalErrs.append(totalErr)
                fitted_classifiers.append(fitted_classifier)
            # recheck the structure of area and fScore variables
            allConfMats.append(confMats)
            allTotalErrs.append(totalErrs)
            allFittedClassifiers.append(fitted_classifiers)
    else:
        print 'parallel computing going on (debug Sahil ...) ..........................'
        #
        classifier.n_jobs = PROCESSORS
        logging.info("Multiprocessing folds for classifier {}.".format(name))
        pool = Pool(processes=min(ksplit, PROCESSORS))
        out_list = pool.map(
            per_split_classifier(data, labels, classifier, numTopVars),
            zip(rank_per_fold, fold_pairs))
        pool.close()
        pool.join()
        #allConfMats = [el[0] for el in out_list]
        #allTotalErrs = [el[1] for el in out_list]
        #allFittedClassifiers = [el[2] for el in out_list]
        allConfMats, allTotalErrs, allFittedClassifiers = tuple(zip(*out_list))
    return classifier, allConfMats, allTotalErrs, allFittedClassifiers
Exemple #4
0
def get_rank_per_fold(data,
                      labels,
                      fold_pairs,
                      ranking_function=ttest_ind,
                      save_path=None,
                      load_file=True,
                      parallel=True):
    '''
    Applies rank_vars to each test set in list of fold pairs
    Inputs:
        data: array
            features for all samples
        labels: array
            label vector of each sample
        fold_pair: list
            list pairs of index arrays containing train and test sets
        ranking_function: function object, default: ttest_ind
            function to apply for ranking features
        ranking_function: function
            ranking function to use, default: ttest_ind
        save_path: dir to load and save ranking files
        load_file: bool
            Whether to try to load an existing file, default: True
        parallel: bool
            True if multicore processing is desired, default: True        
    Outputs:
        rank_per_fod: list
            List of ranked feature indexes for each fold pair
    '''
    file_loaded = False
    if load_file:
        if isinstance(save_path, str):
            fname = path.join(
                save_path, "{}_{}_folds.mat".format(ranking_function.__name__,
                                                    len(fold_pairs)))
            try:
                rd = scipy.io.loadmat(fname, mat_dtype=True)
                rank_per_fold = rd['rank_per_fold']
                file_loaded = True
            except:
                pass
        else:
            print('No rank file path: Computing from scratch without saving')
    if not file_loaded:
        if not parallel:
            rank_per_fold = []
            for fold_pair in fold_pairs:
                rankedVars = rank_vars(data[fold_pair[0], :],
                                       labels[fold_pair[0]], ranking_function)
                rank_per_fold.append(rankedVars)
        else:
            pool = Pool(processes=min(len(fold_pairs), PROCESSORS))
            rank_per_fold = pool.map(
                Ranker(data, labels, ranking_function, rank_vars), fold_pairs)
            pool.close()
            pool.join()
        if isinstance(save_path, str):
            fname = path.join(
                save_path, "{}_{}_folds.mat".format(ranking_function.__name__,
                                                    len(fold_pairs)))
            with open(fname, 'wb') as f:
                scipy.io.savemat(f, {'rank_per_fold': rank_per_fold})
    return rank_per_fold