def get_score(data, labels, fold_pairs, name, model, param): """ Function to get score for a classifier. Parameters ---------- data: array_like Data from which to derive score. labels: array_like or list Corresponding labels for each sample. fold_pairs: list of pairs of array_like A list of train/test indicies for each fold dhjelm(Why can't we just use the KFold object?) name: str Name of classifier. model: WRITEME param: WRITEME Parameters for the classifier. Returns ------- classifier: WRITEME fScore: WRITEME """ assert isinstance(name, str) logger.info("Classifying %s" % name) ksplit = len(fold_pairs) if name not in NAMES: raise ValueError("Classifier %s not supported. " "Did you enter it properly?" % name) # Redefine the parameters to be used for RBF SVM (dependent on # training data) if True: #better identifier here logger.info("Attempting to use grid search...") fScore = [] for i, fold_pair in enumerate(fold_pairs): print("Classifying a %s the %d-th out of %d folds..." % (name, i + 1, len(fold_pairs))) classifier = get_classifier(name, model, param, data[fold_pair[0], :]) area = classify(data, labels, fold_pair, classifier) fScore.append(area) else: logger.warn("Multiprocessing splits not tested yet.") pool = Pool(processes=min(ksplit, PROCESSORS)) classify_func = lambda f: classify( data, labels, fold_pairs[f], classifier=get_classifier( name, model, param, data=data[fold_pairs[f][0], :])) fScore = pool.map(functools.partial(classify_func, xrange(ksplit))) pool.close() pool.join() return classifier, fScore
def get_score(data, labels, fold_pairs, name, model, param): """ Function to get score for a classifier. Parameters ---------- data: array-like Data from which to derive score. labels: array-like or list. Corresponding labels for each sample. fold_pairs: list of pairs of array-like A list of train/test indicies for each fold (Why can't we just use the KFold object?) name: string Name of classifier. model: WRITEME param: WRITEME Parameters for the classifier. """ assert isinstance(name, str) logger.info("Classifying %s" % name) ksplit = len(fold_pairs) if name not in NAMES: raise ValueError("Classifier %s not supported. " "Did you enter it properly?" % name) # Redefine the parameters to be used for RBF SVM (dependent on # training data) if True: #better identifier here logger.info("Attempting to use grid search...") fScore = [] for i, fold_pair in enumerate(fold_pairs): print ("Classifying a %s the %d-th out of %d folds..." % (name, i+1, len(fold_pairs))) classifier = get_classifier(name, model, param, data[fold_pair[0], :]) area = classify(data, labels, fold_pair, classifier) fScore.append(area) else: warnings.warn("Multiprocessing splits not tested yet.") pool = Pool(processes=min(ksplit, PROCESSORS)) classify_func = lambda f : classify( data, labels, fold_pairs[f], classifier=get_classifier( name, model, param, data=data[fold_pairs[f][0], :])) fScore = pool.map(functools.partial(classify_func, xrange(ksplit))) pool.close() pool.join() return classifier, fScore
def get_score(data, labels, fold_pairs, name, model, param, numTopVars, rank_per_fold=None, parallel=True, rand_iter=-1): """ Function to get score for a classifier. Parameters ---------- data: array_like Data from which to derive score. labels: array_like or list Corresponding labels for each sample. fold_pairs: list of pairs of array_like A list of train/test indicies for each fold dhjelm(Why can't we just use the KFold object?) name: str Name of classifier. model: WRITEME param: WRITEME Parameters for the classifier. parallel: bool Whether to run folds in parallel. Default: True Returns ------- classifier: WRITEME allConfMats: Confusion matrix for all folds and all variables sets and best performing parameter set ([numFolds, numVarSets]) """ assert isinstance(name, str) logging.info("Classifying %s" % name) ksplit = len(fold_pairs) # if name not in NAMES: # raise ValueError("Classifier %s not supported. " # "Did you enter it properly?" % name) # Redefine the parameters to be used for RBF SVM (dependent on # training data) if "SGD" in name: param["n_iter"] = [25] # [np.ceil(10**3 / len(fold_pairs[0][0]))] classifier = get_classifier(name, model, param, rand_iter=rand_iter) if name == "RBF SVM": #This doesn't use labels, but looks as ALL data logging.info("RBF SVM requires some preprocessing." "This may take a while") # is_data_computed_gamma = True # if not is_data_computed_gamma: # Sahil commented the code below that computes the gamma choices from data. # The computed gamma choices seem too low thereby making SVM very slow. Instead, trying out fixed values. print param gamma = param['gamma'] gamma = np.array(gamma) print 'gamma', gamma else: #Euclidean distances between samples # sahil switched from the first call to second one for computing the dist as the first one is giving error. # dist = pdist(StandardScaler().fit(data), "euclidean").ravel() dist = pdist(RobustScaler().fit_transform(data), "euclidean").ravel() print 'dist', dist #Estimates for sigma (10th, 50th and 90th percentile) sigest = np.asarray(np.percentile(dist, [10, 50, 90])) print 'sigest', sigest #Estimates for gamma (= -1/(2*sigma^2)) gamma = 1. / (2 * sigest**2) print 'gamma', gamma # # #Set SVM parameters with these values # sahil changed the code a bit to remove a bug # param = [{"kernel": ["rbf"], # "gamma": gamma.tolist(), # "C": np.logspace(-2,2,5).tolist()}] param = { "kernel": ["rbf"], "gamma": gamma.tolist(), "C": np.logspace(-2, 2, 5).tolist() } # if name not in ["Decision Tree", "Naive Bayes"]: if param: if hasattr(classifier, 'param_grid'): # isinstance(classifier, GridSearchCV): print 'param', param N_p = np.prod([len(l) for l in param.values()]) elif isinstance(classifier, RandomizedSearchCV): N_p = classifier.n_iter else: N_p = 1 # is_cv = isinstance(classifier, GridSearchCV) or \ # isinstance(classifier, RandomizedSearchCV) # print('Name: {}, ksplit: {}, N_p: {}'.format(name, ksplit, N_p)) if (not parallel) or ksplit <= N_p or \ (name == "Random Forest") or ("SGD" in name): logging.info("Attempting to use grid search...") classifier.n_jobs = PROCESSORS classifier.pre_dispatch = 1 # np.floor(PROCESSORS/24) allConfMats = [] allTotalErrs = [] allFittedClassifiers = [] for i, fold_pair in enumerate(fold_pairs): confMats = [] totalErrs = [] fitted_classifiers = [] logging.info("Classifying a %s the %d-th out of %d folds..." % (name, i + 1, len(fold_pairs))) if rank_per_fold is not None: rankedVars = rank_per_fold[i] else: rankedVars = np.arange(data.shape[1]) # for numVars in numTopVars: logging.info('Classifying for top %i variables' % numVars) # # print 'rankedVars', rankedVars # confMat, totalErr, fitted_classifier = classify( data[:, rankedVars[:numVars]], labels, fold_pair, classifier) confMats.append(confMat) totalErrs.append(totalErr) fitted_classifiers.append(fitted_classifier) # recheck the structure of area and fScore variables allConfMats.append(confMats) allTotalErrs.append(totalErrs) allFittedClassifiers.append(fitted_classifiers) else: print 'parallel computing going on (debug Sahil ...) ..........................' # classifier.n_jobs = PROCESSORS logging.info("Multiprocessing folds for classifier {}.".format(name)) pool = Pool(processes=min(ksplit, PROCESSORS)) out_list = pool.map( per_split_classifier(data, labels, classifier, numTopVars), zip(rank_per_fold, fold_pairs)) pool.close() pool.join() #allConfMats = [el[0] for el in out_list] #allTotalErrs = [el[1] for el in out_list] #allFittedClassifiers = [el[2] for el in out_list] allConfMats, allTotalErrs, allFittedClassifiers = tuple(zip(*out_list)) return classifier, allConfMats, allTotalErrs, allFittedClassifiers
def get_rank_per_fold(data, labels, fold_pairs, ranking_function=ttest_ind, save_path=None, load_file=True, parallel=True): ''' Applies rank_vars to each test set in list of fold pairs Inputs: data: array features for all samples labels: array label vector of each sample fold_pair: list list pairs of index arrays containing train and test sets ranking_function: function object, default: ttest_ind function to apply for ranking features ranking_function: function ranking function to use, default: ttest_ind save_path: dir to load and save ranking files load_file: bool Whether to try to load an existing file, default: True parallel: bool True if multicore processing is desired, default: True Outputs: rank_per_fod: list List of ranked feature indexes for each fold pair ''' file_loaded = False if load_file: if isinstance(save_path, str): fname = path.join( save_path, "{}_{}_folds.mat".format(ranking_function.__name__, len(fold_pairs))) try: rd = scipy.io.loadmat(fname, mat_dtype=True) rank_per_fold = rd['rank_per_fold'] file_loaded = True except: pass else: print('No rank file path: Computing from scratch without saving') if not file_loaded: if not parallel: rank_per_fold = [] for fold_pair in fold_pairs: rankedVars = rank_vars(data[fold_pair[0], :], labels[fold_pair[0]], ranking_function) rank_per_fold.append(rankedVars) else: pool = Pool(processes=min(len(fold_pairs), PROCESSORS)) rank_per_fold = pool.map( Ranker(data, labels, ranking_function, rank_vars), fold_pairs) pool.close() pool.join() if isinstance(save_path, str): fname = path.join( save_path, "{}_{}_folds.mat".format(ranking_function.__name__, len(fold_pairs))) with open(fname, 'wb') as f: scipy.io.savemat(f, {'rank_per_fold': rank_per_fold}) return rank_per_fold