Example #1
0
def _evalFunction(individual, searchobj, name_values, X, y, scorer, cv, iid, fit_params,
                  verbose=0, error_score='raise'):
    parameters = _individual_to_params(individual, name_values)
    score = 0
    n_test = 0
    for train, test in cv:
        paramkey = str(parameters)
        if paramkey in searchobj.score_cache:
            searchobj.num_cache_hits += 1
            _score = searchobj.score_cache[paramkey]
        else:
            _score, _, _ = _fit_and_score(estimator=individual.est, X=X, y=y, scorer=scorer,
                                     train=train, test=test, verbose=verbose,
                                     parameters=parameters, fit_params=fit_params,
                                     error_score=error_score)
            searchobj.num_evaluations += 1
            searchobj.score_cache[paramkey] = _score
        if searchobj.verbose and (searchobj.num_evaluations + searchobj.num_cache_hits) % searchobj.population_size == 0:
            print "Scoring evaluations: %d, Cache hits: %d, Total: %d" % (
                searchobj.num_evaluations, searchobj.num_cache_hits, searchobj.num_evaluations + searchobj.num_cache_hits)
        if iid:
            score += _score*len(test)
            n_test += len(test)
        else:
            score += _score
            n_test += 1
    score /= float(n_test)

    return (score,)
Example #2
0
def _evalFunction(individual,
                  name_values,
                  X,
                  y,
                  scorer,
                  cv,
                  iid,
                  fit_params,
                  verbose=0,
                  error_score='raise',
                  score_cache={}):
    """ Developer Note:
        --------------------
        score_cache was purposefully moved to parameters, and given a dict reference.
        It will be modified in-place by _evalFunction based on it's reference.
        This is to allow for a managed, paralell memoization dict,
        and also for different memoization per instance of EvolutionaryAlgorithmSearchCV.
        Remember that dicts created inside function definitions are presistent between calls,
        So unless it is replaced this function will be memoized each call automatically. """

    parameters = _individual_to_params(individual, name_values)
    score = 0
    n_test = 0

    paramkey = str(individual)
    if paramkey in score_cache:
        score = score_cache[paramkey]
    else:
        for train, test in cv.split(X, y):
            assert len(train) > 0 and len(
                test
            ) > 0, "Training and/or testing not long enough for evaluation."
            try:
                _score = _fit_and_score(estimator=individual.est,
                                        X=X,
                                        y=y,
                                        scorer=scorer,
                                        train=train,
                                        test=test,
                                        verbose=verbose,
                                        parameters=parameters,
                                        fit_params=fit_params,
                                        error_score=error_score)[0]
            except:
                return (-np.inf, )

            if iid:
                score += _score * len(test)
                n_test += len(test)
            else:
                score += _score
                n_test += 1

        assert n_test > 0, "No fitting was accomplished, check data and cross validation method."
        score /= float(n_test)
        score_cache[paramkey] = score

    return (score, )
Example #3
0
    def evaluate(self, dataset, pipelines):
        if not self.is_valid(dataset):
            raise AssertionError("Dataset is not appropriate for evaluation")
        for subject in dataset.subject_list:
            # check if we already have result for this subject/pipeline
            # we might need a better granularity, if we query the DB
            run_pipes = self.results.not_yet_computed(pipelines, dataset, subject)
            if len(run_pipes) == 0:
                continue

            # get the data
            X, y, metadata = self.paradigm.get_data(
                dataset, [subject], self.return_epochs
            )
            le = LabelEncoder()
            y = y if self.mne_labels else le.fit_transform(y)
            groups = metadata.session.values
            scorer = get_scorer(self.paradigm.scoring)

            for name, clf in run_pipes.items():
                # we want to store a results per session
                cv = LeaveOneGroupOut()
                for train, test in cv.split(X, y, groups):
                    t_start = time()
                    if isinstance(X, BaseEpochs):
                        cvclf = clone(clf)
                        cvclf.fit(X[train], y[train])
                        score = scorer(cvclf, X[test], y[test])
                    else:
                        result = _fit_and_score(
                            clone(clf),
                            X,
                            y,
                            scorer,
                            train,
                            test,
                            verbose=False,
                            parameters=None,
                            fit_params=None,
                            error_score=self.error_score,
                        )
                        score = result["test_scores"]
                    duration = time() - t_start
                    nchan = X.info["nchan"] if isinstance(X, BaseEpochs) else X.shape[1]
                    res = {
                        "time": duration,
                        "dataset": dataset,
                        "subject": subject,
                        "session": groups[test][0],
                        "score": score,
                        "n_samples": len(train),
                        "n_channels": nchan,
                        "pipeline": name,
                    }
                    yield res
Example #4
0
def _evalFunction(individual,
                  name_values,
                  X,
                  y,
                  scorer,
                  cv,
                  uniform,
                  fit_params,
                  verbose=0,
                  error_score='raise',
                  score_cache={}):
    """[Evaluación del modelo]
	Arguments:
		individual {[creator.Individual]} -- [Individuo]
		name_values {[list]} -- [parámetros en general]
		X {[array]} -- [Input]
		y {[array]} -- [Output]
		scorer {[string]} -- [Parámetro de evaluación, precisión]
		cv {[int | cross-validation]} -- [Especificación de los folds]
		uniform {[boolean]} -- [True hace que la data se distribuya uniformemente en los folds]
		fit_params {[dict | None]} -- [parámetros para estimator.fit]
	Keyword Arguments:
		verbose {integer} -- [Mensajes de descripción] (default: {0})
		error_score {numerico} -- [valor asignado si ocurre un error en fitting] (default: {'raise'})
		score_cache {dict} -- [description] (default: {{}})
	"""
    parameters = _individual_to_params(individual, name_values)
    score = 0
    n_test = 0
    paramkey = str(individual)
    if paramkey in score_cache:
        score = score_cache[paramkey]
    else:
        for train, test in cv.split(X, y):
            _score = _fit_and_score(estimator=individual.est,
                                    X=X,
                                    y=y,
                                    scorer=scorer,
                                    train=train,
                                    test=test,
                                    verbose=verbose,
                                    parameters=parameters,
                                    fit_params=fit_params,
                                    error_score=error_score)[0]
            if uniform:
                score += _score * len(test)
                n_test += len(test)
            else:
                score += _score
                n_test += 1
        assert n_test > 0, "No se completo el fitting, Verificar data."
        score /= float(n_test)
        score_cache[paramkey] = score
    return (score, )
Example #5
0
def _wrapped_cross_val_score(sklearn_pipeline, features, target,
                             cv, scoring_function, sample_weight=None, groups=None):
    """Fit estimator and compute scores for a given dataset split.
    Parameters
    ----------
    sklearn_pipeline : pipeline object implementing 'fit'
        The object to use to fit the data.
    features : array-like of shape at least 2D
        The data to fit.
    target : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    cv: int or cross-validation generator
        If CV is a number, then it is the number of folds to evaluate each
        pipeline over in k-fold cross-validation during the TPOT optimization
         process. If it is an object then it is an object to be used as a
         cross-validation generator.
    scoring_function : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    sample_weight : array-like, optional
        List of sample weights to balance (or un-balanace) the dataset target as needed
    groups: array-like {n_samples, }, optional
        Group labels for the samples used while splitting the dataset into train/test set
    """
    sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight)

    features, target, groups = indexable(features, target, groups)

    cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline))
    cv_iter = list(cv.split(features, target, groups))
    scorer = check_scoring(sklearn_pipeline, scoring=scoring_function)

    try:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            scores = [_fit_and_score(estimator=clone(sklearn_pipeline),
                                    X=features,
                                    y=target,
                                    scorer=scorer,
                                    train=train,
                                    test=test,
                                    verbose=0,
                                    parameters=None,
                                    fit_params=sample_weight_dict)
                                for train, test in cv_iter]
            CV_score = np.array(scores)[:, 0]
            return np.nanmean(CV_score)
    except TimeoutException:
        return "Timeout"
    except Exception as e:
        return -float('inf')
Example #6
0
 def fun(tup):
     # DO NOT REFERENCE TO `self` ANYWHERE IN THIS FUNCTION.
     # IT WILL CAUSE A SPARK-5063 ERROR.
     (index, (parameters, train, test)) = tup
     local_estimator = clone(base_estimator)
     local_X = X_bc.value
     local_y = y_bc.value
     res = _fit_and_score(local_estimator, local_X, local_y, scorers,
         train, test, verbose, parameters, fit_params=fit_params,
         return_train_score=return_train_score,
         return_n_test_samples=True, return_times=True,
         error_score=error_score)
     return (index, res)
def _evalFunction(individual, name_values, X, y, scorer, cv, uniform, fit_params,
                  verbose=0, error_score='raise', score_cache={}, result_cache=[]):
    parameters = _individual_to_params(individual, name_values)
    nombreModelo = str(individual.est).split('(')[0] # individual.est.__class__.__name__
    score = 0
    paramkey = nombreModelo+str(individual)
    if 'genCount' in score_cache:
        score_cache['genCount'] = score_cache['genCount'] + 1
    else:
        score_cache['genCount'] = 1
    if paramkey in score_cache:
        score = score_cache[paramkey]
    else:
        try:
            resultIndividuo = []
            scorer = { 'mae': make_scorer(mae), 'mse': make_scorer(mse), 'approach': make_scorer(distance2d) }
            for train, test in cv.split(X, y):
                resultIndividuo.append(_fit_and_score(estimator=individual.est, X=X, y=y, scorer=scorer,  parameters=parameters,
                        train=train, test=test, verbose=verbose, fit_params=None, return_times=True))
            df = pd.DataFrame(list(map(lambda x: _evalfs(x), resultIndividuo)))
            accuracy = np.array(resultIndividuo)[:, 0]  # accuracy
            runtime = np.array(resultIndividuo)[:, 2] + np.array(resultIndividuo)[:, 1]  # runtime train+test
            score = df['approach'].mean()
            score_cache[paramkey] = score
            dict_result = parameters
            dict_result['Accuracy'] = score
            dict_result['stdApproach'] = df['approach'].std()
            dict_result['MSE'] = df['mse'].mean()
            dict_result['stdMSE'] = df['mse'].std()
            dict_result['MAE'] = df['mae'].mean()
            dict_result['stdMAE'] = df['mae'].std()
            dict_result['Runtime'] = df['time'].mean()
            dict_result['stdRuntime'] = df['time'].std()
            dict_result['genCount'] = score_cache['genCount']
            result_cache.append(dict_result)
        except Exception as ex:
            print(ex)
            score_cache[paramkey] = 0
            dict_result = parameters
            dict_result['Accuracy'] = 0
            dict_result['stdApproach'] = 0
            dict_result['Runtime'] = 0
            dict_result['stdRuntime'] = 0
            dict_result['MSE'] = 100
            dict_result['stdMSE'] = 100
            dict_result['MAE'] = 100
            dict_result['stdMAE'] = 100
            dict_result['genCount'] = score_cache['genCount']
            result_cache.append(dict_result)
    return (score,)
Example #8
0
 def getModelAccuracy(self, parametros, individual, score_cache, resultados,
                      generacion):
     params = _individual_to_params(individual, parametros)
     score = 0
     scoring = "accuracy"
     nombreModelo = str(self.estimator).split('(')[0]
     paramkey = nombreModelo + str(np.int32(individual))
     if paramkey in score_cache:
         score = score_cache[paramkey]
     else:
         try:
             resultIndividuo = []
             cv = KFold(n_splits=10, shuffle=False)
             scorer = check_scoring(self.estimator, scoring=scoring)
             for train, test in cv.split(self.X, self.y):
                 resultIndividuo.append(
                     _fit_and_score(estimator=self.estimator,
                                    X=self.X,
                                    y=self.y,
                                    scorer=scorer,
                                    parameters=params,
                                    train=train,
                                    test=test,
                                    verbose=0,
                                    fit_params=None,
                                    return_times=True))
             accuracy = np.array(resultIndividuo)[:, 0]  # accuracy
             runtime = np.array(resultIndividuo)[:, 2] + np.array(
                 resultIndividuo)[:, 1]  # runtime train+test
             score = accuracy.mean()
             score_cache[paramkey] = score
             dict_result = params
             dict_result['Accuracy'] = score
             dict_result['stdAccuracy'] = accuracy.std()
             dict_result['Runtime'] = runtime.mean()
             dict_result['stdRuntime'] = runtime.std()
             dict_result['generacion'] = generacion
             resultados.append(dict_result)
         except Exception as ex:
             print(ex)
             score_cache[paramkey] = 0
             dict_result = params
             dict_result['Accuracy'] = 0
             dict_result['stdAccuracy'] = 0
             dict_result['Runtime'] = 0
             dict_result['stdRuntime'] = 0
             dict_result['generacion'] = generacion
             resultados.append(dict_result)
     return score
Example #9
0
        def evaluate_candidates(candidate_params):
            if isinstance(candidate_params, dict) or isinstance(
                    candidate_params, defaultdict):
                candidate_params = list(candidate_params)
            n_candidates = len(candidate_params)

            if self.verbose > 0:
                print(
                    "Fitting {0} folds for each of {1} remaining candidates, totalling {2} fits"
                    .format(n_splits, n_candidates, n_candidates * n_splits))

            # print('list(cv.split(X, y, groups)): %s' % list(cv.split(X, y, groups)))
            # print('list(product(candidate_params, cv.split(X, y, groups))): %s' % list(product(candidate_params, cv.split(X, y, groups))))

            fold_num = 0
            for parameters, (train, test) in product(candidate_params,
                                                     cv.split(X, y, groups)):
                print('product index/fold number: %d' % fold_num)
                # print('\tparams: %s' % parameters)
                # print('\ttrain: %s' % train)
                # print('\ttest: %s' % test)
                out = _fit_and_score(estimator=clone(base_estimator),
                                     X=X,
                                     y=y,
                                     train=train,
                                     test=test,
                                     parameters=parameters,
                                     **fit_and_score_kwargs)
                print('\tout: %s' % out)

                all_candidate_params.extend(candidate_params)
                all_out.extend(out)
                # nonlocal keyword is exactly what it sounds like, uses the outer function scope: w3schools.com/python/ref_keyword_nonlocal.asp
                nonlocal results
                # results = self._format_results(all_candidate_params, scorers, n_splits, all_out)
                result = self._format_result(candidate_param=parameters,
                                             scorer=scorers,
                                             n_splits=n_splits,
                                             out=out)
                results.append(result)
                self.cv_results.append(result)
                # Just finished training a model, should cv_results be saved?
                if fold_num % self.cv_results_save_freq == 0:
                    self._save_cv_results()
                fold_num += 1
            return self.cv_results
Example #10
0
def _evalFunction(individual,
                  name_values,
                  X,
                  y,
                  scorer,
                  cv,
                  iid,
                  fit_params,
                  verbose=0,
                  error_score='raise',
                  score_cache={}):
    parameters = _individual_to_params(individual, name_values)
    score = 0
    n_test = 0

    paramkey = str(individual)
    if paramkey in score_cache:
        score = score_cache[paramkey]
    else:
        for train, test in cv.split(X, y):
            assert len(train) > 0 and len(
                test
            ) > 0, "Training and/or testing not long enough for evaluation."
            _score = _fit_and_score(estimator=individual.est,
                                    X=X,
                                    y=y,
                                    scorer=scorer,
                                    train=train,
                                    test=test,
                                    verbose=verbose,
                                    parameters=parameters,
                                    fit_params=fit_params,
                                    error_score=error_score)[0]

            if iid:
                score += _score * len(test)
                n_test += len(test)
            else:
                score += _score
                n_test += 1

        assert n_test > 0, "No fitting was accomplished, check data and cross validation method."
        score /= float(n_test)
        score_cache[paramkey] = score

    return (score, )
Example #11
0
 def obj(x):
     cand_params = {}
     _idx = 0
     for _param in self.params_list:
         _param_num_range = self.params[_param]
         if _param_num_range.VType != 'hdreal':
             if _param_num_range.VType == 'integer':
                 cand_params[_param] = int(round(x[_idx]))
             elif _param_num_range.VType == 'categorical':
                 cand_params[_param] = _param_num_range.items[int(round(x[_idx]))]
             else:
                 cand_params[_param] = x[_idx]
             _idx += 1
         else:
             _cls_dict = {}
             for i_ in range(_param_num_range.n):
                 _cls_dict[target_classes[i_]] = x[_idx]
                 _idx += 1
             cand_params[_param] = _cls_dict
     cl = clone(self.estimator)
     cl.set_params(**cand_params)
     score = 0
     n_test = 0
     for train, test in cv_dat:
         #if True:
         try:
             _score = _fit_and_score(estimator=cl, X=X, y=y, scorer=self.scorer_,
                                     train=train, test=test, verbose=self.verbose,
                                     parameters=cand_params, fit_params=self.fit_params,
                                     error_score=self.error_score)[0]
             if self.iid:
                 score += _score * len(test)
                 n_test += len(test)
             else:
                 score += _score
                 n_test += 1
         #else:
         except ValueError:
             pass
         except:# LightGBMError:
             pass
     score /= float(max(n_test, 1))
     if is_classifier(self.estimator):
         return - score
     else:
         return score
Example #12
0
def getModelAccuracy(parametros, individual, estimator, score_cache,
                     resultados):
    X, y = _createDataset(
        "Tx_0x06"
    )  # adaptado de búsqueda de configuración hay que adaptar para sacarlo
    params = _individual_to_params(individual, parametros)
    score = 0
    scoring = "accuracy"
    nombreModelo = str(estimator).split('(')[0]
    paramkey = nombreModelo + str(np.int32(individual))
    if paramkey in score_cache:
        score = score_cache[paramkey]
    else:
        resultIndividuo = []
        cv = KFold(n_splits=10, shuffle=False)
        scorer = check_scoring(estimator, scoring=scoring)
        for train, test in cv.split(X, y):
            resultIndividuo.append(
                _fit_and_score(estimator=estimator,
                               X=X,
                               y=y,
                               scorer=scorer,
                               parameters=params,
                               train=train,
                               test=test,
                               verbose=0,
                               fit_params=None,
                               return_times=True))
        accuracy = np.array(resultIndividuo)[:, 0]  #accuracy
        runtime = np.array(resultIndividuo)[:, 2] + np.array(
            resultIndividuo)[:, 1]  #runtime train+test
        # error = distance_error(estimator, X, y)
        score = accuracy.mean()
        score_cache[paramkey] = score
        dict_result = {
            'Modelo': nombreModelo,
            'Parametros': params,
            'Accuracy': accuracy.mean(),
            'stdAccuracy': accuracy.std(),
            'Runtime': runtime.mean(),
            'accuracy_values': accuracy,
            'runtime_values': runtime,
        }
        resultados.append(dict_result)
    return score
Example #13
0
 def getModelApproach(self, parametros, individual, score_cache, resultados, generacion):
     params = _individual_to_params(individual, parametros)
     score = 0
     scoring = "mse"
     nombreModelo = str(self.estimator).split('(')[0]
     paramkey = nombreModelo + str(np.int32(individual))
     if paramkey in score_cache:
         score = score_cache[paramkey]
     else:
         try:
             resultIndividuo = []
             cv = KFold(n_splits=10, shuffle=True, random_state=self.seed)
             scorer = scoring_reg = { 'mae': make_scorer(mae), 'mse': make_scorer(mse), 'approach': make_scorer(distance2d) }
             for train, test in cv.split(self.X, self.y):
                 resultIndividuo.append(_fit_and_score(estimator=self.estimator, X=self.X, y=self.y, scorer=scorer, parameters=params,
                                 train=train, test=test, verbose=0, fit_params=None, return_times=True))
             df = pd.DataFrame(list(map(lambda x: _evalfs(x), resultIndividuo)))
             score = df['approach'].mean()
             score_cache[paramkey] = score
             dict_result = params
             dict_result['Accuracy'] = score
             dict_result['stdApproach'] = df['approach'].std()
             dict_result['MSE'] = df['mse'].mean()
             dict_result['stdMSE'] = df['mse'].std()
             dict_result['MAE'] = df['mae'].mean()
             dict_result['stdMAE'] = df['mae'].std()
             dict_result['Runtime'] = df['time'].mean()
             dict_result['stdRuntime'] = df['time'].std()
             dict_result['generacion'] = generacion
             resultados.append(dict_result)
         except Exception as ex:
             print(ex)
             score_cache[paramkey] = 0
             dict_result = params
             dict_result['Accuracy'] = 0
             dict_result['stdApproach'] = 0
             dict_result['Runtime'] = 0
             dict_result['stdRuntime'] = 0
             dict_result['MSE'] = 100
             dict_result['stdMSE'] = 100
             dict_result['MAE'] = 100
             dict_result['stdMAE'] = 100
             dict_result['generacion'] = generacion
             resultados.append(dict_result)
     return score
Example #14
0
    def evaluate(self, dataset, pipelines):
        if not self.is_valid(dataset):
            raise AssertionError('Dataset is not appropriate for evaluation')
        for subject in dataset.subject_list:
            # check if we already have result for this subject/pipeline
            # we might need a better granularity, if we query the DB
            run_pipes = self.results.not_yet_computed(pipelines, dataset,
                                                      subject)
            if len(run_pipes) == 0:
                continue

            # get the data
            X, y, metadata = self.paradigm.get_data(dataset, [subject])
            le = LabelEncoder()
            y = le.fit_transform(y)
            groups = metadata.session.values
            scorer = get_scorer(self.paradigm.scoring)

            for name, clf in run_pipes.items():

                # we want to store a results per session
                cv = LeaveOneGroupOut()
                for train, test in cv.split(X, y, groups):
                    t_start = time()
                    score = _fit_and_score(clone(clf),
                                           X,
                                           y,
                                           scorer,
                                           train,
                                           test,
                                           verbose=False,
                                           parameters=None,
                                           fit_params=None)[0]
                    duration = time() - t_start
                    res = {
                        'time': duration,
                        'dataset': dataset,
                        'subject': subject,
                        'session': groups[test][0],
                        'score': score,
                        'n_samples': len(train),
                        'n_channels': X.shape[1],
                        'pipeline': name
                    }
                    yield res
Example #15
0
def _evalFunction(individual, gaobject, estimator, X, y, cv, scorer, verbose, fit_params, caching):
    individual_sum = np.sum(individual, axis=0)
    if individual_sum == 0:
        return -10000, individual_sum
    individual_tuple = tuple(individual)
    if caching and individual_tuple in gaobject.scores_cache:
        return gaobject.scores_cache[individual_tuple], individual_sum
    X_selected = X[:, np.array(individual, dtype=np.bool)]
    scores = []
    for train, test in cv.split(X, y):
        score = _fit_and_score(estimator=estimator, X=X_selected, y=y, scorer=scorer,
                               train=train, test=test, verbose=verbose, parameters=None,
                               fit_params=fit_params)
        scores.append(score)
    scores_mean = np.mean(scores)
    if caching:
        gaobject.scores_cache[individual_tuple] = scores_mean
    return scores_mean, individual_sum
Example #16
0
            def spark_task(tup):
                (index, (parameters, train, test)) = tup
                local_estimator = clone(base_estimator)
                local_X = X_bc.value
                local_y = y_bc.value

                error = None
                with warnings.catch_warnings(record=True) as warns:
                    try:
                        res = _fit_and_score(
                            local_estimator, X=local_X, y=local_y, train=train,
                            test=test, parameters=parameters,
                            **fit_and_score_kwargs)
                    except Exception as e:
                        res = None
                        error = e

                return index, {"results": res, "errors": error, "warnings": warns}
Example #17
0
    def _fit_score_and_log(self,
                           estimator,
                           X,
                           y,
                           scorer,
                           train,
                           test,
                           verbose,
                           parameters,
                           fit_params,
                           return_train_score=False,
                           return_parameters=False,
                           return_n_test_samples=False,
                           return_times=False,
                           return_estimator=False,
                           error_score=np.nan):
        fit_result = _fit_and_score(estimator, X, y, scorer, train, test,
                                    verbose, parameters, fit_params, True,
                                    True, True, True, True, error_score)

        test_score = fit_result[0]['score']
        train_score = fit_result[1]['score']
        sample_count = fit_result[2]
        durations = [fit_result[3], fit_result[4]]
        parameters = fit_result[5]
        estimator = fit_result[6]

        self.log_results(train_score, test_score, sample_count, durations,
                         parameters, estimator)

        if return_train_score:
            ret = [fit_result[0], fit_result[1]]
        else:
            ret = [fit_result[0]]
        if return_n_test_samples:
            ret.append(sample_count)
        if return_times:
            ret.extend(durations)
        if return_parameters:
            ret.append(parameters)
        if return_estimator:
            ret.append(estimator)

        return ret
        def test_one_parameter(task):
            (index, (parameters, split_idx)) = task
            local_estimator = clone(base_estimator)
            local_x = x_bc.value
            local_y = y_bc.value
            local_groups = groups_bc.value

            train, test = next(
                islice(cv.split(local_x, local_y, local_groups), split_idx,
                       split_idx + 1))
            res = _fit_and_score(local_estimator,
                                 local_x,
                                 local_y,
                                 train=train,
                                 test=test,
                                 parameters=parameters,
                                 **fit_and_score_kwargs)

            return index, res
Example #19
0
def grid_search_cv(model, param_grid, precomputed_kernels, y, cv=5):
	cv = StratifiedKFold(n_splits=cv, shuffle=False)
	results =[]
	for train_index, test_index in cv.split(precomputed_kernels[0], y):
		split_results = []
		params = []
		for idx, K in enumerate(precomputed_kernels):
			for p in list(ParameterGrid(param_grid)):
				sc = _fit_and_score(clone(model), K, y, scorer=make_scorer(accuracy_score),
					train=train_index, test=test_index, verbose=0, parameters=p,fit_params=None)
				split_results.append(sc)
				params.append({'K_idx':idx, 'params':p})
		results.append(split_results)
	results=np.array(results)
	fin_results = results.mean(axis=0)
	best_idx = np.argmax(fin_results)
	print(best_idx, fin_results[best_idx])
	ret_model = clone(model).set_params(**params[best_idx]['params'])
	return ret_model.fit(precomputed_kernels[params[best_idx]['K_idx']], y), params[best_idx]
def _evalFunction(individual, gaobject, estimator, X, y, cv, scorer, verbose, fit_params,
                  max_features, caching):
    individual_sum = np.sum(individual, axis=0)
    if individual_sum == 0 or individual_sum > max_features:
        return -10000, individual_sum
    individual_tuple = tuple(individual)
    if caching and individual_tuple in gaobject.scores_cache:
        return gaobject.scores_cache[individual_tuple], individual_sum
    X_selected = X[:, np.array(individual, dtype=np.bool)]
    scores = []
    for train, test in cv.split(X, y):
        score = _fit_and_score(estimator=estimator, X=X_selected, y=y, scorer=scorer,
                               train=train, test=test, verbose=verbose, parameters=None,
                               fit_params=fit_params)
        scores.append(score)
    scores_mean = np.mean(scores)
    if caching:
        gaobject.scores_cache[individual_tuple] = scores_mean
    return scores_mean, individual_sum
def _evalFunctionClassifier(individual, name_values, X, y, scorer, cv, uniform, fit_params,
                  verbose=0, error_score='raise', score_cache={}, result_cache=[]):
    parameters = _individual_to_params(individual, name_values)
    nombreModelo = str(individual.est).split('(')[0]
    score = 0
    paramkey = nombreModelo+str(individual)
    if 'genCount' in score_cache:
        score_cache['genCount'] = score_cache['genCount'] + 1
    else:
        score_cache['genCount'] = 1
    if paramkey in score_cache:
        score = score_cache[paramkey]
    else:
        try:
            resultIndividuo = []
            scorer = check_scoring(individual.est, scoring="accuracy")
            for train, test in cv.split(X, y):
                resultIndividuo.append(_fit_and_score(estimator=individual.est, X=X, y=y, scorer=scorer,
                            train=train, test=test, verbose=verbose, parameters=parameters, fit_params=None, return_times=True))
            accuracy = np.array(resultIndividuo)[:, 0]  # accuracy
            runtime = np.array(resultIndividuo)[:, 2] + np.array(resultIndividuo)[:, 1]  # runtime train+test
            score = accuracy.mean()
            score_cache[paramkey] = score
            dict_result = parameters
            dict_result['Accuracy'] = score
            dict_result['stdAccuracy'] = accuracy.std()
            dict_result['Runtime'] = runtime.mean()
            dict_result['stdRuntime'] = runtime.std()
            dict_result['genCount'] = score_cache['genCount']
            result_cache.append(dict_result)
        except Exception as ex:
            print(ex)
            score_cache[paramkey] = 0
            dict_result = parameters
            dict_result['Accuracy'] = 0
            dict_result['stdAccuracy'] = 0
            dict_result['Runtime'] = 0
            dict_result['stdRuntime'] = 0
            dict_result['genCount'] = score_cache['genCount']
            result_cache.append(dict_result)
    return (score,)
Example #22
0
def _evalFunction(individual,
                  name_values,
                  X,
                  y,
                  scorer,
                  cv,
                  uniform,
                  fit_params,
                  verbose=0,
                  error_score='raise',
                  score_cache={}):

    parameters = _individual_to_params(individual, name_values)
    score = 0
    n_test = 0
    paramkey = str(individual)
    if paramkey in score_cache:
        score = score_cache[paramkey]
    else:
        for train, test in cv.split(X, y):
            _score = _fit_and_score(estimator=individual.est,
                                    X=X,
                                    y=y,
                                    scorer=scorer,
                                    train=train,
                                    test=test,
                                    verbose=verbose,
                                    parameters=parameters,
                                    fit_params=fit_params,
                                    error_score=error_score)[0]
            if uniform:
                score += _score * len(test)
                n_test += len(test)
            else:
                score += _score
                n_test += 1
        assert n_test > 0, "No se completó el fitting, verificar data."
        score /= float(n_test)
        score_cache[paramkey] = score
    return (score, )
Example #23
0
def fit_and_get_score(estimator, X, y):
    cv = check_cv(cv=K_FOLD_CROSS_VALIDATION,
                  y=y,
                  classifier=is_classifier(estimator))
    scorer = check_scoring(estimator, scoring=None)

    scores = []
    for train, test in cv.split(X, y):
        score = _fit_and_score(estimator=estimator,
                               X=X,
                               y=y,
                               scorer=scorer,
                               train=train,
                               test=test,
                               verbose=0,
                               parameters=None,
                               fit_params=None)
        scores.append(score)

    scores_mean = np.mean(scores)

    return scores_mean
Example #24
0
 def parallel_fit_score(cl, cand_params, X, y, scorer, train, test,
                        verbose, fit_params, error_score):
     cl.set_params(**cand_params)
     try:
         _score = _fit_and_score(
             estimator=cl,
             X=X,
             y=y,
             scorer=scorer,  #
             train=train,
             test=test,
             verbose=verbose,  #
             parameters=cand_params,
             fit_params=fit_params,  #
             error_score=error_score,  #
         )[0]
         return _score
     except ValueError:
         pass
     except:  # LightGBMError:
         pass
     return None
Example #25
0
def _evalFunction(individual, name_values, X, y, scorer, cv, uniform, fit_params,
                verbose=0, error_score='raise', score_cache={}):
    """[Evaluación del modelo]
    Arguments:
        individual {[creator.Individual]} -- [Individuo]
        name_values {[list]} -- [parámetros en general]
        X {[array]} -- [Input]
        y {[array]} -- [Output]
        scorer {[string]} -- [Parámetro de evaluación, precisión]
        cv {[int | cross-validation]} -- [Especificación de los folds]
        uniform {[boolean]} -- [True hace que la data se distribuya uniformemente en los folds]
        fit_params {[dict | None]} -- [parámetros para estimator.fit]
    Keyword Arguments:
        verbose {integer} -- [Mensajes de descripción] (default: {0})
        error_score {numerico} -- [valor asignado si ocurre un error en fitting] (default: {'raise'})
        score_cache {dict} -- [description] (default: {{}})
    """
    parameters = _individual_to_params(individual, name_values)
    score = 0
    n_test = 0
    paramkey = str(individual)
    if paramkey in score_cache:
        score = score_cache[paramkey]
    else:
        for train, test in cv.split(X, y):
            _score = _fit_and_score(estimator=individual.est, X=X, y=y, scorer=scorer,
                        train=train, test=test, verbose=verbose,
                        parameters=parameters, fit_params=fit_params,
                        error_score=error_score)[0]
            if uniform:
                score += _score * len(test)
                n_test += len(test)
            else:
                score += _score
                n_test += 1
        assert n_test > 0, "No se completo el fitting, Verificar data."
        score /= float(n_test)
        score_cache[paramkey] = score
    return (score,)
Example #26
0
def fit_grid_point(X,
                   y,
                   estimator,
                   parameters,
                   train,
                   test,
                   scorer,
                   verbose,
                   error_score=np.nan,
                   **fit_params):
    check_scoring(estimator, scorer)
    scores, n_samples_test = _fit_and_score(estimator,
                                            X,
                                            y,
                                            scorer,
                                            train,
                                            test,
                                            verbose,
                                            parameters,
                                            fit_params=fit_params,
                                            return_n_test_samples=True,
                                            error_score=error_score)
    return scores, parameters, n_samples_test
Example #27
0
    def process_batch(self, work_batch):
        fit_params = self.fit_params if self.fit_params is not None else {}

        LOG.debug("Node %d received %d work items", comm_rank, len(work_batch))

        results = []
        for fold_id, train_index, test_index, parameters in work_batch:
            ret = _fit_and_score(clone(self.estimator),
                                 self._data_X, self._data_y,
                                 self.scorer, train_index, test_index,
                                 self.verbose, parameters, fit_params,
                                 return_n_test_samples=True,
                                 return_times=True)

            result = parameters.copy()
            result['score'] = ret[0]
            result['n_samples_test'] = ret[1]
            result['scoring_time'] = ret[2]
            result['fold'] = fold_id
            results.append(result)

        LOG.debug("Node %d is done with fold %d", comm_rank, fold_id)
        return results
Example #28
0
    def _evaluate_one(self, estimator, data_preproc, scorers):
        res = []
        for X_train, X_test, y_train, y_test in data_preproc:
            X = np.vstack([X_train, X_test])
            if y_train.ndim < 2 and y_test.ndim < 2:
                y = np.hstack([y_train, y_test])
            else:
                y = np.vstack([y_train, y_test])
            train = np.arange(len(X_train))
            test = np.arange(len(X_train), len(X_test) + len(X_train))
            with warnings.catch_warnings():
                warnings.filterwarnings('ignore',
                                        category=UndefinedMetricWarning)
                test_scores = _fit_and_score(estimator,
                                             X,
                                             y,
                                             scorer=scorers,
                                             train=train,
                                             test=test,
                                             parameters={},
                                             fit_params={},
                                             verbose=self.verbose)[0]
            res.append(test_scores)

        res_mean = pd.DataFrame(res).mean(axis=0)
        try:
            # show only last step of pipeline for simplicity
            name = nice_repr(estimator.steps[-1][1])
        except AttributeError:
            name = nice_repr(estimator)

        if self.verbose:
            print("Running {}".format(name))
            print(_format_scores(res_mean))
        res_mean.name = name
        self.log_.append(res_mean)
        return res_mean
Example #29
0
def _wrapped_cross_val_score(sklearn_pipeline, features, target,
                             cv, scoring_function, sample_weight=None,
                             groups=None, use_dask=False):
    """Fit estimator and compute scores for a given dataset split.

    Parameters
    ----------
    sklearn_pipeline : pipeline object implementing 'fit'
        The object to use to fit the data.
    features : array-like of shape at least 2D
        The data to fit.
    target : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    cv: int or cross-validation generator
        If CV is a number, then it is the number of folds to evaluate each
        pipeline over in k-fold cross-validation during the TPOT optimization
         process. If it is an object then it is an object to be used as a
         cross-validation generator.
    scoring_function : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    sample_weight : array-like, optional
        List of sample weights to balance (or un-balanace) the dataset target as needed
    groups: array-like {n_samples, }, optional
        Group labels for the samples used while splitting the dataset into train/test set
    use_dask : bool, default False
        Whether to use dask
    """
    sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight)

    features, target, groups = indexable(features, target, groups)

    cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline))
    cv_iter = list(cv.split(features, target, groups))
    scorer = check_scoring(sklearn_pipeline, scoring=scoring_function)

    if use_dask:
        try:
            import dask_ml.model_selection  # noqa
            import dask  # noqa
            from dask.delayed import Delayed
        except ImportError:
            msg = "'use_dask' requires the optional dask and dask-ml depedencies."
            raise ImportError(msg)

        dsk, keys, n_splits = dask_ml.model_selection._search.build_graph(
            estimator=sklearn_pipeline,
            cv=cv,
            scorer=scorer,
            candidate_params=[{}],
            X=features,
            y=target,
            groups=groups,
            fit_params=sample_weight_dict,
            refit=False,
            error_score=float('-inf'),
        )

        cv_results = Delayed(keys[0], dsk)
        scores = [cv_results['split{}_test_score'.format(i)]
                  for i in range(n_splits)]
        CV_score = dask.delayed(np.array)(scores)[:, 0]
        return dask.delayed(np.nanmean)(CV_score)
    else:
        try:
            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                scores = [_fit_and_score(estimator=clone(sklearn_pipeline),
                                         X=features,
                                         y=target,
                                         scorer=scorer,
                                         train=train,
                                         test=test,
                                         verbose=0,
                                         parameters=None,
                                         fit_params=sample_weight_dict)
                                    for train, test in cv_iter]
                CV_score = np.array(scores)[:, 0]
                return np.nanmean(CV_score)
        except TimeoutException:
            return "Timeout"
        except Exception as e:
            return -float('inf')
Example #30
0
def _wrapped_cross_val_score(sklearn_pipeline,
                             features,
                             target,
                             cv,
                             scoring_function,
                             sample_weight=None,
                             groups=None,
                             use_dask=False,
                             random_state=None):
    """Fit estimator and compute scores for a given dataset split.

    Parameters
    ----------
    sklearn_pipeline : pipeline object implementing 'fit'
        The object to use to fit the data.
    features : array-like of shape at least 2D
        The data to fit.
    target : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    cv: int or cross-validation generator
        If CV is a number, then it is the number of folds to evaluate each
        pipeline over in k-fold cross-validation during the TPOT optimization
         process. If it is an object then it is an object to be used as a
         cross-validation generator.
    scoring_function : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    sample_weight : array-like, optional
        List of sample weights to balance (or un-balanace) the dataset target as needed
    groups: array-like {n_samples, }, optional
        Group labels for the samples used while splitting the dataset into train/test set
    use_dask : bool, default False
        Whether to use dask
    """
    sample_weight_dict = set_sample_weight(sklearn_pipeline.steps,
                                           sample_weight)
    features, target, groups = indexable(features, target, groups)
    scorer = check_scoring(sklearn_pipeline, scoring=scoring_function)
    if isinstance(cv, float):
        try:

            def split(l, test_size=0.2):
                random.seed(random_state)
                n_total = len(l)
                offset = int(n_total * test_size)
                if n_total == 0 or offset < 1:
                    return [], l
                random.shuffle(l)
                sublist_1 = l[:offset]
                sublist_2 = l[offset:]
                return sublist_1, sublist_2

            train_indices, test_indices = split(list(range(len(features))),
                                                test_size=cv)
            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                score = _fit_and_score(estimator=clone(sklearn_pipeline),
                                       X=features,
                                       y=target,
                                       scorer=scorer,
                                       train=train_indices,
                                       test=test_indices,
                                       verbose=0,
                                       parameters=None,
                                       fit_params=sample_weight_dict)
                return score[0]
        except TimeoutException:
            return "Timeout"
        except Exception as e:
            return -float('inf')

    cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline))
    cv_iter = list(cv.split(features, target, groups))

    if use_dask:
        try:
            import dask_ml.model_selection  # noqa
            import dask  # noqa
            from dask.delayed import Delayed
        except Exception as e:
            msg = "'use_dask' requires the optional dask and dask-ml depedencies.\n{}".format(
                e)
            raise ImportError(msg)

        dsk, keys, n_splits = dask_ml.model_selection._search.build_graph(
            estimator=sklearn_pipeline,
            cv=cv,
            scorer=scorer,
            candidate_params=[{}],
            X=features,
            y=target,
            groups=groups,
            fit_params=sample_weight_dict,
            refit=False,
            error_score=float('-inf'),
        )

        cv_results = Delayed(keys[0], dsk)
        scores = [
            cv_results['split{}_test_score'.format(i)] for i in range(n_splits)
        ]
        CV_score = dask.delayed(np.array)(scores)[:, 0]
        return dask.delayed(np.nanmean)(CV_score)
    else:
        try:
            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                scores = [
                    _fit_and_score(estimator=clone(sklearn_pipeline),
                                   X=features,
                                   y=target,
                                   scorer=scorer,
                                   train=train,
                                   test=test,
                                   verbose=0,
                                   parameters=None,
                                   fit_params=sample_weight_dict)
                    for train, test in cv_iter
                ]
                CV_score = np.array(scores)[:, 0]
                return np.nanmean(CV_score)
        except TimeoutException:
            return "Timeout"
        except Exception as e:
            return -float('inf')
Example #31
0
def _wrapped_cross_val_score(sklearn_pipeline,
                             features,
                             target,
                             cv,
                             scoring_function,
                             sample_weight=None,
                             groups=None,
                             use_dask=False):
    """Fit estimator and compute scores for a given dataset split.

    Parameters
    ----------
    sklearn_pipeline : pipeline object implementing 'fit'
        The object to use to fit the data.
    features : array-like of shape at least 2D
        The data to fit.
    target : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    cv: cross-validation generator
        Object to be used as a cross-validation generator.
    scoring_function : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    sample_weight : array-like, optional
        List of sample weights to balance (or un-balanace) the dataset target as needed
    groups: array-like {n_samples, }, optional
        Group labels for the samples used while splitting the dataset into train/test set
    use_dask : bool, default False
        Whether to use dask
    """
    sample_weight_dict = set_sample_weight(sklearn_pipeline.steps,
                                           sample_weight)

    features, target, groups = indexable(features, target, groups)

    cv_iter = list(cv.split(features, target, groups))
    scorer = check_scoring(sklearn_pipeline, scoring=scoring_function)

    if use_dask:
        try:
            import dask_ml.model_selection  # noqa
            import dask  # noqa
            from dask.delayed import Delayed
        except Exception as e:
            msg = "'use_dask' requires the optional dask and dask-ml depedencies.\n{}".format(
                e)
            raise ImportError(msg)

        dsk, keys, n_splits = dask_ml.model_selection._search.build_graph(
            estimator=sklearn_pipeline,
            cv=cv,
            scorer=scorer,
            candidate_params=[{}],
            X=features,
            y=target,
            groups=groups,
            fit_params=sample_weight_dict,
            refit=False,
            error_score=float('-inf'),
        )

        cv_results = Delayed(keys[0], dsk)
        scores = [
            cv_results['split{}_test_score'.format(i)] for i in range(n_splits)
        ]
        CV_score = dask.delayed(np.array)(scores)[:, 0]
        return dask.delayed(np.nanmean)(CV_score)
    else:
        try:
            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                scores = [
                    _fit_and_score(estimator=clone(sklearn_pipeline),
                                   X=features,
                                   y=target,
                                   scorer=scorer,
                                   train=train,
                                   test=test,
                                   verbose=0,
                                   parameters=None,
                                   error_score='raise',
                                   return_estimator=True,
                                   fit_params=sample_weight_dict)
                    for train, test in cv_iter
                ]
                if isinstance(scores[0], list):  #scikit-learn <= 0.23.2
                    CV_score = np.array(scores)[:, 0]
                elif isinstance(scores[0], dict):  # scikit-learn >= 0.24
                    from sklearn.model_selection._validation import _aggregate_score_dicts
                    CV_score = _aggregate_score_dicts(scores)["test_scores"]
                    CV_fitted_pipeline = _aggregate_score_dicts(
                        scores)["estimator"]
                else:
                    raise ValueError(
                        "Incorrect output format from _fit_and_score!")
                fit_and_score_details = dict()
                fit_and_score_details["CV_score_mean"] = np.nanmean(CV_score)
                fit_and_score_details[
                    "CV_fitted_best_pipeline"] = CV_fitted_pipeline[0]
            return fit_and_score_details
        except TimeoutException:
            fit_and_score_details = dict()
            fit_and_score_details["CV_score_mean"] = "Timeout"
            fit_and_score_details["CV_fitted_best_pipeline"] = None
            return fit_and_score_details
        except Exception as e:
            fit_and_score_details = dict()
            fit_and_score_details["CV_score_mean"] = -float('inf')
            fit_and_score_details["CV_fitted_best_pipeline"] = None
            return fit_and_score_details
Example #32
0
def fit_and_score(estimator,
                  X,
                  y,
                  scorer,
                  train,
                  test,
                  para,
                  fit_params=None,
                  return_train_score=True,
                  return_n_test_samples=True,
                  return_times=True,
                  return_parameters=True,
                  error_score='raise',
                  verbose=True):
    '''
    Fit an estimator to a dataset and score the performance. The following
    methods can currently be applied as preprocessing before fitting in
    this order:
    1. Select features based on type group.
    2. Apply feature imputation.
    3. Apply feature selection based on variance of feature among patients.
    4. Scale features with e.g. z-scoring.
    5. Select features based on a fit with a LASSO model.
    6. Select features using PCA.

    All of the steps are optional.

    Parameters
    ----------
    estimator: sklearn estimator, mandatory
            Unfitted estimator which will be fit.

    X: array, mandatory
            Array containingfor each object (rows) the feature values
            (1st Column) and the associated feature label (2nd Column).

    y: list(?), mandatory
            List containing the labels of the objects.

    scorer: sklearn scorer, mandatory
            Function used as optimization criterion for the hyperparamater optimization.

    train: list, mandatory
            Indices of the objects to be used as training set.

    test: list, mandatory
            Indices of the objects to be used as testing set.

    para: dictionary, mandatory
            Contains the settings used for the above preprocessing functions
            and the fitting. TODO: Create a default object and show the
            fields.

    fit_params:dictionary, default None
            Parameters supplied to the estimator for fitting. See the SKlearn
            site for the parameters of the estimators.

    return_train_score: boolean, default True
            Save the training score to the final SearchCV object.

    return_n_test_samples: boolean, default True
            Save the number of times each sample was used in the test set
            to the final SearchCV object.

    return_times: boolean, default True
            Save the time spend for each fit to the final SearchCV object.

    return_parameters: boolean, default True
            Return the parameters used in the final fit to the final SearchCV
            object.

    error_score: numeric or "raise" by default
            Value to assign to the score if an error occurs in estimator
            fitting. If set to "raise", the error is raised. If a numeric
            value is given, FitFailedWarning is raised. This parameter
            does not affect the refit step, which will always raise the error.

    verbose: boolean, default=True
            If True, print intermediate progress to command line. Warnings are
            always printed.

    Returns
    ----------
    ret

    GroupSel

    VarSel

    SelectModel

    feature_labels

    scaler

    imputer

    pca

    '''
    # We copy the parameter object so we can alter it and keep the original
    para_estimator = para.copy()

    # X is a tuple: split in two arrays
    feature_values = np.asarray([x[0] for x in X])
    feature_labels = np.asarray([x[1] for x in X])

    # ------------------------------------------------------------------------
    # Groupwise feature selection
    if 'SelectGroups' in para_estimator:
        if verbose:
            print("Selecting groups of features.")
        del para_estimator['SelectGroups']
        # TODO: more elegant way to solve this
        feature_groups = [
            "histogram_features", "orientation_features", "patient_features",
            "semantic_features", "shape_features", "texture_features",
            "coliage_features", 'vessel_features', "phase_features",
            "log_features"
        ]
        parameters_featsel = dict()
        for group in feature_groups:
            if group not in para_estimator:
                # Default: do use the group
                value = True
            else:
                value = para_estimator[group]
                del para_estimator[group]

            parameters_featsel[group] = value

        GroupSel = SelectGroups(parameters=parameters_featsel)
        GroupSel.fit(feature_labels[0])
        if verbose:
            print("Original Length: " + str(len(feature_values[0])))
        feature_values = GroupSel.transform(feature_values)
        if verbose:
            print("New Length: " + str(len(feature_values[0])))
        feature_labels = GroupSel.transform(feature_labels)
    else:
        GroupSel = None

    # Check whether there are any features left
    if len(feature_values[0]) == 0:
        # TODO: Make a specific PREDICT exception for this warning.
        if verbose:
            print(
                '[WARNING]: No features are selected! Probably all feature groups were set to False. Parameters:'
            )
            print para

        # Return a zero performance dummy
        VarSel = None
        scaler = None
        SelectModel = None
        pca = None
        StatisticalSel = None
        imputer = None

        # Delete the non-used fields
        para_estimator = delete_nonestimator_parameters(para_estimator)

        ret = [0, 0, 0, 0, 0, para_estimator, para]
        return ret, GroupSel, VarSel, SelectModel, feature_labels[
            0], scaler, imputer, pca, StatisticalSel

    # ------------------------------------------------------------------------
    # Feature imputation
    if 'Imputation' in para_estimator.keys() and ['Imputation'] == 'True':
        imp_type = para_estimator['ImputationMethod']
        imp_nn = para_estimator['ImputationNeighbours']

        imputer = Imputer(missing_values='NaN',
                          strategy=imp_type,
                          n_neighbors=imp_nn,
                          axis=0)
        imputer.fit(feature_values)
        feature_values = imputer.transform(feature_values)
    else:
        imputer = None

    if 'Imputation' in para_estimator.keys():
        del para_estimator['Imputation']
        del para_estimator['ImputationMethod']
        del para_estimator['ImputationNeighbours']

    # ------------------------------------------------------------------------
    # FIXME: When only using LBP feature, X is 3 dimensional with 3rd dimension length 1
    if len(feature_values.shape) == 3:
        feature_values = np.reshape(
            feature_values, (feature_values.shape[0], feature_values.shape[1]))
    if len(feature_labels.shape) == 3:
        feature_labels = np.reshape(
            feature_labels, (feature_labels.shape[0], feature_labels.shape[1]))

    # Remove any NaN feature values if these are still left after imputation
    feature_values = replacenan(feature_values,
                                verbose=verbose,
                                feature_labels=feature_labels[0])

    # --------------------------------------------------------------------
    # Feature selection based on variance
    if para_estimator['Featsel_Variance'] == 'True':
        if verbose:
            print("Selecting features based on variance.")
        if verbose:
            print("Original Length: " + str(len(feature_values[0])))
        try:
            feature_values, feature_labels, VarSel =\
                selfeat_variance(feature_values, feature_labels)
        except ValueError:
            if verbose:
                print(
                    '[WARNING]: No features meet the selected Variance threshold! Skipping selection.'
                )
            VarSel = None
        if verbose:
            print("New Length: " + str(len(feature_values[0])))
    else:
        VarSel = None
    del para_estimator['Featsel_Variance']

    # Check whether there are any features left
    if len(feature_values[0]) == 0:
        # TODO: Make a specific PREDICT exception for this warning.
        if verbose:
            print(
                '[WARNING]: No features are selected! Probably you selected a feature group that is not in your feature file. Parameters:'
            )
            print para
        para_estimator = delete_nonestimator_parameters(para_estimator)

        # Return a zero performance dummy
        scaler = None
        SelectModel = None
        pca = None
        StatisticalSel = None
        ret = [0, 0, 0, 0, 0, para_estimator, para]
        return ret, GroupSel, VarSel, SelectModel, feature_labels[
            0], scaler, imputer, pca, StatisticalSel

    # --------------------------------------------------------------------
    # Feature selection based on a statistical test
    if 'StatisticalTestUse' in para_estimator.keys():
        if para_estimator['StatisticalTestUse'] == 'True':
            metric = para_estimator['StatisticalTestMetric']
            threshold = para_estimator['StatisticalTestThreshold']
            if verbose:
                print(
                    "Selecting features based on statistical test. Method {}, threshold {}."
                ).format(metric, str(round(threshold, 2)))
            if verbose:
                print("Original Length: " + str(len(feature_values[0])))

            StatisticalSel = StatisticalTestThreshold(metric=metric,
                                                      threshold=threshold)

            StatisticalSel.fit(feature_values, y)
            feature_values = StatisticalSel.transform(feature_values)
            feature_labels = StatisticalSel.transform(feature_labels)
            if verbose:
                print("New Length: " + str(len(feature_values[0])))
        else:
            StatisticalSel = None
        del para_estimator['StatisticalTestUse']
        del para_estimator['StatisticalTestMetric']
        del para_estimator['StatisticalTestThreshold']
    else:
        StatisticalSel = None

    # Check whether there are any features left
    if len(feature_values[0]) == 0:
        # TODO: Make a specific PREDICT exception for this warning.
        if verbose:
            print(
                '[WARNING]: No features are selected! Probably you selected a feature group that is not in your feature file. Parameters:'
            )
            print para
        para_estimator = delete_nonestimator_parameters(para_estimator)

        # Return a zero performance dummy
        scaler = None
        SelectModel = None
        pca = None
        ret = [0, 0, 0, 0, 0, para_estimator, para]
        return ret, GroupSel, VarSel, SelectModel, feature_labels[
            0], scaler, imputer, pca, StatisticalSel

    # ------------------------------------------------------------------------
    # Feature scaling
    if 'FeatureScaling' in para_estimator:
        if verbose:
            print("Fitting scaler and transforming features.")

        if para_estimator['FeatureScaling'] == 'z_score':
            scaler = StandardScaler().fit(feature_values)
        elif para_estimator['FeatureScaling'] == 'minmax':
            scaler = MinMaxScaler().fit(feature_values)
        else:
            scaler = None

        if scaler is not None:
            feature_values = scaler.transform(feature_values)
        del para_estimator['FeatureScaling']
    else:
        scaler = None

    # ------------------------------------------------------------------------
    # Perform feature selection using a model
    if 'SelectFromModel' in para_estimator.keys(
    ) and para_estimator['SelectFromModel'] == 'True':
        if verbose:
            print("Selecting features using lasso model.")
        # Use lasso model for feature selection

        # First, draw a random value for alpha and the penalty ratio
        alpha = scipy.stats.uniform(loc=0.0, scale=1.5).rvs()
        # l1_ratio = scipy.stats.uniform(loc=0.5, scale=0.4).rvs()

        # Create and fit lasso model
        lassomodel = Lasso(alpha=alpha)
        lassomodel.fit(feature_values, y)

        # Use fit to select optimal features
        SelectModel = SelectFromModel(lassomodel, prefit=True)
        if verbose:
            print("Original Length: " + str(len(feature_values[0])))
        feature_values = SelectModel.transform(feature_values)
        if verbose:
            print("New Length: " + str(len(feature_values[0])))
        feature_labels = SelectModel.transform(feature_labels)
    else:
        SelectModel = None
    if 'SelectFromModel' in para_estimator.keys():
        del para_estimator['SelectFromModel']

    # ----------------------------------------------------------------
    # PCA dimensionality reduction
    # Principle Component Analysis
    if 'UsePCA' in para_estimator.keys() and ['UsePCA'] == 'True':
        print('Fitting PCA')
        if para_estimator['PCAType'] == '95variance':
            # Select first X components that describe 95 percent of the explained variance
            pca = PCA(n_components=None)
            pca.fit(feature_values)
            evariance = pca.explained_variance_ratio
            num = 0
            sum = 0
            while sum < 0.95:
                sum += evariance[num]
                num += 1

            # Make a PCA based on the determined amound of components
            pca = PCA(n_components=num)
            pca.fit(feature_values)
            feature_values = pca.transform(feature_values)
            feature_labels = pca.transform(feature_labels)

        else:
            # Assume a fixed number of components
            n_components = int(para_estimator['PCAType'])
            pca = PCA(n_components=n_components)
            pca.fit(feature_values)
            feature_values = pca.transform(feature_values)
            feature_labels = pca.transform(feature_labels)
    else:
        pca = None

    if 'UsePCA' in para_estimator.keys():
        del para_estimator['UsePCA']
        del para_estimator['PCAType']

    # ----------------------------------------------------------------
    # Fitting and scoring
    # Only when using fastr this is an entry
    if 'Number' in para_estimator.keys():
        del para_estimator['Number']

    # For certainty, we delete all parameters again
    para_estimator = delete_nonestimator_parameters(para_estimator)

    ret = _fit_and_score(estimator, feature_values, y, scorer, train, test,
                         verbose, para_estimator, fit_params,
                         return_train_score, return_parameters,
                         return_n_test_samples, return_times, error_score)

    # Paste original parameters in performance
    ret.append(para)

    return ret, GroupSel, VarSel, SelectModel, feature_labels[
        0], scaler, imputer, pca, StatisticalSel
Example #33
0
def _fit_and_score_ckpt(workdir=None,
                        checkpoint=True,
                        force_refresh=False,
                        **fit_and_score_kwargs):
    """Fit estimator and compute scores for a given dataset split.

    This function wraps
    :func:`sklearn:sklearn.model_selection._validation._fit_and_score`,
    while also saving checkpoint files containing the estimator, paramters,
    This is useful if fitting and scoring is costly or if it is being
    performed within a large cross-validation experiment.

    In avoid collisions with scores computed for other CV splits, this
    function computes a hash from a nested dictionary containing all keyword
    arguments as well as estimator parameters. It then saves the scores and
    parameters in <hash>_params.h5 and the estimator itself in
    <hash>_estimator.pkl

    Parameters
    ----------
    workdir : path-like object, default=None
        A string or :term:`python:path-like-object` indicating the directory
        in which to store checkpoint files

    checkpoint : bool, default=True
        If True, checkpoint the parameters, estimators, and scores.

    force_refresh : bool, default=False
        If True, recompute scores even if the checkpoint file already exists.
        Otherwise, load scores from checkpoint files and return.

    **fit_and_score_kwargs : kwargs
        Key-word arguments passed to
        :func:`sklearn:sklearn.model_selection._validation._fit_and_score`

    Returns
    -------
    train_scores : dict of scorer name -> float
        Score on training set (for all the scorers),
        returned only if `return_train_score` is `True`.

    test_scores : dict of scorer name -> float
        Score on testing set (for all the scorers).

    n_test_samples : int
        Number of test samples.

    fit_time : float
        Time spent for fitting in seconds.

    score_time : float
        Time spent for scoring in seconds.

    parameters : dict or None
        The parameters that have been evaluated.

    estimator : estimator object
        The fitted estimator
    """
    if not checkpoint:
        return _fit_and_score(**fit_and_score_kwargs)

    if workdir is None:
        raise ValueError(
            "If checkpoint is True, you must supply a working directory "
            "through the ``workdir`` argument.")

    estimator = fit_and_score_kwargs.pop("estimator", None)
    estimator_params = _serialize_estimator_params(estimator.get_params())
    all_params = {
        "estimator_params": estimator_params,
        "fit_and_score_kwargs": fit_and_score_kwargs,
    }

    cv_hash = hashlib.md5(
        json.dumps(all_params, sort_keys=True, ensure_ascii=True,
                   default=str).encode()).hexdigest()

    h5_file = os.path.join(workdir, cv_hash + "_params.h5")
    pkl_file = os.path.join(workdir, cv_hash + "_estimator.pkl")

    if not force_refresh and os.path.exists(h5_file):
        ckpt_dict = ddio.load(h5_file)

        scores = ckpt_dict["scores"]

        if fit_and_score_kwargs.get("return_estimator", False):
            with open(pkl_file, "rb") as fp:
                estimator = pickle.load(fp)

            scores.append(estimator)

        return scores
    else:
        scores = _fit_and_score(estimator, **fit_and_score_kwargs)
        os.makedirs(workdir, exist_ok=True)
        if fit_and_score_kwargs.get("return_estimator", False):
            estimator = scores[-1]
            with open(pkl_file, "wb") as fp:
                pickle.dump(estimator, fp)

            ckpt_scores = scores[:-1]
            if isinstance(estimator, Pipeline):
                model = estimator.steps[-1]
            else:
                model = estimator

            estimator_params = _serialize_estimator_params(
                estimator.get_params())
            fitted_params = {
                "alpha_": getattr(model, "alpha_", None),
                "alphas_": getattr(model, "alphas_", None),
                "l1_ratio_": getattr(model, "l1_ratio_", None),
                "mse_path_": getattr(model, "mse_path_", None),
                "scoring_path_": getattr(model, "scoring_path_", None),
                "intercept_": getattr(model, "intercept_", None),
                "coef_": getattr(model, "coef_", None),
            }
        else:
            estimator_params = None
            fitted_params = None
            ckpt_scores = scores

        fit_and_score_kwargs.pop("X")
        fit_and_score_kwargs.pop("y")

        if "scorer" in fit_and_score_kwargs:
            fit_and_score_kwargs["scorer"] = list(
                fit_and_score_kwargs["scorer"].keys())

        ckpt_dict = {
            "scores": ckpt_scores,
            "fit_and_score_kwargs": fit_and_score_kwargs,
            "estimator_params": estimator_params,
            "fitted_params": fitted_params,
        }

        ddio.save(h5_file, ckpt_dict)
        return scores
Example #34
0
def fit_and_score(estimator,
                  X,
                  y,
                  scorer,
                  train,
                  test,
                  para,
                  fit_params=None,
                  return_train_score=True,
                  return_n_test_samples=True,
                  return_times=True,
                  return_parameters=True,
                  error_score='raise',
                  verbose=True):
    '''
    Fit an estimator to a dataset and score the performance. The following
    methods can currently be applied as preprocessing before fitting in
    this order:
    1. Apply feature selection based on type group.
    2. Apply feature selection based on variance of feature among patients.
    3. Scale features with e.g. z-scoring.

    Parameters
    ----------
    estimator: sklearn estimator, mandatory
            Unfitted estimator which will be fit.

    X: array, mandatory
            Array containing the feature values (columns) for each object (rows).

    y: list(?), mandatory
            List containing the labels of the objects.

    scorer: sklearn scorer, mandatory
            Function used as optimization criterion for the hyperparamater optimization.

    train: list, mandatory
            Indices of the objects to be used as training set.

    test: list, mandatory
            Indices of the objects to be used as testing set.

    para: dictionary, mandatory
            Contains the settings used for the above preprocessing functions
            and the fitting. TODO: Create a default object and show the
            fields.

    fit_params:dictionary, default None
            Parameters supplied to the estimator for fitting. See the SKlearn
            site for the parameters of the estimators.

    return_train_score: boolean, default True
            Save the training score to the final SearchCV object.

    return_n_test_samples: boolean, default True
            Save the number of times each sample was used in the test set
            to the final SearchCV object.

    return_times: boolean, default True
            Save the time spend for each fit to the final SearchCV object.

    return_parameters: boolean, default True
            Return the parameters used in the final fit to the final SearchCV
            object.

    error_score: numeric or "raise" by default
            Value to assign to the score if an error occurs in estimator
            fitting. If set to "raise", the error is raised. If a numeric
            value is given, FitFailedWarning is raised. This parameter
            does not affect the refit step, which will always raise the error.

    verbose: boolean, default=True
            If True, print intermediate progress to command line. Warnings are
            always printed.

    '''

    pca = None

    # We copy the parameter object so we can alter it and keep the original
    para_estimator = para.copy()
    for i in range(len(X)):
        if len(X[i][0]) != len(X[i][0]):
            raise IOError(
                'Length of features values ({}) does not match length of feature labels ({})\n \
                           Check CalcFeatures'.format(len(X[i][0]),
                                                      len(X[0][1])))

    # X is a tuple: split in two arrays
    feature_values = np.asarray([x[0] for x in X])
    feature_labels = np.asarray([x[1] for x in X])

    # Perform feature selection if required
    if 'SelectGroups' in para_estimator:
        if verbose:
            print("Selecting groups of features.")
        del para_estimator['SelectGroups']
        # TODO: more elegant way to solve this
        feature_groups = [
            "histogram_features", "orientation_features", "patient_features",
            "semantic_features", "shape_features", "texture_features",
            "coliage_features", 'vessel_features', "phase_features",
            "log_features"
        ]
        parameters_featsel = dict()
        for group in feature_groups:
            if group not in para_estimator:
                # Default: do use the group
                value = True
            else:
                value = para_estimator[group]
                del para_estimator[group]

            parameters_featsel[group] = value
        GroupSel = SelectGroups(parameters=parameters_featsel)
        GroupSel.fit(feature_labels[0])
        if verbose:
            print("Original Length: " + str(len(feature_values[0])))
        feature_values = GroupSel.transform(feature_values)
        if verbose:
            print("New Length: " + str(len(feature_values[0])))
        feature_labels = GroupSel.transform(feature_labels)
    else:
        GroupSel = None

    if len(feature_values[0]) == 0:
        # TODO: Make a specific PREDICT exception for this warning.
        print(
            '[WARNING]: No features are selected! Probably all feature groups were set to False. Parameters:'
        )
        print para

        # Return a zero performance dummy
        VarSel = None
        scaler = None
        SelectModel = None

        # Delete the non-used fields
        if 'Featsel_Variance' in para_estimator.keys():
            del para_estimator['Featsel_Variance']
        if 'FeatureScaling' in para_estimator.keys():
            del para_estimator['FeatureScaling']

        ret = [0, 0, 0, 0, 0, para_estimator, para]
    else:
        # FIXME: When only using LBP feature, X is 3 dimensional with 3rd dimension length 1
        if len(feature_values.shape) == 3:
            feature_values = np.reshape(
                feature_values,
                (feature_values.shape[0], feature_values.shape[1]))
        if len(feature_labels.shape) == 3:
            feature_labels = np.reshape(
                feature_labels,
                (feature_labels.shape[0], feature_labels.shape[1]))

        if para_estimator['Featsel_Variance'] == 'True':
            if verbose:
                print("Selecting features based on variance.")
            if verbose:
                print("Original Length: " + str(len(feature_values[0])))
            try:
                feature_values, feature_labels, VarSel =\
                    selfeat_variance(feature_values, feature_labels)
            except ValueError:
                print(
                    '[WARNING]: No features meet the selected Variance threshold! Skipping selection.'
                )
                VarSel = None
            if verbose:
                print("New Length: " + str(len(feature_values[0])))
        else:
            VarSel = None
        del para_estimator['Featsel_Variance']

        # Fit and score the classifier
        if len(feature_values[0]) == 0:
            # TODO: Make a specific PREDICT exception for this warning.
            print(
                '[WARNING]: No features are selected! Probably you selected a feature group that is not in your feature file. Parameters:'
            )
            print para

            # Return a zero performance dummy
            scaler = None
            SelectModel = None

            ret = [0, 0, 0, 0, 0, para_estimator, para]
            del para_estimator['FeatureScaling']
        else:
            if 'FeatureScaling' in para_estimator:
                if verbose:
                    print("Fitting scaler and transforming features.")
                if para_estimator['FeatureScaling'] == 'z_score':
                    scaler = StandardScaler().fit(feature_values)
                elif para_estimator['FeatureScaling'] == 'minmax':
                    scaler = MinMaxScaler().fit(feature_values)
                else:
                    scaler = None

                if scaler is not None:
                    feature_values = scaler.transform(feature_values)
            else:
                scaler = None
            del para_estimator['FeatureScaling']

            # Only when using fastr this is an entry
            if 'Number' in para_estimator.keys():
                del para_estimator['Number']

            # Perform feature selection using a model
            para_estimator['SelectFromModel'] = False
            if para_estimator['SelectFromModel']:
                if verbose:
                    print("Selecting features using lasso model.")
                # Use lasso model for feature selection

                # First, draw a random value for alpha and the penalty ratio
                alpha = scipy.stats.uniform(loc=0.5, scale=1.5).rvs()
                # l1_ratio = scipy.stats.uniform(loc=0.5, scale=0.4).rvs()

                # Create and fit lasso model
                lassomodel = Lasso(alpha=alpha)
                lassomodel.fit(feature_values, y)

                # Use fit to select optimal features
                SelectModel = SelectFromModel(lassomodel, prefit=True)
                if verbose:
                    print("Original Length: " + str(len(feature_values[0])))
                feature_values = SelectModel.transform(feature_values)
                if verbose:
                    print("New Length: " + str(len(feature_values[0])))
                feature_labels = SelectModel.transform(feature_labels)
            else:
                SelectModel = None
            del para_estimator['SelectFromModel']

            #Principle Component Analysis
            do_PCA = False
            n_components = 39
            if len(feature_values[0]) < n_components:
                n_components = len(feature_values[0])
            if do_PCA:
                pca = PCA(n_components=n_components)
                pca.fit(feature_values)
                feature_values = pca.transform(feature_values)
                print('Fitting PCA')

            ret = _fit_and_score(estimator, feature_values, y, scorer, train,
                                 test, verbose, para_estimator, fit_params,
                                 return_train_score, return_parameters,
                                 return_n_test_samples, return_times,
                                 error_score)

            # Paste original parameters in performance
            ret.append(para)

    return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler
Example #35
0
def grid_search_cv(
    clf,
    train_indices,
    n_folds,
    param_grid,
    kernel_matrices,
):
    """
    Internal grid search routine for a set of kernel matrices. The
    routine will use a pre-defined set of train indices to use for
    the grid search. Other indices will *not* be considered. Thus,
    information leakage is prevented.


    :param clf: Classifier to fit
    :param train_indices: Indices permitted to be used for cross-validation
    :param n_folds: Number of folds for the cross-validation
    :param param_grid: Parameters for the grid search
    :param kernel_matrices: Kernel matrices to check; each one of them
    is assumed to represent a different choice of parameter. They will
    *all* be checked iteratively by the routine.

    :return: Best classifier, i.e. the classifier with the best
    parameters. Needs to be refit prior to predicting labels on
    the test data set. Moreover, the best-performing matrix, in
    terms of the grid search, is returned. It has to be used in
    all subsequent prediction tasks. Additionally, the function
    also returns a dictionary of the best parameters.
    """

    y = kernel_matrices["y"][train_indices]

    best_clf = None
    best_accuracy = 0.0
    best_parameters = {}

    # iterate over parameters in most outer loop to avoid issues
    # with matrix normalization (when we loop over the matrices)
    # in the next loop, then parameters['normalize'] can only
    # be either True or False.
    for parameters in list(param_grid):

        for K_param, K in kernel_matrices.items():

            # Skip labels; we could also remove them from the set of
            # matrices but this would make the function inconsistent
            # because it should *not* fiddle with the input data set
            # if it can be avoided.
            if K_param == "y":
                continue

            # This ensures that we *cannot* access the test indices,
            # even if we try :)
            K = K[train_indices, :][:, train_indices]

            # normalize kernel matrix if parameters['normalize'] == True
            if parameters["normalize"]:
                K = normalize(K)

            # Remove the parameter because it does not pertain to
            # the classifier below.
            clf_parameters = {
                key: value
                for key, value in parameters.items()
                if key not in ["normalize"]
            }

            # we have to create a new cv instance for each parameter
            # tuple, because StratifiedKFold returns a generator
            cv = StratifiedKFold(
                n_splits=n_folds,
                shuffle=True,
                random_state=42,  # TODO: make configurable
            )

            # initialize empty list to store fold accuracies of the
            # current parameters in.
            accuracy_list = []

            # From this point on, `train_index` and `test_index` are supposed to
            # be understood *relative* to the input training indices.
            for train_index, test_index in cv.split(train_indices, y):

                accuracy, params = _fit_and_score(
                    clone(clf),
                    K,
                    y,
                    scorer=make_scorer(accuracy_score),
                    train=train_index,
                    test=test_index,
                    verbose=0,
                    parameters=clf_parameters,
                    fit_params=None,  # No additional parameters for `fit()`
                    return_parameters=True,
                )
                accuracy_list.append(accuracy)

            # compute accuracy mean of current parameters to compare to
            # previously best result.
            accuracy_mean = np.mean(accuracy_list)

            # Note that when storing the best parameters, we can re-use
            # the original grid because we want to know about this
            # normalization.
            if accuracy_mean > best_accuracy:
                best_clf = clone(clf).set_params(**params)
                best_accuracy = accuracy_mean

                # Make a copy of the dictionary to ensure that we are
                # not updating it with parameters that cannot be used
                # in the grid search (such as `K`).
                best_parameters = dict(parameters)

                # Update kernel matrix parameter to indicate which
                # matrix was used to obtain these results. The key
                # will also be returned later on.
                best_parameters["K"] = K_param

    # Retrieve the kernel matrix of the best performing
    # model and normalize if `best_parameters['normalize']`
    # is True
    best_K = kernel_matrices[best_parameters["K"]]
    if best_parameters["normalize"]:
        best_K = normalize(best_K)

    return best_clf, best_K, best_parameters