Example #1
0
def verbose_wait(amr, clientview, return_train_scores):
    N = len(amr)
    pending = set(amr.msg_ids)
    while pending:
        try:
            clientview.wait(pending, 1e-3)
        except parallel.TimeoutError:
            pass

        n_completed = N - len(clientview.outstanding)
        finished = pending.difference(clientview.outstanding)
        pending = pending.difference(finished)

        if len(finished) > 0:
            print()

        for msg_id in finished:
            ar = clientview.get_result(msg_id)
            try:
                for result in ar.result:
                    elapsed, params = result[-2], result[-1]
                    test_score = result[1] if return_train_scores else result[0]
                    left = '[CV engine={}] {}   '.format(ar.engine_id,
                        ', '.join('{}={}'.format(k, v) for k, v in params.items()))
                    right = '  score = {:5f}  {}'.format(test_score, short_format_time(elapsed))
                    print(left + right.rjust(70-len(left), '-'))
            except RemoteError as e:
                e.print_traceback()
                raise
        else:
            left = '\r[Parallel] {0:d}/{1:d}  tasks finished'.format(n_completed, N)
            right = 'elapsed {0}         '.format(short_format_time(amr.elapsed))
            print(left + right.rjust(71-len(left)), end='')
            sys.stdout.flush()
            time.sleep(1 + round(amr.elapsed) - amr.elapsed)

    n_engines = len(set(e['engine_id'] for e in amr._metadata))
    engine_time = sum((e.completed - e.submitted for e in amr._metadata),
                      datetime.timedelta()).total_seconds()

    m1 = 'Elapsed walltime:    {}'.format(short_format_time(amr.elapsed))
    m2 = 'Elapsed engine time: {}'.format(short_format_time(engine_time))
    m3a = 'Parallel speedup:'
    m3b = '{:.3f}'.format(engine_time/ amr.elapsed).rjust(len(m2)-len(m3a))
    m4a = 'Number of engines:'
    m4b = '{}'.format(n_engines).rjust(len(m2)-len(m4a))
    print('\n\nTasks completed')
    print('-'*len(m2))
    print(m1)
    print(m2)
    print(m3a + m3b)
    print(m4a + m4b)
    print('-'*len(m2))
Example #2
0
def fit_grid_point(base_estimator, parameters,
                   X, y, sample_weight,
                   train, test, verbose,
                   **fit_params):
    """Run fit on one set of parameters"""
    if verbose > 1:
        start_time = time.time()
        msg = '%s' % (', '.join('%s=%s' % (k, v)
                                     for k, v in parameters.iteritems()))
        print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')

    # update parameters of the classifier after a copy of its base structure
    estimator = clone(base_estimator)
    estimator.set_params(**parameters)

    X_train, y_train, sample_weight_train = _safe_split(
        estimator, X, y, sample_weight, train)
    X_test, y_test, sample_weight_test = _safe_split(
        estimator, X, y, sample_weight, test, train)

    if sample_weight is not None:
        fit_params = fit_params.copy()
        fit_params['sample_weight'] = sample_weight_train

    if y_train is None:
        estimator.fit(X_train, **fit_params)
    else:
        estimator.fit(X_train, y_train, **fit_params)

    if verbose > 1:
        end_msg = "%s -%s" % (msg,
                              logger.short_format_time(time.time() -
                                                       start_time))
        print "[BoostGridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)
    return estimator, parameters, train, test
Example #3
0
def fit_grid_point(base_estimator, parameters, X, y, sample_weight, train,
                   test, verbose, **fit_params):
    """Run fit on one set of parameters"""
    if verbose > 1:
        start_time = time.time()
        msg = '%s' % (', '.join('%s=%s' % (k, v)
                                for k, v in parameters.iteritems()))
        print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')

    # update parameters of the classifier after a copy of its base structure
    estimator = clone(base_estimator)
    estimator.set_params(**parameters)

    X_train, y_train, sample_weight_train = _safe_split(
        estimator, X, y, sample_weight, train)
    X_test, y_test, sample_weight_test = _safe_split(estimator, X, y,
                                                     sample_weight, test,
                                                     train)

    if sample_weight is not None:
        fit_params = fit_params.copy()
        fit_params['sample_weight'] = sample_weight_train

    if y_train is None:
        estimator.fit(X_train, **fit_params)
    else:
        estimator.fit(X_train, y_train, **fit_params)

    if verbose > 1:
        end_msg = "%s -%s" % (
            msg, logger.short_format_time(time.time() - start_time))
        print "[BoostGridSearchCV] %s %s" % (
            (64 - len(end_msg)) * '.', end_msg)
    return estimator, parameters, train, test
Example #4
0
def _fit_and_score(estimator, depthmaps, offset_points_projected, direction_vectors, true_joints, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, error_score='raise'):
    
    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))
        
    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(depthmaps, v, train))
                        for k, v in fit_params.items()])
                            
    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    depth_train, offsets_train, directions_train, truths_train = _safe_split(depthmaps, offset_points_projected, direction_vectors, true_joints, train)
    depth_test, offsets_test, directions_test, truths_test = _safe_split(depthmaps, offset_points_projected, direction_vectors, true_joints, test)
    
    try:
        estimator.fit(depth_train, offsets_train, directions_train, **fit_params)
        
    except Exception as e:
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn("Classifier fit failed. The score on this train-test"
                          " partition for these parameters will be set to %f. "
                          "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)"
                             )
    
    else:
        test_score = _score(estimator, depth_test, truths_test, scorer)
        if return_train_score:
            train_score = _score(estimator, depth_train, truths_train, scorer)
        
    scoring_time = time.time() - start_time
    
    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_score] if return_train_score else []
    ret.extend([test_score, _num_samples(depth_test), scoring_time])
    if return_parameters:
        ret.append(parameters)
    return ret
def _fit_and_score(estimator, Z, scorer, train, test, verbose,
                   parameters, fit_params, return_train_score=False,
                   return_parameters=False, error_score='raise'):

    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in list(parameters.items())))
        print(("[CV] %s %s" % (msg, (64 - len(msg)) * '.')))

    fit_params = fit_params if fit_params is not None else {}

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    Z_train = Z[train]
    Z_test = Z[test]

    try:
        estimator.fit(Z_train, **fit_params)
    except Exception as e:
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn("Classifier fit failed. The score on this train-test"
                          " partition for these parameters will be set to %f. "
                          "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)"
                             )
    else:
        test_score = _score(estimator, Z_test, scorer)
        if return_train_score:
            train_score = _score(estimator, Z_train, scorer)

    scoring_time = time.time() - start_time

    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
        print(("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)))

    ret = [train_score] if return_train_score else []
    ret.extend([test_score, _num_samples(Z_test), scoring_time])
    if return_parameters:
        ret.append(parameters)
    return ret
Example #6
0
def score_each_boost(estimator, parameters,
                     min_n_estimators,
                     X, y, sample_weight,
                     score_func, train, test,
                     verbose):
    """Run fit on one set of parameters

    Returns the score and the instance of the classifier
    """
    if verbose > 1:
        start_time = time.time()
        msg = '%s' % (', '.join('%s=%s' % (k, v)
                                     for k, v in parameters.iteritems()))
        print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')

    X_test, y_test, sample_weight_test = _safe_split(
        estimator, X, y, sample_weight, test, train)

    test_score_params = {}
    if sample_weight is not None:
        test_score_params['sample_weight'] = sample_weight_test

    this_n_test_samples = _num_samples(X_test)

    all_scores = []
    all_clf_params = []
    n_test_samples = []

    for i, y_pred in enumerate(estimator.staged_predict(X_test)):
        if i + 1 < min_n_estimators:
            continue
        score = score_func(y_test, y_pred, **test_score_params)
        all_scores.append(score)
        clf_para = copy(parameters)
        clf_para['n_estimators'] = i + 1
        all_clf_params.append(clf_para)
        n_test_samples.append(this_n_test_samples)

    # boosting may have stopped early
    if len(all_scores) < estimator.n_estimators - min_n_estimators + 1:
        last_score = all_scores[-1]
        last_clf_params = all_clf_params[-1]
        for i in range(len(all_scores),
                       estimator.n_estimators - min_n_estimators + 1):
            all_scores.append(last_score)
            clf_para = copy(last_clf_params)
            clf_para['n_estimators'] = i + 1
            all_clf_params.append(clf_para)
            n_test_samples.append(this_n_test_samples)

    if verbose > 1:
        end_msg = "%s -%s" % (msg,
                              logger.short_format_time(time.time() -
                                                       start_time))
        print "[BoostGridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)
    return all_scores, all_clf_params, n_test_samples
Example #7
0
def score_each_boost(estimator, parameters, min_n_estimators, X, y,
                     sample_weight, score_func, train, test, verbose):
    """Run fit on one set of parameters

    Returns the score and the instance of the classifier
    """
    if verbose > 1:
        start_time = time.time()
        msg = '%s' % (', '.join('%s=%s' % (k, v)
                                for k, v in parameters.iteritems()))
        print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')

    X_test, y_test, sample_weight_test = _safe_split(estimator, X, y,
                                                     sample_weight, test,
                                                     train)

    test_score_params = {}
    if sample_weight is not None:
        test_score_params['sample_weight'] = sample_weight_test

    this_n_test_samples = _num_samples(X_test)

    all_scores = []
    all_clf_params = []
    n_test_samples = []

    for i, y_pred in enumerate(estimator.staged_predict(X_test)):
        if i + 1 < min_n_estimators:
            continue
        score = score_func(y_test, y_pred, **test_score_params)
        all_scores.append(score)
        clf_para = copy(parameters)
        clf_para['n_estimators'] = i + 1
        all_clf_params.append(clf_para)
        n_test_samples.append(this_n_test_samples)

    # boosting may have stopped early
    if len(all_scores) < estimator.n_estimators - min_n_estimators + 1:
        last_score = all_scores[-1]
        last_clf_params = all_clf_params[-1]
        for i in range(len(all_scores),
                       estimator.n_estimators - min_n_estimators + 1):
            all_scores.append(last_score)
            clf_para = copy(last_clf_params)
            clf_para['n_estimators'] = i + 1
            all_clf_params.append(clf_para)
            n_test_samples.append(this_n_test_samples)

    if verbose > 1:
        end_msg = "%s -%s" % (
            msg, logger.short_format_time(time.time() - start_time))
        print "[BoostGridSearchCV] %s %s" % (
            (64 - len(end_msg)) * '.', end_msg)
    return all_scores, all_clf_params, n_test_samples
Example #8
0
def fit_and_score_n_support(estimator, X, y, scorer, train, test, verbose,
                            parameters, fit_params, return_train_score=False,
                            return_parameters=False, return_n_support=True,
                            error_score='raise'):
    """Fit estimator and compute scores for a given dataset split.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : array-like of shape at least 2D
        The data to fit.

    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.

    scorer : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.

    train : array-like, shape (n_train_samples,)
        Indices of training samples.

    test : array-like, shape (n_test_samples,)
        Indices of test samples.

    verbose : integer
        The verbosity level.

    error_score : 'raise' (default) or numeric
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised. If a numeric value is given,
        FitFailedWarning is raised. This parameter does not affect the refit
        step, which will always raise the error.

    parameters : dict or None
        Parameters to be set on the estimator.

    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.

    return_train_score : boolean, optional, default: False
        Compute and return score on training set.

    return_parameters : boolean, optional, default: False
        Return parameters that has been used for the estimator.

    Returns
    -------
    train_score : float, optional
        Score on training set, returned only if `return_train_score` is `True`.

    test_score : float
        Score on test set.

    n_test_samples : int
        Number of test samples.

    scoring_time : float
        Time spent for fitting and scoring in seconds.

    parameters : dict or None, optional
        The parameters that have been evaluated.
    """
    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn("Classifier fit failed. The score on this train-test"
                          " partition for these parameters will be set to %f. "
                          "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)"
                             )

    else:
        test_score = _score(estimator, X_test, y_test, scorer)
        if return_train_score:
            train_score = _score(estimator, X_train, y_train, scorer)

    scoring_time = time.time() - start_time

    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_score] if return_train_score else []
    ret.extend([test_score, _num_samples(X_test), scoring_time])
    if return_parameters:
        ret.append(parameters)
    if return_n_support:
        ret.append(estimator.n_support_)
    return ret
Example #9
0
def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
                   parameters, fit_params, return_train_score=False,
                   return_parameters=False, return_n_test_samples=False,
                   return_times=False, error_score='raise',
                   return_estimator=False, return_idx=False):
    """Fit estimator and compute scores for a given dataset split.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : array-like of shape at least 2D
        The data to fit.

    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.

    scorer : A single callable or dict mapping scorer name to the callable
        If it is a single callable, the return value for ``train_scores`` and
        ``test_scores`` is a single float.

        For a dict, it should be one mapping the scorer name to the scorer
        callable object / function.

        The callable object / fn should have signature
        ``scorer(estimator, X, y)``.

    train : array-like, shape (n_train_samples,)
        Indices of training samples.

    test : array-like, shape (n_test_samples,)
        Indices of test samples.

    verbose : integer
        The verbosity level.

    error_score : 'raise' (default) or numeric
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised. If a numeric value is given,
        FitFailedWarning is raised. This parameter does not affect the refit
        step, which will always raise the error.

    parameters : dict or None
        Parameters to be set on the estimator.

    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.

    return_train_score : boolean, optional, default: False
        Compute and return score on training set.

    return_parameters : boolean, optional, default: False
        Return parameters that has been used for the estimator.

    return_n_test_samples : boolean, optional, default: False
        Whether to return the ``n_test_samples``

    return_times : boolean, optional, default: False
        Whether to return the fit/score times.

    Returns
    -------
    train_scores : dict of scorer name -> float, optional
        Score on training set (for all the scorers),
        returned only if `return_train_score` is `True`.

    test_scores : dict of scorer name -> float, optional
        Score on testing set (for all the scorers).

    n_test_samples : int
        Number of test samples.

    fit_time : float
        Time spent for fitting in seconds.

    score_time : float
        Time spent for scoring in seconds.

    parameters : dict or None, optional
        The parameters that have been evaluated.

    """
    if verbose > 1:
        if parameters is None:
            msg = ''
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                          for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                      for k, v in fit_params.items()])

    test_scores = {}
    train_scores = {}
    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    # do it for each patient
    X_train, y_train, X_test, y_test = _safe_split_multi(
        estimator, X, y, train, test)

    is_multimetric = not callable(scorer)
    n_scorers = len(scorer.keys()) if is_multimetric else 1

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            if is_multimetric:
                test_scores = dict(zip(scorer.keys(),
                                   [error_score, ] * n_scorers))
                if return_train_score:
                    train_scores = dict(zip(scorer.keys(),
                                        [error_score, ] * n_scorers))
            else:
                test_scores = error_score
                if return_train_score:
                    train_scores = error_score
            warnings.warn("Classifier fit failed. The score on this train-test"
                          " partition for these parameters will be set to %f. "
                          "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time
        # _score will return dict if is_multimetric is True
        test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
        score_time = time.time() - start_time - fit_time
        if return_train_score:
            train_scores = _score(estimator, X_train, y_train, scorer,
                                  is_multimetric)

    if verbose > 2:
        if is_multimetric:
            for scorer_name, score in test_scores.items():
                msg += ", %s=%s" % (scorer_name, score)
        else:
            msg += ", score=%s" % test_scores
    if verbose > 1:
        total_time = score_time + fit_time
        end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_scores, test_scores] if return_train_score else [test_scores]

    if return_n_test_samples:
        ret.append(_num_samples(X_test))
    if return_times:
        ret.extend([fit_time, score_time])
    if return_parameters:
        ret.append(parameters)
    if return_estimator:
        ret.append(estimator)
    if return_idx:
        ret.extend([train, test])
    return ret
Example #10
0
def _fit_and_score(estimator, frame, feature_names, target_feature,
                   scorer, parameters, verbose, scoring_params,
                   train, test, is_regression, act_args,
                   cv_fold, iteration):
    """Fits the current fold on the current parameters.

        Parameters
        ----------

        estimator : H2OPipeline or H2OEstimator
            The estimator to fit

        frame : H2OFrame, shape=(n_samples, n_features)
            The training frame

        feature_names : iterable (str)
            The feature names on which to train

        target_feature : str
            The name of the target feature

        scorer : H2OScorer
            The scoring function

        parameters : dict
            The parameters to set in the estimator clone

        verbose : int
            The level of verbosity

        scoring_params : dict
            The parameters to pass as kwargs to the scoring function

        train : iterable, shape=(n_train_samples,)
            The train fold indices

        test : iterable, shape=(n_test_samples,)
            The test fold indices

        is_regression : bool
            Whether we are fitting a continuous target

        act_args : dict
            :class:``skutil.metrics.GainsStatisticalReport`` args if called 
            from a :class:``skutil.h2o.H2OGainsRandomizedSearchCV``. Otherwise,
            these are unused.

        cv_fold : int
            The fold number for reporting

        iteration : int
            The iteration number for reporting

        Returns
        -------

        out : list, shape=(4,)
            test_score : float
                The score produced by the ``_score`` method
                on the test fold of the training set.

            len(test) : int
                The number of samples included in the
                test fold of the training set. Used later
                for IID normalizing of test scores.

            estimator : ``H2OEstimator`` or ``H2OPipeline``
                The fit pipeline or estimator. Used for later
                scoring on the validation set.

            parameters : dict
                The parameters used to fit this estimator.
    """
    if parameters is None:
        parameters = {}

    if verbose > 1:
        if not parameters:
            msg = ''
        else:
            msg = 'Target: %s; %s' % (target_feature, ', '.join('%s=%s' % (k, v) for k, v in parameters.items()))
        print("[CV (iter %i, fold %i)] %s %s" % (iteration, cv_fold, msg, (64 - len(msg)) * '.'))

    # h2o doesn't currently re-order rows... and sometimes will
    # complain for some reason. We need to sort our train/test idcs
    train = sorted(train)
    test = sorted(test)

    # if act_args, then it's a gains search. We just need to slice
    # our existing numpy arrays
    if act_args is not None:
        kwargs = {
            'expo': act_args['expo'][test],
            'loss': act_args['loss'][test],
            'prem': act_args['prem'][test] if act_args['prem'] is not None else None
        }
    else:
        kwargs = scoring_params

    # generate split
    train_frame = frame[train, :]
    test_frame = frame[test, :]

    start_time = time.time()

    # it's probably a pipeline
    is_h2o_est = isinstance(estimator, H2OEstimator)
    if not is_h2o_est:
        estimator.set_params(**parameters)

        # the name setting should be taken care of pre-clone...
        # setattr(estimator, 'feature_names', feature_names)
        # setattr(estimator, 'target_feature',target_feature)

        # do fit
        estimator.fit(train_frame)
    else:  # it's just an H2OEstimator
        # parm_dict = {}
        for k, v in six.iteritems(parameters):
            if '__' in k:
                raise ValueError('only one estimator passed to grid search, '
                                 'but multiple named parameters passed: %s' % k)

            # {parm_name : v}
            estimator._parms[k] = v

        # do train
        estimator.train(training_frame=train_frame, x=feature_names, y=target_feature)

    # score model
    test_score = _score(estimator, test_frame, target_feature, scorer, is_regression, **kwargs)

    # h2o is verbose.. if we are too, print a new line:
    if verbose > 1:
        print()  # new line

    scoring_time = time.time() - start_time

    if verbose > 2:
        msg += ', score=%f' % test_score
    if verbose > 1:
        end_msg = '%s -%s' % (msg, logger.short_format_time(scoring_time))
        print('[CV (iter %i, fold %i)] %s %s' % (iteration, cv_fold, (64 - len(end_msg)) * '.', end_msg))
        print()  # new line
        print()  # new line

    out = [test_score, len(test), estimator, parameters]
    return out
Example #11
0
def fit_grid_point(X, y, sample_weight, base_clf,
                   clf_params, train, test, verbose,
                   **fit_params):
    """Run fit on one set of parameters

    Returns the score and the instance of the classifier
    """
    if verbose > 1:
        start_time = time.time()
        msg = '%s' % (', '.join('%s=%s' % (k, v)
                                     for k, v in clf_params.iteritems()))
        print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')

    X, y = check_arrays(X, y)
    # update parameters of the classifier after a copy of its base structure
    clf = clone(base_clf)
    clf.set_params(**clf_params)

    if hasattr(base_clf, 'kernel') and hasattr(base_clf.kernel, '__call__'):
        # cannot compute the kernel values with custom function
        raise ValueError(
            "Cannot use a custom kernel function. "
            "Precompute the kernel matrix instead.")

    if getattr(base_clf, "_pairwise", False):
        # X is a precomputed square kernel matrix
        if X.shape[0] != X.shape[1]:
            raise ValueError("X should be a square kernel matrix")
        X_train = X[np.ix_(train, train)]
        X_test = X[np.ix_(test, train)]
    else:
        X_train = X[safe_mask(X, train)]
        X_test = X[safe_mask(X, test)]

    if y is not None:
        y_test = y[safe_mask(y, test)]
        y_train = y[safe_mask(y, train)]
    else:
        y_test = None
        y_train = None

    if sample_weight is not None:
        sample_weight_test = sample_weight[safe_mask(sample_weight, test)]
        sample_weight_train = sample_weight[safe_mask(sample_weight, train)]
    else:
        sample_weight_test = None
        sample_weight_train = None

    if sample_weight is not None:
        clf.fit(X_train, y_train,
                sample_weight=sample_weight_train,
                **fit_params)
    else:
        clf.fit(X_train, y_train, **fit_params)

    if verbose > 1:
        end_msg = "%s -%s" % (msg,
                              logger.short_format_time(time.time() -
                                                       start_time))
        print "[BoostGridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)
    return clf, clf_params, train, test
Example #12
0
def score_each_boost(X, y, sample_weight,
                     clf, clf_params,
                     min_n_estimators,
                     train, test, loss_func,
                     score_func, verbose):
    """Run fit on one set of parameters

    Returns the score and the instance of the classifier
    """
    if hasattr(clf, 'kernel') and hasattr(clf.kernel, '__call__'):
        # cannot compute the kernel values with custom function
        raise ValueError(
            "Cannot use a custom kernel function. "
            "Precompute the kernel matrix instead.")

    X, y = check_arrays(X, y)

    if getattr(clf, "_pairwise", False):
        # X is a precomputed square kernel matrix
        if X.shape[0] != X.shape[1]:
            raise ValueError("X should be a square kernel matrix")
        X_train = X[np.ix_(train, train)]
        X_test = X[np.ix_(test, train)]
    else:
        X_train = X[safe_mask(X, train)]
        X_test = X[safe_mask(X, test)]

    if y is not None:
        y_test = y[safe_mask(y, test)]
        y_train = y[safe_mask(y, train)]
    else:
        y_test = None
        y_train = None

    if sample_weight is not None:
        sample_weight_test = sample_weight[safe_mask(sample_weight, test)]
        sample_weight_train = sample_weight[safe_mask(sample_weight, train)]
    else:
        sample_weight_test = None
        sample_weight_train = None

    if verbose > 1:
        start_time = time.time()
        msg = '%s' % (', '.join('%s=%s' % (k, v)
                                     for k, v in clf_params.iteritems()))
        print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')

    if y is not None:
        if hasattr(y, 'shape'):
            this_n_test_samples = y.shape[0]
        else:
            this_n_test_samples = len(y)
    else:
        if hasattr(X, 'shape'):
            this_n_test_samples = X.shape[0]
        else:
            this_n_test_samples = len(X)

    all_scores = []
    all_clf_params = []
    n_test_samples = []

    # TODO: include support for sample_weight in score functions
    if loss_func is not None or score_func is not None:
        for i, y_pred in enumerate(clf.staged_predict(X_test)):
            if i + 1 < min_n_estimators:
                continue
            if loss_func is not None:
                score = -loss_func(y_test, y_pred)
            elif score_func is not None:
                score = score_func(y_test, y_pred)
            all_scores.append(score)
            clf_para = copy(clf_params)
            clf_para['n_estimators'] = i + 1
            all_clf_params.append(clf_para)
            n_test_samples.append(this_n_test_samples)

    else:
        if sample_weight_test is not None:
            for i, score in enumerate(clf.staged_score(X_test, y_test,
                sample_weight=sample_weight_test)):
                if i + 1 < min_n_estimators:
                    continue
                all_scores.append(score)
                clf_para = copy(clf_params)
                clf_para['n_estimators'] = i + 1
                all_clf_params.append(clf_para)
                n_test_samples.append(this_n_test_samples)

        else:
            for i, score in enumerate(clf.staged_score(X_test, y_test)):
                if i + 1 < min_n_estimators:
                    continue
                all_scores.append(score)
                clf_para = copy(clf_params)
                clf_para['n_estimators'] = i + 1
                all_clf_params.append(clf_para)
                n_test_samples.append(this_n_test_samples)

    # boosting may have stopped early
    if len(all_scores) < clf.n_estimators - min_n_estimators + 1:
        last_score = all_scores[-1]
        last_clf_params = all_clf_params[-1]
        for i in range(len(all_scores),
                clf.n_estimators - min_n_estimators + 1):
            all_scores.append(last_score)
            clf_para = copy(last_clf_params)
            clf_para['n_estimators'] = i + 1
            all_clf_params.append(clf_para)
            n_test_samples.append(this_n_test_samples)

    if verbose > 1:
        end_msg = "%s -%s" % (msg,
                              logger.short_format_time(time.time() -
                                                       start_time))
        print "[BoostGridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)
    return all_scores, all_clf_params, n_test_samples
Example #13
0
def fit_grid_point(X, y, base_clf, clf_params, train, test, loss_func,
                   score_func, verbose, param_id=None, **fit_params):
    """Run fit on one set of parameters

    Returns the score and the instance of the classifier
    """
    if verbose > 1:
        start_time = time.time()
        msg = '%s' % (', '.join('%s=%s' % (k, v)
                                     for k, v in clf_params.iteritems()))
        print "[GridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')

    # update parameters of the classifier after a copy of its base structure
    # FIXME we should be doing a clone here
    clf = copy.deepcopy(base_clf)
    clf.set_params(**clf_params)

    if isinstance(X, list) or isinstance(X, tuple):
        X_train = [X[i] for i, cond in enumerate(train) if cond]
        X_test = [X[i] for i, cond in enumerate(test) if cond]
    else:
        if sp.issparse(X):
            # For sparse matrices, slicing only works with indices
            # (no masked array). Convert to CSR format for efficiency and
            # because some sparse formats don't support row slicing.
            X = sp.csr_matrix(X)
            ind = np.arange(X.shape[0])
            train = ind[train]
            test = ind[test]
        if hasattr(base_clf, 'kernel_function'):
            # cannot compute the kernel values with custom function
            raise ValueError(
                "Cannot use a custom kernel function. "
                "Precompute the kernel matrix instead.")
        if getattr(base_clf, 'kernel', '') == 'precomputed':
            # X is a precomputed square kernel matrix
            if X.shape[0] != X.shape[1]:
                raise ValueError("X should be a square kernel matrix")
            X_train = X[np.ix_(train, train)]
            X_test = X[np.ix_(test, train)]
        else:
            X_train = X[train]
            X_test = X[test]
    if y is not None:
        y_test = y[test]
        y_train = y[train]
    else:
        y_test = None
        y_train = None

    clf.fit(X_train, y_train, **fit_params)

    if loss_func is not None:
        y_pred = clf.predict(X_test)
        this_score = -loss_func(y_test, y_pred)
    elif score_func is not None:
        y_pred = clf.predict(X_test)
        this_score = score_func(y_test, y_pred)
    else:
        this_score = clf.score(X_test, y_test)

    if y is not None:
        if hasattr(y, 'shape'):
            this_n_test_samples = y.shape[0]
        else:
            this_n_test_samples = len(y)
    else:
        if hasattr(X, 'shape'):
            this_n_test_samples = X.shape[0]
        else:
            this_n_test_samples = len(X)
    if verbose > 2:
        msg += ", score=%f" % this_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg,
                              logger.short_format_time(time.time() -
                                                       start_time))
        print "[GridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)
    return param_id, clf_params, this_score
Example #14
0
def _extended_fit_and_score(estimator,
                            X,
                            y,
                            scorer,
                            train,
                            test,
                            verbose,
                            parameters,
                            fit_params,
                            return_train_score=False,
                            return_parameters=False,
                            error_score='raise',
                            extraOut="auto"):
    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn(
                "Classifier fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        test_score = _score(estimator, X_test, y_test, scorer)
        if return_train_score:
            train_score = _score(estimator, X_train, y_train, scorer)

    scoring_time = time.time() - start_time

    if verbose > 2:
        msg += " , n=" + str(X_test.shape[0]) + ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_score] if return_train_score else []
    ret.extend([test_score, _num_samples(X_test), scoring_time])
    if return_parameters:
        ret.append(parameters)

    # Add additional return values
    extraRVs = {}
    if extraOut != None:
        extraRVs["counts"] = {"train": train.shape[0], "test": test.shape[0]}
        if "estimator" in extraOut:
            extraRVs["estimator"] = estimator
        if extraOut == "auto" or "predictions" in extraOut:
            assert test.shape[0] == X_test.shape[0]
            probabilities = estimator.predict_proba(X_test)
            probabilityByIndex = {}
            for exampleIndex, prediction in zip(test, probabilities):
                probabilityByIndex[exampleIndex] = prediction
            extraRVs["probabilities"] = probabilityByIndex
        if (extraOut == "auto" or "importances" in extraOut) and hasattr(
                estimator, "feature_importances_"):
            extraRVs["importances"] = estimator.feature_importances_
    ret.append(extraRVs)

    return ret
Example #15
0
def _fit_and_score(
    estimator,
    X,
    y,
    scorer,
    train,
    test,
    verbose,
    parameters,
    fit_params,
    return_train_score=False,
    return_parameters=False,
    return_n_test_samples=False,
    return_times=False,
    error_score="raise",
):
    """
    Fit estimator and compute scores for a given dataset split.
    """
    if verbose > 1:
        if parameters is None:
            msg = ""
        else:
            msg = "%s" % (", ".join("%s=%s" % (k, v)
                                    for k, v in parameters.items()))
        LOG.info("[CV] %s %s", msg, (64 - len(msg)) * ".")

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == "raise":
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn(
                "Classifier fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%r" % (error_score, e),
                FitFailedWarning,
            )
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time
        test_score = [_score(estimator, X_test, y_test, s) for s in scorer]
        score_time = time.time() - start_time - fit_time
        if return_train_score:
            train_score = [
                _score(estimator, X_train, y_train, s) for s in scorer
            ]

    if verbose > 2:
        msg += ", score=".join(("%f" % ts for ts in test_score))
    if verbose > 1:
        total_time = score_time + fit_time
        end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time))
        LOG.info("[CV] %s %s", (64 - len(end_msg)) * ".", end_msg)

    ret = [train_score, test_score] if return_train_score else [test_score]

    if return_n_test_samples:
        ret.append(_num_samples(X_test))
    if return_times:
        ret.extend([fit_time, score_time])
    if return_parameters:
        ret.append(parameters)
    return ret
Example #16
0
def _fit_and_score_clean_test(estimator,
                              X,
                              y,
                              scorer,
                              train,
                              test,
                              verbose,
                              parameters,
                              fit_params,
                              return_train_score=False,
                              return_parameters=False,
                              return_n_test_samples=False,
                              return_times=False,
                              error_score='raise'):
    if verbose > 1:
        if parameters is None:
            msg = ''
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    # MOD Get y_clean if available
    fit_params = fit_params.copy()
    y_clean = fit_params.pop('y_clean', None)
    fit_params = dict([(k, _index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    test_scores = {}
    train_scores = {}
    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    # MOD use y_clean for test if available
    X_test, y_test = _safe_split(estimator, X,
                                 y_clean if y_clean is not None else y, test,
                                 train)

    is_multimetric = not callable(scorer)
    n_scorers = len(scorer.keys()) if is_multimetric else 1

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            if is_multimetric:
                test_scores = dict(
                    zip(scorer.keys(), [
                        error_score,
                    ] * n_scorers))
                if return_train_score:
                    train_scores = dict(
                        zip(scorer.keys(), [
                            error_score,
                        ] * n_scorers))
            else:
                test_scores = error_score
                if return_train_score:
                    train_scores = error_score
            warnings.warn(
                "Classifier fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time
        # _score will return dict if is_multimetric is True
        test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
        score_time = time.time() - start_time - fit_time
        if return_train_score:
            train_scores = _score(estimator, X_train, y_train, scorer,
                                  is_multimetric)

    if verbose > 2:
        if is_multimetric:
            for scorer_name, score in test_scores.items():
                msg += ", %s=%s" % (scorer_name, score)
        else:
            msg += ", score=%s" % test_scores
    if verbose > 1:
        total_time = score_time + fit_time
        end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_scores, test_scores] if return_train_score else [test_scores]

    if return_n_test_samples:
        ret.append(_num_samples(X_test))
    if return_times:
        ret.extend([fit_time, score_time])
    if return_parameters:
        ret.append(parameters)
    return ret
Example #17
0
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
                   fit_params, scorer_params, return_train_score=False,
                   return_parameters=False):
    """Fit estimator and compute scores for a given dataset split.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : array-like of shape at least 2D
        The data to fit.

    y : array-like or None
        The target variable to try to predict in the case of
        supervised learning.

    scoring : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.

    train : array-like, shape = (n_train_samples,)
        Indices of training samples.

    test : array-like, shape = (n_test_samples,)
        Indices of test samples.

    verbose : integer
        The verbosity level.

    parameters : dict or None
        Parameters to be set on the estimator.

    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.

    scorer_params : dict or None
        Parameters that will be passed to the scorer.

    return_train_score : boolean, optional, default: False
        Compute and return score on training set.

    return_parameters : boolean, optional, default: False
        Return parameters that has been used for the estimator.

    Returns
    -------
    train_score : float, optional
        Score on training set, returned only if `return_train_score` is `True`.

    test_score : float
        Score on test set.

    n_test_samples : int
        Number of test samples.

    scoring_time : float
        Time spent for fitting and scoring in seconds.

    parameters : dict or None, optional
        The parameters that have been evaluated.
    """
    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                          for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust lenght of sample weights
    n_samples = _num_samples(X)
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, np.asarray(v)[train]
                       if hasattr(v, '__len__') and len(v) == n_samples else v)
                       for k, v in fit_params.items()])

    # Same, but take both slices
    scorer_params = scorer_params if scorer_params is not None else {}
    train_scorer_params = dict([(k, np.asarray(v)[train]
                                 if hasattr(v, '__len__')
                                 and len(v) == n_samples
                                 else v)
                                for k, v in scorer_params.items()])
    test_scorer_params = dict([(k, np.asarray(v)[test]
                                if hasattr(v, '__len__')
                                and len(v) == n_samples
                                else v)
                               for k, v in scorer_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    if y_train is None:
        estimator.fit(X_train, **fit_params)
    else:
        estimator.fit(X_train, y_train, **fit_params)
    test_score = _score(estimator, X_test, y_test, scorer,
                        **test_scorer_params)
    if return_train_score:
        train_score = _score(estimator, X_train, y_train, scorer,
                             **train_scorer_params)

    scoring_time = time.time() - start_time

    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_score] if return_train_score else []
    ret.extend([test_score, _num_samples(X_test), scoring_time])
    if return_parameters:
        ret.append(parameters)
    return ret
Example #18
0
def fit_grid_point_extended(X, y, base_estimator, parameters, train, test, scorer,
                   verbose, loss_func=None, extraOut="auto", **fit_params):
    """Run fit on one set of parameters.

Parameters
----------
X : array-like, sparse matrix or list
Input data.

y : array-like or None
Targets for input data.

base_estimator : estimator object
This estimator will be cloned and then fitted.

parameters : dict
Parameters to be set on base_estimator clone for this grid point.

train : ndarray, dtype int or bool
Boolean mask or indices for training set.

test : ndarray, dtype int or bool
Boolean mask or indices for test set.

scorer : callable or None.
If provided must be a scorer callable object / function with signature
``scorer(estimator, X, y)``.

verbose : int
Verbosity level.

**fit_params : kwargs
Additional parameter passed to the fit function of the estimator.


Returns
-------
score : float
Score of this parameter setting on given training / test split.

parameters : dict
The parameters that have been evaluated.

n_samples_test : int
Number of test samples in this split.
"""
    if verbose > 1:
        start_time = time.time()
        msg = '%s' % (', '.join('%s=%s' % (k, v)
                      for k, v in parameters.items()))
        print("[GridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # update parameters of the classifier after a copy of its base structure
    clf = clone(base_estimator)
    clf.set_params(**parameters)

    if hasattr(base_estimator, 'kernel') and callable(base_estimator.kernel):
        # cannot compute the kernel values with custom function
        raise ValueError("Cannot use a custom kernel function. "
                         "Precompute the kernel matrix instead.")

    if not hasattr(X, "shape"):
        if getattr(base_estimator, "_pairwise", False):
            raise ValueError("Precomputed kernels or affinity matrices have "
                             "to be passed as arrays or sparse matrices.")
        X_train = [X[idx] for idx in train]
        X_test = [X[idx] for idx in test]
    else:
        if getattr(base_estimator, "_pairwise", False):
            # X is a precomputed square kernel matrix
            if X.shape[0] != X.shape[1]:
                raise ValueError("X should be a square kernel matrix")
            X_train = X[np.ix_(train, train)]
            X_test = X[np.ix_(test, train)]
        else:
            X_train = X[safe_mask(X, train)]
            X_test = X[safe_mask(X, test)]

    if y is not None:
        y_test = y[safe_mask(y, test)]
        y_train = y[safe_mask(y, train)]
        clf.fit(X_train, y_train, **fit_params)

        if scorer is not None:
            this_score = scorer(clf, X_test, y_test)
        else:
            this_score = clf.score(X_test, y_test)
    else:
        clf.fit(X_train, **fit_params)
        if scorer is not None:
            this_score = scorer(clf, X_test)
        else:
            this_score = clf.score(X_test)

    if not isinstance(this_score, numbers.Number):
        raise ValueError("scoring must return a number, got %s (%s)"
                         " instead." % (str(this_score), type(this_score)))

    if verbose > 2:
        msg += ", score=%f" % this_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg,
                              logger.short_format_time(time.time() -
                                                       start_time))
        print("[GridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))
    extraRVs = {}
    if extraOut != None:
        if "estimator" in extraOut:
            extraRVs["estimator"] = clf
        if extraOut == "auto" or "predictions" in extraOut:
            predictions = clf.predict(X)
            predictionIndex = 0
            predictionByIndex = {}
            for exampleIndex in safe_mask(X, test):
                predictionByIndex[exampleIndex] = predictions[predictionIndex]
                predictionIndex += 1
            extraRVs["predictions"] = predictionByIndex
        if (extraOut == "auto" or "importances" in extraOut) and hasattr(clf, "feature_importances_"):
            extraRVs["importances"] = clf.feature_importances_
    rvs = [this_score, parameters, _num_samples(X_test), extraRVs]
    return rvs
def monkeypatch_fit_and_score(estimator, X, y, scorer, train, test, verbose,
                              parameters, fit_params, return_train_score=False,
                              return_parameters=False, return_n_test_samples=False,
                              return_times=False, return_estimator=False,
                              error_score='raise-deprecating'):
    if verbose > 1:
        if parameters is None:
            msg = ''
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                          for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                      for k, v in fit_params.items()])

    test_scores = {}
    train_scores = {}
    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    is_multimetric = not callable(scorer)
    n_scorers = len(scorer.keys()) if is_multimetric else 1

    # ===================================================================
    # BEGIN MONKEYPATCH MODIFICATION
    # ===================================================================

    try:
        if isinstance(estimator, Pipeline):
            pipe = estimator
            est_name, estimator = pipe.steps.pop()

            fit_params_est = {}
            fit_param_keys = fit_params.keys()

            for pname in fit_param_keys:
                step, param = pname.split('__', 1)
                if step == est_name:
                    fit_params_est[param] = fit_params.pop(pname)

        else:
            pipe = None

        if y_train is None:
            if pipe is not None:
                X_train = pipe.fit_transform(X_train, **fit_params)
                X_test = pipe.transform(X_test, **fit_params)
                fit_params = fit_params_est
            estimator.fit(X_train, **fit_params)
        else:
            if pipe is not None:
                X_train = pipe.fit_transform(X_train, y_train, **fit_params)
                X_test = pipe.transform(X_test, **fit_params)
                fit_params = fit_params_est
            estimator.fit(X_train, y_train, **fit_params)

    # ===================================================================
    # END MONKEYPATCH MODIFICATION
    # ===================================================================

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            if is_multimetric:
                test_scores = dict(zip(scorer.keys(),
                                   [error_score, ] * n_scorers))
                if return_train_score:
                    train_scores = dict(zip(scorer.keys(),
                                        [error_score, ] * n_scorers))
            else:
                test_scores = error_score
                if return_train_score:
                    train_scores = error_score
            warnings.warn("Classifier fit failed. The score on this train-test"
                          " partition for these parameters will be set to %f. "
                          "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time
        # _score will return dict if is_multimetric is True
        test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
        score_time = time.time() - start_time - fit_time
        if return_train_score:
            train_scores = _score(estimator, X_train, y_train, scorer,
                                  is_multimetric)

    if verbose > 2:
        if is_multimetric:
            for scorer_name, score in test_scores.items():
                msg += ", %s=%s" % (scorer_name, score)
        else:
            msg += ", score=%s" % test_scores
    if verbose > 1:
        total_time = score_time + fit_time
        end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_scores, test_scores] if return_train_score else [test_scores]

    if return_n_test_samples:
        ret.append(_num_samples(X_test))
    if return_times:
        ret.extend([fit_time, score_time])
    if return_parameters:
        ret.append(parameters)
    return ret
Example #20
0
def nested_fit_and_score(
        estimator, X, y, scorer, train, test, verbose=1,
        parameters=None, fit_params=None, return_train_score=False,
        return_times=False, error_score='raise'):
    """

    """
    from sklearn.externals.joblib.logger import short_format_time

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                      for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    if verbose > 1:
        LOG.info('CV iteration: Xtrain=%d, Ytrain=%d/%d -- Xtest=%d, Ytest=%d/%d.',
                 len(X_train), len(X_train) - sum(y_train), sum(y_train),
                 len(X_test), len(X_test) - sum(y_test), sum(y_test))

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            LOG.warn("Classifier fit failed. The score on this train-test"
                     " partition for these parameters will be set to %f. "
                     "Details: \n%r", error_score, e)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time

        test_score = None
        score_time = 0.0
        if len(set(y_test)) > 1:
            test_score = _score(estimator, X_test, y_test, scorer)
            score_time = time.time() - start_time - fit_time
        else:
            LOG.warn('Test set has no positive labels, scoring has been skipped '
                     'in this loop.')

        if return_train_score:
            train_score = _score(estimator, X_train, y_train, scorer)

        acc_score = _score(estimator, X_test, y_test,
                           check_scoring(estimator, scoring='accuracy'))

    if verbose > 0:
        total_time = score_time + fit_time
        if test_score is not None:
            LOG.info('Iteration took %s, score=%f, accuracy=%f.',
                     short_format_time(total_time), test_score, acc_score)
        else:
            LOG.info('Iteration took %s, score=None, accuracy=%f.',
                     short_format_time(total_time), acc_score)

    ret = {
        'test': {'score': test_score, 'accuracy': acc_score}
    }

    if return_train_score:
        ret['train'] = {'score': train_score}

    if return_times:
        ret['times'] = [fit_time, score_time]

    return ret, estimator
Example #21
0
def _fit_and_score(estimator,
                   X,
                   y,
                   scorer,
                   train,
                   test,
                   verbose,
                   parameters,
                   fit_params,
                   return_train_score=False,
                   return_parameters=False,
                   return_n_test_samples=False,
                   return_times=False,
                   error_score='raise',
                   to_evaluate=None):
    """
    Fit estimator and compute scores for a given dataset split.
    """

    #if verbose > 1:
    if parameters is None:
        msg = ''
    else:
        msg = '%s' % (', '.join('%s=%s' % (k, v)
                                for k, v in parameters.items()))
    LOG.info("[CV] %s %s", msg, (64 - len(msg)) * '.')

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)
    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn(
                "Classifier fit failed. The score on this train-test"
                " partition for these parameters will be set to {}. "
                "Details: \n{} \n model: {}".format(error_score, e, estimator),
                FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time
        test_score = {s: _score(estimator, X_test, y_test, s) for s in scorer}

        score_time = time.time() - start_time - fit_time
        if return_train_score:
            train_score = [
                _score(estimator, X_train, y_train, s) for s in scorer
            ]

    #if verbose > 2:
    #msg += ", score=".join(('%f' % ts for ts in test_score))
    msg += ", score=".format(test_score)
    #if verbose > 1:
    total_time = score_time + fit_time
    end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time))
    LOG.info("[CV] %s %s", (64 - len(end_msg)) * '.', end_msg)

    ret = [train_score, test_score] if return_train_score else [test_score]
    ################################################################################################################
    if to_evaluate:
        res_evaluation = dict()
        for ds_name, ds_vals in to_evaluate.items():
            for s in scorer:
                res_evaluation[ds_name + "_" + s] = _score(
                    estimator, ds_vals["x"], ds_vals["y"], s)

    ################################################################################################################
    if return_n_test_samples:
        ret.append(_num_samples(X_test))

    if return_times:
        ret.extend([fit_time, score_time])

    if return_parameters:
        ret.append(parameters)

    if to_evaluate:
        ret.append(res_evaluation)
        """
        for k, v in res_evaluation.items():
            ret.append({k: v})
        """
    return np.squeeze(ret)
Example #22
0
def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
                   parameters, fit_params, return_train_score=False,
                   return_parameters=False, return_n_test_samples=False,
                   return_times=False, error_score='raise'):
    """
    Fit estimator and compute scores for a given dataset split.
    """
    if verbose > 1:
        if parameters is None:
            msg = ''
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in parameters.items()))
        LOG.info("[CV] %s %s", msg, (64 - len(msg)) * '.')

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn("Classifier fit failed. The score on this train-test"
                          " partition for these parameters will be set to %f. "
                          "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time
        test_score = [_score(estimator, X_test, y_test, s) for s in scorer]
        score_time = time.time() - start_time - fit_time
        if return_train_score:
            train_score = [_score(estimator, X_train, y_train, s)
                           for s in scorer]

    if verbose > 2:
        msg += ", score=".join(('%f' % ts for ts in test_score))
    if verbose > 1:
        total_time = score_time + fit_time
        end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time))
        LOG.info("[CV] %s %s", (64 - len(end_msg)) * '.', end_msg)

    ret = [train_score, test_score] if return_train_score else [test_score]

    if return_n_test_samples:
        ret.append(_num_samples(X_test))
    if return_times:
        ret.extend([fit_time, score_time])
    if return_parameters:
        ret.append(parameters)
    return ret
Example #23
0
def _fit_and_score(estimator,
                   X,
                   y,
                   scorer,
                   train,
                   test,
                   verbose,
                   parameters,
                   fit_params,
                   return_train_score=False,
                   return_parameters=False,
                   error_score='raise'):
    """Fit estimator and compute scores for a given dataset split.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : array-like of shape at least 2D
        The data to fit.

    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.

    scorer : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.

    train : array-like, shape (n_train_samples,)
        Indices of training samples.

    test : array-like, shape (n_test_samples,)
        Indices of test samples.

    verbose : integer
        The verbosity level.

    error_score : 'raise' (default) or numeric
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised. If a numeric value is given,
        FitFailedWarning is raised. This parameter does not affect the refit
        step, which will always raise the error.

    parameters : dict or None
        Parameters to be set on the estimator.

    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.

    return_train_score : boolean, optional, default: False
        Compute and return score on training set.

    return_parameters : boolean, optional, default: False
        Return parameters that has been used for the estimator.

    Returns
    -------
    train_score : float, optional
        Score on training set, returned only if `return_train_score` is `True`.

    test_score : float
        Score on test set.

    n_test_samples : int
        Number of test samples.

    scoring_time : float
        Time spent for fitting and scoring in seconds.

    parameters : dict or None, optional
        The parameters that have been evaluated.
    """
    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn(
                "Classifier fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        test_score = _score(estimator, X_test, y_test, scorer)
        if return_train_score:
            train_score = _score(estimator, X_train, y_train, scorer)

    scoring_time = time.time() - start_time

    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_score] if return_train_score else []
    ret.extend([test_score, _num_samples(X_test), scoring_time])
    if return_parameters:
        ret.append(parameters)
    return ret
Example #24
0
def _fit_and_score_keras2(method,
                          X,
                          y,
                          scorer,
                          train,
                          test,
                          verbose,
                          parameters,
                          fit_params,
                          type="Classification",
                          return_train_score=False,
                          return_parameters=False,
                          return_n_test_samples=False,
                          return_times=False,
                          error_score='raise'):
    """Fit estimator and compute scores for a given dataset split for KerasClassifier and KerasRegressor.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : array-like of shape at least 2D
        The data to fit.

    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.

    scorer : A single callable or dict mapping scorer name to the callable
        If it is a single callable, the return value for ``train_scores`` and
        ``test_scores`` is a single float.

        For a dict, it should be one mapping the scorer name to the scorer
        callable object / function.

        The callable object / fn should have signature
        ``scorer(estimator, X, y)``.

    train : array-like, shape (n_train_samples,)
        Indices of training samples.

    test : array-like, shape (n_test_samples,)
        Indices of test samples.

    verbose : integer
        The verbosity level.

    error_score : 'raise' (default) or numeric
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised. If a numeric value is given,
        FitFailedWarning is raised. This parameter does not affect the refit
        step, which will always raise the error.

    parameters : dict or None
        Parameters to be set on the estimator.

    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.

    return_train_score : boolean, optional, default: False
        Compute and return score on training set.

    return_parameters : boolean, optional, default: False
        Return parameters that has been used for the estimator.

    return_n_test_samples : boolean, optional, default: False
        Whether to return the ``n_test_samples``

    return_times : boolean, optional, default: False
        Whether to return the fit/score times.

    session : Keras backend with a tensorflow session attached
        The keras backend session for applying K.clear_session()
        after the classifier or regressor has been train and scored
        given the split. This is mainly required to avoid posible
        Out Of Memory errors with tensorflow not deallocating the
        GPU memory after each iteration of the Cross Validation.

    Returns
    -------
    train_scores : dict of scorer name -> float, optional
        Score on training set (for all the scorers),
        returned only if `return_train_score` is `True`.

    test_scores : dict of scorer name -> float, optional
        Score on testing set (for all the scorers).

    n_test_samples : int
        Number of test samples.

    fit_time : float
        Time spent for fitting in seconds.

    score_time : float
        Time spent for scoring in seconds.

    parameters : dict or None, optional
        The parameters that have been evaluated.
    """
    from keras import backend as K
    import tensorflow as tf
    tf.logging.set_verbosity(
        tf.logging.ERROR)  # This is useful to avoid the info log of tensorflow
    # The next 4 lines are for avoiding tensorflow to allocate all the GPU memory
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    K.set_session(sess)

    if verbose > 1:
        if parameters is None:
            msg = ''
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    test_scores = {}
    train_scores = {}
    estimator = None
    if type == "Classification":
        from keras.wrappers.scikit_learn import KerasClassifier
        estimator = KerasClassifier(build_fn=method, verbose=0)
    else:
        from keras.wrappers.scikit_learn import KerasRegressor
        estimator = KerasRegressor(build_fn=method, verbose=0)

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    is_multimetric = not callable(scorer)
    n_scorers = len(scorer.keys()) if is_multimetric else 1

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            if is_multimetric:
                test_scores = dict(
                    zip(scorer.keys(), [
                        error_score,
                    ] * n_scorers))
                if return_train_score:
                    train_scores = dict(
                        zip(scorer.keys(), [
                            error_score,
                        ] * n_scorers))
            else:
                test_scores = error_score
                if return_train_score:
                    train_scores = error_score
            warnings.warn(
                "Classifier fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time
        # _score will return dict if is_multimetric is True
        test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
        score_time = time.time() - start_time - fit_time
        if return_train_score:
            train_scores = _score(estimator, X_train, y_train, scorer,
                                  is_multimetric)

    if verbose > 2:
        if is_multimetric:
            for scorer_name, score in test_scores.items():
                msg += ", %s=%s" % (scorer_name, score)
        else:
            msg += ", score=%s" % test_scores
    if verbose > 1:
        total_time = score_time + fit_time
        end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_scores, test_scores] if return_train_score else [test_scores]

    if return_n_test_samples:
        ret.append(_num_samples(X_test))
    if return_times:
        ret.extend([fit_time, score_time])
    if return_parameters:
        ret.append(parameters)
    # The estimator is erased
    del estimator
    # We assign the keras backend
    # Clean the session
    K.clear_session()
    # The garbage collector is called in order to ensure that the estimator is erased from memory
    for i in range(15):
        gc.collect()
    return ret
Example #25
0
def _extended_fit_and_score(estimator, X, y, scorer, train, test, verbose,
                   parameters, fit_params, return_train_score=False,
                   return_parameters=False, error_score='raise', extraOut="auto"):
    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                          for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                      for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn("Classifier fit failed. The score on this train-test"
                          " partition for these parameters will be set to %f. "
                          "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)"
                             )

    else:
        test_score = _score(estimator, X_test, y_test, scorer)
        if return_train_score:
            train_score = _score(estimator, X_train, y_train, scorer)

    scoring_time = time.time() - start_time

    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_score] if return_train_score else []
    ret.extend([test_score, _num_samples(X_test), scoring_time])
    if return_parameters:
        ret.append(parameters)
    
    # Add additional return values
    extraRVs = {}
    if extraOut != None:
        extraRVs["counts"] = {"train":train.shape[0], "test":test.shape[0]}
        if "estimator" in extraOut:
            extraRVs["estimator"] = estimator
        if extraOut == "auto" or "predictions" in extraOut:
            assert test.shape[0] == X_test.shape[0]
            probabilities = estimator.predict_proba(X_test)
            probabilityByIndex = {}
            for exampleIndex, prediction in zip(test, probabilities):
                probabilityByIndex[exampleIndex] = prediction
            extraRVs["probabilities"] = probabilityByIndex
        if (extraOut == "auto" or "importances" in extraOut) and hasattr(estimator, "feature_importances_"):
            extraRVs["importances"] = estimator.feature_importances_
    ret.append(extraRVs)
    
    return ret
Example #26
0
def nested_fit_and_score(estimator,
                         X,
                         y,
                         scorer,
                         train,
                         test,
                         verbose=1,
                         parameters=None,
                         fit_params=None,
                         return_train_score=False,
                         return_times=False,
                         error_score='raise'):
    """

    """
    from sklearn.externals.joblib.logger import short_format_time

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    if verbose > 1:
        LOG.info(
            'CV iteration: Xtrain=%d, Ytrain=%d/%d -- Xtest=%d, Ytest=%d/%d.',
            len(X_train),
            len(X_train) - sum(y_train), sum(y_train), len(X_test),
            len(X_test) - sum(y_test), sum(y_test))

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            LOG.warning(
                "Classifier fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%r", error_score, e)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time

        test_score = None
        score_time = 0.0
        if len(set(y_test)) > 1:
            test_score = _score(estimator, X_test, y_test, scorer)
            score_time = time.time() - start_time - fit_time
        else:
            LOG.warning(
                'Test set has no positive labels, scoring has been skipped '
                'in this loop.')

        if return_train_score:
            train_score = _score(estimator, X_train, y_train, scorer)

        acc_score = _score(estimator, X_test, y_test,
                           check_scoring(estimator, scoring='accuracy'))

    if verbose > 0:
        total_time = score_time + fit_time
        if test_score is not None:
            LOG.info('Iteration took %s, score=%f, accuracy=%f.',
                     short_format_time(total_time), test_score, acc_score)
        else:
            LOG.info('Iteration took %s, score=None, accuracy=%f.',
                     short_format_time(total_time), acc_score)

    ret = {'test': {'score': test_score, 'accuracy': acc_score}}

    if return_train_score:
        ret['train'] = {'score': train_score}

    if return_times:
        ret['times'] = [fit_time, score_time]

    return ret, estimator
Example #27
0
def _fit_and_score(estimator,
                   Z,
                   scorer,
                   train,
                   test,
                   verbose,
                   parameters,
                   fit_params,
                   return_train_score=False,
                   return_parameters=False,
                   error_score='raise'):

    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in list(parameters.items())))
        print(("[CV] %s %s" % (msg, (64 - len(msg)) * '.')))

    fit_params = fit_params if fit_params is not None else {}

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    Z_train = Z[train]
    Z_test = Z[test]

    try:
        estimator.fit(Z_train, **fit_params)
    except Exception as e:
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn(
                "Classifier fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")
    else:
        test_score = _score(estimator, Z_test, scorer)
        if return_train_score:
            train_score = _score(estimator, Z_train, scorer)

    scoring_time = time.time() - start_time

    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
        print(("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)))

    ret = [train_score] if return_train_score else []
    ret.extend([test_score, _num_samples(Z_test), scoring_time])
    if return_parameters:
        ret.append(parameters)
    return ret
Example #28
0
def _fit_and_score_multisignal(estimator, X, y, scorer, train, test, verbose,
                   parameters, fit_params, return_train_score=False,
                   return_parameters=False, return_n_test_samples=False,
                   return_times=False, error_score='raise', logger=logger):
    """Fit estimator and compute scores for a given dataset split.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : array-like of shape at least 2D
        The data to fit.

    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.

    scorer : A single callable or dict mapping scorer name to the callable
        If it is a single callable, the return value for ``train_scores`` and
        ``test_scores`` is a single float.

        For a dict, it should be one mapping the scorer name to the scorer
        callable object / function.

        The callable object / fn should have signature
        ``scorer(estimator, X, y)``.

    train : array-like, shape (n_train_samples,)
        Indices of training samples.

    test : array-like, shape (n_test_samples,)
        Indices of test samples.

    verbose : integer
        The verbosity level.

    error_score : 'raise' (default) or numeric
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised. If a numeric value is given,
        FitFailedWarning is raised. This parameter does not affect the refit
        step, which will always raise the error.

    parameters : dict or None
        Parameters to be set on the estimator.

    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.

    return_train_score : boolean, optional, default: False
        Compute and return score on training set.

    return_parameters : boolean, optional, default: False
        Return parameters that has been used for the estimator.

    return_n_test_samples : boolean, optional, default: False
        Whether to return the ``n_test_samples``

    return_times : boolean, optional, default: False
        Whether to return the fit/score times.

    Returns
    -------
    train_scores : dict of scorer name -> float, optional
        Score on training set (for all the scorers),
        returned only if `return_train_score` is `True`.

    test_scores : dict of scorer name -> float, optional
        Score on testing set (for all the scorers).

    n_test_samples : int
        Number of test samples.

    fit_time : float
        Time spent for fitting in seconds.

    score_time : float
        Time spent for scoring in seconds.

    parameters : dict or None, optional
        The parameters that have been evaluated.
    """
    if verbose > 1:
        if parameters is None:
            msg = ''
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                          for k, v in parameters.items()))
        logger.info("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                      for k, v in fit_params.items()])

    test_scores = {}
    train_scores = {}
    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split_multisignal(estimator, X, y, train)
    X_test, y_test = _safe_split_multisignal(estimator, X, y, test, train)

    is_multimetric = not callable(scorer)
    n_scorers = len(scorer.keys()) if is_multimetric else 1

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            if is_multimetric:
                test_scores = dict(zip(scorer.keys(),
                                   [error_score, ] * n_scorers))
                if return_train_score:
                    train_scores = dict(zip(scorer.keys(),
                                        [error_score, ] * n_scorers))
            else:
                test_scores = error_score
                if return_train_score:
                    train_scores = error_score
            logger.warning("Classifier fit failed. The score on this train-test"
                          " partition for these parameters will be set to %f. "
                          "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time
        # _score will return dict if is_multimetric is True
        test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
        score_time = time.time() - start_time - fit_time
        if return_train_score:
            train_scores = _score(estimator, X_train, y_train, scorer,
                                  is_multimetric)

    if verbose > 2:
        if is_multimetric:
            for scorer_name, score in test_scores.items():
                msg += ", %s=%s" % (scorer_name, score)
        else:
            msg += ", score=%s" % test_scores
    if verbose > 1:
        total_time = score_time + fit_time
        end_msg = "%s, total=%s" % (msg, short_format_time(total_time))
        logger.info("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_scores, test_scores] if return_train_score else [test_scores]

    if return_n_test_samples:
        ret.append(_num_samples(X_test))
    if return_times:
        ret.extend([fit_time, score_time])
    if return_parameters:
        ret.append(parameters)
    return ret
Example #29
0
def _fit_and_score(estimator, frame, feature_names, target_feature, scorer,
                   parameters, verbose, scoring_params, train, test,
                   is_regression, act_args, cv_fold, iteration):
    """Fits the current fold on the current parameters.

        Parameters
        ----------

        estimator : H2OPipeline or H2OEstimator
            The estimator to fit

        frame : H2OFrame, shape=(n_samples, n_features)
            The training frame

        feature_names : iterable (str)
            The feature names on which to train

        target_feature : str
            The name of the target feature

        scorer : H2OScorer
            The scoring function

        parameters : dict
            The parameters to set in the estimator clone

        verbose : int
            The level of verbosity

        scoring_params : dict
            The parameters to pass as kwargs to the scoring function

        train : iterable, shape=(n_train_samples,)
            The train fold indices

        test : iterable, shape=(n_test_samples,)
            The test fold indices

        is_regression : bool
            Whether we are fitting a continuous target

        act_args : dict
            :class:``skutil.metrics.GainsStatisticalReport`` args if called 
            from a :class:``skutil.h2o.H2OGainsRandomizedSearchCV``. Otherwise,
            these are unused.

        cv_fold : int
            The fold number for reporting

        iteration : int
            The iteration number for reporting

        Returns
        -------

        out : list, shape=(4,)
            test_score : float
                The score produced by the ``_score`` method
                on the test fold of the training set.

            len(test) : int
                The number of samples included in the
                test fold of the training set. Used later
                for IID normalizing of test scores.

            estimator : ``H2OEstimator`` or ``H2OPipeline``
                The fit pipeline or estimator. Used for later
                scoring on the validation set.

            parameters : dict
                The parameters used to fit this estimator.
    """
    if parameters is None:
        parameters = {}

    if verbose > 1:
        if not parameters:
            msg = ''
        else:
            msg = 'Target: %s; %s' % (target_feature, ', '.join(
                '%s=%s' % (k, v) for k, v in parameters.items()))
        print("[CV (iter %i, fold %i)] %s %s" % (iteration, cv_fold, msg,
                                                 (64 - len(msg)) * '.'))

    # h2o doesn't currently re-order rows... and sometimes will
    # complain for some reason. We need to sort our train/test idcs
    train = sorted(train)
    test = sorted(test)

    # if act_args, then it's a gains search. We just need to slice
    # our existing numpy arrays
    if act_args is not None:
        kwargs = {
            'expo':
            act_args['expo'][test],
            'loss':
            act_args['loss'][test],
            'prem':
            act_args['prem'][test] if act_args['prem'] is not None else None
        }
    else:
        kwargs = scoring_params

    # generate split
    train_frame = frame[train, :]
    test_frame = frame[test, :]

    start_time = time.time()

    # it's probably a pipeline
    is_h2o_est = isinstance(estimator, H2OEstimator)
    if not is_h2o_est:
        estimator.set_params(**parameters)

        # the name setting should be taken care of pre-clone...
        # setattr(estimator, 'feature_names', feature_names)
        # setattr(estimator, 'target_feature',target_feature)

        # do fit
        estimator.fit(train_frame)
    else:  # it's just an H2OEstimator
        # parm_dict = {}
        for k, v in six.iteritems(parameters):
            if '__' in k:
                raise ValueError('only one estimator passed to grid search, '
                                 'but multiple named parameters passed: %s' %
                                 k)

            # {parm_name : v}
            estimator._parms[k] = v

        # do train
        estimator.train(training_frame=train_frame,
                        x=feature_names,
                        y=target_feature)

    # score model
    test_score = _score(estimator, test_frame, target_feature, scorer,
                        is_regression, **kwargs)

    # h2o is verbose.. if we are too, print a new line:
    if verbose > 1:
        print()  # new line

    scoring_time = time.time() - start_time

    if verbose > 2:
        msg += ', score=%f' % test_score
    if verbose > 1:
        end_msg = '%s -%s' % (msg, logger.short_format_time(scoring_time))
        print('[CV (iter %i, fold %i)] %s %s' %
              (iteration, cv_fold, (64 - len(end_msg)) * '.', end_msg))
        print()  # new line
        print()  # new line

    out = [test_score, len(test), estimator, parameters]
    return out