Example #1
0
        def func(x):
            parameters = self._list_to_grid_point(x, parameter_iterable)

            n_test_samples = 0
            score = 0
            all_scores = []

            for train, test in cv:
                this_score, this_n_test_samples, _, parameters = \
                        _fit_and_score(clone(base_estimator), X, y, self.scorer_,
                                       train, test, self.verbose, parameters,
                                       self.fit_params, return_parameters=True,
                                       error_score=self.error_score)
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score

            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)

            grid_scores.append(
                _CVScoreTuple(parameters, score, np.array(all_scores)))

            #print 'In func:', x, score
            return score
Example #2
0
def fas_mp(base_estimator, key, scorer, train, test, verbose,
                  parameters, fit_params, return_parameters, error_score):
  samples = pickle.loads(red.get(key))
  return _fit_and_score(base_estimator, samples['X'], samples['y'], scorer,
                                train, test, verbose, parameters,
                                fit_params, return_parameters=True,
                                error_score=error_score)
Example #3
0
    def score(self, test_parameter):
        """
		The score function to call in order to evaluate the quality 
		of the parameter test_parameter

		Parameters
		----------
		tested_parameter : dict, the parameter to test

		Returns
		-------
		score : the CV score, either the list of all cv results or
			the mean (depending of score_format)
		"""

        if not self._callable_estimator:
            cv = check_cv(self.cv,
                          self.X,
                          self.y,
                          classifier=is_classifier(self.estimator))
            cv_score = [
                _fit_and_score(clone(self.estimator),
                               self.X,
                               self.y,
                               self.scorer_,
                               train,
                               test,
                               False,
                               test_parameter,
                               self.fit_params,
                               return_parameters=True) for train, test in cv
            ]

            n_test_samples = 0
            mean_score = 0
            detailed_score = []
            for tmp_score, tmp_n_test_samples, _, _ in cv_score:
                detailed_score.append(tmp_score)
                tmp_score *= tmp_n_test_samples
                n_test_samples += tmp_n_test_samples
                mean_score += tmp_score
            mean_score /= float(n_test_samples)

            if (self.score_format == 'avg'):
                score = mean_score
            else:  # format == 'cv'
                score = detailed_score

        else:
            if (self.score_format == 'avg'):
                score = [self.estimator(test_parameter)]
            else:  # format == 'cv'
                score = self.estimator(test_parameter)

        return score
Example #4
0
def _fit_and_score_helper(args):
    import numpy as np
    from sklearn.externals import six
    from sklearn.externals.joblib import load
    from sklearn.cross_validation import _fit_and_score
    args = list(args)
    if isinstance(args[1], six.string_types):
        args[1] = load(args[1], mmap_mode='c')
        if isinstance(args[1], np.memmap):
            args[1] = np.asarray(args[1])
    return _fit_and_score(*args)
Example #5
0
 def run(self,X, Y):
     """
     Lengthly and compute intensive part.
     Returns a Result object
     """
     train_score, test_score, n_test_samples, scoring_time = _fit_and_score(self.estimator, X, Y, self.scorer,
                                                                                    self.train_index, self.test_index,
                                                                                    self.verbose,
                                                                                    self.meta_parameter_set,
                                                                                            None, self.eval_on_training)
     return Result(self.get_unique_token(),train_score,test_score,scoring_time,self.meta_parameter_set)
	def score(self,test_parameter):
		"""
		The score function to call in order to evaluate the quality 
		of the parameter test_parameter

		Parameters
		----------
		`tested_parameter` : dict, the parameter to test

		Returns
		-------
		`score` : the CV score, either the list of all cv results or
			the mean (depending of score_format)
		"""

		if not self._callable_estimator:
	 		cv = check_cv(self.cv, self.X, self.y, classifier=is_classifier(self.estimator))
	 		cv_score = [ _fit_and_score(clone(self.estimator), self.X, self.y, self.scorer_,
							train, test, False, test_parameter,
							self.fit_params, return_parameters=True)
						for train, test in cv ]

			n_test_samples = 0
			mean_score = 0
			detailed_score = []
			for tmp_score, tmp_n_test_samples, _, _ in cv_score:
				detailed_score.append(tmp_score)
				tmp_score *= tmp_n_test_samples
				n_test_samples += tmp_n_test_samples
				mean_score += tmp_score
			mean_score /= float(n_test_samples)

			if(self.score_format == 'avg'):
				score = mean_score
			else: # format == 'cv'
				score = detailed_score


		else:
			if(self.score_format == 'avg'):
				score = [self.estimator(test_parameter)]
			else: # format == 'cv'
				score = self.estimator(test_parameter)

		return score
def dynamic_cross_val_score(estimator, fv, esa_feature_list, unigram_feature_list, dynamic_X, y=None, scoring=None, cv=None,
                verbose=0, fit_params=None):

    print "dynamic cross val mit %s" % esa_feature_list + unigram_feature_list
    vec = DictVectorizer()
    tfidf = TfidfTransformer()

    X = vec.fit_transform(fv).toarray()
    # X= tfidf.fit_transform(X).toarray()

    X, y = cross_validation.indexable(X, y)

    cv = cross_validation.check_cv(cv, X, y, classifier=cross_validation.is_classifier(estimator))
    scorer = cross_validation.check_scoring(estimator, scoring=scoring)
    scores = []

    cross_val_step = 0
    for train, test in cv:

        fv_copy = copy.deepcopy(fv)

        #baue X in jedem Schritt neu
        for i in range(0,len(fv)): #jedes i steht für einen featuredict
            feature_dict = fv_copy[i]
            dynamic_vec = dynamic_X[cross_val_step] #zeigt auf esa_vec
            for feature in esa_feature_list:
                feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten
            for feature in unigram_feature_list:
                feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten



        X = vec.fit_transform(fv_copy).toarray()
        # X = tfidf.fit_transform(X).toarray()

        scores.append(cross_validation._fit_and_score(cross_validation.clone(estimator), X, y, scorer,
                        train, test, verbose, None, fit_params))

        cross_val_step += 1


    return np.array(scores)[:, 0]
Example #8
0
    def process_batch(self, work_batch):
        fit_params = self.fit_params if self.fit_params is not None else {}

        LOG.debug("Node %d received %d work items", comm_rank, len(work_batch))

        results = []
        for fold_id, train_index, test_index, parameters in work_batch:
            ret = _fit_and_score(clone(self.estimator), self._data_X,
                                 self._data_y, self.scorer, train_index,
                                 test_index, self.verbose, parameters,
                                 fit_params)

            result = parameters.copy()
            result['score'] = ret[0]
            result['n_samples_test'] = ret[1]
            result['scoring_time'] = ret[2]
            result['fold'] = fold_id
            results.append(result)

        LOG.debug("Node %d is done with fold %d", comm_rank, fold_id)
        return results
def my_fit_and_score(train_test_parameters,
                     estimator=None,
                     X=None,
                     y=None,
                     verbose=False,
                     fit_params=None,
                     return_parameters=True,
                     scorer=None,
                     x_is_index=True,
                     names=('X', 'y')):
    from runner import bac_scorer, bac_error, confusion_matrix, process_cm

    train, test, parameters = train_test_parameters

    if x_is_index:
        index = X
        X = None
    if X is None:
        if 'X' in globals():
            X = globals()[names[0]]
            y = globals()[names[1]]
        else:
            X, y = loader(names[0], names[1])()
            globals()[names[0]] = X
            globals()[names[1]] = y

    if x_is_index:
        X = X[index]
        y = y[index]

    return _fit_and_score(estimator=estimator,
                          X=X,
                          y=y,
                          verbose=verbose,
                          parameters=parameters,
                          fit_params=fit_params,
                          return_parameters=return_parameters,
                          train=train,
                          test=test,
                          scorer=bac_scorer)
Example #10
0
def _evalFunction(individual,
                  name_values,
                  X,
                  y,
                  scorer,
                  cv,
                  iid,
                  fit_params,
                  verbose=0,
                  error_score='raise'):
    parameters = _individual_to_params(individual, name_values)
    score = 0
    n_test = 0

    paramkey = str(individual)
    if paramkey in score_cache:
        score = score_cache[paramkey]
    else:
        for train, test in cv:
            _score, _, _ = _fit_and_score(estimator=individual.est,
                                          X=X,
                                          y=y,
                                          scorer=scorer,
                                          train=train,
                                          test=test,
                                          verbose=verbose,
                                          parameters=parameters,
                                          fit_params=fit_params,
                                          error_score=error_score)
            if iid:
                score += _score * len(test)
                n_test += len(test)
            else:
                score += _score
                n_test += 1
        score /= float(n_test)
        score_cache[paramkey] = score

    return (score, )
    def my_fit_and_score(index_train_test_parameters,
                         estimator=None,
                         loader=None,
                         fit_params=None,
                         scorer=None,
                         fit_callback=None):
        from sklearn.cross_validation import _fit_and_score
        gs_index, (train, test, parameters) = index_train_test_parameters

        if loader is None:
            raise ValueError('loader is missing')
        X, y = loader()
        estimator = clone(estimator)
        result = _fit_and_score(estimator=estimator,
                                X=X,
                                y=y,
                                verbose=False,
                                parameters=parameters,
                                fit_params=fit_params,
                                return_parameters=True,
                                train=train,
                                test=test,
                                scorer=scorer)

        if fit_callback:
            fit_callback(
                gs_index, {
                    'estimator': estimator,
                    'X': X,
                    'y': y,
                    'parameters': parameters,
                    'fit_params': fit_params,
                    'train': train,
                    'test': test,
                    'scorer': scorer
                })

        return gs_index, result
Example #12
0
def _evalFunction(individual, name_values, X, y, scorer, cv, iid, fit_params,
                  verbose=0, error_score='raise'):
    parameters = _individual_to_params(individual, name_values)
    score = 0
    n_test = 0
    for train, test in cv:
        paramkey = str(individual)
        if paramkey in score_cache:
            _score = score_cache[paramkey]
        else:
            _score, _, _ = _fit_and_score(estimator=individual.est, X=X, y=y, scorer=scorer,
                                     train=train, test=test, verbose=verbose,
                                     parameters=parameters, fit_params=fit_params,
                                     error_score=error_score)
            score_cache[paramkey] = _score
        if iid:
            score += _score * len(test)
            n_test += len(test)
        else:
            score += _score
            n_test += 1
    score /= float(n_test)

    return (score,)
Example #13
0
    def my_fit_and_score(index_train_test_parameters,
                         estimator=None,
                         X=None,
                         y=None,
                         fit_params=None,
                         scorer=None,
                         x_is_index=True,
                         loader=None,
                         fit_callback=None):
        """
        function which represents a single task execution for GridSeearchCVParallel,
        is executed on remote machines
        :param index_train_test_parameters: tuple of 4 (index of the task, train index, test index, hyperparameters)
        :param estimator: sklearn's BaseEstimator subtype
        :param X: the data, or index (if x_is_index is True), or None
        :param y: the target variable
        :param fit_params: parameters for fit function of estimator
        :param scorer: sklearn scorer(estimator, X, y)
        :param x_is_index: True if x is to be used to get a subset of the data
        :param loader: function to load data on remote machines
        :return: tuple of 2 (index, result of sklearn.cross_validation._fit_and_score)
        """
        from sklearn.cross_validation import _fit_and_score
        gs_index, (train, test, parameters) = index_train_test_parameters

        if x_is_index:
            index = X
            X = None
        if X is None:
            if loader is None:
                raise ValueError('loader is missing, X is None')
            X, y = loader()

        if x_is_index:
            X = X[index]
            y = y[index]

        # setup_kfold_patch(10)

        result = _fit_and_score(estimator=estimator,
                                X=X,
                                y=y,
                                verbose=False,
                                parameters=parameters,
                                fit_params=fit_params,
                                return_parameters=True,
                                train=train,
                                test=test,
                                scorer=scorer)

        if fit_callback:
            fit_callback(
                gs_index, {
                    'estimator': estimator,
                    'X': X,
                    'y': y,
                    'parameters': parameters,
                    'fit_params': fit_params,
                    'train': train,
                    'test': test,
                    'scorer': scorer
                })

        return gs_index, result
Example #14
0
def run_experiments(experiments, methods, cache={}, grid_test=False, ignore_cache=False):
    """
    TODO
    """

    def _check_cache(cache, mode, clf_name, feature_hash, method):
        if mode not in cache:
            cache[mode] = {}

        if clf_name not in cache[mode]:
            cache[mode][clf_name] = {}

        if feature_hash not in cache[mode][clf_name]:
            cache[mode][clf_name][feature_hash] = {}

        if method in cache[mode][clf_name][feature_hash]:
            return cache[mode][clf_name][feature_hash][method]

        return None

    def _most_common_dict(param_list):
        param_list = [frozenset(p.items()) for p in param_list]
        mode = max(set(param_list), key=param_list.count)
        return dict(mode)

    # Defaults
    labels = pd.read_csv(LABEL_FILE, index_col=0)
    with open(CV_PARAMS_FILE, 'r') as fp:
        cv_params_dict = json.load(fp)

    results = []

    for exp_id, clf_name, feature_files in tqdm(experiments, total=len(experiments)):
        feature_files = sorted(feature_files.split())
        features = load_features(feature_files)

        logger.info('Experiment {0} ({1} - {2})'.format(exp_id, clf_name, feature_files))

        for method in tqdm(methods, total=len(methods), leave=False):

            splits, iterations = METHODS[method]
            base_clf = classifier_dict[clf_name]
            clf = clone(base_clf)
            feature_hash = ":".join(feature_files)
            cv_param_grid = cv_params_dict[clf_name]

            logger.info('CV ParamGrid {0}'.format(cv_param_grid))

            logger.info('Method {0}'.format(method))

            # Check CV cache

            scores = []
            cv_params = []
            cv_cache = _check_cache(cache, 'cv', clf_name, feature_hash, method)

            cv_cache = not ignore_cache and cv_cache

            if cv_cache:
                cv_params = _check_cache(cache, 'cv', clf_name, feature_hash, method)
                logger.info('CV_CACHE Params: {0}'.format(cv_params))

            test_params = []
            test_cache = _check_cache(cache, 'test', clf_name, feature_hash, method)

            test_cache = not ignore_cache and test_cache

            if test_cache:
                test_params = _check_cache(cache, 'test', clf_name, feature_hash, method)
                logger.info('TEST_CACHE Params: {0}'.format(test_params))

            for j in tqdm(range(iterations), total=iterations, leave=False):
                parameters = _check_cache(cache, 'cv', clf_name, feature_hash, method)

                train_ix, test_ix = split_labels(labels, splits)
                X_train, X_test, y_train, y_test = train_test_unpack(features, labels, train_ix, test_ix)

                if isinstance(cv_params, dict):
                    clf.set_params(**cv_params)
                    clf, parameters, (t, r) = train_classifier(clf, X_train, y_train)
                else:
                    logger.info('It {0}: CV_START search'.format(j))

                    train_labels = labels.iloc[train_ix]
                    cv_iter = cv_split_labels(train_labels, method)

                    clf, parameters, (t, r) = train_classifier(clf, X_train, y_train, cv_param_grid, cv_iter)
                    cv_params.append(parameters)

                    logger.info('It {0}: CV_END search. Params: {1}'.format(j, parameters))

                logger.info('TRAIN_THR t:{0}, r:{1}'.format(t, r))
                y_pred = predict(clf, X_train, t, r)
                sc_train, (sc_train_low, sc_train_up) = cinc_confidence_interval(y_train, y_pred, t, r, 100)
                y_pred = predict(clf, X_test,  t, r)
                sc_test,  (sc_test_low, sc_test_up)   = cinc_confidence_interval(y_test,  y_pred, t, r, 100)

                if grid_test:
                    X, y = features.values, labels.values.squeeze()
                    if isinstance(test_params, dict):
                        sc_test_best, _, _ = _fit_and_score(clf, X, y, cinc_cv_scorer, train_ix, test_ix, 0, test_params, None)

                    else:
                        logger.info('It {0}: TEST_START search'.format(j))
                        cv_param_grid = cv_params_dict[clf_name]
                        sc_test_best, parameters = search_test_params(base_clf, cv_param_grid, X, y,
                                                                      train_ix, test_ix,
                                                                      scoring=cinc_cv_scorer)
                        test_params.append(parameters)

                        logger.info('It {0}: TEST_END search. Params: {1}'.format(j, parameters))
                else:
                    sc_test_best = np.nan

                scores.append(np.round([sc_train, sc_train_low, sc_train_up, sc_test, sc_test_low, sc_test_up, sc_test_best], 3))
                logger.info('It {0}/{1} - Scores {2}'.format(j, iterations - 1, scores[-1]))

            scores = np.mean(np.array(scores), axis=0)

            if not cv_cache:
                best_cv_params = _most_common_dict(cv_params)
                cache['cv'][clf_name][feature_hash][method] = best_cv_params
                logger.info('BEST_CV params: {0}'.format(best_cv_params))

            if not test_cache:
                best_test_params = _most_common_dict(test_params)
                cache['test'][clf_name][feature_hash][method] = best_test_params
                logger.info('BEST_TEST params: {0}'.format(best_test_params))

            cv_test_gap = np.round(scores[-1] - scores[-4], 3)
            scores = np.array([exp_id, method] + list(scores))
            results.append(scores)

            logger.info('Mean Scores: {0}'.format(scores))
            if grid_test:
                logger.info('CV_TEST gap: {0}'.format(cv_test_gap))

    df_results = pd.DataFrame(data=results, columns=HEADER)
    return df_results, cache