def func(x): parameters = self._list_to_grid_point(x, parameter_iterable) n_test_samples = 0 score = 0 all_scores = [] for train, test in cv: this_score, this_n_test_samples, _, parameters = \ _fit_and_score(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) grid_scores.append( _CVScoreTuple(parameters, score, np.array(all_scores))) #print 'In func:', x, score return score
def fas_mp(base_estimator, key, scorer, train, test, verbose, parameters, fit_params, return_parameters, error_score): samples = pickle.loads(red.get(key)) return _fit_and_score(base_estimator, samples['X'], samples['y'], scorer, train, test, verbose, parameters, fit_params, return_parameters=True, error_score=error_score)
def score(self, test_parameter): """ The score function to call in order to evaluate the quality of the parameter test_parameter Parameters ---------- tested_parameter : dict, the parameter to test Returns ------- score : the CV score, either the list of all cv results or the mean (depending of score_format) """ if not self._callable_estimator: cv = check_cv(self.cv, self.X, self.y, classifier=is_classifier(self.estimator)) cv_score = [ _fit_and_score(clone(self.estimator), self.X, self.y, self.scorer_, train, test, False, test_parameter, self.fit_params, return_parameters=True) for train, test in cv ] n_test_samples = 0 mean_score = 0 detailed_score = [] for tmp_score, tmp_n_test_samples, _, _ in cv_score: detailed_score.append(tmp_score) tmp_score *= tmp_n_test_samples n_test_samples += tmp_n_test_samples mean_score += tmp_score mean_score /= float(n_test_samples) if (self.score_format == 'avg'): score = mean_score else: # format == 'cv' score = detailed_score else: if (self.score_format == 'avg'): score = [self.estimator(test_parameter)] else: # format == 'cv' score = self.estimator(test_parameter) return score
def _fit_and_score_helper(args): import numpy as np from sklearn.externals import six from sklearn.externals.joblib import load from sklearn.cross_validation import _fit_and_score args = list(args) if isinstance(args[1], six.string_types): args[1] = load(args[1], mmap_mode='c') if isinstance(args[1], np.memmap): args[1] = np.asarray(args[1]) return _fit_and_score(*args)
def run(self,X, Y): """ Lengthly and compute intensive part. Returns a Result object """ train_score, test_score, n_test_samples, scoring_time = _fit_and_score(self.estimator, X, Y, self.scorer, self.train_index, self.test_index, self.verbose, self.meta_parameter_set, None, self.eval_on_training) return Result(self.get_unique_token(),train_score,test_score,scoring_time,self.meta_parameter_set)
def score(self,test_parameter): """ The score function to call in order to evaluate the quality of the parameter test_parameter Parameters ---------- `tested_parameter` : dict, the parameter to test Returns ------- `score` : the CV score, either the list of all cv results or the mean (depending of score_format) """ if not self._callable_estimator: cv = check_cv(self.cv, self.X, self.y, classifier=is_classifier(self.estimator)) cv_score = [ _fit_and_score(clone(self.estimator), self.X, self.y, self.scorer_, train, test, False, test_parameter, self.fit_params, return_parameters=True) for train, test in cv ] n_test_samples = 0 mean_score = 0 detailed_score = [] for tmp_score, tmp_n_test_samples, _, _ in cv_score: detailed_score.append(tmp_score) tmp_score *= tmp_n_test_samples n_test_samples += tmp_n_test_samples mean_score += tmp_score mean_score /= float(n_test_samples) if(self.score_format == 'avg'): score = mean_score else: # format == 'cv' score = detailed_score else: if(self.score_format == 'avg'): score = [self.estimator(test_parameter)] else: # format == 'cv' score = self.estimator(test_parameter) return score
def dynamic_cross_val_score(estimator, fv, esa_feature_list, unigram_feature_list, dynamic_X, y=None, scoring=None, cv=None, verbose=0, fit_params=None): print "dynamic cross val mit %s" % esa_feature_list + unigram_feature_list vec = DictVectorizer() tfidf = TfidfTransformer() X = vec.fit_transform(fv).toarray() # X= tfidf.fit_transform(X).toarray() X, y = cross_validation.indexable(X, y) cv = cross_validation.check_cv(cv, X, y, classifier=cross_validation.is_classifier(estimator)) scorer = cross_validation.check_scoring(estimator, scoring=scoring) scores = [] cross_val_step = 0 for train, test in cv: fv_copy = copy.deepcopy(fv) #baue X in jedem Schritt neu for i in range(0,len(fv)): #jedes i steht für einen featuredict feature_dict = fv_copy[i] dynamic_vec = dynamic_X[cross_val_step] #zeigt auf esa_vec for feature in esa_feature_list: feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten for feature in unigram_feature_list: feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten X = vec.fit_transform(fv_copy).toarray() # X = tfidf.fit_transform(X).toarray() scores.append(cross_validation._fit_and_score(cross_validation.clone(estimator), X, y, scorer, train, test, verbose, None, fit_params)) cross_val_step += 1 return np.array(scores)[:, 0]
def process_batch(self, work_batch): fit_params = self.fit_params if self.fit_params is not None else {} LOG.debug("Node %d received %d work items", comm_rank, len(work_batch)) results = [] for fold_id, train_index, test_index, parameters in work_batch: ret = _fit_and_score(clone(self.estimator), self._data_X, self._data_y, self.scorer, train_index, test_index, self.verbose, parameters, fit_params) result = parameters.copy() result['score'] = ret[0] result['n_samples_test'] = ret[1] result['scoring_time'] = ret[2] result['fold'] = fold_id results.append(result) LOG.debug("Node %d is done with fold %d", comm_rank, fold_id) return results
def my_fit_and_score(train_test_parameters, estimator=None, X=None, y=None, verbose=False, fit_params=None, return_parameters=True, scorer=None, x_is_index=True, names=('X', 'y')): from runner import bac_scorer, bac_error, confusion_matrix, process_cm train, test, parameters = train_test_parameters if x_is_index: index = X X = None if X is None: if 'X' in globals(): X = globals()[names[0]] y = globals()[names[1]] else: X, y = loader(names[0], names[1])() globals()[names[0]] = X globals()[names[1]] = y if x_is_index: X = X[index] y = y[index] return _fit_and_score(estimator=estimator, X=X, y=y, verbose=verbose, parameters=parameters, fit_params=fit_params, return_parameters=return_parameters, train=train, test=test, scorer=bac_scorer)
def _evalFunction(individual, name_values, X, y, scorer, cv, iid, fit_params, verbose=0, error_score='raise'): parameters = _individual_to_params(individual, name_values) score = 0 n_test = 0 paramkey = str(individual) if paramkey in score_cache: score = score_cache[paramkey] else: for train, test in cv: _score, _, _ = _fit_and_score(estimator=individual.est, X=X, y=y, scorer=scorer, train=train, test=test, verbose=verbose, parameters=parameters, fit_params=fit_params, error_score=error_score) if iid: score += _score * len(test) n_test += len(test) else: score += _score n_test += 1 score /= float(n_test) score_cache[paramkey] = score return (score, )
def my_fit_and_score(index_train_test_parameters, estimator=None, loader=None, fit_params=None, scorer=None, fit_callback=None): from sklearn.cross_validation import _fit_and_score gs_index, (train, test, parameters) = index_train_test_parameters if loader is None: raise ValueError('loader is missing') X, y = loader() estimator = clone(estimator) result = _fit_and_score(estimator=estimator, X=X, y=y, verbose=False, parameters=parameters, fit_params=fit_params, return_parameters=True, train=train, test=test, scorer=scorer) if fit_callback: fit_callback( gs_index, { 'estimator': estimator, 'X': X, 'y': y, 'parameters': parameters, 'fit_params': fit_params, 'train': train, 'test': test, 'scorer': scorer }) return gs_index, result
def _evalFunction(individual, name_values, X, y, scorer, cv, iid, fit_params, verbose=0, error_score='raise'): parameters = _individual_to_params(individual, name_values) score = 0 n_test = 0 for train, test in cv: paramkey = str(individual) if paramkey in score_cache: _score = score_cache[paramkey] else: _score, _, _ = _fit_and_score(estimator=individual.est, X=X, y=y, scorer=scorer, train=train, test=test, verbose=verbose, parameters=parameters, fit_params=fit_params, error_score=error_score) score_cache[paramkey] = _score if iid: score += _score * len(test) n_test += len(test) else: score += _score n_test += 1 score /= float(n_test) return (score,)
def my_fit_and_score(index_train_test_parameters, estimator=None, X=None, y=None, fit_params=None, scorer=None, x_is_index=True, loader=None, fit_callback=None): """ function which represents a single task execution for GridSeearchCVParallel, is executed on remote machines :param index_train_test_parameters: tuple of 4 (index of the task, train index, test index, hyperparameters) :param estimator: sklearn's BaseEstimator subtype :param X: the data, or index (if x_is_index is True), or None :param y: the target variable :param fit_params: parameters for fit function of estimator :param scorer: sklearn scorer(estimator, X, y) :param x_is_index: True if x is to be used to get a subset of the data :param loader: function to load data on remote machines :return: tuple of 2 (index, result of sklearn.cross_validation._fit_and_score) """ from sklearn.cross_validation import _fit_and_score gs_index, (train, test, parameters) = index_train_test_parameters if x_is_index: index = X X = None if X is None: if loader is None: raise ValueError('loader is missing, X is None') X, y = loader() if x_is_index: X = X[index] y = y[index] # setup_kfold_patch(10) result = _fit_and_score(estimator=estimator, X=X, y=y, verbose=False, parameters=parameters, fit_params=fit_params, return_parameters=True, train=train, test=test, scorer=scorer) if fit_callback: fit_callback( gs_index, { 'estimator': estimator, 'X': X, 'y': y, 'parameters': parameters, 'fit_params': fit_params, 'train': train, 'test': test, 'scorer': scorer }) return gs_index, result
def run_experiments(experiments, methods, cache={}, grid_test=False, ignore_cache=False): """ TODO """ def _check_cache(cache, mode, clf_name, feature_hash, method): if mode not in cache: cache[mode] = {} if clf_name not in cache[mode]: cache[mode][clf_name] = {} if feature_hash not in cache[mode][clf_name]: cache[mode][clf_name][feature_hash] = {} if method in cache[mode][clf_name][feature_hash]: return cache[mode][clf_name][feature_hash][method] return None def _most_common_dict(param_list): param_list = [frozenset(p.items()) for p in param_list] mode = max(set(param_list), key=param_list.count) return dict(mode) # Defaults labels = pd.read_csv(LABEL_FILE, index_col=0) with open(CV_PARAMS_FILE, 'r') as fp: cv_params_dict = json.load(fp) results = [] for exp_id, clf_name, feature_files in tqdm(experiments, total=len(experiments)): feature_files = sorted(feature_files.split()) features = load_features(feature_files) logger.info('Experiment {0} ({1} - {2})'.format(exp_id, clf_name, feature_files)) for method in tqdm(methods, total=len(methods), leave=False): splits, iterations = METHODS[method] base_clf = classifier_dict[clf_name] clf = clone(base_clf) feature_hash = ":".join(feature_files) cv_param_grid = cv_params_dict[clf_name] logger.info('CV ParamGrid {0}'.format(cv_param_grid)) logger.info('Method {0}'.format(method)) # Check CV cache scores = [] cv_params = [] cv_cache = _check_cache(cache, 'cv', clf_name, feature_hash, method) cv_cache = not ignore_cache and cv_cache if cv_cache: cv_params = _check_cache(cache, 'cv', clf_name, feature_hash, method) logger.info('CV_CACHE Params: {0}'.format(cv_params)) test_params = [] test_cache = _check_cache(cache, 'test', clf_name, feature_hash, method) test_cache = not ignore_cache and test_cache if test_cache: test_params = _check_cache(cache, 'test', clf_name, feature_hash, method) logger.info('TEST_CACHE Params: {0}'.format(test_params)) for j in tqdm(range(iterations), total=iterations, leave=False): parameters = _check_cache(cache, 'cv', clf_name, feature_hash, method) train_ix, test_ix = split_labels(labels, splits) X_train, X_test, y_train, y_test = train_test_unpack(features, labels, train_ix, test_ix) if isinstance(cv_params, dict): clf.set_params(**cv_params) clf, parameters, (t, r) = train_classifier(clf, X_train, y_train) else: logger.info('It {0}: CV_START search'.format(j)) train_labels = labels.iloc[train_ix] cv_iter = cv_split_labels(train_labels, method) clf, parameters, (t, r) = train_classifier(clf, X_train, y_train, cv_param_grid, cv_iter) cv_params.append(parameters) logger.info('It {0}: CV_END search. Params: {1}'.format(j, parameters)) logger.info('TRAIN_THR t:{0}, r:{1}'.format(t, r)) y_pred = predict(clf, X_train, t, r) sc_train, (sc_train_low, sc_train_up) = cinc_confidence_interval(y_train, y_pred, t, r, 100) y_pred = predict(clf, X_test, t, r) sc_test, (sc_test_low, sc_test_up) = cinc_confidence_interval(y_test, y_pred, t, r, 100) if grid_test: X, y = features.values, labels.values.squeeze() if isinstance(test_params, dict): sc_test_best, _, _ = _fit_and_score(clf, X, y, cinc_cv_scorer, train_ix, test_ix, 0, test_params, None) else: logger.info('It {0}: TEST_START search'.format(j)) cv_param_grid = cv_params_dict[clf_name] sc_test_best, parameters = search_test_params(base_clf, cv_param_grid, X, y, train_ix, test_ix, scoring=cinc_cv_scorer) test_params.append(parameters) logger.info('It {0}: TEST_END search. Params: {1}'.format(j, parameters)) else: sc_test_best = np.nan scores.append(np.round([sc_train, sc_train_low, sc_train_up, sc_test, sc_test_low, sc_test_up, sc_test_best], 3)) logger.info('It {0}/{1} - Scores {2}'.format(j, iterations - 1, scores[-1])) scores = np.mean(np.array(scores), axis=0) if not cv_cache: best_cv_params = _most_common_dict(cv_params) cache['cv'][clf_name][feature_hash][method] = best_cv_params logger.info('BEST_CV params: {0}'.format(best_cv_params)) if not test_cache: best_test_params = _most_common_dict(test_params) cache['test'][clf_name][feature_hash][method] = best_test_params logger.info('BEST_TEST params: {0}'.format(best_test_params)) cv_test_gap = np.round(scores[-1] - scores[-4], 3) scores = np.array([exp_id, method] + list(scores)) results.append(scores) logger.info('Mean Scores: {0}'.format(scores)) if grid_test: logger.info('CV_TEST gap: {0}'.format(cv_test_gap)) df_results = pd.DataFrame(data=results, columns=HEADER) return df_results, cache