def evaluate(gridsearch=True, gen_error=True): """Evaluate model Compute either an estimate for the generalization error for f1_macro with a nested gridsearch or evaluate the parameter grid in a simple gridsearch. Parameters ----------- gridsearch : boolean, if True the gridsearch is performed gen_error : boolean, if True an estimate for the generalization error is computed. Returns --------- NOTHING but SAVES the results of the performed computations """ # since there are no hyper parameters to be optimized we only need # the generalization error estimate MODEL.set_question_loader(subcats=shared.SUBCATS) if gridsearch: MODEL.gridsearch(param_grid=PARAM_GRID, n_jobs=shared.N_JOBS, CV=shared.CV) shared.save_and_report( results=MODEL.grid_search_.cv_results_, folder='lda') if gen_error: nested_scores = MODEL.nested_cv(param_grid=PARAM_GRID, CV=shared.CV) shared.save_and_report(results=nested_scores, folder='lda', name='gen_error.npy')
def evaluate(gridsearch=True, gen_error=True, memory=True): """Evaluate model Compute either an estimate for the generalization error for f1_macro with a nested gridsearch or evaluate the parameter grid in a simple gridsearch. Parameters ----------- gridsearch : boolean, if True the gridsearch is performed gen_error : boolean, if True an estimate for the generalization error is computed. memory : boolean, if True memory option is used Returns --------- NOTHING but SAVES the results of the performed computations """ MODEL = model.SMSGuruModel(classifier=CLASSIFIER, reduction=None, memory=memory) MODEL.set_question_loader(subcats=shared.SUBCATS) if gridsearch: MODEL.gridsearch(param_grid=PARAM_GRID, n_jobs=shared.N_JOBS, CV=shared.CV) shared.save_and_report( results=MODEL.grid_search_.cv_results_, folder='logreg') if gen_error: nested_scores = MODEL.nested_cv(param_grid=PARAM_GRID, CV=shared.CV) shared.save_and_report(results=nested_scores, folder='logreg', name='gen_error.npy')
def evaluate(gridsearch=True, gen_error=True): # since there are no hyper parameters to be optimized we only need # the generalization error estimate MODEL.set_question_loader(subcats=shared.SUBCATS) if gridsearch: MODEL.gridsearch(param_grid=PARAM_GRID, n_jobs=shared.N_JOBS, CV=shared.CV) shared.save_and_report(results=MODEL.grid_search_.cv_results_, folder='multinb') if gen_error: nested_scores = MODEL.nested_cv(param_grid=PARAM_GRID, CV=shared.CV) shared.save_and_report(results=nested_scores, folder='multinb', name='gen_error.npy')
def evaluate(gridsearch=True, gen_error=True, memory=True): """Evaluate model Compute either an estimate for the generalization error for f1_macro with a nested gridsearch or evaluate the parameter grid in a simple gridsearch. Parameters ----------- gridsearch : boolean, if True the gridsearch is performed gen_error : boolean, if True an estimate for the generalization error is computed. Returns --------- NOTHING but SAVES the results of the performed computations """ MODEL = model.SMSGuruModel(classifier=CLASSIFIER, pre_reduction=PRE_REDUCTION, reduction=LDA(), memory=memory) MODEL.set_question_loader(subcats=shared.SUBCATS) if gridsearch: MODEL.gridsearch(param_grid=PARAM_GRID_DIM, n_jobs=shared.N_JOBS, CV=shared.CV) shared.save_and_report(results=MODEL.grid_search_.cv_results_, folder='lda_svm') if gen_error: # since in this case the higher the dimension the better the estimator # we do not include the lower dimensions in this search nested_scores = MODEL.nested_cv(param_grid=PARAM_GRID, CV=shared.CV) shared.save_and_report(results=nested_scores, folder='lda_svm', name='gen_error.npy')
corr_macro = [] for train_index, test_index in skf.split(questions, categoryids): q_train, q_test = questions[train_index], questions[test_index] cat_train, cat_test = categoryids[train_index], categoryids[test_index] # fit all classifiers for clf in clfs: clf.fit(q_train, cat_train) # predict_proba for all classfifiers with best param config probas = [clf.predict_proba(q_test) for clf in clfs] # micro-averaged corrcoef probas_micro = np.asarray([prob.reshape(-1) for prob in probas]) corr_micro.append(np.corrcoef(probas_micro)) # macro-averaged corrcoef probas_macro = np.asarray(probas) corr_macro_class = [ np.corrcoef(probs) for probs in np.rollaxis(probas_macro, 2) ] corr_macro.append(np.mean(np.asarray(corr_macro_class), 0)) corr_micro = np.mean(np.asarray(corr_micro), axis=0) corr_macro = np.mean(np.asarray(corr_macro), axis=0) shared.save_and_report(results={ 'corr_micro': corr_micro, 'corr_macro': corr_macro }, folder='ensemble', name='corr')
def evaluate(subcats=False, comb_method='avg', gen_error=False, gridsearch=False, save_avg_path='./results/gridsearch/ensemble/raw/'): """ Run an ensemble method. A voting classifier is used with three inner classifiers (SVM, mNB, LDA). The ensemble method is then either evaluated in a gridsearch to evaluate the associated parameter grid or in a nested gridsearch to get an estimate for the generalization error. There is also the beginning a a bagging classifier implemented. But that is not working so far. Parameters ---------- subcats : if True subcategories are used as lables, else parent categories comb_method: string, determines the method used to combine the classifiers in the voting classifier. Can be either 'mult' or 'avg', then the classifiers are combined by multiplyinb or averaging, respectively. gen_error : boolean, if true a nested gridsearch is performed to estimate generalization error. gridsearch : boolean, if true a gridsearch is performed to find the best parameter combination from the associated grid. save_avg_path : string, determines where the probs from the voting classifier are saved from the nested gridsearch. """ print('subcats: {}, comb_method: {}' ', save_avg_path: {}'.format(subcats, comb_method, save_avg_path)) if not os.path.exists(save_avg_path): print('create directory: {}'.format(save_avg_path)) os.makedirs(save_avg_path) question_loader = ql.QuestionLoader(qfile=shared.QFILE, catfile=shared.CATFILE, subcats=subcats, metadata=True, verbose=True) cv = 5 verbose = 100 if comb_method != 'bagging': # If a classifier is changed the grid might have to be changed, too # Put the estimator with the best expected perfromance at the first # position! Then its probability output will be saved! SVM = shared.SVM_subcats if subcats else shared.SVM_parentcats MNB = shared.MNB_subcats if subcats else shared.MNB_parentcats ensemble = VotingClassifierB(estimators=[('svm', SVM), ('mnb', MNB), ('lda', shared.LDA)], voting='soft', comb_method=comb_method, save_avg_path=save_avg_path) # ##################### without gridsearch ############################ # scores = cross_val_score( # ensemble, question_loader.questions, # question_loader.categoryids, cv=cv, # scoring='f1_macro', n_jobs=-1, verbose=verbose) # # shared.save_and_report( # results=scores, folder='ensemble', name='gen_error.npy') # ##################### with gridsearch ############################### # svm param C_RANGE = np.logspace(-5, 5, 11) # grid PARAM_GRID_l = { 'svm__classifier__base_estimator__C': C_RANGE, 'svm__union__bow__vectorize__min_df': shared.MIN_DF, 'svm__union__bow__tfidf': [None, TfidfTransformer()], 'mnb__union__bow__vectorize__min_df': shared.MIN_DF, 'mnb__union__bow__tfidf': [None, TfidfTransformer()], 'lda__union__bow__vectorize__min_df': shared.MIN_DF, 'lda__union__bow__tfidf': [None, TfidfTransformer()] } PARAM_GRID_s = {'svm__classifier__base_estimator__C': C_RANGE} PARAM_GRID_m = { 'svm__classifier__base_estimator__C': C_RANGE, 'svm__union__bow__vectorize__min_df': shared.MIN_DF, 'mnb__union__bow__vectorize__min_df': shared.MIN_DF, 'lda__union__bow__vectorize__min_df': shared.MIN_DF } PARAM_GRID = PARAM_GRID_m if gridsearch: grid = GridSearchCV(estimator=ensemble, cv=cv, param_grid=PARAM_GRID, refit=False, error_score=-1, n_jobs=-1, verbose=verbose, scoring='f1_macro') grid.fit(question_loader.questions, question_loader.categoryids) if subcats: name = comb_method + 'subcats' + 'grid.npy' else: name = comb_method + 'grid.npy' shared.save_and_report(results=grid.cv_results_, folder='ensemble', name=name) if gen_error: clf = GridSearchCVB(estimator=ensemble, param_grid=PARAM_GRID, cv=cv, n_jobs=-1, scoring='f1_macro', verbose=verbose) nested_cv_scores = cross_val_score(clf, X=question_loader.questions, y=question_loader.categoryids, cv=cv, scoring=f1_macroB, verbose=verbose) if comb_method == 'bagging': base_estimator = shared.SVM base_estimator.set_params(question_created_at=None, union__bow__selector=None) clf = BaggingClassifier(base_estimator, n_estimators=50, max_samples=1.0) X = [pair['question'] for pair in question_loader.questions] # X = np.asarray(X).reshape((-1, 1)) nested_cv_scores = cross_val_score(clf, X=X, y=question_loader.categoryids, cv=cv, scoring=f1_macroB, verbose=verbose) if gen_error: if subcats: name = comb_method + 'subcats' + 'gen.npy' else: name = comb_method + 'gen.npy' shared.save_and_report(results=nested_cv_scores, folder='ensemble', name=name)