def evaluate(gridsearch=True, gen_error=True, memory=True): """Evaluate model Compute either an estimate for the generalization error for f1_macro with a nested gridsearch or evaluate the parameter grid in a simple gridsearch. Parameters ----------- gridsearch : boolean, if True the gridsearch is performed gen_error : boolean, if True an estimate for the generalization error is computed. memory : boolean, if True memory option is used Returns --------- NOTHING but SAVES the results of the performed computations """ MODEL = model.SMSGuruModel(classifier=CLASSIFIER, reduction=None, memory=memory) MODEL.set_question_loader(subcats=shared.SUBCATS) if gridsearch: MODEL.gridsearch(param_grid=PARAM_GRID, n_jobs=shared.N_JOBS, CV=shared.CV) shared.save_and_report( results=MODEL.grid_search_.cv_results_, folder='logreg') if gen_error: nested_scores = MODEL.nested_cv(param_grid=PARAM_GRID, CV=shared.CV) shared.save_and_report(results=nested_scores, folder='logreg', name='gen_error.npy')
def evaluate(gridsearch=True, gen_error=True, memory=True): """Evaluate model Compute either an estimate for the generalization error for f1_macro with a nested gridsearch or evaluate the parameter grid in a simple gridsearch. Parameters ----------- gridsearch : boolean, if True the gridsearch is performed gen_error : boolean, if True an estimate for the generalization error is computed. Returns --------- NOTHING but SAVES the results of the performed computations """ MODEL = model.SMSGuruModel(classifier=CLASSIFIER, pre_reduction=PRE_REDUCTION, reduction=LDA(), memory=memory) MODEL.set_question_loader(subcats=shared.SUBCATS) if gridsearch: MODEL.gridsearch(param_grid=PARAM_GRID_DIM, n_jobs=shared.N_JOBS, CV=shared.CV) shared.save_and_report(results=MODEL.grid_search_.cv_results_, folder='lda_svm') if gen_error: # since in this case the higher the dimension the better the estimator # we do not include the lower dimensions in this search nested_scores = MODEL.nested_cv(param_grid=PARAM_GRID, CV=shared.CV) shared.save_and_report(results=nested_scores, folder='lda_svm', name='gen_error.npy')
""" The :mod: `lda` implements the model and the constants needed for the evalutation of LDA as classifier""" # Author: Ingo Guehring from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA from sklearn.feature_extraction.text import TfidfTransformer import evaluation.shared as shared import model MODEL = model.SMSGuruModel(classifier=LDA(), reduction=None, memory=True) # PARAM_GRID = {} # PARAM_GRID = dict(classifier__solver=['svd']) PARAM_GRID = {'union__bow__vectorize__min_df': shared.MIN_DF, 'union__bow__tfidf': [None, TfidfTransformer()]} def evaluate(gridsearch=True, gen_error=True): """Evaluate model Compute either an estimate for the generalization error for f1_macro with a nested gridsearch or evaluate the parameter grid in a simple gridsearch. Parameters ----------- gridsearch : boolean, if True the gridsearch is performed
from time import gmtime, strftime import numpy as np from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA from sklearn.model_selection import StratifiedKFold import model import evaluation.shared as shared MODEL = model.SMSGuruModel(classifier=LDA(n_components=2), reduction=None, memory=True) MODEL.set_question_loader(subcats=shared.SUBCATS) skf = StratifiedKFold(n_splits=5) X = MODEL.question_loader_.questions y = MODEL.question_loader_.categoryids categories = MODEL.question_loader_.categories projected_test = None i = 0 for train_index, test_index in skf.split(X, y): if i == 0: X_test = np.array(X)[test_index] y_test = np.array(y)[test_index] X_train = np.array(X)[train_index] y_train = np.array(y)[train_index] MODEL.model.fit(X_train, y_train) projected_test = MODEL.model.transform(X_test) i = 1 folder = 'lda/plot'
# this could also be used: classifier_kernel=kernels, PARAM_GRID_DIM = [ dict(reduce_dim__n_components=N_COMPONENTS_RANGE, classifier__estimator__gamma=GAMMA_RANGE, classifier__estimator__C=C_RANGE) ] PARAM_GRID = [ dict(classifier__estimator__gamma=GAMMA_RANGE, classifier__estimator__C=C_RANGE) ] # model for use in train_apply_classifier MODEL = model.SMSGuruModel(classifier=CLASSIFIER, pre_reduction=PRE_REDUCTION, reduction=LDA()) def evaluate(gridsearch=True, gen_error=True, memory=True): """Evaluate model Compute either an estimate for the generalization error for f1_macro with a nested gridsearch or evaluate the parameter grid in a simple gridsearch. Parameters ----------- gridsearch : boolean, if True the gridsearch is performed gen_error : boolean, if True an estimate for the generalization
import numpy as np from sklearn.naive_bayes import MultinomialNB from sklearn.calibration import CalibratedClassifierCV from sklearn.feature_extraction.text import TfidfTransformer import evaluation.shared as shared import model # MODEL = model.SMSGuruModel(classifier=MultinomialNB(), reduction=None, # metadata=False, memory=True) # # PARAM_GRID = dict(classifier__alpha=np.array([1])) MODEL = model.SMSGuruModel(CalibratedClassifierCV(MultinomialNB(), method='isotonic'), reduction=None, metadata=False, memory=True) PARAM_GRID = { 'union__bow__vectorize__min_df': shared.MIN_DF, 'union__bow__tfidf': [None, TfidfTransformer()] } def evaluate(gridsearch=True, gen_error=True): # since there are no hyper parameters to be optimized we only need # the generalization error estimate MODEL.set_question_loader(subcats=shared.SUBCATS) if gridsearch: MODEL.gridsearch(param_grid=PARAM_GRID,
import evaluation.shared as shared import model import model.knn # algorithm='brute' see stackoverflow --> Zotero, since X is sparse, # which is good for metric (no real metric) CLASSIFIER = KNeighborsClassifier(weights=model.knn.cosine_dist_to_sim, metric=model.knn.cosine_semi_metric, n_jobs=shared.N_JOBS) # grid N_NEIGHBORS_RANGE = np.arange(5, 65, 5) PARAM_GRID = dict(classifier__n_neighbors=N_NEIGHBORS_RANGE) MODEL = model.SMSGuruModel(classifier=CLASSIFIER, reduction=None, metadata=False) def evaluate(gridsearch=True, gen_error=True, memory=True): """Evaluate model Compute either an estimate for the generalization error for f1_macro with a nested gridsearch or evaluate the parameter grid in a simple gridsearch. Parameters ----------- gridsearch : boolean, if True the gridsearch is performed gen_error : boolean, if True an estimate for the generalization
from sklearn.linear_model import LogisticRegression import evaluation.shared as shared import model CLASSIFIER = LogisticRegression() # new wider range # C_RANGE = shared.C_RANGE C_RANGE = np.logspace(-5, 5, 11) PARAM_GRID = [dict(classifier__C=C_RANGE)] # model for use in train_apply_classifier MODEL = model.SMSGuruModel(classifier=CLASSIFIER, reduction=None) def evaluate(gridsearch=True, gen_error=True, memory=True): """Evaluate model Compute either an estimate for the generalization error for f1_macro with a nested gridsearch or evaluate the parameter grid in a simple gridsearch. Parameters ----------- gridsearch : boolean, if True the gridsearch is performed gen_error : boolean, if True an estimate for the generalization error is computed.
# from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier import evaluation.shared as shared import model # CLASSIFIER = RandomForestClassifier(n_estimators=500) CLASSIFIER = GradientBoostingClassifier(n_estimators=500) # MAX_FEATURES_RANGE = [.2, .4, .6, .8, 'log2', 'sqrt'] MAX_FEATURES_RANGE = ['log2'] PARAM_GRID = [dict(classifier__max_features=MAX_FEATURES_RANGE)] # model for use in train_apply_classifier MODEL = model.SMSGuruModel(classifier=CLASSIFIER, metadata=False) def evaluate(gridsearch=True, gen_error=True, memory=True): """Evaluate model Compute either an estimate for the generalization error for f1_macro with a nested gridsearch or evaluate the parameter grid in a simple gridsearch. Parameters ----------- gridsearch : boolean, if True the gridsearch is performed gen_error : boolean, if True an estimate for the generalization error is computed.