Beispiel #1
0
def evaluate(gridsearch=True, gen_error=True, memory=True):
    """Evaluate model

    Compute either an estimate for the generalization error for
    f1_macro with a nested gridsearch or evaluate the parameter
    grid in a simple gridsearch.

    Parameters
    -----------
    gridsearch : boolean, if True the gridsearch is performed

    gen_error : boolean, if True an estimate for the generalization
        error is computed.

    memory : boolean, if True memory option is used

    Returns
    ---------
    NOTHING but SAVES the results of the performed computations
    """
    MODEL = model.SMSGuruModel(classifier=CLASSIFIER, reduction=None,
                               memory=memory)
    MODEL.set_question_loader(subcats=shared.SUBCATS)
    if gridsearch:
        MODEL.gridsearch(param_grid=PARAM_GRID, n_jobs=shared.N_JOBS,
                         CV=shared.CV)
        shared.save_and_report(
            results=MODEL.grid_search_.cv_results_,
            folder='logreg')

    if gen_error:
        nested_scores = MODEL.nested_cv(param_grid=PARAM_GRID, CV=shared.CV)
        shared.save_and_report(results=nested_scores,
                               folder='logreg',
                               name='gen_error.npy')
Beispiel #2
0
def evaluate(gridsearch=True, gen_error=True, memory=True):
    """Evaluate model

    Compute either an estimate for the generalization error for
    f1_macro with a nested gridsearch or evaluate the parameter
    grid in a simple gridsearch.

    Parameters
    -----------
    gridsearch : boolean, if True the gridsearch is performed

    gen_error : boolean, if True an estimate for the generalization
        error is computed.

    Returns
    ---------
    NOTHING but SAVES the results of the performed computations
    """

    MODEL = model.SMSGuruModel(classifier=CLASSIFIER,
                               pre_reduction=PRE_REDUCTION,
                               reduction=LDA(),
                               memory=memory)
    MODEL.set_question_loader(subcats=shared.SUBCATS)
    if gridsearch:
        MODEL.gridsearch(param_grid=PARAM_GRID_DIM,
                         n_jobs=shared.N_JOBS,
                         CV=shared.CV)
        shared.save_and_report(results=MODEL.grid_search_.cv_results_,
                               folder='lda_svm')

    if gen_error:
        # since in this case the higher the dimension the better the estimator
        # we do not include the lower dimensions in this search
        nested_scores = MODEL.nested_cv(param_grid=PARAM_GRID, CV=shared.CV)
        shared.save_and_report(results=nested_scores,
                               folder='lda_svm',
                               name='gen_error.npy')
Beispiel #3
0
"""
The :mod: `lda` implements the model and the constants needed
for the evalutation of LDA as classifier"""
# Author: Ingo Guehring

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.feature_extraction.text import TfidfTransformer

import evaluation.shared as shared
import model


MODEL = model.SMSGuruModel(classifier=LDA(), reduction=None, memory=True)

# PARAM_GRID = {}
# PARAM_GRID = dict(classifier__solver=['svd'])
PARAM_GRID = {'union__bow__vectorize__min_df': shared.MIN_DF,
              'union__bow__tfidf': [None, TfidfTransformer()]}


def evaluate(gridsearch=True, gen_error=True):
    """Evaluate model

    Compute either an estimate for the generalization error for
    f1_macro with a nested gridsearch or evaluate the parameter
    grid in a simple gridsearch.

    Parameters
    -----------
    gridsearch : boolean, if True the gridsearch is performed
Beispiel #4
0
from time import gmtime, strftime
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import StratifiedKFold

import model
import evaluation.shared as shared

MODEL = model.SMSGuruModel(classifier=LDA(n_components=2),
                           reduction=None,
                           memory=True)
MODEL.set_question_loader(subcats=shared.SUBCATS)

skf = StratifiedKFold(n_splits=5)
X = MODEL.question_loader_.questions
y = MODEL.question_loader_.categoryids
categories = MODEL.question_loader_.categories
projected_test = None
i = 0
for train_index, test_index in skf.split(X, y):
    if i == 0:
        X_test = np.array(X)[test_index]
        y_test = np.array(y)[test_index]
        X_train = np.array(X)[train_index]
        y_train = np.array(y)[train_index]

        MODEL.model.fit(X_train, y_train)
        projected_test = MODEL.model.transform(X_test)
    i = 1

folder = 'lda/plot'
Beispiel #5
0
# this could also be used: classifier_kernel=kernels,
PARAM_GRID_DIM = [
    dict(reduce_dim__n_components=N_COMPONENTS_RANGE,
         classifier__estimator__gamma=GAMMA_RANGE,
         classifier__estimator__C=C_RANGE)
]

PARAM_GRID = [
    dict(classifier__estimator__gamma=GAMMA_RANGE,
         classifier__estimator__C=C_RANGE)
]

# model for use in train_apply_classifier
MODEL = model.SMSGuruModel(classifier=CLASSIFIER,
                           pre_reduction=PRE_REDUCTION,
                           reduction=LDA())


def evaluate(gridsearch=True, gen_error=True, memory=True):
    """Evaluate model

    Compute either an estimate for the generalization error for
    f1_macro with a nested gridsearch or evaluate the parameter
    grid in a simple gridsearch.

    Parameters
    -----------
    gridsearch : boolean, if True the gridsearch is performed

    gen_error : boolean, if True an estimate for the generalization
Beispiel #6
0
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_extraction.text import TfidfTransformer

import evaluation.shared as shared
import model

# MODEL = model.SMSGuruModel(classifier=MultinomialNB(), reduction=None,
#                            metadata=False, memory=True)
#
# PARAM_GRID = dict(classifier__alpha=np.array([1]))
MODEL = model.SMSGuruModel(CalibratedClassifierCV(MultinomialNB(),
                                                  method='isotonic'),
                           reduction=None,
                           metadata=False,
                           memory=True)

PARAM_GRID = {
    'union__bow__vectorize__min_df': shared.MIN_DF,
    'union__bow__tfidf': [None, TfidfTransformer()]
}


def evaluate(gridsearch=True, gen_error=True):
    # since there are no hyper parameters to be optimized we only need
    # the generalization error estimate
    MODEL.set_question_loader(subcats=shared.SUBCATS)
    if gridsearch:
        MODEL.gridsearch(param_grid=PARAM_GRID,
Beispiel #7
0
import evaluation.shared as shared
import model
import model.knn

# algorithm='brute' see stackoverflow --> Zotero, since X is sparse,
# which is good for metric (no real metric)
CLASSIFIER = KNeighborsClassifier(weights=model.knn.cosine_dist_to_sim,
                                  metric=model.knn.cosine_semi_metric,
                                  n_jobs=shared.N_JOBS)

# grid
N_NEIGHBORS_RANGE = np.arange(5, 65, 5)

PARAM_GRID = dict(classifier__n_neighbors=N_NEIGHBORS_RANGE)
MODEL = model.SMSGuruModel(classifier=CLASSIFIER,
                           reduction=None,
                           metadata=False)


def evaluate(gridsearch=True, gen_error=True, memory=True):
    """Evaluate model

    Compute either an estimate for the generalization error for
    f1_macro with a nested gridsearch or evaluate the parameter
    grid in a simple gridsearch.

    Parameters
    -----------
    gridsearch : boolean, if True the gridsearch is performed

    gen_error : boolean, if True an estimate for the generalization
Beispiel #8
0
from sklearn.linear_model import LogisticRegression

import evaluation.shared as shared
import model


CLASSIFIER = LogisticRegression()

# new wider range
# C_RANGE = shared.C_RANGE
C_RANGE = np.logspace(-5, 5, 11)

PARAM_GRID = [dict(classifier__C=C_RANGE)]

# model for use in train_apply_classifier
MODEL = model.SMSGuruModel(classifier=CLASSIFIER, reduction=None)


def evaluate(gridsearch=True, gen_error=True, memory=True):
    """Evaluate model

    Compute either an estimate for the generalization error for
    f1_macro with a nested gridsearch or evaluate the parameter
    grid in a simple gridsearch.

    Parameters
    -----------
    gridsearch : boolean, if True the gridsearch is performed

    gen_error : boolean, if True an estimate for the generalization
        error is computed.
Beispiel #9
0
# from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

import evaluation.shared as shared
import model

# CLASSIFIER = RandomForestClassifier(n_estimators=500)
CLASSIFIER = GradientBoostingClassifier(n_estimators=500)

# MAX_FEATURES_RANGE = [.2, .4, .6, .8, 'log2', 'sqrt']
MAX_FEATURES_RANGE = ['log2']

PARAM_GRID = [dict(classifier__max_features=MAX_FEATURES_RANGE)]

# model for use in train_apply_classifier
MODEL = model.SMSGuruModel(classifier=CLASSIFIER, metadata=False)


def evaluate(gridsearch=True, gen_error=True, memory=True):
    """Evaluate model

    Compute either an estimate for the generalization error for
    f1_macro with a nested gridsearch or evaluate the parameter
    grid in a simple gridsearch.

    Parameters
    -----------
    gridsearch : boolean, if True the gridsearch is performed

    gen_error : boolean, if True an estimate for the generalization
        error is computed.