Beispiel #1
0
def test_regression_estimators():
    """Test all regression estimators"""
    df_iris = load_dataset('iris')

    X = ['sepal_length', 'sepal_width', 'petal_length']
    y = 'petal_width'

    for t_mname, t_model_class in _reg_estimators.items():
        m_params = _reg_params.get(t_mname, {})
        model_params = None
        if len(m_params) > 0:
            model_params = {
                f'{t_mname}__{t_param}': t_value
                for t_param, t_value in m_params.items()
            }
            t_model = t_model_class(**m_params)
        else:
            t_model = t_model_class()
        scorers = ['neg_root_mean_squared_error', 'r2']
        api_params = {
            'model': t_mname,
            'model_params': model_params,
            'problem_type': 'regression'
        }
        clf = make_pipeline(StandardScaler(), clone(t_model))
        do_scoring_test(X,
                        y,
                        data=df_iris,
                        api_params=api_params,
                        sklearn_model=clf,
                        scorers=scorers)
Beispiel #2
0
def test_multiclass_estimators():
    """Test all multiclass estimators"""
    df_iris = load_dataset('iris')

    X = ['sepal_length', 'sepal_width', 'petal_length']
    y = 'species'

    for t_mname, t_model_class in _clf_estimators.items():
        m_params = _clf_params.get(t_mname, {})
        model_params = None
        if len(m_params) > 0:
            model_params = {
                f'{t_mname}__{t_param}': t_value
                for t_param, t_value in m_params.items()
            }
            t_model = t_model_class(**m_params)
        else:
            t_model = t_model_class()
        scorers = ['accuracy']
        api_params = {
            'model': t_mname,
            'model_params': model_params,
            'problem_type': 'multiclass_classification'
        }
        clf = make_pipeline(StandardScaler(), clone(t_model))
        do_scoring_test(X,
                        y,
                        data=df_iris,
                        api_params=api_params,
                        sklearn_model=clf,
                        scorers=scorers)
Beispiel #3
0
def test_binary_estimators():
    """Test all binary estimators"""
    df_iris = load_dataset('iris')

    # keep only two species
    df_iris = df_iris[df_iris['species'].isin(['setosa', 'virginica'])]
    X = ['sepal_length', 'sepal_width', 'petal_length']
    y = 'species'

    for t_mname, t_model_class in _clf_estimators.items():
        m_params = _clf_params.get(t_mname, {})
        model_params = None
        if len(m_params) > 0:
            model_params = {
                f'{t_mname}__{t_param}': t_value
                for t_param, t_value in m_params.items()
            }
            t_model = t_model_class(**m_params)
        else:
            t_model = t_model_class()
        scorers = ['accuracy']
        api_params = {'model': t_mname, 'model_params': model_params}
        clf = make_pipeline(StandardScaler(), clone(t_model))
        do_scoring_test(X,
                        y,
                        data=df_iris,
                        api_params=api_params,
                        sklearn_model=clf,
                        scorers=scorers)
        if t_mname != 'dummy':
            # now let's try target-dependent scores
            scorers = ['recall', 'precision', 'f1']
            sk_y = (df_iris[y].values == 'setosa').astype(np.int)
            api_params = {
                'model': t_mname,
                'pos_labels': 'setosa',
                'model_params': model_params
            }
            clf = make_pipeline(StandardScaler(), clone(t_model))
            do_scoring_test(X,
                            y,
                            data=df_iris,
                            api_params=api_params,
                            sklearn_model=clf,
                            scorers=scorers,
                            sk_y=sk_y)
Beispiel #4
0
def test_scoring_y_transformer():
    """Test scoring with y transformer"""
    df_iris = load_dataset('iris')

    # keep only two species
    df_iris = df_iris[df_iris['species'].isin(['setosa', 'virginica'])]
    X = ['sepal_length', 'sepal_width', 'petal_length']
    y = 'species'

    # sk_X = df_iris[X].values
    sk_y = df_iris[y].values
    clf = make_pipeline(StandardScaler(), svm.SVC(probability=True))
    y_transformer = LabelBinarizer()

    scorers = ['accuracy', 'balanced_accuracy']
    api_params = {'model': 'svm', 'preprocess_y': y_transformer}
    do_scoring_test(X, y, data=df_iris, api_params=api_params,
                    sklearn_model=clf, scorers=scorers, sk_y=sk_y)
Beispiel #5
0
def test_feature_transformers():
    """Test transform X"""
    """Test simple binary classification"""
    df_iris = load_dataset('iris')

    # keep only two species
    df_iris = df_iris[df_iris['species'].isin(['setosa', 'virginica'])]
    X = ['sepal_length', 'sepal_width', 'petal_length']
    y = 'species'

    scorers = ['accuracy']

    for tr_name, tr_klass in _features_transformers.items():
        api_params = {'model': 'svm', 'preprocess_X': tr_name}
        if tr_name in _transformer_params:
            model_params = {
                f'{tr_name}__{k}': v
                for k, v in _transformer_params[tr_name].items()
            }
            api_params['model_params'] = model_params
            tr = tr_klass(**_transformer_params[tr_name])
        else:
            tr = tr_klass()
        clf = make_pipeline(tr, svm.SVC())
        if tr_name in _works_only_with_regression:
            df_test = df_iris.copy()
            df_test[y] = df_iris[y].apply(lambda x: {
                'setosa': 0,
                'versicolor': 1,
                'virginica': 3
            }[x])
        else:
            df_test = df_iris.copy()
        do_scoring_test(X,
                        y,
                        data=df_test,
                        api_params=api_params,
                        sklearn_model=clf,
                        scorers=scorers)
Beispiel #6
0
def test_simple_binary():
    """Test simple binary classification"""
    df_iris = load_dataset('iris')

    # keep only two species
    df_iris = df_iris[df_iris['species'].isin(['setosa', 'virginica'])]
    X = ['sepal_length', 'sepal_width', 'petal_length']
    y = 'species'

    scorers = ['accuracy', 'balanced_accuracy']
    api_params = {'model': 'svm'}
    clf = make_pipeline(StandardScaler(), svm.SVC())
    do_scoring_test(X, y, data=df_iris, api_params=api_params,
                    sklearn_model=clf, scorers=scorers)

    # now let's try target-dependent scores
    scorers = ['recall', 'precision', 'f1']
    sk_y = (df_iris[y].values == 'setosa').astype(np.int)
    api_params = {'model': 'svm', 'pos_labels': 'setosa'}
    clf = make_pipeline(StandardScaler(), svm.SVC())
    do_scoring_test(X, y, data=df_iris, api_params=api_params,
                    sklearn_model=clf, scorers=scorers, sk_y=sk_y)

    # now let's try proba-dependent scores
    scorers = ['roc_auc']
    sk_y = (df_iris[y].values == 'setosa').astype(np.int)
    model = svm.SVC(probability=True)
    api_params = {'model': model, 'pos_labels': 'setosa'}
    clf = make_pipeline(StandardScaler(), svm.SVC())
    do_scoring_test(X, y, data=df_iris, api_params=api_params,
                    sklearn_model=clf, scorers=scorers, sk_y=sk_y)

    # now let's try for decision_function based scores
    # e.g. svm with probability=False

    scorers = ['roc_auc']
    sk_y = (df_iris[y].values == 'setosa').astype(np.int)
    model = svm.SVC(probability=False)
    api_params = {'model': model, 'pos_labels': 'setosa'}
    clf = make_pipeline(StandardScaler(), svm.SVC())
    do_scoring_test(X, y, data=df_iris, api_params=api_params,
                    sklearn_model=clf, scorers=scorers, sk_y=sk_y)
Beispiel #7
0
def test_naive_bayes_estimators():
    """Test all naive bayes estimators"""
    df_iris = load_dataset('iris')

    # keep only two species
    df_binary = df_iris[df_iris['species'].isin(['setosa', 'virginica'])]
    X = ['sepal_length', 'sepal_width', 'petal_length']
    y = 'species'

    for t_mname, t_model_class in _nb_estimators.items():
        m_params = _nb_params.get(t_mname, {})
        model_params = None
        if len(m_params) > 0:
            model_params = {
                f'{t_mname}__{t_param}': t_value
                for t_param, t_value in m_params.items()
            }
            t_model = t_model_class(**m_params)
        else:
            t_model = t_model_class()
        t_df_binary = df_binary.copy(deep=True)
        t_df = df_iris.copy(deep=True)
        if t_mname in ['nb_categorical']:
            t_df_binary[X] = t_df_binary[X] > t_df_binary[X].mean()
            t_df[X] = t_df[X] > t_df[X].mean()
        scorers = ['accuracy']
        api_params = {
            'model': t_mname,
            'model_params': model_params,
            'preprocess_X': None
        }
        clf = make_pipeline(clone(t_model))
        do_scoring_test(X,
                        y,
                        data=t_df_binary,
                        api_params=api_params,
                        sklearn_model=clf,
                        scorers=scorers)
        api_params = {
            'model': t_mname,
            'model_params': model_params,
            'preprocess_X': None,
            'problem_type': 'multiclass_classification'
        }
        clf = make_pipeline(clone(t_model))
        do_scoring_test(X,
                        y,
                        data=t_df,
                        api_params=api_params,
                        sklearn_model=clf,
                        scorers=scorers)
        if t_mname not in ['nb_bernoulli']:
            # now let's try target-dependent scores
            scorers = ['recall', 'precision', 'f1']
            sk_y = (t_df_binary[y].values == 'setosa').astype(np.int)
            api_params = {
                'model': t_mname,
                'pos_labels': 'setosa',
                'model_params': model_params,
                'preprocess_X': None
            }
            clf = make_pipeline(clone(t_model))
            do_scoring_test(X,
                            y,
                            data=t_df_binary,
                            api_params=api_params,
                            sklearn_model=clf,
                            scorers=scorers,
                            sk_y=sk_y)