Exemple #1
0
    ac_filepath = "tmp/sample_ac.pickle"
    al_filepath = "tmp/sample_al.pickle"
    df = pd.read_csv('data/train.csv')

    if not (os.path.exists(ac_filepath) and os.path.exists(al_filepath)):
        ac = AutoConverter(target='Survived')
        X, y = ac.fit_transform(df)
        al = AutoLearn(customized_clf_list=[('LogisticRegression',
                                             LogisticRegression())],
                       metric='roc_auc',
                       cv_num=5,
                       pos_label=1,
                       n_jobs=1,
                       verbose=0)
        results = al.learn(X, y)
        print(results['name'])
        print(results['eval_df'])

        pred = al.predict(X)
        print(pred)

        ac.save(ac_filepath)
        al.save(al_filepath)

    ac = AutoConverter.load(ac_filepath)
    al = AutoLearn.load(al_filepath)

    e = Evaluate(ac, al)
    orig_eval_s = e.evaluate_performance(df)
    col_imp_df = e.calculate_column_importance(df)
Exemple #2
0
def __reg_single_validation(X, y,
                            est,
                            validation_ratio=0.2,
                            verbose=0):
    """Run single validation for regression.

    Args:
        X (np.array): Feature matrix
        y (np.array): Label vector
        est (sklearn.base.RegressorMixin): Regressor object
        validation_ratio (float): size of validation data
        n_jobs (int): The number of jobs to run parallel processes
        verbose (int): Controls the verbosity

    Returns:
        {'cv_df': pd.DataFrame(data_list,
                              columns=['metric_test',
                                       'metric_train']),
            'train_eval_df': None,
            'test_eval_df': None,
            'sample_est': est}

    """

    if type(verbose) != int:
        raise ValueError('Verbose parameter must be an integer')

    if hasattr(est, 'verbose'):
        est.verbose = verbose

    data_list = []
    train_eval_s_list = []
    test_eval_s_list = []
    metric_func_dict = MetricCatalog.get_basic_metrics(task_type='regression')

    ss = ShuffleSplit(n_splits=1, test_size=validation_ratio,
                      random_state=0)
    train_idx, test_idx = next(ss.split(X, y))
    y_train, y_test = y[train_idx], y[test_idx]
    X_train, X_test = X[train_idx], X[test_idx]
    est.fit(X_train, y_train)

    y_pred = est.predict(X_test)  # matrix
    y_pred_train = est.predict(X_train)

    metric_test = mean_absolute_error(y_test, y_pred)
    metric_train = mean_absolute_error(y_train, y_pred_train)

    data_list.append([metric_test,
                      metric_train])
    train_eval_s = Evaluate.run_metric_functions(y_train,
                                                 y_pred_train,
                                                 None,
                                                 metric_func_dict,
                                                 task_type="regression")
    train_eval_s_list.append(train_eval_s)
    test_eval_s = Evaluate.run_metric_functions(y_test,
                                                y_pred,
                                                None,
                                                metric_func_dict,
                                                task_type="regression")
    test_eval_s_list.append(test_eval_s)

    return {'cv_df': pd.DataFrame(data_list,
                                  columns=['metric_test',
                                           'metric_train']),
            'train_eval_df': pd.concat(train_eval_s_list, axis=1).T,
            'test_eval_df': pd.concat(test_eval_s_list, axis=1).T,
            'sample_est': est}
Exemple #3
0
def __run_cross_validation(X, y,
                           clf,
                           metric,
                           cv_num=5,
                           pos_label=1,
                           verbose=0):
    """Run cross validation for evaluation.

    Args:
        X (np.array): Feature matrix
        y (np.array): Label vector
        clf (sklearn.base.ClassifierMixin): Classifier object
        metric (str) : Evaluation metric
                       metric in ['roc_auc', 'neg_log_loss']
        cv_num (int): Number of fold for cross validation
        pos_label (int): Positive label name (used for binary classification)
        verbose (int): Controls the verbosity

    Returns:
        {'cv_df': pd.DataFrame(data_list,
                               columns=['metric_test',
                                        'metric_train']),
            'y_error': y_error,
            'y_pred': y_pred,
            'sample_clf': clf}

    """

    if metric == 'neg_mean_absolute_error':
        # regression task: using separate function
        return __reg_cross_validation(X=X, y=y, est=clf, cv_num=cv_num,
                                      verbose=verbose)

    data_list = []
    num_class = len(np.unique(y))
    kf = StratifiedKFold(n_splits=cv_num,
                         random_state=1)  # TODO(Yoshi): random_state

    # accuracy, precision, recall
    metric_func_dict = MetricCatalog.get_basic_metrics()
    train_eval_s_list = []
    test_eval_s_list = []

    # TODO(Yoshi): If clf (e.g., GridSearchCV) has inner classifier object
    # that has `verbose` paramter, the below logic does not handle it.
    assert type(verbose) == int
    if hasattr(clf, 'verbose'):
        clf.verbose = verbose

    if num_class > 2:
        y_error = np.zeros((len(y), num_class))
        y_pred_all = np.zeros((len(y), num_class))
    else:
        y_error = np.zeros(len(y))
        y_pred_all = np.zeros(len(y))

    for train_idx, test_idx in kf.split(X, y):
        y_train, y_test = y[train_idx], y[test_idx]
        X_train, X_test = X[train_idx], X[test_idx]
        clf.fit(X_train, y_train)

        # Take out class information from estimator or GridSearch object
        if hasattr(clf, 'classes_'):
            classes_ = clf.classes_
        else:
            assert hasattr(clf.best_estimator_, 'classes_')
            classes_ = clf.best_estimator_.classes_

        if not hasattr(clf, 'predict_proba'):
            clf = CalibratedClassifierCV(clf, cv='prefit')
            clf.fit(X_train, y_train)

        # predict/predict_proba
        if metric in ['roc_auc']:
            assert num_class == 2

            # Binary classification
            y_pred = clf.predict(X_test)
            pos_idx = np.where(np.array(classes_) == pos_label)[0][0]
            y_prob = clf.predict_proba(X_test)[:, pos_idx]
            y_pred_train = clf.predict(X_train)
            y_prob_train = clf.predict_proba(X_train)[:, pos_idx]
            y_error[test_idx] = np.abs(y_test - y_prob)
            y_pred_all[test_idx] = y_prob

            # Calculate evaulation metric
            fpr_test, tpr_test, _ = roc_curve(y_test,
                                              y_prob,
                                              pos_label=pos_label)
            metric_test = auc(fpr_test, tpr_test)
            fpr_train, tpr_train, _ = roc_curve(y_train,
                                                y_prob_train,
                                                pos_label=pos_label)
            metric_train = auc(fpr_train, tpr_train)
            train_eval_s = Evaluate.run_metric_functions(y_train,
                                                         y_pred_train,
                                                         y_prob_train,
                                                         metric_func_dict,
                                                         "binary")
            train_eval_s_list.append(train_eval_s)
            test_eval_s = Evaluate.run_metric_functions(y_test,
                                                        y_pred,
                                                        y_prob,
                                                        metric_func_dict,
                                                        "binary")
            test_eval_s_list.append(test_eval_s)

        elif metric in ['neg_log_loss']:
            print("metric in [neg_log_loss]")
            # Multi-class classification - we should not run it with binary!
            y_pred = clf.predict(X_test)
            y_prob = clf.predict_proba(X_test)  # matrix
            y_pred_train = clf.predict(X_train)
            y_prob_train = clf.predict_proba(X_train)  # matrix

            y_pred_all[test_idx] = y_prob

            # TODO(Yoshi): Cannot simply define y_error for multi
            y_error[test_idx] = np.nan

            print("Evaluate neg_log_loss")

            # Calculate evaluation metric.
            # Add the negative sign to make it a "score"
            metric_test = - log_loss(y_test, y_prob)
            metric_train = - log_loss(y_train, y_prob_train)

            train_eval_s = Evaluate.run_metric_functions(y_train,
                                                         y_pred_train,
                                                         y_prob_train,
                                                         metric_func_dict,
                                                         "multi")
            train_eval_s_list.append(train_eval_s)
            test_eval_s = Evaluate.run_metric_functions(y_test,
                                                        y_pred,
                                                        y_prob,
                                                        metric_func_dict,
                                                        "multi")
            test_eval_s_list.append(test_eval_s)

        else:
            raise Exception("Metric not supported: {}".format(metric))

        data_list.append([metric_test,
                          metric_train])

    return {'cv_df': pd.DataFrame(data_list,
                                  columns=['metric_test',
                                           'metric_train']),
            'train_eval_df': pd.concat(train_eval_s_list, axis=1).T,
            'test_eval_df': pd.concat(test_eval_s_list, axis=1).T,
            'y_error': y_error,
            'y_pred': y_pred,
            'sample_clf': clf}
Exemple #4
0
def __reg_cross_validation(X, y, est, cv_num=5, n_jobs=1, verbose=0):
    """Cross validation for regression case

    Args:
        X (np.array): Feature matrix
        y (np.array): Label vector
        est (sklearn.base.Regressor): Regressor object
        cv_num (int): Number of fold for cross validation
        n_jobs (int): The number of jobs to run parallel processes
        verbose (int): Controls the verbosity

    Returns:
        {'cv_df': pd.DataFrame(data_list,
                               columns=['metric_test',
                                        'metric_train'])
        'train_eval_df':
        'test_eval_df' :
        sample_est: est
        }

    """

    data_list = []
    train_eval_s_list = []
    test_eval_s_list = []
    metric_func_dict = MetricCatalog.get_basic_metrics(task_type='regression')

    kf = KFold(n_splits=cv_num, random_state=1)  # TODO(Yoshi): random_state

    assert type(verbose) == int
    if hasattr(est, 'verbose'):
        est.verbose = verbose

    if hasattr(est, 'n_jobs'):
        est.n_jobs = n_jobs

    for train_idx, test_idx in kf.split(X, y):

        y_train, y_test = y[train_idx], y[test_idx]
        X_train, X_test = X[train_idx], X[test_idx]
        est.fit(X_train, y_train)

        y_pred = est.predict(X_test)
        y_pred_train = est.predict(X_train)
        metric_test = - mean_absolute_error(y_test, y_pred)
        metric_train = - mean_absolute_error(y_train, y_pred_train)

        data_list.append([metric_test,
                          metric_train])
        train_eval_s = Evaluate.run_metric_functions(y_train,
                                                     y_pred_train,
                                                     None,
                                                     metric_func_dict,
                                                     "regression")
        train_eval_s_list.append(train_eval_s)

        test_eval_s = Evaluate.run_metric_functions(y_test,
                                                    y_pred,
                                                    None,
                                                    metric_func_dict,
                                                    "regression")
        test_eval_s_list.append(test_eval_s)

    return {'cv_df': pd.DataFrame(data_list, columns=['metric_test',
                                                      'metric_train']),
            'train_eval_df': pd.concat(train_eval_s_list, axis=1).T,
            'test_eval_df': pd.concat(test_eval_s_list, axis=1).T,
            'sample_est': est}
Exemple #5
0
def __run_single_validation(X, y,
                            clf,
                            metric,
                            validation_ratio=0.2,
                            pos_label=1,
                            verbose=0):
    """Run validation for evaluation.

    Args:
        X (np.array): Feature matrix
        y (np.array): Label vector
        clf (sklearn.base.ClassifierMixin): Classifier object
        metric (str) : Evaluation metric
                       metric in ['roc_auc', 'neg_log_loss']
        validation_ratio (float): size of validation data
        pos_label (int): Positive label name (used for binary classification)
        verbose (int): Controls the verbosity

    Returns:
        {'cv_df': pd.DataFrame(data_list,
                              columns=['metric_test',
                                       'metric_train']),
            'y_error': y_error,
            'y_pred': y_pred,
            'sample_clf': clf}

    """

    # TODO(Yoshi): Overall function should be able to merge with
    # run_cross_validation()

    data_list = []
    metric_func_dict = MetricCatalog.get_basic_metrics()
    train_eval_s_list = []
    test_eval_s_list = []
    num_class = len(np.unique(y))
    sss = StratifiedShuffleSplit(n_splits=1, test_size=validation_ratio,
                                 random_state=0)
    train_idx, test_idx = next(sss.split(X, y))
    y_train, y_test = y[train_idx], y[test_idx]
    X_train, X_test = X[train_idx], X[test_idx]
    clf.fit(X_train, y_train)
    # predict/predict_proba
    if metric in ['roc_auc']:
        assert num_class == 2
        # Take out class information from estimator or GridSearch object
        if hasattr(clf, 'classes_'):
            classes_ = clf.classes_
        else:
            assert hasattr(clf.best_estimator_, 'classes_')
            classes_ = clf.best_estimator_.classes_
        # Binary classification
        pos_idx = np.where(np.array(classes_) == pos_label)[0][0]
        y_pred = clf.predict(X_test)
        y_prob = clf.predict_proba(X_test)[:, pos_idx]
        y_pred_train = clf.predict(X_train)
        y_prob_train = clf.predict_proba(X_train)[:, pos_idx]
        y_error = np.abs(y_test - y_pred)
        # Calculate evaulation metric
        fpr_test, tpr_test, _ = roc_curve(y_test,
                                          y_prob,
                                          pos_label=pos_label)
        metric_test = auc(fpr_test, tpr_test)
        fpr_train, tpr_train, _ = roc_curve(y_train,
                                            y_prob_train,
                                            pos_label=pos_label)
        metric_train = auc(fpr_train, tpr_train)

        train_eval_s = Evaluate.run_metric_functions(y_train,
                                                     y_pred_train,
                                                     y_prob_train,
                                                     metric_func_dict,
                                                     "binary")
        train_eval_s_list.append(train_eval_s)
        test_eval_s = Evaluate.run_metric_functions(y_test,
                                                    y_pred,
                                                    y_prob,
                                                    metric_func_dict,
                                                    "binary")
        test_eval_s_list.append(test_eval_s)
    elif metric in ['neg_log_loss']:
        print("metric in [neg_log_loss]")
        # Multi-class classification - we should not run it with binary!
        # TODO(Bublin): This y_pred don't have collect index
        # (do we have to return y_pred and y_error?)
        y_pred = clf.predict(X_test)  # matrix
        y_prob = clf.predict_proba(X_test)  # matrix
        y_pred_train = clf.predict(X_train)
        y_prob_train = clf.predict_proba(X_train)  # matrix
        # TODO(Yoshi): Cannot simply define y_error for multi
        y_error = np.nan
        print("Evaluate neg_log_loss")
        # Calculate evaluation metric
        metric_test = log_loss(y_test, y_prob)
        metric_train = log_loss(y_train, y_prob_train)

        train_eval_s = Evaluate.run_metric_functions(y_train,
                                                     y_pred_train,
                                                     y_prob_train,
                                                     metric_func_dict,
                                                     "multi")
        train_eval_s_list.append(train_eval_s)
        test_eval_s = Evaluate.run_metric_functions(y_test,
                                                    y_pred,
                                                    y_prob,
                                                    metric_func_dict,
                                                    "multi")
        test_eval_s_list.append(test_eval_s)
    else:
        raise Exception("Metric not supported: {}".format(metric))
    data_list.append([metric_test,
                      metric_train])

    return {'cv_df': pd.DataFrame(data_list,
                                  columns=['metric_test',
                                           'metric_train']),
            'train_eval_df': pd.concat(train_eval_s_list, axis=1).T,
            'test_eval_df': pd.concat(test_eval_s_list, axis=1).T,
            'y_error': y_error,
            'y_pred': y_pred,
            'test_index': test_idx,
            'sample_clf': clf}
Exemple #6
0
    def setUp(self):
        self.df1 = pd.read_csv('data/train.csv')
        self.assertTrue(True)
        ac1 = AutoConverter(target='Survived')
        self.assertTrue(True)
        X1, y1 = ac1.fit_transform(self.df1)
        al1 = AutoLearn(level=1)
        al1.learn(X1, y1)
        self.e1 = Evaluate(ac=ac1, alearn=al1)
        self.assertTrue(True)

        clf1 = LogisticRegression()
        clf1.fit(X1, y1)
        self.e1a = Evaluate(ac=ac1, alearn=clf1)

        with self.assertRaises(ValueError):
            Evaluate(alearn=al1)

        self.e1b = Evaluate(alearn=al1, feature_names=ac1.feature_names)

        data = datasets.load_iris()
        self.df2 = pd.DataFrame(np.c_[data.target.reshape(-1, 1), data.data],
                                columns=["class"] + data.feature_names)
        ac2 = AutoConverter(target="class")
        al2 = AutoLearn(level=1)
        X2, y2 = ac2.fit_transform(self.df2)
        al2.learn(X2, y2)
        self.e2 = Evaluate(ac=ac2, alearn=al2)

        clf2 = LogisticRegression()
        clf2.fit(X2, y2)
        self.e2a = Evaluate(ac=ac2, alearn=clf2)

        # subtable
        dirpath = "data/kaggle-kkbox-churn-prediction-challenge-1k"
        members_df = pd.read_csv(os.path.join(dirpath, "members_train.csv"))
        transactions_df = pd.read_csv(os.path.join(dirpath,
                                                   "transactions.csv"))
        user_logs_df = pd.read_csv(os.path.join(dirpath, "user_logs.csv"))

        subtables3 = {
            "transactions": {
                "table": transactions_df,
                "link_key": "msno",
                "group_key": "msno"
            },
            "user_logs": {
                "table": user_logs_df,
                "link_key": "msno",
                "group_key": "msno"
            }
        }

        ac3 = AutoConverter(target="is_churn")
        X3, y3 = ac3.fit_transform(df=members_df, subtables=subtables3)
        al3 = AutoLearn(level=1)
        al3.learn(X3, y3)
        self.e3 = Evaluate(ac=ac3, alearn=al3)

        self.df4 = members_df
        ac4 = AutoConverter(target="is_churn", task_type="regression")
        X4, y4 = ac4.fit_transform(df=members_df)
        al4 = AutoLearn(level=1, task="regression")
        al4.learn(X4, y4)
        e4 = Evaluate(alearn=al4, ac=ac4)
        self.e4 = e4
Exemple #7
0
class EvaluateTestCase(unittest.TestCase):
    def setUp(self):
        self.df1 = pd.read_csv('data/train.csv')
        self.assertTrue(True)
        ac1 = AutoConverter(target='Survived')
        self.assertTrue(True)
        X1, y1 = ac1.fit_transform(self.df1)
        al1 = AutoLearn(level=1)
        al1.learn(X1, y1)
        self.e1 = Evaluate(ac=ac1, alearn=al1)
        self.assertTrue(True)

        clf1 = LogisticRegression()
        clf1.fit(X1, y1)
        self.e1a = Evaluate(ac=ac1, alearn=clf1)

        with self.assertRaises(ValueError):
            Evaluate(alearn=al1)

        self.e1b = Evaluate(alearn=al1, feature_names=ac1.feature_names)

        data = datasets.load_iris()
        self.df2 = pd.DataFrame(np.c_[data.target.reshape(-1, 1), data.data],
                                columns=["class"] + data.feature_names)
        ac2 = AutoConverter(target="class")
        al2 = AutoLearn(level=1)
        X2, y2 = ac2.fit_transform(self.df2)
        al2.learn(X2, y2)
        self.e2 = Evaluate(ac=ac2, alearn=al2)

        clf2 = LogisticRegression()
        clf2.fit(X2, y2)
        self.e2a = Evaluate(ac=ac2, alearn=clf2)

        # subtable
        dirpath = "data/kaggle-kkbox-churn-prediction-challenge-1k"
        members_df = pd.read_csv(os.path.join(dirpath, "members_train.csv"))
        transactions_df = pd.read_csv(os.path.join(dirpath,
                                                   "transactions.csv"))
        user_logs_df = pd.read_csv(os.path.join(dirpath, "user_logs.csv"))

        subtables3 = {
            "transactions": {
                "table": transactions_df,
                "link_key": "msno",
                "group_key": "msno"
            },
            "user_logs": {
                "table": user_logs_df,
                "link_key": "msno",
                "group_key": "msno"
            }
        }

        ac3 = AutoConverter(target="is_churn")
        X3, y3 = ac3.fit_transform(df=members_df, subtables=subtables3)
        al3 = AutoLearn(level=1)
        al3.learn(X3, y3)
        self.e3 = Evaluate(ac=ac3, alearn=al3)

        self.df4 = members_df
        ac4 = AutoConverter(target="is_churn", task_type="regression")
        X4, y4 = ac4.fit_transform(df=members_df)
        al4 = AutoLearn(level=1, task="regression")
        al4.learn(X4, y4)
        e4 = Evaluate(alearn=al4, ac=ac4)
        self.e4 = e4

    def test_calculate_column_importance(self):
        for e in [self.e1, self.e1a, self.e2, self.e2a, self.e3, self.e4]:
            try:
                e.calculate_column_importance()
            except Exception as e:
                self.fail(str(e))

    def test_evaluate(self):
        for e in [self.e1, self.e1a, self.e2, self.e2a, self.e3, self.e4]:
            orig_eval_s = e.evaluate_performance()
            col_imp_df = e.calculate_column_importance()
            self.assertEqual(orig_eval_s.index.tolist(),
                             col_imp_df.columns.tolist())

        # They should raise Errors as X and y are not given
        with self.assertRaises(ValueError):
            self.e1b.evaluate_performance()

        with self.assertRaises(ValueError):
            self.e1b.calculate_column_importance()

    def test_get_top_column(self):
        self.assertEqual(5, len(self.e1.get_top_columns(n=5)))
        for table_colname in self.e3.get_top_columns(n=3):
            tablename = table_colname.split("..")[0]
            self.assertTrue(tablename in list(self.e3.ac.subtables_.keys()) +
                            ["main"])

    def test_get_mispredictions(self):
        for e, df in [(self.e1, self.df1), (self.e1a, self.df1),
                      (self.e2, self.df2), (self.e2a, self.df2)]:
            mispred_df = e.get_mispredictions(df)
            orig_colset = set(df.columns.tolist())
            mispred_colset = set(mispred_df.columns.tolist())

            # All columns in mispred_df should be in df
            self.assertEqual(len(mispred_colset & orig_colset),
                             len(mispred_colset))

    def test_stratify_errors(self):
        for e, df in [(self.e1, self.df1), (self.e1a, self.df1)]:
            es = e.stratify_errors(df)
            self.assertIsNotNone(es)
            self.assertIsInstance(es, ErrorSummary)
            self.assertIsNotNone(es.diversity)
            self.assertIsNotNone(es.error_dist)
            self.assertIsNotNone(es.errors)
            self.assertEqual(es.error_dist.index.levels[0].tolist(),
                             es.diversity.index.tolist())

        # None should be returned for the Iris dataset
        self.assertIsNone(self.e2.stratify_errors(self.df2))
        self.assertIsNone(self.e2a.stratify_errors(self.df2))

    def test_get_explanations(self):
        e_df1 = self.e1.get_explanations(self.df1)
        self.assertEqual(e_df1.shape[0], self.df1.shape[0])
        e_df1a = self.e1a.get_explanations(self.df1)
        self.assertEqual(e_df1a.shape[0], self.df1.shape[0])
        e_df2 = self.e2.get_explanations(self.df2)
        self.assertEqual(e_df2.shape[0], self.df2.shape[0])
        e_df2a = self.e2a.get_explanations(self.df2)
        self.assertEqual(e_df2a.shape[0], self.df2.shape[0])
Exemple #8
0
from learnit.autolearn.evaluate import Evaluate
from learnit.autolearn.blueprints import StackedXGBoost, AverageBlender

if __name__ == '__main__':
    df = pd.read_csv('data/train.csv')
    ac = AutoConverter(target='Survived')
    X, y = ac.fit_transform(df)
    al = AutoLearn(customized_clf_list=[('LogisticRegression',
                                         LogisticRegression())],
                   metric='roc_auc',
                   cv_num=5,
                   pos_label=1,
                   n_jobs=1,
                   verbose=0)
    results = al.learn(X, y)
    print(results['name'])
    print(results['eval_df'])

    pred = al.predict(X)
    print(pred)

    # name, clf, cv_result = autolearn(X, y, verbose=3, clf_list=[('AverageBlender', AverageBlender(scoring='roc_auc', random_state=1, verbose=3))])
    #name, clf, cv_result = autolearn(X, y, verbose=3, clf_list=[('LogisticRegression', LogisticRegression())])

    e = Evaluate(ac, al)
    orig_eval_s = e.evaluate_performance()
    col_imp_df = e.calculate_column_importance()
    explain_df = e.get_explanations(df)

    X_test = ac.transform(df, prediction=True)