Ejemplo n.º 1
0
 def test_quantiles_property(self):
     fi = FeatureImpact()
     self.assertEqual(None, fi.quantiles)
     fi.quantiles = []
     assert_array_almost_equal(pandas.DataFrame([]), fi.quantiles, 6)
     fi.quantiles = [[1, 2], [3, 4]]
     assert_array_almost_equal(pandas.DataFrame([[1, 2], [3, 4]]),
                               fi.quantiles, 6)
Ejemplo n.º 2
0
 def test_make_quantiles(self):
     fi = FeatureImpact()
     self.assertRaises(FeatureImpactError,
                       fi.make_quantiles,
                       X=[],
                       n_quantiles=0)
     X = [[1, 2, 3], [4, 2, 6], [7, 2, 9]]
     fi.make_quantiles(X, n_quantiles=3)
     exp = numpy.array(X)
     assert_array_almost_equal(exp, fi.quantiles, 6)
Ejemplo n.º 3
0
    def test_compute_impact_zero_prediction(self):
        class M:
            def predict(self, _):
                return numpy.array([0., 0., 0.])

        fi = FeatureImpact()
        X = numpy.array([[1, 2, 3], [4, 2, 6], [7, 2, 9]], dtype=float)
        fi.quantiles = X.transpose()
        impact = fi.compute_impact(M(), X)
        exp = numpy.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]], dtype=float)
        assert_array_almost_equal(exp, impact, 6)
Ejemplo n.º 4
0
 def test(self):
     n_samples = 100000
     n_features = 100
     X = get_features(n_samples, n_features)
     y = numpy.random.rand(n_samples)
     fi = FeatureImpact()
     timer = Timer()
     fi.make_quantiles(X)
     print('')
     print("make_quantiles: {}".format(timer))
     timer.reset()
     imp = fi.compute_impact(Model(y), X)
     print("compute_impact: {}".format(timer))
Ejemplo n.º 5
0
    def test_compute_impact_real_prediction(self):
        class M:
            def __init__(self):
                self._i = 0

            def predict(self, X):
                if self._i >= X.shape[1]:
                    self._i = 0
                y = numpy.array(X.iloc[:, self._i])
                self._i += 1
                return y

        fi = FeatureImpact()
        X = numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=float)
        fi.quantiles = X.transpose()
        impact = fi.compute_impact(M(), X)
        exp = numpy.array([[0, 1, 0], [0, 0, 1], [1, 0, 0]], dtype=float)
        assert_array_almost_equal(exp, impact, 6)
Ejemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--quality",
                        help="Ensure good data quality",
                        type=str,
                        choices=['train', 'test'])
    parser.add_argument("--train",
                        help="Train the model",
                        type=str,
                        choices=['cv', 'best'])
    parser.add_argument("--evaluate",
                        help="Evaluate the model",
                        type=str,
                        choices=['cv', 'best'])
    parser.add_argument("--submission",
                        help="Generate submission on test data",
                        type=str,
                        choices=['cv', 'best'])
    args = parser.parse_args()

    try:
        os.mkdir('output')
    except FileExistsError:
        pass

    if args.quality:
        model = Pipeline(base_pipeline())
        if args.quality == 'train':
            X, y = train_data()
            X = model.fit_transform(X)
            X = pd.concat([X, y], axis=1)
        else:  # test
            X, _ = test_data()
            X = model.fit_transform(X)
        print('X.dtypes Start -------------')
        print(X.dtypes)
        print('X.dtypes End -------------')
        print('X.head() Start -------------')
        print(X.head())
        print('X.head() End -------------')
        print('X.isna().sum() Start -------------')
        print(X.isna().sum())
        print('X.isna().sum() End -------------')
        print('X.describe() Start -------------')
        print(X.describe())
        print('X.describe() End -------------')
        assert_all_finite(X)
        hist_all(X)
        corr_all(X)
        if args.quality == 'train':
            factor_all(X, 'Survived')

        plt.show(block=False)
        input("Press [enter] to continue.")
        return

    if args.train == 'cv':
        X, y = train_data()
        model = train_pipeline()
        scores = cross_validate(model, X, y, scoring='accuracy',
                                cv=10)  # be aware of the accuracy paradox
        print('scores =', scores['test_score'])
        print('mean score =', scores['test_score'].mean())
        print('std score =', scores['test_score'].std())
        model = train_pipeline()
        model.fit(X, y)
        with open('output/modelcv.pickle', 'wb') as f:
            pickle.dump(model, f)
        return

    if args.train == 'best':
        X, y = train_data()
        model = train_pipeline()
        # SVC
        params = {
            'model__gamma': [1e-4, 1e-3, 1e-2, 1e-1],
            'model__C': [1e0, 1e1, 1e2, 1e3],
            #            'pca__n_components': [10, 11, 12, 13, 14, 15],
            #            'fare_imputer__fill_value': range(0, 100, 5),
            #            'cabin_deck_imputer__fill_value': range(ord('A'), ord('T'), 1),
            #             'embarked_imputer__fill_value': ['C', 'Q', 'S'],
            # 'dropper__columns': ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'family_size',
            #                    'family_group_x0_alone',
            #                    'family_group_x0_small', 'title_x0_Master.', 'title_x0_Miss.',
            #                    'title_x0_Mrs.', 'title_x0_Rare', 'ticket_number',
            #                    'cabin_deck', 'Embarked_x0_C', 'Embarked_x0_Q',
            #                    'Embarked_x0_S', []],
        }
        # MLP
        # params = {
        #     'model__hidden_layer_sizes': range(120, 180, 10),
        #     'model__learning_rate_init': [1e-3, 1e-2, 1e-1],
        # }
        gridcv = GridSearchCV(model,
                              params,
                              verbose=2,
                              cv=5,
                              scoring='accuracy')
        gridcv.fit(X, y)
        print('best score =', gridcv.best_score_)
        print('best params =', gridcv.best_params_)
        best = gridcv.best_estimator_
        best.fit(X, y)
        with open('output/modelbest.pickle', 'wb') as f:
            pickle.dump(best, f)
        return

    if args.submission:
        X, ids = test_data()
        with open('output/model{}.pickle'.format(args.submission), 'rb') as f:
            model = pickle.load(f)
        y_pred = model.predict(X)
        y_pred = pd.DataFrame(y_pred, columns=['Survived'])
        y_pred.index = ids.index
        submission = pd.concat([ids, y_pred], axis=1)
        submission.to_csv('output/submission{}.csv'.format(args.submission),
                          index=False)
        return

    if args.evaluate:
        # Impact of features (uses current code)
        X, y = train_data()
        base = Pipeline(base_pipeline())
        X = base.fit_transform(X)
        model = Pipeline(top_pipeline())
        model.fit(X, y)
        fi = FeatureImpact()
        fi.make_quantiles(X)
        X, _ = test_data()
        X = base.transform(X)
        impact = averaged_impact(fi.compute_impact(model, X))
        for key, imp in impact.iteritems():
            print(key, imp)

        # ROC curve (uses pre-trained model)
        X, y = train_data()
        with open('output/model{}.pickle'.format(args.evaluate), 'rb') as f:
            model = pickle.load(f)
        y_pred = model.predict_proba(X)[:, 1]
        fpr, tpr, thresholds = roc_curve(y.values, y_pred)
        roc_auc = auc(fpr, tpr)
        plt.figure("ROC")
        plt.plot(fpr,
                 tpr,
                 color='darkorange',
                 lw=2,
                 label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.0])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC')
        plt.legend(loc="lower right")
        plt.show(block=False)
        input("Press [enter] to continue.")
        return
Ejemplo n.º 7
0
# Training
linreg = LinearRegression()
linreg.fit(X, y)

forest = RandomForestRegressor(n_estimators=100)
forest.fit(X, y)

svr = SVR(gamma='scale')
svr.fit(X, y)

# Get linreg and forest coefficients
coefs_linreg = numpy.abs(linreg.coef_)
coefs_forest = forest.feature_importances_

# Computing the impact
fi = FeatureImpact()
fi.make_quantiles(X)
impact_linreg = fi.compute_impact(linreg, X)
impact_forest = averaged_impact(fi.compute_impact(forest, X))
impact_svr = averaged_impact(fi.compute_impact(svr, X))

print("Impact vs LinearRegression coeffs:")
for i, imp in enumerate(impact_linreg):
    print(i, impact_linreg[imp].mean(), coefs_linreg[i])

print("Impact vs RandomForestRegressor coeffs:")
for i, imp in enumerate(impact_forest):
    print(i, imp, coefs_forest[i])

print("Impact on SVR:")
for i, imp in enumerate(impact_svr):