def test_plot_partial_dependence():
    # Test partial dependence plot function.
    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
    clf.fit(boston.data, boston.target)

    grid_resolution = 25
    fig, axs = plot_partial_dependence(clf, boston.data, [0, 1, (0, 1)],
                                       grid_resolution=grid_resolution,
                                       feature_names=boston.feature_names)
    assert len(axs) == 3
    assert all(ax.has_data for ax in axs)

    # check with str features and array feature names
    fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN',
                                                          ('CRIM', 'ZN')],
                                       grid_resolution=grid_resolution,
                                       feature_names=boston.feature_names)

    assert len(axs) == 3
    assert all(ax.has_data for ax in axs)

    # check with list feature_names
    feature_names = boston.feature_names.tolist()
    fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN',
                                                          ('CRIM', 'ZN')],
                                       grid_resolution=grid_resolution,
                                       feature_names=feature_names)
    assert len(axs) == 3
    assert all(ax.has_data for ax in axs)
def test_plot_partial_dependence_multiclass():
    # Test partial dependence plot function on multi-class input.
    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
    clf.fit(iris.data, iris.target)

    grid_resolution = 25
    fig, axs = plot_partial_dependence(clf, iris.data, [0, 1],
                                       label=0,
                                       grid_resolution=grid_resolution)
    assert len(axs) == 2
    assert all(ax.has_data for ax in axs)

    # now with symbol labels
    target = iris.target_names[iris.target]
    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
    clf.fit(iris.data, target)

    grid_resolution = 25
    fig, axs = plot_partial_dependence(clf, iris.data, [0, 1],
                                       label='setosa',
                                       grid_resolution=grid_resolution)
    assert len(axs) == 2
    assert all(ax.has_data for ax in axs)

    # label not in gbrt.classes_
    assert_raises(ValueError, plot_partial_dependence,
                  clf, iris.data, [0, 1], label='foobar',
                  grid_resolution=grid_resolution)

    # label not provided
    assert_raises(ValueError, plot_partial_dependence,
                  clf, iris.data, [0, 1],
                  grid_resolution=grid_resolution)
Esempio n. 3
0
 def partial_dependence(self, X_train, fnames):
     colindex = np.argsort(self.feature_importances_)[::-1]
     plot_partial_dependence(self.model, X_train, colindex,
                             feature_names = fnames,
                             figsize=(12,10))
     plt.title(self.model.__class__.__name__ + " Partial Dependence")
     plt.tight_layout()
     plt.show()
Esempio n. 4
0
def plot_skboost_partial_dependence(model, ax, X_train):
    plot_partial_dependence(model,
                            X_train, [0, 1],
                            feature_names=X_train.feature_names[0:3],
                            n_jobs=-1,
                            grid_resolution=50)
    fig.suptitle('Partial Dependence Plot')
    fig.set_figwidth(15)
Esempio n. 5
0
def make_plots_of_chemical_features(df_pos_confirmed, df_neg_confirmed):
    import seaborn as sns
    sns.set_context('talk', font_scale=1.2)
    
    path_to_output = '/home/kimlab1/strokach/working/chemical_interactions/results/14-11-07/'
    
    fg, ax = plt.subplots(figsize=(10,6))
    df_pos_confirmed['side_effect_similarity'].hist(range=(0,0.6), bins=10, ax=ax)
    df_neg_confirmed['side_effect_similarity'].hist(range=(0,0.6), bins=10, ax=ax, alpha=0.7)
    ax.set_xlabel('Side effect similarity')
    ax.set_ylabel('Number of drug pairs')
    ax.legend(['Confirmed positive', 'Confirmed negative'])
    plt.savefig(path_to_output + 'side_effect_similarity_hist.png', bbox_inches='tight', dpi=150)
    plt.savefig(path_to_output + 'side_effect_similarity.pdf', bbox_inches='tight')
    plt.savefig(path_to_output + 'side_effect_similarity.eps', bbox_inches='tight')
    
    
    fg, ax = plt.subplots(figsize=(10,6))
    df_pos_confirmed['chemical_similarity'].hist(range=(0,1), bins=10, ax=ax)
    df_neg_confirmed['chemical_similarity'].hist(range=(0,1), bins=10, ax=ax, alpha=0.7)
    ax.set_xlabel('Chemical similarity')
    ax.set_ylabel('Number of drug pairs')
    ax.legend(['Confirmed positive', 'Confirmed negative'])
    plt.savefig(path_to_output + 'chemical_similarity_hist.png', bbox_inches='tight', dpi=150)
    plt.savefig(path_to_output + 'chemical_similarity_hist.pdf', bbox_inches='tight')
    plt.savefig(path_to_output + 'chemical_similarity_hist.eps', bbox_inches='tight')
    
    
    fg, ax = plt.subplots(figsize=(10,6))
    df_pos_confirmed['atc_similarity'].hist(range=(0,5), bins=10, ax=ax)
    df_neg_confirmed['atc_similarity'].hist(range=(0,5), bins=10, ax=ax, alpha=0.7)
    ax.set_xlabel('ATC code similarity')
    ax.set_ylabel('Number of drug pairs')
    ax.legend(['Confirmed positive', 'Confirmed negative'])
    plt.savefig(path_to_output + 'atc_code_similarity_hist.png', bbox_inches='tight', dpi=150)
    plt.savefig(path_to_output + 'atc_code_similarity_hist.pdf', bbox_inches='tight')
    plt.savefig(path_to_output + 'atc_code_similarity_hist.eps', bbox_inches='tight')

    
    # Make a feature dependence plot
    features = [0, 2, 1]
    pred = ci.Predictor(input_file, path_to_data)
    data_train, labels_train = get_data_and_labels(pred.predictor_df)
    fg, ax = plt.subplots(figsize=(8,10), facecolor='white')
    plot_partial_dependence(
        clf, data_train, features, n_cols=2, percentiles=(0.01, 0.99),
        feature_names=['ATC similarity', 'Chemical similarity', 'Side effect similarity'],
        n_jobs=3, grid_resolution=100, ax=ax)
    plt.savefig(path_to_output + 'drug_pair_feature_importances.png', bbox_inches='tight', dpi=150)
    plt.savefig(path_to_output + 'drug_pair_feature_importances.pdf', bbox_inches='tight')
    plt.savefig(path_to_output + 'drug_pair_feature_importances.eps', bbox_inches='tight')
Esempio n. 6
0
def plot_dependence(data):
    ''' Plot the partial dependence '''
    # train a gbm
    x_train = data.copy()
    y_train = data[RESPONSE_VAR].copy()
    x_train = x_train.drop(RESPONSE_VAR, axis=1)
    # train
    reg = GradientBoostingRegressor(random_state=SEED,
                                    n_estimators=500,
                                    max_features=1 / 3)
    reg.fit(x_train, y_train)

    # determine importances
    importances = reg.feature_importances_
    indices = np.argsort(importances)[::-1]
    var_names = x_train.columns[indices]

    # partial dependence
    features = list(indices[0:4])
    names = list(var_names[0:4])
    # import code
    # code.interact(local=locals())
    fig, axs = plot_partial_dependence(reg,
                                       x_train,
                                       features,
                                       feature_names=x_train.columns,
                                       n_jobs=3,
                                       grid_resolution=50,
                                       n_cols=2)
    plt.tight_layout()  # tight_layout causes overlap with suptitle
    plt.savefig('fig/pdp_py_{}.png'.format(dataset),
                format='png',
                dpi=200,
                transparent=False)
    plt.show()
Esempio n. 7
0
def main():
    cal_housing = fetch_california_housing()

    # split 80/20 train-test
    X_train, X_test, y_train, y_test = train_test_split(cal_housing.data,
                                                        cal_housing.target,
                                                        test_size=0.2,
                                                        random_state=1)
    names = cal_housing.feature_names
    print('_' * 80)
    print("Training GBRT...")

    clf = GradientBoostingRegressor(n_estimators=100,
                                    max_depth=4,
                                    learning_rate=0.1,
                                    loss='huber',
                                    random_state=1)
    clf.fit(X_train, y_train)
    print("done.")
    print('_' * 80)
    print('Convenience plot with ``partial_dependence_plots``')
    print

    features = [0, 5, 1, 2, (5, 1)]
    fig, axs = plot_partial_dependence(clf,
                                       X_train,
                                       features,
                                       feature_names=names,
                                       n_jobs=3,
                                       grid_resolution=50)
    fig.suptitle(
        'Partial dependence of house value on nonlocation features for the California housing dataset'
    )
    plt.subplots_adjust(top=0.9)
    print('_' * 80)
    print('Custom 3d plot via ``partial_dependence``')
    print

    fig = plt.figure()
    target_feature = (1, 5)
    pdp, (x_axis, y_axis) = partial_dependence(clf,
                                               target_feature,
                                               X=X_train,
                                               grid_resolution=50)
    XX, YY = np.meshgrid(x_axis, y_axis)
    Z = pdp.T.reshape(XX.shape).T
    ax = Axes3D(fig)
    surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu)
    ax.set_xlabel(names[target_feature[0]])
    ax.set_ylabel(names[target_feature[1]])
    ax.set_zlabel('Partial dependence')

    #  pretty init view
    ax.view_init(elev=22, azim=122)
    plt.colorbar(surf)
    plt.suptitle(
        'Partial dependence of house value on median age and average occupancy'
    )
    plt.subplots_adjust(top=0.9)
    plt.show()
def partial_dependence(df, y):
    '''
    INPUT: X = features
           y = target variable binary, imbalanced classes
    OUPUT: X = features oversampled to have balanced target classes
           y = target variable oversample to have balanced classes

    Discovers the minority class and then oversamples until eah class makes up
    50% of your data.
    '''
    X_train, X_test, y_train, y_test = oversample_train_test(df, y)
    # X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=42)

    feature_engineering = Pipeline([
        ('lists', ListSplitter()),
        ('race', RaceDummies()),
        ('crime_sentence', CrimeAndSentence()),
        ('feat_eng', FeatureEngineer()),
        ('columns', ColumnFilter(prejudice=False))
    ])

    X = feature_engineering.fit_transform(X_train.copy(), y_train)
    X_test = feature_engineering.fit_transform(X_test.copy(), y_test)

    gbc = GradientBoostingClassifier(n_estimators=850, learning_rate=.75)
    gbc.fit(X.copy(), y_train)
    most_imp = np.argsort(gbc.feature_importances_)[-6:]

    names = list(X_test.columns)
    feats = list(most_imp)
    fig, axs = plot_partial_dependence(gbc, X_test, feats, feature_names=names,
                                       n_jobs=3, grid_resolution=50)
def make_part_plot(mdoel, features):
    importance = model.feature_importances_
    importance = np.argsort(importance)[::-1]
    importance = importance[10:25]

    fig, axs = plot_partial_dependence(model, df, importance,
                                       feature_names=features,
                                       n_jobs=3, grid_resolution=50)

    for ax in axs:
        name = ax.get_xlabel()
        ax.set_xlabel(name, fontsize=16)
        ax.set_ylim(-4,4)

        if name == 'average_gap':
            ax.set_xlim(0, 6)
            #ax.set_ylim(-2, 2)
        if name == 'highest_like':
            ax.set_ylim(-10, 40)
        if name == 'highest_topic_percent':
            ax.set_xlim(0.5, 0.8)
        if name == 'url_length':
            ax.set_xlim(0,25)



    fig.suptitle('Partial Dependence Plot for Selected Features \n Effect on Subscribers Count', fontsize=24)
    plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle

    plt.show()
def partial_dependence(df, y):
    '''
    INPUT: X = features
           y = target variable binary, imbalanced classes
    OUPUT: X = features oversampled to have balanced target classes
           y = target variable oversample to have balanced classes

    Discovers the minority class and then oversamples until eah class makes up
    50% of your data.
    '''
    X_train, X_test, y_train, y_test = oversample_train_test(df, y)
    # X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=42)

    feature_engineering = Pipeline([('lists', ListSplitter()),
                                    ('race', RaceDummies()),
                                    ('crime_sentence', CrimeAndSentence()),
                                    ('feat_eng', FeatureEngineer()),
                                    ('columns', ColumnFilter(prejudice=False))
                                    ])

    X = feature_engineering.fit_transform(X_train.copy(), y_train)
    X_test = feature_engineering.fit_transform(X_test.copy(), y_test)

    gbc = GradientBoostingClassifier(n_estimators=850, learning_rate=.75)
    gbc.fit(X.copy(), y_train)
    most_imp = np.argsort(gbc.feature_importances_)[-6:]

    names = list(X_test.columns)
    feats = list(most_imp)
    fig, axs = plot_partial_dependence(gbc,
                                       X_test,
                                       feats,
                                       feature_names=names,
                                       n_jobs=3,
                                       grid_resolution=50)
Esempio n. 11
0
def main():
    X_train, X_test, y_train, y_test, y_encoder = get_binary_encoded_xy_split(5000)
    # reduce 1000 X 1024 dimensions to 11 (number of X columns before label binarization in table)
    X_train_randPCA = RandomizedPCA()
    X_train_randPCA.fit(X_train)
    print("pca fit")

    X_train_reduced = X_train_randPCA.transform(X_train)
    X_test_reduced = X_train_randPCA.transform(X_test)

    print("Reduced components")
    print("Begin classifier")
    clf = GradientBoostingClassifier(n_estimators=200, max_depth=4, learning_rate=0.1, random_state=1)
    print(y_train.shape, y_test.shape)
    print(y_encoder.classes_)
    print(y_encoder.transform(["Accident"]))
    print(np.where(y_encoder.classes_ == "Accident"))
    clf.fit(X_train_reduced, y_train[:, np.where(y_encoder.classes_=="Accident")[0]])
    print("Fitted")
    print("_" * 80)
    feature_vals = y_encoder.transform(y_encoder.classes_)
    feature_labels = y_encoder.classes_
    print(feature_vals)
    print(feature_labels)
    fig, axs = plot_partial_dependence(clf, X_train,[0,1], n_jobs=4, grid_resolution=100)
    plt.show()
Esempio n. 12
0
def main():
    print 'loading data'
    boston = datasets.load_boston()
    #iris = datasets.load_iris()
    print 'data loaded'
    X, y = boston.data, boston.target
    #X, y = iris.data, iris.target
    print 'X shape:', X.shape
    print 'y shape:', y.shape

    scaler = StandardScaler()

    # I need to fit and transform the data with the scaler.. how do I put
    # this into pipeline?

    # initialize PCA to pick 5 components
    #pca = decomposition.PCA(n_components=4)

    scaledX = scaler.fit_transform(X)
    #kf = cross_validation.KFold(scaledX, n_folds=3, shuffle=True)
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        scaledX, y)

    # then I will plot partial dependence to see how the features work
    clf = GradientBoostingRegressor(n_estimators=100,
                                    max_depth=4,
                                    learning_rate=.1,
                                    loss='huber',
                                    random_state=1)

    print 'training', X_train.shape, y_train.shape
    clf.fit(X_train, y_train)
    print 'trained'

    features = [0, 5, 1, 2, (5, 1)]
    fig, axs = plot_partial_dependence(clf,
                                       X_train,
                                       features,
                                       feature_names=None,
                                       n_jobs=3,
                                       grid_resolution=50)

    fig.suptitle(
        'Partial dependence of house value on nonlocation features\n' +
        'for the California housing dataset')
    plt.subplots_adjust(top=0.9)

    plt.show()

    # then I will PCA and plot partial dependence there
    # then lasso PCA

    # then select the parameters
    # plots are nice but shouldn't I be selecting based on multi-dimensional data?

    # then generate a list of multi-parameter algorithms
    # then do a parameter search with gradient boost and a few other multi-parameter algorithms

    return
Esempio n. 13
0
def plot_partial_dependence(tmodel, X, col_names, cols_to_plot):
    assert isinstance(cols_to_plot, list)
    assert len(cols_to_plot) < 3

    inds = [np.where(col_names == col)[0][0] for col in cols_to_plot]

    if len(inds) == 2:
        features = (inds[0], inds[1], (inds[0], inds[1]))
        fig, axs = pdep.plot_partial_dependence(tmodel,
                                                X,
                                                features,
                                                feature_names=col_names)
    else:
        features = [inds[0]]
        fig, axs = pdep.plot_partial_dependence(tmodel,
                                                X,
                                                features,
                                                feature_names=col_names)
Esempio n. 14
0
 def figure_plot(self):
     fig, _ = plot_partial_dependence(
         self.__gbc,
         self.__train_feature,
         features=self.__all.feature_names,
         feature_names=self.__all.feature_names,
         grid_resolution=100,
         n_cols=3)
     plt.show()
Esempio n. 15
0
def plot_partial_dependence(est,
                            X,
                            features,
                            fnames,
                            tag,
                            n_jobs=-1,
                            verbosity=0,
                            directory=None):
    r"""Display a Partial Dependence Plot.

    Parameters
    ----------
    est : estimator
        The scikit-learn estimator for calculating partial dependence.
    X : numpy array
        The data on which the estimator was trained.
    features : list of int
        Feature numbers of ``X``.
    fnames : list of str
        The feature names to plot.
    tag : str
        Unique identifier for the plot
    n_jobs : int, optional
        The maximum number of parallel jobs.
    verbosity : int, optional
        The amount of logging from 0 (minimum) and higher.
    directory : str
        Directory where the plot will be stored.

    Returns
    -------
    None : None.

    References
    ----------

    http://scikit-learn.org/stable/auto_examples/ensemble/plot_partial_dependence.html#sphx-glr-auto-examples-ensemble-plot-partial-dependence-py

    """

    logger.info("Generating Partial Dependence Plot")

    # Plot partial dependence

    fig, axs = plot_partial_dependence(est,
                                       X,
                                       features,
                                       feature_names=fnames,
                                       grid_resolution=50,
                                       n_jobs=n_jobs,
                                       verbose=verbosity)
    title = "Partial Dependence Plot"
    fig.suptitle(title)
    plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle

    # Save the plot
    write_plot(model, 'matplotlib', plt, 'partial_dependence', tag, directory)
def main():
    # fetch California housing dataset
    try:
        cal_housing = fetch_california_housing()
    except HTTPError:
        print("Failed downloading california housing data.")
        return

    # split 80/20 train-test
    X_train, X_test, y_train, y_test = train_test_split(cal_housing.data,
                                                        cal_housing.target,
                                                        test_size=0.2,
                                                        random_state=1)
    names = cal_housing.feature_names

    print('_' * 80)
    print("Training GBRT...")
    clf = GradientBoostingRegressor(n_estimators=100, max_depth=4,
                                    learning_rate=0.1, loss='huber',
                                    random_state=1)
    clf.fit(X_train, y_train)
    print("done.")

    print('_' * 80)
    print('Convenience plot with ``partial_dependence_plots``')
    print

    features = [0, 5, 1, 2, (5, 1)]
    fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names,
                                       n_jobs=3, grid_resolution=50)
    fig.suptitle('Partial dependence of house value on nonlocation features\n'
                 'for the California housing dataset')
    plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle

    print('_' * 80)
    print('Custom 3d plot via ``partial_dependence``')
    print
    fig = plt.figure()

    target_feature = (1, 5)
    pdp, (x_axis, y_axis) = partial_dependence(clf, target_feature,
                                               X=X_train, grid_resolution=50)
    XX, YY = np.meshgrid(x_axis, y_axis)
    Z = pdp.T.reshape(XX.shape).T
    ax = Axes3D(fig)
    surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu)
    ax.set_xlabel(names[target_feature[0]])
    ax.set_ylabel(names[target_feature[1]])
    ax.set_zlabel('Partial dependence')
    #  pretty init view
    ax.view_init(elev=22, azim=122)
    plt.colorbar(surf)
    plt.suptitle('Partial dependence of house value on median age and '
                 'average occupancy')
    plt.subplots_adjust(top=0.9)

    plt.show()
Esempio n. 17
0
def plot_gradiant(clf, X_train, y_train, features):
    clf.fit(X_train, y_train)
    fig, axs = plot_partial_dependence(clf, X_train, features.keys(), feature_names=features.values(),
                                       grid_resolution=50)
    fig.suptitle('Partial dependence of house value on nonlocation features\n'
                 'for the California housing dataset')
    plt.subplots_adjust(top=0.9)  #
    fig = plt.figure()
    plt.show()
Esempio n. 18
0
 def plot(self, f_0, f_1=0):
     print 'feature importance:'
     for index, value in enumerate(self.clf.feature_importances_):
         print index,': ',value
     features = [f_0, f_1, (f_0, f_1)]
     labels = DataCleaner.categories
     for i in range(1,8):
         fig, axs = plot_partial_dependence(self.clf, self.X, features, label=i)
         pl.show()
         pl.clf()
Esempio n. 19
0
def get_GB_cls_metrics(data_fh,info):
    """
    Get the metrics of Gradient Boost classification models

    :param data_classi_fh: path to file containing Classification training data
    """
    from pylab import figtext
    try:
        dpkl=read_pkl(data_fh)
    except:
        return False
    if not 'gs_cv' in dpkl.keys():
        return False
    dXy=dpkl['dXy_final']
    ycol=dpkl['ycol']
    gs_cv=dpkl['gs_cv']
    feat_imp = dpkl['feat_imp']

    Xcols=[c for c in dXy.columns.tolist() if c!=ycol]
    est=gs_cv.best_estimator_
    X=dXy.loc[:,Xcols].as_matrix()
    y=dXy.loc[:,ycol].as_matrix()        

    #partial dep 
    plot_type='partial_dep'
    plot_fh='%s/data_ml/%s.%s.pdf' % (info.prj_dh,plot_type,basename(data_fh))
    if not exists(plot_fh):
        feats_indi=[s for s in dpkl['feat_imp'].head(6).index.tolist() if not ((') ' in s) and (' (' in s))]
        features=[Xcols.index(f) for f in feats_indi]
        feature_names=linebreaker(Xcols)
        from sklearn.ensemble.partial_dependence import plot_partial_dependence
        fig, axs = plot_partial_dependence(est, X, features,#[[features[1],features[2]]],
                                           feature_names=feature_names,
                                           n_jobs=int(info.cores), grid_resolution=50,
                                           n_cols=2,
                                           line_kw={'color':'r'},
                                          figsize=[7,9])
        figtext(0.9,-0.2,'AUC = %.2f' % gs_cv.best_score_,ha='right',color='b')
        saveplot(plot_fh,form='pdf',tight_layout=False)
    
    #relimp
    plot_type='featimps'
    plot_fh='%s/data_ml/%s.%s.pdf' % (info.prj_dh,plot_type,basename(data_fh))
    if not exists(plot_fh):
        featst=10
        fig=plt.figure(figsize=(3,featst*0.75))
        # fig = plt.figure(figsize=(8,featst*0.25))#figsize=(11,5))
        ax=plt.subplot(111)
        feat_imp=feat_imp.sort_values(by='Feature importance',ascending=True)
        feat_imp.index=linebreaker(feat_imp.index, break_pt=30)
        feat_imp.tail(featst).plot(kind='barh',ax=ax, color='red')
        ax.set_xlabel('Feature Importance')
        ax.legend([])    
        figtext(0.9,-0.2,'AUC = %.2f' % gs_cv.best_score_,ha='right',color='b')
        saveplot(plot_fh,form='pdf',tight_layout=False)
Esempio n. 20
0
def plot_gbm(gbm, model_name, train_X, train_Y, test_X, test_Y, train_fea_name_list):

    img_dir = './data/' + model_name + '/'

    train_img_path = img_dir + 'roc_train_' + model_name + '.png'
    test_img_path = img_dir + 'roc_test_' + model_name + '.png'
    all_img_path = img_dir + 'roc_all_' + model_name + '.png'
    importance_img_path = img_dir + 'importance_' + model_name + '.png'
    pdp_img_path = img_dir + 'pdp_' + model_name + '.png'

    train_fpr, train_tpr, train_auc, train_accuracy= data_predict(gbm, train_X, train_Y, train_img_path)
    test_fpr, test_tpr, test_auc, test_accuracy = data_predict(gbm, test_X, test_Y, test_img_path)

    print 'train data auc %.4f, test data auc %.4f' % (train_auc, test_auc)
    print 'train data accuracy %.4f, test data accuracy %.4f' % (train_accuracy, test_accuracy)
    print '**************** feature importance *****************'
    imp_items =  zip(train_fea_name_list, gbm.feature_importances_)
    sorted_imp_items = sorted(imp_items, key = lambda x:x[1], reverse = True)
    for name, imp in sorted_imp_items:
        print '%s: %.4f' % (name, imp)

    # *********** plot auc **************
    plt.figure()
    plt.plot(train_fpr, train_tpr, label = 'train_auc ' + \
            "%.2f" % (train_auc) + ', acc: ' + "%.2f" % (train_accuracy))
    plt.plot(test_fpr, test_tpr, label = 'test_auc ' + \
            "%.2f" % (test_auc) + ', acc: ' + "%.2f" % (test_accuracy))
    plt.legend()
    plt.savefig(all_img_path)

    # *************** plot importance ***************
    feature_importance = gbm.feature_importances_
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    plt.figure()
    plt.subplot(1, 2, 2)
    plt.barh(pos, feature_importance[sorted_idx], align='center')
    plt.yticks(pos, np.array(train_fea_name_list)[sorted_idx])
    plt.xlabel('Relative Importance')
    plt.title('Variable Importance')
    plt.legend()
    plt.savefig(importance_img_path)

    # *************** plot partial dependence ***************
    plt.figure()
    fig, axs = plot_partial_dependence(gbrt = gbm, X = train_X, features = [
        'PDnnSim', 'Bm25Sim', 'QueryLen', 'DocIdf', \
        ('PDnnSim', 'Bm25Sim')
        ],
        feature_names = np.array(train_fea_name_list),
        n_cols = 3, grid_resolution = 100, percentiles = (0.05, 0.95))
    plt.legend()
    plt.savefig(pdp_img_path)
def show_the_pdp(clf, xtrain, feature_li, feature_nam):
    fig, axs = plot_partial_dependence(clf,
                                       xtrain,
                                       feature_li,
                                       feature_names=feature_nam,
                                       grid_resolution=100,
                                       n_cols=3)
    fig.suptitle(
        "Partial dependence plots for the tick activity using Gradient Boosting method",
        size=20)
    fig.subplots_adjust(top=0.8, hspace=0.7, wspace=0.5)
    plt.show()
Esempio n. 22
0
def plot_gradiant(clf, X_train, y_train, features):
    clf.fit(X_train, y_train)
    fig, axs = plot_partial_dependence(clf,
                                       X_train,
                                       features.keys(),
                                       feature_names=features.values(),
                                       grid_resolution=50)
    fig.suptitle('Partial dependence of house value on nonlocation features\n'
                 'for the California housing dataset')
    plt.subplots_adjust(top=0.9)  #
    fig = plt.figure()
    plt.show()
Esempio n. 23
0
    def plotPartial(self, nFeat=2):
        features = self.indices[:nFeat]
        print "features",features
        featNames=final_cols
        print "FeatureNames",featNames
        fig, axs = plot_partial_dependence(self.gbr, self.X, features, feature_names=featNames)

        print('_' * 80)
        print('Custom 3d plot via ``partial_dependence``')
        print
        fig = plt.figure()
        plt.show()
Esempio n. 24
0
def rph_graph(X, y, columns):
    my_model = GradientBoostingRegressor()
    regression_columns = columns
    my_imputer = SimpleImputer()
    X_regression = my_imputer.fit_transform(X)
    my_model.fit(X_regression, y)
    my_plots = plot_partial_dependence(
        my_model,
        features=[0, 1, 2],  # column numbers of plots we want to show
        X=X_regression,  # raw predictors data.
        feature_names=regression_columns,  # labels on graphs
        grid_resolution=10)  # number of values to plot on x axis
Esempio n. 25
0
def main():
    print 'loading data'
    boston = datasets.load_boston()
    #iris = datasets.load_iris()
    print 'data loaded'
    X, y = boston.data, boston.target
    #X, y = iris.data, iris.target
    print 'X shape:', X.shape
    print 'y shape:', y.shape

    scaler = StandardScaler()

    # I need to fit and transform the data with the scaler.. how do I put
    # this into pipeline?


    # initialize PCA to pick 5 components
    #pca = decomposition.PCA(n_components=4)

    scaledX = scaler.fit_transform(X)
    #kf = cross_validation.KFold(scaledX, n_folds=3, shuffle=True)
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(scaledX, y)

    # then I will plot partial dependence to see how the features work
    clf = GradientBoostingRegressor(n_estimators=100, max_depth=4,
                                    learning_rate=.1, loss='huber',
                                    random_state=1)

    print 'training', X_train.shape, y_train.shape
    clf.fit(X_train, y_train)
    print 'trained'

    features = [0, 5, 1, 2, (5, 1)]
    fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=None,
                                       n_jobs=3, grid_resolution=50)

    fig.suptitle('Partial dependence of house value on nonlocation features\n' + 'for the California housing dataset')
    plt.subplots_adjust(top=0.9)

    plt.show()

    # then I will PCA and plot partial dependence there
    # then lasso PCA

    # then select the parameters
    # plots are nice but shouldn't I be selecting based on multi-dimensional data?

    # then generate a list of multi-parameter algorithms
    # then do a parameter search with gradient boost and a few other multi-parameter algorithms


    return
def multi_case_partial_dependence(df, cases, ests, stdzrs,
                n_oversamps, c_true, c_pred):
    y_true_l = []
    y_hat_l = []
    y_proba_l = []
    feats_l = []

    fig, ax = plt.subplots(1,1,figsize=(6,4))
    for case, est, stdzr, n, c_t, c_p in zip(cases, ests, stdzrs,
            n_oversamps, c_true, c_pred):
        data_df = df.copy() # copy to read all columns after dropping
        print('case: {}'.format(case[0]))

        # drop other binary and probability column
        c_drop = [c for c in list(df.columns) if case[1] in c]
        data_df.drop(c_drop, axis=1, inplace=True)

        # train test split in time
        X_train, y_train, X_test, y_test = train_test_split_time(data_df,
            '2016-06-01', case[0])
        names = list(X_train.columns)
        features = [11, 12, 13, 14, (9, 18)]
        # plot
        fig, axs = plot_partial_dependence(est, X_train, features,
                                           feature_names=names,
                                           n_jobs=3, grid_resolution=50)
        fig.suptitle('Partial dependence of features\n'
                     'for {} model'.format(case[0]))
        plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle

        print('Custom 3d plot via ``partial_dependence``')
        fig = plt.figure()

        target_feature = (9, 18)
        pdp, axes = partial_dependence(est, target_feature,
                                       X=X_train, grid_resolution=50)
        XX, YY = np.meshgrid(axes[0], axes[1])
        Z = pdp[0].reshape(list(map(np.size, axes))).T
        ax = Axes3D(fig)
        surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1,
                               cmap=plt.cm.BuPu, edgecolor='k')
        ax.set_xlabel(names[target_feature[0]])
        ax.set_ylabel(names[target_feature[1]])
        ax.set_zlabel('Partial dependence')
        #  pretty init view
        ax.view_init(elev=22, azim=122)
        plt.colorbar(surf)
        plt.suptitle('Partial dependence of features\n'
                     'for {} model'.format(case[0]))
        plt.subplots_adjust(top=0.9)

        plt.show()
Esempio n. 27
0
    def plot_partial_dependencies(self, colnames):
        feature_importances = self.model.feature_importances_
        top10_colindex = np.argsort(feature_importances)[::-1][0:10]
        #fig, axs = plt.subplots(5,2, figsize=(20,20))
        fig, axs = plot_partial_dependence(self.model,
                                           self.X,
                                           features=top10_colindex,
                                           feature_names=colnames,
                                           figsize=(20, 20),
                                           grid_resolution=100)

        fig.set_figwidth(20)
        fig.set_figheight(20)
        fig.tight_layout()
        #plt.figure(figsize=(5,5))
        plt.show()
Esempio n. 28
0
def generatePDP(modelObj,
                featureVector,
                trainingX,
                outputFolder,
                importance=10):
    #Create Partial Dependenct directory to hold all PD plots
    pdDir = outputFolder
    #if the output Partial Dependency Directory doesn't exist, create it
    if not os.path.exists(os.path.dirname(pdDir)):
        print "Output Directory: " + pdDir + " Doesn't exist. Creating it now"
        os.mkdir(os.path.dirname(pdDir))
    # to generate feature importance
    featureImportanceDF = returnFeatureImportance(modelObj, featureVector)
    #Select only the important features
    featureImportanceDF = featureImportanceDF[
        featureImportanceDF['Relative Importance'] > importance]
    # to generate PDP, create a list of features
    featureId = []
    featureName = []
    for k, feature in enumerate(featureVector.feature_names_, ):
        featureId.append(k)
        featureName.append(feature)
    features = pd.DataFrame([featureId, featureName]).transpose()
    features.columns = ['FeatureId', 'FeatureName']
    #Get the feature id for the important features
    featureImportanceDF = pd.merge(featureImportanceDF,
                                   features,
                                   how='left',
                                   on='FeatureName')

    #Generate PD Plots
    for i in range(featureImportanceDF['FeatureName'].size):
        feature = [featureImportanceDF['FeatureId'][i]]
        featName = featureImportanceDF['FeatureName'][i].replace('/', '_')
        fig, axs = plot_partial_dependence(modelObj,
                                           trainingX,
                                           feature,
                                           featureVector.feature_names_,
                                           n_jobs=-1)
        plt.subplots_adjust(top=0.9)
        #axs.set_xlabel(featName)
        #save the plot in the output directory with the feature name as file name
        fig.savefig(pdDir + featName + "_PD.png")
        plt.close(fig)
Esempio n. 29
0
def plot_features(model, feature_names, target, x):
    """Plot the partial dependence of the feature set."""
    plt.figure(figsize=(20, 10))
    fig, _ = plot_partial_dependence(model,
                                     x,
                                     range(len(feature_names)),
                                     feature_names=feature_names,
                                     n_jobs=-1,
                                     n_cols=4,
                                     grid_resolution=50)
    fig.suptitle('Partial dependence of features predicting'.format(target))
    plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle
    plt.subplots_adjust(right=1.5)
    print('_' * 80)
    print('Custom plot via ``partial_dependence``')
    print
    #fig_size = plt.rcParams["figure.figsize"]
    plt.rcParams["figure.figsize"] = [12, 8]
    plt.show()
def plot_features(model, feature_names, target, x):
    """Plot the partial dependence of the feature set."""
    plt.figure(figsize=(20, 10))
    fig, _ = plot_partial_dependence(model,
                                    x,
                                    range(len(feature_names)),
                                    feature_names=feature_names,
                                    n_jobs=-1,
                                    n_cols=4,
                                    grid_resolution=50)
    fig.suptitle('Partial dependence of features predicting'.format(target))
    plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle
    plt.subplots_adjust(right=1.5)
    print('_' * 80)
    print('Custom plot via ``partial_dependence``')
    print
    #fig_size = plt.rcParams["figure.figsize"]
    plt.rcParams["figure.figsize"] = [12, 8]
    plt.show()
def make_part_plot(mdoel, features):
    importance = model.feature_importances_
    importance = np.argsort(importance)[::-1]
    importance = importance[:15]

    fig, axs = plot_partial_dependence(model, df, importance,
                                       feature_names=features,
                                       n_jobs=3, grid_resolution=50,
                                       label='no traction')

    for ax in axs:
        name = ax.get_xlabel()
        ax.set_xlabel(name, fontsize=16)

        if ax.get_xlim()[0] > 1500:
            ax.set_xlim(2004, 2016)

    fig.suptitle('Partial Dependence Plot for Selected Features \n Effect on Post gaining more likes and comments', fontsize=24)
    plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle

    plt.show()
    return fig, axs
Esempio n. 32
0
def plot_partial_dependence(X_train, y_train, include_features=None, n_ways=1):
    """ Plots one-way or two-way partial dependencies (cf. Friedman 2001 or
        ESL). If include_features is given, only those features will be
        considered, otherwise all non-categorical features will be included.
    """
    raw_features = list(X_train)
    features, feature_names = [], []
    for i in range(len(raw_features)):
        if raw_features[i] in FEATURE_NAMES: # everything but categoricals
            # feature_name indexes match those of full training data column no.
            feature_names.append(FEATURE_NAMES[raw_features[i]])
            if include_features is None or raw_features[i] in include_features:
                features.append(i)
        else:
            # will never be used because categoricals are excluded but we
            # should keep track of indices nevertheless
            feature_names.append('Some categorical')
    assert len(feature_names) == len(raw_features)
    sys.stderr.write('Plotting %d-way partial depdnence for %d features\n' %
                     (n_ways, len(features)))

    if n_ways == 1:
        target_features = features # one-way pdp
    elif n_ways == 2:
        target_features = list(combinations(features, 2)) # two-way pdp
    else:
        raise Exception('only one-way and two-way partial dependence plots allowed, %d given' % int(n_ways))

    reg = train_gbrt(X_train, y_train)
    fig, axs = partial_dependence.plot_partial_dependence(
        reg, X_train, target_features, figsize=(22, 12),
        feature_names=feature_names, n_jobs=3, grid_resolution=50
    )
    for ax in axs:
        ax.yaxis.label.set_size(8)
        ax.grid(True)
        for tick in ax.xaxis.get_major_ticks():
            tick.label.set_fontsize(8)
    fig.tight_layout()
Esempio n. 33
0
def plot_partial(clf, X_train, features, feature_ids):

    for i in feature_ids:

        _, axs = plot_partial_dependence(clf.named_steps['gbm'], X_train, 
                                         [i],
                                         feature_names=features,
                                         n_jobs=14,
                                         grid_resolution=30)
        
        x = axs[0].lines[0].get_xdata()
        y = axs[0].lines[0].get_ydata()
        
        fig, ax = plt.subplots()
        fig.set_size_inches(5, 5)
        plt.subplots_adjust(left = 0.18, right = 0.9, bottom = 0.15, top = 0.9)
        ax.plot(x, y, '-', color = 'black', linewidth = 1)
        
        #ax.set_ylim(-1, 0.5)
        ax.set_ylabel('Partial Dependence', fontsize = 13)
        
        ax.set_xlabel(features[i], fontsize = 14)
        plt.savefig("partial_dependence_" + features[i] +  ".png")
Esempio n. 34
0
 def plot_2d(self,
             feature_2d,
             top=0.9,
             n_jobs=3,
             grid_resolution=50,
             figsize=(8, 9),
             subtitle=""):
     fig, axs = plot_partial_dependence(gbrt=self.model,
                                        X=self.feature_df,
                                        features=feature_2d,
                                        feature_names=self.feature_list,
                                        n_jobs=n_jobs,
                                        grid_resolution=grid_resolution,
                                        figsize=figsize)
     fig.suptitle(subtitle)
     plt.subplots_adjust(top=top,
                         left=0.16,
                         bottom=0.07,
                         right=0.81,
                         wspace=0.98,
                         hspace=0.63)
     plt.savefig(os.path.join(self.output_path, self.fig_file))
     plt.close()
def partial_dependence(df, y):
    X_train, X_test, y_train, y_test = oversample_train_test(df, y)
    # X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=42)

    feature_engineering = Pipeline([
        ('lists', ListSplitter()),
        ('race', RaceDummies()),
        ('crime_sentence', CrimeAndSentence()),
        ('feat_eng', FeatureEngineer()),
        ('columns', ColumnFilter(prejudice=False))
    ])

    X = feature_engineering.fit_transform(X_train.copy(), y_train)
    X_test = feature_engineering.fit_transform(X_test.copy(), y_test)

    gbc = GradientBoostingClassifier(n_estimators=850, learning_rate=.75)
    gbc.fit(X.copy(), y_train)
    most_imp = np.argsort(gbc.feature_importances_)[-6:]

    names = list(X_test.columns)
    feats = list(most_imp)
    fig, axs = plot_partial_dependence(gbc, X_test, feats, feature_names=names,
                                       n_jobs=3, grid_resolution=50)
    mean_mse = mse.mean()
    mean_r2 = r2.mean()

    params = estimator.get_params()
    name = estimator.__class__.__name__
    print '%s Train CV | MSE: %.3f | R2: %.3f' % (name, mean_mse, mean_r2)
    return mean_mse, mean_r2 
    
cross_val(gd_best, train_x, np.array(train_y))
cross_val(rf_best, train_x, np.array(train_y))

cross_val(gd_best, test_x, test_y)
cross_val(rf_best, test_x, test_y)


col_names = X.columns
# sort importances
indices = np.argsort(gd_best.feature_importances_)
# plot as bar chart
figure = plt.figure(figsize=(10,7))
plt.barh(np.arange(len(col_names)),gd_best.feature_importances_[indices],
         align='center', alpha=.5)
plt.yticks(np.arange(len(col_names)), np.array(col_names)[indices], fontsize=14)
plt.xticks(fontsize=14)
_ = plt.xlabel('Relative importance', fontsize=18)

fig, axs = plot_partial_dependence(gd_best, train_x, range(X.shape[1]) ,
                                   feature_names=col_names, figsize=(15, 10))
fig.tight_layout()

Esempio n. 37
0
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble.partial_dependence import partial_dependence
from sklearn.ensemble.partial_dependence import plot_partial_dependence


def get_some_data():
    cols_to_use = ['LotArea', 'YearBuilt', 'GrLivArea']
    data = pd.read_csv('train.csv')
    y = data.SalePrice
    X = data[cols_to_use]
    my_imputer = SimpleImputer()
    imputed_X = my_imputer.fit_transform(X)
    return imputed_X, y


# get_some_data is defined in hidden cell above.
X, y = get_some_data()
# scikit-learn originally implemented partial dependence plots only for Gradient Boosting models
# this was due to an implementation detail, and a future release will support all model types.
my_model = GradientBoostingRegressor()
# fit the model as usual
my_model.fit(X, y)
# Here we make the plot
my_plots = plot_partial_dependence(
    my_model,
    features=[0, 2],  # column numbers of plots we want to show
    X=X,  # raw predictors data.
    feature_names=['LotArea', 'YearBuilt', 'GrLivArea'],  # labels on graphs
    grid_resolution=10)  # number of values to plot on x axis
Esempio n. 38
0
np_y = train.as_matrix(columns=['Sales'])


clf = ensemble.GradientBoostingRegressor(n_estimators=1000,
                                         max_depth=5,
                                         max_features=5,
                                         min_samples_split=6,
                                         min_samples_leaf=6,
                                         learning_rate=0.1, loss='ls')
cross_validate(np_x, np_y, np_weekInd, 10, estimator=clf)

clf.feature_importances_

from sklearn.ensemble.partial_dependence import plot_partial_dependence
features = [0,1,(0, 1)]
plot_partial_dependence(clf, np_x, features)




test = read_test_df()

test.loc[test['Open'].isnull(), 'Open'] = 1

test['Promo2'] = 0
test['StoreType'] = 0
test['Assortment'] = 0
test['CompetitionDistance'] = 0
test['HasCompetitor'] = -1
test['CompetingMonths'] = 0
Esempio n. 39
0
confusion_matrix(res_gb, test_outcome)
#precision_recall_curve(test_outcome, res_gb)
## empirical misclassification error:
1 - (np.diag(confusion_matrix(res_gb, test_outcome)).sum())/(confusion_matrix(res_gb, test_outcome).sum())
# 0.22595596755504055

## misclassification error per class:

drf = np.diag(confusion_matrix(res_rf, test_outcome))
dgb = np.diag(confusion_matrix(res_gb, test_outcome))
crf = confusion_matrix(res_rf, test_outcome).sum(axis=0)
cgb = confusion_matrix(res_gb, test_outcome).sum(axis=0)
errors_rf = np.zeros(7)
errors_gb = np.zeros(7)
for i in range(len(fault_types)):
	errors_rf[i] = 1 - (drf[i])/(crf[i])
	errors_gb[i] = 1 - (dgb[i])/(cgb[i])

#partial dependence plots
features = [1, 10, 14, (10, 14)]
fig, axs = plot_partial_dependence(gbfit, trainset, features,label=gbfit.classes_[0],n_jobs=2, grid_resolution=50)
fig.suptitle('Partial dependence of X_Maximum, Length_of_Conveyer and Edges_Index for Pastry faults')
####### comments #######
# The better mse of the GBM is very likely due to a better recognition
# of the Z_scratch fault. A little contribution due also to Bumps.
# The most mistaken fault types are bumpiness and other faults. 
# In case of other faults, this was quite expected: firstly for the high 
# numerosity of the class, compared to the others. Secondly, because the class 
# 'other' is too broad and not well defined. Therefore it is likely that 
# shares many common features with the remaining fault types. Plot only
# this group to see if there are better clusters.
Esempio n. 40
0
    'learning_rate': 0.01,
    'loss': 'ls'
}

gbr = GradientBoostingRegressor(**params)
gbr.fit(X_train, y_train)
pd.crosstab(y_test,
            gbr.predict(X_test).round(),
            rownames=['Actual'],
            colnames=['Predicted'])

pd.DataFrame({
    'Variable': X_test.columns,
    'Importance': gbr.feature_importances_
}).sort_values('Importance', ascending=False)

fig, axs = plot_partial_dependence(
    gbr,
    X=X_test,
    features=['Parhelion Patrol', 'Rubblebelt Boar', 'Hammer Dropper'],
    feature_names=feature_list,
    n_jobs=1,
    grid_resolution=10)

allpd = {}

for i in range(len(feature_list) - 1):
    key, values = partial_dependence(gbr, target_variables=i, X=X_test)
    allpd.update(dict(zip([feature_list[i]], key.tolist())))

df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in allpd.items()]))
# classify 10% of others

# Create partial dependence plot on most important features for gbm

importances = pd.DataFrame(gbm_grid.best_estimator_.feature_importances_, 
index = df.columns, columns = ['importance'])

importances.sort(columns=['importance'], ascending=False, inplace = True)
print importances

from sklearn.ensemble.partial_dependence import plot_partial_dependence

features = [i for i,j in enumerate(df.columns.tolist()) if j in
importances.importance[0:3].index.tolist()]

fix, axs = plot_partial_dependence(gbm_grid.best_estimator_, df,
features, feature_names = df.columns)


################################
# Read in the testing set and prep it
################################

## Read in the training dataset
df_test = pd.read_csv("C:\\Users\\garauste\\Dropbox\\General Assembly\\Project\\Titanic\\Titanic Data\\test.csv")
df_test.head()

df_submit = df_test

## Creating a function to pull out the titles of the Passengers

def find_between( s, first, last ):
names = cal_housing.feature_names

print('_' * 80)
print("Training GBRT...")
clf = GradientBoostingRegressor(n_estimators=100, max_depth=4,
                                learning_rate=0.1, loss='huber',
                                random_state=1)
clf.fit(X_train, y_train)
print("done.")

print('_' * 80)
print('Convenience plot with ``partial_dependence_plots``')
print

features = [0, 5, 1, 2, (5, 1)]
fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names,
                                   n_jobs=3, grid_resolution=50)
fig.suptitle('Partial dependence of house value on nonlocation features\n'
             'for the California housing dataset')
pl.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle

print('_' * 80)
print('Custom 3d plot via ``partial_dependence``')
print
fig = pl.figure()

target_feature = (1, 5)
pdp, (x_axis, y_axis) = partial_dependence(clf, target_feature,
                                           X=X_train, grid_resolution=50)
XX, YY = np.meshgrid(x_axis, y_axis)
Z = pdp.T.reshape(XX.shape).T
ax = Axes3D(fig)
params = [100,500,1000,1500,650,700,750]

for max_leaf_nodes in params:
	mea = getmea(max_leaf_nodes,train_x,val_x,train_y,val_y)
    #test_scores.append(np.mean(mea)))
	print("Max_leaf_nodes: %d ,mea: %d" %(max_leaf_nodes,mea))
plt.plot(params, test_scores)
plt.title("max_leaf_nodes Error" + str(params));
plt.show()
'''

my_model = GradientBoostingRegressor(n_estimators=10)
my_model.fit(X, y)
my_plots = plot_partial_dependence(my_model, 
                                   features=[0,1,2], 
                                   X=X, 
                                   feature_names=cols_to_use, 
                                   grid_resolution=20)
plt.show()
#melbourne_predictors = ['Rooms','Bathroom','Landsize','BuildingArea','YearBuilt','Lattitude','Longtitude']
#X = melbourne_data[melbourne_predictors]

# split data into train and validation
# how to know test_size and random_state?
#train_x,val_x,train_y,val_y = train_test_split(X,y,test_size=0.25,random_state = 0)

# find max_leaf_nodes, then get 400
'''
def getmea(max_leaf_nodes,mea_train_x,mea_test_x,mea_train_y,mea_test_y):
	model = DecisionTreeRegressor(max_leaf_nodes = max_leaf_nodes,random_state = 0)
	model.fit(mea_train_x,mea_train_y)
Esempio n. 44
0
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.ensemble.partial_dependence import partial_dependence
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

selected_features = ['body_length','fb_published','gts','has_analytics','has_header', 'has_logo',
'name_length','num_order','num_payouts','org_facebook','org_twitter','sale_duration','sale_duration2',
'show_map','user_age','min_price','max_price', 'mean_price', 'total_revenue', 'total_tix_sold',
'total_tix_offered','A','C','E','G','M','N','U','num_caps_freq','email_.com','email_.gov',
'email_.org','email_.other','delivery_method_0.0','delivery_method_1.0','delivery_method_3.0','delivery_method_nan',
'state_GREATER_LONDON','state_FL','state_LONDON','state_GT_LON','state_DE','state_BIRMINGHAM','state_PA',
'state_NV','state_NH','state_GA','state_ENGLAND','country_US','country_IE','country_FR','country_CA',
'country_GB','country_AU','country_ES','country_NL','country_DE','country_VN','country_NZ','country_PK',
'country_MA','country_A1','country_other','previous_payout']

selected_features = np.array(selected_features)

features = [np.where(selected_features == 'total_tix_sold')[0][0], np.where(selected_features == 'mean_price')[0][0]]


fig, axs = plot_partial_dependence(gb4000_clf, X_train, features, feature_names = selected_features,
                                   n_jobs = -1, grid_resolution = 100)
fig.suptitle('Partial dependence of fraud detection features')
plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle
#fig.tight_layout()

plt.show()
Esempio n. 45
0
# from sklearn.ensemble.partial_dependence import partial_dependence
# from sklearn.ensemble import GradientBoostingClassifier

# pdp, axes = partial_dependence(clf, [0], X=X)
# clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X, y)
# print pdp
# print axes

from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble.partial_dependence import plot_partial_dependence

X, y = make_hastie_10_2(random_state=0)
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X, y)
features = [0, 1, (0, 1)]
fig, axs = plot_partial_dependence(clf, X, features) 
Esempio n. 46
0
X_test, y_test = X[offset:], y[offset:]

params = {'n_estimators': 500, 'learning_rate': 0.08, 'max_depth': 3, 'min_samples_leaf': 1}

clf = GradientBoostingRegressor(**params)

print 'Training...'
clf.fit(X_train, y_train)

mse = mean_squared_error(y_test, clf.predict(X_test))
print("RMSE: %.4f" % np.sqrt(mse))

clf_full_data = joblib.load('model/model.pkl')
print 'Generating graphs - partial dependance...'
for idx, x in enumerate(features):
    fig, axs = partial_dependence.plot_partial_dependence(clf_full_data, X, [features[idx]], feature_names=list(features))
    fig.savefig('graphs/_%s.png' %x.lower().replace(' ', '_'))

###############################################################################
# Plot training deviance

# compute test set deviance
test_score = np.zeros((params['n_estimators'],), dtype=np.int64)

for i, y_pred in enumerate(clf.staged_predict(X_test)):
    test_score[i] = clf.loss_(y_test, y_pred)

deviance_plot = plt
deviance_plot.figure(figsize=(12, 6))
deviance_plot.title('Deviance')
deviance_plot.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',
Esempio n. 47
0
df_for_reg.fillna(0, inplace=True)
X = df_for_reg[num_predictor] 
y = df_for_reg['domestic_gross']
    
X_mat = sm.add_constant(X)
linmodel = sm.OLS(y, X_mat).fit()
print linmodel.summary()
plt.scatter(y, linmodel.resid)
plt.scatter(y, linmodel.fittedvalues)

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.ensemble.partial_dependence import partial_dependence

clf = GradientBoostingRegressor(n_estimators=100, max_depth=4,learning_rate=0.1)
clf.fit(X = df_for_reg[num_predictor], y = df_for_reg['domestic_gross'])                                
fig, axs = plot_partial_dependence(clf, df_for_reg[num_predictor], [0], feature_names='wide_release_log',
                                   grid_resolution=50)


for item in num_predictor:
    print item
    plt.scatter(df_for_reg[item] , df_for_reg['domestic_gross'])
    plt.show()






Esempio n. 48
0
from gradientboost import GradientBoost
from sklearn.ensemble.partial_dependence import plot_partial_dependence, partial_dependence
import pandas as pd
import pickle
import matplotlib.pyplot as plt

if __name__ == '__main__':
    with open('modelg2.p', 'rb') as f:
        model = pickle.load(f)

    feature_names = list(model.data.columns)
    feature_names.remove('fraud')

<<<<<<< HEAD
    features = ['event_delay', 'name_length', 'user_created', 'venue_address', 'avg_price', 'num_payouts']

    fig, axs = plot_partial_dependence(model.m, model.X_train, features, feature_names = feature_names)
    fig.set_title('Partial Dependency Plots')
=======
    features = [[feature_names]]
    fig, axs = plot_partial_dependence(model.m, model.X_train, feature_names, feature_names = feature_names)
>>>>>>> d69dc69fbeb04cd4d5fe3fbc062c341f36d223c9
Esempio n. 49
0
test = pd.read_csv('pickle_cellar/test_data.csv')
np_test_x = test.as_matrix()
test_y_hat = clf.predict(np_test_x)
ind = range(1, test_y_hat.shape[0] + 1)
result = zip(ind, test_y_hat)
submission = pd.DataFrame(result, columns=["Id","Sales"])
submission.to_csv('submissions/gb_storeid_dow_model.csv', index=False)

from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.ensemble.partial_dependence import partial_dependence

for i in range(0, 16):
    for t in range(0, 16):
        if i != t:
            fig, axs = plot_partial_dependence(clf, np_x, [(i,t)], feature_names=train_x.columns,
                                   n_jobs=-1, grid_resolution=20)

features = [3, 14, (3, 14)]
fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=train_x.columns,
                                   n_jobs=-1, grid_resolution=20)


from itertools import combinations
aa = combinations(range(0, 16), 2)
for i,t in aa:
    fig, axs = plot_partial_dependence(clf, np_x, [(i,t)], feature_names=train_x.columns,
                                   n_jobs=-1, grid_resolution=20)



pred = clf.predict(X_test)
# Using gradient boost regressor to plot partial dependence plot
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.impute import SimpleImputer

# The PDP will show us the relationship between the target and its features
features = ['Distance', 'Landsize', 'BuildingArea']
data = pd.read_csv('../data/melb_data.csv')
y = data.Price
X = data[features]
imputer = SimpleImputer()
X = imputer.fit_transform(X)

model = GradientBoostingRegressor()
model.fit(X, y)
fig, plots = plot_partial_dependence(model,
                                     features=[0, 1, 2],
                                     X=X,
                                     feature_names=features,
                                     grid_resolution=40)

fig.show()
input('Press enter to continue...')
 print predictors[i], temp_pd[0].shape
 
 if temp_pd[0].shape[1] == gr:
     temp_output = numpy.empty(gr,dtype=[('model', '|S255'), ('n_split', 'i1'), ('lr','f4'),('n_tree','i4'),('pred','|S255'),('pdp_x','f4'),('pdp_y','f4')])
     temp_output['model'] = shelf_file
     temp_output['n_split'] = tree_depth
     temp_output['lr'] = learning_rate
     temp_output['n_tree'] = n_trees
     temp_output['pred'] = predictors[i]
     temp_output['pdp_x'] = temp_pd[0]
     temp_output['pdp_y'] = numpy.array(temp_pd[1]) 
         
     numpy.savetxt(fname=pd_table,X=temp_output,delimiter=',',fmt=['%s','%d','%0.4f','%d','%s','%0.4f','%0.4f'])
     
 #fig, axs = plot_partial_dependence(clf, X_train, [i], grid_resolution=30) #, feature_names=predictors[i])    #, n_jobs=32, grid_resolution=50)
 fig, axs = plot_partial_dependence(clf, X_temp, [i], grid_resolution=gr) #, feature_names=predictors[i])    #, n_jobs=32, grid_resolution=50)
 
 # set x and y axis limits
 if prd=="eco_l1":
     xmin = numpy.nanmin(X_temp[:,i])
     xmax = numpy.nanmax(X_temp[:,i])
 else:
     xmin = numpy.nanpercentile(a=X_temp[:,i], q=2.5)
     xmax = numpy.nanpercentile(a=X_temp[:,i], q=97.5)
 
 plt.xlim( (xmin, xmax) )
 print predictors[i], xmin, numpy.nanmean(X_temp[:,i]), xmax
 
 plt.xlabel(predictors[i])
 plt.savefig( pd_dir + predictors[i] + ".png", dpi=100)
 plt.close(fig)
Esempio n. 52
0
def gradientBoosting():
    
    num_estimadores = 350
    clf = ensemble.GradientBoostingRegressor(n_estimators=num_estimadores, max_depth=2, learning_rate=0.1, loss='ls', subsample=0.5)
    
    importancias = [0,0,0,0,0,0,0,0,0,0,0,0,0]    
    mae, mse, mr2, cont = 0, 0, 0, 0
    test_score = np.zeros((num_estimadores,), dtype=np.float64)
    train_score = np.zeros((num_estimadores,), dtype=np.float64)
    mseVector = [0]

    kf = KFold(len(boston_Y), n_folds=10, indices=True)
    for train, test in kf:
        trainX, testX, trainY, testY=boston_X[train], boston_X[test], boston_Y[train], boston_Y[test]    
    
        clf.fit(trainX, trainY)
        pred = clf.predict(testX)
        
        maeGradient = metrics.mean_absolute_error(testY, pred)
        mseGradient = metrics.mean_squared_error(testY, pred)
        r2 = metrics.r2_score(testY, pred)
        
        mae = mae + maeGradient
        mse = mse + mseGradient     
        mr2 = mr2 + r2
        mseVector.append(mseGradient)
        cont = cont + 1
                
        for i, y_pred in enumerate(clf.staged_decision_function(testX)):
            test_score[i] = test_score[i] + clf.loss_(testY, y_pred)
         
        for i in range(num_estimadores):
            train_score[i] = clf.train_score_[i] + train_score[i]
    
        feature_importance = clf.feature_importances_
        feature_importance = 100.0 * (feature_importance / feature_importance.max())
        for i in range(13):
            importancias[i] = importancias[i] + feature_importance[i]

        print str("Iteracción ")+str(cont)+str(" de la validacion cruzada")
        print str("\tError medio absoluto:  ")+str(maeGradient)
        print str("\tError medio cuadrado:  ")+str(mseGradient)
        print str("\tr2:  ")+str(r2)

        #Dibuja los puntos que predice sobre los puntos verdaderos
        pl.plot(testY, testY, label='Valor verdadero')
        pl.plot(testY, pred, 'ro', label='Prediccion Gradient')
        pl.legend(bbox_to_anchor=(1.05, 1), borderaxespad=0., prop = FontProperties(size='smaller'))
        pl.show()

    print mseVector
    mae = mae/10
    mse = mse/10
    mr2 = mr2/10
    print str("Error medio absoluto: ")+str(mae)+str("\tError medio cuadratico: ")+str(mse)+str("\tR2: ")+str(mr2)    
    
    for i in range(13):
        importancias[i] = importancias[i]/10
        
    sorted_idx = np.argsort(importancias)
    pos = np.arange(sorted_idx.shape[0]) + .5
    importancias = np.reshape(importancias, (len(importancias), -1))

    boston = datasets.load_boston()
    pl.barh(pos, importancias[sorted_idx], align='center')
    pl.yticks(pos, boston.feature_names[sorted_idx])
    pl.xlabel('Importancia relativa')
    pl.show()
    
    for i in range(num_estimadores):
        test_score[i] = test_score[i]/10
        train_score[i] = train_score[i]/10
        
    pl.figure(figsize=(12, 6))
    pl.subplot(1, 1, 1)
    pl.title('Desviacion')
    pl.plot(np.arange(num_estimadores) + 1, train_score, 'b-', label='Error en el conjunto de Training')
    pl.plot(np.arange(num_estimadores) + 1, test_score, 'r-', label='Error en el conjunto de Test')
    pl.legend(loc='upper right')
    pl.xlabel('Iteracciones del Boosting (numero de arboles)')
    pl.ylabel('Desviacion')
    pl.show()
    
    print len(mseVector)
    print len(np.arange(10))    
    
    pl.subplot(1, 1, 1)
    pl.plot(np.arange(11), mseVector, 'b-')
    pl.legend(loc='upper right')
    pl.xlabel('Iteraccion de la validacion cruzada')
    pl.ylabel('Erro Medio Cuadratico')
    pl.show()

    
    fig, axs = plot_partial_dependence(clf, trainX,[0,1,2,3,4,5,6,7,8,9,10,11,12])
    
    fig.suptitle('Dependencia parcial del valor de las casas')
    
    pl.subplots_adjust(top=0.9)
        
    pl.show()
Esempio n. 53
0
expected = y[size:]
predicted = regressor.predict(x_norm[size:])
pearson = pearsonr(expected,predicted)[0]
# measures the correlation between what was predicted and what actually happened
print("Pearson coefficient: %s" % str(pearson))
# (Mean Squared Error) is a measure of the amplitude of the error
print("MSE : %s" % np.sqrt(mean_squared_error(expected, predicted)))

### feature importance
feature_importance = regressor.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)

## relative importance
# plot relative importance
plt.barh(np.arange(len(x_norm.columns)), regressor.feature_importances_[sorted_idx])
plt.yticks(np.arange(len(x_norm.columns)) + 0.25, np.array(x_norm.columns)[sorted_idx])
_ = plt.xlabel('Importance relative')
plt.savefig("relative_importance.png", bbox_inches='tight')

# shell info
print("most important features:")
i=1
for f,w in zip(x_norm.columns[sorted_idx], feature_importance[sorted_idx]):
	print("%d) %s : %d" % (i, f, w))
	i+=1
    # plot partial dependence / feature
	features = [f]
	fig, axs = plot_partial_dependence(regressor, x_norm, features, feature_names=x_norm.columns, figsize=(8, 6))
	name = f + "_partial_dependence.png"
	plt.savefig(name, bbox_inches='tight')    
Esempio n. 54
0
# Using the gradient boosting regressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble.partial_dependence import plot_partial_dependence
regr = DecisionTreeRegressor(max_depth=4)
clf = GradientBoostingRegressor(n_estimators=100,
                                max_depth=4,
                                learning_rate=0.1,
                                loss='huber',
                                random_state=1)
clf.fit(X, y)
clf.feature_importances_
fig, axs = plot_partial_dependence(clf,
                                   X, [0, 1, (1, 2), (2, 3)],
                                   feature_names=['A', 'B'],
                                   n_jobs=3,
                                   grid_resolution=50)

# Doing cross validation
from sklearn.model_selection import ShuffleSplit
ss = ShuffleSplit(n_splits=5, test_size=0.2, random_state=2)
list_train_test = [(X_train[train_index], X_train[test_index],
                    y_train[train_index], y_train[test_index])
                   for (train_index, test_index) in ss.split(X_train)]
for X_train, X_test, y_train, y_test in list_train_test:
    linEx = []
    clf.fit(X_train, y_train)
    print(linEx(y_test, clf.predict(X_test)))

# Export graphiz
plt.ylabel('True Positive Rate (Sensitivity)')
## what does this tell us for this sample?
#this tells us that random forest is the best model

## create partial dependence plot on most important features for gbm.

importances = pandas.DataFrame(gbm_grid.best_estimator_.feature_importances_, index = explanatory_df.columns, columns =['importance'])

importances.sort(columns = ['importance'], ascending = False, inplace = True)
print importances

from sklearn.ensemble.partial_dependence import plot_partial_dependence

features = [i for i, j in enumerate(explanatory_df.columns.tolist()) if j in importances.importance[0:3].index.tolist()]

fig, axs = plot_partial_dependence(gbm_grid.best_estimator_, explanatory_df, features, feature_names = explanatory_df.columns)

#                  importance
#totalruns           0.156319
#shutouts            0.085167
#errors              0.077250
#teamID_Nothing      0.071030
#totalRBI            0.064376
#earnedruns          0.061927
#stolenbases         0.049815
#atbats              0.045446
#totalhomeruns       0.044935
#totalgames          0.042369
#timewithouts        0.036490
#doubleplays         0.036110
#totalhits           0.036078
Esempio n. 56
0
# look at partial dependence plot on most important features for gbm

importances = pandas.DataFrame(gbm_grid.best_estimator_.feature_importances_,
                               index=explanatory_df.columns,
                               columns=['importance'])

importances.sort(columns=['importance'], ascending=False, inplace=True)
print importances
# does not necessarily say whether it is a positive or negative importance

from sklearn.ensemble.partial_dependence import plot_partial_dependence

features = [
    i for i, j in enumerate(explanatory_df.columns.tolist())
    if j in importances.importance[0:3].index.tolist()
]
# match feature importance for the first 3 importances
# i is index in list where the name occured - finds the feature
# j is the feature name

fig, axs = plot_partial_dependence(gbm_grid.best_estimator_,
                                   explanatory_df,
                                   features,
                                   feature_names=explanatory_df.columns)

# compare the mean ROC AUC
print "Neural Networks Mean ROC AUC %f" % roc_scores_nn.mean()
print "Boosting Tree Mean ROC AUC %f" % roc_scores_gbm.mean()
print "Random Forest Mean ROC AUC %f" % roc_scores_rf.mean()
print "Decision Tree Mean ROC AUC %f" % roc_score_tree.mean()
Esempio n. 57
0
 def dependence(self, forest, train, feature_set):
     print "******************this is the output of dependences of features"
     fig, axs = plot_partial_dependence(forest, train, features=feature_set
                                        )
     plt.show()