Esempio n. 1
0
def load_data():
    ''' loads and shapes data '''
    dfn, dfc = load_and_clean_data()
    dfn.pop('last_trip_date')
    dfn.pop('signup_date')
    dfc.pop('last_trip_date')
    dfc.pop('signup_date')
    y_n = dfn.pop('churn').values
    X_n = dfn.values
    y_c = dfc.pop('churn').values
    X_c = dfc.values
    X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(X_n,
                                                                y_n,
                                                                test_size=0.3)
    X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c,
                                                                y_c,
                                                                test_size=0.3)
    theano.config.floatX = 'float32'
    X_train = X_train_n.astype(theano.config.floatX)
    X_test = X_test_n.astype(theano.config.floatX)
    y_train_ohe = np_utils.to_categorical(y_train_n)
    return X_train, y_train_n, X_test, y_test_n, y_train_ohe
Esempio n. 2
0
        churn_values.append(dfc[column_name][(dfc[column_name]==val)&(dfc['churn']==1)].count())
        not_churn_values.append(dfc[column_name][(dfc[column_name]==val)&(dfc['churn']==0)].count())
    x = np.array(range(len(column_values)))
    ax = plt.subplot(111)
    ax.bar(x-0.2, churn_values, width=0.2, color='blue', label='churn')
    ax.bar(x, not_churn_values, width=0.2, color='red', label='not churn')
    ax.legend(loc=0)
    ax.set_xticklabels(column_values)
    ax.set_xticks(range(len(column_values)))
    ax.set_title('Churn vs {}'.format(column_name))
    plt.savefig('plots/churn_vs_{}.png'.format(column_name))
    plt.show()

if __name__ == "__main__":
    plt.style.use('fivethirtyeight')
    dfn, dfc = load_and_clean_data()
    # plot_distance_vs_churn(dfc)
    # plot_churn_vs_avg_rating_by_driver(dfn)
    # plot_churn_vs_avg_rating_of_driver(dfn)
    # plot_churn_vs_surge_percentage(dfn)
    # plot_churn_vs_average_surge(dfn)
    # plot_churn_bar(dfc, 'phone', ['iPhone', 'Android'])
    # plot_churn_bar(dfc, 'city', ['Astapor', 'Winterfell', 'King\'s Landing'])
    # plot_churn_bar(dfc, 'luxury_car_user', [True, False])

    yn = dfn.pop('churn').values
    Xn = dfn.values

    yc = dfc.pop('churn').values
    Xc = dfc.values
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from eda import load_and_clean_data
from churn import get_scores

if __name__ == '__main__':

    dfn, dfc = load_and_clean_data('data/churn_sample.csv')
    dfn.pop('last_trip_date')
    dfn.pop('signup_date')
    dfc.pop('last_trip_date')
    dfc.pop('signup_date')
    y_n = dfn.pop('churn').values
    X_n = dfn.values
    y_c = dfc.pop('churn').values
    X_c = dfc.values

    X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(X_n,
                                                                y_n,
                                                                test_size=0.3)
    X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c,
                                                                y_c,
                                                                test_size=0.3)

    print "    Model,               Accuracy, Precision, Recall"

    abc_acc, abc_prec, abc_rec = get_scores(AdaBoostClassifier,
                                            X_train_n,