def load_data(): ''' loads and shapes data ''' dfn, dfc = load_and_clean_data() dfn.pop('last_trip_date') dfn.pop('signup_date') dfc.pop('last_trip_date') dfc.pop('signup_date') y_n = dfn.pop('churn').values X_n = dfn.values y_c = dfc.pop('churn').values X_c = dfc.values X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(X_n, y_n, test_size=0.3) X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, y_c, test_size=0.3) theano.config.floatX = 'float32' X_train = X_train_n.astype(theano.config.floatX) X_test = X_test_n.astype(theano.config.floatX) y_train_ohe = np_utils.to_categorical(y_train_n) return X_train, y_train_n, X_test, y_test_n, y_train_ohe
churn_values.append(dfc[column_name][(dfc[column_name]==val)&(dfc['churn']==1)].count()) not_churn_values.append(dfc[column_name][(dfc[column_name]==val)&(dfc['churn']==0)].count()) x = np.array(range(len(column_values))) ax = plt.subplot(111) ax.bar(x-0.2, churn_values, width=0.2, color='blue', label='churn') ax.bar(x, not_churn_values, width=0.2, color='red', label='not churn') ax.legend(loc=0) ax.set_xticklabels(column_values) ax.set_xticks(range(len(column_values))) ax.set_title('Churn vs {}'.format(column_name)) plt.savefig('plots/churn_vs_{}.png'.format(column_name)) plt.show() if __name__ == "__main__": plt.style.use('fivethirtyeight') dfn, dfc = load_and_clean_data() # plot_distance_vs_churn(dfc) # plot_churn_vs_avg_rating_by_driver(dfn) # plot_churn_vs_avg_rating_of_driver(dfn) # plot_churn_vs_surge_percentage(dfn) # plot_churn_vs_average_surge(dfn) # plot_churn_bar(dfc, 'phone', ['iPhone', 'Android']) # plot_churn_bar(dfc, 'city', ['Astapor', 'Winterfell', 'King\'s Landing']) # plot_churn_bar(dfc, 'luxury_car_user', [True, False]) yn = dfn.pop('churn').values Xn = dfn.values yc = dfc.pop('churn').values Xc = dfc.values
import pandas as pd import numpy as np from sklearn.cross_validation import train_test_split from sklearn.metrics import accuracy_score, precision_score, recall_score from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from eda import load_and_clean_data from churn import get_scores if __name__ == '__main__': dfn, dfc = load_and_clean_data('data/churn_sample.csv') dfn.pop('last_trip_date') dfn.pop('signup_date') dfc.pop('last_trip_date') dfc.pop('signup_date') y_n = dfn.pop('churn').values X_n = dfn.values y_c = dfc.pop('churn').values X_c = dfc.values X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(X_n, y_n, test_size=0.3) X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, y_c, test_size=0.3) print " Model, Accuracy, Precision, Recall" abc_acc, abc_prec, abc_rec = get_scores(AdaBoostClassifier, X_train_n,