def fitAndScore(features): data = ChurnData(features) model = LogisticRegression() model.fit(**data.split_train) scores = data.getScores(model, 'split_val') return {'model': model, 'scores': scores, 'features': features}
def printTestSetResultsRandFor(): grid = pickle.load(open(_RESULT_PATH + 'grid_search_result.pkl', 'rb')) model = RandomForestClassifier(**grid.best_params_) data = ChurnData() model.fit(**data.train) return data.printScores(model)
def printTestSetResultsLogReg(): gridL2 = pickle.load(open(_RESULT_PATH + 'logRegL2_grid.pkl', 'rb')) accL2 = gridL2['accuracy'] params = accL2.best_params_ model = LogisticRegression(**params) data = ChurnData() model.fit(**data.train) return data.printScores(model)
def runGridSearch(): param_grid = { 'n_estimators': [100], 'max_features': range(10, 30), 'max_depth': range(1, 20), 'min_samples_leaf': range(5, 25) } data = ChurnData() model = RandomForestClassifier() # fixed random state for cross validation cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) # default scoring is accuracy grid = GridSearchCV(estimator=model, param_grid=param_grid, verbose=1, n_jobs=64, cv=cv) grid.fit(**data.train) with open(_RESULT_PATH + 'grid_search_result.pkl', 'wb') as handle: pickle.dump(grid, handle, protocol=pickle.HIGHEST_PROTOCOL) return grid
def getFeatureImportances(): model = getBestModel() data = ChurnData() importances = pd.DataFrame( list(zip(data.features, model.feature_importances_))) importances.columns = ['feature', 'importance'] return importances.sort_values(by='importance', ascending=False)
def _runFeatureElimination(numFeatures, features, train=None, test=None): model = LogisticRegression() data = ChurnData(features) rfe = RFE(model, numFeatures) fit = rfe.fit(data.train['X'][train], data.train['y'][train]) features = data.features[fit.support_] data = ChurnData(features) model.fit(data.train['X'][train], data.train['y'][train]) scores = data.getScores(model, X=data.train['X'][test], y=data.train['y'][test]) return { 'features': features, 'accuracy': scores['accuracy'], 'roc_auc': scores['auc'] }
def bestModelAuc(): # auc = 0.8026 params = getResultGrid().best_params_ data = ChurnData() # fixed random state for cross validation cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) aucs = [] for i, (trainIndex, testIndex) in enumerate(cv.split(**data.train)): print('Split{} out of 10'.format(i + 1)) model = RandomForestClassifier(**params) model.fit(data.train['X'][trainIndex], data.train['y'][trainIndex]) aucs.append( data.getScores(model, X=data.train['X'][testIndex], y=data.train['y'][testIndex])['auc']) return np.mean(aucs)
def storeModel(model, **model_params): data = ChurnData(predict='deltaNextHours') m = model(data, **model_params) m.fit(data.split_train_df) with open(model.RESULT_PATH + 'model.pkl', 'wb') as handle: pickle.dump(m, handle, protocol=pickle.HIGHEST_PROTOCOL) pred_val = m.cf.predict_expectation(data.split_val_df).values.reshape(-1) with open(model.RESULT_PATH + 'pred_val.pkl', 'wb') as handle: pickle.dump(pred_val, handle, protocol=pickle.HIGHEST_PROTOCOL)
def runFeatureElimination(includeFeat='all'): """ Performs feature elimination Run RFE for each fold, find average scores for AUC and accuracy for each step :includeFeat: 'avg' or 'wght' -- include wght avg or avg only """ # load data pool = Pool(64) # all features data = ChurnData() features = data.features if includeFeat == 'avg': # only avg deltaPrev features = list( set(features) - set(['logDeltaPrev_wght_avg', 'deltaPrev_wght_avg'])) elif includeFeat == 'wght': # only weighted deltaPrev features = list( set(features) - set(['logDeltaPrev_avg', 'deltaPrev_avg'])) cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) scores = [0] * 10 for i, (train_ind, test_ind) in enumerate(cv.split(**data.train)): print('Fold: {} out of 10'.format(i + 1)) scores[i] = pool.map( partial(_runFeatureElimination, features=features, train=train_ind, test=test_ind), range(1, len(features) + 1)) pool.close() features = [[s['features'] for s in ss] for ss in scores] accuracy = np.array([[s['accuracy'] for s in ss] for ss in scores]).mean(0) roc_auc = np.array([[s['roc_auc'] for s in ss] for ss in scores]).mean(0) # accuracy = np.array([s['accuracy'] for s in scores]).mean(0) # roc_auc = np.array([s['roc_auc '] for s in scores]).mean(0) res = {'features': features, 'accuracy': accuracy, 'roc_auc': roc_auc} with open('{}logReg_rfe_{}.pkl'.format(_RESULT_PATH, includeFeat), 'wb') as handle: pickle.dump(res, handle, protocol=pickle.HIGHEST_PROTOCOL) return res
def findPearsonCor(): data = ChurnData() df = pd.DataFrame(data.X_split_train) df.columns = data.features df['churned'] = data.y_split_train corr = df.corr().churned keys = corr.keys() feat_noWght = ['numSessions', 'recency'] feat = [ 'deltaPrev', 'dayOfMonth', 'dayOfWeek', 'hourOfDay', 'sessionLen', 'price', 'numDivisions', 'numInteractions', 'numItemsViewed' ] feat_dev = ['Desktop', 'Mobile', 'Ios', 'Android', 'Unknown'] corrs_feat_noWght = pd.DataFrame(columns=['feature', 'plain', 'log']) corrs_feat_noWght.feature = feat_noWght corrs_feat_noWght.plain = [corr[f] for f in feat_noWght] corrs_feat_noWght.log = [corr['log' + upperfirst(f)] for f in feat_noWght] corrs_feat = pd.DataFrame( columns=['feature', 'avg', 'log_avg', 'wght_avg', 'log_wght_avg']) corrs_feat.feature = feat corrs_feat.avg = [corr[f + '_avg'] for f in feat] corrs_feat.log_avg = [ corr['log' + upperfirst(f) + '_avg'] if 'log' + upperfirst(f) + '_avg' in keys else np.nan for f in feat ] corrs_feat.wght_avg = [corr[f + '_wght_avg'] for f in feat] corrs_feat.log_wght_avg = [ corr['log' + upperfirst(f) + '_wght_avg'] if 'log' + upperfirst(f) + '_wght_avg' in keys else np.nan for f in feat ] corrs_dev = pd.DataFrame(columns=['feature', 'plain', 'wght']) corrs_dev.feature = feat_dev corrs_dev.plain = [corr['device' + f] for f in feat_dev] corrs_dev.wght = [ corr['device' + upperfirst(f) + '_wght'] for f in feat_dev ] return corrs_feat_noWght, corrs_feat, corrs_dev
def runBayesOpt(model, include_recency=False, error='concordance', maximise=True): """ Cross-validated search for parameters """ nFolds = 10 nPools = 10 bounds = {'penalizer': (1000, 5000)} n_iter = 20 print(model.RESULT_PATH) # load churn data for splitting fold stratas churnData = ChurnData() cv = StratifiedKFold(n_splits=nFolds, shuffle=True, random_state=42) splits = np.array(list(cv.split(**churnData.train))) f = partial(_evaluatePenalizer, model=model, splits=splits, nPools=nPools, include_recency=include_recency, error=error, maximise=maximise) bOpt = BayesianOptimization(f, bounds) bOpt.maximize(init_points=2, n_iter=n_iter, acq='ucb', kappa=5, kernel=Matern()) with open( model.RESULT_PATH + 'bayes_opt_{}{}.pkl'.format( error, '_rec' if include_recency else ''), 'wb') as handle: pickle.dump(bOpt, handle, protocol=pickle.HIGHEST_PROTOCOL) return bOpt
def runGridSearch(model, include_recency=False): """ Cross-validated search for parameters """ nFolds = 10 nPools = 10 bounds = (2000, 3000) n_iter = 21 space = np.linspace(bounds[0], bounds[1], n_iter) print(model.RESULT_PATH) # load churn data for splitting fold stratas churnData = ChurnData() cv = StratifiedKFold(n_splits=nFolds, shuffle=True, random_state=42) splits = np.array(list(cv.split(**churnData.train))) scores = [] for p in space: print(p) scores.append( _evaluatePenalizer(p, model=model, splits=splits, nPools=nPools, include_recency=include_recency, error=None)) res = { 'penalties': space, 'scores': {k: [d[k] for d in scores] for k in scores[0]} } with open( model.RESULT_PATH + 'grid_search{}_{}.pkl'.format( '_rec' if include_recency else '', n_iter), 'wb') as handle: pickle.dump(res, handle, protocol=pickle.HIGHEST_PROTOCOL) return res
def crossValidate(model, penalizer=2045, include_recency=False, nFolds=10): churnData = ChurnData() cv = StratifiedKFold(n_splits=nFolds, shuffle=True, random_state=42) splits = np.array(list(cv.split(**churnData.train))) pool = Pool(nFolds) scores = pool.map( partial(_scoreModel, model=model, penalizer=penalizer, include_recency=include_recency), splits) res = { key: np.mean([score[key] for score in scores]) for key in scores[0].keys() } pool.close() pool.join() return res
def runL2GridSearch(): """ Runs grid search logistic regression model with L2 penalty Uses 10-fold cross validation """ param_grid = {'penalty': ['l2'], 'C': np.logspace(-6, 0, 800)} data = ChurnData() model = LogisticRegression(penalty='l2') # fixed random state for cross validation cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) # default scoring is accuracy grid_acc = GridSearchCV(estimator=model, param_grid=param_grid, verbose=1, n_jobs=64, cv=cv, scoring='accuracy') grid_acc.fit(**data.train) grid_auc = GridSearchCV(estimator=model, param_grid=param_grid, verbose=1, n_jobs=64, cv=cv, scoring='roc_auc') grid_auc.fit(**data.train) res = {'accuracy': grid_acc, 'roc_auc': grid_auc} with open('{}logRegL2_grid.pkl'.format(_RESULT_PATH), 'wb') as handle: pickle.dump(res, handle, protocol=pickle.HIGHEST_PROTOCOL) return res
import numpy as np from lifelines import CoxPHFitter import sys sys.path.insert(0, '../utils') predPeriod = { 'start': pd.Timestamp('2016-02-01'), 'end': pd.Timestamp('2016-06-01') } predPeriodHours = (predPeriod['end'] - predPeriod['start']) / np.timedelta64(1, 'h') class CoxChurnModel: def __init__(self): self.cf = CoxPHFitter() def fit(self, dataset, pred_col='deltaNextHours', event_col='observed'): self.cf.fit(dataset, pred_col, event_col=event_col) def predict(self, df): pred = self.cf.predict_expectation(df) churned = (pred - df.recency.values.reshape((-1,1))) > predPeriodHours return churned.values.reshape(-1) def predict_proba(self, df): return np.zeros(len(df)) model = CoxChurnModel() data = ChurnData(dataset='cox')
def main(): data = ChurnData() model = MajorityPredictor() model.fit(**data.train) data.getScore(model)
def __init__(self, include_recency=False): self.data = ChurnData(predict='deltaNextHours' ) #, features=['recency', 'logNumSessions']) self.include_recency = include_recency