예제 #1
0
def fitAndScore(features):
    data = ChurnData(features)
    model = LogisticRegression()
    model.fit(**data.split_train)
    scores = data.getScores(model, 'split_val')

    return {'model': model, 'scores': scores, 'features': features}
예제 #2
0
def printTestSetResultsRandFor():
    grid = pickle.load(open(_RESULT_PATH + 'grid_search_result.pkl', 'rb'))
    model = RandomForestClassifier(**grid.best_params_)
    data = ChurnData()

    model.fit(**data.train)

    return data.printScores(model)
예제 #3
0
def printTestSetResultsLogReg():
    gridL2 = pickle.load(open(_RESULT_PATH + 'logRegL2_grid.pkl', 'rb'))
    accL2 = gridL2['accuracy']
    params = accL2.best_params_
    model = LogisticRegression(**params)
    data = ChurnData()

    model.fit(**data.train)

    return data.printScores(model)
예제 #4
0
def runGridSearch():
    param_grid = {
        'n_estimators': [100],
        'max_features': range(10, 30),
        'max_depth': range(1, 20),
        'min_samples_leaf': range(5, 25)
    }

    data = ChurnData()
    model = RandomForestClassifier()

    # fixed random state for cross validation
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    # default scoring is accuracy
    grid = GridSearchCV(estimator=model,
                        param_grid=param_grid,
                        verbose=1,
                        n_jobs=64,
                        cv=cv)
    grid.fit(**data.train)

    with open(_RESULT_PATH + 'grid_search_result.pkl', 'wb') as handle:
        pickle.dump(grid, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return grid
예제 #5
0
def getFeatureImportances():
    model = getBestModel()
    data = ChurnData()

    importances = pd.DataFrame(
        list(zip(data.features, model.feature_importances_)))
    importances.columns = ['feature', 'importance']

    return importances.sort_values(by='importance', ascending=False)
예제 #6
0
def _runFeatureElimination(numFeatures, features, train=None, test=None):
    model = LogisticRegression()
    data = ChurnData(features)

    rfe = RFE(model, numFeatures)
    fit = rfe.fit(data.train['X'][train], data.train['y'][train])

    features = data.features[fit.support_]
    data = ChurnData(features)
    model.fit(data.train['X'][train], data.train['y'][train])
    scores = data.getScores(model,
                            X=data.train['X'][test],
                            y=data.train['y'][test])

    return {
        'features': features,
        'accuracy': scores['accuracy'],
        'roc_auc': scores['auc']
    }
예제 #7
0
def bestModelAuc():
    # auc = 0.8026

    params = getResultGrid().best_params_
    data = ChurnData()

    # fixed random state for cross validation
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    aucs = []

    for i, (trainIndex, testIndex) in enumerate(cv.split(**data.train)):
        print('Split{} out of 10'.format(i + 1))
        model = RandomForestClassifier(**params)
        model.fit(data.train['X'][trainIndex], data.train['y'][trainIndex])
        aucs.append(
            data.getScores(model,
                           X=data.train['X'][testIndex],
                           y=data.train['y'][testIndex])['auc'])

    return np.mean(aucs)
예제 #8
0
def storeModel(model, **model_params):
    data = ChurnData(predict='deltaNextHours')
    m = model(data, **model_params)
    m.fit(data.split_train_df)

    with open(model.RESULT_PATH + 'model.pkl', 'wb') as handle:
        pickle.dump(m, handle, protocol=pickle.HIGHEST_PROTOCOL)

    pred_val = m.cf.predict_expectation(data.split_val_df).values.reshape(-1)

    with open(model.RESULT_PATH + 'pred_val.pkl', 'wb') as handle:
        pickle.dump(pred_val, handle, protocol=pickle.HIGHEST_PROTOCOL)
예제 #9
0
def runFeatureElimination(includeFeat='all'):
    """
    Performs feature elimination
    Run RFE for each fold, find average scores for AUC and accuracy for each step

    :includeFeat: 'avg' or 'wght' -- include wght avg or avg only
    """
    # load data
    pool = Pool(64)

    # all features
    data = ChurnData()
    features = data.features

    if includeFeat == 'avg':
        # only avg deltaPrev
        features = list(
            set(features) -
            set(['logDeltaPrev_wght_avg', 'deltaPrev_wght_avg']))
    elif includeFeat == 'wght':
        # only weighted deltaPrev
        features = list(
            set(features) - set(['logDeltaPrev_avg', 'deltaPrev_avg']))

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    scores = [0] * 10

    for i, (train_ind, test_ind) in enumerate(cv.split(**data.train)):
        print('Fold: {} out of 10'.format(i + 1))
        scores[i] = pool.map(
            partial(_runFeatureElimination,
                    features=features,
                    train=train_ind,
                    test=test_ind), range(1,
                                          len(features) + 1))

    pool.close()

    features = [[s['features'] for s in ss] for ss in scores]
    accuracy = np.array([[s['accuracy'] for s in ss] for ss in scores]).mean(0)
    roc_auc = np.array([[s['roc_auc'] for s in ss] for ss in scores]).mean(0)
    # accuracy = np.array([s['accuracy'] for s in scores]).mean(0)
    # roc_auc = np.array([s['roc_auc '] for s in scores]).mean(0)

    res = {'features': features, 'accuracy': accuracy, 'roc_auc': roc_auc}

    with open('{}logReg_rfe_{}.pkl'.format(_RESULT_PATH, includeFeat),
              'wb') as handle:
        pickle.dump(res, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return res
예제 #10
0
def findPearsonCor():
    data = ChurnData()
    df = pd.DataFrame(data.X_split_train)
    df.columns = data.features
    df['churned'] = data.y_split_train
    corr = df.corr().churned
    keys = corr.keys()

    feat_noWght = ['numSessions', 'recency']
    feat = [
        'deltaPrev', 'dayOfMonth', 'dayOfWeek', 'hourOfDay', 'sessionLen',
        'price', 'numDivisions', 'numInteractions', 'numItemsViewed'
    ]
    feat_dev = ['Desktop', 'Mobile', 'Ios', 'Android', 'Unknown']

    corrs_feat_noWght = pd.DataFrame(columns=['feature', 'plain', 'log'])
    corrs_feat_noWght.feature = feat_noWght
    corrs_feat_noWght.plain = [corr[f] for f in feat_noWght]
    corrs_feat_noWght.log = [corr['log' + upperfirst(f)] for f in feat_noWght]

    corrs_feat = pd.DataFrame(
        columns=['feature', 'avg', 'log_avg', 'wght_avg', 'log_wght_avg'])
    corrs_feat.feature = feat
    corrs_feat.avg = [corr[f + '_avg'] for f in feat]
    corrs_feat.log_avg = [
        corr['log' + upperfirst(f) + '_avg'] if 'log' + upperfirst(f) +
        '_avg' in keys else np.nan for f in feat
    ]
    corrs_feat.wght_avg = [corr[f + '_wght_avg'] for f in feat]
    corrs_feat.log_wght_avg = [
        corr['log' + upperfirst(f) + '_wght_avg'] if 'log' + upperfirst(f) +
        '_wght_avg' in keys else np.nan for f in feat
    ]

    corrs_dev = pd.DataFrame(columns=['feature', 'plain', 'wght'])
    corrs_dev.feature = feat_dev
    corrs_dev.plain = [corr['device' + f] for f in feat_dev]
    corrs_dev.wght = [
        corr['device' + upperfirst(f) + '_wght'] for f in feat_dev
    ]

    return corrs_feat_noWght, corrs_feat, corrs_dev
예제 #11
0
def runBayesOpt(model,
                include_recency=False,
                error='concordance',
                maximise=True):
    """
    Cross-validated search for parameters

    """
    nFolds = 10
    nPools = 10
    bounds = {'penalizer': (1000, 5000)}
    n_iter = 20

    print(model.RESULT_PATH)

    # load churn data for splitting fold stratas
    churnData = ChurnData()

    cv = StratifiedKFold(n_splits=nFolds, shuffle=True, random_state=42)
    splits = np.array(list(cv.split(**churnData.train)))

    f = partial(_evaluatePenalizer,
                model=model,
                splits=splits,
                nPools=nPools,
                include_recency=include_recency,
                error=error,
                maximise=maximise)
    bOpt = BayesianOptimization(f, bounds)

    bOpt.maximize(init_points=2,
                  n_iter=n_iter,
                  acq='ucb',
                  kappa=5,
                  kernel=Matern())

    with open(
            model.RESULT_PATH + 'bayes_opt_{}{}.pkl'.format(
                error, '_rec' if include_recency else ''), 'wb') as handle:
        pickle.dump(bOpt, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return bOpt
예제 #12
0
def runGridSearch(model, include_recency=False):
    """
    Cross-validated search for parameters

    """
    nFolds = 10
    nPools = 10
    bounds = (2000, 3000)
    n_iter = 21
    space = np.linspace(bounds[0], bounds[1], n_iter)

    print(model.RESULT_PATH)

    # load churn data for splitting fold stratas
    churnData = ChurnData()

    cv = StratifiedKFold(n_splits=nFolds, shuffle=True, random_state=42)
    splits = np.array(list(cv.split(**churnData.train)))

    scores = []
    for p in space:
        print(p)
        scores.append(
            _evaluatePenalizer(p,
                               model=model,
                               splits=splits,
                               nPools=nPools,
                               include_recency=include_recency,
                               error=None))

    res = {
        'penalties': space,
        'scores': {k: [d[k] for d in scores]
                   for k in scores[0]}
    }

    with open(
            model.RESULT_PATH + 'grid_search{}_{}.pkl'.format(
                '_rec' if include_recency else '', n_iter), 'wb') as handle:
        pickle.dump(res, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return res
예제 #13
0
def crossValidate(model, penalizer=2045, include_recency=False, nFolds=10):
    churnData = ChurnData()
    cv = StratifiedKFold(n_splits=nFolds, shuffle=True, random_state=42)
    splits = np.array(list(cv.split(**churnData.train)))

    pool = Pool(nFolds)

    scores = pool.map(
        partial(_scoreModel,
                model=model,
                penalizer=penalizer,
                include_recency=include_recency), splits)

    res = {
        key: np.mean([score[key] for score in scores])
        for key in scores[0].keys()
    }

    pool.close()
    pool.join()

    return res
예제 #14
0
def runL2GridSearch():
    """
    Runs grid search logistic regression model with L2 penalty
    Uses 10-fold cross validation
    """

    param_grid = {'penalty': ['l2'], 'C': np.logspace(-6, 0, 800)}

    data = ChurnData()
    model = LogisticRegression(penalty='l2')

    # fixed random state for cross validation
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    # default scoring is accuracy
    grid_acc = GridSearchCV(estimator=model,
                            param_grid=param_grid,
                            verbose=1,
                            n_jobs=64,
                            cv=cv,
                            scoring='accuracy')
    grid_acc.fit(**data.train)

    grid_auc = GridSearchCV(estimator=model,
                            param_grid=param_grid,
                            verbose=1,
                            n_jobs=64,
                            cv=cv,
                            scoring='roc_auc')
    grid_auc.fit(**data.train)

    res = {'accuracy': grid_acc, 'roc_auc': grid_auc}

    with open('{}logRegL2_grid.pkl'.format(_RESULT_PATH), 'wb') as handle:
        pickle.dump(res, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return res
예제 #15
0
import numpy as np
from lifelines import CoxPHFitter

import sys
sys.path.insert(0, '../utils')


predPeriod = {
    'start': pd.Timestamp('2016-02-01'),
    'end': pd.Timestamp('2016-06-01')
}
predPeriodHours = (predPeriod['end'] - predPeriod['start']) / np.timedelta64(1, 'h')

class CoxChurnModel:
    def __init__(self):
        self.cf = CoxPHFitter()

    def fit(self, dataset, pred_col='deltaNextHours', event_col='observed'):
        self.cf.fit(dataset, pred_col, event_col=event_col)

    def predict(self, df):
        pred = self.cf.predict_expectation(df)
        churned = (pred - df.recency.values.reshape((-1,1))) > predPeriodHours
        return churned.values.reshape(-1)

    def predict_proba(self, df):
        return np.zeros(len(df))

model = CoxChurnModel()
data = ChurnData(dataset='cox')
예제 #16
0
def main():
    data = ChurnData()
    model = MajorityPredictor()
    model.fit(**data.train)
    data.getScore(model)
예제 #17
0
 def __init__(self, include_recency=False):
     self.data = ChurnData(predict='deltaNextHours'
                           )  #, features=['recency', 'logNumSessions'])
     self.include_recency = include_recency