Ejemplo n.º 1
0
def analyse_model(train,
                  test,
                  input_prj,
                  model,
                  pred_fold,
                  pred_test,
                  variables_selected,
                  TMP=1234):
    """Analyse a model
    train -- pandas dataframe
    test -- pandas dataframe
    input_prj -- dictionnary containing projet information (response...)
    pred_fold -- pandas dataframe
    pred_test -- pandas dataframe
    variables_selected -- list variables in model
    """
    #Open pdf
    pp = PdfPages(input_prj['OUTPUT_PATH'] + "analysis_" + str(TMP) + ".pdf")
    #Prepare Data
    train_an, test_an = prepare_data_analysis(train, test, input_prj, model,
                                              pred_fold, pred_test)
    #Performance
    if input_prj['PRJ_COLUMN']['EXPOSURE'] != None:
        exposure_train_value = train_an[input_prj['PRJ_COLUMN']['EXPOSURE']]
        exposure_test_value = test_an[input_prj['PRJ_COLUMN']['EXPOSURE']]
    else:
        exposure_train_value = None
        exposure_test_value = None
    metric_pred_cv = error_metric(
        train_an[input_prj['PRJ_COLUMN']['RESPONSE']], train_an["Pred"],
        exposure_train_value, input_prj['METRIC'])
    metric_test = error_metric(test_an[input_prj['PRJ_COLUMN']['RESPONSE']],
                               test_an["Pred"], exposure_test_value,
                               input_prj['METRIC'])
    print("    Fold " + input_prj['METRIC'] + " : " + str(metric_pred_cv))
    print("    Test " + input_prj['METRIC'] + " : " + str(metric_test))
    #Variable importance
    print("    Variable importance")
    plot_var = variable_importance_plot(train, input_prj, model,
                                        variables_selected)
    pp.savefig(plot_var)
    print("    Lift chart")
    plot_lift = lift_chart(train_an, input_prj['PRJ_COLUMN']["RESPONSE"],
                           "Pred")
    pp.savefig(plot_lift)
    #Marginal effect
    print("    Marginal effect")
    marginal_effect(train, pred_fold, input_prj, model, variables_selected, pp)
    #Close Pdf
    pp.close()
    return train_an, test_an
Ejemplo n.º 2
0
def lightgbm_tp(train, prj_info, setting):
    """Tuning parameters
    train -- pandas dataframe
    prj_info -- dictionnary containing projet information (response...)
    setting -- dictionnary containing settings
    """
    #Split data
    train_, test_ = split_data(train, prj_info['PRJ_COLUMN']['FOLD_ASSIGN'])

    #Build data
    y_train, y_test, X_train, X_test, W_train, W_test, O_train, monotonicity_vec = build_data(
        train_, test_, prj_info)

    #Dataset lightgbm
    lgb_train = lgb.Dataset(X_train,
                            y_train,
                            weight=W_train,
                            init_score=O_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

    #params
    param_mono = {'monotone_constraints': monotonicity_vec}
    params = setting['params']
    keys, values = zip(*params.items())
    experiments = [dict(zip(keys, v)) for v in itertools.product(*values)]
    #Loop
    data_results = pd.DataFrame([])
    timeout = time.time() + setting['naive_tp_time']
    loop_exp = 0
    exp_tested = []
    while True:
        #Break if time out
        if time.time() > timeout and data_results.shape[0] > 2:
            break
        #Experiment
        exp = random.choice(experiments)
        exp.update(param_mono)
        #Model
        lightgbm = lgb.train(exp,
                             train_set=lgb_train,
                             num_boost_round=5000,
                             early_stopping_rounds=20,
                             valid_sets=lgb_eval,
                             verbose_eval=False)
        #Predict
        pred_train = lightgbm.predict(X_train,
                                      num_iteration=lightgbm.best_iteration)
        pred_test = lightgbm.predict(X_test,
                                     num_iteration=lightgbm.best_iteration)
        if not os.path.exists(prj_info['OUTPUT_PATH'] + "shadow/"):
            os.makedirs(prj_info['OUTPUT_PATH'] + "shadow/")
        lightgbm.save_model(prj_info['OUTPUT_PATH'] + "shadow/" +
                            str(loop_exp) + "model.txt")

        #Metric
        metric_train = error_metric(y_train, pred_train, W_train,
                                    prj_info['METRIC'])
        metric_test = error_metric(y_test, pred_test, W_test,
                                   prj_info['METRIC'])
        #Save results
        data_results_ = pd.DataFrame.from_dict(exp, orient='index')
        data_results_ = data_results_.transpose()
        data_results_["train"] = metric_train
        data_results_["test"] = metric_test
        data_results = data_results.append(data_results_)

        exp_tested.append(exp)
        loop_exp = loop_exp + 1

    #Find max experiment
    data_results = data_results.reset_index(drop=True)
    best_experiment_index = data_results["test"].idxmax()
    best_experiment = exp_tested[best_experiment_index]
    #Best model
    best_lightgbm = lgb.Booster(model_file=prj_info['OUTPUT_PATH'] +
                                "shadow/" + str(best_experiment_index) +
                                "model.txt")
    print("        " + str(loop_exp + 1) + " models built")

    return best_experiment, y_test, X_test, W_test, best_lightgbm, data_results
Ejemplo n.º 3
0
def lightgbm_final_model(train, test, prj_info, best_experiment):
    """Lightgbm final model
    train -- pandas dataframe
    test -- pandas dataframe
    prj_info -- dictionnary containing projet information (response...)
    best_experiment -- dictionnary containing settings
    """
    #Param
    params = best_experiment
    #Build data
    y_train, y_test, X_train, X_test, W_train, W_test, O_train, monotonicity_vec = build_data(
        train, test, prj_info)

    #Cv Model
    metric_cv = []
    pred_fold = pd.DataFrame([])
    best_it = []
    for fold in range(1,
                      max(train[prj_info['PRJ_COLUMN']['FOLD_ASSIGN']]) + 1):
        print('         Fold ' + str(fold))
        train_fold = train[
            train[prj_info['PRJ_COLUMN']['FOLD_ASSIGN']] != fold]
        test_fold = train[train[prj_info['PRJ_COLUMN']['FOLD_ASSIGN']] == fold]
        test_fold_idx = train[train[prj_info['PRJ_COLUMN']['FOLD_ASSIGN']] ==
                              fold][prj_info['PRJ_COLUMN']['INDEX']]
        y_train_fold, y_valid_fold, X_train_fold, X_valid_fold, W_train_fold, W_test_fold, O_train_fold, monotonicity_vec = build_data(
            train_fold, test_fold, prj_info)

        #Build datset
        lgb_train_fold = lgb.Dataset(X_train_fold,
                                     y_train_fold,
                                     weight=W_train_fold,
                                     init_score=O_train)
        lgb_eval_fold = lgb.Dataset(X_valid_fold,
                                    y_valid_fold,
                                    reference=lgb_train_fold)

        #Model cv
        lightgbm_cv = lgb.train(params,
                                num_boost_round=5000,
                                early_stopping_rounds=20,
                                train_set=lgb_train_fold,
                                valid_sets=lgb_eval_fold,
                                verbose_eval=False)

        #Predict
        pred_valid_fold = lightgbm_cv.predict(
            X_valid_fold, num_iteration=lightgbm_cv.best_iteration)
        pred_fold_data = pd.DataFrame(
            data={
                prj_info['PRJ_COLUMN']['INDEX']: test_fold_idx,
                'Pred': pred_valid_fold
            })
        pred_fold = pred_fold.append(pred_fold_data)
        #Metric
        metric_test_cv = error_metric(y_valid_fold, pred_valid_fold,
                                      W_test_fold, prj_info['METRIC'])
        print(metric_test_cv)
        #Save results
        metric_cv.append(metric_test_cv)
        #Save best It
        best_it.append(lightgbm_cv.best_iteration)

    metric_cv_mean = np.mean(metric_cv)
    best_it_mean = np.mean(best_it)
    #Full Model
    lgb_train = lgb.Dataset(X_train,
                            y_train,
                            weight=W_train,
                            init_score=O_train)

    print('         Full model')
    lightgbm = lgb.train(params,
                         num_boost_round=int(round(best_it_mean)),
                         train_set=lgb_train,
                         verbose_eval=False)

    pred_test = lightgbm.predict(X_test)
    metric_test = error_metric(y_test, pred_test, W_test, prj_info['METRIC'])
    pred_test = pd.DataFrame(
        data={
            prj_info['PRJ_COLUMN']['INDEX']: test[prj_info['PRJ_COLUMN']
                                                  ['INDEX']],
            'Pred': pred_test
        })

    print("    Fold mean " + prj_info['METRIC'] + " : " + str(metric_cv_mean))
    print("    Test " + prj_info['METRIC'] + " : " + str(metric_test))

    return lightgbm, pred_fold, pred_test
Ejemplo n.º 4
0
def keras_final_model(train,test,prj_info,settings):
    """Training pilot
    train -- pandas dataframe
    test -- pandas dataframe
    prj_info -- dictionnary containing projet information (response...)
    settings -- dictionnary containing settings
    """
    #Build data
    y_train,y_test,X_train,X_test,W_train,W_test,O_train,monotonicity_vec = build_data(train,test,prj_info)
    variables_selected = X_train.columns.values
    
    #Prep data
    X_train,le_X,scale_X = keras_prep_data(X_train)
    X_test,le_X,scale_X = keras_prep_data(X_test,le_X,scale_X)

    #Early stop
    early_stop = EarlyStopping(monitor='val_loss', patience=20, mode='auto') 
    
    #Model
    def bp02_model(input_dim = None):
        adam = optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0, amsgrad=False)
        model = Sequential()
        model.add(Dense(50, input_dim = input_dim, kernel_initializer='normal', activation='relu',kernel_regularizer=regularizers.l2(0.01)))
        model.add(Dropout(0.1)) #1500
        model.add(Dense(20, kernel_initializer='normal', activation='relu',kernel_regularizer=regularizers.l2(0.01)))
        model.add(Dropout(0.1)) #750
        model.add(Dense(20, kernel_initializer='normal', activation='relu',kernel_regularizer=regularizers.l2(0.01)))
        model.add(Dropout(0.1))#750
        model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer=adam)
        return model

    #Cv Model
    metric_cv = []
    pred_fold = pd.DataFrame([])
    best_it = []
    for fold in range(1,max(train[prj_info['PRJ_COLUMN']['FOLD_ASSIGN']])+1):
        print('         Fold ' + str(fold))
        train_fold = train[train[prj_info['PRJ_COLUMN']['FOLD_ASSIGN']] != fold]
        test_fold = train[train[prj_info['PRJ_COLUMN']['FOLD_ASSIGN']] == fold]
        test_fold_idx = train[train[prj_info['PRJ_COLUMN']['FOLD_ASSIGN']] == fold][prj_info['PRJ_COLUMN']['INDEX']]
        y_train_fold,y_valid_fold,X_train_fold,X_valid_fold,W_train_fold,W_test_fold,O_train_fold,monotonicity_vec = build_data(train_fold,test_fold,prj_info)

        #Prep data
        X_train_fold,le_X,scale_X = keras_prep_data(X_train_fold,le_X,scale_X)
        X_valid_fold,le_X,scale_X = keras_prep_data(X_valid_fold,le_X,scale_X)
        
        #Estimator
        clf_fold = KerasClassifier(build_fn = bp02_model,
                                   input_dim = X_train.shape[1],
                                   epochs = settings['params']['epochs'],
                                   batch_size = settings['params']['batch_size'],
                                   verbose = settings['params']['verbose'],
                                   callbacks=[early_stop])
        
        #Model cv
        history_fold =  clf_fold.fit(X_train_fold,y_train_fold, validation_data = (X_valid_fold,y_valid_fold))
        #Predict
        pred_valid_fold = clf_fold.predict_proba(X_valid_fold)
        pred_valid_fold = [item[1] for item in pred_valid_fold]
        pred_fold_data = pd.DataFrame(data={prj_info['PRJ_COLUMN']['INDEX']: test_fold_idx, 'Pred' : pred_valid_fold})
        pred_fold = pred_fold.append(pred_fold_data)
        #Metric
        metric_test_cv = error_metric(y_valid_fold,pred_valid_fold,W_test_fold,prj_info['METRIC'])
        print(metric_test_cv)
        #Save results
        metric_cv.append(metric_test_cv)
        #Save best Iteration
        best_it_fold = history_fold.history['val_loss'].index(min(history_fold.history['val_loss']))+1
        best_it.append(best_it_fold)

    metric_cv_mean = np.mean(metric_cv)
    best_it_mean = np.mean(best_it)
    
    #Full model
    print('         Full model')

    #Estimator
    clf = KerasClassifier(build_fn = bp02_model,
                          input_dim = X_train.shape[1],
                          epochs = int(round(best_it_mean)),
                          batch_size = settings['params']['batch_size'],
                          verbose = settings['params']['verbose'])
    
    clf.fit(X_train,y_train)

    pred_test = clf.predict_proba(X_test)
    pred_test = [item[1] for item in pred_test]
    metric_test = error_metric(y_test,pred_test,W_test,prj_info['METRIC'])
    pred_test = pd.DataFrame(data={prj_info['PRJ_COLUMN']['INDEX']: test[prj_info['PRJ_COLUMN']['INDEX']], 'Pred' : pred_test})
    
    print("    Fold mean " + prj_info['METRIC'] + " : " + str(metric_cv_mean))
    print("    Test " + prj_info['METRIC'] + " : " + str(metric_test))
    
    return clf,pred_fold,pred_test,variables_selected,le_X,scale_X