def runRandomForest(filename):
	df_train = pd.read_csv('../data/%s_train.csv' % filename)
	df_test = pd.read_csv('../data/%s_test.csv' % filename)

	train_y = df_train.y
	df_train.drop('y', inplace=True, axis=1)
	train_X = df_train

	test_y = df_test.y
	df_test.drop('y', inplace=True, axis=1)
	test_X = df_test

	#model
	max_acc =0
	if filename == 'all_after_discretion_of_continuous_val':
		i=30
	if filename == 'all_after_expand_and_discretion':
		i=20
	if filename == 'all_rule5':
		i=40
	forest = RandomForestClassifier(n_estimators=i,max_depth=5,min_samples_split=10, bootstrap=True,n_jobs=3)
	print("n_estimators "+str(i))
	forest.fit(train_X, train_y)
	pred_y = forest.predict(test_X)
	prob = forest.predict_proba(test_X)
	print('auc socre :', calAUC(test_y, prob[:, 1]))
	print('f1_score:', f1_score(test_y,pred_y,labels=[1],average = None))
	print('list: ', lift_score(test_y, pred_y))

	print("score: "+str(forest.score(test_X, test_y)))
	drawConfusionMatrix(pred_y, test_y)
def test_binary_with_numpy():

    y_targ = np.array([1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0])
    y_pred = np.array([1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0])
    x = 1.25
    y = lift_score(y_targ, y_pred, binary=False, positive_label=1)
    assert_array_equal(x, y)
def plotUnivariateROC(dataset, label_name, k, cs, rs):
    '''
    preds is an nx1 array of predictions
    truth is an nx1 array of truth labels
    label_string is text to go into the plotting label
    '''
    X = dataset.drop(label_name, 1)
    Y = dataset[label_name]
    #print(Y)
    k_fold = KFold(dataset.shape[0], shuffle=True, n_folds=k, random_state=rs)
    aucs = {}
    for j, (train, test) in enumerate(k_fold):
        #print(X.loc[100000+train,:])
        if j == 9:
            #    continue #data is with bias, for the recent data is easier to be predicted
            for c in cs:
                LRcs = LogisticRegression(C=c)
                #sc = StandardScaler()
                tx = X.loc[train, :]
                ty = Y.loc[train].astype('int64')  #Y is 1-dimensional!!
                #sc.fit(tx)
                #tx = sc.transform(tx)
                rx = X.loc[test, :]
                #rx = sc.transform(rx)
                ry = Y.loc[test].astype('int64')
                #print('tx'.format(j)+':'+str(len(tx)))
                LRcs.fit(tx, ty)

                fpr, tpr, thresholds = roc_curve(ry,
                                                 LRcs.predict_proba(rx)[:, 1])
                roc_auc = auc(fpr, tpr)
                new_auc = 'AUC_{}_{}'.format(c, j)  #name of auc
                aucs[new_auc] = roc_auc
                cl = (np.random.rand(), np.random.rand(), np.random.rand())
                #print Lift score
                LS = lift_score(ry, LRcs.predict(rx))
                print('Lift Score for c={} is :{}'.format(c, LS))
                #create a plot and set some options
                plt.plot(fpr,
                         tpr,
                         color=cl,
                         label='AUC_{}'.format(c) + ' (AUC = %0.3f)' % roc_auc)

                plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
                plt.xlim([0.0, 1.0])
                plt.ylim([0.0, 1.0])
                plt.xlabel('FPR')
                plt.ylabel('TPR')
                plt.title('ROC', fontsize=25)
                plt.legend(loc="lower right")

    #lr_grid_search = GridSearchCV(LogisticRegression(), param_grid_lr, cv = kfolds, scoring = 'roc_auc')
    #lr_grid_search.fit(X, Y)
    return aucs
Example #4
0
def calculate_class_lift(y_test, class_predictions, model_uid):
    """
    Calculates the lift of a model, based on predicted class labels.

    :param y_test: y_test series
    :param class_predictions: class predictions series
    :param model_uid: model uid
    """
    lift = lift_score(y_test, class_predictions)
    pd.DataFrame({
        'lift': [lift]
    }).to_csv(os.path.join('modeling', model_uid, 'diagnostics',
                           'evaluation_plots', 'class_lift.csv'),
              index=False)
def runLR(filename):
	df_train = pd.read_csv('../data/%s_train.csv' % filename)
	df_test = pd.read_csv('../data/%s_test.csv' % filename)

	train_y = df_train.y
	df_train.drop('y', inplace=True, axis=1)
	train_X = df_train

	test_y = df_test.y
	df_test.drop('y', inplace=True, axis=1)
	test_X = df_test

	#model
	clf = LogisticRegression()

	clf.fit(train_X, train_y)
	pred_y = clf.predict(test_X)
	prob = clf.predict_proba(test_X)
	print('auc socre :', calAUC(test_y, prob[:,1]))
	print('f1_score:', f1_score(test_y, pred_y))
	print('list: ',lift_score(test_y,pred_y))
	print('score:' +str(clf.score(test_X,test_y)))
	drawConfusionMatrix(pred_y,test_y)
def test_multidimension():
    y_targ = [[1, 1, 1, 0, 0, 1], [0, 1, 0, 0, 0, 1]]
    y_pred = [[1, 0, 1, 0, 0, 1]]
    x = 1
    y = lift_score(y_targ, y_pred, binary=False, positive_label=1)
    assert_array_equal(x, y)
def test_binary():
    y_targ = [1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0]
    y_pred = [1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0]
    x = 1.25
    y = lift_score(y_targ, y_pred, binary=False, positive_label=1)
    assert_array_equal(x, y)
def test_multiclass_positive_label_0():
    y_targ = [1, 1, 1, 0, 0, 2, 0, 3, 4]
    y_pred = [1, 0, 1, 0, 0, 2, 1, 3, 0]
    x = 1.5
    y = lift_score(y_targ, y_pred, binary=True, positive_label=0)
    assert_array_equal(x, y)
Example #9
0
                                                     Y,
                                                     test_size=0.33)
 logreg = linear_model.LogisticRegression()
 logreg.fit(X_train, y_train)
 predictions = logreg.predict(X_test)
 np.mean(y_test == predictions)
 #Obtained Accuracy : 90.44361 (with 'duration')
 lrroc.append(
     np.mean(
         cross_val_score(linear_model.LogisticRegression(),
                         X_train,
                         y_train,
                         scoring='roc_auc',
                         cv=5)))
 #roc_auc : 0.9286
 lrlift.append(lift_score(y_test, predictions))
 #ALIFT : 5.6524
 #Decision Tree
 clf = tree.DecisionTreeClassifier()
 clf = clf.fit(X_train, y_train)
 pred = clf.predict(X_test)
 np.mean(y_test == pred)
 #Obtained Accuracy : 88.39844% (with 'duration')
 #Calculating the roc_auc scores.
 dtroc.append(
     np.mean(
         cross_val_score(tree.DecisionTreeClassifier(),
                         X_train,
                         y_train,
                         scoring='roc_auc',
                         cv=5)))