def classification_model(model, data, predictors, label, categorical_features = None, cv_label_name = None, k = 5, test_size = 0.1, n_iter = 100, train_only = False):
    data_len = len(data)
    cv = None
    auc, r2, rmse, acc = [], [], [], []

    # print 'Predictors:', predictors
    predictors = [p.strip() for p in predictors]

    if cv_label_name is not None:
        cv_label = data[cv_label_name]
    else:
        cv_label = None


    if k is not None and cv_label is not None:
        cv = LabelKFold(cv_label, n_folds = k)
    elif k is not None and cv_label is None:
        cv = KFold(data_len, n_folds = k, shuffle = True)

    if k is None and test_size is not None and n_iter is not None and cv_label is not None:
        cv = LabelShuffleSplit(cv_label, n_iter = n_iter, test_size = test_size, random_state = 42)
    if k is None and test_size is not None and n_iter is not None and cv_label is None:
        cv = ShuffleSplit(data_len, n_iter = n_iter, test_size = test_size, random_state = 42)


    for train, test in cv:
        x_train = (data[predictors].iloc[train,:])
        y_train = data[label].iloc[train]
        x_test = (data[predictors].iloc[test,:])
        y_test = data[label].iloc[test]

        if categorical_features is not None:
            feature_idxs = [x_train.columns.get_loc(name) for name in categorical_features]
            encoder = OneHotEncoder(categorical_features = feature_idxs)
            encoder.fit(np.vstack((x_train, x_test)))
            x_train = encoder.transform(x_train)
            x_test = encoder.transform(x_test)

        model.fit(x_train, y_train)
        if train_only:
            x_test = x_train
            y_test = y_train
        y_pred_p = model.predict_proba(x_test)[:, 1]
        y_pred_c = model.predict(x_test)

        a,b,c,d = binary_classification_metrics(y_test, y_pred_p, y_pred_c)

        auc.append(a)
        r2.append(b)
        rmse.append(c)
        acc.append(d)

    # print 'auc:', a
    # print 'r2:', b
    # print 'rmse:', c
    # print 'accuracy:', d

    return np.mean(auc), np.mean(r2), np.mean(rmse), np.mean(acc)
def majority_model(data, label):

    mean_label = np.mean(data[label])
    if mean_label >= 0.5:
        data['majority'] = 1
    else:
        data['majority'] = 0

    return binary_classification_metrics(data[label], data['majority'], data['majority'])
def majority_model(data, label):

    mean_label = np.mean(data[label])
    if mean_label >= 0.5:
        data['majority'] = 1
    else:
        data['majority'] = 0

    return binary_classification_metrics(data[label], data['majority'],
                                         data['majority'])
def classification_model(model,
                         data,
                         predictors,
                         label,
                         categorical_features=None,
                         cv_label_name=None,
                         k=5,
                         test_size=0.1,
                         n_iter=100,
                         train_only=False):
    data_len = len(data)
    cv = None
    auc, r2, rmse, acc = [], [], [], []

    # print 'Predictors:', predictors
    predictors = [p.strip() for p in predictors]

    if cv_label_name is not None:
        cv_label = data[cv_label_name]
    else:
        cv_label = None

    if k is not None and cv_label is not None:
        cv = LabelKFold(cv_label, n_folds=k)
    elif k is not None and cv_label is None:
        cv = KFold(data_len, n_folds=k, shuffle=True)

    if k is None and test_size is not None and n_iter is not None and cv_label is not None:
        cv = LabelShuffleSplit(cv_label,
                               n_iter=n_iter,
                               test_size=test_size,
                               random_state=42)
    if k is None and test_size is not None and n_iter is not None and cv_label is None:
        cv = ShuffleSplit(data_len,
                          n_iter=n_iter,
                          test_size=test_size,
                          random_state=42)

    for train, test in cv:
        x_train = (data[predictors].iloc[train, :])
        y_train = data[label].iloc[train]
        x_test = (data[predictors].iloc[test, :])
        y_test = data[label].iloc[test]

        if categorical_features is not None:
            feature_idxs = [
                x_train.columns.get_loc(name) for name in categorical_features
            ]
            encoder = OneHotEncoder(categorical_features=feature_idxs)
            encoder.fit(np.vstack((x_train, x_test)))
            x_train = encoder.transform(x_train)
            x_test = encoder.transform(x_test)

        model.fit(x_train, y_train)
        if train_only:
            x_test = x_train
            y_test = y_train
        y_pred_p = model.predict_proba(x_test)[:, 1]
        y_pred_c = model.predict(x_test)

        a, b, c, d = binary_classification_metrics(y_test, y_pred_p, y_pred_c)

        auc.append(a)
        r2.append(b)
        rmse.append(c)
        acc.append(d)

    # print 'auc:', a
    # print 'r2:', b
    # print 'rmse:', c
    # print 'accuracy:', d

    return np.mean(auc), np.mean(r2), np.mean(rmse), np.mean(acc)
Beispiel #5
0
dists = knn_classifier.compute_distances_two_loops(binary_test_X)
assert np.isclose(dists[0, 10],
                  np.sum(np.abs(binary_test_X[0] - binary_train_X[10])))

dists = knn_classifier.compute_distances_one_loop(binary_test_X)
assert np.isclose(dists[0, 10],
                  np.sum(np.abs(binary_test_X[0] - binary_train_X[10])))

dists = knn_classifier.compute_distances_no_loops(binary_test_X)
assert np.isclose(dists[0, 10],
                  np.sum(np.abs(binary_test_X[0] - binary_train_X[10])))

if False:
    prediction = knn_classifier.predict(binary_test_X, num_loops=1)

    accuracy, precision, recall, f1 = binary_classification_metrics(
        prediction, binary_test_y)
    print("KNN with k = %s" % knn_classifier.k)
    print("Accuracy: %4.2f, Precision: %4.2f, Recall: %4.2f, F1: %4.2f" %
          (accuracy, precision, recall, f1))

    knn_classifier_3 = KNN(k=3)
    knn_classifier_3.fit(binary_train_X, binary_train_y)
    prediction = knn_classifier_3.predict(binary_test_X, num_loops=1)

    accuracy, precision, recall, f1 = binary_classification_metrics(
        prediction, binary_test_y)
    print("KNN with k = %s" % knn_classifier_3.k)
    print("Accuracy: %4.2f, Precision: %4.2f, Recall: %4.2f, F1: %4.2f" %
          (accuracy, precision, recall, f1))

if False:
Beispiel #6
0
def culc_f1_score(train_folds_X,train_folds_y, val_X, val_y,num_folds,K):
   

    binary_train_mask = (train_folds_y == 0) | (train_folds_y == 9)
    binary_train_X = train_folds_X[binary_train_mask]   #test
    #print('binary_train_X (new data set)=', binary_train_X.shape) #expect new_size (~121), 32,32,3

    binary_train_y_test = train_folds_y[binary_train_mask]  
    #print('binary_train_y_test shape =', binary_train_y_test.shape) 
#print('binary_train_y_test[0-10, new lavel set]', binary_train_y_test[0:10]) #expect 0s and 9s
#print('binary_train_y_test[0] type', type(binary_train_y_test[0])) #expect numpy.uint8

    binary_train_y = train_folds_y[binary_train_mask] == 0   
#print('binary_train_y shape =', binary_train_y.shape) #expect 121,
#print('binary_train_y[0:10', binary_train_y[:10]) #extect Folse, True



    binary_test_mask = (val_y == 0) | (val_y == 9)

    binary_test_X = val_X[binary_test_mask]
#print('binary_test_X.shape =', binary_test_X.shape) #expect !16
    binary_test_y = val_y[binary_test_mask] == 0

# Reshape to 1-dimensional array [num_samples, 32*32*3]
#print('binary_train_x shape befor =', binary_train_X.shape) #expect 161,32,32,3
    binary_train_X = binary_train_X.reshape(binary_train_X.shape[0], -1)
#print('binary_train_X.shape[0]',binary_train_X.shape[0]) #expect 121
#print('binary_train_x shape after =', binary_train_X.shape) #expect 121,32*32*3 = 3072

    binary_test_X = binary_test_X.reshape(binary_test_X.shape[0], -1)
#print('binary_test_x shape after =', binary_test_X.shape) #expect 16,32*32*3 = 3072

#print('------------classify ')
# Create the classifier and call fit to train the model
# KNN just remembers all the data
    knn_classifier = KNN(k=K)
    knn_classifier.fit(binary_train_X, binary_train_y)


#print('----------------calculate the dists, no loops ')

    dists = knn_classifier.compute_distances_no_loops(binary_test_X)
    #print(dists)
#print(dists.shape)
    assert np.isclose(dists[0, 10], np.sum(np.abs(binary_test_X[0] - binary_train_X[10])))

#print('----------------calculate the time ')

# Lets look at the performance difference
#%timeit knn_classifier.compute_distances_two_loops(binary_test_X)
#%timeit knn_classifier.compute_distances_one_loop(binary_test_X)
#%timeit knn_classifier.compute_distances_no_loops(binary_test_X)

    prediction = knn_classifier.predict(binary_test_X)
    #print('real value=', binary_test_y)
#print('predicted ',prediction)

#print('----------------calculate metrics ')
    precision, recall, f1, accuracy = binary_classification_metrics(prediction, binary_test_y)
#print("KNN with k = %s" % knn_classifier.k)
#print("Accuracy: %4.2f, Precision: %4.2f, Recall: %4.2f, F1: %4.2f" % (accuracy, precision, recall, f1)) 
     
    return f1
Beispiel #7
0
get_ipython().run_line_magic('timeit', 'knn_classifier.compute_distances_no_loops(binary_test_X)')

#%%
prediction = knn_classifier.predict(binary_test_X)
prediction

#%%
def print_samples(samples):
    for i in range(0, samples.shape[0]):
        image = np.reshape(samples[i], (32,32,3))
        plt.imshow(image.astype(np.uint8))
        plt.axis("off")
        plt.show()

#%%
precision, recall, f1, accuracy = binary_classification_metrics(prediction, binary_test_y)
print("KNN with k = %s" % knn_classifier.k)
print("Accuracy: %4.2f, Precision: %4.2f, Recall: %4.2f, F1: %4.2f" % (accuracy, precision, recall, f1)) 

#%%
# Let's put everything together and run KNN with k=3 and see how we do
knn_classifier_3 = KNN(k=3)
knn_classifier_3.fit(binary_train_X, binary_train_y)
prediction = knn_classifier_3.predict(binary_test_X)

precision, recall, f1, accuracy = binary_classification_metrics(prediction, binary_test_y)
print("KNN with k = %s" % knn_classifier_3.k)
print("Accuracy: %4.2f, Precision: %4.2f, Recall: %4.2f, F1: %4.2f" % (accuracy, precision, recall, f1)) 

#%% [markdown]
# # Кросс-валидация (cross-validation)