def classification_model(model, data, predictors, label, categorical_features = None, cv_label_name = None, k = 5, test_size = 0.1, n_iter = 100, train_only = False): data_len = len(data) cv = None auc, r2, rmse, acc = [], [], [], [] # print 'Predictors:', predictors predictors = [p.strip() for p in predictors] if cv_label_name is not None: cv_label = data[cv_label_name] else: cv_label = None if k is not None and cv_label is not None: cv = LabelKFold(cv_label, n_folds = k) elif k is not None and cv_label is None: cv = KFold(data_len, n_folds = k, shuffle = True) if k is None and test_size is not None and n_iter is not None and cv_label is not None: cv = LabelShuffleSplit(cv_label, n_iter = n_iter, test_size = test_size, random_state = 42) if k is None and test_size is not None and n_iter is not None and cv_label is None: cv = ShuffleSplit(data_len, n_iter = n_iter, test_size = test_size, random_state = 42) for train, test in cv: x_train = (data[predictors].iloc[train,:]) y_train = data[label].iloc[train] x_test = (data[predictors].iloc[test,:]) y_test = data[label].iloc[test] if categorical_features is not None: feature_idxs = [x_train.columns.get_loc(name) for name in categorical_features] encoder = OneHotEncoder(categorical_features = feature_idxs) encoder.fit(np.vstack((x_train, x_test))) x_train = encoder.transform(x_train) x_test = encoder.transform(x_test) model.fit(x_train, y_train) if train_only: x_test = x_train y_test = y_train y_pred_p = model.predict_proba(x_test)[:, 1] y_pred_c = model.predict(x_test) a,b,c,d = binary_classification_metrics(y_test, y_pred_p, y_pred_c) auc.append(a) r2.append(b) rmse.append(c) acc.append(d) # print 'auc:', a # print 'r2:', b # print 'rmse:', c # print 'accuracy:', d return np.mean(auc), np.mean(r2), np.mean(rmse), np.mean(acc)
def majority_model(data, label): mean_label = np.mean(data[label]) if mean_label >= 0.5: data['majority'] = 1 else: data['majority'] = 0 return binary_classification_metrics(data[label], data['majority'], data['majority'])
def classification_model(model, data, predictors, label, categorical_features=None, cv_label_name=None, k=5, test_size=0.1, n_iter=100, train_only=False): data_len = len(data) cv = None auc, r2, rmse, acc = [], [], [], [] # print 'Predictors:', predictors predictors = [p.strip() for p in predictors] if cv_label_name is not None: cv_label = data[cv_label_name] else: cv_label = None if k is not None and cv_label is not None: cv = LabelKFold(cv_label, n_folds=k) elif k is not None and cv_label is None: cv = KFold(data_len, n_folds=k, shuffle=True) if k is None and test_size is not None and n_iter is not None and cv_label is not None: cv = LabelShuffleSplit(cv_label, n_iter=n_iter, test_size=test_size, random_state=42) if k is None and test_size is not None and n_iter is not None and cv_label is None: cv = ShuffleSplit(data_len, n_iter=n_iter, test_size=test_size, random_state=42) for train, test in cv: x_train = (data[predictors].iloc[train, :]) y_train = data[label].iloc[train] x_test = (data[predictors].iloc[test, :]) y_test = data[label].iloc[test] if categorical_features is not None: feature_idxs = [ x_train.columns.get_loc(name) for name in categorical_features ] encoder = OneHotEncoder(categorical_features=feature_idxs) encoder.fit(np.vstack((x_train, x_test))) x_train = encoder.transform(x_train) x_test = encoder.transform(x_test) model.fit(x_train, y_train) if train_only: x_test = x_train y_test = y_train y_pred_p = model.predict_proba(x_test)[:, 1] y_pred_c = model.predict(x_test) a, b, c, d = binary_classification_metrics(y_test, y_pred_p, y_pred_c) auc.append(a) r2.append(b) rmse.append(c) acc.append(d) # print 'auc:', a # print 'r2:', b # print 'rmse:', c # print 'accuracy:', d return np.mean(auc), np.mean(r2), np.mean(rmse), np.mean(acc)
dists = knn_classifier.compute_distances_two_loops(binary_test_X) assert np.isclose(dists[0, 10], np.sum(np.abs(binary_test_X[0] - binary_train_X[10]))) dists = knn_classifier.compute_distances_one_loop(binary_test_X) assert np.isclose(dists[0, 10], np.sum(np.abs(binary_test_X[0] - binary_train_X[10]))) dists = knn_classifier.compute_distances_no_loops(binary_test_X) assert np.isclose(dists[0, 10], np.sum(np.abs(binary_test_X[0] - binary_train_X[10]))) if False: prediction = knn_classifier.predict(binary_test_X, num_loops=1) accuracy, precision, recall, f1 = binary_classification_metrics( prediction, binary_test_y) print("KNN with k = %s" % knn_classifier.k) print("Accuracy: %4.2f, Precision: %4.2f, Recall: %4.2f, F1: %4.2f" % (accuracy, precision, recall, f1)) knn_classifier_3 = KNN(k=3) knn_classifier_3.fit(binary_train_X, binary_train_y) prediction = knn_classifier_3.predict(binary_test_X, num_loops=1) accuracy, precision, recall, f1 = binary_classification_metrics( prediction, binary_test_y) print("KNN with k = %s" % knn_classifier_3.k) print("Accuracy: %4.2f, Precision: %4.2f, Recall: %4.2f, F1: %4.2f" % (accuracy, precision, recall, f1)) if False:
def culc_f1_score(train_folds_X,train_folds_y, val_X, val_y,num_folds,K): binary_train_mask = (train_folds_y == 0) | (train_folds_y == 9) binary_train_X = train_folds_X[binary_train_mask] #test #print('binary_train_X (new data set)=', binary_train_X.shape) #expect new_size (~121), 32,32,3 binary_train_y_test = train_folds_y[binary_train_mask] #print('binary_train_y_test shape =', binary_train_y_test.shape) #print('binary_train_y_test[0-10, new lavel set]', binary_train_y_test[0:10]) #expect 0s and 9s #print('binary_train_y_test[0] type', type(binary_train_y_test[0])) #expect numpy.uint8 binary_train_y = train_folds_y[binary_train_mask] == 0 #print('binary_train_y shape =', binary_train_y.shape) #expect 121, #print('binary_train_y[0:10', binary_train_y[:10]) #extect Folse, True binary_test_mask = (val_y == 0) | (val_y == 9) binary_test_X = val_X[binary_test_mask] #print('binary_test_X.shape =', binary_test_X.shape) #expect !16 binary_test_y = val_y[binary_test_mask] == 0 # Reshape to 1-dimensional array [num_samples, 32*32*3] #print('binary_train_x shape befor =', binary_train_X.shape) #expect 161,32,32,3 binary_train_X = binary_train_X.reshape(binary_train_X.shape[0], -1) #print('binary_train_X.shape[0]',binary_train_X.shape[0]) #expect 121 #print('binary_train_x shape after =', binary_train_X.shape) #expect 121,32*32*3 = 3072 binary_test_X = binary_test_X.reshape(binary_test_X.shape[0], -1) #print('binary_test_x shape after =', binary_test_X.shape) #expect 16,32*32*3 = 3072 #print('------------classify ') # Create the classifier and call fit to train the model # KNN just remembers all the data knn_classifier = KNN(k=K) knn_classifier.fit(binary_train_X, binary_train_y) #print('----------------calculate the dists, no loops ') dists = knn_classifier.compute_distances_no_loops(binary_test_X) #print(dists) #print(dists.shape) assert np.isclose(dists[0, 10], np.sum(np.abs(binary_test_X[0] - binary_train_X[10]))) #print('----------------calculate the time ') # Lets look at the performance difference #%timeit knn_classifier.compute_distances_two_loops(binary_test_X) #%timeit knn_classifier.compute_distances_one_loop(binary_test_X) #%timeit knn_classifier.compute_distances_no_loops(binary_test_X) prediction = knn_classifier.predict(binary_test_X) #print('real value=', binary_test_y) #print('predicted ',prediction) #print('----------------calculate metrics ') precision, recall, f1, accuracy = binary_classification_metrics(prediction, binary_test_y) #print("KNN with k = %s" % knn_classifier.k) #print("Accuracy: %4.2f, Precision: %4.2f, Recall: %4.2f, F1: %4.2f" % (accuracy, precision, recall, f1)) return f1
get_ipython().run_line_magic('timeit', 'knn_classifier.compute_distances_no_loops(binary_test_X)') #%% prediction = knn_classifier.predict(binary_test_X) prediction #%% def print_samples(samples): for i in range(0, samples.shape[0]): image = np.reshape(samples[i], (32,32,3)) plt.imshow(image.astype(np.uint8)) plt.axis("off") plt.show() #%% precision, recall, f1, accuracy = binary_classification_metrics(prediction, binary_test_y) print("KNN with k = %s" % knn_classifier.k) print("Accuracy: %4.2f, Precision: %4.2f, Recall: %4.2f, F1: %4.2f" % (accuracy, precision, recall, f1)) #%% # Let's put everything together and run KNN with k=3 and see how we do knn_classifier_3 = KNN(k=3) knn_classifier_3.fit(binary_train_X, binary_train_y) prediction = knn_classifier_3.predict(binary_test_X) precision, recall, f1, accuracy = binary_classification_metrics(prediction, binary_test_y) print("KNN with k = %s" % knn_classifier_3.k) print("Accuracy: %4.2f, Precision: %4.2f, Recall: %4.2f, F1: %4.2f" % (accuracy, precision, recall, f1)) #%% [markdown] # # Кросс-валидация (cross-validation)