class CumlKNNFitter(FitterBase): def __init__(self, label='label', metric='error', opt: KNNOpt = None, max_eval=10): super(CumlKNNFitter, self).__init__(label, metric, max_eval) if opt is not None: self.opt = opt else: self.opt = KNNOpt() self.clf = None def train(self, train_df, eval_df, params=None): train_df, eval_df = cudf.DataFrame(train_df), cudf.DataFrame(eval_df) x_train, y_train, x_eval, y_eval = train_df.drop(columns=[self.label]), train_df[self.label], \ eval_df.drop(columns=[self.label]), eval_df[self.label], if params is None: use_params = deepcopy(self.opt_params) else: use_params = deepcopy(params) self.clf = KNeighborsClassifier(**use_params) self.clf.fit(X=x_train, y=y_train) preds = self.clf.predict(X=x_eval) output = self.get_loss(y_pred=preds, y=y_eval) return output def search(self, train_df, eval_df): self.opt_params = dict() def train_impl(params): self.train(train_df, eval_df, params) if self.metric == 'auc': y_pred = self.clf.predict(eval_df.drop(columns=[self.label])) else: y_pred = self.clf.predict( eval_df.drop(columns=[self.label])).astype(int) return self.get_loss(eval_df[self.label], y_pred) self.opt_params = fmin(train_impl, asdict(self.opt), algo=tpe.suggest, max_evals=self.max_eval) def search_k_fold(self, k_fold, data): self.opt_params = dict() def train_impl_nfold(params): loss = list() for train_id, eval_id in k_fold.split(data): train_df = data.iloc[train_id, :] eval_df = data.iloc[eval_id, :] self.train(train_df, eval_df, params) if self.metric == 'auc': y_pred = self.clf.predict( eval_df.drop(columns=[self.label])) else: y_pred = self.clf.predict( eval_df.drop(columns=[self.label])).astype(int) loss.append(self.get_loss(eval_df[self.label], y_pred)) return np.mean(loss) self.opt_params = fmin(train_impl_nfold, asdict(self.opt), algo=tpe.suggest, max_evals=self.max_eval) def train_k_fold(self, k_fold, train_data, test_data, params=None, drop_test_y=True): acc_result = list() train_pred = np.empty(train_data.shape[0]) test_pred = np.empty(test_data.shape[0]) if drop_test_y: dtest = test_data.drop(columns=self.label) else: dtest = test_data for train_id, eval_id in k_fold.split(train_data): train_df = train_data.iloc[train_id] eval_df = train_data.iloc[eval_id] self.train(train_df, eval_df, params) train_pred[eval_id] = self.clf.predict( eval_df.drop(columns=self.label)) if self.metric == 'auc': y_pred = self.clf.predict(eval_df.drop(columns=[self.label])) else: y_pred = self.clf.predict( eval_df.drop(columns=[self.label])).astype(int) acc_result.append(self.get_loss(eval_df[self.label], y_pred)) test_pred += self.clf.predict(dtest) test_pred /= k_fold.n_splits return train_pred, test_pred, acc_result
X_train, X_test, y_train, y_test = bench.load_data(params) params.n_classes = y_train[y_train.columns[0]].nunique() # Create classification object knn_clsf = KNeighborsClassifier(n_neighbors=params.n_neighbors, weights=params.weights, algorithm=params.method, metric=params.metric) # Measure time and accuracy on fitting train_time, _ = bench.measure_function_time(knn_clsf.fit, X_train, y_train, params=params) if params.task == 'classification': y_pred = knn_clsf.predict(X_train) train_acc = 100 * bench.accuracy_score(y_pred, y_train) # Measure time and accuracy on prediction if params.task == 'classification': predict_time, yp = bench.measure_function_time(knn_clsf.predict, X_test, params=params) test_acc = 100 * bench.accuracy_score(yp, y_test) else: predict_time, _ = bench.measure_function_time(knn_clsf.kneighbors, X_test, params=params) if params.task == 'classification': bench.print_output(library='cuml',
for i in range(30): plt.subplot(3, 10, i+1) plt.imshow(samples[i].reshape((28, 28)), cmap=plt.cm.binary) plt.axis('off') plt.subplots_adjust(wspace=-0.1, hspace=-0.1) plt.show() # Create 20% Validation set X_train, X_test, y_train, y_test = train_test_split(train.iloc[:, :-1], train.loc[:, 'label'], test_size=0.2, random_state=42) # Grid Search kNN for optimal k accs = [] for k in range(3, 22): knn = KNeighborsClassifier(n_neighbors=k) knn.fit(X_train, y_train) y_hat = knn.predict(X_test) acc = (y_hat.to_array() == y_test.to_array()).sum()/y_test.shape print(k, acc) accs.append(acc) # Free memory del X_train, X_test, y_train, y_test # Plot grid search results plt.figure(figsize=(15, 5)) plt.plot(range(3, 22), accs) plt.title('MNIST kNN k value versus validation acc') plt.show() # KFold Grid Search (cross validation for k in range(3, 6):