def train(self, train_df, eval_df, params=None): train_df, eval_df = cudf.DataFrame(train_df), cudf.DataFrame(eval_df) x_train, y_train, x_eval, y_eval = train_df.drop(columns=[self.label]), train_df[self.label], \ eval_df.drop(columns=[self.label]), eval_df[self.label], if params is None: use_params = deepcopy(self.opt_params) else: use_params = deepcopy(params) self.clf = KNeighborsClassifier(**use_params) self.clf.fit(X=x_train, y=y_train) preds = self.clf.predict(X=x_eval) output = self.get_loss(y_pred=preds, y=y_eval) return output
def oof_probas(X, y, v, n_folds=5, random_state=42, n_neighbors=1000): # First scale inputs X_scaled = StandardScaler().fit_transform(X.iloc(axis=1)[3:].values) X_scaled = np.hstack([X.iloc(axis=1)[:3].values, X_scaled]) # Get Multi-Label Stratified K-Folds df_fold = get_folds(y, n_folds=n_folds, random_state=random_state) # Initialize array to store out-of-fold probabilities oof = np.zeros((X.shape[0], 206)) for fold in range(n_folds): fold_idx = df_fold[df_fold['fold'] != fold].index pp_fold = df_fold[df_fold['fold'] == fold].index model = KNeighborsClassifier(n_neighbors=n_neighbors) model.fit(X_scaled[fold_idx][:, v], y.iloc[fold_idx].values[:, 1:]) pp = model.predict_proba(X_scaled[pp_fold][:, v]) pp = np.stack([(1 - pp[x][:, 0]) for x in range(len(pp))]).T oof[pp_fold, ] = pp return oof
type=str, default='brute', help='Algorithm used to compute the nearest neighbors') parser.add_argument('--metric', type=str, default='euclidean', help='Distance metric to use') params = bench.parse_args(parser) # Load generated data X_train, X_test, y_train, y_test = bench.load_data(params) params.n_classes = y_train[y_train.columns[0]].nunique() # Create classification object knn_clsf = KNeighborsClassifier(n_neighbors=params.n_neighbors, weights=params.weights, algorithm=params.method, metric=params.metric) # Measure time and accuracy on fitting train_time, _ = bench.measure_function_time(knn_clsf.fit, X_train, y_train, params=params) if params.task == 'classification': y_pred = knn_clsf.predict(X_train) train_acc = 100 * bench.accuracy_score(y_pred, y_train) # Measure time and accuracy on prediction if params.task == 'classification': predict_time, yp = bench.measure_function_time(knn_clsf.predict, X_test,
class CumlKNNFitter(FitterBase): def __init__(self, label='label', metric='error', opt: KNNOpt = None, max_eval=10): super(CumlKNNFitter, self).__init__(label, metric, max_eval) if opt is not None: self.opt = opt else: self.opt = KNNOpt() self.clf = None def train(self, train_df, eval_df, params=None): train_df, eval_df = cudf.DataFrame(train_df), cudf.DataFrame(eval_df) x_train, y_train, x_eval, y_eval = train_df.drop(columns=[self.label]), train_df[self.label], \ eval_df.drop(columns=[self.label]), eval_df[self.label], if params is None: use_params = deepcopy(self.opt_params) else: use_params = deepcopy(params) self.clf = KNeighborsClassifier(**use_params) self.clf.fit(X=x_train, y=y_train) preds = self.clf.predict(X=x_eval) output = self.get_loss(y_pred=preds, y=y_eval) return output def search(self, train_df, eval_df): self.opt_params = dict() def train_impl(params): self.train(train_df, eval_df, params) if self.metric == 'auc': y_pred = self.clf.predict(eval_df.drop(columns=[self.label])) else: y_pred = self.clf.predict( eval_df.drop(columns=[self.label])).astype(int) return self.get_loss(eval_df[self.label], y_pred) self.opt_params = fmin(train_impl, asdict(self.opt), algo=tpe.suggest, max_evals=self.max_eval) def search_k_fold(self, k_fold, data): self.opt_params = dict() def train_impl_nfold(params): loss = list() for train_id, eval_id in k_fold.split(data): train_df = data.iloc[train_id, :] eval_df = data.iloc[eval_id, :] self.train(train_df, eval_df, params) if self.metric == 'auc': y_pred = self.clf.predict( eval_df.drop(columns=[self.label])) else: y_pred = self.clf.predict( eval_df.drop(columns=[self.label])).astype(int) loss.append(self.get_loss(eval_df[self.label], y_pred)) return np.mean(loss) self.opt_params = fmin(train_impl_nfold, asdict(self.opt), algo=tpe.suggest, max_evals=self.max_eval) def train_k_fold(self, k_fold, train_data, test_data, params=None, drop_test_y=True): acc_result = list() train_pred = np.empty(train_data.shape[0]) test_pred = np.empty(test_data.shape[0]) if drop_test_y: dtest = test_data.drop(columns=self.label) else: dtest = test_data for train_id, eval_id in k_fold.split(train_data): train_df = train_data.iloc[train_id] eval_df = train_data.iloc[eval_id] self.train(train_df, eval_df, params) train_pred[eval_id] = self.clf.predict( eval_df.drop(columns=self.label)) if self.metric == 'auc': y_pred = self.clf.predict(eval_df.drop(columns=[self.label])) else: y_pred = self.clf.predict( eval_df.drop(columns=[self.label])).astype(int) acc_result.append(self.get_loss(eval_df[self.label], y_pred)) test_pred += self.clf.predict(dtest) test_pred /= k_fold.n_splits return train_pred, test_pred, acc_result
type=str, default='brute', help='Algorithm used to compute the nearest neighbors') parser.add_argument('--metric', type=str, default='euclidean', help='Distance metric to use') params = parse_args(parser) # Load generated data X_train, X_test, y_train, y_test = load_data(params) params.n_classes = y_train[y_train.columns[0]].nunique() # Create classification object knn_clsf = KNeighborsClassifier(n_neighbors=params.n_neighbors, weights=params.weights, algorithm=params.method, metric=params.metric) knn_clsf.fit(X_train, y_train) # Time predict time, yp = measure_function_time(knn_clsf.predict, X_test, params=params) acc = 100 * accuracy_score(yp, y_test) columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', 'n_neighbors', 'n_classes', 'time') print_output(library='cuml', algorithm='knn_classification', stages=['prediction'], columns=columns,
samples = train.iloc[5000:5030, 1:].to_pandas().values plt.figure(figsize=(15, 4.5)) for i in range(30): plt.subplot(3, 10, i+1) plt.imshow(samples[i].reshape((28, 28)), cmap=plt.cm.binary) plt.axis('off') plt.subplots_adjust(wspace=-0.1, hspace=-0.1) plt.show() # Create 20% Validation set X_train, X_test, y_train, y_test = train_test_split(train.iloc[:, :-1], train.loc[:, 'label'], test_size=0.2, random_state=42) # Grid Search kNN for optimal k accs = [] for k in range(3, 22): knn = KNeighborsClassifier(n_neighbors=k) knn.fit(X_train, y_train) y_hat = knn.predict(X_test) acc = (y_hat.to_array() == y_test.to_array()).sum()/y_test.shape print(k, acc) accs.append(acc) # Free memory del X_train, X_test, y_train, y_test # Plot grid search results plt.figure(figsize=(15, 5)) plt.plot(range(3, 22), accs) plt.title('MNIST kNN k value versus validation acc') plt.show()