コード例 #1
0
ファイル: fitter.py プロジェクト: GT-JLU/OpenCompetitionV2
    def train(self, train_df, eval_df, params=None):
        train_df, eval_df = cudf.DataFrame(train_df), cudf.DataFrame(eval_df)
        x_train, y_train, x_eval, y_eval = train_df.drop(columns=[self.label]), train_df[self.label], \
                                           eval_df.drop(columns=[self.label]), eval_df[self.label],

        if params is None:
            use_params = deepcopy(self.opt_params)
        else:
            use_params = deepcopy(params)
        self.clf = KNeighborsClassifier(**use_params)
        self.clf.fit(X=x_train, y=y_train)
        preds = self.clf.predict(X=x_eval)
        output = self.get_loss(y_pred=preds, y=y_eval)

        return output
def oof_probas(X, y, v, n_folds=5, random_state=42, n_neighbors=1000):
    # First scale inputs
    X_scaled = StandardScaler().fit_transform(X.iloc(axis=1)[3:].values)
    X_scaled = np.hstack([X.iloc(axis=1)[:3].values, X_scaled])

    # Get Multi-Label Stratified K-Folds
    df_fold = get_folds(y, n_folds=n_folds, random_state=random_state)

    # Initialize array to store out-of-fold probabilities
    oof = np.zeros((X.shape[0], 206))
    for fold in range(n_folds):
        fold_idx = df_fold[df_fold['fold'] != fold].index
        pp_fold = df_fold[df_fold['fold'] == fold].index

        model = KNeighborsClassifier(n_neighbors=n_neighbors)
        model.fit(X_scaled[fold_idx][:, v], y.iloc[fold_idx].values[:, 1:])

        pp = model.predict_proba(X_scaled[pp_fold][:, v])
        pp = np.stack([(1 - pp[x][:, 0]) for x in range(len(pp))]).T
        oof[pp_fold, ] = pp

    return oof
コード例 #3
0
                    type=str,
                    default='brute',
                    help='Algorithm used to compute the nearest neighbors')
parser.add_argument('--metric',
                    type=str,
                    default='euclidean',
                    help='Distance metric to use')
params = bench.parse_args(parser)

# Load generated data
X_train, X_test, y_train, y_test = bench.load_data(params)
params.n_classes = y_train[y_train.columns[0]].nunique()

# Create classification object
knn_clsf = KNeighborsClassifier(n_neighbors=params.n_neighbors,
                                weights=params.weights,
                                algorithm=params.method,
                                metric=params.metric)

# Measure time and accuracy on fitting
train_time, _ = bench.measure_function_time(knn_clsf.fit,
                                            X_train,
                                            y_train,
                                            params=params)
if params.task == 'classification':
    y_pred = knn_clsf.predict(X_train)
    train_acc = 100 * bench.accuracy_score(y_pred, y_train)

# Measure time and accuracy on prediction
if params.task == 'classification':
    predict_time, yp = bench.measure_function_time(knn_clsf.predict,
                                                   X_test,
コード例 #4
0
ファイル: fitter.py プロジェクト: GT-JLU/OpenCompetitionV2
class CumlKNNFitter(FitterBase):
    def __init__(self,
                 label='label',
                 metric='error',
                 opt: KNNOpt = None,
                 max_eval=10):
        super(CumlKNNFitter, self).__init__(label, metric, max_eval)
        if opt is not None:
            self.opt = opt
        else:
            self.opt = KNNOpt()
        self.clf = None

    def train(self, train_df, eval_df, params=None):
        train_df, eval_df = cudf.DataFrame(train_df), cudf.DataFrame(eval_df)
        x_train, y_train, x_eval, y_eval = train_df.drop(columns=[self.label]), train_df[self.label], \
                                           eval_df.drop(columns=[self.label]), eval_df[self.label],

        if params is None:
            use_params = deepcopy(self.opt_params)
        else:
            use_params = deepcopy(params)
        self.clf = KNeighborsClassifier(**use_params)
        self.clf.fit(X=x_train, y=y_train)
        preds = self.clf.predict(X=x_eval)
        output = self.get_loss(y_pred=preds, y=y_eval)

        return output

    def search(self, train_df, eval_df):
        self.opt_params = dict()

        def train_impl(params):
            self.train(train_df, eval_df, params)
            if self.metric == 'auc':
                y_pred = self.clf.predict(eval_df.drop(columns=[self.label]))
            else:
                y_pred = self.clf.predict(
                    eval_df.drop(columns=[self.label])).astype(int)

            return self.get_loss(eval_df[self.label], y_pred)

        self.opt_params = fmin(train_impl,
                               asdict(self.opt),
                               algo=tpe.suggest,
                               max_evals=self.max_eval)

    def search_k_fold(self, k_fold, data):
        self.opt_params = dict()

        def train_impl_nfold(params):
            loss = list()
            for train_id, eval_id in k_fold.split(data):
                train_df = data.iloc[train_id, :]
                eval_df = data.iloc[eval_id, :]
                self.train(train_df, eval_df, params)
                if self.metric == 'auc':
                    y_pred = self.clf.predict(
                        eval_df.drop(columns=[self.label]))
                else:
                    y_pred = self.clf.predict(
                        eval_df.drop(columns=[self.label])).astype(int)
                loss.append(self.get_loss(eval_df[self.label], y_pred))
            return np.mean(loss)

        self.opt_params = fmin(train_impl_nfold,
                               asdict(self.opt),
                               algo=tpe.suggest,
                               max_evals=self.max_eval)

    def train_k_fold(self,
                     k_fold,
                     train_data,
                     test_data,
                     params=None,
                     drop_test_y=True):
        acc_result = list()
        train_pred = np.empty(train_data.shape[0])
        test_pred = np.empty(test_data.shape[0])
        if drop_test_y:
            dtest = test_data.drop(columns=self.label)
        else:
            dtest = test_data
        for train_id, eval_id in k_fold.split(train_data):
            train_df = train_data.iloc[train_id]
            eval_df = train_data.iloc[eval_id]
            self.train(train_df, eval_df, params)
            train_pred[eval_id] = self.clf.predict(
                eval_df.drop(columns=self.label))
            if self.metric == 'auc':
                y_pred = self.clf.predict(eval_df.drop(columns=[self.label]))
            else:
                y_pred = self.clf.predict(
                    eval_df.drop(columns=[self.label])).astype(int)

            acc_result.append(self.get_loss(eval_df[self.label], y_pred))
            test_pred += self.clf.predict(dtest)
        test_pred /= k_fold.n_splits
        return train_pred, test_pred, acc_result
コード例 #5
0
                    type=str,
                    default='brute',
                    help='Algorithm used to compute the nearest neighbors')
parser.add_argument('--metric',
                    type=str,
                    default='euclidean',
                    help='Distance metric to use')
params = parse_args(parser)

# Load generated data
X_train, X_test, y_train, y_test = load_data(params)
params.n_classes = y_train[y_train.columns[0]].nunique()

# Create classification object
knn_clsf = KNeighborsClassifier(n_neighbors=params.n_neighbors,
                                weights=params.weights,
                                algorithm=params.method,
                                metric=params.metric)

knn_clsf.fit(X_train, y_train)
# Time predict
time, yp = measure_function_time(knn_clsf.predict, X_test, params=params)

acc = 100 * accuracy_score(yp, y_test)

columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size',
           'n_neighbors', 'n_classes', 'time')

print_output(library='cuml',
             algorithm='knn_classification',
             stages=['prediction'],
             columns=columns,
コード例 #6
0
ファイル: mnist_gpu.py プロジェクト: JulioSanchezD/MNIST-GPU
samples = train.iloc[5000:5030, 1:].to_pandas().values
plt.figure(figsize=(15, 4.5))
for i in range(30):
    plt.subplot(3, 10, i+1)
    plt.imshow(samples[i].reshape((28, 28)), cmap=plt.cm.binary)
    plt.axis('off')
plt.subplots_adjust(wspace=-0.1, hspace=-0.1)
plt.show()

# Create 20% Validation set
X_train, X_test, y_train, y_test = train_test_split(train.iloc[:, :-1], train.loc[:, 'label'], test_size=0.2, random_state=42)

# Grid Search kNN for optimal k
accs = []
for k in range(3, 22):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_hat = knn.predict(X_test)
    acc = (y_hat.to_array() == y_test.to_array()).sum()/y_test.shape
    print(k, acc)
    accs.append(acc)

# Free memory
del X_train, X_test, y_train, y_test

# Plot grid search results
plt.figure(figsize=(15, 5))
plt.plot(range(3, 22), accs)
plt.title('MNIST kNN k value versus validation acc')
plt.show()