Esempio n. 1
0
def _test_determenistic(distance, algorithm, weight, k):
    x_train, x_test, y_train, y_test = \
            train_test_split(IRIS.data, IRIS.target,
                             test_size=0.33, random_state=31)

    alg_results = []
    for _ in range(N_TRIES):
        # models
        scikit_model = ScikitKNeighborsClassifier(n_neighbors=k,
                                                  weights=weight,
                                                  algorithm=algorithm,
                                                  leaf_size=30,
                                                  p=2,
                                                  metric=distance)
        daal_model = DaalKNeighborsClassifier(n_neighbors=k,
                                              weights=weight,
                                              algorithm=algorithm,
                                              leaf_size=30,
                                              p=2,
                                              metric=distance)
        # training
        scikit_model.fit(x_train, y_train)
        daal_model.fit(x_train, y_train)
        # predict
        scikit_predict = scikit_model.predict(x_test)
        daal_predict = daal_model.predict(x_test)

        distances, indices = daal_model.kneighbors(x_test)
        alg_results.append((distances, indices, daal_predict))
        # accuracy
        scikit_accuracy = accuracy_score(y_test, scikit_predict)
        daal_accuracy = accuracy_score(y_test, daal_predict)
        ratio = daal_accuracy / scikit_accuracy
        assert ratio >= ACCURACY_RATIO,\
            f'kNN accuracy: scikit_accuracy={scikit_accuracy},daal_accuracy={daal_accuracy}, ratio={ratio}'
        # predict proba
        scikit_predict_proba = scikit_model.predict_proba(x_test)
        daal_predict_proba = daal_model.predict_proba(x_test)
        # log loss
        scikit_log_loss = log_loss(y_test, scikit_predict_proba)
        daal_log_loss = log_loss(y_test, daal_predict_proba)
        ratio = daal_log_loss / scikit_log_loss
        assert ratio <= LOG_LOSS_RATIO,\
            f'kNN log_loss: scikit_log_loss={scikit_log_loss},daal_log_loss={daal_log_loss}, ratio={ratio}'
        # ROC AUC
        scikit_roc_auc = roc_auc_score(y_test,
                                       scikit_predict_proba,
                                       multi_class='ovr')
        daal_roc_auc = roc_auc_score(y_test,
                                     daal_predict_proba,
                                     multi_class='ovr')
        ratio = daal_roc_auc / scikit_roc_auc
        assert ratio >= ROC_AUC_RATIO,\
            f'kNN roc_auc: scikit_roc_auc={scikit_roc_auc},daal_roc_auc={daal_roc_auc}, ratio={ratio}'

    for i in range(1, N_TRIES):
        for j, res in enumerate(alg_results[i]):
            assert (res == alg_results[0][j]).mean() == 1, \
                f'Results are different between runs for {algorithm}, {weight}, {distance}, k={k}'
Esempio n. 2
0
    def kfold_function_template(self, data_transform_function):
        tracemalloc.start()

        x, y, data_memory_size = self.gen_clsf_data()
        kf = KFold(n_splits=10)
        x, y = data_transform_function(x, y)

        mem_before, _ = tracemalloc.get_traced_memory()
        for train_index, test_index in kf.split(x):
            if isinstance(x, np.ndarray):
                x_train, x_test = x[train_index], x[test_index]
                y_train, y_test = y[train_index], y[test_index]
            elif isinstance(x, pd.core.frame.DataFrame):
                x_train, x_test = x.iloc[train_index], x.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            knn = KNeighborsClassifier()
            knn.fit(x_train, y_train)
        del knn, x_train, x_test, y_train, y_test
        mem_after, _ = tracemalloc.get_traced_memory()
        tracemalloc.stop()

        self.assertTrue(
            mem_after - mem_before < 0.25 * data_memory_size,
            'Size of extra allocated memory is greater than 25% of input data')
Esempio n. 3
0
def check_determenistic(distance, algorithm, weight, k):
    x_train, x_test, y_train, y_test = make_dataset()

    alg_results = []
    for _ in range(N_TRIES):
        alg = KNeighborsClassifier(n_neighbors=k,
                                   weights=weight,
                                   algorithm=algorithm,
                                   leaf_size=30,
                                   p=2,
                                   metric=distance)
        alg.fit(x_train, y_train)
        distances, indices = alg.kneighbors(x_test)
        labels = alg.predict(x_test)
        alg_results.append((distances, indices, labels))
        accuracy = accuracy_score(labels, y_test)
        assert accuracy >= CHECK_RATIO_KNN,\
            'kNN classifier:accuracy={}'.format(accuracy)

    for i in range(1, N_TRIES):
        for j, res in enumerate(alg_results[i]):
            assert (res == alg_results[0][j]).mean() == 1, \
                ('Results are different between runs for %s, %s, %s, k=%d'\
                % (algorithm, weight, distance, k))
Esempio n. 4
0
 def test_KNeighborsClassifier(self):
     check_estimator(KNeighborsClassifier(algorithm='kd_tree'))
Esempio n. 5
0
                 test_size=0.5,
                 shuffle=True):
    x, y = make_classification(n_samples=n_samples,
                               n_features=n_features,
                               n_classes=n_classes,
                               random_state=777)
    return train_test_split(x,
                            y,
                            random_state=777,
                            test_size=test_size,
                            shuffle=shuffle)


ESTIMATORS = {
    'KNeighborsClassifier':
    KNeighborsClassifier(n_neighbors=10),
    'DaalRandomForestClassifier':
    DaalRandomForestClassifier(n_estimators=10, random_state=777),
    'DaalRandomForestRegressor':
    DaalRandomForestRegressor(n_estimators=10, random_state=777),
}

ORDERS = ['C', 'F']
DATA_FORMATS = [pd.DataFrame, np.array]


def check_data_formats_diff(name):
    x_train, x_test, y_train, y_test = make_dataset()
    alg_results = []
    for data_format in DATA_FORMATS:
        for order in ORDERS: