Ejemplo n.º 1
0
    def SVC_test(self, kernel):
        print("SVC Testing Result:")
        self.y_pred = np.zeros(
            (self.X_test.shape[0], len(self.targets_scored_col_name)))
        for i in tqdm(range(len(self.targets_scored_col_name))):
            this_target_col_name = self.targets_scored_col_name[i]
            # if samples < 5 in this label, pass
            if self.y_train[this_target_col_name].values.sum() < 5:
                self.y_pred[:, i] = np.zeros(len(self.X_test))
            #         print(f"111{this_y_pred.shape}")
            else:
                self.svc_model = cuml.SVC(kernel=kernel,
                                          C=100,
                                          cache_size=5000,
                                          probability=True)
                self.svc_model.fit(self.X_train[self.features],
                                   self.y_train[this_target_col_name])
                self.y_pred[:, i] = cupy.asnumpy(
                    self.svc_model.predict_proba(
                        self.X_test[self.features]).values)[:, 1]

        y_real = self.y_test[self.targets_scored_col_name].values
        # In multi-label classification, this is the subset accuracy which is a harsh metric since you require for each sample that each label set be correctly predicted.
        self.test_acc = self.svc_model.score(self.X_test[self.features].values,
                                             y_real)
        self.test_loss = log_loss(cupy.asnumpy(y_real), self.y_pred)
        print('Best model ACC: {:.4%}'.format(self.test_acc))
        print('Best model Log Loss: {:.4f}'.format(self.test_loss))
        return self.test_acc, self.test_loss
Ejemplo n.º 2
0
    def _query5(self):
        self._loadTables('query5')

        supportVec = 8
        xVal = np.random.rand(supportVec, 12)
        yVal = np.random.choice([-1.0, 1.0], size=supportVec)

        svm = cuml.SVC(kernel='poly', degree=2)
        svm.fit(xVal, yVal)

        featureName = [
            'drvStat.c0', 'drvStat.c1', 'drvStat.c2', 'drvStat.c3',
            'drvStat.c4', 'drvStat.c5', 'drvStat.c6', 'drvStat.c7',
            'drvStat.c8', 'drvStat.c9', 'drvStat.c10', 'drvStat.c11'
        ]

        startTime = time.time()
        join = self.driverStatusTable.merge(self.driverTable,
                                            left_on='drvStat.driverId',
                                            right_on='drv.driverId')
        groupby = join.groupby(['drv.driverId'
                                ])[featureName].rolling(3,
                                                        min_periods=1).mean()

        predict = svm.predict(groupby[featureName])
        endTime = time.time()

        groupby.to_csv(
            'query5_gpu.csv',
            index=False,
        )
        return endTime - startTime
Ejemplo n.º 3
0
def test_exact_classification_datasets(exact_shap_classification_dataset):
    X_train, X_test, y_train, y_test = exact_shap_classification_dataset

    models = []
    models.append(cuml.SVC(probability=True).fit(X_train, y_train))
    models.append(sklearn.svm.SVC(probability=True).fit(X_train, y_train))

    for mod in models:
        explainer, shap_values = get_shap_values(
            model=mod.predict_proba,
            background_dataset=X_train,
            explained_dataset=X_test,
            explainer=KernelExplainer
        )

        # Some values are very small, which mean our tolerance here needs to be
        # a little looser to avoid false positives from comparisons like
        # 0.00348627 - 0.00247397. The loose tolerance still tests that the
        # distribution of the values matches.
        for idx, svs in enumerate(shap_values):
            assert_and_log(
                svs[0],
                golden_classification_result[idx],
                float(mod.predict_proba(X_test)[0][idx]),
                explainer.expected_value[idx],
                tolerance=1e-01
            )
Ejemplo n.º 4
0
def test_exact_classification_datasets():
    X, y = make_classification(n_samples=101,
                               n_features=11,
                               random_state=42,
                               n_informative=2,
                               n_classes=2)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=1,
                                                        random_state=42)

    X_train = X_train.astype(np.float32)
    X_test = X_test.astype(np.float32)
    y_train = y_train.astype(np.float32)
    y_test = y_test.astype(np.float32)

    mod = cuml.SVC(probability=True).fit(X_train, y_train)

    explainer = cuml.experimental.explainer.KernelExplainer(
        model=mod.predict_proba, data=X_train)

    cu_shap_values = explainer.shap_values(X_test)

    experimental_test_and_log(cu_shap_values[0],
                              golden_classification_result[0],
                              float(mod.predict_proba(X_test)[0][0]),
                              float(explainer.expected_value[0]),
                              tolerance=1e-01)

    experimental_test_and_log(cu_shap_values[1],
                              golden_classification_result[1],
                              float(mod.predict_proba(X_test)[0][1]),
                              float(explainer.expected_value[1]),
                              tolerance=1e-01)

    mod = sklearn.svm.SVC(probability=True).fit(X_train, y_train)

    explainer = cuml.experimental.explainer.KernelExplainer(
        model=mod.predict_proba, data=X_train)

    cu_shap_values = explainer.shap_values(X_test)

    # Some values are very small, which mean our tolerance here needs to be
    # a little looser to avoid false positives from comparisons like
    # 0.00348627 - 0.00247397. The loose tolerance still tests that the
    # distribution of the values matches.
    experimental_test_and_log(cu_shap_values[0],
                              golden_classification_result[0],
                              float(mod.predict_proba(X_test)[0][0]),
                              float(explainer.expected_value[0]),
                              tolerance=1e-01)

    experimental_test_and_log(cu_shap_values[1],
                              golden_classification_result[1],
                              float(mod.predict_proba(X_test)[0][1]),
                              float(explainer.expected_value[1]),
                              tolerance=1e-01)
def test_exact_classification_datasets():
    X, y = make_classification(n_samples=101,
                               n_features=11,
                               random_state=42,
                               n_informative=2,
                               n_classes=2)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=1, random_state=42)

    X_train = X_train.astype(np.float32)
    X_test = X_test.astype(np.float32)
    y_train = y_train.astype(np.float32)
    y_test = y_test.astype(np.float32)

    mod = cuml.SVC(probability=True).fit(X_train, y_train)

    explainer = cuml.experimental.explainer.PermutationExplainer(
        model=mod.predict_proba,
        data=X_train)

    cu_shap_values = explainer.shap_values(X_test)

    exp_v = explainer.expected_value
    fx = mod.predict_proba(X_test)[0]
    assert (np.sum(cp.asnumpy(
        cu_shap_values[0])) - abs(fx[0] - exp_v[0])) <= 1e-5
    assert (np.sum(cp.asnumpy(
        cu_shap_values[1])) - abs(fx[1] - exp_v[1])) <= 1e-5

    mod = sklearn.svm.SVC(probability=True).fit(X_train, y_train)

    explainer = cuml.experimental.explainer.PermutationExplainer(
        model=mod.predict_proba,
        data=X_train)

    skl_shap_values = explainer.shap_values(X_test)

    exp_v = explainer.expected_value
    fx = mod.predict_proba(X_test)[0]
    assert (np.sum(cp.asnumpy(
        skl_shap_values[0])) - abs(fx[0] - exp_v[0])) <= 1e-5
    assert (np.sum(cp.asnumpy(
        skl_shap_values[1])) - abs(fx[1] - exp_v[1])) <= 1e-5
Ejemplo n.º 6
0
 def SVC_train(self, kernel):
     # training SVM for each scored label
     self.y_pred = np.zeros(
         (self.X_val.shape[0], len(self.targets_scored_col_name)))
     for i in tqdm(range(len(self.targets_scored_col_name))):
         this_target_col_name = self.targets_scored_col_name[i]
         # if samples < 5 in this label, pass
         if self.y_train[this_target_col_name].values.sum() < 5:
             self.y_pred[:, i] = np.zeros(len(self.X_val))
         #         print(f"111{this_y_pred.shape}")
         else:
             self.svc_model = cuml.SVC(kernel=kernel,
                                       C=100,
                                       cache_size=5000,
                                       probability=True)
             self.svc_model.fit(self.X_train[self.features],
                                self.y_train[this_target_col_name])
             self.y_pred[:, i] = cupy.asnumpy(
                 self.svc_model.predict_proba(
                     self.X_val[self.features]).values)[:, 1]
def test_exact_classification_datasets(exact_shap_classification_dataset):
    X_train, X_test, y_train, y_test = exact_shap_classification_dataset

    models = []
    models.append(cuml.SVC(probability=True).fit(X_train, y_train))
    models.append(sklearn.svm.SVC(probability=True).fit(X_train, y_train))

    for mod in models:
        explainer, shap_values = get_shap_values(
            model=mod.predict_proba,
            background_dataset=X_train,
            explained_dataset=X_test,
            explainer=PermutationExplainer,
        )

        fx = mod.predict_proba(X_test)
        exp_v = explainer.expected_value

        for i in range(3):
            print(i, fx[i][1], shap_values[1][i])
            assert (np.sum(cp.asnumpy(shap_values[0][i])) -
                    abs(fx[i][0] - exp_v[0])) <= 1e-5
            assert (np.sum(cp.asnumpy(shap_values[1][i])) -
                    abs(fx[i][1] - exp_v[1])) <= 1e-5