def SVC_test(self, kernel): print("SVC Testing Result:") self.y_pred = np.zeros( (self.X_test.shape[0], len(self.targets_scored_col_name))) for i in tqdm(range(len(self.targets_scored_col_name))): this_target_col_name = self.targets_scored_col_name[i] # if samples < 5 in this label, pass if self.y_train[this_target_col_name].values.sum() < 5: self.y_pred[:, i] = np.zeros(len(self.X_test)) # print(f"111{this_y_pred.shape}") else: self.svc_model = cuml.SVC(kernel=kernel, C=100, cache_size=5000, probability=True) self.svc_model.fit(self.X_train[self.features], self.y_train[this_target_col_name]) self.y_pred[:, i] = cupy.asnumpy( self.svc_model.predict_proba( self.X_test[self.features]).values)[:, 1] y_real = self.y_test[self.targets_scored_col_name].values # In multi-label classification, this is the subset accuracy which is a harsh metric since you require for each sample that each label set be correctly predicted. self.test_acc = self.svc_model.score(self.X_test[self.features].values, y_real) self.test_loss = log_loss(cupy.asnumpy(y_real), self.y_pred) print('Best model ACC: {:.4%}'.format(self.test_acc)) print('Best model Log Loss: {:.4f}'.format(self.test_loss)) return self.test_acc, self.test_loss
def _query5(self): self._loadTables('query5') supportVec = 8 xVal = np.random.rand(supportVec, 12) yVal = np.random.choice([-1.0, 1.0], size=supportVec) svm = cuml.SVC(kernel='poly', degree=2) svm.fit(xVal, yVal) featureName = [ 'drvStat.c0', 'drvStat.c1', 'drvStat.c2', 'drvStat.c3', 'drvStat.c4', 'drvStat.c5', 'drvStat.c6', 'drvStat.c7', 'drvStat.c8', 'drvStat.c9', 'drvStat.c10', 'drvStat.c11' ] startTime = time.time() join = self.driverStatusTable.merge(self.driverTable, left_on='drvStat.driverId', right_on='drv.driverId') groupby = join.groupby(['drv.driverId' ])[featureName].rolling(3, min_periods=1).mean() predict = svm.predict(groupby[featureName]) endTime = time.time() groupby.to_csv( 'query5_gpu.csv', index=False, ) return endTime - startTime
def test_exact_classification_datasets(exact_shap_classification_dataset): X_train, X_test, y_train, y_test = exact_shap_classification_dataset models = [] models.append(cuml.SVC(probability=True).fit(X_train, y_train)) models.append(sklearn.svm.SVC(probability=True).fit(X_train, y_train)) for mod in models: explainer, shap_values = get_shap_values( model=mod.predict_proba, background_dataset=X_train, explained_dataset=X_test, explainer=KernelExplainer ) # Some values are very small, which mean our tolerance here needs to be # a little looser to avoid false positives from comparisons like # 0.00348627 - 0.00247397. The loose tolerance still tests that the # distribution of the values matches. for idx, svs in enumerate(shap_values): assert_and_log( svs[0], golden_classification_result[idx], float(mod.predict_proba(X_test)[0][idx]), explainer.expected_value[idx], tolerance=1e-01 )
def test_exact_classification_datasets(): X, y = make_classification(n_samples=101, n_features=11, random_state=42, n_informative=2, n_classes=2) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1, random_state=42) X_train = X_train.astype(np.float32) X_test = X_test.astype(np.float32) y_train = y_train.astype(np.float32) y_test = y_test.astype(np.float32) mod = cuml.SVC(probability=True).fit(X_train, y_train) explainer = cuml.experimental.explainer.KernelExplainer( model=mod.predict_proba, data=X_train) cu_shap_values = explainer.shap_values(X_test) experimental_test_and_log(cu_shap_values[0], golden_classification_result[0], float(mod.predict_proba(X_test)[0][0]), float(explainer.expected_value[0]), tolerance=1e-01) experimental_test_and_log(cu_shap_values[1], golden_classification_result[1], float(mod.predict_proba(X_test)[0][1]), float(explainer.expected_value[1]), tolerance=1e-01) mod = sklearn.svm.SVC(probability=True).fit(X_train, y_train) explainer = cuml.experimental.explainer.KernelExplainer( model=mod.predict_proba, data=X_train) cu_shap_values = explainer.shap_values(X_test) # Some values are very small, which mean our tolerance here needs to be # a little looser to avoid false positives from comparisons like # 0.00348627 - 0.00247397. The loose tolerance still tests that the # distribution of the values matches. experimental_test_and_log(cu_shap_values[0], golden_classification_result[0], float(mod.predict_proba(X_test)[0][0]), float(explainer.expected_value[0]), tolerance=1e-01) experimental_test_and_log(cu_shap_values[1], golden_classification_result[1], float(mod.predict_proba(X_test)[0][1]), float(explainer.expected_value[1]), tolerance=1e-01)
def test_exact_classification_datasets(): X, y = make_classification(n_samples=101, n_features=11, random_state=42, n_informative=2, n_classes=2) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=1, random_state=42) X_train = X_train.astype(np.float32) X_test = X_test.astype(np.float32) y_train = y_train.astype(np.float32) y_test = y_test.astype(np.float32) mod = cuml.SVC(probability=True).fit(X_train, y_train) explainer = cuml.experimental.explainer.PermutationExplainer( model=mod.predict_proba, data=X_train) cu_shap_values = explainer.shap_values(X_test) exp_v = explainer.expected_value fx = mod.predict_proba(X_test)[0] assert (np.sum(cp.asnumpy( cu_shap_values[0])) - abs(fx[0] - exp_v[0])) <= 1e-5 assert (np.sum(cp.asnumpy( cu_shap_values[1])) - abs(fx[1] - exp_v[1])) <= 1e-5 mod = sklearn.svm.SVC(probability=True).fit(X_train, y_train) explainer = cuml.experimental.explainer.PermutationExplainer( model=mod.predict_proba, data=X_train) skl_shap_values = explainer.shap_values(X_test) exp_v = explainer.expected_value fx = mod.predict_proba(X_test)[0] assert (np.sum(cp.asnumpy( skl_shap_values[0])) - abs(fx[0] - exp_v[0])) <= 1e-5 assert (np.sum(cp.asnumpy( skl_shap_values[1])) - abs(fx[1] - exp_v[1])) <= 1e-5
def SVC_train(self, kernel): # training SVM for each scored label self.y_pred = np.zeros( (self.X_val.shape[0], len(self.targets_scored_col_name))) for i in tqdm(range(len(self.targets_scored_col_name))): this_target_col_name = self.targets_scored_col_name[i] # if samples < 5 in this label, pass if self.y_train[this_target_col_name].values.sum() < 5: self.y_pred[:, i] = np.zeros(len(self.X_val)) # print(f"111{this_y_pred.shape}") else: self.svc_model = cuml.SVC(kernel=kernel, C=100, cache_size=5000, probability=True) self.svc_model.fit(self.X_train[self.features], self.y_train[this_target_col_name]) self.y_pred[:, i] = cupy.asnumpy( self.svc_model.predict_proba( self.X_val[self.features]).values)[:, 1]
def test_exact_classification_datasets(exact_shap_classification_dataset): X_train, X_test, y_train, y_test = exact_shap_classification_dataset models = [] models.append(cuml.SVC(probability=True).fit(X_train, y_train)) models.append(sklearn.svm.SVC(probability=True).fit(X_train, y_train)) for mod in models: explainer, shap_values = get_shap_values( model=mod.predict_proba, background_dataset=X_train, explained_dataset=X_test, explainer=PermutationExplainer, ) fx = mod.predict_proba(X_test) exp_v = explainer.expected_value for i in range(3): print(i, fx[i][1], shap_values[1][i]) assert (np.sum(cp.asnumpy(shap_values[0][i])) - abs(fx[i][0] - exp_v[0])) <= 1e-5 assert (np.sum(cp.asnumpy(shap_values[1][i])) - abs(fx[i][1] - exp_v[1])) <= 1e-5