def test_svm_memleak_on_exception(params, n_rows=1000, n_iter=10, n_cols=1000, dataset='blobs'): """ Test whether there is any mem leak when we exit training with an exception. The poly kernel with degree=30 will overflow, and triggers the 'SMO error: NaN found...' exception. """ X_train, y_train = make_blobs(n_samples=n_rows, n_features=n_cols, random_state=137, centers=2) X_train = X_train.astype(np.float32) stream = cuml.cuda.Stream() handle = cuml.Handle(stream=stream) # Warmup. Some modules that are used in SVC allocate space on the device # and consume memory. Here we make sure that this allocation is done # before the first call to get_memory_info. tmp = cu_svm.SVC(handle=handle, **params) with pytest.raises(RuntimeError): tmp.fit(X_train, y_train) # SMO error: NaN found during fitting. free_mem = cuda.current_context().get_memory_info()[0] # Main test loop for i in range(n_iter): cuSVC = cu_svm.SVC(handle=handle, **params) with pytest.raises(RuntimeError): cuSVC.fit(X_train, y_train) # SMO error: NaN found during fitting. del(cuSVC) handle.sync() delta_mem = free_mem - cuda.current_context().get_memory_info()[0] print("Delta GPU mem: {} bytes".format(delta_mem)) assert delta_mem == 0
def test_svm_memleak(params, n_rows, n_iter, n_cols, use_handle, dataset='blobs'): """ Test whether there is any memory leak. .. note:: small `n_rows`, and `n_cols` values will result in small model size, that will not be measured by get_memory_info. """ X_train, X_test, y_train, y_test = make_dataset(dataset, n_rows, n_cols) stream = cuml.cuda.Stream() handle = cuml.Handle() handle.setStream(stream) # Warmup. Some modules that are used in SVC allocate space on the device # and consume memory. Here we make sure that this allocation is done # before the first call to get_memory_info. tmp = cu_svm.SVC(handle=handle, **params) tmp.fit(X_train, y_train) ms = get_memsize(tmp) print("Memory consumtion of SVC object is {} MiB".format(ms / (1024 * 1024.0))) free_mem = cuda.current_context().get_memory_info()[0] # Check first whether the get_memory_info gives us the correct memory # footprint cuSVC = cu_svm.SVC(handle=handle, **params) cuSVC.fit(X_train, y_train) delta_mem = free_mem - cuda.current_context().get_memory_info()[0] assert delta_mem >= ms # Main test loop b_sum = 0 for i in range(n_iter): cuSVC = cu_svm.SVC(handle=handle, **params) cuSVC.fit(X_train, y_train) b_sum += cuSVC.intercept_ cuSVC.predict(X_train) del (cuSVC) handle.sync() delta_mem = free_mem - cuda.current_context().get_memory_info()[0] print("Delta GPU mem: {} bytes".format(delta_mem)) assert delta_mem == 0
def test_svc_weights(class_weight, sample_weight): # We are using the following example as a test case # https://scikit-learn.org/stable/auto_examples/svm/plot_separating_hyperplane_unbalanced.html X, y = make_blobs(n_samples=[1000, 100], centers=[[0.0, 0.0], [2.0, 2.0]], cluster_std=[1.5, 0.5], random_state=137, shuffle=False) if sample_weight: # Put large weight on class 1 sample_weight = y * 9 + 1 params = {'kernel': 'linear', 'C': 1, 'gamma': 'scale'} params['class_weight'] = class_weight cuSVC = cu_svm.SVC(**params) cuSVC.fit(X, y, sample_weight) if class_weight is not None or sample_weight is not None: # Standalone test: check if smaller blob is correctly classified in the # presence of class weights X_1 = X[y == 1, :] y_1 = np.ones(X_1.shape[0]) cu_score = cuSVC.score(X_1, y_1) assert cu_score > 0.9 sklSVC = svm.SVC(**params) sklSVC.fit(X, y, sample_weight) compare_svm(cuSVC, sklSVC, X, y, coef_tol=1e-5, report_summary=True)
def test_svm_skl_cmp_decision_function(params, n_rows=4000, n_cols=20): X_train, X_test, y_train, y_test = make_dataset('classification1', n_rows, n_cols) y_train = y_train.astype(np.int32) y_test = y_test.astype(np.int32) cuSVC = cu_svm.SVC(**params) cuSVC.fit(X_train, y_train) pred = cuSVC.predict(X_test) assert pred.dtype == y_train.dtype df1 = cuSVC.decision_function(X_test) assert df1.dtype == X_train.dtype sklSVC = svm.SVC(**params) sklSVC.fit(X_train, y_train) df2 = sklSVC.decision_function(X_test) if params["probability"]: tol = 2e-2 # See comments in SVC decision_function method else: tol = 1e-5 assert mean_squared_error(df1, df2) < tol
def test_svm_gamma(params): # Note: we test different array types to make sure that the X.var() is # calculated correctly for gamma == 'scale' option. x_arraytype = params.pop('x_arraytype', 'numpy') n_rows = 500 n_cols = 380 centers = [10 * np.ones(380), -10 * np.ones(380)] X, y = make_blobs(n_samples=n_rows, n_features=n_cols, random_state=137, centers=centers) X = X.astype(np.float32) if x_arraytype == 'dataframe': X_df = cudf.DataFrame() X = X_df.from_gpu_matrix(cuda.to_device(X)) elif x_arraytype == 'numba': X = cuda.to_device(X) # Using degree 40 polynomials and fp32 training would fail with # gamma = 1/(n_cols*X.std()), but it works with the correct implementation: # gamma = 1/(n_cols*X.var()) cuSVC = cu_svm.SVC(**params) cuSVC.fit(X, y) y_pred = cuSVC.predict(X).to_array() n_correct = np.sum(y == y_pred) accuracy = n_correct * 100 / n_rows assert accuracy > 70
def test_svm_skl_cmp_predict_proba(in_type, n_rows=10000, n_cols=20): params = { 'kernel': 'rbf', 'C': 1, 'tol': 1e-3, 'gamma': 'scale', 'probability': True } X, y = make_classification(n_samples=n_rows, n_features=n_cols, n_informative=2, n_redundant=10, random_state=137) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42) X_m = input_to_cuml_array(X_train).array y_m = input_to_cuml_array(y_train).array cuSVC = cu_svm.SVC(**params) cuSVC.fit(X_m.to_output(in_type), y_m.to_output(in_type)) sklSVC = svm.SVC(**params) sklSVC.fit(X_train, y_train) compare_probabilistic_svm(cuSVC, sklSVC, X_test, y_test, 1e-3, 1e-2)
def test_svm_skl_cmp_multiclass(params, dataset='classification2', n_rows=100, n_cols=6): X_train, X_test, y_train, y_test = make_dataset(dataset, n_rows, n_cols, n_classes=3, n_informative=6) # Default to numpy for testing with cuml.using_output_type("numpy"): cuSVC = cu_svm.SVC(**params) cuSVC.fit(X_train, y_train) sklSVC = svm.SVC(**params) sklSVC.fit(X_train, y_train) compare_svm(cuSVC, sklSVC, X_test, y_test, coef_tol=1e-5, report_summary=True)
def test_svm_skl_cmp_kernels(params): # X_train, X_test, y_train, y_test = make_dataset('gaussian', 1000, 4) X_train, y_train = get_binary_iris_dataset() cuSVC = cu_svm.SVC(**params) cuSVC.fit(X_train, y_train) sklSVC = svm.SVC(**params) sklSVC.fit(X_train, y_train) compare_svm(cuSVC, sklSVC, X_train, y_train, cmp_decision_func=True)
def test_svm_predict(params, n_pred): n_rows = 500 n_cols = 2 X, y = make_blobs(n_samples=n_rows + n_pred, n_features=n_cols, centers=[[-5, -5], [5, 5]]) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=n_rows) cuSVC = cu_svm.SVC(**params) cuSVC.fit(X_train, y_train) y_pred = cuSVC.predict(X_test) n_correct = np.sum(y_test == y_pred) accuracy = n_correct * 100 / n_pred assert accuracy > 99
def test_svm_predict_convert_dtype(train_dtype, test_dtype, classifier): X, y = make_classification(n_samples=50, random_state=0) X = X.astype(train_dtype) y = y.astype(train_dtype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) if classifier: clf = cu_svm.SVC() else: clf = cu_svm.SVR() clf.fit(X_train, y_train) clf.predict(X_test.astype(test_dtype))
def test_svm_numeric_arraytype(x_dtype, y_dtype): X, y = get_binary_iris_dataset() X = X.astype(x_dtype, order="F") y = y.astype(y_dtype) params = {'kernel': 'rbf', 'C': 1, 'gamma': 0.25} cuSVC = cu_svm.SVC(**params) cuSVC.fit(X, y) intercept_exp = 0.23468959692060373 n_sv_exp = 15 assert abs(cuSVC.intercept_ - intercept_exp) / intercept_exp < 1e-3 assert cuSVC.n_support_ == n_sv_exp n_pred_wrong = np.sum(cuSVC.predict(X)-y) assert n_pred_wrong == 0
def test_svm_skl_cmp_datasets(params, dataset, n_rows, n_cols): if (params['kernel'] == 'linear' and dataset in ['gaussian', 'classification2'] and n_rows > 1000 and n_cols >= 1000): # linear kernel will not fit the gaussian dataset, but takes very long return X_train, X_test, y_train, y_test = make_dataset(dataset, n_rows, n_cols) cuSVC = cuml.svm.SVC(**params) cuSVC.fit(X_train, y_train) sklSVC = svm.SVC(**params) sklSVC.fit(X_train, y_train) compare_svm(cuSVC, sklSVC, X_test, y_test, n_sv_tol=max(2, 0.02*n_rows), coef_tol=1e-5, report_summary=True)
def test_svm_skl_cmp_datasets(params, dataset, n_rows, n_cols): if (params['kernel'] == 'linear' and dataset in ['gaussian', 'classification2'] and n_rows > 1000 and n_cols >= 1000): # linear kernel will not fit the gaussian dataset, but takes very long return X_train, X_test, y_train, y_test = make_dataset(dataset, n_rows, n_cols) # Default to numpy for testing with cuml.using_output_type("numpy"): cuSVC = cu_svm.SVC(**params) cuSVC.fit(X_train, y_train) sklSVC = svm.SVC(**params) sklSVC.fit(X_train, y_train) compare_svm(cuSVC, sklSVC, X_test, y_test, coef_tol=1e-5, report_summary=True)
def train_boundary(latent_codes, scores, chosen_num_or_ratio=0.02, split_ratio=0.7, invalid_value=None, logger=None): """Trains boundary in latent space with offline predicted attribute scores. Given a collection of latent codes and the attribute scores predicted from the corresponding images, this function will train a linear SVM by treating it as a bi-classification problem. Basically, the samples with highest attribute scores are treated as positive samples, while those with lowest scores as negative. For now, the latent code can ONLY be with 1 dimension. NOTE: The returned boundary is with shape (1, latent_space_dim), and also normalized with unit norm. Args: latent_codes: Input latent codes as training data. scores: Input attribute scores used to generate training labels. chosen_num_or_ratio: How many samples will be chosen as positive (negative) samples. If this field lies in range (0, 0.5], `chosen_num_or_ratio * latent_codes_num` will be used. Otherwise, `min(chosen_num_or_ratio, 0.5 * latent_codes_num)` will be used. (default: 0.02) split_ratio: Ratio to split training and validation sets. (default: 0.7) invalid_value: This field is used to filter out data. (default: None) logger: Logger for recording log messages. If set as `None`, a default logger, which prints messages from all levels to screen, will be created. (default: None) Returns: A decision boundary with type `numpy.ndarray`. Raises: ValueError: If the input `latent_codes` or `scores` are with invalid format. """ if not logger: logger = setup_logger(work_dir='', logger_name='train_boundary') if (not isinstance(latent_codes, np.ndarray) or not len(latent_codes.shape) == 2): raise ValueError(f'Input `latent_codes` should be with type' f'`numpy.ndarray`, and shape [num_samples, ' f'latent_space_dim]!') num_samples = latent_codes.shape[0] latent_space_dim = latent_codes.shape[1] if (not isinstance(scores, np.ndarray) or not len(scores.shape) == 2 or not scores.shape[0] == num_samples or not scores.shape[1] == 1): raise ValueError(f'Input `scores` should be with type `numpy.ndarray`, and ' f'shape [num_samples, 1], where `num_samples` should be ' f'exactly same as that of input `latent_codes`!') if chosen_num_or_ratio <= 0: raise ValueError(f'Input `chosen_num_or_ratio` should be positive, ' f'but {chosen_num_or_ratio} received!') logger.info(f'Filtering training data.') if invalid_value is not None: latent_codes = latent_codes[scores[:, 0] != invalid_value] scores = scores[scores[:, 0] != invalid_value] logger.info(f'Sorting scores to get positive and negative samples.') sorted_idx = np.argsort(scores, axis=0)[::-1, 0] latent_codes = latent_codes[sorted_idx] scores = scores[sorted_idx] num_samples = latent_codes.shape[0] if 0 < chosen_num_or_ratio <= 1: chosen_num = int(num_samples * chosen_num_or_ratio) else: chosen_num = int(chosen_num_or_ratio) chosen_num = min(chosen_num, num_samples // 2) logger.info(f'Spliting training and validation sets:') train_num = int(chosen_num * split_ratio) val_num = chosen_num - train_num # Positive samples. positive_idx = np.arange(chosen_num) np.random.shuffle(positive_idx) positive_train = latent_codes[:chosen_num][positive_idx[:train_num]] positive_val = latent_codes[:chosen_num][positive_idx[train_num:]] # Negative samples. negative_idx = np.arange(chosen_num) np.random.shuffle(negative_idx) negative_train = latent_codes[-chosen_num:][negative_idx[:train_num]] negative_val = latent_codes[-chosen_num:][negative_idx[train_num:]] # Training set. train_data = np.concatenate([positive_train, negative_train], axis=0) train_label = np.concatenate([np.ones(train_num, dtype=np.int), np.zeros(train_num, dtype=np.int)], axis=0) logger.info(f' Training: {train_num} positive, {train_num} negative.') # Validation set. val_data = np.concatenate([positive_val, negative_val], axis=0) val_label = np.concatenate([np.ones(val_num, dtype=np.int), np.zeros(val_num, dtype=np.int)], axis=0) logger.info(f' Validation: {val_num} positive, {val_num} negative.') # Remaining set. remaining_num = num_samples - chosen_num * 2 remaining_data = latent_codes[chosen_num:-chosen_num] remaining_scores = scores[chosen_num:-chosen_num] decision_value = (scores[0] + scores[-1]) / 2 remaining_label = np.ones(remaining_num, dtype=np.int) remaining_label[remaining_scores.ravel() < decision_value] = 0 remaining_positive_num = np.sum(remaining_label == 1) remaining_negative_num = np.sum(remaining_label == 0) logger.info(f' Remaining: {remaining_positive_num} positive, ' f'{remaining_negative_num} negative.') logger.info(f'Training boundary.') clf = svm.SVC(kernel='linear') classifier = clf.fit(train_data, train_label) logger.info(f'Finish training.') if val_num: val_prediction = classifier.predict(val_data) correct_num = np.sum(val_label == val_prediction) logger.info(f'Accuracy for validation set: ' f'{correct_num} / {val_num * 2} = ' f'{correct_num / (val_num * 2):.6f}') if remaining_num: remaining_prediction = classifier.predict(remaining_data) correct_num = np.sum(remaining_label == remaining_prediction) logger.info(f'Accuracy for remaining set: ' f'{correct_num} / {remaining_num} = ' f'{correct_num / remaining_num:.6f}') a = classifier.coef_.reshape(1, latent_space_dim).astype(np.float32) return a / np.linalg.norm(a)