Ejemplo n.º 1
0
def test_svm_memleak_on_exception(params, n_rows=1000, n_iter=10,
                                  n_cols=1000, dataset='blobs'):
    """
    Test whether there is any mem leak when we exit training with an exception.
    The poly kernel with degree=30 will overflow, and triggers the
    'SMO error: NaN found...' exception.
    """
    X_train, y_train = make_blobs(n_samples=n_rows, n_features=n_cols,
                                  random_state=137, centers=2)
    X_train = X_train.astype(np.float32)
    stream = cuml.cuda.Stream()
    handle = cuml.Handle(stream=stream)

    # Warmup. Some modules that are used in SVC allocate space on the device
    # and consume memory. Here we make sure that this allocation is done
    # before the first call to get_memory_info.
    tmp = cu_svm.SVC(handle=handle, **params)
    with pytest.raises(RuntimeError):
        tmp.fit(X_train, y_train)
        # SMO error: NaN found during fitting.

    free_mem = cuda.current_context().get_memory_info()[0]

    # Main test loop
    for i in range(n_iter):
        cuSVC = cu_svm.SVC(handle=handle, **params)
        with pytest.raises(RuntimeError):
            cuSVC.fit(X_train, y_train)
            # SMO error: NaN found during fitting.

    del(cuSVC)
    handle.sync()
    delta_mem = free_mem - cuda.current_context().get_memory_info()[0]
    print("Delta GPU mem: {} bytes".format(delta_mem))
    assert delta_mem == 0
Ejemplo n.º 2
0
def test_svm_memleak(params,
                     n_rows,
                     n_iter,
                     n_cols,
                     use_handle,
                     dataset='blobs'):
    """
    Test whether there is any memory leak.

    .. note:: small `n_rows`, and `n_cols` values will result in small model
        size, that will not be measured by get_memory_info.

    """
    X_train, X_test, y_train, y_test = make_dataset(dataset, n_rows, n_cols)
    stream = cuml.cuda.Stream()
    handle = cuml.Handle()
    handle.setStream(stream)
    # Warmup. Some modules that are used in SVC allocate space on the device
    # and consume memory. Here we make sure that this allocation is done
    # before the first call to get_memory_info.
    tmp = cu_svm.SVC(handle=handle, **params)
    tmp.fit(X_train, y_train)
    ms = get_memsize(tmp)
    print("Memory consumtion of SVC object is {} MiB".format(ms /
                                                             (1024 * 1024.0)))

    free_mem = cuda.current_context().get_memory_info()[0]

    # Check first whether the get_memory_info gives us the correct memory
    # footprint
    cuSVC = cu_svm.SVC(handle=handle, **params)
    cuSVC.fit(X_train, y_train)
    delta_mem = free_mem - cuda.current_context().get_memory_info()[0]
    assert delta_mem >= ms

    # Main test loop
    b_sum = 0
    for i in range(n_iter):
        cuSVC = cu_svm.SVC(handle=handle, **params)
        cuSVC.fit(X_train, y_train)
        b_sum += cuSVC.intercept_
        cuSVC.predict(X_train)

    del (cuSVC)
    handle.sync()
    delta_mem = free_mem - cuda.current_context().get_memory_info()[0]
    print("Delta GPU mem: {} bytes".format(delta_mem))
    assert delta_mem == 0
Ejemplo n.º 3
0
def test_svc_weights(class_weight, sample_weight):
    # We are using the following example as a test case
    # https://scikit-learn.org/stable/auto_examples/svm/plot_separating_hyperplane_unbalanced.html
    X, y = make_blobs(n_samples=[1000, 100],
                      centers=[[0.0, 0.0], [2.0, 2.0]],
                      cluster_std=[1.5, 0.5],
                      random_state=137, shuffle=False)
    if sample_weight:
        # Put large weight on class 1
        sample_weight = y * 9 + 1

    params = {'kernel': 'linear', 'C': 1, 'gamma': 'scale'}
    params['class_weight'] = class_weight
    cuSVC = cu_svm.SVC(**params)
    cuSVC.fit(X, y, sample_weight)

    if class_weight is not None or sample_weight is not None:
        # Standalone test: check if smaller blob is correctly classified in the
        # presence of class weights
        X_1 = X[y == 1, :]
        y_1 = np.ones(X_1.shape[0])
        cu_score = cuSVC.score(X_1, y_1)
        assert cu_score > 0.9

    sklSVC = svm.SVC(**params)
    sklSVC.fit(X, y, sample_weight)
    compare_svm(cuSVC, sklSVC, X, y, coef_tol=1e-5, report_summary=True)
Ejemplo n.º 4
0
def test_svm_skl_cmp_decision_function(params, n_rows=4000, n_cols=20):

    X_train, X_test, y_train, y_test = make_dataset('classification1', n_rows,
                                                    n_cols)
    y_train = y_train.astype(np.int32)
    y_test = y_test.astype(np.int32)

    cuSVC = cu_svm.SVC(**params)
    cuSVC.fit(X_train, y_train)

    pred = cuSVC.predict(X_test)
    assert pred.dtype == y_train.dtype

    df1 = cuSVC.decision_function(X_test)
    assert df1.dtype == X_train.dtype

    sklSVC = svm.SVC(**params)
    sklSVC.fit(X_train, y_train)
    df2 = sklSVC.decision_function(X_test)

    if params["probability"]:
        tol = 2e-2  # See comments in SVC decision_function method
    else:
        tol = 1e-5
    assert mean_squared_error(df1, df2) < tol
Ejemplo n.º 5
0
def test_svm_gamma(params):
    # Note: we test different array types to make sure that the X.var() is
    # calculated correctly for gamma == 'scale' option.
    x_arraytype = params.pop('x_arraytype', 'numpy')
    n_rows = 500
    n_cols = 380
    centers = [10 * np.ones(380), -10 * np.ones(380)]
    X, y = make_blobs(n_samples=n_rows,
                      n_features=n_cols,
                      random_state=137,
                      centers=centers)
    X = X.astype(np.float32)
    if x_arraytype == 'dataframe':
        X_df = cudf.DataFrame()
        X = X_df.from_gpu_matrix(cuda.to_device(X))
    elif x_arraytype == 'numba':
        X = cuda.to_device(X)
    # Using degree 40 polynomials and fp32 training would fail with
    # gamma = 1/(n_cols*X.std()), but it works with the correct implementation:
    # gamma = 1/(n_cols*X.var())
    cuSVC = cu_svm.SVC(**params)
    cuSVC.fit(X, y)
    y_pred = cuSVC.predict(X).to_array()
    n_correct = np.sum(y == y_pred)
    accuracy = n_correct * 100 / n_rows
    assert accuracy > 70
Ejemplo n.º 6
0
def test_svm_skl_cmp_predict_proba(in_type, n_rows=10000, n_cols=20):
    params = {
        'kernel': 'rbf',
        'C': 1,
        'tol': 1e-3,
        'gamma': 'scale',
        'probability': True
    }
    X, y = make_classification(n_samples=n_rows,
                               n_features=n_cols,
                               n_informative=2,
                               n_redundant=10,
                               random_state=137)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.8,
                                                        random_state=42)

    X_m = input_to_cuml_array(X_train).array
    y_m = input_to_cuml_array(y_train).array

    cuSVC = cu_svm.SVC(**params)
    cuSVC.fit(X_m.to_output(in_type), y_m.to_output(in_type))
    sklSVC = svm.SVC(**params)
    sklSVC.fit(X_train, y_train)
    compare_probabilistic_svm(cuSVC, sklSVC, X_test, y_test, 1e-3, 1e-2)
Ejemplo n.º 7
0
def test_svm_skl_cmp_multiclass(params,
                                dataset='classification2',
                                n_rows=100,
                                n_cols=6):
    X_train, X_test, y_train, y_test = make_dataset(dataset,
                                                    n_rows,
                                                    n_cols,
                                                    n_classes=3,
                                                    n_informative=6)

    # Default to numpy for testing
    with cuml.using_output_type("numpy"):

        cuSVC = cu_svm.SVC(**params)
        cuSVC.fit(X_train, y_train)

        sklSVC = svm.SVC(**params)
        sklSVC.fit(X_train, y_train)

        compare_svm(cuSVC,
                    sklSVC,
                    X_test,
                    y_test,
                    coef_tol=1e-5,
                    report_summary=True)
Ejemplo n.º 8
0
def test_svm_skl_cmp_kernels(params):
    # X_train, X_test, y_train, y_test = make_dataset('gaussian', 1000, 4)
    X_train, y_train = get_binary_iris_dataset()
    cuSVC = cu_svm.SVC(**params)
    cuSVC.fit(X_train, y_train)

    sklSVC = svm.SVC(**params)
    sklSVC.fit(X_train, y_train)

    compare_svm(cuSVC, sklSVC, X_train, y_train, cmp_decision_func=True)
Ejemplo n.º 9
0
def test_svm_predict(params, n_pred):
    n_rows = 500
    n_cols = 2
    X, y = make_blobs(n_samples=n_rows + n_pred, n_features=n_cols,
                      centers=[[-5, -5], [5, 5]])
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        train_size=n_rows)
    cuSVC = cu_svm.SVC(**params)
    cuSVC.fit(X_train, y_train)
    y_pred = cuSVC.predict(X_test)
    n_correct = np.sum(y_test == y_pred)
    accuracy = n_correct * 100 / n_pred
    assert accuracy > 99
Ejemplo n.º 10
0
def test_svm_predict_convert_dtype(train_dtype, test_dtype, classifier):
    X, y = make_classification(n_samples=50, random_state=0)

    X = X.astype(train_dtype)
    y = y.astype(train_dtype)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)

    if classifier:
        clf = cu_svm.SVC()
    else:
        clf = cu_svm.SVR()
    clf.fit(X_train, y_train)
    clf.predict(X_test.astype(test_dtype))
Ejemplo n.º 11
0
def test_svm_numeric_arraytype(x_dtype, y_dtype):
    X, y = get_binary_iris_dataset()
    X = X.astype(x_dtype, order="F")
    y = y.astype(y_dtype)

    params = {'kernel': 'rbf', 'C': 1, 'gamma': 0.25}
    cuSVC = cu_svm.SVC(**params)
    cuSVC.fit(X, y)
    intercept_exp = 0.23468959692060373
    n_sv_exp = 15
    assert abs(cuSVC.intercept_ - intercept_exp) / intercept_exp < 1e-3
    assert cuSVC.n_support_ == n_sv_exp
    n_pred_wrong = np.sum(cuSVC.predict(X)-y)
    assert n_pred_wrong == 0
Ejemplo n.º 12
0
def test_svm_skl_cmp_datasets(params, dataset, n_rows, n_cols):
    if (params['kernel'] == 'linear' and
            dataset in ['gaussian', 'classification2'] and
            n_rows > 1000 and n_cols >= 1000):
        # linear kernel will not fit the gaussian dataset, but takes very long
        return
    X_train, X_test, y_train, y_test = make_dataset(dataset, n_rows, n_cols)

    cuSVC = cuml.svm.SVC(**params)
    cuSVC.fit(X_train, y_train)

    sklSVC = svm.SVC(**params)
    sklSVC.fit(X_train, y_train)

    compare_svm(cuSVC, sklSVC, X_test, y_test, n_sv_tol=max(2, 0.02*n_rows),
                coef_tol=1e-5, report_summary=True)
Ejemplo n.º 13
0
def test_svm_skl_cmp_datasets(params, dataset, n_rows, n_cols):
    if (params['kernel'] == 'linear' and
            dataset in ['gaussian', 'classification2'] and
            n_rows > 1000 and n_cols >= 1000):
        # linear kernel will not fit the gaussian dataset, but takes very long
        return
    X_train, X_test, y_train, y_test = make_dataset(dataset, n_rows, n_cols)

    # Default to numpy for testing
    with cuml.using_output_type("numpy"):

        cuSVC = cu_svm.SVC(**params)
        cuSVC.fit(X_train, y_train)

        sklSVC = svm.SVC(**params)
        sklSVC.fit(X_train, y_train)

        compare_svm(cuSVC, sklSVC, X_test, y_test, coef_tol=1e-5,
                    report_summary=True)
Ejemplo n.º 14
0
def train_boundary(latent_codes,
                   scores,
                   chosen_num_or_ratio=0.02,
                   split_ratio=0.7,
                   invalid_value=None,
                   logger=None):
  """Trains boundary in latent space with offline predicted attribute scores.

  Given a collection of latent codes and the attribute scores predicted from the
  corresponding images, this function will train a linear SVM by treating it as
  a bi-classification problem. Basically, the samples with highest attribute
  scores are treated as positive samples, while those with lowest scores as
  negative. For now, the latent code can ONLY be with 1 dimension.

  NOTE: The returned boundary is with shape (1, latent_space_dim), and also
  normalized with unit norm.

  Args:
    latent_codes: Input latent codes as training data.
    scores: Input attribute scores used to generate training labels.
    chosen_num_or_ratio: How many samples will be chosen as positive (negative)
      samples. If this field lies in range (0, 0.5], `chosen_num_or_ratio *
      latent_codes_num` will be used. Otherwise, `min(chosen_num_or_ratio,
      0.5 * latent_codes_num)` will be used. (default: 0.02)
    split_ratio: Ratio to split training and validation sets. (default: 0.7)
    invalid_value: This field is used to filter out data. (default: None)
    logger: Logger for recording log messages. If set as `None`, a default
      logger, which prints messages from all levels to screen, will be created.
      (default: None)

  Returns:
    A decision boundary with type `numpy.ndarray`.

  Raises:
    ValueError: If the input `latent_codes` or `scores` are with invalid format.
  """
  if not logger:
    logger = setup_logger(work_dir='', logger_name='train_boundary')

  if (not isinstance(latent_codes, np.ndarray) or
      not len(latent_codes.shape) == 2):
    raise ValueError(f'Input `latent_codes` should be with type'
                     f'`numpy.ndarray`, and shape [num_samples, '
                     f'latent_space_dim]!')
  num_samples = latent_codes.shape[0]
  latent_space_dim = latent_codes.shape[1]
  if (not isinstance(scores, np.ndarray) or not len(scores.shape) == 2 or
      not scores.shape[0] == num_samples or not scores.shape[1] == 1):
    raise ValueError(f'Input `scores` should be with type `numpy.ndarray`, and '
                     f'shape [num_samples, 1], where `num_samples` should be '
                     f'exactly same as that of input `latent_codes`!')
  if chosen_num_or_ratio <= 0:
    raise ValueError(f'Input `chosen_num_or_ratio` should be positive, '
                     f'but {chosen_num_or_ratio} received!')

  logger.info(f'Filtering training data.')
  if invalid_value is not None:
    latent_codes = latent_codes[scores[:, 0] != invalid_value]
    scores = scores[scores[:, 0] != invalid_value]

  logger.info(f'Sorting scores to get positive and negative samples.')
  sorted_idx = np.argsort(scores, axis=0)[::-1, 0]
  latent_codes = latent_codes[sorted_idx]
  scores = scores[sorted_idx]
  num_samples = latent_codes.shape[0]
  if 0 < chosen_num_or_ratio <= 1:
    chosen_num = int(num_samples * chosen_num_or_ratio)
  else:
    chosen_num = int(chosen_num_or_ratio)
  chosen_num = min(chosen_num, num_samples // 2)

  logger.info(f'Spliting training and validation sets:')
  train_num = int(chosen_num * split_ratio)
  val_num = chosen_num - train_num
  # Positive samples.
  positive_idx = np.arange(chosen_num)
  np.random.shuffle(positive_idx)
  positive_train = latent_codes[:chosen_num][positive_idx[:train_num]]
  positive_val = latent_codes[:chosen_num][positive_idx[train_num:]]
  # Negative samples.
  negative_idx = np.arange(chosen_num)
  np.random.shuffle(negative_idx)
  negative_train = latent_codes[-chosen_num:][negative_idx[:train_num]]
  negative_val = latent_codes[-chosen_num:][negative_idx[train_num:]]
  # Training set.
  train_data = np.concatenate([positive_train, negative_train], axis=0)
  train_label = np.concatenate([np.ones(train_num, dtype=np.int),
                                np.zeros(train_num, dtype=np.int)], axis=0)
  logger.info(f'  Training: {train_num} positive, {train_num} negative.')
  # Validation set.
  val_data = np.concatenate([positive_val, negative_val], axis=0)
  val_label = np.concatenate([np.ones(val_num, dtype=np.int),
                              np.zeros(val_num, dtype=np.int)], axis=0)
  logger.info(f'  Validation: {val_num} positive, {val_num} negative.')
  # Remaining set.
  remaining_num = num_samples - chosen_num * 2
  remaining_data = latent_codes[chosen_num:-chosen_num]
  remaining_scores = scores[chosen_num:-chosen_num]
  decision_value = (scores[0] + scores[-1]) / 2
  remaining_label = np.ones(remaining_num, dtype=np.int)
  remaining_label[remaining_scores.ravel() < decision_value] = 0
  remaining_positive_num = np.sum(remaining_label == 1)
  remaining_negative_num = np.sum(remaining_label == 0)
  logger.info(f'  Remaining: {remaining_positive_num} positive, '
              f'{remaining_negative_num} negative.')

  logger.info(f'Training boundary.')
  clf = svm.SVC(kernel='linear')
  classifier = clf.fit(train_data, train_label)
  logger.info(f'Finish training.')

  if val_num:
    val_prediction = classifier.predict(val_data)
    correct_num = np.sum(val_label == val_prediction)
    logger.info(f'Accuracy for validation set: '
                f'{correct_num} / {val_num * 2} = '
                f'{correct_num / (val_num * 2):.6f}')

  if remaining_num:
    remaining_prediction = classifier.predict(remaining_data)
    correct_num = np.sum(remaining_label == remaining_prediction)
    logger.info(f'Accuracy for remaining set: '
                f'{correct_num} / {remaining_num} = '
                f'{correct_num / remaining_num:.6f}')

  a = classifier.coef_.reshape(1, latent_space_dim).astype(np.float32)
  return a / np.linalg.norm(a)