Exemple #1
0
    def __init__(self, dataset_properties, random_state=None):
        """
        Parameters
        ----------
        dataset_properties : dict
            Describes the dataset to work on, this can change the
            configuration space constructed by auto-sklearn. Mandatory
            properties are:
            * target_type: classification or regression


            Optional properties are:
            * multiclass: whether the dataset is a multiclass classification
              dataset.
            * multilabel: whether the dataset is a multilabel classification
              dataset
        """

        # Since all calls to get_hyperparameter_search_space will be done by the
        # pipeline on construction, it is not necessary to construct a
        # configuration space at this location!
        # self.configuration = self.get_hyperparameter_search_space(
        #     dataset_properties).get_default_configuration()

        if random_state is None:
            self.random_state = check_random_state(1)
        else:
            self.random_state = check_random_state(random_state)

        # Since the pipeline will initialize the hyperparameters, it is not
        # necessary to do this upon the construction of this object
        # self.set_hyperparameters(self.configuration)
        self.choice = None
    def _iter_test_indices(self, X, y=None, groups=None):
        """Internal method for providing scikit-learn's split with folds

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
            Note that providing ``y`` is sufficient to generate the splits and
            hence ``np.zeros(n_samples)`` may be used as a placeholder for
            ``X`` instead of actual training data.
        y : array-like, shape (n_samples,)
            The target variable for supervised learning problems.
            Stratification is done based on the y labels.
        groups : object
            Always ignored, exists for compatibility.

        Yields
        ------
        fold : List[int]
            indexes of test samples for a given fold, yielded for each of the folds
        """
        if self.random_state:
            check_random_state(self.random_state)

        rows, rows_used, all_combinations, per_row_combinations, samples_with_combination, folds = \
            self._prepare_stratification(y)

        self._distribute_positive_evidence(rows_used, folds, samples_with_combination, per_row_combinations)
        self._distribute_negative_evidence(rows_used, folds)

        for fold in folds:
            yield fold
Exemple #3
0
def test_number_of_subsets_of_features():
    # In RFE, 'number_of_subsets_of_features'
    # = the number of iterations in '_fit'
    # = max(ranking_)
    # = 1 + (n_features + step - n_features_to_select - 1) // step
    # After optimization #4534, this number
    # = 1 + np.ceil((n_features - n_features_to_select) / float(step))
    # This test case is to test their equivalence, refer to #4534 and #3824

    def formula1(n_features, n_features_to_select, step):
        return 1 + ((n_features + step - n_features_to_select - 1) // step)

    def formula2(n_features, n_features_to_select, step):
        return 1 + np.ceil((n_features - n_features_to_select) / float(step))

    # RFE
    # Case 1, n_features - n_features_to_select is divisible by step
    # Case 2, n_features - n_features_to_select is not divisible by step
    n_features_list = [11, 11]
    n_features_to_select_list = [3, 3]
    step_list = [2, 3]
    for n_features, n_features_to_select, step in zip(
            n_features_list, n_features_to_select_list, step_list):
        generator = check_random_state(43)
        X = generator.normal(size=(100, n_features))
        y = generator.rand(100).round()
        rfe = RFE(estimator=SVC(kernel="linear"),
                  n_features_to_select=n_features_to_select, step=step)
        rfe.fit(X, y)
        # this number also equals to the maximum of ranking_
        assert_equal(np.max(rfe.ranking_),
                     formula1(n_features, n_features_to_select, step))
        assert_equal(np.max(rfe.ranking_),
                     formula2(n_features, n_features_to_select, step))

    # In RFECV, 'fit' calls 'RFE._fit'
    # 'number_of_subsets_of_features' of RFE
    # = the size of 'grid_scores' of RFECV
    # = the number of iterations of the for loop before optimization #4534

    # RFECV, n_features_to_select = 1
    # Case 1, n_features - 1 is divisible by step
    # Case 2, n_features - 1 is not divisible by step

    n_features_to_select = 1
    n_features_list = [11, 10]
    step_list = [2, 2]
    for n_features, step in zip(n_features_list, step_list):
        generator = check_random_state(43)
        X = generator.normal(size=(100, n_features))
        y = generator.rand(100).round()
        rfecv = RFECV(estimator=SVC(kernel="linear"), step=step, cv=5)
        rfecv.fit(X, y)

        assert_equal(rfecv.grid_scores_.shape[0],
                     formula1(n_features, n_features_to_select, step))
        assert_equal(rfecv.grid_scores_.shape[0],
                     formula2(n_features, n_features_to_select, step))
Exemple #4
0
def test_xgboost_random_states():
    X, y, weights = generate_classification_data(n_classes=2, distance=5)
    for random_state in [145, None, check_random_state(None), check_random_state(145)]:
        clf1 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state)
        clf1.fit(X, y)
        clf2 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state)
        clf2.fit(X, y)
        if isinstance(random_state, numpy.random.RandomState):
            assert not numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state)
        else:
            assert numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state)
    def _init_fit(self, n_features):
        """Initialize weight and bias parameters."""

        rng = check_random_state(self.random_state)
        weight_init_bound1 = np.sqrt(6. / (n_features + self.n_hidden))
        weight_init_bound2 = np.sqrt(6. / (n_features + self.n_hidden))      
        rng = check_random_state(self.random_state)
        self.coef_hidden_ = rng.uniform(-weight_init_bound1, weight_init_bound1, (n_features, self.n_hidden))
        rng = check_random_state(self.random_state)
        self.intercept_hidden_ = rng.uniform(-weight_init_bound1, weight_init_bound1, self.n_hidden)
        rng = check_random_state(self.random_state)
        self.coef_output_ = rng.uniform(-weight_init_bound2, weight_init_bound2, (self.n_hidden, n_features))
        rng = check_random_state(self.random_state)
        self.intercept_output_ = rng.uniform(-weight_init_bound2, weight_init_bound2, n_features)
Exemple #6
0
    def _iter_test_indices(self, frame, y=None):
        n_obs = frame.shape[0]
        indices = np.arange(n_obs)
        if self.shuffle:
            check_random_state(self.random_state).shuffle(indices)

        n_folds = self.n_folds
        fold_sizes = (n_obs // n_folds) * np.ones(n_folds, dtype=np.int)
        fold_sizes[:n_obs % n_folds] += 1
        current = 0
        for fold_size in fold_sizes:
            start, stop = current, current + fold_size
            yield indices[start:stop]
            current = stop
Exemple #7
0
def test_make_rng():
    """Check the check_random_state utility function behavior"""
    assert check_random_state(None) is np.random.mtrand._rand
    assert check_random_state(np.random) is np.random.mtrand._rand

    rng_42 = np.random.RandomState(42)
    assert check_random_state(42).randint(100) == rng_42.randint(100)

    rng_42 = np.random.RandomState(42)
    assert check_random_state(rng_42) is rng_42

    rng_42 = np.random.RandomState(42)
    assert check_random_state(43).randint(100) != rng_42.randint(100)

    assert_raises(ValueError, check_random_state, "some invalid seed")
    def run_stochastic_models(self,
                              params,
                              n_input_sample,
                              return_input_samples=True,
                              random_state=None,
                              verbose=False):
        """This function considers the model output as a stochastic function by 
        taking the dependence parameters as inputs.

        Parameters
        ----------
        params : list, or `np.ndarray`
            The list of parameters associated to the predefined copula.
        n_input_sample : int, optional (default=1)
            The number of evaluations for each parameter
        random_state : 
        """
        check_random_state(random_state)
        func = self.model_func

        # Get all the input_sample
        if verbose:
            print('Time taken:', time.clock())
            print('Creating the input samples')

        input_samples = []
        for param in params:
            full_param = np.zeros((self._corr_dim, ))
            full_param[self._pair_ids] = param
            full_param[self._fixed_pairs_ids] = self._fixed_params_list
            intput_sample = self._get_sample(full_param, n_input_sample)
            input_samples.append(intput_sample)

        if verbose:
            print('Time taken:', time.clock())
            print('Evaluate the input samples')

        # Evaluate the through the model
        outputs = func(np.concatenate(input_samples))
        # List of output sample for each param
        output_samples = np.split(outputs, len(params))

        if verbose:
            print('Time taken:', time.clock())
        if return_input_samples:
            return output_samples, input_samples
        else:
            return output_samples
def random_spd(p, eig_min, cond, random_state=0):
    """Generate a random symmetric positive definite matrix.

    Parameters
    ----------
    p : int
        The first dimension of the array.

    eig_min : float
        Minimal eigenvalue.

    cond : float
        Condition number, defined as the ratio of the maximum eigenvalue to the
        minimum one.

    random_state : int or numpy.random.RandomState instance, optional
        random number generator, or seed.

    Returns
    -------
    ouput : numpy.ndarray, shape (p, p)
        A symmetric positive definite matrix with the given minimal eigenvalue
        and condition number.
    """
    random_state = check_random_state(random_state)
    mat = random_state.randn(p, p)
    unitary, _ = linalg.qr(mat)
    diag = random_diagonal(p, v_min=eig_min, v_max=cond * eig_min,
                           random_state=random_state)
    return unitary.dot(diag).dot(unitary.T)
def test_iforest_sparse():
    """Check IForest for various parameter settings on sparse input."""
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
                                                        boston.target[:50],
                                                        random_state=rng)
    grid = ParameterGrid({"max_samples": [0.5, 1.0],
                          "bootstrap": [True, False]})

    for sparse_format in [csc_matrix, csr_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)

        for params in grid:
            # Trained on sparse format
            sparse_classifier = IsolationForest(
                n_estimators=10, random_state=1, **params).fit(X_train_sparse)
            sparse_results = sparse_classifier.predict(X_test_sparse)

            # Trained on dense format
            dense_classifier = IsolationForest(
                n_estimators=10, random_state=1, **params).fit(X_train)
            dense_results = dense_classifier.predict(X_test)

            assert_array_equal(sparse_results, dense_results)
            assert_array_equal(sparse_results, dense_results)
def test_reduction_to_one_component():
    # t-SNE should allow reduction to one component (issue #4154).
    random_state = check_random_state(0)
    tsne = TSNE(n_components=1)
    X = random_state.randn(5, 2)
    X_embedded = tsne.fit(X).embedding_
    assert(np.all(np.isfinite(X_embedded)))
Exemple #12
0
def _mask_and_reduce_single(masker,
                            img, confound,
                            reduction_ratio=None,
                            n_samples=None,
                            memory=None,
                            memory_level=0,
                            random_state=None):
    """Utility function for multiprocessing from MaskReducer"""
    this_data = masker.transform(img, confound)
    # Now get rid of the img as fast as possible, to free a
    # reference count on it, and possibly free the corresponding
    # data
    del img
    random_state = check_random_state(random_state)

    data_n_samples = this_data.shape[0]
    if reduction_ratio is None:
        assert n_samples is not None
        n_samples = min(n_samples, data_n_samples)
    else:
        n_samples = int(ceil(data_n_samples * reduction_ratio))

    U, S, V = cache(fast_svd, memory,
                        memory_level=memory_level,
                        func_memory_level=3)(this_data.T,
                                             n_samples,
                                             random_state=random_state)
    U = U.T.copy()
    U = U * S[:, np.newaxis]
    return U
def test_accessible_kl_divergence():
    # Ensures that the accessible kl_divergence matches the computed value
    random_state = check_random_state(0)
    X = random_state.randn(100, 2)
    tsne = TSNE(n_iter_without_progress=2, verbose=2,
                random_state=0, method='exact')

    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        tsne.fit_transform(X)
    finally:
        out = sys.stdout.getvalue()
        sys.stdout.close()
        sys.stdout = old_stdout

    # The output needs to contain the accessible kl_divergence as the error at
    # the last iteration
    for line in out.split('\n')[::-1]:
        if 'Iteration' in line:
            _, _, error = line.partition('error = ')
            if error:
                error, _, _ = error.partition(',')
                break
    assert_almost_equal(tsne.kl_divergence_, float(error), decimal=5)
def test_binary_perplexity_stability():
    # Binary perplexity search should be stable.
    # The binary_search_perplexity had a bug wherein the P array
    # was uninitialized, leading to sporadically failing tests.
    k = 10
    n_samples = 100
    random_state = check_random_state(0)
    distances = random_state.randn(n_samples, 2).astype(np.float32)
    # Distances shouldn't be negative
    distances = np.abs(distances.dot(distances.T))
    np.fill_diagonal(distances, 0.0)
    last_P = None
    neighbors_nn = np.argsort(distances, axis=1)[:, :k].astype(np.int64)
    for _ in range(100):
        P = _binary_search_perplexity(distances.copy(), neighbors_nn.copy(),
                                      3, verbose=0)
        P1 = _joint_probabilities_nn(distances, neighbors_nn, 3, verbose=0)
        # Convert the sparse matrix to a dense one for testing
        P1 = P1.toarray()
        if last_P is None:
            last_P = P
            last_P1 = P1
        else:
            assert_array_almost_equal(P, last_P, decimal=4)
            assert_array_almost_equal(P1, last_P1, decimal=4)
def test_gradient():
    # Test gradient of Kullback-Leibler divergence.
    random_state = check_random_state(0)

    n_samples = 50
    n_features = 2
    n_components = 2
    alpha = 1.0

    distances = random_state.randn(n_samples, n_features).astype(np.float32)
    distances = np.abs(distances.dot(distances.T))
    np.fill_diagonal(distances, 0.0)
    X_embedded = random_state.randn(n_samples, n_components).astype(np.float32)

    P = _joint_probabilities(distances, desired_perplexity=25.0,
                             verbose=0)

    def fun(params):
        return _kl_divergence(params, P, alpha, n_samples, n_components)[0]

    def grad(params):
        return _kl_divergence(params, P, alpha, n_samples, n_components)[1]

    assert_almost_equal(check_grad(fun, grad, X_embedded.ravel()), 0.0,
                        decimal=5)
Exemple #16
0
def create_random_gmm(n_mix, n_features, covariance_type, prng=0):
    prng = check_random_state(prng)
    g = GaussianMixture(n_mix, covariance_type=covariance_type)
    g.means_ = prng.randint(-20, 20, (n_mix, n_features))
    g.covars_ = make_covar_matrix(covariance_type, n_mix, n_features)
    g.weights_ = normalized(prng.rand(n_mix))
    return g
Exemple #17
0
def test_oneclass_decision_function():
    # Test OneClassSVM decision function
    clf = svm.OneClassSVM()
    rnd = check_random_state(2)

    # Generate train data
    X = 0.3 * rnd.randn(100, 2)
    X_train = np.r_[X + 2, X - 2]

    # Generate some regular novel observations
    X = 0.3 * rnd.randn(20, 2)
    X_test = np.r_[X + 2, X - 2]
    # Generate some abnormal novel observations
    X_outliers = rnd.uniform(low=-4, high=4, size=(20, 2))

    # fit the model
    clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
    clf.fit(X_train)

    # predict things
    y_pred_test = clf.predict(X_test)
    assert_greater(np.mean(y_pred_test == 1), .9)
    y_pred_outliers = clf.predict(X_outliers)
    assert_greater(np.mean(y_pred_outliers == -1), .9)
    dec_func_test = clf.decision_function(X_test)
    assert_array_equal((dec_func_test > 0).ravel(), y_pred_test == 1)
    dec_func_outliers = clf.decision_function(X_outliers)
    assert_array_equal((dec_func_outliers > 0).ravel(), y_pred_outliers == 1)
def random_diagonal(p, v_min=1., v_max=2., random_state=0):
    """Generate a random diagonal matrix.

    Parameters
    ----------
    p : int
        The first dimension of the array.

    v_min : float, optional (default to 1.)
        Minimal element.

    v_max : float, optional (default to 2.)
        Maximal element.

    random_state : int or numpy.random.RandomState instance, optional
        random number generator, or seed.

    Returns
    -------
    output : numpy.ndarray, shape (p, p)
        A diagonal matrix with the given minimal and maximal elements.

    """
    random_state = check_random_state(random_state)
    diag = random_state.rand(p) * (v_max - v_min) + v_min
    diag[diag == np.amax(diag)] = v_max
    diag[diag == np.amin(diag)] = v_min
    return np.diag(diag)
Exemple #19
0
def test_rfe():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    X_sparse = sparse.csr_matrix(X)
    y = iris.target

    # dense model
    clf = SVC(kernel="linear")
    rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1)
    rfe.fit(X, y)
    X_r = rfe.transform(X)
    clf.fit(X_r, y)
    assert_equal(len(rfe.ranking_), X.shape[1])

    # sparse model
    clf_sparse = SVC(kernel="linear")
    rfe_sparse = RFE(estimator=clf_sparse, n_features_to_select=4, step=0.1)
    rfe_sparse.fit(X_sparse, y)
    X_r_sparse = rfe_sparse.transform(X_sparse)

    assert_equal(X_r.shape, iris.data.shape)
    assert_array_almost_equal(X_r[:10], iris.data[:10])

    assert_array_almost_equal(rfe.predict(X), clf.predict(iris.data))
    assert_equal(rfe.score(X, y), clf.score(iris.data, iris.target))
    assert_array_almost_equal(X_r, X_r_sparse.toarray())
def test_compute_mi_cd():
    # To test define a joint distribution as follows:
    # p(x, y) = p(x) p(y | x)
    # X ~ Bernoulli(p)
    # (Y | x = 0) ~ Uniform(-1, 1)
    # (Y | x = 1) ~ Uniform(0, 2)

    # Use the following formula for mutual information:
    # I(X; Y) = H(Y) - H(Y | X)
    # Two entropies can be computed by hand:
    # H(Y) = -(1-p)/2 * ln((1-p)/2) - p/2*log(p/2) - 1/2*log(1/2)
    # H(Y | X) = ln(2)

    # Now we need to implement sampling from out distribution, which is
    # done easily using conditional distribution logic.

    n_samples = 1000
    rng = check_random_state(0)

    for p in [0.3, 0.5, 0.7]:
        x = rng.uniform(size=n_samples) > p

        y = np.empty(n_samples)
        mask = x == 0
        y[mask] = rng.uniform(-1, 1, size=np.sum(mask))
        y[~mask] = rng.uniform(0, 2, size=np.sum(~mask))

        I_theory = -0.5 * ((1 - p) * np.log(0.5 * (1 - p)) +
                           p * np.log(0.5 * p) + np.log(0.5)) - np.log(2)

        # Assert the same tolerance.
        for n_neighbors in [3, 5, 7]:
            I_computed = _compute_mi(x, y, True, False, n_neighbors)
            assert_almost_equal(I_computed, I_theory, 1)
def test_space_net_alpha_grid_pure_spatial():
    rng = check_random_state(42)
    X = rng.randn(10, 100)
    y = np.arange(X.shape[0])
    for is_classif in [True, False]:
        assert_false(np.any(np.isnan(_space_net_alpha_grid(
            X, y, l1_ratio=0., logistic=is_classif))))
def test_lda_dimension_warning(n_classes, n_features):
    # FIXME: Future warning to be removed in 0.23
    rng = check_random_state(0)
    n_samples = 10
    X = rng.randn(n_samples, n_features)
    # we create n_classes labels by repeating and truncating a
    # range(n_classes) until n_samples
    y = np.tile(range(n_classes), n_samples // n_classes + 1)[:n_samples]
    max_components = min(n_features, n_classes - 1)

    for n_components in [max_components - 1, None, max_components]:
        # if n_components <= min(n_classes - 1, n_features), no warning
        lda = LinearDiscriminantAnalysis(n_components=n_components)
        assert_no_warnings(lda.fit, X, y)

    for n_components in [max_components + 1,
                         max(n_features, n_classes - 1) + 1]:
        # if n_components > min(n_classes - 1, n_features), raise warning
        # We test one unit higher than max_components, and then something
        # larger than both n_features and n_classes - 1 to ensure the test
        # works for any value of n_component
        lda = LinearDiscriminantAnalysis(n_components=n_components)
        msg = ("n_components cannot be larger than min(n_features, "
               "n_classes - 1). Using min(n_features, "
               "n_classes - 1) = min(%d, %d - 1) = %d components." %
               (n_features, n_classes, max_components))
        assert_warns_message(ChangedBehaviorWarning, msg, lda.fit, X, y)
        future_msg = ("In version 0.23, setting n_components > min("
                      "n_features, n_classes - 1) will raise a "
                      "ValueError. You should set n_components to None"
                      " (default), or a value smaller or equal to "
                      "min(n_features, n_classes - 1).")
        assert_warns_message(FutureWarning, future_msg, lda.fit, X, y)
def test_compute_mi_cc():
    # For two continuous variables a good approach is to test on bivariate
    # normal distribution, where mutual information is known.

    # Mean of the distribution, irrelevant for mutual information.
    mean = np.zeros(2)

    # Setup covariance matrix with correlation coeff. equal 0.5.
    sigma_1 = 1
    sigma_2 = 10
    corr = 0.5
    cov = np.array([
        [sigma_1**2, corr * sigma_1 * sigma_2],
        [corr * sigma_1 * sigma_2, sigma_2**2]
    ])

    # True theoretical mutual information.
    I_theory = (np.log(sigma_1) + np.log(sigma_2) -
                0.5 * np.log(np.linalg.det(cov)))

    rng = check_random_state(0)
    Z = rng.multivariate_normal(mean, cov, size=1000)

    x, y = Z[:, 0], Z[:, 1]

    # Theory and computed values won't be very close, assert that the
    # first figures after decimal point match.
    for n_neighbors in [3, 5, 7]:
        I_computed = _compute_mi(x, y, False, False, n_neighbors)
        assert_almost_equal(I_computed, I_theory, 1)
Exemple #24
0
    def fit(self, X, y):
        """
        Class method to define class populations and store them as instance
        variables. Also stores majority class label
        """
        self.X = check_array(X)
        self.y = np.array(y).astype(np.int64)
        self.random_state_ = check_random_state(self.random_state)
        self.unique_classes_ = set(self.y)

        # Initialize all class populations with zero
        for element in self.unique_classes_:
            self.clstats[element] = 0

        # Count occurences of each class
        for element in self.y:
            self.clstats[element] += 1

        # Find majority class
        v = list(self.clstats.values())
        k = list(self.clstats.keys())
        self.maj_class_ = k[v.index(max(v))]

        if self.verbose:
            print(
                'Majority class is %s and total number of classes is %s'
                % (self.maj_class_, len(self.unique_classes_)))
Exemple #25
0
    def rvs(self, n=1, random_state=None):
        """Generate random samples from the model.

        Parameters
        ----------
        n : int
            Number of samples to generate.

        Returns
        -------
        obs : array_like, length `n`
            List of samples
        """
        random_state = check_random_state(random_state)

        startprob_pdf = self.startprob
        startprob_cdf = np.cumsum(startprob_pdf)
        transmat_pdf = self.transmat
        transmat_cdf = np.cumsum(transmat_pdf, 1)

        # Initial state.
        rand = random_state.rand()
        currstate = (startprob_cdf > rand).argmax()
        obs = [self._generate_sample_from_state(
            currstate, random_state=random_state)]

        for x in xrange(n - 1):
            rand = random_state.rand()
            currstate = (transmat_cdf[currstate] > rand).argmax()
            obs.append(self._generate_sample_from_state(
                currstate, random_state=random_state))

        return np.array(obs)
Exemple #26
0
def test_linearsvr_fit_sampleweight():
    # check correct result when sample_weight is 1
    # check that SVR(kernel='linear') and LinearSVC() give
    # comparable results
    diabetes = datasets.load_diabetes()
    n_samples = len(diabetes.target)
    unit_weight = np.ones(n_samples)
    lsvr = svm.LinearSVR(C=1e3).fit(diabetes.data, diabetes.target,
                                    sample_weight=unit_weight)
    score1 = lsvr.score(diabetes.data, diabetes.target)

    lsvr_no_weight = svm.LinearSVR(C=1e3).fit(diabetes.data, diabetes.target)
    score2 = lsvr_no_weight.score(diabetes.data, diabetes.target)

    assert_allclose(np.linalg.norm(lsvr.coef_),
                    np.linalg.norm(lsvr_no_weight.coef_), 1, 0.0001)
    assert_almost_equal(score1, score2, 2)

    # check that fit(X)  = fit([X1, X2, X3],sample_weight = [n1, n2, n3]) where
    # X = X1 repeated n1 times, X2 repeated n2 times and so forth
    random_state = check_random_state(0)
    random_weight = random_state.randint(0, 10, n_samples)
    lsvr_unflat = svm.LinearSVR(C=1e3).fit(diabetes.data, diabetes.target,
                                           sample_weight=random_weight)
    score3 = lsvr_unflat.score(diabetes.data, diabetes.target,
                               sample_weight=random_weight)

    X_flat = np.repeat(diabetes.data, random_weight, axis=0)
    y_flat = np.repeat(diabetes.target, random_weight, axis=0)
    lsvr_flat = svm.LinearSVR(C=1e3).fit(X_flat, y_flat)
    score4 = lsvr_flat.score(X_flat, y_flat)

    assert_almost_equal(score3, score4, 2)
Exemple #27
0
def test_linearsvc_fit_sampleweight():
    # check correct result when sample_weight is 1
    n_samples = len(X)
    unit_weight = np.ones(n_samples)
    clf = svm.LinearSVC(random_state=0).fit(X, Y)
    clf_unitweight = svm.LinearSVC(random_state=0).\
        fit(X, Y, sample_weight=unit_weight)

    # check if same as sample_weight=None
    assert_array_equal(clf_unitweight.predict(T), clf.predict(T))
    assert_allclose(clf.coef_, clf_unitweight.coef_, 1, 0.0001)

    # check that fit(X)  = fit([X1, X2, X3],sample_weight = [n1, n2, n3]) where
    # X = X1 repeated n1 times, X2 repeated n2 times and so forth

    random_state = check_random_state(0)
    random_weight = random_state.randint(0, 10, n_samples)
    lsvc_unflat = svm.LinearSVC(random_state=0).\
        fit(X, Y, sample_weight=random_weight)
    pred1 = lsvc_unflat.predict(T)

    X_flat = np.repeat(X, random_weight, axis=0)
    y_flat = np.repeat(Y, random_weight, axis=0)
    lsvc_flat = svm.LinearSVC(random_state=0).fit(X_flat, y_flat)
    pred2 = lsvc_flat.predict(T)

    assert_array_equal(pred1, pred2)
    assert_allclose(lsvc_unflat.coef_, lsvc_flat.coef_, 1, 0.0001)
def random_non_singular(p, sing_min=1., sing_max=2., random_state=0):
    """Generate a random nonsingular matrix.

    Parameters
    ----------
    p : int
        The first dimension of the array.

    sing_min : float, optional (default to 1.)
        Minimal singular value.

    sing_max : float, optional (default to 2.)
        Maximal singular value.

    random_state : int or numpy.random.RandomState instance, optional
        random number generator, or seed.

    Returns
    -------
    output : numpy.ndarray, shape (p, p)
        A nonsingular matrix with the given minimal and maximal singular
        values.
    """
    random_state = check_random_state(random_state)
    diag = random_diagonal(p, v_min=sing_min, v_max=sing_max,
                           random_state=random_state)
    mat1 = random_state.randn(p, p)
    mat2 = random_state.randn(p, p)
    unitary1, _ = linalg.qr(mat1)
    unitary2, _ = linalg.qr(mat2)
    return unitary1.dot(diag).dot(unitary2.T)
def test_kd_tree_query(metric, k, dualtree, breadth_first):
    rng = check_random_state(0)
    X = rng.random_sample((40, DIMENSION))
    Y = rng.random_sample((10, DIMENSION))

    kwargs = METRICS[metric]
    check_neighbors(dualtree, breadth_first, k, metric, X, Y, kwargs)
    def _generate_sample(self, X, nn_data, nn_num, row, col, step):
        """Generate a synthetic sample with an additional steps for the
        categorical features.

        Each new sample is generated the same way than in SMOTE. However, the
        categorical features are mapped to the most frequent nearest neighbors
        of the majority class.
        """
        rng = check_random_state(self.random_state)
        sample = super(SMOTENC, self)._generate_sample(X, nn_data, nn_num,
                                                       row, col, step)
        # To avoid conversion and since there is only few samples used, we
        # convert those samples to dense array.
        sample = (sample.toarray().squeeze()
                  if sparse.issparse(sample) else sample)
        all_neighbors = nn_data[nn_num[row]]
        all_neighbors = (all_neighbors.toarray()
                         if sparse.issparse(all_neighbors) else all_neighbors)

        categories_size = ([self.continuous_features_.size] +
                           [cat.size for cat in self.ohe_.categories_])

        for start_idx, end_idx in zip(np.cumsum(categories_size)[:-1],
                                      np.cumsum(categories_size)[1:]):
            col_max = all_neighbors[:, start_idx:end_idx].sum(axis=0)
            # tie breaking argmax
            col_sel = rng.choice(np.flatnonzero(
                np.isclose(col_max, col_max.max())))
            sample[start_idx:end_idx] = 0
            sample[start_idx + col_sel] = 1

        return sparse.csr_matrix(sample) if sparse.issparse(X) else sample
# Author: Andreas Mueller <*****@*****.**>
#         Jaques Grobler <*****@*****.**>
# License: BSD 3 clause


import numpy as np
import matplotlib.pyplot as plt

from sklearn.svm import LinearSVC
from sklearn.cross_validation import ShuffleSplit
from sklearn.grid_search import GridSearchCV
from sklearn.utils import check_random_state
from sklearn import datasets

rnd = check_random_state(1)

# set up dataset
n_samples = 100
n_features = 300

# l1 data (only 5 informative features)
X_1, y_1 = datasets.make_classification(n_samples=n_samples,
                                        n_features=n_features, n_informative=5,
                                        random_state=1)

# l2 data: non sparse, but less features
y_2 = np.sign(.5 - rnd.rand(n_samples))
X_2 = rnd.randn(n_samples, n_features / 5) + y_2[:, np.newaxis]
X_2 += 5 * rnd.randn(n_samples, n_features / 5)
Exemple #32
0
def fit(self, X, y, sample_weight=None):
    """Fit the SVM model according to the given training data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
            For kernel="precomputed", the expected shape of X is
            (n_samples, n_samples).

        y : array-like, shape (n_samples,)
            Target values (class labels in classification, real numbers in
            regression)

        sample_weight : array-like, shape (n_samples,)
            Per-sample weights. Rescale C per sample. Higher weights
            force the classifier to put more emphasis on these points.

        Returns
        -------
        self : object

        Notes
        ------
        If X and y are not C-ordered and contiguous arrays of np.float64 and
        X is not a scipy.sparse.csr_matrix, X and/or y may be copied.

        If X is a dense array, then the other methods will not support sparse
        matrices as input.
        """
    rnd = check_random_state(self.random_state)

    sparse = sp.isspmatrix(X)
    if sparse and self.kernel == "precomputed":
        raise TypeError("Sparse precomputed kernels are not supported.")
    self._sparse = sparse and not callable(self.kernel)

    X, y = check_X_y(X,
                     y,
                     dtype=np.float64,
                     order='C',
                     accept_sparse='csr',
                     accept_large_sparse=False)
    y = self._validate_targets(y)

    sample_weight = np.asarray([] if sample_weight is None else sample_weight,
                               dtype=np.float64)
    solver_type = LIBSVM_IMPL.index(self._impl)

    # input validation
    if solver_type != 2 and X.shape[0] != y.shape[0]:
        raise ValueError("X and y have incompatible shapes.\n" +
                         "X has %s samples, but y has %s." %
                         (X.shape[0], y.shape[0]))

    if self.kernel == "precomputed" and X.shape[0] != X.shape[1]:
        raise ValueError("X.shape[0] should be equal to X.shape[1]")

    if sample_weight.shape[0] > 0 and sample_weight.shape[0] != X.shape[0]:
        raise ValueError("sample_weight and X have incompatible shapes: "
                         "%r vs %r\n"
                         "Note: Sparse matrices cannot be indexed w/"
                         "boolean masks (use `indices=True` in CV)." %
                         (sample_weight.shape, X.shape))

    self._gamma = _compute_gamma(self.gamma, self.kernel, X, sparse)

    kernel = self.kernel
    if callable(kernel):
        kernel = 'precomputed'

    fit = self._sparse_fit if self._sparse else self._dense_fit
    if self.verbose:  # pragma: no cover
        print('[LibSVM]', end='')

    # see comment on the other call to np.iinfo in this file
    seed = rnd.randint(np.iinfo('i').max)

    is_support_weights = daal_check_version((2020,'P', 2), (2021, 'B', 108)) or \
         sample_weight.size == 0 and self.class_weight is None

    if ( not sparse and not self.probability and not getattr(self, 'break_ties', False) and \
         kernel in ['linear', 'rbf']) and is_support_weights:

        logging.info("sklearn.svm.SVC.fit: " + method_uses_daal)
        self._daal_fit = True
        _daal4py_fit(self, X, y, sample_weight, kernel)
        self.fit_status_ = 0
    else:
        logging.info("sklearn.svm.SVC.fit: " + method_uses_sklearn)
        self._daal_fit = False
        fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)

    self.shape_fit_ = X.shape

    # In binary case, we need to flip the sign of coef, intercept and
    # decision function. Use self._intercept_ and self._dual_coef_ internally.
    if not self._daal_fit:
        self._internal_intercept_ = self.intercept_.copy()
        self._internal_dual_coef_ = self.dual_coef_.copy()
    else:
        self._internal_intercept_ = self.intercept_.copy()
        self._internal_dual_coef_ = self.dual_coef_.copy()
        if len(self.classes_) == 2:
            self._internal_dual_coef_ *= -1
            self._internal_intercept_ *= -1

    if not self._daal_fit and len(
            self.classes_) == 2 and self._impl in ['c_svc', 'nu_svc']:
        self.intercept_ *= -1
        self.dual_coef_ *= -1

    return self
Exemple #33
0
    def _fit(self, X, skip_num_points=0):
        """Private function to fit the model using X as training data."""

        X = self._validate_data(X,
                                accept_sparse=['csr', 'csc', 'coo'],
                                dtype=[np.float32, np.float64])
        if self.metric == "precomputed":
            if isinstance(self.init, str) and self.init == 'pca':
                raise ValueError("The parameter init=\"pca\" cannot be "
                                 "used with metric=\"precomputed\".")
            if X.shape[0] != X.shape[1]:
                raise ValueError("X should be a square distance matrix")

            check_non_negative(
                X, "QSNE.fit(). With metric='precomputed', X "
                "should contain positive distances.")

            if issparse(X):
                raise TypeError('QSNE does not accept sparse '
                                'precomputed distance matrix.')

        random_state = check_random_state(self.random_state)

        if self.early_exaggeration < 1.0:
            raise ValueError(
                "early_exaggeration must be at least 1, but is {}".format(
                    self.early_exaggeration))

        if self.n_iter < 250:
            raise ValueError("n_iter should be at least 250")

        n_samples = X.shape[0]

        neighbors_nn = None
        # Retrieve the distance matrix, either using the precomputed one or
        # computing it.
        if self.metric == "precomputed":
            distances = X
        else:
            if self.verbose:
                print("[q-SNE] Computing pairwise distances...")

            if self.metric == "euclidean":
                distances = pairwise_distances(X,
                                               metric=self.metric,
                                               squared=True)
            else:
                distances = pairwise_distances(X,
                                               metric=self.metric,
                                               n_jobs=self.n_jobs)

            if np.any(distances < 0):
                raise ValueError("All distances should be positive, the "
                                 "metric given is not correct")

        # compute the joint probability distribution for the input space
        P = _joint_probabilities(distances, self.perplexity, self.verbose)
        assert np.all(np.isfinite(P)), "All probabilities should be finite"
        assert np.all(P >= 0), "All probabilities should be non-negative"
        assert np.all(P <= 1), ("All probabilities should be less "
                                "or then equal to one")

        if isinstance(self.init, np.ndarray):
            X_embedded = self.init
        elif self.init == 'pca':
            pca = PCA(n_components=self.n_components,
                      svd_solver='randomized',
                      random_state=random_state)
            X_embedded = pca.fit_transform(X).astype(np.float32, copy=False)
        elif self.init == 'random':
            # The embedding is initialized with iid samples from Gaussians with
            # standard deviation 1e-4.
            X_embedded = 1e-4 * random_state.randn(
                n_samples, self.n_components).astype(np.float32)
        else:
            raise ValueError("'init' must be 'pca', 'random', or "
                             "a numpy array")

        return self._qsne(P,
                          self.q,
                          n_samples,
                          X_embedded=X_embedded,
                          neighbors=neighbors_nn,
                          skip_num_points=skip_num_points)
Exemple #34
0
    def _fit_resample(self, X, y):
        self._validate_estimator()

        X_resampled = X.copy()
        y_resampled = y.copy()

        for class_sample, n_samples in self.sampling_strategy_.items():
            if n_samples == 0:
                continue
            target_class_indices = np.flatnonzero(y == class_sample)
            X_class = _safe_indexing(X, target_class_indices)

            self.nn_m_.fit(X)
            danger_index = self._in_danger_noise(
                self.nn_m_, X_class, class_sample, y, kind="danger"
            )
            if not any(danger_index):
                continue

            self.nn_k_.fit(X_class)
            nns = self.nn_k_.kneighbors(
                _safe_indexing(X_class, danger_index), return_distance=False
            )[:, 1:]

            # divergence between borderline-1 and borderline-2
            if self.kind == "borderline-1":
                # Create synthetic samples for borderline points.
                X_new, y_new = self._make_samples(
                    _safe_indexing(X_class, danger_index),
                    y.dtype,
                    class_sample,
                    X_class,
                    nns,
                    n_samples,
                )
                if sparse.issparse(X_new):
                    X_resampled = sparse.vstack([X_resampled, X_new])
                else:
                    X_resampled = np.vstack((X_resampled, X_new))
                y_resampled = np.hstack((y_resampled, y_new))

            elif self.kind == "borderline-2":
                random_state = check_random_state(self.random_state)
                fractions = random_state.beta(10, 10)

                # only minority
                X_new_1, y_new_1 = self._make_samples(
                    _safe_indexing(X_class, danger_index),
                    y.dtype,
                    class_sample,
                    X_class,
                    nns,
                    int(fractions * (n_samples + 1)),
                    step_size=1.0,
                )

                # we use a one-vs-rest policy to handle the multiclass in which
                # new samples will be created considering not only the majority
                # class but all over classes.
                X_new_2, y_new_2 = self._make_samples(
                    _safe_indexing(X_class, danger_index),
                    y.dtype,
                    class_sample,
                    _safe_indexing(X, np.flatnonzero(y != class_sample)),
                    nns,
                    int((1 - fractions) * n_samples),
                    step_size=0.5,
                )

                if sparse.issparse(X_resampled):
                    X_resampled = sparse.vstack(
                        [X_resampled, X_new_1, X_new_2]
                    )
                else:
                    X_resampled = np.vstack((X_resampled, X_new_1, X_new_2))
                y_resampled = np.hstack((y_resampled, y_new_1, y_new_2))

        return X_resampled, y_resampled
Exemple #35
0
    def fit(self, X, y):
        """Fit Gaussian process classification model

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Training data

        y : array-like, shape = (n_samples,)
            Target values, must be binary

        Returns
        -------
        self : returns an instance of self.
        """
        if self.kernel is None:  # Use an RBF kernel as default
            self.kernel_ = C(1.0, constant_value_bounds="fixed") \
                * RBF(1.0, length_scale_bounds="fixed")
        else:
            self.kernel_ = clone(self.kernel)

        self.rng = check_random_state(self.random_state)

        self.X_train_ = np.copy(X) if self.copy_X_train else X

        # Encode class labels and check that it is a binary classification
        # problem
        label_encoder = LabelEncoder()
        self.y_train_ = label_encoder.fit_transform(y)
        self.classes_ = label_encoder.classes_
        if self.classes_.size > 2:
            raise ValueError("%s supports only binary classification. "
                             "y contains classes %s" %
                             (self.__class__.__name__, self.classes_))
        elif self.classes_.size == 1:
            raise ValueError(
                "{0:s} requires 2 classes; got {1:d} class".format(
                    self.__class__.__name__, self.classes_.size))

        if self.optimizer is not None and self.kernel_.n_dims > 0:
            # Choose hyperparameters based on maximizing the log-marginal
            # likelihood (potentially starting from several initial values)
            def obj_func(theta, eval_gradient=True):
                if eval_gradient:
                    lml, grad = self.log_marginal_likelihood(
                        theta, eval_gradient=True)
                    return -lml, -grad
                else:
                    return -self.log_marginal_likelihood(theta)

            # First optimize starting from theta specified in kernel
            optima = [
                self._constrained_optimization(obj_func, self.kernel_.theta,
                                               self.kernel_.bounds)
            ]

            # Additional runs are performed from log-uniform chosen initial
            # theta
            if self.n_restarts_optimizer > 0:
                if not np.isfinite(self.kernel_.bounds).all():
                    raise ValueError(
                        "Multiple optimizer restarts (n_restarts_optimizer>0) "
                        "requires that all bounds are finite.")
                bounds = self.kernel_.bounds
                for iteration in range(self.n_restarts_optimizer):
                    theta_initial = np.exp(
                        self.rng.uniform(bounds[:, 0], bounds[:, 1]))
                    optima.append(
                        self._constrained_optimization(obj_func, theta_initial,
                                                       bounds))
            # Select result from run with minimal (negative) log-marginal
            # likelihood
            lml_values = list(map(itemgetter(1), optima))
            self.kernel_.theta = optima[np.argmin(lml_values)][0]
            self.log_marginal_likelihood_value_ = -np.min(lml_values)
        else:
            self.log_marginal_likelihood_value_ = \
                self.log_marginal_likelihood(self.kernel_.theta)

        # Precompute quantities required for predictions which are independent
        # of actual query points
        K = self.kernel_(self.X_train_)

        _, (self.pi_, self.W_sr_, self.L_, _, _) = \
            self._posterior_mode(K, return_temporaries=True)

        return self
def sample_Z(n_samples, n_features, random_state):
    """Samples the elements of a (n_samples, n_features) shape 
    matrix from a uniform distribution in the [-1, 1] interval."""
    random_state = check_random_state(random_state)
    return random_state.uniform(-1., 1., size=[n_samples,
                                               n_features]).astype(np.float32)
def sparse_random_matrix(n_components, n_features, density='auto',
                         random_state=None):
    """Generalized Achlioptas random sparse matrix for random projection

    Setting density to 1 / 3 will yield the original matrix by Dimitris
    Achlioptas while setting a lower value will yield the generalization
    by Ping Li et al.

    If we note :math:`s = 1 / density`, the components of the random matrix are
    drawn from:

      - -sqrt(s) / sqrt(n_components)   with probability 1 / 2s
      -  0                              with probability 1 - 1 / s
      - +sqrt(s) / sqrt(n_components)   with probability 1 / 2s

    Parameters
    ----------
    n_components : int,
        Dimensionality of the target projection space.

    n_features : int,
        Dimensionality of the original source space.

    density : float in range ]0, 1] or 'auto', optional (default='auto')
        Ratio of non-zero component in the random projection matrix.

        If density = 'auto', the value is set to the minimum density
        as recommended by Ping Li et al.: 1 / sqrt(n_features).

        Use density = 1 / 3.0 if you want to reproduce the results from
        Achlioptas, 2001.

    random_state : integer, RandomState instance or None (default=None)
        Control the pseudo random number generator used to generate the
        matrix at fit time.

    Returns
    -------
    components: numpy array or CSR matrix with shape [n_components, n_features]
        The generated Gaussian random matrix.

    See Also
    --------
    SparseRandomProjection
    gaussian_random_matrix

    References
    ----------

    .. [1] Ping Li, T. Hastie and K. W. Church, 2006,
           "Very Sparse Random Projections".
           http://www.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf

    .. [2] D. Achlioptas, 2001, "Database-friendly random projections",
           http://www.cs.ucsc.edu/~optas/papers/jl.pdf

    """
    _check_input_size(n_components, n_features)
    density = _check_density(density, n_features)
    rng = check_random_state(random_state)

    if density == 1:
        # skip index generation if totally dense
        components = rng.binomial(1, 0.5, (n_components, n_features)) * 2 - 1
        return 1 / np.sqrt(n_components) * components

    else:
        # Generate location of non zero elements
        indices = []
        offset = 0
        indptr = [offset]
        for i in xrange(n_components):
            # find the indices of the non-zero components for row i
            n_nonzero_i = rng.binomial(n_features, density)
            indices_i = sample_without_replacement(n_features, n_nonzero_i,
                                                   random_state=rng)
            indices.append(indices_i)
            offset += n_nonzero_i
            indptr.append(offset)

        indices = np.concatenate(indices)

        # Among non zero components the probability of the sign is 50%/50%
        data = rng.binomial(1, 0.5, size=np.size(indices)) * 2 - 1

        # build the CSR structure by concatenating the rows
        components = sp.csr_matrix((data, indices, indptr),
                                   shape=(n_components, n_features))

        return np.sqrt(1 / density) / np.sqrt(n_components) * components
Exemple #38
0
    def partial_fit(self, X, y, monitor=None, sample_weight=None, **kwargs):
        """Fit the model on a batch of training data.

        Parameters
        ----------
        X : numpy array or sparse matrix of shape [n_samples, n_features]
            Training data
        y : numpy array of shape [n_samples, n_targets]
            Target values
        monitor : callable, optional
            The monitor is called after each iteration with the current
            iteration, a reference to the estimator, and a dictionary with
            {'loss': loss_value} representing the loss calculated by the
            objective function at this iteration.
            If the callable returns True the fitting procedure is stopped.
            The monitor can be used for various things such as computing
            held-out estimates, early stopping, model introspection,
            and snapshoting.
        sample_weight : numpy array of shape [n_samples,]
            Per-sample weights. Re-scale the loss per sample.
            Higher weights force the estimator to put more emphasis
            on these samples. Sample weights are normalized per-batch.

        Returns
        -------
        self : returns an instance of self.
        """

        X, y = self._check_inputs(X, y)
        assert self.batch_size > 0, "batch_size <= 0"

        if sample_weight is not None:
            sample_weight = check_array(sample_weight, ensure_2d=False)

        # Initialize the model if it hasn't been already by a previous call.
        if self._is_fitted:
            y = self._transform_targets(y)
        else:
            self._random_state = check_random_state(self.random_state)
            self._fit_targets(y, **kwargs)
            y = self._transform_targets(y)

            self.is_sparse_ = sp.issparse(X)
            self.input_layer_sz_ = X.shape[1]

            # Set which layer transform function points to
            if self.transform_layer_index is None:
                self._transform_layer_index = len(self.hidden_units) - 1
            else:
                self._transform_layer_index = self.transform_layer_index

            if (self._transform_layer_index < -1 or
                    self._transform_layer_index >= len(self.hidden_units)):
                raise ValueError(
                    "`transform_layer_index` must be in the range "
                    "[-1, len(hidden_units)-1]!")

            # Instantiate the graph.  TensorFlow seems easier to use by just
            # adding to the default graph, and as_default lets you temporarily
            # set a graph to be treated as the default graph.
            self.graph_ = Graph()
            with self.graph_.as_default():
                tf_random_seed.set_random_seed(
                    self._random_state.randint(0, 10000000))

                tf.get_variable_scope().set_initializer(
                    tf.contrib.layers.xavier_initializer())

                self._build_tf_graph()

                # Train model parameters.
                self._session.run(tf.global_variables_initializer())

            # Set an attributed to mark this as at least partially fitted.
            self._is_fitted = True

        # Train the model with the given data.
        with self.graph_.as_default():
            n_examples = X.shape[0]
            indices = np.arange(n_examples)

            for epoch in range(self.n_epochs):
                self._random_state.shuffle(indices)
                for start_idx in range(0, n_examples, self.batch_size):
                    batch_ind = indices[start_idx:start_idx + self.batch_size]

                    if sample_weight is None:
                        batch_sample_weight = None
                    else:
                        batch_sample_weight = sample_weight[batch_ind]

                    feed_dict = self._make_feed_dict(
                        X[batch_ind],
                        y[batch_ind],
                        sample_weight=batch_sample_weight)
                    obj_val, _ = self._session.run(
                        [self._obj_func, self._train_step],
                        feed_dict=feed_dict)
                    _LOGGER.debug("objective: %.4f, epoch: %d, idx: %d",
                                  obj_val, epoch, start_idx)

                _LOGGER.info("objective: %.4f, epoch: %d, idx: %d",
                             obj_val, epoch, start_idx)

                if monitor:
                    stop_early = monitor(epoch, self, {'loss': obj_val})
                    if stop_early:
                        _LOGGER.info(
                            "stopping early due to monitor function.")
                        return self

        return self
Exemple #39
0
    def _fit_resample(self, X, y):
        self._validate_estimator()
        random_state = check_random_state(self.random_state)
        X_resampled = X.copy()
        y_resampled = y.copy()

        for class_sample, n_samples in self.sampling_strategy_.items():
            if n_samples == 0:
                continue
            target_class_indices = np.flatnonzero(y == class_sample)
            X_class = _safe_indexing(X, target_class_indices)

            self.svm_estimator_.fit(X, y)
            support_index = self.svm_estimator_.support_[
                y[self.svm_estimator_.support_] == class_sample
            ]
            support_vector = _safe_indexing(X, support_index)

            self.nn_m_.fit(X)
            noise_bool = self._in_danger_noise(
                self.nn_m_, support_vector, class_sample, y, kind="noise"
            )
            support_vector = _safe_indexing(
                support_vector, np.flatnonzero(np.logical_not(noise_bool))
            )
            danger_bool = self._in_danger_noise(
                self.nn_m_, support_vector, class_sample, y, kind="danger"
            )
            safety_bool = np.logical_not(danger_bool)

            self.nn_k_.fit(X_class)
            fractions = random_state.beta(10, 10)
            n_generated_samples = int(fractions * (n_samples + 1))
            if np.count_nonzero(danger_bool) > 0:
                nns = self.nn_k_.kneighbors(
                    _safe_indexing(support_vector, np.flatnonzero(danger_bool)),
                    return_distance=False,
                )[:, 1:]

                X_new_1, y_new_1 = self._make_samples(
                    _safe_indexing(support_vector, np.flatnonzero(danger_bool)),
                    y.dtype,
                    class_sample,
                    X_class,
                    nns,
                    n_generated_samples,
                    step_size=1.0,
                )

            if np.count_nonzero(safety_bool) > 0:
                nns = self.nn_k_.kneighbors(
                    _safe_indexing(support_vector, np.flatnonzero(safety_bool)),
                    return_distance=False,
                )[:, 1:]

                X_new_2, y_new_2 = self._make_samples(
                    _safe_indexing(support_vector, np.flatnonzero(safety_bool)),
                    y.dtype,
                    class_sample,
                    X_class,
                    nns,
                    n_samples - n_generated_samples,
                    step_size=-self.out_step,
                )

            if (
                np.count_nonzero(danger_bool) > 0
                and np.count_nonzero(safety_bool) > 0
            ):
                if sparse.issparse(X_resampled):
                    X_resampled = sparse.vstack(
                        [X_resampled, X_new_1, X_new_2]
                    )
                else:
                    X_resampled = np.vstack((X_resampled, X_new_1, X_new_2))
                y_resampled = np.concatenate(
                    (y_resampled, y_new_1, y_new_2), axis=0
                )
            elif np.count_nonzero(danger_bool) == 0:
                if sparse.issparse(X_resampled):
                    X_resampled = sparse.vstack([X_resampled, X_new_2])
                else:
                    X_resampled = np.vstack((X_resampled, X_new_2))
                y_resampled = np.concatenate((y_resampled, y_new_2), axis=0)
            elif np.count_nonzero(safety_bool) == 0:
                if sparse.issparse(X_resampled):
                    X_resampled = sparse.vstack([X_resampled, X_new_1])
                else:
                    X_resampled = np.vstack((X_resampled, X_new_1))
                y_resampled = np.concatenate((y_resampled, y_new_1), axis=0)

        return X_resampled, y_resampled
    def __init__(
        self,
        dimensions,
        base_estimator="gp",
        n_random_starts=None,
        n_initial_points=10,
        acq_func="gp_hedge",
        acq_optimizer="auto",
        random_state=None,
        acq_func_kwargs=None,
        acq_optimizer_kwargs=None,
    ):
        """This is nearly identical to :meth:`skopt.optimizer.optimizer.Optimizer.__init__`. It is
        recreated here to use the modified :class:`hyperparameter_hunter.space.Space`, rather than
        the original `skopt` version. This is not an ideal solution, and other options are being
        considered

        Parameters
        ----------
        dimensions: See :class:`skopt.optimizer.optimizer.Optimizer.__init__`
        base_estimator: See :class:`skopt.optimizer.optimizer.Optimizer.__init__`
        n_random_starts: See :class:`skopt.optimizer.optimizer.Optimizer.__init__`
        n_initial_points: See :class:`skopt.optimizer.optimizer.Optimizer.__init__`
        acq_func: See :class:`skopt.optimizer.optimizer.Optimizer.__init__`
        acq_optimizer: See :class:`skopt.optimizer.optimizer.Optimizer.__init__`
        random_state: See :class:`skopt.optimizer.optimizer.Optimizer.__init__`
        acq_func_kwargs: See :class:`skopt.optimizer.optimizer.Optimizer.__init__`
        acq_optimizer_kwargs: See :class:`skopt.optimizer.optimizer.Optimizer.__init__`"""
        # TODO: Figure out way to override skopt Optimizer's use of skopt Space without having to rewrite __init__
        self.__repeated_ask_kwargs = {}
        self.rng = check_random_state(random_state)

        # Configure acquisition function - Store and create acquisition function set
        self.acq_func = acq_func
        self.acq_func_kwargs = acq_func_kwargs

        allowed_acq_funcs = ["gp_hedge", "EI", "LCB", "PI", "EIps", "PIps"]
        if self.acq_func not in allowed_acq_funcs:
            raise ValueError(
                f"Expected `acq_func` to be in {allowed_acq_funcs}, got {self.acq_func}"
            )

        # Treat hedging method separately
        if self.acq_func == "gp_hedge":
            self.cand_acq_funcs_ = ["EI", "LCB", "PI"]
            self.gains_ = np.zeros(3)
        else:
            self.cand_acq_funcs_ = [self.acq_func]

        if acq_func_kwargs is None:
            acq_func_kwargs = dict()
        self.eta = acq_func_kwargs.get("eta", 1.0)

        # Configure counters of points - Check `n_random_starts` deprecation first
        if n_random_starts is not None:
            warnings.warn(
                ("`n_random_starts` will be removed in favour of `n_initial_points`"
                 ),
                DeprecationWarning,
            )
            n_initial_points = n_random_starts
        if n_initial_points < 0:
            raise ValueError(
                f"Expected `n_initial_points` >= 0, got {n_initial_points}")
        self._n_initial_points = n_initial_points
        self.n_initial_points_ = n_initial_points

        # Configure estimator - Build `base_estimator` if doesn't exist
        if isinstance(base_estimator, str):
            base_estimator = cook_estimator(
                base_estimator,
                space=dimensions,
                random_state=self.rng.randint(0,
                                              np.iinfo(np.int32).max),
            )

        # Check if regressor
        if not is_regressor(base_estimator) and base_estimator is not None:
            raise ValueError(
                f"`base_estimator`={base_estimator} must be a regressor")

        # Treat per second acquisition function specially
        is_multi_regressor = isinstance(base_estimator, MultiOutputRegressor)
        if "ps" in self.acq_func and not is_multi_regressor:
            self.base_estimator_ = MultiOutputRegressor(base_estimator)
        else:
            self.base_estimator_ = base_estimator

        # Configure optimizer - Decide optimizer based on gradient information
        if acq_optimizer == "auto":
            if has_gradients(self.base_estimator_):
                acq_optimizer = "lbfgs"
            else:
                acq_optimizer = "sampling"

        if acq_optimizer not in ["lbfgs", "sampling"]:
            raise ValueError(
                'Expected `acq_optimizer` to be "lbfgs" or "sampling", got {}'.
                format(acq_optimizer))
        if not has_gradients(
                self.base_estimator_) and acq_optimizer != "sampling":
            raise ValueError(
                'The regressor {} should run with `acq_optimizer`="sampling"'.
                format(type(base_estimator)))
        self.acq_optimizer = acq_optimizer

        # Record other arguments
        if acq_optimizer_kwargs is None:
            acq_optimizer_kwargs = dict()

        self.n_points = acq_optimizer_kwargs.get("n_points", 10000)
        self.n_restarts_optimizer = acq_optimizer_kwargs.get(
            "n_restarts_optimizer", 5)
        n_jobs = acq_optimizer_kwargs.get("n_jobs", 1)
        self.n_jobs = n_jobs
        self.acq_optimizer_kwargs = acq_optimizer_kwargs

        # Configure search space - Normalize space if GP regressor
        if isinstance(self.base_estimator_, GaussianProcessRegressor):
            dimensions = normalize_dimensions(dimensions)
        self.space = Space(dimensions)

        # Record categorical and non-categorical indices
        self._cat_inds = []
        self._non_cat_inds = []
        for ind, dim in enumerate(self.space.dimensions):
            if isinstance(dim, Categorical):
                self._cat_inds.append(ind)
            else:
                self._non_cat_inds.append(ind)

        # Initialize storage for optimization
        self.models = []
        self.Xi = []
        self.yi = []

        # Initialize cache for `ask` method responses
        # This ensures that multiple calls to `ask` with n_points set return same sets of points. Reset to {} at call to `tell`
        self.cache_ = {}
    def initialize(self):
        """Initialize all transformer arguments, needing initialization."""
        self._graph_bins = dict()
        if not self._initialized["n_jobs"]:
            if self.n_jobs is not None:
                warnings.warn('no implemented parallelization for GraphletSampling')
            self._initialized["n_jobs"] = True

        if not self._initialized["random_state"]:
            self.random_state_ = check_random_state(self.random_state)
            self._initialized["random_state"] = True

        if not self._initialized["k"]:
            if type(self.k) is not int:
                raise TypeError('k must be an int')

            if self.k > 10:
                warnings.warn('graphlets are too big - '
                              'computation may be slow')
            elif self.k < 3:
                raise TypeError('k must be bigger than 3')

            self._initialized["k"] = True

        if not self._initialized["sampling"]:
            sampling = self.sampling
            k = self.k
            if sampling is None:
                def sample_graphlets(A):
                    return sample_graphlets_all_connected(A, k)
            elif type(sampling) is dict:
                if "n_samples" in sampling:
                    # Get the number of samples
                    n_samples = sampling["n_samples"]

                    # Display a warning if arguments ignored
                    args = [arg for arg in ["delta", "epsilon", "a"]
                            if arg in sampling]
                    if len(args):
                        warnings.warn('Number of samples defined as input, ' +
                                      'ignoring arguments:', ', '.join(args))

                    # Initialise the sample graphlets function
                    sample_graphlets = sample_graphlets_probabilistic

                elif ("delta" in sampling or "epsilon" in sampling
                        or "a" in sampling):
                    # Otherwise if delta exists
                    delta = sampling.get("delta", 0.05)
                    # or epsilon
                    epsilon = sampling.get("epsilon", 0.05)
                    # or a
                    a = sampling.get("a", -1)

                    # check the fit constraints
                    if delta > 1 or delta < 0:
                        raise TypeError('delta must be in the range (0,1)')

                    if epsilon > 1 or epsilon < 0:
                        raise TypeError('epsilon must be in the range (0,1)')

                    if type(a) is not int:
                        raise TypeError('a must be an integer')
                    elif a == 0:
                        raise TypeError('a cannot be zero')
                    elif a < -1:
                        raise TypeError('negative a smaller than -1 have '
                                        'no meaning')

                    if(a == -1):
                        fallback_map = {1: 1, 2: 2, 3: 4, 4: 8, 5: 19, 6: 53,
                                        7: 209, 8: 1253, 9: 13599}
                        if(k > 9):
                            warnings.warn(
                                'warning for such size number of isomorphisms '
                                'is not known - interpolation on know values '
                                'will be used')
                            # Use interpolations

                            isomorphism_prediction = \
                                interp1d(list(fallback_map.keys()),
                                         list(itervalues(fallback_map)),
                                         kind='cubic')
                            a = isomorphism_prediction(k)
                        else:
                            a = fallback_map[k]

                    # and calculate number of samples
                    n_samples = math.ceil(2*(a*np.log10(2) +
                                          np.log10(1/delta))/(epsilon**2))

                    sample_graphlets = sample_graphlets_probabilistic
                else:
                    raise ValueError('sampling doesn\'t have a valid dictionary format')
            else:
                raise TypeError('sampling can either be a dictionary or None')
            self.sample_graphlets_ = sample_graphlets
            self.k_ = k
            self.n_samples_ = n_samples
        self._initialized["sampling"] = True
Exemple #42
0
    def __init__(
        self,
        cat_cols=[],
        num_cols=[],
        embedding_dims=[],
        encoding_dim=128,
        n_layer=1,
        n_encoder=1,
        noise_std=0.0,
        swap_prob=0.2,
        mask_prob=0.0,
        dropout=0.2,
        min_obs=1,
        n_epoch=10,
        batch_size=1024,
        learning_rate=0.004,
        random_state=42,
        label_encoding=True,
        pretrained_model=None,
        freeze_embedding=True,
    ):
        """Initialize a DAE (Denoising AutoEncoder) class object.

        Args:
            cat_cols (list of str): the names of categorical features to create embeddings for.
            num_cols (list of str): the names of numerical features to train embeddings with.
            embedding_dims (int or list of int): the numbers of embedding features used for columns.
            encoding_dim (int): the numbers of hidden units in encoding/decoding layers.
            n_layer (int): the numbers of the encoding/decoding layer pairs
            n_encoder (int): the numbers of encoding layers in each of the encoding/decoding pairs
            noise_std (float): standard deviation of  gaussian noise to be added to features.
            swap_prob (float): probability to add swap noise to features.
            mask_prob (float): probability to add zero masking to features.
            dropout (float): dropout probability in embedding layers
            min_obs (int): categories observed less than it will be grouped together before training embeddings
            n_epoch (int): the number of epochs to train a neural network with embedding layer
            batch_size (int): the size of mini-batches in model training
            learning_rate (float): learning rate in model training
            random_state (int or np.RandomState): random seed.
            label_encoding (bool): to label-encode categorical columns (True) or not (False)
            pretrained_model (DAE): a pretrained DAE/SDAE model
            freeze_embedding (bool): whether to freeze embedding layers when loading the pretrained DAE/SDAE model
        """
        assert cat_cols or num_cols
        self.cat_cols = cat_cols
        self.num_cols = num_cols

        if isinstance(embedding_dims, int):
            self.embedding_dims = [embedding_dims] * len(cat_cols)
        elif isinstance(embedding_dims, list):
            if not embedding_dims:
                self.embedding_dims = [None] * len(cat_cols)
            else:
                assert len(cat_cols) == len(embedding_dims)
                self.embedding_dims = embedding_dims
        else:
            raise ValueError("embedding_dims should be int or list")
        self.input_dims = [None] * len(self.embedding_dims)

        assert (encoding_dim > 0) and (n_layer > 0) and (n_encoder > 0)
        self.encoding_dim = encoding_dim
        self.n_layer = n_layer
        self.n_encoder = n_encoder

        assert ((0.0 <= noise_std) and (0.0 <= swap_prob < 1.0)
                and (0.0 <= mask_prob < 1.0) and (0.0 <= dropout < 1.0))
        self.noise_std = noise_std
        self.swap_prob = swap_prob
        self.mask_prob = mask_prob
        self.dropout = dropout

        assert (min_obs > 0) and (n_epoch > 0) and (batch_size > 0)
        self.min_obs = min_obs
        self.n_epoch = n_epoch
        self.batch_size = batch_size
        self.learning_rate = learning_rate

        # Following Scikit-Learn's coding guidelines (https://scikit-learn.org/stable/developers/develop.html):
        # 1. Every keyword argument accepted by __init__ should correspond to an attribute on the instance.
        # 2. The routine should accept a keyword random_state and use this to construct a np.random.RandomState
        #    object
        self.random_state = random_state
        self.random_state_ = check_random_state(self.random_state)

        # Get an integer seed from np.random.RandomState to use it for tensorflow
        self.seed = self.random_state_.get_state()[1][0]

        self.pretrained_model = pretrained_model
        self.freeze_embedding = freeze_embedding

        self.label_encoding = label_encoding
        if self.label_encoding:
            if self.pretrained_model is not None:
                self.lbe = deepcopy(self.pretrained_model.lbe)
            else:
                self.lbe = LabelEncoder(min_obs=min_obs)
    def _sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : array-like, shape (n_samples,)
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : {ndarray, sparse matrix}, shape \
(n_subset, n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_subset, n_samples_new)
            The corresponding label of `X_resampled`

        idx_under : ndarray, shape (n_subset, n_samples, )
            If `return_indices` is `True`, a boolean array will be returned
            containing the which samples have been selected.

        """
        self._validate_estimator()

        random_state = check_random_state(self.random_state)

        # array to know which samples are available to be taken
        samples_mask = np.ones(y.shape, dtype=bool)

        # where the different set will be stored
        idx_under = []

        n_subsets = 0
        b_subset_search = True
        while b_subset_search:
            target_stats = Counter(
                safe_indexing(y, np.flatnonzero(samples_mask)))
            # store the index of the data to under-sample
            index_under_sample = np.empty((0, ), dtype=y.dtype)
            # value which will be picked at each round
            index_constant = np.empty((0, ), dtype=y.dtype)
            for target_class in target_stats.keys():
                if target_class in self.sampling_strategy_.keys():
                    n_samples = self.sampling_strategy_[target_class]
                    # extract the data of interest for this round from the
                    # current class
                    index_class = np.flatnonzero(y == target_class)
                    index_class_interest = index_class[samples_mask[
                        y == target_class]]
                    y_class = safe_indexing(y, index_class_interest)
                    # select randomly the desired features
                    index_target_class = random_state.choice(range(
                        y_class.size),
                                                             size=n_samples,
                                                             replace=False)
                    index_under_sample = np.concatenate(
                        (index_under_sample,
                         index_class_interest[index_target_class]),
                        axis=0)
                else:
                    index_constant = np.concatenate(
                        (index_constant, np.flatnonzero(y == target_class)),
                        axis=0)

            # store the set created
            n_subsets += 1
            subset_indices = np.concatenate(
                (index_under_sample, index_constant), axis=0)
            idx_under.append(subset_indices)

            # fit and predict using cross validation
            X_subset = safe_indexing(X, subset_indices)
            y_subset = safe_indexing(y, subset_indices)
            pred = cross_val_predict(self.estimator_, X_subset, y_subset)
            # extract the prediction about the targeted classes only
            pred_target = pred[:index_under_sample.size]
            index_classified = index_under_sample[pred_target == safe_indexing(
                y_subset, range(index_under_sample.size))]
            samples_mask[index_classified] = False

            # check the stopping criterion
            if self.n_max_subset is not None:
                if n_subsets == self.n_max_subset:
                    b_subset_search = False
            # check that there is enough samples for another round
            target_stats = Counter(
                safe_indexing(y, np.flatnonzero(samples_mask)))
            for target_class in self.sampling_strategy_.keys():
                if (target_stats[target_class] <
                        self.sampling_strategy_[target_class]):
                    b_subset_search = False

        X_resampled, y_resampled = [], []
        for indices in idx_under:
            X_resampled.append(safe_indexing(X, indices))
            y_resampled.append(safe_indexing(y, indices))

        if self.return_indices:
            return (np.array(X_resampled), np.array(y_resampled),
                    np.array(idx_under))
        else:
            return np.array(X_resampled), np.array(y_resampled)
Exemple #44
0

# In[42]:

subimages = segment_image(image)

# In[43]:

f, axes = plt.subplots(1, len(subimages), figsize=(10, 3))
for i in range(len(subimages)):
    axes[i].imshow(subimages[i], cmap="gray")

# In[44]:

from sklearn.utils import check_random_state
random_state = check_random_state(14)
letters = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
shear_values = np.arange(0, 0.5, 0.05)
scale_values = np.arange(0.5, 1.5, 0.1)

# In[45]:


def generate_sample(random_state=None):
    random_state = check_random_state(random_state)
    letter = random_state.choice(letters)
    shear = random_state.choice(shear_values)
    scale = random_state.choice(scale_values)
    # We use 30,30 as the image size to ensure we get all the text in the image
    return create_captcha(letter, shear=shear, size=(30, 30),
                          scale=scale), letters.index(letter)
Exemple #45
0
    def fit(self, X, y):
        """
        The Gaussian Process model fitting method.

        Parameters
        ----------
        X : double array_like
            An array with shape (n_samples, n_features) with the input at which
            observations were made.

        y : double array_like
            An array with shape (n_samples, ) or shape (n_samples, n_targets)
            with the observations of the output to be predicted.

        Returns
        -------
        gp : self
            A fitted Gaussian Process model object awaiting data to perform
            predictions.
        """
        # Run input checks

        self._check_params()

        self.random_state = check_random_state(self.random_state)

        # Force data to 2D numpy.array
        X, y = check_X_y(X, y, multi_output=True, y_numeric=True)
        self.y_ndim_ = y.ndim
        if y.ndim == 1:
            y = y[:, np.newaxis]

        # Check shapes of DOE & observations
        n_samples, n_features = X.shape
        _, n_targets = y.shape

        # Run input checks
        self._check_params(n_samples)

        # Normalize data or don't
        if self.normalize:
            X_mean = np.mean(X, axis=0)
            X_std = np.std(X, axis=0)
            y_mean = np.mean(y, axis=0)
            y_std = np.std(y, axis=0)
            X_std[X_std == 0.] = 1.
            y_std[y_std == 0.] = 1.
            # center and scale X if necessary
            X = (X - X_mean) / X_std
            y = (y - y_mean) / y_std
        else:
            X_mean = np.zeros(1)
            X_std = np.ones(1)
            y_mean = np.zeros(1)
            y_std = np.ones(1)

        # Calculate matrix of distances D between samples
        D, ij = l1_cross_distances(X)
        if (np.min(np.sum(D, axis=1)) == 0. and self.corr != pure_nugget):
            raise Exception("Multiple input features cannot have the same"
                            " target value.")

        # Regression matrix and parameters
        F = self.regr(X)
        n_samples_F = F.shape[0]
        if F.ndim > 1:
            p = F.shape[1]
        else:
            p = 1
        if n_samples_F != n_samples:
            raise Exception("Number of rows in F and X do not match. Most "
                            "likely something is going wrong with the "
                            "regression model.")
        if p > n_samples_F:
            raise Exception(("Ordinary least squares problem is undetermined "
                             "n_samples=%d must be greater than the "
                             "regression model size p=%d.") % (n_samples, p))
        if self.beta0 is not None:
            if self.beta0.shape[0] != p:
                raise Exception("Shapes of beta0 and F do not match.")

        # Set attributes
        self.X = X
        self.y = y
        self.D = D
        self.ij = ij
        self.F = F
        self.X_mean, self.X_std = X_mean, X_std
        self.y_mean, self.y_std = y_mean, y_std

        # Determine Gaussian Process model parameters
        if self.thetaL is not None and self.thetaU is not None:
            # Maximum Likelihood Estimation of the parameters
            if self.verbose:
                print("Performing Maximum Likelihood Estimation of the "
                      "autocorrelation parameters...")
            self.theta_, self.reduced_likelihood_function_value_, par = \
                self._arg_max_reduced_likelihood_function()
            if np.isinf(self.reduced_likelihood_function_value_):
                raise Exception("Bad parameter region. "
                                "Try increasing upper bound")

        else:
            # Given parameters
            if self.verbose:
                print("Given autocorrelation parameters. "
                      "Computing Gaussian Process model parameters...")
            par = {}
            self.theta_ = self.theta0
            self.reduced_likelihood_function_value_, par = \
                self.log_likelihood_function(self.theta_, par)
            if np.isinf(self.reduced_likelihood_function_value_):
                raise Exception("Bad point. Try increasing theta0.")

        self.noise_var = par['noise_var']
        self.sigma2 = par['sigma2']
        self.rho = par['rho']
        self.Yt = par['Yt']
        self.C = par['C']
        self.Ft = par['Ft']
        self.G = par['G']
        self.Q = par['Q']

        # compute for beta and gamma
        self.compute_beta_gamma()

        return self
Exemple #46
0
def compute_neighbors_umap(
    X: Union[np.ndarray, csr_matrix],
    n_neighbors: int,
    random_state: Optional[Union[int, RandomState]] = None,
    metric: Union[str, Metric] = 'euclidean',
    metric_kwds: Mapping[str, Any] = {},
    angular: bool = False,
    verbose: bool = False,
):
    """This is from umap.fuzzy_simplicial_set [McInnes18]_.

    Given a set of data X, a neighborhood size, and a measure of distance
    compute the fuzzy simplicial set (here represented as a fuzzy graph in
    the form of a sparse matrix) associated to the data. This is done by
    locally approximating geodesic distance at each point, creating a fuzzy
    simplicial set for each such point, and then combining all the local
    fuzzy simplicial sets into a global one via a fuzzy union.

    Parameters
    ----------
    X: array of shape (n_samples, n_features)
        The data to be modelled as a fuzzy simplicial set.
    n_neighbors
        The number of neighbors to use to approximate geodesic distance.
        Larger numbers induce more global estimates of the manifold that can
        miss finer detail, while smaller values will focus on fine manifold
        structure to the detriment of the larger picture.
    random_state
        A state capable being used as a numpy random state.
    metric
        The metric to use to compute distances in high dimensional space.
        If a string is passed it must match a valid predefined metric. If
        a general metric is required a function that takes two 1d arrays and
        returns a float can be provided. For performance purposes it is
        required that this be a numba jit'd function. Valid string metrics
        include:
            * euclidean
            * manhattan
            * chebyshev
            * minkowski
            * canberra
            * braycurtis
            * mahalanobis
            * wminkowski
            * seuclidean
            * cosine
            * correlation
            * haversine
            * hamming
            * jaccard
            * dice
            * russelrao
            * kulsinski
            * rogerstanimoto
            * sokalmichener
            * sokalsneath
            * yule
        Metrics that take arguments (such as minkowski, mahalanobis etc.)
        can have arguments passed via the metric_kwds dictionary. At this
        time care must be taken and dictionary elements must be ordered
        appropriately; this will hopefully be fixed in the future.
    metric_kwds
        Arguments to pass on to the metric, such as the ``p`` value for
        Minkowski distance.
    angular
        Whether to use angular/cosine distance for the random projection
        forest for seeding NN-descent to determine approximate nearest
        neighbors.
    verbose
        Whether to report information on the current progress of the algorithm.

    Returns
    -------
    **knn_indices**, **knn_dists** : np.arrays of shape (n_observations, n_neighbors)
    """
    from umap.umap_ import nearest_neighbors

    random_state = check_random_state(random_state)

    knn_indices, knn_dists, forest = nearest_neighbors(
        X,
        n_neighbors,
        random_state=random_state,
        metric=metric,
        metric_kwds=metric_kwds,
        angular=angular,
        verbose=verbose,
    )

    return knn_indices, knn_dists, forest
Exemple #47
0
    print "Error rate: %f" % (y != clf.predict(X)).mean()


@timeit
def sklearn_BNB(X, y):
    clf = BernoulliNB_SK(alpha=1, fit_prior=False)
    clf.fit(X, y)
    print "Error rate: %f" % (y != clf.predict(X)).mean()


if __name__ == '__main__':

    # Generate random data samples.
    m = 3000
    n = 500
    rs = check_random_state(seed=None)
    X = rs.randint(2, size=(m, n))
    y = rs.randint(2, size=(m, ))
    print("========== Start =========")
    print("BernouliNB:")
    clf = BernoulliNB()
    clf.fit(X, y)
    y_pred = clf.predict(X)
    print("Error rate: %f" % (y.reshape(y_pred.shape) != y_pred).mean())

    print("Comparison with sklearn implementation:")
    sklearn_BNB(X, y)

    print("\nMultinomialNB:")
    clf = MultinomialNB()
    clf.fit(X, y)
    def fit(self, X, y, sample_weight=None, monitor=None):
        """Fit the gradient boosting model.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Data matrix

        y : structured array, shape = (n_samples,)
            A structured array containing the binary event indicator
            as first field, and time of event or time of censoring as
            second field.

        sample_weight : array-like, shape = (n_samples,), optional
            Weights given to each sample. If omitted, all samples have weight 1.

        monitor : callable, optional
            The monitor is called after each iteration with the current
            iteration, a reference to the estimator and the local variables of
            ``_fit_stages`` as keyword arguments ``callable(i, self,
            locals())``. If the callable returns ``True`` the fitting procedure
            is stopped. The monitor can be used for various things such as
            computing held-out estimates, early stopping, model introspect, and
            snapshoting.

        Returns
        -------
        self : object
            Returns self.
        """
        X, event, time = check_arrays_survival(
            X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=DTYPE)
        n_samples, self.n_features_ = X.shape

        X = X.astype(DTYPE)
        sample_weight_is_none = sample_weight is None
        if sample_weight_is_none:
            sample_weight = numpy.ones(n_samples, dtype=numpy.float32)
        else:
            sample_weight = column_or_1d(sample_weight, warn=True)

        check_consistent_length(X, sample_weight)

        self._check_params()

        if isinstance(self.loss_,
                      (CensoredSquaredLoss, IPCWLeastSquaresError)):
            time = numpy.log(time)

        self._init_state()
        if sample_weight_is_none:
            self.init_.fit(X, (event, time))
        else:
            self.init_.fit(X, (event, time), sample_weight)

        raw_predictions = self.loss_.get_init_raw_predictions(X, self.init_)
        begin_at_stage = 0

        # The rng state must be preserved if warm_start is True
        self._rng = check_random_state(self.random_state)

        X_idx_sorted = None

        # fit the boosting stages
        y = numpy.fromiter(zip(event, time),
                           dtype=[('event', numpy.bool),
                                  ('time', numpy.float64)])
        n_stages = self._fit_stages(X, y, raw_predictions, sample_weight,
                                    self._rng, begin_at_stage, monitor,
                                    X_idx_sorted)
        # change shape of arrays after fit (early-stopping or additional tests)
        if n_stages != self.estimators_.shape[0]:
            self.estimators_ = self.estimators_[:n_stages]
            self.train_score_ = self.train_score_[:n_stages]
            if hasattr(self, 'oob_improvement_'):
                self.oob_improvement_ = self.oob_improvement_[:n_stages]

        self.n_estimators_ = n_stages
        return self
Exemple #49
0
# Let's review another modelling method. Here we do not want to predict a characterstic of borrowrs, where as we want to form groups of individuals with similar characterstics. This technqiue is a part of Unsupervised Learning - Clsutering.
#
# Keeping things simple, I will use three variables from our data to form groups.
# 1. Loan amount (Variable name - loanamt; continuous)
# 2. Total monthly income of the applicant (Variable name - atotinc; continuous)
# 3. Purchase price in thousand (Variable name - price; continuous)

# In[14]:

from sklearn.cluster import KMeans
from sklearn.utils import check_random_state

cluster_data = data[['loanamt', 'atotinc', 'price']]

results = KMeans(n_clusters=8,
                 random_state=check_random_state(42)).fit(cluster_data)

# List for all cluster labels
cluster_labels = pd.DataFrame(results.labels_.astype(int),
                              columns=['Clusters'])
scatter_data = cluster_data.join(cluster_labels, how='inner')
scatter_data.head()

# ## Displaying the Results in a 3D plot

# In[15]:

# 3D Scatter plots
from mpl_toolkits.mplot3d import Axes3D

plt.close('all')
Exemple #50
0
def estimate_sigma(
    X: np.ndarray,
    subsample: Optional[int] = None,
    method: str = "median",
    percent: Optional[float] = 0.15,
    scale: float = 1.0,
    random_state: Optional[int] = None,
) -> float:
    """A function to provide a reasonable estimate of the sigma values
    for the RBF kernel using different methods. 
    Parameters
    ----------
    X : array, (n_samples, d_dimensions)
        The data matrix to be estimated.

    method : str, default: 'median'
        different methods used to estimate the sigma for the rbf kernel
        matrix.
        * Mean
        * Median
        * Silverman
        * Scott - very common for density estimation

    percent : float, default=0.15
        The kth percentage of distance chosen

    scale : float, default=None
        Option to scale the sigma chosen. Typically used with the
        median or mean method as they are data dependent.
    
    random_state : int, (default: None)
        controls the seed for the subsamples drawn to represent
        the data distribution

    Returns
    -------
    sigma : float
        The estimated sigma value
        
    Resources
    ---------
    - Original MATLAB function: https://goo.gl/xYoJce

    Information
    -----------
    Author : J. Emmanuel Johnson
    Email  : [email protected]
           : [email protected]
    Date   : 6 - July - 2018
    """
    X = check_array(X, ensure_2d=True)

    rng = check_random_state(random_state)

    # subsampling
    [n_samples, d_dimensions] = X.shape

    if subsample is not None:
        X = rng.permutation(X)[:subsample, :]
    # print(method, percent)
    if method == "mean" and percent is None:
        sigma = np.mean(pdist(X))

    elif method == "mean" and percent is not None:
        kth_sample = int(percent * n_samples)
        sigma = np.mean(np.sort(squareform(pdist(X)))[:, kth_sample])

    elif method == "median" and percent is None:
        sigma = np.median(pdist(X))

    elif method == "median" and percent is not None:
        kth_sample = int(percent * n_samples)
        sigma = np.median(np.sort(squareform(pdist(X)))[:, kth_sample])

    elif method == "silverman":
        sigma = np.power(n_samples * (d_dimensions + 2.0) / 4.0,
                         -1.0 / (d_dimensions + 4))

    elif method == "scott":
        sigma = np.power(n_samples, -1.0 / (d_dimensions + 4))

    else:
        raise ValueError('Unrecognized mode "{}".'.format(method))

    # scale the sigma by a factor
    if scale is not None:
        sigma *= scale

    # return sigma
    return sigma
Exemple #51
0
    def __init__(self,
                 max_depth=None,
                 min_samples_split=2,
                 n_shapelets=10,
                 min_shapelet_size=0,
                 max_shapelet_size=1,
                 metric='euclidean',
                 metric_params=None,
                 force_dim=None,
                 random_state=None):
        """A shapelet decision tree regressor

        :param max_depth: The maximum depth of the tree. If `None` the
           tree is expanded until all leafs are pure or until all
           leafs contain less than `min_samples_split` samples
           (default: None).

        :param min_samples_split: The minimum number of samples to
           split an internal node (default: 2).

        :param n_shapelets: The number of shapelets to sample at each
           node (default: 10).

        :param min_shapelet_size: The minimum length of a sampled
           shapelet expressed as a fraction, computed as
           `min(ceil(X.shape[-1] * min_shapelet_size), 2)` (default:
           0).

        :param max_shapelet_size: The maximum length of a sampled
           shapelet, expressed as a fraction and computed as
           `ceil(X.shape[-1] * max_shapelet_size)`.

        :param metric: Distance metric used to identify the best
           match. (default: `'euclidean'`)

        :param metric_params: Paramters to the distace measure

        :param force_dim: Force the number of dimensions (default:
           None). If `int`, `force_dim` reshapes the input to the
           shape `[n_samples, force_dim, -1]` to support the
           `BaggingClassifier` interface.

        :param random_state: If `int`, `random_state` is the seed used
           by the random number generator; If `RandomState` instance,
           `random_state` is the random number generator; If `None`,
           the random number generator is the `RandomState` instance
           used by `np.random`.

        """
        if min_shapelet_size < 0 or min_shapelet_size > max_shapelet_size:
            raise ValueError(
                "`min_shapelet_size` {0} <= 0 or {0} > {1}".format(
                    min_shapelet_size, max_shapelet_size))
        if max_shapelet_size > 1:
            raise ValueError(
                "`max_shapelet_size` {0} > 1".format(max_shapelet_size))

        self.max_depth = max_depth
        self.max_depth = max_depth or 2**31
        self.min_samples_split = min_samples_split
        self.random_state = check_random_state(random_state)
        self.n_shapelets = n_shapelets
        self.min_shapelet_size = min_shapelet_size
        self.max_shapelet_size = max_shapelet_size
        self.metric = metric
        self.metric_params = metric_params
        self.force_dim = force_dim
def get_hcp_2dpatches_SegmentationHR(extract_pourcent,patch_size = 128, n_patches = 1000, data = None):
  
    (T1s,T2s,T3s,T4s,masks) = data
	
    n_images = len(T2s)
    T1_patches = None
    T2_patches = None
    T3_patches= None
    T4_patches = None

    T1 = []
    T2 = []
    T3 = []
    T4 = []

    mask_extract = extract_pourcent
    patch_shape = (patch_size,patch_size)
    random_state = None

    for i in tqdm(range(n_images)):
		
        #Normalize data using mask
        T2_norm = array_normalization(X=T2s[i],M=masks[i],norm=0)
        mask = masks[i]
        T1_norm = T1s[i]
        T3_norm= T3s[i]
        T4_norm= T4s[i]
		

        for j in range(T1_norm.shape[2]): #Loop over the slices
            pT1 = extract_patches(T1_norm[:,:,j], patch_shape, extraction_step = 1)
            pT2 = extract_patches(T2_norm[:,:,j], patch_shape, extraction_step = 1)
            pT3 = extract_patches(T3_norm[:,:,j], patch_shape, extraction_step = 1)
            pT4 = extract_patches(T4_norm[:,:,j], patch_shape, extraction_step = 1)

            pmask = extract_patches(mask[:,:,j], patch_shape, extraction_step = 1)
            rng = check_random_state(random_state)
            i_s = rng.randint(T1_norm.shape[0] - patch_shape[0] + 1, size = n_patches)
            j_s = rng.randint(T1_norm.shape[1] - patch_shape[1] + 1, size = n_patches)
            pT1 = pT1[i_s, j_s]
            pT2 = pT2[i_s, j_s]
            pT3 = pT3[i_s, j_s]
            pT4 = pT4[i_s, j_s]

            pmask = pmask[i_s, j_s]

            #Channel last
            pT1 = pT1.reshape(-1, patch_shape[0], patch_shape[1])
            pT2 = pT2.reshape(-1, patch_shape[0], patch_shape[1])

            pT3 = pT3.reshape(-1, patch_shape[0], patch_shape[1])
            pT4 = pT4.reshape(-1, patch_shape[0], patch_shape[1])

            pmask = pmask.reshape(-1, patch_shape[0], patch_shape[1])
            pmask = pmask.reshape(pmask.shape[0],-1)

            #Remove empty patches (<65% of mask)
            pmT1 = pT1[ np.mean(pmask,axis=1)>=mask_extract ]
            pmT2 = pT2[ np.mean(pmask,axis=1)>=mask_extract ]
            pmT3 = pT3[ np.mean(pmask,axis=1)>=mask_extract ]
            pmT4 = pT4[ np.mean(pmask,axis=1)>=mask_extract ]
			

            T1.append(pmT1)
            T2.append(pmT2)
            T3.append(pmT3)
            T4.append(pmT4)

    T1_patches = np.concatenate(T1,axis=0)
    T2_patches = np.concatenate(T2,axis=0) 
    T3_patches = np.concatenate(T3,axis=0)
    T4_patches = np.concatenate(T4,axis=0)        

    return (T1_patches,T2_patches,T3_patches,T4_patches)
		
Exemple #53
0
    def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None):
        """Build a Sequentially Bootstrapped Bagging ensemble of estimators from the training
           set (X, y).
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrices are accepted only if
            they are supported by the base estimator.
        y : array-like, shape = [n_samples]
            The target values (class labels in classification, real numbers in
            regression).
        max_samples : int or float, optional (default=None)
            Argument to use instead of self.max_samples.
        max_depth : int, optional (default=None)
            Override value used when constructing base estimator. Only
            supported if the base estimator has a max_depth parameter.
        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted.
            Note that this is supported only if the base estimator supports
            sample weighting.
        Returns
        -------
        self : object
        """
        random_state = check_random_state(self.random_state)
        self.X_time_index = X.index  # Remember X index for future sampling

        # Generate subsample ind_matrix (we need this during subsampling cross_validation)
        subsampled_ind_mat = self.ind_mat[:, self.timestamp_int_index_mapping.
                                          loc[self.X_time_index]]

        # Convert data (X is required to be 2d and indexable)
        X, y = check_X_y(X,
                         y, ['csr', 'csc'],
                         dtype=None,
                         force_all_finite=False,
                         multi_output=True)
        if sample_weight is not None:
            sample_weight = check_array(sample_weight, ensure_2d=False)
            check_consistent_length(y, sample_weight)

        # Remap output
        n_samples, self.n_features_ = X.shape
        self._n_samples = n_samples
        y = self._validate_y(y)

        # Check parameters
        self._validate_estimator()

        # Validate max_samples
        if not isinstance(max_samples, (numbers.Integral, np.integer)):
            max_samples = int(max_samples * X.shape[0])

        if not (0 < max_samples <= X.shape[0]):
            raise ValueError("max_samples must be in (0, n_samples]")

        # Store validated integer row sampling value
        self._max_samples = max_samples

        # Validate max_features
        if isinstance(self.max_features, (numbers.Integral, np.integer)):
            max_features = self.max_features
        elif isinstance(self.max_features, np.float):
            max_features = self.max_features * self.n_features_
        else:
            raise ValueError("max_features must be int or float")

        if not (0 < max_features <= self.n_features_):
            raise ValueError("max_features must be in (0, n_features]")

        max_features = max(1, int(max_features))

        # Store validated integer feature sampling value
        self._max_features = max_features

        if self.warm_start and self.oob_score:
            raise ValueError("Out of bag estimate only available"
                             " if warm_start=False")

        if not self.warm_start or not hasattr(self, 'estimators_'):
            # Free allocated memory, if any
            self.estimators_ = []
            self.estimators_features_ = []
            self.sequentially_bootstrapped_samples_ = []

        n_more_estimators = self.n_estimators - len(self.estimators_)

        if n_more_estimators < 0:
            raise ValueError('n_estimators=%d must be larger or equal to '
                             'len(estimators_)=%d when warm_start==True' %
                             (self.n_estimators, len(self.estimators_)))

        elif n_more_estimators == 0:
            warn("Warm-start fitting without increasing n_estimators does not "
                 "fit new trees.")
            return self

        # Parallel loop
        n_jobs, n_estimators, starts = _partition_estimators(
            n_more_estimators, self.n_jobs)
        total_n_estimators = sum(n_estimators)

        # Advance random state to state after training
        # the first n_estimators
        if self.warm_start and len(self.estimators_) > 0:
            random_state.randint(MAX_INT, size=len(self.estimators_))

        seeds = random_state.randint(MAX_INT, size=n_more_estimators)
        self._seeds = seeds

        # pylint: disable=C0330
        all_results = Parallel(
            n_jobs=n_jobs,
            verbose=self.verbose,
        )(delayed(_parallel_build_estimators)(n_estimators[i],
                                              self,
                                              X,
                                              y,
                                              subsampled_ind_mat,
                                              sample_weight,
                                              seeds[starts[i]:starts[i + 1]],
                                              total_n_estimators,
                                              verbose=self.verbose)
          for i in range(n_jobs))

        # Reduce
        self.estimators_ += list(
            itertools.chain.from_iterable(t[0] for t in all_results))
        self.estimators_features_ += list(
            itertools.chain.from_iterable(t[1] for t in all_results))
        self.sequentially_bootstrapped_samples_ += list(
            itertools.chain.from_iterable(t[2] for t in all_results))

        if self.oob_score:
            self._set_oob_score(X, y)

        return self
Exemple #54
0
    def fit(self, X, y, sample_weight=None, check_input=True):
        """Fit a shapelet tree classifier from the training set (X, y)

        :param X: array-like, shape `[n_samples, n_timesteps]` or
           `[n_samples, n_dimensions, n_timesteps]`. The training time
           series.

        :param y: array-like, shape `[n_samples, n_classes]` or
           `[n_classes]`. Target values (class labels) as integers or
           strings.

        :param sample_weight: If `None`, then samples are equally
            weighted. Splits that would create child nodes with net
            zero or negative weight are ignored while searching for a
            split in each node. Splits are also ignored if they would
            result in any single class carrying a negative weight in
            either child node.

        :param check_input: Allow to bypass several input checking.
            Don't use this parameter unless you know what you do.

        :returns: `self`

        """
        random_state = check_random_state(self.random_state)

        if check_input:
            X = check_array(X, dtype=np.float64, allow_nd=True, order="C")
            y = check_array(y, ensure_2d=False)

        if X.ndim < 2 or X.ndim > 3:
            raise ValueError("illegal input dimensions")

        n_samples = X.shape[0]
        if isinstance(self.force_dim, int):
            X = np.reshape(X, [n_samples, self.force_dim, -1])

        n_timesteps = X.shape[-1]

        if X.ndim > 2:
            n_dims = X.shape[1]
        else:
            n_dims = 1

        if y.ndim == 1:
            self.classes_, y = np.unique(y, return_inverse=True)
        else:
            _, y = np.nonzero(y)
            if len(y) != n_samples:
                raise ValueError("Single label per sample expected.")
            self.classes_ = np.unique(y)

        if len(y) != n_samples:
            raise ValueError("Number of labels={} does not match "
                             "number of samples={}".format(len(y), n_samples))

        if X.dtype != np.float64 or not X.flags.contiguous:
            X = np.ascontiguousarray(X, dtype=np.float64)

        if not y.flags.contiguous:
            y = np.ascontiguousarray(y, dtype=np.intp)

        metric_params = self.metric_params
        if self.metric_params is None:
            metric_params = {}

        distance_measure = DISTANCE_MEASURE[self.metric](n_timesteps,
                                                         **metric_params)

        max_shapelet_size = int(n_timesteps * self.max_shapelet_size)
        min_shapelet_size = int(n_timesteps * self.min_shapelet_size)

        if min_shapelet_size < 2:
            min_shapelet_size = 2
        min_sample_split = self.min_samples_split
        self.n_classes_ = len(self.classes_)
        self.n_timestep_ = n_timesteps
        self.n_dims_ = n_dims

        tree_builder = ClassificationShapeletTreeBuilder(
            self.n_shapelets,
            min_shapelet_size,
            max_shapelet_size,
            self.max_depth,
            min_sample_split,
            distance_measure,
            X,
            y,
            sample_weight,
            random_state,
            self.n_classes_,
        )

        self.root_node_ = tree_builder.build_tree()
        return self
Exemple #55
0
    def _sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : ndarray, shape (n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`

        """

        if self.kind not in SMOTE_KIND:
            raise ValueError('Unknown kind for SMOTE algorithm.'
                             ' Choices are {}. Got {} instead.'.format(
                                 SMOTE_KIND, self.kind))

        random_state = check_random_state(self.random_state)

        # Define the number of sample to create
        # We handle only two classes problem for the moment.
        if self.ratio == 'auto':
            num_samples = (self.stats_c_[self.maj_c_] -
                           self.stats_c_[self.min_c_])
        else:
            num_samples = int((self.ratio * self.stats_c_[self.maj_c_]) -
                              self.stats_c_[self.min_c_])

        # Start by separating minority class features and target values.
        X_min = X[y == self.min_c_]

        # If regular SMOTE is to be performed
        if self.kind == 'regular':

            self.logger.debug('Finding the %s nearest neighbours ...',
                              self.nn_k_.n_neighbors - 1)

            # Look for k-th nearest neighbours, excluding, of course, the
            # point itself.
            self.nn_k_.fit(X_min)

            # Matrix with k-th nearest neighbours indexes for each minority
            # element.
            nns = self.nn_k_.kneighbors(X_min, return_distance=False)[:, 1:]

            self.logger.debug('Create synthetic samples ...')

            # --- Generating synthetic samples
            # Use static method make_samples to generate minority samples
            X_new, y_new = self._make_samples(X_min, self.min_c_, X_min, nns,
                                              num_samples, 1.0)

            # Concatenate the newly generated samples to the original data set
            X_resampled = np.concatenate((X, X_new), axis=0)
            y_resampled = np.concatenate((y, y_new), axis=0)

            return X_resampled, y_resampled

        if self.kind == 'borderline1' or self.kind == 'borderline2':

            self.logger.debug('Finding the %s nearest neighbours ...',
                              self.nn_m_.n_neighbors - 1)

            # Find the NNs for all samples in the data set.
            self.nn_m_.fit(X)

            # Boolean array with True for minority samples in danger
            danger_index = self._in_danger_noise(X_min, y, kind='danger')

            # If all minority samples are safe, return the original data set.
            if not any(danger_index):
                self.logger.debug('There are no samples in danger. No'
                                  ' borderline synthetic samples created.')

                # All are safe, nothing to be done here.
                return X, y

            # If we got here is because some samples are in danger, we need to
            # find the NNs among the minority class to create the new synthetic
            # samples.
            #
            # We start by changing the number of NNs to consider from m + 1
            # to k + 1
            self.nn_k_.fit(X_min)

            # nns...#
            nns = self.nn_k_.kneighbors(X_min[danger_index],
                                        return_distance=False)[:, 1:]

            # B1 and B2 types diverge here!!!
            if self.kind == 'borderline1':
                # Create synthetic samples for borderline points.
                X_new, y_new = self._make_samples(X_min[danger_index],
                                                  self.min_c_, X_min, nns,
                                                  num_samples)

                # Concatenate the newly generated samples to the original
                # dataset
                X_resampled = np.concatenate((X, X_new), axis=0)
                y_resampled = np.concatenate((y, y_new), axis=0)

                return X_resampled, y_resampled

            else:
                # Split the number of synthetic samples between only minority
                # (type 1), or minority and majority (with reduced step size)
                # (type 2).
                # The fraction is sampled from a beta distribution centered
                # around 0.5 with variance ~0.01
                fractions = random_state.beta(10, 10)

                # Only minority
                X_new_1, y_new_1 = self._make_samples(X_min[danger_index],
                                                      self.min_c_,
                                                      X_min,
                                                      nns,
                                                      int(fractions *
                                                          (num_samples + 1)),
                                                      step_size=1.)

                # Only majority with smaller step size
                X_new_2, y_new_2 = self._make_samples(X_min[danger_index],
                                                      self.min_c_,
                                                      X[y != self.min_c_],
                                                      nns,
                                                      int((1 - fractions) *
                                                          num_samples),
                                                      step_size=0.5)

                # Concatenate the newly generated samples to the original
                # data set
                X_resampled = np.concatenate((X, X_new_1, X_new_2), axis=0)
                y_resampled = np.concatenate((y, y_new_1, y_new_2), axis=0)

                return X_resampled, y_resampled

        if self.kind == 'svm':
            # The SVM smote model fits a support vector machine
            # classifier to the data and uses the support vector to
            # provide a notion of boundary. Unlike regular smote, where
            # such notion relies on proportion of nearest neighbours
            # belonging to each class.

            # Fit SVM to the full data#
            self.svm_estimator_.fit(X, y)

            # Find the support vectors and their corresponding indexes
            support_index = self.svm_estimator_.support_[y[
                self.svm_estimator_.support_] == self.min_c_]
            support_vector = X[support_index]

            # First, find the nn of all the samples to identify samples
            # in danger and noisy ones
            self.logger.debug('Finding the %s nearest neighbours ...',
                              self.nn_m_.n_neighbors - 1)

            # As usual, fit a nearest neighbour model to the data
            self.nn_m_.fit(X)

            # Now, get rid of noisy support vectors
            noise_bool = self._in_danger_noise(support_vector, y, kind='noise')

            # Remove noisy support vectors
            support_vector = support_vector[np.logical_not(noise_bool)]
            danger_bool = self._in_danger_noise(support_vector,
                                                y,
                                                kind='danger')
            safety_bool = np.logical_not(danger_bool)

            self.logger.debug(
                'Out of %s support vectors, %s are noisy, '
                '%s are in danger '
                'and %s are safe.', support_vector.shape[0],
                noise_bool.sum().astype(int),
                danger_bool.sum().astype(int),
                safety_bool.sum().astype(int))

            # Proceed to find support vectors NNs among the minority class
            self.logger.debug('Finding the %s nearest neighbours ...',
                              self.nn_k_.n_neighbors - 1)

            self.nn_k_.fit(X_min)

            self.logger.debug('Create synthetic samples ...')

            # Split the number of synthetic samples between interpolation and
            # extrapolation

            # The fraction are sampled from a beta distribution with mean
            # 0.5 and variance 0.01#
            fractions = random_state.beta(10, 10)

            # Interpolate samples in danger
            if np.count_nonzero(danger_bool) > 0:
                nns = self.nn_k_.kneighbors(support_vector[danger_bool],
                                            return_distance=False)[:, 1:]

                X_new_1, y_new_1 = self._make_samples(
                    support_vector[danger_bool],
                    self.min_c_,
                    X_min,
                    nns,
                    int(fractions * (num_samples + 1)),
                    step_size=1.)

            # Extrapolate safe samples
            if np.count_nonzero(safety_bool) > 0:
                nns = self.nn_k_.kneighbors(support_vector[safety_bool],
                                            return_distance=False)[:, 1:]

                X_new_2, y_new_2 = self._make_samples(
                    support_vector[safety_bool],
                    self.min_c_,
                    X_min,
                    nns,
                    int((1 - fractions) * num_samples),
                    step_size=-self.out_step)

            # Concatenate the newly generated samples to the original data set
            if (np.count_nonzero(danger_bool) > 0
                    and np.count_nonzero(safety_bool) > 0):
                X_resampled = np.concatenate((X, X_new_1, X_new_2), axis=0)
                y_resampled = np.concatenate((y, y_new_1, y_new_2), axis=0)
            # not any support vectors in danger
            elif np.count_nonzero(danger_bool) == 0:
                X_resampled = np.concatenate((X, X_new_2), axis=0)
                y_resampled = np.concatenate((y, y_new_2), axis=0)
            # All the support vector in danger
            elif np.count_nonzero(safety_bool) == 0:
                X_resampled = np.concatenate((X, X_new_1), axis=0)
                y_resampled = np.concatenate((y, y_new_1), axis=0)

            return X_resampled, y_resampled
Exemple #56
0
    def _make_samples(self,
                      X,
                      y_type,
                      nn_data,
                      nn_num,
                      n_samples,
                      step_size=1.):
        """A support function that returns artificial samples constructed along
        the line connecting nearest neighbours.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Points from which the points will be created.

        y_type : str or int
            The minority target value, just so the function can return the
            target values for the synthetic variables with correct length in
            a clear format.

        nn_data : ndarray, shape (n_samples_all, n_features)
            Data set carrying all the neighbours to be used

        nn_num : ndarray, shape (n_samples_all, k_nearest_neighbours)
            The nearest neighbours of each sample in nn_data.

        n_samples : int
            The number of samples to generate.

        step_size : float, optional (default=1.)
            The step size to create samples.

        Returns
        -------
        X_new : ndarray, shape (n_samples_new, n_features)
            Synthetically generated samples.

        y_new : ndarray, shape (n_samples_new, )
            Target values for synthetic samples.

        """

        # Check the consistency of X
        X = check_array(X)
        # Check the random state
        random_state = check_random_state(self.random_state)

        # A matrix to store the synthetic samples
        X_new = np.zeros((n_samples, X.shape[1]))

        # # Set seeds
        # seeds = random_state.randint(low=0,
        #                              high=100 * len(nn_num.flatten()),
        #                              size=n_samples)

        # Randomly pick samples to construct neighbours from
        samples = random_state.randint(low=0,
                                       high=len(nn_num.flatten()),
                                       size=n_samples)

        # Loop over the NN matrix and create new samples
        for i, n in enumerate(samples):
            # NN lines relate to original sample, columns to its
            # nearest neighbours
            row, col = divmod(n, nn_num.shape[1])

            # Take a step of random size (0,1) in the direction of the
            # n nearest neighbours
            # if self.random_state is None:
            #     np.random.seed(seeds[i])
            # else:
            #     np.random.seed(self.random_state)
            step = step_size * random_state.uniform()

            # Construct synthetic sample
            X_new[i] = X[row] - step * (X[row] - nn_data[nn_num[row, col]])

        # The returned target vector is simply a repetition of the
        # minority label
        y_new = np.array([y_type] * len(X_new))

        self.logger.info('Generated %s new samples ...', len(X_new))

        return X_new, y_new
Exemple #57
0
def gp_minimize(func, dimensions, base_estimator=None,
                n_calls=100, n_random_starts=10,
                acq_func="gp_hedge", acq_optimizer="auto", x0=None, y0=None,
                random_state=None, verbose=False, callback=None,
                n_points=10000, n_restarts_optimizer=5, xi=0.01, kappa=1.96,
                noise="gaussian", n_jobs=1, model_queue_size=None):
    """Bayesian optimization using Gaussian Processes.

    If every function evaluation is expensive, for instance
    when the parameters are the hyperparameters of a neural network
    and the function evaluation is the mean cross-validation score across
    ten folds, optimizing the hyperparameters by standard optimization
    routines would take for ever!

    The idea is to approximate the function using a Gaussian process.
    In other words the function values are assumed to follow a multivariate
    gaussian. The covariance of the function values are given by a
    GP kernel between the parameters. Then a smart choice to choose the
    next parameter to evaluate can be made by the acquisition function
    over the Gaussian prior which is much quicker to evaluate.

    The total number of evaluations, `n_calls`, are performed like the
    following. If `x0` is provided but not `y0`, then the elements of `x0`
    are first evaluated, followed by `n_random_starts` evaluations.
    Finally, `n_calls - len(x0) - n_random_starts` evaluations are
    made guided by the surrogate model. If `x0` and `y0` are both
    provided then `n_random_starts` evaluations are first made then
    `n_calls - n_random_starts` subsequent evaluations are made
    guided by the surrogate model.

    Parameters
    ----------
    func : callable
        Function to minimize. Should take a single list of parameters
        and return the objective value.
    
        If you have a search-space where all dimensions have names,
        then you can use :func:`skopt.utils.use_named_args` as a decorator
        on your objective function, in order to call it directly
        with the named arguments. See `use_named_args` for an example.

    dimensions : [list, shape (n_dims,)
        List of search space dimensions.
        Each search dimension can be defined either as

        - a `(lower_bound, upper_bound)` tuple (for `Real` or `Integer`
          dimensions),

        - a `(lower_bound, upper_bound, "prior")` tuple (for `Real`
          dimensions),

        - as a list of categories (for `Categorical` dimensions), or

        - an instance of a `Dimension` object (`Real`, `Integer` or
          `Categorical`).

         .. note:: The upper and lower bounds are inclusive for `Integer`
         dimensions.

    base_estimator : a Gaussian process estimator
        The Gaussian process estimator to use for optimization.
        By default, a Matern kernel is used with the following
        hyperparameters tuned.

        - All the length scales of the Matern kernel.

        - The covariance amplitude that each element is multiplied with.

        - Noise that is added to the matern kernel. The noise is assumed
          to be iid gaussian.

    n_calls : int, default=100
        Number of calls to `func`.

    n_random_starts : int, default=10
        Number of evaluations of `func` with random points before
        approximating it with `base_estimator`.

    acq_func : string, default=`"gp_hedge"`
        Function to minimize over the gaussian prior. Can be either

        - `"LCB"` for lower confidence bound.

        - `"EI"` for negative expected improvement.

        - `"PI"` for negative probability of improvement.

        - `"gp_hedge"` Probabilistically choose one of the above three
          acquisition functions at every iteration. The weightage
          given to these gains can be set by :math:`\eta` through
          `acq_func_kwargs`.

            - The gains `g_i` are initialized to zero.

            - At every iteration,

                - Each acquisition function is optimised independently to
                  propose an candidate point `X_i`.

                - Out of all these candidate points, the next point `X_best` is
                  chosen by :math:`softmax(\eta g_i)`

                - After fitting the surrogate model with `(X_best, y_best)`,
                  the gains are updated such that :math:`g_i -= \mu(X_i)`

        - `"EIps"` for negated expected improvement per second to take into
          account the function compute time. Then, the objective function is
          assumed to return two values, the first being the objective value and
          the second being the time taken in seconds.

        - `"PIps"` for negated probability of improvement per second. The
          return type of the objective function is assumed to be similar to
          that of `"EIps

    acq_optimizer : string, `"sampling"` or `"lbfgs"`, default=`"lbfgs"`
        Method to minimize the acquistion function. The fit model
        is updated with the optimal value obtained by optimizing `acq_func`
        with `acq_optimizer`.

        The `acq_func` is computed at `n_points` sampled randomly.

        - If set to `"auto"`, then `acq_optimizer` is configured on the
          basis of the space searched over.
          If the space is Categorical then this is set to be "sampling"`.

        - If set to `"sampling"`, then the point among these `n_points`
          where the `acq_func` is minimum is the next candidate minimum.

        - If set to `"lbfgs"`, then

              - The `n_restarts_optimizer` no. of points which the acquisition
                function is least are taken as start points.

              - `"lbfgs"` is run for 20 iterations with these points as initial
                points to find local minima.

              - The optimal of these local minima is used to update the prior.

    x0 : list, list of lists or `None`
        Initial input points.

        - If it is a list of lists, use it as a list of input points.

        - If it is a list, use it as a single initial input point.

        - If it is `None`, no initial input points are used.

    y0 : list, scalar or `None`
        Evaluation of initial input points.

        - If it is a list, then it corresponds to evaluations of the function
          at each element of `x0` : the i-th element of `y0` corresponds
          to the function evaluated at the i-th element of `x0`.

        - If it is a scalar, then it corresponds to the evaluation of the
          function at `x0`.

        - If it is None and `x0` is provided, then the function is evaluated
          at each element of `x0`.

    random_state : int, RandomState instance, or None (default)
        Set random state to something other than None for reproducible
        results.

    verbose : boolean, default=False
        Control the verbosity. It is advised to set the verbosity to True
        for long optimization runs.

    callback : callable, list of callables, optional
        If callable then `callback(res)` is called after each call to `func`.
        If list of callables, then each callable in the list is called.

    n_points : int, default=10000
        Number of points to sample to determine the next "best" point.
        Useless if acq_optimizer is set to `"lbfgs"`.

    n_restarts_optimizer : int, default=5
        The number of restarts of the optimizer when `acq_optimizer`
        is `"lbfgs"`.

    kappa : float, default=1.96
        Controls how much of the variance in the predicted values should be
        taken into account. If set to be very high, then we are favouring
        exploration over exploitation and vice versa.
        Used when the acquisition is `"LCB"`.

    xi : float, default=0.01
        Controls how much improvement one wants over the previous best
        values. Used when the acquisition is either `"EI"` or `"PI"`.

    noise : float, default="gaussian"

        - Use noise="gaussian" if the objective returns noisy observations.
          The noise of each observation is assumed to be iid with
          mean zero and a fixed variance.

        - If the variance is known before-hand, this can be set directly
          to the variance of the noise.

        - Set this to a value close to zero (1e-10) if the function is
          noise-free. Setting to zero might cause stability issues.

    n_jobs : int, default=1
        Number of cores to run in parallel while running the lbfgs
        optimizations over the acquisition function. Valid only
        when `acq_optimizer` is set to "lbfgs."
        Defaults to 1 core. If `n_jobs=-1`, then number of jobs is set
        to number of cores.

    model_queue_size : int or None, default=None
        Keeps list of models only as long as the argument given. In the
        case of None, the list has no capped length.

    Returns
    -------
    res : `OptimizeResult`, scipy object
        The optimization result returned as a OptimizeResult object.
        Important attributes are:

        - `x` [list]: location of the minimum.

        - `fun` [float]: function value at the minimum.

        - `models`: surrogate models used for each iteration.

        - `x_iters` [list of lists]: location of function evaluation for each
           iteration.

        - `func_vals` [array]: function value for each iteration.

        - `space` [Space]: the optimization space.

        - `specs` [dict]`: the call specifications.

        - `rng` [RandomState instance]: State of the random state
           at the end of minimization.

        For more details related to the OptimizeResult object, refer
        http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.OptimizeResult.html

    .. seealso:: functions :class:`skopt.forest_minimize`,
        :class:`skopt.dummy_minimize`
    """
    # Check params
    rng = check_random_state(random_state)
    space = normalize_dimensions(dimensions)

    if base_estimator is None:
        base_estimator = cook_estimator(
            "GP", space=space, random_state=rng.randint(0, np.iinfo(np.int32).max),
            noise=noise)

    return base_minimize(
        func, space, base_estimator=base_estimator,
        acq_func=acq_func,
        xi=xi, kappa=kappa, acq_optimizer=acq_optimizer, n_calls=n_calls,
        n_points=n_points, n_random_starts=n_random_starts,
        n_restarts_optimizer=n_restarts_optimizer,
        x0=x0, y0=y0, random_state=rng, verbose=verbose,
        callback=callback, n_jobs=n_jobs, model_queue_size=model_queue_size)
Exemple #58
0
def sag_sparse(X,
               y,
               step_size,
               alpha,
               n_iter=1,
               dloss=None,
               sample_weight=None,
               sparse=False,
               fit_intercept=True,
               saga=False,
               random_state=0):
    if step_size * alpha == 1.:
        raise ZeroDivisionError("Sparse sag does not handle the case "
                                "step_size * alpha == 1")
    n_samples, n_features = X.shape[0], X.shape[1]

    weights = np.zeros(n_features)
    sum_gradient = np.zeros(n_features)
    last_updated = np.zeros(n_features, dtype=int)
    gradient_memory = np.zeros(n_samples)
    rng = check_random_state(random_state)
    intercept = 0.0
    intercept_sum_gradient = 0.0
    wscale = 1.0
    decay = 1.0
    seen = set()

    c_sum = np.zeros(n_iter * n_samples)

    # sparse data has a fixed decay of .01
    if sparse:
        decay = .01

    counter = 0
    for epoch in range(n_iter):
        for k in range(n_samples):
            # idx = k
            idx = int(rng.rand(1) * n_samples)
            entry = X[idx]
            seen.add(idx)

            if counter >= 1:
                for j in range(n_features):
                    if last_updated[j] == 0:
                        weights[j] -= c_sum[counter - 1] * sum_gradient[j]
                    else:
                        weights[j] -= (
                            (c_sum[counter - 1] - c_sum[last_updated[j] - 1]) *
                            sum_gradient[j])
                    last_updated[j] = counter

            p = (wscale * np.dot(entry, weights)) + intercept
            gradient = dloss(p, y[idx])

            if sample_weight is not None:
                gradient *= sample_weight[idx]

            update = entry * gradient
            gradient_correction = update - (gradient_memory[idx] * entry)
            sum_gradient += gradient_correction
            if saga:
                for j in range(n_features):
                    weights[j] -= (gradient_correction[j] * step_size *
                                   (1 - 1. / len(seen)) / wscale)

            if fit_intercept:
                gradient_correction = gradient - gradient_memory[idx]
                intercept_sum_gradient += gradient_correction
                gradient_correction *= step_size * (1. - 1. / len(seen))
                if saga:
                    intercept -= ((step_size * intercept_sum_gradient /
                                   len(seen) * decay) + gradient_correction)
                else:
                    intercept -= (step_size * intercept_sum_gradient /
                                  len(seen) * decay)

            gradient_memory[idx] = gradient

            wscale *= (1.0 - alpha * step_size)
            if counter == 0:
                c_sum[0] = step_size / (wscale * len(seen))
            else:
                c_sum[counter] = (c_sum[counter - 1] + step_size /
                                  (wscale * len(seen)))

            if counter >= 1 and wscale < 1e-9:
                for j in range(n_features):
                    if last_updated[j] == 0:
                        weights[j] -= c_sum[counter] * sum_gradient[j]
                    else:
                        weights[j] -= (
                            (c_sum[counter] - c_sum[last_updated[j] - 1]) *
                            sum_gradient[j])
                    last_updated[j] = counter + 1
                c_sum[counter] = 0
                weights *= wscale
                wscale = 1.0

            counter += 1

    for j in range(n_features):
        if last_updated[j] == 0:
            weights[j] -= c_sum[counter - 1] * sum_gradient[j]
        else:
            weights[j] -= ((c_sum[counter - 1] - c_sum[last_updated[j] - 1]) *
                           sum_gradient[j])
    weights *= wscale
    return weights, intercept
Exemple #59
0

import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state

tmp = np.loadtxt("week-wti.csv", dtype=np.str, delimiter=",")
X = tmp[1:, 0:8].astype(np.float)  # 加载数据部分
Y = tmp[1:, 8].astype(np.float)  # 加载类别标签部分

random_state = check_random_state(0)  # 将样本进行随机排列
permutation = random_state.permutation(X.shape[0])
X = X[permutation]
Y = Y[permutation]
X = X.reshape((X.shape[0], -1))

train_x, test_x, train_y, test_y = train_test_split(
    X, Y, test_size=0.2)

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(train_x, train_y)

# Make predictions using the testing set
Exemple #60
0
 def make_random_state(estimator):
     rs = {}
     if estimator.__class__.__name__[-11:] == '_Supervised':
         rs['random_state'] = check_random_state(SEED)
     return rs