Example #1
0
    def pinball_loss(y_true, y_pred, probs):
        """Compute the pinball loss.

        Parameters
        ----------
        pred : {array-like}, shape = [n_quantiles, n_samples] or [n_samples]
            Predictions.
        y : {array-like}, shape = [n_samples]
            Targets.

        Returns
        -------
        l : {array}, shape = [n_quantiles]
            Average loss for each quantile level.
        """
        probs = asarray(probs).reshape(-1)
        check_consistent_length(y_true, y_pred.T)
        y_true = check_array(y_true.reshape((-1, 1)),
                             ensure_2d=True)
        y_pred = check_array(y_pred.T.reshape((y_true.shape[0], -1)),
                             ensure_2d=True)
        residual = y_true - y_pred
        loss = npsum([fmax(prob * res, (prob - 1) * res) for (res, prob) in
                      zip(residual.T, probs)], axis=1)
        return loss / y_true.size
    def fit(self, X, y=None):
        if self.encoding not in ['similarity',
                                 'target',
                                 'ordinal',
                                 'onehot',
                                 'onehot-dense',
                                 'ngram-count',
                                 'ngram-presence',
                                 'ngram-tfidf']:
            template = ("Encoding %s has not been implemented yet")
            raise ValueError(template % self.handle_unknown)

        if self.handle_unknown not in ['error', 'ignore']:
            template = ("handle_unknown should be either 'error' or "
                        "'ignore', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
            raise ValueError("handle_unknown='ignore' is not supported for"
                             " encoding='ordinal'")

        if self.categories != 'auto':
            for cats in self.categories:
                if not np.all(np.sort(cats) == np.array(cats)):
                    raise ValueError("Unsorted categories are not yet "
                                     "supported")

        X_temp = check_array(X, dtype=None)
        if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):
            X = check_array(X, dtype=np.object)
        else:
            X = X_temp

        n_samples, n_features = X.shape

        self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]

        for i in range(n_features):
            le = self._label_encoders_[i]
            Xi = X[:, i]
            if self.categories == 'auto':
                le.fit(Xi)
            else:
                if self.handle_unknown == 'error':
                    valid_mask = np.in1d(Xi, self.categories[i])
                    if not np.all(valid_mask):
                        diff = np.unique(Xi[~valid_mask])
                        msg = ("Found unknown categories {0} in column {1}"
                               " during fit".format(diff, i))
                        raise ValueError(msg)
                le.classes_ = np.array(self.categories[i])

        self.categories_ = [le.classes_ for le in self._label_encoders_]
        if self.encoding == 'target':
            self.Eyx_ = [{cat: np.mean(y[X[:, i] == cat])
                          for cat in self.categories_[i]}
                         for i in range(len(self.categories_))]
            self.Ey_ = [np.mean(y)
                        for i in range(len(self.categories_))]
        return self
Example #3
0
    def predict_proba(self,X):
        """Create predictions. Start a vw process. Convert data to vw format and send. 
        Returns class probability estimates for the given test data.

        X : pandas dataframe or array-like
            Test samples 
        
        Returns
        -------
        proba : array-like, shape = (n_samples, n_outputs)
            Class probability estimates.
  
        Caveats : 
        1. A seldon specific fork of wabbit_wappa is needed to allow vw to run in server mode without save_resume. Save_resume seems to cause issues with the scores returned. Maybe connected to https://github.com/JohnLangford/vowpal_wabb#it/issues/262
        """
        self._start_vw_if_needed("test")
        if isinstance(X,pd.DataFrame):
            df = X
            df_base = self._exclude_include_features(df)
            df_base = df_base.fillna(0)
        else:
            check_array(X)
            df_base = pd.DataFrame(X)
        df_vw = df_base.apply(self._convert_row,axis=1)
        predictions = None
        for (index,val) in df_vw.iteritems():
            prediction = self.vw.send_line(val,parse_result=True)
            self._start_raw_predictions()
            scores = self._get_full_scores()
            if predictions is None:
                predictions = np.array([scores])
            else:
                predictions = np.vstack([predictions,scores])
        return predictions
Example #4
0
    def __call__(self, y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None):
        if self.lb_ is None:
            self.lb_ = LabelBinarizer()
            T = self.lb_.fit_transform(y_true)
        else:
            T = self.lb_.transform(y_true)

        if T.shape[1] == 1:
            T = np.append(1 - T, T, axis=1)

        Y = np.clip(y_pred, eps, 1 - eps)

        if not isinstance(Y, np.ndarray):
            raise ValueError("y_pred should be an array of floats.")

        if Y.ndim == 1:
            Y = Y[:, np.newaxis]
        if Y.shape[1] == 1:
            Y = np.append(1 - Y, Y, axis=1)

        check_consistent_length(T, Y)
        T = check_array(T)
        Y = check_array(Y)
        if T.shape[1] != Y.shape[1]:
            raise ValueError("y_true and y_pred have different number of classes " "%d, %d" % (T.shape[1], Y.shape[1]))

        Y /= Y.sum(axis=1)[:, np.newaxis]
        loss = -(T * np.log(Y)).sum(axis=1)

        return _weighted_sum(loss, sample_weight, normalize)
def test_check_array_force_all_finiteinvalid(value, force_all_finite,
                                             match_msg, retype):
    X = retype(np.arange(4).reshape(2, 2).astype(np.float))
    X[0, 0] = value
    with pytest.raises(ValueError, match=match_msg):
        check_array(X, force_all_finite=force_all_finite,
                    accept_sparse=True)
Example #6
0
    def fit(self, X_train, y_train, n_more_iter=0):
        """ Fit model with specified loss.

        Parameters
        ----------
        X : scipy.sparse.csc_matrix, (n_samples, n_features)

        y : float | ndarray, shape = (n_samples, )

        n_more_iter : int
                Number of iterations to continue from the current Coefficients.

        """

        check_consistent_length(X_train, y_train)
        y_train = check_array(y_train, ensure_2d=False, dtype=np.float64)

        X_train = check_array(X_train, accept_sparse="csc", dtype=np.float64,
                              order="F")
        self.n_iter = self.n_iter + n_more_iter

        if n_more_iter > 0:
            _check_warm_start(self, X_train)
            self.warm_start = True

        self.w0_, self.w_, self.V_ = ffm.ffm_als_fit(self, X_train, y_train)

        if self.iter_count != 0:
            self.iter_count = self.iter_count + n_more_iter
        else:
            self.iter_count = self.n_iter

        # reset to default setting
        self.warm_start = False
        return self
Example #7
0
    def _transform_new(self, X):
        """New implementation assuming categorical input"""
        X_temp = check_array(X, dtype=None)
        if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):
            X = check_array(X, dtype=np.object)
        else:
            X = X_temp

        n_samples, n_features = X.shape

        X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)

        mask = X_mask.ravel()
        n_values = [cats.shape[0] for cats in self.categories_]
        n_values = np.array([0] + n_values)
        feature_indices = np.cumsum(n_values)

        indices = (X_int + feature_indices[:-1]).ravel()[mask]
        indptr = X_mask.sum(axis=1).cumsum()
        indptr = np.insert(indptr, 0, 0)
        data = np.ones(n_samples * n_features)[mask]

        out = sparse.csr_matrix((data, indices, indptr),
                                shape=(n_samples, feature_indices[-1]),
                                dtype=self.dtype)
        if not self.sparse:
            return out.toarray()
        else:
            return out
Example #8
0
    def fit_transform(self,X,y=None):
        """ 
        Generates sets of hyper-spheres for anomaly scores 
        
        Parameters
        ----------
        
        X : numpy array (nb_samples, nb_features)
            data set
    
        Returns
        -------
        
        self
        """
        t_0 = time()
        
        check_array(X)
                 
        self._sets_of_spheres = []
        if self.verbose:
            logger.info('generating sets of spheres...')
        for j in range(self.ensemble_size):
            X_s = np.random.permutation(X)[:self.sample_size,:]
            spheres = self._generate_spheres(X_s)
            self._sets_of_spheres.append(spheres)
        t_f = time() - t_0
        m,s = divmod(t_f, 60)
        h,m = divmod(m, 60)
        if self.verbose:
            logger.info('Total run time: %i:%i:%i'
                        % (h,m,s))

        return self
Example #9
0
    def query(self, X, **query_kwargs):
        """
        Finds the n_instances most informative point in the data provided by calling
        the query_strategy function. Returns the queried instances and its indices.

        Parameters
        ----------
        X: numpy.ndarray of shape (n_samples, n_features)
            The pool of samples from which the query strategy should choose
            instances to request labels.

        query_kwargs: keyword arguments
            Keyword arguments for the query strategy function

        Returns
        -------
        query_idx: numpy.ndarray of shape (n_instances, )
            The indices of the instances from X_pool chosen to be labelled.

        X[query_idx]: numpy.ndarray of shape (n_instances, n_features)
            The instances from X_pool chosen to be labelled.
        """
        check_array(X, ensure_2d=True)

        query_idx, query_instances = self.query_strategy(self, X, **query_kwargs)
        return query_idx, X[query_idx]
Example #10
0
    def fit(self, X, y):
        check_array(X, y)

        for x_i, y_i in izip(X, y):
            self.partial_fit(x_i, y_i)

        return self
Example #11
0
    def _add_training_data(self, X, y):
        """
        Adds the new data and label to the known data, but does
        not retrain the model.

        Parameters
        ----------
        X: numpy.ndarray of shape (n_samples, n_features)
            The new samples for which the labels are supplied
            by the expert.

        y: numpy.ndarray of shape (n_samples, )
            Labels corresponding to the new instances in X.

        Note
        ----
        If the classifier has been fitted, the features in X
        have to agree with the training samples which the
        classifier has seen.
        """
        X, y = check_array(X), check_array(y, ensure_2d=False)
        assert len(X) == len(y), 'the number of new data points and number of labels must match'

        if type(self._X_training) != type(None):
            try:
                self._X_training = np.vstack((self._X_training, X))
                self._y_training = np.concatenate((self._y_training, y))
            except ValueError:
                raise ValueError('the dimensions of the new training data and label must'
                                 'agree with the training data and labels provided so far')

        else:
            self._X_training = X
            self._y_training = y
Example #12
0
    def csr_to_fm(self, X_csr, return_oh=True, indices=None):
        assert (X_csr.shape == (self.n_samples, self.n_features))

        if indices is None:
            y = check_array(X_csr.data, ensure_2d=False, copy=True)
        else:
            if isinstance(indices, tuple):
                indices_samples, indices_features = indices
            elif isinstance(indices, sp.csc_matrix):
                indices_samples, indices_features = self.fm_to_indices(indices)
            y = X_csr[indices_samples, indices_features].A[0].copy()
        if not return_oh:
            return y
        else:
            X = check_array(X_csr, accept_sparse='coo',
                            force_all_finite=False)
            n_rows, n_cols = X_csr.shape
            assert ((n_rows, n_cols) == (self.n_samples, self.n_features))
            if indices is None:
                encoder = OneHotEncoder(n_values=[self.n_samples,
                                                  self.n_features])
                X_ix = np.column_stack([X.row, X.col])
            else:
                assert (np.sorted(indices_samples) == np.sorted(X.row))
                assert (np.sorted(indices_features) == np.sorted(X.col))
                X_ix = np.column_stack([indices_samples, indices_features])
            X_oh = encoder.fit_transform(X_ix)
            return X_oh, y
Example #13
0
    def vote(self, X, **predict_kwargs):
        """
        Predicts the labels for the supplied data for each learner in
        the Committee.

        Parameters
        ----------
        X: numpy.ndarray of shape (n_samples, n_features)
            The samples to cast votes.

        predict_kwargs: keyword arguments
            Keyword arguments to be passed for the learners .predict() method.

        Returns
        -------
        vote: numpy.ndarray of shape (n_samples, n_learners)
            The predicted class for each learner in the Committee
            and each sample in X.
        """
        check_array(X, ensure_2d=True)
        prediction = np.zeros(shape=(X.shape[0], len(self._learner_list)))

        for learner_idx, learner in enumerate(self._learner_list):
            prediction[:, learner_idx] = learner.predict(X, **predict_kwargs)

        return prediction
Example #14
0
    def _transform(self, X, handle_unknown='error'):

        X_temp = check_array(X, dtype=None)
        if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):
            X = check_array(X, dtype=np.object)
        else:
            X = X_temp

        _, n_features = X.shape
        X_int = np.zeros_like(X, dtype=np.int)
        X_mask = np.ones_like(X, dtype=np.bool)

        for i in range(n_features):
            Xi = X[:, i]
            valid_mask = np.in1d(Xi, self.categories_[i])

            if not np.all(valid_mask):
                if handle_unknown == 'error':
                    diff = np.unique(X[~valid_mask, i])
                    msg = ("Found unknown categories {0} in column {1}"
                           " during transform".format(diff, i))
                    raise ValueError(msg)
                else:
                    # Set the problematic rows to an acceptable value and
                    # continue `The rows are marked `X_mask` and will be
                    # removed later.
                    X_mask[:, i] = valid_mask
                    Xi = Xi.copy()
                    Xi[~valid_mask] = self.categories_[i][0]
            X_int[:, i] = self._label_encoders_[i].transform(Xi)

        return X_int, X_mask
Example #15
0
def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None):
    lb = LabelBinarizer()
    T = lb.fit_transform(y_true)
    if T.shape[1] == 1:
        T = np.append(1 - T, T, axis=1)

    # Clipping
    Y = np.clip(y_pred, eps, 1 - eps)

    # This happens in cases when elements in y_pred have type "str".
    if not isinstance(Y, np.ndarray):
        raise ValueError("y_pred should be an array of floats.")

    # If y_pred is of single dimension, assume y_true to be binary
    # and then check.
    if Y.ndim == 1:
        Y = Y[:, np.newaxis]
    if Y.shape[1] == 1:
        Y = np.append(1 - Y, Y, axis=1)
    # Check if dimensions are consistent.
    check_consistent_length(T, Y)
    T = check_array(T)
    Y = check_array(Y)
    if T.shape[1] != Y.shape[1]:
        raise ValueError("y_true and y_pred have different number of classes "
                         "%d, %d" % (T.shape[1], Y.shape[1]))

    # Renormalize
    Y /= Y.sum(axis=1)[:, np.newaxis]
    loss = -(T * np.log(Y)).sum(axis=1)

    return loss 
Example #16
0
    def fit(self, X, y):
        """Fit OVK ridge regression model.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training data.

        y : {array-like}, shape = [n_samples] or [n_samples, n_targets]
            Target values. numpy.NaN for missing targets (semi-supervised
            learning).

        Returns
        -------
        self : returns an instance of self.
        """
        X = check_array(X, force_all_finite=True, accept_sparse=False,
                        ensure_2d=True)
        y = check_array(y, force_all_finite=False, accept_sparse=False,
                        ensure_2d=False)
        if y.ndim == 1:
            y = check_array(y, force_all_finite=True, accept_sparse=False,
                            ensure_2d=False)
        self._validate_params()

        self.linop_ = self._get_kernel_map(X, y)
        Gram = self.linop_._Gram(X)
        if self.lbda > 0:
            self.dual_coefs_ = dlyap(-Gram / self.lbda, self.linop_.A,
                                     y / self.lbda)
        else:
            # TODO: Check A is invertible!!
            self.dual_coefs_ = solve(Gram, y)
        return self
 def transform(self, X):
     check_array(X, accept_sparse=['csr', 'csc'])
     if issparse(X):
         mult = spdiags(self.weights_, 0, self.length, self.length)
         X *= mult
     else:
         X *= self.weights_
     return X
Example #18
0
 def __init__(self, X, y, n_classes, batch_size):
     self.X = check_array(X, dtype=np.float32, ensure_2d=False,
                          allow_nd=True)
     self.y = check_array(y, ensure_2d=False, dtype=None)
     self.n_classes = n_classes
     self.batch_size = batch_size
     self._input_shape = [batch_size] + list(X.shape[1:])
     self._output_shape = [batch_size, n_classes] if n_classes > 1 else [batch_size]
def test_check_array_on_mock_dataframe():
    arr = np.array([[0.2, 0.7], [0.6, 0.5], [0.4, 0.1], [0.7, 0.2]])
    mock_df = MockDataFrame(arr)
    checked_arr = check_array(mock_df)
    assert_equal(checked_arr.dtype,
                 arr.dtype)
    checked_arr = check_array(mock_df, dtype=np.float32)
    assert_equal(checked_arr.dtype, np.dtype(np.float32))
Example #20
0
    def fit(self, X, y):
        X = check_array(X)
        y = check_array(y)

        for x_i, y_i in izip(X, y):
            self.partial_fit(x_i.reshape(-1, 1), y_i.reshape(1, -1))

        return self
Example #21
0
 def partial_fit(self, X, y):
     X = check_array(X, copy=self.copy)
     y = check_array(y, copy=self.copy)
     if self._X is None:
         self._X = X
         self._y = y
     else:
         self._X = np.vstack((self._X, X))
         self._y = np.vstack((self._y, y))
Example #22
0
    def fit_score(self,X,y=None):
        """
        Generate set of hyper-sphere and return anomaly score for all points in dataset

        Parameters
        ----------

        X : numpy array
            data set

        Return
        ------

        scores : numpy array
            1-d vector with the anomaly scores for all data points
        """
        t_0 = time()

        check_array(X)
   
        self._sets_of_spheres = []
        if self.verbose:
            logger.info('generating sets of spheres...')
        for j in range(self.ensemble_size):
            X_s = np.random.permutation(X)[:self.sample_size,:]
            spheres = self._generate_spheres(X_s)
            self._sets_of_spheres.append(spheres)
    
        scores = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            if i % 1000 == 0 and self.verbose:
                logger.info('Getting anomaly score for data point %i'
                            % i)
                logger.info('X shape: %i X %i'
                            % X.shape)
            scores_i = []
            j=0
            for spheres in self._sets_of_spheres:
                score = self._score(X[i],spheres)
                if i % 1000 == 0 and j % 10 ==0 and self.verbose:
                    logger.info('Anomaly score for data point %i from estimator %i: %f'
                                % (i,j,score))
                scores_i.append(score)
                j+=1
            scores[i] = np.mean(scores_i)

        if 'X_scored' not in dir(self):
            self.X_scored = np.column_stack((X,scores))
        
        t_f = time() - t_0
        m,s = divmod(t_f, 60)
        h,m = divmod(m, 60)
        if self.verbose:
            logger.info('Total run time: %i:%i:%i'
                        % (h,m,s))

        return scores
Example #23
0
 def __init__(self, X, y, n_classes, batch_size):
     self.X = check_array(X, ensure_2d=False,
                          allow_nd=True, dtype=[np.float32, np.int64])
     self.y = check_array(y, ensure_2d=False, dtype=np.float32)
     self.n_classes = n_classes
     self.batch_size = batch_size
     self.input_shape, self.output_shape = _get_in_out_shape(
         self.X.shape, self.y.shape, n_classes, batch_size)
     self.input_dtype, self.output_dtype = self.X.dtype, self.y.dtype
def test_check_array_warn_on_dtype_deprecation():
    X = np.asarray([[0.0], [1.0]])
    Y = np.asarray([[2.0], [3.0]])
    with pytest.warns(DeprecationWarning,
                      match="'warn_on_dtype' is deprecated"):
        check_array(X, warn_on_dtype=True)
    with pytest.warns(DeprecationWarning,
                      match="'warn_on_dtype' is deprecated"):
        check_X_y(X, Y, warn_on_dtype=True)
Example #25
0
    def _init(self, X):
        """Initialize statistic and dictionary"""
        if self.projection not in ["partial", "full"]:
            raise ValueError("projection should be in {'partial', 'full'}," " got %s" % self.projection)

        X = check_array(X, dtype="float", order="F", accept_sparse="csr")

        self.sparse_ = sp.issparse(X)

        n_rows, n_cols = X.shape

        if self.n_samples is not None:
            self.n_samples_ = self.n_samples
        else:
            self.n_samples_ = n_rows

        self.random_state_ = check_random_state(self.random_state)

        # D dictionary
        if self.dict_init is not None:
            if self.dict_init.shape != (self.n_components, n_cols):
                raise ValueError(
                    "Initial dictionary and X shape mismatch: %r != %r"
                    % (self.dict_init.shape, (self.n_components, n_cols))
                )
            self.D_ = check_array(self.dict_init, order="C", dtype="float", copy=True)
            if self.fit_intercept:
                if not (np.all(self.D_[0] == self.D_[0].mean())):
                    raise ValueError(
                        "When fitting intercept and providing "
                        "initial dictionary, first component of"
                        " the dictionary should be "
                        "proportional to [1, ..., 1]"
                    )
                self.D_[0] = 1
        else:
            self.D_ = np.empty((self.n_components, n_cols), order="C")

            if self.fit_intercept:
                self.D_[0] = 1
                U = self.random_state_.randn(n_cols, self.n_components - 1)
                Q, _ = np.linalg.qr(U)
                self.D_[1:] = Q.T
            else:
                self.D_[:] = self.random_state_.randn(self.n_components, n_cols)

        self.D_ = np.asfortranarray(enet_scale(self.D_, l1_ratio=self.l1_ratio, radius=1))

        self.A_ = np.zeros((self.n_components, self.n_components), order="F")
        self.B_ = np.zeros((self.n_components, n_cols), order="F")

        self.counter_ = np.zeros(n_cols + 1, dtype="int")

        self.n_iter_ = np.zeros(1, dtype="long")

        self.code_ = np.zeros((self.n_samples_, self.n_components))
Example #26
0
def test_check_array_pandas_dtype_object_conversion():
    # test that data-frame like objects with dtype object
    # get converted
    X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.object)
    X_df = MockDataFrame(X)
    assert_equal(check_array(X_df).dtype.kind, "f")
    assert_equal(check_array(X_df, ensure_2d=False).dtype.kind, "f")
    # smoke-test against dataframes with column named "dtype"
    X_df.dtype = "Hans"
    assert_equal(check_array(X_df, ensure_2d=False).dtype.kind, "f")
def test_check_array_series():
    # regression test that check_array works on pandas Series
    pd = importorskip("pandas")
    res = check_array(pd.Series([1, 2, 3]), ensure_2d=False)
    assert_array_equal(res, np.array([1, 2, 3]))

    # with categorical dtype (not a numpy dtype) (GH12699)
    s = pd.Series(['a', 'b', 'c']).astype('category')
    res = check_array(s, dtype=None, ensure_2d=False)
    assert_array_equal(res, np.array(['a', 'b', 'c'], dtype=object))
Example #28
0
    def fit(self, X, y):
        """Fit OVK ridge regression model.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training data.

        y : {array-like}, shape = [n_samples] or [n_samples, n_targets]
            Target values. numpy.NaN for missing targets (semi-supervised
            learning).

        Returns
        -------
        self : returns an instance of self.
        """
        X = check_array(X, force_all_finite=True, accept_sparse=False,
                        ensure_2d=True)
        y = check_array(y, force_all_finite=False, accept_sparse=False,
                        ensure_2d=False)
        if y.ndim == 1:
            y = check_array(y, force_all_finite=True, accept_sparse=False,
                            ensure_2d=False)
        self._validate_params()

        solver_params = self.solver_params or {}

        self.linop_ = self._get_kernel_map(X, y)
        Gram = self.linop_(X)
        risk = OVKRidgeRisk(self.lbda)

        if not issubdtype(y.dtype, number):
            raise ValueError("Unknown label type: %r" % y.dtype)
        if y.ndim > 1:
            is_sup = ~all(isnan(y), axis=1)
        else:
            is_sup = ~isnan(y)

        if sum(~is_sup) > 0:
            self.L_ = _graph_Laplacian(rbf_kernel(X[~is_sup, :],
                                                  gamma=self.gamma_m))
        else:
            self.L_ = empty((0, 0))

        p = y.shape[1] if y.ndim > 1 else 1
        weight, zeronan = _SemisupLinop(self.lbda_m, is_sup, self.L_, p).gen()

        self.solver_res_ = minimize(risk.functional_grad_val,
                                    zeros(Gram.shape[1]),
                                    args=(y.ravel(), Gram, weight, zeronan),
                                    method=self.solver,
                                    jac=True,
                                    options=solver_params)
        self.dual_coefs_ = self.solver_res_.x
        return self
def test_check_array_force_all_finite_object():
    X = np.array([['a', 'b', np.nan]], dtype=object).T

    X_checked = check_array(X, dtype=None, force_all_finite='allow-nan')
    assert X is X_checked

    X_checked = check_array(X, dtype=None, force_all_finite=False)
    assert X is X_checked

    with pytest.raises(ValueError, match='Input contains NaN'):
        check_array(X, dtype=None, force_all_finite=True)
Example #30
0
    def __init__(self, y_true, y_pred, name='real_metrics'):
        super(BaseRealMetrics, self).__init__(name=name)

        # check inputs
        self._y_true = check_array(
            y_true, ensure_2d=False, ensure_min_samples=0)
        self._y_pred = check_array(
            y_pred, ensure_2d=False, ensure_min_samples=0)
        if self._y_true.shape != self._y_pred.shape:
            raise ValueError(
                'The sizes of true and predicted vectors must be equal')
 def decision_function(self, X):
     check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
     X = check_array(X)
     outlier_scores = self._calculate_decision_score(X)
     return np.array(outlier_scores)
def test_check_array_force_all_finite_valid(value, force_all_finite, retype):
    X = retype(np.arange(4).reshape(2, 2).astype(float))
    X[0, 0] = value
    X_checked = check_array(X, force_all_finite=force_all_finite,
                            accept_sparse=True)
    assert_allclose_dense_sparse(X, X_checked)
def _fit(self, X, y, sample_weight=None, check_input=True):
    # check X and y
    if check_input:
        X, y = check_X_y(
            X,
            y,
            copy=False,
            accept_sparse='csc',
            dtype=[np.float64, np.float32],
            multi_output=True,
            y_numeric=True,
        )
        y = check_array(y, copy=False, dtype=X.dtype.type, ensure_2d=False)

    if not sp.issparse(X):
        self.fit_shape_good_for_daal_ = \
            True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False
    else:
        self.fit_shape_good_for_daal_ = False

    log_str = "sklearn.linear_model." + self.__class__.__name__ + ".fit: "
    sklearn_ready = sp.issparse(X) or not self.fit_shape_good_for_daal_ or \
        X.dtype not in [np.float64, np.float32] or sample_weight is not None

    if sklearn_ready:
        if hasattr(self, 'daal_model_'):
            del self.daal_model_
        logging.info(
            log_str + get_patch_message("sklearn")
        )
        if sklearn_check_version('0.23'):
            res_new = super(ElasticNet, self).fit(
                X, y, sample_weight=sample_weight, check_input=check_input)
        else:
            res_new = super(ElasticNet, self).fit(
                X, y, check_input=check_input)
        self._gap = res_new.dual_gap_
        return res_new
    self.n_iter_ = None
    self._gap = None

    if not check_input:
        # only for compliance with Sklearn,
        # this assert is not required for Intel(R) oneAPI Data
        # Analytics Library
        print(type(X), X.flags['F_CONTIGUOUS'])
        if isinstance(X, np.ndarray) and \
                X.flags['F_CONTIGUOUS'] is False:
            # print(X.flags)
            raise ValueError("ndarray is not Fortran contiguous")

    if sklearn_check_version('1.0'):
        self._normalize = _deprecate_normalize(
            self.normalize,
            default=False,
            estimator_name=self.__class__.__name__)

    # only for pass tests
    # "check_estimators_fit_returns_self(readonly_memmap=True) and
    # check_regressors_train(readonly_memmap=True)
    if not X.flags.writeable:
        X = np.copy(X)
    if not y.flags.writeable:
        y = np.copy(y)
    logging.info(log_str + get_patch_message("daal"))

    if self.__class__.__name__ == "ElasticNet":
        res = _daal4py_fit_enet(self, X, y, check_input=check_input)
    else:
        res = _daal4py_fit_lasso(self, X, y, check_input=check_input)
    if res is None:
        if hasattr(self, 'daal_model_'):
            del self.daal_model_
        logging.info(
            log_str + get_patch_message("sklearn_after_daal")
        )
        if sklearn_check_version('0.23'):
            res_new = super(ElasticNet, self).fit(
                X, y, sample_weight=sample_weight, check_input=check_input)
        else:
            res_new = super(ElasticNet, self).fit(
                X, y, check_input=check_input)
        self._gap = res_new.dual_gap_
        return res_new
    return res
Example #34
0
    def _fit_resample(self, X, y):
        self.n_features_ = X.shape[1]
        self._validate_estimator()

        # compute the median of the standard deviation of the minority class
        target_stats = Counter(y)
        class_minority = min(target_stats, key=target_stats.get)

        X_continuous = X[:, self.continuous_features_]
        X_continuous = check_array(X_continuous, accept_sparse=["csr", "csc"])
        X_minority = _safe_indexing(
            X_continuous, np.flatnonzero(y == class_minority)
        )

        if sparse.issparse(X):
            if X.format == "csr":
                _, var = csr_mean_variance_axis0(X_minority)
            else:
                _, var = csc_mean_variance_axis0(X_minority)
        else:
            var = X_minority.var(axis=0)
        self.median_std_ = np.median(np.sqrt(var))

        X_categorical = X[:, self.categorical_features_]
        if X_continuous.dtype.name != "object":
            dtype_ohe = X_continuous.dtype
        else:
            dtype_ohe = np.float64
        self.ohe_ = OneHotEncoder(
            sparse=True, handle_unknown="ignore", dtype=dtype_ohe
        )
        # the input of the OneHotEncoder needs to be dense
        X_ohe = self.ohe_.fit_transform(
            X_categorical.toarray()
            if sparse.issparse(X_categorical)
            else X_categorical
        )

        # we can replace the 1 entries of the categorical features with the
        # median of the standard deviation. It will ensure that whenever
        # distance is computed between 2 samples, the difference will be equal
        # to the median of the standard deviation as in the original paper.
        X_ohe.data = (
            np.ones_like(X_ohe.data, dtype=X_ohe.dtype) * self.median_std_ / 2
        )
        X_encoded = sparse.hstack((X_continuous, X_ohe), format="csr")

        X_resampled, y_resampled = super()._fit_resample(X_encoded, y)

        # reverse the encoding of the categorical features
        X_res_cat = X_resampled[:, self.continuous_features_.size:]
        X_res_cat.data = np.ones_like(X_res_cat.data)
        X_res_cat_dec = self.ohe_.inverse_transform(X_res_cat)

        if sparse.issparse(X):
            X_resampled = sparse.hstack(
                (
                    X_resampled[:, : self.continuous_features_.size],
                    X_res_cat_dec,
                ),
                format="csr",
            )
        else:
            X_resampled = np.hstack(
                (
                    X_resampled[:, : self.continuous_features_.size].toarray(),
                    X_res_cat_dec,
                )
            )

        indices_reordered = np.argsort(
            np.hstack((self.continuous_features_, self.categorical_features_))
        )
        if sparse.issparse(X_resampled):
            # the matrix is supposed to be in the CSR format after the stacking
            col_indices = X_resampled.indices.copy()
            for idx, col_idx in enumerate(indices_reordered):
                mask = X_resampled.indices == col_idx
                col_indices[mask] = idx
            X_resampled.indices = col_indices
        else:
            X_resampled = X_resampled[:, indices_reordered]

        return X_resampled, y_resampled
Example #35
0
    def _fit(self, X, Y, weights, check_input):
        time_init = time.perf_counter()

        # Check parameters and input arrays
        _check_parameters(**self.get_params())
        _check_X_Y_weights(X, Y, weights)

        self._n_scenarios = len(X)

        if self.verbose:
            logging.info("Optimal binning started.")
            logging.info("Options: check parameters.")

        _check_parameters(**self.get_params())

        # Pre-processing
        if self.verbose:
            logging.info("Pre-processing started.")

        time_preprocessing = time.perf_counter()

        self._n_samples_scenario = [len(x) for x in X]
        self._n_samples = sum(self._n_samples_scenario)

        if self.verbose:
            logging.info("Pre-processing: number of samples: {}"
                         .format(self._n_samples))

        [x_clean, y_clean, x_missing, y_missing, x_special, y_special,
         w] = split_data_scenarios(X, Y, weights, self.special_codes,
                                   check_input)

        self._time_preprocessing = time.perf_counter() - time_preprocessing

        if self.verbose:
            n_clean = len(x_clean)
            n_missing = len(x_missing)
            n_special = len(x_special)

            logging.info("Pre-processing: number of clean samples: {}"
                         .format(n_clean))

            logging.info("Pre-processing: number of missing samples: {}"
                         .format(n_missing))

            logging.info("Pre-processing: number of special samples: {}"
                         .format(n_special))

            logging.info("Pre-processing terminated. Time: {:.4f}s"
                         .format(self._time_preprocessing))

        # Pre-binning
        if self.verbose:
            logging.info("Pre-binning started.")

        time_prebinning = time.perf_counter()

        if self.user_splits is not None:
            user_splits = check_array(
                self.user_splits, ensure_2d=False, dtype=None,
                force_all_finite=True)

            user_splits = np.unique(self.user_splits)

            splits, n_nonevent, n_event = self._prebinning_refinement(
                user_splits, x_clean, y_clean, y_missing, y_special)
        else:
            splits, n_nonevent, n_event = self._fit_prebinning(
                w, x_clean, y_clean, y_missing, y_special, self.class_weight)

        self._n_prebins = len(n_nonevent)

        self._time_prebinning = time.perf_counter() - time_prebinning

        if self.verbose:
            logging.info("Pre-binning: number of prebins: {}"
                         .format(self._n_prebins))
            logging.info("Pre-binning: number of refinements: {}"
                         .format(self._n_refinements))

            logging.info("Pre-binning terminated. Time: {:.4f}s"
                         .format(self._time_prebinning))

        # Optimization
        self._fit_optimizer(splits, n_nonevent, n_event, weights)

        # Post-processing
        if self.verbose:
            logging.info("Post-processing started.")
            logging.info("Post-processing: compute binning information.")

        time_postprocessing = time.perf_counter()

        self._n_nonevent = 0
        self._n_event = 0
        self._binning_tables = []

        for s in range(self._n_scenarios):
            s_n_nonevent, s_n_event = bin_info(
                self._solution, n_nonevent[:, s], n_event[:, s],
                self._n_nonevent_missing[s], self._n_event_missing[s],
                self._n_nonevent_special[s], self._n_event_special[s], None,
                None, [])

            self._n_nonevent += s_n_nonevent
            self._n_event += s_n_event

            binning_table = BinningTable(
                self.name, self.dtype, self._splits_optimal, s_n_nonevent,
                s_n_event, None, None, self.user_splits)

            self._binning_tables.append(binning_table)

        self._binning_table = BinningTable(
            self.name, self.dtype, self._splits_optimal, self._n_nonevent,
            self._n_event, None, None, self.user_splits)

        self._time_postprocessing = time.perf_counter() - time_postprocessing

        if self.verbose:
            logging.info("Post-processing terminated. Time: {:.4f}s"
                         .format(self._time_postprocessing))

        self._time_total = time.perf_counter() - time_init

        if self.verbose:
            logging.info("Optimal binning terminated. Status: {}. "
                         "Time: {:.4f}s".format(
                            self._status, self._time_total))

        # Completed successfully
        self._logger.close()
        self._is_fitted = True

        return self
Example #36
0
def _daal_fit_classifier(self, X, y, sample_weight=None):
    y = check_array(y, ensure_2d=False, dtype=None)
    y, expanded_class_weight = self._validate_y_class_weight(y)
    n_classes_ = self.n_classes_[0]
    self.n_features_ = X.shape[1]
    self.n_features_in_ = X.shape[1]

    if expanded_class_weight is not None:
        if sample_weight is not None:
            sample_weight = sample_weight * expanded_class_weight
        else:
            sample_weight = expanded_class_weight
    if sample_weight is not None:
        sample_weight = [sample_weight]

    rs_ = check_random_state(self.random_state)
    seed_ = rs_.randint(0, np.iinfo('i').max)

    if n_classes_ < 2:
        raise ValueError(
            "Training data only contain information about one class.")

    # create algorithm
    X_fptype = getFPType(X)
    daal_engine_ = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)
    features_per_node_ = _to_absolute_max_features(self.max_features,
                                                   X.shape[1],
                                                   is_classification=True)

    n_samples_bootstrap_ = _get_n_samples_bootstrap(
        n_samples=X.shape[0], max_samples=self.max_samples)

    if not self.bootstrap and self.oob_score:
        raise ValueError("Out of bag estimation only available"
                         " if bootstrap=True")

    dfc_algorithm = daal4py.decision_forest_classification_training(
        nClasses=int(n_classes_),
        fptype=X_fptype,
        method='hist' if daal_check_version(
            (2021, 'P', 200)) else 'defaultDense',
        nTrees=int(self.n_estimators),
        observationsPerTreeFraction=n_samples_bootstrap_
        if self.bootstrap is True else 1.,
        featuresPerNode=int(features_per_node_),
        maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
        minObservationsInLeafNode=(self.min_samples_leaf if isinstance(
            self.min_samples_leaf, numbers.Integral) else int(
                ceil(self.min_samples_leaf * X.shape[0]))),
        engine=daal_engine_,
        impurityThreshold=float(0.0 if self.min_impurity_split is None else
                                self.min_impurity_split),
        varImportance="MDI",
        resultsToCompute="",
        memorySavingMode=False,
        bootstrap=bool(self.bootstrap),
        minObservationsInSplitNode=(self.min_samples_split if isinstance(
            self.min_samples_split, numbers.Integral) else int(
                ceil(self.min_samples_split * X.shape[0]))),
        minWeightFractionInLeafNode=self.min_weight_fraction_leaf,
        minImpurityDecreaseInSplitNode=self.min_impurity_decrease,
        maxLeafNodes=0 if self.max_leaf_nodes is None else self.max_leaf_nodes,
        maxBins=self.maxBins,
        minBinSize=self.minBinSize)
    self._cached_estimators_ = None
    # compute
    dfc_trainingResult = dfc_algorithm.compute(X, y, sample_weight)

    # get resulting model
    model = dfc_trainingResult.model
    self.daal_model_ = model

    # compute oob_score_
    #if self.oob_score:
    #    self.estimators_ = self._estimators_
    #    self._set_oob_score(X, y)

    return self
Example #37
0
    def fit(self, X, seq_length=[], n_iter=2):
        X = check_array(X)
        n_seq = max(len(seq_length), 1)
        n_sts = len(self.states)
        n_obs, n_dim = X.shape
        n_mix = self.gmm_k
        if self.initialized is False:
            try:
                self._init_param(X)
            except:
                raise ValueError("Problem with initialization")
        if self.method == 'gmm':
            for i in range(n_iter):
                start_prob_accum = np.zeros(n_sts)
                xisum_accum = np.zeros((n_sts, n_sts))
                gamma_sum_accum = np.zeros((n_sts, n_mix))
                means_num_accum = np.zeros((n_sts, n_mix, n_dim))
                sigmas_num_accum = np.zeros((n_sts, n_mix, n_dim, n_dim))
                logA = np.log(self.A)
                obs_logprob_accum = 0
                print('in loop 1', i)
                for j in range(n_seq):
                    print('in loop 2 ', j)
                    S = self._get_seq(X, seq_length, j)
                    emitlogprob = self._log_emission(S)
                    fw = self._pass_forward(emitlogprob, logA)
                    bw = self._pass_backward(emitlogprob, logA)
                    obs_logprob_accum += self._precision_lse(fw[0], axis=0)
                    print('fw: \n', fw)
                    print('bw: \n', bw)
                    print('framelogprob: \n', emitlogprob)
                    # accumulate start prob
                    temp = fw[0] + bw[0]
                    self._log_normalize(temp, axis=0)
                    start_prob_accum += np.exp(temp)
                    # accumulate logxisum
                    logxisum = self._log_sum_xi(S, fw, bw, emitlogprob, logA)
                    print('logxisum: \n', logxisum)
                    xisum_accum += np.exp(logxisum)
                    # accumulate gamma_sum
                    gamma = np.exp(self._log_gamma(S, fw, bw))
                    print('gamma; \n', gamma)
                    gamma_sum_accum += gamma.sum(axis=0)
                    # accumulate means
                    means_num_accum += np.einsum('jik,jh->ikh', gamma, S)
                    # accumulate sigma
                    d = S[:, None, None, :] - self.means[None, :, :, :]
                    sigmas_num_accum += np.sum(
                        gamma[:, :, :, None, None] *
                        (d[:, :, :, :, None] * d[:, :, :, None, :]),
                        axis=0)
                # update start_prob
                prev_start_prob = self.start_prob.copy()
                self.start_prob = start_prob_accum
                self._normalize(self.start_prob, axis=0)
                print('update startprob: ', prev_start_prob, '->',
                      self.start_prob)
                # update A
                new_A = xisum_accum
                self._normalize(new_A, axis=1)
                # update weights
                new_weights = gamma_sum_accum.copy()
                print('new_weights num\n', new_weights)
                self._normalize(new_weights, axis=1)
                # update mean
                means_den = gamma_sum_accum[:, :, None]
                new_means = means_num_accum / means_den
                print('new_means num\n', means_num_accum)
                # update sigma
                sigmas_den = gamma_sum_accum[:, :, None, None]
                new_sigmas = sigmas_num_accum / sigmas_den
                print('new_covs num\n', sigmas_num_accum)
                # update parameter
                self._update_param({
                    'A': new_A,
                    'C': new_weights,
                    'MU': new_means,
                    'SIGMA': new_sigmas
                })
                # update Convergence
                self._update_convergence(obs_logprob_accum)
                if self._converged:
                    print('Done')
                    print('Convergence: \n', self._convergence)
                    break
        else:
            raise ValueError("\"{0}\" method not supported".format(
                self.method))

        return
Example #38
0
    def fit(self, X, y=None):
        """Fit the model of minimum divergence / maximum entropy subject to
        constraints on the feature expectations <f_i(X)> = X[0].

        Parameters
        ----------
        X : ndarray (dense) of shape [1, n_features]
            A row vector (1 x n_features matrix) representing desired
            expectations of features.  The curious shape is deliberate: models
            of minimum divergence / maximum entropy depend on the data only
            through the feature expectations.

        y : is not used: placeholder to allow for usage in a Pipeline.

        Returns
        -------
        self

        """

        X = np.atleast_2d(X)
        X = check_array(X)
        n_samples = X.shape[0]
        if n_samples != 1:
            raise ValueError('X must have only one row')

        # Extract a 1d array of the feature expectations
        # K = np.asarray(X[0], float)
        K = X[0]
        assert K.ndim == 1

        # Store the desired feature expectations as a member variable
        self.K = K

        self._check_features()

        # Sanity checks
        try:
            self.params
        except AttributeError:
            self.resetparams(len(K))
        else:
            assert len(self.params) == len(K)

        # Don't reset the number of function and gradient evaluations to zero
        # self.fnevals = 0
        # self.gradevals = 0

        # Make a copy of the parameters
        oldparams = np.array(self.params)

        callback = self.log

        retval = optimize.minimize(self.dual,
                                   oldparams,
                                   args=(),
                                   method=self.algorithm,
                                   jac=self.grad,
                                   tol=self.tol,
                                   options={
                                       'maxiter': self.maxiter,
                                       'disp': self.verbose
                                   },
                                   callback=callback)
        newparams = retval.x
        func_calls = retval.nfev

        # if self.algorithm == 'CG':
        #     retval = optimize.fmin_cg(self.dual, oldparams, self.grad, (), self.avegtol, \
        #                               maxiter=self.maxiter, full_output=1, \
        #                               disp=self.verbose, retall=0,
        #                               callback=callback)
        #
        #     (newparams, fopt, func_calls, grad_calls, warnflag) = retval
        #
        # elif self.algorithm == 'LBFGSB':
        #     if callback is not None:
        #         raise NotImplementedError("L-BFGS-B optimization algorithm"
        #                 " does not yet support callback functions for"
        #                 " testing with an external sample")
        #     retval = optimize.fmin_l_bfgs_b(self.dual, oldparams, \
        #                 self.grad, args=(), bounds=self.bounds, pgtol=self.maxgtol,
        #                 maxfun=self.maxfun)
        #     (newparams, fopt, d) = retval
        #     warnflag, func_calls = d['warnflag'], d['funcalls']
        #     if self.verbose:
        #         print(self.algorithm + " optimization terminated successfully.")
        #         print("\tFunction calls: " + str(func_calls))
        #         # We don't have info on how many gradient calls the LBFGSB
        #         # algorithm makes
        #
        # elif self.algorithm == 'BFGS':
        #     retval = optimize.fmin_bfgs(self.dual, oldparams, \
        #                                 self.grad, (), self.tol, \
        #                                 maxiter=self.maxiter, full_output=1, \
        #                                 disp=self.verbose, retall=0, \
        #                                 callback=callback)
        #
        #     (newparams, fopt, gopt, Lopt, func_calls, grad_calls, warnflag) = retval
        #
        # elif self.algorithm == 'Powell':
        #     retval = optimize.fmin_powell(self.dual, oldparams, args=(), \
        #                            xtol=self.tol, ftol = self.tol, \
        #                            maxiter=self.maxiter, full_output=1, \
        #                            disp=self.verbose, retall=0, \
        #                            callback=callback)
        #
        #     (newparams, fopt, direc, numiter, func_calls, warnflag) = retval
        #     # fmin_powell seems to turn newparams into a 0d array
        #     newparams = np.atleast_1d(newparams)
        #
        # elif self.algorithm == 'Nelder-Mead':
        #     retval = optimize.fmin(self.dual, oldparams, args=(), \
        #                            xtol=self.tol, ftol = self.tol, \
        #                            maxiter=self.maxiter, full_output=1, \
        #                            disp=self.verbose, retall=0, \
        #                            callback=callback)
        #
        #     (newparams, fopt, numiter, func_calls, warnflag) = retval
        #
        # else:
        #     raise AttributeError("the specified algorithm '" + str(self.algorithm)
        #             + "' is unsupported.  Options are 'CG', 'LBFGSB', "
        #             "'Nelder-Mead', 'Powell', and 'BFGS'")

        if np.any(self.params != newparams):
            self.setparams(newparams)
        self.func_calls = func_calls
        return self
Example #39
0
def calculate_all_metafeatures(X,
                               y,
                               categorical,
                               dataset_name,
                               calculate=None,
                               dont_calculate=None,
                               densify_threshold=1000):
    logger = getLogger(__name__)
    """Calculate all metafeatures."""
    helper_functions.clear()
    metafeatures.clear()
    mf_ = dict()

    visited = set()
    to_visit = deque()
    to_visit.extend(metafeatures)

    X_transformed = None
    y_transformed = None

    # TODO calculate the numpy metafeatures after all others to consume less
    # memory
    while len(to_visit) > 0:
        name = to_visit.pop()
        if calculate is not None and name not in calculate:
            continue
        if dont_calculate is not None and name in dont_calculate:
            continue

        if name in npy_metafeatures:
            if X_transformed is None:
                # TODO make sure this is done as efficient as possible (no copy for
                # sparse matrices because of wrong sparse format)
                sparse = scipy.sparse.issparse(X)
                assert not any(categorical)
                X_transformed = X
                imputer = Imputer(strategy='mean', copy=False)
                X_transformed = imputer.fit_transform(X_transformed)
                center = not scipy.sparse.isspmatrix(X_transformed)
                standard_scaler = StandardScaler(copy=False, with_mean=center)
                X_transformed = standard_scaler.fit_transform(X_transformed)
                # categorical_transformed = [False] * X_transformed.shape[1]

                # Densify the transformed matrix
                if not sparse and scipy.sparse.issparse(X_transformed):
                    bytes_per_float = X_transformed.dtype.itemsize
                    num_elements = X_transformed.shape[
                        0] * X_transformed.shape[1]
                    megabytes_required = num_elements * bytes_per_float / 1000 / 1000
                    if megabytes_required < densify_threshold:
                        X_transformed = X_transformed.todense()

                # This is not only important for datasets which are somehow
                # sorted in a strange way, but also prevents lda from failing in
                # some cases.
                # Because this is advanced indexing, a copy of the data is returned!!!
                X_transformed = check_array(X_transformed,
                                            force_all_finite=True,
                                            accept_sparse='csr')
                rs = np.random.RandomState(42)
                indices = np.arange(X_transformed.shape[0])
                rs.shuffle(indices)
                # TODO Shuffle inplace
                X_transformed = X_transformed[indices]
                y_transformed = y[indices]

            X_ = X_transformed
            y_ = y_transformed
            # categorical_ = categorical_transformed
            categorical_ = categorical
        else:
            X_ = X
            y_ = y
            categorical_ = categorical

        dependency = metafeatures.get_dependency(name)
        if dependency is not None:
            is_metafeature = dependency in metafeatures
            is_helper_function = dependency in helper_functions

            if is_metafeature and is_helper_function:
                raise NotImplementedError()
            elif not is_metafeature and not is_helper_function:
                raise ValueError(dependency)
            elif is_metafeature and not metafeatures.is_calculated(dependency):
                to_visit.appendleft(name)
                continue
            elif is_helper_function and not helper_functions.is_calculated(
                    dependency):
                logger.info('%s: Going to calculate: %s', dataset_name,
                            dependency)
                value = helper_functions[dependency](X_, y_, categorical_)
                helper_functions.set_value(dependency, value)
                mf_[dependency] = value

        logger.info('%s: Going to calculate: %s', dataset_name, name)

        value = metafeatures[name](X_, y_, categorical_)
        metafeatures.set_value(name, value)
        mf_[name] = value
        visited.add(name)

    mf_ = DatasetMetafeatures(dataset_name, mf_)
    return mf_
Example #40
0
    def solve(self, X, missing_mask):
        if self.task_type == "Regression":
            self.sign = "MAE"
            self.loss_fn = tf.keras.losses.MeanAbsoluteError()
        elif self.task_type == 'Classification':
            self.loss_fn = tf.keras.losses.BinaryCrossentropy()
            self.sign = "BCE"
        self.group_pre_u = {}
        self.group_pre_i = {}
        X = check_array(X, force_all_finite=False)

        X_init = X.copy()
        self.loss_record = []
        self.valloss_record = []

        X_filled = X
        observed_mask = ~missing_mask
        max_singular_value = self._max_singular_value(X_filled)
        if self.verbose:
            if self.auto_tune == False:
                print("[SoftImpute] Max Singular Value of X_init = %f" %
                      (max_singular_value))

        if self.shrinkage_value:
            shrinkage_value = self.shrinkage_value
        else:
            # totally hackish heuristic: keep only components
            # with at least 1/50th the max singular value
            shrinkage_value = max_singular_value / 50.0

        if self.auto_tune == False:
            print('#####mf_training#####')

        X_reconstruction, rank, U_thresh, V_thresh, S_thresh = self._svd_step(
            X_filled, shrinkage_value, tuning=False, max_rank=self.max_rank)
        X_reconstruction = self.clip(X_reconstruction)

        converged = self._converged(X_old=X_filled,
                                    X_new=X_reconstruction,
                                    missing_mask=missing_mask)
        X_filled[missing_mask] = X_reconstruction[missing_mask]

        self.ini_u = X_filled

        for i in range(self.max_iters):
            X_reconstruction, rank, U_thresh, V_thresh, S_thresh = self._svd_step(
                X_filled, shrinkage_value, tuning=True, max_rank=self.max_rank)
            X_reconstruction = self.clip(X_reconstruction)

            pred = self.predict(X_reconstruction, self.tr_Xi)
            if self.wc == 'warm':
                predval = self.predict(X_reconstruction, self.val_Xi)
            else:
                predval = self.predict_cold(U_thresh, V_thresh, S_thresh)

            if self.task_type == 'Classification':
                #self.loss_record.append(tf.keras.losses.MeanAbsoluteError(self.tr_y.ravel(),tf.sigmoid(pred.ravel()+self.pred_tr.ravel()).numpy()).numpy())
                #self.valloss_record.append(tf.keras.losses.MeanAbsoluteError(self.val_y.ravel(),tf.sigmoid(predval.ravel()+self.pred_val.ravel()).numpy()).numpy())
                self.loss_record.append(
                    self.loss_fn(
                        self.tr_y.ravel(),
                        tf.sigmoid(pred.ravel() +
                                   self.pred_tr.ravel()).numpy()).numpy())
                self.valloss_record.append(
                    self.loss_fn(
                        self.val_y.ravel(),
                        tf.sigmoid(predval.ravel() +
                                   self.pred_val.ravel()).numpy()).numpy())
            else:
                self.loss_record.append(
                    self.loss_fn(self.tr_y.ravel(),
                                 pred.ravel() + self.pred_tr.ravel()).numpy())
                self.valloss_record.append(
                    self.loss_fn(self.val_y.ravel(),
                                 predval.ravel() +
                                 self.pred_val.ravel()).numpy())

            # print error on observed data
            if self.verbose:
                self._verbose(X_reconstruction, i, rank)

            converged = self._converged(X_old=X_filled,
                                        X_new=X_reconstruction,
                                        missing_mask=missing_mask)
            X_filled[missing_mask] = X_reconstruction[missing_mask]
            # print(X_reconstruction[observed_mask])

            if converged:
                break

        if self.verbose:
            if self.auto_tune == False:
                print("[SoftImpute] Stopped after iteration %d for lambda=%f" %
                      (i + 1, shrinkage_value))

        if self.change_mode:
            X_filled = X_reconstruction

        var_whole_u = np.var(U_thresh)
        var_whole_i = np.var(V_thresh.T)
        print('final num of user group:', len(self.match_u))
        print('final num of item group:', len(self.match_i))
        return X_filled, U_thresh, V_thresh, S_thresh, self.loss_record, self.valloss_record, self.match_u, self.match_i, self.var_u, self.var_i, var_whole_u, var_whole_i, self.group_pre_u, self.group_pre_i
def robust_single_linkage(X,
                          cut,
                          k=5,
                          alpha=1.4142135623730951,
                          gamma=5,
                          metric='euclidean',
                          algorithm='best',
                          memory=Memory(cachedir=None, verbose=0),
                          leaf_size=40,
                          core_dist_n_jobs=4,
                          **kwargs):
    """Perform robust single linkage clustering from a vector array
    or distance matrix.

    Parameters
    ----------
    X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
            array of shape (n_samples, n_samples)
        A feature array, or array of distances between samples if
        ``metric='precomputed'``.

    cut : float
        The reachability distance value to cut the cluster heirarchy at
        to derive a flat cluster labelling.

    k : int, optional (default=5)
        Reachability distances will be computed with regard to the `k`
        nearest neighbors.

    alpha : float, optional (default=np.sqrt(2))
        Distance scaling for reachability distance computation. Reachability
        distance is computed as
        $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.

    gamma : int, optional (default=5)
        Ignore any clusters in the flat clustering with size less than gamma,
        and declare points in such clusters as noise points.

    metric : string, or callable, optional (default='euclidean')
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string or callable, it must be one of
        the options allowed by metrics.pairwise.pairwise_distances for its
        metric parameter.
        If metric is "precomputed", X is assumed to be a distance matrix and
        must be square.

    algorithm : string, optional (default='best')
        Exactly which algorithm to use; hdbscan has variants specialised
        for different characteristics of the data. By default this is set
        to ``best`` which chooses the "best" algorithm given the nature of
        the data. You can force other options if you believe you know
        better. Options are:
            * ``generic``
            * ``best``
            * ``prims_kdtree``
            * ``prims_balltree``
            * ``boruvka_kdtree``
            * ``boruvka_balltree``

    memory : Instance of joblib.Memory or string (optional)
        Used to cache the output of the computation of the tree.
        By default, no caching is done. If a string is given, it is the
        path to the caching directory.

    leaf_size : int, optional (default=40)
        Leaf size for trees responsible for fast nearest
        neighbour queries.

    core_dist_n_jobs : int, optional
        Number of parallel jobs to run in core distance computations (if
        supported by the specific algorithm). For ``core_dist_n_jobs``
        below -1, (n_cpus + 1 + core_dist_n_jobs) are used.
        (default 4)

    Returns
    -------
    labels : ndarray, shape (n_samples, )
        Cluster labels for each point.  Noisy samples are given the label -1.

    single_linkage_tree : ndarray, shape (n_samples - 1, 4)
        The single linkage tree produced during clustering in scipy
        hierarchical clustering format
        (see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html).

    References
    ----------
    .. [1] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the
       cluster tree. In Advances in Neural Information Processing Systems
       (pp. 343-351).

    """

    if not isinstance(k, int) or k < 1:
        raise ValueError('k must be an integer greater than zero!')

    if not isinstance(alpha, float) or alpha < 1.0:
        raise ValueError('alpha must be a float greater than or equal to 1.0!')

    if not isinstance(gamma, int) or gamma < 1:
        raise ValueError('gamma must be an integer greater than zero!')

    if not isinstance(leaf_size, int) or leaf_size < 1:
        raise ValueError('Leaf size must be at least one!')

    if metric == 'minkowski':
        if 'p' not in kwargs or kwargs['p'] is None:
            raise TypeError('Minkowski metric given but no p value supplied!')
        if kwargs['p'] < 0:
            raise ValueError('Minkowski metric with negative p value is not'
                             ' defined!')

    X = check_array(X, accept_sparse='csr')
    if isinstance(memory, six.string_types):
        memory = Memory(cachedir=memory, verbose=0)

    if algorithm != 'best':
        if algorithm == 'generic':
            single_linkage_tree = memory.cache(_rsl_generic)(X, k, alpha,
                                                             metric, **kwargs)
        elif algorithm == 'prims_kdtree':
            single_linkage_tree = memory.cache(_rsl_prims_kdtree)(X, k, alpha,
                                                                  metric,
                                                                  **kwargs)
        elif algorithm == 'prims_balltree':
            single_linkage_tree = memory.cache(_rsl_prims_balltree)(X, k,
                                                                    alpha,
                                                                    metric,
                                                                    **kwargs)
        elif algorithm == 'boruvka_kdtree':
            single_linkage_tree = \
                memory.cache(_rsl_boruvka_kdtree)(X, k, alpha, metric, leaf_size,
                                                  core_dist_n_jobs, **kwargs)
        elif algorithm == 'boruvka_balltree':
            single_linkage_tree = \
                memory.cache(_rsl_boruvka_balltree)(X, k, alpha, metric, leaf_size,
                                                    core_dist_n_jobs, **kwargs)
        else:
            raise TypeError('Unknown algorithm type %s specified' % algorithm)
    else:
        if issparse(X) or metric not in FAST_METRICS:
            # We can't do much with sparse matrices ...
            single_linkage_tree = memory.cache(_rsl_generic)(X, k, alpha,
                                                             metric, **kwargs)
        elif metric in KDTree.valid_metrics:
            # Need heuristic to decide when to go to boruvka;
            # still debugging for now
            if X.shape[1] > 128:
                single_linkage_tree = memory.cache(_rsl_prims_kdtree)(X, k,
                                                                      alpha,
                                                                      metric,
                                                                      **kwargs)
            else:
                single_linkage_tree = \
                    memory.cache(_rsl_boruvka_kdtree)(X, k, alpha, metric,
                                                        leaf_size,
                                                        core_dist_n_jobs,
                                                        **kwargs)
        else:  # Metric is a valid BallTree metric
            # Need heuristic to decide when to go to boruvka;
            # still debugging for now
            if X.shape[1] > 128:
                single_linkage_tree = memory.cache(_rsl_prims_kdtree)(X, k,
                                                                      alpha,
                                                                      metric,
                                                                      **kwargs)
            else:
                single_linkage_tree = \
                    memory.cache(_rsl_boruvka_balltree)(X, k, alpha, metric,
                                                        leaf_size,
                                                        core_dist_n_jobs,
                                                        **kwargs)

    labels = single_linkage_tree.get_clusters(cut, gamma)

    return labels, single_linkage_tree.to_numpy()
Example #42
0
def normalize(X, norm='l2', axis=1, copy=True):
    """Scale input vectors individually to unit norm (vector length).

    Parameters
    ----------
    X : array or scipy.sparse matrix with shape [n_samples, n_features]
        The data to normalize, element by element.
        scipy.sparse matrices should be in CSR format to avoid an
        un-necessary copy.

    norm : 'l1' or 'l2', optional ('l2' by default)
        The norm to use to normalize each non zero sample (or each non-zero
        feature if axis is 0).

    axis : 0 or 1, optional (1 by default)
        axis used to normalize the data along. If 1, independently normalize
        each sample, otherwise (if 0) normalize each feature.

    copy : boolean, optional, default True
        set to False to perform inplace row normalization and avoid a
        copy (if the input is already a numpy array or a scipy.sparse
        CSR matrix and if axis is 1).

    See also
    --------
    :class:`sklearn.preprocessing.Normalizer` to perform normalization
    using the ``Transformer`` API (e.g. as part of a preprocessing
    :class:`sklearn.pipeline.Pipeline`)
    """
    if norm not in ('l1', 'l2'):
        raise ValueError("'%s' is not a supported norm" % norm)

    if axis == 0:
        sparse_format = 'csc'
    elif axis == 1:
        sparse_format = 'csr'
    else:
        raise ValueError("'%d' is not a supported axis" % axis)

    X = check_array(X, sparse_format, copy=copy)
    warn_if_not_float(X, 'The normalize function')
    if axis == 0:
        X = X.T

    if sparse.issparse(X):
        X = check_array(X, accept_sparse=sparse_format, dtype=np.float64)
        if norm == 'l1':
            inplace_csr_row_normalize_l1(X)
        elif norm == 'l2':
            inplace_csr_row_normalize_l2(X)
    else:
        if norm == 'l1':
            norms = np.abs(X).sum(axis=1)
            norms[norms == 0.0] = 1.0
        elif norm == 'l2':
            norms = row_norms(X)
            norms[norms == 0.0] = 1.0
        X /= norms[:, np.newaxis]

    if axis == 0:
        X = X.T

    return X
Example #43
0
def split_data(dtype,
               x,
               y,
               special_codes=None,
               cat_cutoff=None,
               user_splits=None,
               check_input=True,
               outlier_detector=None,
               outlier_params=None,
               fix_lb=None,
               fix_ub=None,
               class_weight=None,
               sample_weight=None):
    """Split data into clean, missing and special values data.

    Parameters
    ----------
    dtype : str or None, optional (default=None)
        The variable data type. Supported data types are "numerical" for
        continuous and ordinal variables and "categorical" for categorical
        and nominal variables.

    x : array-like, shape = (n_samples)
        Data samples, where n_samples is the number of samples.

    y : array-like, shape = (n_samples)
        Target vector relative to x.

    special_codes : array-like or None, optional (default=None)
        List of special codes. Use special codes to specify the data values
        that must be treated separately.

    cat_cutoff : float or None, optional (default=None)
        Generate bin others with categories in which the fraction of
        occurrences is below the  ``cat_cutoff`` value. This option is
        available when ``dtype`` is "categorical".

    user_splits : array-like or None, optional (default=None)
        The list of pre-binning split points when ``dtype`` is "numerical" or
        the list of prebins when ``dtype`` is "categorical".

    check_input : bool, (default=True)
        If False, the input arrays x and y will not be checked.

    outlier_detector : str or None (default=None)
        The outlier detection method. Supported methods are "range" to use
        the interquartile range based method or "zcore" to use the modified
        Z-score method.

    outlier_params : dict or None (default=None)
        Dictionary of parameters to pass to the outlier detection method.

    fix_lb : float or None (default=None)
        Lower bound or minimum admissible value.

    fix_ub : float or None (default=None)
        Upper bound or maximum admissible value.

    class_weight : dict, "balanced" or None, optional (default=None)
        Weights associated with classes in the form ``{class_label: weight}``.
        If None, all classes are supposed to have weight one.

    sample_weight : array-like of shape (n_samples,) (default=None)
        Array of weights that are assigned to individual samples.

    Returns
    -------
    x_clean : array, shape = (n_clean)
        Clean data samples

    y_clean : array, shape = (n_clean)
        Clean target samples.

    x_missing : array, shape = (n_missing)
        Missing data samples.

    y_missing : array, shape = (n_missing)
        Missing target samples.

    x_special : array, shape = (n_special)
        Special data samples.

    y_special : array, shape = (n_special)
        Special target samples.

    y_others : array, shape = (n_others)
        Others target samples.

    categories : array, shape (n_categories)
        List of categories.

    others : array, shape (n_other_categories)
        List of other categories.

    sw_clean : array-like
        Clean data sample weigth.

    sw_missing : array-like
        Missing data sample weight.

    sw_special : array-like
        Special data sample weight.

    sw_others : array-like
        Others data sample weight.
    """
    if outlier_detector is not None:
        if outlier_detector not in ("range", "zscore"):
            raise ValueError('Invalid value for outlier_detector. Allowed '
                             'string values are "range" and "zscore".')

        if outlier_params is not None:
            if not isinstance(outlier_params, dict):
                raise TypeError("outlier_params must be a dict or None; "
                                "got {}.".format(outlier_params))

    if fix_lb is not None:
        if not isinstance(fix_lb, numbers.Number):
            raise ValueError("fix_lb must be a number; got {}.".format(fix_lb))

    if fix_ub is not None:
        if not isinstance(fix_ub, numbers.Number):
            raise ValueError("fix_ub must be a number; got {}.".format(fix_ub))

    if fix_lb is not None and fix_ub is not None:
        if fix_lb > fix_ub:
            raise ValueError("fix_lb must be <= fix_ub; got {} <= {}.".format(
                fix_lb, fix_ub))

    if check_input:
        x = check_array(x,
                        ensure_2d=False,
                        dtype=None,
                        force_all_finite='allow-nan')

        y = check_array(y, ensure_2d=False, dtype=None, force_all_finite=True)

        check_consistent_length(x, y)

    x = np.asarray(x)
    y = np.asarray(y)

    sample_weight = _check_sample_weight(sample_weight, x, dtype=x.dtype)

    if class_weight is not None:
        classes = np.unique(y)
        le = LabelEncoder()
        class_weight_ = compute_class_weight(class_weight, classes, y)
        sample_weight *= class_weight_[le.fit_transform(y)]

    if np.issubdtype(x.dtype, np.number) and np.issubdtype(y.dtype, np.number):
        missing_mask = np.isnan(x) | np.isnan(y)
    else:
        missing_mask = pd.isnull(x) | pd.isnull(y)

    if special_codes is None:
        clean_mask = ~missing_mask

        x_clean = x[clean_mask]
        y_clean = y[clean_mask]
        x_missing = x[missing_mask]
        y_missing = y[missing_mask]
        x_special = []
        y_special = []
        sw_clean = sample_weight[clean_mask]
        sw_missing = sample_weight[missing_mask]
        sw_special = []
    else:
        special_mask = pd.Series(x).isin(special_codes).values

        clean_mask = ~missing_mask & ~special_mask

        x_clean = x[clean_mask]
        y_clean = y[clean_mask]
        x_missing = x[missing_mask]
        y_missing = y[missing_mask]
        x_special = x[special_mask]
        y_special = y[special_mask]
        sw_clean = sample_weight[clean_mask]
        sw_missing = sample_weight[missing_mask]
        sw_special = sample_weight[special_mask]

    if dtype == "numerical":
        if outlier_detector is not None:
            if outlier_detector == "range":
                detector = RangeDetector()
            elif outlier_detector == "zscore":
                detector = ModifiedZScoreDetector()

            if outlier_params is not None:
                detector.set_params(**outlier_params)

            mask_outlier = detector.fit(x_clean).get_support()
            x_clean = x_clean[~mask_outlier]
            y_clean = y_clean[~mask_outlier]
            sw_clean = sw_clean[~mask_outlier]

        if fix_lb is not None or fix_ub is not None:
            if fix_lb is not None:
                mask = x_clean >= fix_lb
            elif fix_ub is not None:
                mask = x_clean <= fix_ub
            else:
                mask = (x_clean >= fix_lb) & (x_clean <= fix_ub)

            x_clean = x_clean[mask]
            y_clean = y_clean[mask]
            sw_clean = sw_clean[mask]

    if dtype == "categorical" and user_splits is None:
        if cat_cutoff is not None:
            mask_others, others = categorical_cutoff(x_clean, y_clean,
                                                     cat_cutoff)

            y_others = y_clean[mask_others]
            sw_others = sw_clean[mask_others]
            x_clean = x_clean[~mask_others]
            y_clean = y_clean[~mask_others]
            sw_clean = sw_clean[~mask_others]
        else:
            y_others = []
            others = []
            sw_others = []

        categories, x_clean = categorical_transform(x_clean, y_clean)

        return (x_clean, y_clean, x_missing, y_missing, x_special, y_special,
                y_others, categories, others, sw_clean, sw_missing, sw_special,
                sw_others)
    else:
        return (x_clean, y_clean, x_missing, y_missing, x_special, y_special,
                [], [], [], sw_clean, sw_missing, sw_special, [])
def test_check_array_force_all_finite_object_unsafe_casting(
        X, err_msg, force_all_finite):
    # casting a float array containing NaN or inf to int dtype should
    # raise an error irrespective of the force_all_finite parameter.
    with pytest.raises(ValueError, match=err_msg):
        check_array(X, dtype=int, force_all_finite=force_all_finite)
Example #45
0
    def fit(self, X, y=None):
        """Compute clustering of the data.

        Parameters
        ----------
        X: ndarray, shape = [n_samples, n_features]
            Training data.
        y: Ignored

        Returns
        -------
        self: `ReNA` object
        """

        X = check_array(X,
                        ensure_min_features=2,
                        ensure_min_samples=2,
                        estimator=self)
        n_features = X.shape[1]

        if not isinstance(self.mask_img, (str, Nifti1Image)):
            raise ValueError("The mask image should be a Niimg-like"
                             "object. Instead a %s object was provided." %
                             type(self.mask_img))

        if self.memory is None or isinstance(self.memory, str):
            self.memory_ = Memory(cachedir=self.memory,
                                  verbose=max(0, self.verbose - 1))
        else:
            self.memory_ = self.memory

        if self.n_clusters <= 0:
            raise ValueError("n_clusters should be an integer greater than 0."
                             " %s was provided." % str(self.n_clusters))

        if self.n_iter <= 0:
            raise ValueError("n_iter should be an integer greater than 0."
                             " %s was provided." % str(self.n_iter))

        if self.n_clusters > n_features:
            self.n_clusters = n_features
            warnings.warn("n_clusters should be at most the number of "
                          "features. Taking n_clusters = %s instead." %
                          str(n_features))

        n_components, labels = self.memory_.cache(
            recursive_neighbor_agglomeration)(X,
                                              self.mask_img,
                                              self.n_clusters,
                                              n_iter=self.n_iter,
                                              threshold=self.threshold,
                                              verbose=self.verbose)

        sizes = np.bincount(labels)
        sizes = sizes[sizes > 0]

        self.labels_ = labels
        self.n_clusters_ = np.unique(self.labels_).shape[0]
        self.sizes_ = sizes

        return self
    def fit(self, X, y, sample_weight=None, check_input=True):
        """Fit model with coordinate descent.

        Parameters
        ----------
        X : {ndarray, sparse matrix} of (n_samples, n_features)
            Data

        y : {ndarray, sparse matrix} of shape (n_samples,) or \
            (n_samples, n_targets)
            Target. Will be cast to X's dtype if necessary

        sample_weight : float or array-like of shape (n_samples,), default=None
            Sample weight.

        check_input : bool, default=True
            Allow to bypass several input checking.
            Don't use this parameter unless you know what you do.

        Notes
        -----

        Coordinate descent is an algorithm that considers each column of
        data at a time hence it will automatically convert the X input
        as a Fortran-contiguous numpy array if necessary.

        To avoid memory re-allocation it is advised to allocate the
        initial data in memory directly using that format.
        """
        #check X and y
        if check_input:
            X, y = check_X_y(X, y, copy=False, accept_sparse='csc', dtype=[np.float64, np.float32], multi_output=True, y_numeric=True)
            y = check_array(y, copy=False, dtype=X.dtype.type, ensure_2d=False)
        else:
            #only for compliance with Sklearn, this assert is not required for Intel(R) oneAPI Data
            #Analytics Library
            if (isinstance(X, np.ndarray) and X.flags['F_CONTIGUOUS'] == False):
                raise ValueError("ndarray is not Fortran contiguous")

        if isinstance(X, np.ndarray):
            self.fit_shape_good_for_daal_ = True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False  
        else:
            self.fit_shape_good_for_daal_ = False

        if (sp.issparse(X) or
                sample_weight is not None or
                not self.fit_shape_good_for_daal_ or
                not (X.dtype == np.float64 or X.dtype == np.float32)):
            if hasattr(self, 'daal_model_'):
                del self.daal_model_
            logging.info("sklearn.linear_model.Lasso.fit: " + get_patch_message("sklearn"))
            res_new = super(ElasticNet, self).fit(X, y, sample_weight=sample_weight, check_input=check_input)
            self._gap = res_new.dual_gap_
            return res_new
        self.n_iter_ = None
        self._gap = None
        #only for pass tests "check_estimators_fit_returns_self(readonly_memmap=True) and check_regressors_train(readonly_memmap=True)
        if  not (X.flags.writeable):
            X = np.copy(X)
        if  not (y.flags.writeable):
            y = np.copy(y)
        logging.info("sklearn.linear_model.Lasso.fit: " + get_patch_message("daal"))
        res = _daal4py_fit_lasso(self, X, y, check_input=check_input)
        if res is None:
            if hasattr(self, 'daal_model_'):
                del self.daal_model_
            logging.info("sklearn.linear_model.Lasso.fit: " + get_patch_message("sklearn_after_daal"))
            res_new = super(ElasticNet, self).fit(X, y, sample_weight=sample_weight, check_input=check_input)
            self._gap = res_new.dual_gap_
            return res_new
        return res
Example #47
0
    def __init__(
        self,
        data,
        metric="euclidean",
        metric_kwds=None,
        n_neighbors=15,
        n_trees=8,
        leaf_size=15,
        pruning_level=0,
        tree_init=True,
        random_state=np.random,
        algorithm="standard",
        max_candidates=20,
        n_iters=10,
        delta=0.001,
        rho=0.5,
    ):

        self.n_trees = n_trees
        self.n_neighbors = n_neighbors
        self.metric = metric

        if metric_kwds is None:
            metric_kwds = dict()
        self.metric_kwds = metric_kwds

        self.leaf_size = leaf_size
        self.prune_level = pruning_level
        self.max_candidates = max_candidates
        self.n_iters = n_iters
        self.delta = delta
        self.rho = rho
        self.dim = data.shape[1]

        data = check_array(data).astype(np.float32)

        if not tree_init or n_trees == 0:
            self.tree_init = False
        else:
            self.tree_init = True

        self._dist_args = tuple(metric_kwds.values())

        self.random_state = check_random_state(random_state)

        self._raw_data = data.copy()

        if callable(metric):
            self._distance_func = metric
        elif metric in dist.named_distances:
            self._distance_func = dist.named_distances[metric]

        if metric in ("cosine", "correlation", "dice", "jaccard"):
            self._angular_trees = True
        else:
            self._angular_trees = False

        self.rng_state = self.random_state.randint(INT32_MIN, INT32_MAX, 3).astype(
            np.int64
        )

        indices = np.arange(data.shape[0])

        if self.tree_init:
            if self._angular_trees:
                self._rp_forest = [
                    flatten_tree(
                        make_angular_tree(
                            data, indices, self.rng_state, self.leaf_size
                        ),
                        self.leaf_size,
                    )
                    for i in range(n_trees)
                ]
            else:
                self._rp_forest = [
                    flatten_tree(
                        make_euclidean_tree(
                            data, indices, self.rng_state, self.leaf_size
                        ),
                        self.leaf_size,
                    )
                    for i in range(n_trees)
                ]

            leaf_array = np.vstack([tree.indices for tree in self._rp_forest])
        else:
            self._rp_forest = None
            leaf_array = np.array([[-1]])

        if algorithm == "standard" or leaf_array.shape[0] == 1:
            nn_descent = make_nn_descent(self._distance_func, self._dist_args)
            self._neighbor_graph = nn_descent(
                self._raw_data,
                self.n_neighbors,
                self.rng_state,
                self.max_candidates,
                self.n_iters,
                self.delta,
                self.rho,
                True,
                leaf_array,
            )
        elif algorithm == "alternative":
            self._search = make_initialized_nnd_search(
                self._distance_func, self._dist_args
            )

            init_heaps = make_heap_initializer(self._distance_func, self._dist_args)
            graph_heap, search_heap = init_heaps(
                self._raw_data, self.n_neighbors, leaf_array
            )
            graph = lil_matrix((data.shape[0], data.shape[0]))
            graph.rows, graph.data = deheap_sort(graph_heap)
            graph = graph.maximum(graph.transpose())
            self._neighbor_graph = deheap_sort(
                self._search(
                    self._raw_data,
                    graph.indptr,
                    graph.indices,
                    search_heap,
                    self._raw_data,
                )
            )
        else:
            raise ValueError("Unknown algorithm selected")

        self._search_graph = lil_matrix(
            (data.shape[0], data.shape[0]), dtype=np.float32
        )
        self._search_graph.rows = self._neighbor_graph[0]
        self._search_graph.data = self._neighbor_graph[1]
        self._search_graph = self._search_graph.maximum(
            self._search_graph.transpose()
        ).tocsr()
        self._search_graph = prune(
            self._search_graph,
            prune_level=self.prune_level,
            n_neighbors=self.n_neighbors,
        )
        self._search_graph = (self._search_graph != 0).astype(np.int8)

        self._random_init, self._tree_init = make_initialisations(
            self._distance_func, self._dist_args
        )

        self._search = make_initialized_nnd_search(self._distance_func, self._dist_args)

        return
Example #48
0
def estimate_sigma(
    X: np.ndarray,
    subsample: Optional[int] = None,
    method: str = "median",
    percent: Optional[float] = 0.15,
    scale: float = 1.0,
    random_state: Optional[int] = None,
) -> float:
    """A function to provide a reasonable estimate of the sigma values
    for the RBF kernel using different methods. 

    Parameters
    ----------
    X : array, (n_samples, d_dimensions)
        The data matrix to be estimated.
    
    method : str, default: 'median'
        different methods used to estimate the sigma for the rbf kernel
        matrix.
        * Mean
        * Median
        * Silverman
        * Scott - very common for density estimation
    percent : float, default=0.15
        The kth percentage of distance chosen
    
    scale : float, default=None
        Option to scale the sigma chosen. Typically used with the
        median or mean method as they are data dependent.
    
    random_state : int, (default: None)
        controls the seed for the subsamples drawn to represent
        the data distribution
    
    Returns
    -------
    sigma : float
        The estimated sigma value
        
    Resources
    ---------
    - Original MATLAB function: https://goo.gl/xYoJce
    Information
    -----------
    Author : J. Emmanuel Johnson
    Email  : [email protected]
           : [email protected]
    Date   : 6 - July - 2018
    """
    X = check_array(X, ensure_2d=True)

    rng = check_random_state(random_state)

    # subsampling
    [n_samples, d_dimensions] = X.shape

    if subsample is not None:
        X = rng.permutation(X)[:subsample, :]

    if method == "mean":
        if percent is None:
            sigma = np.mean(pdist(X))
        else:
            kth_sample = int(percent * n_samples)
            sigma = np.mean(np.sort(squareform(pdist(X)))[:, kth_sample])

    elif method == "median":
        if percent is None:
            sigma = np.median(pdist(X))
        else:
            kth_sample = int(percent * n_samples)
            sigma = np.median(np.sort(squareform(pdist(X)))[:, kth_sample])

    elif method == "silverman":
        sigma = np.power(
            n_samples * (d_dimensions + 2.0) / 4.0, -1.0 / (d_dimensions + 4)
        )

    elif method == "scott":
        sigma = np.power(n_samples, -1.0 / (d_dimensions + 4))

    else:
        raise ValueError('Unrecognized mode "{}".'.format(method))

    # scale the sigma by a factor
    if scale is not None:
        sigma *= scale

    # return sigma
    return sigma
Example #49
0
    def _make_samples(self,
                      X,
                      y_type,
                      nn_data,
                      nn_num,
                      n_samples,
                      step_size=1.):
        """A support function that returns artificial samples constructed along
        the line connecting nearest neighbours.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Points from which the points will be created.

        y_type : str or int
            The minority target value, just so the function can return the
            target values for the synthetic variables with correct length in
            a clear format.

        nn_data : ndarray, shape (n_samples_all, n_features)
            Data set carrying all the neighbours to be used

        nn_num : ndarray, shape (n_samples_all, k_nearest_neighbours)
            The nearest neighbours of each sample in nn_data.

        n_samples : int
            The number of samples to generate.

        step_size : float, optional (default=1.)
            The step size to create samples.

        Returns
        -------
        X_new : ndarray, shape (n_samples_new, n_features)
            Synthetically generated samples.

        y_new : ndarray, shape (n_samples_new, )
            Target values for synthetic samples.

        """

        # Check the consistency of X
        X = check_array(X)

        # A matrix to store the synthetic samples
        X_new = np.zeros((n_samples, X.shape[1]))

        # Set seeds
        np.random.seed(self.rs_)
        seeds = np.random.randint(low=0,
                                  high=100 * len(nn_num.flatten()),
                                  size=n_samples)

        # Randomly pick samples to construct neighbours from
        np.random.seed(self.rs_)
        samples = np.random.randint(low=0,
                                    high=len(nn_num.flatten()),
                                    size=n_samples)

        # Loop over the NN matrix and create new samples
        for i, n in enumerate(samples):
            # NN lines relate to original sample, columns to its
            # nearest neighbours
            row, col = divmod(n, nn_num.shape[1])

            # Take a step of random size (0,1) in the direction of the
            # n nearest neighbours
            if self.rs_ is None:
                np.random.seed(seeds[i])
            else:
                np.random.seed(self.rs_)
            step = step_size * np.random.uniform()

            # Construct synthetic sample
            X_new[i] = X[row] - step * (X[row] - nn_data[nn_num[row, col]])

        # The returned target vector is simply a repetition of the
        # minority label
        y_new = np.array([y_type] * len(X_new))

        if self.verbose:
            print("Generated {} new samples ...".format(len(X_new)))

        return X_new, y_new
Example #50
0
    def fit_transform(self, X, y, sample_weight=None):
        """ Fit and Transform data into modified features
        (before being passed to penalised regression step).
        If `linear_features=True` then this will be scaled linear features
        followed by the one-hot-encoding signifying which rules are "on".
        Otherwise this is just the one-hot-encoding signifying which rules are "on".
        
        Fitting process involves fitted bagged/boosted tree model to generate rules
        and then using these in a penalised logistic regression.
        
        X: pandas.DataFrame or numpy.ndarray
            Features
            
        y: pandas.Series or numpy.ndarray
            Target
            
        Returns
        -------
        
        sparse array
        """
        # Instantiate rule ensemble generator and set parameters
        if isinstance(self.base_estimator, XGBClassifier):
            self.base_estimator.set_params(n_estimators=self.n_estimators,
                                           silent=(self.verbose > 0),
                                           max_depth=self.max_depth,
                                           n_jobs=self.n_jobs)
        elif isinstance(self.base_estimator, RandomForestClassifier):
            warnings.warn(
                'This base_estimator implementation has not been tested in a while!'
            )
            self.base_estimator.set_params(n_estimators=self.n_estimators,
                                           verbose=self.verbose,
                                           max_depth=self.max_depth,
                                           n_jobs=self.n_jobs)
        elif isinstance(self.base_estimator, GradientBoostingClassifier):
            warnings.warn(
                'This base_estimator implementation has not been tested in a while!'
            )
            self.base_estimator.set_params(n_estimators=self.n_estimators,
                                           verbose=self.verbose,
                                           max_depth=self.max_depth,
                                           n_jobs=self.n_jobs)
        else:
            raise NotImplementedError

        # Name features
        if isinstance(X, DataFrame):
            self.features = X.columns.values
        else:
            self.features = ['f' + str(i) for i in range(X.shape[1])]

        # Check input
        X = check_array(X)

        # Generate and extract rules
        if not self.rand_tree_size:
            self.base_estimator.fit(X, y, sample_weight=sample_weight)
            if isinstance(self.base_estimator, XGBClassifier):
                self._rule_dump = self.base_estimator._Booster.get_dump()
        else:
            NotImplementedError(
            )  # TODO: work out how to incrementally train XGB

        if self.verbose > 0:
            print('fitting trees')

        # For each tree: get leaf numbers and map them to [0, num leaves]
        # before one-hot encoding them
        n_values = "auto"
        leaves_l = []
        for tree_i in self._rule_dump:
            leaves = [int(i) for i in re.findall(r'([0-9]+):leaf=', tree_i)]
            leaves_l.append(leaves)
        self._one_hot_encoder = LabelOneHotEncoder(leaves_l)

        if self.verbose > 0:
            print('setup encoding')

        # Scale and centre linear features
        X = self.ext_scaler.fit_transform(X)

        if self.linear_features:
            # Linear features must be scaled to have same weighting as an average rule
            self._scaler = FriedScaler(quantile=self.linear_feature_quantile)
            X_scale = self._scaler.fit_transform(X)
            X_transform = hstack([
                X_scale,
                self._one_hot_encoder.fit_transform(
                    self.base_estimator.apply(X).reshape(
                        -1, self.n_estimators))
            ])
        else:
            X_transform = self._one_hot_encoder.fit_transform(
                self.base_estimator.apply(X).reshape(-1, self.n_estimators))

        if self.verbose > 0:
            print('encoded')

        # Fit sparse linear model to rules (and optionally linear features)
        self.LR = LogisticRegression(C=self.C,
                                     penalty=self.penalty,
                                     class_weight=self.class_weight,
                                     warm_start=self.warm_start,
                                     solver='saga',
                                     verbose=self.verbose)
        self.LR.fit(X_transform, y, sample_weight=sample_weight)

        if self.verbose > 0:
            print('fitted')

        # Mask features with zero co-efficients
        # self.feature_mask_ = np.arange(self.LR.coef_.size)
        self.feature_mask_ = self.LR.coef_.nonzero()[1]

        self.coef_ = self.LR.coef_[0, self.feature_mask_]
        self.intercept_ = self.LR.intercept_
        self.get_feature_names()
        assert self.features_.size == self.feature_mask_.size
        return X_transform
    def bootstrap(self, X_list, n_sampling, start_from_t=1):
        """Evaluate the statistical reliability of DAG based on the bootstrapping.

        Parameters
        ----------
        X_list : array-like, shape (X, ...)
            Longitudinal multiple datasets for training, where ``X`` is an dataset.
            The shape of ''X'' is (n_samples, n_features), 
            where ``n_samples`` is the number of samples and ``n_features`` is the number of features.
        n_sampling : int
            Number of bootstrapping samples.

        Returns
        -------
        results : array-like, shape (BootstrapResult, ...)
            Returns the results of bootstrapping for multiple datasets.
        """
        # Check parameters
        if not isinstance(X_list, (list, np.ndarray)):
            raise ValueError('X_list must be a array-like.')

        if len(X_list) < 2:
            raise ValueError(
                'X_list must be a list containing at least two items')

        self._T = len(X_list)
        self._n = check_array(X_list[0]).shape[0]
        self._p = check_array(X_list[0]).shape[1]
        X_t = []
        for X in X_list:
            X = check_array(X)
            if X.shape != (self._n, self._p):
                raise ValueError('X_list must be a list with the same shape')
            X_t.append(X)

        # Bootstrapping
        adjacency_matrices = np.zeros(
            (n_sampling, self._T, 1 + self._n_lags, self._p, self._p))
        total_effects = np.zeros(
            (n_sampling, self._T * self._p, self._T * self._p))
        for i in range(n_sampling):
            resampled_X_t = np.empty((self._T, self._n, self._p))
            indices = np.random.randint(0, self._n, size=(self._n, ))
            for t in range(self._T):
                resampled_X_t[t] = X_t[t][indices, :]

            self.fit(resampled_X_t)
            adjacency_matrices[i] = self._adjacency_matrices

            # Calculate total effects
            for from_t in range(start_from_t, self._T):
                for c, from_ in enumerate(self._causal_orders[from_t]):
                    to_t = from_t
                    for to in self._causal_orders[from_t][c + 1:]:
                        total_effects[i, to_t * self._p + to,
                                      from_t * self._p +
                                      from_] = self.estimate_total_effect(
                                          X_t, from_t, from_, to_t, to)

                    for to_t in range(from_t + 1, self._T):
                        for to in self._causal_orders[to_t]:
                            total_effects[i, to_t * self._p + to,
                                          from_t * self._p +
                                          from_] = self.estimate_total_effect(
                                              X_t, from_t, from_, to_t, to)

        return LongitudinalBootstrapResult(self._T, adjacency_matrices,
                                           total_effects)
Example #52
0
def dtw_region(x,
               y,
               dist='square',
               region=None,
               return_cost=False,
               return_accumulated=False,
               return_path=False):
    """Dynamic Time Warping (DTW) distance with a constraint region.

    Parameters
    ----------
    x : array-like, shape = (n_timestamps_1,)
        First array.

    y : array-like, shape = (n_timestamps_2,)
        Second array

    dist : 'square', 'absolute' or callable (default = 'square')
        Distance used. If 'square', the squared difference is used.
        If 'absolute', the absolute difference is used. If callable,
        it must be a function with a numba.njit() decorator that takes
        as input two numbers (two arguments) and returns a number.

     region : None or array-like, shape = (2, n_timestamps_1)
         Constraint region. If None, no constraint region is used. Otherwise,
         the first row consists of the starting indices (included) and the
         second row consists of the ending indices (excluded) of the valid rows
         for each column.

    return_cost : bool (default = False)
        If True, the cost matrix is returned.

    return_accumulated : bool (default = False)
        If True, the accumulated cost matrix is returned.

    return_path : bool (default = False)
        If True, the optimal path is returned.

    Returns
    -------
    dtw_dist : float
        The DTW distance between the two arrays.

    cost_mat : array, shape = (n_timestamps_1, n_timestamps_2)
        Cost matrix. Only returned if ``return_cost=True``.

    acc_cost_mat : array, shape = (n_timestamps_1 n_timestamps_2)
        Accumulated cost matrix. Only returned if ``return_accumulated=True``.

    path : array, shape = (2, path_length)
        The optimal path along the cost matrix. The first row consists
        of the indices of the optimal path for x while the second row
        consists of the indices of the optimal path for y. Only returned
        if ``return_path=True``.

    Examples
    --------
    >>> from pyts.metrics import dtw_region
    >>> x = [0, 1, 1]
    >>> y = [2, 0, 1]
    >>> region = [[0, 1, 1], [2, 2, 3]]
    >>> dtw_region(x, y, region=region)
    2.23...

    """
    x, y, n_timestamps_1, n_timestamps_2 = _check_input_dtw(x, y)

    if region is not None:
        region = check_array(region, dtype='int64')
        if region.shape != (2, n_timestamps_1):
            raise ValueError("If 'region' is not None, it must be array-like "
                             "with shape (2, n_timestamps_1).")

    cost_mat = cost_matrix(x, y, dist=dist, region=region)
    acc_cost_mat = accumulated_cost_matrix(cost_mat)
    dtw_dist = acc_cost_mat[-1, -1]
    if dist == 'square':
        dtw_dist = sqrt(dtw_dist)

    res = _return_results(dtw_dist, cost_mat, acc_cost_mat, return_cost,
                          return_accumulated, return_path)
    return res
Example #53
0
    def partial_fit(self, X, y, monitor=None, sample_weight=None, **kwargs):
        """Fit the model on a batch of training data.

        Parameters
        ----------
        X : numpy array or sparse matrix of shape [n_samples, n_features]
            Training data
        y : numpy array of shape [n_samples, n_targets]
            Target values
        monitor : callable, optional
            The monitor is called after each iteration with the current
            iteration, a reference to the estimator, and a dictionary with
            {'loss': loss_value} representing the loss calculated by the
            objective function at this iteration.
            If the callable returns True the fitting procedure is stopped.
            The monitor can be used for various things such as computing
            held-out estimates, early stopping, model introspection,
            and snapshoting.
        sample_weight : numpy array of shape [n_samples,]
            Per-sample weights. Re-scale the loss per sample.
            Higher weights force the estimator to put more emphasis
            on these samples. Sample weights are normalized per-batch.

        Returns
        -------
        self : returns an instance of self.
        """

        X, y = self._check_inputs(X, y)
        assert self.batch_size > 0, "batch_size <= 0"

        if sample_weight is not None:
            sample_weight = check_array(sample_weight, ensure_2d=False)

        # Initialize the model if it hasn't been already by a previous call.
        if self._is_fitted:
            y = self._transform_targets(y)
        else:
            self._random_state = check_random_state(self.random_state)
            self._fit_targets(y, **kwargs)
            y = self._transform_targets(y)

            self.is_sparse_ = sp.issparse(X)
            self.input_layer_sz_ = X.shape[1]

            # Set which layer transform function points to
            if self.transform_layer_index is None:
                self._transform_layer_index = len(self.hidden_units) - 1
            else:
                self._transform_layer_index = self.transform_layer_index

            if (self._transform_layer_index < -1 or
                    self._transform_layer_index >= len(self.hidden_units)):
                raise ValueError(
                    "`transform_layer_index` must be in the range "
                    "[-1, len(hidden_units)-1]!")

            # Instantiate the graph.  TensorFlow seems easier to use by just
            # adding to the default graph, and as_default lets you temporarily
            # set a graph to be treated as the default graph.
            self.graph_ = Graph()
            with self.graph_.as_default():
                tf_random_seed.set_random_seed(
                    self._random_state.randint(0, 10000000))

                tf.get_variable_scope().set_initializer(
                    tf.contrib.layers.xavier_initializer())

                self._build_tf_graph()

                # Train model parameters.
                self._session.run(tf.global_variables_initializer())

            # Set an attributed to mark this as at least partially fitted.
            self._is_fitted = True

        # Train the model with the given data.
        with self.graph_.as_default():
            n_examples = X.shape[0]
            indices = np.arange(n_examples)

            for epoch in range(self.n_epochs):
                self._random_state.shuffle(indices)
                for start_idx in range(0, n_examples, self.batch_size):
                    batch_ind = indices[start_idx:start_idx + self.batch_size]

                    if sample_weight is None:
                        batch_sample_weight = None
                    else:
                        batch_sample_weight = sample_weight[batch_ind]

                    feed_dict = self._make_feed_dict(
                        X[batch_ind],
                        y[batch_ind],
                        sample_weight=batch_sample_weight)
                    obj_val, _ = self._session.run(
                        [self._obj_func, self._train_step],
                        feed_dict=feed_dict)
                    _LOGGER.debug("objective: %.4f, epoch: %d, idx: %d",
                                  obj_val, epoch, start_idx)

                _LOGGER.info("objective: %.4f, epoch: %d, idx: %d",
                             obj_val, epoch, start_idx)

                if monitor:
                    stop_early = monitor(epoch, self, {'loss': obj_val})
                    if stop_early:
                        _LOGGER.info(
                            "stopping early due to monitor function.")
                        return self

        return self
Example #54
0
    def fit(self, X, y=None):
        """Fit detector. y is ignored in unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : Ignored
            Not used, present for API consistency by convention.

        Returns
        -------
        self : object
            Fitted estimator.
        """

        X = check_array(X)
        self._set_n_classes(y)
        self.train_history = defaultdict(list)
        names = locals()
        epochs = self.stop_epochs * 3
        stop = 0
        latent_size = X.shape[1]
        data_size = X.shape[0]
        # Create discriminator
        self.discriminator = create_discriminator(latent_size, data_size)
        self.discriminator.compile(optimizer=SGD(lr=self.lr_d,
                                                 decay=self.decay,
                                                 momentum=self.momentum),
                                   loss='binary_crossentropy')

        # Create k combine models
        for i in range(self.k):
            names['sub_generator' + str(i)] = create_generator(latent_size)
            latent = Input(shape=(latent_size, ))
            names['fake' + str(i)] = names['sub_generator' + str(i)](latent)
            self.discriminator.trainable = False
            names['fake' + str(i)] = self.discriminator(names['fake' + str(i)])
            names['combine_model' + str(i)] = Model(latent,
                                                    names['fake' + str(i)])
            names['combine_model' + str(i)].compile(optimizer=SGD(
                lr=self.lr_g, decay=self.decay, momentum=self.momentum),
                                                    loss='binary_crossentropy')

        # Start iteration
        for epoch in range(epochs):
            if self.verbose:
                print('Epoch {} of {}'.format(epoch + 1, epochs))
            batch_size = min(500, data_size)
            num_batches = int(data_size / batch_size)

            for index in range(num_batches):
                if self.verbose:
                    print('\nTesting for epoch {} index {}:'.format(
                        epoch + 1, index + 1))

                # Generate noise
                noise_size = batch_size
                noise = np.random.uniform(0, 1, (int(noise_size), latent_size))

                # Get training data
                data_batch = X[index * batch_size:(index + 1) * batch_size]

                # Generate potential outliers
                block = ((1 + self.k) * self.k) // 2
                for i in range(self.k):
                    if i != (self.k - 1):
                        noise_start = int(
                            (((self.k + (self.k - i + 1)) * i) / 2) *
                            (noise_size // block))
                        noise_end = int(
                            (((self.k + (self.k - i)) *
                              (i + 1)) / 2) * (noise_size // block))
                        names['noise' + str(i)] = noise[noise_start:noise_end]
                        names['generated_data' +
                              str(i)] = names['sub_generator' +
                                              str(i)].predict(names['noise' +
                                                                    str(i)],
                                                              verbose=0)
                    else:
                        noise_start = int(
                            (((self.k + (self.k - i + 1)) * i) / 2) *
                            (noise_size // block))
                        names['noise' + str(i)] = noise[noise_start:noise_size]
                        names['generated_data' +
                              str(i)] = names['sub_generator' +
                                              str(i)].predict(names['noise' +
                                                                    str(i)],
                                                              verbose=0)

                # Concatenate real data to generated data
                for i in range(self.k):
                    if i == 0:
                        x = np.concatenate(
                            (data_batch, names['generated_data' + str(i)]))
                    else:
                        x = np.concatenate(
                            (x, names['generated_data' + str(i)]))
                y = np.array([1] * batch_size + [0] * int(noise_size))

                # Train discriminator
                discriminator_loss = self.discriminator.train_on_batch(x, y)
                self.train_history['discriminator_loss'].append(
                    discriminator_loss)

                # Get the target value of sub-generator
                pred_scores = self.discriminator.predict(X).ravel()

                for i in range(self.k):
                    names['T' + str(i)] = np.percentile(
                        pred_scores, i / self.k * 100)
                    names['trick' + str(i)] = np.array(
                        [float(names['T' + str(i)])] * noise_size)

                # Train generator
                noise = np.random.uniform(0, 1, (int(noise_size), latent_size))
                if stop == 0:
                    for i in range(self.k):
                        names['sub_generator' + str(i) + '_loss'] = \
                            names['combine_model' + str(i)].train_on_batch(
                                noise, names['trick' + str(i)])
                        self.train_history['sub_generator{}_loss'.format(
                            i)].append(names['sub_generator' + str(i) +
                                             '_loss'])
                else:
                    for i in range(self.k):
                        names['sub_generator' + str(i) +
                              '_loss'] = names['combine_model' +
                                               str(i)].evaluate(
                                                   noise,
                                                   names['trick' + str(i)])
                        self.train_history['sub_generator{}_loss'.format(
                            i)].append(names['sub_generator' + str(i) +
                                             '_loss'])

                generator_loss = 0
                for i in range(self.k):
                    generator_loss = generator_loss + names['sub_generator' +
                                                            str(i) + '_loss']
                generator_loss = generator_loss / self.k
                self.train_history['generator_loss'].append(generator_loss)

                # Stop training generator
                if epoch + 1 > self.stop_epochs:
                    stop = 1

        # Detection result
        self.decision_scores_ = self.discriminator.predict(X).ravel()
        self._process_decision_scores()
        return self
Example #55
0
def _compute_shap_values(pipeline, features, training_data=None):
    """Computes SHAP values for each feature.

    Arguments:
        pipeline (PipelineBase): Trained pipeline whose predictions we want to explain with SHAP.
        features (pd.DataFrame): Dataframe of features - needs to correspond to data the pipeline was fit on.
        training_data (pd.DataFrame): Training data the pipeline was fit on.
            For non-tree estimators, we need a sample of training data for the KernelSHAP algorithm.

    Returns:
        dict or list(dict): For regression problems, a dictionary mapping a feature name to a list of SHAP values.
            For classification problems, returns a list of dictionaries. One for each class.
    """
    estimator = pipeline.estimator
    if estimator.model_family == ModelFamily.BASELINE:
        raise ValueError(
            "You passed in a baseline pipeline. These are simple enough that SHAP values are not needed."
        )

    feature_names = features.columns

    # This is to make sure all dtypes are numeric - SHAP algorithms will complain otherwise.
    # Sklearn components do this under-the-hood so we're not changing the data the model was trained on.
    # Catboost can naturally handle string-encoded categorical features so we don't need to convert to numeric.
    if estimator.model_family != ModelFamily.CATBOOST:
        features = check_array(features.values)

    if estimator.model_family.is_tree_estimator():
        # Because of this issue: https://github.com/slundberg/shap/issues/1215
        if estimator.model_family == ModelFamily.XGBOOST:
            raise NotImplementedError(
                "SHAP values cannot currently be computed for xgboost models.")
        if estimator.model_family == ModelFamily.CATBOOST and pipeline.problem_type == ProblemTypes.MULTICLASS:
            # Will randomly segfault
            raise NotImplementedError(
                "SHAP values cannot currently be computed for catboost models for multiclass problems."
            )
        # Use tree_path_dependent to avoid linear runtime with dataset size
        with warnings.catch_warnings(record=True) as ws:
            explainer = shap.TreeExplainer(
                estimator._component_obj,
                feature_perturbation="tree_path_dependent")
        if ws:
            logger.debug(
                f"_compute_shap_values TreeExplainer: {ws[0].message}")
        shap_values = explainer.shap_values(features, check_additivity=False)
        # shap only outputs values for positive class for Catboost binary estimators.
        # this modifies the output to match the output format of other binary estimators.
        # Ok to fill values of negative class with zeros since the negative class will get dropped
        # in the UI anyways.
        if estimator.model_family == ModelFamily.CATBOOST and pipeline.problem_type == ProblemTypes.BINARY:
            shap_values = [np.zeros(shap_values.shape), shap_values]
    else:
        if training_data is None:
            raise ValueError(
                "You must pass in a value for parameter 'training_data' when the pipeline "
                "does not have a tree-based estimator. "
                f"Current estimator model family is {estimator.model_family}.")

        # More than 100 datapoints can negatively impact runtime according to SHAP
        # https://github.com/slundberg/shap/blob/master/shap/explainers/kernel.py#L114
        sampled_training_data_features = shap.sample(training_data, 100)
        sampled_training_data_features = check_array(
            sampled_training_data_features)

        if pipeline.problem_type == ProblemTypes.REGRESSION:
            link_function = "identity"
            decision_function = estimator._component_obj.predict
        else:
            link_function = "logit"
            decision_function = estimator._component_obj.predict_proba
        with warnings.catch_warnings(record=True) as ws:
            explainer = shap.KernelExplainer(decision_function,
                                             sampled_training_data_features,
                                             link_function)
            shap_values = explainer.shap_values(features)
        if ws:
            logger.debug(
                f"_compute_shap_values KernelExplainer: {ws[0].message}")

    # classification problem
    if isinstance(shap_values, list):
        mappings = []
        for class_shap_values in shap_values:
            mappings.append(
                _create_dictionary(class_shap_values, feature_names))
        return mappings
    # regression problem
    elif isinstance(shap_values, np.ndarray):
        return _create_dictionary(shap_values, feature_names)
    else:
        raise ValueError(
            f"Unknown shap_values datatype {str(type(shap_values))}!")
    def fit(self, X, lengths=None):

        X = check_array(X)
        self._init(X, lengths=lengths)
        self._check()

        self.monitor_ = ConvergenceMonitor(self.tol, self.n_iter, self.verbose)
        for iter in range(self.n_iter):
            print('iteration: {}'.format(iter))
            stats = self._initialize_sufficient_statistics()
            curr_logprob = 0
            tt = 0
            path_list = list()

            for i, j in iter_from_X_lengths(X, lengths):
                logprob, state_sequence = self.decode(X[i:j],
                                                      algorithm="viterbi")

                curr_logprob += logprob

                epsilon = np.zeros((state_sequence.shape[0] - 1,
                                    self.n_components, self.n_components))
                gamma = np.zeros((state_sequence.shape[0], self.n_components))

                for t in range(state_sequence.shape[0] - 1):
                    epsilon[t, state_sequence[t], state_sequence[t + 1]] = 1

                for t in range(state_sequence.shape[0]):
                    for i in range(self.n_components):
                        if t != (state_sequence.shape[0] - 1):
                            gamma[t, i] = np.sum(epsilon[t, i])
                        else:
                            gamma[t, i] = gamma[t - 1, i]

                path_list.append(state_sequence)
                self._accumulate_sufficient_statistics(stats, X[i:j], epsilon,
                                                       gamma, state_sequence,
                                                       None)
                tt += 1

            print('average loss: {}'.format(curr_logprob / tt))

            if not fast_update:
                stats['start'] /= tt
                stats['trans'] /= tt

                self._do_mstep(stats)
                if update_dnn:
                    temp_path = np.zeros((0, 1))
                    for k, (i, j) in enumerate(iter_from_X_lengths(X,
                                                                   lengths)):
                        temp_path = np.vstack(
                            [temp_path,
                             np.array(path_list[k]).reshape(-1, 1)])
                    self.mlp.train(X, temp_path, 20)

                acoustic_model = np.zeros(self.n_components)
                for i, j in iter_from_X_lengths(X, lengths):
                    logprob, state_sequence = self.decode(X[i:j],
                                                          algorithm="viterbi")
                    for state in state_sequence:
                        acoustic_model[state] += 1
                self.aucoustic_model = acoustic_model / np.sum(acoustic_model)

            self.monitor_.report(curr_logprob)
            if self.monitor_.iter == self.monitor_.n_iter or \
                    (len(self.monitor_.history) == 2 and
                     abs(self.monitor_.history[1] - self.monitor_.history[0]) < self.monitor_.tol * abs(
                                self.monitor_.history[1])):
                break

        print('----------------------------------------------')
        return self
Example #57
0
 def find_bots(self, priors):
     print "Getting all user info..."
     self.users_to_query = set()
     followers_set = set(self.followers)
     print "Number of followers: " + str(len(self.followers))
     follower_counts = Counter(self.followers).most_common()
     # should fix this to be a more precise measure
     size_to_keep = int(.15*len(self.followers))
     connectedness_threshold = floor(0.3*self.n)
     tmp_followers = [f[0] for f in follower_counts if f[1] >= connectedness_threshold]
     if len(tmp_followers) < size_to_keep:
         tmp_followers.extend([f[0] for f in follower_counts[:size_to_keep] if f[1] > 1])
     followers_set = set(tmp_followers)
     print "Number of connected followers: " + str(len(followers_set))
     for follower in followers_set:
         user_info = None
         follower = str(follower)
         if follower not in self.users and follower not in self.ignore_users:
             self.cur.execute('SELECT suspended, deleted, other_error, user_info_json FROM followers WHERE user_id = %s', (follower,))
             record = self.cur.fetchone()
             if record:
                 if record[0] or record[1] or record[2]:
                     self.ignore_users.add(follower)
                     # print "User is suspended or deleted"
                     continue
                 if record[3]:
                     # print "Already have profile information for user number " + follower
                     self.user_info[follower] = ast.literal_eval(record[3])
                     continue
             self.users_to_query.add(follower)
     get_user_info(self)
     print "Getting all timeline info and extracting features"
     for follower in followers_set:
         timeline = None
         follower = str(follower)
         if follower not in self.users and follower not in self.ignore_users:
             self.users.add(follower)
             self.cur.execute('SELECT suspended, deleted, other_error, timeline FROM followers WHERE user_id = %s', (follower,))
             record = self.cur.fetchone()
             if record:
                 if record[0] or record[1] or record[2]:
                     self.ignore_users.add(follower)
                     # print "User is suspended or deleted"
                     continue
                 if record[3]:
                     # print "Already have timeline information for user number " + follower
                     # Have to read in file to get timeline info
                     timeline = get_timeline_from_file(self, follower)
                 else:
                     timeline = get_user_timeline(self, follower)
             else:
                 timeline = get_user_timeline(self, follower)
             if timeline and self.user_info.get(follower) and len(timeline) > 50:
                 gf = GetFeatures(follower, self.user_info[follower], timeline)
                 try:
                     gf.user_features()
                     gf.collect_tweets()
                     gf.content_features()
                     gf.temporal_features()
                 except Exception as e:
                     print "ERROR GETTING FEATURES"
                     print e
                     print follower
                     print self.user_info[follower]
                 # need to incorporate other network features
                 #gf.features['num_shared_edges'] = follower_counts[user]
                 #cself.user_features[user] = gf.features
                 self.current_level_users.append(follower)
                 self.features_list.append(gf.features)
     # Axis=0 should be vertical
     len_priors = len(priors)
     current_features = priors
     current_features.extend(self.features_list)
     print "Performing anomaly detection"
     #json.dump(priors, open('test.json', 'w'), indent=4, separators=(',', ': '))
     X = self.vec.fit_transform(current_features).toarray()
     current_features = {}
     X_norm = normalize(X)
     #print np.any(np.isnan(X))
     #print np.all(np.isfinite(X))
     print X.shape
     # X = np.stack([current_features, priors], axis=0) Every round will find outliers, how do we stop exploring?
     clf = LocalOutlierFactor(n_neighbors=20)
     clf.fit(X)
     check_is_fitted(clf, ["threshold_", "negative_outlier_factor_", "n_neighbors_", "_distances_fit_X_"])
     if X is not None:
         X = check_array(X, accept_sparse='csr')
         y_pred = clf._decision_function(X)
     else:
         y_pred = clf.negative_outlier_factor_
     #y_pred = clf.fit_predict(X)
     y_pred_new = y_pred[len_priors:]
     # Do anomaly detection and set connected followers to certain outliers
     # this line is a stand-in
     users_scores = zip(self.current_level_users, y_pred_new)
     connected_followers = [u[0] for u in users_scores if u[1] <= clf.threshold_]
     #How do I add back in the outliers to the anomaly detection? Mueen said not to so I will leave for now
     self.level += 1
     # Add highly connected followers to the clique and to_check
     for follower in connected_followers:
         self.clique.add((follower, self.level))
         self.to_check.add(follower)
     print self.clique
     self.n = float(len(self.clique))
     print "Current size of cluster: " + str(self.n)
Example #58
0
    def update(self, X, y=None, **fit_params):
        if "relations" not in fit_params:
            raise ValueError(
                "Aligned UMAP requires relations between data to be "
                "specified")

        new_dict_relations = fit_params["relations"]
        X = check_array(X)

        self.__dict__ = set_aligned_params(fit_params, self.__dict__,
                                           self.n_models_)
        self.n_models_ += 1

        new_mapper = UMAP(
            n_neighbors=get_nth_item_or_val(self.n_neighbors, self.n_models_),
            min_dist=get_nth_item_or_val(self.min_dist, self.n_models_),
            n_epochs=get_nth_item_or_val(self.n_epochs, self.n_models_),
            repulsion_strength=get_nth_item_or_val(self.repulsion_strength,
                                                   self.n_models_),
            learning_rate=get_nth_item_or_val(self.learning_rate,
                                              self.n_models_),
            spread=get_nth_item_or_val(self.spread, self.n_models_),
            negative_sample_rate=get_nth_item_or_val(self.negative_sample_rate,
                                                     self.n_models_),
            local_connectivity=get_nth_item_or_val(self.local_connectivity,
                                                   self.n_models_),
            set_op_mix_ratio=get_nth_item_or_val(self.set_op_mix_ratio,
                                                 self.n_models_),
            unique=get_nth_item_or_val(self.unique, self.n_models_),
            n_components=self.n_components,
        ).fit(X)

        self.mappers_ += [new_mapper]

        # TODO: We can likely make this more efficient and not recompute each time
        self.dict_relations_ += [invert_dict(new_dict_relations)]

        if self.n_epochs is None:
            n_epochs = 200
        else:
            n_epochs = self.n_epochs

        indptr_list = numba.typed.List.empty_list(numba.types.int32[::1])
        indices_list = numba.typed.List.empty_list(numba.types.int32[::1])
        heads = numba.typed.List.empty_list(numba.types.int32[::1])
        tails = numba.typed.List.empty_list(numba.types.int32[::1])
        epochs_per_samples = numba.typed.List.empty_list(
            numba.types.float64[::1])

        for i, mapper in enumerate(self.mappers_):
            indptr_list.append(mapper.graph_.indptr)
            indices_list.append(mapper.graph_.indices)
            heads.append(mapper.graph_.tocoo().row)
            tails.append(mapper.graph_.tocoo().col)
            if i == len(self.mappers_) - 1:
                epochs_per_samples.append(
                    make_epochs_per_sample(mapper.graph_.tocoo().data,
                                           n_epochs))
            else:
                epochs_per_samples.append(
                    np.full(mapper.embedding_.shape[0],
                            n_epochs + 1,
                            dtype=np.float64))

        new_relations = expand_relations(self.dict_relations_)
        new_regularisation_weights = build_neighborhood_similarities(
            indptr_list,
            indices_list,
            new_relations,
        )

        new_embedding = init_from_existing(self.embeddings_[-1],
                                           new_mapper.graph_,
                                           new_dict_relations)

        random_state = check_random_state(self.random_state)
        rng_state = random_state.randint(INT32_MIN, INT32_MAX,
                                         3).astype(np.int64)

        self.embeddings_.append(new_embedding)

        self.embeddings_ = optimize_layout_aligned_euclidean(
            self.embeddings_,
            self.embeddings_,
            heads,
            tails,
            n_epochs,
            epochs_per_samples,
            new_regularisation_weights,
            new_relations,
            rng_state,
            lambda_=self.alignment_regularisation,
        )
def test_check_array():
    # accept_sparse == False
    # raise error on sparse inputs
    X = [[1, 2], [3, 4]]
    X_csr = sp.csr_matrix(X)
    with pytest.raises(TypeError):
        check_array(X_csr)

    # ensure_2d=False
    X_array = check_array([0, 1, 2], ensure_2d=False)
    assert X_array.ndim == 1
    # ensure_2d=True with 1d array
    with pytest.raises(ValueError, match="Expected 2D array,"
                                         " got 1D array instead"):
        check_array([0, 1, 2], ensure_2d=True)

    # ensure_2d=True with scalar array
    with pytest.raises(ValueError, match="Expected 2D array,"
                                         " got scalar array instead"):
        check_array(10, ensure_2d=True)

    # don't allow ndim > 3
    X_ndim = np.arange(8).reshape(2, 2, 2)
    with pytest.raises(ValueError):
        check_array(X_ndim)
    check_array(X_ndim, allow_nd=True)  # doesn't raise

    # dtype and order enforcement.
    X_C = np.arange(4).reshape(2, 2).copy("C")
    X_F = X_C.copy("F")
    X_int = X_C.astype(int)
    X_float = X_C.astype(float)
    Xs = [X_C, X_F, X_int, X_float]
    dtypes = [np.int32, int, float, np.float32, None, bool, object]
    orders = ['C', 'F', None]
    copys = [True, False]

    for X, dtype, order, copy in product(Xs, dtypes, orders, copys):
        X_checked = check_array(X, dtype=dtype, order=order, copy=copy)
        if dtype is not None:
            assert X_checked.dtype == dtype
        else:
            assert X_checked.dtype == X.dtype
        if order == 'C':
            assert X_checked.flags['C_CONTIGUOUS']
            assert not X_checked.flags['F_CONTIGUOUS']
        elif order == 'F':
            assert X_checked.flags['F_CONTIGUOUS']
            assert not X_checked.flags['C_CONTIGUOUS']
        if copy:
            assert X is not X_checked
        else:
            # doesn't copy if it was already good
            if (X.dtype == X_checked.dtype and
                    X_checked.flags['C_CONTIGUOUS'] == X.flags['C_CONTIGUOUS']
                    and X_checked.flags['F_CONTIGUOUS'] == X.flags['F_CONTIGUOUS']):
                assert X is X_checked

    # allowed sparse != None
    X_csc = sp.csc_matrix(X_C)
    X_coo = X_csc.tocoo()
    X_dok = X_csc.todok()
    X_int = X_csc.astype(int)
    X_float = X_csc.astype(float)

    Xs = [X_csc, X_coo, X_dok, X_int, X_float]
    accept_sparses = [['csr', 'coo'], ['coo', 'dok']]
    for X, dtype, accept_sparse, copy in product(Xs, dtypes, accept_sparses,
                                                 copys):
        with warnings.catch_warnings(record=True) as w:
            X_checked = check_array(X, dtype=dtype,
                                    accept_sparse=accept_sparse, copy=copy)
        if (dtype is object or sp.isspmatrix_dok(X)) and len(w):
            # XXX unreached code as of v0.22
            message = str(w[0].message)
            messages = ["object dtype is not supported by sparse matrices",
                        "Can't check dok sparse matrix for nan or inf."]
            assert message in messages
        else:
            assert len(w) == 0
        if dtype is not None:
            assert X_checked.dtype == dtype
        else:
            assert X_checked.dtype == X.dtype
        if X.format in accept_sparse:
            # no change if allowed
            assert X.format == X_checked.format
        else:
            # got converted
            assert X_checked.format == accept_sparse[0]
        if copy:
            assert X is not X_checked
        else:
            # doesn't copy if it was already good
            if X.dtype == X_checked.dtype and X.format == X_checked.format:
                assert X is X_checked

    # other input formats
    # convert lists to arrays
    X_dense = check_array([[1, 2], [3, 4]])
    assert isinstance(X_dense, np.ndarray)
    # raise on too deep lists
    with pytest.raises(ValueError):
        check_array(X_ndim.tolist())
    check_array(X_ndim.tolist(), allow_nd=True)  # doesn't raise

    # convert weird stuff to arrays
    X_no_array = _NotAnArray(X_dense)
    result = check_array(X_no_array)
    assert isinstance(result, np.ndarray)
Example #60
0
def _fit(
    X,
    alpha=1e-2,
    gamma=1e-3,
    tol=1e-3,
    max_iter=1000,
    verbose=0,
    return_history=True,
    compute_objective=True,
    warm_start=None,
    return_n_iter=False,
    adjust_gamma=False,
    A=None,
    T=0,
    rho=1,
    update_gamma=0.5,
    line_search=False,
):
    n, d = X.shape
    if warm_start is None:
        theta = np.zeros((d, d))
    else:
        theta = check_array(warm_start)

    thetas = [theta]
    theta_new = theta.copy()
    checks = []
    for iter_ in range(max_iter):
        theta_old = thetas[-1]
        if not line_search:
            grad = _gradient_ising(X, theta, n, A, rho, T)
            theta_new = theta - gamma * grad
            theta = (theta_new + theta_new.T) / 2
            theta = soft_thresholding_od(theta, alpha * gamma)
        else:
            while True:
                grad = _gradient_ising(X, theta, n, A, rho, T)
                theta_new = theta - gamma * grad
                theta = (theta_new + theta_new.T) / 2
                theta = soft_thresholding_od(theta, alpha * gamma)
                print(theta)
                loss_new = loss(X, theta)
                loss_old = loss(X, theta_old)
                # Line search
                diff_theta2 = np.linalg.norm(theta_old - theta) ** 2
                grad_diff = np.trace(grad.dot(theta_old - theta))
                diff = loss_old - grad_diff + (diff_theta2 / (2 * gamma))

                if loss_new > diff or np.isinf(loss_new) or np.isnan(loss_new):
                    gamma = update_gamma * gamma
                    theta = theta_old - gamma * grad
                    theta = soft_thresholding_od(theta, alpha * gamma)
                    loss_new = loss(X, theta)
                    diff = loss_old - grad_diff + (diff_theta2 / (2 * gamma))
                else:
                    break
        thetas.append(theta)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            check = convergence(
                iter=iter_,
                obj=objective(X, theta, alpha),
                iter_norm=np.linalg.norm(thetas[-2] - thetas[-1]),
                iter_r_norm=(np.linalg.norm(thetas[-2] - thetas[-1]) / np.linalg.norm(thetas[-1])),
            )
        checks.append(check)
        # if adjust_gamma: # TODO multiply or divide
        if verbose:
            print("Iter: %d, objective: %.4f, iter_norm %.4f" % (check[0], check[1], check[2]))

        if np.abs(check[2]) < tol:
            break

    return_list = [thetas[-1]]
    if return_history:
        return_list.append(thetas)
        return_list.append(checks)
    if return_n_iter:
        return_list.append(iter_)

    return return_list