Ejemplo n.º 1
0
def precision_n_scores(y, y_pred, n=None):
    """Utility function to calculate precision @ rank n.

    Parameters
    ----------
    y : list or numpy array of shape (n_samples,)
        The ground truth. Binary (0: inliers, 1: outliers).

    y_pred : list or numpy array of shape (n_samples,)
        The raw outlier scores as returned by a fitted model.

    n : int, optional (default=None)
        The number of outliers. if not defined, infer using ground truth.

    Returns
    -------
    precision_at_rank_n : float
        Precision at rank n score.

    """

    # turn raw prediction decision scores into binary labels
    y_pred = get_label_n(y, y_pred, n)

    # enforce formats of y and labels_
    y = column_or_1d(y)
    y_pred = column_or_1d(y_pred)

    return precision_score(y, y_pred)
Ejemplo n.º 2
0
def check_consistent_shape(X_train, y_train, X_test, y_test, y_train_pred,
                           y_test_pred):
    """Internal shape to check input data shapes are consistent.

    Parameters
    ----------
    X_train : numpy array of shape (n_samples, n_features)
        The training samples.

    y_train : list or array of shape (n_samples,)
        The ground truth of training samples.

    X_test : numpy array of shape (n_samples, n_features)
        The test samples.

    y_test : list or array of shape (n_samples,)
        The ground truth of test samples.

    y_train_pred : numpy array of shape (n_samples, n_features)
        The predicted binary labels of the training samples.

    y_test_pred : numpy array of shape (n_samples, n_features)
        The predicted binary labels of the test samples.

    Returns
    -------
    X_train : numpy array of shape (n_samples, n_features)
        The training samples.

    y_train : list or array of shape (n_samples,)
        The ground truth of training samples.

    X_test : numpy array of shape (n_samples, n_features)
        The test samples.

    y_test : list or array of shape (n_samples,)
        The ground truth of test samples.

    y_train_pred : numpy array of shape (n_samples, n_features)
        The predicted binary labels of the training samples.

    y_test_pred : numpy array of shape (n_samples, n_features)
        The predicted binary labels of the test samples.
    """

    # check input data shapes are consistent
    X_train, y_train = check_X_y(X_train, y_train)
    X_test, y_test = check_X_y(X_test, y_test)

    y_test_pred = column_or_1d(y_test_pred)
    y_train_pred = column_or_1d(y_train_pred)

    check_consistent_length(y_train, y_train_pred)
    check_consistent_length(y_test, y_test_pred)

    if X_train.shape[1] != X_test.shape[1]:
        raise ValueError("X_train {0} and X_test {1} have different number "
                         "of features.".format(X_train.shape, X_test.shape))

    return X_train, y_train, X_test, y_test, y_train_pred, y_test_pred
Ejemplo n.º 3
0
def savings_score(y_true, y_pred, cost_mat):
    #TODO: update description
    """Savings score.

    This function calculates the savings cost of using y_pred on y_true with
    cost-matrix cost-mat, as the difference of y_pred and the cost_loss of a naive
    classification model.

    Parameters
    ----------
    y_true : array-like or label indicator matrix
        Ground truth (correct) labels.

    y_pred : array-like or label indicator matrix
        Predicted labels, as returned by a classifier.

    cost_mat : array-like of shape = [n_samples, 4]
        Cost matrix of the classification problem
        Where the columns represents the costs of: false positives, false negatives,
        true positives and true negatives, for each example.

    Returns
    -------
    score : float
        Savings of a using y_pred on y_true with cost-matrix cost-mat

        The best performance is 1.

    References
    ----------
    .. [1] A. Correa Bahnsen, A. Stojanovic, D.Aouada, B, Ottersten,
           `"Improving Credit Card Fraud Detection with Calibrated Probabilities" <http://albahnsen.com/files/%20Improving%20Credit%20Card%20Fraud%20Detection%20by%20using%20Calibrated%20Probabilities%20-%20Publish.pdf>`__, in Proceedings of the fourteenth SIAM International Conference on Data Mining,
           677-685, 2014.

    See also
    --------
    cost_loss

    Examples
    --------
    >>> import numpy as np
    >>> from costcla.metrics import savings_score, cost_loss
    >>> y_pred = [0, 1, 0, 0]
    >>> y_true = [0, 1, 1, 0]
    >>> cost_mat = np.array([[4, 1, 0, 0], [1, 3, 0, 0], [2, 3, 0, 0], [2, 1, 0, 0]])
    >>> savings_score(y_true, y_pred, cost_mat)
    0.5
    """

    #TODO: Check consistency of cost_mat
    y_true = column_or_1d(y_true)
    y_pred = column_or_1d(y_pred)
    n_samples = len(y_true)

    # Calculate the cost of naive prediction
    cost_base = min(cost_loss(y_true, np.zeros(n_samples), cost_mat),
                    cost_loss(y_true, np.ones(n_samples), cost_mat))

    cost = cost_loss(y_true, y_pred, cost_mat)
    return 1.0 - cost / cost_base
Ejemplo n.º 4
0
def evaluate_print(clf_name, y, y_pred):
    """Utility function for evaluating and printing the results for examples.
    Default metrics include ROC and Precision @ n

    Parameters
    ----------
    clf_name : str
        The name of the detector.

    y : list or numpy array of shape (n_samples,)
        The ground truth. Binary (0: inliers, 1: outliers).

    y_pred : list or numpy array of shape (n_samples,)
        The raw outlier scores as returned by a fitted model.

    """

    y = column_or_1d(y)
    y_pred = column_or_1d(y_pred)
    check_consistent_length(y, y_pred)

    print('{clf_name} ROC:{roc}, precision @ rank n:{prn}'.format(
        clf_name=clf_name,
        roc=np.round(roc_auc_score(y, y_pred), decimals=4),
        prn=np.round(precision_n_scores(y, y_pred), decimals=4)))
Ejemplo n.º 5
0
    def _sigmoid_calibration(self,df, y, sample_weight=None):
        """Probability Calibration with sigmoid method (Platt 2000)
        Parameters
        ----------
        df : ndarray, shape (n_samples,)
            The decision function or predict proba for the samples.
        y : ndarray, shape (n_samples,)
            The targets.
        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted.
        Returns
        -------
        a : float
            The slope.
        b : float
            The intercept.
        References
        ----------
        Platt, "Probabilistic Outputs for Support Vector Machines"
        """
        df = column_or_1d(df)
        y = column_or_1d(y)

        F = df  # F follows Platt's notations in the Reference Paper
        tiny = np.finfo(np.float).tiny  # to avoid division by 0 warning

        # Bayesian priors (see Platt end of section 2.2 in the Reference Paper)
        prior0 = float(np.sum(y <= 0))
        prior1 = y.shape[0] - prior0
        T = np.zeros(y.shape)
        T[y > 0] = (prior1 + 1.) / (prior1 + 2.)
        T[y <= 0] = 1. / (prior0 + 2.)
        T1 = 1. - T

        def objective(AB):
            # From Platt (beginning of Section 2.2 in the Reference Paper)
            E = np.exp(AB[0] * F + AB[1])
            P = 1. / (1. + E)
            l = -(T * np.log(P + tiny) + T1 * np.log(1. - P + tiny))
            if sample_weight is not None:
                return (sample_weight * l).sum()
            else:
                return l.sum()

        def grad(AB):
            # gradient of the objective function
            E = np.exp(AB[0] * F + AB[1])
            P = 1. / (1. + E)
            TEP_minus_T1P = P * (T * E - T1)
            if sample_weight is not None:
                TEP_minus_T1P *= sample_weight
            dA = np.dot(TEP_minus_T1P, F)
            dB = np.sum(TEP_minus_T1P)
            return np.array([dA, dB])

        AB0 = np.array([0., math.log((prior0 + 1.) / (prior1 + 1.))])
        AB_ = fmin_bfgs(objective, AB0, fprime=grad, disp=False)
        return (AB_[0], AB_[1])
Ejemplo n.º 6
0
def _check_targets_hmc(y_true, y_pred):
    check_consistent_length(y_true, y_pred)
    y_type = set([type_of_target(y_true), type_of_target(y_pred)])
    if y_type == set(["binary", "multiclass"]):
        y_type = set(["multiclass"])
    if y_type != set(["multiclass"]):
        raise ValueError("{0} is not supported".format(y_type))
    y_true = column_or_1d(y_true)
    y_pred = column_or_1d(y_pred)
    return y_true, y_pred
Ejemplo n.º 7
0
def brier_score_loss(y_true, y_prob):
    """Compute the Brier score

    The smaller the Brier score, the better, hence the naming with "loss".

    Across all items in a set N predictions, the Brier score measures the
    mean squared difference between (1) the predicted probability assigned
    to the possible outcomes for item i, and (2) the actual outcome.
    Therefore, the lower the Brier score is for a set of predictions, the
    better the predictions are calibrated. Note that the Brier score always
    takes on a value between zero and one, since this is the largest
    possible difference between a predicted probability (which must be
    between zero and one) and the actual outcome (which can take on values
    of only 0 and 1).

    The Brier score is appropriate for binary and categorical outcomes that
    can be structured as true or false, but is inappropriate for ordinal
    variables which can take on three or more values (this is because the
    Brier score assumes that all possible outcomes are equivalently
    "distant" from one another).

    Parameters
    ----------
    y_true : array, shape (n_samples,)
    True targets.

    y_prob : array, shape (n_samples,)
    Probabilities of the positive class.

    Returns
    -------
    score : float
    Brier score

    Examples
    --------
    >>> import numpy as np
    >>> from costcla.metrics import brier_score_loss
    >>> y_true = [0, 1, 1, 0]
    >>> y_prob = [0.1, 0.9, 0.8, 0.3]
    >>> brier_score_loss(y_true, y_prob) # doctest: +ELLIPSIS
    0.037...
    >>> brier_score_loss(y_true, np.array(y_prob) > 0.5)
    0.0

    References
    ----------
    http://en.wikipedia.org/wiki/Brier_score
    """
    y_true = column_or_1d(y_true)
    y_prob = column_or_1d(y_prob)
    return np.mean((y_true - y_prob) ** 2)
Ejemplo n.º 8
0
def _check_clf_targets(y_true, y_pred):
    """Check that y_true and y_pred belong to the same classification task

    This converts multiclass or binary types to a common shape, and raises a
    ValueError for a mix of multilabel and multiclass targets, a mix of
    multilabel formats, for the presence of continuous-valued or multioutput
    targets, or for targets of different lengths.

    Column vectors are squeezed to 1d.

    Parameters
    ----------
    y_true : array-like,

    y_pred : array-like

    Returns
    -------
    type_true : one of {'multilabel-indicator', 'multilabel-sequences', \
    'multiclass', 'binary'}
    The type of the true target data, as output by
    ``utils.multiclass.type_of_target``

    y_true : array or indicator matrix or sequence of sequences

    y_pred : array or indicator matrix or sequence of sequences
    """

    y_true, y_pred = check_arrays(y_true, y_pred, allow_lists=True)
    type_true = type_of_target(y_true)
    type_pred = type_of_target(y_pred)

    y_type = set([type_true, type_pred])
    if y_type == set(["binary", "multiclass"]):
        y_type = set(["multiclass"])

    if len(y_type) > 1:
        raise ValueError("Can't handle mix of {0} and {1}" "".format(type_true, type_pred))

    # We can't have more than one value on y_type => The set is no more needed
    y_type = y_type.pop()

    # No metrics support "multiclass-multioutput" format
    if y_type not in ["binary", "multiclass", "multilabel-indicator", "multilabel-sequences"]:
        raise ValueError("{0} is not supported".format(y_type))

    if y_type in ["binary", "multiclass"]:
        y_true = column_or_1d(y_true)
        y_pred = column_or_1d(y_pred)

    return y_type, y_true, y_pred
Ejemplo n.º 9
0
def score_to_label(pred_scores, outliers_fraction=0.1):
    """Turn raw outlier outlier scores to binary labels (0 or 1).

    Parameters
    ----------
    pred_scores : list or numpy array of shape (n_samples,)
        Raw outlier scores. Outliers are assumed have larger values.

    outliers_fraction : float in (0,1)
        Percentage of outliers.

    Returns
    -------
    outlier_labels : numpy array of shape (n_samples,)
        For each observation, tells whether or not
        it should be considered as an outlier according to the
        fitted model. Return the outlier probability, ranging
        in [0,1].
    """
    # check input values
    pred_scores = column_or_1d(pred_scores)
    check_parameter(outliers_fraction, 0, 1)

    threshold = scoreatpercentile(pred_scores, 100 * (1 - outliers_fraction))
    pred_labels = (pred_scores > threshold).astype('int')
    return pred_labels
Ejemplo n.º 10
0
def average(scores, estimator_weight=None):
    """Combination method to merge the outlier scores from multiple estimators
    by taking the average.

    Parameters
    ----------
    scores : numpy array of shape (n_samples, n_estimators)
        Score matrix from multiple estimators on the same samples.

    estimator_weight : list of shape (1, n_estimators)
        If specified, using weighted average

    Returns
    -------
    combined_scores : numpy array of shape (n_samples, )
        The combined outlier scores.

    """
    scores = check_array(scores)

    if estimator_weight is not None:
        estimator_weight = column_or_1d(estimator_weight).reshape(1, -1)
        assert_equal(scores.shape[1], estimator_weight.shape[1])

        # (d1*w1 + d2*w2 + ...+ dn*wn)/(w1+w2+...+wn)
        # generated weighted scores
        scores = np.sum(np.multiply(scores, estimator_weight),
                        axis=1) / np.sum(
            estimator_weight)
        return scores.ravel()

    else:
        return np.mean(scores, axis=1).ravel()
Ejemplo n.º 11
0
    def fit(self, T, y, sample_weight=None):
        """Fit using `T`, `y` as training data.

        Parameters
        ----------
        * `T` [array-like, shape=(n_samples,)]:
            Training data.

        * `y` [array-like, shape=(n_samples,)]:
            Training target.

        * `sample_weight` [array-like, shape=(n_samples,), optional]:
            Weights. If set to `None`, all weights will be set to 1.

        Returns
        -------
        * `self` [object]:
            `self`.
        """
        # Check input
        T = column_or_1d(T)

        # Fit
        self.calibrator_ = _SigmoidCalibration()
        self.calibrator_.fit(T, y, sample_weight=sample_weight)

        return self
Ejemplo n.º 12
0
    def fit(self, T, y, sample_weight=None):
        """Fit using `T`, `y` as training data.

        Parameters
        ----------
        * `T` [array-like, shape=(n_samples,)]:
            Training data.

        * `y` [array-like, shape=(n_samples,)]:
            Training target.

        Returns
        -------
        * `self` [object]:
            `self`.
        """
        # Check input
        T = column_or_1d(T)
        assert sample_weight is None  # not supported by KernelDensity

        # Fit
        t0 = T[y == 0]
        t1 = T[y == 1]

        self.calibrator0 = KernelDensity(bandwidth=self.bandwidth)
        self.calibrator1 = KernelDensity(bandwidth=self.bandwidth)

        self.calibrator0.fit(t0.reshape(-1, 1))
        self.calibrator1.fit(t1.reshape(-1, 1))

        return self
Ejemplo n.º 13
0
    def fit(self, X, y, sample_weight=None, check_input=True):
        """Fit Ridge regression model after searching for the best mu and tau.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training data

        y : array-like, shape = [n_samples] or [n_samples, n_targets]
            Target values

        sample_weight : float or array-like of shape [n_samples]
            Sample weight

        Returns
        -------
        self : Returns self.
        """
        self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
        y = self._label_binarizer.fit_transform(y)
        if self._label_binarizer.y_type_.startswith('multilabel'):
            raise ValueError(
                "%s doesn't support multi-label classification" % (
                    self.__class__.__name__))
        else:
            y = column_or_1d(y, warn=False)

        param_grid = {'tau': self.taus, 'lamda': self.lamdas}
        fit_params = {'sample_weight': sample_weight,
                      'check_input': check_input}
        estimator = L1L2TwoStepClassifier(
            mu=self.mu, fit_intercept=self.fit_intercept,
            use_gpu=self.use_gpu, threshold=self.threshold,
            normalize=self.normalize, precompute=self.precompute,
            max_iter=self.max_iter,
            copy_X=self.copy_X, tol=self.tol, warm_start=self.warm_start,
            positive=self.positive,
            random_state=self.random_state, selection=self.selection)
        gs = GridSearchCV(
            estimator=estimator,
            param_grid=param_grid, fit_params=fit_params, cv=self.cv,
            scoring=self.scoring, n_jobs=self.n_jobs, iid=self.iid,
            refit=self.refit, verbose=self.verbose,
            pre_dispatch=self.pre_dispatch, error_score=self.error_score,
            return_train_score=self.return_train_score)
        gs.fit(X, y)
        estimator = gs.best_estimator_
        self.tau_ = estimator.tau
        self.lamda_ = estimator.lamda
        self.coef_ = estimator.coef_
        self.intercept_ = estimator.intercept_
        self.best_estimator_ = estimator  # XXX DEBUG

        if self.classes_.shape[0] > 2:
            ndim = self.classes_.shape[0]
        else:
            ndim = 1
            self.coef_ = self.coef_.reshape(ndim, -1)

        return self
Ejemplo n.º 14
0
    def fit(self, X, y):
        """Fit the model to the data X and target y.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Training data, where n_samples in the number of samples
            and n_features is the number of features.

        y : numpy array of shape (n_samples)

        Returns
        -------
        self
        """
        y = column_or_1d(y, warn=True)

        # needs a better way to check multi-label instances
        if isinstance(np.reshape(y, (-1, 1))[0][0], list):
            self.multi_label = True
        else:
            self.multi_label = False

        self.classes_ = np.unique(y)
        self._lbin = LabelBinarizer()
        y = self._lbin.fit_transform(y)

        super(MultilayerPerceptronClassifier, self).fit(X, y)

        return self
Ejemplo n.º 15
0
def get_color_codes(y):
    """Internal function to generate color codes for inliers and outliers.
    Inliers (0): blue; Outlier (1): red.

    Parameters
    ----------
    y : list or numpy array of shape (n_samples,)
        The ground truth. Binary (0: inliers, 1: outliers).

    Returns
    -------
    c : numpy array of shape (n_samples,)
        Color codes.

    """
    y = column_or_1d(y)

    # inliers are assigned blue
    c = np.full([len(y)], 'b', dtype=str)
    outliers_ind = np.where(y == 1)

    # outlier are assigned red
    c[outliers_ind] = 'r'

    return c
Ejemplo n.º 16
0
    def _validate_y(self, y):
        y = column_or_1d(y, warn=True)
        check_classification_targets(y)
        self.classes_, y = np.unique(y, return_inverse=True)
        self.n_classes_ = len(self.classes_)

        return y
Ejemplo n.º 17
0
def get_label_n(y, y_pred, n=None):
    """Function to turn raw outlier scores into binary labels by assign 1
    to top n outlier scores.

    Parameters
    ----------
    y : list or numpy array of shape (n_samples,)
        The ground truth. Binary (0: inliers, 1: outliers).

    y_pred : list or numpy array of shape (n_samples,)
        The raw outlier scores as returned by a fitted model.

    n : int, optional (default=None)
        The number of outliers. if not defined, infer using ground truth.

    Returns
    -------
    labels : numpy array of shape (n_samples,)
        binary labels 0: normal points and 1: outliers

    Examples
    --------
    >>> from pyod.utils.utility import get_label_n
    >>> y = [0, 1, 1, 0, 0, 0]
    >>> y_pred = [0.1, 0.5, 0.3, 0.2, 0.7]
    >>> get_label_n(y, y_pred)
    >>> [0, 1, 0, 0, 1]

    """

    # enforce formats of inputs
    y = column_or_1d(y)
    y_pred = column_or_1d(y_pred)

    check_consistent_length(y, y_pred)
    y_len = len(y)  # the length of targets

    # calculate the percentage of outliers
    if n is not None:
        outliers_fraction = n / y_len
    else:
        outliers_fraction = np.count_nonzero(y) / y_len

    threshold = scoreatpercentile(y_pred, 100 * (1 - outliers_fraction))
    y_pred = (y_pred > threshold).astype('int')

    return y_pred
Ejemplo n.º 18
0
    def _validate_y(self, y):
        y = column_or_1d(y, warn=True)
        check_classification_targets(y)
        self.classes_, y = np.unique(y, return_inverse=True)
        n_classes = len(self.classes_)

        if n_classes > 2:
            raise ValueError("It's a binary classification algorithm. Use a dataset with only 2 classes to predict.")

        return y
    def fit_all(self, X, y, n_shop, last_obs_plan):
        # if not warmstart - clear the estimator state
        if not self.warm_start:
            self._clear_state()

        # Check input
        X, = check_arrays(X, dtype=DTYPE, sparse_format="dense")
        y = column_or_1d(y, warn=True)
        n_samples, n_features = X.shape
        self.n_features = n_features
        random_state = check_random_state(self.random_state)
        self._check_params()

        if not self._is_initialized():
            if self.verbose:
                print 'Initializing gradient boosting...'
            # init state
            self._init_state()

            # fit initial model
            if not self.fix_history:
                idx = get_truncated_shopping_indices(n_shop)
            else:
                idx = np.arange(len(n_shop))

            # init predictions by averaging over the shopping histories
            y_pred = self.init_.predict(last_obs_plan[idx])
            print 'First training accuracy:', accuracy_score(y, y_pred.argmax(axis=1))
            begin_at_stage = 0
        else:
            # add more estimators to fitted model
            # invariant: warm_start = True
            if self.n_estimators < self.estimators_.shape[0]:
                raise ValueError('n_estimators=%d must be larger or equal to '
                                 'estimators_.shape[0]=%d when '
                                 'warm_start==True'
                                 % (self.n_estimators,
                                    self.estimators_.shape[0]))
            begin_at_stage = self.estimators_.shape[0]
            y_pred = self.decision_function(X)
            self._resize_state()

        # fit the boosting stages
        n_stages = self._fit_stages(X, y, y_pred, random_state, begin_at_stage, n_shop)
        # change shape of arrays after fit (early-stopping or additional tests)
        if n_stages != self.estimators_.shape[0]:
            self.estimators_ = self.estimators_[:n_stages]
            self.train_score_ = self.train_score_[:n_stages]
            if hasattr(self, 'oob_improvement_'):
                self.oob_improvement_ = self.oob_improvement_[:n_stages]
            if hasattr(self, '_oob_score_'):
                self._oob_score_ = self._oob_score_[:n_stages]

        return self
Ejemplo n.º 20
0
    def partial_fit(self, X, y, classes=None):
        """Fit the model to the data X and target y.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Training data, where n_samples in the number of samples
            and n_features is the number of features.

        classes : array, shape (n_classes)
            Classes across all calls to partial_fit.
            Can be obtained by via `np.unique(y_all)`, where y_all is the
            target vector of the entire dataset.
            This argument is required for the first call to partial_fit
            and can be omitted in the subsequent calls.
            Note that y doesn't need to contain all labels in `classes`.

        y : numpy array of shape (n_samples)
             Subset of the target values.

        Returns
        -------
        self
        """
        if self.algorithm != 'sgd':
            raise ValueError("only SGD algorithm"
                             " supports partial fit")

        if self.classes_ is None and classes is None:
            raise ValueError("classes must be passed on the first call "
                             "to partial_fit.")
        elif self.classes_ is not None and classes is not None:
            if np.any(self.classes_ != np.unique(classes)):
                raise ValueError("`classes` is not the same as on last call "
                                 "to partial_fit.")
        elif classes is not None:
            self.classes_ = classes

        if not hasattr(self, '_lbin'):
            self._lbin = LabelBinarizer()
            self._lbin._classes = classes

        y = column_or_1d(y, warn=True)

        # needs a better way to check multi-label instances
        if isinstance(np.reshape(y, (-1, 1))[0][0], list):
            self.multi_label = True
        else:
            self.multi_label = False

        y = self._lbin.fit_transform(y)
        super(MultilayerPerceptronClassifier, self).partial_fit(X, y)

        return self
Ejemplo n.º 21
0
    def fit(self, T, y, sample_weight=None):
        """Fit using `T`, `y` as training data.

        Parameters
        ----------
        * `T` [array-like, shape=(n_samples,)]:
            Training data.

        * `y` [array-like, shape=(n_samples,)]:
            Training target.

        * `sample_weight` [array-like, shape=(n_samples,), optional]:
            Weights. If set to `None`, all weights will be set to 1.

        Returns
        -------
        * `self` [object]:
            `self`.
        """
        # Check input
        T = column_or_1d(T)
        t0 = T[y == 0]
        t1 = T[y == 1]

        sw0 = None
        if sample_weight is not None:
            sw0 = sample_weight[y == 0]

        sw1 = None
        if sample_weight is not None:
            sw1 = sample_weight[y == 1]

        bins = self.bins
        if self.bins == "auto":
            bins = 10 + int(len(t0) ** (1. / 3.))

        range = self.range
        if self.range is None:
            t_min = max(0, min(np.min(t0), np.min(t1)) - self.eps)
            t_max = min(1, max(np.max(t0), np.max(t1)) + self.eps)
            range = [(t_min, t_max)]
        # Fit
        self.calibrator0 = Histogram(bins=bins, range=range,
                                     interpolation=self.interpolation,
                                     variable_width=self.variable_width)
        self.calibrator1 = Histogram(bins=bins, range=range,
                                     interpolation=self.interpolation,
                                     variable_width=self.variable_width)

        self.calibrator0.fit(t0.reshape(-1, 1), sample_weight=sw0)
        self.calibrator1.fit(t1.reshape(-1, 1), sample_weight=sw1)

        return self
Ejemplo n.º 22
0
    def fit(self, X, y):
        """Finds the intervals of interest from the input data.

        Parameters
        ----------
        X : The array containing features to be discretized. Continuous
            features should be specified by the `continuous_features`
            attribute if `X` is a 2-D array.

        y : A list or array of class labels corresponding to `X`.
        """

        self.dimensions_ = len(X.shape)

        if self.dimensions_ > 2:
            raise ValueError("Invalid input dimension for `X`. Input shape is"
                             "{0}".format(X.shape))

        X = check_array(X, force_all_finite=True, ensure_2d=False)
        y = column_or_1d(y)
        y = check_array(y, ensure_2d=False, dtype=int)
        X, y = check_X_y(X, y)

        if not self.shuffle:
            import warnings
            warnings.warn("Shuffle parameter will be removed in the future.",
                          DeprecationWarning)
        else:
            state = check_random_state(self.random_state)
            perm = state.permutation(len(y))
            X = X[perm]
            y = y[perm]

        if self.dimensions_ == 2:
            if self.continuous_features_ is None:
                self.continuous_features_ = np.arange(X.shape[1])

            self.cut_points_ = dict()

            for index, col in enumerate(X.T):
                if index not in self.continuous_features_:
                    continue
                cut_points = MDLPDiscretize(col, y, self.min_depth)
                self.cut_points_[index] = cut_points
        else:
            if self.continuous_features_ is not None:
                raise ValueError("Passed in a 1-d column of continuous features, "
                                 "but continuous_features is not None")
            self.continuous_features_ = None
            cut_points = MDLPDiscretize(X, y, self.min_depth)
            self.cut_points_ = cut_points

        return self
Ejemplo n.º 23
0
    def fit(self, T, y, sample_weight=None):
        """Fit using `T`, `y` as training data.

        Parameters
        ----------
        * `T` [array-like, shape=(n_samples,)]:
            Training data.

        * `y` [array-like, shape=(n_samples,)]:
            Training target.

        * `sample_weight` [array-like, shape=(n_samples,), optional]:
            Weights. If set to None, all weights will be set to 1.

        Returns
        -------
        * `self` [object]:
            `self`.

        Notes
        -----
        `T` is stored for future use, as `predict` needs T to interpolate
        new input data.
        """
        # Check input
        T = column_or_1d(T)

        # Fit isotonic regression
        self.ir_ = IsotonicRegression(y_min=self.y_min,
                                      y_max=self.y_max,
                                      increasing=self.increasing,
                                      out_of_bounds="clip")
        self.ir_.fit(T, y, sample_weight=sample_weight)

        # Interpolators
        if self.interpolation:
            p = self.ir_.transform(T)

            change_mask1 = (p - np.roll(p, 1)) > 0
            change_mask2 = np.roll(change_mask1, -1)
            change_mask1[0] = True
            change_mask1[-1] = True
            change_mask2[0] = True
            change_mask2[-1] = True

            self.interp1_ = interp1d(T[change_mask1], p[change_mask1],
                                     bounds_error=False,
                                     fill_value=(0., 1.))
            self.interp2_ = interp1d(T[change_mask2], p[change_mask2],
                                     bounds_error=False,
                                     fill_value=(0., 1.))

        return self
Ejemplo n.º 24
0
def group_based_cvm(y_pred, mask, sample_weight, groups_indices):
    y_pred = column_or_1d(y_pred)
    sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight)
    group_weights = compute_group_weights_by_indices(groups_indices, sample_weight=sample_weight)

    result = 0.
    global_data, global_weight, global_F = prepare_distribution(y_pred[mask], weights=sample_weight[mask])
    for group, group_weight in zip(groups_indices, group_weights):
        local_distribution = y_pred[group]
        local_weights = sample_weight[group]
        result += group_weight * _cvm_2samp_fast(global_data, local_distribution,
                                                 global_weight, local_weights, global_F)
    return result
Ejemplo n.º 25
0
def hinge_loss(y_true, pred_decision, labels=None, sample_weight=None):
    check_consistent_length(y_true, pred_decision, sample_weight)
    pred_decision = check_array(pred_decision, ensure_2d=False)
    y_true = column_or_1d(y_true)
    y_true_unique = np.unique(y_true)
    if y_true_unique.size > 2:
        if (labels is None and pred_decision.ndim > 1 and
                (np.size(y_true_unique) != pred_decision.shape[1])):
            raise ValueError("Please include all labels in y_true "
                             "or pass labels as third argument")
        if labels is None:
            labels = y_true_unique
        le = LabelEncoder()
        le.fit(labels)
        y_true = le.transform(y_true)
        mask = np.ones_like(pred_decision, dtype=bool)
        mask[np.arange(y_true.shape[0]), y_true] = False
        margin = pred_decision[~mask]
        margin -= np.max(pred_decision[mask].reshape(y_true.shape[0], -1),
                         axis=1)
    else:
        # Handles binary class case
        # this code assumes that positive and negative labels
        # are encoded as +1 and -1 respectively
        pred_decision = column_or_1d(pred_decision)
        pred_decision = np.ravel(pred_decision)

        lbin = LabelBinarizer(neg_label=-1)
        y_true = lbin.fit_transform(y_true)[:, 0]

        try:
            margin = y_true * pred_decision
        except TypeError:
            raise TypeError("pred_decision should be an array of floats.")

    losses = 1 - margin
    # The hinge_loss doesn't penalize good enough predictions.
    losses[losses <= 0] = 0
    return losses
Ejemplo n.º 26
0
    def fit(self, X, y,Q=None, monitor=None):

        if Q is None:
            Q = np.ones(Q)
        else:
            ids = np.argsort(Q)
            X,y,Q = X[ids],y[ids],Q[ids]
        self.n_classes_ = 1
        if Q is not None:
            Q = column_or_1d(Q, warn=True)
            # check if sample_group is grouped
            uniq_group = {Q[0]}
            last_group = Q[0]
            for g in Q[1:]:
                if g != last_group:
                    # group must be unseen thus far
                    if g in uniq_group:
                        raise ValueError("queries must be grouped together")
                    uniq_group.add(g)
                    last_group = g
            self.n_uniq_group = len(uniq_group)
        y = self._gain(column_or_1d(y, warn=True))
        return super(LambdaMART, self).fit(X, y, monitor,sample_group=Q)
Ejemplo n.º 27
0
 def __init__(self, filename='./corpus/train.csv'):
     if os.path.exists(filename):
         data = pd.read_csv(filename)
         self.data = shuffle(data)
         X_data = pd.DataFrame(data.drop('sentiment', axis=1))
         Y_data = column_or_1d(data[:]['sentiment'], warn=True)
         self.X_train, self.X_val,\
         self.y_train, self.y_val = train_test_split(X_data, Y_data, test_size=0.3, random_state=1)
         self.model = None
         self.load_model()
         self.preprocessor = Preprocessor.Preprocessor()
     else:
         print('No Source!')
         self.preprocessor.process_data()
Ejemplo n.º 28
0
def compute_theil_on_groups(y_pred, mask, groups_indices, target_efficiencies, sample_weight):
    y_pred = column_or_1d(y_pred)
    sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight)
    groups_weights = compute_group_weights_by_indices(groups_indices, sample_weight=sample_weight)
    divided_weight = compute_divided_weight_by_indices(groups_indices, sample_weight=sample_weight * mask)
    cuts = compute_cut_for_efficiency(target_efficiencies, mask=mask,
                                      y_pred=y_pred, sample_weight=sample_weight)

    result = 0.
    for cut in cuts:
        groups_efficiencies = compute_group_efficiencies_by_indices(y_pred, groups_indices=groups_indices,
                                                         cut=cut, divided_weight=divided_weight)
        result += theil(groups_efficiencies, groups_weights)
    return result / len(cuts)
Ejemplo n.º 29
0
def compute_sde_on_groups(y_pred, mask, groups_indices, target_efficiencies, sample_weight=None, power=2.):
    y_pred = column_or_1d(y_pred)
    sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight)
    group_weights = compute_group_weights_by_indices(groups_indices, sample_weight=sample_weight)
    divided_weight = compute_divided_weight_by_indices(groups_indices, sample_weight=sample_weight * mask)

    cuts = compute_cut_for_efficiency(target_efficiencies, mask=mask, y_pred=y_pred, sample_weight=sample_weight)

    sde = 0.
    for cut in cuts:
        group_efficiencies = compute_group_efficiencies_by_indices(y_pred, groups_indices=groups_indices,
                                                        cut=cut, divided_weight=divided_weight)
        # print('FROM SDE function', cut, group_efficiencies)
        sde += weighted_deviation(group_efficiencies, weights=group_weights, power=power)
    return (sde / len(cuts)) ** (1. / power)
Ejemplo n.º 30
0
    def fit(self, X, y, sample_weight=None):
        """Fit Naive Bayes classifier according to X, y

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples]
            Target values.

        sample_weight : array-like, shape = [n_samples], optional
            Weights applied to individual samples (1. for unweighted).

        Returns
        -------
        self : object
            Returns self.
        """
        X, y = check_arrays(X, y, sparse_format='csr')
        X = X.astype(np.float)
        y = column_or_1d(y, warn=True)
        _, n_features = X.shape

        labelbin = LabelBinarizer()
        Y = labelbin.fit_transform(y)
        self.classes_ = labelbin.classes_
        if Y.shape[1] == 1:
            Y = np.concatenate((1 - Y, Y), axis=1)

        # convert to float to support sample weight consistently
        Y = Y.astype(np.float64)
        if sample_weight is not None:
            Y *= array2d(sample_weight).T

        class_prior = self.class_prior

        # Count raw events from data before updating the class log prior
        # and feature log probas
        n_effective_classes = Y.shape[1]
        self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64)
        self.feature_count_ = np.zeros((n_effective_classes, n_features),
                                       dtype=np.float64)
        self._count(X, Y)
        self._update_feature_log_prob()
        self._update_class_log_prior(class_prior=class_prior)
        return self
Ejemplo n.º 31
0
def remap_labels(
    y_true: Union[List, np.ndarray, pd.Series],
    y_pred: Union[List, np.ndarray, pd.Series],
    return_map: bool = False,
) -> np.ndarray:
    """
    Remaps a categorical labeling (such as one predicted by a clustering algorithm) to
    match the labels used by another similar labeling.

    Given two :math:`n`-length vectors describing a categorical labeling of :math:`n`
    samples, this method reorders the labels of the second vector (`y_pred`) so that as
    many samples as possible from the two label vectors are in the same category.


    Parameters
    ----------
    y_true : array-like of shape (n_samples,)
        Ground truth labels, or, labels to map to.
    y_pred : array-like of shape (n_samples,)
        Labels to remap to match the categorical labeling of `y_true`. The categorical
        labeling of `y_pred` will be preserved exactly, but the labels used to
        denote the categories will be changed to best match the categories used in
        `y_true`.
    return_map : bool, optional
        Whether to return a dictionary where the keys are the original category labels
        from `y_pred` and the values are the new category labels that they were mapped
        to.

    Returns
    -------
    remapped_y_pred : np.ndarray of shape (n_samples,)
        Same categorical labeling as that of `y_pred`, but with the category labels
        permuted to best match those of `y_true`.
    label_map : dict
        Mapping from the original labels of `y_pred` to the new labels which best
        resemble those of `y_true`. Only returned if `return_map` was True.

    Examples
    --------
    >>> y_true = np.array([0,0,1,1,2,2])
    >>> y_pred = np.array([2,2,1,1,0,0])
    >>> remap_labels(y_true, y_pred)
    array([0, 0, 1, 1, 2, 2])

    Notes
    -----
    This method will work well when the label vectors describe a somewhat similar
    categorization of the data (as measured by metrics such as
    :func:`sklearn.metrics.adjusted_rand_score`, for example). When the categorizations
    are not similar, the remapping may not make sense (as such a remapping does not
    exist).

    For example, consider when one category in `y_true` is exactly split in half into
    two categories in `y_pred`. If this is the case, it is impossible to say which of
    the categories in `y_pred` match that original category from `y_true`.
    """
    check_consistent_length(y_true, y_pred)
    true_type = type_of_target(y_true)
    pred_type = type_of_target(y_pred)

    valid_target_types = {"binary", "multiclass"}
    if (true_type not in valid_target_types) or (pred_type not in valid_target_types):
        msg = "Elements of `y_true` and `y_pred` must represent a valid binary or "
        msg += "multiclass labeling, see "
        msg += "https://scikit-learn.org/stable/modules/generated/sklearn.utils.multiclass.type_of_target.html"
        msg += " for more information."
        raise ValueError(msg)

    y_true = column_or_1d(y_true)
    y_pred = column_or_1d(y_pred)

    if not isinstance(return_map, bool):
        raise TypeError("return_map must be of type bool.")

    labels = unique_labels(y_true, y_pred)
    confusion_mat = confusion_matrix(y_true, y_pred, labels=labels)
    row_inds, col_inds = linear_sum_assignment(confusion_mat, maximize=True)
    label_map = dict(zip(labels[col_inds], labels[row_inds]))

    remapped_y_pred = np.vectorize(label_map.get)(y_pred)
    if return_map:
        return remapped_y_pred, label_map
    else:
        return remapped_y_pred
Ejemplo n.º 32
0
def transform(target, y):
    y = column_or_1d(y, warn=True)
    indices = np.isin(y, target)
    y_transformed = np.searchsorted(target, y)
    y_transformed[~indices] = -1
    return y_transformed
Ejemplo n.º 33
0
 def _validate_y(self, y):
     # Default implementation
     return column_or_1d(y, warn=True)
from sklearn.utils import column_or_1d

# importing the dataset
dataset = pd.read_csv('./models/Prediction/dataset/score_and_grade.csv')
bins = [
    0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90,
    95, 100
]
dataset["Grade Bins"] = pd.cut(dataset['Score'], bins=bins)

x = dataset.iloc[:, :1].values
y = dataset.iloc[:, 2].values

# Encoding our categorical variables for Y
label_encoding_Y = LabelEncoder()
y = column_or_1d(y, warn=True)
y = label_encoding_Y.fit_transform(y)

# splitting the dataset into a training set and a test set
# here we are using 100 observation which is 100/400 = 0.25, so test_size=0.25
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

# feature scaling
sc_X = StandardScaler()
scaler_x = sc_X.fit(x_train)
x_train = scaler_x.transform(x_train)
x_test = scaler_x.transform(x_test)
Ejemplo n.º 35
0
    def fit(self, X, y, sample_weight=None, monitor=None):
        """Fit the gradient boosting model.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Data matrix

        y : structured array, shape = (n_samples,)
            A structured array containing the binary event indicator
            as first field, and time of event or time of censoring as
            second field.

        sample_weight : array-like, shape = (n_samples,), optional
            Weights given to each sample. If omitted, all samples have weight 1.

        monitor : callable, optional
            The monitor is called after each iteration with the current
            iteration, a reference to the estimator and the local variables of
            ``_fit_stages`` as keyword arguments ``callable(i, self,
            locals())``. If the callable returns ``True`` the fitting procedure
            is stopped. The monitor can be used for various things such as
            computing held-out estimates, early stopping, model introspect, and
            snapshoting.

        Returns
        -------
        self : object
            Returns self.
        """
        X, event, time = check_arrays_survival(
            X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=DTYPE)
        n_samples, self.n_features_ = X.shape

        X = X.astype(DTYPE)
        sample_weight_is_none = sample_weight is None
        if sample_weight_is_none:
            sample_weight = numpy.ones(n_samples, dtype=numpy.float32)
        else:
            sample_weight = column_or_1d(sample_weight, warn=True)

        check_consistent_length(X, sample_weight)

        self._check_params()

        if isinstance(self.loss_,
                      (CensoredSquaredLoss, IPCWLeastSquaresError)):
            time = numpy.log(time)

        self._init_state()
        if sample_weight_is_none:
            self.init_.fit(X, (event, time))
        else:
            self.init_.fit(X, (event, time), sample_weight)

        raw_predictions = self.loss_.get_init_raw_predictions(X, self.init_)
        begin_at_stage = 0

        # The rng state must be preserved if warm_start is True
        self._rng = check_random_state(self.random_state)

        if self.presort is True and issparse(X):
            raise ValueError(
                "Presorting is not supported for sparse matrices.")

        presort = self.presort
        # Allow presort to be 'auto', which means True if the dataset is dense,
        # otherwise it will be False.
        if presort == 'auto':
            presort = not issparse(X)

        X_idx_sorted = None
        if presort:
            X_idx_sorted = numpy.asfortranarray(numpy.argsort(X, axis=0),
                                                dtype=numpy.int32)

        # fit the boosting stages
        y = numpy.fromiter(zip(event, time),
                           dtype=[('event', numpy.bool),
                                  ('time', numpy.float64)])
        n_stages = self._fit_stages(X, y, raw_predictions, sample_weight,
                                    self._rng, begin_at_stage, monitor,
                                    X_idx_sorted)
        # change shape of arrays after fit (early-stopping or additional tests)
        if n_stages != self.estimators_.shape[0]:
            self.estimators_ = self.estimators_[:n_stages]
            self.train_score_ = self.train_score_[:n_stages]
            if hasattr(self, 'oob_improvement_'):
                self.oob_improvement_ = self.oob_improvement_[:n_stages]

        self.n_estimators_ = n_stages
        return self
Ejemplo n.º 36
0
 def _validate_y(self, y):
     y = column_or_1d(y, warn=True)
     self.classes_, y = np.unique(y, return_inverse=True)
     self.n_classes_ = len(self.classes_)
     return y
Ejemplo n.º 37
0
 def fit(self, y):
     y = column_or_1d(y, warn=True)
     return self
Ejemplo n.º 38
0
def _binary_clf_curve2(y_true, y_score, pos_label=None, sample_weight=None):
    """
    MODIFIED VERSION OF SCIKIT-LEARN API

    Calculate true and false positives per binary classification threshold.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
        True targets of binary classification

    y_score : array, shape = [n_samples]
        Estimated probabilities or decision function

    pos_label : int or str, default=None
        The label of the positive class

    sample_weight : array-like of shape (n_samples,), default=None
        Sample weights.

    Returns
    -------
    fps : array, shape = [n_thresholds]
        A count of false positives, at index i being the number of negative
        samples assigned a score >= thresholds[i]. The total number of
        negative samples is equal to fps[-1] (thus true negatives are given by
        fps[-1] - fps).

    tps : array, shape = [n_thresholds <= len(np.unique(y_score))]
        An increasing count of true positives, at index i being the number
        of positive samples assigned a score >= thresholds[i]. The total
        number of positive samples is equal to tps[-1] (thus false negatives
        are given by tps[-1] - tps).

    thresholds : array, shape = [n_thresholds]
        Decreasing score values.

    Example
    -------
    >>> y_true  = [      1,   1,   1,   1,   1,   1,   0]
    >>> y_score = [ np.nan, 0.2, 0.3, 0.4, 0.5, 0.6, 0.3]
    >>> sample_weight = None
    >>> pos_label = None
    >>> fps, tps, thresholds = _binary_clf_curve2(y_true, y_score)
    """
    import numpy as np
    from sklearn.utils import assert_all_finite
    from sklearn.utils import column_or_1d
    from sklearn.utils import check_consistent_length
    from sklearn.utils.multiclass import type_of_target
    from sklearn.utils.extmath import stable_cumsum
    # Check to make sure y_true is valid
    y_type = type_of_target(y_true)
    if not (y_type == "binary" or
            (y_type == "multiclass" and pos_label is not None)):
        raise ValueError("{0} format is not supported".format(y_type))

    check_consistent_length(y_true, y_score, sample_weight)
    y_true = column_or_1d(y_true)
    y_score = column_or_1d(y_score)
    assert_all_finite(y_true)
    # assert_all_finite(y_score)

    if sample_weight is not None:
        sample_weight = column_or_1d(sample_weight)

    # ensure binary classification if pos_label is not specified
    # classes.dtype.kind in ('O', 'U', 'S') is required to avoid
    # triggering a FutureWarning by calling np.array_equal(a, b)
    # when elements in the two arrays are not comparable.
    classes = np.unique(y_true)
    if (pos_label is None and (
            classes.dtype.kind in ('O', 'U', 'S') or
            not (np.array_equal(classes, [0, 1]) or
                 np.array_equal(classes, [-1, 1]) or
                 np.array_equal(classes, [0]) or
                 np.array_equal(classes, [-1]) or
                 np.array_equal(classes, [1])))):
        classes_repr = ", ".join(repr(c) for c in classes)
        raise ValueError("y_true takes value in {{{classes_repr}}} and "
                         "pos_label is not specified: either make y_true "
                         "take value in {{0, 1}} or {{-1, 1}} or "
                         "pass pos_label explicitly.".format(
                             classes_repr=classes_repr))
    elif pos_label is None:
        pos_label = 1.

    # make y_true a boolean vector
    y_true = (y_true == pos_label)

    # Transform nans into negative infinity
    nan_flags = np.isnan(y_score)
    y_score[nan_flags] = -np.inf

    # sort scores and corresponding truth values
    desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
    y_score = y_score[desc_score_indices]
    y_true = y_true[desc_score_indices]
    if sample_weight is not None:
        weight = sample_weight[desc_score_indices]
    else:
        weight = 1.

    # y_score typically has many tied values. Here we extract
    # the indices associated with the distinct values. We also
    # concatenate a value for the end of the curve.

    with np.errstate(invalid="ignore"):
        y_diff = np.diff(y_score)
    # Set difference between -inf to zero
    fix_flags = np.isinf(y_score[:-1]) & np.isnan(y_diff)
    y_diff[fix_flags] = 0

    distinct_value_indices = np.where(y_diff)[0]
    threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]

    # accumulate the true positives with decreasing threshold
    tps = stable_cumsum(y_true * weight)[threshold_idxs]
    if sample_weight is not None:
        # express fps as a cumsum to ensure fps is increasing even in
        # the presence of floating point errors
        fps = stable_cumsum((1 - y_true) * weight)[threshold_idxs]
    else:
        fps = 1 + threshold_idxs - tps
    return fps, tps, y_score[threshold_idxs]
Ejemplo n.º 39
0
    def fit(self, X, y):
        if self.model is not None:
            thundersvm.model_free(c_void_p(self.model))
            self.model = None
        sparse = sp.isspmatrix(X)
        self._sparse = sparse and not callable(self.kernel)
        X, y = check_X_y(X,
                         y,
                         dtype=np.float64,
                         order='C',
                         accept_sparse='csr')
        y = column_or_1d(y, warn=True).astype(np.float64)

        solver_type = SVM_TYPE.index(self._impl)

        if self.gamma == 'auto':
            self._gamma = 1.0 / X.shape[1]
        else:
            self._gamma = self.gamma
        if self.kernel not in KERNEL_TYPE:
            print(
                "The kernel parameter not recognized, please refer to the document."
            )
            exit()
        else:
            kernel = KERNEL_TYPE.index(self.kernel)

        fit = self._sparse_fit if self._sparse else self._dense_fit
        thundersvm.model_new.restype = c_void_p
        self.model = thundersvm.model_new(solver_type)
        if self.max_mem_size != -1:
            thundersvm.set_memory_size(c_void_p(self.model), self.max_mem_size)
        fit(X, y, solver_type, kernel)
        if self._train_succeed[0] == -1:
            print("Training failed!")
            return
        self.n_sv = thundersvm.n_sv(c_void_p(self.model))
        csr_row = (c_int * (self.n_sv + 1))()
        csr_col = (c_int * (self.n_sv * self.n_features))()
        csr_data = (c_float * (self.n_sv * self.n_features))()
        data_size = (c_int * 1)()
        sv_indices = (c_int * self.n_sv)()
        thundersvm.get_sv(csr_row, csr_col, csr_data, data_size, sv_indices,
                          c_void_p(self.model))
        self.row = np.frombuffer(csr_row, dtype=np.int32)
        self.col = np.frombuffer(csr_col, dtype=np.int32)[:data_size[0]]
        self.data = np.frombuffer(csr_data, dtype=np.float32)[:data_size[0]]

        self.support_vectors_ = sp.csr_matrix((self.data, self.col, self.row))
        if not self._sparse:
            self.support_vectors_ = self.support_vectors_.toarray(order='C')
        self.support_ = np.frombuffer(sv_indices, dtype=np.int32).astype(int)

        dual_coef = (c_float * ((self.n_classes - 1) * self.n_sv))()
        thundersvm.get_coef(dual_coef, self.n_classes, self.n_sv,
                            c_void_p(self.model))

        self.dual_coef_ = np.frombuffer(dual_coef, dtype=np.float32)\
            .astype(float)\
            .reshape((self.n_classes - 1, self.n_sv))

        rho_size = int(self.n_classes * (self.n_classes - 1) / 2)
        self.n_binary_model = rho_size
        rho = (c_float * rho_size)()
        thundersvm.get_rho(rho, rho_size, c_void_p(self.model))
        self.intercept_ = np.frombuffer(rho, dtype=np.float32).astype(float)

        if self.kernel == 'linear':
            coef = (c_float * (self.n_binary_model * self.n_features))()
            thundersvm.get_linear_coef(coef, self.n_binary_model,
                                       self.n_features, c_void_p(self.model))
            self.coef_ = np.frombuffer(coef, dtype=np.float32)\
                .astype(float)\
                .reshape((self.n_binary_model, self.n_features))

        n_support_ = (c_int * self.n_classes)()
        thundersvm.get_support_classes(n_support_, self.n_classes,
                                       c_void_p(self.model))

        self.n_support_ = np.frombuffer(n_support_, dtype=np.int32).astype(int)
        self.shape_fit_ = X.shape

        return self
Ejemplo n.º 40
0
def brier_score_loss(y_true, y_prob, sample_weight=None, pos_label=None):
    """Compute the Brier score.
    The smaller the Brier score, the better, hence the naming with "loss".
    Across all items in a set N predictions, the Brier score measures the
    mean squared difference between (1) the predicted probability assigned
    to the possible outcomes for item i, and (2) the actual outcome.
    Therefore, the lower the Brier score is for a set of predictions, the
    better the predictions are calibrated. Note that the Brier score always
    takes on a value between zero and one, since this is the largest
    possible difference between a predicted probability (which must be
    between zero and one) and the actual outcome (which can take on values
    of only 0 and 1).
    The Brier score is appropriate for binary and categorical outcomes that
    can be structured as true or false, but is inappropriate for ordinal
    variables which can take on three or more values (this is because the
    Brier score assumes that all possible outcomes are equivalently
    "distant" from one another). Which label is considered to be the positive
    label is controlled via the parameter pos_label, which defaults to 1.
    Read more in the :ref:`User Guide <calibration>`.
    Parameters
    ----------
    y_true : array, shape (n_samples,)
        True targets.
    y_prob : array, shape (n_samples,)
        Probabilities of the positive class.
    sample_weight : array-like of shape = [n_samples], optional
        Sample weights.
    pos_label : int (default: None)
        Label of the positive class. If None, the maximum label is used as
        positive class
    Returns
    -------
    score : float
        Brier score
    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.metrics import brier_score_loss
    >>> y_true = np.array([0, 1, 1, 0])
    >>> y_true_categorical = np.array(["spam", "ham", "ham", "spam"])
    >>> y_prob = np.array([0.1, 0.9, 0.8, 0.3])
    >>> brier_score_loss(y_true, y_prob)  # doctest: +ELLIPSIS
    0.037...
    >>> brier_score_loss(y_true, 1-y_prob, pos_label=0)  # doctest: +ELLIPSIS
    0.037...
    >>> brier_score_loss(y_true_categorical, y_prob, \
                         pos_label="ham")  # doctest: +ELLIPSIS
    0.037...
    >>> brier_score_loss(y_true, np.array(y_prob) > 0.5)
    0.0
    References
    ----------
    http://en.wikipedia.org/wiki/Brier_score
    """
    y_true = column_or_1d(y_true)
    y_prob = column_or_1d(y_prob)
    if pos_label is None:
        pos_label = y_true.max()
    y_true = np.array(y_true == pos_label, int)
    y_true = _check_binary_probabilistic_predictions(y_true, y_prob)
    return np.average((y_true - y_prob)**2, weights=sample_weight)
Ejemplo n.º 41
0
 def fit(self, X, y=None):
     X = column_or_1d(X, warn=True)
     return self
Ejemplo n.º 42
0
 def transform(self, X):
     X = column_or_1d(X, warn=True)
     func = lambda x: x[self.begin:self.end]
     Xt = eval_rows(X, func)
     return _col2d(Xt)
Ejemplo n.º 43
0
	def fit(self, y):
		y = column_or_1d(y, warn = True)
		self.classes_ = numpy.unique(y[~pandas.isnull(y)])
		return self
Ejemplo n.º 44
0
def VOC_prec_recall_curve(y_true, y_score, sample_weight=None):
    '''The unstable version by VOC people.
    This function heavily copies from scikit-learn's stable code.
    Licence:
    # Authors: Alexandre Gramfort <*****@*****.**>
    #          Mathieu Blondel <*****@*****.**>
    #          Olivier Grisel <*****@*****.**>
    #          Arnaud Joly <*****@*****.**>
    #          Jochen Wersdorfer <*****@*****.**>
    #          Lars Buitinck
    #          Joel Nothman <*****@*****.**>
    #          Noel Dawe <*****@*****.**>
    # License: BSD 3 clause
    '''

    from sklearn.utils import assert_all_finite, check_consistent_length, column_or_1d
    from sklearn.utils.extmath import stable_cumsum

    check_consistent_length(y_true, y_score)
    y_true = column_or_1d(y_true)
    y_score = column_or_1d(y_score)
    assert_all_finite(y_true)
    assert_all_finite(y_score)

    if sample_weight is not None:
        sample_weight = column_or_1d(sample_weight)

    # ensure binary classification if pos_label is not specified
    classes = np.unique(y_true)
    if not (np.array_equal(classes, [0, 1]) or
             np.array_equal(classes, [-1, 1]) or
             np.array_equal(classes, [0]) or
             np.array_equal(classes, [-1]) or
             np.array_equal(classes, [1])):
        raise ValueError("Data is not binary and pos_label is not specified")
    else:
        pos_label = 1.

    # make y_true a boolean vector
    y_true = (y_true == pos_label)

    # sort scores and corresponding truth values
    # first flip for consistency with buggy MATLAB version
    y_score = y_score[::-1]
    y_true = y_true[::-1]
    desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
    y_score = y_score[desc_score_indices]
    y_true = y_true[desc_score_indices]
    if sample_weight is not None:
        weight = sample_weight[desc_score_indices]
    else:
        weight = 1.

    # y_score typically has many tied values. Here we extract
    # the indices associated with the distinct values. We also
    # concatenate a value for the end of the curve.
    # distinct_value_indices = np.where(np.diff(y_score))[0]
    # threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]

    # VOC ignores this.
    threshold_idxs = np.r_[range(y_true.size)]

    # accumulate the true positives with decreasing threshold
    tps = stable_cumsum(y_true * weight)[threshold_idxs]
    if sample_weight is not None:
        fps = stable_cumsum(weight)[threshold_idxs] - tps
    else:
        fps = 1 + threshold_idxs - tps
    fps, tps, thresholds = fps, tps, y_score[threshold_idxs]


    # now copying from the caller
    precision = tps / (tps + fps)
    recall = tps / tps[-1]

    # stop when full recall attained
    # and reverse the outputs so recall is decreasing
    last_ind = tps.searchsorted(tps[-1])
    sl = slice(last_ind, None, -1)
    return np.r_[precision[sl], 1], np.r_[recall[sl], 0], thresholds[sl]
Ejemplo n.º 45
0
 def transform(x):
     x = column_or_1d(x, warn=True)
     return DataFrame([described_solvents[x] for x in x], columns=header)
def linear_regr(X_train, y_train, X_test, y_test, poly_degree,
                interaction_only, print_coef, plot, ask_user, model_result):

    # create more features
    poly = preprocessing.PolynomialFeatures(poly_degree,
                                            interaction_only=interaction_only)

    X_train = poly.fit_transform(X_train)
    X_test = poly.fit_transform(X_test)
    (s_n, f_n) = X_train.shape
    # l_n = int(math.ceil(1.5*f_n))
    l_n = int(math.ceil(1.2 * f_n))
    print("@@@ s_n = {}, f_n = {}, l_n = {}".format(s_n, f_n, l_n))

    np.savetxt("x_train.csv", X_train, delimiter=",")
    np.savetxt("y_train.csv", y_train, delimiter=",")
    np.savetxt("x_test.csv", X_test, delimiter=",")
    np.savetxt("y_test.csv", y_test, delimiter=",")

    print("### type of X_train = {}".format(type(X_train)))

    # debug
    for model in [2]:
        # linear regr: [0, 1, 2] NN: [3, 4]
        # for model in [0 1 2 3]:
        # run all: very long runtime
        # for model in [0 1 2 3 4]:
        # model selection
        ## # test score: 0.83
        ## model_name = "SGDRegressor"
        ## model_rt_start = timeit.default_timer()
        ## regr = linear_model.SGDRegressor(penalty='elasticnet', alpha=0.01, l1_ratio=0.25, fit_intercept=True)
        ## model_rt_stop = timeit.default_timer()
        ## model_runtime = model_rt_stop - model_rt_start
        ## # test score: 0.83
        ## model_name = "ElasticNet"
        ## model_rt_start = timeit.default_timer()
        ## regr = linear_model.ElasticNet(alpha = 0.01)
        ## model_rt_stop = timeit.default_timer()
        ## model_runtime = model_rt_stop - model_rt_start
        if (model == 0):
            # test score: 0.84
            alpha = 0
            model_name = "linear_model.LinearRegression"
            regr = linear_model.LinearRegression(copy_X=True,
                                                 fit_intercept=True,
                                                 n_jobs=1,
                                                 normalize=False)
            model_rt_start = timeit.default_timer()
            regr.fit(X_train, column_or_1d(y_train))
            model_rt_stop = timeit.default_timer()
            model_runtime = model_rt_stop - model_rt_start
            model_result = evaluation(X_train, y_train, X_test, y_test,
                                      poly_degree, interaction_only,
                                      print_coef, plot, ask_user, model_result,
                                      model_name, model_runtime, regr, alpha)
        elif (model == 1):
            for alpha in [0.0001, 0.001, 0.01, 0.1, 1, 3, 10]:
                # test score: 0.83
                model_name = "linear_model.Lasso"
                regr_lasso = linear_model.Lasso(alpha=alpha)
                model_rt_start = timeit.default_timer()
                regr_lasso.fit(X_train, column_or_1d(y_train))
                model_rt_stop = timeit.default_timer()
                model_runtime = model_rt_stop - model_rt_start
                model_result = evaluation(X_train, y_train, X_test, y_test,
                                          poly_degree, interaction_only,
                                          print_coef, plot, ask_user,
                                          model_result, model_name,
                                          model_runtime, regr_lasso, alpha)
        elif (model == 2):
            for alpha in [0.0001, 0.001, 0.01, 0.1, 1, 3, 10]:
                # for alpha in [0.0000001, 0.00001, 0.001, 0.01, 0.1, 1, 3, 10, 30, 100, 300, 10**3, 10**4, 10**5]:
                # test score: 0.84
                model_name = "linear_model.Ridge"
                regr_ridge = linear_model.Ridge(alpha=alpha)
                model_rt_start = timeit.default_timer()
                regr_ridge.fit(X_train, column_or_1d(y_train))
                model_rt_stop = timeit.default_timer()
                model_runtime = model_rt_stop - model_rt_start
                model_result = evaluation(X_train, y_train, X_test, y_test,
                                          poly_degree, interaction_only,
                                          print_coef, plot, ask_user,
                                          model_result, model_name,
                                          model_runtime, regr_ridge, alpha)
        elif (model == 3):
            if (poly_degree <= 2):
                for alpha in [0.0001, 0.01, 1]:
                    # for alpha in [0.00001]:
                    for layer_n in [3, 7, 11]:
                        # for layer_n in [3]:
                        # test score: 0.83, runtime longer
                        model_name = "neural_network.MLPRegressor, layer = " + str(
                            layer_n)
                        if (layer_n == 3):
                            regr = neural_network.MLPRegressor(
                                random_state=True,
                                hidden_layer_sizes=(l_n, l_n, l_n),
                                alpha=alpha)
                        if (layer_n == 7):
                            regr = neural_network.MLPRegressor(
                                random_state=True,
                                hidden_layer_sizes=(l_n, l_n, l_n, l_n, l_n,
                                                    l_n, l_n),
                                alpha=alpha)
                        if (layer_n == 11):
                            regr = neural_network.MLPRegressor(
                                random_state=True,
                                hidden_layer_sizes=(l_n, l_n, l_n, l_n, l_n,
                                                    l_n, l_n, l_n, l_n, l_n,
                                                    l_n),
                                alpha=alpha)
                        model_rt_start = timeit.default_timer()
                        regr.fit(X_train, column_or_1d(y_train))
                        model_rt_stop = timeit.default_timer()
                        model_runtime = model_rt_stop - model_rt_start
                        model_result = evaluation(X_train, y_train, X_test,
                                                  y_test, poly_degree,
                                                  interaction_only, print_coef,
                                                  plot, ask_user, model_result,
                                                  model_name, model_runtime,
                                                  regr, alpha)
        elif (model == 4):
            if (poly_degree <= 3):
                for alpha in [1, 10, 1000]:
                    # for alpha in [0.00001]:
                    # for layer_n in [3, 7, 11]:
                    for layer_n in [7, 11]:
                        # for layer_n in [3]:
                        # test score: 0.83, runtime longer
                        model_name = "neural_network.MLPRegressor, layer = " + str(
                            layer_n)
                        if (layer_n == 3):
                            regr = neural_network.MLPRegressor(
                                random_state=True,
                                hidden_layer_sizes=(l_n, l_n, l_n),
                                alpha=alpha)
                        if (layer_n == 7):
                            # regr = neural_network.MLPRegressor(random_state=True,hidden_layer_sizes=(l_n,l_n,l_n,l_n,l_n,l_n,l_n),alpha=alpha)
                            regr = neural_network.MLPRegressor(
                                random_state=True,
                                hidden_layer_sizes=(l_n, l_n, l_n, l_n, l_n,
                                                    l_n, l_n),
                                alpha=alpha,
                                learning_rate='invscaling')
                        if (layer_n == 11):
                            regr = neural_network.MLPRegressor(
                                random_state=True,
                                hidden_layer_sizes=(l_n, l_n, l_n, l_n, l_n,
                                                    l_n, l_n, l_n, l_n, l_n,
                                                    l_n),
                                alpha=alpha)
                        model_rt_start = timeit.default_timer()
                        regr.fit(X_train, column_or_1d(y_train))
                        model_rt_stop = timeit.default_timer()
                        model_runtime = model_rt_stop - model_rt_start
                        model_result = evaluation(X_train, y_train, X_test,
                                                  y_test, poly_degree,
                                                  interaction_only, print_coef,
                                                  plot, ask_user, model_result,
                                                  model_name, model_runtime,
                                                  regr, alpha)
        else:
            raise SystemExit("Model selection out of range!!!")

    return model_result
Ejemplo n.º 47
0
def multilabel_confusion_matrix(y_true,
                                y_pred,
                                sample_weight=None,
                                labels=None,
                                samplewise=False):
    """Compute a confusion matrix for each class or sample
    .. versionadded:: 0.21
    Compute class-wise (default) or sample-wise (samplewise=True) multilabel
    confusion matrix to evaluate the accuracy of a classification, and output
    confusion matrices for each class or sample.
    In multilabel confusion matrix :math:`MCM`, the count of true negatives
    is :math:`MCM_{:,0,0}`, false negatives is :math:`MCM_{:,1,0}`,
    true positives is :math:`MCM_{:,1,1}` and false positives is
    :math:`MCM_{:,0,1}`.
    Multiclass data will be treated as if binarized under a one-vs-rest
    transformation. Returned confusion matrices will be in the order of
    sorted unique labels in the union of (y_true, y_pred).
    Read more in the :ref:`User Guide <multilabel_confusion_matrix>`.
    Parameters
    ----------
    y_true : 1d array-like, or label indicator array / sparse matrix
        of shape (n_samples, n_outputs) or (n_samples,)
        Ground truth (correct) target values.
    y_pred : 1d array-like, or label indicator array / sparse matrix
        of shape (n_samples, n_outputs) or (n_samples,)
        Estimated targets as returned by a classifier
    sample_weight : array-like of shape = (n_samples,), optional
        Sample weights
    labels : array-like
        A list of classes or column indices to select some (or to force
        inclusion of classes absent from the data)
    samplewise : bool, default=False
        In the multilabel case, this calculates a confusion matrix per sample
    Returns
    -------
    multi_confusion : array, shape (n_outputs, 2, 2)
        A 2x2 confusion matrix corresponding to each output in the input.
        When calculating class-wise multi_confusion (default), then
        n_outputs = n_labels; when calculating sample-wise multi_confusion
        (samplewise=True), n_outputs = n_samples. If ``labels`` is defined,
        the results will be returned in the order specified in ``labels``,
        otherwise the results will be returned in sorted order by default.
    See also
    --------
    confusion_matrix
    Notes
    -----
    The multilabel_confusion_matrix calculates class-wise or sample-wise
    multilabel confusion matrices, and in multiclass tasks, labels are
    binarized under a one-vs-rest way; while confusion_matrix calculates
    one confusion matrix for confusion between every two classes.
    Examples
    --------
    Multilabel-indicator case:
    >>> import numpy as np
    >>> from sklearn.metrics import multilabel_confusion_matrix
    >>> y_true = np.array([[1, 0, 1],
    ...                    [0, 1, 0]])
    >>> y_pred = np.array([[1, 0, 0],
    ...                    [0, 1, 1]])
    >>> multilabel_confusion_matrix(y_true, y_pred)
    array([[[1, 0],
            [0, 1]],
    <BLANKLINE>
           [[1, 0],
            [0, 1]],
    <BLANKLINE>
           [[0, 1],
            [1, 0]]])
    Multiclass case:
    >>> y_true = ["cat", "ant", "cat", "cat", "ant", "bird"]
    >>> y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"]
    >>> multilabel_confusion_matrix(y_true, y_pred,
    ...                             labels=["ant", "bird", "cat"])
    array([[[3, 1],
            [0, 2]],
    <BLANKLINE>
           [[5, 0],
            [1, 0]],
    <BLANKLINE>
           [[2, 1],
            [1, 2]]])
    """
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
    if sample_weight is not None:
        sample_weight = column_or_1d(sample_weight)
    check_consistent_length(y_true, y_pred, sample_weight)

    if y_type not in ("binary", "multiclass", "multilabel-indicator"):
        raise ValueError("%s is not supported" % y_type)

    present_labels = unique_labels(y_true, y_pred)
    if labels is None:
        labels = present_labels
        n_labels = None
    else:
        n_labels = len(labels)
        labels = np.hstack(
            [labels,
             np.setdiff1d(present_labels, labels, assume_unique=True)])

    if y_true.ndim == 1:
        if samplewise:
            raise ValueError("Samplewise metrics are not available outside of "
                             "multilabel classification.")

        le = LabelEncoder()
        le.fit(labels)
        y_true = le.transform(y_true)
        y_pred = le.transform(y_pred)
        sorted_labels = le.classes_

        # labels are now from 0 to len(labels) - 1 -> use bincount
        tp = y_true == y_pred
        tp_bins = y_true[tp]
        if sample_weight is not None:
            tp_bins_weights = np.asarray(sample_weight)[tp]
        else:
            tp_bins_weights = None

        if len(tp_bins):
            tp_sum = np.bincount(tp_bins,
                                 weights=tp_bins_weights,
                                 minlength=len(labels))
        else:
            # Pathological case
            true_sum = pred_sum = tp_sum = np.zeros(len(labels))
        if len(y_pred):
            pred_sum = np.bincount(y_pred,
                                   weights=sample_weight,
                                   minlength=len(labels))
        if len(y_true):
            true_sum = np.bincount(y_true,
                                   weights=sample_weight,
                                   minlength=len(labels))

        # Retain only selected labels
        indices = np.searchsorted(sorted_labels, labels[:n_labels])
        tp_sum = tp_sum[indices]
        true_sum = true_sum[indices]
        pred_sum = pred_sum[indices]

    else:
        sum_axis = 1 if samplewise else 0

        # All labels are index integers for multilabel.
        # Select labels:
        if not np.array_equal(labels, present_labels):
            if np.max(labels) > np.max(present_labels):
                raise ValueError('All labels must be in [0, n labels) for '
                                 'multilabel targets. '
                                 'Got %d > %d' %
                                 (np.max(labels), np.max(present_labels)))
            if np.min(labels) < 0:
                raise ValueError('All labels must be in [0, n labels) for '
                                 'multilabel targets. '
                                 'Got %d < 0' % np.min(labels))

        if n_labels is not None:
            y_true = y_true[:, labels[:n_labels]]
            y_pred = y_pred[:, labels[:n_labels]]

        # calculate weighted counts
        true_and_pred = y_true.multiply(y_pred)
        tp_sum = count_nonzero(true_and_pred,
                               axis=sum_axis,
                               sample_weight=sample_weight)
        pred_sum = count_nonzero(y_pred,
                                 axis=sum_axis,
                                 sample_weight=sample_weight)
        true_sum = count_nonzero(y_true,
                                 axis=sum_axis,
                                 sample_weight=sample_weight)

    fp = pred_sum - tp_sum
    fn = true_sum - tp_sum
    tp = tp_sum

    if sample_weight is not None and samplewise:
        sample_weight = np.array(sample_weight)
        tp = np.array(tp)
        fp = np.array(fp)
        fn = np.array(fn)
        tn = sample_weight * y_true.shape[1] - tp - fp - fn
    elif sample_weight is not None:
        tn = sum(sample_weight) - tp - fp - fn
    elif samplewise:
        tn = y_true.shape[1] - tp - fp - fn
    else:
        tn = y_true.shape[0] - tp - fp - fn

    return np.array([tn, fp, fn, tp]).T.reshape(-1, 2, 2)
def evaluation(X_train, y_train, X_test, y_test, poly_degree, interaction_only,
               print_coef, plot, ask_user, model_result, model_name,
               model_runtime, regr, alpha):
    print("poly_degree = {}, interaction_only = {}".format(
        poly_degree, interaction_only))
    with open("logs/log_" + log_timestr + ".txt", "a") as logfile:
        logfile.write("====================\n")
        logfile.write("poly_degree = {}, interaction_only = {}\n".format(
            poly_degree, interaction_only))

    print("Model: {} \n".format(model_name))
    print("Alpha (Regularization strength): {} \n".format(alpha))
    print("X_train.shape = {}".format(X_train.shape))
    print("y_train.shape = {}".format(y_train.shape))
    print("X_test.shape = {}".format(X_test.shape))
    print("y_test.shape = {}".format(y_test.shape))

    if (print_coef):
        # The coefficients
        if hasattr(regr, 'coef_'):
            print("Coefficients: {}\n", regr.coef_)
            with open("logs/log_" + log_timestr + ".txt", "a") as logfile:
                logfile.write("Coefficients: {}\n".format(regr.coef_))
        # for neural_network.MLPRegressor
        if hasattr(regr, 'coefs_'):
            print("Coefficients: {}\n", regr.coefs_)
            with open("logs/log_" + log_timestr + ".txt", "a") as logfile:
                logfile.write("Coefficients: {}\n".format(regr.coefs_))

    print("For training set:")
    (mse_train, score_train) = (0, 0)
    # mse_train = float(np.mean( (regr.predict(X_train) - y_train) ** 2) )
    # need to use column_or_1d instead of np.array
    model_rt_predict_train_start = timeit.default_timer()
    predict_train = regr.predict(X_train)
    model_rt_predict_train_stop = timeit.default_timer()
    model_runtime_predict_train = model_rt_predict_train_stop - model_rt_predict_train_start
    mse_train = float(np.mean((predict_train - column_or_1d(y_train))**2))
    score_train = regr.score(X_train, y_train)
    # The mean squared error
    print("Mean squared error (train): {0:.3f} \n".format(mse_train))
    # Explained variance score: 1 is perfect prediction
    print("Variance score (train): {0:.3f} \n".format(score_train))
    print("model_runtime (training) = {0:.3f} (seconds) \n".format(
        model_runtime))
    print("model_runtime (predict train set) = {0:.3f} (seconds) \n".format(
        model_runtime_predict_train))

    print("For test set:")
    (mse_test, score_test) = (0, 0)
    model_rt_predict_test_start = timeit.default_timer()
    predict_test = regr.predict(X_test)
    model_rt_predict_test_stop = timeit.default_timer()
    model_runtime_predict_test = model_rt_predict_test_stop - model_rt_predict_test_start
    mse_test = float(np.mean((predict_test - column_or_1d(y_test))**2))
    score_test = regr.score(X_test, y_test)
    # The mean squared error
    print("Mean squared error (test): {0:.3f} \n".format(mse_test))
    # Explained variance score: 1 is perfect prediction
    print("Variance score (test): {0:.3f} \n".format(score_test))
    print("model_runtime (predict test set) = {0:.3f} (seconds) \n".format(
        model_runtime_predict_test))

    with open("logs/log_" + log_timestr + ".txt", "a") as logfile:
        logfile.write("====================\n")
        logfile.write("Features polynomial degree: {} \n".format(poly_degree))
        logfile.write("Model: {} \n".format(model_name))
        logfile.write("Alpha (Regularization strength): {} \n".format(alpha))
        logfile.write("X_train.shape = {} \n".format(X_train.shape))
        logfile.write("y_train.shape = {} \n".format(y_train.shape))
        logfile.write("X_test.shape = {} \n".format(X_test.shape))
        logfile.write("y_test.shape = {} \n".format(y_test.shape))
        logfile.write("For training set: \n")
        logfile.write(
            "Mean squared error (train): {0:.3f} \n".format(mse_train))
        logfile.write("Variance score (train): {0:.3f} \n".format(score_train))
        logfile.write("For test set: \n")
        logfile.write("Mean squared error (test): {0:.3f} \n".format(mse_test))
        logfile.write("Variance score (test): {0:.3f} \n".format(score_test))
        logfile.write("model_runtime (training) = {0:.3f} (seconds) \n".format(
            model_runtime))
        logfile.write(
            "model_runtime (predict train set) = {0:.3f} (seconds) \n".format(
                model_runtime_predict_train))
        logfile.write(
            "model_runtime (predict test set) = {0:.3f} (seconds) \n".format(
                model_runtime_predict_test))
        logfile.write("====================\n")

    # collect info.
    (s_n, f_n) = X_train.shape
    model_result.update({(model_name, alpha, int(f_n), poly_degree): []})
    ## model_result[(model_name, alpha, int(f_n) )] = ( poly_degree, round(mse_train, 3), round(score_train, 3),
    ##             round(mse_test, 3), round(score_test, 3),
    ##             round(model_runtime, 3), round(model_runtime_predict_train, 3), round(model_runtime_predict_test, 3) )
    model_result[(model_name, alpha, int(f_n),
                  poly_degree)] = (round(mse_train, 3), round(score_train, 3),
                                   round(mse_test,
                                         3), round(score_test,
                                                   3), round(model_runtime, 3),
                                   round(model_runtime_predict_train, 3),
                                   round(model_runtime_predict_test, 3))

    # print shape
    if (plot == True):
        plot_y_test(regr, X_test, y_test, ask_user)

    return model_result
        clf = KNN()  # 初始化检测器clf
        clf.fit(X_train)  # 使用X_train训练检测器clf

        # 返回训练数据X_train上的异常标签和异常分值
        y_train_pred = clf.labels_  # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值)
        y_train_scores = clf.decision_scores_  # 返回训练数据上的异常值 (分值越大越异常)
        print("On train Data:")
        evaluate_print(clf_name, y_train, y_train_scores)

        # 用训练好的clf来预测未知数据中的异常值
        y_test_pred = clf.predict(X_test)  # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值)
        y_test_scores = clf.decision_function(X_test)  # 返回未知数据上的异常值 (分值越大越异常)
        print("On Test Data:")
        evaluate_print(clf_name, y_test, y_test_scores)

        y_true = column_or_1d(y_test)
        y_pred = column_or_1d(y_test_scores)
        check_consistent_length(y_true, y_pred)

        roc = np.round(roc_auc_score(y_true, y_pred), decimals=4),
        prn = np.round(precision_n_scores(y_true, y_pred), decimals=4)
        knn_roc.append(roc)
        knn_prn.append(prn)


        clf_name = 'LOF'
        clf = LOF()  # 初始化检测器clf
        clf.fit(X_train)  # 使用X_train训练检测器clf

        # 返回训练数据X_train上的异常标签和异常分值
        y_train_pred = clf.labels_  # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值)
Ejemplo n.º 50
0
 def _validate_input(self, X, y, incremental):
     X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
                      multi_output=True, y_numeric=True)
     if y.ndim == 2 and y.shape[1] == 1:
         y = column_or_1d(y, warn=True)
     return X, y
Ejemplo n.º 51
0
    def label_validate(self, y):

        return column_or_1d(y, warn=True).astype(np.float64)
Ejemplo n.º 52
0
 def transform(self, X):
     X = column_or_1d(X, warn=True)
     engine = _regex_engine(self.pattern)
     func = lambda x: engine.sub(self.replacement, x)
     Xt = eval_rows(X, func)
     return _col2d(Xt)
Ejemplo n.º 53
0
    def fit(self, X, y, sample_weight=None, check_input=True):
        """Fit Ridge regression model after searching for the best mu and tau.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training data

        y : array-like, shape = [n_samples] or [n_samples, n_targets]
            Target values

        sample_weight : float or array-like of shape [n_samples]
            Sample weight

        Returns
        -------
        self : Returns self.
        """
        self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
        y = self._label_binarizer.fit_transform(y)
        if self._label_binarizer.y_type_.startswith('multilabel'):
            raise ValueError("%s doesn't support multi-label classification" %
                             (self.__class__.__name__))
        else:
            y = column_or_1d(y, warn=False)

        param_grid = {'tau': self.taus, 'lamda': self.lamdas}
        fit_params = {
            'sample_weight': sample_weight,
            'check_input': check_input
        }
        estimator = L1L2TwoStepClassifier(mu=self.mu,
                                          fit_intercept=self.fit_intercept,
                                          use_gpu=self.use_gpu,
                                          threshold=self.threshold,
                                          normalize=self.normalize,
                                          precompute=self.precompute,
                                          max_iter=self.max_iter,
                                          copy_X=self.copy_X,
                                          tol=self.tol,
                                          warm_start=self.warm_start,
                                          positive=self.positive,
                                          random_state=self.random_state,
                                          selection=self.selection)
        gs = GridSearchCV(estimator=estimator,
                          param_grid=param_grid,
                          fit_params=fit_params,
                          cv=self.cv,
                          scoring=self.scoring,
                          n_jobs=self.n_jobs,
                          iid=self.iid,
                          refit=self.refit,
                          verbose=self.verbose,
                          pre_dispatch=self.pre_dispatch,
                          error_score=self.error_score,
                          return_train_score=self.return_train_score)
        gs.fit(X, y)
        estimator = gs.best_estimator_
        self.tau_ = estimator.tau
        self.lamda_ = estimator.lamda
        self.coef_ = estimator.coef_
        self.intercept_ = estimator.intercept_
        self.best_estimator_ = estimator  # XXX DEBUG

        if self.classes_.shape[0] > 2:
            ndim = self.classes_.shape[0]
        else:
            ndim = 1
            self.coef_ = self.coef_.reshape(ndim, -1)

        return self
Ejemplo n.º 54
0
 def fit(self, y):
     y = column_or_1d(y, warn=True)
     self.classes_ = pd.Series(y).unique()
     return self
Ejemplo n.º 55
0
    def fit(self, X, y,weights=None):
        """Scikit-learn required: Computes the feature importance scores from the training data.
        Parameters
        ----------
        X: array-like {n_samples, n_features} Training instances to compute the feature importance scores from
        y: array-like {n_samples}             Training labels
        Returns
         -------
         self
        """
        X = check_array(X, force_all_finite=False)
        y = column_or_1d(y)
        #random_state
        if self.random_state != None:
            np.random.seed(self.random_state)
            random.seed(self.random_state)

        #Make subsets with all the features
        num_features = X.shape[1]
        self.size_feature_subset = min(self.size_feature_subset,num_features)
        subsets = self.make_subsets(list(range(num_features)),self.num_feature_subset,self.size_feature_subset)

        #Fit each subset
        scores = []
        for subset in subsets:
            new_X = self.custom_transform(X,subset)
            copy_relief_object = copy.deepcopy(self.relief_object)
            if not isinstance(weights,np.ndarray):
                copy_relief_object.fit(new_X,y)
            else:
                copy_relief_object.fit(new_X,y,weights=weights[subset])
            raw_score = copy_relief_object.feature_importances_
            score = np.empty(num_features)
            if self.rank_absolute:
                score.fill(0)
            else:
                score.fill(np.NINF)
            counter = 0
            for index in subset:
                score[index] = raw_score[counter]
                counter+=1
            scores.append(score)

            #DEBUGGING
            #print(score)

        scores = np.array(scores)

        #Merge results by selecting largest found weight for each feature
        max_scores = []
        for score in scores.T:
            if self.rank_absolute:
                max = np.max(np.absolute(score))
                if max in score:
                    max_scores.append(max)
                else:
                    max_scores.append(-max)
            else:
                max_scores.append(np.max(score))
        max_scores = np.array(max_scores)

        #Save FI as feature_importances_
        self.feature_importances_ = max_scores

        if self.rank_absolute:
            self.top_features_ = np.argsort(np.absolute(self.feature_importances_))[::-1]
        else:
            self.top_features_ = np.argsort(self.feature_importances_)[::-1]

        return self
Ejemplo n.º 56
0
    def fit(self, X, y, sample_weight=None, monitor=None):
        """Fit the gradient boosting model.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Data matrix

        y : structured array, shape = (n_samples,)
            A structured array containing the binary event indicator
            as first field, and time of event or time of censoring as
            second field.

        sample_weight : array-like, shape = (n_samples,), optional
            Weights given to each sample. If omitted, all samples have weight 1.

        monitor : callable, optional
            The monitor is called after each iteration with the current
            iteration, a reference to the estimator and the local variables of
            ``_fit_stages`` as keyword arguments ``callable(i, self,
            locals())``. If the callable returns ``True`` the fitting procedure
            is stopped. The monitor can be used for various things such as
            computing held-out estimates, early stopping, model introspect, and
            snapshoting.

        Returns
        -------
        self : object
            Returns self.
        """
        random_state = check_random_state(self.random_state)

        X, event, time = check_arrays_survival(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=DTYPE)
        n_samples, self.n_features_ = X.shape

        X = X.astype(DTYPE)
        if sample_weight is None:
            sample_weight = numpy.ones(n_samples, dtype=numpy.float32)
        else:
            sample_weight = column_or_1d(sample_weight, warn=True)
            check_consistent_length(X, sample_weight)

        self._check_params()

        self.loss_ = LOSS_FUNCTIONS[self.loss](1)
        if isinstance(self.loss_, (CensoredSquaredLoss, IPCWLeastSquaresError)):
            time = numpy.log(time)

        self._init_state()
        self.init_.fit(X, (event, time), sample_weight)
        y_pred = self.init_.predict(X)
        begin_at_stage = 0

        # fit the boosting stages
        y = numpy.fromiter(zip(event, time), dtype=[('event', numpy.bool), ('time', numpy.float64)])
        n_stages = self._fit_stages(X, y, y_pred, sample_weight, random_state,
                                    begin_at_stage, monitor)
        # change shape of arrays after fit (early-stopping or additional tests)
        if n_stages != self.estimators_.shape[0]:
            self.estimators_ = self.estimators_[:n_stages]
            self.train_score_ = self.train_score_[:n_stages]
            if hasattr(self, 'oob_improvement_'):
                self.oob_improvement_ = self.oob_improvement_[:n_stages]

        return self
Ejemplo n.º 57
0
 def __call__(self, est, X, y_true):
     column_or_1d(y_true)
     y_pred = est.predict_proba(X)[:, 1]
     return self.score_fct(y_true, y_pred)
Ejemplo n.º 58
0
 def fit(self, X, y=None):
     X = column_or_1d(X, warn=True)
     self.classes_ = numpy.unique(X[~pandas.isnull(X)])
     return self
Ejemplo n.º 59
0
	def transform(self, y):
		y = column_or_1d(y, warn = True)
		index = list(self.classes_)
		return numpy.array([self.missing_value if pandas.isnull(v) else index.index(v) for v in y])
Ejemplo n.º 60
0
 def transform(self, X):
     X = column_or_1d(X, warn=True)
     engine = _regex_engine(self.pattern)
     func = lambda x: bool(engine.search(x))
     Xt = eval_rows(X, func)
     return _col2d(Xt)