コード例 #1
0
ファイル: utils.py プロジェクト: woshahua/eli5
def fit_proba(clf,
              X,
              y_proba,
              expand_factor=10,
              sample_weight=None,
              shuffle=True,
              random_state=None,
              **fit_params):
    """
    Fit classifier ``clf`` to return probabilities close to ``y_proba``.

    scikit-learn can't optimize cross-entropy directly if target
    probability values are not indicator vectors. As a workaround this function
    expands the dataset according to target probabilities.
    Use expand_factor=None to turn it off
    (e.g. if probability scores are 0/1 in a first place).
    """
    rng = check_random_state(random_state)
    if expand_factor:
        if sample_weight is not None:
            X, y, sample_weight = zip(
                *expand_dataset(X,
                                y_proba,
                                factor=expand_factor,
                                random_state=rng,
                                extra_arrays=[sample_weight]))
        else:
            X, y = zip(*expand_dataset(
                X, y_proba, factor=expand_factor, random_state=rng))
    else:
        y = y_proba.argmax(axis=1)

    if shuffle:
        if sample_weight is not None:
            X, y, sample_weight = _shuffle(X,
                                           y,
                                           sample_weight,
                                           random_state=rng)
        else:
            X, y = _shuffle(X, y, random_state=rng)

    param_name = _get_classifier_prefix(clf) + "sample_weight"
    fit_params.setdefault(param_name, sample_weight)
    clf.fit(X, y, **fit_params)
    return clf
コード例 #2
0
def expanded_X_y_sample_weights(X,
                                y_proba,
                                expand_factor=10,
                                sample_weight=None,
                                shuffle=True,
                                random_state=None):
    """
    scikit-learn can't optimize cross-entropy directly if target
    probability values are not indicator vectors.
    As a workaround this function expands the dataset according to
    target probabilities. ``expand_factor=None`` means no dataset
    expansion.
    """
    rng = check_random_state(random_state)
    if expand_factor:
        if sample_weight is not None:
            X, y, sample_weight = zip(
                *expand_dataset(X,
                                y_proba,
                                factor=expand_factor,
                                random_state=rng,
                                extra_arrays=[sample_weight]))
        else:
            X, y = zip(*expand_dataset(
                X, y_proba, factor=expand_factor, random_state=rng))
    else:
        y = y_proba.argmax(axis=1)

    if isinstance(X, (list, tuple)) and len(X) and issparse(X[0]):
        X = vstack(X)

    if shuffle:
        if sample_weight is not None:
            X, y, sample_weight = _shuffle(X,
                                           y,
                                           sample_weight,
                                           random_state=rng)
        else:
            X, y = _shuffle(X, y, random_state=rng)
    return X, y, sample_weight
コード例 #3
0
def make_classification_dataset(X_top, X_bot, shuffle=False, dtype="float64",
                                random_state=None):

    y_top = np.ones(len(X_top))
    y_bot = np.zeros(len(X_bot))

    X = np.vstack([X_top, X_bot]).astype(dtype)
    y = np.hstack([y_top, y_bot])

    if shuffle:
        X, y = _shuffle(X, y, random_state=random_state)

    return X, y
コード例 #4
0
def make_classification_dataset(X_pos,
                                X_neg,
                                shuffle=False,
                                dtype="float64",
                                random_state=None):

    X = np.vstack([X_pos, X_neg]).astype(dtype)
    y = np.hstack([np.ones(len(X_pos)), np.zeros(len(X_neg))])

    if shuffle:
        X, y = _shuffle(X, y, random_state=random_state)

    return X, y
コード例 #5
0
def load_data(dtype=np.float32, order='C', shuffle=True, seed=0):
    """Load the data, then cache and memmap the train/test split"""
    print("Loading dataset...")
    data = fetch_openml('mnist_784')

    X = check_array(data['data'], dtype=dtype, order=order)
    y = data["target"]

    if shuffle:
        X, y = _shuffle(X, y, random_state=seed)

    # Normalize features
    X /= 255
    return X, y
コード例 #6
0
def load_data(dtype=np.float32, order='C', shuffle=True, seed=0):
    """Load the data, then cache and memmap the train/test split"""
    print("Loading dataset...")
    data = fetch_mldata('MNIST original')

    X = check_array(data['data'], dtype=dtype, order=order)
    y = data["target"]

    if shuffle:
        X, y = _shuffle(X, y, random_state=seed)

    # Normalize features
    X /= 255
    return X, y
コード例 #7
0
def SampleGenerator(images, labels, batch_size=1, random=True, augment=None):
    """ Infinite generator of batches of (input image, labelled image) """
    num_samples = len(images)
    while True:
        # Shuffle the data to avoid looping over in the same order every time
        if random:
            images, labels = _shuffle(images, labels)
        
        for cnt in range(num_samples // batch_size):
            offset = cnt * batch_size
            
            x_batch = []
            y_batch = []
            
            for idx in range(offset, offset + batch_size):
                x = images[idx]
                y = labels[idx]
                if augment is not None:
                    x, y = augment(x, y)
                
                x_batch.append(x)
                y_batch.append(y)
            
            yield _np.array(x_batch), _np.array(y_batch)
コード例 #8
0
    def iter(
        self,
        negative_samples: int = 0,
        output_dim: int = 1,
        shuffle: bool = True,
        aux_matrix: Optional[coo_matrix] = None,
        sampling_mode: str = "relative",
    ) -> Iterator[Tuple[ndarray, ndarray, float]]:
        """
        Iterate over a sequence of ([user_vector_{i}, item_vector_{i}], ratings_{i}).

        In practice, this will result in each user-item interaction being yielded,
        optionally with additional metadata, and optionally with 'n' negative sample
        instances.

        By default (with 'output_dim=1', and/or with no metadata), this will yield:

        ([user_{i}], [item_{i}}], ratings_{i})

        Concretely, this may be: ([0], [1], 1) If user/item meta-data is provided, this
        will be lazily injected into the yielded value, for example:

        ([user_{i}, user_tag_{i}{1}, ..., user_tag_{i}{n}],
         [item_{i}, item_tag_{i}{1}, ..., item_tag_{i}{n}],
         ratings_{i})

        Again, concretely: ([0, 21, 82], [1, 97, 64], 1). Make sure to set 'output_dim'
        if you want your metadata rendered (if you provided it)!

        Parameters
        ----------
        negative_samples: int
            The total number of negative samples (for each positive sample) you wish
            to take from the provided interactions set (and auxiliary matrix, if
            provided).
        output_dim: int
            The output dimensions for _both_ the encoded user- and item-vectors. Note
            that this will only be applied if user/item metadata is provided, otherwise
            all output vectors will have 'output_dim=1'.
        shuffle: int
            Indicate whether the output data should be shuffled.
        aux_matrix: coo_matrix, optional
            Provide a sparse matrix of the same shape as the 'interactions' matrix with
            additional interactions terms. These terms will be used when taking negative
            samples. This can be useful if this Dataset is a 'test' dataset, and you
            wish to draw negative samples from items a user has never interacted with
            when generating an evaluation set. This is the process described in [1].
        sampling_mode: str
            If negative sampling is used, specify the sampling mode you wish to use.
            See 'xanthus.dataset.utils.single_negative_sample' for more details.

        Returns
        -------
        output: Generator
            A generator yielding user/item vectors and the associated pairing's rating.

        See Also
        --------
            xanthus.evaluate.utils.he_sample
            xanthus.dataset.utils.single_negative_sample

        References
        ----------
        [1] He et al. https://dl.acm.org/doi/10.1145/3038912.3052569

        """

        # must cast interactions to csr so we can use indexing on the matrix.
        interactions: csr_matrix = self.interactions.tocsr()

        # setup user metadata
        if self.user_meta is not None:
            user_meta = self.user_meta.tocsr()
        else:
            user_meta = None

        # setup item metadata
        if self.item_meta is not None:
            item_meta = self.item_meta.tocsr()
        else:
            item_meta = None

        users, items = interactions.nonzero()
        ratings = interactions.data

        if negative_samples > 0:
            # the aux_matrix should include additional interactions you wish to consider
            # _exclusively_ for the purposes of generating negative samples.
            users, items, ratings = self.sampler(
                users,
                items,
                ratings,
                interactions,
                negative_samples,
                sampling_mode,
                aux_matrix,
                concat=True,
            )

        # optionally shuffle the users, items and ratings.
        if shuffle:
            users, items, ratings = _shuffle(users, items, ratings)

        ratings.reshape(-1, 1)

        # stack user ids with associated user metadata.
        if user_meta is not None and output_dim > 1:
            users = self._iter_meta(users, user_meta, output_dim)
        elif output_dim > 1:
            users = np.c_[users,
                          np.zeros((len(users), output_dim - 1), dtype=int)]
        else:
            users = users.reshape(-1, 1)

        # stack item ids with associated item metadata.
        if item_meta is not None and output_dim > 1:
            items = self._iter_meta(items, item_meta, output_dim)
        elif output_dim > 1:
            items = np.c_[items, np.zeros((len(items), output_dim - 1))]
        else:
            items = items.reshape(-1, 1)

        for (user, item, rating) in zip(users, items, ratings):
            yield user, item, rating
コード例 #9
0
def make_blobs(centers=5,
               center_box=(-10., 10.),
               cluster_std=1.,
               contamination=0.02,
               n_features=25,
               n_samples=500,
               random_state=None,
               shuffle=True):
    """Generate isotropic Gaussian blobs with outliers.

    Parameters
    ----------
    centers : int or array-like of shape (n_centers, n_features), default 5
        Number of centers to generate, or the fixed center locations.

    center_box : pair of floats (min, max), default (-10.0, 10.0)
        Bounding box for each cluster center when centers are generated at
        random.

    cluster_std : float or array-like of shape (n_centers,), default 1.0
        Standard deviation of the clusters.

    contamination : float, default 0.02
        Proportion of outliers in the data set.

    n_features : int, default 25
        Number of features for each sample.

    n_samples : int, default 500
        Number of samples.

    random_state : int, RandomState instance, default None
        Seed of the pseudo random number generator.

    shuffle : bool, default True
        If True, shuffle samples.

    Returns
    -------
    X : array-like of shape (n_samples, n_features)
        Generated data.

    y : array-like of shape (n_samples,)
        Return -1 for outliers and +1 for inliers.

    References
    ----------
    .. [#kriegel08] Kriegel, H.-P., Schubert, M., and Zimek, A.,
        "Angle-based outlier detection in high-dimensional data,"
        In Proceedings of SIGKDD, pp. 444-452, 2008.

    .. [#sugiyama13] Sugiyama, M., and Borgwardt, K.,
        "Rapid distance-based outlier detection via sampling,"
        Advances in NIPS, pp. 467-475, 2013.

    Examples
    --------
    >>> from kenchi.datasets import make_blobs
    >>> X, y = make_blobs(n_samples=10, n_features=2, contamination=0.1)
    >>> X.shape
    (10, 2)
    >>> y.shape
    (10,)
    """

    check_contamination(contamination)

    rnd = check_random_state(random_state)

    n_inliers = int(np.round((1. - contamination) * n_samples))
    X_inlier, _ = _make_blobs(centers=centers,
                              center_box=center_box,
                              cluster_std=cluster_std,
                              n_features=n_features,
                              n_samples=n_inliers,
                              random_state=rnd,
                              shuffle=False)

    data_max = np.max(X_inlier, axis=0)
    data_min = np.min(X_inlier, axis=0)

    n_outliers = n_samples - n_inliers
    X_outlier = rnd.uniform(low=np.minimum(center_box[0], data_min),
                            high=np.maximum(center_box[1], data_max),
                            size=(n_outliers, n_features))

    X = np.concatenate([X_inlier, X_outlier])
    y = np.empty(n_samples, dtype=int)
    y[:n_inliers] = POS_LABEL
    y[n_inliers:] = NEG_LABEL

    if shuffle:
        X, y = _shuffle(X, y, random_state=rnd)

    return X, y