def fit_proba(clf, X, y_proba, expand_factor=10, sample_weight=None, shuffle=True, random_state=None, **fit_params): """ Fit classifier ``clf`` to return probabilities close to ``y_proba``. scikit-learn can't optimize cross-entropy directly if target probability values are not indicator vectors. As a workaround this function expands the dataset according to target probabilities. Use expand_factor=None to turn it off (e.g. if probability scores are 0/1 in a first place). """ rng = check_random_state(random_state) if expand_factor: if sample_weight is not None: X, y, sample_weight = zip( *expand_dataset(X, y_proba, factor=expand_factor, random_state=rng, extra_arrays=[sample_weight])) else: X, y = zip(*expand_dataset( X, y_proba, factor=expand_factor, random_state=rng)) else: y = y_proba.argmax(axis=1) if shuffle: if sample_weight is not None: X, y, sample_weight = _shuffle(X, y, sample_weight, random_state=rng) else: X, y = _shuffle(X, y, random_state=rng) param_name = _get_classifier_prefix(clf) + "sample_weight" fit_params.setdefault(param_name, sample_weight) clf.fit(X, y, **fit_params) return clf
def expanded_X_y_sample_weights(X, y_proba, expand_factor=10, sample_weight=None, shuffle=True, random_state=None): """ scikit-learn can't optimize cross-entropy directly if target probability values are not indicator vectors. As a workaround this function expands the dataset according to target probabilities. ``expand_factor=None`` means no dataset expansion. """ rng = check_random_state(random_state) if expand_factor: if sample_weight is not None: X, y, sample_weight = zip( *expand_dataset(X, y_proba, factor=expand_factor, random_state=rng, extra_arrays=[sample_weight])) else: X, y = zip(*expand_dataset( X, y_proba, factor=expand_factor, random_state=rng)) else: y = y_proba.argmax(axis=1) if isinstance(X, (list, tuple)) and len(X) and issparse(X[0]): X = vstack(X) if shuffle: if sample_weight is not None: X, y, sample_weight = _shuffle(X, y, sample_weight, random_state=rng) else: X, y = _shuffle(X, y, random_state=rng) return X, y, sample_weight
def make_classification_dataset(X_top, X_bot, shuffle=False, dtype="float64", random_state=None): y_top = np.ones(len(X_top)) y_bot = np.zeros(len(X_bot)) X = np.vstack([X_top, X_bot]).astype(dtype) y = np.hstack([y_top, y_bot]) if shuffle: X, y = _shuffle(X, y, random_state=random_state) return X, y
def make_classification_dataset(X_pos, X_neg, shuffle=False, dtype="float64", random_state=None): X = np.vstack([X_pos, X_neg]).astype(dtype) y = np.hstack([np.ones(len(X_pos)), np.zeros(len(X_neg))]) if shuffle: X, y = _shuffle(X, y, random_state=random_state) return X, y
def load_data(dtype=np.float32, order='C', shuffle=True, seed=0): """Load the data, then cache and memmap the train/test split""" print("Loading dataset...") data = fetch_openml('mnist_784') X = check_array(data['data'], dtype=dtype, order=order) y = data["target"] if shuffle: X, y = _shuffle(X, y, random_state=seed) # Normalize features X /= 255 return X, y
def load_data(dtype=np.float32, order='C', shuffle=True, seed=0): """Load the data, then cache and memmap the train/test split""" print("Loading dataset...") data = fetch_mldata('MNIST original') X = check_array(data['data'], dtype=dtype, order=order) y = data["target"] if shuffle: X, y = _shuffle(X, y, random_state=seed) # Normalize features X /= 255 return X, y
def SampleGenerator(images, labels, batch_size=1, random=True, augment=None): """ Infinite generator of batches of (input image, labelled image) """ num_samples = len(images) while True: # Shuffle the data to avoid looping over in the same order every time if random: images, labels = _shuffle(images, labels) for cnt in range(num_samples // batch_size): offset = cnt * batch_size x_batch = [] y_batch = [] for idx in range(offset, offset + batch_size): x = images[idx] y = labels[idx] if augment is not None: x, y = augment(x, y) x_batch.append(x) y_batch.append(y) yield _np.array(x_batch), _np.array(y_batch)
def iter( self, negative_samples: int = 0, output_dim: int = 1, shuffle: bool = True, aux_matrix: Optional[coo_matrix] = None, sampling_mode: str = "relative", ) -> Iterator[Tuple[ndarray, ndarray, float]]: """ Iterate over a sequence of ([user_vector_{i}, item_vector_{i}], ratings_{i}). In practice, this will result in each user-item interaction being yielded, optionally with additional metadata, and optionally with 'n' negative sample instances. By default (with 'output_dim=1', and/or with no metadata), this will yield: ([user_{i}], [item_{i}}], ratings_{i}) Concretely, this may be: ([0], [1], 1) If user/item meta-data is provided, this will be lazily injected into the yielded value, for example: ([user_{i}, user_tag_{i}{1}, ..., user_tag_{i}{n}], [item_{i}, item_tag_{i}{1}, ..., item_tag_{i}{n}], ratings_{i}) Again, concretely: ([0, 21, 82], [1, 97, 64], 1). Make sure to set 'output_dim' if you want your metadata rendered (if you provided it)! Parameters ---------- negative_samples: int The total number of negative samples (for each positive sample) you wish to take from the provided interactions set (and auxiliary matrix, if provided). output_dim: int The output dimensions for _both_ the encoded user- and item-vectors. Note that this will only be applied if user/item metadata is provided, otherwise all output vectors will have 'output_dim=1'. shuffle: int Indicate whether the output data should be shuffled. aux_matrix: coo_matrix, optional Provide a sparse matrix of the same shape as the 'interactions' matrix with additional interactions terms. These terms will be used when taking negative samples. This can be useful if this Dataset is a 'test' dataset, and you wish to draw negative samples from items a user has never interacted with when generating an evaluation set. This is the process described in [1]. sampling_mode: str If negative sampling is used, specify the sampling mode you wish to use. See 'xanthus.dataset.utils.single_negative_sample' for more details. Returns ------- output: Generator A generator yielding user/item vectors and the associated pairing's rating. See Also -------- xanthus.evaluate.utils.he_sample xanthus.dataset.utils.single_negative_sample References ---------- [1] He et al. https://dl.acm.org/doi/10.1145/3038912.3052569 """ # must cast interactions to csr so we can use indexing on the matrix. interactions: csr_matrix = self.interactions.tocsr() # setup user metadata if self.user_meta is not None: user_meta = self.user_meta.tocsr() else: user_meta = None # setup item metadata if self.item_meta is not None: item_meta = self.item_meta.tocsr() else: item_meta = None users, items = interactions.nonzero() ratings = interactions.data if negative_samples > 0: # the aux_matrix should include additional interactions you wish to consider # _exclusively_ for the purposes of generating negative samples. users, items, ratings = self.sampler( users, items, ratings, interactions, negative_samples, sampling_mode, aux_matrix, concat=True, ) # optionally shuffle the users, items and ratings. if shuffle: users, items, ratings = _shuffle(users, items, ratings) ratings.reshape(-1, 1) # stack user ids with associated user metadata. if user_meta is not None and output_dim > 1: users = self._iter_meta(users, user_meta, output_dim) elif output_dim > 1: users = np.c_[users, np.zeros((len(users), output_dim - 1), dtype=int)] else: users = users.reshape(-1, 1) # stack item ids with associated item metadata. if item_meta is not None and output_dim > 1: items = self._iter_meta(items, item_meta, output_dim) elif output_dim > 1: items = np.c_[items, np.zeros((len(items), output_dim - 1))] else: items = items.reshape(-1, 1) for (user, item, rating) in zip(users, items, ratings): yield user, item, rating
def make_blobs(centers=5, center_box=(-10., 10.), cluster_std=1., contamination=0.02, n_features=25, n_samples=500, random_state=None, shuffle=True): """Generate isotropic Gaussian blobs with outliers. Parameters ---------- centers : int or array-like of shape (n_centers, n_features), default 5 Number of centers to generate, or the fixed center locations. center_box : pair of floats (min, max), default (-10.0, 10.0) Bounding box for each cluster center when centers are generated at random. cluster_std : float or array-like of shape (n_centers,), default 1.0 Standard deviation of the clusters. contamination : float, default 0.02 Proportion of outliers in the data set. n_features : int, default 25 Number of features for each sample. n_samples : int, default 500 Number of samples. random_state : int, RandomState instance, default None Seed of the pseudo random number generator. shuffle : bool, default True If True, shuffle samples. Returns ------- X : array-like of shape (n_samples, n_features) Generated data. y : array-like of shape (n_samples,) Return -1 for outliers and +1 for inliers. References ---------- .. [#kriegel08] Kriegel, H.-P., Schubert, M., and Zimek, A., "Angle-based outlier detection in high-dimensional data," In Proceedings of SIGKDD, pp. 444-452, 2008. .. [#sugiyama13] Sugiyama, M., and Borgwardt, K., "Rapid distance-based outlier detection via sampling," Advances in NIPS, pp. 467-475, 2013. Examples -------- >>> from kenchi.datasets import make_blobs >>> X, y = make_blobs(n_samples=10, n_features=2, contamination=0.1) >>> X.shape (10, 2) >>> y.shape (10,) """ check_contamination(contamination) rnd = check_random_state(random_state) n_inliers = int(np.round((1. - contamination) * n_samples)) X_inlier, _ = _make_blobs(centers=centers, center_box=center_box, cluster_std=cluster_std, n_features=n_features, n_samples=n_inliers, random_state=rnd, shuffle=False) data_max = np.max(X_inlier, axis=0) data_min = np.min(X_inlier, axis=0) n_outliers = n_samples - n_inliers X_outlier = rnd.uniform(low=np.minimum(center_box[0], data_min), high=np.maximum(center_box[1], data_max), size=(n_outliers, n_features)) X = np.concatenate([X_inlier, X_outlier]) y = np.empty(n_samples, dtype=int) y[:n_inliers] = POS_LABEL y[n_inliers:] = NEG_LABEL if shuffle: X, y = _shuffle(X, y, random_state=rnd) return X, y