def test_empty(): np.testing.assert_array_equal( one_hot([]), np.zeros((0, 0)) ) np.testing.assert_array_equal( one_hot([], dim=2), np.zeros((0, 2)) )
def one_hot_probs(value): if not multitarget: return one_hot(value) max_card = max(len(c.values) for c in self.domain.class_vars) probs = np.zeros(value.shape + (max_card,), float) for i in range(len(self.domain.class_vars)): probs[:, i, :] = one_hot(value[:, i]) return probs
def _get_bin_distributions(self, bin_indices): """Compute the distribution of instances within bins. Parameters ---------- bin_indices : np.ndarray An array with same shape as `x` but containing the bin index of the instance. Returns ------- np.ndarray A 2d array; the first dimension represents different bins, the second - the counts of different target values. """ if self.target_var and self.target_var.is_discrete: y = self.y # TODO This probably also isn't the best handling of sparse data... if sp.issparse(y): y = np.squeeze(np.array(y.todense())) y = one_hot(y) bins = np.arange(self.n_bins)[:, np.newaxis] mask = bin_indices == bins distributions = np.zeros((self.n_bins, y.shape[1])) for bin_idx in range(self.n_bins): distributions[bin_idx] = y[mask[bin_idx]].sum(axis=0) else: distributions, _ = ut.bincount(bin_indices.astype(np.int64)) # To keep things consistent across different variable types, we # want to return a 2d array where the first dim represent different # bins, and the second the distributions. distributions = distributions[:, np.newaxis] return distributions
def test_dim(self): np.testing.assert_array_equal( one_hot(self.values, dim=4), [[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0]] )
def test_one_hot(self): np.testing.assert_array_equal( one_hot(self.values), [[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 1, 0]] )
def _get_bin_distributions(self, bin_indices): """Compute the distribution of instances within bins. Parameters ---------- bin_indices : np.ndarray An array with same shape as `x` but containing the bin index of the instance. Returns ------- np.ndarray A 2d array; the first dimension represents different bins, the second - the counts of different target values. """ if self.target_var and self.target_var.is_discrete: y = self.y # TODO This probably also isn't the best handling of sparse data... if sp.issparse(y): y = np.squeeze(np.array(y.todense())) # Since y can contain missing values, we need to filter them out as # well as their corresponding `x` values y_nan_mask = np.isnan(y) y, bin_indices = y[~y_nan_mask], bin_indices[~y_nan_mask] y = one_hot(y) # In the event that y does not take up all the values and the # largest discrete value does not appear at all, one hot encoding # will produce too few columns. This causes problems, so we need to # pad y with zeros to properly compute the distribution if y.shape[1] != len(self.target_var.values): n_missing_columns = len(self.target_var.values) - y.shape[1] y = np.hstack((y, np.zeros((y.shape[0], n_missing_columns)))) bins = np.arange(self.n_bins)[:, np.newaxis] mask = bin_indices == bins distributions = np.zeros((self.n_bins, y.shape[1])) for bin_idx in range(self.n_bins): distributions[bin_idx] = y[mask[bin_idx]].sum(axis=0) else: distributions, _ = ut.bincount(bin_indices.astype(np.int64)) # To keep things consistent across different variable types, we # want to return a 2d array where the first dim represent different # bins, and the second the distributions. distributions = distributions[:, np.newaxis] return distributions
def __call__(self, data, ret=Value): def fix_dim(x): return x[0] if one_d else x if not 0 <= ret <= 2: raise ValueError("invalid value of argument 'ret'") if ret > 0 and any(v.is_continuous for v in self.domain.class_vars): raise ValueError("cannot predict continuous distributions") # Call the predictor one_d = False if isinstance(data, np.ndarray): one_d = data.ndim == 1 prediction = self.predict(np.atleast_2d(data)) elif isinstance(data, scipy.sparse.csr.csr_matrix): prediction = self.predict(data) elif isinstance(data, (Table, Instance)): if isinstance(data, Instance): data = Table(data.domain, [data]) one_d = True if data.domain != self.domain: if self.original_domain.attributes != data.domain.attributes \ and data.X.size \ and not np.isnan(data.X).all(): data = data.transform(self.original_domain) if np.isnan(data.X).all(): raise DomainTransformationError( "domain transformation produced no defined values") data = data.transform(self.domain) prediction = self.predict_storage(data) elif isinstance(data, (list, tuple)): if not isinstance(data[0], (list, tuple)): data = [data] one_d = True data = Table.from_list(self.original_domain, data) data = data.transform(self.domain) prediction = self.predict_storage(data) else: raise TypeError("Unrecognized argument (instance of '{}')" .format(type(data).__name__)) # Parse the result into value and probs multitarget = len(self.domain.class_vars) > 1 if isinstance(prediction, tuple): value, probs = prediction elif prediction.ndim == 1 + multitarget: value, probs = prediction, None elif prediction.ndim == 2 + multitarget: value, probs = None, prediction else: raise TypeError("model returned a %i-dimensional array", prediction.ndim) # Ensure that we have what we need to return if ret != Model.Probs and value is None: value = np.argmax(probs, axis=-1) if ret != Model.Value and probs is None: if multitarget: max_card = max(len(c.values) for c in self.domain.class_vars) probs = np.zeros(value.shape + (max_card,), float) for i in range(len(self.domain.class_vars)): probs[:, i, :] = one_hot(value[:, i]) else: probs = one_hot(value) if ret == Model.ValueProbs: return fix_dim(value), fix_dim(probs) else: return fix_dim(probs) # Return what we need to if ret == Model.Probs: return fix_dim(probs) if isinstance(data, Instance) and not multitarget: value = Value(self.domain.class_var, value[0]) if ret == Model.Value: return fix_dim(value) else: # ret == Model.ValueProbs return fix_dim(value), fix_dim(probs)
def __call__(self, data, ret=Value): if not 0 <= ret <= 2: raise ValueError("invalid value of argument 'ret'") if ret > 0 and any(v.is_continuous for v in self.domain.class_vars): raise ValueError("cannot predict continuous distributions") # Call the predictor if isinstance(data, np.ndarray): prediction = self.predict(np.atleast_2d(data)) elif isinstance(data, scipy.sparse.csr.csr_matrix): prediction = self.predict(data) elif isinstance(data, (Table, Instance)): if isinstance(data, Instance): data = Table(data.domain, [data]) if data.domain != self.domain: data = data.transform(self.domain) prediction = self.predict_storage(data) elif isinstance(data, (list, tuple)): if not isinstance(data[0], (list, tuple)): data = [data] data = Table(self.original_domain, data) data = data.transform(self.domain) prediction = self.predict_storage(data) else: raise TypeError("Unrecognized argument (instance of '{}')".format( type(data).__name__)) # Parse the result into value and probs multitarget = len(self.domain.class_vars) > 1 if isinstance(prediction, tuple): value, probs = prediction elif prediction.ndim == 1 + multitarget: value, probs = prediction, None elif prediction.ndim == 2 + multitarget: value, probs = None, prediction else: raise TypeError("model returned a %i-dimensional array", prediction.ndim) # Ensure that we have what we need to return if ret != Model.Probs and value is None: value = np.argmax(probs, axis=-1) if ret != Model.Value and probs is None: if multitarget: max_card = max(len(c.values) for c in self.domain.class_vars) probs = np.zeros(value.shape + (max_card, ), float) for i, cvar in enumerate(self.domain.class_vars): probs[:, i, :] = one_hot(value[:, i]) else: probs = one_hot(value) if ret == Model.ValueProbs: return value, probs else: return probs # Return what we need to if ret == Model.Probs: return probs if isinstance(data, Instance) and not multitarget: value = Value(self.domain.class_var, value[0]) if ret == Model.Value: return value else: # ret == Model.ValueProbs return value, probs
def __call__(self, data, ret=Value): if not 0 <= ret <= 2: raise ValueError("invalid value of argument 'ret'") if ret > 0 and any(v.is_continuous for v in self.domain.class_vars): raise ValueError("cannot predict continuous distributions") # Call the predictor if isinstance(data, np.ndarray): prediction = self.predict(np.atleast_2d(data)) elif isinstance(data, scipy.sparse.csr.csr_matrix): prediction = self.predict(data) elif isinstance(data, Instance): if data.domain != self.domain: data = Instance(self.domain, data) data = Table(data.domain, [data]) prediction = self.predict_storage(data) elif isinstance(data, Table): if data.domain != self.domain: data = data.transform(self.domain) prediction = self.predict_storage(data) elif isinstance(data, (list, tuple)): if not isinstance(data[0], (list, tuple)): data = [data] data = Table(self.original_domain, data) data = data.transform(self.domain) prediction = self.predict_storage(data) else: raise TypeError("Unrecognized argument (instance of '{}')" .format(type(data).__name__)) # Parse the result into value and probs multitarget = len(self.domain.class_vars) > 1 if isinstance(prediction, tuple): value, probs = prediction elif prediction.ndim == 1 + multitarget: value, probs = prediction, None elif prediction.ndim == 2 + multitarget: value, probs = None, prediction else: raise TypeError("model returned a %i-dimensional array", prediction.ndim) # Ensure that we have what we need to return if ret != Model.Probs and value is None: value = np.argmax(probs, axis=-1) if ret != Model.Value and probs is None: if multitarget: max_card = max(len(c.values) for c in self.domain.class_vars) probs = np.zeros(value.shape + (max_card,), float) for i, cvar in enumerate(self.domain.class_vars): probs[:, i, :] = one_hot(value[:, i]) else: probs = one_hot(value) if ret == Model.ValueProbs: return value, probs else: return probs # Return what we need to if ret == Model.Probs: return probs if isinstance(data, Instance) and not multitarget: value = Value(self.domain.class_var, value[0]) if ret == Model.Value: return value else: # ret == Model.ValueProbs return value, probs
def test_one_hot(self): np.testing.assert_equal( one_hot([0, 1, 2, 1], int), [[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 1, 0]])
def test_one_hot(self): np.testing.assert_equal( one_hot([0, 1, 2, 1], int), [[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 1, 0]] ) np.testing.assert_equal(one_hot([], int), np.zeros((0, 0), dtype=int))
def test_dim_too_low(self): with self.assertRaises(ValueError): one_hot(self.values, dim=2)
def test_dtype(self): res = one_hot(self.values) self.assertEqual(res.dtype, float) res = one_hot(self.values, dtype=int) self.assertEqual(res.dtype, int)