def _compute_distributions(self, columns=None): def _get_matrix(M, cachedM, col): nonlocal single_column if not sp.issparse(M): return M[:, col], self.W if self.has_weights() else None, None if cachedM is None: if single_column: warn(ResourceWarning, "computing distributions on sparse data " "for a single column is inefficient") cachedM = sp.csc_matrix(self.X) data = cachedM.data[cachedM.indptr[col]:cachedM.indptr[col+1]] if self.has_weights(): weights = self.W[ cachedM.indices[cachedM.indptr[col]:cachedM.indptr[col+1]]] else: weights = None return data, weights, cachedM if columns is None: columns = range(len(self.domain.variables)) single_column = False else: columns = [self.domain.index(var) for var in columns] single_column = len(columns) == 1 and len(self.domain) > 1 distributions = [] Xcsc = Ycsc = None for col in columns: var = self.domain[col] if col < self.X.shape[1]: m, W, Xcsc = _get_matrix(self.X, Xcsc, col) else: m, W, Ycsc = _get_matrix(self.Y, Ycsc, col - self.X.shape[1]) if isinstance(var, DiscreteVariable): if W is not None: W = W.ravel() dist, unknowns = bn.bincount(m, len(var.values)-1, W) elif not len(m): dist, unknowns = np.zeros((2, 0)), 0 else: if W is not None: ranks = np.argsort(m) vals = np.vstack((m[ranks], W[ranks])) unknowns = bn.countnans(m, W) else: vals = np.ones((2, m.shape[0])) vals[0, :] = m vals[0, :].sort() unknowns = bn.countnans(m) dist = np.array(_valuecount.valuecount(vals)) distributions.append((dist, unknowns)) return distributions
def __determine_density(data): if data is None: return Storage.Missing if data is not None and sp.issparse(data): try: if bn.bincount(data.data, 1)[0][0] == 0: return Storage.SPARSE_BOOL except ValueError as e: pass return Storage.SPARSE else: return Storage.DENSE
def _compute_contingency(self, col_vars=None, row_var=None): n_atts = self.X.shape[1] if col_vars is None: col_vars = range(len(self.domain.variables)) single_column = False else: col_vars = [self.domain.index(var) for var in col_vars] single_column = len(col_vars) == 1 and len(self.domain) > 1 if row_var is None: row_var = self.domain.class_var if row_var is None: raise ValueError("No row variable") row_desc = self.domain[row_var] if not isinstance(row_desc, DiscreteVariable): raise TypeError("Row variable must be discrete") row_indi = self.domain.index(row_var) n_rows = len(row_desc.values) if 0 <= row_indi < n_atts: row_data = self.X[:, row_indi] elif row_indi < 0: row_data = self.metas[:, -1 - row_indi] else: row_data = self.Y[:, row_indi - n_atts] W = self.W if self.has_weights() else None col_desc = [self.domain[var] for var in col_vars] col_indi = [self.domain.index(var) for var in col_vars] if any(not isinstance(var, (ContinuousVariable, DiscreteVariable)) for var in col_desc): raise ValueError("contingency can be computed only for discrete " "and continuous values") if any(isinstance(var, ContinuousVariable) for var in col_desc): dep_indices = np.argsort(row_data) dep_sizes, nans = bn.bincount(row_data, n_rows - 1) dep_sizes = dep_sizes.astype(int, copy=False) if nans: raise ValueError("cannot compute contigencies with missing " "row data") else: dep_indices = dep_sizes = None contingencies = [None] * len(col_desc) for arr, f_cond, f_ind in ( (self.X, lambda i: 0 <= i < n_atts, lambda i: i), (self.Y, lambda i: i >= n_atts, lambda i: i - n_atts), (self.metas, lambda i: i < 0, lambda i: -1 - i)): arr_indi = [e for e, ind in enumerate(col_indi) if f_cond(ind)] vars = [(e, f_ind(col_indi[e]), col_desc[e]) for e in arr_indi] disc_vars = [v for v in vars if isinstance(v[2], DiscreteVariable)] if disc_vars: if sp.issparse(arr): max_vals = max(len(v[2].values) for v in disc_vars) disc_indi = {i for _, i, _ in disc_vars} mask = [i in disc_indi for i in range(arr.shape[1])] conts, nans = bn.contingency(arr, row_data, max_vals - 1, n_rows - 1, W, mask) for col_i, arr_i, _ in disc_vars: contingencies[col_i] = (conts[arr_i], nans[arr_i]) else: for col_i, arr_i, var in disc_vars: contingencies[col_i] = bn.contingency(arr[:, arr_i], row_data, len(var.values) - 1, n_rows - 1, W) cont_vars = [v for v in vars if isinstance(v[2], ContinuousVariable)] if cont_vars: for col_i, _, _ in cont_vars: contingencies[col_i] = ([], np.empty(n_rows)) fr = 0 for clsi, cs in enumerate(dep_sizes): to = fr + cs grp_rows = dep_indices[fr:to] grp_data = arr[grp_rows, :] grp_W = W and W[grp_rows] if sp.issparse(grp_data): grp_data = sp.csc_matrix(grp_data) for col_i, arr_i, _ in cont_vars: if sp.issparse(grp_data): col_data = grp_data.data[grp_data.indptr[arr_i]: grp_data.indptr[arr_i+1]] else: col_data = grp_data[:, arr_i] if W is not None: ranks = np.argsort(col_data) vals = np.vstack((col_data[ranks], grp_W[ranks])) nans = bn.countnans(col_data, grp_W) else: col_data.sort() vals = np.ones((2, len(col_data))) vals[0, :] = col_data nans = bn.countnans(col_data) dist = np.array(_valuecount.valuecount(vals)) contingencies[col_i][0].append(dist) contingencies[col_i][1][clsi] = nans fr = to return contingencies
def __call__(self, data, ret=Value): if not 0 <= ret <= 2: raise ValueError("invalid value of argument 'ret'") if (ret > 0 and any(isinstance(v, Orange_data.ContinuousVariable) for v in self.domain.class_vars)): raise ValueError("cannot predict continuous distributions") # Call the predictor if isinstance(data, np.ndarray): prediction = self.predict(np.atleast_2d(data)) elif isinstance(data, scipy.sparse.csr.csr_matrix): prediction = self.predict(data) elif isinstance(data, Orange_data.Instance): if data.domain != self.domain: data = Orange_data.Instance(self.domain, data) prediction = self.predict(np.atleast_2d(data.x)) elif isinstance(data, Orange_data.Table): if data.domain != self.domain: data = Orange_data.Table.from_table(self.domain, data) prediction = self.predict(data.X) else: raise TypeError("Unrecognized argument (instance of '%s')", type(data).__name__) # Parse the result into value and probs multitarget = len(self.domain.class_vars) > 1 if isinstance(prediction, tuple): value, probs = prediction elif prediction.ndim == 1 + multitarget: value, probs = prediction, None elif prediction.ndim == 2 + multitarget: value, probs = None, prediction else: raise TypeError("model returned a %i-dimensional array", prediction.ndim) # Ensure that we have what we need to return if ret != Model.Probs and value is None: value = np.argmax(probs, axis=-1) if ret != Model.Value and probs is None: if multitarget: max_card = max(len(c.values) for c in self.domain.class_vars) probs = np.zeros(value.shape + (max_card,), float) for i, cvar in enumerate(self.domain.class_vars): probs[:, i, :], _ = bn.bincount(np.atleast_2d(value[:, i]), max_card - 1) else: probs, _ = bn.bincount(np.atleast_2d(value), len(self.domain.class_var.values) - 1) if ret == Model.ValueProbs: return value, probs else: return probs # Expand probability predictions for class values which are not present if ret != self.Value: n_class = len(self.domain.class_vars) used_vals = [np.unique(y) for y in self.Y.T] max_values = max(len(cv.values) for cv in self.domain.class_vars) if max_values != probs.shape[-1]: if not self.supports_multiclass: probs = probs[:, np.newaxis, :] probs_ext = np.zeros((len(probs), n_class, max_values)) for c in range(n_class): i = 0 class_values = len(self.domain.class_vars[c].values) for cv in range(class_values): if i < len(used_vals[c]) and cv == used_vals[c][i]: probs_ext[:, c, cv] = probs[:, c, i] i += 1 if self.supports_multiclass: probs = probs_ext else: probs = probs_ext[:, 0, :] # Return what we need to if ret == Model.Probs: return probs if isinstance(data, Orange_data.Instance) and not multitarget: value = Value(self.domain.class_var, value[0]) if ret == Model.Value: return value else: # ret == Model.ValueProbs return value, probs
def __call__(self, data, ret=Value): if not 0 <= ret <= 2: raise ValueError("invalid value of argument 'ret'") if (ret > 0 and any(isinstance(v, Orange.data.ContinuousVariable) for v in self.domain.class_vars)): raise ValueError("cannot predict continuous distributions") # Call the predictor if isinstance(data, np.ndarray): prediction = self.predict(np.atleast_2d(data)) elif isinstance(data, scipy.sparse.csr.csr_matrix): prediction = self.predict(data) elif isinstance(data, Orange.data.Instance): if data.domain != self.domain: data = Orange.data.Instance(self.domain, data) prediction = self.predict_storage(data) elif isinstance(data, Orange.data.Table): if data.domain != self.domain: data = data.from_table(self.domain, data) prediction = self.predict_storage(data) else: raise TypeError("Unrecognized argument (instance of '{}')".format( type(data).__name__)) # Parse the result into value and probs multitarget = len(self.domain.class_vars) > 1 if isinstance(prediction, tuple): value, probs = prediction elif prediction.ndim == 1 + multitarget: value, probs = prediction, None elif prediction.ndim == 2 + multitarget: value, probs = None, prediction else: raise TypeError("model returned a %i-dimensional array", prediction.ndim) # Ensure that we have what we need to return if ret != Model.Probs and value is None: value = np.argmax(probs, axis=-1) if ret != Model.Value and probs is None: if multitarget: max_card = max(len(c.values) for c in self.domain.class_vars) probs = np.zeros(value.shape + (max_card,), float) for i, cvar in enumerate(self.domain.class_vars): probs[:, i, :], _ = bn.bincount(np.atleast_2d(value[:, i]), max_card - 1) else: probs, _ = bn.bincount(np.atleast_2d(value), len(self.domain.class_var.values) - 1) if ret == Model.ValueProbs: return value, probs else: return probs # Return what we need to if ret == Model.Probs: return probs if isinstance(data, Orange.data.Instance) and not multitarget: value = Orange.data.Value(self.domain.class_var, value[0]) if ret == Model.Value: return value else: # ret == Model.ValueProbs return value, probs
def __call__(self, data, ret=Value): if not 0 <= ret <= 2: raise ValueError("invalid value of argument 'ret'") if (ret > 0 and any( isinstance(v, Orange.data.ContinuousVariable) for v in self.domain.class_vars)): raise ValueError("cannot predict continuous distributions") # Call the predictor if isinstance(data, np.ndarray): prediction = self.predict(np.atleast_2d(data)) elif isinstance(data, scipy.sparse.csr.csr_matrix): prediction = self.predict(data) elif isinstance(data, Orange.data.Instance): if data.domain != self.domain: data = Orange.data.Instance(self.domain, data) prediction = self.predict_storage(data) elif isinstance(data, Orange.data.Table): if data.domain != self.domain: data = data.from_table(self.domain, data) prediction = self.predict_storage(data) else: raise TypeError("Unrecognized argument (instance of '{}')".format( type(data).__name__)) # Parse the result into value and probs multitarget = len(self.domain.class_vars) > 1 if isinstance(prediction, tuple): value, probs = prediction elif prediction.ndim == 1 + multitarget: value, probs = prediction, None elif prediction.ndim == 2 + multitarget: value, probs = None, prediction else: raise TypeError("model returned a %i-dimensional array", prediction.ndim) # Ensure that we have what we need to return if ret != Model.Probs and value is None: value = np.argmax(probs, axis=-1) if ret != Model.Value and probs is None: if multitarget: max_card = max(len(c.values) for c in self.domain.class_vars) probs = np.zeros(value.shape + (max_card, ), float) for i, cvar in enumerate(self.domain.class_vars): probs[:, i, :], _ = bn.bincount(np.atleast_2d(value[:, i]), max_card - 1) else: probs, _ = bn.bincount(np.atleast_2d(value), len(self.domain.class_var.values) - 1) if ret == Model.ValueProbs: return value, probs else: return probs # Return what we need to if ret == Model.Probs: return probs if isinstance(data, Orange.data.Instance) and not multitarget: value = Orange.data.Value(self.domain.class_var, value[0]) if ret == Model.Value: return value else: # ret == Model.ValueProbs return value, probs