def get_distributions_for_columns(data, columns): """Compute the distributions for columns. Parameters ---------- data : data.Table List of column indices into the `data.domain` (indices can be :class:`int` or instances of `Orange.data.Variable`) """ domain = data.domain # Normailze the columns to int indices columns = [ col if isinstance(col, int) else domain.index(col) for col in columns ] try: # Try the optimized code path (query the table|storage directly). dist_unks = data._compute_distributions(columns) except NotImplementedError: # Use default slow(er) implementation. return [get_distribution(data, i) for i in columns] else: # dist_unkn is a list of (values, unknowns) return [ get_distribution(dist, domain[col], unknown) for col, (dist, unknown) in zip(columns, dist_unks) ]
def from_data(cls, variable, data): variable = _get_variable(data, variable) try: dist, unknowns = data._compute_distributions([variable])[0] except NotImplementedError: col = data[:, variable] dtype = col.dtype if data.has_weights(): if not "float" in dtype.name and "float" in col.dtype.name: dtype = col.dtype.name dist = np.empty((2, len(col)), dtype=dtype) dist[0, :] = col dist[1, :] = data.W else: dist = np.ones((2, len(col)), dtype=dtype) dist[0, :] = col dist.sort(axis=0) dist = np.array(_orange.valuecount(dist)) unknowns = len(col) - dist.shape[1] self = super().__new__(cls, dist.shape) self[:] = dist self.unknowns = unknowns self.variable = variable return self
def from_data(cls, data, variable): variable = _get_variable(data, variable) try: dist, unknowns = data._compute_distributions([variable])[0] self = super().__new__(cls, len(dist)) self[:] = dist self.unknowns = unknowns except NotImplementedError: self = super().__new__(cls, len(variable.values)) self[:] = np.zeros(len(variable.values)) self.unknowns = 0 if data.has_weights(): for inst, w in zip(data, data.W): val = inst[variable] if not np.isnan(val): self[int(val)] += w else: self.unknowns += w else: for inst in data: val = inst[variable] if val == val: self[int(val)] += 1 else: self.unknowns += 1 self.variable = variable return self
def from_data(cls, data, variable): variable = _get_variable(data, variable) try: dist, unknowns = data._compute_distributions([variable])[0] self = super().__new__(cls, len(dist)) self[:] = dist self.unknowns = unknowns except NotImplementedError: self = np.zeros(len(variable.values)) self.unknowns = 0 if data.has_weights(): for val, w in zip(data[:, variable], data.W): if not math.isnan(val): self[val] += w else: self.unknowns += w else: for inst in data: val = inst[variable] if val == val: self[val] += 1 else: self.unknowns += 1 self.variable = variable return self
def from_data(cls, variable, data): variable = _get_variable(data, variable) try: dist, unknowns = data._compute_distributions([variable])[0] except NotImplementedError: col = data[:, variable] dtype = col.dtype if data.has_weights(): if not "float" in dtype.name and "float" in col.dtype.name: dtype = col.dtype.name dist = np.empty((2, len(col)), dtype=dtype) dist[0, :] = col dist[1, :] = data.W else: dist = np.ones((2, len(col)), dtype=dtype) dist[0, :] = col dist.sort(axis=0) dist = np.array(_orange.valuecount(dist)) unknowns = len(col) - dist.shape[1] self = super().__new__(cls, dist.shape) self[:] = dist self.unknowns = unknowns self.variable = variable return self
def get_distributions_for_columns(data, columns): """Compute the distributions for columns. Parameters ---------- data : data.Table List of column indices into the `data.domain` (indices can be :class:`int` or instances of `Orange.data.Variable`) """ domain = data.domain # Normailze the columns to int indices columns = [col if isinstance(col, int) else domain.index(col) for col in columns] try: # Try the optimized code path (query the table|storage directly). dist_unks = data._compute_distributions(columns) except NotImplementedError: # Use default slow(er) implementation. return [get_distribution(data, i) for i in columns] else: # dist_unkn is a list of (values, unknowns) return [get_distribution(dist, domain[col], unknown) for col, (dist, unknown) in zip(columns, dist_unks)]