def compute_box_data(self): attr = self.attribute if not attr: return dataset = self.dataset if dataset is None: self.stats = self.dist = self.conts = [] return self.is_continuous = attr.is_continuous if self.group_var: self.dist = [] self.conts = datacaching.getCached( dataset, contingency.get_contingency, (dataset, attr, self.group_var)) if self.is_continuous: self.stats = [BoxData(cont) for cont in self.conts] self.label_txts_all = self.group_var.values else: self.dist = datacaching.getCached( dataset, distribution.get_distribution, (dataset, attr)) self.conts = [] if self.is_continuous: self.stats = [BoxData(self.dist)] self.label_txts_all = [""] self.label_txts = [txts for stat, txts in zip(self.stats, self.label_txts_all) if stat.n > 0] self.stats = [stat for stat in self.stats if stat.n > 0]
def compute_box_data(self): dataset = self.dataset if dataset is None: self.stats = self.dist = self.conts = [] return attr = self.attributes[self.attributes_select[0]][0] attr = dataset.domain[attr] self.is_continuous = attr.is_continuous group_by = self.grouping_select[0] if group_by: group = self.grouping[group_by][0] self.dist = [] self.conts = datacaching.getCached(dataset, contingency.get_contingency, (dataset, attr, group)) if self.is_continuous: self.stats = [BoxData(cont) for cont in self.conts] self.label_txts_all = dataset.domain[group].values else: self.dist = datacaching.getCached(dataset, distribution.get_distribution, (dataset, attr)) self.conts = [] if self.is_continuous: self.stats = [BoxData(self.dist)] self.label_txts_all = [""] self.label_txts = [ txts for stat, txts in zip(self.stats, self.label_txts_all) if stat.N > 0 ] self.stats = [stat for stat in self.stats if stat.N > 0]
def compute_box_data(self): dataset = self.dataset if dataset is None: self.stats = self.dist = self.conts = [] return attr = self.attributes[self.attributes_select[0]][0] attr = dataset.domain[attr] self.is_continuous = attr.is_continuous group_by = self.grouping_select[0] if group_by: group = self.grouping[group_by][0] self.dist = [] self.conts = datacaching.getCached( dataset, contingency.get_contingency, (dataset, attr, group)) if self.is_continuous: self.stats = [BoxData(cont) for cont in self.conts] self.label_txts_all = dataset.domain[group].values else: self.dist = datacaching.getCached( dataset, distribution.get_distribution, (dataset, attr)) self.conts = [] if self.is_continuous: self.stats = [BoxData(self.dist)] self.label_txts_all = [""] self.label_txts = [txts for stat, txts in zip(self.stats, self.label_txts_all) if stat.N > 0] self.stats = [stat for stat in self.stats if stat.N > 0]
def compute_box_data(self): attr = self.attribute if not attr: return dataset = self.dataset if dataset is None: self.stats = self.dist = self.conts = [] return self.is_continuous = attr.is_continuous if self.group_var: self.dist = [] self.conts = datacaching.getCached(dataset, contingency.get_contingency, (dataset, attr, self.group_var)) if self.is_continuous: self.stats = [BoxData(cont) for cont in self.conts] self.label_txts_all = self.group_var.values else: self.dist = datacaching.getCached(dataset, distribution.get_distribution, (dataset, attr)) self.conts = [] if self.is_continuous: self.stats = [BoxData(self.dist)] self.label_txts_all = [""] self.label_txts = [ txts for stat, txts in zip(self.stats, self.label_txts_all) if stat.n > 0 ] self.stats = [stat for stat in self.stats if stat.n > 0]
def compute_box_data(self): dataset = self.ddataset if dataset is None: self.stats = self.dist = self.conts = [] return attr_ind = self.attributes_select[0] attr = dataset.domain[attr_ind] self.is_continuous = isinstance(attr, ContinuousVariable) group_by = self.grouping_select[0] if group_by: group_attr = self.grouping[group_by][0] group_ind = dataset.domain.index(group_attr) self.dist = [] self.conts = datacaching.getCached( dataset, contingency.get_contingency, (dataset, attr_ind, group_ind)) if self.is_continuous: self.stats = [BoxData(cont) for cont in self.conts] self.label_txts = dataset.domain[group_ind].values else: self.dist = datacaching.getCached( dataset, distribution.get_distribution, (dataset, attr_ind)) self.conts = [] if self.is_continuous: self.stats = [BoxData(self.dist)] self.label_txts = [""] self.stats = [stat for stat in self.stats if stat.N > 0]
def _compute_scaled_data(self): data = self.data # We cache scaled_data and validArray to share them between widgets cached = getCached(data, "visualizationData") if cached: self.original_data, self.scaled_data, self.valid_data_array = cached return Y = data.Y if data.Y.ndim == 2 else np.atleast_2d(data.Y).T self.original_data = np.hstack((data.X, Y)).T self.scaled_data = no_jit = self.original_data.copy() self.valid_data_array = ~np.isnan(no_jit) for index in range(len(data.domain)): attr = data.domain[index] if attr.is_discrete: no_jit[index] *= 2 no_jit[index] += 1 no_jit[index] /= 2 * len(attr.values) else: dstat = self.domain_data_stat[index] no_jit[index] -= dstat.min if dstat.max != dstat.min: no_jit[index] /= dstat.max - dstat.min setCached( data, "visualizationData", (self.original_data, self.scaled_data, self.valid_data_array))
def _compute_scaled_data(self): data = self.data # We cache scaled_data and validArray to share them between widgets cached = getCached(data, "visualizationData") if cached: self.data, self.scaled_data, self.valid_data_array = cached return Y = data.Y if data.Y.ndim == 2 else np.atleast_2d(data.Y).T all_data = np.hstack((data.X, Y, data.metas)).T self.scaled_data = self.data.copy() self.valid_data_array = np.isfinite(all_data) domain = self.domain for attr in chain(domain.attributes, domain.class_vars, domain.metas): c = self.scaled_data.get_column_view(attr)[0] if attr.is_discrete: c += 0.5 c /= len(attr.values) else: dstat = self.domain_data_stat[attr] c -= dstat.min if dstat.max != dstat.min: c /= dstat.max - dstat.min setCached(data, "visualizationData", (self.data, self.scaled_data, self.valid_data_array))
def __compute_density(self, data): def desc(part, frm, to): nans = sum(dist[i].nans for i in range(frm, to)) non_nans = sum(dist[i].non_nans for i in range(frm, to)) tot = nans + non_nans if tot == 0: return "" density = getattr(data, part + "_density")() if density == Storage.DENSE: dp = "%.1f%%" % (100 * nans / tot) if nans > 0 else "no" return " (%s missing values)" % dp s = " (sparse" if density == Storage.SPARSE else " (tags" return s + ", density %.2f %%)" % (100 * non_nans / tot) dist = datacaching.getCached(data, basic_stats.DomainBasicStats, (data, True)) domain = data.domain descriptions = [ desc(part, frm, to) for part, frm, to in [("X", 0, len(domain.attributes) ), ("Y", len(domain.attributes), len(domain)), ("metas", len(domain), len(domain) + len(domain.metas))] ] if all(not d or d == " (no missing values)" for d in descriptions): descriptions = self.__no_missing return descriptions
def _compute_scaled_data(self): data = self.data # We cache scaled_data and validArray to share them between widgets cached = getCached(data, "visualizationData") if cached: self.original_data, self.scaled_data, self.valid_data_array = cached return Y = data.Y if data.Y.ndim == 2 else np.atleast_2d(data.Y).T self.original_data = np.hstack((data.X, Y)).T self.scaled_data = no_jit = self.original_data.copy() self.valid_data_array = ~np.isnan(no_jit) for index in range(len(data.domain)): attr = data.domain[index] if attr.is_discrete: no_jit[index] *= 2 no_jit[index] += 1 no_jit[index] /= 2 * len(attr.values) else: dstat = self.domain_data_stat[index] no_jit[index] -= dstat.min if dstat.max != dstat.min: no_jit[index] /= dstat.max - dstat.min setCached(data, "visualizationData", (self.original_data, self.scaled_data, self.valid_data_array))
def data(self, index, role): row, col = self.sorted_map[index.row()], index.column() example = self.examples[row] if role == gui.TableClassValueRole: return example.get_class() # check whether we have a sparse columns, # handle background color role while you are at it sp_data = attributes = None if col < self.n_attr_cols: if role == QtCore.Qt.BackgroundRole: return density = self.X_density if density != Storage.DENSE: sp_data, attributes = example.sparse_x, self.domain.attributes elif col < self.n_attr_class_cols: if role == QtCore.Qt.BackgroundRole: return self.cls_color density = self.Y_density if density != Storage.DENSE: sp_data, attributes = example.sparse_y, self.domain.class_vars else: if role == QtCore.Qt.BackgroundRole: return self.meta_color density = self.metas_density if density != Storage.DENSE: sp_data, attributes = \ example.sparse_metas, self.domain.class_vars if sp_data is not None: if role == QtCore.Qt.DisplayRole: if density == Storage.SPARSE: return ", ".join( "{}={}".format(attributes[i].name, attributes[i].repr_val(v)) for i, v in zip(sp_data.indices, sp_data.data)) else: return ", ".join(attributes[i].name for i in sp_data.indices) else: # not sparse attr = self.all_attrs[col] val = example[attr] if role == QtCore.Qt.DisplayRole: return str(val) elif (role == gui.TableBarItem.BarRole and isinstance(attr, ContinuousVariable) and not isnan(val)): if self.dist is None: self.dist = datacaching.getCached( self.examples, basic_stats.DomainBasicStats, (self.examples, True)) dist = self.dist[col] return (val - dist.min) / (dist.max - dist.min or 1) elif role == gui.TableValueRole: return val elif role == gui.TableVariable: return val.variable return self._other_data.get((index.row(), index.column(), role), None)
def __compute_density(self, data): def desc(part, frm, to): nans = sum(dist[i].nans for i in range(frm, to)) non_nans = sum(dist[i].non_nans for i in range(frm, to)) tot = nans + non_nans if tot == 0: return "" density = getattr(data, part + "_density")() if density == Storage.DENSE: dp = "%.1f%%" % (100 * nans / tot) if nans > 0 else "no" return " (%s missing values)" % dp s = " (sparse" if density == Storage.SPARSE else " (tags" return s + ", density %.2f %%)" % (100 * non_nans / tot) dist = datacaching.getCached(data, basic_stats.DomainBasicStats, (data, True)) domain = data.domain descriptions = [desc(part, frm, to) for part, frm, to in [ ("X", 0, len(domain.attributes)), ("Y", len(domain.attributes), len(domain)), ("metas", len(domain), len(domain) + len(domain.metas))]] if all(not d or d == " (no missing values)" for d in descriptions): descriptions = self.__no_missing return descriptions
def _compute_domain_data_stat(self): stt = self.domain_data_stat = \ getCached(self.data, DomainBasicStats, (self.data,)) for index in range(len(self.domain)): attr = self.domain[index] if attr.is_discrete: self.attr_values[attr] = [0, len(attr.values)] elif attr.is_continuous: self.attr_values[attr] = [stt[index].min, stt[index].max]
def _compute_domain_data_stat(self): stt = self.domain_data_stat = \ getCached(self.data, DomainBasicStats, (self.data, True)) domain = self.domain for attr in chain(domain.variables, domain.metas): if attr.is_discrete: self.attr_values[attr] = [0, len(attr.values)] elif attr.is_continuous: self.attr_values[attr] = [stt[attr].min, stt[attr].max]
def set_info(self, data): """Updates data info.""" def sp(n): if n == 0: return "No", "s" elif n == 1: return str(n), '' else: return str(n), 's' if data is None: self.info_ex.setText('No data on input.') self.info_attr.setText('') self.info_meta.setText('') self.info_class.setText('') else: if isinstance(data, SqlTable): descriptions = ['', '', ''] else: descriptions = datacaching.getCached(data, self.__compute_density, (data, )) out_i = "~%s instance%s" % sp(data.approx_len()) if descriptions is self.__no_missing: out_i += " (no missing values)" self.info_ex.setText(out_i) def update_num_inst(): out_i = "%s instance%s" % sp(len(data)) if descriptions is self.__no_missing: out_i += " (no missing values)" self.info_ex.setText(out_i) threading.Thread(target=update_num_inst).start() self.info_attr.setText("%s feature%s" % sp(len(data.domain.attributes)) + descriptions[0]) self.info_meta.setText("%s meta attribute%s" % sp(len(data.domain.metas)) + descriptions[2]) if not data.domain.class_vars: out_c = 'No target variable.' else: if len(data.domain.class_vars) > 1: out_c = "%s outcome%s" % sp(len(data.domain.class_vars)) elif isinstance(data.domain.class_var, ContinuousVariable): out_c = 'Continuous target variable' else: out_c = 'Discrete class with %s value%s' % sp( len(data.domain.class_var.values)) out_c += descriptions[1] self.info_class.setText(out_c)
def set_info(self, data): """Updates data info.""" def sp(n): if n == 0: return "No", "s" elif n == 1: return str(n), '' else: return str(n), 's' if data is None: self.info_ex.setText('No data on input.') self.info_attr.setText('') self.info_meta.setText('') self.info_class.setText('') else: if isinstance(data, SqlTable): descriptions = ['', '', ''] else: descriptions = datacaching.getCached( data, self.__compute_density, (data, )) out_i = "~%s instance%s" % sp(data.approx_len()) if descriptions is self.__no_missing: out_i += " (no missing values)" self.info_ex.setText(out_i) def update_num_inst(): out_i = "%s instance%s" % sp(len(data)) if descriptions is self.__no_missing: out_i += " (no missing values)" self.info_ex.setText(out_i) threading.Thread(target=update_num_inst).start() self.info_attr.setText("%s feature%s" % sp(len(data.domain.attributes)) + descriptions[0]) self.info_meta.setText("%s meta attribute%s" % sp(len(data.domain.metas)) + descriptions[2]) if not data.domain.class_vars: out_c = 'No target variable.' else: if len(data.domain.class_vars) > 1: out_c = "%s outcome%s" % sp(len(data.domain.class_vars)) elif isinstance(data.domain.class_var, ContinuousVariable): out_c = 'Continuous target variable' else: out_c = 'Discrete class with %s value%s' % sp( len(data.domain.class_var.values)) out_c += descriptions[1] self.info_class.setText(out_c)
def table_summary(table): if isinstance(table, SqlTable): approx_len = table.approx_len() len_future = concurrent.futures.Future() def _len(): len_future.set_result(len(table)) threading.Thread(target=_len).start() # KILL ME !!! return ApproxSummary( approx_len, len_future, table.domain, NotAvailable(), NotAvailable(), NotAvailable(), ) else: domain = table.domain n_instances = len(table) # dist = basic_stats.DomainBasicStats(table, include_metas=True) bstats = datacaching.getCached(table, basic_stats.DomainBasicStats, (table, True)) dist = bstats.stats X_dist, Y_dist, M_dist = numpy.split( dist, numpy.cumsum([len(domain.attributes), len(domain.class_vars)])) def parts(array, density, col_dist): array = numpy.atleast_2d(array) nans = sum([dist.nans for dist in col_dist]) non_nans = sum([dist.non_nans for dist in col_dist]) if density == Storage.DENSE: return DenseArray(nans, non_nans, col_dist) elif density == Storage.SPARSE: return SparseArray(nans, non_nans, col_dist) elif density == Storage.SPARSE_BOOL: return SparseBoolArray(nans, non_nans, col_dist) elif density == Storage.MISSING: return NotAvailable() else: assert False X_part = parts(table.X, table.X_density(), X_dist) Y_part = parts(table.Y, table.Y_density(), Y_dist) M_part = parts(table.metas, table.metas_density(), M_dist) return Summary(n_instances, domain, X_part, Y_part, M_part)
def _stats_for_column(self, column): """ Return BasicStats for `column` index. """ coldesc = self.columns[column] if isinstance(coldesc, TableModel.Basket): return None if self.__stats is None: self.__stats = datacaching.getCached(self.source, basic_stats.DomainBasicStats, (self.source, True)) return self.__stats[coldesc.var]
def _stats_for_column(self, column): """ Return BasicStats for `column` index. """ coldesc = self.columns[column] if isinstance(coldesc, TableModel.Basket): return None if self.__stats is None: self.__stats = datacaching.getCached( self.source, basic_stats.DomainBasicStats, (self.source, True) ) return self.__stats[coldesc.var]
def table_summary(table): if isinstance(table, SqlTable): approx_len = table.approx_len() len_future = concurrent.futures.Future() def _len(): len_future.set_result(len(table)) threading.Thread(target=_len).start() # KILL ME !!! return ApproxSummary(approx_len, len_future, table.domain, NotAvailable(), NotAvailable(), NotAvailable()) else: domain = table.domain n_instances = len(table) # dist = basic_stats.DomainBasicStats(table, include_metas=True) bstats = datacaching.getCached( table, basic_stats.DomainBasicStats, (table, True) ) dist = bstats.stats # pylint: disable=unbalanced-tuple-unpacking X_dist, Y_dist, M_dist = numpy.split( dist, numpy.cumsum([len(domain.attributes), len(domain.class_vars)])) def parts(array, density, col_dist): array = numpy.atleast_2d(array) nans = sum([dist.nans for dist in col_dist]) non_nans = sum([dist.non_nans for dist in col_dist]) if density == Storage.DENSE: return DenseArray(nans, non_nans, col_dist) elif density == Storage.SPARSE: return SparseArray(nans, non_nans, col_dist) elif density == Storage.SPARSE_BOOL: return SparseBoolArray(nans, non_nans, col_dist) elif density == Storage.MISSING: return NotAvailable() else: assert False return None X_part = parts(table.X, table.X_density(), X_dist) Y_part = parts(table.Y, table.Y_density(), Y_dist) M_part = parts(table.metas, table.metas_density(), M_dist) return Summary(n_instances, domain, X_part, Y_part, M_part)
def set_data(self, data, subset_data=None, **args): if args.get("skipIfSame", 1): if checksum(data) == checksum(self.raw_data) and \ checksum(subset_data) == checksum(self.raw_subset_data): return self.domain_data_stat = [] self.attr_values = {} self.original_data = self.original_subset_data = None self.scaled_data = self.scaled_subset_data = None self.no_jittering_scaled_data = self.no_jittering_scaled_subset_data = None self.valid_data_array = self.valid_subset_data_array = None self.raw_data = None self.raw_subset_data = None self.have_data = False self.have_subset_data = False self.data_has_class = False self.data_has_continuous_class = False self.data_has_discrete_class = False self.data_class_name = None self.data_domain = None self.data_class_index = None if data is None: return full_data = self.merge_data_sets(data, subset_data) self.raw_data = data self.raw_subset_data = subset_data len_data = data and len(data) or 0 self.attribute_names = [attr.name for attr in full_data.domain] self.attribute_name_index = dict([(full_data.domain[i].name, i) for i in range(len(full_data.domain))]) self.attribute_flip_info = {} self.data_domain = full_data.domain self.data_has_class = bool(full_data.domain.class_var) self.data_has_continuous_class = \ isinstance(full_data.domain.class_var, ContinuousVariable) self.data_has_discrete_class = \ isinstance(full_data.domain.class_var, DiscreteVariable) self.data_class_name = self.data_has_class and full_data.domain.class_var.name if self.data_has_class: self.data_class_index = self.attribute_name_index[self.data_class_name] self.have_data = bool(self.raw_data and len(self.raw_data) > 0) self.have_subset_data = bool(self.raw_subset_data and len(self.raw_subset_data) > 0) self.domain_data_stat = getCached(full_data, DomainBasicStats, (full_data,)) sort_values_for_discrete_attrs = args.get("sort_values_for_discrete_attrs", 1) for index in range(len(full_data.domain)): attr = full_data.domain[index] if isinstance(attr, DiscreteVariable): self.attr_values[attr.name] = [0, len(attr.values)] elif isinstance(attr, ContinuousVariable): self.attr_values[attr.name] = [self.domain_data_stat[index].min, self.domain_data_stat[index].max] # the original_data, no_jittering_scaled_data and validArray are arrays # that we can cache so that other visualization widgets don't need to # compute it. The scaled_data on the other hand has to be computed for # each widget separately because of different # jitter_continuous and jitter_size values if getCached(data, "visualizationData") and subset_data == None: self.original_data, self.no_jittering_scaled_data, self.valid_data_array = getCached(data, "visualizationData") self.original_subset_data = self.no_jittering_scaled_subset_data = self.valid_subset_data_array = np.array( []).reshape([len(self.original_data), 0]) else: no_jittering_data = np.hstack((full_data.X, full_data.Y)).T valid_data_array = no_jittering_data != np.NaN original_data = no_jittering_data.copy() for index in range(len(data.domain)): attr = data.domain[index] if isinstance(attr, DiscreteVariable): # see if the values for discrete attributes have to be resorted variable_value_indices = get_variable_value_indices(data.domain[index], sort_values_for_discrete_attrs) if 0 in [i == variable_value_indices[attr.values[i]] for i in range(len(attr.values))]: # make the array a contiguous, otherwise the putmask # function does not work line = no_jittering_data[index].copy() indices = [np.where(line == val, 1, 0) for val in range(len(attr.values))] for i in range(len(attr.values)): np.putmask(line, indices[i], variable_value_indices[attr.values[i]]) no_jittering_data[index] = line # save the changed array original_data[index] = line # reorder also the values in the original data no_jittering_data[index] = ((no_jittering_data[index] * 2.0 + 1.0) / float(2 * len(attr.values))) elif isinstance(attr, ContinuousVariable): diff = self.domain_data_stat[index].max - self.domain_data_stat[ index].min or 1 # if all values are the same then prevent division by zero no_jittering_data[index] = (no_jittering_data[index] - self.domain_data_stat[index].min) / diff self.original_data = original_data[:, :len_data] self.original_subset_data = original_data[:, len_data:] self.no_jittering_scaled_data = no_jittering_data[:, :len_data] self.no_jittering_scaled_subset_data = no_jittering_data[:, len_data:] self.valid_data_array = valid_data_array[:, :len_data] self.valid_subset_data_array = valid_data_array[:, len_data:] if data: setCached(data, "visualizationData", (self.original_data, self.no_jittering_scaled_data, self.valid_data_array)) if subset_data: setCached(subset_data, "visualizationData", (self.original_subset_data, self.no_jittering_scaled_subset_data, self.valid_subset_data_array)) # compute the scaled_data arrays scaled_data = np.concatenate([self.no_jittering_scaled_data, self.no_jittering_scaled_subset_data], axis=1) # Random generators for jittering random = np.random.RandomState(seed=self.jitter_seed) rand_seeds = random.random_integers(0, sys.maxsize - 1, size=len(data.domain)) for index, rseed in zip(list(range(len(data.domain))), rand_seeds): # Need to use a different seed for each feature random = np.random.RandomState(seed=rseed) attr = data.domain[index] if isinstance(attr, DiscreteVariable): scaled_data[index] += (self.jitter_size / (50.0 * max(1, len(attr.values)))) * \ (random.rand(len(full_data)) - 0.5) elif isinstance(attr, ContinuousVariable) and self.jitter_continuous: scaled_data[index] += self.jitter_size / 50.0 * (0.5 - random.rand(len(full_data))) scaled_data[index] = np.absolute(scaled_data[index]) # fix values below zero ind = np.where(scaled_data[index] > 1.0, 1, 0) # fix values above 1 np.putmask(scaled_data[index], ind, 2.0 - np.compress(ind, scaled_data[index])) if self.have_subset_data: # Fix all subset instances which are also in the main data # to have the same jittered values ids_to_indices = dict((inst.id, i) for i, inst in enumerate(self.raw_data)) subset_ids_map = [[i, ids_to_indices[s.id]] for i, s in enumerate(self.raw_subset_data) if s.id in ids_to_indices] if len(subset_ids_map): subset_ids_map = np.array(subset_ids_map) subset_ids_map[:, 0] += len_data scaled_data[:, subset_ids_map[:, 0]] = \ scaled_data[:, subset_ids_map[:, 1]] self.scaled_data = scaled_data[:, :len_data] self.scaled_subset_data = scaled_data[:, len_data:]
def data(self, index, role): row, col = self.sorted_map[index.row()], index.column() example = self.examples[row] if role == gui.TableClassValueRole: return example.get_class() # check whether we have a sparse columns, # handle background color role while you are at it sp_data = attributes = None if col < self.n_attr_cols: if role == QtCore.Qt.BackgroundRole: return density = self.X_density if density != Storage.DENSE: sp_data, attributes = example.sparse_x, self.domain.attributes elif col < self.n_attr_class_cols: if role == QtCore.Qt.BackgroundRole: return self.cls_color density = self.Y_density if density != Storage.DENSE: sp_data, attributes = example.sparse_y, self.domain.class_vars else: if role == QtCore.Qt.BackgroundRole: return self.meta_color density = self.metas_density if density != Storage.DENSE: sp_data, attributes = \ example.sparse_metas, self.domain.class_vars if sp_data is not None: if role == QtCore.Qt.DisplayRole: if density == Storage.SPARSE: return ", ".join( "{}={}".format(attributes[i].name, attributes[i].repr_val(v)) for i, v in zip(sp_data.indices, sp_data.data)) else: return ", ".join( attributes[i].name for i in sp_data.indices) else: # not sparse attr = self.all_attrs[col] val = example[attr] if role == QtCore.Qt.DisplayRole: return str(val) elif (role == gui.TableBarItem.BarRole and isinstance(attr, ContinuousVariable) and not isnan(val)): if self.dist is None: self.dist = datacaching.getCached( self.examples, basic_stats.DomainBasicStats, (self.examples, True)) dist = self.dist[col] return (val - dist.min) / (dist.max - dist.min or 1) elif role == gui.TableValueRole: return val elif role == gui.TableVariable: return val.variable return self._other_data.get((index.row(), index.column(), role), None)
def set_data(self, data, subset_data=None, **args): if args.get("skipIfSame", 1): if checksum(data) == checksum(self.raw_data) and \ checksum(subset_data) == checksum(self.raw_subset_data): return self.domain_data_stat = [] self.attr_values = {} self.original_data = self.original_subset_data = None self.scaled_data = self.scaled_subset_data = None self.no_jittering_scaled_data = self.no_jittering_scaled_subset_data = None self.valid_data_array = self.valid_subset_data_array = None self.raw_data = None self.raw_subset_data = None self.have_data = False self.have_subset_data = False self.data_has_class = False self.data_has_continuous_class = False self.data_has_discrete_class = False self.data_class_name = None self.data_domain = None self.data_class_index = None if data is None: return full_data = self.merge_data_sets(data, subset_data) self.raw_data = data self.raw_subset_data = subset_data len_data = data and len(data) or 0 self.attribute_names = [attr.name for attr in full_data.domain] self.attribute_name_index = dict([(full_data.domain[i].name, i) for i in range(len(full_data.domain))]) self.attribute_flip_info = {} self.data_domain = full_data.domain self.data_has_class = bool(full_data.domain.class_var) self.data_has_continuous_class = bool(self.data_has_class and full_data.domain.class_var.var_type == VarTypes.Continuous) self.data_has_discrete_class = bool(self.data_has_class and full_data.domain.class_var.var_type == VarTypes.Discrete) self.data_class_name = self.data_has_class and full_data.domain.class_var.name if self.data_has_class: self.data_class_index = self.attribute_name_index[self.data_class_name] self.have_data = bool(self.raw_data and len(self.raw_data) > 0) self.have_subset_data = bool(self.raw_subset_data and len(self.raw_subset_data) > 0) self.domain_data_stat = getCached(full_data, DomainBasicStats, (full_data,)) sort_values_for_discrete_attrs = args.get("sort_values_for_discrete_attrs", 1) for index in range(len(full_data.domain)): attr = full_data.domain[index] if attr.var_type == VarTypes.Discrete: self.attr_values[attr.name] = [0, len(attr.values)] elif attr.var_type == VarTypes.Continuous: self.attr_values[attr.name] = [self.domain_data_stat[index].min, self.domain_data_stat[index].max] # the original_data, no_jittering_scaled_data and validArray are arrays # that we can cache so that other visualization widgets don't need to # compute it. The scaled_data on the other hand has to be computed for # each widget separately because of different # jitter_continuous and jitter_size values if getCached(data, "visualizationData") and subset_data == None: self.original_data, self.no_jittering_scaled_data, self.valid_data_array = getCached(data, "visualizationData") self.original_subset_data = self.no_jittering_scaled_subset_data = self.valid_subset_data_array = np.array( []).reshape([len(self.original_data), 0]) else: no_jittering_data = np.hstack((full_data.X, full_data.Y)).T valid_data_array = no_jittering_data != np.NaN original_data = no_jittering_data.copy() for index in range(len(data.domain)): attr = data.domain[index] if attr.var_type == VarTypes.Discrete: # see if the values for discrete attributes have to be resorted variable_value_indices = get_variable_value_indices(data.domain[index], sort_values_for_discrete_attrs) if 0 in [i == variable_value_indices[attr.values[i]] for i in range(len(attr.values))]: # make the array a contiguous, otherwise the putmask # function does not work line = no_jittering_data[index].copy() indices = [np.where(line == val, 1, 0) for val in range(len(attr.values))] for i in range(len(attr.values)): np.putmask(line, indices[i], variable_value_indices[attr.values[i]]) no_jittering_data[index] = line # save the changed array original_data[index] = line # reorder also the values in the original data no_jittering_data[index] = ((no_jittering_data[index] * 2.0 + 1.0) / float(2 * len(attr.values))) elif attr.var_type == VarTypes.Continuous: diff = self.domain_data_stat[index].max - self.domain_data_stat[ index].min or 1 # if all values are the same then prevent division by zero no_jittering_data[index] = (no_jittering_data[index] - self.domain_data_stat[index].min) / diff self.original_data = original_data[:, :len_data] self.original_subset_data = original_data[:, len_data:] self.no_jittering_scaled_data = no_jittering_data[:, :len_data] self.no_jittering_scaled_subset_data = no_jittering_data[:, len_data:] self.valid_data_array = valid_data_array[:, :len_data] self.valid_subset_data_array = valid_data_array[:, len_data:] if data: setCached(data, "visualizationData", (self.original_data, self.no_jittering_scaled_data, self.valid_data_array)) if subset_data: setCached(subset_data, "visualizationData", (self.original_subset_data, self.no_jittering_scaled_subset_data, self.valid_subset_data_array)) # compute the scaled_data arrays scaled_data = np.concatenate([self.no_jittering_scaled_data, self.no_jittering_scaled_subset_data], axis=1) # Random generators for jittering random = np.random.RandomState(seed=self.jitter_seed) rand_seeds = random.random_integers(0, sys.maxsize - 1, size=len(data.domain)) for index, rseed in zip(list(range(len(data.domain))), rand_seeds): # Need to use a different seed for each feature random = np.random.RandomState(seed=rseed) attr = data.domain[index] if attr.var_type == VarTypes.Discrete: scaled_data[index] += (self.jitter_size / (50.0 * max(1, len(attr.values)))) * \ (random.rand(len(full_data)) - 0.5) elif attr.var_type == VarTypes.Continuous and self.jitter_continuous: scaled_data[index] += self.jitter_size / 50.0 * (0.5 - random.rand(len(full_data))) scaled_data[index] = np.absolute(scaled_data[index]) # fix values below zero ind = np.where(scaled_data[index] > 1.0, 1, 0) # fix values above 1 np.putmask(scaled_data[index], ind, 2.0 - np.compress(ind, scaled_data[index])) if self.have_subset_data: # Fix all subset instances which are also in the main data # to have the same jittered values ids_to_indices = dict((inst.id, i) for i, inst in enumerate(self.raw_data)) subset_ids_map = [[i, ids_to_indices[s.id]] for i, s in enumerate(self.raw_subset_data) if s.id in ids_to_indices] if len(subset_ids_map): subset_ids_map = np.array(subset_ids_map) subset_ids_map[:, 0] += len_data scaled_data[:, subset_ids_map[:, 0]] = \ scaled_data[:, subset_ids_map[:, 1]] self.scaled_data = scaled_data[:, :len_data] self.scaled_subset_data = scaled_data[:, len_data:]
def set_data(self, data, **args): if args.get("skipIfSame", 1): if checksum(data) == checksum(self.raw_data): return self.domain_data_stat = [] self.attr_values = {} self.original_data = None self.scaled_data = None self.no_jittering_scaled_data = None self.valid_data_array = None self.raw_data = None self.have_data = False self.data_has_class = False self.data_has_continuous_class = False self.data_has_discrete_class = False self.data_class_name = None self.data_domain = None self.data_class_index = None if data is None: return full_data = data self.raw_data = data len_data = data and len(data) or 0 self.attribute_names = [attr.name for attr in full_data.domain] self.attribute_name_index = dict([ (full_data.domain[i].name, i) for i in range(len(full_data.domain)) ]) self.attribute_flip_info = {} self.data_domain = full_data.domain self.data_has_class = bool(full_data.domain.class_var) self.data_has_continuous_class = full_data.domain.has_continuous_class self.data_has_discrete_class = full_data.domain.has_discrete_class self.data_class_name = self.data_has_class and full_data.domain.class_var.name if self.data_has_class: self.data_class_index = self.attribute_name_index[ self.data_class_name] self.have_data = bool(self.raw_data and len(self.raw_data) > 0) self.domain_data_stat = getCached(full_data, DomainBasicStats, (full_data, )) sort_values_for_discrete_attrs = args.get( "sort_values_for_discrete_attrs", 1) for index in range(len(full_data.domain)): attr = full_data.domain[index] if attr.is_discrete: self.attr_values[attr.name] = [0, len(attr.values)] elif attr.is_continuous: self.attr_values[attr.name] = [ self.domain_data_stat[index].min, self.domain_data_stat[index].max ] if 'no_data' in args: return # the original_data, no_jittering_scaled_data and validArray are arrays # that we can cache so that other visualization widgets don't need to # compute it. The scaled_data on the other hand has to be computed for # each widget separately because of different # jitter_continuous and jitter_size values if getCached(data, "visualizationData"): self.original_data, self.no_jittering_scaled_data, self.valid_data_array = getCached( data, "visualizationData") else: no_jittering_data = np.c_[full_data.X, full_data.Y].T valid_data_array = ~np.isnan(no_jittering_data) original_data = no_jittering_data.copy() for index in range(len(data.domain)): attr = data.domain[index] if attr.is_discrete: # see if the values for discrete attributes have to be resorted variable_value_indices = get_variable_value_indices( data.domain[index], sort_values_for_discrete_attrs) if 0 in [ i == variable_value_indices[attr.values[i]] for i in range(len(attr.values)) ]: # make the array a contiguous, otherwise the putmask # function does not work line = no_jittering_data[index].copy() indices = [ np.where(line == val, 1, 0) for val in range(len(attr.values)) ] for i in range(len(attr.values)): np.putmask(line, indices[i], variable_value_indices[attr.values[i]]) no_jittering_data[ index] = line # save the changed array original_data[ index] = line # reorder also the values in the original data no_jittering_data[index] = ( (no_jittering_data[index] * 2.0 + 1.0) / float(2 * len(attr.values))) elif attr.is_continuous: diff = self.domain_data_stat[ index].max - self.domain_data_stat[ index].min or 1 # if all values are the same then prevent division by zero no_jittering_data[index] = ( no_jittering_data[index] - self.domain_data_stat[index].min) / diff self.original_data = original_data self.no_jittering_scaled_data = no_jittering_data self.valid_data_array = valid_data_array if data: setCached(data, "visualizationData", (self.original_data, self.no_jittering_scaled_data, self.valid_data_array)) # compute the scaled_data arrays scaled_data = self.no_jittering_scaled_data # Random generators for jittering random = np.random.RandomState(seed=self.jitter_seed) rand_seeds = random.random_integers(0, 2**30 - 1, size=len(data.domain)) for index, rseed in zip(list(range(len(data.domain))), rand_seeds): # Need to use a different seed for each feature random = np.random.RandomState(seed=rseed) attr = data.domain[index] if attr.is_discrete: scaled_data[index] += (self.jitter_size / (50.0 * max(1, len(attr.values)))) * \ (random.rand(len(full_data)) - 0.5) elif attr.is_continuous and self.jitter_continuous: scaled_data[index] += self.jitter_size / 50.0 * ( 0.5 - random.rand(len(full_data))) scaled_data[index] = np.absolute( scaled_data[index]) # fix values below zero ind = np.where(scaled_data[index] > 1.0, 1, 0) # fix values above 1 np.putmask(scaled_data[index], ind, 2.0 - np.compress(ind, scaled_data[index])) self.scaled_data = scaled_data[:, :len_data]
def set_data(self, data, **args): if args.get("skipIfSame", 1): if checksum(data) == checksum(self.raw_data): return self.domain_data_stat = [] self.attr_values = {} self.original_data = None self.scaled_data = None self.no_jittering_scaled_data = None self.valid_data_array = None self.raw_data = None self.have_data = False self.data_has_class = False self.data_has_continuous_class = False self.data_has_discrete_class = False self.data_class_name = None self.data_domain = None self.data_class_index = None if data is None: return full_data = data self.raw_data = data len_data = data and len(data) or 0 self.attribute_names = [attr.name for attr in full_data.domain] self.attribute_name_index = dict([(full_data.domain[i].name, i) for i in range(len(full_data.domain))]) self.attribute_flip_info = {} self.data_domain = full_data.domain self.data_has_class = bool(full_data.domain.class_var) self.data_has_continuous_class = full_data.domain.has_continuous_class self.data_has_discrete_class = full_data.domain.has_discrete_class self.data_class_name = self.data_has_class and full_data.domain.class_var.name if self.data_has_class: self.data_class_index = self.attribute_name_index[self.data_class_name] self.have_data = bool(self.raw_data and len(self.raw_data) > 0) self.domain_data_stat = getCached(full_data, DomainBasicStats, (full_data,)) sort_values_for_discrete_attrs = args.get("sort_values_for_discrete_attrs", 1) for index in range(len(full_data.domain)): attr = full_data.domain[index] if attr.is_discrete: self.attr_values[attr.name] = [0, len(attr.values)] elif attr.is_continuous: self.attr_values[attr.name] = [self.domain_data_stat[index].min, self.domain_data_stat[index].max] if 'no_data' in args: return # the original_data, no_jittering_scaled_data and validArray are arrays # that we can cache so that other visualization widgets don't need to # compute it. The scaled_data on the other hand has to be computed for # each widget separately because of different # jitter_continuous and jitter_size values if getCached(data, "visualizationData"): self.original_data, self.no_jittering_scaled_data, self.valid_data_array = getCached(data, "visualizationData") else: no_jittering_data = np.c_[full_data.X, full_data.Y].T valid_data_array = ~np.isnan(no_jittering_data) original_data = no_jittering_data.copy() for index in range(len(data.domain)): attr = data.domain[index] if attr.is_discrete: # see if the values for discrete attributes have to be resorted variable_value_indices = get_variable_value_indices(data.domain[index], sort_values_for_discrete_attrs) if 0 in [i == variable_value_indices[attr.values[i]] for i in range(len(attr.values))]: # make the array a contiguous, otherwise the putmask # function does not work line = no_jittering_data[index].copy() indices = [np.where(line == val, 1, 0) for val in range(len(attr.values))] for i in range(len(attr.values)): np.putmask(line, indices[i], variable_value_indices[attr.values[i]]) no_jittering_data[index] = line # save the changed array original_data[index] = line # reorder also the values in the original data no_jittering_data[index] = ((no_jittering_data[index] * 2.0 + 1.0) / float(2 * len(attr.values))) elif attr.is_continuous: diff = self.domain_data_stat[index].max - self.domain_data_stat[ index].min or 1 # if all values are the same then prevent division by zero no_jittering_data[index] = (no_jittering_data[index] - self.domain_data_stat[index].min) / diff self.original_data = original_data self.no_jittering_scaled_data = no_jittering_data self.valid_data_array = valid_data_array if data: setCached(data, "visualizationData", (self.original_data, self.no_jittering_scaled_data, self.valid_data_array)) # compute the scaled_data arrays scaled_data = self.no_jittering_scaled_data # Random generators for jittering random = np.random.RandomState(seed=self.jitter_seed) rand_seeds = random.random_integers(0, 2 ** 30 - 1, size=len(data.domain)) for index, rseed in zip(list(range(len(data.domain))), rand_seeds): # Need to use a different seed for each feature random = np.random.RandomState(seed=rseed) attr = data.domain[index] if attr.is_discrete: scaled_data[index] += (self.jitter_size / (50.0 * max(1, len(attr.values)))) * \ (random.rand(len(full_data)) - 0.5) elif attr.is_continuous and self.jitter_continuous: scaled_data[index] += self.jitter_size / 50.0 * (0.5 - random.rand(len(full_data))) scaled_data[index] = np.absolute(scaled_data[index]) # fix values below zero ind = np.where(scaled_data[index] > 1.0, 1, 0) # fix values above 1 np.putmask(scaled_data[index], ind, 2.0 - np.compress(ind, scaled_data[index])) self.scaled_data = scaled_data[:, :len_data]