def _compute_scaled_data(self): data = self.data # We cache scaled_data and validArray to share them between widgets cached = getCached(data, "visualizationData") if cached: self.original_data, self.scaled_data, self.valid_data_array = cached return Y = data.Y if data.Y.ndim == 2 else np.atleast_2d(data.Y).T self.original_data = np.hstack((data.X, Y)).T self.scaled_data = no_jit = self.original_data.copy() self.valid_data_array = ~np.isnan(no_jit) for index in range(len(data.domain)): attr = data.domain[index] if attr.is_discrete: no_jit[index] *= 2 no_jit[index] += 1 no_jit[index] /= 2 * len(attr.values) else: dstat = self.domain_data_stat[index] no_jit[index] -= dstat.min if dstat.max != dstat.min: no_jit[index] /= dstat.max - dstat.min setCached(data, "visualizationData", (self.original_data, self.scaled_data, self.valid_data_array))
def _compute_scaled_data(self): data = self.data # We cache scaled_data and validArray to share them between widgets cached = getCached(data, "visualizationData") if cached: self.data, self.scaled_data, self.valid_data_array = cached return Y = data.Y if data.Y.ndim == 2 else np.atleast_2d(data.Y).T all_data = np.hstack((data.X, Y, data.metas)).T self.scaled_data = self.data.copy() self.valid_data_array = np.isfinite(all_data) domain = self.domain for attr in chain(domain.attributes, domain.class_vars, domain.metas): c = self.scaled_data.get_column_view(attr)[0] if attr.is_discrete: c += 0.5 c /= len(attr.values) else: dstat = self.domain_data_stat[attr] c -= dstat.min if dstat.max != dstat.min: c /= dstat.max - dstat.min setCached(data, "visualizationData", (self.data, self.scaled_data, self.valid_data_array))
def _compute_scaled_data(self): data = self.data # We cache scaled_data and validArray to share them between widgets cached = getCached(data, "visualizationData") if cached: self.original_data, self.scaled_data, self.valid_data_array = cached return Y = data.Y if data.Y.ndim == 2 else np.atleast_2d(data.Y).T self.original_data = np.hstack((data.X, Y)).T self.scaled_data = no_jit = self.original_data.copy() self.valid_data_array = ~np.isnan(no_jit) for index in range(len(data.domain)): attr = data.domain[index] if attr.is_discrete: no_jit[index] *= 2 no_jit[index] += 1 no_jit[index] /= 2 * len(attr.values) else: dstat = self.domain_data_stat[index] no_jit[index] -= dstat.min if dstat.max != dstat.min: no_jit[index] /= dstat.max - dstat.min setCached( data, "visualizationData", (self.original_data, self.scaled_data, self.valid_data_array))
def set_data(self, data, subset_data=None, **args): if args.get("skipIfSame", 1): if checksum(data) == checksum(self.raw_data) and \ checksum(subset_data) == checksum(self.raw_subset_data): return self.domain_data_stat = [] self.attr_values = {} self.original_data = self.original_subset_data = None self.scaled_data = self.scaled_subset_data = None self.no_jittering_scaled_data = self.no_jittering_scaled_subset_data = None self.valid_data_array = self.valid_subset_data_array = None self.raw_data = None self.raw_subset_data = None self.have_data = False self.have_subset_data = False self.data_has_class = False self.data_has_continuous_class = False self.data_has_discrete_class = False self.data_class_name = None self.data_domain = None self.data_class_index = None if data is None: return full_data = self.merge_data_sets(data, subset_data) self.raw_data = data self.raw_subset_data = subset_data len_data = data and len(data) or 0 self.attribute_names = [attr.name for attr in full_data.domain] self.attribute_name_index = dict([(full_data.domain[i].name, i) for i in range(len(full_data.domain))]) self.attribute_flip_info = {} self.data_domain = full_data.domain self.data_has_class = bool(full_data.domain.class_var) self.data_has_continuous_class = bool(self.data_has_class and full_data.domain.class_var.var_type == VarTypes.Continuous) self.data_has_discrete_class = bool(self.data_has_class and full_data.domain.class_var.var_type == VarTypes.Discrete) self.data_class_name = self.data_has_class and full_data.domain.class_var.name if self.data_has_class: self.data_class_index = self.attribute_name_index[self.data_class_name] self.have_data = bool(self.raw_data and len(self.raw_data) > 0) self.have_subset_data = bool(self.raw_subset_data and len(self.raw_subset_data) > 0) self.domain_data_stat = getCached(full_data, DomainBasicStats, (full_data,)) sort_values_for_discrete_attrs = args.get("sort_values_for_discrete_attrs", 1) for index in range(len(full_data.domain)): attr = full_data.domain[index] if attr.var_type == VarTypes.Discrete: self.attr_values[attr.name] = [0, len(attr.values)] elif attr.var_type == VarTypes.Continuous: self.attr_values[attr.name] = [self.domain_data_stat[index].min, self.domain_data_stat[index].max] # the original_data, no_jittering_scaled_data and validArray are arrays # that we can cache so that other visualization widgets don't need to # compute it. The scaled_data on the other hand has to be computed for # each widget separately because of different # jitter_continuous and jitter_size values if getCached(data, "visualizationData") and subset_data == None: self.original_data, self.no_jittering_scaled_data, self.valid_data_array = getCached(data, "visualizationData") self.original_subset_data = self.no_jittering_scaled_subset_data = self.valid_subset_data_array = np.array( []).reshape([len(self.original_data), 0]) else: no_jittering_data = np.hstack((full_data.X, full_data.Y)).T valid_data_array = no_jittering_data != np.NaN original_data = no_jittering_data.copy() for index in range(len(data.domain)): attr = data.domain[index] if attr.var_type == VarTypes.Discrete: # see if the values for discrete attributes have to be resorted variable_value_indices = get_variable_value_indices(data.domain[index], sort_values_for_discrete_attrs) if 0 in [i == variable_value_indices[attr.values[i]] for i in range(len(attr.values))]: # make the array a contiguous, otherwise the putmask # function does not work line = no_jittering_data[index].copy() indices = [np.where(line == val, 1, 0) for val in range(len(attr.values))] for i in range(len(attr.values)): np.putmask(line, indices[i], variable_value_indices[attr.values[i]]) no_jittering_data[index] = line # save the changed array original_data[index] = line # reorder also the values in the original data no_jittering_data[index] = ((no_jittering_data[index] * 2.0 + 1.0) / float(2 * len(attr.values))) elif attr.var_type == VarTypes.Continuous: diff = self.domain_data_stat[index].max - self.domain_data_stat[ index].min or 1 # if all values are the same then prevent division by zero no_jittering_data[index] = (no_jittering_data[index] - self.domain_data_stat[index].min) / diff self.original_data = original_data[:, :len_data] self.original_subset_data = original_data[:, len_data:] self.no_jittering_scaled_data = no_jittering_data[:, :len_data] self.no_jittering_scaled_subset_data = no_jittering_data[:, len_data:] self.valid_data_array = valid_data_array[:, :len_data] self.valid_subset_data_array = valid_data_array[:, len_data:] if data: setCached(data, "visualizationData", (self.original_data, self.no_jittering_scaled_data, self.valid_data_array)) if subset_data: setCached(subset_data, "visualizationData", (self.original_subset_data, self.no_jittering_scaled_subset_data, self.valid_subset_data_array)) # compute the scaled_data arrays scaled_data = np.concatenate([self.no_jittering_scaled_data, self.no_jittering_scaled_subset_data], axis=1) # Random generators for jittering random = np.random.RandomState(seed=self.jitter_seed) rand_seeds = random.random_integers(0, sys.maxsize - 1, size=len(data.domain)) for index, rseed in zip(list(range(len(data.domain))), rand_seeds): # Need to use a different seed for each feature random = np.random.RandomState(seed=rseed) attr = data.domain[index] if attr.var_type == VarTypes.Discrete: scaled_data[index] += (self.jitter_size / (50.0 * max(1, len(attr.values)))) * \ (random.rand(len(full_data)) - 0.5) elif attr.var_type == VarTypes.Continuous and self.jitter_continuous: scaled_data[index] += self.jitter_size / 50.0 * (0.5 - random.rand(len(full_data))) scaled_data[index] = np.absolute(scaled_data[index]) # fix values below zero ind = np.where(scaled_data[index] > 1.0, 1, 0) # fix values above 1 np.putmask(scaled_data[index], ind, 2.0 - np.compress(ind, scaled_data[index])) if self.have_subset_data: # Fix all subset instances which are also in the main data # to have the same jittered values ids_to_indices = dict((inst.id, i) for i, inst in enumerate(self.raw_data)) subset_ids_map = [[i, ids_to_indices[s.id]] for i, s in enumerate(self.raw_subset_data) if s.id in ids_to_indices] if len(subset_ids_map): subset_ids_map = np.array(subset_ids_map) subset_ids_map[:, 0] += len_data scaled_data[:, subset_ids_map[:, 0]] = \ scaled_data[:, subset_ids_map[:, 1]] self.scaled_data = scaled_data[:, :len_data] self.scaled_subset_data = scaled_data[:, len_data:]
def set_data(self, data, **args): if args.get("skipIfSame", 1): if checksum(data) == checksum(self.raw_data): return self.domain_data_stat = [] self.attr_values = {} self.original_data = None self.scaled_data = None self.no_jittering_scaled_data = None self.valid_data_array = None self.raw_data = None self.have_data = False self.data_has_class = False self.data_has_continuous_class = False self.data_has_discrete_class = False self.data_class_name = None self.data_domain = None self.data_class_index = None if data is None: return full_data = data self.raw_data = data len_data = data and len(data) or 0 self.attribute_names = [attr.name for attr in full_data.domain] self.attribute_name_index = dict([(full_data.domain[i].name, i) for i in range(len(full_data.domain))]) self.attribute_flip_info = {} self.data_domain = full_data.domain self.data_has_class = bool(full_data.domain.class_var) self.data_has_continuous_class = full_data.domain.has_continuous_class self.data_has_discrete_class = full_data.domain.has_discrete_class self.data_class_name = self.data_has_class and full_data.domain.class_var.name if self.data_has_class: self.data_class_index = self.attribute_name_index[self.data_class_name] self.have_data = bool(self.raw_data and len(self.raw_data) > 0) self.domain_data_stat = getCached(full_data, DomainBasicStats, (full_data,)) sort_values_for_discrete_attrs = args.get("sort_values_for_discrete_attrs", 1) for index in range(len(full_data.domain)): attr = full_data.domain[index] if attr.is_discrete: self.attr_values[attr.name] = [0, len(attr.values)] elif attr.is_continuous: self.attr_values[attr.name] = [self.domain_data_stat[index].min, self.domain_data_stat[index].max] if 'no_data' in args: return # the original_data, no_jittering_scaled_data and validArray are arrays # that we can cache so that other visualization widgets don't need to # compute it. The scaled_data on the other hand has to be computed for # each widget separately because of different # jitter_continuous and jitter_size values if getCached(data, "visualizationData"): self.original_data, self.no_jittering_scaled_data, self.valid_data_array = getCached(data, "visualizationData") else: no_jittering_data = np.c_[full_data.X, full_data.Y].T valid_data_array = ~np.isnan(no_jittering_data) original_data = no_jittering_data.copy() for index in range(len(data.domain)): attr = data.domain[index] if attr.is_discrete: # see if the values for discrete attributes have to be resorted variable_value_indices = get_variable_value_indices(data.domain[index], sort_values_for_discrete_attrs) if 0 in [i == variable_value_indices[attr.values[i]] for i in range(len(attr.values))]: # make the array a contiguous, otherwise the putmask # function does not work line = no_jittering_data[index].copy() indices = [np.where(line == val, 1, 0) for val in range(len(attr.values))] for i in range(len(attr.values)): np.putmask(line, indices[i], variable_value_indices[attr.values[i]]) no_jittering_data[index] = line # save the changed array original_data[index] = line # reorder also the values in the original data no_jittering_data[index] = ((no_jittering_data[index] * 2.0 + 1.0) / float(2 * len(attr.values))) elif attr.is_continuous: diff = self.domain_data_stat[index].max - self.domain_data_stat[ index].min or 1 # if all values are the same then prevent division by zero no_jittering_data[index] = (no_jittering_data[index] - self.domain_data_stat[index].min) / diff self.original_data = original_data self.no_jittering_scaled_data = no_jittering_data self.valid_data_array = valid_data_array if data: setCached(data, "visualizationData", (self.original_data, self.no_jittering_scaled_data, self.valid_data_array)) # compute the scaled_data arrays scaled_data = self.no_jittering_scaled_data # Random generators for jittering random = np.random.RandomState(seed=self.jitter_seed) rand_seeds = random.random_integers(0, 2 ** 30 - 1, size=len(data.domain)) for index, rseed in zip(list(range(len(data.domain))), rand_seeds): # Need to use a different seed for each feature random = np.random.RandomState(seed=rseed) attr = data.domain[index] if attr.is_discrete: scaled_data[index] += (self.jitter_size / (50.0 * max(1, len(attr.values)))) * \ (random.rand(len(full_data)) - 0.5) elif attr.is_continuous and self.jitter_continuous: scaled_data[index] += self.jitter_size / 50.0 * (0.5 - random.rand(len(full_data))) scaled_data[index] = np.absolute(scaled_data[index]) # fix values below zero ind = np.where(scaled_data[index] > 1.0, 1, 0) # fix values above 1 np.putmask(scaled_data[index], ind, 2.0 - np.compress(ind, scaled_data[index])) self.scaled_data = scaled_data[:, :len_data]
def set_data(self, data, **args): if args.get("skipIfSame", 1): if checksum(data) == checksum(self.raw_data): return self.domain_data_stat = [] self.attr_values = {} self.original_data = None self.scaled_data = None self.no_jittering_scaled_data = None self.valid_data_array = None self.raw_data = None self.have_data = False self.data_has_class = False self.data_has_continuous_class = False self.data_has_discrete_class = False self.data_class_name = None self.data_domain = None self.data_class_index = None if data is None: return full_data = data self.raw_data = data len_data = data and len(data) or 0 self.attribute_names = [attr.name for attr in full_data.domain] self.attribute_name_index = dict([ (full_data.domain[i].name, i) for i in range(len(full_data.domain)) ]) self.attribute_flip_info = {} self.data_domain = full_data.domain self.data_has_class = bool(full_data.domain.class_var) self.data_has_continuous_class = full_data.domain.has_continuous_class self.data_has_discrete_class = full_data.domain.has_discrete_class self.data_class_name = self.data_has_class and full_data.domain.class_var.name if self.data_has_class: self.data_class_index = self.attribute_name_index[ self.data_class_name] self.have_data = bool(self.raw_data and len(self.raw_data) > 0) self.domain_data_stat = getCached(full_data, DomainBasicStats, (full_data, )) sort_values_for_discrete_attrs = args.get( "sort_values_for_discrete_attrs", 1) for index in range(len(full_data.domain)): attr = full_data.domain[index] if attr.is_discrete: self.attr_values[attr.name] = [0, len(attr.values)] elif attr.is_continuous: self.attr_values[attr.name] = [ self.domain_data_stat[index].min, self.domain_data_stat[index].max ] if 'no_data' in args: return # the original_data, no_jittering_scaled_data and validArray are arrays # that we can cache so that other visualization widgets don't need to # compute it. The scaled_data on the other hand has to be computed for # each widget separately because of different # jitter_continuous and jitter_size values if getCached(data, "visualizationData"): self.original_data, self.no_jittering_scaled_data, self.valid_data_array = getCached( data, "visualizationData") else: no_jittering_data = np.c_[full_data.X, full_data.Y].T valid_data_array = ~np.isnan(no_jittering_data) original_data = no_jittering_data.copy() for index in range(len(data.domain)): attr = data.domain[index] if attr.is_discrete: # see if the values for discrete attributes have to be resorted variable_value_indices = get_variable_value_indices( data.domain[index], sort_values_for_discrete_attrs) if 0 in [ i == variable_value_indices[attr.values[i]] for i in range(len(attr.values)) ]: # make the array a contiguous, otherwise the putmask # function does not work line = no_jittering_data[index].copy() indices = [ np.where(line == val, 1, 0) for val in range(len(attr.values)) ] for i in range(len(attr.values)): np.putmask(line, indices[i], variable_value_indices[attr.values[i]]) no_jittering_data[ index] = line # save the changed array original_data[ index] = line # reorder also the values in the original data no_jittering_data[index] = ( (no_jittering_data[index] * 2.0 + 1.0) / float(2 * len(attr.values))) elif attr.is_continuous: diff = self.domain_data_stat[ index].max - self.domain_data_stat[ index].min or 1 # if all values are the same then prevent division by zero no_jittering_data[index] = ( no_jittering_data[index] - self.domain_data_stat[index].min) / diff self.original_data = original_data self.no_jittering_scaled_data = no_jittering_data self.valid_data_array = valid_data_array if data: setCached(data, "visualizationData", (self.original_data, self.no_jittering_scaled_data, self.valid_data_array)) # compute the scaled_data arrays scaled_data = self.no_jittering_scaled_data # Random generators for jittering random = np.random.RandomState(seed=self.jitter_seed) rand_seeds = random.random_integers(0, 2**30 - 1, size=len(data.domain)) for index, rseed in zip(list(range(len(data.domain))), rand_seeds): # Need to use a different seed for each feature random = np.random.RandomState(seed=rseed) attr = data.domain[index] if attr.is_discrete: scaled_data[index] += (self.jitter_size / (50.0 * max(1, len(attr.values)))) * \ (random.rand(len(full_data)) - 0.5) elif attr.is_continuous and self.jitter_continuous: scaled_data[index] += self.jitter_size / 50.0 * ( 0.5 - random.rand(len(full_data))) scaled_data[index] = np.absolute( scaled_data[index]) # fix values below zero ind = np.where(scaled_data[index] > 1.0, 1, 0) # fix values above 1 np.putmask(scaled_data[index], ind, 2.0 - np.compress(ind, scaled_data[index])) self.scaled_data = scaled_data[:, :len_data]
def set_data(self, data, subset_data=None, **args): if args.get("skipIfSame", 1): if checksum(data) == checksum(self.raw_data) and \ checksum(subset_data) == checksum(self.raw_subset_data): return self.domain_data_stat = [] self.attr_values = {} self.original_data = self.original_subset_data = None self.scaled_data = self.scaled_subset_data = None self.no_jittering_scaled_data = self.no_jittering_scaled_subset_data = None self.valid_data_array = self.valid_subset_data_array = None self.raw_data = None self.raw_subset_data = None self.have_data = False self.have_subset_data = False self.data_has_class = False self.data_has_continuous_class = False self.data_has_discrete_class = False self.data_class_name = None self.data_domain = None self.data_class_index = None if data is None: return full_data = self.merge_data_sets(data, subset_data) self.raw_data = data self.raw_subset_data = subset_data len_data = data and len(data) or 0 self.attribute_names = [attr.name for attr in full_data.domain] self.attribute_name_index = dict([(full_data.domain[i].name, i) for i in range(len(full_data.domain))]) self.attribute_flip_info = {} self.data_domain = full_data.domain self.data_has_class = bool(full_data.domain.class_var) self.data_has_continuous_class = \ isinstance(full_data.domain.class_var, ContinuousVariable) self.data_has_discrete_class = \ isinstance(full_data.domain.class_var, DiscreteVariable) self.data_class_name = self.data_has_class and full_data.domain.class_var.name if self.data_has_class: self.data_class_index = self.attribute_name_index[self.data_class_name] self.have_data = bool(self.raw_data and len(self.raw_data) > 0) self.have_subset_data = bool(self.raw_subset_data and len(self.raw_subset_data) > 0) self.domain_data_stat = getCached(full_data, DomainBasicStats, (full_data,)) sort_values_for_discrete_attrs = args.get("sort_values_for_discrete_attrs", 1) for index in range(len(full_data.domain)): attr = full_data.domain[index] if isinstance(attr, DiscreteVariable): self.attr_values[attr.name] = [0, len(attr.values)] elif isinstance(attr, ContinuousVariable): self.attr_values[attr.name] = [self.domain_data_stat[index].min, self.domain_data_stat[index].max] # the original_data, no_jittering_scaled_data and validArray are arrays # that we can cache so that other visualization widgets don't need to # compute it. The scaled_data on the other hand has to be computed for # each widget separately because of different # jitter_continuous and jitter_size values if getCached(data, "visualizationData") and subset_data == None: self.original_data, self.no_jittering_scaled_data, self.valid_data_array = getCached(data, "visualizationData") self.original_subset_data = self.no_jittering_scaled_subset_data = self.valid_subset_data_array = np.array( []).reshape([len(self.original_data), 0]) else: no_jittering_data = np.hstack((full_data.X, full_data.Y)).T valid_data_array = no_jittering_data != np.NaN original_data = no_jittering_data.copy() for index in range(len(data.domain)): attr = data.domain[index] if isinstance(attr, DiscreteVariable): # see if the values for discrete attributes have to be resorted variable_value_indices = get_variable_value_indices(data.domain[index], sort_values_for_discrete_attrs) if 0 in [i == variable_value_indices[attr.values[i]] for i in range(len(attr.values))]: # make the array a contiguous, otherwise the putmask # function does not work line = no_jittering_data[index].copy() indices = [np.where(line == val, 1, 0) for val in range(len(attr.values))] for i in range(len(attr.values)): np.putmask(line, indices[i], variable_value_indices[attr.values[i]]) no_jittering_data[index] = line # save the changed array original_data[index] = line # reorder also the values in the original data no_jittering_data[index] = ((no_jittering_data[index] * 2.0 + 1.0) / float(2 * len(attr.values))) elif isinstance(attr, ContinuousVariable): diff = self.domain_data_stat[index].max - self.domain_data_stat[ index].min or 1 # if all values are the same then prevent division by zero no_jittering_data[index] = (no_jittering_data[index] - self.domain_data_stat[index].min) / diff self.original_data = original_data[:, :len_data] self.original_subset_data = original_data[:, len_data:] self.no_jittering_scaled_data = no_jittering_data[:, :len_data] self.no_jittering_scaled_subset_data = no_jittering_data[:, len_data:] self.valid_data_array = valid_data_array[:, :len_data] self.valid_subset_data_array = valid_data_array[:, len_data:] if data: setCached(data, "visualizationData", (self.original_data, self.no_jittering_scaled_data, self.valid_data_array)) if subset_data: setCached(subset_data, "visualizationData", (self.original_subset_data, self.no_jittering_scaled_subset_data, self.valid_subset_data_array)) # compute the scaled_data arrays scaled_data = np.concatenate([self.no_jittering_scaled_data, self.no_jittering_scaled_subset_data], axis=1) # Random generators for jittering random = np.random.RandomState(seed=self.jitter_seed) rand_seeds = random.random_integers(0, sys.maxsize - 1, size=len(data.domain)) for index, rseed in zip(list(range(len(data.domain))), rand_seeds): # Need to use a different seed for each feature random = np.random.RandomState(seed=rseed) attr = data.domain[index] if isinstance(attr, DiscreteVariable): scaled_data[index] += (self.jitter_size / (50.0 * max(1, len(attr.values)))) * \ (random.rand(len(full_data)) - 0.5) elif isinstance(attr, ContinuousVariable) and self.jitter_continuous: scaled_data[index] += self.jitter_size / 50.0 * (0.5 - random.rand(len(full_data))) scaled_data[index] = np.absolute(scaled_data[index]) # fix values below zero ind = np.where(scaled_data[index] > 1.0, 1, 0) # fix values above 1 np.putmask(scaled_data[index], ind, 2.0 - np.compress(ind, scaled_data[index])) if self.have_subset_data: # Fix all subset instances which are also in the main data # to have the same jittered values ids_to_indices = dict((inst.id, i) for i, inst in enumerate(self.raw_data)) subset_ids_map = [[i, ids_to_indices[s.id]] for i, s in enumerate(self.raw_subset_data) if s.id in ids_to_indices] if len(subset_ids_map): subset_ids_map = np.array(subset_ids_map) subset_ids_map[:, 0] += len_data scaled_data[:, subset_ids_map[:, 0]] = \ scaled_data[:, subset_ids_map[:, 1]] self.scaled_data = scaled_data[:, :len_data] self.scaled_subset_data = scaled_data[:, len_data:]