class LogarithmicScale(Preprocess): Base = Enum("LogarithmicScale", ("BinaryLog", "NaturalLog", "CommonLog"), qualname="LogarithmicScale.Base") BinaryLog, NaturalLog, CommonLog = Base def __init__(self, base=BinaryLog): self.base = base def __call__(self, data: Table) -> Table: new_data = data.copy() if self.base == LogarithmicScale.BinaryLog: def func(x, *args, **kwargs): return np.log2(x + 1, *args, **kwargs) elif self.base == LogarithmicScale.CommonLog: def func(x, *args, **kwargs): return np.log10(x + 1, *args, **kwargs) elif self.base == LogarithmicScale.NaturalLog: func = np.log1p if sp.issparse(new_data.X): func(new_data.X.data, out=new_data.X.data) else: func(new_data.X, out=new_data.X) return new_data
class Normalize(Preprocess): Method = Enum("Normalize", ("CPM", "Median"), qualname="Normalize.Method") CPM, Median = Method def __init__(self, method=CPM): self.method = method def __call__(self, *args): raise NotImplementedError def normalize(self, *args): raise NotImplementedError
class SelectMostVariableGenes(Preprocess): Method = Enum("SelectMostVariableGenes", ("Dispersion", "Variance", "Mean"), qualname="SelectMostVariableGenes.Method") Dispersion, Variance, Mean = Method def __init__(self, method=Dispersion, n_genes=1000, n_groups=20): self.method = method self.n_genes = n_genes self.n_groups = n_groups if n_groups and n_groups > 1 else 1 def __call__(self, data: Table) -> Table: n_groups = min(self.n_groups, len(data.domain.attributes)) mean = ut.nanmean(data.X, axis=0) variance = ut.nanvar(data.X, axis=0) percentiles = [percentileofscore(mean, m) for m in mean] _, bins = np.histogram(percentiles, n_groups) bin_indices = np.digitize(percentiles, bins, True) # Right limit is treated differently in histogram and digitize # See https://github.com/numpy/numpy/issues/4217 bin_indices[bin_indices == 0] = 1 zscores = np.zeros_like(mean) for group in range(n_groups): group_indices, = np.where(bin_indices == group + 1) if self.method == SelectMostVariableGenes.Dispersion: group_mean = mean[group_indices] group_scores = np.divide(variance[group_indices], group_mean, out=np.zeros_like(group_mean), where=group_mean != 0) elif self.method == SelectMostVariableGenes.Variance: group_scores = variance[group_indices] elif self.method == SelectMostVariableGenes.Mean: group_scores = mean[group_indices] with np.errstate(invalid="ignore"): zscores[group_indices] = zscore(group_scores) indices = np.argsort(np.nan_to_num(zscores))[-self.n_genes:] return self._filter_columns(data, indices) @staticmethod def _filter_columns(data, indices): indices = sorted(indices) domain = data.domain attrs, cls, metas = domain.attributes, domain.class_vars, domain.metas domain = Domain(tuple(np.array(attrs)[indices]), cls, metas) return data.transform(domain)
class Binarize(Preprocess): Condition = Enum("Binarize", ("GreaterOrEqual", "Greater"), qualname="Binarize.Condition") GreaterOrEqual, Greater = Condition def __init__(self, condition=GreaterOrEqual, threshold=1): self.condition = condition self.threshold = threshold def __call__(self, data: Table) -> Table: new_data = data.copy() if self.condition == Binarize.GreaterOrEqual: new_data.X = new_data.X >= self.threshold elif self.condition == Binarize.Greater: new_data.X = new_data.X > self.threshold return new_data
class LogarithmicScale(Preprocess): Base = Enum("LogarithmicScale", ("BinaryLog", "NaturalLog", "CommonLog"), qualname="LogarithmicScale.Base") BinaryLog, NaturalLog, CommonLog = Base def __init__(self, base=BinaryLog): self.base = base def __call__(self, data): new_data = data.copy() if self.base == LogarithmicScale.BinaryLog: new_data.X = np.log2(1 + data.X) elif self.base == LogarithmicScale.NaturalLog: new_data.X = np.log(1 + data.X) elif self.base == LogarithmicScale.CommonLog: new_data.X = np.log10(1 + data.X) return new_data
class Continuize(Preprocess): (Indicators, FirstAsBase, FrequentAsBase, Remove, RemoveMultinomial, ReportError, AsOrdinal, AsNormalizedOrdinal, Leave) = Enum( "Continuize", "Indicators, FirstAsBase, FrequentAsBase," "Remove, RemoveMultinomial, ReportError, AsOrdinal," "AsNormalizedOrdinal, Leave") def __init__(self, zero_based=True, multinomial_treatment=Indicators): self.zero_based = zero_based self.multinomial_treatment = multinomial_treatment def __call__(self, data): from . import continuize continuizer = continuize.DomainContinuizer( zero_based=self.zero_based, multinomial_treatment=self.multinomial_treatment) domain = continuizer(data) return data.transform(domain)
class FilterString(ValueFilter): """ Subfilter for string variables. .. attribute:: column The column to which the filter applies (int, str or :obj:`Orange.data.Variable`). .. attribute:: ref The reference value; also aliased to `min` for operators `Between` and `Outside`. .. attribute:: max The upper threshold for operators `Between` and `Outside`. .. attribute:: oper The operator; should be `FilterString.Equal`, `NotEqual`, `Less`, `LessEqual`, `Greater`, `GreaterEqual`, `Between`, `Outside`, `Contains`, `StartsWith`, `EndsWith` or `IsDefined`. .. attribute:: case_sensitive Tells whether the comparisons are case sensitive """ Type = Enum( 'FilterString', 'Equal, NotEqual, Less, LessEqual, Greater,' 'GreaterEqual, Between, Outside, Contains,' 'StartsWith, EndsWith, IsDefined') (Equal, NotEqual, Less, LessEqual, Greater, GreaterEqual, Between, Outside, Contains, StartsWith, EndsWith, IsDefined) = Type def __init__(self, position, oper, ref=None, max=None, case_sensitive=True, **a): super().__init__(position) if a: if len(a) != 1 or "min" not in a: raise TypeError( "FilterContinuous got unexpected keyword arguments") else: ref = a["min"] self.ref = ref self.max = max self.oper = oper self.case_sensitive = case_sensitive self.position = position @property def min(self): return self.ref @min.setter def min(self, value): self.ref = value def __call__(self, inst): # the function is a large 'switch'; pylint: disable=too-many-branches value = inst[inst.domain.index(self.column)] if self.oper == self.IsDefined: return not np.isnan(value) if self.case_sensitive: value = str(value) refval = str(self.ref) else: value = str(value).lower() refval = str(self.ref).lower() if self.oper == self.Equal: return value == refval if self.oper == self.NotEqual: return value != refval if self.oper == self.Less: return value < refval if self.oper == self.LessEqual: return value <= refval if self.oper == self.Greater: return value > refval if self.oper == self.GreaterEqual: return value >= refval if self.oper == self.Contains: return refval in value if self.oper == self.StartsWith: return value.startswith(refval) if self.oper == self.EndsWith: return value.endswith(refval) high = self.max if self.case_sensitive else self.max.lower() if self.oper == self.Between: return refval <= value <= high if self.oper == self.Outside: return not refval <= value <= high raise ValueError("invalid operator")
class FilterContinuous(ValueFilter): """ Subfilter for continuous variables. .. attribute:: column The column to which the filter applies (int, str or :obj:`Orange.data.Variable`). .. attribute:: ref The reference value; also aliased to `min` for operators `Between` and `Outside`. .. attribute:: max The upper threshold for operators `Between` and `Outside`. .. attribute:: oper The operator; should be `FilterContinuous.Equal`, `NotEqual`, `Less`, `LessEqual`, `Greater`, `GreaterEqual`, `Between`, `Outside` or `IsDefined`. """ Type = Enum( 'FilterContinuous', 'Equal, NotEqual, Less, LessEqual, Greater,' 'GreaterEqual, Between, Outside, IsDefined') (Equal, NotEqual, Less, LessEqual, Greater, GreaterEqual, Between, Outside, IsDefined) = Type def __init__(self, position, oper, ref=None, max=None, min=None): super().__init__(position) self.ref = ref if min is None else min self.max = max self.oper = oper self.position = position @property def min(self): return self.ref @min.setter def min(self, value): self.ref = value def __call__(self, inst): value = inst[inst.domain.index(self.column)] if isnan(value): return self.oper == self.Equal and isnan(self.ref) if self.oper == self.Equal: return value == self.ref if self.oper == self.NotEqual: return value != self.ref if self.oper == self.Less: return value < self.ref if self.oper == self.LessEqual: return value <= self.ref if self.oper == self.Greater: return value > self.ref if self.oper == self.GreaterEqual: return value >= self.ref if self.oper == self.Between: return self.ref <= value <= self.max if self.oper == self.Outside: return not self.ref <= value <= self.max if self.oper == self.IsDefined: return True raise ValueError("invalid operator") def __eq__(self, other): return isinstance(other, FilterContinuous) and \ self.column == other.column and self.oper == other.oper and \ self.ref == other.ref and self.max == other.max def __str__(self): if isinstance(self.column, str): column = self.column elif isinstance(self.column, Variable): column = self.column.name else: column = "feature({})".format(self.column) names = { self.Equal: "=", self.NotEqual: "≠", self.Less: "<", self.LessEqual: "≤", self.Greater: ">", self.GreaterEqual: "≥" } if self.oper in names: return "{} {} {}".format(column, names[self.oper], self.ref) if self.oper == self.Between: return "{} ≤ {} ≤ {}".format(self.min, column, self.max) if self.oper == self.Outside: return "not {} ≤ {} ≤ {}".format(self.min, column, self.max) if self.oper == self.IsDefined: return "{} is defined".format(column) return "invalid operator"
class OWLinearProjection(widget.OWWidget): name = "Linear Projection" description = "A multi-axis projection of data onto " \ "a two-dimensional plane." icon = "icons/LinearProjection.svg" priority = 240 keywords = [] selection_indices = settings.Setting(None, schema_only=True) class Inputs: data = Input("Data", Table, default=True) data_subset = Input("Data Subset", Table) projection = Input("Projection", Table) class Outputs: selected_data = Output("Selected Data", Table, default=True) annotated_data = Output(ANNOTATED_DATA_SIGNAL_NAME, Table) components = Output("Components", Table) Placement = Enum("Placement", dict(Circular=0, LDA=1, PCA=2, Projection=3), type=int, qualname="OWLinearProjection.Placement") Component_name = { Placement.Circular: "C", Placement.LDA: "LD", Placement.PCA: "PC" } Variable_name = { Placement.Circular: "circular", Placement.LDA: "lda", Placement.PCA: "pca", Placement.Projection: "projection" } jitter_sizes = [0, 0.1, 0.5, 1.0, 2.0] settings_version = 3 settingsHandler = settings.DomainContextHandler() variable_state = settings.ContextSetting({}) placement = settings.Setting(Placement.Circular) radius = settings.Setting(0) auto_commit = settings.Setting(True) resolution = 256 graph = settings.SettingProvider(OWLinProjGraph) ReplotRequest = QEvent.registerEventType() vizrank = settings.SettingProvider(LinearProjectionVizRank) graph_name = "graph.plot_widget.plotItem" class Warning(widget.OWWidget.Warning): no_cont_features = widget.Msg("Plotting requires numeric features") not_enough_components = widget.Msg( "Input projection has less than 2 components") trivial_components = widget.Msg( "All components of the PCA are trivial (explain 0 variance). " "Input data is constant (or near constant).") class Error(widget.OWWidget.Error): proj_and_domain_match = widget.Msg( "Projection and Data domains do not match") no_valid_data = widget.Msg("No projection due to invalid data") def __init__(self): super().__init__() self.data = None self.projection = None self.subset_data = None self._subset_mask = None self._selection = None self.__replot_requested = False self.n_cont_var = 0 #: Remember the saved state to restore self.__pending_selection_restore = self.selection_indices self.selection_indices = None self.variable_x = None self.variable_y = None box = gui.vBox(self.mainArea, True, margin=0) self.graph = OWLinProjGraph(self, box, "Plot", view_box=LinProjInteractiveViewBox) box.layout().addWidget(self.graph.plot_widget) plot = self.graph.plot_widget SIZE_POLICY = (QSizePolicy.Minimum, QSizePolicy.Maximum) self.variables_selection = VariablesSelection() self.model_selected = VariableListModel(enable_dnd=True) self.model_other = VariableListModel(enable_dnd=True) self.variables_selection(self, self.model_selected, self.model_other) self.vizrank, self.btn_vizrank = LinearProjectionVizRank.add_vizrank( self.controlArea, self, "Suggest Features", self._vizrank) self.variables_selection.add_remove.layout().addWidget( self.btn_vizrank) box = gui.widgetBox(self.controlArea, "Placement", sizePolicy=SIZE_POLICY) self.radio_placement = gui.radioButtonsInBox( box, self, "placement", btnLabels=[ "Circular Placement", "Linear Discriminant Analysis", "Principal Component Analysis", "Use input projection" ], callback=self._change_placement) self.viewbox = plot.getViewBox() self.replot = None g = self.graph.gui box = g.point_properties_box(self.controlArea) self.models = g.points_models g.add_widget(g.JitterSizeSlider, box) box.setSizePolicy(*SIZE_POLICY) box = gui.widgetBox(self.controlArea, "Hide axes", sizePolicy=SIZE_POLICY) self.rslider = gui.hSlider(box, self, "radius", minValue=0, maxValue=100, step=5, label="Radius", createLabel=False, ticks=True, callback=self.update_radius) self.rslider.setTickInterval(0) self.rslider.setPageStep(10) box = gui.vBox(self.controlArea, "Plot Properties") box.setSizePolicy(*SIZE_POLICY) g.add_widgets([ g.ShowLegend, g.ToolTipShowsAll, g.ClassDensity, g.LabelOnlySelected ], box) box = self.graph.box_zoom_select(self.controlArea) box.setSizePolicy(*SIZE_POLICY) self.icons = gui.attributeIconDict p = self.graph.plot_widget.palette() self.graph.set_palette(p) gui.auto_commit(self.controlArea, self, "auto_commit", "Send Selection", auto_label="Send Automatically") self.graph.zoom_actions(self) self._new_plotdata() self._change_placement() self.graph.jitter_continuous = True def reset_graph_data(self): if self.data is not None: self.graph.rescale_data() self._update_graph(reset_view=True) def keyPressEvent(self, event): super().keyPressEvent(event) self.graph.update_tooltip(event.modifiers()) def keyReleaseEvent(self, event): super().keyReleaseEvent(event) self.graph.update_tooltip(event.modifiers()) def _vizrank(self, attrs): self.variables_selection.display_none() self.model_selected[:] = attrs[:] self.model_other[:] = [ var for var in self.model_other if var not in attrs ] def _change_placement(self): placement = self.placement p_Circular = self.Placement.Circular p_LDA = self.Placement.LDA self.variables_selection.set_enabled(placement in [p_Circular, p_LDA]) self._vizrank_color_change() self.rslider.setEnabled(placement != p_Circular) self._setup_plot() self.commit() def _get_min_radius(self): return self.radius * np.max(np.linalg.norm(self.plotdata.axes, axis=1)) / 100 + 1e-5 def update_radius(self): # Update the anchor/axes visibility pd = self.plotdata assert pd is not None if pd.hidecircle is None: return min_radius = self._get_min_radius() for anchor, item in zip(pd.axes, pd.axisitems): item.setVisible(np.linalg.norm(anchor) > min_radius) pd.hidecircle.setRect( QRectF(-min_radius, -min_radius, 2 * min_radius, 2 * min_radius)) def _new_plotdata(self): self.plotdata = namespace(valid_mask=None, embedding_coords=None, axisitems=[], axes=[], variables=[], data=None, hidecircle=None) def _anchor_circle(self, variables): # minimum visible anchor radius (radius) min_radius = self._get_min_radius() axisitems = [] for anchor, var in zip(self.plotdata.axes, variables[:]): axitem = AnchorItem( line=QLineF(0, 0, *anchor), text=var.name, ) axitem.setVisible(np.linalg.norm(anchor) > min_radius) axitem.setPen(pg.mkPen((100, 100, 100))) axitem.setArrowVisible(True) self.viewbox.addItem(axitem) axisitems.append(axitem) self.plotdata.axisitems = axisitems if self.placement == self.Placement.Circular: return hidecircle = QGraphicsEllipseItem() hidecircle.setRect( QRectF(-min_radius, -min_radius, 2 * min_radius, 2 * min_radius)) _pen = QPen(Qt.lightGray, 1) _pen.setCosmetic(True) hidecircle.setPen(_pen) self.viewbox.addItem(hidecircle) self.plotdata.hidecircle = hidecircle def update_colors(self): self._vizrank_color_change() def clear(self): # Clear/reset the widget state self.data = None self.model_selected.clear() self.model_other.clear() self._clear_plot() self.selection_indices = None def _clear_plot(self): self.Warning.trivial_components.clear() for axisitem in self.plotdata.axisitems: self.viewbox.removeItem(axisitem) if self.plotdata.hidecircle: self.viewbox.removeItem(self.plotdata.hidecircle) self._new_plotdata() self.graph.hide_axes() def invalidate_plot(self): """ Schedule a delayed replot. """ if not self.__replot_requested: self.__replot_requested = True QApplication.postEvent(self, QEvent(self.ReplotRequest), Qt.LowEventPriority - 10) def init_attr_values(self): self.graph.set_domain(self.data) def _vizrank_color_change(self): is_enabled = False if self.data is None: self.btn_vizrank.setToolTip("There is no data.") return vars = [ v for v in chain(self.data.domain.variables, self.data.domain.metas) if v.is_primitive and v is not self.graph.attr_color ] self.n_cont_var = len(vars) if self.placement not in [self.Placement.Circular, self.Placement.LDA]: msg = "Suggest Features works only for Circular and " \ "Linear Discriminant Analysis Projection" elif self.graph.attr_color is None: msg = "Color variable has to be selected" elif self.graph.attr_color.is_continuous and self.placement == self.Placement.LDA: msg = "Suggest Features does not work for Linear Discriminant Analysis Projection " \ "when continuous color variable is selected." elif len(vars) < 3: msg = "Not enough available continuous variables" else: is_enabled = True msg = "" self.btn_vizrank.setToolTip(msg) self.btn_vizrank.setEnabled(is_enabled) self.vizrank.stop_and_reset(is_enabled) @Inputs.projection def set_projection(self, projection): self.Warning.not_enough_components.clear() if projection and len(projection) < 2: self.Warning.not_enough_components() projection = None if projection is not None: self.placement = self.Placement.Projection self.projection = projection @Inputs.data def set_data(self, data): """ Set the input dataset. Args: data (Orange.data.table): data instances """ def sql(data): if isinstance(data, SqlTable): if data.approx_len() < 4000: data = Table(data) else: self.information("Data has been sampled") data_sample = data.sample_time(1, no_cache=True) data_sample.download_data(2000, partial=True) data = Table(data_sample) return data def settings(data): # get the default encoded state, replacing the position with Inf state = VariablesSelection.encode_var_state( [list(self.model_selected), list(self.model_other)]) state = { key: (source_ind, np.inf) for key, (source_ind, _) in state.items() } self.openContext(data.domain) selected_keys = [ key for key, (sind, _) in self.variable_state.items() if sind == 0 ] if set(selected_keys).issubset(set(state.keys())): pass if self.__pending_selection_restore is not None: self._selection = np.array(self.__pending_selection_restore, dtype=int) self.__pending_selection_restore = None # update the defaults state (the encoded state must contain # all variables in the input domain) state.update(self.variable_state) # ... and restore it with saved positions taking precedence over # the defaults selected, other = VariablesSelection.decode_var_state( state, [list(self.model_selected), list(self.model_other)]) return selected, other self.closeContext() self.clear() self.Warning.no_cont_features.clear() self.information() data = sql(data) if data is not None: domain = data.domain vars = [ var for var in chain(domain.variables, domain.metas) if var.is_continuous ] if not len(vars): self.Warning.no_cont_features() data = None self.data = data self.init_attr_values() if data is not None and len(data): self._initialize(data) self.model_selected[:], self.model_other[:] = settings(data) self.vizrank.stop_and_reset() self.vizrank.attrs = self.data.domain.attributes if self.data is not None else [] def _check_possible_opt(self): def set_enabled(is_enabled): for btn in self.radio_placement.buttons: btn.setEnabled(is_enabled) self.variables_selection.set_enabled(is_enabled) p_Circular = self.Placement.Circular p_LDA = self.Placement.LDA p_Input = self.Placement.Projection if self.data: set_enabled(True) domain = self.data.domain if not domain.has_discrete_class or len( domain.class_var.values) < 2: self.radio_placement.buttons[p_LDA].setEnabled(False) if self.placement == p_LDA: self.placement = p_Circular if not self.projection: self.radio_placement.buttons[p_Input].setEnabled(False) if self.placement == p_Input: self.placement = p_Circular self._setup_plot() else: self.graph.new_data(None) self.rslider.setEnabled(False) set_enabled(False) self.commit() @Inputs.data_subset def set_subset_data(self, subset): """ Set the supplementary input subset dataset. Args: subset (Orange.data.table): subset of data instances """ self.subset_data = subset self._subset_mask = None self.controls.graph.alpha_value.setEnabled(subset is None) def handleNewSignals(self): if self.data is not None and self.subset_data is not None: # Update the plot's highlight items dataids = self.data.ids.ravel() subsetids = np.unique(self.subset_data.ids) self._subset_mask = np.in1d(dataids, subsetids, assume_unique=True) self._check_possible_opt() self._change_placement() self.commit() def customEvent(self, event): if event.type() == OWLinearProjection.ReplotRequest: self.__replot_requested = False self._setup_plot() self.commit() else: super().customEvent(event) def closeContext(self): self.variable_state = VariablesSelection.encode_var_state( [list(self.model_selected), list(self.model_other)]) super().closeContext() def _initialize(self, data): # Initialize the GUI controls from data's domain. vars = [ v for v in chain(data.domain.metas, data.domain.attributes) if v.is_continuous ] self.model_other[:] = vars[3:] self.model_selected[:] = vars[:3] def prepare_plot_data(self, variables): def projection(variables): if set(self.projection.domain.attributes).issuperset(variables): axes = self.projection[:2, variables].X elif set(f.name for f in self.projection.domain.attributes).issuperset( f.name for f in variables): axes = self.projection[:2, [f.name for f in variables]].X else: self.Error.proj_and_domain_match() axes = None return axes def get_axes(variables): self.Error.proj_and_domain_match.clear() axes = None if self.placement == self.Placement.Circular: axes = LinProj.defaultaxes(len(variables)) elif self.placement == self.Placement.LDA: axes = self._get_lda(self.data, variables) elif self.placement == self.Placement.Projection and self.projection: axes = projection(variables) return axes coords = [ column_data(self.data, var, dtype=float) for var in variables ] coords = np.vstack(coords) p, N = coords.shape assert N == len(self.data), p == len(variables) axes = get_axes(variables) if axes is None: return None, None, None assert axes.shape == (2, p) valid_mask = ~np.isnan(coords).any(axis=0) coords = coords[:, valid_mask] X, Y = np.dot(axes, coords) if X.size and Y.size: X = normalized(X) Y = normalized(Y) return valid_mask, np.stack((X, Y), axis=1), axes.T def _setup_plot(self): self._clear_plot() if self.data is None: return self.__replot_requested = False names = get_unique_names([ v.name for v in chain(self.data.domain.variables, self.data.domain.metas) ], [ "{}-x".format(self.Variable_name[self.placement]), "{}-y".format( self.Variable_name[self.placement]) ]) self.variable_x = ContinuousVariable(names[0]) self.variable_y = ContinuousVariable(names[1]) if self.placement in [self.Placement.Circular, self.Placement.LDA]: variables = list(self.model_selected) elif self.placement == self.Placement.Projection: variables = self.model_selected[:] + self.model_other[:] elif self.placement == self.Placement.PCA: variables = [ var for var in self.data.domain.attributes if var.is_continuous ] if not variables: self.graph.new_data(None) return if self.placement == self.Placement.PCA: valid_mask, ec, axes = self._get_pca() variables = self._pca.orig_domain.attributes else: valid_mask, ec, axes = self.prepare_plot_data(variables) self.plotdata.variables = variables self.plotdata.valid_mask = valid_mask self.plotdata.embedding_coords = ec self.plotdata.axes = axes if any(e is None for e in (valid_mask, ec, axes)): return if not sum(valid_mask): self.Error.no_valid_data() self.graph.new_data(None, None) return self.Error.no_valid_data.clear() self._anchor_circle(variables=variables) self._plot() def _plot(self): domain = self.data.domain new_metas = domain.metas + (self.variable_x, self.variable_y) domain = Domain(attributes=domain.attributes, class_vars=domain.class_vars, metas=new_metas) valid_mask = self.plotdata.valid_mask array = np.zeros((len(self.data), 2), dtype=np.float) array[valid_mask] = self.plotdata.embedding_coords self.plotdata.data = data = self.data.transform(domain) data[:, self.variable_x] = array[:, 0].reshape(-1, 1) data[:, self.variable_y] = array[:, 1].reshape(-1, 1) subset_data = data[self._subset_mask & valid_mask]\ if self._subset_mask is not None and len(self._subset_mask) else None self.plotdata.data = data self.graph.new_data(data[valid_mask], subset_data) if self._selection is not None: self.graph.selection = self._selection[valid_mask] self.graph.update_data(self.variable_x, self.variable_y, False) def _get_lda(self, data, variables): domain = Domain(attributes=variables, class_vars=data.domain.class_vars) data = data.transform(domain) lda = LinearDiscriminantAnalysis(solver='eigen', n_components=2) lda.fit(data.X, data.Y) scalings = lda.scalings_[:, :2].T if scalings.shape == (1, 1): scalings = np.array([[1.], [0.]]) return scalings def _get_pca(self): data = self.data MAX_COMPONENTS = 2 ncomponents = 2 DECOMPOSITIONS = [PCA] # TruncatedSVD cls = DECOMPOSITIONS[0] pca_projector = cls(n_components=MAX_COMPONENTS) pca_projector.component = ncomponents pca_projector.preprocessors = cls.preprocessors + [Normalize()] pca = pca_projector(data) variance_ratio = pca.explained_variance_ratio_ cumulative = np.cumsum(variance_ratio) self._pca = pca if not np.isfinite(cumulative[-1]): self.Warning.trivial_components() coords = pca(data).X valid_mask = ~np.isnan(coords).any(axis=1) # scale axes max_radius = np.min( [np.abs(np.min(coords, axis=0)), np.max(coords, axis=0)]) axes = pca.components_.T.copy() axes *= max_radius / np.max(np.linalg.norm(axes, axis=1)) return valid_mask, coords, axes def _update_graph(self, reset_view=False): self.graph.zoomStack = [] if self.graph.data is None: return self.graph.update_data(self.variable_x, self.variable_y, reset_view) def update_density(self): self._update_graph(reset_view=False) def selection_changed(self): if self.graph.selection is not None: self._selection = np.zeros(len(self.data), dtype=np.uint8) self._selection[self.plotdata.valid_mask] = self.graph.selection self.selection_indices = self._selection.tolist() else: self._selection = self.selection_indices = None self.commit() def prepare_data(self): pass def commit(self): def prepare_components(): if self.placement in [self.Placement.Circular, self.Placement.LDA]: attrs = [a for a in self.model_selected[:]] axes = self.plotdata.axes elif self.placement == self.Placement.PCA: axes = self._pca.components_.T attrs = [a for a in self._pca.orig_domain.attributes] if self.placement != self.Placement.Projection: domain = Domain([ ContinuousVariable(a.name, compute_value=lambda _: None) for a in attrs ], metas=[StringVariable(name='component')]) metas = np.array([[ "{}{}".format(self.Component_name[self.placement], i + 1) for i in range(axes.shape[1]) ]], dtype=object).T components = Table(domain, axes.T, metas=metas) components.name = 'components' else: components = self.projection return components selected = annotated = components = None if self.data is not None and self.plotdata.data is not None: components = prepare_components() graph = self.graph mask = self.plotdata.valid_mask.astype(int) mask[mask == 1] = graph.selection if graph.selection is not None \ else [False * len(mask)] selection = np.array( [], dtype=np.uint8) if mask is None else np.flatnonzero(mask) name = self.data.name data = self.plotdata.data if len(selection): selected = data[selection] selected.name = name + ": selected" selected.attributes = self.data.attributes if graph.selection is not None and np.max(graph.selection) > 1: annotated = create_groups_table(data, mask) else: annotated = create_annotated_table(data, selection) annotated.attributes = self.data.attributes annotated.name = name + ": annotated" self.Outputs.selected_data.send(selected) self.Outputs.annotated_data.send(annotated) self.Outputs.components.send(components) def send_report(self): if self.data is None: return def name(var): return var and var.name def projection_name(): name = ("Circular Placement", "Linear Discriminant Analysis", "Principal Component Analysis", "Input projection") return name[self.placement] caption = report.render_items_vert( (("Projection", projection_name()), ("Color", name(self.graph.attr_color)), ("Label", name(self.graph.attr_label)), ("Shape", name(self.graph.attr_shape)), ("Size", name(self.graph.attr_size)), ("Jittering", self.graph.jitter_size != 0 and "{} %".format(self.graph.jitter_size)))) self.report_plot() if caption: self.report_caption(caption) @classmethod def migrate_settings(cls, settings_, version): if version < 2: settings_["point_width"] = settings_["point_size"] if version < 3: settings_graph = {} settings_graph["jitter_size"] = settings_["jitter_value"] settings_graph["point_width"] = settings_["point_width"] settings_graph["alpha_value"] = settings_["alpha_value"] settings_graph["class_density"] = settings_["class_density"] settings_["graph"] = settings_graph @classmethod def migrate_context(cls, context, version): if version < 2: domain = context.ordered_domain c_domain = [t for t in context.ordered_domain if t[1] == 2] d_domain = [t for t in context.ordered_domain if t[1] == 1] for d, old_val, new_val in ((domain, "color_index", "attr_color"), (d_domain, "shape_index", "attr_shape"), (c_domain, "size_index", "attr_size")): index = context.values[old_val][0] - 1 context.values[new_val] = (d[index][0], d[index][1] + 100) \ if 0 <= index < len(d) else None if version < 3: context.values["graph"] = { "attr_color": context.values["attr_color"], "attr_shape": context.values["attr_shape"], "attr_size": context.values["attr_size"] }
if self.always_show_axes: self.plot_widget.removeItem(self.circle_item) self.circle_item = None if self.circle_item is not None: points, _ = self.master.get_anchors() if points is None: return r = self.scaled_radius * np.max(np.linalg.norm(points, axis=1)) self.circle_item.setRect(QRectF(-r, -r, 2 * r, 2 * r)) pen = pg.mkPen(QColor(Qt.lightGray), width=1, cosmetic=True) self.circle_item.setPen(pen) Placement = Enum("Placement", dict(Circular=0, LDA=1, PCA=2), type=int, qualname="Placement") class OWLinearProjection(OWAnchorProjectionWidget): name = "Linear Projection" description = "A multi-axis projection of data onto " \ "a two-dimensional plane." icon = "icons/LinearProjection.svg" priority = 240 keywords = [] Projection_name = {Placement.Circular: "Circular Placement", Placement.LDA: "Linear Discriminant Analysis", Placement.PCA: "Principal Component Analysis"} settings_version = 6
class OWLinearProjection(OWAnchorProjectionWidget): name = "Linear Projection" description = "A multi-axis projection of data onto " \ "a two-dimensional plane." icon = "icons/LinearProjection.svg" priority = 240 keywords = [] Placement = Enum("Placement", dict(Circular=0, LDA=1, PCA=2), type=int, qualname="OWLinearProjection.Placement") Projection_name = { Placement.Circular: "Circular Placement", Placement.LDA: "Linear Discriminant Analysis", Placement.PCA: "Principal Component Analysis" } settings_version = 5 placement = Setting(Placement.Circular) selected_vars = ContextSetting([]) vizrank = SettingProvider(LinearProjectionVizRank) GRAPH_CLASS = OWLinProjGraph graph = SettingProvider(OWLinProjGraph) left_side_scrolling = True class Error(OWAnchorProjectionWidget.Error): no_cont_features = Msg("Plotting requires numeric features") def __init__(self): self.model_selected = VariableListModel(enable_dnd=True) self.model_selected.removed.connect(self.__model_selected_changed) self.model_other = VariableListModel(enable_dnd=True) self.vizrank, self.btn_vizrank = LinearProjectionVizRank.add_vizrank( None, self, "Suggest Features", self.__vizrank_set_attrs) super().__init__() def _add_controls(self): self._add_controls_variables() self._add_controls_placement() super()._add_controls() self.gui.add_control(self._effects_box, gui.hSlider, "Hide radius:", master=self.graph, value="hide_radius", minValue=0, maxValue=100, step=10, createLabel=False, callback=self.__radius_slider_changed) self.controlArea.layout().removeWidget(self.control_area_stretch) self.control_area_stretch.setParent(None) def _add_controls_variables(self): self.variables_selection = VariablesSelection(self, self.model_selected, self.model_other, self.controlArea) self.variables_selection.added.connect(self.__model_selected_changed) self.variables_selection.removed.connect(self.__model_selected_changed) self.variables_selection.add_remove.layout().addWidget( self.btn_vizrank) def _add_controls_placement(self): box = gui.widgetBox(self.controlArea, True, sizePolicy=(QSizePolicy.Minimum, QSizePolicy.Maximum)) self.radio_placement = gui.radioButtonsInBox( box, self, "placement", btnLabels=[self.Projection_name[x] for x in self.Placement], callback=self.__placement_radio_changed) @property def continuous_variables(self): if self.data is None or self.data.domain is None: return [] dom = self.data.domain return [v for v in chain(dom.variables, dom.metas) if v.is_continuous] @property def effective_variables(self): return self.model_selected[:] def __vizrank_set_attrs(self, attrs): if not attrs: return self.model_selected[:] = attrs[:] self.model_other[:] = [ var for var in self.continuous_variables if var not in attrs ] self.__model_selected_changed() def __model_selected_changed(self): self.selected_vars = [(var.name, vartype(var)) for var in self.model_selected] self.projection = None self._check_options() self.init_projection() self.setup_plot() self.commit() def __placement_radio_changed(self): self.controls.graph.hide_radius.setEnabled( self.placement != self.Placement.Circular) self.projection = self.projector = None self._init_vizrank() self.init_projection() self.setup_plot() self.commit() def __radius_slider_changed(self): self.graph.update_radius() def colors_changed(self): super().colors_changed() self._init_vizrank() def set_data(self, data): super().set_data(data) self._check_options() self._init_vizrank() self.init_projection() def use_context(self): self.model_selected.clear() self.model_other.clear() if self.data is not None and len(self.selected_vars): d, selected = self.data.domain, [v[0] for v in self.selected_vars] self.model_selected[:] = [d[attr] for attr in selected] self.model_other[:] = [ d[attr.name] for attr in self.continuous_variables if attr.name not in selected ] elif self.data is not None: self.model_selected[:] = self.continuous_variables[:3] self.model_other[:] = self.continuous_variables[3:] def _check_options(self): buttons = self.radio_placement.buttons for btn in buttons: btn.setEnabled(True) if self.data is not None: has_discrete_class = self.data.domain.has_discrete_class if not has_discrete_class or len(np.unique(self.data.Y)) < 3: buttons[self.Placement.LDA].setEnabled(False) if self.placement == self.Placement.LDA: self.placement = self.Placement.Circular self.controls.graph.hide_radius.setEnabled( self.placement != self.Placement.Circular) def _init_vizrank(self): is_enabled, msg = False, "" if self.data is None: msg = "There is no data." elif self.attr_color is None: msg = "Color variable has to be selected" elif self.attr_color.is_continuous and \ self.placement == self.Placement.LDA: msg = "Suggest Features does not work for Linear " \ "Discriminant Analysis Projection when " \ "continuous color variable is selected." elif len( [v for v in self.continuous_variables if v is not self.attr_color ]) < 3: msg = "Not enough available continuous variables" elif np.sum(np.all(np.isfinite(self.data.X), axis=1)) < 2: msg = "Not enough valid data instances" else: is_enabled = not np.isnan( self.data.get_column_view( self.attr_color)[0].astype(float)).all() self.btn_vizrank.setToolTip(msg) self.btn_vizrank.setEnabled(is_enabled) if is_enabled: self.vizrank.initialize() def check_data(self): def error(err): err() self.data = None super().check_data() if self.data is not None: if not len(self.continuous_variables): error(self.Error.no_cont_features) def init_attr_values(self): super().init_attr_values() self.selected_vars = [] def init_projection(self): if self.placement == self.Placement.Circular: self.projector = CircularPlacement() elif self.placement == self.Placement.LDA: self.projector = LDA(solver="eigen", n_components=2) elif self.placement == self.Placement.PCA: self.projector = PCA(n_components=2) self.projector.component = 2 self.projector.preprocessors = PCA.preprocessors + [Normalize()] super().init_projection() def get_coordinates_data(self): def normalized(a): span = np.max(a, axis=0) - np.min(a, axis=0) span[span == 0] = 1 return (a - np.mean(a, axis=0)) / span embedding = self.get_embedding() if embedding is None: return None, None norm_emb = normalized(embedding[self.valid_data]) return (norm_emb.ravel(), np.zeros(len(norm_emb), dtype=float)) \ if embedding.shape[1] == 1 else norm_emb.T def _get_send_report_caption(self): def projection_name(): return self.Projection_name[self.placement] return report.render_items_vert( (("Projection", projection_name()), ("Color", self._get_caption_var_name(self.attr_color)), ("Label", self._get_caption_var_name(self.attr_label)), ("Shape", self._get_caption_var_name(self.attr_shape)), ("Size", self._get_caption_var_name(self.attr_size)), ("Jittering", self.graph.jitter_size != 0 and "{} %".format(self.graph.jitter_size)))) @classmethod def migrate_settings(cls, settings_, version): if version < 2: settings_["point_width"] = settings_["point_size"] if version < 3: settings_graph = {} settings_graph["jitter_size"] = settings_["jitter_value"] settings_graph["point_width"] = settings_["point_width"] settings_graph["alpha_value"] = settings_["alpha_value"] settings_graph["class_density"] = settings_["class_density"] settings_["graph"] = settings_graph if version < 4: if "radius" in settings_: settings_["graph"]["hide_radius"] = settings_["radius"] if "selection_indices" in settings_ and \ settings_["selection_indices"] is not None: selection = settings_["selection_indices"] settings_["selection"] = [ (i, 1) for i, selected in enumerate(selection) if selected ] if version < 5: if "placement" in settings_ and \ settings_["placement"] not in cls.Placement: settings_["placement"] = cls.Placement.Circular @classmethod def migrate_context(cls, context, version): if version < 2: domain = context.ordered_domain c_domain = [t for t in context.ordered_domain if t[1] == 2] d_domain = [t for t in context.ordered_domain if t[1] == 1] for d, old_val, new_val in ((domain, "color_index", "attr_color"), (d_domain, "shape_index", "attr_shape"), (c_domain, "size_index", "attr_size")): index = context.values[old_val][0] - 1 context.values[new_val] = (d[index][0], d[index][1] + 100) \ if 0 <= index < len(d) else None if version < 3: context.values["graph"] = { "attr_color": context.values["attr_color"], "attr_shape": context.values["attr_shape"], "attr_size": context.values["attr_size"] } if version == 3: values = context.values values["attr_color"] = values["graph"]["attr_color"] values["attr_size"] = values["graph"]["attr_size"] values["attr_shape"] = values["graph"]["attr_shape"] values["attr_label"] = values["graph"]["attr_label"]
class Randomize(Preprocess): """ Construct a preprocessor for randomization of classes, attributes and/or metas. Given a data table, preprocessor returns a new table in which the data is shuffled. Parameters ---------- rand_type : RandTypes (default: Randomize.RandomizeClasses) Randomization type. If Randomize.RandomizeClasses, classes are shuffled. If Randomize.RandomizeAttributes, attributes are shuffled. If Randomize.RandomizeMetas, metas are shuffled. rand_seed : int (optional) Random seed Examples -------- >>> from Orange.data import Table >>> from Orange.preprocess import Randomize >>> data = Table("iris") >>> randomizer = Randomize(Randomize.RandomizeClasses) >>> randomized_data = randomizer(data) """ Type = Enum("Randomize", dict(RandomizeClasses=1, RandomizeAttributes=2, RandomizeMetas=4), type=int, qualname="Randomize.Type") RandomizeClasses, RandomizeAttributes, RandomizeMetas = Type def __init__(self, rand_type=RandomizeClasses, rand_seed=None): self.rand_type = rand_type self.rand_seed = rand_seed def __call__(self, data): """ Apply randomization of the given data. Returns a new data table. Parameters ---------- data : Orange.data.Table A data table to be randomized. Returns ------- data : Orange.data.Table Randomized data table. """ new_data = data.copy() rstate = np.random.RandomState(self.rand_seed) # ensure the same seed is not used to shuffle X and Y at the same time r1, r2, r3 = rstate.randint(0, 2**32 - 1, size=3, dtype=np.int64) if self.rand_type & Randomize.RandomizeClasses: new_data.Y = self.randomize(new_data.Y, r1) if self.rand_type & Randomize.RandomizeAttributes: new_data.X = self.randomize(new_data.X, r2) if self.rand_type & Randomize.RandomizeMetas: new_data.metas = self.randomize(new_data.metas, r3) return new_data def randomize(self, table, rand_state=None): rstate = np.random.RandomState(rand_state) if sp.issparse(table): table = table.tocsc() # type: sp.spmatrix for i in range(table.shape[1]): permutation = rstate.permutation(table.shape[0]) col_indices = \ table.indices[table.indptr[i]: table.indptr[i + 1]] col_indices[:] = permutation[col_indices] elif len(table.shape) > 1: for i in range(table.shape[1]): rstate.shuffle(table[:, i]) else: rstate.shuffle(table) return table
class Normalize(Preprocess): """ Construct a preprocessor for normalization of features. Given a data table, preprocessor returns a new table in which the continuous attributes are normalized. Parameters ---------- zero_based : bool (default=True) Determines the value used as the “low” value of the variable. It determines the interval for normalized continuous variables (either [-1, 1] or [0, 1]). norm_type : NormTypes (default: Normalize.NormalizeBySD) Normalization type. If Normalize.NormalizeBySD, the values are replaced with standardized values by subtracting the average value and dividing by the standard deviation. Attribute zero_based has no effect on this standardization. If Normalize.NormalizeBySpan, the values are replaced with normalized values by subtracting min value of the data and dividing by span (max - min). transform_class : bool (default=False) If True the class is normalized as well. Examples -------- >>> from Orange.data import Table >>> from Orange.preprocess import Normalize >>> data = Table("iris") >>> normalizer = Normalize(norm_type=Normalize.NormalizeBySpan) >>> normalized_data = normalizer(data) """ Type = Enum("Normalize", ("NormalizeBySpan", "NormalizeBySD"), qualname="Normalize.Type") NormalizeBySpan, NormalizeBySD = Type def __init__(self, zero_based=True, norm_type=NormalizeBySD, transform_class=False): self.zero_based = zero_based self.norm_type = norm_type self.transform_class = transform_class def __call__(self, data): """ Compute and apply normalization of the given data. Returns a new data table. Parameters ---------- data : Orange.data.Table A data table to be normalized. Returns ------- data : Orange.data.Table Normalized data table. """ from . import normalize if all( a.attributes.get('skip-normalization', False) for a in data.domain.attributes if a.is_continuous): # Skip normalization for datasets where all features are marked as already normalized. # Required for SVMs (with normalizer as their default preprocessor) on sparse data to # retain sparse structure. Normalizing sparse data would otherwise result in a dense # matrix, which requires too much memory. For example, this is used for Bag of Words # models where normalization is not really needed. return data normalizer = normalize.Normalizer(zero_based=self.zero_based, norm_type=self.norm_type, transform_class=self.transform_class) return normalizer(data)
class Randomize(Preprocess): """ Construct a preprocessor for randomization of classes, attributes and/or metas. Given a data table, preprocessor returns a new table in which the data is shuffled. Parameters ---------- rand_type : RandTypes (default: Randomize.RandomizeClasses) Randomization type. If Randomize.RandomizeClasses, classes are shuffled. If Randomize.RandomizeAttributes, attributes are shuffled. If Randomize.RandomizeMetas, metas are shuffled. rand_seed : int (optional) Random seed Examples -------- >>> from Orange.data import Table >>> from Orange.preprocess import Randomize >>> data = Table("iris") >>> randomizer = Randomize(Randomize.RandomizeClasses) >>> randomized_data = randomizer(data) """ Type = Enum("Randomize", dict(RandomizeClasses=1, RandomizeAttributes=2, RandomizeMetas=4), type=int) RandomizeClasses, RandomizeAttributes, RandomizeMetas = Type def __init__(self, rand_type=RandomizeClasses, rand_seed=None): self.rand_type = rand_type self.rand_seed = rand_seed def __call__(self, data): """ Apply randomization of the given data. Returns a new data table. Parameters ---------- data : Orange.data.Table A data table to be randomized. Returns ------- data : Orange.data.Table Randomized data table. """ new_data = data.copy() if self.rand_type & Randomize.RandomizeClasses: new_data.Y = self.randomize(new_data.Y) if self.rand_type & Randomize.RandomizeAttributes: new_data.X = self.randomize(new_data.X) if self.rand_type & Randomize.RandomizeMetas: new_data.metas = self.randomize(new_data.metas) return new_data def randomize(self, table): return skl_shuffle(table, random_state=self.rand_seed)
class OWLinearProjection(OWAnchorProjectionWidget): name = "Linear Projection" description = "A multi-axis projection of data onto " \ "a two-dimensional plane." icon = "icons/LinearProjection.svg" priority = 240 keywords = [] class Inputs(OWAnchorProjectionWidget.Inputs): projection_input = Input("Projection", Table) Placement = Enum("Placement", dict(Circular=0, LDA=1, PCA=2, Projection=3), type=int, qualname="OWLinearProjection.Placement") Component_name = {Placement.Circular: "C", Placement.LDA: "LD", Placement.PCA: "PC"} Variable_name = {Placement.Circular: "circular", Placement.LDA: "lda", Placement.PCA: "pca", Placement.Projection: "projection"} Projection_name = {Placement.Circular: "Circular Placement", Placement.LDA: "Linear Discriminant Analysis", Placement.PCA: "Principal Component Analysis", Placement.Projection: "Use input projection"} settings_version = 4 placement = Setting(Placement.Circular) selected_vars = ContextSetting([]) vizrank = SettingProvider(LinearProjectionVizRank) GRAPH_CLASS = OWLinProjGraph graph = SettingProvider(OWLinProjGraph) class Warning(OWAnchorProjectionWidget.Warning): not_enough_comp = Msg("Input projection has less than two components") trivial_components = Msg( "All components of the PCA are trivial (explain zero variance). " "Input data is constant (or near constant).") class Error(OWAnchorProjectionWidget.Error): no_cont_features = Msg("Plotting requires numeric features") proj_and_domain_match = Msg("Projection and Data domains do not match") def __init__(self): self.model_selected = VariableListModel(enable_dnd=True) self.model_selected.rowsInserted.connect(self.__model_selected_changed) self.model_selected.rowsRemoved.connect(self.__model_selected_changed) self.model_other = VariableListModel(enable_dnd=True) self.vizrank, self.btn_vizrank = LinearProjectionVizRank.add_vizrank( None, self, "Suggest Features", self.__vizrank_set_attrs) super().__init__() self.projection_input = None self.variables = None def _add_controls(self): self._add_controls_variables() self._add_controls_placement() super()._add_controls() self.graph.gui.add_control( self._effects_box, gui.hSlider, "Hide radius:", master=self.graph, value="hide_radius", minValue=0, maxValue=100, step=10, createLabel=False, callback=self.__radius_slider_changed ) self.controlArea.layout().removeWidget(self.control_area_stretch) self.control_area_stretch.setParent(None) def _add_controls_variables(self): self.variables_selection = VariablesSelection( self, self.model_selected, self.model_other, self.controlArea ) self.variables_selection.add_remove.layout().addWidget( self.btn_vizrank ) def _add_controls_placement(self): box = gui.widgetBox( self.controlArea, True, sizePolicy=(QSizePolicy.Minimum, QSizePolicy.Maximum) ) self.radio_placement = gui.radioButtonsInBox( box, self, "placement", btnLabels=[self.Projection_name[x] for x in self.Placement], callback=self.__placement_radio_changed ) @property def continuous_variables(self): if self.data is None or self.data.domain is None: return [] dom = self.data.domain return [v for v in chain(dom.variables, dom.metas) if v.is_continuous] def __vizrank_set_attrs(self, attrs): if not attrs: return self.model_selected[:] = attrs[:] self.model_other[:] = [var for var in self.continuous_variables if var not in attrs] def __model_selected_changed(self): self.selected_vars = [(var.name, vartype(var)) for var in self.model_selected] self.projection = None self.variables = None self._check_options() self.setup_plot() self.commit() def __placement_radio_changed(self): self.variables_selection.set_enabled( self.placement in [self.Placement.Circular, self.Placement.LDA]) self.controls.graph.hide_radius.setEnabled( self.placement != self.Placement.Circular) self.projection = None self.variables = None self._init_vizrank() self.setup_plot() self.commit() def __radius_slider_changed(self): self.graph.update_radius() def colors_changed(self): super().colors_changed() self._init_vizrank() def set_data(self, data): super().set_data(data) if self.data is not None and len(self.selected_vars): d, selected = self.data.domain, [v[0] for v in self.selected_vars] self.model_selected[:] = [d[attr] for attr in selected] self.model_other[:] = [d[attr.name] for attr in self.continuous_variables if attr.name not in selected] elif self.data is not None: self.model_selected[:] = self.continuous_variables[:3] self.model_other[:] = self.continuous_variables[3:] self._check_options() self._init_vizrank() def _check_options(self): buttons = self.radio_placement.buttons for btn in buttons: btn.setEnabled(True) if self.data is not None: has_discrete_class = self.data.domain.has_discrete_class if not has_discrete_class or len(np.unique(self.data.Y)) < 2: buttons[self.Placement.LDA].setEnabled(False) if self.placement == self.Placement.LDA: self.placement = self.Placement.Circular if not self.projection_input: buttons[self.Placement.Projection].setEnabled(False) if self.placement == self.Placement.Projection: self.placement = self.Placement.Circular self.variables_selection.set_enabled( self.placement in [self.Placement.Circular, self.Placement.LDA]) self.controls.graph.hide_radius.setEnabled( self.placement != self.Placement.Circular) def _init_vizrank(self): is_enabled, msg = False, "" if self.data is None: msg = "There is no data." elif self.placement not in [self.Placement.Circular, self.Placement.LDA]: msg = "Suggest Features works only for Circular and " \ "Linear Discriminant Analysis Projection" elif self.attr_color is None: msg = "Color variable has to be selected" elif self.attr_color.is_continuous and \ self.placement == self.Placement.LDA: msg = "Suggest Features does not work for Linear " \ "Discriminant Analysis Projection when " \ "continuous color variable is selected." elif len([v for v in self.continuous_variables if v is not self.attr_color]) < 3: msg = "Not enough available continuous variables" elif len(self.data[self.valid_data]) < 2: msg = "Not enough valid data instances" else: is_enabled = not np.isnan(self.data.get_column_view( self.attr_color)[0].astype(float)).all() self.btn_vizrank.setToolTip(msg) self.btn_vizrank.setEnabled(is_enabled) if is_enabled: self.vizrank.initialize() def check_data(self): def error(err): err() self.data = None super().check_data() if self.data is not None: if not len(self.continuous_variables): error(self.Error.no_cont_features) def init_attr_values(self): super().init_attr_values() self.selected_vars = [] @Inputs.projection_input def set_projection(self, projection): self.Warning.not_enough_comp.clear() if projection and len(projection) < 2: self.Warning.not_enough_comp() projection = None if projection is not None: self.placement = self.Placement.Projection self.projection_input = projection self._check_options() def get_embedding(self): self.valid_data = None if self.data is None or not self.variables: return None if self.placement == self.Placement.PCA: self.valid_data, ec, self.projection = self._get_pca() self.variables = self._pca.orig_domain.attributes else: self.valid_data, ec, self.projection = \ self.prepare_projection_data(self.variables) self.Error.no_valid_data.clear() if self.valid_data is None or not sum(self.valid_data) or \ self.projection is None or ec is None: self.Error.no_valid_data() return None embedding = np.zeros((len(self.data), 2), dtype=np.float) embedding[self.valid_data] = ec return embedding def prepare_projection_data(self, variables): def projection(_vars): attrs = self.projection_input.domain.attributes if set(attrs).issuperset(_vars): return self.projection_input[:2, _vars].X elif set(f.name for f in attrs).issuperset(f.name for f in _vars): return self.projection_input[:2, [f.name for f in _vars]].X else: self.Error.proj_and_domain_match() return None def get_axes(_vars): self.Error.proj_and_domain_match.clear() if self.placement == self.Placement.Circular: return LinProj.defaultaxes(len(_vars)) elif self.placement == self.Placement.LDA: return self._get_lda(self.data, _vars) elif self.placement == self.Placement.Projection and \ self.projection_input is not None: return projection(_vars) else: return None coords = np.vstack(column_data(self.data, v, float) for v in variables) axes = get_axes(variables) if axes is None: return None, None, None valid_mask = ~np.isnan(coords).any(axis=0) X, Y = np.dot(axes, coords[:, valid_mask]) if X.size and Y.size: X = normalized(X) Y = normalized(Y) return valid_mask, np.stack((X, Y), axis=1), axes.T def get_anchors(self): if self.projection is None: return None, None return self.projection, [v.name for v in self.variables] def setup_plot(self): self.init_projection_variables() super().setup_plot() def init_projection_variables(self): self.variables = None if self.data is None: return if self.placement in [self.Placement.Circular, self.Placement.LDA]: self.variables = self.model_selected[:] elif self.placement == self.Placement.Projection: self.variables = self.model_selected[:] + self.model_other[:] elif self.placement == self.Placement.PCA: self.variables = [var for var in self.data.domain.attributes if var.is_continuous] def _get_lda(self, data, variables): data = data.transform(Domain(variables, data.domain.class_vars)) lda = LinearDiscriminantAnalysis(solver='eigen', n_components=2) lda.fit(data.X, data.Y) scalings = lda.scalings_[:, :2].T if scalings.shape == (1, 1): scalings = np.array([[1.], [0.]]) return scalings def _get_pca(self): pca_projector = PCA(n_components=2) pca_projector.component = 2 pca_projector.preprocessors = PCA.preprocessors + [Normalize()] pca = pca_projector(self.data) variance_ratio = pca.explained_variance_ratio_ cumulative = np.cumsum(variance_ratio) self._pca = pca if not np.isfinite(cumulative[-1]): self.Warning.trivial_components() coords = pca(self.data).X valid_mask = ~np.isnan(coords).any(axis=1) # scale axes max_radius = np.min([np.abs(np.min(coords, axis=0)), np.max(coords, axis=0)]) axes = pca.components_.T.copy() axes *= max_radius / np.max(np.linalg.norm(axes, axis=1)) return valid_mask, coords, axes def send_components(self): components = None if self.data is not None and self.valid_data is not None and \ self.projection is not None: if self.placement in [self.Placement.Circular, self.Placement.LDA]: axes = self.projection attrs = self.model_selected elif self.placement == self.Placement.PCA: axes = self._pca.components_.T attrs = self._pca.orig_domain.attributes if self.placement != self.Placement.Projection: meta_attrs = [StringVariable(name='component')] metas = np.array( [["{}{}".format(self.Component_name[self.placement], i + 1) for i in range(axes.shape[1])]], dtype=object).T components = Table(Domain(attrs, metas=meta_attrs), axes.T, metas=metas) components.name = self.data.name else: components = self.projection_input self.Outputs.components.send(components) def _get_projection_variables(self): pn = self.Variable_name[self.placement] self.embedding_variables_names = ("{}-x".format(pn), "{}-y".format(pn)) return super()._get_projection_variables() def _get_send_report_caption(self): def projection_name(): return self.Projection_name[self.placement] return report.render_items_vert(( ("Projection", projection_name()), ("Color", self._get_caption_var_name(self.attr_color)), ("Label", self._get_caption_var_name(self.attr_label)), ("Shape", self._get_caption_var_name(self.attr_shape)), ("Size", self._get_caption_var_name(self.attr_size)), ("Jittering", self.graph.jitter_size != 0 and "{} %".format(self.graph.jitter_size)))) def clear(self): self.variables = None if self.model_selected: self.model_selected.clear() if self.model_other: self.model_other.clear() super().clear() @classmethod def migrate_settings(cls, settings_, version): if version < 2: settings_["point_width"] = settings_["point_size"] if version < 3: settings_graph = {} settings_graph["jitter_size"] = settings_["jitter_value"] settings_graph["point_width"] = settings_["point_width"] settings_graph["alpha_value"] = settings_["alpha_value"] settings_graph["class_density"] = settings_["class_density"] settings_["graph"] = settings_graph if version < 4: if "radius" in settings_: settings_["graph"]["hide_radius"] = settings_["radius"] if "selection_indices" in settings_ and \ settings_["selection_indices"] is not None: selection = settings_["selection_indices"] settings_["selection"] = [(i, 1) for i, selected in enumerate(selection) if selected] @classmethod def migrate_context(cls, context, version): if version < 2: domain = context.ordered_domain c_domain = [t for t in context.ordered_domain if t[1] == 2] d_domain = [t for t in context.ordered_domain if t[1] == 1] for d, old_val, new_val in ((domain, "color_index", "attr_color"), (d_domain, "shape_index", "attr_shape"), (c_domain, "size_index", "attr_size")): index = context.values[old_val][0] - 1 context.values[new_val] = (d[index][0], d[index][1] + 100) \ if 0 <= index < len(d) else None if version < 3: context.values["graph"] = { "attr_color": context.values["attr_color"], "attr_shape": context.values["attr_shape"], "attr_size": context.values["attr_size"] } if version == 3: values = context.values values["attr_color"] = values["graph"]["attr_color"] values["attr_size"] = values["graph"]["attr_size"] values["attr_shape"] = values["graph"]["attr_shape"] values["attr_label"] = values["graph"]["attr_label"]
class Randomize(Preprocess): """ Construct a preprocessor for randomization of classes, attributes or metas. Given a data table, preprocessor returns a new table in which the data is shuffled. Parameters ---------- rand_type : RandTypes (default: Randomize.RandomizeClasses) Randomization type. If Randomize.RandomizeClasses, classes are shuffled. If Randomize.RandomizeAttributes, attributes are shuffled. If Randomize.RandomizeMetas, metas are shuffled. rand_seed : int (optional) Random seed Examples -------- >>> from Orange.data import Table >>> from Orange.preprocess import Randomize >>> data = Table("iris") >>> randomizer = Randomize(Randomize.RandomizeClasses) >>> randomized_data = randomizer(data) """ Type = Enum("Randomize", "RandomizeClasses, RandomizeAttributes, RandomizeMetas") RandomizeClasses, RandomizeAttributes, RandomizeMetas = Type def __init__(self, rand_type=RandomizeClasses, rand_seed=None): self.rand_type = rand_type self.rand_seed = rand_seed def __call__(self, data): """ Apply randomization of the given data. Returns a new data table. Parameters ---------- data : Orange.data.Table A data table to be randomized. Returns ------- data : Orange.data.Table Randomized data table. """ new_data = Table(data) new_data.ensure_copy() if self.rand_type == Randomize.RandomizeClasses: self.randomize(new_data.Y) elif self.rand_type == Randomize.RandomizeAttributes: self.randomize(new_data.X) elif self.rand_type == Randomize.RandomizeMetas: self.randomize(new_data.metas) else: raise TypeError('Unsupported type') return new_data def randomize(self, table): np.random.seed(self.rand_seed) if len(table.shape) > 1: for i in range(table.shape[1]): np.random.shuffle(table[:, i]) else: np.random.shuffle(table)