class MockWidget(OWBaseWidget): name = "mock" domain_editor = SettingProvider(DomainEditor) def __init__(self): self.domain_editor = DomainEditor(self)
class OWOutliers(OWWidget, ConcurrentWidgetMixin): name = "Outliers" description = "Detect outliers." icon = "icons/Outliers.svg" priority = 3000 category = "Data" keywords = ["inlier"] class Inputs: data = Input("Data", Table) class Outputs: inliers = Output("Inliers", Table) outliers = Output("Outliers", Table) data = Output("Data", Table) want_main_area = False resizing_enabled = False OneClassSVM, Covariance, LOF, IsolationForest = range(4) METHODS = (OneClassSVMLearner, EllipticEnvelopeLearner, LocalOutlierFactorLearner, IsolationForestLearner) svm_editor = SettingProvider(SVMEditor) cov_editor = SettingProvider(CovarianceEditor) lof_editor = SettingProvider(LocalOutlierFactorEditor) isf_editor = SettingProvider(IsolationForestEditor) settings_version = 2 outlier_method = Setting(LOF) auto_commit = Setting(True) MAX_FEATURES = 1500 class Warning(OWWidget.Warning): disabled_cov = Msg("Too many features for covariance estimation.") class Error(OWWidget.Error): singular_cov = Msg("Singular covariance matrix.") memory_error = Msg("Not enough memory") def __init__(self): OWWidget.__init__(self) ConcurrentWidgetMixin.__init__(self) self.data = None # type: Table self.n_inliers = None # type: int self.n_outliers = None # type: int self.editors = None # type: Tuple[ParametersEditor] self.current_editor = None # type: ParametersEditor self.method_combo = None # type: QComboBox self.init_gui() def init_gui(self): box = gui.vBox(self.controlArea, "Method") self.method_combo = gui.comboBox(box, self, "outlier_method", items=[m.name for m in self.METHODS], callback=self.__method_changed) self._init_editors() gui.auto_apply(self.buttonsArea, self, "auto_commit") def _init_editors(self): self.svm_editor = SVMEditor(self) self.cov_editor = CovarianceEditor(self) self.lof_editor = LocalOutlierFactorEditor(self) self.isf_editor = IsolationForestEditor(self) box = gui.vBox(self.controlArea, "Parameters") self.editors = (self.svm_editor, self.cov_editor, self.lof_editor, self.isf_editor) for editor in self.editors: editor.param_changed.connect(lambda: self.commit()) box.layout().addWidget(editor) editor.hide() self.set_current_editor() def __method_changed(self): self.set_current_editor() self.commit() def set_current_editor(self): if self.current_editor: self.current_editor.hide() self.current_editor = self.editors[self.outlier_method] self.current_editor.show() @Inputs.data @check_sql_input def set_data(self, data): self.cancel() self.clear_messages() self.data = data self.enable_controls() self.unconditional_commit() def enable_controls(self): self.method_combo.model().item(self.Covariance).setEnabled(True) if self.data and len(self.data.domain.attributes) > self.MAX_FEATURES: self.outlier_method = self.LOF self.set_current_editor() self.method_combo.model().item(self.Covariance).setEnabled(False) self.Warning.disabled_cov() def commit(self): self.Error.singular_cov.clear() self.Error.memory_error.clear() self.n_inliers = self.n_outliers = None learner_class = self.METHODS[self.outlier_method] kwargs = self.current_editor.get_parameters() learner = learner_class(**kwargs) self.start(run, self.data, learner) def on_partial_result(self, _): pass def on_done(self, result: Results): inliers, outliers = result.inliers, result.outliers self.n_inliers = len(inliers) if inliers else None self.n_outliers = len(outliers) if outliers else None self.Outputs.inliers.send(inliers) self.Outputs.outliers.send(outliers) self.Outputs.data.send(result.annotated_data) def on_exception(self, ex): if isinstance(ex, ValueError): self.Error.singular_cov(ex) elif isinstance(ex, MemoryError): self.Error.memory_error() else: raise ex def onDeleteWidget(self): self.shutdown() super().onDeleteWidget() def send_report(self): if self.n_outliers is None or self.n_inliers is None: return self.report_items("Data", (("Input instances", len(self.data)), ("Inliers", self.n_inliers), ("Outliers", self.n_outliers))) params = self.current_editor.get_parameters() if self.outlier_method == self.OneClassSVM: self.report_items( "Detection", (("Detection method", "One class SVM with non-linear kernel (RBF)"), ("Regularization (nu)", params["nu"]), ("Kernel coefficient", params["gamma"]))) elif self.outlier_method == self.Covariance: self.report_items( "Detection", (("Detection method", "Covariance estimator"), ("Contamination", params["contamination"]), ("Support fraction", params["support_fraction"]))) elif self.outlier_method == self.LOF: self.report_items( "Detection", (("Detection method", "Local Outlier Factor"), ("Contamination", params["contamination"]), ("Number of neighbors", params["n_neighbors"]), ("Metric", params["metric"]))) elif self.outlier_method == self.IsolationForest: self.report_items( "Detection", (("Detection method", "Isolation Forest"), ("Contamination", params["contamination"]))) else: raise NotImplementedError @classmethod def migrate_settings(cls, settings: Dict, version: int): if version is None or version < 2: settings["svm_editor"] = {"nu": settings.get("nu", 50), "gamma": settings.get("gamma", 0.01)} ec, sf = "empirical_covariance", "support_fraction" settings["cov_editor"] = {"cont": settings.get("cont", 10), ec: settings.get(ec, False), sf: settings.get(sf, 1)}
def setUp(self): global default_provider default_provider = SettingProvider(Widget)
class OWOutliers(OWWidget): name = "Outliers" description = "Detect outliers." icon = "icons/Outliers.svg" priority = 3000 category = "Data" keywords = ["inlier"] class Inputs: data = Input("Data", Table) class Outputs: inliers = Output("Inliers", Table) outliers = Output("Outliers", Table) data = Output("Data", Table) want_main_area = False resizing_enabled = False OneClassSVM, Covariance, LOF, IsolationForest = range(4) METHODS = (OneClassSVMLearner, EllipticEnvelopeLearner, LocalOutlierFactorLearner, IsolationForestLearner) svm_editor = SettingProvider(SVMEditor) cov_editor = SettingProvider(CovarianceEditor) lof_editor = SettingProvider(LocalOutlierFactorEditor) isf_editor = SettingProvider(IsolationForestEditor) settings_version = 2 outlier_method = Setting(LOF) auto_commit = Setting(True) MAX_FEATURES = 1500 class Warning(OWWidget.Warning): disabled_cov = Msg("Too many features for covariance estimation.") class Error(OWWidget.Error): singular_cov = Msg("Singular covariance matrix.") memory_error = Msg("Not enough memory") def __init__(self): super().__init__() self.data = None # type: Table self.n_inliers = None # type: int self.n_outliers = None # type: int self.editors = None # type: Tuple[ParametersEditor] self.current_editor = None # type: ParametersEditor self.method_combo = None # type: QComboBox self.init_gui() def init_gui(self): box = gui.vBox(self.controlArea, "Method") self.method_combo = gui.comboBox(box, self, "outlier_method", items=[m.name for m in self.METHODS], callback=self.__method_changed) self._init_editors() gui.auto_send(self.controlArea, self, "auto_commit") self.info.set_input_summary(self.info.NoInput) self.info.set_output_summary(self.info.NoOutput) def _init_editors(self): self.svm_editor = SVMEditor(self) self.cov_editor = CovarianceEditor(self) self.lof_editor = LocalOutlierFactorEditor(self) self.isf_editor = IsolationForestEditor(self) box = gui.vBox(self.controlArea, "Parameters") self.editors = (self.svm_editor, self.cov_editor, self.lof_editor, self.isf_editor) for editor in self.editors: editor.param_changed.connect(lambda: self.commit()) box.layout().addWidget(editor) editor.hide() self.set_current_editor() def __method_changed(self): self.set_current_editor() self.commit() def set_current_editor(self): if self.current_editor: self.current_editor.hide() self.current_editor = self.editors[self.outlier_method] self.current_editor.show() @Inputs.data @check_sql_input def set_data(self, data): self.clear_messages() self.data = data self.info.set_input_summary(len(data) if data else self.info.NoOutput) self.enable_controls() self.unconditional_commit() def enable_controls(self): self.method_combo.model().item(self.Covariance).setEnabled(True) if self.data and len(self.data.domain.attributes) > self.MAX_FEATURES: self.outlier_method = self.LOF self.set_current_editor() self.method_combo.model().item(self.Covariance).setEnabled(False) self.Warning.disabled_cov() def _get_outliers(self) -> Tuple[Table, Table, Table]: self.Error.singular_cov.clear() self.Error.memory_error.clear() try: y_pred, amended_data = self.detect_outliers() except ValueError: self.Error.singular_cov() return None, None, None except MemoryError: self.Error.memory_error() return None, None, None else: inliers_ind = np.where(y_pred == 1)[0] outliers_ind = np.where(y_pred == -1)[0] inliers = amended_data[inliers_ind] outliers = amended_data[outliers_ind] self.n_inliers = len(inliers) self.n_outliers = len(outliers) return inliers, outliers, self.annotated_data(amended_data, y_pred) def commit(self): inliers = outliers = data = None self.n_inliers = self.n_outliers = None if self.data: inliers, outliers, data = self._get_outliers() summary = len(inliers) if inliers else self.info.NoOutput self.info.set_output_summary(summary) self.Outputs.inliers.send(inliers) self.Outputs.outliers.send(outliers) self.Outputs.data.send(data) def detect_outliers(self) -> Tuple[np.ndarray, Table]: learner_class = self.METHODS[self.outlier_method] kwargs = self.current_editor.get_parameters() learner = learner_class(**kwargs) model = learner(self.data) y_pred = model(self.data) amended_data = self.amended_data(model) return np.array(y_pred), amended_data def amended_data(self, model: Model) -> Table: if self.outlier_method != self.Covariance: return self.data mahal = model.mahalanobis(self.data.X) mahal = mahal.reshape(len(self.data), 1) attrs = self.data.domain.attributes classes = self.data.domain.class_vars new_metas = list(self.data.domain.metas) + \ [ContinuousVariable(name="Mahalanobis")] new_domain = Domain(attrs, classes, new_metas) amended_data = self.data.transform(new_domain) amended_data.metas = np.hstack((self.data.metas, mahal)) return amended_data @staticmethod def annotated_data(data: Table, labels: np.ndarray) -> Table: domain = data.domain names = [v.name for v in domain.variables + domain.metas] name = get_unique_names(names, "Outlier") outlier_var = DiscreteVariable(name, values=["Yes", "No"]) metas = domain.metas + (outlier_var,) domain = Domain(domain.attributes, domain.class_vars, metas) data = data.transform(domain) labels[labels == -1] = 0 data.metas[:, -1] = labels return data def send_report(self): if self.n_outliers is None or self.n_inliers is None: return self.report_items("Data", (("Input instances", len(self.data)), ("Inliers", self.n_inliers), ("Outliers", self.n_outliers))) params = self.current_editor.get_parameters() if self.outlier_method == self.OneClassSVM: self.report_items( "Detection", (("Detection method", "One class SVM with non-linear kernel (RBF)"), ("Regularization (nu)", params["nu"]), ("Kernel coefficient", params["gamma"]))) elif self.outlier_method == self.Covariance: self.report_items( "Detection", (("Detection method", "Covariance estimator"), ("Contamination", params["contamination"]), ("Support fraction", params["support_fraction"]))) elif self.outlier_method == self.LOF: self.report_items( "Detection", (("Detection method", "Local Outlier Factor"), ("Contamination", params["contamination"]), ("Number of neighbors", params["n_neighbors"]), ("Metric", params["metric"]))) elif self.outlier_method == self.IsolationForest: self.report_items( "Detection", (("Detection method", "Isolation Forest"), ("Contamination", params["contamination"]))) else: raise NotImplementedError @classmethod def migrate_settings(cls, settings: Dict, version: int): if version is None or version < 2: settings["svm_editor"] = {"nu": settings.get("nu", 50), "gamma": settings.get("gamma", 0.01)} ec, sf = "empirical_covariance", "support_fraction" settings["cov_editor"] = {"cont": settings.get("cont", 10), ec: settings.get(ec, False), sf: settings.get(sf, 1)}
class OWLDAvis(OWWidget): name = "LDAvis" description = "Interactive exploration of LDA topics." priority = 410 icon = "icons/LDAvis.svg" selected_topic = Setting(0, schema_only=True) relevance = Setting(0.5) visual_settings = Setting({}, schema_only=True) graph = SettingProvider(BarPlotGraph) graph_name = "graph.plotItem" class Inputs: topics = Input("Topics", Topics) class Error(OWWidget.Error): # Relevant Terms cannot work with LSI or HDP, because it expects # topic-term probabilities. wrong_model = Msg("Relevant Terms only accepts output from LDA.") def __init__(self): OWWidget.__init__(self) self.data = None self.topic_list = [] self.term_topic_matrix = None self.term_frequency = None self.num_tokens = None # should be used later for bar chart self.graph: Optional[BarPlotGraph] = None self._create_layout() VisualSettingsDialog(self, self.graph.parameter_setter.initial_settings) def _create_layout(self): self._add_graph() box = gui.widgetBox(self.controlArea, "Relevance") self.rel_slider = gui.hSlider( box, self, "relevance", minValue=0, maxValue=1, step=0.1, intOnly=False, labelFormat="%.1f", callback_finished=self.on_params_change, createLabel=True, ) self.topic_box = gui.listBox( self.controlArea, self, "selected_topic", "topic_list", box="Topics", callback=self.on_params_change, ) def _add_graph(self): self.graph = BarPlotGraph(self) self.mainArea.layout().addWidget(self.graph) def compute_relevance(self, topic: np.ndarray) -> np.ndarray: """ Relevance is defined as lambda*log(topic_probability) + ( 1-lambda)*log(topic_probability/marginal_probability). https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf """ nonzero = (topic > 0) & (self.term_frequency > 0) tp, mp = topic[nonzero], self.term_frequency[nonzero] adj_prob = np.zeros(topic.shape) rel = self.relevance adj_prob[nonzero] = rel * np.log(tp) + (1 - rel) * np.log(tp / mp) return adj_prob @staticmethod def compute_distributions(data: Topics) -> np.ndarray: """ Compute how likely is the term in each topic Term-topic column is multiplied by marginal topic probability """ topic_frequency = data.get_column_view("Marginal Topic Probability")[0] return data.X * topic_frequency[:, None].astype(float) def on_params_change(self): if self.data is None: return topic = self.data.X[:, self.selected_topic] adj_prob = self.compute_relevance(topic) idx = np.argsort(adj_prob, axis=None)[::-1][:N_BEST_PLOTTED] words = self.data.metas[:, 0][idx] term_topic_freq = self.term_topic_matrix[self.selected_topic].T[idx] marg_prob = self.term_frequency[idx] # convert to absolute frequencies term_topic_freq = term_topic_freq * self.num_tokens marg_prob = marg_prob * self.num_tokens self.graph.update_graph(words, term_topic_freq, marg_prob) @Inputs.topics def set_data(self, data: Optional[Topics]): prev_topic = self.selected_topic self.clear() if data is None: return if data.attributes.get("Model", "") != "Latent Dirichlet Allocation": self.Error.wrong_model() return self.data = Table.transpose(data, "Topics", "Words") self.topic_list = [var.name for var in self.data.domain.attributes] self.num_tokens = data.attributes.get("Number of tokens", "") self.term_topic_matrix = self.compute_distributions(data) self.term_frequency = np.sum(self.term_topic_matrix, axis=0) self.selected_topic = prev_topic if prev_topic < len( self.topic_list) else 0 self.on_params_change() def set_visual_settings(self, key: KeyType, value: ValueType): self.graph.parameter_setter.set_parameter(key, value) self.visual_settings[key] = value def clear(self): self.Error.clear() self.graph.clear_all() self.data = None self.topic_list = [] self.term_topic_matrix = None self.term_frequency = None self.num_tokens = None def send_report(self): self.report_items(( ("Relevance", self.relevance), ("Shown topic", self.topic_list[self.selected_topic]), )) self.report_plot()