Ejemplo n.º 1
0
class MockWidget(OWBaseWidget):
    name = "mock"
    domain_editor = SettingProvider(DomainEditor)

    def __init__(self):
        self.domain_editor = DomainEditor(self)
Ejemplo n.º 2
0
class OWOutliers(OWWidget, ConcurrentWidgetMixin):
    name = "Outliers"
    description = "Detect outliers."
    icon = "icons/Outliers.svg"
    priority = 3000
    category = "Data"
    keywords = ["inlier"]

    class Inputs:
        data = Input("Data", Table)

    class Outputs:
        inliers = Output("Inliers", Table)
        outliers = Output("Outliers", Table)
        data = Output("Data", Table)

    want_main_area = False
    resizing_enabled = False

    OneClassSVM, Covariance, LOF, IsolationForest = range(4)
    METHODS = (OneClassSVMLearner, EllipticEnvelopeLearner,
               LocalOutlierFactorLearner, IsolationForestLearner)
    svm_editor = SettingProvider(SVMEditor)
    cov_editor = SettingProvider(CovarianceEditor)
    lof_editor = SettingProvider(LocalOutlierFactorEditor)
    isf_editor = SettingProvider(IsolationForestEditor)

    settings_version = 2
    outlier_method = Setting(LOF)
    auto_commit = Setting(True)

    MAX_FEATURES = 1500

    class Warning(OWWidget.Warning):
        disabled_cov = Msg("Too many features for covariance estimation.")

    class Error(OWWidget.Error):
        singular_cov = Msg("Singular covariance matrix.")
        memory_error = Msg("Not enough memory")

    def __init__(self):
        OWWidget.__init__(self)
        ConcurrentWidgetMixin.__init__(self)
        self.data = None  # type: Table
        self.n_inliers = None  # type: int
        self.n_outliers = None  # type: int
        self.editors = None  # type: Tuple[ParametersEditor]
        self.current_editor = None  # type: ParametersEditor
        self.method_combo = None  # type: QComboBox
        self.init_gui()

    def init_gui(self):
        box = gui.vBox(self.controlArea, "Method")
        self.method_combo = gui.comboBox(box, self, "outlier_method",
                                         items=[m.name for m in self.METHODS],
                                         callback=self.__method_changed)

        self._init_editors()

        gui.auto_apply(self.buttonsArea, self, "auto_commit")

    def _init_editors(self):
        self.svm_editor = SVMEditor(self)
        self.cov_editor = CovarianceEditor(self)
        self.lof_editor = LocalOutlierFactorEditor(self)
        self.isf_editor = IsolationForestEditor(self)

        box = gui.vBox(self.controlArea, "Parameters")
        self.editors = (self.svm_editor, self.cov_editor,
                        self.lof_editor, self.isf_editor)
        for editor in self.editors:
            editor.param_changed.connect(lambda: self.commit())
            box.layout().addWidget(editor)
            editor.hide()

        self.set_current_editor()

    def __method_changed(self):
        self.set_current_editor()
        self.commit()

    def set_current_editor(self):
        if self.current_editor:
            self.current_editor.hide()
        self.current_editor = self.editors[self.outlier_method]
        self.current_editor.show()

    @Inputs.data
    @check_sql_input
    def set_data(self, data):
        self.cancel()
        self.clear_messages()
        self.data = data
        self.enable_controls()
        self.unconditional_commit()

    def enable_controls(self):
        self.method_combo.model().item(self.Covariance).setEnabled(True)
        if self.data and len(self.data.domain.attributes) > self.MAX_FEATURES:
            self.outlier_method = self.LOF
            self.set_current_editor()
            self.method_combo.model().item(self.Covariance).setEnabled(False)
            self.Warning.disabled_cov()

    def commit(self):
        self.Error.singular_cov.clear()
        self.Error.memory_error.clear()
        self.n_inliers = self.n_outliers = None

        learner_class = self.METHODS[self.outlier_method]
        kwargs = self.current_editor.get_parameters()
        learner = learner_class(**kwargs)

        self.start(run, self.data, learner)

    def on_partial_result(self, _):
        pass

    def on_done(self, result: Results):
        inliers, outliers = result.inliers, result.outliers
        self.n_inliers = len(inliers) if inliers else None
        self.n_outliers = len(outliers) if outliers else None

        self.Outputs.inliers.send(inliers)
        self.Outputs.outliers.send(outliers)
        self.Outputs.data.send(result.annotated_data)

    def on_exception(self, ex):
        if isinstance(ex, ValueError):
            self.Error.singular_cov(ex)
        elif isinstance(ex, MemoryError):
            self.Error.memory_error()
        else:
            raise ex

    def onDeleteWidget(self):
        self.shutdown()
        super().onDeleteWidget()

    def send_report(self):
        if self.n_outliers is None or self.n_inliers is None:
            return
        self.report_items("Data",
                          (("Input instances", len(self.data)),
                           ("Inliers", self.n_inliers),
                           ("Outliers", self.n_outliers)))

        params = self.current_editor.get_parameters()
        if self.outlier_method == self.OneClassSVM:
            self.report_items(
                "Detection",
                (("Detection method",
                  "One class SVM with non-linear kernel (RBF)"),
                 ("Regularization (nu)", params["nu"]),
                 ("Kernel coefficient", params["gamma"])))
        elif self.outlier_method == self.Covariance:
            self.report_items(
                "Detection",
                (("Detection method", "Covariance estimator"),
                 ("Contamination", params["contamination"]),
                 ("Support fraction", params["support_fraction"])))
        elif self.outlier_method == self.LOF:
            self.report_items(
                "Detection",
                (("Detection method", "Local Outlier Factor"),
                 ("Contamination", params["contamination"]),
                 ("Number of neighbors", params["n_neighbors"]),
                 ("Metric", params["metric"])))
        elif self.outlier_method == self.IsolationForest:
            self.report_items(
                "Detection",
                (("Detection method", "Isolation Forest"),
                 ("Contamination", params["contamination"])))
        else:
            raise NotImplementedError

    @classmethod
    def migrate_settings(cls, settings: Dict, version: int):
        if version is None or version < 2:
            settings["svm_editor"] = {"nu": settings.get("nu", 50),
                                      "gamma": settings.get("gamma", 0.01)}
            ec, sf = "empirical_covariance", "support_fraction"
            settings["cov_editor"] = {"cont": settings.get("cont", 10),
                                      ec: settings.get(ec, False),
                                      sf: settings.get(sf, 1)}
 def setUp(self):
     global default_provider
     default_provider = SettingProvider(Widget)
Ejemplo n.º 4
0
class OWOutliers(OWWidget):
    name = "Outliers"
    description = "Detect outliers."
    icon = "icons/Outliers.svg"
    priority = 3000
    category = "Data"
    keywords = ["inlier"]

    class Inputs:
        data = Input("Data", Table)

    class Outputs:
        inliers = Output("Inliers", Table)
        outliers = Output("Outliers", Table)
        data = Output("Data", Table)

    want_main_area = False
    resizing_enabled = False

    OneClassSVM, Covariance, LOF, IsolationForest = range(4)
    METHODS = (OneClassSVMLearner, EllipticEnvelopeLearner,
               LocalOutlierFactorLearner, IsolationForestLearner)
    svm_editor = SettingProvider(SVMEditor)
    cov_editor = SettingProvider(CovarianceEditor)
    lof_editor = SettingProvider(LocalOutlierFactorEditor)
    isf_editor = SettingProvider(IsolationForestEditor)

    settings_version = 2
    outlier_method = Setting(LOF)
    auto_commit = Setting(True)

    MAX_FEATURES = 1500

    class Warning(OWWidget.Warning):
        disabled_cov = Msg("Too many features for covariance estimation.")

    class Error(OWWidget.Error):
        singular_cov = Msg("Singular covariance matrix.")
        memory_error = Msg("Not enough memory")

    def __init__(self):
        super().__init__()
        self.data = None  # type: Table
        self.n_inliers = None  # type: int
        self.n_outliers = None  # type: int
        self.editors = None  # type: Tuple[ParametersEditor]
        self.current_editor = None  # type: ParametersEditor
        self.method_combo = None  # type: QComboBox
        self.init_gui()

    def init_gui(self):
        box = gui.vBox(self.controlArea, "Method")
        self.method_combo = gui.comboBox(box, self, "outlier_method",
                                         items=[m.name for m in self.METHODS],
                                         callback=self.__method_changed)

        self._init_editors()

        gui.auto_send(self.controlArea, self, "auto_commit")

        self.info.set_input_summary(self.info.NoInput)
        self.info.set_output_summary(self.info.NoOutput)

    def _init_editors(self):
        self.svm_editor = SVMEditor(self)
        self.cov_editor = CovarianceEditor(self)
        self.lof_editor = LocalOutlierFactorEditor(self)
        self.isf_editor = IsolationForestEditor(self)

        box = gui.vBox(self.controlArea, "Parameters")
        self.editors = (self.svm_editor, self.cov_editor,
                        self.lof_editor, self.isf_editor)
        for editor in self.editors:
            editor.param_changed.connect(lambda: self.commit())
            box.layout().addWidget(editor)
            editor.hide()

        self.set_current_editor()

    def __method_changed(self):
        self.set_current_editor()
        self.commit()

    def set_current_editor(self):
        if self.current_editor:
            self.current_editor.hide()
        self.current_editor = self.editors[self.outlier_method]
        self.current_editor.show()

    @Inputs.data
    @check_sql_input
    def set_data(self, data):
        self.clear_messages()
        self.data = data
        self.info.set_input_summary(len(data) if data else self.info.NoOutput)
        self.enable_controls()
        self.unconditional_commit()

    def enable_controls(self):
        self.method_combo.model().item(self.Covariance).setEnabled(True)
        if self.data and len(self.data.domain.attributes) > self.MAX_FEATURES:
            self.outlier_method = self.LOF
            self.set_current_editor()
            self.method_combo.model().item(self.Covariance).setEnabled(False)
            self.Warning.disabled_cov()

    def _get_outliers(self) -> Tuple[Table, Table, Table]:
        self.Error.singular_cov.clear()
        self.Error.memory_error.clear()
        try:
            y_pred, amended_data = self.detect_outliers()
        except ValueError:
            self.Error.singular_cov()
            return None, None, None
        except MemoryError:
            self.Error.memory_error()
            return None, None, None
        else:
            inliers_ind = np.where(y_pred == 1)[0]
            outliers_ind = np.where(y_pred == -1)[0]
            inliers = amended_data[inliers_ind]
            outliers = amended_data[outliers_ind]
            self.n_inliers = len(inliers)
            self.n_outliers = len(outliers)
            return inliers, outliers, self.annotated_data(amended_data, y_pred)

    def commit(self):
        inliers = outliers = data = None
        self.n_inliers = self.n_outliers = None
        if self.data:
            inliers, outliers, data = self._get_outliers()

        summary = len(inliers) if inliers else self.info.NoOutput
        self.info.set_output_summary(summary)
        self.Outputs.inliers.send(inliers)
        self.Outputs.outliers.send(outliers)
        self.Outputs.data.send(data)

    def detect_outliers(self) -> Tuple[np.ndarray, Table]:
        learner_class = self.METHODS[self.outlier_method]
        kwargs = self.current_editor.get_parameters()
        learner = learner_class(**kwargs)
        model = learner(self.data)
        y_pred = model(self.data)
        amended_data = self.amended_data(model)
        return np.array(y_pred), amended_data

    def amended_data(self, model: Model) -> Table:
        if self.outlier_method != self.Covariance:
            return self.data
        mahal = model.mahalanobis(self.data.X)
        mahal = mahal.reshape(len(self.data), 1)
        attrs = self.data.domain.attributes
        classes = self.data.domain.class_vars
        new_metas = list(self.data.domain.metas) + \
                    [ContinuousVariable(name="Mahalanobis")]
        new_domain = Domain(attrs, classes, new_metas)
        amended_data = self.data.transform(new_domain)
        amended_data.metas = np.hstack((self.data.metas, mahal))
        return amended_data

    @staticmethod
    def annotated_data(data: Table, labels: np.ndarray) -> Table:
        domain = data.domain
        names = [v.name for v in domain.variables + domain.metas]
        name = get_unique_names(names, "Outlier")

        outlier_var = DiscreteVariable(name, values=["Yes", "No"])
        metas = domain.metas + (outlier_var,)
        domain = Domain(domain.attributes, domain.class_vars, metas)
        data = data.transform(domain)

        labels[labels == -1] = 0
        data.metas[:, -1] = labels
        return data

    def send_report(self):
        if self.n_outliers is None or self.n_inliers is None:
            return
        self.report_items("Data",
                          (("Input instances", len(self.data)),
                           ("Inliers", self.n_inliers),
                           ("Outliers", self.n_outliers)))

        params = self.current_editor.get_parameters()
        if self.outlier_method == self.OneClassSVM:
            self.report_items(
                "Detection",
                (("Detection method",
                  "One class SVM with non-linear kernel (RBF)"),
                 ("Regularization (nu)", params["nu"]),
                 ("Kernel coefficient", params["gamma"])))
        elif self.outlier_method == self.Covariance:
            self.report_items(
                "Detection",
                (("Detection method", "Covariance estimator"),
                 ("Contamination", params["contamination"]),
                 ("Support fraction", params["support_fraction"])))
        elif self.outlier_method == self.LOF:
            self.report_items(
                "Detection",
                (("Detection method", "Local Outlier Factor"),
                 ("Contamination", params["contamination"]),
                 ("Number of neighbors", params["n_neighbors"]),
                 ("Metric", params["metric"])))
        elif self.outlier_method == self.IsolationForest:
            self.report_items(
                "Detection",
                (("Detection method", "Isolation Forest"),
                 ("Contamination", params["contamination"])))
        else:
            raise NotImplementedError

    @classmethod
    def migrate_settings(cls, settings: Dict, version: int):
        if version is None or version < 2:
            settings["svm_editor"] = {"nu": settings.get("nu", 50),
                                      "gamma": settings.get("gamma", 0.01)}
            ec, sf = "empirical_covariance", "support_fraction"
            settings["cov_editor"] = {"cont": settings.get("cont", 10),
                                      ec: settings.get(ec, False),
                                      sf: settings.get(sf, 1)}
Ejemplo n.º 5
0
class OWLDAvis(OWWidget):
    name = "LDAvis"
    description = "Interactive exploration of LDA topics."
    priority = 410
    icon = "icons/LDAvis.svg"

    selected_topic = Setting(0, schema_only=True)
    relevance = Setting(0.5)
    visual_settings = Setting({}, schema_only=True)

    graph = SettingProvider(BarPlotGraph)
    graph_name = "graph.plotItem"

    class Inputs:
        topics = Input("Topics", Topics)

    class Error(OWWidget.Error):
        # Relevant Terms cannot work with LSI or HDP, because it expects
        # topic-term probabilities.
        wrong_model = Msg("Relevant Terms only accepts output from LDA.")

    def __init__(self):
        OWWidget.__init__(self)
        self.data = None
        self.topic_list = []
        self.term_topic_matrix = None
        self.term_frequency = None
        self.num_tokens = None
        # should be used later for bar chart
        self.graph: Optional[BarPlotGraph] = None
        self._create_layout()

        VisualSettingsDialog(self,
                             self.graph.parameter_setter.initial_settings)

    def _create_layout(self):
        self._add_graph()
        box = gui.widgetBox(self.controlArea, "Relevance")
        self.rel_slider = gui.hSlider(
            box,
            self,
            "relevance",
            minValue=0,
            maxValue=1,
            step=0.1,
            intOnly=False,
            labelFormat="%.1f",
            callback_finished=self.on_params_change,
            createLabel=True,
        )

        self.topic_box = gui.listBox(
            self.controlArea,
            self,
            "selected_topic",
            "topic_list",
            box="Topics",
            callback=self.on_params_change,
        )

    def _add_graph(self):
        self.graph = BarPlotGraph(self)
        self.mainArea.layout().addWidget(self.graph)

    def compute_relevance(self, topic: np.ndarray) -> np.ndarray:
        """
        Relevance is defined as lambda*log(topic_probability) + (
        1-lambda)*log(topic_probability/marginal_probability).
        https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf
        """
        nonzero = (topic > 0) & (self.term_frequency > 0)
        tp, mp = topic[nonzero], self.term_frequency[nonzero]
        adj_prob = np.zeros(topic.shape)
        rel = self.relevance
        adj_prob[nonzero] = rel * np.log(tp) + (1 - rel) * np.log(tp / mp)
        return adj_prob

    @staticmethod
    def compute_distributions(data: Topics) -> np.ndarray:
        """
        Compute how likely is the term in each topic
        Term-topic column is multiplied by marginal topic probability
        """
        topic_frequency = data.get_column_view("Marginal Topic Probability")[0]
        return data.X * topic_frequency[:, None].astype(float)

    def on_params_change(self):
        if self.data is None:
            return
        topic = self.data.X[:, self.selected_topic]
        adj_prob = self.compute_relevance(topic)

        idx = np.argsort(adj_prob, axis=None)[::-1][:N_BEST_PLOTTED]

        words = self.data.metas[:, 0][idx]
        term_topic_freq = self.term_topic_matrix[self.selected_topic].T[idx]
        marg_prob = self.term_frequency[idx]

        # convert to absolute frequencies
        term_topic_freq = term_topic_freq * self.num_tokens
        marg_prob = marg_prob * self.num_tokens

        self.graph.update_graph(words, term_topic_freq, marg_prob)

    @Inputs.topics
    def set_data(self, data: Optional[Topics]):
        prev_topic = self.selected_topic
        self.clear()
        if data is None:
            return
        if data.attributes.get("Model", "") != "Latent Dirichlet Allocation":
            self.Error.wrong_model()
            return

        self.data = Table.transpose(data, "Topics", "Words")
        self.topic_list = [var.name for var in self.data.domain.attributes]
        self.num_tokens = data.attributes.get("Number of tokens", "")
        self.term_topic_matrix = self.compute_distributions(data)
        self.term_frequency = np.sum(self.term_topic_matrix, axis=0)

        self.selected_topic = prev_topic if prev_topic < len(
            self.topic_list) else 0
        self.on_params_change()

    def set_visual_settings(self, key: KeyType, value: ValueType):
        self.graph.parameter_setter.set_parameter(key, value)
        self.visual_settings[key] = value

    def clear(self):
        self.Error.clear()
        self.graph.clear_all()
        self.data = None
        self.topic_list = []
        self.term_topic_matrix = None
        self.term_frequency = None
        self.num_tokens = None

    def send_report(self):
        self.report_items((
            ("Relevance", self.relevance),
            ("Shown topic", self.topic_list[self.selected_topic]),
        ))
        self.report_plot()