Ejemplo n.º 1
0
class OWDBSCAN(widget.OWWidget):
    name = "DBSCAN"
    description = "Density-based spatial clustering."
    icon = "icons/DBSCAN.svg"
    priority = 2150

    class Inputs:
        data = Input("Data", Table)

    class Outputs:
        annotated_data = Output(ANNOTATED_DATA_SIGNAL_NAME, Table)

    class Error(widget.OWWidget.Error):
        not_enough_instances = Msg("Not enough unique data instances. "
                                   "At least two are required.")

    METRICS = [
        ("Euclidean", "euclidean"),
        ("Manhattan", "cityblock"),
        ("Cosine", "cosine")
    ]

    min_samples = Setting(4)
    eps = Setting(0.5)
    metric_idx = Setting(0)
    normalize = Setting(True)
    auto_commit = Setting(True)
    k_distances = None
    cut_point = None

    def __init__(self):
        super().__init__()

        self.data = None
        self.data_normalized = None
        self.db = None
        self.model = None

        box = gui.widgetBox(self.controlArea, "Parameters")
        gui.spin(box, self, "min_samples", 1, 100, 1,
                 callback=self._min_samples_changed,
                 label="Core point neighbors")
        gui.doubleSpin(box, self, "eps", EPS_BOTTOM_LIMIT, 1000, 0.01,
                       callback=self._eps_changed,
                       label="Neighborhood distance")

        box = gui.widgetBox(self.controlArea, self.tr("Distance Metric"))
        gui.comboBox(box, self, "metric_idx",
                     items=list(zip(*self.METRICS))[0],
                     callback=self._metirc_changed)
        gui.checkBox(box, self, "normalize", "Normalize features",
                     callback=self._on_normalize_changed)

        gui.auto_apply(self.buttonsArea, self, "auto_commit")
        gui.rubber(self.controlArea)

        self.controlArea.layout().addStretch()

        self.plot = SliderGraph(
            x_axis_label="Data items sorted by score",
            y_axis_label="Distance to the k-th nearest neighbour",
            callback=self._on_cut_changed
        )

        self.mainArea.layout().addWidget(self.plot)

    def check_data_size(self, data):
        if data is None:
            return False
        if len(data) < 2:
            self.Error.not_enough_instances()
            return False
        return True

    def commit(self):
        self.cluster()

    def cluster(self):
        if not self.check_data_size(self.data):
            return
        self.model = DBSCAN(
            eps=self.eps,
            min_samples=self.min_samples,
            metric=self.METRICS[self.metric_idx][1]
        ).get_model(self.data_normalized)
        self.send_data()

    def _compute_and_plot(self, cut_point=None):
        self._compute_kdistances()
        if cut_point is None:
            self._compute_cut_point()
        self._plot_graph()

    def _plot_graph(self):
        nonzero = np.sum(self.k_distances > EPS_BOTTOM_LIMIT)
        self.plot.update(np.arange(len(self.k_distances)),
                         [self.k_distances],
                         colors=[QColor('red')],
                         cutpoint_x=self.cut_point,
                         selection_limit=(0, nonzero - 1))

    def _compute_kdistances(self):
        self.k_distances = get_kth_distances(
            self.data_normalized, metric=self.METRICS[self.metric_idx][1],
            k=self.min_samples
        )

    def _compute_cut_point(self):
        self.cut_point = int(DEFAULT_CUT_POINT * len(self.k_distances))
        self.eps = self.k_distances[self.cut_point]

        mask = self.k_distances >= EPS_BOTTOM_LIMIT
        if self.eps < EPS_BOTTOM_LIMIT and sum(mask):
            self.eps = np.min(self.k_distances[mask])
            self.cut_point = self._find_nearest_dist(self.eps)

    @Inputs.data
    def set_data(self, data):
        self.Error.clear()
        if not self.check_data_size(data):
            data = None
        self.data = self.data_normalized = data
        if self.data is None:
            self.Outputs.annotated_data.send(None)
            self.plot.clear_plot()
            return

        if self.data is None:
            return

        self._preprocess_data()

        self._compute_and_plot()
        self.unconditional_commit()

    def _preprocess_data(self):
        self.data_normalized = self.data
        for pp in PREPROCESSORS:
            if isinstance(pp, Normalize) and not self.normalize:
                continue
            self.data_normalized = pp(self.data_normalized)

    def send_data(self):
        model = self.model

        clusters = [c if c >= 0 else np.nan for c in model.labels]
        k = len(set(clusters) - {np.nan})
        clusters = np.array(clusters)
        core_samples = set(model.projector.core_sample_indices_)
        in_core = np.array([1 if (i in core_samples) else 0
                            for i in range(len(self.data))])

        domain = self.data.domain
        attributes, classes = domain.attributes, domain.class_vars
        meta_attrs = domain.metas
        names = [var.name for var in chain(attributes, classes, meta_attrs) if var]

        u_clust_var = get_unique_names(names, "Cluster")
        clust_var = DiscreteVariable(
            u_clust_var, values=["C%d" % (x + 1) for x in range(k)])

        u_in_core = get_unique_names(names + [u_clust_var], "DBSCAN Core")
        in_core_var = DiscreteVariable(u_in_core, values=("0", "1"))

        new_table = self.data.add_column(clust_var, clusters, to_metas=True)
        new_table = new_table.add_column(in_core_var, in_core, to_metas=True)

        self.Outputs.annotated_data.send(new_table)

    def _invalidate(self):
        self.commit()

    def _find_nearest_dist(self, value):
        array = np.asarray(self.k_distances)
        idx = (np.abs(array - value)).argmin()
        return idx

    def _eps_changed(self):
        # find the closest value to eps
        if self.data is None:
            return
        self.cut_point = self._find_nearest_dist(self.eps)
        self.plot.set_cut_point(self.cut_point)
        self._invalidate()

    def _metirc_changed(self):
        if self.data is not None:
            self._compute_and_plot()
            self._invalidate()

    def _on_cut_changed(self, value):
        # cut changed by means of a cut line over the scree plot.
        self.cut_point = value
        self.eps = self.k_distances[value]

        self.commit()

    def _min_samples_changed(self):
        if self.data is None:
            return
        self._compute_and_plot(cut_point=self.cut_point)
        self._invalidate()

    def _on_normalize_changed(self):
        if not self.data:
            return
        self._preprocess_data()
        self._compute_and_plot()
        self._invalidate()
Ejemplo n.º 2
0
class OWPCA(widget.OWWidget):
    name = "PCA"
    description = "Principal component analysis with a scree-diagram."
    icon = "icons/PCA.svg"
    priority = 3050
    keywords = ["principal component analysis", "linear transformation"]

    class Inputs:
        data = Input("Data", Table)

    class Outputs:
        transformed_data = Output("Transformed Data",
                                  Table,
                                  replaces=["Transformed data"])
        data = Output("Data", Table, default=True)
        components = Output("Components", Table)
        pca = Output("PCA", PCA, dynamic=False)

    ncomponents = settings.Setting(2)
    variance_covered = settings.Setting(100)
    auto_commit = settings.Setting(True)
    normalize = settings.Setting(True)
    maxp = settings.Setting(20)
    axis_labels = settings.Setting(10)

    graph_name = "plot.plotItem"

    class Warning(widget.OWWidget.Warning):
        trivial_components = widget.Msg(
            "All components of the PCA are trivial (explain 0 variance). "
            "Input data is constant (or near constant).")

    class Error(widget.OWWidget.Error):
        no_features = widget.Msg("At least 1 feature is required")
        no_instances = widget.Msg("At least 1 data instance is required")

    def __init__(self):
        super().__init__()
        self.data = None

        self._pca = None
        self._transformed = None
        self._variance_ratio = None
        self._cumulative = None
        self._init_projector()

        # Components Selection
        box = gui.vBox(self.controlArea, "Components Selection")
        form = QFormLayout()
        box.layout().addLayout(form)

        self.components_spin = gui.spin(
            box,
            self,
            "ncomponents",
            1,
            MAX_COMPONENTS,
            callback=self._update_selection_component_spin,
            keyboardTracking=False)
        self.components_spin.setSpecialValueText("All")

        self.variance_spin = gui.spin(
            box,
            self,
            "variance_covered",
            1,
            100,
            callback=self._update_selection_variance_spin,
            keyboardTracking=False)
        self.variance_spin.setSuffix("%")

        form.addRow("Components:", self.components_spin)
        form.addRow("Explained variance:", self.variance_spin)

        # Options
        self.options_box = gui.vBox(self.controlArea, "Options")
        self.normalize_box = gui.checkBox(self.options_box,
                                          self,
                                          "normalize",
                                          "Normalize variables",
                                          callback=self._update_normalize)

        self.maxp_spin = gui.spin(self.options_box,
                                  self,
                                  "maxp",
                                  1,
                                  MAX_COMPONENTS,
                                  label="Show only first",
                                  callback=self._setup_plot,
                                  keyboardTracking=False)

        self.controlArea.layout().addStretch()

        gui.auto_apply(self.controlArea, self, "auto_commit")

        self.plot = SliderGraph("Principal Components",
                                "Proportion of variance", self._on_cut_changed)

        self.mainArea.layout().addWidget(self.plot)
        self._update_normalize()

    @Inputs.data
    def set_data(self, data):
        self.clear_messages()
        self.clear()
        self.information()
        self.data = None
        if not data:
            self.clear_outputs()
        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            else:
                self.information("Data has been sampled")
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(2000, partial=True)
                data = Table(data_sample)
        if isinstance(data, Table):
            if not data.domain.attributes:
                self.Error.no_features()
                self.clear_outputs()
                return
            if not data:
                self.Error.no_instances()
                self.clear_outputs()
                return

        self._init_projector()

        self.data = data
        self.fit()

    def fit(self):
        self.clear()
        self.Warning.trivial_components.clear()
        if self.data is None:
            return

        data = self.data

        if self.normalize:
            self._pca_projector.preprocessors = \
                self._pca_preprocessors + [preprocess.Normalize(center=False)]
        else:
            self._pca_projector.preprocessors = self._pca_preprocessors

        if not isinstance(data, SqlTable):
            pca = self._pca_projector(data)
            variance_ratio = pca.explained_variance_ratio_
            cumulative = numpy.cumsum(variance_ratio)

            if numpy.isfinite(cumulative[-1]):
                self.components_spin.setRange(0, len(cumulative))
                self._pca = pca
                self._variance_ratio = variance_ratio
                self._cumulative = cumulative
                self._setup_plot()
            else:
                self.Warning.trivial_components()

            self.unconditional_commit()

    def clear(self):
        self._pca = None
        self._transformed = None
        self._variance_ratio = None
        self._cumulative = None
        self.plot.clear_plot()

    def clear_outputs(self):
        self.Outputs.transformed_data.send(None)
        self.Outputs.data.send(None)
        self.Outputs.components.send(None)
        self.Outputs.pca.send(self._pca_projector)

    def _setup_plot(self):
        if self._pca is None:
            self.plot.clear_plot()
            return

        explained_ratio = self._variance_ratio
        explained = self._cumulative
        cutpos = self._nselected_components()
        p = min(len(self._variance_ratio), self.maxp)

        self.plot.update(numpy.arange(1, p + 1),
                         [explained_ratio[:p], explained[:p]],
                         [Qt.red, Qt.darkYellow],
                         cutpoint_x=cutpos,
                         names=LINE_NAMES)

        self._update_axis()

    def _on_cut_changed(self, components):
        if components == self.ncomponents \
                or self.ncomponents == 0 \
                or self._pca is not None \
                and components == len(self._variance_ratio):
            return

        self.ncomponents = components
        if self._pca is not None:
            var = self._cumulative[components - 1]
            if numpy.isfinite(var):
                self.variance_covered = int(var * 100)

        self._invalidate_selection()

    def _update_selection_component_spin(self):
        # cut changed by "ncomponents" spin.
        if self._pca is None:
            self._invalidate_selection()
            return

        if self.ncomponents == 0:
            # Special "All" value
            cut = len(self._variance_ratio)
        else:
            cut = self.ncomponents

        var = self._cumulative[cut - 1]
        if numpy.isfinite(var):
            self.variance_covered = int(var * 100)

        self.plot.set_cut_point(cut)
        self._invalidate_selection()

    def _update_selection_variance_spin(self):
        # cut changed by "max variance" spin.
        if self._pca is None:
            return

        cut = numpy.searchsorted(self._cumulative,
                                 self.variance_covered / 100.0) + 1
        cut = min(cut, len(self._cumulative))
        self.ncomponents = cut
        self.plot.set_cut_point(cut)
        self._invalidate_selection()

    def _update_normalize(self):
        self.fit()
        if self.data is None:
            self._invalidate_selection()

    def _init_projector(self):
        self._pca_projector = PCA(n_components=MAX_COMPONENTS, random_state=0)
        self._pca_projector.component = self.ncomponents
        self._pca_preprocessors = PCA.preprocessors

    def _nselected_components(self):
        """Return the number of selected components."""
        if self._pca is None:
            return 0

        if self.ncomponents == 0:
            # Special "All" value
            max_comp = len(self._variance_ratio)
        else:
            max_comp = self.ncomponents

        var_max = self._cumulative[max_comp - 1]
        if var_max != numpy.floor(self.variance_covered / 100.0):
            cut = max_comp
            assert numpy.isfinite(var_max)
            self.variance_covered = int(var_max * 100)
        else:
            self.ncomponents = cut = numpy.searchsorted(
                self._cumulative, self.variance_covered / 100.0) + 1
        return cut

    def _invalidate_selection(self):
        self.commit()

    def _update_axis(self):
        p = min(len(self._variance_ratio), self.maxp)
        axis = self.plot.getAxis("bottom")
        d = max((p - 1) // (self.axis_labels - 1), 1)
        axis.setTicks([[(i, str(i)) for i in range(1, p + 1, d)]])

    def commit(self):
        transformed = data = components = None
        if self._pca is not None:
            if self._transformed is None:
                # Compute the full transform (MAX_COMPONENTS components) once.
                self._transformed = self._pca(self.data)
            transformed = self._transformed

            domain = Domain(transformed.domain.attributes[:self.ncomponents],
                            self.data.domain.class_vars,
                            self.data.domain.metas)
            transformed = transformed.from_table(domain, transformed)
            # prevent caching new features by defining compute_value
            proposed = [a.name for a in self._pca.orig_domain.attributes]
            meta_name = get_unique_names(proposed, 'components')
            dom = Domain([
                ContinuousVariable(name, compute_value=lambda _: None)
                for name in proposed
            ],
                         metas=[StringVariable(name=meta_name)])
            metas = numpy.array(
                [['PC{}'.format(i + 1) for i in range(self.ncomponents)]],
                dtype=object).T
            components = Table(dom,
                               self._pca.components_[:self.ncomponents],
                               metas=metas)
            components.name = 'components'

            data_dom = Domain(self.data.domain.attributes,
                              self.data.domain.class_vars,
                              self.data.domain.metas + domain.attributes)
            data = Table.from_numpy(data_dom,
                                    self.data.X,
                                    self.data.Y,
                                    numpy.hstack(
                                        (self.data.metas, transformed.X)),
                                    ids=self.data.ids)

        self._pca_projector.component = self.ncomponents
        self.Outputs.transformed_data.send(transformed)
        self.Outputs.components.send(components)
        self.Outputs.data.send(data)
        self.Outputs.pca.send(self._pca_projector)

    def send_report(self):
        if self.data is None:
            return
        self.report_items(
            (("Normalize data", str(self.normalize)), ("Selected components",
                                                       self.ncomponents),
             ("Explained variance", "{:.3f} %".format(self.variance_covered))))
        self.report_plot()

    @classmethod
    def migrate_settings(cls, settings, version):
        if "variance_covered" in settings:
            # Due to the error in gh-1896 the variance_covered was persisted
            # as a NaN value, causing a TypeError in the widgets `__init__`.
            vc = settings["variance_covered"]
            if isinstance(vc, numbers.Real):
                if numpy.isfinite(vc):
                    vc = int(vc)
                else:
                    vc = 100
                settings["variance_covered"] = vc
        if settings.get("ncomponents", 0) > MAX_COMPONENTS:
            settings["ncomponents"] = MAX_COMPONENTS

        # Remove old `decomposition_idx` when SVD was still included
        settings.pop("decomposition_idx", None)

        # Remove RemotePCA settings
        settings.pop("batch_size", None)
        settings.pop("address", None)
        settings.pop("auto_update", None)
Ejemplo n.º 3
0
class OWDBSCAN(widget.OWWidget):
    name = "DBSCAN"
    description = "基于密度的空间聚类."
    icon = "icons/DBSCAN.svg"
    priority = 2150

    class Inputs:
        data = Input("数据(Data)", Table, replaces=['Data'])

    class Outputs:
        annotated_data = Output(ANNOTATED_DATA_SIGNAL_Chinese_NAME, Table, replaces=['Data'])

    class Error(widget.OWWidget.Error):
        not_enough_instances = Msg("Not enough unique data instances. "
                                   "At least two are required.")

    METRICS = [
        ("欧几里得", "euclidean"),
        ("曼哈顿", "cityblock"),
        ("余弦", "cosine")
    ]

    min_samples = Setting(4)
    eps = Setting(0.5)
    metric_idx = Setting(0)
    auto_commit = Setting(True)
    k_distances = None
    cut_point = None

    def __init__(self):
        super().__init__()

        self.data = None
        self.data_normalized = None
        self.db = None
        self.model = None

        box = gui.widgetBox(self.controlArea, "参数")
        gui.spin(box, self, "min_samples", 1, 100, 1,
                 callback=self._min_samples_changed,
                 label="核心店邻近数(Core point neighbors)")
        gui.doubleSpin(box, self, "eps", EPS_BOTTOM_LIMIT, 1000, 0.01,
                       callback=self._eps_changed,
                       label="临近点距离")

        box = gui.widgetBox(self.controlArea, self.tr("距离度量"))
        gui.comboBox(box, self, "metric_idx",
                     items=list(zip(*self.METRICS))[0],
                     callback=self._metirc_changed)

        gui.auto_apply(self.controlArea, self, "auto_commit")
        gui.rubber(self.controlArea)

        self.controlArea.layout().addStretch()

        self.plot = SliderGraph(
            x_axis_label="根据评分排序的数据",
            y_axis_label="到第 k 个最邻近点的距离",
            callback=self._on_cut_changed
        )

        self.mainArea.layout().addWidget(self.plot)

    def check_data_size(self, data):
        if data is None:
            return False
        if len(data) < 2:
            self.Error.not_enough_instances()
            return False
        return True

    def commit(self):
        self.cluster()

    def cluster(self):
        if not self.check_data_size(self.data):
            return
        self.model = DBSCAN(
            eps=self.eps,
            min_samples=self.min_samples,
            metric=self.METRICS[self.metric_idx][1]
        ).get_model(self.data_normalized)
        self.send_data()

    def _compute_and_plot(self, cut_point=None):
        self._compute_kdistances()
        if cut_point is None:
            self._compute_cut_point()
        self._plot_graph()

    def _plot_graph(self):
        nonzero = np.sum(self.k_distances > EPS_BOTTOM_LIMIT)
        self.plot.update(np.arange(len(self.k_distances)),
                         [self.k_distances],
                         colors=[QColor('red')],
                         cutpoint_x=self.cut_point,
                         selection_limit=(0, nonzero - 1))

    def _compute_kdistances(self):
        self.k_distances = get_kth_distances(
            self.data_normalized, metric=self.METRICS[self.metric_idx][1],
            k=self.min_samples
        )

    def _compute_cut_point(self):
        self.cut_point = int(DEFAULT_CUT_POINT * len(self.k_distances))
        self.eps = self.k_distances[self.cut_point]

        if self.eps < EPS_BOTTOM_LIMIT:
            self.eps = np.min(
                self.k_distances[self.k_distances >= EPS_BOTTOM_LIMIT])
            self.cut_point = self._find_nearest_dist(self.eps)

    @Inputs.data
    def set_data(self, data):
        self.Error.clear()
        if not self.check_data_size(data):
            data = None
        self.data = self.data_normalized = data
        if self.data is None:
            self.Outputs.annotated_data.send(None)
            self.plot.clear_plot()
            return

        if self.data is None:
            return

        # preprocess data
        for pp in PREPROCESSORS:
            self.data_normalized = pp(self.data_normalized)

        self._compute_and_plot()
        self.unconditional_commit()

    def send_data(self):
        model = self.model

        clusters = [c if c >= 0 else np.nan for c in model.labels]
        k = len(set(clusters) - {np.nan})
        clusters = np.array(clusters).reshape(len(self.data), 1)
        core_samples = set(model.projector.core_sample_indices_)
        in_core = np.array([1 if (i in core_samples) else 0
                            for i in range(len(self.data))])
        in_core = in_core.reshape(len(self.data), 1)

        clust_var = DiscreteVariable(
            "Cluster", values=["C%d" % (x + 1) for x in range(k)])
        in_core_var = DiscreteVariable("DBSCAN Core", values=["0", "1"])

        domain = self.data.domain
        attributes, classes = domain.attributes, domain.class_vars
        meta_attrs = domain.metas
        x, y, metas = self.data.X, self.data.Y, self.data.metas

        meta_attrs += (clust_var, )
        metas = np.hstack((metas, clusters))
        meta_attrs += (in_core_var, )
        metas = np.hstack((metas, in_core))

        domain = Domain(attributes, classes, meta_attrs)
        new_table = Table(domain, x, y, metas, self.data.W)

        self.Outputs.annotated_data.send(new_table)

    def _invalidate(self):
        self.commit()

    def _find_nearest_dist(self, value):
        array = np.asarray(self.k_distances)
        idx = (np.abs(array - value)).argmin()
        return idx

    def _eps_changed(self):
        # find the closest value to eps
        if self.data is None:
            return
        self.cut_point = self._find_nearest_dist(self.eps)
        self.plot.set_cut_point(self.cut_point)
        self._invalidate()

    def _metirc_changed(self):
        if self.data is not None:
            self._compute_and_plot()
            self._invalidate()

    def _on_cut_changed(self, value):
        # cut changed by means of a cut line over the scree plot.
        self.cut_point = value
        self.eps = self.k_distances[value]

        self.commit()

    def _min_samples_changed(self):
        if self.data is None:
            return
        self._compute_and_plot(cut_point=self.cut_point)
        self._invalidate()