Ejemplo n.º 1
0
 def start(self):
     if 'Abort' in self.start_button.text():
         self.rpca.abort()
         self.__timer.stop()
         self.start_button.setText("Start remote computation")
     else:
         self.address = self.addresstext.text()
         with remote.server(self.address):
             from Orange.projection.pca import RemotePCA
             maxiter = (1e5 + self.data.approx_len()) / self.batch_size * 3
             self.rpca = RemotePCA(self.data, self.batch_size, int(maxiter))
         self.update_model()
         self.start_button.setText("Abort remote computation")
Ejemplo n.º 2
0
 def test_PCA(self):
     iris_v = ['Iris-setosa', 'Iris-virginica', 'Iris-versicolor']
     table = SqlTable(self.conn,
                      self.iris,
                      type_hints=Domain([],
                                        DiscreteVariable("iris",
                                                         values=iris_v)))
     for batch_size in (50, 500):
         rpca = RemotePCA(table, batch_size, 20)
         self.assertEqual(rpca.components_.shape, (4, 4))
Ejemplo n.º 3
0
 def start(self):
     if 'Abort' in self.start_button.text():
         self.rpca.abort()
         self.__timer.stop()
         self.start_button.setText("Start remote computation")
     else:
         self.address = self.addresstext.text()
         with remote.server(self.address):
             from Orange.projection.pca import RemotePCA
             maxiter = (1e5 + self.data.approx_len()) / self.batch_size * 3
             self.rpca = RemotePCA(self.data, self.batch_size, int(maxiter))
         self.update_model()
         self.start_button.setText("Abort remote computation")
Ejemplo n.º 4
0
 def test_PCA(self):
     table = SqlTable(connection_params(),
                      'iris',
                      type_hints=Domain([],
                                        DiscreteVariable(
                                            "iris",
                                            values=[
                                                'Iris-setosa',
                                                'Iris-virginica',
                                                'Iris-versicolor'
                                            ])))
     for batch_size in (50, 500):
         rpca = RemotePCA(table, batch_size, 10)
         self.assertEqual(rpca.components_.shape, (4, 4))
Ejemplo n.º 5
0
class OWPCA(widget.OWWidget):
    name = "PCA"
    description = "Principal component analysis with a scree-diagram."
    icon = "icons/PCA.svg"
    priority = 3050

    inputs = [("Data", Table, "set_data")]
    outputs = [("Transformed data", Table),
               ("Components", Table),
               ("PCA", PCA)]

    ncomponents = settings.Setting(2)
    variance_covered = settings.Setting(100)
    batch_size = settings.Setting(100)
    address = settings.Setting('')
    auto_update = settings.Setting(True)
    auto_commit = settings.Setting(True)
    normalize = settings.Setting(True)
    maxp = settings.Setting(20)
    axis_labels = settings.Setting(10)

    graph_name = "plot.plotItem"

    def __init__(self):
        super().__init__()
        self.data = None

        self._pca = None
        self._transformed = None
        self._variance_ratio = None
        self._cumulative = None
        self._line = False
        self._pca_projector = PCA()
        self._pca_projector.component = self.ncomponents
        self._pca_preprocessors = PCA.preprocessors

        # Components Selection
        box = gui.vBox(self.controlArea, "Components Selection")
        form = QFormLayout()
        box.layout().addLayout(form)

        self.components_spin = gui.spin(
            box, self, "ncomponents", 0, 1000,
            callback=self._update_selection_component_spin,
            keyboardTracking=False
        )
        self.components_spin.setSpecialValueText("All")

        self.variance_spin = gui.spin(
            box, self, "variance_covered", 1, 100,
            callback=self._update_selection_variance_spin,
            keyboardTracking=False
        )
        self.variance_spin.setSuffix("%")

        form.addRow("Components:", self.components_spin)
        form.addRow("Variance covered:", self.variance_spin)

        # Incremental learning
        self.sampling_box = gui.vBox(self.controlArea, "Incremental learning")
        self.addresstext = QLineEdit(box)
        self.addresstext.setPlaceholderText('Remote server')
        if self.address:
            self.addresstext.setText(self.address)
        self.sampling_box.layout().addWidget(self.addresstext)

        form = QFormLayout()
        self.sampling_box.layout().addLayout(form)
        self.batch_spin = gui.spin(
            self.sampling_box, self, "batch_size", 50, 100000, step=50,
            keyboardTracking=False)
        form.addRow("Batch size ~ ", self.batch_spin)

        self.start_button = gui.button(
            self.sampling_box, self, "Start remote computation",
            callback=self.start, autoDefault=False,
            tooltip="Start/abort computation on the server")
        self.start_button.setEnabled(False)

        gui.checkBox(self.sampling_box, self, "auto_update",
                     "Periodically fetch model", callback=self.update_model)
        self.__timer = QTimer(self, interval=2000)
        self.__timer.timeout.connect(self.get_model)

        self.sampling_box.setVisible(remotely)

        # Options
        self.options_box = gui.vBox(self.controlArea, "Options")
        gui.checkBox(self.options_box, self, "normalize", "Normalize data",
                     callback=self._update_normalize)
        self.maxp_spin = gui.spin(
            self.options_box, self, "maxp", 1, 100,
            label="Show only first", callback=self._setup_plot,
            keyboardTracking=False
        )

        self.controlArea.layout().addStretch()

        gui.auto_commit(self.controlArea, self, "auto_commit", "Apply",
                        checkbox_label="Apply automatically")

        self.plot = pg.PlotWidget(background="w")

        axis = self.plot.getAxis("bottom")
        axis.setLabel("Principal Components")
        axis = self.plot.getAxis("left")
        axis.setLabel("Proportion of variance")
        self.plot_horlabels = []
        self.plot_horlines = []

        self.plot.getViewBox().setMenuEnabled(False)
        self.plot.getViewBox().setMouseEnabled(False, False)
        self.plot.showGrid(True, True, alpha=0.5)
        self.plot.setRange(xRange=(0.0, 1.0), yRange=(0.0, 1.0))

        self.mainArea.layout().addWidget(self.plot)
        self._update_normalize()

    def update_model(self):
        self.get_model()
        if self.auto_update and self.rpca and not self.rpca.ready():
            self.__timer.start(2000)
        else:
            self.__timer.stop()

    def start(self):
        if 'Abort' in self.start_button.text():
            self.rpca.abort()
            self.__timer.stop()
            self.start_button.setText("Start remote computation")
        else:
            self.address = self.addresstext.text()
            with remote.server(self.address):
                from Orange.projection.pca import RemotePCA
                maxiter = (1e5 + self.data.approx_len()) / self.batch_size * 3
                self.rpca = RemotePCA(self.data, self.batch_size, int(maxiter))
            self.update_model()
            self.start_button.setText("Abort remote computation")

    def set_data(self, data):
        self.information(0)
        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            elif not remotely:
                self.information(0, "Data has been sampled")
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(2000, partial=True)
                data = Table(data_sample)
        self.data = data
        self.fit()

    def fit(self):
        self.clear()
        self.start_button.setEnabled(False)
        if self.data is None:
            return
        data = self.data
        self._transformed = None
        if isinstance(data, SqlTable): # data was big and remote available
            self.sampling_box.setVisible(True)
            self.start_button.setText("Start remote computation")
            self.start_button.setEnabled(True)
        else:
            self.sampling_box.setVisible(False)
            pca = self._pca_projector(data)
            variance_ratio = pca.explained_variance_ratio_
            cumulative = numpy.cumsum(variance_ratio)
            self.components_spin.setRange(0, len(cumulative))

            self._pca = pca
            self._variance_ratio = variance_ratio
            self._cumulative = cumulative
            self._setup_plot()

            self.unconditional_commit()

    def clear(self):
        self._pca = None
        self._transformed = None
        self._variance_ratio = None
        self._cumulative = None
        self._line = None
        self.plot_horlabels = []
        self.plot_horlines = []
        self.plot.clear()

    def get_model(self):
        if self.rpca is None:
            return
        if self.rpca.ready():
            self.__timer.stop()
            self.start_button.setText("Restart (finished)")
        self._pca = self.rpca.get_state()
        if self._pca is None:
            return
        self._variance_ratio = self._pca.explained_variance_ratio_
        self._cumulative = numpy.cumsum(self._variance_ratio)
        self._setup_plot()
        self._transformed = None
        self.commit()

    def _setup_plot(self):
        self.plot.clear()
        explained_ratio = self._variance_ratio
        explained = self._cumulative
        p = min(len(self._variance_ratio), self.maxp)

        self.plot.plot(numpy.arange(p), explained_ratio[:p],
                       pen=pg.mkPen(QColor(Qt.red), width=2),
                       antialias=True,
                       name="Variance")
        self.plot.plot(numpy.arange(p), explained[:p],
                       pen=pg.mkPen(QColor(Qt.darkYellow), width=2),
                       antialias=True,
                       name="Cumulative Variance")

        cutpos = self._nselected_components() - 1
        self._line = pg.InfiniteLine(
            angle=90, pos=cutpos, movable=True, bounds=(0, p - 1))
        self._line.setCursor(Qt.SizeHorCursor)
        self._line.setPen(pg.mkPen(QColor(Qt.black), width=2))
        self._line.sigPositionChanged.connect(self._on_cut_changed)
        self.plot.addItem(self._line)

        self.plot_horlines = (
            pg.PlotCurveItem(pen=pg.mkPen(QColor(Qt.blue), style=Qt.DashLine)),
            pg.PlotCurveItem(pen=pg.mkPen(QColor(Qt.blue), style=Qt.DashLine)))
        self.plot_horlabels = (
            pg.TextItem(color=QColor(Qt.black), anchor=(1, 0)),
            pg.TextItem(color=QColor(Qt.black), anchor=(1, 1)))
        for item in self.plot_horlabels + self.plot_horlines:
            self.plot.addItem(item)
        self._set_horline_pos()

        self.plot.setRange(xRange=(0.0, p - 1), yRange=(0.0, 1.0))
        self._update_axis()

    def _set_horline_pos(self):
        cutidx = self.ncomponents - 1
        for line, label, curve in zip(self.plot_horlines, self.plot_horlabels,
                                      (self._variance_ratio, self._cumulative)):
            y = curve[cutidx]
            line.setData([-1, cutidx], 2 * [y])
            label.setPos(cutidx, y)
            label.setPlainText("{:.3f}".format(y))

    def _on_cut_changed(self, line):
        # cut changed by means of a cut line over the scree plot.
        value = int(round(line.value()))
        self._line.setValue(value)
        current = self._nselected_components()
        components = value + 1

        if not (self.ncomponents == 0 and
                components == len(self._variance_ratio)):
            self.ncomponents = components

        self._set_horline_pos()

        if self._pca is not None:
            self.variance_covered = self._cumulative[components - 1] * 100

        if current != self._nselected_components():
            self._invalidate_selection()

    def _update_selection_component_spin(self):
        # cut changed by "ncomponents" spin.
        if self._pca is None:
            self._invalidate_selection()
            return

        if self.ncomponents == 0:
            # Special "All" value
            cut = len(self._variance_ratio)
        else:
            cut = self.ncomponents
        self.variance_covered = self._cumulative[cut - 1] * 100

        if numpy.floor(self._line.value()) + 1 != cut:
            self._line.setValue(cut - 1)

        self._invalidate_selection()

    def _update_selection_variance_spin(self):
        # cut changed by "max variance" spin.
        if self._pca is None:
            return

        cut = numpy.searchsorted(self._cumulative,
                                 self.variance_covered / 100.0) + 1
        self.ncomponents = cut
        if numpy.floor(self._line.value()) + 1 != cut:
            self._line.setValue(cut - 1)
        self._invalidate_selection()

    def _update_normalize(self):
        if self.normalize:
            pp = self._pca_preprocessors + [Normalize()]
        else:
            pp = self._pca_preprocessors
        self._pca_projector.preprocessors = pp
        self.fit()
        if self.data is None:
            self._invalidate_selection()

    def _nselected_components(self):
        """Return the number of selected components."""
        if self._pca is None:
            return 0

        if self.ncomponents == 0:
            # Special "All" value
            max_comp = len(self._variance_ratio)
        else:
            max_comp = self.ncomponents

        var_max = self._cumulative[max_comp - 1]
        if var_max != numpy.floor(self.variance_covered / 100.0):
            cut = max_comp
            self.variance_covered = var_max * 100
        else:
            self.ncomponents = cut = numpy.searchsorted(
                self._cumulative, self.variance_covered / 100.0) + 1
        return cut

    def _invalidate_selection(self):
        self.commit()

    def _update_axis(self):
        p = min(len(self._variance_ratio), self.maxp)
        axis = self.plot.getAxis("bottom")
        d = max((p-1)//(self.axis_labels-1), 1)
        axis.setTicks([[(i, str(i+1)) for i in range(0, p, d)]])

    def commit(self):
        transformed = components = None
        if self._pca is not None:
            if self._transformed is None:
                # Compute the full transform (all components) only once.
                self._transformed = self._pca(self.data)
            transformed = self._transformed

            domain = Domain(
                transformed.domain.attributes[:self.ncomponents],
                self.data.domain.class_vars,
                self.data.domain.metas
            )
            transformed = transformed.from_table(domain, transformed)
            dom = Domain(self._pca.orig_domain.attributes,
                         metas=[StringVariable(name='component')])
            metas = numpy.array([['PC{}'.format(i + 1)
                                  for i in range(self.ncomponents)]],
                                dtype=object).T
            components = Table(dom, self._pca.components_[:self.ncomponents],
                               metas=metas)
            components.name = 'components'

        self._pca_projector.component = self.ncomponents
        self.send("Transformed data", transformed)
        self.send("Components", components)
        self.send("PCA", self._pca_projector)

    def send_report(self):
        if self.data is None:
            return
        self.report_items((
            ("Selected components", self.ncomponents),
            ("Explained variance", "{:.3f} %".format(self.variance_covered))
        ))
        self.report_plot()
Ejemplo n.º 6
0
class OWPCA(widget.OWWidget):
    name = "PCA"
    description = "Principal component analysis with a scree-diagram."
    icon = "icons/PCA.svg"
    priority = 3050

    class Inputs:
        data = Input("Data", Table)

    class Outputs:
        transformed_data = Output("Transformed data", Table)
        components = Output("Components", Table)
        pca = Output("PCA", PCA, dynamic=False)

    settingsHandler = settings.DomainContextHandler()

    ncomponents = settings.Setting(2)
    variance_covered = settings.Setting(100)
    batch_size = settings.Setting(100)
    address = settings.Setting('')
    auto_update = settings.Setting(True)
    auto_commit = settings.Setting(True)
    normalize = settings.ContextSetting(True)
    decomposition_idx = settings.ContextSetting(0)
    maxp = settings.Setting(20)
    axis_labels = settings.Setting(10)

    graph_name = "plot.plotItem"

    class Warning(widget.OWWidget.Warning):
        trivial_components = widget.Msg(
            "All components of the PCA are trivial (explain 0 variance). "
            "Input data is constant (or near constant).")

    class Error(widget.OWWidget.Error):
        no_features = widget.Msg("At least 1 feature is required")
        no_instances = widget.Msg("At least 1 data instance is required")
        sparse_data = widget.Msg("Sparse data is not supported")

    def __init__(self):
        super().__init__()
        self.data = None

        self._pca = None
        self._transformed = None
        self._variance_ratio = None
        self._cumulative = None
        self._line = False
        self._init_projector()

        # Components Selection
        box = gui.vBox(self.controlArea, "Components Selection")
        form = QFormLayout()
        box.layout().addLayout(form)

        self.components_spin = gui.spin(
            box,
            self,
            "ncomponents",
            1,
            MAX_COMPONENTS,
            callback=self._update_selection_component_spin,
            keyboardTracking=False)
        self.components_spin.setSpecialValueText("All")

        self.variance_spin = gui.spin(
            box,
            self,
            "variance_covered",
            1,
            100,
            callback=self._update_selection_variance_spin,
            keyboardTracking=False)
        self.variance_spin.setSuffix("%")

        form.addRow("Components:", self.components_spin)
        form.addRow("Variance covered:", self.variance_spin)

        # Incremental learning
        self.sampling_box = gui.vBox(self.controlArea, "Incremental learning")
        self.addresstext = QLineEdit(box)
        self.addresstext.setPlaceholderText('Remote server')
        if self.address:
            self.addresstext.setText(self.address)
        self.sampling_box.layout().addWidget(self.addresstext)

        form = QFormLayout()
        self.sampling_box.layout().addLayout(form)
        self.batch_spin = gui.spin(self.sampling_box,
                                   self,
                                   "batch_size",
                                   50,
                                   100000,
                                   step=50,
                                   keyboardTracking=False)
        form.addRow("Batch size ~ ", self.batch_spin)

        self.start_button = gui.button(
            self.sampling_box,
            self,
            "Start remote computation",
            callback=self.start,
            autoDefault=False,
            tooltip="Start/abort computation on the server")
        self.start_button.setEnabled(False)

        gui.checkBox(self.sampling_box,
                     self,
                     "auto_update",
                     "Periodically fetch model",
                     callback=self.update_model)
        self.__timer = QTimer(self, interval=2000)
        self.__timer.timeout.connect(self.get_model)

        self.sampling_box.setVisible(remotely)

        # Decomposition
        self.decomposition_box = gui.radioButtons(
            self.controlArea,
            self,
            "decomposition_idx", [d.name for d in DECOMPOSITIONS],
            box="Decomposition",
            callback=self._update_decomposition)

        # Options
        self.options_box = gui.vBox(self.controlArea, "Options")
        self.normalize_box = gui.checkBox(self.options_box,
                                          self,
                                          "normalize",
                                          "Normalize data",
                                          callback=self._update_normalize)

        self.maxp_spin = gui.spin(self.options_box,
                                  self,
                                  "maxp",
                                  1,
                                  MAX_COMPONENTS,
                                  label="Show only first",
                                  callback=self._setup_plot,
                                  keyboardTracking=False)

        self.controlArea.layout().addStretch()

        gui.auto_commit(self.controlArea,
                        self,
                        "auto_commit",
                        "Apply",
                        checkbox_label="Apply automatically")

        self.plot = pg.PlotWidget(background="w")

        axis = self.plot.getAxis("bottom")
        axis.setLabel("Principal Components")
        axis = self.plot.getAxis("left")
        axis.setLabel("Proportion of variance")
        self.plot_horlabels = []
        self.plot_horlines = []

        self.plot.getViewBox().setMenuEnabled(False)
        self.plot.getViewBox().setMouseEnabled(False, False)
        self.plot.showGrid(True, True, alpha=0.5)
        self.plot.setRange(xRange=(0.0, 1.0), yRange=(0.0, 1.0))

        self.mainArea.layout().addWidget(self.plot)
        self._update_normalize()

    def update_model(self):
        self.get_model()
        if self.auto_update and self.rpca and not self.rpca.ready():
            self.__timer.start(2000)
        else:
            self.__timer.stop()

    def update_buttons(self, sparse_data=False):
        if sparse_data:
            self.normalize = False

        buttons = self.decomposition_box.buttons
        for cls, button in zip(DECOMPOSITIONS, buttons):
            button.setDisabled(sparse_data and not cls.supports_sparse)

        if not buttons[self.decomposition_idx].isEnabled():
            # Set decomposition index to first sparse-enabled decomposition
            for i, cls in enumerate(DECOMPOSITIONS):
                if cls.supports_sparse:
                    self.decomposition_idx = i
                    break

        self._init_projector()

    def start(self):
        if 'Abort' in self.start_button.text():
            self.rpca.abort()
            self.__timer.stop()
            self.start_button.setText("Start remote computation")
        else:
            self.address = self.addresstext.text()
            with remote.server(self.address):
                from Orange.projection.pca import RemotePCA
                maxiter = (1e5 + self.data.approx_len()) / self.batch_size * 3
                self.rpca = RemotePCA(self.data, self.batch_size, int(maxiter))
            self.update_model()
            self.start_button.setText("Abort remote computation")

    @Inputs.data
    def set_data(self, data):
        self.closeContext()
        self.clear_messages()
        self.clear()
        self.start_button.setEnabled(False)
        self.information()
        self.data = None
        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            elif not remotely:
                self.information("Data has been sampled")
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(2000, partial=True)
                data = Table(data_sample)
            else:  # data was big and remote available
                self.sampling_box.setVisible(True)
                self.start_button.setText("Start remote computation")
                self.start_button.setEnabled(True)
        if not isinstance(data, SqlTable):
            self.sampling_box.setVisible(False)

        if isinstance(data, Table):
            if len(data.domain.attributes) == 0:
                self.Error.no_features()
                self.clear_outputs()
                return
            if len(data) == 0:
                self.Error.no_instances()
                self.clear_outputs()
                return

        self.openContext(data)
        sparse_data = data is not None and data.is_sparse()
        self.normalize_box.setDisabled(sparse_data)
        self.update_buttons(sparse_data=sparse_data)

        self.data = data
        self.fit()

    def fit(self):
        self.clear()
        self.Warning.trivial_components.clear()
        if self.data is None:
            return
        data = self.data
        if not isinstance(data, SqlTable):
            pca = self._pca_projector(data)
            variance_ratio = pca.explained_variance_ratio_
            cumulative = numpy.cumsum(variance_ratio)

            if numpy.isfinite(cumulative[-1]):
                self.components_spin.setRange(0, len(cumulative))
                self._pca = pca
                self._variance_ratio = variance_ratio
                self._cumulative = cumulative
                self._setup_plot()
            else:
                self.Warning.trivial_components()

            self.unconditional_commit()

    def clear(self):
        self._pca = None
        self._transformed = None
        self._variance_ratio = None
        self._cumulative = None
        self._line = None
        self.plot_horlabels = []
        self.plot_horlines = []
        self.plot.clear()

    def clear_outputs(self):
        self.Outputs.transformed_data.send(None)
        self.Outputs.components.send(None)
        self.Outputs.pca.send(self._pca_projector)

    def get_model(self):
        if self.rpca is None:
            return
        if self.rpca.ready():
            self.__timer.stop()
            self.start_button.setText("Restart (finished)")
        self._pca = self.rpca.get_state()
        if self._pca is None:
            return
        self._variance_ratio = self._pca.explained_variance_ratio_
        self._cumulative = numpy.cumsum(self._variance_ratio)
        self._setup_plot()
        self._transformed = None
        self.commit()

    def _setup_plot(self):
        self.plot.clear()
        if self._pca is None:
            return

        explained_ratio = self._variance_ratio
        explained = self._cumulative
        p = min(len(self._variance_ratio), self.maxp)

        self.plot.plot(numpy.arange(p),
                       explained_ratio[:p],
                       pen=pg.mkPen(QColor(Qt.red), width=2),
                       antialias=True,
                       name="Variance")
        self.plot.plot(numpy.arange(p),
                       explained[:p],
                       pen=pg.mkPen(QColor(Qt.darkYellow), width=2),
                       antialias=True,
                       name="Cumulative Variance")

        cutpos = self._nselected_components() - 1
        self._line = pg.InfiniteLine(angle=90,
                                     pos=cutpos,
                                     movable=True,
                                     bounds=(0, p - 1))
        self._line.setCursor(Qt.SizeHorCursor)
        self._line.setPen(pg.mkPen(QColor(Qt.black), width=2))
        self._line.sigPositionChanged.connect(self._on_cut_changed)
        self.plot.addItem(self._line)

        self.plot_horlines = (
            pg.PlotCurveItem(pen=pg.mkPen(QColor(Qt.blue), style=Qt.DashLine)),
            pg.PlotCurveItem(pen=pg.mkPen(QColor(Qt.blue), style=Qt.DashLine)))
        self.plot_horlabels = (pg.TextItem(color=QColor(Qt.black),
                                           anchor=(1, 0)),
                               pg.TextItem(color=QColor(Qt.black),
                                           anchor=(1, 1)))
        for item in self.plot_horlabels + self.plot_horlines:
            self.plot.addItem(item)
        self._set_horline_pos()

        self.plot.setRange(xRange=(0.0, p - 1), yRange=(0.0, 1.0))
        self._update_axis()

    def _set_horline_pos(self):
        cutidx = self.ncomponents - 1
        for line, label, curve in zip(
                self.plot_horlines, self.plot_horlabels,
            (self._variance_ratio, self._cumulative)):
            y = curve[cutidx]
            line.setData([-1, cutidx], 2 * [y])
            label.setPos(cutidx, y)
            label.setPlainText("{:.3f}".format(y))

    def _on_cut_changed(self, line):
        # cut changed by means of a cut line over the scree plot.
        value = int(round(line.value()))
        self._line.setValue(value)
        current = self._nselected_components()
        components = value + 1

        if not (self.ncomponents == 0
                and components == len(self._variance_ratio)):
            self.ncomponents = components

        self._set_horline_pos()

        if self._pca is not None:
            var = self._cumulative[components - 1]
            if numpy.isfinite(var):
                self.variance_covered = int(var * 100)

        if current != self._nselected_components():
            self._invalidate_selection()

    def _update_selection_component_spin(self):
        # cut changed by "ncomponents" spin.
        if self._pca is None:
            self._invalidate_selection()
            return

        if self.ncomponents == 0:
            # Special "All" value
            cut = len(self._variance_ratio)
        else:
            cut = self.ncomponents

        var = self._cumulative[cut - 1]
        if numpy.isfinite(var):
            self.variance_covered = int(var * 100)

        if numpy.floor(self._line.value()) + 1 != cut:
            self._line.setValue(cut - 1)

        self._invalidate_selection()

    def _update_selection_variance_spin(self):
        # cut changed by "max variance" spin.
        if self._pca is None:
            return

        cut = numpy.searchsorted(self._cumulative,
                                 self.variance_covered / 100.0) + 1
        cut = min(cut, len(self._cumulative))
        self.ncomponents = cut
        if numpy.floor(self._line.value()) + 1 != cut:
            self._line.setValue(cut - 1)
        self._invalidate_selection()

    def _update_normalize(self):
        if self.normalize:
            pp = self._pca_preprocessors + [Normalize()]
        else:
            pp = self._pca_preprocessors
        self._pca_projector.preprocessors = pp
        self.fit()
        if self.data is None:
            self._invalidate_selection()

    def _init_projector(self):
        cls = DECOMPOSITIONS[self.decomposition_idx]
        self._pca_projector = cls(n_components=MAX_COMPONENTS)
        self._pca_projector.component = self.ncomponents
        self._pca_preprocessors = cls.preprocessors

    def _update_decomposition(self):
        self._init_projector()
        self._update_normalize()

    def _nselected_components(self):
        """Return the number of selected components."""
        if self._pca is None:
            return 0

        if self.ncomponents == 0:
            # Special "All" value
            max_comp = len(self._variance_ratio)
        else:
            max_comp = self.ncomponents

        var_max = self._cumulative[max_comp - 1]
        if var_max != numpy.floor(self.variance_covered / 100.0):
            cut = max_comp
            assert numpy.isfinite(var_max)
            self.variance_covered = int(var_max * 100)
        else:
            self.ncomponents = cut = numpy.searchsorted(
                self._cumulative, self.variance_covered / 100.0) + 1
        return cut

    def _invalidate_selection(self):
        self.commit()

    def _update_axis(self):
        p = min(len(self._variance_ratio), self.maxp)
        axis = self.plot.getAxis("bottom")
        d = max((p - 1) // (self.axis_labels - 1), 1)
        axis.setTicks([[(i, str(i + 1)) for i in range(0, p, d)]])

    def commit(self):
        transformed = components = None
        if self._pca is not None:
            if self._transformed is None:
                # Compute the full transform (MAX_COMPONENTS components) only once.
                self._transformed = self._pca(self.data)
            transformed = self._transformed

            domain = Domain(transformed.domain.attributes[:self.ncomponents],
                            self.data.domain.class_vars,
                            self.data.domain.metas)
            transformed = transformed.from_table(domain, transformed)
            dom = Domain([
                ContinuousVariable(a.name)
                for a in self._pca.orig_domain.attributes
            ],
                         metas=[StringVariable(name='component')])
            metas = numpy.array(
                [['PC{}'.format(i + 1) for i in range(self.ncomponents)]],
                dtype=object).T
            components = Table(dom,
                               self._pca.components_[:self.ncomponents],
                               metas=metas)
            components.name = 'components'

        self._pca_projector.component = self.ncomponents
        self.Outputs.transformed_data.send(transformed)
        self.Outputs.components.send(components)
        self.Outputs.pca.send(self._pca_projector)

    def send_report(self):
        if self.data is None:
            return
        self.report_items(
            (("Decomposition", DECOMPOSITIONS[self.decomposition_idx].name),
             ("Normalize data", str(self.normalize)), ("Selected components",
                                                       self.ncomponents),
             ("Explained variance", "{:.3f} %".format(self.variance_covered))))
        self.report_plot()

    @classmethod
    def migrate_settings(cls, settings, version):
        if "variance_covered" in settings:
            # Due to the error in gh-1896 the variance_covered was persisted
            # as a NaN value, causing a TypeError in the widgets `__init__`.
            vc = settings["variance_covered"]
            if isinstance(vc, numbers.Real):
                if numpy.isfinite(vc):
                    vc = int(vc)
                else:
                    vc = 100
                settings["variance_covered"] = vc
        if settings["ncomponents"] > MAX_COMPONENTS:
            settings["ncomponents"] = MAX_COMPONENTS
Ejemplo n.º 7
0
class OWPCA(widget.OWWidget):
    name = "PCA"
    description = "Principal component analysis with a scree-diagram."
    icon = "icons/PCA.svg"
    priority = 3050
    keywords = ["principal component analysis", "linear transformation"]

    class Inputs:
        data = Input("Data", Table)

    class Outputs:
        transformed_data = Output("Transformed data", Table)
        components = Output("Components", Table)
        pca = Output("PCA", PCA, dynamic=False)
        preprocessor = Output("Preprocessor", Preprocess)

    settingsHandler = settings.DomainContextHandler()

    ncomponents = settings.Setting(2)
    variance_covered = settings.Setting(100)
    batch_size = settings.Setting(100)
    address = settings.Setting('')
    auto_update = settings.Setting(True)
    auto_commit = settings.Setting(True)
    normalize = settings.ContextSetting(True)
    decomposition_idx = settings.ContextSetting(0)
    maxp = settings.Setting(20)
    axis_labels = settings.Setting(10)

    graph_name = "plot.plotItem"

    class Warning(widget.OWWidget.Warning):
        trivial_components = widget.Msg(
            "All components of the PCA are trivial (explain 0 variance). "
            "Input data is constant (or near constant).")

    class Error(widget.OWWidget.Error):
        no_features = widget.Msg("At least 1 feature is required")
        no_instances = widget.Msg("At least 1 data instance is required")
        sparse_data = widget.Msg("Sparse data is not supported")

    def __init__(self):
        super().__init__()
        self.data = None

        self._pca = None
        self._transformed = None
        self._variance_ratio = None
        self._cumulative = None
        self._line = False
        self._init_projector()

        # Components Selection
        box = gui.vBox(self.controlArea, "Components Selection")
        form = QFormLayout()
        box.layout().addLayout(form)

        self.components_spin = gui.spin(
            box, self, "ncomponents", 1, MAX_COMPONENTS,
            callback=self._update_selection_component_spin,
            keyboardTracking=False
        )
        self.components_spin.setSpecialValueText("All")

        self.variance_spin = gui.spin(
            box, self, "variance_covered", 1, 100,
            callback=self._update_selection_variance_spin,
            keyboardTracking=False
        )
        self.variance_spin.setSuffix("%")

        form.addRow("Components:", self.components_spin)
        form.addRow("Variance covered:", self.variance_spin)

        # Incremental learning
        self.sampling_box = gui.vBox(self.controlArea, "Incremental learning")
        self.addresstext = QLineEdit(box)
        self.addresstext.setPlaceholderText('Remote server')
        if self.address:
            self.addresstext.setText(self.address)
        self.sampling_box.layout().addWidget(self.addresstext)

        form = QFormLayout()
        self.sampling_box.layout().addLayout(form)
        self.batch_spin = gui.spin(
            self.sampling_box, self, "batch_size", 50, 100000, step=50,
            keyboardTracking=False)
        form.addRow("Batch size ~ ", self.batch_spin)

        self.start_button = gui.button(
            self.sampling_box, self, "Start remote computation",
            callback=self.start, autoDefault=False,
            tooltip="Start/abort computation on the server")
        self.start_button.setEnabled(False)

        gui.checkBox(self.sampling_box, self, "auto_update",
                     "Periodically fetch model", callback=self.update_model)
        self.__timer = QTimer(self, interval=2000)
        self.__timer.timeout.connect(self.get_model)

        self.sampling_box.setVisible(remotely)

        # Decomposition
        self.decomposition_box = gui.radioButtons(
            self.controlArea, self,
            "decomposition_idx", [d.name for d in DECOMPOSITIONS],
            box="Decomposition", callback=self._update_decomposition
        )

        # Options
        self.options_box = gui.vBox(self.controlArea, "Options")
        self.normalize_box = gui.checkBox(
            self.options_box, self, "normalize",
            "Normalize data", callback=self._update_normalize
        )

        self.maxp_spin = gui.spin(
            self.options_box, self, "maxp", 1, MAX_COMPONENTS,
            label="Show only first", callback=self._setup_plot,
            keyboardTracking=False
        )

        self.controlArea.layout().addStretch()

        gui.auto_commit(self.controlArea, self, "auto_commit", "Apply",
                        checkbox_label="Apply automatically")

        self.plot = pg.PlotWidget(background="w")

        axis = self.plot.getAxis("bottom")
        axis.setLabel("Principal Components")
        axis = self.plot.getAxis("left")
        axis.setLabel("Proportion of variance")
        self.plot_horlabels = []
        self.plot_horlines = []

        self.plot.getViewBox().setMenuEnabled(False)
        self.plot.getViewBox().setMouseEnabled(False, False)
        self.plot.showGrid(True, True, alpha=0.5)
        self.plot.setRange(xRange=(0.0, 1.0), yRange=(0.0, 1.0))

        self.mainArea.layout().addWidget(self.plot)
        self._update_normalize()

    def update_model(self):
        self.get_model()
        if self.auto_update and self.rpca and not self.rpca.ready():
            self.__timer.start(2000)
        else:
            self.__timer.stop()

    def update_buttons(self, sparse_data=False):
        if sparse_data:
            self.normalize = False

        buttons = self.decomposition_box.buttons
        for cls, button in zip(DECOMPOSITIONS, buttons):
            button.setDisabled(sparse_data and not cls.supports_sparse)

        if not buttons[self.decomposition_idx].isEnabled():
            # Set decomposition index to first sparse-enabled decomposition
            for i, cls in enumerate(DECOMPOSITIONS):
                if cls.supports_sparse:
                    self.decomposition_idx = i
                    break

        self._init_projector()

    def start(self):
        if 'Abort' in self.start_button.text():
            self.rpca.abort()
            self.__timer.stop()
            self.start_button.setText("Start remote computation")
        else:
            self.address = self.addresstext.text()
            with remote.server(self.address):
                from Orange.projection.pca import RemotePCA
                maxiter = (1e5 + self.data.approx_len()) / self.batch_size * 3
                self.rpca = RemotePCA(self.data, self.batch_size, int(maxiter))
            self.update_model()
            self.start_button.setText("Abort remote computation")

    @Inputs.data
    def set_data(self, data):
        self.closeContext()
        self.clear_messages()
        self.clear()
        self.start_button.setEnabled(False)
        self.information()
        self.data = None
        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            elif not remotely:
                self.information("Data has been sampled")
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(2000, partial=True)
                data = Table(data_sample)
            else:       # data was big and remote available
                self.sampling_box.setVisible(True)
                self.start_button.setText("Start remote computation")
                self.start_button.setEnabled(True)
        if not isinstance(data, SqlTable):
            self.sampling_box.setVisible(False)

        if isinstance(data, Table):
            if len(data.domain.attributes) == 0:
                self.Error.no_features()
                self.clear_outputs()
                return
            if len(data) == 0:
                self.Error.no_instances()
                self.clear_outputs()
                return

        self.openContext(data)
        sparse_data = data is not None and data.is_sparse()
        self.normalize_box.setDisabled(sparse_data)
        self.update_buttons(sparse_data=sparse_data)

        self.data = data
        self.fit()

    def fit(self):
        self.clear()
        self.Warning.trivial_components.clear()
        if self.data is None:
            return
        data = self.data
        self._pca_projector.preprocessors = \
            self._pca_preprocessors + ([Normalize()] if self.normalize else [])
        if not isinstance(data, SqlTable):
            pca = self._pca_projector(data)
            variance_ratio = pca.explained_variance_ratio_
            cumulative = numpy.cumsum(variance_ratio)

            if numpy.isfinite(cumulative[-1]):
                self.components_spin.setRange(0, len(cumulative))
                self._pca = pca
                self._variance_ratio = variance_ratio
                self._cumulative = cumulative
                self._setup_plot()
            else:
                self.Warning.trivial_components()

            self.unconditional_commit()

    def clear(self):
        self._pca = None
        self._transformed = None
        self._variance_ratio = None
        self._cumulative = None
        self._line = None
        self.plot_horlabels = []
        self.plot_horlines = []
        self.plot.clear()

    def clear_outputs(self):
        self.Outputs.transformed_data.send(None)
        self.Outputs.components.send(None)
        self.Outputs.pca.send(self._pca_projector)
        self.Outputs.preprocessor.send(None)

    def get_model(self):
        if self.rpca is None:
            return
        if self.rpca.ready():
            self.__timer.stop()
            self.start_button.setText("Restart (finished)")
        self._pca = self.rpca.get_state()
        if self._pca is None:
            return
        self._variance_ratio = self._pca.explained_variance_ratio_
        self._cumulative = numpy.cumsum(self._variance_ratio)
        self._setup_plot()
        self._transformed = None
        self.commit()

    def _setup_plot(self):
        self.plot.clear()
        if self._pca is None:
            return

        explained_ratio = self._variance_ratio
        explained = self._cumulative
        p = min(len(self._variance_ratio), self.maxp)

        self.plot.plot(numpy.arange(p), explained_ratio[:p],
                       pen=pg.mkPen(QColor(Qt.red), width=2),
                       antialias=True,
                       name="Variance")
        self.plot.plot(numpy.arange(p), explained[:p],
                       pen=pg.mkPen(QColor(Qt.darkYellow), width=2),
                       antialias=True,
                       name="Cumulative Variance")

        cutpos = self._nselected_components() - 1
        self._line = pg.InfiniteLine(
            angle=90, pos=cutpos, movable=True, bounds=(0, p - 1))
        self._line.setCursor(Qt.SizeHorCursor)
        self._line.setPen(pg.mkPen(QColor(Qt.black), width=2))
        self._line.sigPositionChanged.connect(self._on_cut_changed)
        self.plot.addItem(self._line)

        self.plot_horlines = (
            pg.PlotCurveItem(pen=pg.mkPen(QColor(Qt.blue), style=Qt.DashLine)),
            pg.PlotCurveItem(pen=pg.mkPen(QColor(Qt.blue), style=Qt.DashLine)))
        self.plot_horlabels = (
            pg.TextItem(color=QColor(Qt.black), anchor=(1, 0)),
            pg.TextItem(color=QColor(Qt.black), anchor=(1, 1)))
        for item in self.plot_horlabels + self.plot_horlines:
            self.plot.addItem(item)
        self._set_horline_pos()

        self.plot.setRange(xRange=(0.0, p - 1), yRange=(0.0, 1.0))
        self._update_axis()

    def _set_horline_pos(self):
        cutidx = self.ncomponents - 1
        for line, label, curve in zip(self.plot_horlines, self.plot_horlabels,
                                      (self._variance_ratio, self._cumulative)):
            y = curve[cutidx]
            line.setData([-1, cutidx], 2 * [y])
            label.setPos(cutidx, y)
            label.setPlainText("{:.3f}".format(y))

    def _on_cut_changed(self, line):
        # cut changed by means of a cut line over the scree plot.
        value = int(round(line.value()))
        self._line.setValue(value)
        current = self._nselected_components()
        components = value + 1

        if not (self.ncomponents == 0 and
                components == len(self._variance_ratio)):
            self.ncomponents = components

        self._set_horline_pos()

        if self._pca is not None:
            var = self._cumulative[components - 1]
            if numpy.isfinite(var):
                self.variance_covered = int(var * 100)

        if current != self._nselected_components():
            self._invalidate_selection()

    def _update_selection_component_spin(self):
        # cut changed by "ncomponents" spin.
        if self._pca is None:
            self._invalidate_selection()
            return

        if self.ncomponents == 0:
            # Special "All" value
            cut = len(self._variance_ratio)
        else:
            cut = self.ncomponents

        var = self._cumulative[cut - 1]
        if numpy.isfinite(var):
            self.variance_covered = int(var * 100)

        if numpy.floor(self._line.value()) + 1 != cut:
            self._line.setValue(cut - 1)

        self._invalidate_selection()

    def _update_selection_variance_spin(self):
        # cut changed by "max variance" spin.
        if self._pca is None:
            return

        cut = numpy.searchsorted(self._cumulative,
                                 self.variance_covered / 100.0) + 1
        cut = min(cut, len(self._cumulative))
        self.ncomponents = cut
        if numpy.floor(self._line.value()) + 1 != cut:
            self._line.setValue(cut - 1)
        self._invalidate_selection()

    def _update_normalize(self):
        self.fit()
        if self.data is None:
            self._invalidate_selection()

    def _init_projector(self):
        cls = DECOMPOSITIONS[self.decomposition_idx]
        self._pca_projector = cls(n_components=MAX_COMPONENTS)
        self._pca_projector.component = self.ncomponents
        self._pca_preprocessors = cls.preprocessors

    def _update_decomposition(self):
        self._init_projector()
        self._update_normalize()

    def _nselected_components(self):
        """Return the number of selected components."""
        if self._pca is None:
            return 0

        if self.ncomponents == 0:
            # Special "All" value
            max_comp = len(self._variance_ratio)
        else:
            max_comp = self.ncomponents

        var_max = self._cumulative[max_comp - 1]
        if var_max != numpy.floor(self.variance_covered / 100.0):
            cut = max_comp
            assert numpy.isfinite(var_max)
            self.variance_covered = int(var_max * 100)
        else:
            self.ncomponents = cut = numpy.searchsorted(
                self._cumulative, self.variance_covered / 100.0) + 1
        return cut

    def _invalidate_selection(self):
        self.commit()

    def _update_axis(self):
        p = min(len(self._variance_ratio), self.maxp)
        axis = self.plot.getAxis("bottom")
        d = max((p-1)//(self.axis_labels-1), 1)
        axis.setTicks([[(i, str(i+1)) for i in range(0, p, d)]])

    def commit(self):
        transformed = components = pp = None
        if self._pca is not None:
            if self._transformed is None:
                # Compute the full transform (MAX_COMPONENTS components) only once.
                self._transformed = self._pca(self.data)
            transformed = self._transformed

            domain = Domain(
                transformed.domain.attributes[:self.ncomponents],
                self.data.domain.class_vars,
                self.data.domain.metas
            )
            transformed = transformed.from_table(domain, transformed)
            # prevent caching new features by defining compute_value
            dom = Domain([ContinuousVariable(a.name, compute_value=lambda _: None)
                          for a in self._pca.orig_domain.attributes],
                         metas=[StringVariable(name='component')])
            metas = numpy.array([['PC{}'.format(i + 1)
                                  for i in range(self.ncomponents)]],
                                dtype=object).T
            components = Table(dom, self._pca.components_[:self.ncomponents],
                               metas=metas)
            components.name = 'components'

            pp = ApplyDomain(domain, "PCA")

        self._pca_projector.component = self.ncomponents
        self.Outputs.transformed_data.send(transformed)
        self.Outputs.components.send(components)
        self.Outputs.pca.send(self._pca_projector)
        self.Outputs.preprocessor.send(pp)

    def send_report(self):
        if self.data is None:
            return
        self.report_items((
            ("Decomposition", DECOMPOSITIONS[self.decomposition_idx].name),
            ("Normalize data", str(self.normalize)),
            ("Selected components", self.ncomponents),
            ("Explained variance", "{:.3f} %".format(self.variance_covered))
        ))
        self.report_plot()

    @classmethod
    def migrate_settings(cls, settings, version):
        if "variance_covered" in settings:
            # Due to the error in gh-1896 the variance_covered was persisted
            # as a NaN value, causing a TypeError in the widgets `__init__`.
            vc = settings["variance_covered"]
            if isinstance(vc, numbers.Real):
                if numpy.isfinite(vc):
                    vc = int(vc)
                else:
                    vc = 100
                settings["variance_covered"] = vc
        if settings.get("ncomponents", 0) > MAX_COMPONENTS:
            settings["ncomponents"] = MAX_COMPONENTS
Ejemplo n.º 8
0
class OWPCA(widget.OWWidget):
    name = "PCA"
    description = "Principal component analysis with a scree-diagram."
    icon = "icons/PCA.svg"
    priority = 3050

    inputs = [("Data", Table, "set_data")]
    outputs = [("Transformed data", Table), ("Components", Table),
               ("PCA", PCA)]

    ncomponents = settings.Setting(2)
    variance_covered = settings.Setting(100)
    batch_size = settings.Setting(100)
    address = settings.Setting('')
    auto_update = settings.Setting(True)
    auto_commit = settings.Setting(True)
    normalize = settings.Setting(True)
    maxp = settings.Setting(20)
    axis_labels = settings.Setting(10)

    graph_name = "plot.plotItem"

    def __init__(self):
        super().__init__()
        self.data = None

        self._pca = None
        self._transformed = None
        self._variance_ratio = None
        self._cumulative = None
        self._line = False
        self._pca_projector = PCA()
        self._pca_projector.component = 0
        self._pca_preprocessors = PCA.preprocessors

        # Components Selection
        box = gui.vBox(self.controlArea, "Components Selection")
        form = QFormLayout()
        box.layout().addLayout(form)

        self.components_spin = gui.spin(
            box,
            self,
            "ncomponents",
            0,
            1000,
            callback=self._update_selection_component_spin,
            keyboardTracking=False)
        self.components_spin.setSpecialValueText("All")

        self.variance_spin = gui.spin(
            box,
            self,
            "variance_covered",
            1,
            100,
            callback=self._update_selection_variance_spin,
            keyboardTracking=False)
        self.variance_spin.setSuffix("%")

        form.addRow("Components", self.components_spin)
        form.addRow("Variance covered", self.variance_spin)

        # Incremental learning
        self.sampling_box = gui.vBox(self.controlArea, "Incremental learning")
        self.addresstext = QLineEdit(box)
        self.addresstext.setPlaceholderText('Remote server')
        if self.address:
            self.addresstext.setText(self.address)
        self.sampling_box.layout().addWidget(self.addresstext)

        form = QFormLayout()
        self.sampling_box.layout().addLayout(form)
        self.batch_spin = gui.spin(self.sampling_box,
                                   self,
                                   "batch_size",
                                   50,
                                   100000,
                                   step=50,
                                   keyboardTracking=False)
        form.addRow("Batch size ~ ", self.batch_spin)

        self.start_button = gui.button(
            self.sampling_box,
            self,
            "Start remote computation",
            callback=self.start,
            autoDefault=False,
            tooltip="Start/abort computation on the server")
        self.start_button.setEnabled(False)

        gui.checkBox(self.sampling_box,
                     self,
                     "auto_update",
                     "Periodically fetch model",
                     callback=self.update_model)
        self.__timer = QTimer(self, interval=2000)
        self.__timer.timeout.connect(self.get_model)

        self.sampling_box.setVisible(remotely)

        # Options
        self.options_box = gui.vBox(self.controlArea, "Options")
        gui.checkBox(self.options_box,
                     self,
                     "normalize",
                     "Normalize data",
                     callback=self._update_normalize)
        self.maxp_spin = gui.spin(self.options_box,
                                  self,
                                  "maxp",
                                  1,
                                  100,
                                  label="Show only first",
                                  callback=self._setup_plot,
                                  keyboardTracking=False)

        self.controlArea.layout().addStretch()

        gui.auto_commit(self.controlArea,
                        self,
                        "auto_commit",
                        "Send data",
                        checkbox_label="Auto send on change")

        self.plot = pg.PlotWidget(background="w")

        axis = self.plot.getAxis("bottom")
        axis.setLabel("Principal Components")
        axis = self.plot.getAxis("left")
        axis.setLabel("Proportion of variance")

        self.plot.getViewBox().setMenuEnabled(False)
        self.plot.getViewBox().setMouseEnabled(False, False)
        self.plot.showGrid(True, True, alpha=0.5)
        self.plot.setRange(xRange=(0.0, 1.0), yRange=(0.0, 1.0))

        self.mainArea.layout().addWidget(self.plot)
        self._update_normalize()

    def update_model(self):
        self.get_model()
        if self.auto_update and self.rpca and not self.rpca.ready():
            self.__timer.start(2000)
        else:
            self.__timer.stop()

    def start(self):
        if 'Abort' in self.start_button.text():
            self.rpca.abort()
            self.__timer.stop()
            self.start_button.setText("Start remote computation")
        else:
            self.address = self.addresstext.text()
            with remote.server(self.address):
                from Orange.projection.pca import RemotePCA
                maxiter = (1e5 + self.data.approx_len()) / self.batch_size * 3
                self.rpca = RemotePCA(self.data, self.batch_size, int(maxiter))
            self.update_model()
            self.start_button.setText("Abort remote computation")

    def set_data(self, data):
        self.information(0)
        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            elif not remotely:
                self.information(0, "Data has been sampled")
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(2000, partial=True)
                data = Table(data_sample)
        self.data = data
        self.fit()

    def fit(self):
        self.clear()
        self.start_button.setEnabled(False)
        if self.data is None:
            return
        data = self.data
        self._transformed = None
        if isinstance(data, SqlTable):  # data was big and remote available
            self.sampling_box.setVisible(True)
            self.start_button.setText("Start remote computation")
            self.start_button.setEnabled(True)
        else:
            self.sampling_box.setVisible(False)
            pca = self._pca_projector(data)
            variance_ratio = pca.explained_variance_ratio_
            cumulative = numpy.cumsum(variance_ratio)
            self.components_spin.setRange(0, len(cumulative))

            self._pca = pca
            self._variance_ratio = variance_ratio
            self._cumulative = cumulative
            self._setup_plot()

            self.unconditional_commit()

    def clear(self):
        self._pca = None
        self._transformed = None
        self._variance_ratio = None
        self._cumulative = None
        self._line = None
        self.plot.clear()

    def get_model(self):
        if self.rpca is None:
            return
        if self.rpca.ready():
            self.__timer.stop()
            self.start_button.setText("Restart (finished)")
        self._pca = self.rpca.get_state()
        if self._pca is None:
            return
        self._variance_ratio = self._pca.explained_variance_ratio_
        self._cumulative = numpy.cumsum(self._variance_ratio)
        self._setup_plot()
        self._transformed = None
        self.commit()

    def _setup_plot(self):
        self.plot.clear()
        explained_ratio = self._variance_ratio
        explained = self._cumulative
        p = min(len(self._variance_ratio), self.maxp)

        self.plot.plot(numpy.arange(p),
                       explained_ratio[:p],
                       pen=pg.mkPen(QColor(Qt.red), width=2),
                       antialias=True,
                       name="Variance")
        self.plot.plot(numpy.arange(p),
                       explained[:p],
                       pen=pg.mkPen(QColor(Qt.darkYellow), width=2),
                       antialias=True,
                       name="Cumulative Variance")

        self._line = pg.InfiniteLine(angle=90,
                                     pos=self._nselected_components() - 1,
                                     movable=True,
                                     bounds=(0, p - 1))
        self._line.setCursor(Qt.SizeHorCursor)
        self._line.setPen(pg.mkPen(QColor(Qt.darkGray), width=5))
        self._line.sigPositionChanged.connect(self._on_cut_changed)

        self.plot.addItem(self._line)
        self.plot.setRange(xRange=(0.0, p - 1), yRange=(0.0, 1.0))
        self._update_axis()

    def _on_cut_changed(self, line):
        # cut changed by means of a cut line over the scree plot.
        value = line.value()
        self._line.setValue(round(value))
        current = self._nselected_components()
        components = int(numpy.floor(value)) + 1

        if not (self.ncomponents == 0
                and components == len(self._variance_ratio)):
            self.ncomponents = components

        if self._pca is not None:
            self.variance_covered = self._cumulative[components - 1] * 100

        if current != self._nselected_components():
            self._invalidate_selection()

    def _update_selection_component_spin(self):
        # cut changed by "ncomponents" spin.
        if self._pca is None:
            return

        if self.ncomponents == 0:
            # Special "All" value
            cut = len(self._variance_ratio)
        else:
            cut = self.ncomponents
        self.variance_covered = self._cumulative[cut - 1] * 100

        if numpy.floor(self._line.value()) + 1 != cut:
            self._line.setValue(cut - 1)

        self._invalidate_selection()

    def _update_selection_variance_spin(self):
        # cut changed by "max variance" spin.
        if self._pca is None:
            return

        cut = numpy.searchsorted(self._cumulative,
                                 self.variance_covered / 100.0)
        self.ncomponents = cut + 1
        if numpy.floor(self._line.value()) + 1 != cut:
            self._line.setValue(cut - 1)
        self._invalidate_selection()

    def _update_normalize(self):
        if self.normalize:
            pp = self._pca_preprocessors + [Normalize()]
        else:
            pp = self._pca_preprocessors
        self._pca_projector.preprocessors = pp
        self.fit()
        if self.data is None:
            self._invalidate_selection()

    def _nselected_components(self):
        """Return the number of selected components."""
        if self._pca is None:
            return 0

        if self.ncomponents == 0:
            # Special "All" value
            max_comp = len(self._variance_ratio)
        else:
            max_comp = self.ncomponents

        var_max = self._cumulative[max_comp - 1]
        if var_max != numpy.floor(self.variance_covered / 100.0):
            cut = max_comp
            self.variance_covered = var_max * 100
        else:
            self.ncomponents = cut = numpy.searchsorted(
                self._cumulative, self.variance_covered / 100.0) + 1
        return cut

    def _invalidate_selection(self):
        self.commit()

    def _update_axis(self):
        p = min(len(self._variance_ratio), self.maxp)
        axis = self.plot.getAxis("bottom")
        d = max((p - 1) // (self.axis_labels - 1), 1)
        axis.setTicks([[(i, str(i + 1)) for i in range(0, p, d)]])

    def commit(self):
        transformed = components = None
        if self._pca is not None:
            if self._transformed is None:
                # Compute the full transform (all components) only once.
                self._transformed = self._pca(self.data)
            transformed = self._transformed

            domain = Domain(transformed.domain.attributes[:self.ncomponents],
                            self.data.domain.class_vars,
                            self.data.domain.metas)
            transformed = transformed.from_table(domain, transformed)
            dom = Domain(self._pca.orig_domain.attributes,
                         metas=[StringVariable(name='component')])
            metas = numpy.array(
                [['PC{}'.format(i + 1) for i in range(self.ncomponents)]],
                dtype=object).T
            components = Table(dom,
                               self._pca.components_[:self.ncomponents],
                               metas=metas)
            components.name = 'components'

        self.send("Transformed data", transformed)
        self.send("Components", components)
        self.send("PCA", self._pca_projector)

    def send_report(self):
        if self.data is None:
            return
        self.report_items(
            (("Selected components", self.ncomponents),
             ("Explained variance", "{:.3f} %".format(self.variance_covered))))
        self.report_plot()
Ejemplo n.º 9
0
class OWPCA(widget.OWWidget):
    name = "PCA"
    description = "Principal component analysis"
    icon = "icons/PCA.svg"
    priority = 3050

    inputs = [("Data", Orange.data.Table, "set_data")]
    outputs = [("Transformed data", Orange.data.Table),
               ("Components", Orange.data.Table)]
    ncomponents = settings.Setting(2)
    variance_covered = settings.Setting(100)
    batch_size = settings.Setting(100)
    address = settings.Setting('localhost:9465')
    auto_update = settings.Setting(True)
    auto_commit = settings.Setting(True)

    def __init__(self, parent=None):
        super().__init__(parent)
        self.data = None

        self._pca = None
        self._transformed = None
        self._variance_ratio = None
        self._cumulative = None
        self._line = False

        box = gui.widgetBox(self.controlArea, "Components Selection")
        form = QFormLayout()
        box.layout().addLayout(form)

        self.components_spin = gui.spin(
            box,
            self,
            "ncomponents",
            0,
            1000,
            callback=self._update_selection_component_spin,
            keyboardTracking=False)
        self.components_spin.setSpecialValueText("All")

        self.variance_spin = gui.spin(
            box,
            self,
            "variance_covered",
            1,
            100,
            callback=self._update_selection_variance_spin,
            keyboardTracking=False)
        self.variance_spin.setSuffix("%")

        form.addRow("Components", self.components_spin)
        form.addRow("Variance covered", self.variance_spin)

        self.sampling_box = gui.widgetBox(self.controlArea,
                                          "Incremental learning")

        self.addresstext = QLineEdit(box)
        self.addresstext.setPlaceholderText('Remote server')
        if self.address:
            self.addresstext.setText(self.address)
        self.sampling_box.layout().addWidget(self.addresstext)

        form = QFormLayout()
        self.sampling_box.layout().addLayout(form)
        self.batch_spin = gui.spin(self.sampling_box,
                                   self,
                                   "batch_size",
                                   50,
                                   100000,
                                   step=50,
                                   keyboardTracking=False)
        form.addRow("Batch size ~ ", self.batch_spin)

        self.start_button = gui.button(
            self.sampling_box,
            self,
            "Start remote computation",
            callback=self.start,
            autoDefault=False,
            tooltip="Start/abort computation on the server")
        self.start_button.setEnabled(False)

        gui.checkBox(self.sampling_box,
                     self,
                     "auto_update",
                     "Periodically fetch model",
                     callback=self.update_model)
        self.__timer = QTimer(self, interval=2000)
        self.__timer.timeout.connect(self.get_model)

        self.sampling_box.setVisible(remotely)
        self.controlArea.layout().addStretch()

        gui.auto_commit(self.controlArea,
                        self,
                        "auto_commit",
                        "Send data",
                        checkbox_label="Auto send on change")

        self.plot = pg.PlotWidget(background="w")

        axis = self.plot.getAxis("bottom")
        axis.setLabel("Principal Components")
        axis = self.plot.getAxis("left")
        axis.setLabel("Proportion of variance")

        self.plot.getViewBox().setMenuEnabled(False)
        self.plot.getViewBox().setMouseEnabled(False, False)
        self.plot.showGrid(True, True, alpha=0.5)
        self.plot.setRange(xRange=(0.0, 1.0), yRange=(0.0, 1.0))

        self.mainArea.layout().addWidget(self.plot)

    def update_model(self):
        self.get_model()
        if self.auto_update and self.rpca and not self.rpca.ready():
            self.__timer.start(2000)
        else:
            self.__timer.stop()

    def start(self):
        if 'Abort' in self.start_button.text():
            self.rpca.abort()
            self.__timer.stop()
            self.start_button.setText("Start remote computation")
        else:
            self.address = self.addresstext.text()
            with remote.server(self.address):
                from Orange.projection.pca import RemotePCA
                maxiter = (1e5 + self.data.approx_len()) / self.batch_size * 3
                self.rpca = RemotePCA(self.data, self.address, self.batch_size,
                                      int(maxiter))
            self.update_model()
            self.start_button.setText("Abort remote computation")

    def set_data(self, data):
        self.clear()
        self.data = data

        self.start_button.setEnabled(False)
        if data is not None:
            self._transformed = None
            if remotely and isinstance(data, SqlTable):
                self.sampling_box.setVisible(True)
                self.start_button.setText("Start remote computation")
                self.start_button.setEnabled(True)
            else:
                self.sampling_box.setVisible(False)
                pca = Orange.projection.PCA()
                pca = pca(self.data)
                variance_ratio = pca.explained_variance_ratio_
                cumulative = numpy.cumsum(variance_ratio)
                self.components_spin.setRange(0, len(cumulative))

                self._pca = pca
                self._variance_ratio = variance_ratio
                self._cumulative = cumulative
                self._setup_plot()

                self.unconditional_commit()

    def clear(self):
        self.data = None
        self._pca = None
        self._transformed = None
        self._variance_ratio = None
        self._cumulative = None
        self._line = None
        self.plot.clear()

    def get_model(self):
        if self.rpca is None:
            return
        if self.rpca.ready():
            self.__timer.stop()
            self.start_button.setText("Restart (finished)")
        self._pca = self.rpca.get_state()
        if self._pca is None:
            return
        self._variance_ratio = self._pca.explained_variance_ratio_
        self._cumulative = numpy.cumsum(self._variance_ratio)
        self.plot.clear()
        self._setup_plot()
        self._transformed = None
        self.commit()

    def _setup_plot(self):
        explained_ratio = self._variance_ratio
        explained = self._cumulative
        (p, ) = explained.shape

        self.plot.plot(numpy.arange(p),
                       explained_ratio,
                       pen=pg.mkPen(QColor(Qt.red), width=2),
                       antialias=True,
                       name="Variance")
        self.plot.plot(numpy.arange(p),
                       explained,
                       pen=pg.mkPen(QColor(Qt.darkYellow), width=2),
                       antialias=True,
                       name="Cumulative Variance")

        self._line = pg.InfiniteLine(angle=90,
                                     pos=self._nselected_components() - 1,
                                     movable=True,
                                     bounds=(0, p - 1))
        self._line.setCursor(Qt.SizeHorCursor)
        self._line.setPen(pg.mkPen(QColor(Qt.darkGray), width=1.5))
        self._line.sigPositionChanged.connect(self._on_cut_changed)

        self.plot.addItem(self._line)
        self.plot.setRange(xRange=(0.0, p - 1), yRange=(0.0, 1.0))
        axis = self.plot.getAxis("bottom")
        axis.setTicks([[(i, "C{}".format(i + 1)) for i in range(p)]])

    def _on_cut_changed(self, line):
        # cut changed by means of a cut line over the scree plot.
        value = line.value()
        current = self._nselected_components()
        components = int(numpy.floor(value)) + 1

        if not (self.ncomponents == 0
                and components == len(self._variance_ratio)):
            self.ncomponents = components

        if self._pca is not None:
            self.variance_covered = self._cumulative[components - 1] * 100

        if current != self._nselected_components():
            self._invalidate_selection()

    def _update_selection_component_spin(self):
        # cut changed by "ncomponents" spin.
        if self._pca is None:
            return

        if self.ncomponents == 0:
            # Special "All" value
            cut = len(self._variance_ratio)
        else:
            cut = self.ncomponents
        self.variance_covered = self._cumulative[cut - 1] * 100

        if numpy.floor(self._line.value()) + 1 != cut:
            self._line.setValue(cut - 1)

        self._invalidate_selection()

    def _update_selection_variance_spin(self):
        # cut changed by "max variance" spin.
        if self._pca is None:
            return

        cut = numpy.searchsorted(self._cumulative,
                                 self.variance_covered / 100.0)
        self.ncomponents = cut + 1
        if numpy.floor(self._line.value()) + 1 != cut:
            self._line.setValue(cut - 1)
        self._invalidate_selection()

    def _nselected_components(self):
        """Return the number of selected components."""
        if self._pca is None:
            return 0

        if self.ncomponents == 0:
            # Special "All" value
            max_comp = len(self._variance_ratio)
        else:
            max_comp = self.ncomponents

        var_max = self._cumulative[max_comp - 1]
        if var_max != numpy.floor(self.variance_covered / 100.0):
            cut = max_comp
            self.variance_covered = var_max * 100
        else:
            self.ncomponents = cut = numpy.searchsorted(
                self._cumulative, self.variance_covered / 100.0) + 1
        return cut

    def _invalidate_selection(self):
        self.commit()

    def commit(self):
        transformed = components = None
        if self._pca is not None:
            components = self._pca.components_
            if self._transformed is None:
                # Compute the full transform (all components) only once.
                self._transformed = self._pca(self.data)
            transformed = self._transformed

            domain = Orange.data.Domain(
                transformed.domain.attributes[:self.ncomponents],
                self.data.domain.class_vars, self.data.domain.metas)
            transformed = transformed.from_table(domain, transformed)
            components = Orange.data.Table.from_numpy(None, components)
            components.name = 'components'

        self.send("Transformed data", transformed)
        self.send("Components", components)
Ejemplo n.º 10
0
class OWPCA(widget.OWWidget):
    name = "PCA"
    description = "Principal component analysis with a scree-diagram."
    icon = "icons/PCA.svg"
    priority = 3050

    inputs = [("Data", Orange.data.Table, "set_data")]
    outputs = [("Transformed data", Orange.data.Table),
               ("Components", Orange.data.Table)]
    ncomponents = settings.Setting(2)
    variance_covered = settings.Setting(100)
    batch_size = settings.Setting(100)
    address = settings.Setting('')
    auto_update = settings.Setting(True)
    auto_commit = settings.Setting(True)

    def __init__(self, parent=None):
        super().__init__(parent)
        self.data = None

        self._pca = None
        self._transformed = None
        self._variance_ratio = None
        self._cumulative = None
        self._line = False

        box = gui.widgetBox(self.controlArea, "Components Selection")
        form = QFormLayout()
        box.layout().addLayout(form)

        self.components_spin = gui.spin(
            box, self, "ncomponents", 0, 1000,
            callback=self._update_selection_component_spin,
            keyboardTracking=False
        )
        self.components_spin.setSpecialValueText("All")

        self.variance_spin = gui.spin(
            box, self, "variance_covered", 1, 100,
            callback=self._update_selection_variance_spin,
            keyboardTracking=False
        )
        self.variance_spin.setSuffix("%")

        form.addRow("Components", self.components_spin)
        form.addRow("Variance covered", self.variance_spin)

        self.sampling_box = gui.widgetBox(self.controlArea,
                                          "Incremental learning")

        self.addresstext = QLineEdit(box)
        self.addresstext.setPlaceholderText('Remote server')
        if self.address:
            self.addresstext.setText(self.address)
        self.sampling_box.layout().addWidget(self.addresstext)

        form = QFormLayout()
        self.sampling_box.layout().addLayout(form)
        self.batch_spin = gui.spin(
            self.sampling_box, self, "batch_size", 50, 100000, step=50,
            keyboardTracking=False)
        form.addRow("Batch size ~ ", self.batch_spin)

        self.start_button = gui.button(
            self.sampling_box, self, "Start remote computation",
            callback=self.start, autoDefault=False,
            tooltip="Start/abort computation on the server")
        self.start_button.setEnabled(False)

        gui.checkBox(self.sampling_box, self, "auto_update",
                     "Periodically fetch model", callback=self.update_model)
        self.__timer = QTimer(self, interval=2000)
        self.__timer.timeout.connect(self.get_model)

        self.sampling_box.setVisible(remotely)
        self.controlArea.layout().addStretch()

        gui.auto_commit(self.controlArea, self, "auto_commit", "Send data",
                        checkbox_label="Auto send on change")

        self.plot = pg.PlotWidget(background="w")

        axis = self.plot.getAxis("bottom")
        axis.setLabel("Principal Components")
        axis = self.plot.getAxis("left")
        axis.setLabel("Proportion of variance")

        self.plot.getViewBox().setMenuEnabled(False)
        self.plot.getViewBox().setMouseEnabled(False, False)
        self.plot.showGrid(True, True, alpha=0.5)
        self.plot.setRange(xRange=(0.0, 1.0), yRange=(0.0, 1.0))

        self.mainArea.layout().addWidget(self.plot)

    def update_model(self):
        self.get_model()
        if self.auto_update and self.rpca and not self.rpca.ready():
            self.__timer.start(2000)
        else:
            self.__timer.stop()

    def start(self):
        if 'Abort' in self.start_button.text():
            self.rpca.abort()
            self.__timer.stop()
            self.start_button.setText("Start remote computation")
        else:
            self.address = self.addresstext.text()
            with remote.server(self.address):
                from Orange.projection.pca import RemotePCA
                maxiter = (1e5 + self.data.approx_len()) / self.batch_size * 3
                self.rpca = RemotePCA(self.data, self.address,
                                      self.batch_size, int(maxiter))
            self.update_model()
            self.start_button.setText("Abort remote computation")

    def set_data(self, data):
        self.clear()
        self.data = data

        self.start_button.setEnabled(False)
        if data is not None:
            self._transformed = None
            if remotely and isinstance(data, SqlTable):
                self.sampling_box.setVisible(True)
                self.start_button.setText("Start remote computation")
                self.start_button.setEnabled(True)
            else:
                self.sampling_box.setVisible(False)
                pca = Orange.projection.PCA()
                pca = pca(self.data)
                variance_ratio = pca.explained_variance_ratio_
                cumulative = numpy.cumsum(variance_ratio)
                self.components_spin.setRange(0, len(cumulative))

                self._pca = pca
                self._variance_ratio = variance_ratio
                self._cumulative = cumulative
                self._setup_plot()

                self.unconditional_commit()

    def clear(self):
        self.data = None
        self._pca = None
        self._transformed = None
        self._variance_ratio = None
        self._cumulative = None
        self._line = None
        self.plot.clear()

    def get_model(self):
        if self.rpca is None:
            return
        if self.rpca.ready():
            self.__timer.stop()
            self.start_button.setText("Restart (finished)")
        self._pca = self.rpca.get_state()
        if self._pca is None:
            return
        self._variance_ratio = self._pca.explained_variance_ratio_
        self._cumulative = numpy.cumsum(self._variance_ratio)
        self.plot.clear()
        self._setup_plot()
        self._transformed = None
        self.commit()

    def _setup_plot(self):
        explained_ratio = self._variance_ratio
        explained = self._cumulative
        (p, ) = explained.shape

        self.plot.plot(numpy.arange(p), explained_ratio,
                       pen=pg.mkPen(QColor(Qt.red), width=2),
                       antialias=True,
                       name="Variance")
        self.plot.plot(numpy.arange(p), explained,
                       pen=pg.mkPen(QColor(Qt.darkYellow), width=2),
                       antialias=True,
                       name="Cumulative Variance")

        self._line = pg.InfiniteLine(
            angle=90, pos=self._nselected_components() - 1, movable=True,
            bounds=(0, p - 1)
        )
        self._line.setCursor(Qt.SizeHorCursor)
        self._line.setPen(pg.mkPen(QColor(Qt.darkGray), width=1.5))
        self._line.sigPositionChanged.connect(self._on_cut_changed)

        self.plot.addItem(self._line)
        self.plot.setRange(xRange=(0.0, p - 1), yRange=(0.0, 1.0))
        axis = self.plot.getAxis("bottom")
        axis.setTicks([[(i, "C{}".format(i + 1)) for i in range(p)]])

    def _on_cut_changed(self, line):
        # cut changed by means of a cut line over the scree plot.
        value = line.value()
        current = self._nselected_components()
        components = int(numpy.floor(value)) + 1

        if not (self.ncomponents == 0 and
                components == len(self._variance_ratio)):
            self.ncomponents = components

        if self._pca is not None:
            self.variance_covered = self._cumulative[components - 1] * 100

        if current != self._nselected_components():
            self._invalidate_selection()

    def _update_selection_component_spin(self):
        # cut changed by "ncomponents" spin.
        if self._pca is None:
            return

        if self.ncomponents == 0:
            # Special "All" value
            cut = len(self._variance_ratio)
        else:
            cut = self.ncomponents
        self.variance_covered = self._cumulative[cut - 1] * 100

        if numpy.floor(self._line.value()) + 1 != cut:
            self._line.setValue(cut - 1)

        self._invalidate_selection()

    def _update_selection_variance_spin(self):
        # cut changed by "max variance" spin.
        if self._pca is None:
            return

        cut = numpy.searchsorted(self._cumulative,
                                 self.variance_covered / 100.0)
        self.ncomponents = cut + 1
        if numpy.floor(self._line.value()) + 1 != cut:
            self._line.setValue(cut - 1)
        self._invalidate_selection()

    def _nselected_components(self):
        """Return the number of selected components."""
        if self._pca is None:
            return 0

        if self.ncomponents == 0:
            # Special "All" value
            max_comp = len(self._variance_ratio)
        else:
            max_comp = self.ncomponents

        var_max = self._cumulative[max_comp - 1]
        if var_max != numpy.floor(self.variance_covered / 100.0):
            cut = max_comp
            self.variance_covered = var_max * 100
        else:
            self.ncomponents = cut = numpy.searchsorted(
                self._cumulative, self.variance_covered / 100.0) + 1
        return cut

    def _invalidate_selection(self):
        self.commit()

    def commit(self):
        transformed = components = None
        if self._pca is not None:
            components = self._pca.components_
            if self._transformed is None:
                # Compute the full transform (all components) only once.
                self._transformed = self._pca(self.data)
            transformed = self._transformed

            domain = Orange.data.Domain(
                transformed.domain.attributes[:self.ncomponents],
                self.data.domain.class_vars,
                self.data.domain.metas
            )
            transformed = transformed.from_table(domain, transformed)
            components = Orange.data.Table.from_numpy(None, components)
            components.name = 'components'

        self.send("Transformed data", transformed)
        self.send("Components", components)