def start(self): if 'Abort' in self.start_button.text(): self.rpca.abort() self.__timer.stop() self.start_button.setText("Start remote computation") else: self.address = self.addresstext.text() with remote.server(self.address): from Orange.projection.pca import RemotePCA maxiter = (1e5 + self.data.approx_len()) / self.batch_size * 3 self.rpca = RemotePCA(self.data, self.batch_size, int(maxiter)) self.update_model() self.start_button.setText("Abort remote computation")
def test_PCA(self): iris_v = ['Iris-setosa', 'Iris-virginica', 'Iris-versicolor'] table = SqlTable(self.conn, self.iris, type_hints=Domain([], DiscreteVariable("iris", values=iris_v))) for batch_size in (50, 500): rpca = RemotePCA(table, batch_size, 20) self.assertEqual(rpca.components_.shape, (4, 4))
def test_PCA(self): table = SqlTable(connection_params(), 'iris', type_hints=Domain([], DiscreteVariable( "iris", values=[ 'Iris-setosa', 'Iris-virginica', 'Iris-versicolor' ]))) for batch_size in (50, 500): rpca = RemotePCA(table, batch_size, 10) self.assertEqual(rpca.components_.shape, (4, 4))
class OWPCA(widget.OWWidget): name = "PCA" description = "Principal component analysis with a scree-diagram." icon = "icons/PCA.svg" priority = 3050 inputs = [("Data", Table, "set_data")] outputs = [("Transformed data", Table), ("Components", Table), ("PCA", PCA)] ncomponents = settings.Setting(2) variance_covered = settings.Setting(100) batch_size = settings.Setting(100) address = settings.Setting('') auto_update = settings.Setting(True) auto_commit = settings.Setting(True) normalize = settings.Setting(True) maxp = settings.Setting(20) axis_labels = settings.Setting(10) graph_name = "plot.plotItem" def __init__(self): super().__init__() self.data = None self._pca = None self._transformed = None self._variance_ratio = None self._cumulative = None self._line = False self._pca_projector = PCA() self._pca_projector.component = self.ncomponents self._pca_preprocessors = PCA.preprocessors # Components Selection box = gui.vBox(self.controlArea, "Components Selection") form = QFormLayout() box.layout().addLayout(form) self.components_spin = gui.spin( box, self, "ncomponents", 0, 1000, callback=self._update_selection_component_spin, keyboardTracking=False ) self.components_spin.setSpecialValueText("All") self.variance_spin = gui.spin( box, self, "variance_covered", 1, 100, callback=self._update_selection_variance_spin, keyboardTracking=False ) self.variance_spin.setSuffix("%") form.addRow("Components:", self.components_spin) form.addRow("Variance covered:", self.variance_spin) # Incremental learning self.sampling_box = gui.vBox(self.controlArea, "Incremental learning") self.addresstext = QLineEdit(box) self.addresstext.setPlaceholderText('Remote server') if self.address: self.addresstext.setText(self.address) self.sampling_box.layout().addWidget(self.addresstext) form = QFormLayout() self.sampling_box.layout().addLayout(form) self.batch_spin = gui.spin( self.sampling_box, self, "batch_size", 50, 100000, step=50, keyboardTracking=False) form.addRow("Batch size ~ ", self.batch_spin) self.start_button = gui.button( self.sampling_box, self, "Start remote computation", callback=self.start, autoDefault=False, tooltip="Start/abort computation on the server") self.start_button.setEnabled(False) gui.checkBox(self.sampling_box, self, "auto_update", "Periodically fetch model", callback=self.update_model) self.__timer = QTimer(self, interval=2000) self.__timer.timeout.connect(self.get_model) self.sampling_box.setVisible(remotely) # Options self.options_box = gui.vBox(self.controlArea, "Options") gui.checkBox(self.options_box, self, "normalize", "Normalize data", callback=self._update_normalize) self.maxp_spin = gui.spin( self.options_box, self, "maxp", 1, 100, label="Show only first", callback=self._setup_plot, keyboardTracking=False ) self.controlArea.layout().addStretch() gui.auto_commit(self.controlArea, self, "auto_commit", "Apply", checkbox_label="Apply automatically") self.plot = pg.PlotWidget(background="w") axis = self.plot.getAxis("bottom") axis.setLabel("Principal Components") axis = self.plot.getAxis("left") axis.setLabel("Proportion of variance") self.plot_horlabels = [] self.plot_horlines = [] self.plot.getViewBox().setMenuEnabled(False) self.plot.getViewBox().setMouseEnabled(False, False) self.plot.showGrid(True, True, alpha=0.5) self.plot.setRange(xRange=(0.0, 1.0), yRange=(0.0, 1.0)) self.mainArea.layout().addWidget(self.plot) self._update_normalize() def update_model(self): self.get_model() if self.auto_update and self.rpca and not self.rpca.ready(): self.__timer.start(2000) else: self.__timer.stop() def start(self): if 'Abort' in self.start_button.text(): self.rpca.abort() self.__timer.stop() self.start_button.setText("Start remote computation") else: self.address = self.addresstext.text() with remote.server(self.address): from Orange.projection.pca import RemotePCA maxiter = (1e5 + self.data.approx_len()) / self.batch_size * 3 self.rpca = RemotePCA(self.data, self.batch_size, int(maxiter)) self.update_model() self.start_button.setText("Abort remote computation") def set_data(self, data): self.information(0) if isinstance(data, SqlTable): if data.approx_len() < AUTO_DL_LIMIT: data = Table(data) elif not remotely: self.information(0, "Data has been sampled") data_sample = data.sample_time(1, no_cache=True) data_sample.download_data(2000, partial=True) data = Table(data_sample) self.data = data self.fit() def fit(self): self.clear() self.start_button.setEnabled(False) if self.data is None: return data = self.data self._transformed = None if isinstance(data, SqlTable): # data was big and remote available self.sampling_box.setVisible(True) self.start_button.setText("Start remote computation") self.start_button.setEnabled(True) else: self.sampling_box.setVisible(False) pca = self._pca_projector(data) variance_ratio = pca.explained_variance_ratio_ cumulative = numpy.cumsum(variance_ratio) self.components_spin.setRange(0, len(cumulative)) self._pca = pca self._variance_ratio = variance_ratio self._cumulative = cumulative self._setup_plot() self.unconditional_commit() def clear(self): self._pca = None self._transformed = None self._variance_ratio = None self._cumulative = None self._line = None self.plot_horlabels = [] self.plot_horlines = [] self.plot.clear() def get_model(self): if self.rpca is None: return if self.rpca.ready(): self.__timer.stop() self.start_button.setText("Restart (finished)") self._pca = self.rpca.get_state() if self._pca is None: return self._variance_ratio = self._pca.explained_variance_ratio_ self._cumulative = numpy.cumsum(self._variance_ratio) self._setup_plot() self._transformed = None self.commit() def _setup_plot(self): self.plot.clear() explained_ratio = self._variance_ratio explained = self._cumulative p = min(len(self._variance_ratio), self.maxp) self.plot.plot(numpy.arange(p), explained_ratio[:p], pen=pg.mkPen(QColor(Qt.red), width=2), antialias=True, name="Variance") self.plot.plot(numpy.arange(p), explained[:p], pen=pg.mkPen(QColor(Qt.darkYellow), width=2), antialias=True, name="Cumulative Variance") cutpos = self._nselected_components() - 1 self._line = pg.InfiniteLine( angle=90, pos=cutpos, movable=True, bounds=(0, p - 1)) self._line.setCursor(Qt.SizeHorCursor) self._line.setPen(pg.mkPen(QColor(Qt.black), width=2)) self._line.sigPositionChanged.connect(self._on_cut_changed) self.plot.addItem(self._line) self.plot_horlines = ( pg.PlotCurveItem(pen=pg.mkPen(QColor(Qt.blue), style=Qt.DashLine)), pg.PlotCurveItem(pen=pg.mkPen(QColor(Qt.blue), style=Qt.DashLine))) self.plot_horlabels = ( pg.TextItem(color=QColor(Qt.black), anchor=(1, 0)), pg.TextItem(color=QColor(Qt.black), anchor=(1, 1))) for item in self.plot_horlabels + self.plot_horlines: self.plot.addItem(item) self._set_horline_pos() self.plot.setRange(xRange=(0.0, p - 1), yRange=(0.0, 1.0)) self._update_axis() def _set_horline_pos(self): cutidx = self.ncomponents - 1 for line, label, curve in zip(self.plot_horlines, self.plot_horlabels, (self._variance_ratio, self._cumulative)): y = curve[cutidx] line.setData([-1, cutidx], 2 * [y]) label.setPos(cutidx, y) label.setPlainText("{:.3f}".format(y)) def _on_cut_changed(self, line): # cut changed by means of a cut line over the scree plot. value = int(round(line.value())) self._line.setValue(value) current = self._nselected_components() components = value + 1 if not (self.ncomponents == 0 and components == len(self._variance_ratio)): self.ncomponents = components self._set_horline_pos() if self._pca is not None: self.variance_covered = self._cumulative[components - 1] * 100 if current != self._nselected_components(): self._invalidate_selection() def _update_selection_component_spin(self): # cut changed by "ncomponents" spin. if self._pca is None: self._invalidate_selection() return if self.ncomponents == 0: # Special "All" value cut = len(self._variance_ratio) else: cut = self.ncomponents self.variance_covered = self._cumulative[cut - 1] * 100 if numpy.floor(self._line.value()) + 1 != cut: self._line.setValue(cut - 1) self._invalidate_selection() def _update_selection_variance_spin(self): # cut changed by "max variance" spin. if self._pca is None: return cut = numpy.searchsorted(self._cumulative, self.variance_covered / 100.0) + 1 self.ncomponents = cut if numpy.floor(self._line.value()) + 1 != cut: self._line.setValue(cut - 1) self._invalidate_selection() def _update_normalize(self): if self.normalize: pp = self._pca_preprocessors + [Normalize()] else: pp = self._pca_preprocessors self._pca_projector.preprocessors = pp self.fit() if self.data is None: self._invalidate_selection() def _nselected_components(self): """Return the number of selected components.""" if self._pca is None: return 0 if self.ncomponents == 0: # Special "All" value max_comp = len(self._variance_ratio) else: max_comp = self.ncomponents var_max = self._cumulative[max_comp - 1] if var_max != numpy.floor(self.variance_covered / 100.0): cut = max_comp self.variance_covered = var_max * 100 else: self.ncomponents = cut = numpy.searchsorted( self._cumulative, self.variance_covered / 100.0) + 1 return cut def _invalidate_selection(self): self.commit() def _update_axis(self): p = min(len(self._variance_ratio), self.maxp) axis = self.plot.getAxis("bottom") d = max((p-1)//(self.axis_labels-1), 1) axis.setTicks([[(i, str(i+1)) for i in range(0, p, d)]]) def commit(self): transformed = components = None if self._pca is not None: if self._transformed is None: # Compute the full transform (all components) only once. self._transformed = self._pca(self.data) transformed = self._transformed domain = Domain( transformed.domain.attributes[:self.ncomponents], self.data.domain.class_vars, self.data.domain.metas ) transformed = transformed.from_table(domain, transformed) dom = Domain(self._pca.orig_domain.attributes, metas=[StringVariable(name='component')]) metas = numpy.array([['PC{}'.format(i + 1) for i in range(self.ncomponents)]], dtype=object).T components = Table(dom, self._pca.components_[:self.ncomponents], metas=metas) components.name = 'components' self._pca_projector.component = self.ncomponents self.send("Transformed data", transformed) self.send("Components", components) self.send("PCA", self._pca_projector) def send_report(self): if self.data is None: return self.report_items(( ("Selected components", self.ncomponents), ("Explained variance", "{:.3f} %".format(self.variance_covered)) )) self.report_plot()
class OWPCA(widget.OWWidget): name = "PCA" description = "Principal component analysis with a scree-diagram." icon = "icons/PCA.svg" priority = 3050 class Inputs: data = Input("Data", Table) class Outputs: transformed_data = Output("Transformed data", Table) components = Output("Components", Table) pca = Output("PCA", PCA, dynamic=False) settingsHandler = settings.DomainContextHandler() ncomponents = settings.Setting(2) variance_covered = settings.Setting(100) batch_size = settings.Setting(100) address = settings.Setting('') auto_update = settings.Setting(True) auto_commit = settings.Setting(True) normalize = settings.ContextSetting(True) decomposition_idx = settings.ContextSetting(0) maxp = settings.Setting(20) axis_labels = settings.Setting(10) graph_name = "plot.plotItem" class Warning(widget.OWWidget.Warning): trivial_components = widget.Msg( "All components of the PCA are trivial (explain 0 variance). " "Input data is constant (or near constant).") class Error(widget.OWWidget.Error): no_features = widget.Msg("At least 1 feature is required") no_instances = widget.Msg("At least 1 data instance is required") sparse_data = widget.Msg("Sparse data is not supported") def __init__(self): super().__init__() self.data = None self._pca = None self._transformed = None self._variance_ratio = None self._cumulative = None self._line = False self._init_projector() # Components Selection box = gui.vBox(self.controlArea, "Components Selection") form = QFormLayout() box.layout().addLayout(form) self.components_spin = gui.spin( box, self, "ncomponents", 1, MAX_COMPONENTS, callback=self._update_selection_component_spin, keyboardTracking=False) self.components_spin.setSpecialValueText("All") self.variance_spin = gui.spin( box, self, "variance_covered", 1, 100, callback=self._update_selection_variance_spin, keyboardTracking=False) self.variance_spin.setSuffix("%") form.addRow("Components:", self.components_spin) form.addRow("Variance covered:", self.variance_spin) # Incremental learning self.sampling_box = gui.vBox(self.controlArea, "Incremental learning") self.addresstext = QLineEdit(box) self.addresstext.setPlaceholderText('Remote server') if self.address: self.addresstext.setText(self.address) self.sampling_box.layout().addWidget(self.addresstext) form = QFormLayout() self.sampling_box.layout().addLayout(form) self.batch_spin = gui.spin(self.sampling_box, self, "batch_size", 50, 100000, step=50, keyboardTracking=False) form.addRow("Batch size ~ ", self.batch_spin) self.start_button = gui.button( self.sampling_box, self, "Start remote computation", callback=self.start, autoDefault=False, tooltip="Start/abort computation on the server") self.start_button.setEnabled(False) gui.checkBox(self.sampling_box, self, "auto_update", "Periodically fetch model", callback=self.update_model) self.__timer = QTimer(self, interval=2000) self.__timer.timeout.connect(self.get_model) self.sampling_box.setVisible(remotely) # Decomposition self.decomposition_box = gui.radioButtons( self.controlArea, self, "decomposition_idx", [d.name for d in DECOMPOSITIONS], box="Decomposition", callback=self._update_decomposition) # Options self.options_box = gui.vBox(self.controlArea, "Options") self.normalize_box = gui.checkBox(self.options_box, self, "normalize", "Normalize data", callback=self._update_normalize) self.maxp_spin = gui.spin(self.options_box, self, "maxp", 1, MAX_COMPONENTS, label="Show only first", callback=self._setup_plot, keyboardTracking=False) self.controlArea.layout().addStretch() gui.auto_commit(self.controlArea, self, "auto_commit", "Apply", checkbox_label="Apply automatically") self.plot = pg.PlotWidget(background="w") axis = self.plot.getAxis("bottom") axis.setLabel("Principal Components") axis = self.plot.getAxis("left") axis.setLabel("Proportion of variance") self.plot_horlabels = [] self.plot_horlines = [] self.plot.getViewBox().setMenuEnabled(False) self.plot.getViewBox().setMouseEnabled(False, False) self.plot.showGrid(True, True, alpha=0.5) self.plot.setRange(xRange=(0.0, 1.0), yRange=(0.0, 1.0)) self.mainArea.layout().addWidget(self.plot) self._update_normalize() def update_model(self): self.get_model() if self.auto_update and self.rpca and not self.rpca.ready(): self.__timer.start(2000) else: self.__timer.stop() def update_buttons(self, sparse_data=False): if sparse_data: self.normalize = False buttons = self.decomposition_box.buttons for cls, button in zip(DECOMPOSITIONS, buttons): button.setDisabled(sparse_data and not cls.supports_sparse) if not buttons[self.decomposition_idx].isEnabled(): # Set decomposition index to first sparse-enabled decomposition for i, cls in enumerate(DECOMPOSITIONS): if cls.supports_sparse: self.decomposition_idx = i break self._init_projector() def start(self): if 'Abort' in self.start_button.text(): self.rpca.abort() self.__timer.stop() self.start_button.setText("Start remote computation") else: self.address = self.addresstext.text() with remote.server(self.address): from Orange.projection.pca import RemotePCA maxiter = (1e5 + self.data.approx_len()) / self.batch_size * 3 self.rpca = RemotePCA(self.data, self.batch_size, int(maxiter)) self.update_model() self.start_button.setText("Abort remote computation") @Inputs.data def set_data(self, data): self.closeContext() self.clear_messages() self.clear() self.start_button.setEnabled(False) self.information() self.data = None if isinstance(data, SqlTable): if data.approx_len() < AUTO_DL_LIMIT: data = Table(data) elif not remotely: self.information("Data has been sampled") data_sample = data.sample_time(1, no_cache=True) data_sample.download_data(2000, partial=True) data = Table(data_sample) else: # data was big and remote available self.sampling_box.setVisible(True) self.start_button.setText("Start remote computation") self.start_button.setEnabled(True) if not isinstance(data, SqlTable): self.sampling_box.setVisible(False) if isinstance(data, Table): if len(data.domain.attributes) == 0: self.Error.no_features() self.clear_outputs() return if len(data) == 0: self.Error.no_instances() self.clear_outputs() return self.openContext(data) sparse_data = data is not None and data.is_sparse() self.normalize_box.setDisabled(sparse_data) self.update_buttons(sparse_data=sparse_data) self.data = data self.fit() def fit(self): self.clear() self.Warning.trivial_components.clear() if self.data is None: return data = self.data if not isinstance(data, SqlTable): pca = self._pca_projector(data) variance_ratio = pca.explained_variance_ratio_ cumulative = numpy.cumsum(variance_ratio) if numpy.isfinite(cumulative[-1]): self.components_spin.setRange(0, len(cumulative)) self._pca = pca self._variance_ratio = variance_ratio self._cumulative = cumulative self._setup_plot() else: self.Warning.trivial_components() self.unconditional_commit() def clear(self): self._pca = None self._transformed = None self._variance_ratio = None self._cumulative = None self._line = None self.plot_horlabels = [] self.plot_horlines = [] self.plot.clear() def clear_outputs(self): self.Outputs.transformed_data.send(None) self.Outputs.components.send(None) self.Outputs.pca.send(self._pca_projector) def get_model(self): if self.rpca is None: return if self.rpca.ready(): self.__timer.stop() self.start_button.setText("Restart (finished)") self._pca = self.rpca.get_state() if self._pca is None: return self._variance_ratio = self._pca.explained_variance_ratio_ self._cumulative = numpy.cumsum(self._variance_ratio) self._setup_plot() self._transformed = None self.commit() def _setup_plot(self): self.plot.clear() if self._pca is None: return explained_ratio = self._variance_ratio explained = self._cumulative p = min(len(self._variance_ratio), self.maxp) self.plot.plot(numpy.arange(p), explained_ratio[:p], pen=pg.mkPen(QColor(Qt.red), width=2), antialias=True, name="Variance") self.plot.plot(numpy.arange(p), explained[:p], pen=pg.mkPen(QColor(Qt.darkYellow), width=2), antialias=True, name="Cumulative Variance") cutpos = self._nselected_components() - 1 self._line = pg.InfiniteLine(angle=90, pos=cutpos, movable=True, bounds=(0, p - 1)) self._line.setCursor(Qt.SizeHorCursor) self._line.setPen(pg.mkPen(QColor(Qt.black), width=2)) self._line.sigPositionChanged.connect(self._on_cut_changed) self.plot.addItem(self._line) self.plot_horlines = ( pg.PlotCurveItem(pen=pg.mkPen(QColor(Qt.blue), style=Qt.DashLine)), pg.PlotCurveItem(pen=pg.mkPen(QColor(Qt.blue), style=Qt.DashLine))) self.plot_horlabels = (pg.TextItem(color=QColor(Qt.black), anchor=(1, 0)), pg.TextItem(color=QColor(Qt.black), anchor=(1, 1))) for item in self.plot_horlabels + self.plot_horlines: self.plot.addItem(item) self._set_horline_pos() self.plot.setRange(xRange=(0.0, p - 1), yRange=(0.0, 1.0)) self._update_axis() def _set_horline_pos(self): cutidx = self.ncomponents - 1 for line, label, curve in zip( self.plot_horlines, self.plot_horlabels, (self._variance_ratio, self._cumulative)): y = curve[cutidx] line.setData([-1, cutidx], 2 * [y]) label.setPos(cutidx, y) label.setPlainText("{:.3f}".format(y)) def _on_cut_changed(self, line): # cut changed by means of a cut line over the scree plot. value = int(round(line.value())) self._line.setValue(value) current = self._nselected_components() components = value + 1 if not (self.ncomponents == 0 and components == len(self._variance_ratio)): self.ncomponents = components self._set_horline_pos() if self._pca is not None: var = self._cumulative[components - 1] if numpy.isfinite(var): self.variance_covered = int(var * 100) if current != self._nselected_components(): self._invalidate_selection() def _update_selection_component_spin(self): # cut changed by "ncomponents" spin. if self._pca is None: self._invalidate_selection() return if self.ncomponents == 0: # Special "All" value cut = len(self._variance_ratio) else: cut = self.ncomponents var = self._cumulative[cut - 1] if numpy.isfinite(var): self.variance_covered = int(var * 100) if numpy.floor(self._line.value()) + 1 != cut: self._line.setValue(cut - 1) self._invalidate_selection() def _update_selection_variance_spin(self): # cut changed by "max variance" spin. if self._pca is None: return cut = numpy.searchsorted(self._cumulative, self.variance_covered / 100.0) + 1 cut = min(cut, len(self._cumulative)) self.ncomponents = cut if numpy.floor(self._line.value()) + 1 != cut: self._line.setValue(cut - 1) self._invalidate_selection() def _update_normalize(self): if self.normalize: pp = self._pca_preprocessors + [Normalize()] else: pp = self._pca_preprocessors self._pca_projector.preprocessors = pp self.fit() if self.data is None: self._invalidate_selection() def _init_projector(self): cls = DECOMPOSITIONS[self.decomposition_idx] self._pca_projector = cls(n_components=MAX_COMPONENTS) self._pca_projector.component = self.ncomponents self._pca_preprocessors = cls.preprocessors def _update_decomposition(self): self._init_projector() self._update_normalize() def _nselected_components(self): """Return the number of selected components.""" if self._pca is None: return 0 if self.ncomponents == 0: # Special "All" value max_comp = len(self._variance_ratio) else: max_comp = self.ncomponents var_max = self._cumulative[max_comp - 1] if var_max != numpy.floor(self.variance_covered / 100.0): cut = max_comp assert numpy.isfinite(var_max) self.variance_covered = int(var_max * 100) else: self.ncomponents = cut = numpy.searchsorted( self._cumulative, self.variance_covered / 100.0) + 1 return cut def _invalidate_selection(self): self.commit() def _update_axis(self): p = min(len(self._variance_ratio), self.maxp) axis = self.plot.getAxis("bottom") d = max((p - 1) // (self.axis_labels - 1), 1) axis.setTicks([[(i, str(i + 1)) for i in range(0, p, d)]]) def commit(self): transformed = components = None if self._pca is not None: if self._transformed is None: # Compute the full transform (MAX_COMPONENTS components) only once. self._transformed = self._pca(self.data) transformed = self._transformed domain = Domain(transformed.domain.attributes[:self.ncomponents], self.data.domain.class_vars, self.data.domain.metas) transformed = transformed.from_table(domain, transformed) dom = Domain([ ContinuousVariable(a.name) for a in self._pca.orig_domain.attributes ], metas=[StringVariable(name='component')]) metas = numpy.array( [['PC{}'.format(i + 1) for i in range(self.ncomponents)]], dtype=object).T components = Table(dom, self._pca.components_[:self.ncomponents], metas=metas) components.name = 'components' self._pca_projector.component = self.ncomponents self.Outputs.transformed_data.send(transformed) self.Outputs.components.send(components) self.Outputs.pca.send(self._pca_projector) def send_report(self): if self.data is None: return self.report_items( (("Decomposition", DECOMPOSITIONS[self.decomposition_idx].name), ("Normalize data", str(self.normalize)), ("Selected components", self.ncomponents), ("Explained variance", "{:.3f} %".format(self.variance_covered)))) self.report_plot() @classmethod def migrate_settings(cls, settings, version): if "variance_covered" in settings: # Due to the error in gh-1896 the variance_covered was persisted # as a NaN value, causing a TypeError in the widgets `__init__`. vc = settings["variance_covered"] if isinstance(vc, numbers.Real): if numpy.isfinite(vc): vc = int(vc) else: vc = 100 settings["variance_covered"] = vc if settings["ncomponents"] > MAX_COMPONENTS: settings["ncomponents"] = MAX_COMPONENTS
class OWPCA(widget.OWWidget): name = "PCA" description = "Principal component analysis with a scree-diagram." icon = "icons/PCA.svg" priority = 3050 keywords = ["principal component analysis", "linear transformation"] class Inputs: data = Input("Data", Table) class Outputs: transformed_data = Output("Transformed data", Table) components = Output("Components", Table) pca = Output("PCA", PCA, dynamic=False) preprocessor = Output("Preprocessor", Preprocess) settingsHandler = settings.DomainContextHandler() ncomponents = settings.Setting(2) variance_covered = settings.Setting(100) batch_size = settings.Setting(100) address = settings.Setting('') auto_update = settings.Setting(True) auto_commit = settings.Setting(True) normalize = settings.ContextSetting(True) decomposition_idx = settings.ContextSetting(0) maxp = settings.Setting(20) axis_labels = settings.Setting(10) graph_name = "plot.plotItem" class Warning(widget.OWWidget.Warning): trivial_components = widget.Msg( "All components of the PCA are trivial (explain 0 variance). " "Input data is constant (or near constant).") class Error(widget.OWWidget.Error): no_features = widget.Msg("At least 1 feature is required") no_instances = widget.Msg("At least 1 data instance is required") sparse_data = widget.Msg("Sparse data is not supported") def __init__(self): super().__init__() self.data = None self._pca = None self._transformed = None self._variance_ratio = None self._cumulative = None self._line = False self._init_projector() # Components Selection box = gui.vBox(self.controlArea, "Components Selection") form = QFormLayout() box.layout().addLayout(form) self.components_spin = gui.spin( box, self, "ncomponents", 1, MAX_COMPONENTS, callback=self._update_selection_component_spin, keyboardTracking=False ) self.components_spin.setSpecialValueText("All") self.variance_spin = gui.spin( box, self, "variance_covered", 1, 100, callback=self._update_selection_variance_spin, keyboardTracking=False ) self.variance_spin.setSuffix("%") form.addRow("Components:", self.components_spin) form.addRow("Variance covered:", self.variance_spin) # Incremental learning self.sampling_box = gui.vBox(self.controlArea, "Incremental learning") self.addresstext = QLineEdit(box) self.addresstext.setPlaceholderText('Remote server') if self.address: self.addresstext.setText(self.address) self.sampling_box.layout().addWidget(self.addresstext) form = QFormLayout() self.sampling_box.layout().addLayout(form) self.batch_spin = gui.spin( self.sampling_box, self, "batch_size", 50, 100000, step=50, keyboardTracking=False) form.addRow("Batch size ~ ", self.batch_spin) self.start_button = gui.button( self.sampling_box, self, "Start remote computation", callback=self.start, autoDefault=False, tooltip="Start/abort computation on the server") self.start_button.setEnabled(False) gui.checkBox(self.sampling_box, self, "auto_update", "Periodically fetch model", callback=self.update_model) self.__timer = QTimer(self, interval=2000) self.__timer.timeout.connect(self.get_model) self.sampling_box.setVisible(remotely) # Decomposition self.decomposition_box = gui.radioButtons( self.controlArea, self, "decomposition_idx", [d.name for d in DECOMPOSITIONS], box="Decomposition", callback=self._update_decomposition ) # Options self.options_box = gui.vBox(self.controlArea, "Options") self.normalize_box = gui.checkBox( self.options_box, self, "normalize", "Normalize data", callback=self._update_normalize ) self.maxp_spin = gui.spin( self.options_box, self, "maxp", 1, MAX_COMPONENTS, label="Show only first", callback=self._setup_plot, keyboardTracking=False ) self.controlArea.layout().addStretch() gui.auto_commit(self.controlArea, self, "auto_commit", "Apply", checkbox_label="Apply automatically") self.plot = pg.PlotWidget(background="w") axis = self.plot.getAxis("bottom") axis.setLabel("Principal Components") axis = self.plot.getAxis("left") axis.setLabel("Proportion of variance") self.plot_horlabels = [] self.plot_horlines = [] self.plot.getViewBox().setMenuEnabled(False) self.plot.getViewBox().setMouseEnabled(False, False) self.plot.showGrid(True, True, alpha=0.5) self.plot.setRange(xRange=(0.0, 1.0), yRange=(0.0, 1.0)) self.mainArea.layout().addWidget(self.plot) self._update_normalize() def update_model(self): self.get_model() if self.auto_update and self.rpca and not self.rpca.ready(): self.__timer.start(2000) else: self.__timer.stop() def update_buttons(self, sparse_data=False): if sparse_data: self.normalize = False buttons = self.decomposition_box.buttons for cls, button in zip(DECOMPOSITIONS, buttons): button.setDisabled(sparse_data and not cls.supports_sparse) if not buttons[self.decomposition_idx].isEnabled(): # Set decomposition index to first sparse-enabled decomposition for i, cls in enumerate(DECOMPOSITIONS): if cls.supports_sparse: self.decomposition_idx = i break self._init_projector() def start(self): if 'Abort' in self.start_button.text(): self.rpca.abort() self.__timer.stop() self.start_button.setText("Start remote computation") else: self.address = self.addresstext.text() with remote.server(self.address): from Orange.projection.pca import RemotePCA maxiter = (1e5 + self.data.approx_len()) / self.batch_size * 3 self.rpca = RemotePCA(self.data, self.batch_size, int(maxiter)) self.update_model() self.start_button.setText("Abort remote computation") @Inputs.data def set_data(self, data): self.closeContext() self.clear_messages() self.clear() self.start_button.setEnabled(False) self.information() self.data = None if isinstance(data, SqlTable): if data.approx_len() < AUTO_DL_LIMIT: data = Table(data) elif not remotely: self.information("Data has been sampled") data_sample = data.sample_time(1, no_cache=True) data_sample.download_data(2000, partial=True) data = Table(data_sample) else: # data was big and remote available self.sampling_box.setVisible(True) self.start_button.setText("Start remote computation") self.start_button.setEnabled(True) if not isinstance(data, SqlTable): self.sampling_box.setVisible(False) if isinstance(data, Table): if len(data.domain.attributes) == 0: self.Error.no_features() self.clear_outputs() return if len(data) == 0: self.Error.no_instances() self.clear_outputs() return self.openContext(data) sparse_data = data is not None and data.is_sparse() self.normalize_box.setDisabled(sparse_data) self.update_buttons(sparse_data=sparse_data) self.data = data self.fit() def fit(self): self.clear() self.Warning.trivial_components.clear() if self.data is None: return data = self.data self._pca_projector.preprocessors = \ self._pca_preprocessors + ([Normalize()] if self.normalize else []) if not isinstance(data, SqlTable): pca = self._pca_projector(data) variance_ratio = pca.explained_variance_ratio_ cumulative = numpy.cumsum(variance_ratio) if numpy.isfinite(cumulative[-1]): self.components_spin.setRange(0, len(cumulative)) self._pca = pca self._variance_ratio = variance_ratio self._cumulative = cumulative self._setup_plot() else: self.Warning.trivial_components() self.unconditional_commit() def clear(self): self._pca = None self._transformed = None self._variance_ratio = None self._cumulative = None self._line = None self.plot_horlabels = [] self.plot_horlines = [] self.plot.clear() def clear_outputs(self): self.Outputs.transformed_data.send(None) self.Outputs.components.send(None) self.Outputs.pca.send(self._pca_projector) self.Outputs.preprocessor.send(None) def get_model(self): if self.rpca is None: return if self.rpca.ready(): self.__timer.stop() self.start_button.setText("Restart (finished)") self._pca = self.rpca.get_state() if self._pca is None: return self._variance_ratio = self._pca.explained_variance_ratio_ self._cumulative = numpy.cumsum(self._variance_ratio) self._setup_plot() self._transformed = None self.commit() def _setup_plot(self): self.plot.clear() if self._pca is None: return explained_ratio = self._variance_ratio explained = self._cumulative p = min(len(self._variance_ratio), self.maxp) self.plot.plot(numpy.arange(p), explained_ratio[:p], pen=pg.mkPen(QColor(Qt.red), width=2), antialias=True, name="Variance") self.plot.plot(numpy.arange(p), explained[:p], pen=pg.mkPen(QColor(Qt.darkYellow), width=2), antialias=True, name="Cumulative Variance") cutpos = self._nselected_components() - 1 self._line = pg.InfiniteLine( angle=90, pos=cutpos, movable=True, bounds=(0, p - 1)) self._line.setCursor(Qt.SizeHorCursor) self._line.setPen(pg.mkPen(QColor(Qt.black), width=2)) self._line.sigPositionChanged.connect(self._on_cut_changed) self.plot.addItem(self._line) self.plot_horlines = ( pg.PlotCurveItem(pen=pg.mkPen(QColor(Qt.blue), style=Qt.DashLine)), pg.PlotCurveItem(pen=pg.mkPen(QColor(Qt.blue), style=Qt.DashLine))) self.plot_horlabels = ( pg.TextItem(color=QColor(Qt.black), anchor=(1, 0)), pg.TextItem(color=QColor(Qt.black), anchor=(1, 1))) for item in self.plot_horlabels + self.plot_horlines: self.plot.addItem(item) self._set_horline_pos() self.plot.setRange(xRange=(0.0, p - 1), yRange=(0.0, 1.0)) self._update_axis() def _set_horline_pos(self): cutidx = self.ncomponents - 1 for line, label, curve in zip(self.plot_horlines, self.plot_horlabels, (self._variance_ratio, self._cumulative)): y = curve[cutidx] line.setData([-1, cutidx], 2 * [y]) label.setPos(cutidx, y) label.setPlainText("{:.3f}".format(y)) def _on_cut_changed(self, line): # cut changed by means of a cut line over the scree plot. value = int(round(line.value())) self._line.setValue(value) current = self._nselected_components() components = value + 1 if not (self.ncomponents == 0 and components == len(self._variance_ratio)): self.ncomponents = components self._set_horline_pos() if self._pca is not None: var = self._cumulative[components - 1] if numpy.isfinite(var): self.variance_covered = int(var * 100) if current != self._nselected_components(): self._invalidate_selection() def _update_selection_component_spin(self): # cut changed by "ncomponents" spin. if self._pca is None: self._invalidate_selection() return if self.ncomponents == 0: # Special "All" value cut = len(self._variance_ratio) else: cut = self.ncomponents var = self._cumulative[cut - 1] if numpy.isfinite(var): self.variance_covered = int(var * 100) if numpy.floor(self._line.value()) + 1 != cut: self._line.setValue(cut - 1) self._invalidate_selection() def _update_selection_variance_spin(self): # cut changed by "max variance" spin. if self._pca is None: return cut = numpy.searchsorted(self._cumulative, self.variance_covered / 100.0) + 1 cut = min(cut, len(self._cumulative)) self.ncomponents = cut if numpy.floor(self._line.value()) + 1 != cut: self._line.setValue(cut - 1) self._invalidate_selection() def _update_normalize(self): self.fit() if self.data is None: self._invalidate_selection() def _init_projector(self): cls = DECOMPOSITIONS[self.decomposition_idx] self._pca_projector = cls(n_components=MAX_COMPONENTS) self._pca_projector.component = self.ncomponents self._pca_preprocessors = cls.preprocessors def _update_decomposition(self): self._init_projector() self._update_normalize() def _nselected_components(self): """Return the number of selected components.""" if self._pca is None: return 0 if self.ncomponents == 0: # Special "All" value max_comp = len(self._variance_ratio) else: max_comp = self.ncomponents var_max = self._cumulative[max_comp - 1] if var_max != numpy.floor(self.variance_covered / 100.0): cut = max_comp assert numpy.isfinite(var_max) self.variance_covered = int(var_max * 100) else: self.ncomponents = cut = numpy.searchsorted( self._cumulative, self.variance_covered / 100.0) + 1 return cut def _invalidate_selection(self): self.commit() def _update_axis(self): p = min(len(self._variance_ratio), self.maxp) axis = self.plot.getAxis("bottom") d = max((p-1)//(self.axis_labels-1), 1) axis.setTicks([[(i, str(i+1)) for i in range(0, p, d)]]) def commit(self): transformed = components = pp = None if self._pca is not None: if self._transformed is None: # Compute the full transform (MAX_COMPONENTS components) only once. self._transformed = self._pca(self.data) transformed = self._transformed domain = Domain( transformed.domain.attributes[:self.ncomponents], self.data.domain.class_vars, self.data.domain.metas ) transformed = transformed.from_table(domain, transformed) # prevent caching new features by defining compute_value dom = Domain([ContinuousVariable(a.name, compute_value=lambda _: None) for a in self._pca.orig_domain.attributes], metas=[StringVariable(name='component')]) metas = numpy.array([['PC{}'.format(i + 1) for i in range(self.ncomponents)]], dtype=object).T components = Table(dom, self._pca.components_[:self.ncomponents], metas=metas) components.name = 'components' pp = ApplyDomain(domain, "PCA") self._pca_projector.component = self.ncomponents self.Outputs.transformed_data.send(transformed) self.Outputs.components.send(components) self.Outputs.pca.send(self._pca_projector) self.Outputs.preprocessor.send(pp) def send_report(self): if self.data is None: return self.report_items(( ("Decomposition", DECOMPOSITIONS[self.decomposition_idx].name), ("Normalize data", str(self.normalize)), ("Selected components", self.ncomponents), ("Explained variance", "{:.3f} %".format(self.variance_covered)) )) self.report_plot() @classmethod def migrate_settings(cls, settings, version): if "variance_covered" in settings: # Due to the error in gh-1896 the variance_covered was persisted # as a NaN value, causing a TypeError in the widgets `__init__`. vc = settings["variance_covered"] if isinstance(vc, numbers.Real): if numpy.isfinite(vc): vc = int(vc) else: vc = 100 settings["variance_covered"] = vc if settings.get("ncomponents", 0) > MAX_COMPONENTS: settings["ncomponents"] = MAX_COMPONENTS
class OWPCA(widget.OWWidget): name = "PCA" description = "Principal component analysis with a scree-diagram." icon = "icons/PCA.svg" priority = 3050 inputs = [("Data", Table, "set_data")] outputs = [("Transformed data", Table), ("Components", Table), ("PCA", PCA)] ncomponents = settings.Setting(2) variance_covered = settings.Setting(100) batch_size = settings.Setting(100) address = settings.Setting('') auto_update = settings.Setting(True) auto_commit = settings.Setting(True) normalize = settings.Setting(True) maxp = settings.Setting(20) axis_labels = settings.Setting(10) graph_name = "plot.plotItem" def __init__(self): super().__init__() self.data = None self._pca = None self._transformed = None self._variance_ratio = None self._cumulative = None self._line = False self._pca_projector = PCA() self._pca_projector.component = 0 self._pca_preprocessors = PCA.preprocessors # Components Selection box = gui.vBox(self.controlArea, "Components Selection") form = QFormLayout() box.layout().addLayout(form) self.components_spin = gui.spin( box, self, "ncomponents", 0, 1000, callback=self._update_selection_component_spin, keyboardTracking=False) self.components_spin.setSpecialValueText("All") self.variance_spin = gui.spin( box, self, "variance_covered", 1, 100, callback=self._update_selection_variance_spin, keyboardTracking=False) self.variance_spin.setSuffix("%") form.addRow("Components", self.components_spin) form.addRow("Variance covered", self.variance_spin) # Incremental learning self.sampling_box = gui.vBox(self.controlArea, "Incremental learning") self.addresstext = QLineEdit(box) self.addresstext.setPlaceholderText('Remote server') if self.address: self.addresstext.setText(self.address) self.sampling_box.layout().addWidget(self.addresstext) form = QFormLayout() self.sampling_box.layout().addLayout(form) self.batch_spin = gui.spin(self.sampling_box, self, "batch_size", 50, 100000, step=50, keyboardTracking=False) form.addRow("Batch size ~ ", self.batch_spin) self.start_button = gui.button( self.sampling_box, self, "Start remote computation", callback=self.start, autoDefault=False, tooltip="Start/abort computation on the server") self.start_button.setEnabled(False) gui.checkBox(self.sampling_box, self, "auto_update", "Periodically fetch model", callback=self.update_model) self.__timer = QTimer(self, interval=2000) self.__timer.timeout.connect(self.get_model) self.sampling_box.setVisible(remotely) # Options self.options_box = gui.vBox(self.controlArea, "Options") gui.checkBox(self.options_box, self, "normalize", "Normalize data", callback=self._update_normalize) self.maxp_spin = gui.spin(self.options_box, self, "maxp", 1, 100, label="Show only first", callback=self._setup_plot, keyboardTracking=False) self.controlArea.layout().addStretch() gui.auto_commit(self.controlArea, self, "auto_commit", "Send data", checkbox_label="Auto send on change") self.plot = pg.PlotWidget(background="w") axis = self.plot.getAxis("bottom") axis.setLabel("Principal Components") axis = self.plot.getAxis("left") axis.setLabel("Proportion of variance") self.plot.getViewBox().setMenuEnabled(False) self.plot.getViewBox().setMouseEnabled(False, False) self.plot.showGrid(True, True, alpha=0.5) self.plot.setRange(xRange=(0.0, 1.0), yRange=(0.0, 1.0)) self.mainArea.layout().addWidget(self.plot) self._update_normalize() def update_model(self): self.get_model() if self.auto_update and self.rpca and not self.rpca.ready(): self.__timer.start(2000) else: self.__timer.stop() def start(self): if 'Abort' in self.start_button.text(): self.rpca.abort() self.__timer.stop() self.start_button.setText("Start remote computation") else: self.address = self.addresstext.text() with remote.server(self.address): from Orange.projection.pca import RemotePCA maxiter = (1e5 + self.data.approx_len()) / self.batch_size * 3 self.rpca = RemotePCA(self.data, self.batch_size, int(maxiter)) self.update_model() self.start_button.setText("Abort remote computation") def set_data(self, data): self.information(0) if isinstance(data, SqlTable): if data.approx_len() < AUTO_DL_LIMIT: data = Table(data) elif not remotely: self.information(0, "Data has been sampled") data_sample = data.sample_time(1, no_cache=True) data_sample.download_data(2000, partial=True) data = Table(data_sample) self.data = data self.fit() def fit(self): self.clear() self.start_button.setEnabled(False) if self.data is None: return data = self.data self._transformed = None if isinstance(data, SqlTable): # data was big and remote available self.sampling_box.setVisible(True) self.start_button.setText("Start remote computation") self.start_button.setEnabled(True) else: self.sampling_box.setVisible(False) pca = self._pca_projector(data) variance_ratio = pca.explained_variance_ratio_ cumulative = numpy.cumsum(variance_ratio) self.components_spin.setRange(0, len(cumulative)) self._pca = pca self._variance_ratio = variance_ratio self._cumulative = cumulative self._setup_plot() self.unconditional_commit() def clear(self): self._pca = None self._transformed = None self._variance_ratio = None self._cumulative = None self._line = None self.plot.clear() def get_model(self): if self.rpca is None: return if self.rpca.ready(): self.__timer.stop() self.start_button.setText("Restart (finished)") self._pca = self.rpca.get_state() if self._pca is None: return self._variance_ratio = self._pca.explained_variance_ratio_ self._cumulative = numpy.cumsum(self._variance_ratio) self._setup_plot() self._transformed = None self.commit() def _setup_plot(self): self.plot.clear() explained_ratio = self._variance_ratio explained = self._cumulative p = min(len(self._variance_ratio), self.maxp) self.plot.plot(numpy.arange(p), explained_ratio[:p], pen=pg.mkPen(QColor(Qt.red), width=2), antialias=True, name="Variance") self.plot.plot(numpy.arange(p), explained[:p], pen=pg.mkPen(QColor(Qt.darkYellow), width=2), antialias=True, name="Cumulative Variance") self._line = pg.InfiniteLine(angle=90, pos=self._nselected_components() - 1, movable=True, bounds=(0, p - 1)) self._line.setCursor(Qt.SizeHorCursor) self._line.setPen(pg.mkPen(QColor(Qt.darkGray), width=5)) self._line.sigPositionChanged.connect(self._on_cut_changed) self.plot.addItem(self._line) self.plot.setRange(xRange=(0.0, p - 1), yRange=(0.0, 1.0)) self._update_axis() def _on_cut_changed(self, line): # cut changed by means of a cut line over the scree plot. value = line.value() self._line.setValue(round(value)) current = self._nselected_components() components = int(numpy.floor(value)) + 1 if not (self.ncomponents == 0 and components == len(self._variance_ratio)): self.ncomponents = components if self._pca is not None: self.variance_covered = self._cumulative[components - 1] * 100 if current != self._nselected_components(): self._invalidate_selection() def _update_selection_component_spin(self): # cut changed by "ncomponents" spin. if self._pca is None: return if self.ncomponents == 0: # Special "All" value cut = len(self._variance_ratio) else: cut = self.ncomponents self.variance_covered = self._cumulative[cut - 1] * 100 if numpy.floor(self._line.value()) + 1 != cut: self._line.setValue(cut - 1) self._invalidate_selection() def _update_selection_variance_spin(self): # cut changed by "max variance" spin. if self._pca is None: return cut = numpy.searchsorted(self._cumulative, self.variance_covered / 100.0) self.ncomponents = cut + 1 if numpy.floor(self._line.value()) + 1 != cut: self._line.setValue(cut - 1) self._invalidate_selection() def _update_normalize(self): if self.normalize: pp = self._pca_preprocessors + [Normalize()] else: pp = self._pca_preprocessors self._pca_projector.preprocessors = pp self.fit() if self.data is None: self._invalidate_selection() def _nselected_components(self): """Return the number of selected components.""" if self._pca is None: return 0 if self.ncomponents == 0: # Special "All" value max_comp = len(self._variance_ratio) else: max_comp = self.ncomponents var_max = self._cumulative[max_comp - 1] if var_max != numpy.floor(self.variance_covered / 100.0): cut = max_comp self.variance_covered = var_max * 100 else: self.ncomponents = cut = numpy.searchsorted( self._cumulative, self.variance_covered / 100.0) + 1 return cut def _invalidate_selection(self): self.commit() def _update_axis(self): p = min(len(self._variance_ratio), self.maxp) axis = self.plot.getAxis("bottom") d = max((p - 1) // (self.axis_labels - 1), 1) axis.setTicks([[(i, str(i + 1)) for i in range(0, p, d)]]) def commit(self): transformed = components = None if self._pca is not None: if self._transformed is None: # Compute the full transform (all components) only once. self._transformed = self._pca(self.data) transformed = self._transformed domain = Domain(transformed.domain.attributes[:self.ncomponents], self.data.domain.class_vars, self.data.domain.metas) transformed = transformed.from_table(domain, transformed) dom = Domain(self._pca.orig_domain.attributes, metas=[StringVariable(name='component')]) metas = numpy.array( [['PC{}'.format(i + 1) for i in range(self.ncomponents)]], dtype=object).T components = Table(dom, self._pca.components_[:self.ncomponents], metas=metas) components.name = 'components' self.send("Transformed data", transformed) self.send("Components", components) self.send("PCA", self._pca_projector) def send_report(self): if self.data is None: return self.report_items( (("Selected components", self.ncomponents), ("Explained variance", "{:.3f} %".format(self.variance_covered)))) self.report_plot()
class OWPCA(widget.OWWidget): name = "PCA" description = "Principal component analysis" icon = "icons/PCA.svg" priority = 3050 inputs = [("Data", Orange.data.Table, "set_data")] outputs = [("Transformed data", Orange.data.Table), ("Components", Orange.data.Table)] ncomponents = settings.Setting(2) variance_covered = settings.Setting(100) batch_size = settings.Setting(100) address = settings.Setting('localhost:9465') auto_update = settings.Setting(True) auto_commit = settings.Setting(True) def __init__(self, parent=None): super().__init__(parent) self.data = None self._pca = None self._transformed = None self._variance_ratio = None self._cumulative = None self._line = False box = gui.widgetBox(self.controlArea, "Components Selection") form = QFormLayout() box.layout().addLayout(form) self.components_spin = gui.spin( box, self, "ncomponents", 0, 1000, callback=self._update_selection_component_spin, keyboardTracking=False) self.components_spin.setSpecialValueText("All") self.variance_spin = gui.spin( box, self, "variance_covered", 1, 100, callback=self._update_selection_variance_spin, keyboardTracking=False) self.variance_spin.setSuffix("%") form.addRow("Components", self.components_spin) form.addRow("Variance covered", self.variance_spin) self.sampling_box = gui.widgetBox(self.controlArea, "Incremental learning") self.addresstext = QLineEdit(box) self.addresstext.setPlaceholderText('Remote server') if self.address: self.addresstext.setText(self.address) self.sampling_box.layout().addWidget(self.addresstext) form = QFormLayout() self.sampling_box.layout().addLayout(form) self.batch_spin = gui.spin(self.sampling_box, self, "batch_size", 50, 100000, step=50, keyboardTracking=False) form.addRow("Batch size ~ ", self.batch_spin) self.start_button = gui.button( self.sampling_box, self, "Start remote computation", callback=self.start, autoDefault=False, tooltip="Start/abort computation on the server") self.start_button.setEnabled(False) gui.checkBox(self.sampling_box, self, "auto_update", "Periodically fetch model", callback=self.update_model) self.__timer = QTimer(self, interval=2000) self.__timer.timeout.connect(self.get_model) self.sampling_box.setVisible(remotely) self.controlArea.layout().addStretch() gui.auto_commit(self.controlArea, self, "auto_commit", "Send data", checkbox_label="Auto send on change") self.plot = pg.PlotWidget(background="w") axis = self.plot.getAxis("bottom") axis.setLabel("Principal Components") axis = self.plot.getAxis("left") axis.setLabel("Proportion of variance") self.plot.getViewBox().setMenuEnabled(False) self.plot.getViewBox().setMouseEnabled(False, False) self.plot.showGrid(True, True, alpha=0.5) self.plot.setRange(xRange=(0.0, 1.0), yRange=(0.0, 1.0)) self.mainArea.layout().addWidget(self.plot) def update_model(self): self.get_model() if self.auto_update and self.rpca and not self.rpca.ready(): self.__timer.start(2000) else: self.__timer.stop() def start(self): if 'Abort' in self.start_button.text(): self.rpca.abort() self.__timer.stop() self.start_button.setText("Start remote computation") else: self.address = self.addresstext.text() with remote.server(self.address): from Orange.projection.pca import RemotePCA maxiter = (1e5 + self.data.approx_len()) / self.batch_size * 3 self.rpca = RemotePCA(self.data, self.address, self.batch_size, int(maxiter)) self.update_model() self.start_button.setText("Abort remote computation") def set_data(self, data): self.clear() self.data = data self.start_button.setEnabled(False) if data is not None: self._transformed = None if remotely and isinstance(data, SqlTable): self.sampling_box.setVisible(True) self.start_button.setText("Start remote computation") self.start_button.setEnabled(True) else: self.sampling_box.setVisible(False) pca = Orange.projection.PCA() pca = pca(self.data) variance_ratio = pca.explained_variance_ratio_ cumulative = numpy.cumsum(variance_ratio) self.components_spin.setRange(0, len(cumulative)) self._pca = pca self._variance_ratio = variance_ratio self._cumulative = cumulative self._setup_plot() self.unconditional_commit() def clear(self): self.data = None self._pca = None self._transformed = None self._variance_ratio = None self._cumulative = None self._line = None self.plot.clear() def get_model(self): if self.rpca is None: return if self.rpca.ready(): self.__timer.stop() self.start_button.setText("Restart (finished)") self._pca = self.rpca.get_state() if self._pca is None: return self._variance_ratio = self._pca.explained_variance_ratio_ self._cumulative = numpy.cumsum(self._variance_ratio) self.plot.clear() self._setup_plot() self._transformed = None self.commit() def _setup_plot(self): explained_ratio = self._variance_ratio explained = self._cumulative (p, ) = explained.shape self.plot.plot(numpy.arange(p), explained_ratio, pen=pg.mkPen(QColor(Qt.red), width=2), antialias=True, name="Variance") self.plot.plot(numpy.arange(p), explained, pen=pg.mkPen(QColor(Qt.darkYellow), width=2), antialias=True, name="Cumulative Variance") self._line = pg.InfiniteLine(angle=90, pos=self._nselected_components() - 1, movable=True, bounds=(0, p - 1)) self._line.setCursor(Qt.SizeHorCursor) self._line.setPen(pg.mkPen(QColor(Qt.darkGray), width=1.5)) self._line.sigPositionChanged.connect(self._on_cut_changed) self.plot.addItem(self._line) self.plot.setRange(xRange=(0.0, p - 1), yRange=(0.0, 1.0)) axis = self.plot.getAxis("bottom") axis.setTicks([[(i, "C{}".format(i + 1)) for i in range(p)]]) def _on_cut_changed(self, line): # cut changed by means of a cut line over the scree plot. value = line.value() current = self._nselected_components() components = int(numpy.floor(value)) + 1 if not (self.ncomponents == 0 and components == len(self._variance_ratio)): self.ncomponents = components if self._pca is not None: self.variance_covered = self._cumulative[components - 1] * 100 if current != self._nselected_components(): self._invalidate_selection() def _update_selection_component_spin(self): # cut changed by "ncomponents" spin. if self._pca is None: return if self.ncomponents == 0: # Special "All" value cut = len(self._variance_ratio) else: cut = self.ncomponents self.variance_covered = self._cumulative[cut - 1] * 100 if numpy.floor(self._line.value()) + 1 != cut: self._line.setValue(cut - 1) self._invalidate_selection() def _update_selection_variance_spin(self): # cut changed by "max variance" spin. if self._pca is None: return cut = numpy.searchsorted(self._cumulative, self.variance_covered / 100.0) self.ncomponents = cut + 1 if numpy.floor(self._line.value()) + 1 != cut: self._line.setValue(cut - 1) self._invalidate_selection() def _nselected_components(self): """Return the number of selected components.""" if self._pca is None: return 0 if self.ncomponents == 0: # Special "All" value max_comp = len(self._variance_ratio) else: max_comp = self.ncomponents var_max = self._cumulative[max_comp - 1] if var_max != numpy.floor(self.variance_covered / 100.0): cut = max_comp self.variance_covered = var_max * 100 else: self.ncomponents = cut = numpy.searchsorted( self._cumulative, self.variance_covered / 100.0) + 1 return cut def _invalidate_selection(self): self.commit() def commit(self): transformed = components = None if self._pca is not None: components = self._pca.components_ if self._transformed is None: # Compute the full transform (all components) only once. self._transformed = self._pca(self.data) transformed = self._transformed domain = Orange.data.Domain( transformed.domain.attributes[:self.ncomponents], self.data.domain.class_vars, self.data.domain.metas) transformed = transformed.from_table(domain, transformed) components = Orange.data.Table.from_numpy(None, components) components.name = 'components' self.send("Transformed data", transformed) self.send("Components", components)
class OWPCA(widget.OWWidget): name = "PCA" description = "Principal component analysis with a scree-diagram." icon = "icons/PCA.svg" priority = 3050 inputs = [("Data", Orange.data.Table, "set_data")] outputs = [("Transformed data", Orange.data.Table), ("Components", Orange.data.Table)] ncomponents = settings.Setting(2) variance_covered = settings.Setting(100) batch_size = settings.Setting(100) address = settings.Setting('') auto_update = settings.Setting(True) auto_commit = settings.Setting(True) def __init__(self, parent=None): super().__init__(parent) self.data = None self._pca = None self._transformed = None self._variance_ratio = None self._cumulative = None self._line = False box = gui.widgetBox(self.controlArea, "Components Selection") form = QFormLayout() box.layout().addLayout(form) self.components_spin = gui.spin( box, self, "ncomponents", 0, 1000, callback=self._update_selection_component_spin, keyboardTracking=False ) self.components_spin.setSpecialValueText("All") self.variance_spin = gui.spin( box, self, "variance_covered", 1, 100, callback=self._update_selection_variance_spin, keyboardTracking=False ) self.variance_spin.setSuffix("%") form.addRow("Components", self.components_spin) form.addRow("Variance covered", self.variance_spin) self.sampling_box = gui.widgetBox(self.controlArea, "Incremental learning") self.addresstext = QLineEdit(box) self.addresstext.setPlaceholderText('Remote server') if self.address: self.addresstext.setText(self.address) self.sampling_box.layout().addWidget(self.addresstext) form = QFormLayout() self.sampling_box.layout().addLayout(form) self.batch_spin = gui.spin( self.sampling_box, self, "batch_size", 50, 100000, step=50, keyboardTracking=False) form.addRow("Batch size ~ ", self.batch_spin) self.start_button = gui.button( self.sampling_box, self, "Start remote computation", callback=self.start, autoDefault=False, tooltip="Start/abort computation on the server") self.start_button.setEnabled(False) gui.checkBox(self.sampling_box, self, "auto_update", "Periodically fetch model", callback=self.update_model) self.__timer = QTimer(self, interval=2000) self.__timer.timeout.connect(self.get_model) self.sampling_box.setVisible(remotely) self.controlArea.layout().addStretch() gui.auto_commit(self.controlArea, self, "auto_commit", "Send data", checkbox_label="Auto send on change") self.plot = pg.PlotWidget(background="w") axis = self.plot.getAxis("bottom") axis.setLabel("Principal Components") axis = self.plot.getAxis("left") axis.setLabel("Proportion of variance") self.plot.getViewBox().setMenuEnabled(False) self.plot.getViewBox().setMouseEnabled(False, False) self.plot.showGrid(True, True, alpha=0.5) self.plot.setRange(xRange=(0.0, 1.0), yRange=(0.0, 1.0)) self.mainArea.layout().addWidget(self.plot) def update_model(self): self.get_model() if self.auto_update and self.rpca and not self.rpca.ready(): self.__timer.start(2000) else: self.__timer.stop() def start(self): if 'Abort' in self.start_button.text(): self.rpca.abort() self.__timer.stop() self.start_button.setText("Start remote computation") else: self.address = self.addresstext.text() with remote.server(self.address): from Orange.projection.pca import RemotePCA maxiter = (1e5 + self.data.approx_len()) / self.batch_size * 3 self.rpca = RemotePCA(self.data, self.address, self.batch_size, int(maxiter)) self.update_model() self.start_button.setText("Abort remote computation") def set_data(self, data): self.clear() self.data = data self.start_button.setEnabled(False) if data is not None: self._transformed = None if remotely and isinstance(data, SqlTable): self.sampling_box.setVisible(True) self.start_button.setText("Start remote computation") self.start_button.setEnabled(True) else: self.sampling_box.setVisible(False) pca = Orange.projection.PCA() pca = pca(self.data) variance_ratio = pca.explained_variance_ratio_ cumulative = numpy.cumsum(variance_ratio) self.components_spin.setRange(0, len(cumulative)) self._pca = pca self._variance_ratio = variance_ratio self._cumulative = cumulative self._setup_plot() self.unconditional_commit() def clear(self): self.data = None self._pca = None self._transformed = None self._variance_ratio = None self._cumulative = None self._line = None self.plot.clear() def get_model(self): if self.rpca is None: return if self.rpca.ready(): self.__timer.stop() self.start_button.setText("Restart (finished)") self._pca = self.rpca.get_state() if self._pca is None: return self._variance_ratio = self._pca.explained_variance_ratio_ self._cumulative = numpy.cumsum(self._variance_ratio) self.plot.clear() self._setup_plot() self._transformed = None self.commit() def _setup_plot(self): explained_ratio = self._variance_ratio explained = self._cumulative (p, ) = explained.shape self.plot.plot(numpy.arange(p), explained_ratio, pen=pg.mkPen(QColor(Qt.red), width=2), antialias=True, name="Variance") self.plot.plot(numpy.arange(p), explained, pen=pg.mkPen(QColor(Qt.darkYellow), width=2), antialias=True, name="Cumulative Variance") self._line = pg.InfiniteLine( angle=90, pos=self._nselected_components() - 1, movable=True, bounds=(0, p - 1) ) self._line.setCursor(Qt.SizeHorCursor) self._line.setPen(pg.mkPen(QColor(Qt.darkGray), width=1.5)) self._line.sigPositionChanged.connect(self._on_cut_changed) self.plot.addItem(self._line) self.plot.setRange(xRange=(0.0, p - 1), yRange=(0.0, 1.0)) axis = self.plot.getAxis("bottom") axis.setTicks([[(i, "C{}".format(i + 1)) for i in range(p)]]) def _on_cut_changed(self, line): # cut changed by means of a cut line over the scree plot. value = line.value() current = self._nselected_components() components = int(numpy.floor(value)) + 1 if not (self.ncomponents == 0 and components == len(self._variance_ratio)): self.ncomponents = components if self._pca is not None: self.variance_covered = self._cumulative[components - 1] * 100 if current != self._nselected_components(): self._invalidate_selection() def _update_selection_component_spin(self): # cut changed by "ncomponents" spin. if self._pca is None: return if self.ncomponents == 0: # Special "All" value cut = len(self._variance_ratio) else: cut = self.ncomponents self.variance_covered = self._cumulative[cut - 1] * 100 if numpy.floor(self._line.value()) + 1 != cut: self._line.setValue(cut - 1) self._invalidate_selection() def _update_selection_variance_spin(self): # cut changed by "max variance" spin. if self._pca is None: return cut = numpy.searchsorted(self._cumulative, self.variance_covered / 100.0) self.ncomponents = cut + 1 if numpy.floor(self._line.value()) + 1 != cut: self._line.setValue(cut - 1) self._invalidate_selection() def _nselected_components(self): """Return the number of selected components.""" if self._pca is None: return 0 if self.ncomponents == 0: # Special "All" value max_comp = len(self._variance_ratio) else: max_comp = self.ncomponents var_max = self._cumulative[max_comp - 1] if var_max != numpy.floor(self.variance_covered / 100.0): cut = max_comp self.variance_covered = var_max * 100 else: self.ncomponents = cut = numpy.searchsorted( self._cumulative, self.variance_covered / 100.0) + 1 return cut def _invalidate_selection(self): self.commit() def commit(self): transformed = components = None if self._pca is not None: components = self._pca.components_ if self._transformed is None: # Compute the full transform (all components) only once. self._transformed = self._pca(self.data) transformed = self._transformed domain = Orange.data.Domain( transformed.domain.attributes[:self.ncomponents], self.data.domain.class_vars, self.data.domain.metas ) transformed = transformed.from_table(domain, transformed) components = Orange.data.Table.from_numpy(None, components) components.name = 'components' self.send("Transformed data", transformed) self.send("Components", components)