Esempio n. 1
0
class OWRank(widget.OWWidget):
    name = "Rank"
    description = "Rank and filter data features by their relevance."
    icon = "icons/Rank.svg"
    priority = 1102

    inputs = [("Data", Orange.data.Table, "setData"),
              ("Scorer", score.Scorer, "set_learner", widget.Multiple)]
    outputs = [("Reduced Data", Orange.data.Table)]

    SelectNone, SelectAll, SelectManual, SelectNBest = range(4)

    selectMethod = settings.Setting(SelectNBest)
    nSelected = settings.Setting(5)
    auto_apply = settings.Setting(True)

    # Header state for discrete/continuous scores
    headerState = settings.Setting((None, None))

    def __init__(self):
        super().__init__()
        self.out_domain_desc = None

        self.all_measures = SCORES

        self.selectedMeasures = dict(
            [(name, True) for name in _DEFAULT_SELECTED] +
            [(m.name, False)
             for m in self.all_measures[len(_DEFAULT_SELECTED):]]
        )
        # Discrete (0) or continuous (1) class mode
        self.rankMode = 0

        self.data = None

        self.discMeasures = [m for m in self.all_measures
                             if issubclass(DiscreteVariable, m.score.class_type)]
        self.contMeasures = [m for m in self.all_measures
                             if issubclass(ContinuousVariable, m.score.class_type)]

        selMethBox = gui.widgetBox(
            self.controlArea, "Select attributes", addSpace=True)

        grid = QtGui.QGridLayout()
        grid.setContentsMargins(0, 0, 0, 0)
        self.selectButtons = QtGui.QButtonGroup()
        self.selectButtons.buttonClicked[int].connect(self.setSelectMethod)

        def button(text, buttonid, toolTip=None):
            b = QtGui.QRadioButton(text)
            self.selectButtons.addButton(b, buttonid)
            if toolTip is not None:
                b.setToolTip(toolTip)
            return b

        b1 = button(self.tr("None"), OWRank.SelectNone)
        b2 = button(self.tr("All"), OWRank.SelectAll)
        b3 = button(self.tr("Manual"), OWRank.SelectManual)
        b4 = button(self.tr("Best ranked"), OWRank.SelectNBest)

        s = gui.spin(selMethBox, self, "nSelected", 1, 100,
                     callback=self.nSelectedChanged)

        grid.addWidget(b1, 0, 0)
        grid.addWidget(b2, 1, 0)
        grid.addWidget(b3, 2, 0)
        grid.addWidget(b4, 3, 0)
        grid.addWidget(s, 3, 1)

        self.selectButtons.button(self.selectMethod).setChecked(True)

        selMethBox.layout().addLayout(grid)

        gui.auto_commit(self.controlArea, self, "auto_apply", "Commit",
                        checkbox_label="Commit on any change")

        gui.rubber(self.controlArea)

        # Discrete and continuous table views are stacked
        self.ranksViewStack = QtGui.QStackedLayout()
        self.mainArea.layout().addLayout(self.ranksViewStack)

        self.discRanksView = QtGui.QTableView()
        self.ranksViewStack.addWidget(self.discRanksView)
        self.discRanksView.setSelectionBehavior(QtGui.QTableView.SelectRows)
        self.discRanksView.setSelectionMode(QtGui.QTableView.MultiSelection)
        self.discRanksView.setSortingEnabled(True)

        self.discRanksLabels = ["#"] + [m.shortname for m in self.discMeasures]
        self.discRanksModel = QtGui.QStandardItemModel(self)
        self.discRanksModel.setHorizontalHeaderLabels(self.discRanksLabels)

        self.discRanksProxyModel = MySortProxyModel(self)
        self.discRanksProxyModel.setSourceModel(self.discRanksModel)
        self.discRanksView.setModel(self.discRanksProxyModel)

        self.discRanksView.setColumnWidth(0, 20)
        self.discRanksView.sortByColumn(1, Qt.DescendingOrder)
        self.discRanksView.selectionModel().selectionChanged.connect(
            self.commit
        )
        self.discRanksView.pressed.connect(self.onSelectItem)
        self.discRanksView.horizontalHeader().sectionClicked.connect(
            self.headerClick
        )

        if self.headerState[0] is not None:
            self.discRanksView.horizontalHeader().restoreState(
            self.headerState[0]
        )

        self.contRanksView = QtGui.QTableView()
        self.ranksViewStack.addWidget(self.contRanksView)
        self.contRanksView.setSelectionBehavior(QtGui.QTableView.SelectRows)
        self.contRanksView.setSelectionMode(QtGui.QTableView.MultiSelection)
        self.contRanksView.setSortingEnabled(True)

        self.contRanksLabels = ["#"] + [m.shortname for m in self.contMeasures]
        self.contRanksModel = QtGui.QStandardItemModel(self)
        self.contRanksModel.setHorizontalHeaderLabels(self.contRanksLabels)

        self.contRanksProxyModel = MySortProxyModel(self)
        self.contRanksProxyModel.setSourceModel(self.contRanksModel)
        self.contRanksView.setModel(self.contRanksProxyModel)

        self.discRanksView.setColumnWidth(0, 20)
        self.contRanksView.sortByColumn(1, Qt.DescendingOrder)
        self.contRanksView.selectionModel().selectionChanged.connect(
            self.commit
        )
        self.contRanksView.pressed.connect(self.onSelectItem)
        self.contRanksView.horizontalHeader().sectionClicked.connect(
            self.headerClick
        )
        if self.headerState[1] is not None:
            self.contRanksView.horizontalHeader().restoreState(
            self.headerState[1]
        )

        # Switch the current view to Discrete
        self.switchRanksMode(0)
        self.resetInternals()
        self.updateDelegates()
        self.updateVisibleScoreColumns()

        self.resize(690, 500)

        self.measure_scores = table((len(self.measures), 0), None)
        self.learners = {}

    def switchRanksMode(self, index):
        """
        Switch between discrete/continuous mode
        """
        self.rankMode = index
        self.ranksViewStack.setCurrentIndex(index)

        if index == 0:
            self.ranksView = self.discRanksView
            self.ranksModel = self.discRanksModel
            self.ranksProxyModel = self.discRanksProxyModel
            self.measures = self.discMeasures
        else:
            self.ranksView = self.contRanksView
            self.ranksModel = self.contRanksModel
            self.ranksProxyModel = self.contRanksProxyModel
            self.measures = self.contMeasures

        self.updateVisibleScoreColumns()

    @check_sql_input
    def setData(self, data):
        self.error([0, 100])
        self.resetInternals()

        if data is not None and not data.domain.class_var:
            data = None
            self.error(100, "Data does not have a target variable")

        self.data = data
        if self.data is not None:
            attrs = self.data.domain.attributes
            self.usefulAttributes = [attr for attr in attrs
                                     if attr.is_discrete or attr.is_continuous]

            if self.data.domain.has_continuous_class:
                self.switchRanksMode(1)
            elif self.data.domain.has_discrete_class:
                self.switchRanksMode(0)
            else:
                # String or other.
                self.error(0, "Cannot handle class variable type %r" %
                           type(self.data.domain.class_var).__name__)

            self.ranksModel.setRowCount(len(attrs))
            for i, a in enumerate(attrs):
                if a.is_discrete:
                    v = len(a.values)
                else:
                    v = "C"
                item = ScoreValueItem()
                item.setData(v, Qt.DisplayRole)
                self.ranksModel.setItem(i, 0, item)
                item = QtGui.QStandardItem(a.name)
                item.setData(gui.attributeIconDict[a], Qt.DecorationRole)
                self.ranksModel.setVerticalHeaderItem(i, item)

            shape = (len(self.measures) + len(self.learners), len(attrs))
            self.measure_scores = table(shape, None)
            self.updateScores()

        self.selectMethodChanged()
        self.commit()

    def set_learner(self, learner, lid=None):
        if learner is None and lid is not None:
            del self.learners[lid]
        elif learner is not None:
            self.learners[lid] = score_meta(
                learner.name,
                learner.name,
                learner
            )
        attrs_len = 0 if not self.data else len(self.data.domain.attributes)
        shape = (len(self.measures) + len(self.learners), attrs_len)
        self.measure_scores = table(shape, None)
        labels = [v.shortname for k, v in self.learners.items()]
        self.contRanksModel.setHorizontalHeaderLabels(
            self.contRanksLabels + labels
        )
        self.discRanksModel.setHorizontalHeaderLabels(
            self.discRanksLabels + labels
        )
        self.updateScores()
        self.commit()

    def updateScores(self, measuresMask=None):
        """
        Update the current computed scores.

        If `measuresMask` is given it must be an list of bool values
        indicating what measures should be recomputed.

        """
        if not self.data:
            return

        measures = self.measures + [v for k, v in self.learners.items()]
        # Invalidate all warnings
        self.warning(range(max(len(self.discMeasures),
                               len(self.contMeasures))))

        if measuresMask is None:
            # Update all selected measures
            measuresMask = [self.selectedMeasures.get(m.name)
                            for m in self.measures]
            measuresMask = measuresMask + [v.name for k, v in
                                           self.learners.items()]

        data = self.data
        self.error(1)
        for index, (meas, mask) in enumerate(zip(measures, measuresMask)):
            if not mask:
                continue
            if index < len(self.measures):
                estimator = meas.score()
                self.measure_scores[index] = estimator(data)
            else:
                learner = meas.score
                if isinstance(learner, Learner) and \
                        not learner.check_learner_adequacy(self.data.domain):
                    self.error(1, learner.learner_adequacy_err_msg)
                else:
                    self.measure_scores[index] = meas.score.score_data(data)

        self.updateRankModel(measuresMask)
        self.ranksProxyModel.invalidate()
        self.selectMethodChanged()

    def updateRankModel(self, measuresMask=None):
        """
        Update the rankModel.
        """
        values = []
        for i in range(len(self.measure_scores) + 1,
                       self.ranksModel.columnCount()):
            self.ranksModel.removeColumn(i)

        for i, scores in enumerate(self.measure_scores):
            values_one = []
            for j, score in enumerate(scores):
                values_one.append(score)
                item = self.ranksModel.item(j, i + 1)
                if not item:
                    item = ScoreValueItem()
                    self.ranksModel.setItem(j, i + 1, item)
                item.setData(score, Qt.DisplayRole)
            values.append(values_one)

        for i, vals in enumerate(values):
            valid_vals = [v for v in vals if v is not None]
            if valid_vals:
                vmin, vmax = min(valid_vals), max(valid_vals)
                for j, v in enumerate(vals):
                    if v is not None:
                        # Set the bar ratio role for i-th measure.
                        ratio = float((v - vmin) / ((vmax - vmin) or 1))
                        item = self.ranksModel.item(j, i + 1)
                        item.setData(ratio, gui.BarRatioRole)

        self.ranksView.setColumnWidth(0, 20)
        self.ranksView.resizeRowsToContents()

    def resetInternals(self):
        self.data = None
        self.usefulAttributes = []
        self.ranksModel.setRowCount(0)

    def onSelectItem(self, index):
        """
        Called when the user selects/unselects an item in the table view.
        """
        self.selectMethod = OWRank.SelectManual  # Manual
        self.selectButtons.button(self.selectMethod).setChecked(True)
        self.commit()

    def setSelectMethod(self, method):
        if self.selectMethod != method:
            self.selectMethod = method
            self.selectButtons.button(method).setChecked(True)
            self.selectMethodChanged()

    def selectMethodChanged(self):
        if self.selectMethod in [OWRank.SelectNone, OWRank.SelectAll,
                                 OWRank.SelectNBest]:
            self.autoSelection()

    def nSelectedChanged(self):
        self.selectMethod = OWRank.SelectNBest
        self.selectButtons.button(self.selectMethod).setChecked(True)
        self.selectMethodChanged()

    def autoSelection(self):
        selModel = self.ranksView.selectionModel()
        rowCount = self.ranksModel.rowCount()
        columnCount = self.ranksModel.columnCount()
        model = self.ranksProxyModel

        if self.selectMethod == OWRank.SelectNone:
            selection = QtGui.QItemSelection()
        elif self.selectMethod == OWRank.SelectAll:
            selection = QtGui.QItemSelection(
                model.index(0, 0),
                model.index(rowCount - 1, columnCount - 1)
            )
            selModel.select(selection,
                            QtGui.QItemSelectionModel.ClearAndSelect)
        elif self.selectMethod == OWRank.SelectNBest:
            nSelected = min(self.nSelected, rowCount)
            selection = QtGui.QItemSelection(
                model.index(0, 0),
                model.index(nSelected - 1, columnCount - 1)
            )
        else:
            selection = QtGui.QItemSelection()

        selModel.select(selection, QtGui.QItemSelectionModel.ClearAndSelect)

    def headerClick(self, index):
        if index >= 1 and self.selectMethod == OWRank.SelectNBest:
            # Reselect the top ranked attributes
            self.autoSelection()

        # Store the header states
        disc = bytes(self.discRanksView.horizontalHeader().saveState())
        cont = bytes(self.contRanksView.horizontalHeader().saveState())
        self.headerState = (disc, cont)

    def measuresSelectionChanged(self, measure=None):
        """Measure selection has changed. Update column visibility.
        """
        if measure is None:
            # Update all scores
            measuresMask = None
        else:
            # Update scores for shown column if they are not yet computed.
            shown = self.selectedMeasures.get(measure.name, False)
            index = self.measures.index(measure)
            if all(s is None for s in self.measure_scores[index]) and shown:
                measuresMask = [m == measure for m in self.measures]
            else:
                measuresMask = [False] * len(self.measures)
        self.updateScores(measuresMask)

        self.updateVisibleScoreColumns()

    def updateVisibleScoreColumns(self):
        """
        Update the visible columns of the scores view.
        """
        for i, measure in enumerate(self.measures):
            shown = self.selectedMeasures.get(measure.name)
            self.ranksView.setColumnHidden(i + 1, not shown)

    def updateDelegates(self):
        self.contRanksView.setItemDelegate(
            gui.ColoredBarItemDelegate(self)
        )

        self.discRanksView.setItemDelegate(
            gui.ColoredBarItemDelegate(self)
        )

    def send_report(self):
        if not self.data:
            return
        self.report_domain("Input", self.data.domain)
        self.report_table("Ranks", self.ranksView, num_format="{:.3f}")
        if self.out_domain_desc is not None:
            self.report_items("Output", self.out_domain_desc)

    def commit(self):
        selected = self.selectedAttrs()
        if not self.data or not selected:
            self.send("Reduced Data", None)
            self.out_domain_desc = None
        else:
            domain = Orange.data.Domain(selected, self.data.domain.class_var,
                                        metas=self.data.domain.metas)
            data = Orange.data.Table(domain, self.data)
            self.send("Reduced Data", data)
            self.out_domain_desc = report.describe_domain(data.domain)

    def selectedAttrs(self):
        if self.data:
            inds = self.ranksView.selectionModel().selectedRows(0)
            source = self.ranksProxyModel.mapToSource
            inds = map(source, inds)
            inds = [ind.row() for ind in inds]
            return [self.data.domain.attributes[i] for i in inds]
        else:
            return []
Esempio n. 2
0
class OWTableToTimeseries(widget.OWWidget):
    name = 'As Timeseries'
    description = ('Reinterpret data table as a time series object.')
    icon = 'icons/TableToTimeseries.svg'
    priority = 10

    inputs = [("Data", Table, 'set_data')]
    outputs = [(Output.TIMESERIES, Timeseries)]

    want_main_area = False
    resizing_enabled = False

    radio_sequential = settings.Setting(0)
    selected_attr = settings.Setting('')
    autocommit = settings.Setting(True)

    class Error(widget.OWWidget.Error):
        nan_times = widget.Msg('Some values of chosen sequential attribute '
                               '"{}" are NaN, which makes the values '
                               'impossible to sort')

    def __init__(self):
        self.data = None
        box = gui.vBox(self.controlArea, 'Sequence')
        group = gui.radioButtons(box,
                                 self,
                                 'radio_sequential',
                                 callback=self.on_changed)
        hbox = gui.hBox(box)
        gui.appendRadioButton(group, 'Sequential attribute:', insertInto=hbox)

        attrs_model = self.attrs_model = VariableListModel()
        combo_attrs = self.combo_attrs = gui.comboBox(hbox,
                                                      self,
                                                      'selected_attr',
                                                      callback=self.on_changed,
                                                      sendSelectedValue=True)
        combo_attrs.setModel(attrs_model)

        gui.appendRadioButton(group,
                              'Sequence is implied by instance order',
                              insertInto=box)

        gui.auto_commit(self.controlArea, self, 'autocommit', '&Apply')
        # TODO: seasonally adjust data (select attributes & season cycle length (e.g. 12 if you have monthly data))

    def set_data(self, data):
        self.data = data
        self.attrs_model.clear()
        if self.data is None:
            self.commit()
            return
        if data.domain.has_continuous_attributes():
            vars = [var for var in data.domain if isinstance(var, TimeVariable)] + \
                   [var for var in data.domain if var.is_continuous and not isinstance(var, TimeVariable)]
            self.attrs_model.wrap(vars)
            # self.selected_attr = vars.index(getattr(data, 'time_variable', vars[0]))
            self.selected_attr = data.time_variable.name if getattr(
                data, 'time_variable', False) else vars[0].name
        self.on_changed()

    def on_changed(self):
        self.commit()

    def commit(self):
        data = self.data
        self.Error.clear()
        if data is None or self.selected_attr not in data.domain:
            self.send(Output.TIMESERIES, None)
            return

        attrs = data.domain.attributes
        cvars = data.domain.class_vars
        metas = data.domain.metas
        X = data.X
        Y = np.column_stack((data.Y, ))  # make 2d
        M = data.metas

        # Set sequence attribute
        if self.radio_sequential:
            for i in chain(('', ), range(10)):
                name = '__seq__' + str(i)
                if name not in data.domain:
                    break
            time_var = ContinuousVariable(name)
            attrs = attrs.__class__((time_var, )) + attrs
            X = np.column_stack((np.arange(1, len(data) + 1), X))
            data = Table(Domain(attrs, cvars, metas), X, Y, M)
        else:
            # Or make a sequence attribute one of the existing attributes
            # and sort all values according to it
            time_var = data.domain[self.selected_attr]
            values = Table.from_table(Domain([], [], [time_var]),
                                      source=data).metas.ravel()
            if np.isnan(values).any():
                self.Error.nan_times(time_var.name)
                return
            ordered = np.argsort(values)
            if (ordered != np.arange(len(ordered))).any():
                data = data[ordered]

        ts = Timeseries(data.domain, data)
        # TODO: ensure equidistant
        ts.time_variable = time_var
        self.send(Output.TIMESERIES, ts)
Esempio n. 3
0
class OWDistanceMap(widget.OWWidget):
    name = "Distance Map"
    description = "Visualize a distance matrix."
    icon = "icons/DistanceMatrix.svg"
    priority = 1200

    inputs = [("Distances", Orange.misc.DistMatrix, "set_distances")]
    outputs = [("Data", Orange.data.Table), ("Features", widget.AttributeList)]

    sorting = settings.Setting(0)

    colormap = settings.Setting(0)
    color_gamma = settings.Setting(0.0)
    color_low = settings.Setting(0.0)
    color_high = settings.Setting(1.0)

    annotation_idx = settings.Setting(0)

    autocommit = settings.Setting(True)

    def __init__(self, parent=None):
        super().__init__(parent)

        self.matrix = None
        self._tree = None
        self._ordered_tree = None
        self._sorted_matrix = None
        self._sort_indices = None
        self._selection = None

        box = gui.widgetBox(self.controlArea, "Element sorting", margin=0)
        gui.comboBox(
            box,
            self,
            "sorting",
            items=["None", "Clustering", "Clustering with ordered leaves"],
            callback=self._invalidate_ordering)

        box = gui.widgetBox(self.controlArea, "Colors")

        self.colormap_cb = gui.comboBox(box,
                                        self,
                                        "colormap",
                                        callback=self._update_color)
        self.colormap_cb.setIconSize(QSize(64, 16))
        self.palettes = list(sorted(load_default_palettes()))
        init_color_combo(self.colormap_cb, self.palettes, QSize(64, 16))
        self.colormap_cb.setCurrentIndex(self.colormap)

        form = QFormLayout(formAlignment=Qt.AlignLeft,
                           labelAlignment=Qt.AlignLeft,
                           fieldGrowthPolicy=QFormLayout.AllNonFixedFieldsGrow)
        #         form.addRow(
        #             "Gamma",
        #             gui.hSlider(box, self, "color_gamma", minValue=0.0, maxValue=1.0,
        #                         step=0.05, ticks=True, intOnly=False,
        #                         createLabel=False, callback=self._update_color)
        #         )
        form.addRow(
            "Low",
            gui.hSlider(box,
                        self,
                        "color_low",
                        minValue=0.0,
                        maxValue=1.0,
                        step=0.05,
                        ticks=True,
                        intOnly=False,
                        createLabel=False,
                        callback=self._update_color))
        form.addRow(
            "High",
            gui.hSlider(box,
                        self,
                        "color_high",
                        minValue=0.0,
                        maxValue=1.0,
                        step=0.05,
                        ticks=True,
                        intOnly=False,
                        createLabel=False,
                        callback=self._update_color))
        box.layout().addLayout(form)

        box = gui.widgetBox(self.controlArea, "Annotations")
        self.annot_combo = gui.comboBox(box,
                                        self,
                                        "annotation_idx",
                                        callback=self._invalidate_annotations)
        self.annot_combo.setModel(itemmodels.VariableListModel())
        self.annot_combo.model()[:] = ["None", "Enumeration"]
        self.controlArea.layout().addStretch()

        gui.auto_commit(self.controlArea, self, "autocommit", "Send data",
                        "Auto send is on")

        self.view = pg.GraphicsView(background="w")
        self.mainArea.layout().addWidget(self.view)

        self.grid_widget = pg.GraphicsWidget()
        self.grid = QGraphicsGridLayout()
        self.grid_widget.setLayout(self.grid)

        self.viewbox = pg.ViewBox(enableMouse=False)
        self.viewbox.setAcceptedMouseButtons(Qt.NoButton)
        self.viewbox.setAcceptHoverEvents(False)
        self.grid.addItem(self.viewbox, 1, 1)

        self.left_dendrogram = DendrogramWidget(
            self.grid_widget, orientation=DendrogramWidget.Left)
        self.left_dendrogram.setAcceptedMouseButtons(Qt.NoButton)
        self.left_dendrogram.setAcceptHoverEvents(False)

        self.top_dendrogram = DendrogramWidget(
            self.grid_widget, orientation=DendrogramWidget.Top)
        self.top_dendrogram.setAcceptedMouseButtons(Qt.NoButton)
        self.top_dendrogram.setAcceptHoverEvents(False)

        self.grid.addItem(self.left_dendrogram, 1, 0)
        self.grid.addItem(self.top_dendrogram, 0, 1)

        self.right_labels = TextList(alignment=Qt.AlignLeft)

        self.bottom_labels = TextList(orientation=Qt.Horizontal,
                                      alignment=Qt.AlignRight)

        self.grid.addItem(self.right_labels, 1, 2)
        self.grid.addItem(self.bottom_labels, 2, 1)

        self.view.setCentralItem(self.grid_widget)

        self.left_dendrogram.hide()
        self.top_dendrogram.hide()
        self.right_labels.hide()
        self.bottom_labels.hide()

        self.matrix_item = None
        self.dendrogram = None

        self.grid_widget.scene().installEventFilter(self)

    def set_distances(self, matrix):
        self.clear()
        self.error(0)
        if matrix is not None:
            N, _ = matrix.X.shape
            if N < 2:
                self.error(0, "Empty distance matrix.")
                matrix = None

        self.matrix = matrix
        if matrix is not None:
            self.set_items(matrix.row_items, matrix.axis)
        else:
            self.set_items(None)

    def set_items(self, items, axis=1):
        self.items = items
        model = self.annot_combo.model()
        if items is None:
            model[:] = ["None", "Enumeration"]
        elif not axis:
            model[:] = ["None", "Enumeration", "Attribute names"]
            self.annotation_idx = 2
        elif isinstance(items, Orange.data.Table):
            model[:] = ["None", "Enumeration"] + list(items.domain)
        elif isinstance(items, list) and \
                all(isinstance(item, Orange.data.Variable) for item in items):
            model[:] = ["None", "Enumeration", "Name"]
        else:
            model[:] = ["None", "Enumeration"]
        self.annotation_idx = min(self.annotation_idx, len(model) - 1)

    def clear(self):
        self.matrix = None
        self.cluster = None
        self._tree = None
        self._ordered_tree = None
        self._sorted_matrix = None
        self._selection = []
        self._clear_plot()

    def handleNewSignals(self):
        if self.matrix is not None:
            self._update_ordering()
            self._setup_scene()
            self._update_labels()
        self.unconditional_commit()

    def _clear_plot(self):
        def remove(item):
            item.setParentItem(None)
            item.scene().removeItem(item)

        if self.matrix_item:
            remove(self.matrix_item)
            self.matrix_item = None

        self.top_dendrogram.hide()
        self.left_dendrogram.hide()

        self._set_labels(None)

    def _cluster_tree(self):
        if self._tree is None:
            self._tree = hierarchical.dist_matrix_clustering(self.matrix)
        return self._tree

    def _ordered_cluster_tree(self):
        if self._ordered_tree is None:
            tree = self._cluster_tree()
            self._ordered_tree = \
                hierarchical.optimal_leaf_ordering(tree, self.matrix)
        return self._ordered_tree

    def _setup_scene(self):
        self.matrix_item = DistanceMapItem(self._sorted_matrix)
        # Scale the y axis to compensate for pg.ViewBox's y axis invert
        self.matrix_item.scale(1, -1)
        self.viewbox.addItem(self.matrix_item)
        # Set fixed view box range.
        h, w = self._sorted_matrix.shape
        self.viewbox.setRange(QRectF(0, -h, w, h), padding=0)

        self.matrix_item.selectionChanged.connect(self._invalidate_selection)

        if self.sorting == 0:
            tree = None
        elif self.sorting == 1:
            tree = self._cluster_tree()
        else:
            tree = self._ordered_cluster_tree()

        self._set_displayed_dendrogram(tree)

        self._update_color()

    def _set_displayed_dendrogram(self, root):
        self.left_dendrogram.set_root(root)
        self.top_dendrogram.set_root(root)
        self.left_dendrogram.setVisible(root is not None)
        self.top_dendrogram.setVisible(root is not None)

        constraint = 0 if root is None else -1  # 150
        self.left_dendrogram.setMaximumWidth(constraint)
        self.top_dendrogram.setMaximumHeight(constraint)

    def _invalidate_ordering(self):
        self._sorted_matrix = None
        if self.matrix is not None:
            self._update_ordering()
            self._setup_scene()

    def _update_ordering(self):
        if self.sorting == 0:
            self._sorted_matrix = self.matrix.X
            self._sort_indices = None
        else:
            if self.sorting == 1:
                tree = self._cluster_tree()
            elif self.sorting == 2:
                tree = self._ordered_cluster_tree()

            leaves = hierarchical.leaves(tree)
            indices = numpy.array([leaf.value.index for leaf in leaves])
            X = self.matrix.X
            self._sorted_matrix = X[indices[:, numpy.newaxis],
                                    indices[numpy.newaxis, :]]
            self._sort_indices = indices

    def _invalidate_annotations(self):
        if self.matrix is not None:
            self._update_labels()

    def _update_labels(self, ):
        if self.annotation_idx == 0:
            labels = None
        elif self.annotation_idx == 1:
            labels = [str(i + 1) for i in range(self.matrix.dim[0])]
        elif self.annot_combo.model()[
                self.annotation_idx] == "Attribute names":
            attr = self.matrix.row_items.domain.attributes
            labels = [str(attr[i]) for i in range(self.matrix.dim[0])]
        elif self.annotation_idx == 2 and \
                isinstance(self.items, widget.AttributeList):
            labels = [v.name for v in self.items]
        elif isinstance(self.items, Orange.data.Table):
            var = self.annot_combo.model()[self.annotation_idx]
            column, _ = self.items.get_column_view(var)
            labels = [var.repr_val(value) for value in column]

        self._set_labels(labels)

    def _set_labels(self, labels):
        self._labels = labels

        if labels and self.sorting:
            sortind = self._sort_indices
            labels = [labels[i] for i in sortind]

        for textlist in [self.right_labels, self.bottom_labels]:
            textlist.set_labels(labels or [])
            textlist.setVisible(bool(labels))

        constraint = -1 if labels else 0
        self.right_labels.setMaximumWidth(constraint)
        self.bottom_labels.setMaximumHeight(constraint)

    def _update_color(self):
        if self.matrix_item:
            name, colors = self.palettes[self.colormap]
            n, colors = max(colors.items())
            colors = numpy.array(colors, dtype=numpy.ubyte)
            low, high = self.color_low * 255, self.color_high * 255
            points = numpy.linspace(low, high, n)
            space = numpy.linspace(0, 255, 255)

            r = numpy.interp(space, points, colors[:, 0], left=255, right=0)
            g = numpy.interp(space, points, colors[:, 1], left=255, right=0)
            b = numpy.interp(space, points, colors[:, 2], left=255, right=0)
            colortable = numpy.c_[r, g, b]
            self.matrix_item.setLookupTable(colortable)

    def _invalidate_selection(self):
        ranges = self.matrix_item.selections()
        ranges = reduce(iadd, ranges, [])
        indices = reduce(iadd, ranges, [])
        if self.sorting:
            sortind = self._sort_indices
            indices = [sortind[i] for i in indices]
        self._selection = list(sorted(set(indices)))
        self.commit()

    def commit(self):
        datasubset = None
        featuresubset = None

        if not self._selection:
            pass
        elif isinstance(self.items, Orange.data.Table):
            indices = self._selection
            if self.matrix.axis == 1:
                datasubset = self.items.from_table_rows(self.items, indices)
            elif self.matrix.axis == 0:
                domain = Orange.data.Domain(
                    [self.items.domain[i] for i in indices],
                    self.items.domain.class_vars, self.items.domain.metas)
                datasubset = Orange.data.Table.from_table(domain, self.items)
        elif isinstance(self.items, widget.AttributeList):
            subset = [self.items[i] for i in self._selection]
            featuresubset = widget.AttributeList(subset)

        self.send("Data", datasubset)
        self.send("Features", featuresubset)
Esempio n. 4
0
class OWRandomForest(OWBaseLearner):
    name = "随机森林(Random Forest)"
    description = "使用一组决策树进行预测。"
    icon = "icons/RandomForest.svg"
    replaces = [
        "Orange.widgets.classify.owrandomforest.OWRandomForest",
        "Orange.widgets.regression.owrandomforestregression.OWRandomForestRegression",
    ]
    priority = 40
    keywords = []

    LEARNER = RandomForestLearner

    n_estimators = settings.Setting(10)
    max_features = settings.Setting(5)
    use_max_features = settings.Setting(False)
    use_random_state = settings.Setting(False)
    max_depth = settings.Setting(3)
    use_max_depth = settings.Setting(False)
    min_samples_split = settings.Setting(5)
    use_min_samples_split = settings.Setting(True)
    index_output = settings.Setting(0)

    class Error(OWBaseLearner.Error):
        not_enough_features = Msg("Insufficient number of attributes ({})")

    def add_main_layout(self):
        # this is part of init, pylint: disable=attribute-defined-outside-init
        box = gui.vBox(self.controlArea, '基本特性')
        self.n_estimators_spin = gui.spin(box,
                                          self,
                                          "n_estimators",
                                          minv=1,
                                          maxv=10000,
                                          controlWidth=80,
                                          alignment=Qt.AlignRight,
                                          label="树的数量: ",
                                          callback=self.settings_changed)
        self.max_features_spin = gui.spin(
            box,
            self,
            "max_features",
            2,
            50,
            controlWidth=80,
            label="每次拆分时考虑的属性数: ",
            callback=self.settings_changed,
            checked="use_max_features",
            checkCallback=self.settings_changed,
            alignment=Qt.AlignRight,
        )
        self.random_state = gui.checkBox(box,
                                         self,
                                         "use_random_state",
                                         label="可重复的训练",
                                         callback=self.settings_changed)

        box = gui.vBox(self.controlArea, "生长控制")
        self.max_depth_spin = gui.spin(box,
                                       self,
                                       "max_depth",
                                       1,
                                       50,
                                       controlWidth=80,
                                       label="单个树的极限深度: ",
                                       alignment=Qt.AlignRight,
                                       callback=self.settings_changed,
                                       checked="use_max_depth",
                                       checkCallback=self.settings_changed)
        self.min_samples_split_spin = gui.spin(
            box,
            self,
            "min_samples_split",
            2,
            1000,
            controlWidth=80,
            label="小于...不要拆分: ",
            callback=self.settings_changed,
            checked="use_min_samples_split",
            checkCallback=self.settings_changed,
            alignment=Qt.AlignRight)

    def create_learner(self):
        common_args = {"n_estimators": self.n_estimators}
        if self.use_max_features:
            common_args["max_features"] = self.max_features
        if self.use_random_state:
            common_args["random_state"] = 0
        if self.use_max_depth:
            common_args["max_depth"] = self.max_depth
        if self.use_min_samples_split:
            common_args["min_samples_split"] = self.min_samples_split

        return self.LEARNER(preprocessors=self.preprocessors, **common_args)

    def check_data(self):
        self.Error.not_enough_features.clear()
        if super().check_data():
            n_features = len(self.data.domain.attributes)
            if self.use_max_features and self.max_features > n_features:
                self.Error.not_enough_features(n_features)
                self.valid_data = False
        return self.valid_data

    def get_learner_parameters(self):
        """Called by send report to list the parameters of the learner."""
        return (("Number of trees", self.n_estimators),
                ("Maximal number of considered features",
                 self.max_features if self.use_max_features else "unlimited"),
                ("Replicable training", ["No", "Yes"][self.use_random_state]),
                ("Maximal tree depth",
                 self.max_depth if self.use_max_depth else "unlimited"),
                ("Stop splitting nodes with maximum instances",
                 self.min_samples_split
                 if self.use_min_samples_split else "unlimited"))
Esempio n. 5
0
class OWImpute(OWWidget):
    name = "Impute"
    description = "Impute missing values in the data table."
    icon = "icons/Impute.svg"
    priority = 2130

    inputs = [("Data", Orange.data.Table, "set_data"),
              ("Learner", Learner, "set_learner")]
    outputs = [("Data", Orange.data.Table)]

    METHODS = METHODS

    settingsHandler = settings.DomainContextHandler()

    default_method = settings.Setting(1)
    variable_methods = settings.ContextSetting({})
    autocommit = settings.Setting(True)

    want_main_area = False
    resizing_enabled = False

    def __init__(self):
        super().__init__()
        self.modified = False

        box = group_box(self.tr("Default method"), layout=layout(Qt.Vertical))
        self.controlArea.layout().addWidget(box)

        bgroup = QButtonGroup()

        for i, m in enumerate(self.METHODS[1:-1], 1):
            b = radio_button(m.name,
                             checked=i == self.default_method,
                             group=bgroup,
                             group_id=i)
            box.layout().addWidget(b)

        self.defbggroup = bgroup

        bgroup.buttonClicked[int].connect(self.set_default_method)
        box = group_box(self.tr("Individual attribute settings"),
                        layout=layout(Qt.Horizontal))
        self.controlArea.layout().addWidget(box)

        self.varview = QtGui.QListView(
            selectionMode=QtGui.QListView.ExtendedSelection)
        self.varview.setItemDelegate(DisplayFormatDelegate())
        self.varmodel = itemmodels.VariableListModel()
        self.varview.setModel(self.varmodel)
        self.varview.selectionModel().selectionChanged.connect(
            self._on_var_selection_changed)
        self.selection = self.varview.selectionModel()

        box.layout().addWidget(self.varview)

        method_layout = layout(Qt.Vertical, margins=0)
        box.layout().addLayout(method_layout)

        methodbox = group_box(layout=layout(Qt.Vertical))

        bgroup = QButtonGroup()
        for i, m in enumerate(self.METHODS):
            b = radio_button(m.name, group=bgroup, group_id=i)
            methodbox.layout().addWidget(b)

        assert self.METHODS[-1].short == "value"

        self.value_stack = value_stack = QStackedLayout()
        self.value_combo = QComboBox(
            minimumContentsLength=8,
            sizeAdjustPolicy=QComboBox.AdjustToMinimumContentsLength,
            activated=self._on_value_changed)
        self.value_line = QLineEdit(editingFinished=self._on_value_changed)
        self.value_line.setValidator(QDoubleValidator())
        value_stack.addWidget(self.value_combo)
        value_stack.addWidget(self.value_line)
        methodbox.layout().addLayout(value_stack)

        bgroup.buttonClicked[int].connect(
            self.set_method_for_current_selection)
        reset_button = push_button("Restore all to default",
                                   clicked=self.reset_var_methods,
                                   default=False,
                                   autoDefault=False)

        method_layout.addWidget(methodbox)
        method_layout.addStretch(2)
        method_layout.addWidget(reset_button)
        self.varmethodbox = methodbox
        self.varbgroup = bgroup

        box = gui.auto_commit(self.controlArea,
                              self,
                              "autocommit",
                              "Commit",
                              orientation="horizontal",
                              checkbox_label="Commit on any change")
        box.layout().insertSpacing(0, 80)
        box.layout().insertWidget(0, self.report_button)
        self.data = None
        self.learner = None

    def set_default_method(self, index):
        """
        Set the current selected default imputation method.
        """
        if self.default_method != index:
            self.default_method = index
            self.defbggroup.button(index).setChecked(True)
            self._invalidate()

    @check_sql_input
    def set_data(self, data):
        self.closeContext()
        self.clear()
        self.data = data
        if data is not None:
            self.varmodel[:] = data.domain.variables
            self.openContext(data.domain)
            self.restore_state(self.variable_methods)
            itemmodels.select_row(self.varview, 0)
        self.unconditional_commit()

    def set_learner(self, learner):
        self.learner = learner

        if self.data is not None and \
                any(state.method.short == "model" for state in
                    map(self.state_for_column, range(len(self.data.domain)))):
            self.commit()

    def restore_state(self, state):
        for i, var in enumerate(self.varmodel):
            key = variable_key(var)
            if key in state:
                index = self.varmodel.index(i)
                self.varmodel.setData(index, state[key], Qt.UserRole)

    def clear(self):
        self.varmodel[:] = []
        self.variable_methods = {}
        self.data = None
        self.modified = False

    def state_for_column(self, column):
        """
        #:: int -> State
        Return the effective imputation state for `column`.

        :param int column:
        :rtype State:

        """
        var = self.varmodel[column]

        state = self.variable_methods.get(variable_key(var), None)
        if state is None or state.method == METHODS[0]:
            state = State(METHODS[self.default_method], ())
        return state

    def imputed_vars_for_column(self, column):
        state = self.state_for_column(column)
        data = self.data
        var = data.domain[column]
        method, params = state
        if method.short == "leave":
            return var
        elif method.short == "drop":
            return var
        elif method.short == "avg":
            return impute.Average()(data, var)
        elif method.short == "model":
            learner = (self.learner if self.learner is not None else
                       Orange.classification.SimpleTreeLearner())
            return impute.Model(learner)(data, var)
        elif method.short == "random":
            return impute.Random()(data, var)
        elif method.short == "value":
            return impute.Default(float(params[0]))(data, var)
        elif method.short == "as_value":
            return impute.AsValue()(data, var)
        else:
            assert False

    def commit(self):
        if self.data is not None:
            varstates = [(var, self.state_for_column(i))
                         for i, var in enumerate(self.varmodel)]
            attrs = []
            class_vars = []
            filter_columns = []
            for i, (var, state) in enumerate(varstates):
                if state.method.short == "drop":
                    imputedvars = [var]
                    filter_columns.append(i)
                elif state.method.short == "leave":
                    imputedvars = [var]
                else:
                    imputedvars = self.imputed_vars_for_column(i)
                    if imputedvars is None:
                        imputedvars = []
                    elif isinstance(imputedvars, Orange.data.Variable):
                        imputedvars = [imputedvars]

                if i < len(self.data.domain.attributes):
                    attrs.extend(imputedvars)
                else:
                    class_vars.extend(imputedvars)

            domain = Orange.data.Domain(attrs, class_vars,
                                        self.data.domain.metas)

            data = self.data.from_table(domain, self.data)

            if filter_columns:
                filter_ = Orange.data.filter.IsDefined(filter_columns)
                data = filter_(data)
        else:
            data = None

        self.send("Data", data)
        self.modified = False

    def send_report(self):
        specific = []
        for var in self.varmodel:
            state = self.variable_methods.get(variable_key(var), None)
            if state is not None and state.method.short:
                if state.method.short == "value":
                    if var.is_continuous:
                        specific.append("{} (impute value {})".format(
                            var.name, float(state.params[0])))
                    else:
                        specific.append("{} (impute value '{}'".format(
                            var.name, var.values[state.params[0]]))
                else:
                    specific.append("{} ({})".format(
                        var.name, state.method.name.lower()))
        default = self.METHODS[self.default_method].name
        if specific:
            self.report_items((("Default method", default),
                               ("Specific imputers", ", ".join(specific))))
        else:
            self.report_items((("Method", default), ))

    def _invalidate(self):
        self.modified = True
        self.commit()

    def _on_var_selection_changed(self):
        indexes = self.selection.selectedIndexes()

        vars = [self.varmodel[index.row()] for index in indexes]
        defstate = State(METHODS[0], ())
        states = [
            self.variable_methods.get(variable_key(var), defstate)
            for var in vars
        ]
        all_cont = all(var.is_continuous for var in vars)
        states = list(unique(states))
        method = None
        params = ()
        state = None
        if len(states) == 1:
            state = states[0]
            method, params = state
            mindex = METHODS.index(method)
            self.varbgroup.button(mindex).setChecked(True)
        elif self.varbgroup.checkedButton() is not None:
            self.varbgroup.setExclusive(False)
            self.varbgroup.checkedButton().setChecked(False)
            self.varbgroup.setExclusive(True)

        values, enabled, stack_index = [], False, 0
        value, value_index = "0.0", 0
        if all_cont:
            enabled, stack_index = True, 1
            if method is not None and method.short == "value":
                value = params[0]

        elif len(vars) == 1 and vars[0].is_discrete:
            values, enabled, stack_index = vars[0].values, True, 0
            if method is not None and method.short == "value":
                try:
                    value_index = values.index(params[0])
                except IndexError:
                    pass

        self.value_stack.setCurrentIndex(stack_index)
        self.value_stack.setEnabled(enabled)

        if stack_index == 0:
            self.value_combo.clear()
            self.value_combo.addItems(values)
            self.value_combo.setCurrentIndex(value_index)
        else:
            self.value_line.setText(value)

    def _on_value_changed(self):
        # The "fixed" value in the widget has been changed by the user.
        index = self.varbgroup.checkedId()
        self.set_method_for_current_selection(index)

    def set_method_for_current_selection(self, methodindex):
        indexes = self.selection.selectedIndexes()
        self.set_method_for_indexes(indexes, methodindex)

    def set_method_for_indexes(self, indexes, methodindex):
        method = METHODS[methodindex]
        params = (None, )
        if method.short == "value":
            if self.value_stack.currentIndex() == 0:
                value = self.value_combo.currentIndex()
            else:
                value = self.value_line.text()
            params = (value, )
        elif method.short == "model":
            params = ("model", )
        state = State(method, params)

        for index in indexes:
            self.varmodel.setData(index, state, Qt.UserRole)
            var = self.varmodel[index.row()]
            self.variable_methods[variable_key(var)] = state

        self._invalidate()

    def reset_var_methods(self):
        indexes = map(self.varmodel.index, range(len(self.varmodel)))
        self.set_method_for_indexes(indexes, 0)
Esempio n. 6
0
class OWCalibrationPlot(widget.OWWidget):
    name = "Calibration Plot"
    description = "Calibration plot based on evaluation of classifiers."
    icon = "icons/CalibrationPlot.svg"
    priority = 1030
    keywords = []

    class Inputs:
        evaluation_results = Input("Evaluation Results", Results)

    class Outputs:
        calibrated_model = Output("Calibrated Model", Model)

    class Error(widget.OWWidget.Error):
        non_discrete_target = Msg("Calibration plot requires a categorical "
                                  "target variable.")
        empty_input = widget.Msg("Empty result on input. Nothing to display.")
        nan_classes = \
            widget.Msg("Remove test data instances with unknown classes.")
        all_target_class = widget.Msg(
            "All data instances belong to target class.")
        no_target_class = widget.Msg(
            "No data instances belong to target class.")

    class Warning(widget.OWWidget.Warning):
        omitted_folds = widget.Msg(
            "Test folds where all data belongs to (non)-target are not shown.")
        omitted_nan_prob_points = widget.Msg(
            "Instance for which the model couldn't compute probabilities are"
            "skipped.")
        no_valid_data = widget.Msg("No valid data for model(s) {}")

    class Information(widget.OWWidget.Information):
        no_output = Msg("Can't output a model: {}")

    settingsHandler = EvaluationResultsContextHandler()
    target_index = settings.ContextSetting(0)
    selected_classifiers = settings.ContextSetting([])
    score = settings.Setting(0)
    output_calibration = settings.Setting(0)
    fold_curves = settings.Setting(False)
    display_rug = settings.Setting(True)
    threshold = settings.Setting(0.5)
    visual_settings = settings.Setting({}, schema_only=True)
    auto_commit = settings.Setting(True)

    graph_name = "plot"

    def __init__(self):
        super().__init__()

        self.results = None
        self.scores = None
        self.classifier_names = []
        self.colors = []
        self.line = None

        self._last_score_value = -1

        box = gui.vBox(self.controlArea, box="Settings")
        self.target_cb = gui.comboBox(box,
                                      self,
                                      "target_index",
                                      label="Target:",
                                      orientation=Qt.Horizontal,
                                      callback=self.target_index_changed,
                                      contentsLength=8,
                                      searchable=True)
        gui.checkBox(box,
                     self,
                     "display_rug",
                     "Show rug",
                     callback=self._on_display_rug_changed)
        gui.checkBox(box,
                     self,
                     "fold_curves",
                     "Curves for individual folds",
                     callback=self._replot)

        self.classifiers_list_box = gui.listBox(
            self.controlArea,
            self,
            "selected_classifiers",
            "classifier_names",
            box="Classifier",
            selectionMode=QListWidget.ExtendedSelection,
            sizePolicy=(QSizePolicy.Preferred, QSizePolicy.Preferred),
            sizeHint=QSize(150, 40),
            callback=self._on_selection_changed)

        box = gui.vBox(self.controlArea, "Metrics")
        combo = gui.comboBox(box,
                             self,
                             "score",
                             items=(metric.name for metric in Metrics),
                             callback=self.score_changed)

        self.explanation = gui.widgetLabel(box,
                                           wordWrap=True,
                                           fixedWidth=combo.sizeHint().width())
        self.explanation.setContentsMargins(8, 8, 0, 0)
        font = self.explanation.font()
        font.setPointSizeF(0.85 * font.pointSizeF())
        self.explanation.setFont(font)

        gui.radioButtons(box,
                         self,
                         value="output_calibration",
                         btnLabels=("Sigmoid calibration",
                                    "Isotonic calibration"),
                         label="Output model calibration",
                         callback=self.apply)

        self.info_box = gui.widgetBox(self.controlArea, "Info")
        self.info_label = gui.widgetLabel(self.info_box)

        gui.auto_apply(self.buttonsArea,
                       self,
                       "auto_commit",
                       commit=self.apply)

        self.plotview = pg.GraphicsView(background="w")
        axes = {
            "bottom": AxisItem(orientation="bottom"),
            "left": AxisItem(orientation="left")
        }
        self.plot = pg.PlotItem(enableMenu=False, axisItems=axes)
        self.plot.parameter_setter = ParameterSetter(self.plot)
        self.plot.setMouseEnabled(False, False)
        self.plot.hideButtons()

        for axis_name in ("bottom", "left"):
            axis = self.plot.getAxis(axis_name)
            axis.setPen(pg.mkPen(color=0.0))
            # Remove the condition (that is, allow setting this for bottom
            # axis) when pyqtgraph is fixed
            # Issue: https://github.com/pyqtgraph/pyqtgraph/issues/930
            # Pull request: https://github.com/pyqtgraph/pyqtgraph/pull/932
            if axis_name != "bottom":  # remove if when pyqtgraph is fixed
                axis.setStyle(stopAxisAtTick=(True, True))

        self.plot.setRange(xRange=(0.0, 1.0), yRange=(0.0, 1.0), padding=0.05)
        self.plotview.setCentralItem(self.plot)

        self.mainArea.layout().addWidget(self.plotview)
        self._set_explanation()

        VisualSettingsDialog(self, self.plot.parameter_setter.initial_settings)

    @Inputs.evaluation_results
    def set_results(self, results):
        self.closeContext()
        self.clear()
        self.Error.clear()
        self.Information.clear()

        self.results = None
        if results is not None:
            if not results.domain.has_discrete_class:
                self.Error.non_discrete_target()
            elif not results.actual.size:
                self.Error.empty_input()
            elif np.any(np.isnan(results.actual)):
                self.Error.nan_classes()
            else:
                self.results = results
                self._initialize(results)
                class_var = self.results.domain.class_var
                self.target_index = int(len(class_var.values) == 2)
                self.openContext(class_var, self.classifier_names)
                self._replot()

        self.apply()

    def clear(self):
        self.plot.clear()
        self.results = None
        self.classifier_names = []
        self.selected_classifiers = []
        self.target_cb.clear()
        self.colors = []

    def target_index_changed(self):
        if len(self.results.domain.class_var.values) == 2:
            self.threshold = 1 - self.threshold
        self._set_explanation()
        self._replot()
        self.apply()

    def score_changed(self):
        self._set_explanation()
        self._replot()
        if self._last_score_value != self.score:
            self.apply()
            self._last_score_value = self.score

    def _set_explanation(self):
        explanation = Metrics[self.score].explanation
        if explanation:
            self.explanation.setText(explanation)
            self.explanation.show()
        else:
            self.explanation.hide()

        if self.score == 0:
            self.controls.output_calibration.show()
            self.info_box.hide()
        else:
            self.controls.output_calibration.hide()
            self.info_box.show()

        axis = self.plot.getAxis("bottom")
        axis.setLabel("Predicted probability" if self.score ==
                      0 else "Threshold probability to classify as positive")

        axis = self.plot.getAxis("left")
        axis.setLabel(Metrics[self.score].name)

    def _initialize(self, results):
        n = len(results.predicted)
        names = getattr(results, "learner_names", None)
        if names is None:
            names = ["#{}".format(i + 1) for i in range(n)]

        self.classifier_names = names
        self.colors = colorpalettes.get_default_curve_colors(n)

        for i in range(n):
            item = self.classifiers_list_box.item(i)
            item.setIcon(colorpalettes.ColorIcon(self.colors[i]))

        self.selected_classifiers = list(range(n))
        self.target_cb.addItems(results.domain.class_var.values)
        self.target_index = 0

    def _rug(self, data, pen_args):
        color = pen_args["pen"].color()
        rh = 0.025
        rug_x = np.c_[data.probs[:-1], data.probs[:-1]]
        rug_x_true = rug_x[data.ytrue].ravel()
        rug_x_false = rug_x[~data.ytrue].ravel()

        rug_y_true = np.ones_like(rug_x_true)
        rug_y_true[1::2] = 1 - rh
        rug_y_false = np.zeros_like(rug_x_false)
        rug_y_false[1::2] = rh

        self.plot.plot(rug_x_false,
                       rug_y_false,
                       pen=color,
                       connect="pairs",
                       antialias=True)
        self.plot.plot(rug_x_true,
                       rug_y_true,
                       pen=color,
                       connect="pairs",
                       antialias=True)

    def plot_metrics(self, data, metrics, pen_args):
        if metrics is None:
            return self._prob_curve(data.ytrue, data.probs[:-1], pen_args)
        ys = [metric(data) for metric in metrics]
        for y in ys:
            self.plot.plot(data.probs, y, **pen_args)
        return data.probs, ys

    def _prob_curve(self, ytrue, probs, pen_args):
        xmin, xmax = probs.min(), probs.max()
        x = np.linspace(xmin, xmax, 100)
        if xmax != xmin:
            f = gaussian_smoother(probs, ytrue, sigma=0.15 * (xmax - xmin))
            y = f(x)
        else:
            y = np.full(100, xmax)

        self.plot.plot(x, y, symbol="+", symbolSize=4, **pen_args)
        return x, (y, )

    def _setup_plot(self):
        target = self.target_index
        results = self.results
        metrics = Metrics[self.score].functions
        plot_folds = self.fold_curves and results.folds is not None
        self.scores = []

        if not self._check_class_presence(results.actual == target):
            return

        self.Warning.omitted_folds.clear()
        self.Warning.omitted_nan_prob_points.clear()
        no_valid_models = []
        shadow_width = 4 + 4 * plot_folds
        for clsf in self.selected_classifiers:
            data = Curves.from_results(results, target, clsf)
            if data.tot == 0:  # all probabilities are nan
                no_valid_models.append(clsf)
                continue
            if data.tot != results.probabilities.shape[1]:  # some are nan
                self.Warning.omitted_nan_prob_points()

            color = self.colors[clsf]
            pen_args = dict(pen=pg.mkPen(color, width=1),
                            antiAlias=True,
                            shadowPen=pg.mkPen(color.lighter(160),
                                               width=shadow_width))
            self.scores.append((self.classifier_names[clsf],
                                self.plot_metrics(data, metrics, pen_args)))

            if self.display_rug:
                self._rug(data, pen_args)

            if plot_folds:
                pen_args = dict(pen=pg.mkPen(color, width=1,
                                             style=Qt.DashLine),
                                antiAlias=True)
                for fold in range(len(results.folds)):
                    fold_results = results.get_fold(fold)
                    fold_curve = Curves.from_results(fold_results, target,
                                                     clsf)
                    # Can't check this before: p and n can be 0 because of
                    # nan probabilities
                    if fold_curve.p * fold_curve.n == 0:
                        self.Warning.omitted_folds()
                    self.plot_metrics(fold_curve, metrics, pen_args)

        if no_valid_models:
            self.Warning.no_valid_data(", ".join(self.classifier_names[i]
                                                 for i in no_valid_models))

        if self.score == 0:
            self.plot.plot([0, 1], [0, 1], antialias=True)
        else:
            self.line = pg.InfiniteLine(
                pos=self.threshold,
                movable=True,
                pen=pg.mkPen(color="k", style=Qt.DashLine, width=2),
                hoverPen=pg.mkPen(color="k", style=Qt.DashLine, width=3),
                bounds=(0, 1),
            )
            self.line.sigPositionChanged.connect(self.threshold_change)
            self.line.sigPositionChangeFinished.connect(
                self.threshold_change_done)
            self.plot.addItem(self.line)

    def _check_class_presence(self, ytrue):
        self.Error.all_target_class.clear()
        self.Error.no_target_class.clear()
        if np.max(ytrue) == 0:
            self.Error.no_target_class()
            return False
        if np.min(ytrue) == 1:
            self.Error.all_target_class()
            return False
        return True

    def _replot(self):
        self.plot.clear()
        if self.results is not None:
            self._setup_plot()
        self._update_info()

    def _on_display_rug_changed(self):
        self._replot()

    def _on_selection_changed(self):
        self._replot()
        self.apply()

    def threshold_change(self):
        self.threshold = round(self.line.pos().x(), 2)
        self.line.setPos(self.threshold)
        self._update_info()

    def get_info_text(self, short):
        if short:

            def elided(s):
                return s[:17] + "..." if len(s) > 20 else s

            text = f"""<table>
                            <tr>
                                <th align='right'>Threshold: p=</th>
                                <td colspan='4'>{self.threshold:.2f}<br/></td>
                            </tr>"""

        else:

            def elided(s):
                return s

            text = f"""<table>
                            <tr>
                                <th align='right'>Threshold:</th>
                                <td colspan='4'>p = {self.threshold:.2f}<br/>
                                </td>
                                <tr/>
                            </tr>"""

        if self.scores is not None:
            short_names = Metrics[self.score].short_names
            if short_names:
                text += f"""<tr>
                                <th></th>
                                {"<td></td>".join(f"<td align='right'>{n}</td>"
                                                  for n in short_names)}
                            </tr>"""
            for name, (probs, curves) in self.scores:
                ind = min(np.searchsorted(probs, self.threshold),
                          len(probs) - 1)
                text += f"<tr><th align='right'>{elided(name)}:</th>"
                text += "<td>/</td>".join(f'<td>{curve[ind]:.3f}</td>'
                                          for curve in curves)
                text += "</tr>"
            text += "<table>"
            return text
        return None

    def _update_info(self):
        self.info_label.setText(self.get_info_text(short=True))

    def threshold_change_done(self):
        self.apply()

    def apply(self):
        self.Information.no_output.clear()
        wrapped = None
        results = self.results
        if results is not None:
            problems = [
                msg for condition, msg in (
                    (len(results.folds) > 1,
                     "each training data sample produces a different model"),
                    (results.models is None,
                     "test results do not contain stored models - try testing "
                     "on separate data or on training data"),
                    (len(self.selected_classifiers) != 1,
                     "select a single model - the widget can output only one"),
                    (self.score != 0
                     and len(results.domain.class_var.values) != 2,
                     "cannot calibrate non-binary classes")) if condition
            ]
            if len(problems) == 1:
                self.Information.no_output(problems[0])
            elif problems:
                self.Information.no_output("".join(f"\n - {problem}"
                                                   for problem in problems))
            else:
                clsf_idx = self.selected_classifiers[0]
                model = results.models[0, clsf_idx]
                if self.score == 0:
                    cal_learner = CalibratedLearner(None,
                                                    self.output_calibration)
                    wrapped = cal_learner.get_model(
                        model, results.actual, results.probabilities[clsf_idx])
                else:
                    threshold = [1 - self.threshold,
                                 self.threshold][self.target_index]
                    wrapped = ThresholdClassifier(model, threshold)

        self.Outputs.calibrated_model.send(wrapped)

    def send_report(self):
        if self.results is None:
            return
        self.report_items(
            (("Target class", self.target_cb.currentText()),
             ("Output model calibration", self.score == 0
              and ("Sigmoid calibration",
                   "Isotonic calibration")[self.output_calibration])))
        caption = report.list_legend(self.classifiers_list_box,
                                     self.selected_classifiers)
        self.report_plot()
        self.report_caption(caption)
        self.report_caption(self.controls.score.currentText())

        if self.score != 0:
            self.report_raw(self.get_info_text(short=False))

    def set_visual_settings(self, key, value):
        self.plot.parameter_setter.set_parameter(key, value)
        self.visual_settings[key] = value
Esempio n. 7
0
class OWROCAnalysis(widget.OWWidget):
    name = "ROC Analysis"
    description = "Display the Receiver Operating Characteristics curve " \
                  "based on the evaluation of classifiers."
    icon = "icons/ROCAnalysis.svg"
    priority = 1010
    inputs = [("Evaluation Results", Orange.evaluation.Results, "set_results")]

    target_index = settings.Setting(0)
    selected_classifiers = []

    display_perf_line = settings.Setting(True)
    display_def_threshold = settings.Setting(True)

    fp_cost = settings.Setting(500)
    fn_cost = settings.Setting(500)
    target_prior = settings.Setting(50.0)

    #: ROC Averaging Types
    Merge, Vertical, Threshold, NoAveraging = 0, 1, 2, 3
    roc_averaging = settings.Setting(Merge)

    display_convex_hull = settings.Setting(False)
    display_convex_curve = settings.Setting(False)

    graph_name = "plot"

    def __init__(self):
        super().__init__()

        self.results = None
        self.classifier_names = []
        self.perf_line = None
        self.colors = []
        self._curve_data = {}
        self._plot_curves = {}
        self._rocch = None
        self._perf_line = None

        box = gui.vBox(self.controlArea, "Plot")
        tbox = gui.vBox(box, "Target Class")
        tbox.setFlat(True)

        self.target_cb = gui.comboBox(tbox,
                                      self,
                                      "target_index",
                                      callback=self._on_target_changed,
                                      contentsLength=8)

        cbox = gui.vBox(box, "Classifiers")
        cbox.setFlat(True)
        self.classifiers_list_box = gui.listBox(
            cbox,
            self,
            "selected_classifiers",
            "classifier_names",
            selectionMode=QListView.MultiSelection,
            callback=self._on_classifiers_changed)

        abox = gui.vBox(box, "Combine ROC Curves From Folds")
        abox.setFlat(True)
        gui.comboBox(abox,
                     self,
                     "roc_averaging",
                     items=[
                         "Merge Predictions from Folds", "Mean TP Rate",
                         "Mean TP and FP at Threshold",
                         "Show Individual Curves"
                     ],
                     callback=self._replot)

        hbox = gui.vBox(box, "ROC Convex Hull")
        hbox.setFlat(True)
        gui.checkBox(hbox,
                     self,
                     "display_convex_curve",
                     "Show convex ROC curves",
                     callback=self._replot)
        gui.checkBox(hbox,
                     self,
                     "display_convex_hull",
                     "Show ROC convex hull",
                     callback=self._replot)

        box = gui.vBox(self.controlArea, "Analysis")

        gui.checkBox(box,
                     self,
                     "display_def_threshold",
                     "Default threshold (0.5) point",
                     callback=self._on_display_def_threshold_changed)

        gui.checkBox(box,
                     self,
                     "display_perf_line",
                     "Show performance line",
                     callback=self._on_display_perf_line_changed)
        grid = QGridLayout()
        ibox = gui.indentedBox(box, orientation=grid)

        sp = gui.spin(box,
                      self,
                      "fp_cost",
                      1,
                      1000,
                      10,
                      callback=self._on_display_perf_line_changed)
        grid.addWidget(QLabel("FP Cost:"), 0, 0)
        grid.addWidget(sp, 0, 1)

        sp = gui.spin(box,
                      self,
                      "fn_cost",
                      1,
                      1000,
                      10,
                      callback=self._on_display_perf_line_changed)
        grid.addWidget(QLabel("FN Cost:"))
        grid.addWidget(sp, 1, 1)
        sp = gui.spin(box,
                      self,
                      "target_prior",
                      1,
                      99,
                      callback=self._on_display_perf_line_changed)
        sp.setSuffix("%")
        sp.addAction(QAction("Auto", sp))
        grid.addWidget(QLabel("Prior target class probability:"))
        grid.addWidget(sp, 2, 1)

        self.plotview = pg.GraphicsView(background="w")
        self.plotview.setFrameStyle(QFrame.StyledPanel)

        self.plot = pg.PlotItem()
        self.plot.getViewBox().setMenuEnabled(False)
        self.plot.getViewBox().setMouseEnabled(False, False)

        pen = QPen(self.palette().color(QPalette.Text))

        tickfont = QFont(self.font())
        tickfont.setPixelSize(max(int(tickfont.pixelSize() * 2 // 3), 11))

        axis = self.plot.getAxis("bottom")
        axis.setTickFont(tickfont)
        axis.setPen(pen)
        axis.setLabel("FP Rate (1-Specificity)")

        axis = self.plot.getAxis("left")
        axis.setTickFont(tickfont)
        axis.setPen(pen)
        axis.setLabel("TP Rate (Sensitivity)")

        self.plot.showGrid(True, True, alpha=0.1)
        self.plot.setRange(xRange=(0.0, 1.0), yRange=(0.0, 1.0))

        self.plotview.setCentralItem(self.plot)
        self.mainArea.layout().addWidget(self.plotview)

    def set_results(self, results):
        """Set the input evaluation results."""
        self.clear()
        self.results = check_results_adequacy(results, self.Error)
        if self.results is not None:
            self._initialize(results)
            self._setup_plot()

    def clear(self):
        """Clear the widget state."""
        self.results = None
        self.plot.clear()
        self.classifier_names = []
        self.selected_classifiers = []
        self.target_cb.clear()
        self.target_index = 0
        self.colors = []
        self._curve_data = {}
        self._plot_curves = {}
        self._rocch = None
        self._perf_line = None

    def _initialize(self, results):
        names = getattr(results, "learner_names", None)

        if names is None:
            names = [
                "#{}".format(i + 1) for i in range(len(results.predicted))
            ]

        self.colors = colorpalette.ColorPaletteGenerator(
            len(names), colorbrewer.colorSchemes["qualitative"]["Dark2"])

        self.classifier_names = names
        self.selected_classifiers = list(range(len(names)))
        for i in range(len(names)):
            listitem = self.classifiers_list_box.item(i)
            listitem.setIcon(colorpalette.ColorPixmap(self.colors[i]))

        class_var = results.data.domain.class_var
        self.target_cb.addItems(class_var.values)

    def curve_data(self, target, clf_idx):
        """Return `ROCData' for the given target and classifier."""
        if (target, clf_idx) not in self._curve_data:
            data = ROCData.from_results(self.results, clf_idx, target)
            self._curve_data[target, clf_idx] = data

        return self._curve_data[target, clf_idx]

    def plot_curves(self, target, clf_idx):
        """Return a set of functions `plot_curves` generating plot curves."""
        def generate_pens(basecolor):
            pen = QPen(basecolor, 1)
            pen.setCosmetic(True)

            shadow_pen = QPen(pen.color().lighter(160), 2.5)
            shadow_pen.setCosmetic(True)
            return pen, shadow_pen

        data = self.curve_data(target, clf_idx)

        if (target, clf_idx) not in self._plot_curves:
            pen, shadow_pen = generate_pens(self.colors[clf_idx])
            name = self.classifier_names[clf_idx]

            @once
            def merged():
                return plot_curve(data.merged,
                                  pen=pen,
                                  shadow_pen=shadow_pen,
                                  name=name)

            @once
            def folds():
                return [
                    plot_curve(fold, pen=pen, shadow_pen=shadow_pen)
                    for fold in data.folds
                ]

            @once
            def avg_vert():
                return plot_avg_curve(data.avg_vertical,
                                      pen=pen,
                                      shadow_pen=shadow_pen,
                                      name=name)

            @once
            def avg_thres():
                return plot_avg_curve(data.avg_threshold,
                                      pen=pen,
                                      shadow_pen=shadow_pen,
                                      name=name)

            self._plot_curves[target,
                              clf_idx] = plot_curves(merge=merged,
                                                     folds=folds,
                                                     avg_vertical=avg_vert,
                                                     avg_threshold=avg_thres)

        return self._plot_curves[target, clf_idx]

    def _setup_plot(self):
        target = self.target_index
        selected = self.selected_classifiers

        curves = [self.plot_curves(target, i) for i in selected]
        selected = [self.curve_data(target, i) for i in selected]

        if self.roc_averaging == OWROCAnalysis.Merge:
            for curve in curves:
                graphics = curve.merge()
                curve = graphics.curve
                self.plot.addItem(graphics.curve_item)

                if self.display_convex_curve:
                    self.plot.addItem(graphics.hull_item)

                if self.display_def_threshold:
                    points = curve.points
                    ind = numpy.argmin(numpy.abs(points.thresholds - 0.5))
                    item = pg.TextItem(text="{:.3f}".format(
                        points.thresholds[ind]), )
                    item.setPos(points.fpr[ind], points.tpr[ind])
                    self.plot.addItem(item)

            hull_curves = [curve.merged.hull for curve in selected]
            if hull_curves:
                self._rocch = convex_hull(hull_curves)
                iso_pen = QPen(QColor(Qt.black), 1)
                iso_pen.setCosmetic(True)
                self._perf_line = InfiniteLine(pen=iso_pen, antialias=True)
                self.plot.addItem(self._perf_line)

        elif self.roc_averaging == OWROCAnalysis.Vertical:
            for curve in curves:
                graphics = curve.avg_vertical()

                self.plot.addItem(graphics.curve_item)
                self.plot.addItem(graphics.confint_item)

            hull_curves = [curve.avg_vertical.hull for curve in selected]

        elif self.roc_averaging == OWROCAnalysis.Threshold:
            for curve in curves:
                graphics = curve.avg_threshold()
                self.plot.addItem(graphics.curve_item)
                self.plot.addItem(graphics.confint_item)

            hull_curves = [curve.avg_threshold.hull for curve in selected]

        elif self.roc_averaging == OWROCAnalysis.NoAveraging:
            for curve in curves:
                graphics = curve.folds()
                for fold in graphics:
                    self.plot.addItem(fold.curve_item)
                    if self.display_convex_curve:
                        self.plot.addItem(fold.hull_item)
            hull_curves = [
                fold.hull for curve in selected for fold in curve.folds
            ]

        if self.display_convex_hull and hull_curves:
            hull = convex_hull(hull_curves)
            hull_pen = QPen(QColor(200, 200, 200, 100), 2)
            hull_pen.setCosmetic(True)
            item = self.plot.plot(hull.fpr,
                                  hull.tpr,
                                  pen=hull_pen,
                                  brush=QBrush(QColor(200, 200, 200, 50)),
                                  fillLevel=0)
            item.setZValue(-10000)

        pen = QPen(QColor(100, 100, 100, 100), 1, Qt.DashLine)
        pen.setCosmetic(True)
        self.plot.plot([0, 1], [0, 1], pen=pen, antialias=True)

        if self.roc_averaging == OWROCAnalysis.Merge:
            self._update_perf_line()

    def _on_target_changed(self):
        self.plot.clear()
        self._setup_plot()

    def _on_classifiers_changed(self):
        self.plot.clear()
        if self.results is not None:
            self._setup_plot()

    def _on_display_perf_line_changed(self):
        if self.roc_averaging == OWROCAnalysis.Merge:
            self._update_perf_line()

        if self.perf_line is not None:
            self.perf_line.setVisible(self.display_perf_line)

    def _on_display_def_threshold_changed(self):
        self._replot()

    def _replot(self):
        self.plot.clear()
        if self.results is not None:
            self._setup_plot()

    def _update_perf_line(self):
        if self._perf_line is None:
            return

        self._perf_line.setVisible(self.display_perf_line)
        if self.display_perf_line:
            m = roc_iso_performance_slope(self.fp_cost, self.fn_cost,
                                          self.target_prior / 100.0)

            hull = self._rocch
            ind = roc_iso_performance_line(m, hull)
            angle = numpy.arctan2(m, 1)  # in radians
            self._perf_line.setAngle(angle * 180 / numpy.pi)
            self._perf_line.setPos((hull.fpr[ind[0]], hull.tpr[ind[0]]))

    def onDeleteWidget(self):
        self.clear()

    def send_report(self):
        if self.results is None:
            return
        items = OrderedDict()
        items["Target class"] = self.target_cb.currentText()
        if self.display_perf_line:
            items["Costs"] = \
                "FP = {}, FN = {}".format(self.fp_cost, self.fn_cost)
            items["Target probability"] = "{} %".format(self.target_prior)
        caption = report.list_legend(self.classifiers_list_box,
                                     self.selected_classifiers)
        self.report_items(items)
        self.report_plot()
        self.report_caption(caption)
class OWSilhouettePlot(widget.OWWidget):
    name = "Silhouette Plot"
    description = "Visually assess cluster quality and " \
                  "the degree of cluster membership."

    icon = "icons/SilhouettePlot.svg"
    priority = 300
    keywords = []

    class Inputs:
        data = Input("Data", Orange.data.Table)

    class Outputs:
        selected_data = Output("Selected Data",
                               Orange.data.Table,
                               default=True)
        annotated_data = Output(ANNOTATED_DATA_SIGNAL_NAME, Orange.data.Table)

    replaces = [
        "orangecontrib.prototypes.widgets.owsilhouetteplot.OWSilhouettePlot",
        "Orange.widgets.unsupervised.owsilhouetteplot.OWSilhouettePlot"
    ]

    settingsHandler = settings.PerfectDomainContextHandler()

    #: Distance metric index
    distance_idx = settings.Setting(0)
    #: Group/cluster variable index
    cluster_var_idx = settings.ContextSetting(0)
    #: Annotation variable index
    annotation_var_idx = settings.ContextSetting(0)
    #: Group the (displayed) silhouettes by cluster
    group_by_cluster = settings.Setting(True)
    #: A fixed size for an instance bar
    bar_size = settings.Setting(3)
    #: Add silhouette scores to output data
    add_scores = settings.Setting(False)
    auto_commit = settings.Setting(True)

    Distances = [("Euclidean", Orange.distance.Euclidean),
                 ("Manhattan", Orange.distance.Manhattan),
                 ("Cosine", Orange.distance.Cosine)]

    graph_name = "scene"
    buttons_area_orientation = Qt.Vertical

    class Error(widget.OWWidget.Error):
        need_two_clusters = Msg("Need at least two non-empty clusters")
        singleton_clusters_all = Msg("All clusters are singletons")
        memory_error = Msg("Not enough memory")
        value_error = Msg("Distances could not be computed: '{}'")

    class Warning(widget.OWWidget.Warning):
        missing_cluster_assignment = Msg(
            "{} instance{s} omitted (missing cluster assignment)")
        nan_distances = Msg("{} instance{s} omitted (undefined distances)")
        ignoring_categorical = Msg("Ignoring categorical features")

    def __init__(self):
        super().__init__()
        #: The input data
        self.data = None  # type: Optional[Orange.data.Table]
        #: Distance matrix computed from data
        self._matrix = None  # type: Optional[Orange.misc.DistMatrix]
        #: An bool mask (size == len(data)) indicating missing group/cluster
        #: assignments
        self._mask = None  # type: Optional[np.ndarray]
        #: An array of cluster/group labels for instances with valid group
        #: assignment
        self._labels = None  # type: Optional[np.ndarray]
        #: An array of silhouette scores for instances with valid group
        #: assignment
        self._silhouette = None  # type: Optional[np.ndarray]
        self._silplot = None  # type: Optional[SilhouettePlot]

        gui.comboBox(self.controlArea,
                     self,
                     "distance_idx",
                     box="Distance",
                     items=[name for name, _ in OWSilhouettePlot.Distances],
                     orientation=Qt.Horizontal,
                     callback=self._invalidate_distances)

        box = gui.vBox(self.controlArea, "Cluster Label")
        self.cluster_var_cb = gui.comboBox(box,
                                           self,
                                           "cluster_var_idx",
                                           contentsLength=14,
                                           addSpace=4,
                                           callback=self._invalidate_scores)
        gui.checkBox(box,
                     self,
                     "group_by_cluster",
                     "Group by cluster",
                     callback=self._replot)
        self.cluster_var_model = itemmodels.VariableListModel(parent=self)
        self.cluster_var_cb.setModel(self.cluster_var_model)

        box = gui.vBox(self.controlArea, "Bars")
        gui.widgetLabel(box, "Bar width:")
        gui.hSlider(box,
                    self,
                    "bar_size",
                    minValue=1,
                    maxValue=10,
                    step=1,
                    callback=self._update_bar_size,
                    addSpace=6)
        gui.widgetLabel(box, "Annotations:")
        self.annotation_cb = gui.comboBox(box,
                                          self,
                                          "annotation_var_idx",
                                          contentsLength=14,
                                          callback=self._update_annotations)
        self.annotation_var_model = itemmodels.VariableListModel(parent=self)
        self.annotation_var_model[:] = ["None"]
        self.annotation_cb.setModel(self.annotation_var_model)
        ibox = gui.indentedBox(box, 5)
        self.ann_hidden_warning = warning = gui.widgetLabel(
            ibox, "(increase the width to show)")
        ibox.setFixedWidth(ibox.sizeHint().width())
        warning.setVisible(False)

        gui.rubber(self.controlArea)

        gui.separator(self.buttonsArea)
        box = gui.vBox(self.buttonsArea, "Output")
        # Thunk the call to commit to call conditional commit
        gui.checkBox(box,
                     self,
                     "add_scores",
                     "Add silhouette scores",
                     callback=lambda: self.commit())
        gui.auto_commit(box,
                        self,
                        "auto_commit",
                        "Commit",
                        auto_label="Auto commit",
                        box=False)
        # Ensure that the controlArea is not narrower than buttonsArea
        self.controlArea.layout().addWidget(self.buttonsArea)

        self.scene = QGraphicsScene()
        self.view = QGraphicsView(self.scene)
        self.view.setRenderHint(QPainter.Antialiasing, True)
        self.view.setAlignment(Qt.AlignTop | Qt.AlignLeft)
        self.mainArea.layout().addWidget(self.view)

    def sizeHint(self):
        sh = self.controlArea.sizeHint()
        return sh.expandedTo(QSize(600, 720))

    @Inputs.data
    @check_sql_input
    def set_data(self, data):
        """
        Set the input dataset.
        """
        self.closeContext()
        self.clear()
        error_msg = ""
        warning_msg = ""
        candidatevars = []
        if data is not None:
            candidatevars = [
                v for v in data.domain.variables + data.domain.metas
                if v.is_discrete and len(v.values) >= 2
            ]
            if not candidatevars:
                error_msg = "Input does not have any suitable labels."
                data = None

        self.data = data
        if data is not None:
            self.cluster_var_model[:] = candidatevars
            if data.domain.class_var in candidatevars:
                self.cluster_var_idx = \
                    candidatevars.index(data.domain.class_var)
            else:
                self.cluster_var_idx = 0

            annotvars = [var for var in data.domain.metas if var.is_string]
            self.annotation_var_model[:] = ["None"] + annotvars
            self.annotation_var_idx = 1 if len(annotvars) else 0
            self.openContext(Orange.data.Domain(candidatevars))

        self.error(error_msg)
        self.warning(warning_msg)

    def handleNewSignals(self):
        if self.data is not None:
            self._update()
            self._replot()

        self.unconditional_commit()

    def clear(self):
        """
        Clear the widget state.
        """
        self.data = None
        self._matrix = None
        self._mask = None
        self._silhouette = None
        self._labels = None
        self.cluster_var_model[:] = []
        self.annotation_var_model[:] = ["None"]
        self._clear_scene()
        self.Error.clear()
        self.Warning.clear()

    def _clear_scene(self):
        # Clear the graphics scene and associated objects
        self.scene.clear()
        self.scene.setSceneRect(QRectF())
        self._silplot = None

    def _invalidate_distances(self):
        # Invalidate the computed distance matrix and recompute the silhouette.
        self._matrix = None
        self._invalidate_scores()

    def _invalidate_scores(self):
        # Invalidate and recompute the current silhouette scores.
        self._labels = self._silhouette = self._mask = None
        self._update()
        self._replot()
        if self.data is not None:
            self.commit()

    def _update(self):
        # Update/recompute the distances/scores as required
        self._clear_messages()

        if self.data is None or not len(self.data):
            self._reset_all()
            return

        if self._matrix is None and self.data is not None:
            _, metric = self.Distances[self.distance_idx]
            data = self.data
            if not metric.supports_discrete and any(
                    a.is_discrete for a in data.domain.attributes):
                self.Warning.ignoring_categorical()
                data = Orange.distance.remove_discrete_features(data)
            try:
                self._matrix = np.asarray(metric(data))
            except MemoryError:
                self.Error.memory_error()
                return
            except ValueError as err:
                self.Error.value_error(str(err))
                return

        self._update_labels()

    def _reset_all(self):
        self._mask = None
        self._silhouette = None
        self._labels = None
        self._matrix = None
        self._clear_scene()

    def _clear_messages(self):
        self.Error.clear()
        self.Warning.clear()

    def _update_labels(self):
        labelvar = self.cluster_var_model[self.cluster_var_idx]
        labels, _ = self.data.get_column_view(labelvar)
        labels = np.asarray(labels, dtype=float)
        cluster_mask = np.isnan(labels)
        dist_mask = np.isnan(self._matrix).all(axis=0)
        mask = cluster_mask | dist_mask
        labels = labels.astype(int)
        labels = labels[~mask]

        labels_unq, _ = np.unique(labels, return_counts=True)

        if len(labels_unq) < 2:
            self.Error.need_two_clusters()
            labels = silhouette = mask = None
        elif len(labels_unq) == len(labels):
            self.Error.singleton_clusters_all()
            labels = silhouette = mask = None
        else:
            silhouette = sklearn.metrics.silhouette_samples(
                self._matrix[~mask, :][:, ~mask], labels, metric="precomputed")
        self._mask = mask
        self._labels = labels
        self._silhouette = silhouette

        if mask is not None:
            count_missing = np.count_nonzero(cluster_mask)
            if count_missing:
                self.Warning.missing_cluster_assignment(
                    count_missing, s="s" if count_missing > 1 else "")
            count_nandist = np.count_nonzero(dist_mask)
            if count_nandist:
                self.Warning.nan_distances(count_nandist,
                                           s="s" if count_nandist > 1 else "")

    def _set_bar_height(self):
        visible = self.bar_size >= 5
        self._silplot.setBarHeight(self.bar_size)
        self._silplot.setRowNamesVisible(visible)
        self.ann_hidden_warning.setVisible(not visible
                                           and self.annotation_var_idx > 0)

    def _replot(self):
        # Clear and replot/initialize the scene
        self._clear_scene()
        if self._silhouette is not None and self._labels is not None:
            var = self.cluster_var_model[self.cluster_var_idx]
            self._silplot = silplot = SilhouettePlot()
            self._set_bar_height()

            if self.group_by_cluster:
                silplot.setScores(self._silhouette, self._labels, var.values,
                                  var.colors)
            else:
                silplot.setScores(self._silhouette,
                                  np.zeros(len(self._silhouette), dtype=int),
                                  [""], np.array([[63, 207, 207]]))

            self.scene.addItem(silplot)
            self._update_annotations()
            silplot.selectionChanged.connect(self.commit)
            silplot.layout().activate()
            self._update_scene_rect()
            silplot.geometryChanged.connect(self._update_scene_rect)

    def _update_bar_size(self):
        if self._silplot is not None:
            self._set_bar_height()

    def _update_annotations(self):
        if 0 < self.annotation_var_idx < len(self.annotation_var_model):
            annot_var = self.annotation_var_model[self.annotation_var_idx]
        else:
            annot_var = None
        self.ann_hidden_warning.setVisible(self.bar_size < 5
                                           and annot_var is not None)

        if self._silplot is not None:
            if annot_var is not None:
                column, _ = self.data.get_column_view(annot_var)
                if self._mask is not None:
                    assert column.shape == self._mask.shape
                    column = column[~self._mask]
                self._silplot.setRowNames(
                    [annot_var.str_val(value) for value in column])
            else:
                self._silplot.setRowNames(None)

    def _update_scene_rect(self):
        self.scene.setSceneRect(self._silplot.geometry())

    def commit(self):
        """
        Commit/send the current selection to the output.
        """
        selected = indices = data = None
        if self.data is not None:
            selectedmask = np.full(len(self.data), False, dtype=bool)
            if self._silplot is not None:
                indices = self._silplot.selection()
                assert (np.diff(indices) > 0).all(), "strictly increasing"
                if self._mask is not None:
                    indices = np.flatnonzero(~self._mask)[indices]
                selectedmask[indices] = True

            if self._mask is not None:
                scores = np.full(shape=selectedmask.shape, fill_value=np.nan)
                scores[~self._mask] = self._silhouette
            else:
                scores = self._silhouette

            silhouette_var = None
            if self.add_scores:
                var = self.cluster_var_model[self.cluster_var_idx]
                silhouette_var = Orange.data.ContinuousVariable(
                    "Silhouette ({})".format(escape(var.name)))
                domain = Orange.data.Domain(
                    self.data.domain.attributes, self.data.domain.class_vars,
                    self.data.domain.metas + (silhouette_var, ))
                data = self.data.transform(domain)
            else:
                domain = self.data.domain
                data = self.data

            if np.count_nonzero(selectedmask):
                selected = self.data.from_table(domain, self.data,
                                                np.flatnonzero(selectedmask))

            if self.add_scores:
                if selected is not None:
                    selected[:, silhouette_var] = np.c_[scores[selectedmask]]
                data[:, silhouette_var] = np.c_[scores]

        self.Outputs.selected_data.send(selected)
        self.Outputs.annotated_data.send(create_annotated_table(data, indices))

    def send_report(self):
        if not len(self.cluster_var_model):
            return

        self.report_plot()
        caption = "Silhouette plot ({} distance), clustered by '{}'".format(
            self.Distances[self.distance_idx][0],
            self.cluster_var_model[self.cluster_var_idx])
        if self.annotation_var_idx and self._silplot.rowNamesVisible():
            caption += ", annotated with '{}'".format(
                self.annotation_var_model[self.annotation_var_idx])
        self.report_caption(caption)

    def onDeleteWidget(self):
        self.clear()
        super().onDeleteWidget()
Esempio n. 9
0
class OWDiscretize(widget.OWWidget):
    name = "Discretize"
    description = "Discretize the numeric data features."
    icon = "icons/Discretize.svg"

    class Inputs:
        data = Input("Data", Orange.data.Table, doc="Input data table")

    class Outputs:
        data = Output("Data", Orange.data.Table, doc="Table with discretized features")

    settingsHandler = settings.DomainContextHandler()
    saved_var_states = settings.ContextSetting({})

    default_method = settings.Setting(2)
    default_k = settings.Setting(3)
    autosend = settings.Setting(True)

    #: Discretization methods
    Default, Leave, MDL, EqualFreq, EqualWidth, Remove, Custom = range(7)

    want_main_area = False
    resizing_enabled = False

    def  __init__(self):
        super().__init__()

        #: input data
        self.data = None
        #: Current variable discretization state
        self.var_state = {}
        #: Saved variable discretization settings (context setting)
        self.saved_var_states = {}

        self.method = 0
        self.k = 5

        box = gui.vBox(self.controlArea, self.tr("Default Discretization"))
        self.default_bbox = rbox = gui.radioButtons(
            box, self, "default_method", callback=self._default_disc_changed)
        rb = gui.hBox(rbox)
        self.left = gui.vBox(rb)
        right = gui.vBox(rb)
        rb.layout().setStretch(0, 1)
        rb.layout().setStretch(1, 1)
        options = self.options = [
            self.tr("Default"),
            self.tr("Leave numeric"),
            self.tr("Entropy-MDL discretization"),
            self.tr("Equal-frequency discretization"),
            self.tr("Equal-width discretization"),
            self.tr("Remove numeric variables")
        ]

        for opt in options[1:]:
            t = gui.appendRadioButton(rbox, opt)
            # This condition is ugly, but it keeps the same order of
            # options for backward compatibility of saved schemata
            [right, self.left][opt.startswith("Equal")].layout().addWidget(t)
        gui.separator(right, 18, 18)

        def _intbox(widget, attr, callback):
            box = gui.indentedBox(widget)
            s = gui.spin(
                box, self, attr, minv=2, maxv=10, label="Num. of intervals:",
                callback=callback)
            s.setMaximumWidth(60)
            s.setAlignment(Qt.AlignRight)
            gui.rubber(s.box)
            return box.box

        self.k_general = _intbox(self.left, "default_k",
                                 self._default_disc_changed)
        self.k_general.layout().setContentsMargins(0, 0, 0, 0)
        vlayout = QHBoxLayout()
        box = gui.widgetBox(
            self.controlArea, "Individual Attribute Settings",
            orientation=vlayout, spacing=8
        )

        # List view with all attributes
        self.varview = QListView(selectionMode=QListView.ExtendedSelection)
        self.varview.setItemDelegate(DiscDelegate())
        self.varmodel = itemmodels.VariableListModel()
        self.varview.setModel(self.varmodel)
        self.varview.selectionModel().selectionChanged.connect(
            self._var_selection_changed
        )

        vlayout.addWidget(self.varview)
        # Controls for individual attr settings
        self.bbox = controlbox = gui.radioButtons(
            box, self, "method", callback=self._disc_method_changed
        )
        vlayout.addWidget(controlbox)

        for opt in options[:5]:
            gui.appendRadioButton(controlbox, opt)

        self.k_specific = _intbox(controlbox, "k", self._disc_method_changed)

        gui.appendRadioButton(controlbox, "Remove attribute")

        gui.rubber(controlbox)
        controlbox.setEnabled(False)

        self.controlbox = controlbox

        box = gui.auto_commit(
            self.controlArea, self, "autosend", "Apply",
            orientation=Qt.Horizontal,
            checkbox_label="Apply automatically")
        box.layout().insertSpacing(0, 20)
        box.layout().insertWidget(0, self.report_button)
        self._update_spin_positions()


    @Inputs.data
    def set_data(self, data):
        self.closeContext()
        self.data = data
        if self.data is not None:
            self._initialize(data)
            self.openContext(data)
            # Restore the per variable discretization settings
            self._restore(self.saved_var_states)
            # Complete the induction of cut points
            self._update_points()
        else:
            self._clear()
        self.unconditional_commit()

    def _initialize(self, data):
        # Initialize the default variable states for new data.
        self.class_var = data.domain.class_var
        cvars = [var for var in data.domain if var.is_continuous]
        self.varmodel[:] = cvars

        class_var = data.domain.class_var
        has_disc_class = data.domain.has_discrete_class

        self.default_bbox.buttons[self.MDL - 1].setEnabled(has_disc_class)
        self.bbox.buttons[self.MDL].setEnabled(has_disc_class)

        # If the newly disabled MDL button is checked then change it
        if not has_disc_class and self.default_method == self.MDL - 1:
            self.default_method = 0
        if not has_disc_class and self.method == self.MDL:
            self.method = 0

        # Reset (initialize) the variable discretization states.
        self._reset()

    def _restore(self, saved_state):
        # Restore variable states from a saved_state dictionary.
        def_method = self._current_default_method()
        for i, var in enumerate(self.varmodel):
            key = variable_key(var)
            if key in saved_state:
                state = saved_state[key]
                if isinstance(state.method, Default):
                    state = DState(Default(def_method), None, None)
                self._set_var_state(i, state)

    def _reset(self):
        # restore the individual variable settings back to defaults.
        def_method = self._current_default_method()
        self.var_state = {}
        for i in range(len(self.varmodel)):
            state = DState(Default(def_method), None, None)
            self._set_var_state(i, state)

    def _set_var_state(self, index, state):
        # set the state of variable at `index` to `state`.
        self.var_state[index] = state
        self.varmodel.setData(self.varmodel.index(index), state, Qt.UserRole)

    def _clear(self):
        self.data = None
        self.varmodel[:] = []
        self.var_state = {}
        self.saved_var_states = {}
        self.default_bbox.buttons[self.MDL - 1].setEnabled(True)
        self.bbox.buttons[self.MDL].setEnabled(True)

    def _update_points(self):
        """
        Update the induced cut points.
        """
        if self.data is None or not len(self.data):
            return

        def induce_cuts(method, data, var):
            dvar = _dispatch[type(method)](method, data, var)
            if dvar is None:
                # removed
                return [], None
            elif dvar is var:
                # no transformation took place
                return None, var
            elif is_discretized(dvar):
                return dvar.compute_value.points, dvar
            else:
                assert False
        for i, var in enumerate(self.varmodel):
            state = self.var_state[i]
            if state.points is None and state.disc_var is None:
                points, dvar = induce_cuts(state.method, self.data, var)
                new_state = state._replace(points=points, disc_var=dvar)
                self._set_var_state(i, new_state)

    def _method_index(self, method):
        return METHODS.index((type(method), ))

    def _current_default_method(self):
        method = self.default_method + 1
        k = self.default_k
        if method == OWDiscretize.Leave:
            def_method = Leave()
        elif method == OWDiscretize.MDL:
            def_method = MDL()
        elif method == OWDiscretize.EqualFreq:
            def_method = EqualFreq(k)
        elif method == OWDiscretize.EqualWidth:
            def_method = EqualWidth(k)
        elif method == OWDiscretize.Remove:
            def_method = Remove()
        else:
            assert False
        return def_method

    def _current_method(self):
        if self.method == OWDiscretize.Default:
            method = Default(self._current_default_method())
        elif self.method == OWDiscretize.Leave:
            method = Leave()
        elif self.method == OWDiscretize.MDL:
            method = MDL()
        elif self.method == OWDiscretize.EqualFreq:
            method = EqualFreq(self.k)
        elif self.method == OWDiscretize.EqualWidth:
            method = EqualWidth(self.k)
        elif self.method == OWDiscretize.Remove:
            method = Remove()
        elif self.method == OWDiscretize.Custom:
            method = Custom(self.cutpoints)
        else:
            assert False
        return method

    def _update_spin_positions(self):
        self.k_general.setDisabled(self.default_method not in [2, 3])
        if self.default_method == 2:
            self.left.layout().insertWidget(1, self.k_general)
        elif self.default_method == 3:
            self.left.layout().insertWidget(2, self.k_general)

        self.k_specific.setDisabled(self.method not in [3, 4])
        if self.method == 3:
            self.bbox.layout().insertWidget(4, self.k_specific)
        elif self.method == 4:
            self.bbox.layout().insertWidget(5, self.k_specific)

    def _default_disc_changed(self):
        self._update_spin_positions()
        method = self._current_default_method()
        state = DState(Default(method), None, None)
        for i, _ in enumerate(self.varmodel):
            if isinstance(self.var_state[i].method, Default):
                self._set_var_state(i, state)
        self._update_points()
        self.commit()

    def _disc_method_changed(self):
        self._update_spin_positions()
        indices = self.selected_indices()
        method = self._current_method()
        state = DState(method, None, None)
        for idx in indices:
            self._set_var_state(idx, state)
        self._update_points()
        self.commit()

    def _var_selection_changed(self, *args):
        indices = self.selected_indices()
        # set of all methods for the current selection
        methods = [self.var_state[i].method for i in indices]
        mset = set(methods)
        self.controlbox.setEnabled(len(mset) > 0)
        if len(mset) == 1:
            method = mset.pop()
            self.method = self._method_index(method)
            if isinstance(method, (EqualFreq, EqualWidth)):
                self.k = method.k
            elif isinstance(method, Custom):
                self.cutpoints = method.points
        else:
            # deselect the current button
            self.method = -1
            bg = self.controlbox.group
            button_group_reset(bg)
        self._update_spin_positions()

    def selected_indices(self):
        rows = self.varview.selectionModel().selectedRows()
        return [index.row() for index in rows]

    def discretized_var(self, source):
        index = list(self.varmodel).index(source)
        state = self.var_state[index]
        if state.disc_var is None:
            return None
        elif state.disc_var is source:
            return source
        elif state.points == []:
            return None
        else:
            return state.disc_var

    def discretized_domain(self):
        """
        Return the current effective discretized domain.
        """
        if self.data is None:
            return None

        def disc_var(source):
            if source and source.is_continuous:
                return self.discretized_var(source)
            else:
                return source

        attributes = [disc_var(v) for v in self.data.domain.attributes]
        attributes = [v for v in attributes if v is not None]

        class_var = disc_var(self.data.domain.class_var)

        domain = Orange.data.Domain(
            attributes, class_var,
            metas=self.data.domain.metas
        )
        return domain

    def commit(self):
        output = None
        if self.data is not None and len(self.data):
            domain = self.discretized_domain()
            output = self.data.transform(domain)
        self.Outputs.data.send(output)

    def storeSpecificSettings(self):
        super().storeSpecificSettings()
        self.saved_var_states = {
            variable_key(var):
                self.var_state[i]._replace(points=None, disc_var=None)
            for i, var in enumerate(self.varmodel)
        }

    def send_report(self):
        self.report_items((
            ("Default method", self.options[self.default_method + 1]),))
        if self.varmodel:
            self.report_items("Thresholds", [
                (var.name,
                 DiscDelegate.cutsText(self.var_state[i]) or "leave numeric")
                for i, var in enumerate(self.varmodel)])
Esempio n. 10
0
class OWKEGGPathwayBrowser(widget.OWWidget):
    name = "KEGG Pathways"
    description = "Browse KEGG pathways that include an input set of genes."
    icon = "../widgets/icons/OWKEGGPathwayBrowser.svg"
    priority = 8

    inputs = [("Data", Orange.data.Table, "SetData", widget.Default),
              ("Reference", Orange.data.Table, "SetRefData")]
    outputs = [("Selected Data", Orange.data.Table, widget.Default),
               ("Unselected Data", Orange.data.Table)]

    settingsHandler = settings.DomainContextHandler()

    organismIndex = settings.ContextSetting(0)
    geneAttrIndex = settings.ContextSetting(0)
    useAttrNames = settings.ContextSetting(False)

    autoCommit = settings.Setting(False)
    autoResize = settings.Setting(True)
    useReference = settings.Setting(False)
    showOrthology = settings.Setting(True)

    Ready, Initializing, Running = 0, 1, 2

    def __init__(self, parent=None):
        super().__init__(parent)

        self.organismCodes = []
        self._changedFlag = False
        self.__invalidated = False
        self.__runstate = OWKEGGPathwayBrowser.Initializing
        self.__in_setProgress = False

        self.controlArea.setMaximumWidth(250)
        box = gui.widgetBox(self.controlArea, "Info")
        self.infoLabel = gui.widgetLabel(box, "No data on input\n")

        # Organism selection.
        box = gui.widgetBox(self.controlArea, "Organism")
        self.organismComboBox = gui.comboBox(
            box,
            self,
            "organismIndex",
            items=[],
            callback=self.Update,
            addSpace=True,
            tooltip="Select the organism of the input genes")

        # Selection of genes attribute
        box = gui.widgetBox(self.controlArea, "Gene attribute")
        self.geneAttrCandidates = itemmodels.VariableListModel(parent=self)
        self.geneAttrCombo = gui.comboBox(box,
                                          self,
                                          "geneAttrIndex",
                                          callback=self.Update)
        self.geneAttrCombo.setModel(self.geneAttrCandidates)

        gui.checkBox(box,
                     self,
                     "useAttrNames",
                     "Use variable names",
                     disables=[(-1, self.geneAttrCombo)],
                     callback=self.Update)

        self.geneAttrCombo.setDisabled(bool(self.useAttrNames))

        gui.separator(self.controlArea)

        gui.checkBox(self.controlArea,
                     self,
                     "useReference",
                     "From signal",
                     box="Reference",
                     callback=self.Update)

        gui.separator(self.controlArea)

        gui.checkBox(self.controlArea,
                     self,
                     "showOrthology",
                     "Show pathways in full orthology",
                     box="Orthology",
                     callback=self.UpdateListView)

        gui.checkBox(self.controlArea,
                     self,
                     "autoResize",
                     "Resize to fit",
                     box="Image",
                     callback=self.UpdatePathwayViewTransform)

        box = gui.widgetBox(self.controlArea, "Cache Control")

        gui.button(box,
                   self,
                   "Clear cache",
                   callback=self.ClearCache,
                   tooltip="Clear all locally cached KEGG data.",
                   default=False,
                   autoDefault=False)

        gui.separator(self.controlArea)

        gui.auto_commit(self.controlArea, self, "autoCommit", "Commit")

        gui.rubber(self.controlArea)

        spliter = QSplitter(Qt.Vertical, self.mainArea)
        self.pathwayView = PathwayView(self, spliter)
        self.pathwayView.scene().selectionChanged.connect(
            self._onSelectionChanged)
        self.mainArea.layout().addWidget(spliter)

        self.listView = QTreeWidget(allColumnsShowFocus=True,
                                    selectionMode=QTreeWidget.SingleSelection,
                                    sortingEnabled=True,
                                    maximumHeight=200)

        spliter.addWidget(self.listView)

        self.listView.setColumnCount(4)
        self.listView.setHeaderLabels(
            ["Pathway", "P value", "Genes", "Reference"])

        self.listView.itemSelectionChanged.connect(self.UpdatePathwayView)

        select = QAction("Select All", self, shortcut=QKeySequence.SelectAll)
        select.triggered.connect(self.selectAll)
        self.addAction(select)

        self.data = None
        self.refData = None

        self._executor = concurrent.ThreadExecutor()
        self.setEnabled(False)
        self.setBlocking(True)
        progress = concurrent.methodinvoke(self, "setProgress", (float, ))

        def get_genome():
            """Return a KEGGGenome with the common org entries precached."""
            genome = kegg.KEGGGenome()

            essential = genome.essential_organisms()
            common = genome.common_organisms()
            # Remove duplicates of essential from common.
            # (essential + common list as defined here will be used in the
            # GUI.)
            common = [c for c in common if c not in essential]

            # TODO: Add option to specify additional organisms not
            # in the common list.

            keys = list(map(genome.org_code_to_entry_key, essential + common))

            genome.pre_cache(keys, progress_callback=progress)
            return (keys, genome)

        self._genomeTask = task = concurrent.Task(function=get_genome)
        task.finished.connect(self.__initialize_finish)

        self.progressBarInit()
        self.infoLabel.setText("Fetching organism definitions\n")
        self._executor.submit(task)

    def __initialize_finish(self):
        if self.__runstate != OWKEGGPathwayBrowser.Initializing:
            return

        try:
            keys, genome = self._genomeTask.result()
        except Exception as err:
            self.error(0, str(err))
            raise

        self.progressBarFinished()
        self.setEnabled(True)
        self.setBlocking(False)

        entries = [genome[key] for key in keys]
        items = [entry.definition for entry in entries]
        codes = [entry.organism_code for entry in entries]

        self.organismCodes = codes
        self.organismComboBox.clear()
        self.organismComboBox.addItems(items)
        self.organismComboBox.setCurrentIndex(self.organismIndex)

        self.infoLabel.setText("No data on input\n")

    def Clear(self):
        """
        Clear the widget state.
        """
        self.queryGenes = []
        self.referenceGenes = []
        self.genes = {}
        self.uniqueGenesDict = {}
        self.revUniqueGenesDict = {}
        self.pathways = {}
        self.org = None
        self.geneAttrCandidates[:] = []

        self.infoLabel.setText("No data on input\n")
        self.listView.clear()
        self.pathwayView.SetPathway(None)

        self.send("Selected Data", None)
        self.send("Unselected Data", None)

    def SetData(self, data=None):
        if self.__runstate == OWKEGGPathwayBrowser.Initializing:
            self.__initialize_finish()

        self.data = data
        self.warning(0)
        self.error(0)
        self.information(0)

        if data is not None:
            vars = data.domain.variables + data.domain.metas
            vars = [
                var for var in vars
                if isinstance(var, Orange.data.StringVariable)
            ]
            self.geneAttrCandidates[:] = vars

            # Try to guess the gene name variable
            if vars:
                names_lower = [v.name.lower() for v in vars]
                scores = [(name == "gene", "gene" in name)
                          for name in names_lower]
                imax, _ = max(enumerate(scores), key=itemgetter(1))
            else:
                imax = -1

            self.geneAttrIndex = imax

            taxid = str(data.attributes.get(TAX_ID, ''))

            if taxid:
                try:
                    code = kegg.from_taxid(taxid)
                    self.organismIndex = self.organismCodes.index(code)
                except Exception as ex:
                    print(ex, taxid)

            self.useAttrNames = data.attributes.get(GENE_AS_ATTRIBUTE_NAME,
                                                    self.useAttrNames)

            if len(self.geneAttrCandidates) == 0:
                self.useAttrNames = True
                self.geneAttrIndex = -1
            else:
                self.geneAttrIndex = min(self.geneAttrIndex,
                                         len(self.geneAttrCandidates) - 1)
        else:
            self.Clear()

        self.__invalidated = True

    def SetRefData(self, data=None):
        self.refData = data
        self.information(1)

        if data is not None and self.useReference:
            self.__invalidated = True

    def handleNewSignals(self):
        if self.__invalidated:
            self.Update()
            self.__invalidated = False

    def UpdateListView(self):
        self.bestPValueItem = None
        self.listView.clear()
        if not self.data:
            return

        allPathways = self.org.pathways()
        allRefPathways = kegg.pathways("map")

        items = []
        kegg_pathways = kegg.KEGGPathways()

        org_code = self.organismCodes[min(self.organismIndex,
                                          len(self.organismCodes) - 1)]

        if self.showOrthology:
            self.koOrthology = kegg.KEGGBrite("ko00001")
            self.listView.setRootIsDecorated(True)
            path_ids = set([s[-5:] for s in self.pathways.keys()])

            def _walkCollect(koEntry):
                num = koEntry.title[:5] if koEntry.title else None
                if num in path_ids:
                    return ([koEntry] +
                            reduce(lambda li, c: li + _walkCollect(c),
                                   [child for child in koEntry.entries], []))
                else:
                    c = reduce(lambda li, c: li + _walkCollect(c),
                               [child for child in koEntry.entries], [])
                    return c + (c and [koEntry] or [])

            allClasses = reduce(lambda li1, li2: li1 + li2,
                                [_walkCollect(c) for c in self.koOrthology],
                                [])

            def _walkCreate(koEntry, lvItem):
                item = QTreeWidgetItem(lvItem)
                id = "path:" + org_code + koEntry.title[:5]

                if koEntry.title[:5] in path_ids:
                    p = kegg_pathways.get_entry(id)
                    if p is None:
                        # In case the genesets still have obsolete entries
                        name = koEntry.title
                    else:
                        name = p.name
                    genes, p_value, ref = self.pathways[id]
                    item.setText(0, name)
                    item.setText(1, "%.5f" % p_value)
                    item.setText(2, "%i of %i" % (len(genes), len(self.genes)))
                    item.setText(3,
                                 "%i of %i" % (ref, len(self.referenceGenes)))
                    item.pathway_id = id if p is not None else None
                else:
                    if id in allPathways:
                        text = kegg_pathways.get_entry(id).name
                    else:
                        text = koEntry.title
                    item.setText(0, text)

                    if id in allPathways:
                        item.pathway_id = id
                    elif "path:map" + koEntry.title[:5] in allRefPathways:
                        item.pathway_id = "path:map" + koEntry.title[:5]
                    else:
                        item.pathway_id = None

                for child in koEntry.entries:
                    if child in allClasses:
                        _walkCreate(child, item)

            for koEntry in self.koOrthology:
                if koEntry in allClasses:
                    _walkCreate(koEntry, self.listView)

            self.listView.update()
        else:
            self.listView.setRootIsDecorated(False)
            pathways = self.pathways.items()
            pathways = sorted(pathways, key=lambda item: item[1][1])

            for id, (genes, p_value, ref) in pathways:
                item = QTreeWidgetItem(self.listView)
                item.setText(0, kegg_pathways.get_entry(id).name)
                item.setText(1, "%.5f" % p_value)
                item.setText(2, "%i of %i" % (len(genes), len(self.genes)))
                item.setText(3, "%i of %i" % (ref, len(self.referenceGenes)))
                item.pathway_id = id
                items.append(item)

        self.bestPValueItem = items and items[0] or None
        self.listView.expandAll()
        for i in range(4):
            self.listView.resizeColumnToContents(i)

        if self.bestPValueItem:
            index = self.listView.indexFromItem(self.bestPValueItem)
            self.listView.selectionModel().select(
                index, QItemSelectionModel.ClearAndSelect)

    def UpdatePathwayView(self):
        items = self.listView.selectedItems()

        if len(items) > 0:
            item = items[0]
        else:
            item = None

        self.commit()
        item = item or self.bestPValueItem
        if not item or not item.pathway_id:
            self.pathwayView.SetPathway(None)
            return

        def get_kgml_and_image(pathway_id):
            """Return an initialized KEGGPathway with pre-cached data"""
            p = kegg.KEGGPathway(pathway_id)
            p._get_kgml()  # makes sure the kgml file is downloaded
            p._get_image_filename()  # makes sure the image is downloaded
            return (pathway_id, p)

        self.setEnabled(False)
        self._pathwayTask = concurrent.Task(
            function=lambda: get_kgml_and_image(item.pathway_id))
        self._pathwayTask.finished.connect(self._onPathwayTaskFinshed)
        self._executor.submit(self._pathwayTask)

    def _onPathwayTaskFinshed(self):
        self.setEnabled(True)
        pathway_id, self.pathway = self._pathwayTask.result()
        self.pathwayView.SetPathway(self.pathway,
                                    self.pathways.get(pathway_id, [[]])[0])

    def UpdatePathwayViewTransform(self):
        self.pathwayView.updateTransform()

    def Update(self):
        """
        Update (recompute enriched pathways) the widget state.
        """
        if not self.data:
            return

        self.error(0)
        self.information(0)

        # XXX: Check data in setData, do not even allow this to be executed if
        # data has no genes
        try:
            genes = self.GeneNamesFromData(self.data)
        except ValueError:
            self.error(0, "Cannot extract gene names from input.")
            genes = []

        if not self.useAttrNames and any("," in gene for gene in genes):
            genes = reduce(add, (split_and_strip(gene, ",") for gene in genes),
                           [])
            self.information(
                0, "Separators detected in input gene names. "
                "Assuming multiple genes per instance.")

        self.queryGenes = genes

        self.information(1)
        reference = None
        if self.useReference and self.refData:
            reference = self.GeneNamesFromData(self.refData)
            if not self.useAttrNames \
                    and any("," in gene for gene in reference):
                reference = reduce(add, (split_and_strip(gene, ",")
                                         for gene in reference), [])
                self.information(
                    1, "Separators detected in reference gene "
                    "names. Assuming multiple genes per "
                    "instance.")

        org_code = self.SelectedOrganismCode()

        gm = GeneMatcher(kegg.to_taxid(org_code))
        gm.genes = genes
        gm.run_matcher()
        mapped_genes = {
            gene: str(ncbi_id)
            for gene, ncbi_id in gm.map_input_to_ncbi().items()
        }

        def run_enrichment(org_code, genes, reference=None, progress=None):
            org = kegg.KEGGOrganism(org_code)
            if reference is None:
                reference = org.get_ncbi_ids()

            # This is here just to keep widget working without any major changes.
            # map not needed, geneMatcher will not work on widget level.
            unique_genes = genes
            unique_ref_genes = dict([(gene, gene) for gene in set(reference)])

            taxid = kegg.to_taxid(org.org_code)
            # Map the taxid back to standard 'common' taxids
            # (as used by 'geneset') if applicable
            r_tax_map = dict(
                (v, k) for k, v in kegg.KEGGGenome.TAXID_MAP.items())
            if taxid in r_tax_map:
                taxid = r_tax_map[taxid]

            # We use the kegg pathway gene sets provided by 'geneset' for
            # the enrichment calculation.

            kegg_api = kegg.api.CachedKeggApi()
            linkmap = kegg_api.link(org.org_code, "pathway")
            converted_ids = kegg_api.conv(org.org_code, 'ncbi-geneid')
            kegg_sets = relation_list_to_multimap(
                linkmap,
                dict((gene.upper(), ncbi.split(':')[-1])
                     for ncbi, gene in converted_ids))

            kegg_sets = geneset.GeneSets(sets=[
                geneset.GeneSet(gs_id=ddi, genes=set(genes))
                for ddi, genes in kegg_sets.items()
            ])

            pathways = pathway_enrichment(kegg_sets,
                                          unique_genes.values(),
                                          unique_ref_genes.keys(),
                                          callback=progress)
            # Ensure that pathway entries are pre-cached for later use in the
            # list/tree view
            kegg_pathways = kegg.KEGGPathways()
            kegg_pathways.pre_cache(pathways.keys(),
                                    progress_callback=progress)

            return pathways, org, unique_genes, unique_ref_genes

        self.progressBarInit()
        self.setEnabled(False)
        self.infoLabel.setText("Retrieving...\n")

        progress = concurrent.methodinvoke(self, "setProgress", (float, ))

        self._enrichTask = concurrent.Task(function=lambda: run_enrichment(
            org_code, mapped_genes, reference, progress))
        self._enrichTask.finished.connect(self._onEnrichTaskFinished)
        self._executor.submit(self._enrichTask)

    def _onEnrichTaskFinished(self):
        self.setEnabled(True)
        self.setBlocking(False)
        try:
            pathways, org, unique_genes, unique_ref_genes = \
                self._enrichTask.result()
        except Exception:
            raise

        self.progressBarFinished()

        self.org = org
        self.genes = unique_genes.keys()
        self.uniqueGenesDict = {
            ncbi_id: input_name
            for input_name, ncbi_id in unique_genes.items()
        }
        self.revUniqueGenesDict = dict([
            (val, key) for key, val in self.uniqueGenesDict.items()
        ])
        self.referenceGenes = unique_ref_genes.keys()
        self.pathways = pathways

        if not self.pathways:
            self.warning(0, "No enriched pathways found.")
        else:
            self.warning(0)

        count = len(set(self.queryGenes))
        self.infoLabel.setText("%i unique gene names on input\n"
                               "%i (%.1f%%) genes names matched" %
                               (count, len(unique_genes), 100.0 *
                                len(unique_genes) / count if count else 0.0))

        self.UpdateListView()

    @Slot(float)
    def setProgress(self, value):
        if self.__in_setProgress:
            return

        self.__in_setProgress = True
        self.progressBarSet(value)
        self.__in_setProgress = False

    def GeneNamesFromData(self, data):
        """
        Extract and return gene names from `data`.
        """
        if self.useAttrNames:
            genes = [str(v.name).strip() for v in data.domain.attributes]
        elif self.geneAttrCandidates:
            assert 0 <= self.geneAttrIndex < len(self.geneAttrCandidates)
            geneAttr = self.geneAttrCandidates[self.geneAttrIndex]
            genes = [
                str(e[geneAttr]) for e in data if not numpy.isnan(e[geneAttr])
            ]
        else:
            raise ValueError("No gene names in data.")
        return genes

    def SelectedOrganismCode(self):
        """
        Return the selected organism code.
        """
        return self.organismCodes[min(self.organismIndex,
                                      len(self.organismCodes) - 1)]

    def selectAll(self):
        """
        Select all items in the pathway view.
        """
        changed = False
        scene = self.pathwayView.scene()
        with disconnected(scene.selectionChanged, self._onSelectionChanged):
            for item in scene.items():
                if item.flags() & QGraphicsItem.ItemIsSelectable and \
                        not item.isSelected():
                    item.setSelected(True)
                    changed = True
        if changed:
            self._onSelectionChanged()

    def _onSelectionChanged(self):
        # Item selection in the pathwayView/scene has changed
        self.commit()

    def commit(self):
        if self.data:
            selectedItems = self.pathwayView.scene().selectedItems()
            selectedGenes = reduce(
                set.union, [item.marked_objects for item in selectedItems],
                set())

            if self.useAttrNames:
                selected = [
                    self.data.domain[self.uniqueGenesDict[gene]]
                    for gene in selectedGenes
                ]
                #                 newDomain = Orange.data.Domain(selectedVars, 0)
                data = self.data[:, selected]
                #                 data = Orange.data.Table(newDomain, self.data)
                self.send("Selected Data", data)
            elif self.geneAttrCandidates:
                assert 0 <= self.geneAttrIndex < len(self.geneAttrCandidates)
                geneAttr = self.geneAttrCandidates[self.geneAttrIndex]
                selectedIndices = []
                otherIndices = []
                for i, ex in enumerate(self.data):
                    names = [
                        self.revUniqueGenesDict.get(name, None)
                        for name in split_and_strip(str(ex[geneAttr]), ",")
                    ]
                    if any(name and name in selectedGenes for name in names):
                        selectedIndices.append(i)
                    else:
                        otherIndices.append(i)

                if selectedIndices:
                    selected = self.data[selectedIndices]
                else:
                    selected = None

                if otherIndices:
                    other = self.data[otherIndices]
                else:
                    other = None

                self.send("Selected Data", selected)
                self.send("Unselected Data", other)
        else:
            self.send("Selected Data", None)
            self.send("Unselected Data", None)

    def ClearCache(self):
        kegg.caching.clear_cache()

    def onDeleteWidget(self):
        """
        Called before the widget is removed from the canvas.
        """
        super().onDeleteWidget()

        self.org = None
        self._executor.shutdown(wait=False)
        gc.collect()  # Force collection (WHY?)

    def sizeHint(self):
        return QSize(1024, 720)
Esempio n. 11
0
class OWSVMClassification(widget.OWWidget):
    name = "SVM"
    description = "Support vector machines classifier with standard " \
                  "selection of kernels."
    icon = "icons/SVM.svg"

    inputs = [("Data", Table, "set_data"),
              ("Preprocessor", Preprocess, "set_preprocessor")]
    outputs = [("Learner", SVMLearner, widget.Default),
               ("Classifier", SVMClassifier), ("Support vectors", Table)]

    want_main_area = False
    resizing_enabled = False

    learner_name = settings.Setting("SVM Learner")

    # 0: c_svc, 1: nu_svc
    svmtype = settings.Setting(0)
    C = settings.Setting(1.0)
    nu = settings.Setting(0.5)
    # 0: Linear, 1: Poly, 2: RBF, 3: Sigmoid
    kernel_type = settings.Setting(0)
    degree = settings.Setting(3)
    gamma = settings.Setting(0.0)
    coef0 = settings.Setting(0.0)
    shrinking = settings.Setting(True),
    probability = settings.Setting(False)
    tol = settings.Setting(0.001)
    max_iter = settings.Setting(100)
    limit_iter = settings.Setting(True)

    def __init__(self):
        super().__init__()

        self.data = None
        self.preprocessors = None

        box = gui.widgetBox(self.controlArea, self.tr("Name"))
        gui.lineEdit(box, self, "learner_name")

        form = QtGui.QGridLayout()
        typebox = gui.radioButtonsInBox(
            self.controlArea,
            self,
            "svmtype",
            [],
            box=self.tr("SVM Type"),
            orientation=form,
        )

        c_svm = gui.appendRadioButton(typebox, "C-SVM", addToLayout=False)
        form.addWidget(c_svm, 0, 0, Qt.AlignLeft)
        form.addWidget(QtGui.QLabel(self.tr("Cost (C)")), 0, 1, Qt.AlignRight)
        c_spin = gui.doubleSpin(typebox,
                                self,
                                "C",
                                1e-3,
                                1000.0,
                                0.1,
                                decimals=3,
                                addToLayout=False)

        form.addWidget(c_spin, 0, 2)

        nu_svm = gui.appendRadioButton(typebox, "ν-SVM", addToLayout=False)
        form.addWidget(nu_svm, 1, 0, Qt.AlignLeft)

        form.addWidget(QtGui.QLabel(self.trUtf8("Complexity bound (\u03bd)")),
                       1, 1, Qt.AlignRight)

        nu_spin = gui.doubleSpin(typebox,
                                 self,
                                 "nu",
                                 0.05,
                                 1.0,
                                 0.05,
                                 decimals=2,
                                 addToLayout=False)
        form.addWidget(nu_spin, 1, 2)

        box = gui.widgetBox(self.controlArea, self.tr("Kernel"))
        buttonbox = gui.radioButtonsInBox(box,
                                          self,
                                          "kernel_type",
                                          btnLabels=[
                                              "Linear,   x∙y",
                                              "Polynomial,   (g x∙y + c)^d",
                                              "RBF,   exp(-g|x-y|²)",
                                              "Sigmoid,   tanh(g x∙y + c)"
                                          ],
                                          callback=self._on_kernel_changed)
        parambox = gui.widgetBox(box, orientation="horizontal")
        gamma = gui.doubleSpin(parambox,
                               self,
                               "gamma",
                               0.0,
                               10.0,
                               0.0001,
                               label=" g: ",
                               orientation="horizontal",
                               alignment=Qt.AlignRight)
        coef0 = gui.doubleSpin(parambox,
                               self,
                               "coef0",
                               0.0,
                               10.0,
                               0.0001,
                               label=" c: ",
                               orientation="horizontal",
                               alignment=Qt.AlignRight)
        degree = gui.doubleSpin(parambox,
                                self,
                                "degree",
                                0.0,
                                10.0,
                                0.5,
                                label=" d: ",
                                orientation="horizontal",
                                alignment=Qt.AlignRight)
        self._kernel_params = [gamma, coef0, degree]
        box = gui.widgetBox(self.controlArea, "Optimization parameters")
        gui.doubleSpin(box,
                       self,
                       "tol",
                       1e-7,
                       1.0,
                       5e-7,
                       label="Numerical Tolerance")
        gui.spin(box,
                 self,
                 "max_iter",
                 0,
                 1e6,
                 100,
                 label="Iteration Limit",
                 checked="limit_iter")

        gui.button(self.controlArea,
                   self,
                   "&Apply",
                   callback=self.apply,
                   default=True)

        self._on_kernel_changed()

        self.apply()

    @check_sql_input
    def set_data(self, data):
        """Set the input train data set."""
        self.data = data
        if data is not None:
            self.apply()

    def set_preprocessor(self, preproc):
        if preproc is None:
            self.preprocessors = None
        else:
            self.preprocessors = (preproc, )
        self.apply()

    def apply(self):
        kernel = ["linear", "poly", "rbf", "sigmoid"][self.kernel_type]
        common_args = dict(kernel=kernel,
                           degree=self.degree,
                           gamma=self.gamma,
                           coef0=self.coef0,
                           tol=self.tol,
                           max_iter=self.max_iter if self.limit_iter else -1,
                           probability=True,
                           preprocessors=self.preprocessors)
        if self.svmtype == 0:
            learner = SVMLearner(C=self.C, **common_args)
        else:
            learner = NuSVMLearner(nu=self.nu, **common_args)
        learner.name = self.learner_name
        classifier = None
        sv = None
        if self.data is not None:
            self.error([0, 1])
            if not learner.check_learner_adequacy(self.data.domain):
                self.error(0, learner.learner_adequacy_err_msg)
            elif len(np.unique(self.data.Y)) < 2:
                self.error(1, "Data contains only one target value.")
            else:
                classifier = learner(self.data)
                classifier.name = self.learner_name
                sv = self.data[classifier.skl_model.support_]

        self.send("Learner", learner)
        self.send("Classifier", classifier)
        self.send("Support vectors", sv)

    def _on_kernel_changed(self):
        enabled = [
            [False, False, False],  # linear
            [True, True, True],  # poly
            [True, False, False],  # rbf
            [True, True, False]
        ]  # sigmoid

        mask = enabled[self.kernel_type]
        for spin, enabled in zip(self._kernel_params, mask):
            spin.setEnabled(enabled)
Esempio n. 12
0
class OWClusterAnalysis(OWWidget):
    name = "Cluster Analysis"
    description = (
        "The widget displays differentially expressed genes that characterize the cluster, "
        "and corresponding gene terms that describe differentially expressed genes"
    )
    icon = "../widgets/icons/OWClusterAnalysis.svg"
    priority = 110

    class Inputs:
        data_table = Input('Data', Table)
        custom_sets = Input('Custom Gene Sets', Table)

    class Outputs:
        selected_data = Output('Selected Data', Table)
        gene_scores = Output('Gene Scores', Table)
        gene_set_scores = Output('Gene Set Scores', Table)

    class Information(OWWidget.Information):
        pass

    class Warning(OWWidget.Warning):
        gene_enrichment = Msg('{}, {}.')
        no_selected_gene_sets = Msg(
            'No gene set selected, select them from Gene Sets box.')

    class Error(OWWidget.Error):
        no_cluster_indicator = Msg('No cluster indicator in the input data')
        gene_as_attributes = Msg(
            'Genes, in the input data, are expected as column names')
        organism_mismatch = Msg(
            'Organism in input data and custom gene sets does not match')
        cluster_batch_conflict = Msg(
            'Cluster and batch must not be the same variable')

    settingsHandler = ClusterAnalysisContextHandler()
    cluster_indicators = ContextSetting([])
    batch_indicator = ContextSetting(None)
    stored_gene_sets_selection = ContextSetting(())

    scoring_method_selection = ContextSetting(0)
    scoring_method_design = ContextSetting(0)
    scoring_test_type = ContextSetting(0)

    # genes filter
    max_gene_count = Setting(20)
    use_gene_count_filter = Setting(True)

    max_gene_p_value = Setting(0.1)
    use_gene_pval_filter = Setting(False)

    max_gene_fdr = Setting(0.1)
    use_gene_fdr_filter = Setting(True)

    # gene sets filter
    min_gs_count = Setting(5)
    use_gs_count_filter = Setting(True)

    max_gs_p_value = Setting(0.1)
    use_gs_pval_filter = Setting(False)

    max_gs_fdr = Setting(0.1)
    use_gs_max_fdr = Setting(True)

    # auto commit results
    auto_commit = settings.Setting(False)

    custom_gene_set_indicator = settings.Setting(None)

    def __init__(self):
        super().__init__()

        # widget attributes
        self.input_data = None
        self.store_input_domain = None
        self.input_genes_names = []
        self.input_genes_ids = []

        self.tax_id = None
        self.use_attr_names = None
        self.gene_id_attribute = None

        # custom gene set input
        self.feature_model = itemmodels.DomainModel(
            valid_types=(DiscreteVariable, StringVariable))
        self.custom_data = None
        self.custom_tax_id = None
        self.custom_use_attr_names = None
        self.custom_gene_id_attribute = None
        self.custom_gene_id_column = None
        self.num_of_custom_sets = None

        self.rows_by_cluster = None
        self.rows_by_batch = None
        self.clusters = []
        self.new_cluster_profile = []

        # data model
        self.cluster_info_model = None

        # Info
        info_box = vBox(self.controlArea, 'Info')
        self.input_info = widgetLabel(info_box)

        # Cluster selection
        self.cluster_indicator_model = itemmodels.DomainModel(
            valid_types=(DiscreteVariable, ), separators=False)
        self.cluster_indicator_box = widgetBox(self.controlArea,
                                               'Cluster Indicator')

        self.cluster_indicator_view = listView(
            self.cluster_indicator_box,
            self,
            'cluster_indicators',
            model=self.cluster_indicator_model,
            selectionMode=QListWidget.MultiSelection,
            callback=self.invalidate,
            sizeHint=QSize(256, 70),
        )

        # Batch selection
        self.batch_indicator_model = itemmodels.DomainModel(
            valid_types=(DiscreteVariable, ), separators=False, placeholder="")
        box = widgetBox(self.controlArea, 'Batch Indicator')
        self.batch_indicator_combobox = comboBox(
            box,
            self,
            'batch_indicator',
            model=self.batch_indicator_model,
            sendSelectedValue=True,
            callback=self.batch_indicator_changed,
        )

        # Gene scoring
        box = widgetBox(self.controlArea, 'Gene Scoring')
        self.gene_scoring = GeneScoringWidget(box, self)
        self.gene_scoring.set_method_selection_area('scoring_method_selection')
        self.gene_scoring.set_method_design_area('scoring_method_design')
        self.gene_scoring.set_test_type('scoring_test_type')

        # Gene Sets widget
        gene_sets_box = widgetBox(self.controlArea, "Gene Sets")
        self.gs_widget = GeneSetsSelection(gene_sets_box, self,
                                           'stored_gene_sets_selection')
        self.gs_widget.hierarchy_tree_widget.itemClicked.connect(
            self.__gene_sets_enrichment)

        # custom gene sets area
        box = vBox(self.controlArea, "Custom Gene Sets")

        if self.custom_gene_set_indicator not in self.feature_model:
            self.custom_gene_set_indicator = None

        self.gs_label_combobox = comboBox(
            box,
            self,
            "custom_gene_set_indicator",
            sendSelectedValue=True,
            model=self.feature_model,
            callback=self.handle_custom_gene_sets,
        )
        self.gs_label_combobox.setDisabled(True)

        # main area
        splitter = QSplitter(Qt.Horizontal, self.mainArea)
        self.mainArea.layout().addWidget(splitter)

        genes_filter = widgetBox(splitter,
                                 'Filter Genes',
                                 orientation=QHBoxLayout())
        spin(
            genes_filter,
            self,
            'max_gene_count',
            0,
            10000,
            label='Count',
            tooltip='Minimum genes count',
            checked='use_gene_count_filter',
            callback=self.filter_genes,
            callbackOnReturn=True,
            checkCallback=self.filter_genes,
        )

        doubleSpin(
            genes_filter,
            self,
            'max_gene_p_value',
            0.0,
            1.0,
            0.0001,
            label='p-value',
            tooltip='Maximum p-value of the enrichment score',
            checked='use_gene_pval_filter',
            callback=self.filter_genes,
            callbackOnReturn=True,
            checkCallback=self.filter_genes,
        )

        doubleSpin(
            genes_filter,
            self,
            'max_gene_fdr',
            0.0,
            1.0,
            0.0001,
            label='FDR',
            tooltip='Maximum false discovery rate',
            checked='use_gene_fdr_filter',
            callback=self.filter_genes,
            callbackOnReturn=True,
            checkCallback=self.filter_genes,
        )

        gene_sets_filter = widgetBox(splitter,
                                     'Filter Gene Sets',
                                     orientation=QHBoxLayout())
        spin(
            gene_sets_filter,
            self,
            'min_gs_count',
            0,
            DISPLAY_GENE_SETS_COUNT,
            label='Count',
            tooltip='Minimum genes count',
            checked='use_gs_count_filter',
            callback=self.filter_gene_sets,
            callbackOnReturn=True,
            checkCallback=self.filter_gene_sets,
        )

        doubleSpin(
            gene_sets_filter,
            self,
            'max_gs_p_value',
            0.0,
            1.0,
            0.0001,
            label='p-value',
            tooltip='Maximum p-value of the enrichment score',
            checked='use_gs_pval_filter',
            callback=self.filter_gene_sets,
            callbackOnReturn=True,
            checkCallback=self.filter_gene_sets,
        )

        doubleSpin(
            gene_sets_filter,
            self,
            'max_gs_fdr',
            0.0,
            1.0,
            0.0001,
            label='FDR',
            tooltip='Maximum false discovery rate',
            checked='use_gs_max_fdr',
            callback=self.filter_gene_sets,
            callbackOnReturn=True,
            checkCallback=self.filter_gene_sets,
        )

        self.cluster_info_view = QTableView()
        self.cluster_info_view.verticalHeader().setVisible(False)
        self.cluster_info_view.setItemDelegate(HTMLDelegate())
        self.cluster_info_view.horizontalHeader().hide()
        self.cluster_info_view.horizontalHeader().setSectionResizeMode(
            QHeaderView.Stretch)

        auto_commit(self.controlArea,
                    self,
                    "auto_commit",
                    "&Commit",
                    box=False)

        self.mainArea.layout().addWidget(self.cluster_info_view)

    def sizeHint(self):
        return QSize(800, 600)

    def __update_info_box(self):
        info_string = ''
        if self.input_genes_ids:
            info_string += '{} samples, {} clusters\n'.format(
                self.input_data.X.shape[0],
                len(self.clusters) if self.clusters else '?')
            info_string += '{:,d} unique genes\n'.format(
                len(self.input_genes_ids))
        else:
            info_string += 'No genes on input.\n'

        if self.custom_data:
            info_string += '{} marker genes in {} sets\n'.format(
                self.custom_data.X.shape[0], self.num_of_custom_sets)

        self.input_info.setText(info_string)

    def __set_cluster_info_model(self):
        self.cluster_info_view.setModel(None)

        self.cluster_info_model = ClusterModel(self)
        self.cluster_info_model.add_rows(self.clusters)

        # add model to the view
        self.cluster_info_view.setModel(self.cluster_info_model)
        # call sizeHint function
        self.cluster_info_view.resizeRowsToContents()
        self.cluster_info_view.selectionModel().selectionChanged.connect(
            self.commit)

    def __create_temp_class_var(self):
        """ See no evil !"""
        cluster_indicator_name = 'Cluster indicators'
        row_profile = None
        new_cluster_values = []
        var_index_lookup = {
            val: idx
            for var in self.cluster_indicators
            for idx, val in enumerate(var.values)
        }

        cart_prod = itertools.product(
            *[cluster.values for cluster in self.cluster_indicators])
        for comb in cart_prod:
            new_cluster_values.append(', '.join([val for val in comb]))
            self.new_cluster_profile.append(
                [var_index_lookup[val] for val in comb])

        row_profile_lookup = {
            tuple(profile): indx
            for indx, (profile, _) in enumerate(
                zip(self.new_cluster_profile, new_cluster_values))
        }
        for var in self.cluster_indicators:
            if row_profile is None:
                row_profile = np.asarray(
                    self.input_data.get_column_view(var)[0], dtype=int)
            else:
                row_profile = np.vstack(
                    (row_profile,
                     np.asarray(self.input_data.get_column_view(var)[0],
                                dtype=int)))

        ca_ind = DiscreteVariable.make(
            cluster_indicator_name,
            values=[val for val in new_cluster_values],
            ordered=True)

        domain = Domain(
            self.input_data.domain.attributes,
            self.input_data.domain.class_vars,
            self.input_data.domain.metas + (ca_ind, ),
        )

        table = self.input_data.transform(domain)
        table[:, ca_ind] = np.array(
            [[row_profile_lookup[tuple(row_profile[:, i])]]
             for i in range(row_profile.shape[1])])
        self.input_data = table
        return ca_ind

    def __set_clusters(self):
        self.clusters = []
        self.new_cluster_profile = []
        self.cluster_var = None

        if self.cluster_indicators and self.input_data:

            if isinstance(self.cluster_indicators,
                          list) and len(self.cluster_indicators) > 1:
                self.cluster_var = self.__create_temp_class_var()
            else:
                self.cluster_var = self.cluster_indicators[0]

            self.rows_by_cluster = np.asarray(self.input_data.get_column_view(
                self.cluster_var)[0],
                                              dtype=int)
            for index, name in enumerate(self.cluster_var.values):
                cluster = Cluster(name, index)
                self.clusters.append(cluster)
                cluster.set_genes(self.input_genes_names, self.input_genes_ids)

    def __set_batch(self):
        self.Error.cluster_batch_conflict.clear()
        self.rows_by_batch = None

        if self.batch_indicator == self.cluster_var:
            self.Error.cluster_batch_conflict()
            return
        if self.batch_indicator and self.input_data:
            self.rows_by_batch = np.asarray(self.input_data.get_column_view(
                self.batch_indicator)[0],
                                            dtype=int)

    def __set_genes(self):
        self.input_genes_names = []
        self.input_genes_ids = []

        if self.use_attr_names:
            for variable in self.input_data.domain.attributes:
                self.input_genes_names.append(str(variable.name))
                self.input_genes_ids.append(
                    str(variable.attributes.get(self.gene_id_attribute,
                                                np.nan)))

    def filter_genes(self):
        if self.cluster_info_model:
            # filter genes
            # note: after gene filter is applied, we need to recalculate gene set enrichment
            self.cluster_info_model.apply_gene_filters(
                self.max_gene_p_value if self.use_gene_pval_filter else None,
                self.max_gene_fdr if self.use_gene_fdr_filter else None,
                self.max_gene_count if self.use_gene_count_filter else None,
            )

            # recalculate gene set enrichment
            self.__gene_sets_enrichment()
            # call sizeHint function
            self.cluster_info_view.resizeRowsToContents()

            # commit changes after filter
            self.commit()

    def filter_gene_sets(self):
        if self.cluster_info_model:
            # filter gene sets
            self.cluster_info_model.apply_gene_sets_filters(
                self.max_gs_p_value if self.use_gs_pval_filter else None,
                self.max_gs_fdr if self.use_gs_max_fdr else None,
                self.min_gs_count if self.use_gs_count_filter else None,
            )

            # call sizeHint function
            self.cluster_info_view.resizeRowsToContents()

    def __gene_enrichment(self):
        design = bool(self.gene_scoring.get_selected_desig()
                      )  # if true cluster vs. cluster else cluster vs rest
        test_type = self.gene_scoring.get_selected_test_type()
        method = self.gene_scoring.get_selected_method()
        try:
            if method.score_function == score_hypergeometric_test:
                values = set(np.unique(self.input_data.X))
                if (0 not in values) or (len(values) != 2):
                    raise ValueError('Binary data expected (use Preprocess)')

            self.cluster_info_model.score_genes(
                design=design,
                table_x=self.input_data.X,
                rows_by_cluster=self.rows_by_cluster,
                rows_by_batch=self.rows_by_batch,
                method=method,
                alternative=test_type,
            )
        except ValueError as e:
            self.Warning.gene_enrichment(str(e), 'p-values are set to 1')

    def __gene_sets_enrichment(self):
        if self.input_data:
            self.Warning.no_selected_gene_sets.clear()
            all_sets = self.gs_widget.get_hierarchies()
            selected_sets = self.gs_widget.get_hierarchies(only_selected=True)

            if len(selected_sets) == 0 and len(all_sets) > 0:
                self.Warning.no_selected_gene_sets()

            # save setting on selected hierarchies
            self.stored_gene_sets_selection = tuple(selected_sets)
            ref_genes = set(self.input_genes_ids)

            try:
                self.cluster_info_model.gene_sets_enrichment(
                    self.gs_widget.gs_object, selected_sets, ref_genes)
            except Exception as e:
                # TODO: possible exceptions?

                raise e

            self.filter_gene_sets()

    def invalidate(self, cluster_init=True):
        if self.input_data is not None and self.tax_id is not None:
            self.Warning.gene_enrichment.clear()

            if self.cluster_info_model is not None:
                self.cluster_info_model.cancel()

            self.__set_genes()
            if cluster_init:
                self.__set_clusters()
            self.__set_batch()
            self.__set_cluster_info_model()

            # note: when calling self.__gene_enrichment we calculate gse automatically.
            #       No need to call self.__gene_sets_enrichment here
            self.__gene_enrichment()
            self.__update_info_box()

    def batch_indicator_changed(self):
        self.invalidate(cluster_init=False)

    @Inputs.data_table
    def handle_input(self, data):
        self.closeContext()
        self.Warning.clear()
        self.Error.clear()

        self.input_data = None
        self.store_input_domain = None
        self.stored_gene_sets_selection = ()
        self.input_genes_names = []
        self.input_genes_ids = []
        self.tax_id = None
        self.use_attr_names = None
        self.gene_id_attribute = None
        self.clusters = None

        self.gs_widget.clear()
        self.gs_widget.clear_gene_sets()
        self.cluster_info_view.setModel(None)

        self.cluster_indicators = []
        self.cluster_var = None
        self.batch_indicator = None
        self.cluster_indicator_model.set_domain(None)
        self.batch_indicator_model.set_domain(None)

        self.__update_info_box()

        if data:
            self.input_data = data

            self.cluster_indicator_model.set_domain(self.input_data.domain)
            self.batch_indicator_model.set_domain(self.input_data.domain)

            # For Cluster Indicator do not use categorical variables that contain only one value.
            self.cluster_indicator_model.wrap([
                item for item in self.cluster_indicator_model
                if len(item.values) > 1
            ])
            # First value in batch indicator model is a NoneType,
            # we can skip it when we validate categorical variables
            self.batch_indicator_model.wrap(self.batch_indicator_model[:1] + [
                item for item in self.batch_indicator_model[1:]
                if len(item.values) > 1
            ])

            self.tax_id = self.input_data.attributes.get(TAX_ID, None)
            self.use_attr_names = self.input_data.attributes.get(
                GENE_AS_ATTRIBUTE_NAME, None)
            self.gene_id_attribute = self.input_data.attributes.get(
                GENE_ID_ATTRIBUTE, None)

            if not self.cluster_indicator_model:
                self.Error.no_cluster_indicator()
                return
            elif not self.use_attr_names:
                self.Error.gene_as_attributes()
                return

            self.openContext(self.input_data.domain)

            self.gs_widget.load_gene_sets(self.tax_id)
            if self.cluster_indicator_model and len(
                    self.cluster_indicators) < 1:
                self.cluster_indicators = [self.cluster_indicator_model[0]]
            if self.batch_indicator_model and self.batch_indicator is None:
                self.batch_indicator = self.batch_indicator_model[0]

            self.invalidate()

            if self.custom_data:
                self.refresh_custom_gene_sets()
                self._handle_future_model()
                self.handle_custom_gene_sets()

    @Inputs.custom_sets
    def handle_custom_input(self, data):
        self.Error.clear()
        self.Warning.clear()
        self.closeContext()
        self.custom_data = None
        self.custom_tax_id = None
        self.custom_use_attr_names = None
        self.custom_gene_id_attribute = None
        self.custom_gene_id_column = None
        self.num_of_custom_sets = None
        self.feature_model.set_domain(None)

        if data:
            self.custom_data = data
            self.feature_model.set_domain(self.custom_data.domain)
            self.custom_tax_id = str(
                self.custom_data.attributes.get(TAX_ID, None))
            self.custom_use_attr_names = self.custom_data.attributes.get(
                GENE_AS_ATTRIBUTE_NAME, None)
            self.custom_gene_id_attribute = self.custom_data.attributes.get(
                GENE_ID_ATTRIBUTE, None)
            self.custom_gene_id_column = self.custom_data.attributes.get(
                GENE_ID_COLUMN, None)

            self._handle_future_model()

        if self.input_data:
            self.openContext(self.input_data.domain)

        self.gs_label_combobox.setDisabled(True)
        self.refresh_custom_gene_sets()
        self.handle_custom_gene_sets(select_customs_flag=True)

    def __check_organism_mismatch(self):
        """ Check if organisms from different inputs match.

        :return: True if there is a mismatch
        """
        if self.tax_id is not None and self.custom_tax_id is not None:
            return self.tax_id != self.custom_tax_id
        return False

    def _handle_future_model(self):
        if self.custom_gene_set_indicator in self.feature_model:
            index = self.feature_model.indexOf(self.custom_gene_set_indicator)
            self.custom_gene_set_indicator = self.feature_model[index]
        else:
            if self.feature_model:
                self.custom_gene_set_indicator = self.feature_model[0]
            else:
                self.custom_gene_set_indicator = None

    def handle_custom_gene_sets(self, select_customs_flag=False):
        if self.custom_gene_set_indicator:
            if self.custom_data is not None and self.custom_gene_id_column is not None:

                if self.__check_organism_mismatch():
                    self.gs_label_combobox.setDisabled(True)
                    self.Error.organism_mismatch()
                    self.gs_widget.update_gs_hierarchy()
                    self.__gene_sets_enrichment()
                    return

                if isinstance(self.custom_gene_set_indicator,
                              DiscreteVariable):
                    labels = self.custom_gene_set_indicator.values
                    gene_sets_names = [
                        labels[int(idx)]
                        for idx in self.custom_data.get_column_view(
                            self.custom_gene_set_indicator)[0]
                    ]
                else:
                    gene_sets_names, _ = self.custom_data.get_column_view(
                        self.custom_gene_set_indicator)

                self.num_of_custom_sets = len(set(gene_sets_names))
                gene_names, _ = self.custom_data.get_column_view(
                    self.custom_gene_id_column)
                hierarchy_title = (self.custom_data.name if
                                   self.custom_data.name else 'Custom sets', )
                try:
                    self.gs_widget.add_custom_sets(
                        gene_sets_names,
                        gene_names,
                        hierarchy_title=hierarchy_title,
                        select_customs_flag=select_customs_flag,
                    )
                except GeneSetException:
                    pass
                self.gs_label_combobox.setDisabled(False)
            else:
                self.gs_widget.update_gs_hierarchy()

        self.__gene_sets_enrichment()
        self.__update_info_box()

    def refresh_custom_gene_sets(self):
        self.gs_widget.clear_custom_sets()
        # self.gs_widget.update_gs_hierarchy()

    def gene_scores_output(self, selected_clusters):

        metas = [
            StringVariable('Gene'),
            StringVariable(ENTREZ_ID),
            StringVariable('Rank'),
            ContinuousVariable('Statistic score'),
            ContinuousVariable('P-value'),
            ContinuousVariable('FDR'),
        ]

        if len(self.new_cluster_profile):
            # note: order is important
            metas = self.cluster_indicators + metas

        domain = Domain([], metas=metas, class_vars=self.cluster_var)

        data = []
        for cluster in selected_clusters:
            num_of_genes = len(cluster.filtered_genes)

            scores = [gene.score for gene in cluster.filtered_genes]
            p_vals = [gene.p_val for gene in cluster.filtered_genes]
            fdr_vals = [gene.fdr for gene in cluster.filtered_genes]
            gene_names = [
                gene.input_identifier for gene in cluster.filtered_genes
            ]
            gene_ids = [gene.gene_id for gene in cluster.filtered_genes]
            rank = rankdata(p_vals, method='min')

            if len(self.new_cluster_profile):
                profiles = [[cluster.index] * num_of_genes]
                [
                    profiles.append([p] * num_of_genes)
                    for p in self.new_cluster_profile[cluster.index]
                ]
            else:
                profiles = [[cluster.index] * num_of_genes]

            for row in zip(*profiles, gene_names, gene_ids, rank, scores,
                           p_vals, fdr_vals):
                data.append(list(row))

        out_data = Table(domain, data)
        out_data.attributes[TAX_ID] = self.tax_id
        out_data.attributes[GENE_AS_ATTRIBUTE_NAME] = False
        out_data.attributes[GENE_ID_COLUMN] = ENTREZ_ID
        self.Outputs.gene_scores.send(out_data)

    def gene_set_scores_output(self, selected_clusters):

        metas = [
            StringVariable('Term'),
            StringVariable('Term ID'),
            StringVariable('Rank'),
            ContinuousVariable('P-value'),
            ContinuousVariable('FDR'),
        ]

        if len(self.new_cluster_profile):
            # note: order is important
            metas = self.cluster_indicators + metas

        domain = Domain([], metas=metas, class_vars=self.cluster_var)

        data = []
        for cluster in selected_clusters:
            num_of_sets = len(cluster.filtered_gene_sets)

            p_vals = [gs.p_val for gs in cluster.filtered_gene_sets]
            fdr_vals = [gs.fdr for gs in cluster.filtered_gene_sets]
            gs_names = [gs.name for gs in cluster.filtered_gene_sets]
            gs_ids = [gs.gs_id for gs in cluster.filtered_gene_sets]
            rank = rankdata(p_vals, method='min')

            if len(self.new_cluster_profile):
                profiles = [[cluster.index] * num_of_sets]
                [
                    profiles.append([p] * num_of_sets)
                    for p in self.new_cluster_profile[cluster.index]
                ]
            else:
                profiles = [[cluster.index] * num_of_sets]

            for row in zip(*profiles, gs_names, gs_ids, rank, p_vals,
                           fdr_vals):
                data.append(list(row))

        self.Outputs.gene_set_scores.send(Table(domain, data))

    def commit(self):
        selection_model = self.cluster_info_view.selectionModel()
        selected_rows = selection_model.selectedRows()
        selected_clusters = []
        selected_cluster_indexes = set()
        selected_cluster_genes = set()

        if not self.input_data or not selected_rows:
            self.Outputs.selected_data.send(None)
            return

        for sel_row in selected_rows:
            cluster = sel_row.data()
            selected_clusters.append(cluster)
            selected_cluster_indexes.add(cluster.index)
            [
                selected_cluster_genes.add(gene.gene_id)
                for gene in cluster.filtered_genes
            ]

        # get columns of selected clusters
        selected_columns = [
            column for column in self.input_data.domain.attributes
            if self.gene_id_attribute in column.attributes
            and str(column.attributes[
                self.gene_id_attribute]) in selected_cluster_genes
        ]

        domain = Domain(selected_columns, self.input_data.domain.class_vars,
                        self.input_data.domain.metas)
        output_data = self.input_data.from_table(domain, self.input_data)

        # get rows of selected clusters
        selected_rows = [
            row_index
            for row_index, col_index in enumerate(self.rows_by_cluster)
            if col_index in selected_cluster_indexes
        ]

        # send to output signal
        self.Outputs.selected_data.send(output_data[selected_rows])
        self.gene_scores_output(selected_clusters)
        self.gene_set_scores_output(selected_clusters)
Esempio n. 13
0
class OWSVMClassification(OWBaseSVM):
    name = "SVM"
    description = "Support Vector Machines map inputs to higher-dimensional " \
                  "feature spaces that best separate different classes. "
    icon = "icons/SVM.svg"
    priority = 50

    LEARNER = SVMLearner

    outputs = [("Support vectors", Table)]

    # 0: c_svc, 1: nu_svc
    svmtype = settings.Setting(0)
    C = settings.Setting(1.0)
    nu = settings.Setting(0.5)
    shrinking = settings.Setting(True),
    probability = settings.Setting(False)
    max_iter = settings.Setting(100)
    limit_iter = settings.Setting(True)

    def _add_type_box(self):
        form = QtGui.QGridLayout()
        self.type_box = box = gui.radioButtonsInBox(
            self.controlArea, self, "svmtype", [], box="SVM Type",
            orientation=form, callback=self.settings_changed)

        form.addWidget(gui.appendRadioButton(box, "C-SVM", addToLayout=False),
                       0, 0, Qt.AlignLeft)
        form.addWidget(QtGui.QLabel("Cost (C):"),
                       0, 1, Qt.AlignRight)
        form.addWidget(gui.doubleSpin(box, self, "C", 1e-3, 1000.0, 0.1,
                                      decimals=3, alignment=Qt.AlignRight,
                                      controlWidth=80, addToLayout=False,
                                      callback=self.settings_changed),
                       0, 2)

        form.addWidget(gui.appendRadioButton(box, "ν-SVM", addToLayout=False),
                       1, 0, Qt.AlignLeft)
        form.addWidget(QtGui.QLabel("Complexity (ν):"),
                       1, 1, Qt.AlignRight)
        form.addWidget(gui.doubleSpin(box, self, "nu", 0.05, 1.0, 0.05,
                                      decimals=2, alignment=Qt.AlignRight,
                                      controlWidth=80, addToLayout=False,
                                      callback=self.settings_changed),
                       1, 2)

    def _add_optimization_box(self):
        super()._add_optimization_box()
        gui.spin(self.optimization_box, self, "max_iter", 50, 1e6, 50,
                 label="Iteration limit:", checked="limit_iter",
                 alignment=Qt.AlignRight, controlWidth=100,
                 callback=self.settings_changed)

    def create_learner(self):
        kernel = ["linear", "poly", "rbf", "sigmoid"][self.kernel_type]
        common_args = dict(
            kernel=kernel,
            degree=self.degree,
            gamma=self.gamma,
            coef0=self.coef0,
            tol=self.tol,
            max_iter=self.max_iter if self.limit_iter else -1,
            probability=True,
            preprocessors=self.preprocessors
        )
        if self.svmtype == 0:
            return SVMLearner(C=self.C, **common_args)
        else:
            return NuSVMLearner(nu=self.nu, **common_args)

    def get_learner_parameters(self):
        items = OrderedDict()
        if self.svmtype == 0:
            items["SVM type"] = "C-SVM, C={}".format(self.C)
        else:
            items["SVM type"] = "ν-SVM, ν={}".format(self.nu)
        self._report_kernel_parameters(items)
        items["Numerical tolerance"] = "{:.6}".format(self.tol)
        items["Iteration limt"] = self.max_iter if self.limit_iter else "unlimited"
        return items
Esempio n. 14
0
class OWBaseSVM(OWBaseLearner):
    #: Kernel types
    Linear, Poly, RBF, Sigmoid = 0, 1, 2, 3
    #: Selected kernel type
    kernel_type = settings.Setting(RBF)
    #: kernel degree
    degree = settings.Setting(3)
    #: gamma
    gamma = settings.Setting(1.0)
    #: coef0 (adative constant)
    coef0 = settings.Setting(0.0)

    #: numerical tolerance
    tol = settings.Setting(0.001)

    kernels = (("Linear", "x⋅y"),
               ("Polynomial", "(g x⋅y + c)<sup>d</sup>"),
               ("RBF", "exp(-g|x-y|²)"),
               ("Sigmoid", "tanh(g x⋅y + c)"))

    def _add_kernel_box(self):
        # Initialize with the widest label to measure max width
        self.kernel_eq = self.kernels[-1][1]

        box = gui.hBox(self.controlArea, "Kernel")

        self.kernel_box = buttonbox = gui.radioButtonsInBox(
            box, self, "kernel_type", btnLabels=[k[0] for k in self.kernels],
            callback=self._on_kernel_changed, addSpace=20)
        buttonbox.layout().setSpacing(10)
        gui.rubber(buttonbox)

        parambox = gui.vBox(box)
        gui.label(parambox, self, "Kernel: %(kernel_eq)s")
        common = dict(orientation=Qt.Horizontal, callback=self.settings_changed,
                      alignment=Qt.AlignRight, controlWidth=80)
        spbox = gui.hBox(parambox)
        gui.rubber(spbox)
        inbox = gui.vBox(spbox)
        gamma = gui.doubleSpin(
            inbox, self, "gamma", 0.0, 10.0, 0.01, label=" g: ", **common)
        coef0 = gui.doubleSpin(
            inbox, self, "coef0", 0.0, 10.0, 0.01, label=" c: ", **common)
        degree = gui.doubleSpin(
            inbox, self, "degree", 0.0, 10.0, 0.5, label=" d: ", **common)
        self._kernel_params = [gamma, coef0, degree]
        gui.rubber(parambox)

        # This is the maximal height (all double spins are visible)
        # and the maximal width (the label is initialized to the widest one)
        box.layout().activate()
        box.setFixedHeight(box.sizeHint().height())
        box.setMinimumWidth(box.sizeHint().width())

    def _add_optimization_box(self):
        self.optimization_box = gui.vBox(
            self.controlArea, "Optimization Parameters")
        gui.doubleSpin(
            self.optimization_box, self, "tol", 1e-6, 1.0, 1e-5,
            label="Numerical tolerance:",
            decimals=6, alignment=Qt.AlignRight, controlWidth=100,
            callback=self.settings_changed)

    def add_main_layout(self):
        self._add_type_box()
        self._add_kernel_box()
        self._add_optimization_box()
        self._show_right_kernel()

    def _show_right_kernel(self):
        enabled = [[False, False, False],  # linear
                   [True, True, True],  # poly
                   [True, False, False],  # rbf
                   [True, True, False]]  # sigmoid

        self.kernel_eq = self.kernels[self.kernel_type][1]
        mask = enabled[self.kernel_type]
        for spin, enabled in zip(self._kernel_params, mask):
            [spin.box.hide, spin.box.show][enabled]()

    def _on_kernel_changed(self):
        self._show_right_kernel()
        self.settings_changed()

    def _report_kernel_parameters(self, items):
        if self.kernel_type == 0:
            items["Kernel"] = "Linear"
        elif self.kernel_type == 1:
            items["Kernel"] = \
                "Polynomial, ({g:.4} x⋅y + {c:.4})<sup>{d}</sup>".format(
                    g=self.gamma, c=self.coef0, d=self.degree)
        elif self.kernel_type == 2:
            items["Kernel"] = "RBF, exp(-{:.4}|x-y|²)".format(self.gamma)
        else:
            items["Kernel"] = "Sigmoid, tanh({g:.4} x⋅y + {c:.4})".format(
                g=self.gamma, c=self.coef0)

    def update_model(self):
        super().update_model()

        sv = None
        if self.valid_data:
            sv = self.data[self.model.skl_model.support_]
        self.send("Support vectors", sv)
Esempio n. 15
0
class OWLogisticRegression(OWBaseLearner):
    name = "Logistic Regression"
    description = "The logistic regression classification algorithm with " \
                  "LASSO (L1) or ridge (L2) regularization."
    icon = "icons/LogisticRegression.svg"
    priority = 60

    LEARNER = LogisticRegressionLearner

    outputs = [("Coefficients", Table)]

    penalty_type = settings.Setting(1)
    C_index = settings.Setting(61)

    C_s = list(chain(range(1000, 200, -50),
                     range(200, 100, -10),
                     range(100, 20, -5),
                     range(20, 0, -1),
                     [x / 10 for x in range(9, 2, -1)],
                     [x / 100 for x in range(20, 2, -1)],
                     [x / 1000 for x in range(20, 0, -1)]))
    dual = False
    tol = 0.0001
    fit_intercept = True
    intercept_scaling = 1.0

    penalty_types = ("Lasso (L1)", "Ridge (L2)")

    def add_main_layout(self):
        box = gui.widgetBox(self.controlArea, box=True)
        gui.comboBox(box, self, "penalty_type", label="Regularization type: ",
                     items=self.penalty_types, orientation=Qt.Horizontal,
                     addSpace=4, callback=self.settings_changed)
        gui.widgetLabel(box, "Strength:")
        box2 = gui.hBox(gui.indentedBox(box))
        gui.widgetLabel(box2, "Weak").setStyleSheet("margin-top:6px")
        gui.hSlider(box2, self, "C_index",
                    minValue=0, maxValue=len(self.C_s) - 1,
                    callback=self.set_c, createLabel=False)
        gui.widgetLabel(box2, "Strong").setStyleSheet("margin-top:6px")
        box2 = gui.hBox(box)
        box2.layout().setAlignment(Qt.AlignCenter)
        self.c_label = gui.widgetLabel(box2)
        self.set_c()

    def set_c(self):
        self.C = self.C_s[self.C_index]
        fmt = "C={}" if self.C >= 1 else "C={:.3f}"
        self.c_label.setText(fmt.format(self.C))
        self.settings_changed()

    def create_learner(self):
        penalty = ["l1", "l2"][self.penalty_type]
        return self.LEARNER(
            penalty=penalty,
            dual=self.dual,
            tol=self.tol,
            C=self.C,
            fit_intercept=self.fit_intercept,
            intercept_scaling=self.intercept_scaling,
            preprocessors=self.preprocessors
        )

    def update_model(self):
        super().update_model()
        coef_table = None
        if self.valid_data:
            coef_table = create_coef_table(self.model)
        self.send("Coefficients", coef_table)

    def get_learner_parameters(self):
        return (("Regularization", "{}, C={}".format(
                self.penalty_types[self.penalty_type], self.C_s[self.C_index])),)
class OWParallelCoordinates(widget.OWWidget):
    name = "Parallel Coordinates"
    description = "Parallel coordinates display of multi-dimensional data."
    icon = "icons/ParallelCoordinates.svg"
    priority = 900
    inputs = [("Data", Table, 'set_data', widget.Default),
              ("Features", widget.AttributeList, 'set_shown_attributes')]
    outputs = [("Selected Data", Table, widget.Default),
               ("Annotated Data", Table), ("Features", widget.AttributeList)]

    graph_name = 'graph'
    settingsHandler = settings.DomainContextHandler()

    autocommit = settings.Setting(True)
    selected_attrs = settings.ContextSetting([])
    color_attr = settings.ContextSetting('')
    constraint_range = settings.ContextSetting({})

    autocommit = settings.Setting(default=True)

    UserAdviceMessages = [
        widget.Message(
            'You can select subsets of data based on value intervals '
            'by dragging on the corresponding dimensions\' axes.\n\n'
            'You can reset the selection by clicking somewhere '
            'outside the selected interval on the axis.', 'subset-selection')
    ]

    class Warning(widget.OWWidget.Warning):
        too_many_selected_dimensions = widget.Msg(
            'Too many dimensions selected ({}). Only first {} shown.')

    class Information(widget.OWWidget.Information):
        dataset_sampled = widget.Msg('Showing a random sample of your data.')

    OPTIMIZATION_N_DIMS = (3, 9)
    MAX_N_DIMS = 20

    def __init__(self):
        super().__init__()
        self.graph = ParallelCoordinates(self)
        self.mainArea.layout().addWidget(self.graph)

        self.model = DomainModel(separators=False,
                                 valid_types=DomainModel.PRIMITIVE)
        self.colormodel = DomainModel(valid_types=DomainModel.PRIMITIVE)

        box = gui.vBox(self.controlArea, 'Lines')
        combo = gui.comboBox(box,
                             self,
                             'color_attr',
                             sendSelectedValue=True,
                             label='Color:',
                             orientation=Qt.Horizontal,
                             callback=self.update_plot)
        combo.setModel(self.colormodel)

        box = gui.vBox(self.controlArea, 'Dimensions')
        view = gui.listView(box,
                            self,
                            'selected_attrs',
                            model=self.model,
                            callback=self.update_plot)
        view.setSelectionMode(view.ExtendedSelection)
        # Prevent drag selection. Otherwise, each new addition to selectio`n
        # the mouse passes over triggers a webview redraw. Sending lots of data
        # around multiple times on large datasets results in stalling and crashes.
        view.mouseMoveEvent = (
            lambda event: None
            if view.state() == view.DragSelectingState else super(
                view.__class__, view).mouseMoveEvent(event))

        self.optimize_button = gui.button(
            box,
            self,
            'Optimize Selected Dimensions',
            callback=self.optimize,
            tooltip='Optimize visualized dimensions by maximizing cumulative '
            'Kendall rank correlation coefficient.')

        gui.auto_commit(self.controlArea, self, 'autocommit', '&Apply')

    def set_data(self, data):
        self.data = data
        self.graph.clear()

        self.closeContext()

        model = self.model
        colormodel = self.colormodel

        self.sample = None
        self.selected_attrs = None
        self.color_attr = None

        N_SAMPLE = 2000

        if data is not None and len(data) and len(data.domain):
            self.sample = slice(
                None) if len(data) <= N_SAMPLE else np.random.choice(
                    np.arange(len(data)), N_SAMPLE, replace=False)
            model.set_domain(data.domain)
            colormodel.set_domain(data.domain)
            self.color_attr = try_(lambda: data.domain.class_vars[0].name,
                                   None)
            selected_attrs = (model.data(model.index(i, 0)) for i in range(
                min(self.OPTIMIZATION_N_DIMS[1], model.rowCount())))
            self.selected_attrs = [
                attr for attr in selected_attrs if isinstance(attr, str)
            ]
        else:
            model.set_domain(None)
            colormodel.set_domain(None)

        self.Information.dataset_sampled(
            shown=False if data is None else len(data) > N_SAMPLE)

        self.openContext(data.domain)

        self.update_plot()
        self.commit()

    def clear(self):
        self.graph.clear()
        self.commit()

    def update_plot(self):
        data = self.data
        if data is None or not len(data):
            self.clear()
            return

        self.optimize_button.setDisabled(not self.is_optimization_valid())

        self.Warning.too_many_selected_dimensions(
            len(self.selected_attrs),
            self.MAX_N_DIMS,
            shown=len(self.selected_attrs) > self.MAX_N_DIMS)
        selected_attrs = self.selected_attrs[:self.MAX_N_DIMS]

        sample = self.sample

        dimensions = []
        for attr in selected_attrs:
            attr = data.domain[attr]
            values = data.get_column_view(attr)[0][sample]
            dim = dict(label=attr.name,
                       values=values,
                       constraintrange=self.constraint_range.get(attr.name))
            if attr.is_discrete:
                dim.update(tickvals=np.arange(len(attr.values)),
                           ticktext=attr.values)
            elif isinstance(attr, TimeVariable):
                tickvals = [
                    np.nanmin(values),
                    np.nanmedian(values),
                    np.nanmax(values)
                ]
                ticktext = [attr.repr_val(i) for i in tickvals]
                dim.update(tickvals=tickvals, ticktext=ticktext)
            dimensions.append(dim)

        # Compute color legend
        line = dict()
        padding_right = 40
        if self.color_attr:
            attr = data.domain[self.color_attr]
            values = data.get_column_view(attr)[0][sample]
            line.update(color=values, showscale=True)
            title = '<br>'.join(
                textwrap.wrap(attr.name.strip(),
                              width=7,
                              max_lines=4,
                              placeholder='…'))
            if attr.is_discrete:
                padding_right = 90
                colors = [color_to_hex(i) for i in attr.colors]
                values_short = [
                    textwrap.fill(value, width=9, max_lines=1, placeholder='…')
                    for value in attr.values
                ]
                self.graph.exposeObject(
                    'discrete_colorbar',
                    dict(colors=colors,
                         title=title,
                         values=attr.values,
                         values_short=values_short))
                line.update(showscale=False,
                            colorscale=list(
                                zip(np.linspace(0, 1, len(attr.values)),
                                    colors)))
            else:
                padding_right = 0
                self.graph.exposeObject('discrete_colorbar', {})
                line.update(colorscale=list(
                    zip((0, 1), (color_to_hex(i) for i in attr.colors[:-1]))),
                            colorbar=dict(title=title))
                if isinstance(attr, TimeVariable):
                    tickvals = [
                        np.nanmin(values),
                        np.nanmedian(values),
                        np.nanmax(values)
                    ]
                    ticktext = [attr.repr_val(i) for i in tickvals]
                    line.update(colorbar=dict(title=title,
                                              tickangle=-90,
                                              tickvals=tickvals,
                                              ticktext=ticktext))
        self.graph.plot([Parcoords(line=line, dimensions=dimensions)],
                        padding_right=padding_right)

    def set_shown_attributes(self, attrs):
        self.selected_attrs = attrs
        self.update_plot()

    def commit(self):
        selected_data, annotated_data = None, None
        data = self.data
        if data is not None and len(data):

            mask = np.ones(len(data), dtype=bool)
            for attr, (min, max) in self.constraint_range.items():
                values = data.get_column_view(attr)[0]
                mask &= (values >= min) & (values <= max)

            selected_data = data[mask]
            annotated_data = create_annotated_table(data, mask)

        self.send('Selected Data', selected_data)
        self.send('Annotated Data', annotated_data)
        self.send('Features', widget.AttributeList(self.selected_attrs))

    def is_optimization_valid(self):
        return (self.OPTIMIZATION_N_DIMS[0] <= len(self.selected_attrs) <=
                self.OPTIMIZATION_N_DIMS[1])

    def optimize(self):
        """ Optimizes the order of selected dimensions. """
        data = self.data
        if data is None or not len(data):
            return

        if not self.is_optimization_valid():
            QMessageBox(
                QMessageBox.Warning, "Parallel Coordinates Optimization",
                "Can only optimize when the number of selected dimensions "
                "is between {} and {}. "
                "Sorry.".format(*self.OPTIMIZATION_N_DIMS), QMessageBox.Abort,
                self).exec()
            return

        self.optimize_button.blockSignals(True)

        R = {}
        Rc = {}
        sample = slice(None) if len(data) < 300 else np.random.choice(
            np.arange(len(data)), 300, replace=False)

        for attr1 in self.selected_attrs:
            if self.color_attr:
                Rc[attr1] = kendalltau(data.get_column_view(attr1)[0][sample],
                                       data.get_column_view(
                                           self.color_attr)[0][sample],
                                       nan_policy='omit')[0]
            for attr2 in self.selected_attrs:
                if (attr1, attr2) in R or attr1 == attr2:
                    continue
                R[(attr1, attr2)] = R[(attr2, attr1)] = \
                    kendalltau(data.get_column_view(attr1)[0][sample],
                               data.get_column_view(attr2)[0][sample],
                               nan_policy='omit')[0]

        # First dimension is the one with the highest correlation with the
        # color attribute; the last dimension the one with the lowest
        # correlation with the first dimension.
        # If there is no color attribute, first and last are the two dimensions
        # with the lowest correlation.
        # In either case, the rest are filled in in the order of maximal
        # cumulative correlation.
        if self.color_attr:
            head = max(Rc.items(), key=lambda i: i[1])[0]
            tail = min(
                ((key, value) for key, value in R.items() if key[0] == head),
                key=lambda i: i[1])[0][1]
        else:
            head, tail = min(R.items(), key=lambda i: i[1])[0]

        def cumsum(permutation):
            return sum(R[(attr1, attr2)]
                       for attr1, attr2 in pairwise((head, ) + permutation +
                                                    (tail, )))

        body = max(itertools.permutations(
            set(self.selected_attrs) - set([head, tail])),
                   key=cumsum)

        self.selected_attrs = (head, ) + body + (tail, )
        self.update_plot()

        self.optimize_button.blockSignals(False)

    def send_report(self):
        self.report_items((('Dimensions', list(self.selected_attrs)),
                           ('Color', self.color_attr)))
        self.report_plot()
Esempio n. 17
0
class OWSilhouettePlot(widget.OWWidget):
    name = "Silhouette Plot"
    description = "Silhouette Plot"

    icon = "icons/Silhouette.svg"

    inputs = [("Data", Orange.data.Table, "set_data")]
    outputs = [("Selected Data", Orange.data.Table, widget.Default),
               ("Other Data", Orange.data.Table)]

    replaces = ["orangecontrib.prototypes.widgets.owsilhouetteplot.OWSilhouettePlot"]

    settingsHandler = settings.PerfectDomainContextHandler()

    #: Distance metric index
    distance_idx = settings.Setting(0)
    #: Group/cluster variable index
    cluster_var_idx = settings.ContextSetting(0)
    #: Annotation variable index
    annotation_var_idx = settings.ContextSetting(0)
    #: Group the silhouettes by cluster
    group_by_cluster = settings.Setting(True)
    #: A fixed size for an instance bar
    bar_size = settings.Setting(3)
    #: Add silhouette scores to output data
    add_scores = settings.Setting(False)
    auto_commit = settings.Setting(False)

    Distances = [("Euclidean", Orange.distance.Euclidean),
                 ("Manhattan", Orange.distance.Manhattan)]

    def __init__(self):
        super().__init__()

        self.data = None
        self._effective_data = None
        self._matrix = None
        self._silhouette = None
        self._labels = None
        self._silplot = None

        box = gui.widgetBox(self.controlArea, "Settings",)
        gui.comboBox(box, self, "distance_idx", label="Distance",
                     items=[name for name, _ in OWSilhouettePlot.Distances],
                     callback=self._invalidate_distances)
        self.cluster_var_cb = gui.comboBox(
            box, self, "cluster_var_idx", label="Cluster",
            callback=self._invalidate_scores)
        self.cluster_var_model = itemmodels.VariableListModel(parent=self)
        self.cluster_var_cb.setModel(self.cluster_var_model)

        gui.spin(box, self, "bar_size", minv=1, maxv=10, label="Bar Size",
                 callback=self._update_bar_size)

        gui.checkBox(box, self, "group_by_cluster", "Group by cluster",
                     callback=self._replot)

        self.annotation_cb = gui.comboBox(
            box, self, "annotation_var_idx", label="Annotations",
            callback=self._update_annotations)
        self.annotation_var_model = itemmodels.VariableListModel(parent=self)
        self.annotation_var_model[:] = ["None"]
        self.annotation_cb.setModel(self.annotation_var_model)

        gui.rubber(self.controlArea)

        box = gui.widgetBox(self.controlArea, "Output")
        gui.checkBox(box, self, "add_scores", "Add silhouette scores",)
        gui.auto_commit(box, self, "auto_commit", "Commit", box=False)

        self.scene = QtGui.QGraphicsScene()
        self.view = QtGui.QGraphicsView(self.scene)
        self.view.setRenderHint(QtGui.QPainter.Antialiasing, True)
        self.view.setAlignment(Qt.AlignTop | Qt.AlignLeft)
        self.mainArea.layout().addWidget(self.view)

    def sizeHint(self):
        sh = self.controlArea.sizeHint()
        return sh.expandedTo(QtCore.QSize(600, 720))

    @check_sql_input
    def set_data(self, data):
        """
        Set the input data set.
        """
        self.closeContext()
        self.clear()
        error_msg = ""
        warning_msg = ""
        candidatevars = []
        if data is not None:
            candidatevars = [v for v in data.domain.variables + data.domain.metas
                             if v.is_discrete and len(v.values) >= 2]
            if not candidatevars:
                error_msg = "Input does not have any suitable cluster labels."
                data = None

        if data is not None:
            ncont = sum(v.is_continuous for v in data.domain.attributes)
            ndiscrete = len(data.domain.attributes) - ncont
            if ncont == 0:
                data = None
                error_msg = "No continuous columns"
            elif ncont < len(data.domain.attributes):
                warning_msg = "{0} discrete columns will not be used for " \
                              "distance computation".format(ndiscrete)

        self.data = data
        if data is not None:
            self.cluster_var_model[:] = candidatevars
            if data.domain.class_var in candidatevars:
                self.cluster_var_idx = candidatevars.index(data.domain.class_var)
            else:
                self.cluster_var_idx = 0

            annotvars = [var for var in data.domain.metas if var.is_string]
            self.annotation_var_model[:] = ["None"] + annotvars
            self.annotation_var_idx = 1 if len(annotvars) else 0
            self._effective_data = Orange.distance._preprocess(data)
            self.openContext(Orange.data.Domain(candidatevars))

        self.error(0, error_msg)
        self.warning(0, warning_msg)

    def handleNewSignals(self):
        if self._effective_data is not None:
            self._update()
            self._replot()

        self.unconditional_commit()

    def clear(self):
        """
        Clear the widget state.
        """
        self.data = None
        self._effective_data = None
        self._matrix = None
        self._silhouette = None
        self._labels = None
        self.cluster_var_model[:] = []
        self.annotation_var_model[:] = ["None"]
        self._clear_scene()

    def _clear_scene(self):
        # Clear the graphics scene and associated objects
        self.scene.clear()
        self.scene.setSceneRect(QRectF())
        self._silplot = None

    def _invalidate_distances(self):
        # Invalidate the computed distance matrix and recompute the silhouette.
        self._matrix = None
        self._invalidate_scores()

    def _invalidate_scores(self):
        # Invalidate and recompute the current silhouette scores.
        self._labels = self._silhouette = None
        self._update()
        self._replot()
        if self.data is not None:
            self.commit()

    def _update(self):
        # Update/recompute the distances/scores as required
        if self.data is None:
            self._silhouette = None
            self._labels = None
            self._matrix = None
            self._clear_scene()
            return

        if self._matrix is None and self._effective_data is not None:
            _, metric = self.Distances[self.distance_idx]
            self._matrix = numpy.asarray(metric(self._effective_data))

        labelvar = self.cluster_var_model[self.cluster_var_idx]
        labels, _ = self.data.get_column_view(labelvar)
        labels = labels.astype(int)
        _, counts = numpy.unique(labels, return_counts=True)
        if numpy.count_nonzero(counts) >= 2:
            self.error(1, "")
            silhouette = sklearn.metrics.silhouette_samples(
                self._matrix, labels, metric="precomputed")
        else:
            self.error(1, "Need at least 2 clusters with non zero counts")
            labels = silhouette = None

        self._labels = labels
        self._silhouette = silhouette

    def _replot(self):
        # Clear and replot/initialize the scene
        self._clear_scene()
        if self._silhouette is not None and self._labels is not None:
            var = self.cluster_var_model[self.cluster_var_idx]
            silplot = SilhouettePlot()
            silplot.setBarHeight(self.bar_size)
            silplot.setRowNamesVisible(self.bar_size >= 5)

            if self.group_by_cluster:
                silplot.setScores(self._silhouette, self._labels, var.values)
            else:
                silplot.setScores(
                    self._silhouette,
                    numpy.zeros(len(self._silhouette), dtype=int),
                    [""]
                )

            self.scene.addItem(silplot)
            self._silplot = silplot
            self._update_annotations()

            silplot.resize(silplot.effectiveSizeHint(Qt.PreferredSize))
            silplot.selectionChanged.connect(self.commit)

            self.scene.setSceneRect(
                QRectF(QtCore.QPointF(0, 0),
                       self._silplot.effectiveSizeHint(Qt.PreferredSize)))

    def _update_bar_size(self):
        if self._silplot is not None:
            self._silplot.setBarHeight(self.bar_size)
            self._silplot.setRowNamesVisible(self.bar_size >= 5)

            self.scene.setSceneRect(
                QRectF(QtCore.QPointF(0, 0),
                       self._silplot.effectiveSizeHint(Qt.PreferredSize)))

    def _update_annotations(self):
        if 0 < self.annotation_var_idx < len(self.annotation_var_model):
            annot_var = self.annotation_var_model[self.annotation_var_idx]
        else:
            annot_var = None

        if self._silplot is not None:
            if annot_var is not None:
                column, _ = self.data.get_column_view(annot_var)
                self._silplot.setRowNames(
                    [annot_var.str_val(value) for value in column])
            else:
                self._silplot.setRowNames(None)

    def commit(self):
        """
        Commit/send the current selection to the output.
        """
        selected = other = None
        if self.data is not None:
            selectedmask = numpy.full(len(self.data), False, dtype=bool)
            if self._silplot is not None:
                indices = self._silplot.selection()
                selectedmask[indices] = True
            scores = self._silhouette
            silhouette_var = None
            if self.add_scores:
                var = self.cluster_var_model[self.cluster_var_idx]
                silhouette_var = Orange.data.ContinuousVariable(
                    "Silhouette ({})".format(escape(var.name)))
                domain = Orange.data.Domain(
                    self.data.domain.attributes,
                    self.data.domain.class_vars,
                    self.data.domain.metas + (silhouette_var, ))
            else:
                domain = self.data.domain

            if numpy.count_nonzero(selectedmask):
                selected = self.data.from_table(
                    domain, self.data, numpy.flatnonzero(selectedmask))

            if numpy.count_nonzero(~selectedmask):
                other = self.data.from_table(
                    domain, self.data, numpy.flatnonzero(~selectedmask))

            if self.add_scores:
                if selected is not None:
                    selected[:, silhouette_var] = numpy.c_[scores[selectedmask]]
                if other is not None:
                    other[:, silhouette_var] = numpy.c_[scores[~selectedmask]]

        self.send("Selected Data", selected)
        self.send("Other Data", other)

    def onDeleteWidget(self):
        self.clear()
        super().onDeleteWidget()
Esempio n. 18
0
class OWDistributions(OWWidget):
    name = "Distributions"
    description = "Display value distributions of a data feature in a graph."
    icon = "icons/Distribution.svg"
    priority = 120
    keywords = []

    class Inputs:
        data = Input("Data", Table, doc="Set the input dataset")

    class Outputs:
        selected_data = Output("Selected Data", Table, default=True)
        annotated_data = Output(ANNOTATED_DATA_SIGNAL_NAME, Table)
        histogram_data = Output("Histogram Data", Table)

    class Error(OWWidget.Error):
        no_defined_values_var = \
            Msg("Variable '{}' does not have any defined values")
        no_defined_values_pair = \
            Msg("No data instances with '{}' and '{}' defined")

    class Warning(OWWidget.Warning):
        ignored_nans = Msg("Data instances with missing values are ignored")

    settingsHandler = settings.DomainContextHandler()
    var = settings.ContextSetting(None)
    cvar = settings.ContextSetting(None)
    selection = settings.ContextSetting(set(), schema_only=True)
    # number_of_bins must be a context setting because selection depends on it
    number_of_bins = settings.ContextSetting(5, schema_only=True)

    fitted_distribution = settings.Setting(0)
    hide_bars = settings.Setting(False)
    show_probs = settings.Setting(False)
    stacked_columns = settings.Setting(False)
    cumulative_distr = settings.Setting(False)
    kde_smoothing = settings.Setting(10)

    auto_apply = settings.Setting(True)

    graph_name = "plot"

    Fitters = (("None", None, (),
                ()), ("Normal", norm, ("loc", "scale"),
                      ("μ", "σ²")), ("Beta", beta, ("a", "b", "loc", "scale"),
                                     ("α", "β", "-loc", "-scale")),
               ("Gamma", gamma, ("a", "loc", "scale"), ("α", "β", "-loc",
                                                        "-scale")),
               ("Rayleigh", rayleigh, ("loc", "scale"),
                ("-loc", "σ²")), ("Pareto", pareto, ("b", "loc", "scale"),
                                  ("α", "-loc", "-scale")),
               ("Exponential", expon, ("loc", "scale"),
                ("-loc", "λ")), ("Kernel density", AshCurve, ("a", ), ("", )))

    DragNone, DragAdd, DragRemove = range(3)

    def __init__(self):
        super().__init__()
        self.data = None
        self.valid_data = self.valid_group_data = None
        self.bar_items = []
        self.curve_items = []
        self.curve_descriptions = None
        self.binnings = []

        self.last_click_idx = None
        self.drag_operation = self.DragNone
        self.key_operation = None
        self._user_var_bins = {}

        gui.listView(self.controlArea,
                     self,
                     "var",
                     box="Variable",
                     model=DomainModel(valid_types=DomainModel.PRIMITIVE,
                                       separators=False),
                     callback=self._on_var_changed)

        box = self.continuous_box = gui.vBox(self.controlArea, "Distribution")
        slider = gui.hSlider(box,
                             self,
                             "number_of_bins",
                             label="Bin width",
                             orientation=Qt.Horizontal,
                             minValue=0,
                             maxValue=max(1,
                                          len(self.binnings) - 1),
                             createLabel=False,
                             callback=self._on_bins_changed)
        self.bin_width_label = gui.widgetLabel(slider.box)
        self.bin_width_label.setFixedWidth(35)
        self.bin_width_label.setAlignment(Qt.AlignRight)
        slider.sliderReleased.connect(self._on_bin_slider_released)
        gui.comboBox(box,
                     self,
                     "fitted_distribution",
                     label="Fitted distribution",
                     orientation=Qt.Horizontal,
                     items=(name[0] for name in self.Fitters),
                     callback=self._on_fitted_dist_changed)
        self.smoothing_box = gui.indentedBox(box, 40)
        gui.hSlider(self.smoothing_box,
                    self,
                    "kde_smoothing",
                    label="Smoothing",
                    orientation=Qt.Horizontal,
                    minValue=2,
                    maxValue=20,
                    callback=self.replot)
        gui.checkBox(box,
                     self,
                     "hide_bars",
                     "Hide bars",
                     stateWhenDisabled=False,
                     callback=self._on_hide_bars_changed,
                     disabled=not self.fitted_distribution)

        box = gui.vBox(self.controlArea, "Columns")
        gui.comboBox(box,
                     self,
                     "cvar",
                     label="Split by",
                     orientation=Qt.Horizontal,
                     model=DomainModel(
                         placeholder="(None)",
                         valid_types=(DiscreteVariable),
                     ),
                     callback=self._on_cvar_changed,
                     contentsLength=18)
        gui.checkBox(box,
                     self,
                     "stacked_columns",
                     "Stack columns",
                     callback=self.replot)
        gui.checkBox(box,
                     self,
                     "show_probs",
                     "Show probabilities",
                     callback=self._on_show_probabilities_changed)
        gui.checkBox(box,
                     self,
                     "cumulative_distr",
                     "Show cumulative distribution",
                     callback=self.replot)

        gui.auto_apply(self.controlArea, self, commit=self.apply)

        self._set_smoothing_visibility()
        self._setup_plots()
        self._setup_legend()

    def _setup_plots(self):
        def add_new_plot(zvalue):
            plot = pg.ViewBox(enableMouse=False, enableMenu=False)
            self.ploti.scene().addItem(plot)
            pg.AxisItem("right").linkToView(plot)
            plot.setXLink(self.ploti)
            plot.setZValue(zvalue)
            return plot

        self.plotview = DistributionWidget(background=None)
        self.plotview.item_clicked.connect(self._on_item_clicked)
        self.plotview.blank_clicked.connect(self._on_blank_clicked)
        self.plotview.mouse_released.connect(self._on_end_selecting)
        self.plotview.setRenderHint(QPainter.Antialiasing)
        self.mainArea.layout().addWidget(self.plotview)
        self.ploti = pg.PlotItem(
            enableMenu=False,
            enableMouse=False,
            axisItems={"bottom": ElidedAxisNoUnits("bottom")})
        self.plot = self.ploti.vb
        self.plot.setMouseEnabled(False, False)
        self.ploti.hideButtons()
        self.plotview.setCentralItem(self.ploti)

        self.plot_pdf = add_new_plot(10)
        self.plot_mark = add_new_plot(-10)
        self.plot_mark.setYRange(0, 1)
        self.ploti.vb.sigResized.connect(self.update_views)
        self.update_views()

        pen = QPen(self.palette().color(QPalette.Text))
        self.ploti.getAxis("bottom").setPen(pen)
        left = self.ploti.getAxis("left")
        left.setPen(pen)
        left.setStyle(stopAxisAtTick=(True, True))

    def _setup_legend(self):
        self._legend = LegendItem()
        self._legend.setParentItem(self.plot_pdf)
        self._legend.hide()
        self._legend.anchor((1, 0), (1, 0))

    # -----------------------------
    # Event and signal handlers

    def update_views(self):
        for plot in (self.plot_pdf, self.plot_mark):
            plot.setGeometry(self.plot.sceneBoundingRect())
            plot.linkedViewChanged(self.plot, plot.XAxis)

    def onDeleteWidget(self):
        self.plot.clear()
        self.plot_pdf.clear()
        self.plot_mark.clear()
        super().onDeleteWidget()

    @Inputs.data
    def set_data(self, data):
        self.closeContext()
        self.var = self.cvar = None
        self.data = data
        domain = self.data.domain if self.data else None
        varmodel = self.controls.var.model()
        cvarmodel = self.controls.cvar.model()
        varmodel.set_domain(domain)
        cvarmodel.set_domain(domain)
        if varmodel:
            self.var = varmodel[min(len(domain.class_vars), len(varmodel) - 1)]
        if domain is not None and domain.has_discrete_class:
            self.cvar = domain.class_var
        self.reset_select()
        self._user_var_bins.clear()
        self.openContext(domain)
        self.set_valid_data()
        self.recompute_binnings()
        self.replot()
        self.apply()

    def _on_var_changed(self):
        self.reset_select()
        self.set_valid_data()
        self.recompute_binnings()
        self.replot()
        self.apply()

    def _on_cvar_changed(self):
        self.set_valid_data()
        self.replot()
        self.apply()

    def _on_bins_changed(self):
        self.reset_select()
        self._set_bin_width_slider_label()
        self.replot()
        # this is triggered when dragging, so don't call apply here;
        # apply is called on sliderReleased

    def _on_bin_slider_released(self):
        self._user_var_bins[self.var] = self.number_of_bins
        self.apply()

    def _on_fitted_dist_changed(self):
        self.controls.hide_bars.setDisabled(not self.fitted_distribution)
        self._set_smoothing_visibility()
        self.replot()

    def _on_hide_bars_changed(self):
        for bar in self.bar_items:  # pylint: disable=blacklisted-name
            bar.setHidden(self.hide_bars)
        self._set_curve_brushes()
        self.plot.update()

    def _set_smoothing_visibility(self):
        self.smoothing_box.setVisible(
            self.Fitters[self.fitted_distribution][1] is AshCurve)

    def _set_bin_width_slider_label(self):
        if self.number_of_bins < len(self.binnings):
            text = reduce(lambda s, rep: s.replace(*rep),
                          short_time_units.items(),
                          self.binnings[self.number_of_bins].width_label)
        else:
            text = ""
        self.bin_width_label.setText(text)

    def _on_show_probabilities_changed(self):
        label = self.controls.fitted_distribution.label
        if self.show_probs:
            label.setText("Fitted probability")
            label.setToolTip(
                "Chosen distribution is used to compute Bayesian probabilities"
            )
        else:
            label.setText("Fitted distribution")
            label.setToolTip("")
        self.replot()

    @property
    def is_valid(self):
        return self.valid_data is not None

    def set_valid_data(self):
        err_def_var = self.Error.no_defined_values_var
        err_def_pair = self.Error.no_defined_values_pair
        err_def_var.clear()
        err_def_pair.clear()
        self.Warning.ignored_nans.clear()

        self.valid_data = self.valid_group_data = None
        if self.var is None:
            return

        column = self.data.get_column_view(self.var)[0].astype(float)
        valid_mask = np.isfinite(column)
        if not np.any(valid_mask):
            self.Error.no_defined_values_var(self.var.name)
            return
        if self.cvar:
            ccolumn = self.data.get_column_view(self.cvar)[0].astype(float)
            valid_mask *= np.isfinite(ccolumn)
            if not np.any(valid_mask):
                self.Error.no_defined_values_pair(self.var.name,
                                                  self.cvar.name)
                return
            self.valid_group_data = ccolumn[valid_mask]
        if not np.all(valid_mask):
            self.Warning.ignored_nans()
        self.valid_data = column[valid_mask]

    # -----------------------------
    # Plotting

    def replot(self):
        self._clear_plot()
        if self.is_valid:
            self._set_axis_names()
            self._update_controls_state()
            self._call_plotting()
            self._display_legend()
        self.show_selection()

    def _clear_plot(self):
        self.plot.clear()
        self.plot_pdf.clear()
        self.plot_mark.clear()
        self.bar_items = []
        self.curve_items = []
        self._legend.clear()
        self._legend.hide()

    def _set_axis_names(self):
        assert self.is_valid  # called only from replot, so assumes data is OK
        bottomaxis = self.ploti.getAxis("bottom")
        bottomaxis.setLabel(self.var and self.var.name)
        bottomaxis.setShowUnit(not (self.var and self.var.is_time))

        leftaxis = self.ploti.getAxis("left")
        if self.show_probs and self.cvar:
            leftaxis.setLabel(
                f"Probability of '{self.cvar.name}' at given '{self.var.name}'"
            )
        else:
            leftaxis.setLabel("Frequency")
        leftaxis.resizeEvent()

    def _update_controls_state(self):
        assert self.is_valid  # called only from replot, so assumes data is OK
        self.continuous_box.setDisabled(self.var.is_discrete)
        self.controls.show_probs.setDisabled(self.cvar is None)
        self.controls.stacked_columns.setDisabled(self.cvar is None)

    def _call_plotting(self):
        assert self.is_valid  # called only from replot, so assumes data is OK
        self.curve_descriptions = None
        if self.var.is_discrete:
            if self.cvar:
                self._disc_split_plot()
            else:
                self._disc_plot()
        else:
            if self.cvar:
                self._cont_split_plot()
            else:
                self._cont_plot()
        self.plot.autoRange()

    def _add_bar(self,
                 x,
                 width,
                 padding,
                 freqs,
                 colors,
                 stacked,
                 expanded,
                 tooltip,
                 hidden=False):
        item = DistributionBarItem(x, width, padding, freqs, colors, stacked,
                                   expanded, tooltip, hidden)
        self.plot.addItem(item)
        self.bar_items.append(item)

    def _disc_plot(self):
        var = self.var
        self.ploti.getAxis("bottom").setTicks([list(enumerate(var.values))])
        colors = [QColor(0, 128, 255)]
        dist = distribution.get_distribution(self.data, self.var)
        for i, freq in enumerate(dist):
            tooltip = \
                "<p style='white-space:pre;'>" \
                f"<b>{escape(var.values[i])}</b>: {int(freq)} " \
                f"({100 * freq / len(self.valid_data):.2f} %) "
            self._add_bar(i - 0.5,
                          1,
                          0.1, [freq],
                          colors,
                          stacked=False,
                          expanded=False,
                          tooltip=tooltip)

    def _disc_split_plot(self):
        var = self.var
        self.ploti.getAxis("bottom").setTicks([list(enumerate(var.values))])
        gcolors = [QColor(*col) for col in self.cvar.colors]
        gvalues = self.cvar.values
        conts = contingency.get_contingency(self.data, self.cvar, self.var)
        total = len(self.data)
        for i, freqs in enumerate(conts):
            self._add_bar(i - 0.5,
                          1,
                          0.1,
                          freqs,
                          gcolors,
                          stacked=self.stacked_columns,
                          expanded=self.show_probs,
                          tooltip=self._split_tooltip(var.values[i],
                                                      np.sum(freqs), total,
                                                      gvalues, freqs))

    def _cont_plot(self):
        self._set_cont_ticks()
        data = self.valid_data
        y, x = np.histogram(data,
                            bins=self.binnings[self.number_of_bins].thresholds)
        total = len(data)
        colors = [QColor(0, 128, 255)]
        if self.fitted_distribution:
            colors[0] = colors[0].lighter(130)

        tot_freq = 0
        lasti = len(y) - 1
        for i, (x0, x1), freq in zip(count(), zip(x, x[1:]), y):
            tot_freq += freq
            tooltip = \
                "<p style='white-space:pre;'>" \
                f"<b>{escape(self.str_int(x0, x1, not i, i == lasti))}</b>: " \
                f"{freq} ({100 * freq / total:.2f} %)</p>"
            self._add_bar(x0,
                          x1 - x0,
                          0, [tot_freq if self.cumulative_distr else freq],
                          colors,
                          stacked=False,
                          expanded=False,
                          tooltip=tooltip,
                          hidden=self.hide_bars)

        if self.fitted_distribution:
            self._plot_approximations(x[0], x[-1],
                                      [self._fit_approximation(data)],
                                      [QColor(0, 0, 0)], (1, ))

    def _cont_split_plot(self):
        self._set_cont_ticks()
        data = self.valid_data
        _, bins = np.histogram(
            data, bins=self.binnings[self.number_of_bins].thresholds)
        gvalues = self.cvar.values
        varcolors = [QColor(*col) for col in self.cvar.colors]
        if self.fitted_distribution:
            gcolors = [c.lighter(130) for c in varcolors]
        else:
            gcolors = varcolors
        nvalues = len(gvalues)
        ys = []
        fitters = []
        prior_sizes = []
        for val_idx in range(nvalues):
            group_data = data[self.valid_group_data == val_idx]
            prior_sizes.append(len(group_data))
            ys.append(np.histogram(group_data, bins)[0])
            if self.fitted_distribution:
                fitters.append(self._fit_approximation(group_data))
        total = len(data)
        prior_sizes = np.array(prior_sizes)
        tot_freqs = np.zeros(len(ys))

        lasti = len(ys[0]) - 1
        for i, x0, x1, freqs in zip(count(), bins, bins[1:], zip(*ys)):
            tot_freqs += freqs
            plotfreqs = tot_freqs.copy() if self.cumulative_distr else freqs
            self._add_bar(x0,
                          x1 - x0,
                          0 if self.stacked_columns else 0.1,
                          plotfreqs,
                          gcolors,
                          stacked=self.stacked_columns,
                          expanded=self.show_probs,
                          hidden=self.hide_bars,
                          tooltip=self._split_tooltip(
                              self.str_int(x0, x1, not i, i == lasti),
                              np.sum(plotfreqs), total, gvalues, plotfreqs))

        if fitters:
            self._plot_approximations(bins[0], bins[-1], fitters, varcolors,
                                      prior_sizes / len(data))

    def _set_cont_ticks(self):
        axis = self.ploti.getAxis("bottom")
        if self.var and self.var.is_time:
            binning = self.binnings[self.number_of_bins]
            labels = np.array(binning.short_labels)
            thresholds = np.array(binning.thresholds)
            lengths = np.array([len(lab) for lab in labels])
            slengths = set(lengths)
            if len(slengths) == 1:
                ticks = [
                    list(zip(thresholds[::2], labels[::2])),
                    list(zip(thresholds[1::2], labels[1::2]))
                ]
            else:
                ticks = []
                for length in sorted(slengths, reverse=True):
                    idxs = lengths == length
                    ticks.append(list(zip(thresholds[idxs], labels[idxs])))
            axis.setTicks(ticks)
        else:
            axis.setTicks(None)

    def _fit_approximation(self, y):
        def join_pars(pairs):
            strv = self.var.str_val
            return ", ".join(f"{sname}={strv(val)}" for sname, val in pairs)

        def str_params():
            s = join_pars((sname, val)
                          for sname, val in zip(str_names, fitted)
                          if sname and sname[0] != "-")
            par = join_pars((sname[1:], val)
                            for sname, val in zip(str_names, fitted)
                            if sname and sname[0] == "-")
            if par:
                s += f" ({par})"
            return s

        if not y.size:
            return None, None
        _, dist, names, str_names = self.Fitters[self.fitted_distribution]
        fitted = dist.fit(y)
        params = dict(zip(names, fitted))
        return partial(dist.pdf, **params), str_params()

    def _plot_approximations(self, x0, x1, fitters, colors, prior_probs):
        x = np.linspace(x0, x1, 100)
        ys = np.zeros((len(fitters), 100))
        self.curve_descriptions = [s for _, s in fitters]
        for y, (fitter, _) in zip(ys, fitters):
            if fitter is None:
                continue
            if self.Fitters[self.fitted_distribution][1] is AshCurve:
                y[:] = fitter(x, sigma=(22 - self.kde_smoothing) / 40)
            else:
                y[:] = fitter(x)
            if self.cumulative_distr:
                y[:] = np.cumsum(y)
        tots = np.sum(ys, axis=0)

        show_probs = self.show_probs and self.cvar is not None
        plot = self.ploti if show_probs else self.plot_pdf

        for y, prior_prob, color in zip(ys, prior_probs, colors):
            if not prior_prob:
                continue
            if show_probs:
                y_p = y * prior_prob
                tot = (y_p + (tots - y) * (1 - prior_prob))
                tot[tot == 0] = 1
                y = y_p / tot
            curve = pg.PlotCurveItem(x=x,
                                     y=y,
                                     fillLevel=0,
                                     pen=pg.mkPen(width=5, color=color),
                                     shadowPen=pg.mkPen(
                                         width=8, color=color.darker(120)))
            plot.addItem(curve)
            self.curve_items.append(curve)
        if not show_probs:
            self.plot_pdf.autoRange()
        self._set_curve_brushes()

    def _set_curve_brushes(self):
        for curve in self.curve_items:
            if self.hide_bars:
                color = curve.opts['pen'].color().lighter(160)
                color.setAlpha(128)
                curve.setBrush(pg.mkBrush(color))
            else:
                curve.setBrush(None)

    @staticmethod
    def _split_tooltip(valname, tot_group, total, gvalues, freqs):
        div_group = tot_group or 1
        cs = "white-space:pre; text-align: right;"
        s = f"style='{cs} padding-left: 1em'"
        snp = f"style='{cs}'"
        return f"<table style='border-collapse: collapse'>" \
               f"<tr><th {s}>{escape(valname)}:</th>" \
               f"<td {snp}><b>{int(tot_group)}</b></td>" \
               "<td/>" \
               f"<td {s}><b>{100 * tot_group / total:.2f} %</b></td></tr>" + \
               f"<tr><td/><td/><td {s}>(in group)</td><td {s}>(overall)</td>" \
               "</tr>" + \
               "".join(
                   "<tr>"
                   f"<th {s}>{value}:</th>"
                   f"<td {snp}><b>{int(freq)}</b></td>"
                   f"<td {s}>{100 * freq / div_group:.2f} %</td>"
                   f"<td {s}>{100 * freq / total:.2f} %</td>"
                   "</tr>"
                   for value, freq in zip(gvalues, freqs)) + \
               "</table>"

    def _display_legend(self):
        assert self.is_valid  # called only from replot, so assumes data is OK
        if self.cvar is None:
            if not self.curve_descriptions or not self.curve_descriptions[0]:
                self._legend.hide()
                return
            self._legend.addItem(
                pg.PlotCurveItem(pen=pg.mkPen(width=5, color=0.0)),
                self.curve_descriptions[0])
        else:
            cvar_values = self.cvar.values
            colors = [QColor(*col) for col in self.cvar.colors]
            descriptions = self.curve_descriptions or repeat(None)
            for color, name, desc in zip(colors, cvar_values, descriptions):
                self._legend.addItem(
                    ScatterPlotItem(pen=color, brush=color, size=10,
                                    shape="s"),
                    escape(name + (f" ({desc})" if desc else "")))
        self._legend.show()

    # -----------------------------
    # Bins

    def recompute_binnings(self):
        if self.is_valid and self.var.is_continuous:
            # binning is computed on valid var data, ignoring any cvar nans
            column = self.data.get_column_view(self.var)[0].astype(float)
            if np.any(np.isfinite(column)):
                if self.var.is_time:
                    self.binnings = time_binnings(column, min_unique=5)
                    self.bin_width_label.setFixedWidth(45)
                else:
                    self.binnings = decimal_binnings(
                        column,
                        min_width=self.min_var_resolution(self.var),
                        add_unique=10,
                        min_unique=5)
                    self.bin_width_label.setFixedWidth(35)
                max_bins = len(self.binnings) - 1
        else:
            self.binnings = []
            max_bins = 0

        self.controls.number_of_bins.setMaximum(max_bins)
        self.number_of_bins = min(
            max_bins, self._user_var_bins.get(self.var, self.number_of_bins))
        self._set_bin_width_slider_label()

    @staticmethod
    def min_var_resolution(var):
        # pylint: disable=unidiomatic-typecheck
        if type(var) is not ContinuousVariable:
            return 0
        return 10**-var.number_of_decimals

    def str_int(self, x0, x1, first, last):
        var = self.var
        sx0, sx1 = var.repr_val(x0), var.repr_val(x1)
        if self.cumulative_distr:
            return f"{var.name} < {sx1}"
        elif first and last:
            return f"{var.name} = {sx0}"
        elif first:
            return f"{var.name} < {sx1}"
        elif last:
            return f"{var.name} ≥ {sx0}"
        elif sx0 == sx1 or x1 - x0 <= self.min_var_resolution(var):
            return f"{var.name} = {sx0}"
        else:
            return f"{sx0} ≤ {var.name} < {sx1}"

    # -----------------------------
    # Selection

    def _on_item_clicked(self, item, modifiers, drag):
        def add_or_remove(idx, add):
            self.drag_operation = [self.DragRemove, self.DragAdd][add]
            if add:
                self.selection.add(idx)
            else:
                if idx in self.selection:
                    # This can be False when removing with dragging and the
                    # mouse crosses unselected items
                    self.selection.remove(idx)

        def add_range(add):
            if self.last_click_idx is None:
                add = True
                idx_range = {idx}
            else:
                from_idx, to_idx = sorted((self.last_click_idx, idx))
                idx_range = set(range(from_idx, to_idx + 1))
            self.drag_operation = [self.DragRemove, self.DragAdd][add]
            if add:
                self.selection |= idx_range
            else:
                self.selection -= idx_range

        self.key_operation = None
        if item is None:
            self.reset_select()
            return

        idx = self.bar_items.index(item)
        if drag:
            # Dragging has to add a range, otherwise fast dragging skips bars
            add_range(self.drag_operation == self.DragAdd)
        else:
            if modifiers & Qt.ShiftModifier:
                add_range(self.drag_operation == self.DragAdd)
            elif modifiers & Qt.ControlModifier:
                add_or_remove(idx, add=idx not in self.selection)
            else:
                if self.selection == {idx}:
                    # Clicking on a single selected bar  deselects it,
                    # but dragging from here will select
                    add_or_remove(idx, add=False)
                    self.drag_operation = self.DragAdd
                else:
                    self.selection.clear()
                    add_or_remove(idx, add=True)
        self.last_click_idx = idx

        self.show_selection()

    def _on_blank_clicked(self):
        self.reset_select()

    def reset_select(self):
        self.selection.clear()
        self.last_click_idx = None
        self.drag_operation = None
        self.key_operation = None
        self.show_selection()

    def _on_end_selecting(self):
        self.apply()

    def show_selection(self):
        self.plot_mark.clear()
        if not self.is_valid:  # though if it's not, selection is empty anyway
            return

        blue = QColor(Qt.blue)
        pen = QPen(QBrush(blue), 3)
        pen.setCosmetic(True)
        brush = QBrush(blue.lighter(190))

        for group in self.grouped_selection():
            group = list(group)
            left_idx, right_idx = group[0], group[-1]
            left_pad, right_pad = self._determine_padding(left_idx, right_idx)
            x0 = self.bar_items[left_idx].x0 - left_pad
            x1 = self.bar_items[right_idx].x1 + right_pad
            item = QGraphicsRectItem(x0, 0, x1 - x0, 1)
            item.setPen(pen)
            item.setBrush(brush)
            if self.var.is_continuous:
                valname = self.str_int(x0, x1, not left_idx,
                                       right_idx == len(self.bar_items) - 1)
                inside = sum(np.sum(self.bar_items[i].freqs) for i in group)
                total = len(self.valid_data)
                item.setToolTip("<p style='white-space:pre;'>"
                                f"<b>{escape(valname)}</b>: "
                                f"{inside} ({100 * inside / total:.2f} %)")
            self.plot_mark.addItem(item)

    def _determine_padding(self, left_idx, right_idx):
        def _padding(i):
            return (self.bar_items[i + 1].x0 - self.bar_items[i].x1) / 2

        if len(self.bar_items) == 1:
            return 6, 6
        if left_idx == 0 and right_idx == len(self.bar_items) - 1:
            return (_padding(0), ) * 2

        if left_idx > 0:
            left_pad = _padding(left_idx - 1)
        if right_idx < len(self.bar_items) - 1:
            right_pad = _padding(right_idx)
        else:
            right_pad = left_pad
        if left_idx == 0:
            left_pad = right_pad
        return left_pad, right_pad

    def grouped_selection(self):
        return [[g[1] for g in group]
                for _, group in groupby(enumerate(sorted(self.selection)),
                                        key=lambda x: x[1] - x[0])]

    def keyPressEvent(self, e):
        def on_nothing_selected():
            if e.key() == Qt.Key_Left:
                self.last_click_idx = len(self.bar_items) - 1
            else:
                self.last_click_idx = 0
            self.selection.add(self.last_click_idx)

        def on_key_left():
            if e.modifiers() & Qt.ShiftModifier:
                if self.key_operation == Qt.Key_Right and first != last:
                    self.selection.remove(last)
                    self.last_click_idx = last - 1
                elif first:
                    self.key_operation = Qt.Key_Left
                    self.selection.add(first - 1)
                    self.last_click_idx = first - 1
            else:
                self.selection.clear()
                self.last_click_idx = max(first - 1, 0)
                self.selection.add(self.last_click_idx)

        def on_key_right():
            if e.modifiers() & Qt.ShiftModifier:
                if self.key_operation == Qt.Key_Left and first != last:
                    self.selection.remove(first)
                    self.last_click_idx = first + 1
                elif not self._is_last_bar(last):
                    self.key_operation = Qt.Key_Right
                    self.selection.add(last + 1)
                    self.last_click_idx = last + 1
            else:
                self.selection.clear()
                self.last_click_idx = min(last + 1, len(self.bar_items) - 1)
                self.selection.add(self.last_click_idx)

        if not self.is_valid or not self.bar_items \
                or e.key() not in (Qt.Key_Left, Qt.Key_Right):
            super().keyPressEvent(e)
            return

        prev_selection = self.selection.copy()
        if not self.selection:
            on_nothing_selected()
        else:
            first, last = min(self.selection), max(self.selection)
            if e.key() == Qt.Key_Left:
                on_key_left()
            else:
                on_key_right()

        if self.selection != prev_selection:
            self.drag_operation = self.DragAdd
            self.show_selection()
            self.apply()

    def keyReleaseEvent(self, ev):
        if ev.key() == Qt.Key_Shift:
            self.key_operation = None
        super().keyReleaseEvent(ev)

    # -----------------------------
    # Output

    def apply(self):
        data = self.data
        selected_data = annotated_data = histogram_data = None
        if self.is_valid:
            if self.var.is_discrete:
                group_indices, values = self._get_output_indices_disc()
            else:
                group_indices, values = self._get_output_indices_cont()
                hist_indices, hist_values = self._get_histogram_indices()
                histogram_data = create_groups_table(data,
                                                     hist_indices,
                                                     values=hist_values)
            selected = np.nonzero(group_indices)[0]
            if selected.size:
                selected_data = create_groups_table(data,
                                                    group_indices,
                                                    include_unselected=False,
                                                    values=values)
                annotated_data = create_annotated_table(data, selected)

        self.Outputs.selected_data.send(selected_data)
        self.Outputs.annotated_data.send(annotated_data)
        self.Outputs.histogram_data.send(histogram_data)

    def _get_output_indices_disc(self):
        group_indices = np.zeros(len(self.data), dtype=np.int32)
        col = self.data.get_column_view(self.var)[0].astype(float)
        for group_idx, val_idx in enumerate(self.selection, start=1):
            group_indices[col == val_idx] = group_idx
        values = [self.var.values[i] for i in self.selection]
        return group_indices, values

    def _get_output_indices_cont(self):
        group_indices = np.zeros(len(self.data), dtype=np.int32)
        col = self.data.get_column_view(self.var)[0].astype(float)
        values = []
        for group_idx, group in enumerate(self.grouped_selection(), start=1):
            x0 = x1 = None
            for bar_idx in group:
                minx, maxx, mask = self._get_cont_baritem_indices(col, bar_idx)
                if x0 is None:
                    x0 = minx
                x1 = maxx
                group_indices[mask] = group_idx
            # pylint: disable=undefined-loop-variable
            values.append(
                self.str_int(x0, x1, not bar_idx, self._is_last_bar(bar_idx)))
        return group_indices, values

    def _get_histogram_indices(self):
        group_indices = np.zeros(len(self.data), dtype=np.int32)
        col = self.data.get_column_view(self.var)[0].astype(float)
        values = []
        for bar_idx in range(len(self.bar_items)):
            x0, x1, mask = self._get_cont_baritem_indices(col, bar_idx)
            group_indices[mask] = bar_idx + 1
            values.append(
                self.str_int(x0, x1, not bar_idx, self._is_last_bar(bar_idx)))
        return group_indices, values

    def _get_cont_baritem_indices(self, col, bar_idx):
        bar_item = self.bar_items[bar_idx]
        minx = bar_item.x0
        maxx = bar_item.x1 + (bar_idx == len(self.bar_items) - 1)
        with np.errstate(invalid="ignore"):
            return minx, maxx, (col >= minx) * (col < maxx)

    def _is_last_bar(self, idx):
        return idx == len(self.bar_items) - 1

    # -----------------------------
    # Report

    def get_widget_name_extension(self):
        return self.var

    def send_report(self):
        self.plotview.scene().setSceneRect(self.plotview.sceneRect())
        if not self.is_valid:
            return
        self.report_plot()
        if self.cumulative_distr:
            text = f"Cummulative distribution of '{self.var.name}'"
        else:
            text = f"Distribution of '{self.var.name}'"
        if self.cvar:
            text += f" with columns split by '{self.cvar.name}'"
        self.report_caption(text)
Esempio n. 19
0
class OWPIPAx(widget.OWWidget):
    name = "PIPAx"
    description = "Access data from PIPA RNA-Seq database."
    icon = "../widgets/icons/PIPA.svg"
    priority = 35

    inputs = []
    outputs = [("Data", Orange.data.Table)]

    username = settings.Setting("")
    password = settings.Setting("")

    log2 = settings.Setting(False)
    rtypei = settings.Setting(5)  # hardcoded rpkm mapability polya
    excludeconstant = settings.Setting(False)
    joinreplicates = settings.Setting(False)
    #: The stored current selection (in experiments view)
    #: SelectionByKey | None
    currentSelection = settings.Setting(None)
    #: Stored selections (presets)
    #: list of SelectionByKey
    storedSelections = settings.Setting([])
    #: Stored column sort keys (from Sort view)
    #: list of strings
    storedSortingOrder = settings.Setting(
        ["Strain", "Experiment", "Genotype", "Timepoint"])

    experimentsHeaderState = settings.Setting(
        {name: False
         for _, name in HEADER[:ID_INDEX + 1]})

    def __init__(self, parent=None, signalManager=None, name="PIPAx"):
        super().__init__(parent)

        self.selectedExperiments = []
        self.buffer = dicty.CacheSQLite(bufferfile)

        self.searchString = ""

        self.result_types = []
        self.mappings = {}

        self.controlArea.setMaximumWidth(250)
        self.controlArea.setMinimumWidth(250)

        gui.button(self.controlArea, self, "Reload", callback=self.Reload)
        gui.button(self.controlArea,
                   self,
                   "Clear cache",
                   callback=self.clear_cache)

        b = gui.widgetBox(self.controlArea, "Experiment Sets")
        self.selectionSetsWidget = SelectionSetsWidget(self)
        self.selectionSetsWidget.setSizePolicy(QSizePolicy.Preferred,
                                               QSizePolicy.Maximum)

        def store_selections(modified):
            if not modified:
                self.storedSelections = self.selectionSetsWidget.selections

        self.selectionSetsWidget.selectionModified.connect(store_selections)
        b.layout().addWidget(self.selectionSetsWidget)

        gui.separator(self.controlArea)

        b = gui.widgetBox(self.controlArea, "Sort output columns")
        self.columnsSortingWidget = SortedListWidget(self)
        self.columnsSortingWidget.setSizePolicy(QSizePolicy.Preferred,
                                                QSizePolicy.Maximum)

        def store_sort_order():
            self.storedSortingOrder = self.columnsSortingWidget.sortingOrder

        self.columnsSortingWidget.sortingOrderChanged.connect(store_sort_order)
        b.layout().addWidget(self.columnsSortingWidget)
        sorting_model = QStringListModel(SORTING_MODEL_LIST)
        self.columnsSortingWidget.setModel(sorting_model)

        gui.separator(self.controlArea)

        box = gui.widgetBox(self.controlArea, 'Expression Type')
        self.expressionTypesCB = gui.comboBox(box,
                                              self,
                                              "rtypei",
                                              items=[],
                                              callback=self.UpdateResultsList)

        gui.checkBox(self.controlArea, self, "excludeconstant",
                     "Exclude labels with constant values")

        gui.checkBox(self.controlArea, self, "joinreplicates",
                     "Average replicates (use median)")

        gui.checkBox(self.controlArea, self, "log2",
                     "Logarithmic (base 2) transformation")

        self.commit_button = gui.button(self.controlArea,
                                        self,
                                        "&Commit",
                                        callback=self.Commit)
        self.commit_button.setDisabled(True)

        gui.rubber(self.controlArea)

        box = gui.widgetBox(self.controlArea, "Authentication")

        gui.lineEdit(box,
                     self,
                     "username",
                     "Username:"******"password",
                                  "Password:"******"searchString",
                     "Search",
                     callbackOnType=True,
                     callback=self.SearchUpdate)

        self.headerLabels = [t[1] for t in HEADER]

        self.experimentsWidget = QTreeWidget()
        self.experimentsWidget.setHeaderLabels(self.headerLabels)
        self.experimentsWidget.setSelectionMode(QTreeWidget.ExtendedSelection)
        self.experimentsWidget.setRootIsDecorated(False)
        self.experimentsWidget.setSortingEnabled(True)

        contextEventFilter = gui.VisibleHeaderSectionContextEventFilter(
            self.experimentsWidget, self.experimentsWidget)

        self.experimentsWidget.header().installEventFilter(contextEventFilter)
        self.experimentsWidget.setItemDelegateForColumn(
            0, gui.IndicatorItemDelegate(self, role=Qt.DisplayRole))

        self.experimentsWidget.setAlternatingRowColors(True)

        self.experimentsWidget.selectionModel().selectionChanged.connect(
            self.onSelectionChanged)

        self.selectionSetsWidget.setSelectionModel(
            self.experimentsWidget.selectionModel())

        self.mainArea.layout().addWidget(self.experimentsWidget)

        # Restore the selection states from the stored settings
        self.selectionSetsWidget.selections = self.storedSelections
        self.columnsSortingWidget.sortingOrder = self.storedSortingOrder

        self.restoreHeaderState()

        self.experimentsWidget.header().geometriesChanged.connect(
            self.saveHeaderState)

        self.dbc = None

        self.AuthSet()

        QTimer.singleShot(100, self.UpdateExperiments)

    def sizeHint(self):
        return QSize(800, 600)

    def AuthSet(self):
        if len(self.username):
            self.passf.setDisabled(False)
        else:
            self.passf.setDisabled(True)

    def AuthChanged(self):
        self.AuthSet()
        self.ConnectAndUpdate()

    def ConnectAndUpdate(self):
        self.Connect()
        self.UpdateExperiments(reload=True)

    def Connect(self):
        self.error(1)
        self.warning(1)

        def en(x):
            return x if len(x) else None

        self.dbc = dicty.PIPAx(cache=self.buffer,
                               username=en(self.username),
                               password=self.password)

        # check password
        if en(self.username) != None:
            try:
                self.dbc.mappings(reload=True)
            except dicty.AuthenticationError:
                self.error(1, "Wrong username or password")
                self.dbc = None
            except Exception as ex:
                print("Error when contacting the PIPA database", ex)
                sys.excepthook(*sys.exc_info())
                try:  # maybe cached?
                    self.dbc.mappings()
                    self.warning(
                        1, "Can not access database - using cached data.")
                except Exception as ex:
                    self.dbc = None
                    self.error(1, "Can not access database.")

    def Reload(self):
        self.UpdateExperiments(reload=True)

    def clear_cache(self):
        self.buffer.clear()
        self.Reload()

    def rtype(self):
        """Return selected result template type """
        if self.result_types:
            return self.result_types[self.rtypei][0]
        else:
            return "-1"

    def UpdateExperimentTypes(self):
        self.expressionTypesCB.clear()
        items = [desc for _, desc in self.result_types]
        self.expressionTypesCB.addItems(items)
        self.rtypei = max(0, min(self.rtypei, len(self.result_types) - 1))

    def UpdateExperiments(self, reload=False):
        self.experimentsWidget.clear()
        self.items = []

        self.progressBarInit()

        if not self.dbc:
            self.Connect()

        mappings = {}
        result_types = []
        sucind = False  # success indicator for database index

        try:
            mappings = self.dbc.mappings(reload=reload)
            result_types = self.dbc.result_types(reload=reload)
            sucind = True
        except Exception as ex:
            try:
                mappings = self.dbc.mappings()
                result_types = self.dbc.result_types()
                self.warning(0, "Can not access database - using cached data.")
                sucind = True
            except Exception as ex:
                self.error(0, "Can not access database.")

        if sucind:
            self.warning(0)
            self.error(0)

        self.mappings = mappings
        self.result_types = result_types

        self.UpdateExperimentTypes()
        self.UpdateResultsList(reload=reload)

        self.progressBarFinished()

        if self.currentSelection:
            self.currentSelection.select(
                self.experimentsWidget.selectionModel())

        self.handle_commit_button()

    def UpdateResultsList(self, reload=False):

        results_list = {}
        try:
            results_list = self.dbc.results_list(self.rtype(), reload=reload)
        except Exception as ex:
            try:
                results_list = self.dbc.results_list(self.rtype())
            except Exception as ex:
                self.error(0, "Can not access database.")

        self.results_list = results_list
        mappings_key_dict = dict(((m["data_id"], m["id"]), key) \
                                 for key, m in self.mappings.items())

        def mapping_unique_id(annot):
            """Map annotations dict from results_list to unique
            `mappings` ids.
            """
            data_id, mappings_id = annot["data_id"], annot["mappings_id"]
            return mappings_key_dict[data_id, mappings_id]

        elements = []

        # softly change the view so that the selection stays the same

        items_shown = {}
        for i, item in enumerate(self.items):
            c = str(item.text(10))
            items_shown[c] = i

        items_to_show = dict((mapping_unique_id(annot), annot)
                             for annot in self.results_list.values())

        add_items = set(items_to_show) - set(items_shown)
        delete_items = set(items_shown) - set(items_to_show)

        i = 0
        while i < self.experimentsWidget.topLevelItemCount():
            it = self.experimentsWidget.topLevelItem(i)
            if str(it.text(10)) in delete_items:
                self.experimentsWidget.takeTopLevelItem(i)
            else:
                i += 1

        delete_ind = set([items_shown[i] for i in delete_items])
        self.items = [
            it for i, it in enumerate(self.items) if i not in delete_ind
        ]

        for r_annot in [items_to_show[i] for i in add_items]:
            d = defaultdict(lambda: "?", r_annot)
            row_items = [""] + [d.get(key, "?") for key, _ in HEADER[1:]]
            try:
                time_dict = literal_eval(row_items[DATE_INDEX])
                date_rna = date(
                    time_dict["fullYearUTC"],
                    time_dict["monthUTC"] + 1,  # Why is month 0 based?
                    time_dict["dateUTC"])
                row_items[DATE_INDEX] = date_rna.strftime("%x")
            except Exception:
                row_items[DATE_INDEX] = ''

            row_items[ID_INDEX] = mapping_unique_id(r_annot)
            elements.append(row_items)

            ci = MyTreeWidgetItem(self.experimentsWidget, row_items)

            self.items.append(ci)

        for i in range(len(self.headerLabels)):
            self.experimentsWidget.resizeColumnToContents(i)

        # which is the ok buffer version
        # FIXME: what attribute to use for version?
        self.wantbufver = \
            lambda x, ad=self.results_list: \
            defaultdict(lambda: "?", ad[x])["date"]

        self.wantbufver = lambda x: "0"

        self.UpdateCached()

    def UpdateCached(self):
        if self.wantbufver and self.dbc:
            fn = self.dbc.download_key_function()
            result_id_key = dict(((m["data_id"], m["mappings_id"]), key) \
                                 for key, m in self.results_list.items())

            for item in self.items:
                c = str(item.text(10))
                mapping = self.mappings[c]
                data_id, mappings_id = mapping["data_id"], mapping["id"]
                r_id = result_id_key[data_id, mappings_id]
                # Get the buffered version
                buffered = self.dbc.inBuffer(fn(r_id))
                value = " " if buffered == self.wantbufver(r_id) else ""
                item.setData(0, Qt.DisplayRole, value)

    def SearchUpdate(self, string=""):
        for item in self.items:
            item.setHidden(not all(s in item \
                                   for s in self.searchString.split())
                           )

    def Commit(self):
        if not self.dbc:
            self.Connect()

        pb = gui.ProgressBar(self, iterations=100)

        table = None

        ids = []
        for item in self.experimentsWidget.selectedItems():
            unique_id = str(item.text(10))
            annots = self.mappings[unique_id]
            ids.append((annots["data_id"], annots["id"]))

        transfn = None
        if self.log2:
            transfn = lambda x: math.log(x + 1.0, 2)

        reverse_header_dict = dict((name, key) for key, name in HEADER)

        hview = self.experimentsWidget.header()
        shownHeaders = [label for i, label in \
                        list(enumerate(self.headerLabels))[1:] \
                        if not hview.isSectionHidden(i)
                        ]

        allowed_labels = [reverse_header_dict.get(label, label) \
                          for label in shownHeaders]

        if self.joinreplicates and "id" not in allowed_labels:
            # need 'id' labels in join_replicates for attribute names
            allowed_labels.append("id")

        if len(ids):
            table = self.dbc.get_data(
                ids=ids,
                result_type=self.rtype(),
                callback=pb.advance,
                exclude_constant_labels=self.excludeconstant,
                #                          bufver=self.wantbufver,
                transform=transfn,
                allowed_labels=allowed_labels)

            if self.joinreplicates:
                table = dicty.join_replicates(table,
                                              ignorenames=[
                                                  "replicate", "data_id",
                                                  "mappings_id", "data_name",
                                                  "id", "unique_id"
                                              ],
                                              namefn=None,
                                              avg=dicty.median)

            # Sort attributes
            sortOrder = self.columnsSortingWidget.sortingOrder

            all_values = defaultdict(set)
            for at in table.domain.attributes:
                atts = at.attributes
                for name in sortOrder:
                    all_values[name].add(
                        atts.get(reverse_header_dict[name], ""))

            isnum = {}
            for at, vals in all_values.items():
                vals = filter(None, vals)
                try:
                    for a in vals:
                        float(a)
                    isnum[at] = True
                except:
                    isnum[at] = False

            def optfloat(x, at):
                if x == "":
                    return ""
                else:
                    return float(x) if isnum[at] else x

            def sorting_key(attr):
                atts = attr.attributes
                return tuple([optfloat(atts.get(reverse_header_dict[name], ""), name) \
                              for name in sortOrder])

            attributes = sorted(table.domain.attributes, key=sorting_key)

            domain = Orange.data.Domain(attributes, table.domain.class_var,
                                        table.domain.metas)
            table = table.from_table(domain, table)

            data_hints.set_hint(table, "taxid", "352472")
            data_hints.set_hint(table, "genesinrows", False)

            self.send("Data", table)

            self.UpdateCached()

        pb.finish()

    def onSelectionChanged(self, selected, deselected):
        self.handle_commit_button()

    def handle_commit_button(self):
        self.currentSelection = \
            SelectionByKey(self.experimentsWidget.selectionModel().selection(),
                           key=(1, 2, 3, 10))
        self.commit_button.setDisabled(not len(self.currentSelection))

    def saveHeaderState(self):
        hview = self.experimentsWidget.header()
        for i, label in enumerate(self.headerLabels):
            self.experimentsHeaderState[label] = hview.isSectionHidden(i)

    def restoreHeaderState(self):
        hview = self.experimentsWidget.header()
        state = self.experimentsHeaderState
        for i, label in enumerate(self.headerLabels):
            hview.setSectionHidden(i, state.get(label, True))
            self.experimentsWidget.resizeColumnToContents(i)
Esempio n. 20
0
class OWConfusionMatrix(widget.OWWidget):
    name = "Confusion Matrix"
    description = "Display confusion matrix constructed from results " \
                  "of evaluation of classifiers."
    icon = "icons/ConfusionMatrix.svg"
    priority = 1001

    inputs = [("Evaluation Results", Orange.evaluation.Results, "set_results")]
    outputs = [("Selected Data", Orange.data.Table)]

    quantities = ["Number of instances",
                  "Proportion of predicted",
                  "Proportion of actual"]

    selected_learner = settings.Setting([])
    selected_quantity = settings.Setting(0)
    append_predictions = settings.Setting(True)
    append_probabilities = settings.Setting(False)
    autocommit = settings.Setting(True)

    UserAdviceMessages = [
        widget.Message(
                "Clicking on cells or in headers outputs the corresponding "
                "data instances",
                "click_cell")]

    def __init__(self):
        super().__init__()

        self.data = None
        self.results = None
        self.learners = []
        self.headers = []

        box = gui.widgetBox(self.controlArea, "Learners")

        self.learners_box = gui.listBox(
            box, self, "selected_learner", "learners",
            callback=self._learner_changed
        )
        box = gui.widgetBox(self.controlArea, "Show")

        gui.comboBox(box, self, "selected_quantity", items=self.quantities,
                     callback=self._update)

        box = gui.widgetBox(self.controlArea, "Select")

        gui.button(box, self, "Correct",
                   callback=self.select_correct, autoDefault=False)
        gui.button(box, self, "Misclassified",
                   callback=self.select_wrong, autoDefault=False)
        gui.button(box, self, "None",
                   callback=self.select_none, autoDefault=False)

        self.outputbox = box = gui.widgetBox(self.controlArea, "Output")
        gui.checkBox(box, self, "append_predictions",
                     "Predictions", callback=self._invalidate)
        gui.checkBox(box, self, "append_probabilities",
                     "Probabilities",
                     callback=self._invalidate)

        gui.auto_commit(self.controlArea, self, "autocommit",
                        "Send Data", "Auto send is on")

        grid = QGridLayout()

        self.tablemodel = QStandardItemModel(self)
        view = self.tableview = QTableView(
            editTriggers=QTableView.NoEditTriggers)
        view.setModel(self.tablemodel)
        view.horizontalHeader().hide()
        view.verticalHeader().hide()
        view.horizontalHeader().setMinimumSectionSize(60)
        view.selectionModel().selectionChanged.connect(self._invalidate)
        view.setShowGrid(False)
        view.clicked.connect(self.cell_clicked)
        grid.addWidget(view, 0, 0)
        self.mainArea.layout().addLayout(grid)

    def sizeHint(self):
        return QSize(750, 490)

    def _item(self, i, j):
        return self.tablemodel.item(i, j) or QStandardItem()

    def _set_item(self, i, j, item):
        self.tablemodel.setItem(i, j, item)

    def set_results(self, results):
        """Set the input results."""

        self.clear()
        self.warning([0, 1])

        data = None
        if results is not None:
            if results.data is not None:
                data = results.data

        if data is not None and not data.domain.has_discrete_class:
            data = None
            results = None
            self.warning(
                0, "Confusion Matrix cannot be used for regression results.")

        self.results = results
        self.data = data

        if data is not None:
            class_values = data.domain.class_var.values
        elif results is not None:
            raise NotImplementedError

        if results is not None:
            nmodels, ntests = results.predicted.shape
            self.headers = class_values + \
                           [unicodedata.lookup("N-ARY SUMMATION")]

            # NOTE: The 'learner_names' is set in 'Test Learners' widget.
            if hasattr(results, "learner_names"):
                self.learners = results.learner_names
            else:
                self.learners = ["Learner #%i" % (i + 1)
                                 for i in range(nmodels)]

            item = self._item(0, 2)
            item.setData("Predicted", Qt.DisplayRole)
            item.setTextAlignment(Qt.AlignCenter)
            item.setFlags(Qt.NoItemFlags)

            self._set_item(0, 2, item)
            item = self._item(2, 0)
            item.setData("Actual", Qt.DisplayRole)
            item.setTextAlignment(Qt.AlignHCenter | Qt.AlignBottom)
            item.setFlags(Qt.NoItemFlags)
            self.tableview.setItemDelegateForColumn(
                0, gui.VerticalItemDelegate())
            self._set_item(2, 0, item)
            self.tableview.setSpan(0, 2, 1, len(class_values))
            self.tableview.setSpan(2, 0, len(class_values), 1)

            for i in (0, 1):
                for j in (0, 1):
                    item = self._item(i, j)
                    item.setFlags(Qt.NoItemFlags)
                    self._set_item(i, j, item)

            for p, label in enumerate(self.headers):
                for i, j in ((1, p + 2), (p + 2, 1)):
                    item = self._item(i, j)
                    item.setData(label, Qt.DisplayRole)
                    item.setData(QBrush(QColor(208, 208, 208)),
                                 Qt.BackgroundColorRole)
                    item.setTextAlignment(Qt.AlignRight | Qt.AlignVCenter)
                    item.setFlags(Qt.ItemIsEnabled)
                    self._set_item(i, j, item)

            hor_header = self.tableview.horizontalHeader()
            if len(' '.join(self.headers)) < 120:
                hor_header.setResizeMode(QHeaderView.ResizeToContents)
            else:
                hor_header.setDefaultSectionSize(60)
            self.tablemodel.setRowCount(len(class_values) + 3)
            self.tablemodel.setColumnCount(len(class_values) + 3)
            self.selected_learner = [0]
            self._update()

    def clear(self):
        self.results = None
        self.data = None
        self.tablemodel.clear()
        self.headers = []
        # Clear learners last. This action will invoke `_learner_changed`
        # method
        self.learners = []

    def select_correct(self):
        selection = QItemSelection()
        n = self.tablemodel.rowCount()
        for i in range(2, n):
            index = self.tablemodel.index(i, i)
            selection.select(index, index)

        self.tableview.selectionModel().select(
            selection, QItemSelectionModel.ClearAndSelect
        )

    def select_wrong(self):
        selection = QItemSelection()
        n = self.tablemodel.rowCount()

        for i in range(2, n):
            for j in range(i + 1, n):
                index = self.tablemodel.index(i, j)
                selection.select(index, index)
                index = self.tablemodel.index(j, i)
                selection.select(index, index)

        self.tableview.selectionModel().select(
            selection, QItemSelectionModel.ClearAndSelect
        )

    def select_none(self):
        self.tableview.selectionModel().clear()

    def cell_clicked(self, model_index):
        i, j = model_index.row(), model_index.column()
        if not i or not j:
            return
        n = self.tablemodel.rowCount()
        index = self.tablemodel.index
        selection = None
        if i == j == 1 or i == j == n - 1:
            selection = QItemSelection(index(2, 2), index(n - 1, n - 1))
        elif i in (1, n - 1):
            selection = QItemSelection(index(2, j), index(n - 1, j))
        elif j in (1, n - 1):
            selection = QItemSelection(index(i, 2), index(i, n - 1))

        if selection is not None:
            self.tableview.selectionModel().select(
                selection, QItemSelectionModel.ClearAndSelect
            )

    def commit(self):
        if self.results is not None and self.data is not None \
                and self.selected_learner:
            indices = self.tableview.selectedIndexes()
            indices = {(ind.row() - 2, ind.column() - 2) for ind in indices}
            actual = self.results.actual
            selected_learner = self.selected_learner[0]
            learner_name = self.learners[selected_learner]
            predicted = self.results.predicted[selected_learner]
            selected = [i for i, t in enumerate(zip(actual, predicted))
                        if t in indices]
            row_indices = self.results.row_indices[selected]

            extra = []
            class_var = self.data.domain.class_var
            metas = self.data.domain.metas

            if self.append_predictions:
                predicted = numpy.array(predicted[selected], dtype=object)
                extra.append(predicted.reshape(-1, 1))
                var = Orange.data.DiscreteVariable(
                    "{}({})".format(class_var.name, learner_name),
                    class_var.values
                )
                metas = metas + (var,)

            if self.append_probabilities and \
                    self.results.probabilities is not None:
                probs = self.results.probabilities[selected_learner, selected]
                extra.append(numpy.array(probs, dtype=object))
                pvars = [Orange.data.ContinuousVariable("p({})".format(value))
                         for value in class_var.values]
                metas = metas + tuple(pvars)

            X = self.data.X[row_indices]
            Y = self.data.Y[row_indices]
            M = self.data.metas[row_indices]
            row_ids = self.data.ids[row_indices]

            M = numpy.hstack((M,) + tuple(extra))
            domain = Orange.data.Domain(
                self.data.domain.attributes,
                self.data.domain.class_vars,
                metas
            )
            data = Orange.data.Table.from_numpy(domain, X, Y, M)
            data.ids = row_ids
            data.name = learner_name

        else:
            data = None

        self.send("Selected Data", data)

    def _invalidate(self):
        self.commit()

    def _learner_changed(self):
        # The selected learner has changed
        indices = self.tableview.selectedIndexes()
        self._update()
        selection = QItemSelection()
        for sel in indices:
            selection.select(sel, sel)
        self.tableview.selectionModel().select(
            selection, QItemSelectionModel.ClearAndSelect
        )
        self.commit()

    def _update(self):
        # Update the displayed confusion matrix
        if self.results is not None and self.selected_learner:
            index = self.selected_learner[0]
            cmatrix = confusion_matrix(self.results, index)
            colsum = cmatrix.sum(axis=0)
            rowsum = cmatrix.sum(axis=1)
            total = rowsum.sum()

            if self.selected_quantity == 0:
                value = lambda i, j: int(cmatrix[i, j])
            elif self.selected_quantity == 1:
                value = lambda i, j: \
                    ("{:2.1f} %".format(100 * cmatrix[i, j] / colsum[i])
                     if colsum[i] else "N/A")
            elif self.selected_quantity == 2:
                value = lambda i, j: \
                    ("{:2.1f} %".format(100 * cmatrix[i, j] / rowsum[i])
                     if colsum[i] else "N/A")
            else:
                assert False

            for i, row in enumerate(cmatrix):
                for j, _ in enumerate(row):
                    item = self._item(i + 2, j + 2)
                    item.setData(value(i, j), Qt.DisplayRole)
                    item.setToolTip("actual: {}\npredicted: {}".format(
                        self.headers[i], self.headers[j]))
                    item.setTextAlignment(Qt.AlignRight | Qt.AlignVCenter)
                    item.setFlags(Qt.ItemIsEnabled | Qt.ItemIsSelectable)
                    self._set_item(i + 2, j + 2, item)

            model = self.tablemodel
            font = model.invisibleRootItem().font()
            bold_font = QFont(font)
            bold_font.setBold(True)

            def sum_item(value):
                item = QStandardItem()
                item.setData(value, Qt.DisplayRole)
                item.setTextAlignment(Qt.AlignRight | Qt.AlignVCenter)
                item.setFlags(Qt.ItemIsEnabled)
                item.setFont(bold_font)
                return item

            N = len(colsum)
            for i in range(N):
                model.setItem(N + 2, i + 2, sum_item(int(colsum[i])))
                model.setItem(i + 2, N + 2, sum_item(int(rowsum[i])))

            model.setItem(N + 2, N + 2, sum_item(int(total)))
Esempio n. 21
0
class OWDuplicates(widget.OWWidget):
    name = '重复文档检测'
    description = '检测和删除语料库中的重复文档'
    icon = 'icons/Duplicates.svg'
    priority = 700

    class Inputs:
        distances = Input("Distances", DistMatrix)

    class Outputs:
        corpus_without_duplicates = Output("Corpus Without Duplicates", Corpus)
        duplicates = Output("Duplicates Cluster", Corpus)
        corpus = Output("Corpus", Corpus)

    resizing_enabled = True

    class Error(OWWidget.Error):
        dist_matrix_invalid_shape = Msg('Duplicate detection only supports '
                                        'distances calculated between rows.')
        too_little_documents = Msg('More than one document is required.')

    LINKAGE = ['Single', 'Average', 'Complete', 'Weighted', 'Ward']
    linkage_method = settings.Setting(1)

    threshold = settings.Setting(.0)

    # Cluster variable domain role
    AttributeRole, ClassRole, MetaRole = 0, 1, 2
    CLUSTER_ROLES = ["Attributes", "Class", "Metas"]
    cluster_role = settings.Setting(2)

    def __init__(self):
        super().__init__()
        self.corpus = None  # corpus taken from distances
        self.linkage = None  # hierarchical clustering linkage as returned by Orange
        self.distances = None  # DistMatrix on input
        self.clustering_mask = None  # 1D array of clusters for self.corpus
        self.threshold_spin = None

        # Info
        self.n_documents = ''
        self.n_unique = ''
        self.n_duplicates = ''
        info_box = gui.widgetBox(self.controlArea, box='Info')
        gui.label(info_box, self, 'Documents: %(n_documents)s')
        gui.label(info_box, self, '  ◦ unique: %(n_unique)s')
        gui.label(info_box, self, '  ◦ duplicates: %(n_duplicates)s')

        # Threshold Histogram & Cluster View
        self.histogram = Histogram(self)
        self.table_view = gui.TableView(
            selectionMode=QListView.SingleSelection)
        self.table_model = PyTableModel()
        self.table_model.setHorizontalHeaderLabels(['Cluster', 'Size'])
        self.table_view.setModel(self.table_model)
        self.table_view.selectionModel().selectionChanged.connect(
            self.send_duplicates)

        # Add to main area
        height = 300
        main_area = gui.hBox(self.mainArea)
        self.histogram.setMinimumWidth(500)
        self.histogram.setMinimumHeight(height)
        self.table_view.setFixedWidth(140)
        main_area.layout().addWidget(self.histogram)
        main_area.layout().addWidget(self.table_view)

        # Controls
        gui.comboBox(self.controlArea,
                     self,
                     'linkage_method',
                     items=self.LINKAGE,
                     box='Linkage',
                     callback=self.recalculate_linkage,
                     orientation=Qt.Horizontal)
        self.threshold_spin = gui.doubleSpin(self.controlArea,
                                             self,
                                             'threshold',
                                             0,
                                             float('inf'),
                                             0.01,
                                             decimals=2,
                                             label='Distance threshold',
                                             box='Distances',
                                             callback=self.threshold_changed,
                                             keyboardTracking=False,
                                             controlWidth=60)
        self.histogram.region.sigRegionChangeFinished.connect(
            self.threshold_from_histogram_region)
        self.threshold_spin.setEnabled(False)
        gui.rubber(self.controlArea)

        # Output
        gui.comboBox(self.controlArea,
                     self,
                     "cluster_role",
                     box='Output',
                     label='Append Cluster IDs to:',
                     callback=self.send_corpus,
                     items=self.CLUSTER_ROLES)

    def reset(self):
        self.corpus = None
        self.linkage = None
        self.distances = None
        self.clustering_mask = None
        self.n_documents = ''
        self.n_unique = ''
        self.n_duplicates = ''
        self.threshold = 0
        self.threshold_spin.setEnabled(False)
        self.table_model.clear()
        self.histogram.setValues([])

    @Inputs.distances
    def set_distances(self, distances):
        self.Error.clear()
        self.distances = distances
        if distances is None:
            self.reset()
            return

        self.corpus = self.distances.row_items
        self.n_documents = len(self.corpus)
        if self.n_documents < 2:
            self.Error.too_little_documents()
            self.reset()
            return
        if distances.shape != (self.n_documents, self.n_documents):
            self.Error.dist_matrix_invalid_shape()
            self.reset()
            return
        self.threshold_spin.setEnabled(True)
        self.recalculate_linkage()

    def threshold_from_histogram_region(self):
        _, self.threshold = self.histogram.getRegion()
        self.threshold_changed()

    def threshold_changed(self):
        self.threshold = np.clip(self.threshold, *self.histogram.boundary())
        self.histogram.setRegion(0, self.threshold)
        self.detect_duplicates()

    def recalculate_linkage(self):
        if self.distances is not None:
            self.linkage = dist_matrix_linkage(
                self.distances, self.LINKAGE[self.linkage_method].lower())

            # Magnitude of the spinbox's step is data-dependent
            vals = sorted(self.linkage[:, 2])
            low, up = vals[0], vals[-1]
            step = (up - low) / 20

            self.threshold_spin.setSingleStep(step)
            self.threshold = np.clip(self.threshold, low, up)
            self.histogram.setValues(
                [])  # without this range breaks when changing linkages
            self.histogram.setValues(vals)
            self.histogram.setRegion(0, self.threshold)

            self.detect_duplicates()

    def detect_duplicates(self):
        if self.distances is not None:
            self.cluster_linkage()
            self.send_corpus()
            self.send_corpus_without_duplicates()
            self.fill_cluster_view()

    def cluster_linkage(self):
        # cluster documents
        n = int(self.n_documents)
        clusters = {j: [j] for j in range(n)}
        for i, (c1, c2, dist, size) in enumerate(self.linkage):
            if dist > self.threshold:
                break
            clusters[n + i] = clusters[c1] + clusters[c2]
            del clusters[c1]
            del clusters[c2]

        self.n_unique = len(clusters)
        self.n_duplicates = n - self.n_unique

        # create mask
        self.clustering_mask = np.empty(n, dtype=int)
        for i, c in enumerate(clusters.values()):
            self.clustering_mask[c] = i

    def fill_cluster_view(self):
        self.table_model.clear()
        c = Counter(self.clustering_mask)
        for id_, count in c.items():
            self.table_model.append([Cluster(id_), count])
        self.table_view.sortByColumn(1, Qt.DescendingOrder)
        self.table_view.selectRow(0)

    def send_corpus(self):
        if self.clustering_mask is not None:
            cluster_var = DiscreteVariable(
                'Duplicates Cluster',
                values=[
                    str(Cluster(v))
                    for v in set(self.clustering_mask.flatten())
                ])
            corpus, domain = self.corpus, self.corpus.domain
            attrs = domain.attributes
            class_ = domain.class_vars
            metas = domain.metas

            if self.cluster_role == self.AttributeRole:
                attrs = attrs + (cluster_var, )
            elif self.cluster_role == self.ClassRole:
                class_ = class_ + (cluster_var, )
            elif self.cluster_role == self.MetaRole:
                metas = metas + (cluster_var, )

            domain = Domain(attrs, class_, metas)
            corpus = corpus.from_table(domain, corpus)
            corpus.get_column_view(cluster_var)[0][:] = self.clustering_mask
            self.Outputs.corpus.send(corpus)
        else:
            self.Outputs.corpus.send(None)

    def send_corpus_without_duplicates(self):
        if self.clustering_mask is not None:
            # TODO make this more general, currently we just take the first document
            mask = [
                np.where(self.clustering_mask == i)[0][0]
                for i in set(self.clustering_mask)
            ]
            c = self.corpus[mask]
            c.name = '{} (Without Duplicates)'.format(self.corpus.name)
            self.Outputs.corpus_without_duplicates.send(c)
        else:
            self.Outputs.corpus_without_duplicates.send(None)

    def send_duplicates(self):
        c = None
        indices = self.table_view.selectionModel().selectedIndexes()
        if indices:
            cluster = self.table_view.model().data(indices[0], Qt.EditRole)
            mask = np.flatnonzero(self.clustering_mask == cluster.id)
            c = self.corpus[mask]
            c.name = '{} {}'.format(self.Outputs.duplicates.name, cluster)
        self.Outputs.duplicates.send(c)

    def send_report(self):
        self.report_items([
            ('Linkage', self.LINKAGE[self.linkage_method]),
            ('Distance threshold', '{:.2f}'.format(self.threshold)),
            ('Documents', self.n_documents),
            ('Unique', self.n_unique),
            ('Duplicates', self.n_duplicates),
        ])
Esempio n. 22
0
class OWPCA(widget.OWWidget):
    name = "PCA"
    description = "Principal component analysis with a scree-diagram."
    icon = "icons/PCA.svg"
    priority = 3050
    keywords = ["principal component analysis", "linear transformation"]

    class Inputs:
        data = Input("Data", Table)

    class Outputs:
        transformed_data = Output("Transformed Data",
                                  Table,
                                  replaces=["Transformed data"])
        data = Output("Data", Table, default=True)
        components = Output("Components", Table)
        pca = Output("PCA", PCA, dynamic=False)

    settingsHandler = settings.DomainContextHandler()

    ncomponents = settings.Setting(2)
    variance_covered = settings.Setting(100)
    auto_commit = settings.Setting(True)
    normalize = settings.ContextSetting(True)
    maxp = settings.Setting(20)
    axis_labels = settings.Setting(10)

    graph_name = "plot.plotItem"

    class Warning(widget.OWWidget.Warning):
        trivial_components = widget.Msg(
            "All components of the PCA are trivial (explain 0 variance). "
            "Input data is constant (or near constant).")

    class Error(widget.OWWidget.Error):
        no_features = widget.Msg("At least 1 feature is required")
        no_instances = widget.Msg("At least 1 data instance is required")

    def __init__(self):
        super().__init__()
        self.data = None

        self._pca = None
        self._transformed = None
        self._variance_ratio = None
        self._cumulative = None
        self._init_projector()

        # Components Selection
        box = gui.vBox(self.controlArea, "Components Selection")
        form = QFormLayout()
        box.layout().addLayout(form)

        self.components_spin = gui.spin(
            box,
            self,
            "ncomponents",
            1,
            MAX_COMPONENTS,
            callback=self._update_selection_component_spin,
            keyboardTracking=False)
        self.components_spin.setSpecialValueText("All")

        self.variance_spin = gui.spin(
            box,
            self,
            "variance_covered",
            1,
            100,
            callback=self._update_selection_variance_spin,
            keyboardTracking=False)
        self.variance_spin.setSuffix("%")

        form.addRow("Components:", self.components_spin)
        form.addRow("Explained variance:", self.variance_spin)

        # Options
        self.options_box = gui.vBox(self.controlArea, "Options")
        self.normalize_box = gui.checkBox(self.options_box,
                                          self,
                                          "normalize",
                                          "Normalize variables",
                                          callback=self._update_normalize)

        self.maxp_spin = gui.spin(self.options_box,
                                  self,
                                  "maxp",
                                  1,
                                  MAX_COMPONENTS,
                                  label="Show only first",
                                  callback=self._setup_plot,
                                  keyboardTracking=False)

        self.controlArea.layout().addStretch()

        gui.auto_apply(self.controlArea, self, "auto_commit")

        self.plot = SliderGraph("Principal Components",
                                "Proportion of variance", self._on_cut_changed)

        self.mainArea.layout().addWidget(self.plot)
        self._update_normalize()

    @Inputs.data
    def set_data(self, data):
        self.closeContext()
        self.clear_messages()
        self.clear()
        self.information()
        self.data = None
        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            else:
                self.information("Data has been sampled")
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(2000, partial=True)
                data = Table(data_sample)
        if isinstance(data, Table):
            if not data.domain.attributes:
                self.Error.no_features()
                self.clear_outputs()
                return
            if not data:
                self.Error.no_instances()
                self.clear_outputs()
                return

        self.openContext(data)
        self._init_projector()

        self.data = data
        self.fit()

    def fit(self):
        self.clear()
        self.Warning.trivial_components.clear()
        if self.data is None:
            return

        data = self.data

        if self.normalize:
            self._pca_projector.preprocessors = \
                self._pca_preprocessors + [preprocess.Normalize(center=False)]
        else:
            self._pca_projector.preprocessors = self._pca_preprocessors

        if not isinstance(data, SqlTable):
            pca = self._pca_projector(data)
            variance_ratio = pca.explained_variance_ratio_
            cumulative = numpy.cumsum(variance_ratio)

            if numpy.isfinite(cumulative[-1]):
                self.components_spin.setRange(0, len(cumulative))
                self._pca = pca
                self._variance_ratio = variance_ratio
                self._cumulative = cumulative
                self._setup_plot()
            else:
                self.Warning.trivial_components()

            self.unconditional_commit()

    def clear(self):
        self._pca = None
        self._transformed = None
        self._variance_ratio = None
        self._cumulative = None
        self.plot.clear_plot()

    def clear_outputs(self):
        self.Outputs.transformed_data.send(None)
        self.Outputs.data.send(None)
        self.Outputs.components.send(None)
        self.Outputs.pca.send(self._pca_projector)

    def _setup_plot(self):
        if self._pca is None:
            self.plot.clear_plot()
            return

        explained_ratio = self._variance_ratio
        explained = self._cumulative
        cutpos = self._nselected_components()
        p = min(len(self._variance_ratio), self.maxp)

        self.plot.update(numpy.arange(1, p + 1),
                         [explained_ratio[:p], explained[:p]],
                         [Qt.red, Qt.darkYellow],
                         cutpoint_x=cutpos,
                         names=LINE_NAMES)

        self._update_axis()

    def _on_cut_changed(self, components):
        if components == self.ncomponents \
                or self.ncomponents == 0 \
                or self._pca is not None \
                and components == len(self._variance_ratio):
            return

        self.ncomponents = components
        if self._pca is not None:
            var = self._cumulative[components - 1]
            if numpy.isfinite(var):
                self.variance_covered = int(var * 100)

        self._invalidate_selection()

    def _update_selection_component_spin(self):
        # cut changed by "ncomponents" spin.
        if self._pca is None:
            self._invalidate_selection()
            return

        if self.ncomponents == 0:
            # Special "All" value
            cut = len(self._variance_ratio)
        else:
            cut = self.ncomponents

        var = self._cumulative[cut - 1]
        if numpy.isfinite(var):
            self.variance_covered = int(var * 100)

        self.plot.set_cut_point(cut)
        self._invalidate_selection()

    def _update_selection_variance_spin(self):
        # cut changed by "max variance" spin.
        if self._pca is None:
            return

        cut = numpy.searchsorted(self._cumulative,
                                 self.variance_covered / 100.0) + 1
        cut = min(cut, len(self._cumulative))
        self.ncomponents = cut
        self.plot.set_cut_point(cut)
        self._invalidate_selection()

    def _update_normalize(self):
        self.fit()
        if self.data is None:
            self._invalidate_selection()

    def _init_projector(self):
        self._pca_projector = PCA(n_components=MAX_COMPONENTS, random_state=0)
        self._pca_projector.component = self.ncomponents
        self._pca_preprocessors = PCA.preprocessors

    def _nselected_components(self):
        """Return the number of selected components."""
        if self._pca is None:
            return 0

        if self.ncomponents == 0:
            # Special "All" value
            max_comp = len(self._variance_ratio)
        else:
            max_comp = self.ncomponents

        var_max = self._cumulative[max_comp - 1]
        if var_max != numpy.floor(self.variance_covered / 100.0):
            cut = max_comp
            assert numpy.isfinite(var_max)
            self.variance_covered = int(var_max * 100)
        else:
            self.ncomponents = cut = numpy.searchsorted(
                self._cumulative, self.variance_covered / 100.0) + 1
        return cut

    def _invalidate_selection(self):
        self.commit()

    def _update_axis(self):
        p = min(len(self._variance_ratio), self.maxp)
        axis = self.plot.getAxis("bottom")
        d = max((p - 1) // (self.axis_labels - 1), 1)
        axis.setTicks([[(i, str(i)) for i in range(1, p + 1, d)]])

    def commit(self):
        transformed = data = components = None
        if self._pca is not None:
            if self._transformed is None:
                # Compute the full transform (MAX_COMPONENTS components) once.
                self._transformed = self._pca(self.data)
            transformed = self._transformed

            domain = Domain(transformed.domain.attributes[:self.ncomponents],
                            self.data.domain.class_vars,
                            self.data.domain.metas)
            transformed = transformed.from_table(domain, transformed)
            # prevent caching new features by defining compute_value
            dom = Domain([
                ContinuousVariable(a.name, compute_value=lambda _: None)
                for a in self._pca.orig_domain.attributes
            ],
                         metas=[StringVariable(name='component')])
            metas = numpy.array(
                [['PC{}'.format(i + 1) for i in range(self.ncomponents)]],
                dtype=object).T
            components = Table(dom,
                               self._pca.components_[:self.ncomponents],
                               metas=metas)
            components.name = 'components'

            data_dom = Domain(self.data.domain.attributes,
                              self.data.domain.class_vars,
                              self.data.domain.metas + domain.attributes)
            data = Table.from_numpy(
                data_dom, self.data.X, self.data.Y,
                numpy.hstack((self.data.metas, transformed.X)))

        self._pca_projector.component = self.ncomponents
        self.Outputs.transformed_data.send(transformed)
        self.Outputs.components.send(components)
        self.Outputs.data.send(data)
        self.Outputs.pca.send(self._pca_projector)

    def send_report(self):
        if self.data is None:
            return
        self.report_items(
            (("Normalize data", str(self.normalize)), ("Selected components",
                                                       self.ncomponents),
             ("Explained variance", "{:.3f} %".format(self.variance_covered))))
        self.report_plot()

    @classmethod
    def migrate_settings(cls, settings, version):
        if "variance_covered" in settings:
            # Due to the error in gh-1896 the variance_covered was persisted
            # as a NaN value, causing a TypeError in the widgets `__init__`.
            vc = settings["variance_covered"]
            if isinstance(vc, numbers.Real):
                if numpy.isfinite(vc):
                    vc = int(vc)
                else:
                    vc = 100
                settings["variance_covered"] = vc
        if settings.get("ncomponents", 0) > MAX_COMPONENTS:
            settings["ncomponents"] = MAX_COMPONENTS

        # Remove old `decomposition_idx` when SVD was still included
        settings.pop("decomposition_idx", None)

        # Remove RemotePCA settings
        settings.pop("batch_size", None)
        settings.pop("address", None)
        settings.pop("auto_update", None)
Esempio n. 23
0
class OWFiltering(widget.OWWidget):
    name = "Filtering"
    description = "Filter audio clips"
    priority = 2
    icon = "icons/filtering.png"

    inputs = [("Data", Orange.data.Table, "set_data")]
    filter_design_id = settings.Setting(0)
    band_type_id = settings.Setting(0)

    first_cutoff = settings.Setting(1000)
    second_cutoff = settings.Setting(1000)
    filter_order = settings.Setting(10)
    maximum_ripple = settings.Setting(10)
    minimum_attenuation = settings.Setting(10)

    outputs = [("Filtered data", Orange.data.Table)]

    want_main_area = False

    data = None

    def __init__(self):
        super().__init__()

        self.tmp_dir_id = str(time.time()).split(".")[-1]
        self.new_tmp_dirs = []

        info_box = gui.widgetBox(self.controlArea, "Info")
        self.info = gui.widgetLabel(
            info_box, 'No data on input yet, waiting to get something.')

        self.filter_designs_combo = gui.comboBox(
            self.controlArea,
            self,
            "filter_design_id",
            box="Filter designs",
            items=[
                m for m in filter_designs],
        )
        self.filter_designs_combo.activated.connect(self.onDesignChange)

        self.band_types_combo = gui.comboBox(
            self.controlArea,
            self,
            "band_type_id",
            box="Band types",
            items=[
                m for m in band_types],
        )
        self.band_types_combo.activated.connect(self.onTypeChange)

        parameters_box = gui.widgetBox(self.controlArea, 'Parameters')
        self.first_cutoff_spin = gui.spin(
            parameters_box,
            self,
            "first_cutoff",
            minv=1,
            maxv=10000,
            controlWidth=80,
            alignment=Qt.AlignRight,
            label="First cutoff frequency [Hz]: ",
            spinType=float,
            decimals=2)
        self.second_cutoff_spin = gui.spin(
            parameters_box,
            self,
            "second_cutoff",
            minv=1,
            maxv=10000,
            controlWidth=80,
            alignment=Qt.AlignRight,
            label="Second cutoff frequency [Hz]: ",
            spinType=float,
            decimals=2)
        self.filter_order_spin = gui.spin(
            parameters_box,
            self,
            "filter_order",
            minv=1,
            maxv=10000,
            controlWidth=80,
            alignment=Qt.AlignRight,
            label="Order: ")
        self.maximum_ripple_spin = gui.spin(
            parameters_box,
            self,
            "maximum_ripple",
            minv=1,
            maxv=10000,
            controlWidth=80,
            alignment=Qt.AlignRight,
            label="Maximum ripple [dB]: ",
            spinType=float,
            decimals=2)
        self.minimum_attenuation_spin = gui.spin(
            parameters_box,
            self,
            "minimum_attenuation",
            minv=1,
            maxv=10000,
            controlWidth=80,
            alignment=Qt.AlignRight,
            label="Minimum attenuation [dB]: ",
            spinType=float,
            decimals=2)

        self.filter_button = gui.button(
            self.controlArea,
            self,
            "Filter",
            callback=lambda: self.call_filter(
                self.filter_designs_combo.currentText(),
                self.band_types_combo.currentText(),
                self.first_cutoff,
                self.second_cutoff,
                self.filter_order,
                self.maximum_ripple,
                self.minimum_attenuation))

        self.onDesignChange()

    def set_data(self, dataset):
        """
        Set data from input

        :param dataset: input data
        :return: Void
        """

        if dataset is not None:
            self.info.setText('%d instances in input data set' % len(dataset))
            self.data = dataset
        else:
            self.infoa.setText(
                'No data on input yet, waiting to get something.')
            self.send("Filtered data", None)

    def allSpinHandle(self, handle):
        """
        Helper function which handle all spines at once

        :param handle: handle parameter (true -> enable, false -> disable)
        :return: Void
        """

        self.first_cutoff_spin.setEnabled(handle)
        self.second_cutoff_spin.setEnabled(handle)
        self.filter_order_spin.setEnabled(handle)
        self.maximum_ripple_spin.setEnabled(handle)
        self.minimum_attenuation_spin.setEnabled(handle)

    def onDesignChange(self):
        """
        When the desgin changes, it changes the options of the parameters

        :return: Void
        """

        self.allSpinHandle(True)
        if self.filter_design_id == 0 or self.filter_design_id == 1 or self.filter_design_id == 5:
            self.second_cutoff_spin.setEnabled(False)
            self.maximum_ripple_spin.setEnabled(False)
            self.minimum_attenuation_spin.setEnabled(False)
        elif self.filter_design_id == 2:
            self.second_cutoff_spin.setEnabled(False)
            self.minimum_attenuation_spin.setEnabled(False)
        elif self.filter_design_id == 3:
            self.second_cutoff_spin.setEnabled(False)
            self.maximum_ripple_spin.setEnabled(False)
        self.onTypeChange()

    def onTypeChange(self):
        """
        When the type changes, it changes the options of the parameters

        :return: Void
        """

        if self.band_type_id == 2 or self.band_type_id == 3:
            self.second_cutoff_spin.setEnabled(True)
        else:
            self.second_cutoff_spin.setEnabled(False)

    def call_filter(
            self,
            filter_type,
            filter_band,
            first_cutoff,
            second_cutoff,
            order,
            max_ripple,
            min_attenuation):
        """
        Call specified filter function on all audio clips

        :param filter_type: type of filter
        :param filter_band:  band of filter
        :param first_cutoff: first cutoff frequency
        :param second_cutoff: second cutoff frequency
        :param order: filter order
        :param max_ripple: the maximum ripple
        :param min_attenuation: the minimum attenuatio
        :return: Void
        """

        if self.data is None:
            return

        filterBand = (''.join(c for c in filter_band if c not in "-")).lower()
        filterType = self.convertTypeToStr(filter_type)

        error = None

        self.X = []
        self.metas = []

        try:
            for i in range(len(self.data.metas)):
                if self.data.X != []:
                    input_data = self.data.X[i]
                else:
                    input_data = read(self.data.metas[i][1])[1]
                    if len(input_data.shape) > 1:
                        input_data = input_data[:, 0]

                if filterType == "FIR" or filterType == "butter" or filterType == "bessel":
                    if filterBand == "lowpass" or filterBand == "highpass":
                        filtered = st.filter_signal(input_data,
                                                    ftype=filterType,
                                                    band=filterBand,
                                                    order=order,
                                                    frequency=first_cutoff,
                                                    sampling_rate=self.data.metas[i][-1])
                    else:
                        filtered = st.filter_signal(input_data, ftype=filterType, band=filterBand, order=order,
                                                    frequency=[first_cutoff, second_cutoff],
                                                    sampling_rate=self.data.metas[i][-1])
                elif filterType == "cheby1":
                    if filterBand == "lowpass" or filterBand == "highpass":
                        filtered = st.filter_signal(input_data,
                                                    ftype=filterType,
                                                    band=filterBand,
                                                    order=order,
                                                    frequency=first_cutoff,
                                                    sampling_rate=self.data.metas[i][-1],
                                                    rp=max_ripple)
                    else:
                        filtered = st.filter_signal(input_data, ftype=filterType, band=filterBand, order=order,
                                                    frequency=[first_cutoff, second_cutoff], sampling_rate=self.data.metas[i][-1],
                                                    rp=max_ripple)
                elif filterType == "cheby2":
                    if filterBand == "lowpass" or filterBand == "highpass":
                        filtered = st.filter_signal(input_data,
                                                    ftype=filterType,
                                                    band=filterBand,
                                                    order=order,
                                                    frequency=first_cutoff,
                                                    sampling_rate=self.data.metas[i][-1],
                                                    rs=min_attenuation)
                    else:
                        filtered = st.filter_signal(input_data, ftype=filterType, band=filterBand, order=order,
                                                    frequency=[first_cutoff, second_cutoff], sampling_rate=self.data.metas[i][-1],
                                                    rs=min_attenuation)

                else:
                    if filterBand == "lowpass" or filterBand == "highpass":
                        filtered = st.filter_signal(input_data,
                                                    ftype=filterType,
                                                    band=filterBand,
                                                    order=order,
                                                    frequency=first_cutoff,
                                                    sampling_rate=self.data.metas[i][-1],
                                                    rp=max_ripple,
                                                    rs=min_attenuation)
                    else:
                        filtered = st.filter_signal(input_data,
                                                    ftype=filterType,
                                                    band=filterBand,
                                                    order=order,
                                                    frequency=[first_cutoff,
                                                               second_cutoff],
                                                    sampling_rate=self.data.metas[i][-1],
                                                    rp=max_ripple,
                                                    rs=min_attenuation)

                self.new_tmp_dir = os.path.dirname(
                    self.data.metas[i][1]) + os.sep + "filtered-" + self.tmp_dir_id + os.sep

                if not os.path.exists(self.new_tmp_dir):
                    os.makedirs(self.new_tmp_dir)
                    self.new_tmp_dirs.append(self.new_tmp_dir)

                filename = self.new_tmp_dir + self.data.metas[i][0] + ".wav"
                self.metas.append([self.data.metas[i][0],
                                   filename,
                                   self.data.metas[i][2],
                                   self.data.metas[i][3],
                                   self.data.metas[i][4]])

                data = filtered["signal"]
                data = data / data.max()
                data = data * (2 ** 15 - 1)
                data = data.astype(numpy.int16)
                write(filename, self.data.metas[i][-1], data)

        except Exception as ex:
            error = ex

        if not error:
            self.info.setStyleSheet(success_green)
            self.info.setText(
                filter_type +
                " " +
                filter_band +
                " " +
                "filter successful!")
            orange_table = Orange.data.Table.from_numpy(
                self.data.domain, numpy.empty((len(self.data.Y), 0), dtype=float),
                self.data.Y, self.metas
            )

            self.send("Filtered data", orange_table)
        if error:
            self.info.setStyleSheet(error_red)
            self.info.setText("An error occurred:\n{}".format(error))
            return

    def convertTypeToStr(self, filter_type):
        """
        Helper function which convert specified type of filter in a coded string

        :param filter_type: type of filter
        :return: coded type of filter
        """

        if filter_type == "Finite Impulse Response":
            typeStr = "FIR"
        elif filter_type == "Butterworth":
            typeStr = "butter"
        elif filter_type == "Chebyshev 1":
            typeStr = "cheby1"
        elif filter_type == "Chebyshev 2":
            typeStr = "cheby2"
        elif filter_type == "Elliptic":
            typeStr = "ellip"
        else:
            typeStr = "bessel"

        return typeStr

    def onDeleteWidget(self):
        """
        Delete temporarily written audio clips

        :return: Void
        """

        if self.new_tmp_dirs != []:
            import shutil
            for i in self.new_tmp_dirs:
                shutil.rmtree(i)
class OWSVDPlusPlus(OWBaseLearner):
    # Widget needs a name, or it is considered an abstract widget
    # and not shown in the menu.
    name = "SVD++"
    description = 'Matrix factorization model which makes use of implicit ' \
                  'feedback information'
    icon = "icons/svdplusplus.svg"
    priority = 80

    LEARNER = SVDPlusPlusLearner

    inputs = [("Feedback information", Table, "set_feedback")]

    outputs = [("P", Table), ("Q", Table), ("Y", Table)]

    # Parameters (general)
    num_factors = settings.Setting(10)
    num_iter = settings.Setting(15)
    learning_rate = settings.Setting(0.01)
    bias_learning_rate = settings.Setting(0.01)
    lmbda = settings.Setting(0.1)
    bias_lmbda = settings.Setting(0.1)
    feedback = None

    # Seed (Random state)
    RND_SEED, FIXED_SEED = range(2)
    seed_type = settings.Setting(RND_SEED)
    random_seed = settings.Setting(42)

    # SGD optimizers
    class _Optimizer:
        SGD, MOMENTUM, NAG, ADAGRAD, RMSPROP, ADADELTA, ADAM, ADAMAX = range(8)
        names = [
            'Vanilla SGD', 'Momentum', "Nesterov momentum", 'AdaGrad',
            'RMSprop', 'AdaDelta', 'Adam', 'Adamax'
        ]

    opt_type = settings.Setting(_Optimizer.SGD)
    momentum = settings.Setting(0.9)
    rho = settings.Setting(0.9)
    beta1 = settings.Setting(0.9)
    beta2 = settings.Setting(0.999)

    def add_main_layout(self):
        # hbox = gui.hBox(self.controlArea, "Settings")

        # Frist groupbox (Common parameters)
        box = gui.widgetBox(self.controlArea, "Parameters")

        gui.spin(box,
                 self,
                 "num_factors",
                 1,
                 10000,
                 label="Number of latent factors:",
                 alignment=Qt.AlignRight,
                 callback=self.settings_changed)

        gui.spin(box,
                 self,
                 "num_iter",
                 1,
                 10000,
                 label="Number of iterations:",
                 alignment=Qt.AlignRight,
                 callback=self.settings_changed)

        gui.doubleSpin(box,
                       self,
                       "learning_rate",
                       minv=1e-5,
                       maxv=1e+5,
                       step=1e-5,
                       label="Learning rate:",
                       decimals=5,
                       alignment=Qt.AlignRight,
                       controlWidth=90,
                       callback=self.settings_changed)

        gui.doubleSpin(box,
                       self,
                       "bias_learning_rate",
                       minv=1e-5,
                       maxv=1e+5,
                       step=1e-5,
                       label="     Bias learning rate:",
                       decimals=5,
                       alignment=Qt.AlignRight,
                       controlWidth=90,
                       callback=self.settings_changed)

        gui.doubleSpin(box,
                       self,
                       "lmbda",
                       minv=1e-4,
                       maxv=1e+4,
                       step=1e-4,
                       label="Regularization:",
                       decimals=4,
                       alignment=Qt.AlignRight,
                       controlWidth=90,
                       callback=self.settings_changed)

        gui.doubleSpin(box,
                       self,
                       "bias_lmbda",
                       minv=1e-4,
                       maxv=1e+4,
                       step=1e-4,
                       label="     Bias regularization:",
                       decimals=4,
                       alignment=Qt.AlignRight,
                       controlWidth=90,
                       callback=self.settings_changed)

        # Second groupbox (SGD optimizers)
        box = gui.widgetBox(self.controlArea, "SGD optimizers")

        gui.comboBox(box,
                     self,
                     "opt_type",
                     label="SGD optimizer: ",
                     items=self._Optimizer.names,
                     orientation=Qt.Horizontal,
                     addSpace=4,
                     callback=self._opt_changed)

        _m_comp = gui.doubleSpin(box,
                                 self,
                                 "momentum",
                                 minv=1e-4,
                                 maxv=1e+4,
                                 step=1e-4,
                                 label="Momentum:",
                                 decimals=4,
                                 alignment=Qt.AlignRight,
                                 controlWidth=90,
                                 callback=self.settings_changed)

        _r_comp = gui.doubleSpin(box,
                                 self,
                                 "rho",
                                 minv=1e-4,
                                 maxv=1e+4,
                                 step=1e-4,
                                 label="Rho:",
                                 decimals=4,
                                 alignment=Qt.AlignRight,
                                 controlWidth=90,
                                 callback=self.settings_changed)

        _b1_comp = gui.doubleSpin(box,
                                  self,
                                  "beta1",
                                  minv=1e-5,
                                  maxv=1e+5,
                                  step=1e-4,
                                  label="Beta 1:",
                                  decimals=5,
                                  alignment=Qt.AlignRight,
                                  controlWidth=90,
                                  callback=self.settings_changed)

        _b2_comp = gui.doubleSpin(box,
                                  self,
                                  "beta2",
                                  minv=1e-5,
                                  maxv=1e+5,
                                  step=1e-4,
                                  label="Beta 2:",
                                  decimals=5,
                                  alignment=Qt.AlignRight,
                                  controlWidth=90,
                                  callback=self.settings_changed)
        gui.rubber(box)
        self._opt_params = [_m_comp, _r_comp, _b1_comp, _b2_comp]
        self._show_right_optimizer()

        # Third groupbox (Random state)
        box = gui.widgetBox(self.controlArea, "Random state")
        rndstate = gui.radioButtons(box,
                                    self,
                                    "seed_type",
                                    callback=self.settings_changed)
        gui.appendRadioButton(rndstate, "Random seed")
        gui.appendRadioButton(rndstate, "Fixed seed")
        ibox = gui.indentedBox(rndstate)
        self.spin_rnd_seed = gui.spin(ibox,
                                      self,
                                      "random_seed",
                                      -1e5,
                                      1e5,
                                      label="Seed:",
                                      alignment=Qt.AlignRight,
                                      callback=self.settings_changed)
        self.settings_changed()  # Update (extra) settings

    def settings_changed(self):
        # Enable/Disable Fixed seed control
        self.spin_rnd_seed.setEnabled(self.seed_type == self.FIXED_SEED)
        super().settings_changed()

    def _show_right_optimizer(self):
        enabled = [
            [False, False, False, False],  # SGD
            [True, False, False, False],  # Momentum
            [True, False, False, False],  # NAG
            [False, False, False, False],  # AdaGrad
            [False, True, False, False],  # RMSprop
            [False, True, False, False],  # AdaDelta
            [False, False, True, True],  # Adam
            [False, False, True, True],  # Adamax
        ]
        mask = enabled[self.opt_type]
        for spin, enabled in zip(self._opt_params, mask):
            [spin.box.hide, spin.box.show][enabled]()

    def _opt_changed(self):
        self._show_right_optimizer()
        self.settings_changed()

    def select_optimizer(self):
        if self.opt_type == self._Optimizer.MOMENTUM:
            return opt.Momentum(momentum=self.momentum)

        elif self.opt_type == self._Optimizer.NAG:
            return opt.NesterovMomentum(momentum=self.momentum)

        elif self.opt_type == self._Optimizer.ADAGRAD:
            return opt.AdaGrad()

        elif self.opt_type == self._Optimizer.RMSPROP:
            return opt.RMSProp(rho=self.rho)

        elif self.opt_type == self._Optimizer.ADADELTA:
            return opt.AdaDelta(rho=self.rho)

        elif self.opt_type == self._Optimizer.ADAM:
            return opt.Adam(beta1=self.beta1, beta2=self.beta2)

        elif self.opt_type == self._Optimizer.ADAMAX:
            return opt.Adamax(beta1=self.beta1, beta2=self.beta2)

        else:
            return opt.SGD()

    def create_learner(self):
        # Set random state
        if self.seed_type == self.FIXED_SEED:
            seed = self.random_seed
        else:
            seed = None

        return self.LEARNER(num_factors=self.num_factors,
                            num_iter=self.num_iter,
                            learning_rate=self.learning_rate,
                            bias_learning_rate=self.bias_learning_rate,
                            lmbda=self.lmbda,
                            bias_lmbda=self.bias_lmbda,
                            feedback=self.feedback,
                            optimizer=self.select_optimizer(),
                            random_state=seed,
                            callback=self.progress_callback)

    def get_learner_parameters(self):
        return (("Number of latent factors", self.num_factors),
                ("Number of iterations", self.num_iter), ("Learning rate",
                                                          self.learning_rate),
                ("Bias learning rate", self.bias_learning_rate),
                ("Regularization", self.lmbda), ("Bias regularization",
                                                 self.bias_lmbda),
                ("SGD optimizer", self._Optimizer.names[self.opt_type]))

    def _check_data(self):
        self.valid_data = False

        if self.data is not None:
            try:  # Check ratings data
                valid_ratings = format_data.check_data(self.data)
            except Exception as e:
                valid_ratings = False
                print('Error checking rating data: ' + str(e))

            if not valid_ratings:  # Check if it's valid
                self.Error.data_error("Data not valid for rating models.")
            else:
                self.valid_data = True

        return self.valid_data

    def update_learner(self):
        self._check_data()

        # If our method returns 'False', it could be because there is no data.
        # But when cross-validating, a learner is required, as the data is in
        # the widget Test&Score
        if self.valid_data or self.data is None:
            super().update_learner()

    def update_model(self):
        self._check_data()
        super().update_model()

        P = None
        Q = None
        Y = None
        if self.valid_data:
            P = self.model.getPTable()
            Q = self.model.getQTable()
            Y = self.model.getYTable()

        self.send("P", P)
        self.send("Q", Q)
        self.send("Y", Y)

    def progress_callback(self, *args, **kwargs):
        iter = args[0]

        # Start/Finish progress bar
        if iter == 1:  # Start it
            self.progressBarInit()

        if iter == self.num_iter:  # Finish
            self.progressBarFinished()
            return

        if self.num_iter > 0:
            self.progressBarSet(int(iter / self.num_iter * 100))

    def set_feedback(self, feedback):
        self.feedback = feedback
        self.update_learner()
Esempio n. 25
0
class OWSVMRegression(OWBaseSVM):
    name = "SVM Regression"
    description = "Support Vector Machines map inputs to higher-dimensional " \
                  "feature spaces that best map instances to a linear function.  "
    icon = "icons/SVMRegression.svg"
    priority = 50

    LEARNER = SVRLearner

    outputs = [("Support vectors", Table, widget.Explicit)]

    #: SVR types
    Epsilon_SVR, Nu_SVR = 0, 1
    #: Selected SVR type
    svrtype = settings.Setting(Epsilon_SVR)
    #: C parameter for Epsilon SVR
    epsilon_C = settings.Setting(1.0)
    #: epsilon parameter for Epsilon SVR
    epsilon = settings.Setting(0.1)
    #: C parameter for Nu SVR
    nu_C = settings.Setting(1.0)
    #: Nu pareter for Nu SVR
    nu = settings.Setting(0.5)

    def _add_type_box(self):
        form = QGridLayout()
        self.type_box = box = gui.radioButtonsInBox(self.controlArea,
                                                    self,
                                                    "svrtype", [],
                                                    box="SVR Type",
                                                    orientation=form)

        self.epsilon_radio = gui.appendRadioButton(box,
                                                   "ε-SVR",
                                                   addToLayout=False)
        self.epsilon_C_spin = gui.doubleSpin(box,
                                             self,
                                             "epsilon_C",
                                             0.1,
                                             512.0,
                                             0.1,
                                             decimals=2,
                                             addToLayout=False)
        self.epsilon_spin = gui.doubleSpin(box,
                                           self,
                                           "epsilon",
                                           0.1,
                                           512.0,
                                           0.1,
                                           decimals=2,
                                           addToLayout=False)
        form.addWidget(self.epsilon_radio, 0, 0, Qt.AlignLeft)
        form.addWidget(QLabel("Cost (C):"), 0, 1, Qt.AlignRight)
        form.addWidget(self.epsilon_C_spin, 0, 2)
        form.addWidget(QLabel("Loss epsilon (ε):"), 1, 1, Qt.AlignRight)
        form.addWidget(self.epsilon_spin, 1, 2)

        self.nu_radio = gui.appendRadioButton(box, "ν-SVR", addToLayout=False)
        self.nu_C_spin = gui.doubleSpin(box,
                                        self,
                                        "nu_C",
                                        0.1,
                                        512.0,
                                        0.1,
                                        decimals=2,
                                        addToLayout=False)
        self.nu_spin = gui.doubleSpin(box,
                                      self,
                                      "nu",
                                      0.05,
                                      1.0,
                                      0.05,
                                      decimals=2,
                                      addToLayout=False)
        form.addWidget(self.nu_radio, 2, 0, Qt.AlignLeft)
        form.addWidget(QLabel("Cost (C):"), 2, 1, Qt.AlignRight)
        form.addWidget(self.nu_C_spin, 2, 2)
        form.addWidget(QLabel("Complexity bound (ν):"), 3, 1, Qt.AlignRight)
        form.addWidget(self.nu_spin, 3, 2)

    def create_learner(self):
        kernel = ["linear", "poly", "rbf", "sigmoid"][self.kernel_type]
        common_args = dict(
            kernel=kernel,
            degree=self.degree,
            gamma=self.gamma if self.gamma else self._default_gamma,
            coef0=self.coef0,
            tol=self.tol,
            preprocessors=self.preprocessors)
        if self.svrtype == OWSVMRegression.Epsilon_SVR:
            return SVRLearner(C=self.epsilon_C,
                              epsilon=self.epsilon,
                              **common_args)
        else:
            return NuSVRLearner(C=self.nu_C, nu=self.nu, **common_args)

    def get_learner_parameters(self):
        items = OrderedDict()
        if self.svrtype == 0:
            items["SVM type"] = \
                "ε-SVR, C={}, ε={}".format(self.epsilon_C, self.epsilon)
        else:
            items["SVM type"] = "ν-SVR, C={}, ν={}".format(self.nu_C, self.nu)
        self._report_kernel_parameters(items)
        items["Numerical tolerance"] = "{:.6}".format(self.tol)
        return items
Esempio n. 26
0
class OWPredictions(OWWidget):
    name = "Predictions"
    icon = "icons/Predictions.svg"
    priority = 200
    description = "Display the predictions of models for an input data set."
    inputs = [("Data", Orange.data.Table, "set_data"),
              ("Predictors", Model,
               "set_predictor", widget.Multiple)]
    outputs = [("Predictions", Orange.data.Table),
               ("Evaluation Results", Orange.evaluation.Results)]

    class Warning(OWWidget.Warning):
        empty_data = Msg("Empty data set")

    class Error(OWWidget.Error):
        predictor_failed = Msg("One or more predictors failed (see more...)\n{}")

    settingsHandler = settings.ClassValuesContextHandler()
    #: Display the full input dataset or only the target variable columns (if
    #: available)
    show_attrs = settings.Setting(True)
    #: Show predicted values (for discrete target variable)
    show_predictions = settings.Setting(True)
    #: Show predictions probabilities (for discrete target variable)
    show_probabilities = settings.Setting(True)
    #: List of selected class value indices in the "Show probabilities" list
    selected_classes = settings.ContextSetting([])
    #: Draw colored distribution bars
    draw_dist = settings.Setting(True)

    output_attrs = settings.Setting(True)
    output_predictions = settings.Setting(True)
    output_probabilities = settings.Setting(True)

    def __init__(self):
        super().__init__()

        #: Input data table
        self.data = None  # type: Optional[Orange.data.Table]
        #: A dict mapping input ids to PredictorSlot
        self.predictors = OrderedDict()  # type: Dict[object, PredictorSlot]
        #: A class variable (prediction target)
        self.class_var = None  # type: Optional[Orange.data.Variable]
        #: List of (discrete) class variable's values
        self.class_values = []  # type: List[str]

        box = gui.vBox(self.controlArea, "Info")
        self.infolabel = gui.widgetLabel(
            box, "No data on input.\nPredictors: 0\nTask: N/A")
        self.infolabel.setMinimumWidth(150)
        gui.button(box, self, "Restore Original Order",
                   callback=self._reset_order,
                   tooltip="Show rows in the original order")

        self.classification_options = box = gui.vBox(
            self.controlArea, "Show", spacing=-1, addSpace=False)

        gui.checkBox(box, self, "show_predictions", "Predicted class",
                     callback=self._update_prediction_delegate)
        b = gui.checkBox(box, self, "show_probabilities",
                         "Predicted probabilities for:",
                         callback=self._update_prediction_delegate)
        ibox = gui.indentedBox(box, sep=gui.checkButtonOffsetHint(b),
                               addSpace=False)
        gui.listBox(ibox, self, "selected_classes", "class_values",
                    callback=self._update_prediction_delegate,
                    selectionMode=QListWidget.MultiSelection,
                    addSpace=False)
        gui.checkBox(box, self, "draw_dist", "Draw distribution bars",
                     callback=self._update_prediction_delegate)

        box = gui.vBox(self.controlArea, "Data View")
        gui.checkBox(box, self, "show_attrs", "Show full data set",
                     callback=self._update_column_visibility)

        box = gui.vBox(self.controlArea, "Output", spacing=-1)
        self.checkbox_class = gui.checkBox(
            box, self, "output_attrs", "Original data",
            callback=self.commit)
        self.checkbox_class = gui.checkBox(
            box, self, "output_predictions", "Predictions",
            callback=self.commit)
        self.checkbox_prob = gui.checkBox(
            box, self, "output_probabilities", "Probabilities",
            callback=self.commit)

        gui.rubber(self.controlArea)

        self.splitter = QSplitter(
            orientation=Qt.Horizontal,
            childrenCollapsible=False,
            handleWidth=2,
        )
        self.dataview = TableView(
            verticalScrollBarPolicy=Qt.ScrollBarAlwaysOn,
            horizontalScrollBarPolicy=Qt.ScrollBarAlwaysOn,
            horizontalScrollMode=QTableView.ScrollPerPixel,
            selectionMode=QTableView.NoSelection,
            focusPolicy=Qt.StrongFocus
        )
        self.predictionsview = TableView(
            verticalScrollBarPolicy=Qt.ScrollBarAlwaysOff,
            horizontalScrollBarPolicy=Qt.ScrollBarAlwaysOn,
            horizontalScrollMode=QTableView.ScrollPerPixel,
            selectionMode=QTableView.NoSelection,
            focusPolicy=Qt.StrongFocus,
            sortingEnabled=True,
        )

        self.predictionsview.setItemDelegate(PredictionsItemDelegate())
        self.dataview.verticalHeader().hide()

        dsbar = self.dataview.verticalScrollBar()
        psbar = self.predictionsview.verticalScrollBar()

        psbar.valueChanged.connect(dsbar.setValue)
        dsbar.valueChanged.connect(psbar.setValue)

        self.dataview.verticalHeader().setDefaultSectionSize(22)
        self.predictionsview.verticalHeader().setDefaultSectionSize(22)
        self.dataview.verticalHeader().sectionResized.connect(
            lambda index, _, size:
            self.predictionsview.verticalHeader().resizeSection(index, size)
        )

        self.splitter.addWidget(self.predictionsview)
        self.splitter.addWidget(self.dataview)

        self.mainArea.layout().addWidget(self.splitter)

    @check_sql_input
    def set_data(self, data):
        """Set the input data set"""
        if data is not None and not len(data):
            data = None
            self.Warning.empty_data()
        else:
            self.Warning.empty_data.clear()

        self.data = data
        if data is None:
            self.class_var = class_var = None
            self.dataview.setModel(None)
            self.predictionsview.setModel(None)
            self.predictionsview.setItemDelegate(PredictionsItemDelegate())
        else:
            # force full reset of the view's HeaderView state
            self.class_var = class_var = data.domain.class_var
            self.dataview.setModel(None)
            model = TableModel(data, parent=None)
            modelproxy = TableSortProxyModel()
            modelproxy.setSourceModel(model)
            self.dataview.setModel(modelproxy)
            self._update_column_visibility()

        discrete_class = class_var is not None and class_var.is_discrete
        self.classification_options.setVisible(discrete_class)

        self.closeContext()
        if discrete_class:
            self.class_values = list(class_var.values)
            self.selected_classes = list(range(len(self.class_values)))
            self.openContext(self.class_var)
        else:
            self.class_values = []
            self.selected_classes = []

        self._invalidate_predictions()

    def set_predictor(self, predictor=None, id=None):
        if id in self.predictors:
            if predictor is not None:
                self.predictors[id] = self.predictors[id]._replace(
                    predictor=predictor, name=predictor.name, results=None)
            else:
                del self.predictors[id]
        elif predictor is not None:
            self.predictors[id] = \
                PredictorSlot(predictor, predictor.name, None)

    def handleNewSignals(self):
        if self.data is not None:
            self._call_predictors()
        self._update_predictions_model()
        self._update_prediction_delegate()
        self._set_errors()
        self._update_info()
        self.commit()

    def _call_predictors(self):
        for inputid, pred in self.predictors.items():
            if pred.results is None:
                try:
                    predictor_class = pred.predictor.domain.class_var
                    if predictor_class != self.class_var:
                        results = "{}: mismatching target ({})".format(
                            pred.predictor.name, predictor_class.name)
                    else:
                        results = self.predict(pred.predictor, self.data)
                except ValueError as err:
                    results = "{}: {}".format(pred.predictor.name, err)
                self.predictors[inputid] = pred._replace(results=results)

    def _set_errors(self):
        errors = "\n".join(p.results for p in self.predictors.values()
                           if isinstance(p.results, str))
        if errors:
            self.Error.predictor_failed(errors)
        else:
            self.Error.predictor_failed.clear()

    def _update_info(self):
        info = []
        if self.data is not None:
            info.append("Data: {} instances.".format(len(self.data)))
        else:
            info.append("Data: N/A")

        n_predictors = len(self.predictors)
        n_valid = len(self._valid_predictors())
        if n_valid != n_predictors:
            info.append("Predictors: {} (+ {} failed)".format(
                n_valid, n_predictors - n_valid))
        else:
            info.append("Predictors: {}".format(n_predictors or "N/A"))

        if self.class_var is None:
            info.append("Task: N/A")
        elif self.class_var.is_discrete:
            info.append("Task: Classification")
            self.checkbox_class.setEnabled(True)
            self.checkbox_prob.setEnabled(True)
        else:
            info.append("Task: Regression")
            self.checkbox_class.setEnabled(False)
            self.checkbox_prob.setEnabled(False)

        self.infolabel.setText("\n".join(info))

    def _invalidate_predictions(self):
        for inputid, pred in list(self.predictors.items()):
            self.predictors[inputid] = pred._replace(results=None)

    def _valid_predictors(self):
        return [p for p in self.predictors.values()
                if p.results is not None and not isinstance(p.results, str)]

    def _update_predictions_model(self):
        """Update the prediction view model."""
        if self.data is not None:
            slots = self._valid_predictors()
            results = []
            class_var = self.class_var
            for p in slots:
                values, prob = p.results
                if self.class_var.is_discrete:
                    values = [Value(class_var, v) for v in values]
                results.append((values, prob))
            results = list(zip(*(zip(*res) for res in results)))
            headers = [p.name for p in slots]
            model = PredictionsModel(results, headers)
        else:
            model = None

        predmodel = PredictionsSortProxyModel()
        predmodel.setSourceModel(model)
        predmodel.setDynamicSortFilter(True)
        self.predictionsview.setItemDelegate(PredictionsItemDelegate())
        self.predictionsview.setModel(predmodel)
        hheader = self.predictionsview.horizontalHeader()
        hheader.setSortIndicatorShown(False)
        # SortFilterProxyModel is slow due to large abstraction overhead
        # (every comparison triggers multiple `model.index(...)`,
        # model.rowCount(...), `model.parent`, ... calls)
        hheader.setSectionsClickable(predmodel.rowCount() < 20000)

        predmodel.layoutChanged.connect(self._update_data_sort_order)
        self._update_data_sort_order()
        self.predictionsview.resizeColumnsToContents()

    def _update_column_visibility(self):
        """Update data column visibility."""
        if self.data is not None:
            domain = self.data.domain
            first_attr = len(domain.class_vars) + len(domain.metas)

            for i in range(first_attr, first_attr + len(domain.attributes)):
                self.dataview.setColumnHidden(i, not self.show_attrs)
            if domain.class_var:
                self.dataview.setColumnHidden(0, False)

    def _update_data_sort_order(self):
        """Update data row order to match the current predictions view order"""
        datamodel = self.dataview.model()  # data model proxy
        predmodel = self.predictionsview.model()  # predictions model proxy
        sortindicatorshown = False
        if datamodel is not None:
            assert isinstance(datamodel, TableSortProxyModel)
            n = datamodel.rowCount()
            if predmodel is not None and predmodel.sortColumn() >= 0:
                sortind = numpy.argsort(
                    [predmodel.mapToSource(predmodel.index(i, 0)).row()
                     for i in range(n)])
                sortind = numpy.array(sortind, numpy.int)
                sortindicatorshown = True
            else:
                sortind = None

            datamodel.setSortIndices(sortind)

        self.predictionsview.horizontalHeader() \
            .setSortIndicatorShown(sortindicatorshown)

    def _reset_order(self):
        """Reset the row sorting to original input order."""
        datamodel = self.dataview.model()
        predmodel = self.predictionsview.model()
        if datamodel is not None:
            datamodel.sort(-1)
        if predmodel is not None:
            predmodel.sort(-1)
        self.predictionsview.horizontalHeader().setSortIndicatorShown(False)

    def _update_prediction_delegate(self):
        """Update the predicted probability visibility state"""
        if self.class_var is not None:
            delegate = PredictionsItemDelegate()
            if self.class_var.is_continuous:
                self._setup_delegate_continuous(delegate)
            else:
                self._setup_delegate_discrete(delegate)
                proxy = self.predictionsview.model()
                if proxy is not None:
                    proxy.setProbInd(
                        numpy.array(self.selected_classes, dtype=int))
            self.predictionsview.setItemDelegate(delegate)
            self.predictionsview.resizeColumnsToContents()
        self._update_spliter()

    def _setup_delegate_discrete(self, delegate):
        colors = [QtGui.QColor(*rgb) for rgb in self.class_var.colors]
        fmt = []
        if self.show_probabilities:
            fmt.append(" : ".join("{{dist[{}]:.2f}}".format(i)
                                  for i in sorted(self.selected_classes)))
        if self.show_predictions:
            fmt.append("{value!s}")
        delegate.setFormat(" \N{RIGHTWARDS ARROW} ".join(fmt))
        if self.draw_dist and colors is not None:
            delegate.setColors(colors)
        return delegate

    def _setup_delegate_continuous(self, delegate):
        delegate.setFormat(
            "{{value:.{}f}}".format(self.class_var.number_of_decimals))

    def _update_spliter(self):
        if self.data is None:
            return

        def width(view):
            h_header = view.horizontalHeader()
            v_header = view.verticalHeader()
            return h_header.length() + v_header.width()

        w = width(self.predictionsview) + 4
        w1, w2 = self.splitter.sizes()
        self.splitter.setSizes([w, w1 + w2 - w])

    def commit(self):
        self._commit_predictions()
        self._commit_evaluation_results()

    def _commit_evaluation_results(self):
        class_var = self.class_var
        slots = self._valid_predictors()
        if not slots:
            self.send("Evaluation Results", None)
            return

        nanmask = numpy.isnan(self.data.get_column_view(class_var)[0])
        data = self.data[~nanmask]
        N = len(data)
        results = Orange.evaluation.Results(data, store_data=True)
        results.folds = None
        results.row_indices = numpy.arange(N)
        results.actual = data.Y.ravel()
        results.predicted = numpy.vstack(
            tuple(p.results[0][~nanmask] for p in slots))
        if class_var and class_var.is_discrete:
            results.probabilities = numpy.array(
                [p.results[1][~nanmask] for p in slots])
        results.learner_names = [p.name for p in slots]
        self.send("Evaluation Results", results)

    def _commit_predictions(self):
        slots = self._valid_predictors()
        if not slots:
            self.send("Predictions", None)
            return

        class_var = self.class_var
        if class_var and class_var.is_discrete:
            newmetas, newcolumns = self._classification_output_columns()
        else:
            newmetas, newcolumns = self._regression_output_columns()

        attrs = list(self.data.domain.attributes) if self.output_attrs else []
        metas = list(self.data.domain.metas) + newmetas
        domain = Orange.data.Domain(attrs, class_var, metas=metas)
        predictions = self.data.from_table(domain, self.data)
        if newcolumns:
            newcolumns = numpy.hstack(
                [numpy.atleast_2d(cols) for cols in newcolumns])
            predictions.metas[:, -newcolumns.shape[1]:] = newcolumns
        self.send("Predictions", predictions)

    def _classification_output_columns(self):
        newmetas = []
        newcolumns = []
        slots = self._valid_predictors()
        if self.output_predictions:
            newmetas += [DiscreteVariable(name=p.name, values=self.class_values)
                         for p in slots]
            newcolumns += [p.results[0].reshape((-1, 1)) for p in slots]

        if self.output_probabilities:
            newmetas += [ContinuousVariable(name="%s(%s)" % (p.name, value))
                         for p in slots for value in self.class_values]
            newcolumns += [p.results[1] for p in slots]
        return newmetas, newcolumns

    def _regression_output_columns(self):
        slots = self._valid_predictors()
        newmetas = [ContinuousVariable(name=p.name) for p in slots]
        newcolumns = [p.results[0].reshape((-1, 1)) for p in slots]
        return newmetas, newcolumns

    def send_report(self):
        def merge_data_with_predictions():
            data_model = self.dataview.model()
            predictions_model = self.predictionsview.model()

            # use ItemDelegate to style prediction values
            style = lambda x: self.predictionsview.itemDelegate().displayText(x, QLocale())

            # iterate only over visible columns of data's QTableView
            iter_data_cols = list(filter(lambda x: not self.dataview.isColumnHidden(x),
                                         range(data_model.columnCount())))

            # print header
            yield [''] + \
                  [predictions_model.headerData(col, Qt.Horizontal, Qt.DisplayRole)
                   for col in range(predictions_model.columnCount())] + \
                  [data_model.headerData(col, Qt.Horizontal, Qt.DisplayRole)
                   for col in iter_data_cols]

            # print data & predictions
            for i in range(data_model.rowCount()):
                yield [data_model.headerData(i, Qt.Vertical, Qt.DisplayRole)] + \
                      [style(predictions_model.data(predictions_model.index(i, j)))
                       for j in range(predictions_model.columnCount())] + \
                      [data_model.data(data_model.index(i, j))
                       for j in iter_data_cols]

        if self.data is not None:
            text = self.infolabel.text().replace('\n', '<br>')
            if self.show_probabilities and self.selected_classes:
                text += '<br>Showing probabilities for: '
                text += ', '. join([self.class_values[i]
                                    for i in self.selected_classes])
            self.report_paragraph('Info', text)
            self.report_table("Data & Predictions", merge_data_with_predictions(),
                              header_rows=1, header_columns=1)

    @classmethod
    def predict(cls, predictor, data):
        class_var = predictor.domain.class_var
        if class_var:
            if class_var.is_discrete:
                return cls.predict_discrete(predictor, data)
            elif class_var.is_continuous:
                return cls.predict_continuous(predictor, data)

    @staticmethod
    def predict_discrete(predictor, data):
        return predictor(data, Model.ValueProbs)

    @staticmethod
    def predict_continuous(predictor, data):
        values = predictor(data, Model.Value)
        return values, [None] * len(data)
Esempio n. 27
0
class OWYahooFinance(widget.OWWidget):
    name = 'Yahoo Finance'
    description = "Generate time series from Yahoo Finance stock market data."
    icon = 'icons/YahooFinance.svg'
    priority = 9

    class Outputs:
        time_series = Output("Time series", Timeseries)

    QT_DATE_FORMAT = 'yyyy-MM-dd'
    PY_DATE_FORMAT = '%Y-%m-%d'
    MIN_DATE = date(1851, 1, 1)

    date_from = settings.Setting(
        (datetime.now().date() - timedelta(5 * 365)).strftime(PY_DATE_FORMAT))
    date_to = settings.Setting(datetime.now().date().strftime(PY_DATE_FORMAT))
    symbols = settings.Setting(
        ['AMZN', 'AAPL', 'GOOG', 'FB', 'SPY', '^DJI', '^TNX'])

    want_main_area = False
    resizing_enabled = False

    class Error(widget.OWWidget.Error):
        download_error = widget.Msg('Failed to download data (HTTP Error {}). '
                                    'Wrong stock symbol?')

    def __init__(self):
        box = gui.widgetBox(self.controlArea,
                            'Yahoo Finance Stock Data',
                            orientation='horizontal')
        lbox = gui.widgetBox(box, orientation='vertical')
        hbox = gui.widgetBox(lbox, orientation='horizontal')
        gui.label(hbox, self, 'Ticker:')
        self.combo = combo = QComboBox(editable=True,
                                       insertPolicy=QComboBox.InsertAtTop)
        combo.addItems(self.symbols)
        hbox.layout().addWidget(combo)
        # combo = gui.comboBox(
        #     lbox, self, 'symbol',#, items=self.symbols,
        #     label='Ticker:', orientation='horizontal',
        #     editable=True, maximumContentsLength=-1)
        gui.rubber(combo.parentWidget())
        minDate = QDate.fromString(self.MIN_DATE.strftime(self.PY_DATE_FORMAT),
                                   self.QT_DATE_FORMAT)
        date_from = QDateEdit(QDate.fromString(self.date_from,
                                               self.QT_DATE_FORMAT),
                              displayFormat=self.QT_DATE_FORMAT,
                              minimumDate=minDate,
                              calendarPopup=True)
        date_to = QDateEdit(QDate.fromString(self.date_to,
                                             self.QT_DATE_FORMAT),
                            displayFormat=self.QT_DATE_FORMAT,
                            minimumDate=minDate,
                            calendarPopup=True)
        date_from.dateChanged.connect(lambda date: setattr(
            self, 'date_from', date.toString(self.QT_DATE_FORMAT)))
        date_to.dateChanged.connect(lambda date: setattr(
            self, 'date_to', date.toString(self.QT_DATE_FORMAT)))
        hbox = gui.hBox(lbox)
        gui.label(hbox, self, "From:")
        hbox.layout().addWidget(date_from)
        hbox = gui.hBox(lbox)
        gui.label(hbox, self, "To:")
        hbox.layout().addWidget(date_to)

        self.button = gui.button(self.controlArea,
                                 self,
                                 'Download',
                                 callback=self.download)

    def download(self):
        date_from = datetime.strptime(self.date_from, self.PY_DATE_FORMAT)
        date_to = datetime.strptime(self.date_to, self.PY_DATE_FORMAT)

        # Update symbol in symbols history
        symbol = self.combo.currentText().strip().upper()
        self.combo.removeItem(self.combo.currentIndex())
        self.combo.insertItem(0, symbol)
        self.combo.setCurrentIndex(0)
        try:
            self.symbols.remove(symbol)
        except ValueError:
            pass
        self.symbols.insert(0, symbol)

        if not symbol:
            return

        self.Error.clear()
        with self.progressBar(3) as progress:
            try:
                progress.advance()
                self.button.setDisabled(True)
                data = finance_data(symbol, date_from, date_to)

                self.Outputs.time_series.send(data)
            except Exception as e:
                self.Error.download_error(getattr(e, 'status', -1))
            finally:
                self.button.setDisabled(False)
Esempio n. 28
0
class OWPCA(widget.OWWidget):
    name = "PCA"
    description = "Principal component analysis with a scree-diagram."
    icon = "icons/PCA.svg"
    priority = 3050
    keywords = ["principal component analysis", "linear transformation"]

    class Inputs:
        data = Input("Data", Table)

    class Outputs:
        transformed_data = Output("Transformed data", Table)
        components = Output("Components", Table)
        pca = Output("PCA", PCA, dynamic=False)

    settingsHandler = settings.DomainContextHandler()

    ncomponents = settings.Setting(2)
    variance_covered = settings.Setting(100)
    batch_size = settings.Setting(100)
    address = settings.Setting('')
    auto_update = settings.Setting(True)
    auto_commit = settings.Setting(True)
    normalize = settings.ContextSetting(True)
    decomposition_idx = settings.ContextSetting(0)
    maxp = settings.Setting(20)
    axis_labels = settings.Setting(10)

    graph_name = "plot.plotItem"

    class Warning(widget.OWWidget.Warning):
        trivial_components = widget.Msg(
            "All components of the PCA are trivial (explain 0 variance). "
            "Input data is constant (or near constant).")

    class Error(widget.OWWidget.Error):
        no_features = widget.Msg("At least 1 feature is required")
        no_instances = widget.Msg("At least 1 data instance is required")
        sparse_data = widget.Msg("Sparse data is not supported")

    def __init__(self):
        super().__init__()
        self.data = None

        self._pca = None
        self._transformed = None
        self._variance_ratio = None
        self._cumulative = None
        self._line = False
        self._init_projector()

        # Components Selection
        box = gui.vBox(self.controlArea, "Components Selection")
        form = QFormLayout()
        box.layout().addLayout(form)

        self.components_spin = gui.spin(
            box,
            self,
            "ncomponents",
            1,
            MAX_COMPONENTS,
            callback=self._update_selection_component_spin,
            keyboardTracking=False)
        self.components_spin.setSpecialValueText("All")

        self.variance_spin = gui.spin(
            box,
            self,
            "variance_covered",
            1,
            100,
            callback=self._update_selection_variance_spin,
            keyboardTracking=False)
        self.variance_spin.setSuffix("%")

        form.addRow("Components:", self.components_spin)
        form.addRow("Variance covered:", self.variance_spin)

        # Incremental learning
        self.sampling_box = gui.vBox(self.controlArea, "Incremental learning")
        self.addresstext = QLineEdit(box)
        self.addresstext.setPlaceholderText('Remote server')
        if self.address:
            self.addresstext.setText(self.address)
        self.sampling_box.layout().addWidget(self.addresstext)

        form = QFormLayout()
        self.sampling_box.layout().addLayout(form)
        self.batch_spin = gui.spin(self.sampling_box,
                                   self,
                                   "batch_size",
                                   50,
                                   100000,
                                   step=50,
                                   keyboardTracking=False)
        form.addRow("Batch size ~ ", self.batch_spin)

        self.start_button = gui.button(
            self.sampling_box,
            self,
            "Start remote computation",
            callback=self.start,
            autoDefault=False,
            tooltip="Start/abort computation on the server")
        self.start_button.setEnabled(False)

        gui.checkBox(self.sampling_box,
                     self,
                     "auto_update",
                     "Periodically fetch model",
                     callback=self.update_model)
        self.__timer = QTimer(self, interval=2000)
        self.__timer.timeout.connect(self.get_model)

        self.sampling_box.setVisible(remotely)

        # Decomposition
        self.decomposition_box = gui.radioButtons(
            self.controlArea,
            self,
            "decomposition_idx", [d.name for d in DECOMPOSITIONS],
            box="Decomposition",
            callback=self._update_decomposition)

        # Options
        self.options_box = gui.vBox(self.controlArea, "Options")
        self.normalize_box = gui.checkBox(self.options_box,
                                          self,
                                          "normalize",
                                          "Normalize data",
                                          callback=self._update_normalize)

        self.maxp_spin = gui.spin(self.options_box,
                                  self,
                                  "maxp",
                                  1,
                                  MAX_COMPONENTS,
                                  label="Show only first",
                                  callback=self._setup_plot,
                                  keyboardTracking=False)

        self.controlArea.layout().addStretch()

        gui.auto_commit(self.controlArea,
                        self,
                        "auto_commit",
                        "Apply",
                        checkbox_label="Apply automatically")

        self.plot = pg.PlotWidget(background="w")

        axis = self.plot.getAxis("bottom")
        axis.setLabel("Principal Components")
        axis = self.plot.getAxis("left")
        axis.setLabel("Proportion of variance")
        self.plot_horlabels = []
        self.plot_horlines = []

        self.plot.getViewBox().setMenuEnabled(False)
        self.plot.getViewBox().setMouseEnabled(False, False)
        self.plot.showGrid(True, True, alpha=0.5)
        self.plot.setRange(xRange=(0.0, 1.0), yRange=(0.0, 1.0))

        self.mainArea.layout().addWidget(self.plot)
        self._update_normalize()

    def update_model(self):
        self.get_model()
        if self.auto_update and self.rpca and not self.rpca.ready():
            self.__timer.start(2000)
        else:
            self.__timer.stop()

    def update_buttons(self, sparse_data=False):
        if sparse_data:
            self.normalize = False

        buttons = self.decomposition_box.buttons
        for cls, button in zip(DECOMPOSITIONS, buttons):
            button.setDisabled(sparse_data and not cls.supports_sparse)

        if not buttons[self.decomposition_idx].isEnabled():
            # Set decomposition index to first sparse-enabled decomposition
            for i, cls in enumerate(DECOMPOSITIONS):
                if cls.supports_sparse:
                    self.decomposition_idx = i
                    break

        self._init_projector()

    def start(self):
        if 'Abort' in self.start_button.text():
            self.rpca.abort()
            self.__timer.stop()
            self.start_button.setText("Start remote computation")
        else:
            self.address = self.addresstext.text()
            with remote.server(self.address):
                from Orange.projection.pca import RemotePCA
                maxiter = (1e5 + self.data.approx_len()) / self.batch_size * 3
                self.rpca = RemotePCA(self.data, self.batch_size, int(maxiter))
            self.update_model()
            self.start_button.setText("Abort remote computation")

    @Inputs.data
    def set_data(self, data):
        self.closeContext()
        self.clear_messages()
        self.clear()
        self.start_button.setEnabled(False)
        self.information()
        self.data = None
        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            elif not remotely:
                self.information("Data has been sampled")
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(2000, partial=True)
                data = Table(data_sample)
            else:  # data was big and remote available
                self.sampling_box.setVisible(True)
                self.start_button.setText("Start remote computation")
                self.start_button.setEnabled(True)
        if not isinstance(data, SqlTable):
            self.sampling_box.setVisible(False)

        if isinstance(data, Table):
            if len(data.domain.attributes) == 0:
                self.Error.no_features()
                self.clear_outputs()
                return
            if len(data) == 0:
                self.Error.no_instances()
                self.clear_outputs()
                return

        self.openContext(data)
        sparse_data = data is not None and data.is_sparse()
        self.normalize_box.setDisabled(sparse_data)
        self.update_buttons(sparse_data=sparse_data)

        self.data = data
        self.fit()

    def fit(self):
        self.clear()
        self.Warning.trivial_components.clear()
        if self.data is None:
            return
        data = self.data
        self._pca_projector.preprocessors = \
            self._pca_preprocessors + ([Normalize()] if self.normalize else [])
        if not isinstance(data, SqlTable):
            pca = self._pca_projector(data)
            variance_ratio = pca.explained_variance_ratio_
            cumulative = numpy.cumsum(variance_ratio)

            if numpy.isfinite(cumulative[-1]):
                self.components_spin.setRange(0, len(cumulative))
                self._pca = pca
                self._variance_ratio = variance_ratio
                self._cumulative = cumulative
                self._setup_plot()
            else:
                self.Warning.trivial_components()

            self.unconditional_commit()

    def clear(self):
        self._pca = None
        self._transformed = None
        self._variance_ratio = None
        self._cumulative = None
        self._line = None
        self.plot_horlabels = []
        self.plot_horlines = []
        self.plot.clear()

    def clear_outputs(self):
        self.Outputs.transformed_data.send(None)
        self.Outputs.components.send(None)
        self.Outputs.pca.send(self._pca_projector)

    def get_model(self):
        if self.rpca is None:
            return
        if self.rpca.ready():
            self.__timer.stop()
            self.start_button.setText("Restart (finished)")
        self._pca = self.rpca.get_state()
        if self._pca is None:
            return
        self._variance_ratio = self._pca.explained_variance_ratio_
        self._cumulative = numpy.cumsum(self._variance_ratio)
        self._setup_plot()
        self._transformed = None
        self.commit()

    def _setup_plot(self):
        self.plot.clear()
        if self._pca is None:
            return

        explained_ratio = self._variance_ratio
        explained = self._cumulative
        p = min(len(self._variance_ratio), self.maxp)

        self.plot.plot(numpy.arange(p),
                       explained_ratio[:p],
                       pen=pg.mkPen(QColor(Qt.red), width=2),
                       antialias=True,
                       name="Variance")
        self.plot.plot(numpy.arange(p),
                       explained[:p],
                       pen=pg.mkPen(QColor(Qt.darkYellow), width=2),
                       antialias=True,
                       name="Cumulative Variance")

        cutpos = self._nselected_components() - 1
        self._line = pg.InfiniteLine(angle=90,
                                     pos=cutpos,
                                     movable=True,
                                     bounds=(0, p - 1))
        self._line.setCursor(Qt.SizeHorCursor)
        self._line.setPen(pg.mkPen(QColor(Qt.black), width=2))
        self._line.sigPositionChanged.connect(self._on_cut_changed)
        self.plot.addItem(self._line)

        self.plot_horlines = (
            pg.PlotCurveItem(pen=pg.mkPen(QColor(Qt.blue), style=Qt.DashLine)),
            pg.PlotCurveItem(pen=pg.mkPen(QColor(Qt.blue), style=Qt.DashLine)))
        self.plot_horlabels = (pg.TextItem(color=QColor(Qt.black),
                                           anchor=(1, 0)),
                               pg.TextItem(color=QColor(Qt.black),
                                           anchor=(1, 1)))
        for item in self.plot_horlabels + self.plot_horlines:
            self.plot.addItem(item)
        self._set_horline_pos()

        self.plot.setRange(xRange=(0.0, p - 1), yRange=(0.0, 1.0))
        self._update_axis()

    def _set_horline_pos(self):
        cutidx = self.ncomponents - 1
        for line, label, curve in zip(
                self.plot_horlines, self.plot_horlabels,
            (self._variance_ratio, self._cumulative)):
            y = curve[cutidx]
            line.setData([-1, cutidx], 2 * [y])
            label.setPos(cutidx, y)
            label.setPlainText("{:.3f}".format(y))

    def _on_cut_changed(self, line):
        # cut changed by means of a cut line over the scree plot.
        value = int(round(line.value()))
        self._line.setValue(value)
        current = self._nselected_components()
        components = value + 1

        if not (self.ncomponents == 0
                and components == len(self._variance_ratio)):
            self.ncomponents = components

        self._set_horline_pos()

        if self._pca is not None:
            var = self._cumulative[components - 1]
            if numpy.isfinite(var):
                self.variance_covered = int(var * 100)

        if current != self._nselected_components():
            self._invalidate_selection()

    def _update_selection_component_spin(self):
        # cut changed by "ncomponents" spin.
        if self._pca is None:
            self._invalidate_selection()
            return

        if self.ncomponents == 0:
            # Special "All" value
            cut = len(self._variance_ratio)
        else:
            cut = self.ncomponents

        var = self._cumulative[cut - 1]
        if numpy.isfinite(var):
            self.variance_covered = int(var * 100)

        if numpy.floor(self._line.value()) + 1 != cut:
            self._line.setValue(cut - 1)

        self._invalidate_selection()

    def _update_selection_variance_spin(self):
        # cut changed by "max variance" spin.
        if self._pca is None:
            return

        cut = numpy.searchsorted(self._cumulative,
                                 self.variance_covered / 100.0) + 1
        cut = min(cut, len(self._cumulative))
        self.ncomponents = cut
        if numpy.floor(self._line.value()) + 1 != cut:
            self._line.setValue(cut - 1)
        self._invalidate_selection()

    def _update_normalize(self):
        self.fit()
        if self.data is None:
            self._invalidate_selection()

    def _init_projector(self):
        cls = DECOMPOSITIONS[self.decomposition_idx]
        self._pca_projector = cls(n_components=MAX_COMPONENTS)
        self._pca_projector.component = self.ncomponents
        self._pca_preprocessors = cls.preprocessors

    def _update_decomposition(self):
        self._init_projector()
        self._update_normalize()

    def _nselected_components(self):
        """Return the number of selected components."""
        if self._pca is None:
            return 0

        if self.ncomponents == 0:
            # Special "All" value
            max_comp = len(self._variance_ratio)
        else:
            max_comp = self.ncomponents

        var_max = self._cumulative[max_comp - 1]
        if var_max != numpy.floor(self.variance_covered / 100.0):
            cut = max_comp
            assert numpy.isfinite(var_max)
            self.variance_covered = int(var_max * 100)
        else:
            self.ncomponents = cut = numpy.searchsorted(
                self._cumulative, self.variance_covered / 100.0) + 1
        return cut

    def _invalidate_selection(self):
        self.commit()

    def _update_axis(self):
        p = min(len(self._variance_ratio), self.maxp)
        axis = self.plot.getAxis("bottom")
        d = max((p - 1) // (self.axis_labels - 1), 1)
        axis.setTicks([[(i, str(i + 1)) for i in range(0, p, d)]])

    def commit(self):
        transformed = components = None
        if self._pca is not None:
            if self._transformed is None:
                # Compute the full transform (MAX_COMPONENTS components) only once.
                self._transformed = self._pca(self.data)
            transformed = self._transformed

            domain = Domain(transformed.domain.attributes[:self.ncomponents],
                            self.data.domain.class_vars,
                            self.data.domain.metas)
            transformed = transformed.from_table(domain, transformed)
            # prevent caching new features by defining compute_value
            dom = Domain([
                ContinuousVariable(a.name, compute_value=lambda _: None)
                for a in self._pca.orig_domain.attributes
            ],
                         metas=[StringVariable(name='component')])
            metas = numpy.array(
                [['PC{}'.format(i + 1) for i in range(self.ncomponents)]],
                dtype=object).T
            components = Table(dom,
                               self._pca.components_[:self.ncomponents],
                               metas=metas)
            components.name = 'components'

        self._pca_projector.component = self.ncomponents
        self.Outputs.transformed_data.send(transformed)
        self.Outputs.components.send(components)
        self.Outputs.pca.send(self._pca_projector)

    def send_report(self):
        if self.data is None:
            return
        self.report_items(
            (("Decomposition", DECOMPOSITIONS[self.decomposition_idx].name),
             ("Normalize data", str(self.normalize)), ("Selected components",
                                                       self.ncomponents),
             ("Explained variance", "{:.3f} %".format(self.variance_covered))))
        self.report_plot()

    @classmethod
    def migrate_settings(cls, settings, version):
        if "variance_covered" in settings:
            # Due to the error in gh-1896 the variance_covered was persisted
            # as a NaN value, causing a TypeError in the widgets `__init__`.
            vc = settings["variance_covered"]
            if isinstance(vc, numbers.Real):
                if numpy.isfinite(vc):
                    vc = int(vc)
                else:
                    vc = 100
                settings["variance_covered"] = vc
        if settings.get("ncomponents", 0) > MAX_COMPONENTS:
            settings["ncomponents"] = MAX_COMPONENTS
Esempio n. 29
0
class OWPythagoreanForest(OWWidget):
    name = 'Pythagorean Forest'
    description = '毕达哥拉斯森林用于可视化随机森林'
    icon = 'icons/PythagoreanForest.svg'
    settings_version = 2
    keywords = ["fractal"]

    priority = 1001

    class Inputs:
        random_forest = Input("Random forest", RandomForestModel)

    class Outputs:
        tree = Output("Tree", TreeModel)

    # Enable the save as feature
    graph_name = '场景'

    # Settings
    depth_limit = settings.ContextSetting(10)
    target_class_index = settings.ContextSetting(0)
    size_calc_idx = settings.Setting(0)
    zoom = settings.Setting(200)

    SIZE_CALCULATION = [
        ('Normal', lambda x: x),
        ('Square root', lambda x: sqrt(x)),
        ('Logarithmic', lambda x: log(x + 1)),
    ]

    @classmethod
    def migrate_settings(cls, settings, version):
        if version < 2:
            settings.pop('selected_tree_index', None)
            v1_min, v1_max = 20, 150
            v2_min, v2_max = 100, 400
            ratio = (v2_max - v2_min) / (v1_max - v1_min)
            settings['zoom'] = int(ratio * (settings['zoom'] - v1_min) + v2_min)

    def __init__(self):
        super().__init__()
        self.rf_model = None
        self.forest = None
        self.instances = None
        self.clf_dataset = None

        self.color_palette = None

        # CONTROL AREA
        # Tree info area
        box_info = gui.widgetBox(self.controlArea, 'Forest')
        self.ui_info = gui.widgetLabel(box_info)

        # Display controls area
        box_display = gui.widgetBox(self.controlArea, 'Display')
        self.ui_depth_slider = gui.hSlider(
            box_display, self, 'depth_limit', label='Depth', ticks=False,
        )  # type: QSlider
        self.ui_target_class_combo = gui.comboBox(
            box_display, self, 'target_class_index', label='Target class',
            orientation=Qt.Horizontal, items=[], contentsLength=8,
        )  # type: gui.OrangeComboBox
        self.ui_size_calc_combo = gui.comboBox(
            box_display, self, 'size_calc_idx', label='Size',
            orientation=Qt.Horizontal,
            items=list(zip(*self.SIZE_CALCULATION))[0], contentsLength=8,
        )  # type: gui.OrangeComboBox
        self.ui_zoom_slider = gui.hSlider(
            box_display, self, 'zoom', label='Zoom', ticks=False, minValue=100,
            maxValue=400, createLabel=False, intOnly=False,
        )  # type: QSlider

        # Stretch to fit the rest of the unsused area
        gui.rubber(self.controlArea)

        self.controlArea.setSizePolicy(QSizePolicy.Preferred, QSizePolicy.Expanding)

        # MAIN AREA
        self.forest_model = PythagoreanForestModel(parent=self)
        self.forest_model.update_item_size(self.zoom)
        self.ui_depth_slider.valueChanged.connect(
            self.forest_model.update_depth)
        self.ui_target_class_combo.currentIndexChanged.connect(
            self.forest_model.update_target_class)
        self.ui_zoom_slider.valueChanged.connect(
            self.forest_model.update_item_size)
        self.ui_size_calc_combo.currentIndexChanged.connect(
            self.forest_model.update_size_calc)

        self.list_delegate = PythagorasTreeDelegate(parent=self)
        self.list_view = ClickToClearSelectionListView(parent=self)
        self.list_view.setWrapping(True)
        self.list_view.setFlow(QListView.LeftToRight)
        self.list_view.setResizeMode(QListView.Adjust)
        self.list_view.setModel(self.forest_model)
        self.list_view.setItemDelegate(self.list_delegate)
        self.list_view.setSpacing(2)
        self.list_view.setSelectionMode(QListView.SingleSelection)
        self.list_view.selectionModel().selectionChanged.connect(self.commit)
        self.list_view.setUniformItemSizes(True)
        self.mainArea.layout().addWidget(self.list_view)

        self.resize(800, 500)

        # Clear to set sensible default values
        self.clear()

    @Inputs.random_forest
    def set_rf(self, model=None):
        """When a different forest is given."""
        self.clear()
        self.rf_model = model

        if model is not None:
            self.forest = self._get_forest_adapter(self.rf_model)
            self.forest_model[:] = self.forest.trees

            self.instances = model.instances
            # This bit is important for the regression classifier
            if self.instances is not None and self.instances.domain != model.domain:
                self.clf_dataset = self.instances.transform(self.rf_model.domain)
            else:
                self.clf_dataset = self.instances

            self._update_info_box()
            self._update_target_class_combo()
            self._update_depth_slider()

    def clear(self):
        """Clear all relevant data from the widget."""
        self.rf_model = None
        self.forest = None
        self.forest_model.clear()

        self._clear_info_box()
        self._clear_target_class_combo()
        self._clear_depth_slider()

    def _update_info_box(self):
        self.ui_info.setText('Trees: {}'.format(len(self.forest.trees)))

    def _update_depth_slider(self):
        self.depth_limit = self._get_max_depth()

        self.ui_depth_slider.parent().setEnabled(True)
        self.ui_depth_slider.setMaximum(self.depth_limit)
        self.ui_depth_slider.setValue(self.depth_limit)

    def _update_target_class_combo(self):
        self._clear_target_class_combo()
        label = [x for x in self.ui_target_class_combo.parent().children()
                 if isinstance(x, QLabel)][0]

        if self.instances.domain.has_discrete_class:
            label_text = '目标类'
            values = [c.title() for c in self.instances.domain.class_vars[0].values]
            values.insert(0, 'None')
        else:
            label_text = '节点颜色'
            values = list(ContinuousTreeNode.COLOR_METHODS.keys())
        label.setText(label_text)
        self.ui_target_class_combo.addItems(values)
        self.ui_target_class_combo.setCurrentIndex(self.target_class_index)

    def _clear_info_box(self):
        self.ui_info.setText('没有森林输入')

    def _clear_target_class_combo(self):
        self.ui_target_class_combo.clear()
        self.target_class_index = 0
        self.ui_target_class_combo.setCurrentIndex(self.target_class_index)

    def _clear_depth_slider(self):
        self.ui_depth_slider.parent().setEnabled(False)
        self.ui_depth_slider.setMaximum(0)

    def _get_max_depth(self):
        return max(tree.max_depth for tree in self.forest.trees)

    def _get_forest_adapter(self, model):
        return SklRandomForestAdapter(model)

    def onDeleteWidget(self):
        """When deleting the widget."""
        super().onDeleteWidget()
        self.clear()

    def commit(self, selection):
        # type: (QItemSelection) -> None
        """Commit the selected tree to output."""
        selected_indices = selection.indexes()

        if not len(selected_indices):
            self.Outputs.tree.send(None)
            return

        selected_index, = selection.indexes()

        idx = selected_index.row()
        tree = self.rf_model.trees[idx]
        tree.instances = self.instances
        tree.meta_target_class_index = self.target_class_index
        tree.meta_size_calc_idx = self.size_calc_idx
        tree.meta_depth_limit = self.depth_limit

        self.Outputs.tree.send(tree)

    def send_report(self):
        """Send report."""
        self.report_plot()
class OWImportDocuments(widget.OWWidget):
    name = "Import Documents"
    description = "Import text documents from folders."
    icon = "icons/ImportDocuments.svg"
    priority = 110

    class Outputs:
        data = Output("Corpus", Corpus)
        skipped_documents = Output("Skipped documents", Table)

    LOCAL_FILE, URL = range(2)
    source = settings.Setting(LOCAL_FILE)
    #: list of recent paths
    recent_paths: List[RecentPath] = settings.Setting([])
    currentPath: Optional[str] = settings.Setting(None)
    recent_urls: List[str] = settings.Setting([])

    want_main_area = False
    resizing_enabled = False

    Modality = Qt.ApplicationModal
    MaxRecentItems = 20

    class Warning(widget.OWWidget.Warning):
        read_error = widget.Msg("{} couldn't be read.")

    def __init__(self):
        super().__init__()
        #: widget's runtime state
        self.__state = State.NoState
        self.corpus = None
        self.n_text_categories = 0
        self.n_text_data = 0
        self.skipped_documents = []

        self.__invalidated = False
        self.__pendingTask = None

        layout = QGridLayout()
        layout.setSpacing(4)
        gui.widgetBox(self.controlArea, orientation=layout, box='Source')
        source_box = gui.radioButtons(None, self, "source", box=True,
                                      callback=self.start, addToLayout=False)
        rb_button = gui.appendRadioButton(source_box, "Folder:",
                                          addToLayout=False)
        layout.addWidget(rb_button, 0, 0, Qt.AlignVCenter)

        box = gui.hBox(None, addToLayout=False, margin=0)
        box.setSizePolicy(QSizePolicy.MinimumExpanding, QSizePolicy.Fixed)

        self.recent_cb = QComboBox(
            sizeAdjustPolicy=QComboBox.AdjustToMinimumContentsLengthWithIcon,
            minimumContentsLength=16,
            acceptDrops=True
        )
        self.recent_cb.installEventFilter(self)
        self.recent_cb.activated[int].connect(self.__onRecentActivated)

        browseaction = QAction(
            "Open/Load Documents", self,
            iconText="\N{HORIZONTAL ELLIPSIS}",
            icon=self.style().standardIcon(QStyle.SP_DirOpenIcon),
            toolTip="Select a folder from which to load the documents"
        )
        browseaction.triggered.connect(self.__runOpenDialog)
        reloadaction = QAction(
            "Reload", self,
            icon=self.style().standardIcon(QStyle.SP_BrowserReload),
            toolTip="Reload current document set"
        )
        reloadaction.triggered.connect(self.reload)
        self.__actions = namespace(
            browse=browseaction,
            reload=reloadaction,
        )

        browsebutton = QPushButton(
            browseaction.iconText(),
            icon=browseaction.icon(),
            toolTip=browseaction.toolTip(),
            clicked=browseaction.trigger,
            default=False,
            autoDefault=False,
        )
        reloadbutton = QPushButton(
            reloadaction.iconText(),
            icon=reloadaction.icon(),
            clicked=reloadaction.trigger,
            default=False,
            autoDefault=False,
        )
        box.layout().addWidget(self.recent_cb)
        layout.addWidget(box, 0, 1)
        layout.addWidget(browsebutton, 0, 2)
        layout.addWidget(reloadbutton, 0, 3)

        rb_button = gui.appendRadioButton(source_box, "URL:", addToLayout=False)
        layout.addWidget(rb_button, 3, 0, Qt.AlignVCenter)

        self.url_combo = url_combo = QComboBox()
        url_model = PyListModel()
        url_model.wrap(self.recent_urls)
        url_combo.setLineEdit(LineEditSelectOnFocus())
        url_combo.setModel(url_model)
        url_combo.setSizePolicy(QSizePolicy.Ignored, QSizePolicy.Fixed)
        url_combo.setEditable(True)
        url_combo.setInsertPolicy(url_combo.InsertAtTop)
        url_edit = url_combo.lineEdit()
        l, t, r, b = url_edit.getTextMargins()
        url_edit.setTextMargins(l + 5, t, r, b)
        layout.addWidget(url_combo, 3, 1, 1, 3)
        url_combo.activated.connect(self._url_set)
        # whit completer we set that combo box is case sensitive when
        # matching the history
        completer = QCompleter()
        completer.setCaseSensitivity(Qt.CaseSensitive)
        url_combo.setCompleter(completer)

        self.addActions([browseaction, reloadaction])

        reloadaction.changed.connect(
            lambda: reloadbutton.setEnabled(reloadaction.isEnabled())
        )
        box = gui.vBox(self.controlArea, "Info")
        self.infostack = QStackedWidget()

        self.info_area = QLabel(
            text="No document set selected",
            wordWrap=True
        )
        self.progress_widget = QProgressBar(
            minimum=0, maximum=100
        )
        self.cancel_button = QPushButton(
            "Cancel",
            icon=self.style().standardIcon(QStyle.SP_DialogCancelButton),
            default=False,
            autoDefault=False,
        )
        self.cancel_button.clicked.connect(self.cancel)

        w = QWidget()
        vlayout = QVBoxLayout()
        vlayout.setContentsMargins(0, 0, 0, 0)
        hlayout = QHBoxLayout()
        hlayout.setContentsMargins(0, 0, 0, 0)

        hlayout.addWidget(self.progress_widget)
        hlayout.addWidget(self.cancel_button)
        vlayout.addLayout(hlayout)

        self.pathlabel = TextLabel()
        self.pathlabel.setTextElideMode(Qt.ElideMiddle)
        self.pathlabel.setAttribute(Qt.WA_MacSmallSize)

        vlayout.addWidget(self.pathlabel)
        w.setLayout(vlayout)

        self.infostack.addWidget(self.info_area)
        self.infostack.addWidget(w)

        box.layout().addWidget(self.infostack)

        self.__initRecentItemsModel()
        self.__invalidated = True
        self.__executor = ThreadExecutor(self)

        QApplication.postEvent(self, QEvent(RuntimeEvent.Init))

    def _url_set(self):
        url = self.url_combo.currentText()
        pos = self.recent_urls.index(url)
        url = url.strip()
        if not urlparse(url).scheme:
            url = "http://" + url
            self.url_combo.setItemText(pos, url)
            self.recent_urls[pos] = url
        self.source = self.URL
        self.start()

    def __initRecentItemsModel(self):
        if self.currentPath is not None and \
                not os.path.isdir(self.currentPath):
            self.currentPath = None

        recent_paths = []
        for item in self.recent_paths:
            if os.path.isdir(item.abspath):
                recent_paths.append(item)
        recent_paths = recent_paths[:OWImportDocuments.MaxRecentItems]
        recent_model = self.recent_cb.model()
        for pathitem in recent_paths:
            item = RecentPath_asqstandarditem(pathitem)
            recent_model.appendRow(item)

        self.recent_paths = recent_paths

        if self.currentPath is not None and \
                os.path.isdir(self.currentPath) and self.recent_paths and \
                os.path.samefile(self.currentPath, self.recent_paths[0].abspath):
            self.recent_cb.setCurrentIndex(0)
        else:
            self.currentPath = None
            self.recent_cb.setCurrentIndex(-1)
        self.__actions.reload.setEnabled(self.currentPath is not None)

    def customEvent(self, event):
        """Reimplemented."""
        if event.type() == RuntimeEvent.Init:
            if self.__invalidated:
                try:
                    self.start()
                finally:
                    self.__invalidated = False

        super().customEvent(event)

    def __runOpenDialog(self):
        startdir = os.path.expanduser("~/")
        if self.recent_paths:
            startdir = os.path.dirname(self.recent_paths[0].abspath)

        caption = "Select Top Level Folder"
        if OWImportDocuments.Modality == Qt.WindowModal:
            dlg = QFileDialog(
                self, caption, startdir,
                acceptMode=QFileDialog.AcceptOpen,
                modal=True,
            )
            dlg.setFileMode(QFileDialog.Directory)
            dlg.setOption(QFileDialog.ShowDirsOnly)
            dlg.setDirectory(startdir)
            dlg.setAttribute(Qt.WA_DeleteOnClose)

            @dlg.accepted.connect
            def on_accepted():
                dirpath = dlg.selectedFiles()
                if dirpath:
                    self.setCurrentPath(dirpath[0])
                    self.start()
            dlg.open()
        else:
            dirpath = QFileDialog.getExistingDirectory(
                self, caption, startdir
            )
            if dirpath:
                self.setCurrentPath(dirpath)
                self.start()

    def __onRecentActivated(self, index):
        item = self.recent_cb.itemData(index)
        if item is None:
            return
        assert isinstance(item, RecentPath)
        self.setCurrentPath(item.abspath)
        self.start()

    def __updateInfo(self):
        if self.__state == State.NoState:
            text = "No document set selected"
        elif self.__state == State.Processing:
            text = "Processing"
        elif self.__state == State.Done:
            nvalid = self.n_text_data
            ncategories = self.n_text_categories
            n_skipped = len(self.skipped_documents)
            if ncategories < 2:
                text = "{} document{}".format(nvalid, "s" if nvalid != 1 else "")
            else:
                text = "{} documents / {} categories".format(nvalid, ncategories)
            if n_skipped > 0:
                text = text + ", {} skipped".format(n_skipped)
        elif self.__state == State.Cancelled:
            text = "Cancelled"
        elif self.__state == State.Error:
            text = "Error state"
        else:
            assert False

        self.info_area.setText(text)

        if self.__state == State.Processing:
            self.infostack.setCurrentIndex(1)
        else:
            self.infostack.setCurrentIndex(0)

    def setCurrentPath(self, path):
        """
        Set the current root text path to path

        If the path does not exists or is not a directory the current path
        is left unchanged

        Parameters
        ----------
        path : str
            New root import path.

        Returns
        -------
        status : bool
            True if the current root import path was successfully
            changed to path.
        """
        if self.currentPath is not None and path is not None and \
                os.path.isdir(self.currentPath) and os.path.isdir(path) and \
                os.path.samefile(self.currentPath, path) and \
                self.source == self.LOCAL_FILE:
            return True

        success = True
        error = None
        if path is not None:
            if not os.path.exists(path):
                error = "'{}' does not exist".format(path)
                path = None
                success = False
            elif not os.path.isdir(path):
                error = "'{}' is not a folder".format(path)
                path = None
                success = False

        if error is not None:
            self.error(error)
            warnings.warn(error, UserWarning, stacklevel=3)
        else:
            self.error()

        if path is not None:
            newindex = self.addRecentPath(path)
            self.recent_cb.setCurrentIndex(newindex)
            if newindex >= 0:
                self.currentPath = path
            else:
                self.currentPath = None
        else:
            self.currentPath = None
        self.__actions.reload.setEnabled(self.currentPath is not None)

        if self.__state == State.Processing:
            self.cancel()
        self.source = self.LOCAL_FILE
        return success

    def addRecentPath(self, path):
        """
        Prepend a path entry to the list of recent paths

        If an entry with the same path already exists in the recent path
        list it is moved to the first place

        Parameters
        ----------
        path : str
        """
        existing = None
        for pathitem in self.recent_paths:
            try:
                if os.path.samefile(pathitem.abspath, path):
                    existing = pathitem
                    break
            except FileNotFoundError:
                # file not found if the `pathitem.abspath` no longer exists
                pass

        model = self.recent_cb.model()

        if existing is not None:
            selected_index = self.recent_paths.index(existing)
            assert model.item(selected_index).data(Qt.UserRole) is existing
            self.recent_paths.remove(existing)
            row = model.takeRow(selected_index)
            self.recent_paths.insert(0, existing)
            model.insertRow(0, row)
        else:
            item = RecentPath(path, None, None)
            self.recent_paths.insert(0, item)
            model.insertRow(0, RecentPath_asqstandarditem(item))
        return 0

    def __setRuntimeState(self, state):
        assert state in State
        self.setBlocking(state == State.Processing)
        message = ""
        if state == State.Processing:
            assert self.__state in [State.Done,
                                    State.NoState,
                                    State.Error,
                                    State.Cancelled]
            message = "Processing"
        elif state == State.Done:
            assert self.__state == State.Processing
        elif state == State.Cancelled:
            assert self.__state == State.Processing
            message = "Cancelled"
        elif state == State.Error:
            message = "Error during processing"
        elif state == State.NoState:
            message = ""
        else:
            assert False

        self.__state = state

        if self.__state == State.Processing:
            self.infostack.setCurrentIndex(1)
        else:
            self.infostack.setCurrentIndex(0)

        self.setStatusMessage(message)
        self.__updateInfo()

    def reload(self):
        """
        Restart the text scan task
        """
        if self.__state == State.Processing:
            self.cancel()
        self.source = self.LOCAL_FILE
        self.corpus = None
        self.start()

    def start(self):
        """
        Start/execute the text indexing operation
        """
        self.error()
        self.Warning.clear()
        self.progress_widget.setValue(0)

        self.__invalidated = False
        startdir = self.currentPath if self.source == self.LOCAL_FILE \
            else self.url_combo.currentText().strip()
        if not startdir:
            return

        if self.__state == State.Processing:
            assert self.__pendingTask is not None
            log.info("Starting a new task while one is in progress. "
                     "Cancel the existing task (dir:'{}')"
                     .format(self.__pendingTask.startdir))
            self.cancel()

        self.__setRuntimeState(State.Processing)

        report_progress = methodinvoke(
            self, "__onReportProgress", (object,))

        task = ImportDocuments(startdir, self.source == self.URL,
                               report_progress=report_progress)

        # collect the task state in one convenient place
        self.__pendingTask = taskstate = namespace(
            task=task,
            startdir=startdir,
            future=None,
            watcher=None,
            cancelled=False,
            cancel=None,
        )

        def cancel():
            # Cancel the task and disconnect
            if taskstate.future.cancel():
                pass
            else:
                taskstate.task.cancelled = True
                taskstate.cancelled = True
                try:
                    taskstate.future.result(timeout=0)
                except UserInterruptError:
                    pass
                except TimeoutError:
                    log.info("The task did not stop in in a timely manner")
            taskstate.watcher.finished.disconnect(self.__onRunFinished)

        taskstate.cancel = cancel

        def run_text_scan_task_interupt():
            try:
                return task.run()
            except UserInterruptError:
                # Suppress interrupt errors, so they are not logged
                return

        taskstate.future = self.__executor.submit(run_text_scan_task_interupt)
        taskstate.watcher = FutureWatcher(taskstate.future)
        taskstate.watcher.finished.connect(self.__onRunFinished)

    @Slot()
    def __onRunFinished(self):
        assert QThread.currentThread() is self.thread()
        assert self.__state == State.Processing
        assert self.__pendingTask is not None
        assert self.sender() is self.__pendingTask.watcher
        assert self.__pendingTask.future.done()
        task = self.__pendingTask
        self.__pendingTask = None

        corpus, errors = None, []
        try:
            corpus, errors = task.future.result()
        except NoDocumentsException:
            state = State.Error
            self.error("Folder contains no readable files.")
        except Exception:
            sys.excepthook(*sys.exc_info())
            state = State.Error
            self.error(traceback.format_exc())
        else:
            state = State.Done
            self.error()

        if corpus:
            self.n_text_data = len(corpus)
            self.n_text_categories = len(corpus.domain.class_var.values)\
                if corpus.domain.class_var else 0

        self.corpus = corpus
        if self.corpus:
            self.corpus.name = "Documents"
        self.skipped_documents = errors

        if len(errors):
            self.Warning.read_error(
                "Some files" if len(errors) > 1 else "One file"
            )

        self.__setRuntimeState(state)
        self.commit()

    def cancel(self):
        """
        Cancel current pending task (if any).
        """
        if self.__state == State.Processing:
            assert self.__pendingTask is not None
            self.__pendingTask.cancel()
            self.__pendingTask = None
            self.__setRuntimeState(State.Cancelled)

    @Slot(object)
    def __onReportProgress(self, arg):
        # report on scan progress from a worker thread
        # arg must be a namespace(count: int, lastpath: str)
        assert QThread.currentThread() is self.thread()
        if self.__state == State.Processing:
            self.pathlabel.setText(prettifypath(arg.lastpath))
            self.progress_widget.setValue(int(100 * arg.progress))

    def commit(self):
        """
        Create and commit a Corpus from the collected text meta data.
        """
        self.Outputs.data.send(self.corpus)
        if self.skipped_documents:
            skipped_table = (
                Table.from_list(
                    SKIPPED_DOMAIN,
                    [[x, os.path.join(self.currentPath, x)]
                     for x in self.skipped_documents]
                )
            )
            skipped_table.name = "Skipped documents"
        else:
            skipped_table = None
        self.Outputs.skipped_documents.send(skipped_table)

    def onDeleteWidget(self):
        self.cancel()
        self.__executor.shutdown(wait=True)
        self.__invalidated = False

    def eventFilter(self, receiver, event):
        # re-implemented from QWidget
        # intercept and process drag drop events on the recent directory
        # selection combo box
        def dirpath(event):
            # type: (QDropEvent) -> Optional[str]
            """Return the directory from a QDropEvent."""
            data = event.mimeData()
            urls = data.urls()
            if len(urls) == 1:
                url = urls[0]
                path = url.toLocalFile()
                if os.path.isdir(path):
                    return path
            return None

        if receiver is self.recent_cb and \
                event.type() in {QEvent.DragEnter, QEvent.DragMove,
                                 QEvent.Drop}:
            assert isinstance(event, QDropEvent)
            path = dirpath(event)
            if path is not None and event.possibleActions() & Qt.LinkAction:
                event.setDropAction(Qt.LinkAction)
                event.accept()
                if event.type() == QEvent.Drop:
                    self.setCurrentPath(path)
                    self.start()
            else:
                event.ignore()
            return True

        return super().eventFilter(receiver, event)

    def send_report(self):
        if not self.currentPath:
            return
        items = [('Path', self.currentPath),
                 ('Number of documents', self.n_text_data)]
        if self.n_text_categories:
            items += [('Categories', self.n_text_categories)]
        if self.skipped_documents:
            items += [('Number of skipped', len(self.skipped_documents))]
        self.report_items(items, )