Exemple #1
0
class OWTestLearners(widget.OWWidget):
    name = "Test Learners"
    description = ""
    icon = "icons/TestLearners1.svg"
    priority = 100

    inputs = [("Learner", Orange.classification.Learner,
               "set_learner", widget.Multiple),
              ("Data", Orange.data.Table, "set_train_data", widget.Default),
              ("Test Data", Orange.data.Table, "set_test_data")]

    outputs = [("Evaluation Results", testing.Results)]

    #: Resampling/testing types
    KFold, LeaveOneOut, Bootstrap, TestOnTrain, TestOnTest = 0, 1, 2, 3, 4

    #: Selected resampling type
    resampling = settings.Setting(0)
    #: Number of folds for K-fold cross validation
    k_folds = settings.Setting(10)
    #: Number of repeats for bootstrap sampling
    n_repeat = settings.Setting(10)
    #: Bootstrap sampling p
    sample_p = settings.Setting(75)

    def __init__(self, parent=None):
        super().__init__(parent)

        self.train_data = None
        self.test_data = None

        #: An Ordered dictionary with current inputs and their testing
        #: results.
        self.learners = OrderedDict()

        sbox = gui.widgetBox(self.controlArea, "Sampling")
        rbox = gui.radioButtons(
            sbox, self, "resampling", callback=self._param_changed
        )
        gui.appendRadioButton(rbox, "Cross validation")
        ibox = gui.indentedBox(rbox)
        gui.spin(ibox, self, "k_folds", 2, 50, label="Number of folds:",
                 callback=self.kfold_changed)
        gui.appendRadioButton(rbox, "Leave one out")
        gui.appendRadioButton(rbox, "Random sampling")
        ibox = gui.indentedBox(rbox)
        gui.spin(ibox, self, "n_repeat", 2, 50, label="Repeat train/test",
                 callback=self.bootstrap_changed)
        gui.widgetLabel(ibox, "Relative training set size:")
        gui.hSlider(ibox, self, "sample_p", minValue=1, maxValue=100,
                    ticks=20, vertical=False, labelFormat="%d %%",
                    callback=self.bootstrap_changed)

        gui.appendRadioButton(rbox, "Test on train data")
        gui.appendRadioButton(rbox, "Test on test data")

        rbox.layout().addSpacing(5)
        gui.button(rbox, self, "Apply", callback=self.apply)

        gui.rubber(self.controlArea)

        self.view = QTreeView(
            rootIsDecorated=False,
            uniformRowHeights=True,
            wordWrap=True,
            editTriggers=QTreeView.NoEditTriggers
        )
        header = self.view.header()
        header.setResizeMode(QHeaderView.ResizeToContents)
        header.setDefaultAlignment(Qt.AlignCenter)
        header.setStretchLastSection(False)

        self.result_model = QStandardItemModel()
        self.view.setModel(self.result_model)
        self.view.setItemDelegate(ItemDelegate())
        self._update_header()
        box = gui.widgetBox(self.mainArea, "Evaluation Results")
        box.layout().addWidget(self.view)

    def set_learner(self, learner, key):
        if key in self.learners and learner is None:
            del self.learners[key]
        else:
            self.learners[key] = Input(learner, None, ())
        self._update_stats_model()

    def set_train_data(self, data):
        self.error(0)
        if data is not None:
            if data.domain.class_var is None:
                self.error(0, "Train data input requires a class variable")
                data = None

        self.train_data = data
        self._update_header()
        self._invalidate()

    def set_test_data(self, data):
        self.error(1)
        if data is not None:
            if data.domain.class_var is None:
                self.error(1, "Test data input requires a class variable")
                data = None

        self.test_data = data
        if self.resampling == OWTestLearners.TestOnTest:
            self._invalidate()

    def handleNewSignals(self):
        self.update_results()
        self.commit()

    def kfold_changed(self):
        self.resampling = OWTestLearners.KFold
        self._param_changed()

    def bootstrap_changed(self):
        self.resampling = OWTestLearners.Bootstrap
        self._param_changed()

    def _param_changed(self):
        self._invalidate()

    def update_results(self):
        self.warning([1, 2])
        self.error(2)

        if self.train_data is None:
            return

        if self.resampling == OWTestLearners.TestOnTest:
            if self.test_data is None:
                self.warning(2, "Missing separate test data input")
                return

            elif self.test_data.domain.class_var != \
                    self.train_data.domain.class_var:
                self.error(2, ("Inconsistent class variable between test " +
                               "and train data sets"))
                return

        # items in need of an update
        items = [(key, input) for key, input in self.learners.items()
                 if input.results is None]
        learners = [input.learner for _, input in items]

        self.setStatusMessage("Running")
        if self.test_data is not None and \
                self.resampling != OWTestLearners.TestOnTest:
            self.warning(1, "Test data is present but unused. "
                            "Select 'Test on test data' to use it.")

        # TODO: Test each learner individually

        if self.resampling == OWTestLearners.KFold:
            results = testing.CrossValidation(
                self.train_data, learners, k=self.k_folds, store_data=True
            )
        elif self.resampling == OWTestLearners.LeaveOneOut:
            results = testing.LeaveOneOut(
                self.train_data, learners, store_data=True
            )
        elif self.resampling == OWTestLearners.Bootstrap:
            p = self.sample_p / 100.0
            results = testing.Bootstrap(
                self.train_data, learners, n_resamples=self.n_repeat, p=p,
                store_data=True
            )
        elif self.resampling == OWTestLearners.TestOnTrain:
            results = testing.TestOnTrainingData(
                self.train_data, learners, store_data=True
            )
        elif self.resampling == OWTestLearners.TestOnTest:
            if self.test_data is None:
                return
            results = testing.TestOnTestData(
                self.train_data, self.test_data, learners, store_data=True
            )
        else:
            assert False

        results = list(split_by_model(results))
        class_var = self.train_data.domain.class_var
        
        if is_discrete(class_var):
            test_stats = classification_stats
        else:
            test_stats = regression_stats
        
        self._update_header()
        
        stats = [test_stats(res) for res in results]
        for (key, input), res, stat in zip(items, results, stats):
            self.learners[key] = input._replace(results=res, stats=stat)

        self.setStatusMessage("")
        
        self._update_stats_model()

    def _update_header(self):
        headers = ["Method"]
        if self.train_data is not None:
            if is_discrete(self.train_data.domain.class_var):
                headers.extend(classification_stats.headers)
            else:
                headers.extend(regression_stats.headers)
        for i in reversed(range(len(headers),
                                self.result_model.columnCount())):
            self.result_model.takeColumn(i)

        self.result_model.setHorizontalHeaderLabels(headers)

    def _update_stats_model(self):
        model = self.view.model()

        for r in reversed(range(model.rowCount())):
            model.takeRow(r)

        for input in self.learners.values():
            name = learner_name(input.learner)
            row = []
            head = QStandardItem()
            head.setData(name, Qt.DisplayRole)
            row.append(head)
            for stat in input.stats:
                item = QStandardItem()
                item.setData(" {:.3f} ".format(stat[0]), Qt.DisplayRole)
                row.append(item)
            model.appendRow(row)

    def _invalidate(self, which=None):
        if which is None:
            which = self.learners.keys()

        all_keys = list(self.learners.keys())
        model = self.view.model()

        for key in which:
            self.learners[key] = \
                self.learners[key]._replace(results=None, stats=None)

            if key in self.learners:
                row = all_keys.index(key)
                for c in range(1, model.columnCount()):
                    item = model.item(row, c)
                    if item is not None:
                        item.setData(None, Qt.DisplayRole)

    def apply(self):
        self.update_results()
        self.commit()

    def commit(self):
        results = [val.results for val in self.learners.values()
                   if val.results is not None]
        if results:
            combined = results_merge(results)
            combined.learner_names = [learner_name(val.learner)
                                     for val in self.learners.values()]
        else:
            combined = None
        self.send("Evaluation Results", combined)
Exemple #2
0
class OWTestLearners(widget.OWWidget):
    name = "Test & Score"
    description = "Cross-validation accuracy estimation."
    icon = "icons/TestLearners1.svg"
    priority = 100

    inputs = [("Learner", Learner,
               "set_learner", widget.Multiple),
              ("Data", Table, "set_train_data", widget.Default),
              ("Test Data", Table, "set_test_data"),
              ("Preprocessor", Preprocess, "set_preprocessor")]

    outputs = [("Predictions", Orange.data.Table),
               ("Evaluation Results", Orange.evaluation.Results)]

    settingsHandler = settings.ClassValuesContextHandler()

    #: Resampling/testing types
    KFold, LeaveOneOut, ShuffleSplit, TestOnTrain, TestOnTest = 0, 1, 2, 3, 4

    #: Selected resampling type
    resampling = settings.Setting(0)
    #: Number of folds for K-fold cross validation
    k_folds = settings.Setting(10)
    #: Number of repeats for ShuffleSplit sampling
    n_repeat = settings.Setting(10)
    #: ShuffleSplit sampling p
    sample_p = settings.Setting(75)

    TARGET_AVERAGE = "(Average over classes)"
    class_selection = settings.ContextSetting(TARGET_AVERAGE)

    def __init__(self):
        super().__init__()

        self.data = None
        self.test_data = None
        self.preprocessor = None
        self.train_data_missing_vals = False
        self.test_data_missing_vals = False

        #: An Ordered dictionary with current inputs and their testing
        #: results.
        self.learners = OrderedDict()

        sbox = gui.widgetBox(self.controlArea, "Sampling")
        rbox = gui.radioButtons(
            sbox, self, "resampling", callback=self._param_changed
        )
        gui.appendRadioButton(rbox, "Cross validation")
        ibox = gui.indentedBox(rbox)
        gui.spin(ibox, self, "k_folds", 2, 50, label="Number of folds:",
                 callback=self.kfold_changed)
        gui.appendRadioButton(rbox, "Leave one out")
        gui.appendRadioButton(rbox, "Random sampling")
        ibox = gui.indentedBox(rbox)
        gui.spin(ibox, self, "n_repeat", 2, 50, label="Repeat train/test",
                 callback=self.shuffle_split_changed)
        gui.widgetLabel(ibox, "Relative training set size:")
        gui.hSlider(ibox, self, "sample_p", minValue=1, maxValue=99,
                    ticks=20, vertical=False, labelFormat="%d %%",
                    callback=self.shuffle_split_changed)

        gui.appendRadioButton(rbox, "Test on train data")
        gui.appendRadioButton(rbox, "Test on test data")

        rbox.layout().addSpacing(5)
        self.apply_button = gui.button(
            rbox, self, "Apply", callback=self.apply, default=True)

        self.cbox = gui.widgetBox(self.controlArea, "Target class")
        self.class_selection_combo = gui.comboBox(
            self.cbox, self, "class_selection", items=[],
            sendSelectedValue=True, valueType=str,
            callback=self._on_target_class_changed,
            contentsLength=8)

        gui.rubber(self.controlArea)

        self.view = QTreeView(
            rootIsDecorated=False,
            uniformRowHeights=True,
            wordWrap=True,
            editTriggers=QTreeView.NoEditTriggers
        )
        header = self.view.header()
        header.setResizeMode(QHeaderView.ResizeToContents)
        header.setDefaultAlignment(Qt.AlignCenter)
        header.setStretchLastSection(False)

        self.result_model = QStandardItemModel(self)
        self.result_model.setHorizontalHeaderLabels(["Method"])
        self.view.setModel(self.result_model)
        self.view.setItemDelegate(ItemDelegate())

        box = gui.widgetBox(self.mainArea, "Evaluation Results")
        box.layout().addWidget(self.view)

    def sizeHint(self):
        return QSize(780, 1)

    def set_learner(self, learner, key):
        """
        Set the input `learner` for `key`.
        """
        if key in self.learners and learner is None:
            # Removed
            del self.learners[key]
        else:
            self.learners[key] = Input(learner, None, None)
            self._invalidate([key])

    def set_train_data(self, data):
        """
        Set the input training dataset.
        """
        self.error(0)
        self.information(0)
        if data and not data.domain.class_var:
            self.error(0, "Train data input requires a class variable")
            data = None

        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            else:
                self.information(0, "Train data has been sampled")
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(AUTO_DL_LIMIT, partial=True)
                data = Table(data_sample)

        self.warning(4)
        self.train_data_missing_vals = data is not None and \
                                       np.isnan(data.Y).any()
        if self.train_data_missing_vals or self.test_data_missing_vals:
            self.warning(4, self._get_missing_data_warning(
                self.train_data_missing_vals, self.test_data_missing_vals
            ))
            if data:
                data = RemoveNaNClasses(data)

        self.data = data
        self.closeContext()
        if data is not None:
            self._update_class_selection()
            self.openContext(data.domain.class_var)
        self._invalidate()

    def set_test_data(self, data):
        """
        Set the input separate testing dataset.
        """
        self.error(1)
        self.information(1)
        if data and not data.domain.class_var:
            self.error(1, "Test data input requires a class variable")
            data = None

        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            else:
                self.information(1, "Test data has been sampled")
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(AUTO_DL_LIMIT, partial=True)
                data = Table(data_sample)

        self.warning(4)
        self.test_data_missing_vals = data is not None and \
                                      np.isnan(data.Y).any()
        if self.train_data_missing_vals or self.test_data_missing_vals:
            self.warning(4, self._get_missing_data_warning(
                self.train_data_missing_vals, self.test_data_missing_vals
            ))
            if data:
                data = RemoveNaNClasses(data)

        self.test_data = data
        if self.resampling == OWTestLearners.TestOnTest:
            self._invalidate()

    def _get_missing_data_warning(self, train_missing, test_missing):
        return "Instances with unknown target values were removed from{}data"\
            .format(train_missing * test_missing * " "
                    or train_missing * " train " or test_missing * " test ")

    def set_preprocessor(self, preproc):
        """
        Set the input preprocessor to apply on the training data.
        """
        self.preprocessor = preproc
        self._invalidate()

    def handleNewSignals(self):
        """Reimplemented from OWWidget.handleNewSignals."""
        self._update_class_selection()
        self.apply()

    def kfold_changed(self):
        self.resampling = OWTestLearners.KFold
        self._param_changed()

    def shuffle_split_changed(self):
        self.resampling = OWTestLearners.ShuffleSplit
        self._param_changed()

    def _param_changed(self):
        self._invalidate()

    def _update_results(self):
        """
        Run/evaluate the learners.
        """
        self.warning([1, 2])
        self.error(2)

        if self.data is None:
            return

        class_var = self.data.domain.class_var

        if self.resampling == OWTestLearners.TestOnTest:
            if self.test_data is None:
                self.warning(2, "Missing separate test data input")
                return
            elif self.test_data.domain.class_var != class_var:
                self.error(2, ("Inconsistent class variable between test " +
                               "and train data sets"))
                return

        # items in need of an update
        items = [(key, slot) for key, slot in self.learners.items()
                 if slot.results is None]
        learners = [slot.learner for _, slot in items]
        if len(items) == 0:
            return

        if self.test_data is not None and \
                self.resampling != OWTestLearners.TestOnTest:
            self.warning(1, "Test data is present but unused. "
                            "Select 'Test on test data' to use it.")

        rstate = 42
        def update_progress(finished):
            self.progressBarSet(100 * finished)
        common_args = dict(
            store_data=True,
            preprocessor=self.preprocessor,
            callback=update_progress)
        self.setStatusMessage("Running")
        self.progressBarInit()

        try:
            if self.resampling == OWTestLearners.KFold:
                warnings = []
                results = Orange.evaluation.CrossValidation(
                    self.data, learners, k=self.k_folds, random_state=rstate,
                    warnings=warnings, **common_args)
                if warnings:
                    self.warning(2, warnings[0])
            elif self.resampling == OWTestLearners.LeaveOneOut:
                results = Orange.evaluation.LeaveOneOut(
                    self.data, learners, **common_args)
            elif self.resampling == OWTestLearners.ShuffleSplit:
                train_size = self.sample_p / 100
                results = Orange.evaluation.ShuffleSplit(
                    self.data, learners, n_resamples=self.n_repeat,
                    train_size=train_size, test_size=None,
                    random_state=rstate, **common_args)
            elif self.resampling == OWTestLearners.TestOnTrain:
                results = Orange.evaluation.TestOnTrainingData(
                    self.data, learners, **common_args)
            elif self.resampling == OWTestLearners.TestOnTest:
                results = Orange.evaluation.TestOnTestData(
                    self.data, self.test_data, learners, **common_args)
            else:
                assert False
        except RuntimeError as e:
            self.error(2, str(e))
            self.setStatusMessage("")
            self.progressBarFinished()
            return

        learner_key = {slot.learner: key for key, slot in self.learners.items()}
        for learner, result in zip(learners, split_by_model(results)):
            stats = None
            if class_var.is_discrete:
                scorers = classification_stats.scores
            elif class_var.is_continuous:
                scorers = regression_stats.scores
            else:
                scorers = None
            if scorers:
                ex = result.failed[0]
                if ex:
                    stats = [Try.Fail(ex)] * len(scorers)
                    result = Try.Fail(ex)
                else:
                    stats = [Try(lambda: score(result)) for score in scorers]
                    result = Try.Success(result)
            key = learner_key[learner]
            self.learners[key] = \
                self.learners[key]._replace(results=result, stats=stats)

        self.setStatusMessage("")
        self.progressBarFinished()

    def _update_header(self):
        # Set the correct horizontal header labels on the results_model.
        headers = ["Method"]
        if self.data is not None:
            if self.data.domain.has_discrete_class:
                headers.extend(classification_stats.headers)
            else:
                headers.extend(regression_stats.headers)

        # remove possible extra columns from the model.
        for i in reversed(range(len(headers),
                                self.result_model.columnCount())):
            self.result_model.takeColumn(i)

        self.result_model.setHorizontalHeaderLabels(headers)

    def _update_stats_model(self):
        # Update the results_model with up to date scores.
        # Note: The target class specific scores (if requested) are
        # computed as needed in this method.
        model = self.view.model()
        # clear the table model, but preserving the header labels
        for r in reversed(range(model.rowCount())):
            model.takeRow(r)

        target_index = None
        if self.data is not None:
            class_var = self.data.domain.class_var
            if class_var.is_discrete and \
                    self.class_selection != self.TARGET_AVERAGE:
                target_index = class_var.values.index(self.class_selection)
        else:
            class_var = None

        errors = []
        has_missing_scores = False

        for key, slot in self.learners.items():
            name = learner_name(slot.learner)
            head = QStandardItem(name)
            head.setData(key, Qt.UserRole)
            if isinstance(slot.results, Try.Fail):
                head.setToolTip(str(slot.results.exception))
                head.setText("{} (error)".format(name))
                head.setForeground(QtGui.QBrush(Qt.red))
                errors.append("{name} failed with error:\n"
                              "{exc.__class__.__name__}: {exc!s}"
                              .format(name=name, exc=slot.results.exception))

            row = [head]

            if class_var is not None and class_var.is_discrete and \
                    target_index is not None:
                if slot.results is not None and slot.results.success:
                    ovr_results = results_one_vs_rest(
                        slot.results.value, target_index)

                    stats = [Try(lambda: score(ovr_results))
                             for score in classification_stats.scores]
                else:
                    stats = None
            else:
                stats = slot.stats

            if stats is not None:
                for stat in stats:
                    item = QStandardItem()
                    if stat.success:
                        item.setText("{:.3f}".format(stat.value[0]))
                    else:
                        item.setToolTip(str(stat.exception))
                        has_missing_scores = True
                    row.append(item)

            model.appendRow(row)

        if errors:
            self.error(3, "\n".join(errors))
        else:
            self.error(3)

        if has_missing_scores:
            self.warning(3, "Some scores could not be computed")
        else:
            self.warning(3)

    def _update_class_selection(self):
        self.class_selection_combo.setCurrentIndex(-1)
        self.class_selection_combo.clear()
        if not self.data:
            return

        if self.data.domain.has_discrete_class:
            self.cbox.setVisible(True)
            class_var = self.data.domain.class_var
            items = [self.TARGET_AVERAGE] + class_var.values
            self.class_selection_combo.addItems(items)

            class_index = 0
            if self.class_selection in class_var.values:
                class_index = class_var.values.index(self.class_selection) + 1

            self.class_selection_combo.setCurrentIndex(class_index)
            self.class_selection = items[class_index]
        else:
            self.cbox.setVisible(False)

    def _on_target_class_changed(self):
        self._update_stats_model()

    def _invalidate(self, which=None):
        # Invalidate learner results for `which` input keys
        # (if None then all learner results are invalidated)
        if which is None:
            which = self.learners.keys()

        model = self.view.model()
        statmodelkeys = [model.item(row, 0).data(Qt.UserRole)
                         for row in range(model.rowCount())]

        for key in which:
            self.learners[key] = \
                self.learners[key]._replace(results=None, stats=None)

            if key in statmodelkeys:
                row = statmodelkeys.index(key)
                for c in range(1, model.columnCount()):
                    item = model.item(row, c)
                    if item is not None:
                        item.setData(None, Qt.DisplayRole)
                        item.setData(None, Qt.ToolTipRole)

        self.apply_button.setEnabled(True)

    def apply(self):
        self.apply_button.setEnabled(False)
        self._update_header()
        # Update the view to display the model names
        self._update_stats_model()
        self._update_results()
        self._update_stats_model()
        self.commit()

    def commit(self):
        valid = [slot for slot in self.learners.values()
                 if slot.results is not None and slot.results.success]
        if valid:
            # Evaluation results
            combined = results_merge([slot.results.value for slot in valid])
            combined.learner_names = [learner_name(slot.learner)
                                      for slot in valid]

            # Predictions & Probabilities
            predictions = combined.get_augmented_data(combined.learner_names)
        else:
            combined = None
            predictions = None
        self.send("Evaluation Results", combined)
        self.send("Predictions", predictions)
Exemple #3
0
class OWTestLearners(widget.OWWidget):
    name = "Test & Score"
    description = "Cross-validation accuracy estimation."
    icon = "icons/TestLearners1.svg"
    priority = 100

    inputs = [("Learner", Learner, "set_learner", widget.Multiple),
              ("Data", Table, "set_train_data", widget.Default),
              ("Test Data", Table, "set_test_data"),
              ("Preprocessor", Preprocess, "set_preprocessor")]

    outputs = [("Predictions", Orange.data.Table),
               ("Evaluation Results", Orange.evaluation.Results)]

    settingsHandler = settings.ClassValuesContextHandler()

    #: Resampling/testing types
    KFold, ShuffleSplit, LeaveOneOut, TestOnTrain, TestOnTest = 0, 1, 2, 3, 4

    #: Selected resampling type
    resampling = settings.Setting(0)
    #: Number of folds for K-fold cross validation
    k_folds = settings.Setting(10)
    #: Stratified sampling for K-fold
    cv_stratified = settings.Setting(True)
    #: Number of repeats for ShuffleSplit sampling
    n_repeat = settings.Setting(10)
    #: ShuffleSplit sampling p
    sample_p = settings.Setting(75)
    #: Stratified sampling for Random Sampling
    shuffle_stratified = settings.Setting(True)

    TARGET_AVERAGE = "(Average over classes)"
    class_selection = settings.ContextSetting(TARGET_AVERAGE)

    def __init__(self):
        super().__init__()

        self.data = None
        self.test_data = None
        self.preprocessor = None
        self.train_data_missing_vals = False
        self.test_data_missing_vals = False

        #: An Ordered dictionary with current inputs and their testing
        #: results.
        self.learners = OrderedDict()

        sbox = gui.widgetBox(self.controlArea, "Sampling")
        rbox = gui.radioButtons(sbox,
                                self,
                                "resampling",
                                callback=self._param_changed)

        gui.appendRadioButton(rbox, "Cross validation")
        ibox = gui.indentedBox(rbox)
        gui.spin(ibox,
                 self,
                 "k_folds",
                 2,
                 50,
                 label="Number of folds:",
                 callback=self.kfold_changed)
        gui.checkBox(ibox,
                     self,
                     "cv_stratified",
                     "Stratified",
                     callback=self.kfold_changed)

        gui.appendRadioButton(rbox, "Random sampling")
        ibox = gui.indentedBox(rbox)
        gui.spin(ibox,
                 self,
                 "n_repeat",
                 2,
                 50,
                 label="Repeat train/test",
                 callback=self.shuffle_split_changed)
        gui.widgetLabel(ibox, "Relative training set size:")
        gui.hSlider(ibox,
                    self,
                    "sample_p",
                    minValue=1,
                    maxValue=99,
                    ticks=20,
                    vertical=False,
                    labelFormat="%d %%",
                    callback=self.shuffle_split_changed)
        gui.checkBox(ibox,
                     self,
                     "shuffle_stratified",
                     "Stratified",
                     callback=self.shuffle_split_changed)

        gui.appendRadioButton(rbox, "Leave one out")

        gui.appendRadioButton(rbox, "Test on train data")
        gui.appendRadioButton(rbox, "Test on test data")

        rbox.layout().addSpacing(5)
        self.apply_button = gui.button(rbox,
                                       self,
                                       "Apply",
                                       callback=self.apply,
                                       default=True)

        self.cbox = gui.widgetBox(self.controlArea, "Target class")
        self.class_selection_combo = gui.comboBox(
            self.cbox,
            self,
            "class_selection",
            items=[],
            sendSelectedValue=True,
            valueType=str,
            callback=self._on_target_class_changed,
            contentsLength=8)

        gui.rubber(self.controlArea)

        self.view = QTreeView(rootIsDecorated=False,
                              uniformRowHeights=True,
                              wordWrap=True,
                              editTriggers=QTreeView.NoEditTriggers)
        header = self.view.header()
        header.setResizeMode(QHeaderView.ResizeToContents)
        header.setDefaultAlignment(Qt.AlignCenter)
        header.setStretchLastSection(False)

        self.result_model = QStandardItemModel(self)
        self.result_model.setHorizontalHeaderLabels(["Method"])
        self.view.setModel(self.result_model)
        self.view.setItemDelegate(ItemDelegate())

        box = gui.widgetBox(self.mainArea, "Evaluation Results")
        box.layout().addWidget(self.view)

    def sizeHint(self):
        return QSize(780, 1)

    def set_learner(self, learner, key):
        """
        Set the input `learner` for `key`.
        """
        if key in self.learners and learner is None:
            # Removed
            del self.learners[key]
        else:
            self.learners[key] = Input(learner, None, None)
            self._invalidate([key])

    def set_train_data(self, data):
        """
        Set the input training dataset.
        """
        self.error(0)
        self.information(0)
        if data and not data.domain.class_var:
            self.error(0, "Train data input requires a class variable")
            data = None

        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            else:
                self.information(0, "Train data has been sampled")
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(AUTO_DL_LIMIT, partial=True)
                data = Table(data_sample)

        self.warning(4)
        self.train_data_missing_vals = data is not None and \
                                       np.isnan(data.Y).any()
        if self.train_data_missing_vals or self.test_data_missing_vals:
            self.warning(
                4,
                self._get_missing_data_warning(self.train_data_missing_vals,
                                               self.test_data_missing_vals))
            if data:
                data = RemoveNaNClasses(data)

        self.data = data
        self.closeContext()
        if data is not None:
            self._update_class_selection()
            self.openContext(data.domain.class_var)
        self._invalidate()

    def set_test_data(self, data):
        """
        Set the input separate testing dataset.
        """
        self.error(1)
        self.information(1)
        if data and not data.domain.class_var:
            self.error(1, "Test data input requires a class variable")
            data = None

        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            else:
                self.information(1, "Test data has been sampled")
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(AUTO_DL_LIMIT, partial=True)
                data = Table(data_sample)

        self.warning(4)
        self.test_data_missing_vals = data is not None and \
                                      np.isnan(data.Y).any()
        if self.train_data_missing_vals or self.test_data_missing_vals:
            self.warning(
                4,
                self._get_missing_data_warning(self.train_data_missing_vals,
                                               self.test_data_missing_vals))
            if data:
                data = RemoveNaNClasses(data)

        self.test_data = data
        if self.resampling == OWTestLearners.TestOnTest:
            self._invalidate()

    def _get_missing_data_warning(self, train_missing, test_missing):
        return "Instances with unknown target values were removed from{}data"\
            .format(train_missing * test_missing * " "
                    or train_missing * " train " or test_missing * " test ")

    def set_preprocessor(self, preproc):
        """
        Set the input preprocessor to apply on the training data.
        """
        self.preprocessor = preproc
        self._invalidate()

    def handleNewSignals(self):
        """Reimplemented from OWWidget.handleNewSignals."""
        self._update_class_selection()
        self.apply()

    def kfold_changed(self):
        self.resampling = OWTestLearners.KFold
        self._param_changed()

    def shuffle_split_changed(self):
        self.resampling = OWTestLearners.ShuffleSplit
        self._param_changed()

    def _param_changed(self):
        self._invalidate()

    def _update_results(self):
        """
        Run/evaluate the learners.
        """
        self.warning([1, 2])
        self.error([2, 4])
        if self.data is None:
            return

        class_var = self.data.domain.class_var

        if self.resampling == OWTestLearners.TestOnTest:
            if self.test_data is None:
                self.warning(2, "Missing separate test data input")
                return
            elif self.test_data.domain.class_var != class_var:
                self.error(2, ("Inconsistent class variable between test " +
                               "and train data sets"))
                return

        # items in need of an update
        items = [(key, slot) for key, slot in self.learners.items()
                 if slot.results is None]
        learners = [slot.learner for _, slot in items]
        if len(items) == 0:
            return

        if self.test_data is not None and \
                self.resampling != OWTestLearners.TestOnTest:
            self.warning(
                1, "Test data is present but unused. "
                "Select 'Test on test data' to use it.")

        rstate = 42

        def update_progress(finished):
            self.progressBarSet(100 * finished)

        common_args = dict(store_data=True,
                           preprocessor=self.preprocessor,
                           callback=update_progress)
        self.setStatusMessage("Running")

        with self.progressBar():
            try:
                if self.resampling == OWTestLearners.KFold:
                    if len(self.data) < self.k_folds:
                        self.error(4, "Number of folds exceeds the data size")
                        return

                    warnings = []
                    results = Orange.evaluation.CrossValidation(
                        self.data,
                        learners,
                        k=self.k_folds,
                        random_state=rstate,
                        warnings=warnings,
                        **common_args)
                    if warnings:
                        self.warning(2, warnings[0])
                elif self.resampling == OWTestLearners.LeaveOneOut:
                    results = Orange.evaluation.LeaveOneOut(
                        self.data, learners, **common_args)
                elif self.resampling == OWTestLearners.ShuffleSplit:
                    train_size = self.sample_p / 100
                    results = Orange.evaluation.ShuffleSplit(
                        self.data,
                        learners,
                        n_resamples=self.n_repeat,
                        train_size=train_size,
                        test_size=None,
                        stratified=self.shuffle_stratified,
                        random_state=rstate,
                        **common_args)
                elif self.resampling == OWTestLearners.TestOnTrain:
                    results = Orange.evaluation.TestOnTrainingData(
                        self.data, learners, **common_args)
                elif self.resampling == OWTestLearners.TestOnTest:
                    results = Orange.evaluation.TestOnTestData(
                        self.data, self.test_data, learners, **common_args)
                else:
                    assert False
            except RuntimeError as e:
                self.error(2, str(e))
                self.setStatusMessage("")
                return

        learner_key = {
            slot.learner: key
            for key, slot in self.learners.items()
        }
        for learner, result in zip(learners, split_by_model(results)):
            stats = None
            if class_var.is_discrete:
                scorers = classification_stats.scores
            elif class_var.is_continuous:
                scorers = regression_stats.scores
            else:
                scorers = None
            if scorers:
                ex = result.failed[0]
                if ex:
                    stats = [Try.Fail(ex)] * len(scorers)
                    result = Try.Fail(ex)
                else:
                    stats = [Try(lambda: score(result)) for score in scorers]
                    result = Try.Success(result)
            key = learner_key[learner]
            self.learners[key] = \
                self.learners[key]._replace(results=result, stats=stats)

        self.setStatusMessage("")

    def _update_header(self):
        # Set the correct horizontal header labels on the results_model.
        headers = ["Method"]
        if self.data is not None:
            if self.data.domain.has_discrete_class:
                headers.extend(classification_stats.headers)
            else:
                headers.extend(regression_stats.headers)

        # remove possible extra columns from the model.
        for i in reversed(range(len(headers),
                                self.result_model.columnCount())):
            self.result_model.takeColumn(i)

        self.result_model.setHorizontalHeaderLabels(headers)

    def _update_stats_model(self):
        # Update the results_model with up to date scores.
        # Note: The target class specific scores (if requested) are
        # computed as needed in this method.
        model = self.view.model()
        # clear the table model, but preserving the header labels
        for r in reversed(range(model.rowCount())):
            model.takeRow(r)

        target_index = None
        if self.data is not None:
            class_var = self.data.domain.class_var
            if self.data.domain.has_discrete_class and \
                            self.class_selection != self.TARGET_AVERAGE:
                target_index = class_var.values.index(self.class_selection)
        else:
            class_var = None

        errors = []
        has_missing_scores = False

        for key, slot in self.learners.items():
            name = learner_name(slot.learner)
            head = QStandardItem(name)
            head.setData(key, Qt.UserRole)
            if isinstance(slot.results, Try.Fail):
                head.setToolTip(str(slot.results.exception))
                head.setText("{} (error)".format(name))
                head.setForeground(QtGui.QBrush(Qt.red))
                errors.append("{name} failed with error:\n"
                              "{exc.__class__.__name__}: {exc!s}".format(
                                  name=name, exc=slot.results.exception))

            row = [head]

            if class_var is not None and class_var.is_discrete and \
                    target_index is not None:
                if slot.results is not None and slot.results.success:
                    ovr_results = results_one_vs_rest(slot.results.value,
                                                      target_index)

                    stats = [
                        Try(lambda: score(ovr_results))
                        for score in classification_stats.scores
                    ]
                else:
                    stats = None
            else:
                stats = slot.stats

            if stats is not None:
                for stat in stats:
                    item = QStandardItem()
                    if stat.success:
                        item.setText("{:.3f}".format(stat.value[0]))
                    else:
                        item.setToolTip(str(stat.exception))
                        has_missing_scores = True
                    row.append(item)

            model.appendRow(row)

        if errors:
            self.error(3, "\n".join(errors))
        else:
            self.error(3)

        if has_missing_scores:
            self.warning(3, "Some scores could not be computed")
        else:
            self.warning(3)

    def _update_class_selection(self):
        self.class_selection_combo.setCurrentIndex(-1)
        self.class_selection_combo.clear()
        if not self.data:
            return

        if self.data.domain.has_discrete_class:
            self.cbox.setVisible(True)
            class_var = self.data.domain.class_var
            items = [self.TARGET_AVERAGE] + class_var.values
            self.class_selection_combo.addItems(items)

            class_index = 0
            if self.class_selection in class_var.values:
                class_index = class_var.values.index(self.class_selection) + 1

            self.class_selection_combo.setCurrentIndex(class_index)
            self.class_selection = items[class_index]
        else:
            self.cbox.setVisible(False)

    def _on_target_class_changed(self):
        self._update_stats_model()

    def _invalidate(self, which=None):
        # Invalidate learner results for `which` input keys
        # (if None then all learner results are invalidated)
        if which is None:
            which = self.learners.keys()

        model = self.view.model()
        statmodelkeys = [
            model.item(row, 0).data(Qt.UserRole)
            for row in range(model.rowCount())
        ]

        for key in which:
            self.learners[key] = \
                self.learners[key]._replace(results=None, stats=None)

            if key in statmodelkeys:
                row = statmodelkeys.index(key)
                for c in range(1, model.columnCount()):
                    item = model.item(row, c)
                    if item is not None:
                        item.setData(None, Qt.DisplayRole)
                        item.setData(None, Qt.ToolTipRole)

        self.apply_button.setEnabled(True)

    def apply(self):
        self.apply_button.setEnabled(False)
        self._update_header()
        # Update the view to display the model names
        self._update_stats_model()
        self._update_results()
        self._update_stats_model()
        self.commit()

    def commit(self):
        valid = [
            slot for slot in self.learners.values()
            if slot.results is not None and slot.results.success
        ]
        if valid:
            # Evaluation results
            combined = results_merge([slot.results.value for slot in valid])
            combined.learner_names = [
                learner_name(slot.learner) for slot in valid
            ]

            # Predictions & Probabilities
            predictions = combined.get_augmented_data(combined.learner_names)
        else:
            combined = None
            predictions = None
        self.send("Evaluation Results", combined)
        self.send("Predictions", predictions)

    def send_report(self):
        if not self.data or not self.learners:
            return
        if self.resampling == self.KFold:
            stratified = 'Stratified ' if self.cv_stratified else ''
            items = [
                ("Sampling type",
                 "{}{}-fold Cross validation".format(stratified, self.k_folds))
            ]
        elif self.resampling == self.LeaveOneOut:
            items = [("Sampling type", "Leave one out")]
        elif self.resampling == self.ShuffleSplit:
            stratified = 'Stratified ' if self.shuffle_stratified else ''
            items = [
                ("Sampling type",
                 "{}Shuffle split, {} random samples with {}% data ".format(
                     stratified, self.n_repeat, self.sample_p))
            ]
        elif self.resampling == self.TestOnTrain:
            items = [("Sampling type", "No sampling, test on training data")]
        elif self.resampling == self.TestOnTest:
            items = [("Sampling type", "No sampling, test on testing data")]
        else:
            items = []
        if self.data.domain.has_discrete_class:
            items += [("Target class", self.class_selection.strip("()"))]
        if items:
            self.report_items("Settings", items)
        self.report_table("Scores", self.view)
Exemple #4
0
class OWTestLearners(widget.OWWidget):
    name = "Test Learners"
    description = "Cross-validation accuracy estimation."
    icon = "icons/TestLearners1.svg"
    priority = 100

    inputs = [("Learner", Orange.classification.Learner,
               "set_learner", widget.Multiple),
              ("Data", Orange.data.Table, "set_train_data", widget.Default),
              ("Test Data", Orange.data.Table, "set_test_data")]

    outputs = [("Evaluation Results", Orange.evaluation.Results)]

    settingsHandler = settings.ClassValuesContextHandler()

    #: Resampling/testing types
    KFold, LeaveOneOut, Bootstrap, TestOnTrain, TestOnTest = 0, 1, 2, 3, 4

    #: Selected resampling type
    resampling = settings.Setting(0)
    #: Number of folds for K-fold cross validation
    k_folds = settings.Setting(10)
    #: Number of repeats for bootstrap sampling
    n_repeat = settings.Setting(10)
    #: Bootstrap sampling p
    sample_p = settings.Setting(75)

    class_selection = settings.ContextSetting("(None)")

    def __init__(self, parent=None):
        super().__init__(parent)

        self.train_data = None
        self.test_data = None

        #: An Ordered dictionary with current inputs and their testing
        #: results.
        self.learners = OrderedDict()

        sbox = gui.widgetBox(self.controlArea, "Sampling")
        rbox = gui.radioButtons(
            sbox, self, "resampling", callback=self._param_changed
        )
        gui.appendRadioButton(rbox, "Cross validation")
        ibox = gui.indentedBox(rbox)
        gui.spin(ibox, self, "k_folds", 2, 50, label="Number of folds:",
                 callback=self.kfold_changed)
        gui.appendRadioButton(rbox, "Leave one out")
        gui.appendRadioButton(rbox, "Random sampling")
        ibox = gui.indentedBox(rbox)
        gui.spin(ibox, self, "n_repeat", 2, 50, label="Repeat train/test",
                 callback=self.bootstrap_changed)
        gui.widgetLabel(ibox, "Relative training set size:")
        gui.hSlider(ibox, self, "sample_p", minValue=1, maxValue=100,
                    ticks=20, vertical=False, labelFormat="%d %%",
                    callback=self.bootstrap_changed)

        gui.appendRadioButton(rbox, "Test on train data")
        gui.appendRadioButton(rbox, "Test on test data")

        rbox.layout().addSpacing(5)
        gui.button(rbox, self, "Apply", callback=self.apply)

        self.cbox = gui.widgetBox(self.controlArea, "Target class")
        self.class_selection_combo = gui.comboBox(self.cbox, self, "class_selection",
             items=[],
             callback=self._select_class,
             sendSelectedValue=True, valueType=str)

        gui.rubber(self.controlArea)


        self.view = QTreeView(
            rootIsDecorated=False,
            uniformRowHeights=True,
            wordWrap=True,
            editTriggers=QTreeView.NoEditTriggers
        )
        header = self.view.header()
        header.setResizeMode(QHeaderView.ResizeToContents)
        header.setDefaultAlignment(Qt.AlignCenter)
        header.setStretchLastSection(False)

        self.result_model = QStandardItemModel()
        self.view.setModel(self.result_model)
        self.view.setItemDelegate(ItemDelegate())
        self._update_header()
        box = gui.widgetBox(self.mainArea, "Evaluation Results")
        box.layout().addWidget(self.view)

    def set_learner(self, learner, key):
        if key in self.learners and learner is None:
            del self.learners[key]
        else:
            self.learners[key] = Input(learner, None, ())
        self._update_stats_model()

    def set_train_data(self, data):
        self.error(0)
        if data and not data.domain.class_var:
            self.error(0, "Train data input requires a class variable")
            data = None

        self.train_data = data
        self.closeContext()
        if data is not None:
            self.openContext(data.domain.class_var)
            self._update_header()
        self._update_class_selection()
        self._invalidate()

    def set_test_data(self, data):
        self.error(1)
        if data and not data.domain.class_var:
            self.error(1, "Test data input requires a class variable")
            data = None

        self.test_data = data
        if self.resampling == OWTestLearners.TestOnTest:
            self._invalidate()

    def handleNewSignals(self):
        self.update_results()
        self.commit()

    def kfold_changed(self):
        self.resampling = OWTestLearners.KFold
        self._param_changed()

    def bootstrap_changed(self):
        self.resampling = OWTestLearners.Bootstrap
        self._param_changed()

    def _param_changed(self):
        self._invalidate()

    def update_results(self):
        self.warning([1, 2])
        self.error(2)

        if self.train_data is None:
            return

        if self.resampling == OWTestLearners.TestOnTest:
            if self.test_data is None:
                self.warning(2, "Missing separate test data input")
                return

            elif self.test_data.domain.class_var != \
                    self.train_data.domain.class_var:
                self.error(2, ("Inconsistent class variable between test " +
                               "and train data sets"))
                return

        # items in need of an update
        items = [(key, input) for key, input in self.learners.items()
                 if input.results is None]
        learners = [input.learner for _, input in items]

        self.setStatusMessage("Running")
        if self.test_data is not None and \
                self.resampling != OWTestLearners.TestOnTest:
            self.warning(1, "Test data is present but unused. "
                            "Select 'Test on test data' to use it.")

        # TODO: Test each learner individually
        try:
            if self.resampling == OWTestLearners.KFold:
                results = Orange.evaluation.CrossValidation(
                    self.train_data, learners, k=self.k_folds, store_data=True
                )
            elif self.resampling == OWTestLearners.LeaveOneOut:
                results = Orange.evaluation.LeaveOneOut(
                    self.train_data, learners, store_data=True
                )
            elif self.resampling == OWTestLearners.Bootstrap:
                p = self.sample_p / 100.0
                results = Orange.evaluation.Bootstrap(
                    self.train_data, learners, n_resamples=self.n_repeat, p=p,
                    store_data=True
                )
            elif self.resampling == OWTestLearners.TestOnTrain:
                results = Orange.evaluation.TestOnTrainingData(
                    self.train_data, learners, store_data=True
                )
            elif self.resampling == OWTestLearners.TestOnTest:
                if self.test_data is None:
                    return
                results = Orange.evaluation.TestOnTestData(
                    self.train_data, self.test_data, learners, store_data=True
                )
            else:
                assert False
        except Exception as e:
            self.error(2, str(e))
            return

        self.results = results
        results = list(split_by_model(results))

        class_var = self.train_data.domain.class_var

        if class_var.is_discrete:
            stats = [classification_stats(self.one_vs_rest(res)) for res in results]
        else:
            stats = [regression_stats(res) for res in results]

        self._update_header()

        for (key, input), res, stat in zip(items, results, stats):
            self.learners[key] = input._replace(results=res, stats=stat)

        self.setStatusMessage("")

        self._update_stats_model()


    def _update_header(self):
        headers = ["Method"]
        if self.train_data is not None:
            if self.train_data.domain.class_var.is_discrete:
                headers.extend(classification_stats.headers)
            else:
                headers.extend(regression_stats.headers)
        for i in reversed(range(len(headers),
                                self.result_model.columnCount())):
            self.result_model.takeColumn(i)

        self.result_model.setHorizontalHeaderLabels(headers)

    def _update_stats_model(self):
        model = self.view.model()

        for r in reversed(range(model.rowCount())):
            model.takeRow(r)

        for input in self.learners.values():
            name = learner_name(input.learner)
            row = []
            head = QStandardItem()
            head.setData(name, Qt.DisplayRole)
            row.append(head)
            for stat in input.stats:
                item = QStandardItem()
                item.setData(" {:.3f} ".format(stat[0]), Qt.DisplayRole)
                row.append(item)
            model.appendRow(row)

    def _update_class_selection(self):
        self.class_selection_combo.clear()
        if not self.train_data:
            return
        if self.train_data.domain.class_var.is_discrete:
            self.cbox.setVisible(True)
            values = self.train_data.domain.class_var.values
            self.class_selection_combo.addItem("(None)")
            self.class_selection_combo.addItems(values)

            class_index = 0
            if self.class_selection in self.train_data.domain.class_var.values:
                    class_index = self.train_data.domain.class_var.values.index(self.class_selection)+1
            else:
                self.class_selection = '(None)'

            self.class_selection_combo.setCurrentIndex(class_index)
            self.previous_class_selection = "(None)"
        else:
            self.cbox.setVisible(False)

    def one_vs_rest(self, res):
        if self.class_selection != '(None)' and self.class_selection != 0:
            class_ = self.train_data.domain.class_var.values.index(self.class_selection)
            actual = res.actual == class_
            predicted = res.predicted == class_
            return Results(
                nmethods=1, domain=self.train_data.domain,
                actual=actual, predicted=predicted)
        else:
            return res

    def _select_class(self):
        if self.previous_class_selection == self.class_selection:
            return

        results = list(split_by_model(self.results))

        items = [(key, input) for key, input in self.learners.items()]
        learners = [input.learner for _, input in items]

        class_var = self.train_data.domain.class_var
        if class_var.is_discrete:
            stats = [classification_stats(self.one_vs_rest(res)) for res in results]
        else:
            stats = [regression_stats(res) for res in results]

        for (key, input), res, stat in zip(items, results, stats):
            self.learners[key] = input._replace(results=res, stats=stat)

        self.setStatusMessage("")

        self._update_stats_model()
        self.previous_class_selection = self.class_selection

    def _invalidate(self, which=None):
        if which is None:
            which = self.learners.keys()

        all_keys = list(self.learners.keys())
        model = self.view.model()

        for key in which:
            self.learners[key] = \
                self.learners[key]._replace(results=None, stats=None)

            if key in self.learners:
                row = all_keys.index(key)
                for c in range(1, model.columnCount()):
                    item = model.item(row, c)
                    if item is not None:
                        item.setData(None, Qt.DisplayRole)

    def apply(self):
        self.update_results()
        self.commit()

    def commit(self):
        results = [val.results for val in self.learners.values()
                   if val.results is not None]
        if results:
            combined = results_merge(results)
            combined.learner_names = [learner_name(val.learner)
                                     for val in self.learners.values()]
        else:
            combined = None
        self.send("Evaluation Results", combined)
Exemple #5
0
class OWTestLearners(OWWidget):
    name = "Test & Score"
    description = "Cross-validation accuracy estimation."
    icon = "icons/TestLearners1.svg"
    priority = 100

    inputs = [
        ("Learner", Learner, "set_learner", widget.Multiple),
        ("Data", Table, "set_train_data", widget.Default),
        ("Test Data", Table, "set_test_data"),
        ("Preprocessor", Preprocess, "set_preprocessor"),
    ]

    outputs = [("Predictions", Table), ("Evaluation Results", Results)]

    settingsHandler = settings.ClassValuesContextHandler()

    #: Resampling/testing types
    KFold, ShuffleSplit, LeaveOneOut, TestOnTrain, TestOnTest = 0, 1, 2, 3, 4
    #: Numbers of folds
    NFolds = [2, 3, 5, 10, 20]
    #: Number of repetitions
    NRepeats = [2, 3, 5, 10, 20, 50, 100]
    #: Sample sizes
    SampleSizes = [5, 10, 20, 25, 30, 33, 40, 50, 60, 66, 70, 75, 80, 90, 95]

    #: Selected resampling type
    resampling = settings.Setting(0)
    #: Number of folds for K-fold cross validation
    n_folds = settings.Setting(3)
    #: Stratified sampling for K-fold
    cv_stratified = settings.Setting(True)
    #: Number of repeats for ShuffleSplit sampling
    n_repeats = settings.Setting(3)
    #: ShuffleSplit sample size
    sample_size = settings.Setting(9)
    #: Stratified sampling for Random Sampling
    shuffle_stratified = settings.Setting(True)

    TARGET_AVERAGE = "(Average over classes)"
    class_selection = settings.ContextSetting(TARGET_AVERAGE)

    class Error(OWWidget.Error):
        class_required = Msg("Train data input requires a class variable")
        class_required_test = Msg("Test data input requires a class variable")
        too_many_folds = Msg("Number of folds exceeds the data size")
        class_inconsistent = Msg("Test and train data sets " "have different classes")

    class Warning(OWWidget.Warning):
        missing_data = Msg("Instances with unknown target values were removed from{}data")
        test_data_missing = Msg("Missing separate test data input")
        scores_not_computed = Msg("Some scores could not be computed")
        test_data_unused = Msg("Test data is present but unused. " "Select 'Test on test data' to use it.")

    class Information(OWWidget.Information):
        data_sampled = Msg("Train data has been sampled")
        test_data_sampled = Msg("Test data has been sampled")

    def __init__(self):
        super().__init__()

        self.data = None
        self.test_data = None
        self.preprocessor = None
        self.train_data_missing_vals = False
        self.test_data_missing_vals = False

        #: An Ordered dictionary with current inputs and their testing results.
        self.learners = OrderedDict()

        sbox = gui.vBox(self.controlArea, "Sampling")
        rbox = gui.radioButtons(sbox, self, "resampling", callback=self._param_changed)

        gui.appendRadioButton(rbox, "Cross validation")
        ibox = gui.indentedBox(rbox)
        gui.comboBox(
            ibox,
            self,
            "n_folds",
            label="Number of folds: ",
            items=[str(x) for x in self.NFolds],
            maximumContentsLength=3,
            orientation=Qt.Horizontal,
            callback=self.kfold_changed,
        )
        gui.checkBox(ibox, self, "cv_stratified", "Stratified", callback=self.kfold_changed)

        gui.appendRadioButton(rbox, "Random sampling")
        ibox = gui.indentedBox(rbox)
        gui.comboBox(
            ibox,
            self,
            "n_repeats",
            label="Repeat train/test: ",
            items=[str(x) for x in self.NRepeats],
            maximumContentsLength=3,
            orientation=Qt.Horizontal,
            callback=self.shuffle_split_changed,
        )
        gui.comboBox(
            ibox,
            self,
            "sample_size",
            label="Training set size: ",
            items=["{} %".format(x) for x in self.SampleSizes],
            maximumContentsLength=5,
            orientation=Qt.Horizontal,
            callback=self.shuffle_split_changed,
        )
        gui.checkBox(ibox, self, "shuffle_stratified", "Stratified", callback=self.shuffle_split_changed)

        gui.appendRadioButton(rbox, "Leave one out")

        gui.appendRadioButton(rbox, "Test on train data")
        gui.appendRadioButton(rbox, "Test on test data")

        self.cbox = gui.vBox(self.controlArea, "Target Class")
        self.class_selection_combo = gui.comboBox(
            self.cbox,
            self,
            "class_selection",
            items=[],
            sendSelectedValue=True,
            valueType=str,
            callback=self._on_target_class_changed,
            contentsLength=8,
        )

        gui.rubber(self.controlArea)

        self.view = gui.TableView(wordWrap=True)
        header = self.view.horizontalHeader()
        header.setResizeMode(QHeaderView.ResizeToContents)
        header.setDefaultAlignment(Qt.AlignCenter)
        header.setStretchLastSection(False)

        self.result_model = QStandardItemModel(self)
        self.result_model.setHorizontalHeaderLabels(["Method"])
        self.view.setModel(self.result_model)
        self.view.setItemDelegate(ItemDelegate())

        box = gui.vBox(self.mainArea, "Evaluation Results")
        box.layout().addWidget(self.view)

    def sizeHint(self):
        return QSize(780, 1)

    def set_learner(self, learner, key):
        """
        Set the input `learner` for `key`.
        """
        if key in self.learners and learner is None:
            # Removed
            del self.learners[key]
        else:
            self.learners[key] = Input(learner, None, None)
            self._invalidate([key])

    def set_train_data(self, data):
        """
        Set the input training dataset.
        """
        self.Information.data_sampled.clear()
        if data and not data.domain.class_var:
            self.Error.class_required()
            data = None
        else:
            self.Error.class_required.clear()

        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            else:
                self.Information.data_sampled()
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(AUTO_DL_LIMIT, partial=True)
                data = Table(data_sample)

        self.train_data_missing_vals = data is not None and np.isnan(data.Y).any()
        if self.train_data_missing_vals or self.test_data_missing_vals:
            self.Warning.missing_data(self._which_missing_data())
            if data:
                data = RemoveNaNClasses(data)
        else:
            self.Warning.missing_data.clear()

        self.data = data
        self.closeContext()
        if data is not None:
            self._update_class_selection()
            self.openContext(data.domain.class_var)
        self._invalidate()

    def set_test_data(self, data):
        """
        Set the input separate testing dataset.
        """
        self.Information.test_data_sampled.clear()
        if data and not data.domain.class_var:
            self.Error.class_required()
            data = None
        else:
            self.Error.class_required_test.clear()

        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            else:
                self.Information.test_data_sampled()
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(AUTO_DL_LIMIT, partial=True)
                data = Table(data_sample)

        self.test_data_missing_vals = data is not None and np.isnan(data.Y).any()
        if self.train_data_missing_vals or self.test_data_missing_vals:
            self.Warning.missing_data(self._which_missing_data())
            if data:
                data = RemoveNaNClasses(data)
        else:
            self.Warning.missing_data.clear()

        self.test_data = data
        if self.resampling == OWTestLearners.TestOnTest:
            self._invalidate()

    def _which_missing_data(self):
        return {(True, True): " ", (True, False): " train ", (False, True): " test "}[  # both, don't specify
            (self.train_data_missing_vals, self.test_data_missing_vals)
        ]

    def set_preprocessor(self, preproc):
        """
        Set the input preprocessor to apply on the training data.
        """
        self.preprocessor = preproc
        self._invalidate()

    def handleNewSignals(self):
        """Reimplemented from OWWidget.handleNewSignals."""
        self._update_class_selection()
        self.commit()

    def kfold_changed(self):
        self.resampling = OWTestLearners.KFold
        self._param_changed()

    def shuffle_split_changed(self):
        self.resampling = OWTestLearners.ShuffleSplit
        self._param_changed()

    def _param_changed(self):
        self._invalidate()

    def _update_results(self):
        """
        Run/evaluate the learners.
        """
        self.Warning.test_data_unused.clear()
        self.Warning.test_data_missing.clear()
        self.warning()
        self.Error.class_inconsistent.clear()
        self.Error.too_many_folds.clear()
        self.error()
        if self.data is None:
            return

        class_var = self.data.domain.class_var

        if self.resampling == OWTestLearners.TestOnTest:
            if self.test_data is None:
                self.Warning.test_data_missing()
                return
            elif self.test_data.domain.class_var != class_var:
                self.Error.class_inconsistent()
                return

        # items in need of an update
        items = [(key, slot) for key, slot in self.learners.items() if slot.results is None]
        learners = [slot.learner for _, slot in items]
        if len(items) == 0:
            return

        if self.test_data is not None and self.resampling != OWTestLearners.TestOnTest:
            self.Warning.test_data_unused()

        rstate = 42

        def update_progress(finished):
            self.progressBarSet(100 * finished)

        common_args = dict(store_data=True, preprocessor=self.preprocessor, callback=update_progress, n_jobs=-1)
        self.setStatusMessage("Running")

        with self.progressBar():
            try:
                folds = self.NFolds[self.n_folds]
                if self.resampling == OWTestLearners.KFold:
                    if len(self.data) < folds:
                        self.Error.too_many_folds()
                        return
                    warnings = []
                    results = Orange.evaluation.CrossValidation(
                        self.data, learners, k=folds, random_state=rstate, warnings=warnings, **common_args
                    )
                    if warnings:
                        self.warning(warnings[0])
                elif self.resampling == OWTestLearners.LeaveOneOut:
                    results = Orange.evaluation.LeaveOneOut(self.data, learners, **common_args)
                elif self.resampling == OWTestLearners.ShuffleSplit:
                    train_size = self.SampleSizes[self.sample_size] / 100
                    results = Orange.evaluation.ShuffleSplit(
                        self.data,
                        learners,
                        n_resamples=self.NRepeats[self.n_repeats],
                        train_size=train_size,
                        test_size=None,
                        stratified=self.shuffle_stratified,
                        random_state=rstate,
                        **common_args
                    )
                elif self.resampling == OWTestLearners.TestOnTrain:
                    results = Orange.evaluation.TestOnTrainingData(self.data, learners, **common_args)
                elif self.resampling == OWTestLearners.TestOnTest:
                    results = Orange.evaluation.TestOnTestData(self.data, self.test_data, learners, **common_args)
                else:
                    assert False
            except (RuntimeError, ValueError) as e:
                self.error(str(e))
                self.setStatusMessage("")
                return
            else:
                self.error()

        learner_key = {slot.learner: key for key, slot in self.learners.items()}
        for learner, result in zip(learners, results.split_by_model()):
            stats = None
            if class_var.is_discrete:
                scorers = classification_stats.scores
            elif class_var.is_continuous:
                scorers = regression_stats.scores
            else:
                scorers = None
            if scorers:
                ex = result.failed[0]
                if ex:
                    stats = [Try.Fail(ex)] * len(scorers)
                    result = Try.Fail(ex)
                else:
                    stats = [Try(lambda: score(result)) for score in scorers]
                    result = Try.Success(result)
            key = learner_key[learner]
            self.learners[key] = self.learners[key]._replace(results=result, stats=stats)

        self.setStatusMessage("")

    def _update_header(self):
        # Set the correct horizontal header labels on the results_model.
        headers = ["Method"]
        if self.data is not None:
            if self.data.domain.has_discrete_class:
                headers.extend(classification_stats.headers)
            else:
                headers.extend(regression_stats.headers)

        # remove possible extra columns from the model.
        for i in reversed(range(len(headers), self.result_model.columnCount())):
            self.result_model.takeColumn(i)

        self.result_model.setHorizontalHeaderLabels(headers)

    def _update_stats_model(self):
        # Update the results_model with up to date scores.
        # Note: The target class specific scores (if requested) are
        # computed as needed in this method.
        model = self.view.model()
        # clear the table model, but preserving the header labels
        for r in reversed(range(model.rowCount())):
            model.takeRow(r)

        target_index = None
        if self.data is not None:
            class_var = self.data.domain.class_var
            if self.data.domain.has_discrete_class and self.class_selection != self.TARGET_AVERAGE:
                target_index = class_var.values.index(self.class_selection)
        else:
            class_var = None

        errors = []
        has_missing_scores = False

        for key, slot in self.learners.items():
            name = learner_name(slot.learner)
            head = QStandardItem(name)
            head.setData(key, Qt.UserRole)
            if isinstance(slot.results, Try.Fail):
                head.setToolTip(str(slot.results.exception))
                head.setText("{} (error)".format(name))
                head.setForeground(QtGui.QBrush(Qt.red))
                errors.append(
                    "{name} failed with error:\n"
                    "{exc.__class__.__name__}: {exc!s}".format(name=name, exc=slot.results.exception)
                )

            row = [head]

            if class_var is not None and class_var.is_discrete and target_index is not None:
                if slot.results is not None and slot.results.success:
                    ovr_results = results_one_vs_rest(slot.results.value, target_index)

                    stats = [Try(lambda: score(ovr_results)) for score in classification_stats.scores]
                else:
                    stats = None
            else:
                stats = slot.stats

            if stats is not None:
                for stat in stats:
                    item = QStandardItem()
                    if stat.success:
                        item.setText("{:.3f}".format(stat.value[0]))
                    else:
                        item.setToolTip(str(stat.exception))
                        has_missing_scores = True
                    row.append(item)

            model.appendRow(row)

        self.error("\n".join(errors), shown=bool(errors))
        self.Warning.scores_not_computed(shown=has_missing_scores)

    def _update_class_selection(self):
        self.class_selection_combo.setCurrentIndex(-1)
        self.class_selection_combo.clear()
        if not self.data:
            return

        if self.data.domain.has_discrete_class:
            self.cbox.setVisible(True)
            class_var = self.data.domain.class_var
            items = [self.TARGET_AVERAGE] + class_var.values
            self.class_selection_combo.addItems(items)

            class_index = 0
            if self.class_selection in class_var.values:
                class_index = class_var.values.index(self.class_selection) + 1

            self.class_selection_combo.setCurrentIndex(class_index)
            self.class_selection = items[class_index]
        else:
            self.cbox.setVisible(False)

    def _on_target_class_changed(self):
        self._update_stats_model()

    def _invalidate(self, which=None):
        # Invalidate learner results for `which` input keys
        # (if None then all learner results are invalidated)
        if which is None:
            which = self.learners.keys()

        model = self.view.model()
        statmodelkeys = [model.item(row, 0).data(Qt.UserRole) for row in range(model.rowCount())]

        for key in which:
            self.learners[key] = self.learners[key]._replace(results=None, stats=None)

            if key in statmodelkeys:
                row = statmodelkeys.index(key)
                for c in range(1, model.columnCount()):
                    item = model.item(row, c)
                    if item is not None:
                        item.setData(None, Qt.DisplayRole)
                        item.setData(None, Qt.ToolTipRole)

        self.commit()

    def commit(self):
        """Recompute and output the results"""
        self._update_header()
        # Update the view to display the model names
        self._update_stats_model()
        self._update_results()
        self._update_stats_model()
        valid = [slot for slot in self.learners.values() if slot.results is not None and slot.results.success]
        if valid:
            # Evaluation results
            combined = results_merge([slot.results.value for slot in valid])
            combined.learner_names = [learner_name(slot.learner) for slot in valid]

            # Predictions & Probabilities
            predictions = combined.get_augmented_data(combined.learner_names)
        else:
            combined = None
            predictions = None
        self.send("Evaluation Results", combined)
        self.send("Predictions", predictions)

    def send_report(self):
        """Report on the testing schema and results"""
        if not self.data or not self.learners:
            return
        if self.resampling == self.KFold:
            stratified = "Stratified " if self.cv_stratified else ""
            items = [("Sampling type", "{}{}-fold Cross validation".format(stratified, self.NFolds[self.n_folds]))]
        elif self.resampling == self.LeaveOneOut:
            items = [("Sampling type", "Leave one out")]
        elif self.resampling == self.ShuffleSplit:
            stratified = "Stratified " if self.shuffle_stratified else ""
            items = [
                (
                    "Sampling type",
                    "{}Shuffle split, {} random samples with {}% data ".format(
                        stratified, self.NRepeats[self.n_repeats], self.SampleSizes[self.sample_size]
                    ),
                )
            ]
        elif self.resampling == self.TestOnTrain:
            items = [("Sampling type", "No sampling, test on training data")]
        elif self.resampling == self.TestOnTest:
            items = [("Sampling type", "No sampling, test on testing data")]
        else:
            items = []
        if self.data.domain.has_discrete_class:
            items += [("Target class", self.class_selection.strip("()"))]
        if items:
            self.report_items("Settings", items)
        self.report_table("Scores", self.view)
Exemple #6
0
class OWTestLearners(widget.OWWidget):
    name = "Test Learners"
    description = ""
    icon = "icons/TestLearners1.svg"
    priority = 100

    inputs = [("Learner", Orange.classification.Fitter,
               "set_learner", widget.Multiple),
              ("Data", Orange.data.Table, "set_train_data", widget.Default),
              ("Test Data", Orange.data.Table, "set_test_data")]

    outputs = [("Evaluation Results", testing.Results)]

    #: Resampling/testing types
    KFold, LeaveOneOut, Bootstrap, TestOnTrain, TestOnTest = 0, 1, 2, 3, 4

    #: Selected resampling type
    resampling = settings.Setting(0)
    #: Number of folds for K-fold cross validation
    k_folds = settings.Setting(10)
    #: Number of repeats for bootstrap sampling
    n_repeat = settings.Setting(10)
    #: Bootstrap sampling p
    sample_p = settings.Setting(75)

    def __init__(self, parent=None):
        super().__init__(parent)

        self.train_data = None
        self.test_data = None

        #: An Ordered dictionary with current inputs and their testing
        #: results.
        self.learners = OrderedDict()

        sbox = gui.widgetBox(self.controlArea, "Sampling")
        rbox = gui.radioButtons(
            sbox, self, "resampling", callback=self._param_changed
        )
        gui.appendRadioButton(rbox, "Cross validation")
        ibox = gui.indentedBox(rbox)
        gui.spin(ibox, self, "k_folds", 2, 50, label="Number of folds:",
                 callback=self._param_changed)
        gui.appendRadioButton(rbox, "Leave one out")
        gui.appendRadioButton(rbox, "Random sampling")
        ibox = gui.indentedBox(rbox)
        gui.spin(ibox, self, "n_repeat", 2, 50, label="Repeat train/test",
                 callback=self._param_changed)
        gui.widgetLabel(ibox, "Relative training set size:")
        gui.hSlider(ibox, self, "sample_p", minValue=1, maxValue=100,
                    ticks=20, vertical=False,
                    callback=self._param_changed)

        gui.appendRadioButton(rbox, "Test on train data")
        gui.appendRadioButton(rbox, "Test on test data")

        rbox.layout().addSpacing(5)
        gui.button(rbox, self, "Apply", callback=self.apply)

        gui.rubber(self.controlArea)

        self.view = QTreeView(
            rootIsDecorated=False,
            uniformRowHeights=True,
            wordWrap=True,
            editTriggers=QTreeView.NoEditTriggers
        )

        self.result_model = QStandardItemModel()
        self.view.setModel(self.result_model)
        self._update_header()
        box = gui.widgetBox(self.mainArea, "Evaluation Results")
        box.layout().addWidget(self.view)

    def set_learner(self, learner, key):
        if key in self.learners and learner is None:
            del self.learners[key]
        else:
            self.learners[key] = Input(learner, None, ())
        self._update_stats_model()

    def set_train_data(self, data):
        self.train_data = data
        self._update_header()
        self._invalidate()

    def set_test_data(self, data):
        self.test_data = data
        if self.resampling == OWTestLearners.TestOnTest:
            self._invalidate()

    def handleNewSignals(self):
        self.update_results()
        self.commit()

    def _param_changed(self):
        self._invalidate()

    def update_results(self):
        self.warning(1, "")
        if self.train_data is None:
            return

        # items in need of an update
        items = [(key, input) for key, input in self.learners.items()
                 if input.results is None]
        learners = [input.learner for _, input in items]

        self.setStatusMessage("Running")
        if self.test_data is not None and \
                self.resampling != OWTestLearners.TestOnTest:
            self.warning(1, "Select 'Test on test data' to use the test data")

        # TODO: Test each learner individually

        if self.resampling == OWTestLearners.KFold:
            results = testing.CrossValidation(
                self.train_data, learners, k=self.k_folds, store_data=True
            )
        elif self.resampling == OWTestLearners.LeaveOneOut:
            results = testing.LeaveOneOut(
                self.train_data, learners, store_data=True
            )
        elif self.resampling == OWTestLearners.Bootstrap:
            p = self.sample_p / 100.0
            results = testing.Bootstrap(
                self.train_data, learners, n_resamples=self.n_repeat, p=p,
                store_data=True
            )
        elif self.resampling == OWTestLearners.TestOnTrain:
            results = testing.TestOnTrainingData(
                self.train_data, learners, store_data=True
            )
        elif self.resampling == OWTestLearners.TestOnTest:
            if self.test_data is None:
                return
            results = testing.TestOnTestData(
                self.train_data, self.test_data, learners, store_data=True
            )
        else:
            assert False

        results = list(split_by_model(results))
        class_var = self.train_data.domain.class_var
        
        if is_discrete(class_var):
            test_stats = classification_stats
        else:
            test_stats = regression_stats
        
        self._update_header()
        
        stats = [test_stats(res) for res in results]
        for (key, input), res, stat in zip(items, results, stats):
            self.learners[key] = input._replace(results=res, stats=stat)

        self.setStatusMessage("")
        
        self._update_stats_model()

    def _update_header(self):
        headers = ["Method"]
        if self.train_data is not None:
            if is_discrete(self.train_data.domain.class_var):
                if len(self.train_data.domain.class_var.values) == 2:
                    headers.extend(classification_stats.headers_binary)
                headers.extend(classification_stats.headers)
            else:
                headers.extend(regression_stats.headers)
        for i in reversed(range(len(headers),
                                self.result_model.columnCount())):
            self.result_model.takeColumn(i)
        
        self.result_model.setHorizontalHeaderLabels(headers)

    def _update_stats_model(self):
        model = self.view.model()

        for r in reversed(range(model.rowCount())):
            model.takeRow(r)

        for input in self.learners.values():
            name = learner_name(input.learner)
            row = []
            head = QStandardItem()
            head.setData(name, Qt.DisplayRole)
            row.append(head)
            for stat in input.stats:
                item = QStandardItem()
                item.setData(float(stat[0]), Qt.DisplayRole)
                row.append(item)
            model.appendRow(row)

        self.view.resizeColumnToContents(0)

    def _invalidate(self, which=None):
        if which is None:
            which = self.learners.keys()

        all_keys = list(self.learners.keys())
        model = self.view.model()

        for key in which:
            self.learners[key] = \
                self.learners[key]._replace(results=None, stats=None)

            if key in self.learners:
                row = all_keys.index(key)
                for c in range(1, model.columnCount()):
                    item = model.item(row, c)
                    if item is not None:
                        item.setData(None, Qt.DisplayRole)

    def apply(self):
        self.update_results()
        self.commit()

    def commit(self):
        results = [val.results for val in self.learners.values()
                   if val.results is not None]
        if results:
            combined = results_merge(results)
            combined.fitter_names = [learner_name(val.learner)
                                     for val in self.learners.values()]
        else:
            combined = None
        self.send("Evaluation Results", combined)