Example #1
0
 def setUp(self):
     self.domain = Domain(
         attributes=[
             ContinuousVariable("c1"),
             DiscreteVariable("d1", values="abc"),
             DiscreteVariable("d2", values="def"),
         ],
         class_vars=[DiscreteVariable("d3", values="ghi")],
         metas=[
             ContinuousVariable("c2"),
             DiscreteVariable("d4", values="jkl")
         ],
     )
     self.args = (
         self.domain,
         (("c1", Continuous), ("d1", Discrete), ("d2", Discrete)),
         (("d3", Discrete), ),
         (("c2", Continuous), ("d4", Discrete)),
     )
     self.args_match_all = (
         self.domain,
         (("c1", Continuous), ("d1", list("abc")), ("d2", list("def"))),
         (("d3", list("ghi")), ),
         (("c2", Continuous), ("d4", list("jkl"))),
     )
     self.handler = PerfectDomainContextHandler()
     self.handler.read_defaults = lambda: None
     self.handler.bind(SimpleWidget)
     self.widget = SimpleWidget()
     self.handler.initialize(self.widget)
Example #2
0
 def setUp(self):
     self.domain = Domain(attributes=[
         ContinuousVariable('c1'),
         DiscreteVariable('d1', values='abc'),
         DiscreteVariable('d2', values='def')
     ],
                          class_vars=[DiscreteVariable('d3', values='ghi')],
                          metas=[
                              ContinuousVariable('c2'),
                              DiscreteVariable('d4', values='jkl')
                          ])
     self.args = (self.domain, (('c1', Continuous), ('d1', Discrete),
                                ('d2', Discrete)), (('d3', Discrete), ),
                  (('c2', Continuous), ('d4', Discrete)))
     self.args_match_all = (self.domain,
                            (('c1', Continuous), ('d1', list('abc')),
                             ('d2', list('def'))), (('d3', list('ghi')), ),
                            (('c2', Continuous), ('d4', list('jkl'))))
     self.handler = PerfectDomainContextHandler()
     self.handler.read_defaults = lambda: None
     self.handler.bind(SimpleWidget)
     self.widget = SimpleWidget()
     self.handler.initialize(self.widget)
 def setUp(self):
     self.domain = Domain(
         attributes=[ContinuousVariable('c1'),
                     DiscreteVariable('d1', values='abc'),
                     DiscreteVariable('d2', values='def')],
         class_vars=[DiscreteVariable('d3', values='ghi')],
         metas=[ContinuousVariable('c2'),
                DiscreteVariable('d4', values='jkl')]
     )
     self.args = (self.domain,
                  (('c1', Continuous), ('d1', Discrete), ('d2', Discrete)),
                  (('d3', Discrete),),
                  (('c2', Continuous), ('d4', Discrete)))
     self.args_match_all = (self.domain,
                            (('c1', Continuous), ('d1', list('abc')), ('d2', list('def'))),
                            (('d3', list('ghi')),),
                            (('c2', Continuous), ('d4', list('jkl'))))
     self.handler = PerfectDomainContextHandler()
     self.handler.read_defaults = lambda: None
     self.handler.bind(SimpleWidget)
     self.widget = SimpleWidget()
     self.handler.initialize(self.widget)
Example #4
0
class OWLoadModel(widget.OWWidget, RecentPathsWComboMixin):
    name = "Load PMML/PFA Model"
    id = "orange.widgets.scoring.model"
    description = "Load model from an input PMML file ( *.pmml, *.xml) " \
                  "or from an input PFA file ( *.pfa, *.json, *.yml, *.yaml) " \
                  "and send the model to the output."
    icon = "icons/model.svg"
    priority = 1
    category = "Scoring"
    keywords = ["pmml", "pfa", "load", "read", "open"]

    class Outputs:
        data = Output("Scoring Model",
                      ScoringModel,
                      doc="PMML/PFA Model read from the input file.")

    want_main_area = False

    SEARCH_PATHS = [("location", os.getcwd())]
    SIZE_LIMIT = 1e7
    LOCAL_FILE, URL = range(2)

    settingsHandler = PerfectDomainContextHandler(
        match_values=PerfectDomainContextHandler.MATCH_VALUES_ALL)

    # pylint seems to want declarations separated from definitions
    recent_paths: List[RecentPath]

    # Overload RecentPathsWidgetMixin.recent_paths to set defaults
    recent_paths = Setting([])
    source = Setting(LOCAL_FILE)

    class Warning(widget.OWWidget.Warning):
        file_too_big = widget.Msg(
            "The file is too large to load automatically."
            " Press Reload to load.")
        load_warning = widget.Msg("Read warning:\n{}")

    class Error(widget.OWWidget.Error):
        file_not_found = widget.Msg("File not found.")
        missing_reader = widget.Msg("Missing reader.")
        unknown = widget.Msg("Read error:\n{}")

    class NoFileSelected:
        pass

    def __init__(self):
        super().__init__()
        RecentPathsWComboMixin.__init__(self)
        self.domain = None
        self.data = None
        self.loaded_file = ""
        self.reader = None

        layout = QGridLayout()
        gui.widgetBox(self.controlArea, margin=0, orientation=layout)
        vbox = gui.radioButtons(None,
                                self,
                                "source",
                                box=True,
                                addSpace=True,
                                callback=self.load_data,
                                addToLayout=False)

        rb_button = gui.appendRadioButton(vbox, "File:", addToLayout=False)
        layout.addWidget(rb_button, 0, 0, Qt.AlignVCenter)

        box = gui.hBox(None, addToLayout=False, margin=0)
        box.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed)
        self.file_combo.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed)
        self.file_combo.activated[int].connect(self.select_file)
        box.layout().addWidget(self.file_combo)
        layout.addWidget(box, 0, 1)

        file_button = gui.button(None,
                                 self,
                                 '...',
                                 callback=self.browse_file,
                                 autoDefault=False)
        file_button.setIcon(self.style().standardIcon(QStyle.SP_DirOpenIcon))
        file_button.setSizePolicy(Policy.Maximum, Policy.Fixed)
        layout.addWidget(file_button, 0, 2)

        reload_button = gui.button(None,
                                   self,
                                   "Reload",
                                   callback=self.load_data,
                                   autoDefault=False)
        reload_button.setIcon(self.style().standardIcon(
            QStyle.SP_BrowserReload))
        reload_button.setSizePolicy(Policy.Fixed, Policy.Fixed)
        layout.addWidget(reload_button, 0, 3)

        box = gui.vBox(self.controlArea, "Info")
        self.infolabel = gui.widgetLabel(box, 'No model loaded.')
        self.warnings = gui.widgetLabel(box, '')

        box = gui.hBox(self.controlArea)
        gui.rubber(box)

        self.apply_button = gui.button(box,
                                       self,
                                       "Send",
                                       callback=self.send_data)
        self.apply_button.setEnabled(False)
        self.apply_button.setFixedWidth(170)

        self.set_file_list()
        # Must not call open_file from within __init__. open_file
        # explicitly re-enters the event loop (by a progress bar)

        self.setAcceptDrops(True)

        if self.source == self.LOCAL_FILE:
            last_path = self.last_path()
            if last_path and os.path.exists(last_path) and \
                    os.path.getsize(last_path) > self.SIZE_LIMIT:
                self.Warning.file_too_big()
                return

        QTimer.singleShot(0, self.load_data)

    @staticmethod
    def sizeHint():
        return QSize(600, 30)

    def select_file(self, n):
        assert n < len(self.recent_paths)
        super().select_file(n)
        if self.recent_paths:
            self.source = self.LOCAL_FILE
            self.load_data()
            self.set_file_list()

    def browse_file(self):
        start_file = self.last_path() or os.path.expanduser("~/")

        readers = [
            PMMLFormat,
            PFAFormat,
        ]
        filename, file_format, filter = open_filename_dialog(
            start_file, None, readers)
        if not filename:
            return
        self.add_path(filename)
        if file_format is not None:
            self.recent_paths[0].file_format = file_format.qualified_name()

        self.source = self.LOCAL_FILE
        self.load_data()

    # Open a file, create data from it and send it over the data channel
    def load_data(self):
        # We need to catch any exception type since anything can happen in
        # file readers
        self.closeContext()
        self.apply_button.setEnabled(False)
        self.clear_messages()
        self.set_file_list()

        error = self._try_load()
        if error:
            error()
            self.data = None
            self.Outputs.data.send(None)
            self.infolabel.setText("No model.")

    def _try_load(self):
        # pylint: disable=broad-except
        if self.last_path() and not os.path.exists(self.last_path()):
            return self.Error.file_not_found
        try:
            self.reader = self._get_reader()
            assert self.reader is not None
        except Exception:
            return self.Error.missing_reader

        if self.reader is self.NoFileSelected:
            self.Outputs.data.send(None)
            return None

        with catch_warnings(record=True) as warnings:
            try:
                model = self.reader.read()
            except Exception as ex:
                log.exception(ex)
                return lambda x=ex: self.Error.unknown(str(x))
            if warnings:
                self.Warning.load_warning(warnings[-1].message.args[0])

        self.infolabel.setText(self._describe(model))

        self.loaded_file = self.last_path()
        self.data = model
        self.apply_button.setEnabled(True)
        return None

    def _get_reader(self):
        if self.source == self.LOCAL_FILE:
            path = self.last_path()
            if path is None:
                return self.NoFileSelected
            if self.recent_paths and self.recent_paths[0].file_format:
                qname = self.recent_paths[0].file_format
                reader_class = class_from_qualified_name(qname)
                reader = reader_class.get_reader(path)
            else:
                _, ext = os.path.splitext(path)
                reader = self.NoFileSelected
                if ext in PMMLFormat.EXTENSIONS:
                    reader = PMMLFormat.get_reader(path)
                if ext in PFAFormat.EXTENSIONS:
                    reader = PFAFormat.get_reader(path)
            return reader
        return self.NoFileSelected

    @staticmethod
    def _describe(modelFormat):
        text = ""
        if modelFormat.type == "PFA":
            text += "Method:<br/>&nbsp;&nbsp;&nbsp;&nbsp;" + modelFormat.method + "<br/>"

        text += "Input fields(s)"
        if len(modelFormat.inputFields) > 0:
            text += ":<br/>&nbsp;&nbsp;&nbsp;&nbsp;" + \
                ", ".join([name+ " ("+dataType+")" for name, dataType in modelFormat.inputFields])
        else:
            text += ":<br/>&nbsp;&nbsp;&nbsp;&nbsp;None"
        text += "<br/>Output fields(s)"
        if len(modelFormat.outputFields) > 0:
            text += ":<br/>&nbsp;&nbsp;&nbsp;&nbsp;" + \
                ", ".join([name+ " ("+dataType+")" for name, dataType in modelFormat.outputFields])
        else:
            text += ":<br/>&nbsp;&nbsp;&nbsp;&nbsp;None"

        if modelFormat.type == "PMML":
            text += "<br/>Target fields(s)"
            if len(modelFormat.targetFields) > 0:
                text += ":<br/>&nbsp;&nbsp;&nbsp;&nbsp;" + \
                    ", ".join([name+ " ("+dataType+")" for name, dataType in modelFormat.targetFields])
            else:
                text += ":<br/>&nbsp;&nbsp;&nbsp;&nbsp;None"
        return text

    def get_widget_name_extension(self):
        _, name = os.path.split(self.loaded_file)
        return os.path.splitext(name)[0]

    def send_data(self):
        self.Outputs.data.send(self.data)
        self.apply_button.setEnabled(False)
Example #5
0
class OWTableToRelation(OWWidget):
    name = "Table to Relation"
    description = "Convert data table to relation matrix. Label matrix axis."
    priority = 50000
    icon = "icons/TableToRelation.svg"

    class Inputs:
        data = Input("Data", Table)

    class Outputs:
        relation = Output("Relation", Relation)

    settingsHandler = PerfectDomainContextHandler()

    data = None

    relation_name = ContextSetting("")
    transpose = ContextSetting(False)

    row_type = ContextSetting("")
    selected_meta = ContextSetting(0)
    row_names = None

    col_type = ContextSetting("")
    col_names = None

    auto_commit = Setting(True)

    def __init__(self):
        super().__init__()

        self.model = None
        self.view = None
        self.row_names_combo = None
        self.icons = gui.attributeIconDict
        self.populate_control_area()
        self.populate_main_area()

    def populate_control_area(self):
        rel = gui.widgetBox(self.controlArea, "Relation")
        gui.lineEdit(rel,
                     self,
                     "relation_name",
                     "Name",
                     callbackOnType=True,
                     callback=self.apply)
        gui.checkBox(rel, self, "transpose", "Transpose", callback=self.apply)

        col = gui.widgetBox(self.controlArea, "Column")
        gui.lineEdit(col,
                     self,
                     "col_type",
                     "Object Type",
                     callbackOnType=True,
                     callback=self.apply)

        row = gui.widgetBox(self.controlArea, "Row")
        gui.lineEdit(row,
                     self,
                     "row_type",
                     "Object Type",
                     callbackOnType=True,
                     callback=self.apply)
        self.row_names_combo = gui.comboBox(row,
                                            self,
                                            "selected_meta",
                                            label="Object Names",
                                            callback=self.update_row_names)

        gui.rubber(self.controlArea)
        gui.auto_commit(self.controlArea,
                        self,
                        "auto_commit",
                        "Send",
                        checkbox_label='Auto-send',
                        orientation='vertical')

    def populate_main_area(self):
        grid = QWidget()
        grid.setLayout(QGridLayout(grid))
        self.mainArea.layout().addWidget(grid)

        col_type = gui.label(None, self, '%(col_type)s')

        grid.layout().addWidget(col_type, 0, 1)
        grid.layout().setAlignment(col_type, Qt.AlignHCenter)

        row_type = gui.label(None, self, '%(row_type)s')
        grid.layout().addWidget(row_type, 1, 0)
        grid.layout().setAlignment(row_type, Qt.AlignVCenter)

        self.view = QTableView()
        self.model = None
        grid.layout().addWidget(self.view, 1, 1)

    def sizeHint(self):
        return QSize(800, 500)

    @Inputs.data
    def set_data(self, data):
        self.closeContext()
        self.data = data
        if data is not None:
            self.init_attr_values(data.domain.metas)
            self.openContext(self.data)
            self.col_names = [str(a.name) for a in data.domain.attributes]
            if hasattr(data, 'col_type'):
                self.col_type = data.col_type
        else:
            self.init_attr_values(())
        self.update_preview()
        self.update_row_names()
        self.unconditional_commit()

    def init_attr_values(self, candidates):
        self.col_type = ""
        self.col_names = None

        if candidates:
            self.row_type = candidates[0].name
            self.selected_meta = 1
        else:
            self.row_type = ""
            self.selected_meta = 0
            self.row_names = None

        self.row_names_combo.clear()
        self.row_names_combo.addItem('(None)')
        for var in candidates:
            self.row_names_combo.addItem(self.icons[var], var.name)
        self.row_names_combo.setCurrentIndex(self.selected_meta)

    def update_row_names(self):
        if self.selected_meta:
            self.row_names = list(
                self.data[:, -self.selected_meta].metas.flatten())
        else:
            self.row_names = None

        if self.model:
            self.model.headerDataChanged.emit(Qt.Vertical, 0,
                                              self.model.rowCount() - 1)
        self.commit()

    def update_preview(self):
        this = self

        class MyTableModel(TableModel):
            def headerData(self, section, orientation, role):
                if orientation == Qt.Vertical and role == Qt.DisplayRole:
                    if this.row_names:
                        return this.row_names[section]
                else:
                    return super().headerData(section, orientation, role)

        if self.data:
            domain = Domain(self.data.domain.attributes)
            preview_data = Table(domain, self.data)
            self.model = MyTableModel(preview_data)
        else:
            self.model = None
        self.view.setModel(self.model)

    def apply(self):
        self.commit()

    def commit(self):
        if self.data:
            domain = self.data.domain
            metadata_cols = list(domain.class_vars) + list(domain.metas)
            metadata = [{
                var: var.to_val(value)
                for var, value in zip(metadata_cols, values.list)
            } for values in self.data[:, metadata_cols]]

            if self.transpose:
                relation = fusion.Relation(
                    self.data.X.T,
                    name=self.relation_name,
                    row_type=fusion.ObjectType(self.col_type or 'Unknown'),
                    row_names=self.col_names,
                    col_type=fusion.ObjectType(self.row_type or 'Unknown'),
                    col_names=self.row_names,
                    col_metadata=metadata)
            else:
                relation = fusion.Relation(
                    self.data.X,
                    name=self.relation_name,
                    row_type=fusion.ObjectType(self.row_type or 'Unknown'),
                    row_names=self.row_names,
                    row_metadata=metadata,
                    col_type=fusion.ObjectType(self.col_type or 'Unknown'),
                    col_names=self.col_names,
                )
            self.Outputs.relation.send(Relation(relation))
class OWBatchNorm(OWWidget):
    name = "Batch Effect Removal"
    description = "Batch effect normalization on Single Cell data set."
    icon = "icons/BatchEffectRemoval.svg"
    priority = 230

    class Inputs:
        data = Input("Data", Table)

    class Outputs:
        data = Output("Data", Table)

    class Error(OWWidget.Error):
        general_error = Msg({})
        discrete_attributes = Msg("Data with discrete attributes "
                                  "can not be processed.")

    class Warning(OWWidget.Warning):
        missing_values = Msg("Missing values have been replaced with 0.")
        negative_values = Msg("Unable to use current settings due "
                              "to negative values in data.")

    resizing_enabled = False
    want_main_area = False

    settingsHandler = PerfectDomainContextHandler()
    batch_vars = ContextSetting([])
    link_method = Setting(LinkMethod.IDENTITY_LINK)
    skip_zeros = Setting(False)
    auto_commit = Setting(True)

    def __init__(self, parent=None):
        super().__init__(parent)
        self.data = None

        # Info
        infobox = gui.widgetBox(self.controlArea, "Info")
        self.info_label = gui.widgetLabel(infobox, "No data on input.")

        # Link method
        method_box = gui.widgetBox(self.controlArea, "Method")
        gui.comboBox(method_box,
                     self,
                     "link_method",
                     items=LinkMethod.items(),
                     callback=self.__link_method_changed)
        gui.separator(method_box)
        self.skip_zeros_check = gui.checkBox(
            method_box,
            self,
            "skip_zeros",
            "Skip zero expressions",
            enabled=self.link_method != LinkMethod.LOG_LINK,
            callback=lambda: self.commit())

        # Batch Variable Selection
        header_shema = (("selected", ""), ("variable", "Variable"),
                        ("count", "#"), ("score", "Score"))
        header_labels = labels = [label for _, label in header_shema]
        header = namedtuple("header", [tag for tag, _ in header_shema])
        self.Header = header(*[index for index, _ in enumerate(labels)])

        batch_box = gui.widgetBox(self.controlArea, "Batch Variable Selection")
        self.view = QTreeView()
        self.model = QStandardItemModel()
        self.model.itemChanged.connect(self.__selected_batch_vars_changed)
        self.model.setHorizontalHeaderLabels(header_labels)
        batch_box.layout().addWidget(self.view)
        self._setup_view()

        gui.auto_commit(self.controlArea, self, "auto_commit", "Apply",
                        "Apply Automatically")

    def __link_method_changed(self):
        enable = self.link_method != LinkMethod.LOG_LINK
        self.skip_zeros_check.setEnabled(enable)
        if not enable:
            self.skip_zeros_check.setChecked(True)
        self.commit()

    def __selected_batch_vars_changed(self, item):
        if item.checkState():
            self.batch_vars.append(item.data(VariableRole))
        else:
            self.batch_vars.remove(item.data(VariableRole))
        self.commit()

    def _setup_view(self):
        self.view.setModel(self.model)
        self.view.setSelectionMode(QTreeView.NoSelection)
        self.view.setSortingEnabled(True)
        self.view.setRootIsDecorated(False)
        self.view.setItemDelegateForColumn(self.Header.count,
                                           IntegralDelegate(self))
        self.view.setItemDelegateForColumn(self.Header.score,
                                           RealDelegate(self))
        self.view.header().setSectionResizeMode(QHeaderView.ResizeToContents)
        self.view.header().setStretchLastSection(False)
        self.view.header().setSectionResizeMode(self.Header.variable,
                                                QHeaderView.Stretch)
        self.view.setFocus()

    @Inputs.data
    def set_data(self, data):
        self.closeContext()
        self.clear()
        self.data = data
        self._setup_info_label()
        self._check_data()
        self.openContext(data)
        if self.data is not None:
            self.batch_vars = [data.domain[v.name] for v in self.batch_vars]
            self._setup_model()
        self.commit()

    def clear(self):
        self.batch_vars = []
        if self.model:
            n_rows = self.model.rowCount()
            self.model.removeRows(0, n_rows)

    def _setup_info_label(self):
        text = "No data on input."
        if self.data is not None:
            domain, attrs = self.data.domain, self.data.domain.attributes
            text = "{} cells, {} genes\n".format(len(self.data), len(attrs))
            text += "{} meta features".format(len(domain.metas)) \
                if len(domain.metas) else "(no meta features)"
        self.info_label.setText(text)

    def _check_data(self):
        self.clear_messages()
        if self.data and self.data.domain.has_discrete_attributes():
            self.data = None
            self.Error.discrete_attributes()
        if self.data and np.isnan(self.data.X).any():
            self.data.X = np.nan_to_num(self.data.X)
            self.Warning.missing_values()

    def _setup_model(self):
        estimator = ScBatchScorer()
        for var in self.data.domain.class_vars + self.data.domain.metas:
            if not var.is_primitive():
                continue
            try:
                score = float(estimator.score_data(self.data, var))
            except Exception:
                score = np.nan
            self.model.appendRow([
                self.__selected_item(var),
                self.__variable_item(var),
                self.__count_item(var),
                self.__score_item(score)
            ])

    def __selected_item(self, var):
        item = QStandardItem()
        item.setData(var, VariableRole)
        item.setCheckable(True)
        select = var in self.batch_vars
        item.setCheckState(Qt.Checked if select else Qt.Unchecked)
        item.setEditable(False)
        return item

    def __variable_item(self, var):
        item = QStandardItem()
        item.setData(var.name, Qt.DisplayRole)
        item.setData(gui.attributeIconDict[var], Qt.DecorationRole)
        item.setEditable(False)
        return item

    def __count_item(self, var):
        item = QStandardItem()
        if var.is_discrete:
            item.setData(len(var.values), Qt.DisplayRole)
        item.setEditable(False)
        return item

    def __score_item(self, score):
        item = QStandardItem()
        item.setData(score, Qt.DisplayRole)
        item.setEditable(False)
        return item

    def commit(self):
        data = None
        self.Error.general_error.clear()
        self.Warning.negative_values.clear()
        if self.data is not None:
            if (self.data.X < 0).any() and self.skip_zeros:
                self.Warning.negative_values()
                data = self.data
            else:
                try:
                    data = SCBatchNormalizer(
                        LinkMethod.items()[self.link_method], self.skip_zeros,
                        self.batch_vars)(self.data)
                except Exception as e:
                    self.Error.general_error(str(e))
                    data = None
        self.Outputs.data.send(data)

    def send_report(self):
        method = LinkMethod.items()[self.link_method]
        if self.skip_zeros:
            method += " (Skip zero expressions)"
        variables = ", ".join([v.name for v in self.batch_vars]) \
            if self.batch_vars else "None"
        self.report_items("", [("Method", method),
                               ("Batch variable selection", variables)])
Example #7
0
class OWFile(widget.OWWidget, RecentPathsWComboMixin):
    name = "领域编辑器2"
    icon = "icons/gear.svg"
    id = "orange.widgets.data.file"
    description = "Read data from an input file or network " \
                  "and send a data table to the output."

    priority = 10
    category = "Data"
    keywords = ["file", "load", "read", "open"]

    class Outputs:
        data = Output("领域背景", Table, doc="专业领域背景的介绍")

    want_main_area = False

    SEARCH_PATHS = [("sample-datasets", get_sample_datasets_dir())]
    SIZE_LIMIT = 1e7
    LOCAL_FILE, URL = range(2)

    settingsHandler = PerfectDomainContextHandler(
        match_values=PerfectDomainContextHandler.MATCH_VALUES_ALL)

    # Overload RecentPathsWidgetMixin.recent_paths to set defaults
    recent_paths = Setting([
        RecentPath("", "sample-datasets", "iris.tab"),
        RecentPath("", "sample-datasets", "titanic.tab"),
        RecentPath("", "sample-datasets", "housing.tab"),
        RecentPath("", "sample-datasets", "heart_disease.tab"),
    ])
    recent_urls = Setting([])
    source = Setting(LOCAL_FILE)
    xls_sheet = ContextSetting("")
    sheet_names = Setting({})
    url = Setting("")

    variables = ContextSetting([])

    domain_editor = SettingProvider(DomainEditor)

    ##用于警告代码可以无视
    class Warning(widget.OWWidget.Warning):
        file_too_big = widget.Msg(
            "The file is too large to load automatically."
            " Press Reload to load.")
        load_warning = widget.Msg("Read warning:\n{}")
##用于报错代码可以无视

    class Error(widget.OWWidget.Error):
        file_not_found = widget.Msg("File not found.")
        missing_reader = widget.Msg("Missing reader.")
        sheet_error = widget.Msg("Error listing available sheets.")
        unknown = widget.Msg("Read error:\n{}")

    def __init__(self):
        super().__init__()
        RecentPathsWComboMixin.__init__(self)
        self.domain = None
        self.data = None
        self.loaded_file = ""
        self.reader = None

        layout = QGridLayout()  ##画布的布局,使用网格划分的方式
        gui.widgetBox(self.controlArea, margin=20, orientation=layout)
        vbox = gui.radioButtons(None,
                                self,
                                "source",
                                box=True,
                                addSpace=True,
                                callback=self.load_data,
                                addToLayout=False)

        rb_button = gui.appendRadioButton(vbox, "File:", addToLayout=False)
        layout.addWidget(rb_button, 0, 0, Qt.AlignVCenter)  #确定位置0,0

        box = gui.hBox(None, addToLayout=False, margin=0)  #水平box
        box.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed)  #设置size
        self.file_combo.setSizePolicy(Policy.MinimumExpanding,
                                      Policy.Fixed)  # 按钮和下拉菜单的联合体
        self.file_combo.activated[int].connect(
            self.select_file)  ##使用.connect(功能函数)来实现与功能函数的连接
        box.layout().addWidget(self.file_combo)
        layout.addWidget(box, 0, 1)  #确定位置0,1

        file_button = gui.button(None,
                                 self,
                                 '...',
                                 callback=self.browse_file,
                                 autoDefault=False)
        file_button.setIcon(self.style().standardIcon(QStyle.SP_DirOpenIcon))
        file_button.setSizePolicy(Policy.Maximum, Policy.Fixed)
        layout.addWidget(file_button, 0, 2)

        reload_button = gui.button(None,
                                   self,
                                   "Reload",
                                   callback=self.load_data,
                                   autoDefault=False)
        reload_button.setIcon(self.style().standardIcon(
            QStyle.SP_BrowserReload))
        reload_button.setSizePolicy(Policy.Fixed, Policy.Fixed)
        layout.addWidget(reload_button, 0, 3)

        ## 含Information的box设置
        box = gui.widgetBox(self.controlArea, "Info")
        self.info = gui.widgetLabel(box, '请设置领域特征')
        # self.warnings = gui.widgetLabel(box, '')

        ##下面几句控制含有table的box
        box = gui.widgetBox(self.controlArea, "双击进行编辑")
        self.domain_editor = DomainEditor(self)  ##对table操作的事情在DomainEditor内部定义
        self.editor_model = self.domain_editor.model()  ##设置与Apply激活状态有关
        box.layout().addWidget(self.domain_editor)

        ## Apply 按钮
        box = gui.hBox(self.controlArea)
        # gui.button(
        #     box, self, "Browse documentation datasets",
        #     callback=lambda: self.browse_file(True), autoDefault=False)
        # gui.rubber(box)
        self.apply_button = gui.button(box,
                                       self,
                                       "应用",
                                       callback=self.apply_domain_edit)
        self.apply_button.setEnabled(False)
        self.apply_button.setFixedWidth(170)

        # print('editor_model',self.editor_model)
        ## 如果数据改变就激活apply按钮.dataChange表示是否改变数据
        self.editor_model.dataChanged.connect(
            lambda: self.apply_button.setEnabled(True))

        self.set_file_list()  ##设置文件列表中的项
        # Must not call open_file from within __init__. open_file
        # explicitly re-enters the event loop (by a progress bar)

        self.setAcceptDrops(True)  ##表示接受响应释放操作

        if self.source == self.LOCAL_FILE:
            last_path = self.last_path()
            if last_path and os.path.exists(last_path) and \
                    os.path.getsize(last_path) > self.SIZE_LIMIT:
                self.Warning.file_too_big()
                return

        ##QTimer.singleShot()表示在s秒后调用一个槽函数(self.load_data)
        QTimer.singleShot(0, self.load_data)

    def sizeHint(self):
        return QSize(600, 550)

    def select_file(self, n):
        assert n < len(self.recent_paths)
        super().select_file(n)
        if self.recent_paths:
            self.source = self.LOCAL_FILE
            self.load_data()
            self.set_file_list()

## 读取文件

    def browse_file(self, in_demos=False):
        if in_demos:
            start_file = get_sample_datasets_dir()
            if not os.path.exists(start_file):
                QMessageBox.information(
                    None, "File",
                    "Cannot find the directory with documentation datasets")
                return
        else:
            start_file = self.last_path() or os.path.expanduser("~/")

        readers = [
            f for f in FileFormat.formats
            if getattr(f, 'read', None) and getattr(f, "EXTENSIONS", None)
        ]
        filename, reader, _ = open_filename_dialog(start_file, None, readers)
        if not filename:
            return
        self.add_path(filename)
        if reader is not None:
            self.recent_paths[0].file_format = reader.qualified_name()

        self.source = self.LOCAL_FILE
        self.load_data()

## 获取数据self.data,方式是调用了_try_load函数,并且将数据send到Output的channel中
# Open a file, create data from it and send it over the data channel

    def load_data(self):
        # We need to catch any exception type since anything can happen in
        # file readers
        self.closeContext()  ##重新设置widget 的context
        self.domain_editor.set_domain(None)  #把domain设置为None
        self.apply_button.setEnabled(False)  #把apply button设置为不可见
        self.clear_messages()
        self.set_file_list()

        ##这句话判断数据导入是否有错误
        error = self._try_load()
        if error:
            error()
            self.data = None
            # self.sheet_box.hide()
            self.Outputs.data.send(None)
            self.info.setText("无数据.")

    ## 导入数据的核心方法:获取self.data数据,同时判断这个出错可能性
    def _try_load(self):
        # pylint: disable=broad-except
        if self.last_path() and not os.path.exists(self.last_path()):
            return self.Error.file_not_found

        try:
            self.reader = self._get_reader()  ##这里获取reader
            assert self.reader is not None
        except Exception:
            return self.Error.missing_reader

        try:
            self._update_sheet_combo()
        except Exception:
            return self.Error.sheet_error

        with catch_warnings(record=True) as warnings:
            try:
                data = self.reader.read()  ##通过这句话读取数据,这是的data已经是table型数据了
                print('jia', type(data))
            except Exception as ex:
                log.exception(ex)
                return lambda x=ex: self.Error.unknown(str(x))
            if warnings:
                self.Warning.load_warning(warnings[-1].message.args[0])

        self.info.setText(self._describe(data))  #描述info的text

        self.loaded_file = self.last_path()  ##描述文档地址

        add_origin(data, self.loaded_file)
        self.data = data
        # print('liangyue',dir(self.data))
        self.openContext(data.domain)

        # print('data',data)
        self.apply_domain_edit()  # sends data

## 获取导入文件的格式

    def _get_reader(self):
        """

        Returns
        -------
        FileFormat
        """
        if self.source == self.LOCAL_FILE:
            path = self.last_path()
            if self.recent_paths and self.recent_paths[0].file_format:
                qname = self.recent_paths[0].file_format
                reader_class = class_from_qualified_name(qname)
                reader = reader_class(path)
                print('reader_class', reader_class)
            else:
                reader = FileFormat.get_reader(path)
                # Return reader instance that can be used to read the file
            if self.recent_paths and self.recent_paths[0].sheet:
                reader.select_sheet(self.recent_paths[0].sheet)

            return reader
        elif self.source == self.URL:
            url = self.url_combo.currentText().strip()
            if url:
                return UrlReader(url)

## 更新file的下拉列表中的内容

    def _update_sheet_combo(self):
        if len(self.reader.sheets) < 2:
            # self.sheet_box.hide()
            self.reader.select_sheet(None)
            return

        self.sheet_combo.clear()
        self.sheet_combo.addItems(self.reader.sheets)
        self._select_active_sheet()
        # self.sheet_box.show()

    def _select_active_sheet(self):
        if self.reader.sheet:
            try:
                idx = self.reader.sheets.index(self.reader.sheet)
                self.sheet_combo.setCurrentIndex(idx)
            except ValueError:
                # Requested sheet does not exist in this file
                self.reader.select_sheet(None)
        else:
            self.sheet_combo.setCurrentIndex(0)


## 下面是info的描述语句

    def _describe(self, table):
        domain = table.domain
        text = ""

        attrs = getattr(table, "attributes", {})
        descs = [
            attrs[desc] for desc in ("Name", "Description") if desc in attrs
        ]
        if len(descs) == 2:
            descs[0] = "<b>{}</b>".format(descs[0])
        if descs:
            text += "<p>{}</p>".format("<br/>".join(descs))

        text += "<p>{} 个实例数据(s), {} 个输入特征(s), {} 个元特征(s)".\
            format(len(table), len(domain.attributes), len(domain.metas))
        if domain.has_continuous_class:
            text += "<br/>回归模型 ."
        elif domain.has_discrete_class:
            text += "<br/>分类模型; 共分为 {} 类.".\
                format(len(domain.class_var.values))
        elif table.domain.class_vars:
            text += "<br/>多目标模型; {} 个目标".format(len(table.domain.class_vars))
        else:
            text += "<br/>无目标值."
        text += "</p>"

        if 'Timestamp' in table.domain:
            # Google Forms uses this header to timestamp responses
            text += '<p>First entry: {}<br/>Last entry: {}</p>'.format(
                table[0, 'Timestamp'], table[-1, 'Timestamp'])
        return text

    def storeSpecificSettings(self):
        self.current_context.modified_variables = self.variables[:]

    def retrieveSpecificSettings(self):
        if hasattr(self.current_context, "modified_variables"):
            self.variables[:] = self.current_context.modified_variables

    ## 对Ourputs的data赋值为table
    def apply_domain_edit(self):
        if self.data is None:
            table = None
        else:
            domain, cols = self.domain_editor.get_domain(
                self.data.domain, self.data)
            printData = self.data
            printDomain = self.data.domain
            if not (domain.variables or domain.metas):
                table = None
            else:
                X, y, m = cols
                #X是输入,domain.attributes;y是输出class_var;m是元特征
                ## 下面解决将self.data的数据付给了table。
                # 1data's name; 2数据编号ids;3数据属性attributes
                table = Table.from_numpy(domain, X, y, m, self.data.W)
                table.name = self.data.name
                index = self.data.ids
                table.ids = np.array(self.data.ids)
                # print('ids',table.ids)

                data = self.data
                table.attributes = getattr(self.data, 'attributes', {})
                ## 将table的属性定义为{}
                ''' 对Ourputs的data赋值为table'''
        # print('table is :',table)
        # print('table domain',table.domain)
        # print('table name',table.name)
        # print('table class_var name',table.domain.class_vars[0].name)
        self.Outputs.data.send(table)
        self.apply_button.setEnabled(False)

    def get_widget_name_extension(self):
        _, name = os.path.split(self.loaded_file)
        return os.path.splitext(name)[0]

    def send_report(self):
        def get_ext_name(filename):
            try:
                return FileFormat.names[os.path.splitext(filename)[1]]
            except KeyError:
                return "unknown"

        if self.data is None:
            self.report_paragraph("File", "No file.")
            return

        if self.source == self.LOCAL_FILE:
            home = os.path.expanduser("~")
            if self.loaded_file.startswith(home):
                # os.path.join does not like ~
                name = "~" + os.path.sep + \
                       self.loaded_file[len(home):].lstrip("/").lstrip("\\")
            else:
                name = self.loaded_file
            if self.sheet_combo.isVisible():
                name += " ({})".format(self.sheet_combo.currentText())
            self.report_items("File", [("File name", name),
                                       ("Format", get_ext_name(name))])
        else:
            self.report_items("Data", [("Resource", self.url),
                                       ("Format", get_ext_name(self.url))])

        self.report_data("Data", self.data)

    def dragEnterEvent(self, event):
        """Accept drops of valid file urls"""
        urls = event.mimeData().urls()
        if urls:
            try:
                FileFormat.get_reader(
                    OSX_NSURL_toLocalFile(urls[0]) or urls[0].toLocalFile())
                event.acceptProposedAction()
            except IOError:
                pass

    def dropEvent(self, event):
        """Handle file drops"""
        urls = event.mimeData().urls()
        if urls:
            self.add_path(
                OSX_NSURL_toLocalFile(urls[0])
                or urls[0].toLocalFile())  # add first file
            self.source = self.LOCAL_FILE
            self.load_data()

    def workflowEnvChanged(self, key, value, oldvalue):
        """
        Function called when environment changes (e.g. while saving the scheme)
        It make sure that all environment connected values are modified
        (e.g. relative file paths are changed)
        """
        self.update_file_list(key, value, oldvalue)
Example #8
0
class OWConcordance(OWWidget):
    name = "Concordance"
    description = "Display the context of the word."
    icon = "icons/Concordance.svg"
    priority = 520

    class Inputs:
        corpus = Input("Corpus", Corpus)
        query_word = Input("Query Word", Topic)

    class Outputs:
        selected_documents = Output("Selected Documents", Corpus)
        concordances = Output("Concordances", Corpus)

    settingsHandler = PerfectDomainContextHandler(
        match_values=PerfectDomainContextHandler.MATCH_VALUES_ALL)
    autocommit = Setting(True)
    context_width = Setting(5)
    word = ContextSetting("", exclude_metas=False)
    selected_rows = Setting([], schema_only=True)

    class Warning(OWWidget.Warning):
        multiple_words_on_input = Msg("Multiple query words on input. "
                                      "Only the first one is considered!")

    def __init__(self):
        super().__init__()

        self.corpus = None  # Corpus
        self.n_matching = ''  # Info on docs matching the word
        self.n_tokens = ''  # Info on tokens
        self.n_types = ''  # Info on types (unique tokens)
        self.is_word_on_input = False

        # Info attributes
        info_box = gui.widgetBox(self.controlArea, 'Info')
        gui.label(info_box, self, 'Tokens: %(n_tokens)s')
        gui.label(info_box, self, 'Types: %(n_types)s')
        gui.label(info_box, self, 'Matching: %(n_matching)s')

        # Width parameter
        gui.spin(self.controlArea,
                 self,
                 'context_width',
                 3,
                 10,
                 box=True,
                 label="Number of words:",
                 callback=self.set_width)

        gui.rubber(self.controlArea)

        # Search
        c_box = gui.widgetBox(self.mainArea, orientation="vertical")
        self.input = gui.lineEdit(c_box,
                                  self,
                                  'word',
                                  orientation=Qt.Horizontal,
                                  sizePolicy=QSizePolicy(
                                      QSizePolicy.MinimumExpanding,
                                      QSizePolicy.Fixed),
                                  label='Query:',
                                  callback=self.set_word,
                                  callbackOnType=True)
        self.input.setFocus()

        # Concordances view
        self.conc_view = QTableView()
        self.model = ConcordanceModel()
        self.conc_view.setModel(self.model)
        self.conc_view.setWordWrap(False)
        self.conc_view.setSelectionBehavior(QTableView.SelectRows)
        self.conc_view.setSelectionModel(DocumentSelectionModel(self.model))
        self.conc_view.setItemDelegate(HorizontalGridDelegate())
        self.conc_view.selectionModel().selectionChanged.connect(
            self.selection_changed)
        self.conc_view.horizontalHeader().hide()
        self.conc_view.setShowGrid(False)
        self.mainArea.layout().addWidget(self.conc_view)
        self.set_width()

        # Auto-commit box
        gui.auto_commit(self.controlArea, self, 'autocommit', 'Commit',
                        'Auto commit is on')

    def sizeHint(self):  # pragma: no cover
        return QSize(600, 400)

    def set_width(self):
        sel = self.conc_view.selectionModel().selection()
        self.model.set_width(self.context_width)
        if sel:
            self.conc_view.selectionModel().select(
                sel,
                QItemSelectionModel.SelectCurrent | QItemSelectionModel.Rows)

    def selection_changed(self):
        selection = self.conc_view.selectionModel().selection()
        self.selected_rows = sorted(
            set(cell.row() for cell in selection.indexes()))
        self.commit()

    def set_selection(self, selection):
        if selection:
            sel = QItemSelection()
            for row in selection:
                index = self.conc_view.model().index(row, 0)
                sel.select(index, index)
            self.conc_view.selectionModel().select(
                sel,
                QItemSelectionModel.SelectCurrent | QItemSelectionModel.Rows)

    @Inputs.corpus
    def set_corpus(self, data=None):
        self.closeContext()
        self.corpus = data
        if data is None:  # data removed, clear selection
            self.selected_rows = []

        if not self.is_word_on_input:
            self.word = ""
            self.openContext(self.corpus)

        self.model.set_corpus(self.corpus)
        self.set_word()

    @Inputs.query_word
    def set_word_from_input(self, topic):
        self.Warning.multiple_words_on_input.clear()
        if self.is_word_on_input:  # word changed, clear selection
            self.selected_rows = []
        self.is_word_on_input = topic is not None and len(topic) > 0
        self.input.setEnabled(not self.is_word_on_input)
        if self.is_word_on_input:
            if len(topic) > 1:
                self.Warning.multiple_words_on_input()
            self.word = topic.metas[0, 0]
            self.set_word()

    def set_word(self):
        self.selected_rows = []
        self.model.set_word(self.word)
        self.update_widget()
        self.commit()

    def handleNewSignals(self):
        self.set_selection(self.selected_rows)

    def resize_columns(self):
        col_width = (self.conc_view.width() -
                     self.conc_view.columnWidth(1)) / 2 - 12
        self.conc_view.setColumnWidth(0, col_width)
        self.conc_view.setColumnWidth(2, col_width)

    def resizeEvent(self, event):  # pragma: no cover
        super().resizeEvent(event)
        self.resize_columns()

    def update_widget(self):
        self.conc_view.resizeColumnToContents(1)
        self.resize_columns()
        self.conc_view.resizeRowsToContents()

        if self.corpus is not None:
            self.n_matching = '{}/{}'.format(
                self.model.matching_docs() if self.word else 0,
                len(self.corpus))
            self.n_tokens = self.model.n_tokens
            self.n_types = self.model.n_types
        else:
            self.n_matching = ''
            self.n_tokens = ''
            self.n_types = ''

    def commit(self):
        selected_docs = sorted(
            set(self.model.word_index[row][0] for row in self.selected_rows))
        concordance = self.model.get_data()
        if selected_docs:
            selected = self.corpus[selected_docs]
            self.Outputs.selected_documents.send(selected)
        else:
            self.Outputs.selected_documents.send(None)
        self.Outputs.concordances.send(concordance)

    def send_report(self):
        view = self.conc_view
        model = self.conc_view.model()
        self.report_items("Concordances", (
            ("Query", model.word),
            ("Tokens", model.n_tokens),
            ("Types", model.n_types),
            ("Matching", self.n_matching),
        ))
        self.report_table(view)
class TestPerfectDomainContextHandler(TestCase):
    def setUp(self):
        self.domain = Domain(
            attributes=[ContinuousVariable('c1'),
                        DiscreteVariable('d1', values='abc'),
                        DiscreteVariable('d2', values='def')],
            class_vars=[DiscreteVariable('d3', values='ghi')],
            metas=[ContinuousVariable('c2'),
                   DiscreteVariable('d4', values='jkl')]
        )
        self.args = (self.domain,
                     (('c1', Continuous), ('d1', Discrete), ('d2', Discrete)),
                     (('d3', Discrete),),
                     (('c2', Continuous), ('d4', Discrete)))
        self.args_match_all = (self.domain,
                               (('c1', Continuous), ('d1', list('abc')), ('d2', list('def'))),
                               (('d3', list('ghi')),),
                               (('c2', Continuous), ('d4', list('jkl'))))
        self.handler = PerfectDomainContextHandler()
        self.handler.read_defaults = lambda: None
        self.handler.bind(SimpleWidget)
        self.widget = SimpleWidget()
        self.handler.initialize(self.widget)

    def test_new_context(self):
        context = self.handler.new_context(*self.args)
        _, attrs, class_vars, metas = self.args

        self.assertEqual(context.attributes, attrs)
        self.assertEqual(context.class_vars, class_vars)
        self.assertEqual(context.metas, metas)

    def test_open_context(self):
        context = Context()
        context.attributes = ()
        context.class_vars = ()
        self.handler.new_context = Mock(return_value=context)
        self.handler.open_context(self.widget, self.domain)
        self.handler.new_context.assert_called_with(*self.args)

    def test_encode_domain_simple(self):
        attrs, class_vars, metas = self.handler.encode_domain(self.domain)

        self.assertEqual(attrs, (('c1', Continuous), ('d1', Discrete), ('d2', Discrete)))
        self.assertEqual(class_vars, (('d3', Discrete),))
        self.assertEqual(metas, (('c2', Continuous), ('d4', Discrete)))

    def test_encode_domain_match_values(self):
        self.handler.match_values = self.handler.MATCH_VALUES_ALL
        attrs, class_vars, metas = self.handler.encode_domain(self.domain)

        self.assertEqual(attrs, (('c1', Continuous), ('d1', list('abc')), ('d2', list('def'))))
        self.assertEqual(class_vars, (('d3', list('ghi')),))
        self.assertEqual(metas, (('c2', Continuous), ('d4', list('jkl'))))

    def test_match_simple(self):
        domain, attrs, class_vars, metas = self.args
        context = self._create_context(attrs, class_vars, metas)

        self.assertEqual(self.handler.match(context, *self.args),
                         self.handler.PERFECT_MATCH)

        attrs2 = list(attrs)
        attrs2[:2] = attrs[1::-1]
        self.assertEqual(self.handler.match(context,
                                            domain, attrs2, class_vars, metas),
                         self.handler.NO_MATCH)

        attrs3 = list(attrs)
        attrs3.append(attrs[0])
        self.assertEqual(self.handler.match(context,
                                            domain, attrs3, class_vars, metas),
                         self.handler.NO_MATCH)

        metas2 = list(metas)
        metas2.append(attrs[0])
        self.assertEqual(self.handler.match(context,
                                            domain, attrs, class_vars, metas2),
                         self.handler.NO_MATCH)

    def test_match_values(self):
        domain, attrs, class_vars, metas = self.args_match_all
        context = self._create_context(attrs, class_vars, metas)

        self.handler.match_values = self.handler.MATCH_VALUES_ALL
        self.assertEqual(self.handler.match(context, *self.args_match_all),
                         self.handler.PERFECT_MATCH)

        attrs2 = list(attrs)
        attrs2[:2] = attrs[1::-1]
        self.assertEqual(self.handler.match(context,
                                            domain, attrs2, class_vars, metas),
                         self.handler.NO_MATCH)

        attrs3 = list(attrs)
        attrs3.append(attrs[0])
        self.assertEqual(self.handler.match(context,
                                            domain, attrs2, class_vars, metas),
                         self.handler.NO_MATCH)

    def test_encode_setting(self):
        _, attrs, class_vars, metas = self.args
        context = self._create_context(attrs, class_vars, metas)
        encoded_setting = self.handler.encode_setting(
            context, SimpleWidget.setting, "d1")
        self.assertEqual(encoded_setting, ("d1", -2))

        encoded_setting = self.handler.encode_setting(
            context, SimpleWidget.text, "d1")
        self.assertEqual(encoded_setting, ("d1", -1))

        encoded_setting = self.handler.encode_setting(
            context, SimpleWidget.with_metas, "d4")
        self.assertEqual(encoded_setting, ("d4", 1))

    def _create_context(self, attrs, class_vars, metas):
        context = Context()
        context.attributes = attrs
        context.class_vars = class_vars
        context.metas = metas
        return context
Example #10
0
 def test_migrate_removes_invalid_contexts(self):
     context_invalid = ClassValuesContextHandler().new_context([0, 1, 2])
     context_valid = PerfectDomainContextHandler().new_context(*[[]] * 4)
     settings = {'context_settings': [context_invalid, context_valid]}
     self.widget.migrate_settings(settings, 2)
     self.assertEqual(settings['context_settings'], [context_valid])
Example #11
0
class OWCorpusViewer(OWWidget):
    name = "Corpus Viewer"
    description = "Display corpus contents."
    icon = "icons/CorpusViewer.svg"
    priority = 500

    class Inputs:
        corpus = Input("Corpus", Corpus, replaces=["Data"])

    class Outputs:
        matching_docs = Output("Matching Docs", Corpus, default=True)
        other_docs = Output("Other Docs", Corpus)

    settingsHandler = PerfectDomainContextHandler(
        match_values=PerfectDomainContextHandler.MATCH_VALUES_ALL)

    search_indices = ContextSetting(
        [], exclude_metas=False)  # features included in search
    display_indices = ContextSetting(
        [], exclude_metas=False)  # features for display
    display_features = ContextSetting([], exclude_metas=False)
    regexp_filter = ContextSetting("")

    selection = [0]  # TODO: DataHashContextHandler

    show_tokens = Setting(False)
    autocommit = Setting(True)

    class Warning(OWWidget.Warning):
        no_feats_search = Msg('No features included in search.')
        no_feats_display = Msg('No features selected for display.')

    def __init__(self):
        super().__init__()

        self.corpus = None  # Corpus
        self.corpus_docs = None  # Documents generated from Corpus
        self.output_mask = []  # Output corpus indices
        self.doc_webview = None  # WebView for showing content
        self.search_features = [
        ]  # two copies are needed since Display allows drag & drop
        self.display_list_indices = [0]

        # Info attributes
        self.update_info()
        info_box = gui.widgetBox(self.controlArea, 'Info')
        gui.label(info_box, self, 'Documents: %(n_documents)s')
        gui.label(info_box, self, 'Preprocessed: %(is_preprocessed)s')
        gui.label(info_box, self, '  ◦ Tokens: %(n_tokens)s')
        gui.label(info_box, self, '  ◦ Types: %(n_types)s')
        gui.label(info_box, self, 'POS tagged: %(is_pos_tagged)s')
        gui.label(info_box, self, 'N-grams range: %(ngram_range)s')
        gui.label(info_box, self, 'Matching: %(n_matching)s')

        # Search features
        self.search_listbox = gui.listBox(
            self.controlArea,
            self,
            'search_indices',
            'search_features',
            selectionMode=QListView.ExtendedSelection,
            box='Search features',
            callback=self.search_features_changed)

        # Display features
        display_box = gui.widgetBox(self.controlArea, 'Display features')
        self.display_listbox = gui.listBox(
            display_box,
            self,
            'display_list_indices',
            'display_features',
            selectionMode=QListView.ExtendedSelection,
            callback=self.show_docs,
            enableDragDrop=True)
        self.show_tokens_checkbox = gui.checkBox(display_box,
                                                 self,
                                                 'show_tokens',
                                                 'Show Tokens && Tags',
                                                 callback=self.show_docs)

        # Auto-commit box
        gui.auto_commit(self.controlArea, self, 'autocommit', 'Send data',
                        'Auto send is on')

        # Search
        self.filter_input = gui.lineEdit(self.mainArea,
                                         self,
                                         'regexp_filter',
                                         orientation=Qt.Horizontal,
                                         sizePolicy=QSizePolicy(
                                             QSizePolicy.MinimumExpanding,
                                             QSizePolicy.Fixed),
                                         label='RegExp Filter:')
        self.filter_input.textChanged.connect(self.refresh_search)

        # Main area
        self.splitter = QSplitter(
            orientation=Qt.Horizontal,
            childrenCollapsible=False,
        )

        # Document list
        self.doc_list = QTableView()
        self.doc_list.setSelectionBehavior(QTableView.SelectRows)
        self.doc_list.setSelectionMode(QTableView.ExtendedSelection)
        self.doc_list.setEditTriggers(QAbstractItemView.NoEditTriggers)
        self.doc_list.horizontalHeader().setSectionResizeMode(
            QHeaderView.Stretch)
        self.doc_list.horizontalHeader().setVisible(False)
        self.splitter.addWidget(self.doc_list)

        self.doc_list_model = QStandardItemModel(self)
        self.doc_list.setModel(self.doc_list_model)
        self.doc_list.selectionModel().selectionChanged.connect(self.show_docs)

        # Document contents
        self.doc_webview = gui.WebviewWidget(self.splitter, debug=False)
        self.doc_webview.loadFinished.connect(self.highlight_docs)

        self.mainArea.layout().addWidget(self.splitter)

    def copy_to_clipboard(self):
        text = self.doc_webview.selectedText()
        QApplication.clipboard().setText(text)

    @Inputs.corpus
    def set_data(self, corpus=None):
        self.closeContext()
        self.reset_widget()
        self.corpus = corpus
        self.search_features = []
        if corpus is not None:
            domain = self.corpus.domain
            # Enable/disable tokens checkbox
            if not self.corpus.has_tokens():
                self.show_tokens_checkbox.setCheckState(False)
            self.show_tokens_checkbox.setEnabled(self.corpus.has_tokens())

            self.search_features = list(
                filter_visible(chain(domain.variables, domain.metas)))
            self.display_features = list(
                filter_visible(chain(domain.variables, domain.metas)))
            self.search_indices = list(range(len(self.search_features)))
            self.display_indices = list(range(len(self.display_features)))
            self.selection = [0]
            self.openContext(self.corpus)
            self.display_list_indices = self.display_indices
            self.regenerate_docs()
            self.list_docs()
            self.update_info()
            self.set_selection()
            self.show_docs()
        self.commit()

    def reset_widget(self):
        # Corpus
        self.corpus = None
        self.corpus_docs = None
        self.output_mask = []
        self.display_features = []
        # Widgets
        self.search_listbox.clear()
        self.display_listbox.clear()
        self.filter_input.clear()
        self.update_info()
        # Models/vars
        self.search_features.clear()
        self.search_indices.clear()
        self.display_indices.clear()
        self.doc_list_model.clear()
        # Warnings
        self.Warning.clear()
        # WebView
        self.doc_webview.setHtml('')

    def list_docs(self):
        """ List documents into the left scrolling area """
        if self.corpus_docs is None:
            return
        search_keyword = self.regexp_filter.strip('|')
        try:
            reg = re.compile(search_keyword, re.IGNORECASE)
        except sre_constants.error:
            return

        def is_match(x):
            return not bool(search_keyword) or reg.search(x)

        self.output_mask.clear()
        self.doc_list_model.clear()

        for i, (doc, title, content) in enumerate(
                zip(self.corpus, self.corpus.titles, self.corpus_docs)):
            if is_match(content):
                item = QStandardItem()
                item.setData(title, Qt.DisplayRole)
                item.setData(doc, Qt.UserRole)
                self.doc_list_model.appendRow(item)
                self.output_mask.append(i)

    def reset_selection(self):
        if self.doc_list_model.rowCount() > 0:
            self.doc_list.selectRow(0)  # Select the first document
        else:
            self.doc_webview.setHtml('')

    def set_selection(self):
        view = self.doc_list
        if len(self.selection):
            selection = QItemSelection()

            for row in self.selection:
                selection.append(
                    QItemSelectionRange(view.model().index(row, 0),
                                        view.model().index(row, 0)))
            view.selectionModel().select(selection,
                                         QItemSelectionModel.ClearAndSelect)

    def show_docs(self):
        """ Show the selected documents in the right area """
        HTML = '''
        <!doctype html>
        <html>
        <head>
        <script type="text/javascript" src="resources/jquery-3.1.1.min.js">
        </script>
        <script type="text/javascript" src="resources/jquery.mark.min.js">
        </script>
        <script type="text/javascript" src="resources/highlighter.js">
        </script>
        <meta charset='utf-8'>
        <style>

        table {{ border-collapse: collapse; }}
        mark {{ background: #FFCD28; }}

        tr > td {{
            padding-bottom: 3px;
            padding-top: 3px;
        }}

        body {{
            font-family: Helvetica;
            font-size: 10pt;
        }}

        .line {{ border-bottom: 1px solid #000; }}
        .separator {{ height: 5px; }}

        .variables {{
            vertical-align: top;
            padding-right: 10px;
        }}

        .token {{
            padding: 3px;
            border: 1px #B0B0B0 solid;
            margin-right: 5px;
            margin-bottom: 5px;
            display: inline-block;
        }}

        img {{
            max-width: 100%;
        }}

        </style>
        </head>
        <body>
        {}
        </body>
        </html>
        '''
        self.display_indices = self.display_list_indices
        if self.corpus is None:
            return

        self.Warning.no_feats_display.clear()
        if len(self.display_indices) == 0:
            self.Warning.no_feats_display()

        if self.show_tokens:
            tokens = list(self.corpus.ngrams_iterator(include_postags=True))

        marked_search_features = [
            f for i, f in enumerate(self.search_features)
            if i in self.search_indices
        ]

        html = '<table>'
        selection = [
            i.row() for i in self.doc_list.selectionModel().selectedRows()
        ]
        if selection != []:
            self.selection = selection
        for doc_count, index in enumerate(
                self.doc_list.selectionModel().selectedRows()):
            if doc_count > 0:  # add split
                html += '<tr class="line separator"><td/><td/></tr>' \
                        '<tr class="separator"><td/><td/></tr>'

            row_ind = index.data(Qt.UserRole).row_index
            for ind in self.display_indices:
                feature = self.display_features[ind]
                mark = 'class="mark-area"' if feature in marked_search_features else ''
                value = str(index.data(Qt.UserRole)[feature.name])
                is_image = feature.attributes.get('type', '') == 'image'
                if is_image and value != '?':
                    value = '<img src="{}"></img>'.format(value)
                html += '<tr><td class="variables"><strong>{}:</strong></td>' \
                        '<td {}>{}</td></tr>'.format(
                    feature.name, mark, value)

            if self.show_tokens:
                html += '<tr><td class="variables"><strong>Tokens & Tags:</strong></td>' \
                        '<td>{}</td></tr>'.format(''.join('<span class="token">{}</span>'.format(
                    token) for token in tokens[row_ind]))

        html += '</table>'
        base = QUrl.fromLocalFile(__file__)
        self.doc_webview.setHtml(HTML.format(html), base)

    def search_features_changed(self):
        self.regenerate_docs()
        self.refresh_search()

    def regenerate_docs(self):
        self.corpus_docs = None
        self.Warning.no_feats_search.clear()
        if self.corpus is not None:
            feats = [self.search_features[i] for i in self.search_indices]
            if len(feats) == 0:
                self.Warning.no_feats_search()
            self.corpus_docs = self.corpus.documents_from_features(feats)

    def refresh_search(self):
        if self.corpus is not None:
            self.list_docs()
            self.reset_selection()
            self.update_info()
            self.commit()

    @Slot()
    def highlight_docs(self):
        search_keyword = self.regexp_filter.\
            strip('|').replace('\\', '\\\\')    # escape one \ to  two for mark.js

        if search_keyword:
            # mark is undefined when clearing the view (`setHtml('')`). Maybe
            # set and template html with all the scripts, ... but no contents?
            self.doc_webview.runJavaScript('''
                    if (typeof mark !== "undefined") {{
                        mark("{}");
                    }}
                '''.format(search_keyword))

    def update_info(self):
        if self.corpus is not None:
            self.n_documents = len(self.corpus)
            self.n_matching = '{}/{}'.format(self.doc_list_model.rowCount(),
                                             self.n_documents)
            self.n_tokens = sum(
                map(len,
                    self.corpus.tokens)) if self.corpus.has_tokens() else 'n/a'
            self.n_types = len(
                self.corpus.dictionary) if self.corpus.has_tokens() else 'n/a'
            self.is_preprocessed = self.corpus.has_tokens()
            self.is_pos_tagged = self.corpus.pos_tags is not None
            self.ngram_range = '{}-{}'.format(*self.corpus.ngram_range)
        else:
            self.n_documents = ''
            self.n_matching = ''
            self.n_tokens = ''
            self.n_types = ''
            self.is_preprocessed = ''
            self.is_pos_tagged = ''
            self.ngram_range = ''

    def commit(self):
        if self.corpus is not None:
            matched = self.corpus[self.output_mask]
            output_mask = set(self.output_mask)
            unmatched_mask = [
                i for i in range(len(self.corpus)) if i not in output_mask
            ]
            unmatched = self.corpus[unmatched_mask]
            self.Outputs.matching_docs.send(matched)
            self.Outputs.other_docs.send(unmatched)
        else:
            self.Outputs.matching_docs.send(None)
            self.Outputs.other_docs.send(None)
Example #12
0
class TestPerfectDomainContextHandler(TestCase):
    def setUp(self):
        self.domain = Domain(attributes=[
            ContinuousVariable('c1'),
            DiscreteVariable('d1', values='abc'),
            DiscreteVariable('d2', values='def')
        ],
                             class_vars=[DiscreteVariable('d3', values='ghi')],
                             metas=[
                                 ContinuousVariable('c2'),
                                 DiscreteVariable('d4', values='jkl')
                             ])
        self.args = (self.domain, (('c1', Continuous), ('d1', Discrete),
                                   ('d2', Discrete)), (('d3', Discrete), ),
                     (('c2', Continuous), ('d4', Discrete)))
        self.args_match_all = (self.domain,
                               (('c1', Continuous), ('d1', list('abc')),
                                ('d2', list('def'))), (('d3', list('ghi')), ),
                               (('c2', Continuous), ('d4', list('jkl'))))
        self.handler = PerfectDomainContextHandler()
        self.handler.read_defaults = lambda: None
        self.handler.bind(SimpleWidget)
        self.widget = SimpleWidget()
        self.handler.initialize(self.widget)

    def test_new_context(self):
        context = self.handler.new_context(*self.args)
        _, attrs, class_vars, metas = self.args

        self.assertEqual(context.attributes, attrs)
        self.assertEqual(context.class_vars, class_vars)
        self.assertEqual(context.metas, metas)

    def test_open_context(self):
        context = Context()
        context.attributes = ()
        context.class_vars = ()
        self.handler.new_context = Mock(return_value=context)
        self.handler.open_context(self.widget, self.domain)
        self.handler.new_context.assert_called_with(*self.args)

    def test_encode_domain_simple(self):
        attrs, class_vars, metas = self.handler.encode_domain(self.domain)

        self.assertEqual(attrs, (('c1', Continuous), ('d1', Discrete),
                                 ('d2', Discrete)))
        self.assertEqual(class_vars, (('d3', Discrete), ))
        self.assertEqual(metas, (('c2', Continuous), ('d4', Discrete)))

    def test_encode_domain_match_values(self):
        self.handler.match_values = self.handler.MATCH_VALUES_ALL
        attrs, class_vars, metas = self.handler.encode_domain(self.domain)

        self.assertEqual(attrs, (('c1', Continuous), ('d1', list('abc')),
                                 ('d2', list('def'))))
        self.assertEqual(class_vars, (('d3', list('ghi')), ))
        self.assertEqual(metas, (('c2', Continuous), ('d4', list('jkl'))))

    def test_match_simple(self):
        domain, attrs, class_vars, metas = self.args
        context = self._create_context(attrs, class_vars, metas)

        self.assertEqual(self.handler.match(context, *self.args),
                         self.handler.PERFECT_MATCH)

        attrs2 = list(attrs)
        attrs2[:2] = attrs[1::-1]
        self.assertEqual(
            self.handler.match(context, domain, attrs2, class_vars, metas),
            self.handler.NO_MATCH)

        attrs3 = list(attrs)
        attrs3.append(attrs[0])
        self.assertEqual(
            self.handler.match(context, domain, attrs3, class_vars, metas),
            self.handler.NO_MATCH)

        metas2 = list(metas)
        metas2.append(attrs[0])
        self.assertEqual(
            self.handler.match(context, domain, attrs, class_vars, metas2),
            self.handler.NO_MATCH)

    def test_match_values(self):
        domain, attrs, class_vars, metas = self.args_match_all
        context = self._create_context(attrs, class_vars, metas)

        self.handler.match_values = self.handler.MATCH_VALUES_ALL
        self.assertEqual(self.handler.match(context, *self.args_match_all),
                         self.handler.PERFECT_MATCH)

        attrs2 = list(attrs)
        attrs2[:2] = attrs[1::-1]
        self.assertEqual(
            self.handler.match(context, domain, attrs2, class_vars, metas),
            self.handler.NO_MATCH)

        attrs3 = list(attrs)
        attrs3.append(attrs[0])
        self.assertEqual(
            self.handler.match(context, domain, attrs2, class_vars, metas),
            self.handler.NO_MATCH)

    def test_encode_setting(self):
        _, attrs, class_vars, metas = self.args
        context = self._create_context(attrs, class_vars, metas)
        encoded_setting = self.handler.encode_setting(context,
                                                      SimpleWidget.setting,
                                                      "d1")
        self.assertEqual(encoded_setting, ("d1", -2))

        encoded_setting = self.handler.encode_setting(context,
                                                      SimpleWidget.text, "d1")
        self.assertEqual(encoded_setting, ("d1", -1))

        encoded_setting = self.handler.encode_setting(context,
                                                      SimpleWidget.with_metas,
                                                      "d4")
        self.assertEqual(encoded_setting, ("d4", 1))

    def _create_context(self, attrs, class_vars, metas):
        context = Context()
        context.attributes = attrs
        context.class_vars = class_vars
        context.metas = metas
        return context
Example #13
0
class OWFile(widget.OWWidget, RecentPathsWComboMixin):
    name = "File"
    id = "orange.widgets.data.file"
    description = "Read a data from an input file or network " \
                  "and send the data table to the output."
    icon = "icons/File.svg"
    priority = 10
    category = "Data"
    keywords = ["data", "file", "load", "read"]
    outputs = [widget.OutputSignal(
        "Data", Table,
        doc="Attribute-valued data set read from the input file.")]

    want_main_area = False

    SEARCH_PATHS = [("sample-datasets", get_sample_datasets_dir())]

    LOCAL_FILE, URL = range(2)

    settingsHandler = PerfectDomainContextHandler()

    # Overload RecentPathsWidgetMixin.recent_paths to set defaults
    recent_paths = Setting([
        RecentPath("", "sample-datasets", "iris.tab"),
        RecentPath("", "sample-datasets", "titanic.tab"),
        RecentPath("", "sample-datasets", "housing.tab"),
    ])
    recent_urls = Setting([])
    source = Setting(LOCAL_FILE)
    xls_sheet = ContextSetting("")
    sheet_names = Setting({})
    url = Setting("")

    variables = ContextSetting([])

    dlg_formats = (
        "All readable files ({});;".format(
            '*' + ' *'.join(FileFormat.readers.keys())) +
        ";;".join("{} (*{})".format(f.DESCRIPTION, ' *'.join(f.EXTENSIONS))
                  for f in sorted(set(FileFormat.readers.values()),
                                  key=list(FileFormat.readers.values()).index)))

    def __init__(self):
        super().__init__()
        RecentPathsWComboMixin.__init__(self)
        self.domain = None
        self.data = None
        self.loaded_file = ""
        self.reader = None

        layout = QtGui.QGridLayout()
        gui.widgetBox(self.controlArea, margin=0, orientation=layout)
        vbox = gui.radioButtons(None, self, "source", box=True, addSpace=True,
                                callback=self.load_data, addToLayout=False)

        rb_button = gui.appendRadioButton(vbox, "File:", addToLayout=False)
        layout.addWidget(rb_button, 0, 0, QtCore.Qt.AlignVCenter)

        box = gui.hBox(None, addToLayout=False, margin=0)
        box.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed)
        self.file_combo.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed)
        self.file_combo.activated[int].connect(self.select_file)
        box.layout().addWidget(self.file_combo)
        layout.addWidget(box, 0, 1)

        file_button = gui.button(
            None, self, '...', callback=self.browse_file, autoDefault=False)
        file_button.setIcon(self.style().standardIcon(
            QtGui.QStyle.SP_DirOpenIcon))
        file_button.setSizePolicy(Policy.Maximum, Policy.Fixed)
        layout.addWidget(file_button, 0, 2)

        reload_button = gui.button(
            None, self, "Reload", callback=self.load_data, autoDefault=False)
        reload_button.setIcon(self.style().standardIcon(
            QtGui.QStyle.SP_BrowserReload))
        reload_button.setSizePolicy(Policy.Fixed, Policy.Fixed)
        layout.addWidget(reload_button, 0, 3)

        self.sheet_box = gui.hBox(None, addToLayout=False, margin=0)
        self.sheet_combo = gui.comboBox(None, self, "xls_sheet",
                                        callback=self.select_sheet,
                                        sendSelectedValue=True)
        self.sheet_combo.setSizePolicy(
            Policy.MinimumExpanding, Policy.Fixed)
        self.sheet_label = QtGui.QLabel()
        self.sheet_label.setText('Sheet')
        self.sheet_label.setSizePolicy(
            Policy.MinimumExpanding, Policy.Fixed)
        self.sheet_box.layout().addWidget(
            self.sheet_label, QtCore.Qt.AlignLeft)
        self.sheet_box.layout().addWidget(
            self.sheet_combo, QtCore.Qt.AlignVCenter)
        layout.addWidget(self.sheet_box, 2, 1)
        self.sheet_box.hide()

        rb_button = gui.appendRadioButton(vbox, "URL:", addToLayout=False)
        layout.addWidget(rb_button, 3, 0, QtCore.Qt.AlignVCenter)

        self.url_combo = url_combo = QtGui.QComboBox()
        url_model = NamedURLModel(self.sheet_names)
        url_model.wrap(self.recent_urls)
        url_combo.setModel(url_model)
        url_combo.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed)
        url_combo.setEditable(True)
        url_combo.setInsertPolicy(url_combo.InsertAtTop)
        url_edit = url_combo.lineEdit()
        l, t, r, b = url_edit.getTextMargins()
        url_edit.setTextMargins(l + 5, t, r, b)
        layout.addWidget(url_combo, 3, 1, 3, 3)
        url_combo.activated.connect(self._url_set)

        box = gui.vBox(self.controlArea, "Info")
        self.info = gui.widgetLabel(box, 'No data loaded.')
        self.warnings = gui.widgetLabel(box, '')

        box = gui.widgetBox(self.controlArea, "Columns (Double click to edit)")
        domain_editor = DomainEditor(self.variables)
        self.editor_model = domain_editor.model()
        box.layout().addWidget(domain_editor)

        box = gui.hBox(self.controlArea)
        gui.button(
            box, self, "Browse documentation data sets",
            callback=lambda: self.browse_file(True), autoDefault=False)
        gui.rubber(box)
        box.layout().addWidget(self.report_button)
        self.report_button.setFixedWidth(170)

        self.apply_button = gui.button(
            box, self, "Apply", callback=self.apply_domain_edit)
        self.apply_button.hide()
        self.apply_button.setFixedWidth(170)
        self.editor_model.dataChanged.connect(self.apply_button.show)

        self.set_file_list()
        # Must not call open_file from within __init__. open_file
        # explicitly re-enters the event loop (by a progress bar)
        QtCore.QTimer.singleShot(0, self.load_data)

        self.setAcceptDrops(True)

    def sizeHint(self):
        return QtCore.QSize(600, 550)

    def select_file(self, n):
        assert n < len(self.recent_paths)
        super().select_file(n)
        if self.recent_paths:
            self.source = self.LOCAL_FILE
            self.load_data()
            self.set_file_list()

    def select_sheet(self):
        self.recent_paths[0].sheet = self.sheet_combo.currentText()
        self.load_data()

    def _url_set(self):
        self.source = self.URL
        self.load_data()

    def browse_file(self, in_demos=False):
        if in_demos:
            start_file = get_sample_datasets_dir()
            if not os.path.exists(start_file):
                QtGui.QMessageBox.information(
                    None, "File",
                    "Cannot find the directory with documentation data sets")
                return
        else:
            start_file = self.last_path() or os.path.expanduser("~/")

        filename = QtGui.QFileDialog.getOpenFileName(
            self, 'Open Orange Data File', start_file, self.dlg_formats)
        if not filename:
            return
        self.loaded_file = filename
        self.add_path(filename)
        self.source = self.LOCAL_FILE
        self.load_data()

    # Open a file, create data from it and send it over the data channel
    def load_data(self):
        self.reader = self._get_reader()
        self._update_sheet_combo()

        errors = []
        with catch_warnings(record=True) as warnings:
            try:
                data = self.reader.read()
            except Exception as ex:
                errors.append("An error occurred:")
                errors.append(str(ex))
                data = None
                self.editor_model.reset()
            self.warning(warnings[-1].message.args[0] if warnings else '')

        if data is None:
            self.send("Data", None)
            self.info.setText("\n".join(errors))
            return

        self.info.setText(self._describe(data))

        add_origin(data, self.loaded_file or self.last_path())
        self.send("Data", data)
        self.editor_model.set_domain(data.domain)
        self.data = data

    def _get_reader(self):
        """

        Returns
        -------
        FileFormat
        """
        if self.source == self.LOCAL_FILE:
            reader = FileFormat.get_reader(self.last_path())
            if self.recent_paths and self.recent_paths[0].sheet:
                reader.select_sheet(self.recent_paths[0].sheet)
            return reader
        elif self.source == self.URL:
            return UrlReader(self.url_combo.currentText())

    def _update_sheet_combo(self):
        if len(self.reader.sheets) < 2:
            self.sheet_box.hide()
            self.reader.select_sheet(None)
            return

        self.sheet_combo.clear()
        self.sheet_combo.addItems(self.reader.sheets)
        self._select_active_sheet()
        self.sheet_box.show()

    def _select_active_sheet(self):
        if self.reader.sheet:
            try:
                idx = self.reader.sheets.index(self.reader.sheet)
                self.sheet_combo.setCurrentIndex(idx)
            except ValueError:
                # Requested sheet does not exist in this file
                self.reader.select_sheet(None)
        else:
            self.sheet_combo.setCurrentIndex(0)

    def _describe(self, table):
        domain = table.domain
        text = "{} instance(s), {} feature(s), {} meta attribute(s)".format(
            len(table), len(domain.attributes), len(domain.metas))
        if domain.has_continuous_class:
            text += "\nRegression; numerical class."
        elif domain.has_discrete_class:
            text += "\nClassification; discrete class with {} values.".format(
                len(domain.class_var.values))
        elif table.domain.class_vars:
            text += "\nMulti-target; {} target variables.".format(
                len(table.domain.class_vars))
        else:
            text += "\nData has no target variable."
        if 'Timestamp' in table.domain:
            # Google Forms uses this header to timestamp responses
            text += '\n\nFirst entry: {}\nLast entry: {}'.format(
                table[0, 'Timestamp'], table[-1, 'Timestamp'])
        return text

    def storeSpecificSettings(self):
        self.current_context.modified_variables = self.variables[:]

    def retrieveSpecificSettings(self):
        if hasattr(self.current_context, "modified_variables"):
            self.variables[:] = self.current_context.modified_variables

    def apply_domain_edit(self):
        attributes = []
        class_vars = []
        metas = []
        places = [attributes, class_vars, metas]
        X, y, m = [], [], []
        cols = [X, y, m]  # Xcols, Ycols, Mcols

        def is_missing(x):
            return str(x) in ("nan", "")

        for column, (name, tpe, place, vals, is_con), (orig_var, orig_plc) in \
            zip(count(), self.editor_model.variables,
                chain([(at, 0) for at in self.data.domain.attributes],
                      [(cl, 1) for cl in self.data.domain.class_vars],
                      [(mt, 2) for mt in self.data.domain.metas])):
            if place == 3:
                continue
            if orig_plc == 2:
                col_data = list(chain(*self.data[:, orig_var].metas))
            else:
                col_data = list(chain(*self.data[:, orig_var]))
            if name == orig_var.name and tpe == type(orig_var):
                var = orig_var
            elif tpe == DiscreteVariable:
                values = list(str(i) for i in set(col_data) if not is_missing(i))
                var = tpe(name, values)
                col_data = [np.nan if is_missing(x) else values.index(str(x))
                            for x in col_data]
            elif tpe == StringVariable and type(orig_var) == DiscreteVariable:
                var = tpe(name)
                col_data = [orig_var.repr_val(x) if not np.isnan(x) else ""
                            for x in col_data]
            else:
                var = tpe(name)
            places[place].append(var)
            cols[place].append(col_data)
        domain = Domain(attributes, class_vars, metas)
        X = np.array(X).T if len(X) else np.empty((len(self.data), 0))
        y = np.array(y).T if len(y) else None
        dtpe = object if any(isinstance(m, StringVariable)
                             for m in domain.metas) else float
        m = np.array(m, dtype=dtpe).T if len(m) else None
        table = Table.from_numpy(domain, X, y, m, self.data.W)
        self.send("Data", table)
        self.apply_button.hide()

    def get_widget_name_extension(self):
        _, name = os.path.split(self.loaded_file)
        return os.path.splitext(name)[0]

    def send_report(self):
        def get_ext_name(filename):
            try:
                return FileFormat.names[os.path.splitext(filename)[1]]
            except KeyError:
                return "unknown"

        if self.data is None:
            self.report_paragraph("File", "No file.")
            return

        if self.source == self.LOCAL_FILE:
            home = os.path.expanduser("~")
            if self.loaded_file.startswith(home):
                # os.path.join does not like ~
                name = "~/" + \
                       self.loaded_file[len(home):].lstrip("/").lstrip("\\")
            else:
                name = self.loaded_file
            if self.sheet_combo.isVisible():
                name += " ({})".format(self.sheet_combo.currentText())
            self.report_items("File", [("File name", name),
                                       ("Format", get_ext_name(name))])
        else:
            self.report_items("Data", [("Resource", self.url),
                                       ("Format", get_ext_name(self.url))])

        self.report_data("Data", self.data)

    def dragEnterEvent(self, event):
        """Accept drops of valid file urls"""
        urls = event.mimeData().urls()
        if urls:
            try:
                FileFormat.get_reader(OSX_NSURL_toLocalFile(urls[0]) or
                                      urls[0].toLocalFile())
                event.acceptProposedAction()
            except IOError:
                pass

    def dropEvent(self, event):
        """Handle file drops"""
        urls = event.mimeData().urls()
        if urls:
            self.add_path(OSX_NSURL_toLocalFile(urls[0]) or
                          urls[0].toLocalFile())  # add first file
            self.source = self.LOCAL_FILE
            self.load_data()
Example #14
0
class OWSelectRows(widget.OWWidget):
    name = "Select Rows"
    id = "Orange.widgets.data.file"
    description = "Select rows from the data based on values of variables."
    icon = "icons/SelectRows.svg"
    priority = 100
    category = "Data"
    author = "Peter Juvan, Janez Demšar"
    author_email = "janez.demsar(@at@)fri.uni-lj.si"
    inputs = [("Data", Table, "set_data")]
    outputs = [("Matching Data", Table, widget.Default),
               ("Unmatched Data", Table)]

    want_main_area = False

    settingsHandler = PerfectDomainContextHandler()
    conditions = ContextSetting([])
    update_on_change = Setting(True)
    purge_attributes = Setting(True)
    purge_classes = Setting(True)
    auto_commit = Setting(True)

    operator_names = {
        ContinuousVariable: [
            "equals", "is not", "is below", "is at most", "is greater than",
            "is at least", "is between", "is outside", "is defined"
        ],
        DiscreteVariable: ["is", "is not", "is one of", "is defined"],
        StringVariable: [
            "equals", "is not", "is before", "is equal or before", "is after",
            "is equal or after", "is between", "is outside", "contains",
            "begins with", "ends with", "is defined"
        ]
    }

    def __init__(self):
        super().__init__()

        self.old_purge_classes = True

        self.conditions = []
        self.last_output_conditions = None
        self.data = None
        self.data_desc = self.match_desc = self.nonmatch_desc = None

        box = gui.widgetBox(self.controlArea, 'Conditions', stretch=100)
        self.cond_list = QtGui.QTableWidget(box)
        box.layout().addWidget(self.cond_list)
        self.cond_list.setShowGrid(False)
        self.cond_list.setSelectionMode(QtGui.QTableWidget.NoSelection)
        self.cond_list.setColumnCount(3)
        self.cond_list.setRowCount(0)
        self.cond_list.verticalHeader().hide()
        self.cond_list.horizontalHeader().hide()
        self.cond_list.resizeColumnToContents(0)
        self.cond_list.horizontalHeader().setResizeMode(
            QtGui.QHeaderView.Stretch)
        self.cond_list.viewport().setBackgroundRole(QtGui.QPalette.Window)

        box2 = gui.widgetBox(box, orientation="horizontal")
        self.add_button = gui.button(box2,
                                     self,
                                     "Add condition",
                                     callback=self.add_row)
        self.add_all_button = gui.button(box2,
                                         self,
                                         "Add all variables",
                                         callback=self.add_all)
        self.remove_all_button = gui.button(box2,
                                            self,
                                            "Remove all",
                                            callback=self.remove_all)
        gui.rubber(box2)

        info = gui.widgetBox(self.controlArea, '', orientation="horizontal")
        box_data_in = gui.widgetBox(info, 'Data In')
        #        self.data_in_rows = gui.widgetLabel(box_data_in, " ")
        self.data_in_variables = gui.widgetLabel(box_data_in, " ")
        gui.rubber(box_data_in)

        box_data_out = gui.widgetBox(info, 'Data Out')
        self.data_out_rows = gui.widgetLabel(box_data_out, " ")
        #        self.dataOutAttributesLabel = gui.widgetLabel(box_data_out, " ")
        gui.rubber(box_data_out)

        box = gui.widgetBox(self.controlArea, orientation="horizontal")
        box_setting = gui.widgetBox(box, 'Purging')
        self.cb_pa = gui.checkBox(box_setting,
                                  self,
                                  "purge_attributes",
                                  "Remove unused features",
                                  callback=self.conditions_changed)
        gui.separator(box_setting, height=1)
        self.cb_pc = gui.checkBox(box_setting,
                                  self,
                                  "purge_classes",
                                  "Remove unused classes",
                                  callback=self.conditions_changed)
        gui.auto_commit(box,
                        self,
                        "auto_commit",
                        label="Commit",
                        checkbox_label="Commit on change")
        self.set_data(None)
        self.resize(600, 400)

    def add_row(self, attr=None, condition_type=None, condition_value=None):
        model = self.cond_list.model()
        row = model.rowCount()
        model.insertRow(row)

        attr_combo = QtGui.QComboBox(minimumContentsLength=12,
                                     sizeAdjustPolicy=QtGui.QComboBox.
                                     AdjustToMinimumContentsLengthWithIcon)
        attr_combo.row = row
        for var in chain(self.data.domain.variables, self.data.domain.metas):
            attr_combo.addItem(*gui.attributeItem(var))
        attr_combo.setCurrentIndex(attr or 0)
        self.cond_list.setCellWidget(row, 0, attr_combo)

        self.remove_all_button.setDisabled(False)
        self.set_new_operators(attr_combo, attr is not None, condition_type,
                               condition_value)
        attr_combo.currentIndexChanged.connect(
            lambda _: self.set_new_operators(attr_combo, False))

        self.cond_list.resizeRowToContents(row)

    def add_all(self):
        if self.cond_list.rowCount():
            Mb = QtGui.QMessageBox
            if Mb.question(
                    self, "Remove existing filters",
                    "This will replace the existing filters with "
                    "filters for all variables.", Mb.Ok | Mb.Cancel) != Mb.Ok:
                return
            self.remove_all()
        domain = self.data.domain
        for i in range(len(domain.variables) + len(domain.metas)):
            self.add_row(i)

    def remove_all(self):
        self.remove_all_rows()
        self.conditions_changed()

    def remove_all_rows(self):
        self.cond_list.clear()
        self.cond_list.setRowCount(0)
        self.remove_all_button.setDisabled(True)

    def set_new_operators(self,
                          attr_combo,
                          adding_all,
                          selected_index=None,
                          selected_values=None):
        oper_combo = QtGui.QComboBox()
        oper_combo.row = attr_combo.row
        oper_combo.attr_combo = attr_combo
        var = self.data.domain[attr_combo.currentText()]
        oper_combo.addItems(self.operator_names[type(var)])
        oper_combo.setCurrentIndex(selected_index or 0)
        self.set_new_values(oper_combo, adding_all, selected_values)
        self.cond_list.setCellWidget(oper_combo.row, 1, oper_combo)
        oper_combo.currentIndexChanged.connect(
            lambda _: self.set_new_values(oper_combo, False))

    @staticmethod
    def _get_lineedit_contents(box):
        return [
            child.text() for child in getattr(box, "controls", [box])
            if isinstance(child, QtGui.QLineEdit)
        ]

    @staticmethod
    def _get_value_contents(box):
        cont = []
        names = []
        for child in getattr(box, "controls", [box]):
            if isinstance(child, QtGui.QLineEdit):
                cont.append(child.text())
            elif isinstance(child, QtGui.QComboBox):
                cont.append(child.currentIndex())
            elif isinstance(child, QtGui.QToolButton):
                if child.popup is not None:
                    model = child.popup.list_view.model()
                    for row in range(model.rowCount()):
                        item = model.item(row)
                        if item.checkState():
                            cont.append(row + 1)
                            names.append(item.text())
                    child.desc_text = ', '.join(names)
                    child.set_text()
            elif child is None:
                pass
            else:
                raise TypeError('Type %s not supported.' % type(child))
        return tuple(cont)

    class QDoubleValidatorEmpty(QtGui.QDoubleValidator):
        def validate(self, input_, pos):
            if not input_:
                return (QtGui.QDoubleValidator.Acceptable, input_, pos)
            else:
                return super().validate(input_, pos)

    def set_new_values(self, oper_combo, adding_all, selected_values=None):
        # def remove_children():
        #     for child in box.children()[1:]:
        #         box.layout().removeWidget(child)
        #         child.setParent(None)

        def add_textual(contents):
            le = gui.lineEdit(box, self, None)
            if contents:
                le.setText(contents)
            le.setAlignment(QtCore.Qt.AlignRight)
            le.editingFinished.connect(self.conditions_changed)
            return le

        def add_numeric(contents):
            le = add_textual(contents)
            le.setValidator(OWSelectRows.QDoubleValidatorEmpty())
            return le

        var = self.data.domain[oper_combo.attr_combo.currentText()]
        box = self.cond_list.cellWidget(oper_combo.row, 2)
        if selected_values is not None:
            lc = list(selected_values) + ["", ""]
            lc = [str(x) for x in lc[:2]]
        else:
            lc = ["", ""]
        if box and vartype(var) == box.var_type:
            lc = self._get_lineedit_contents(box) + lc
        oper = oper_combo.currentIndex()

        if oper == oper_combo.count() - 1:
            self.cond_list.removeCellWidget(oper_combo.row, 2)
        elif var.is_discrete:
            if oper_combo.currentText() == "is one of":
                if selected_values:
                    lc = [x for x in list(selected_values)]
                button = DropDownToolButton(self, var, lc)
                button.var_type = vartype(var)
                self.cond_list.setCellWidget(oper_combo.row, 2, button)
            else:
                combo = QtGui.QComboBox()
                combo.addItems([""] + var.values)
                if lc[0]:
                    combo.setCurrentIndex(int(lc[0]))
                else:
                    combo.setCurrentIndex(0)
                combo.var_type = vartype(var)
                self.cond_list.setCellWidget(oper_combo.row, 2, combo)
                combo.currentIndexChanged.connect(self.conditions_changed)
        else:
            box = gui.widgetBox(self,
                                orientation="horizontal",
                                addToLayout=False)
            box.var_type = vartype(var)
            self.cond_list.setCellWidget(oper_combo.row, 2, box)
            if var.is_continuous:
                box.controls = [add_numeric(lc[0])]
                if oper > 5:
                    gui.widgetLabel(box, " and ")
                    box.controls.append(add_numeric(lc[1]))
                gui.rubber(box)
            elif var.is_string:
                box.controls = [add_textual(lc[0])]
                if oper in [6, 7]:
                    gui.widgetLabel(box, " and ")
                    box.controls.append(add_textual(lc[1]))
            else:
                box.controls = []
        if not adding_all:
            self.conditions_changed()

    def set_data(self, data):
        self.closeContext()
        self.data = data
        self.cb_pa.setEnabled(not isinstance(data, SqlTable))
        self.cb_pc.setEnabled(not isinstance(data, SqlTable))
        self.remove_all_rows()
        self.add_button.setDisabled(data is None)
        self.add_all_button.setDisabled(
            data is None
            or len(data.domain.variables) + len(data.domain.metas) > 100)
        if not data:
            self.data_desc = None
            self.commit()
            return
        self.data_desc = report.describe_data_brief(data)
        self.conditions = []
        try:
            self.openContext(data)
        except Exception:
            pass

        if not self.conditions and len(data.domain.variables):
            self.add_row()
        self.update_info(data, self.data_in_variables)
        for attr, cond_type, cond_value in self.conditions:
            attrs = [a.name for a in data.domain.variables + data.domain.metas]
            if attr in attrs:
                self.add_row(attrs.index(attr), cond_type, cond_value)
        self.unconditional_commit()

    def conditions_changed(self):
        try:
            self.conditions = []
            self.conditions = [
                (self.cond_list.cellWidget(row, 0).currentText(),
                 self.cond_list.cellWidget(row, 1).currentIndex(),
                 self._get_value_contents(self.cond_list.cellWidget(row, 2)))
                for row in range(self.cond_list.rowCount())
            ]
            if self.update_on_change and (
                    self.last_output_conditions is None
                    or self.last_output_conditions != self.conditions):
                self.commit()
        except AttributeError:
            # Attribute error appears if the signal is triggered when the
            # controls are being constructed
            pass

    def commit(self):
        matching_output = self.data
        non_matching_output = None
        if self.data:
            domain = self.data.domain
            conditions = []
            for attr_name, oper, values in self.conditions:
                attr_index = domain.index(attr_name)
                attr = domain[attr_index]
                if attr.is_continuous:
                    if any(not v for v in values):
                        continue
                    filter = data_filter.FilterContinuous(
                        attr_index, oper, *[float(v) for v in values])
                elif attr.is_string:
                    filter = data_filter.FilterString(
                        attr_index, oper, *[str(v) for v in values])
                else:
                    if oper == 3:
                        f_values = None
                    else:
                        if not values or not values[0]:
                            continue
                        values = [attr.values[i - 1] for i in values]
                        if oper == 0:
                            f_values = {values[0]}
                        elif oper == 1:
                            f_values = set(attr.values)
                            f_values.remove(values[0])
                        elif oper == 2:
                            f_values = set(values)
                        else:
                            raise ValueError("invalid operand")
                    filter = data_filter.FilterDiscrete(attr_index, f_values)
                conditions.append(filter)

            if conditions:
                filters = data_filter.Values(conditions)
                matching_output = filters(self.data)
                filters.negate = True
                non_matching_output = filters(self.data)

            # if hasattr(self.data, "name"):
            #     matching_output.name = self.data.name
            #     non_matching_output.name = self.data.name

            purge_attrs = self.purge_attributes
            purge_classes = self.purge_classes
            if (purge_attrs or purge_classes) and \
                    not isinstance(self.data, SqlTable):
                attr_flags = sum([
                    Remove.RemoveConstant * purge_attrs,
                    Remove.RemoveUnusedValues * purge_attrs
                ])
                class_flags = sum([
                    Remove.RemoveConstant * purge_classes,
                    Remove.RemoveUnusedValues * purge_classes
                ])
                # same settings used for attributes and meta features
                remover = Remove(attr_flags, class_flags, attr_flags)

                matching_output = remover(matching_output)
                non_matching_output = remover(non_matching_output)

        self.send("Matching Data", matching_output)
        self.send("Unmatched Data", non_matching_output)

        self.match_desc = report.describe_data_brief(matching_output)
        self.nonmatch_desc = report.describe_data_brief(non_matching_output)

        self.update_info(matching_output, self.data_out_rows)

    def update_info(self, data, lab1):
        def sp(s, capitalize=True):
            return s and s or ("No" if capitalize else "no"), "s" * (s != 1)

        if data is None:
            lab1.setText("")
        else:
            lab1.setText(
                "~%s row%s, %s variable%s" %
                (sp(data.approx_len()) +
                 sp(len(data.domain.variables) + len(data.domain.metas))))

    def send_report(self):
        if not self.data:
            self.report_paragraph("No data.")
            return

        pdesc = None
        describe_domain = False
        for d in (self.data_desc, self.match_desc, self.nonmatch_desc):
            if not d or not d["Data instances"]:
                continue
            ndesc = d.copy()
            del ndesc["Data instances"]
            if pdesc is not None and pdesc != ndesc:
                describe_domain = True
            pdesc = ndesc

        conditions = []
        domain = self.data.domain
        for attr_name, oper, values in self.conditions:
            attr_index = domain.index(attr_name)
            attr = domain[attr_index]
            names = self.operator_names[type(attr)]
            name = names[oper]
            if oper == len(names) - 1:
                conditions.append("{} {}".format(attr, name))
            elif attr.is_discrete:
                if name == "is one of":
                    if len(values) == 1:
                        conditions.append("{} is {}".format(
                            attr, attr.values[values[0] - 1]))
                    elif len(values) > 1:
                        conditions.append("{} is {} or {}".format(
                            attr,
                            ", ".join(attr.values[v - 1] for v in values[:-1]),
                            attr.values[values[-1] - 1]))
                else:
                    if not (values and values[0]):
                        continue
                    value = values[0] - 1
                    conditions.append("{} {} {}".format(
                        attr, name, attr.values[value]))
            else:
                if len(values) == 1:
                    conditions.append("{} {} {}".format(attr, name, *values))
                else:
                    conditions.append("{} {} {} and {}".format(
                        attr, name, *values))
        items = OrderedDict()
        if describe_domain:
            items.update(self.data_desc)
        else:
            items["Instances"] = self.data_desc["Data instances"]
        items["Condition"] = " AND ".join(conditions) or "no conditions"
        self.report_items("Data", items)
        if describe_domain:
            self.report_items("Matching data", self.match_desc)
            self.report_items("Non-matching data", self.nonmatch_desc)
        else:
            match_inst = \
                bool(self.match_desc) and \
                self.match_desc["Data instances"]
            nonmatch_inst = \
                bool(self.nonmatch_desc) and \
                self.nonmatch_desc["Data instances"]
            self.report_items(
                "Output",
                (("Matching data",
                  "{} instances".format(match_inst) if match_inst else "None"),
                 ("Non-matching data", nonmatch_inst > 0
                  and "{} instances".format(nonmatch_inst))))
Example #15
0
class OWAlignDatasets(widget.OWWidget):
    name = "Align Datasets"
    description = "Alignment of multiple datasets with a diagram of correlation visualization."
    icon = "icons/AlignDatasets.svg"
    priority = 240

    class Inputs:
        data = Input("Data", Table)

    class Outputs:
        transformed_data = Output("Transformed Data", Table)
        genes_components = Output("Genes per n. Components", Table)

    settingsHandler = PerfectDomainContextHandler()
    axis_labels = ContextSetting(10)
    source_id = ContextSetting(None)
    ncomponents = ContextSetting(20)
    ngenes = ContextSetting(30)
    scoring = ContextSetting(list(SCORINGS.keys())[0])
    quantile_normalization = ContextSetting(False)
    quantile_normalization_perc = ContextSetting(2.5)
    dynamic_time_warping = ContextSetting(False)

    auto_update = Setting(True)
    auto_commit = Setting(True)

    graph_name = "plot.plotItem"

    class Error(widget.OWWidget.Error):
        no_features = widget.Msg("At least 1 feature is required")
        no_instances = widget.Msg(
            "At least 2 data instances are required for each class")
        no_class = widget.Msg("At least 1 Discrete class variable is required")
        nan_class = widget.Msg(
            "Data contains undefined instances for the selected Data source indicator"
        )
        nan_input = widget.Msg("Input data contains non numeric values")
        sparse_data = widget.Msg("Sparse data is not supported")
        only_one_dataset = widget.Msg(
            "Data source indicator attribute column must indicate at least two datasets."
        )

    def __init__(self):
        super().__init__()
        self.data = None
        self.source_id = None
        self._mas = None
        self._Ws = None
        self._transformed = None
        self._components = None
        self._use_genes = None
        self._shared_correlations = None
        self._transformed_table = None
        self._line = False
        self._feature_model = DomainModel(valid_types=DiscreteVariable,
                                          separators=False)
        self._feature_model.set_domain(None)
        self._init_mas()
        self._legend = None
        form = QFormLayout(labelAlignment=Qt.AlignLeft,
                           formAlignment=Qt.AlignLeft,
                           fieldGrowthPolicy=QFormLayout.AllNonFixedFieldsGrow,
                           verticalSpacing=10)
        # Data source indicator
        box = gui.vBox(self.controlArea, "Data source indicator")

        gui.comboBox(
            box,
            self,
            "source_id",
            sendSelectedValue=True,
            callback=self._update_combo_source_id,
            model=self._feature_model,
        )

        # Canonical correlation analysis
        box = gui.vBox(self.controlArea, "Canonical correlation analysis")
        gui.spin(box,
                 self,
                 "ncomponents",
                 1,
                 MAX_COMPONENTS,
                 callback=self._update_selection_component_spin,
                 keyboardTracking=False,
                 label="Num. of components")

        # Shared genes
        box = gui.vBox(self.controlArea, "Shared genes")
        gui.spin(
            box,
            self,
            "ngenes",
            1,
            MAX_GENES,
            callback=self._update_ngenes_spin,
            keyboardTracking=False,
        )
        form.addRow("Num. of genes", self.controls.ngenes)

        gui.comboBox(
            box,
            self,
            "scoring",
            callback=self._update_scoring_combo,
            items=list(SCORINGS.keys()),
            sendSelectedValue=True,
            editable=False,
        )
        form.addRow("Scoring:", self.controls.scoring)

        box.layout().addLayout(form)

        # Post-processing
        box = gui.vBox(self.controlArea, "Post-processing")
        gui.doubleSpin(
            box,
            self,
            "quantile_normalization_perc",
            minv=0,
            maxv=49,
            step=5e-1,
            callback=self._update_quantile_normalization,
            checkCallback=self._update_quantile_normalization,
            controlWidth=80,
            alignment=Qt.AlignRight,
            label="Quantile normalization",
            checked="quantile_normalization",
        )
        self.controls.quantile_normalization_perc.setSuffix("%")
        b = gui.vBox(box)
        gui.checkBox(b,
                     self,
                     "dynamic_time_warping",
                     callback=self._update_dynamic_time_warping,
                     label="Dynamic time warping")

        self.controlArea.layout().addStretch()

        gui.auto_commit(self.controlArea,
                        self,
                        "auto_commit",
                        "Apply",
                        callback=self._invalidate_selection(),
                        checkbox_label="Apply automatically")

        self.plot = pg.PlotWidget(background="w")

        axis = self.plot.getAxis("bottom")
        axis.setLabel("Correlation components")
        axis = self.plot.getAxis("left")
        axis.setLabel("Correlation strength")
        self.plot_horlabels = []
        self.plot_horlines = []

        self.plot.getViewBox().setMenuEnabled(False)
        self.plot.getViewBox().setMouseEnabled(False, False)
        self.plot.showGrid(True, True, alpha=0.5)
        self.plot.setRange(xRange=(0.0, 1.0), yRange=(0.0, 1.0))

        self.mainArea.layout().addWidget(self.plot)

    @Inputs.data
    @check_sql_input
    def set_data(self, data):
        self.closeContext()
        self.clear_messages()
        self.clear()
        self.information()
        self.clear_outputs()
        self._feature_model.set_domain(None)
        self.data = data

        if self.data:
            self._feature_model.set_domain(self.data.domain)
            if self._feature_model:
                # if source id is available we assume that it is the feature that describes a dataset source
                if "Source ID" in self.data.domain:
                    self.source_id = self.data.domain["Source ID"]
                self.openContext(self.data.domain)
                if self.source_id is None or self.source_id == '':
                    for model in self._feature_model:
                        y = np.array(self.data.get_column_view(model)[0],
                                     dtype=np.float64)
                        _, counts = np.unique(y, return_counts=True)
                        if np.isfinite(y).all() and min(counts) > 1:
                            self.source_id = model
                            self._reset_max_components()
                            break

                if not self.source_id:
                    self.Error.nan_class()
                    return
                if len(self.data.domain.attributes) == 0:
                    self.Error.no_features()
                    return
                if len(self.data) == 0:
                    self.Error.no_instances()
                    return
                if np.isnan(self.data.X).any():
                    self.Error.nan_input()
                    return
                y = np.array(self.data.get_column_view(self.source_id)[0],
                             dtype=np.float64)
                _, counts = np.unique(y, return_counts=True)
                if min(counts) < 2:
                    self.Error.no_instances()
                    return
                self._reset_max_components()
                self.fit()

            else:
                self.Error.no_class()
                self.clear()
                return

    def fit(self):
        if self.data is None:
            return
        global MAX_COMPONENTS
        if self.ncomponents > MAX_COMPONENTS:
            self.ncomponents = MAX_COMPONENTS

        X = self.data.X
        y = self.data.get_column_view(self.source_id)[0]

        if len(set(y)) < 2:
            self.Error.only_one_dataset()
            return
        self._init_mas()

        self._Ws = self._mas.fit(X, y)
        self._shared_correlations = self._mas.shared_correlations
        if np.isnan(np.sum(self._shared_correlations)):
            self._shared_correlations = np.array(
                [interpolate_nans(x) for x in self._shared_correlations])
        self._use_genes = self._mas.use_genes

        self._setup_plot()
        if self.auto_commit:
            self.commit()

    def clear(self):
        self.data = None
        self.source_id = None
        self._mas = None
        self._Ws = None
        self._transformed = None
        self._transformed_table = None
        self._components = None
        self._use_genes = None
        self._shared_correlations = None
        self._feature_model.set_domain(None)
        self.clear_plot()

    def clear_legend(self):
        if self._legend is None:
            return

        scene = self._legend.scene()
        if scene is None:
            return

        scene.removeItem(self._legend)
        self._legend = None

    def clear_plot(self):
        self.clear_legend()
        self._line = False
        self.plot_horlabels = []
        self.plot_horlines = []
        self._mas = None
        self._setup_plot()

    def clear_outputs(self):
        self.Outputs.transformed_data.send(None)
        self.Outputs.genes_components.send(None)

    def _reset_max_components(self):
        y = np.array(self.data.get_column_view(self.source_id)[0],
                     dtype=np.float64)
        _, counts = np.unique(y, return_counts=True)
        global MAX_COMPONENTS
        if min(counts) < MAX_COMPONENTS_DEFAULT or len(
                self.data.domain.attributes) < MAX_COMPONENTS_DEFAULT:
            MAX_COMPONENTS = min(min(counts), len(
                self.data.domain.attributes)) - 1
            if self.ncomponents > MAX_COMPONENTS:
                self.ncomponents = MAX_COMPONENTS // 2
            self.controls.ncomponents.setMaximum(MAX_COMPONENTS)
        else:
            MAX_COMPONENTS = MAX_COMPONENTS_DEFAULT
            self.ncomponents = 20
            self.controls.ncomponents.setMaximum(MAX_COMPONENTS)

    def _init_mas(self):
        self._mas = SeuratAlignmentModel(
            n_components=MAX_COMPONENTS,
            n_metagenes=self.ngenes,
            gene_scoring=SCORINGS[self.scoring],
        )

    def get_model(self):
        if self.data is None:
            return

        self.fit()
        self._setup_plot()
        self.commit()

    def _setup_plot(self):
        self.plot.clear()
        if self._mas is None:
            return

        shared_correlations = self._shared_correlations
        p = MAX_COMPONENTS

        # Colors chosen based on: http://colorbrewer2.org/?type=qualitative&scheme=Set1&n=9
        colors = [
            '#e41a1c', '#377eb8', '#4daf4a', '#984ea3', '#ff7f00', '#ffff33',
            '#a65628', '#f781bf', '#999999'
        ]

        self.clear_legend()
        self._legend = self.plot.addLegend(offset=(-1, 1))
        # correlation lines
        offset = 2
        if MAX_COMPONENTS > 2 * offset + 1:
            smoothed_correlations = smooth_correlations(shared_correlations,
                                                        offset=offset)
        else:
            smoothed_correlations = shared_correlations
        plotitem = dict()
        for i, corr in enumerate(smoothed_correlations):
            plotitem[i] = self.plot.plot(
                np.arange(p),
                corr,
                pen=pg.mkPen(QColor(colors[i]), width=2),
                antialias=True)  # name=self.source_id.values[i]
        # self.plot.plotItem.legend.addItem(3, "maximum value")

        for i in range(len(plotitem)):
            self._legend.addItem(
                MyLegendItem(pg.ScatterPlotItem(pen=colors[i])),
                self.source_id.values[i])

        # vertical movable line
        cutpos = self.ncomponents - 1
        self._line = pg.InfiniteLine(angle=90,
                                     pos=cutpos,
                                     movable=True,
                                     bounds=(0, p - 1))
        self._line.setCursor(Qt.SizeHorCursor)
        self._line.setPen(pg.mkPen(QColor(Qt.black), width=2))
        self._line.sigPositionChanged.connect(self._on_cut_changed)
        self.plot.addItem(self._line)

        # horizontal lines
        self.plot_horlines = tuple(
            pg.PlotCurveItem(
                pen=pg.mkPen(QColor(colors[i]), style=Qt.DashLine))
            for i in range(len(shared_correlations)))
        self.plot_horlabels = tuple(
            pg.TextItem(color=QColor('k'), anchor=(0, 1))
            for _ in range(len(shared_correlations)))

        for item in self.plot_horlabels + self.plot_horlines:
            self.plot.addItem(item)
        self._set_horline_pos()

        # self.plot.setRange(xRange=(0.0, p - 1), yRange=(0.0, 1.0))
        self.plot.setXRange(0.0, p - 1, padding=0)
        self.plot.setYRange(0.0, 1.0, padding=0)
        self._update_axis()

    def _set_horline_pos(self):
        cutidx = self.ncomponents - 1
        for line, label, curve in zip(self.plot_horlines, self.plot_horlabels,
                                      self._shared_correlations):
            y = curve[cutidx]
            line.setData([-1, cutidx], 2 * [y])
            label.setPos(cutidx, y)
            label.setPlainText("{:.3f}".format(y))

    def _on_cut_changed(self, line):
        # cut changed by means of a cut line over the scree plot.
        value = int(round(line.value()))
        components = value + 1

        if not (self.ncomponents == 0 and components == len(self._components)):
            self.ncomponents = components

        self._line.setValue(value)
        self._set_horline_pos()
        self.commit()

    def _update_selection_component_spin(self):
        # cut changed by "ncomponents" spin.
        if self._mas is None:
            self._invalidate_selection()
            return

        if np.floor(self._line.value()) + 1 != self.ncomponents:
            self._line.setValue(self.ncomponents - 1)

        self.commit()

    def _invalidate_selection(self):
        if self.data is not None:
            self._transformed = None
            self.commit()

    def _update_scoring_combo(self):
        self.fit()
        self._invalidate_selection()

    def _update_dynamic_time_warping(self):
        self._invalidate_selection()

    def _update_quantile_normalization(self):
        self._invalidate_selection()

    def _update_ngenes_spin(self):
        self.clear_plot()
        if self.data is None:
            return
        if self._has_nan_classes():
            self.Error.nan_class()
            return
        self.clear_messages()
        self.fit()
        self._invalidate_selection()

    def _update_combo_source_id(self):
        self.clear_plot()
        if self.data is None:
            return
        y = np.array(self.data.get_column_view(self.source_id)[0],
                     dtype=np.float64)
        _, counts = np.unique(y, return_counts=True)
        if min(counts) < 2:
            self.Error.no_instances()
            return
        self._reset_max_components()
        if self._has_nan_classes():
            self.Error.nan_class()
            return
        self.clear_messages()
        self.fit()
        self._invalidate_selection()

    def _update_axis(self):
        p = MAX_COMPONENTS
        axis = self.plot.getAxis("bottom")
        d = max((p - 1) // (self.axis_labels - 1), 1)
        axis.setTicks([[(i, str(i + 1)) for i in range(0, p, d)]])

    def _has_nan_classes(self):
        y = np.array(self.data.get_column_view(self.source_id)[0],
                     dtype=np.float64)
        return not np.isfinite(y).all()

    def commit(self):
        transformed_table = meta_genes = None
        if self._mas is not None:
            # Compute the full transform (MAX_COMPONENTS components) only once.
            if self._transformed is None:
                X = self.data.X
                y = self.data.get_column_view(self.source_id)[0]
                self._transformed = self._mas.transform(
                    X,
                    y,
                    normalize=self.quantile_normalization,
                    quantile=self.quantile_normalization_perc,
                    dtw=self.dynamic_time_warping)

                attributes = tuple(
                    ContinuousVariable.make("CCA{}".format(x + 1))
                    for x in range(MAX_COMPONENTS))
                dom = Domain(attributes, self.data.domain.class_vars,
                             self.data.domain.metas)

                # Meta-genes
                meta_genes = self.data.transform(dom)
                genes_components = np.zeros(
                    (self.data.X.shape[1], MAX_COMPONENTS))
                for key, genes in self._mas.use_genes.items():
                    for gene in genes:
                        genes_components[gene - 1, key] = genes.index(gene) + 1
                genes_components[genes_components == 0] = np.NaN
                meta_genes.X = genes_components
                self.meta_genes = Table.from_numpy(Domain(attributes),
                                                   genes_components)

                # Transformed data
                transformed = self._transformed
                new_domain = add_columns(self.data.domain,
                                         attributes=attributes)
                transformed_table_temp = self.data.transform(new_domain)
                transformed_table_temp.X[:, -MAX_COMPONENTS:] = transformed
                self.transformed_table = Table.from_table(
                    dom, transformed_table_temp)

            ncomponents_attributes = tuple(
                ContinuousVariable.make("CCA{}".format(x + 1))
                for x in range(self.ncomponents))
            ncomponents_domain = Domain(ncomponents_attributes,
                                        self.data.domain.class_vars,
                                        self.data.domain.metas)

            meta_genes = self.meta_genes.transform(
                Domain(ncomponents_attributes))
            transformed_table = self.transformed_table.transform(
                ncomponents_domain)

        self.Outputs.transformed_data.send(transformed_table)
        self.Outputs.genes_components.send(meta_genes)

    def send_report(self):
        if self.data is None:
            return
        self.report_items(
            (("Source ID", self.source_id), ("Selected num. of components",
                                             self.ncomponents),
             ("Selected num. of genes", self.ngenes), ("Scoring",
                                                       self.scoring),
             ("Quantile normalization",
              True if self.quantile_normalization else "False"),
             ("Quantile normalization percentage",
              self.quantile_normalization_perc if self.quantile_normalization
              else False), ("Dynamic time warping",
                            True if self.dynamic_time_warping else "False")))
        self.report_plot()

    """
Example #16
0
class OWLoadCorpus(OWWidget):
    name = "Corpus"
    description = "Load a corpus of text documents, (optionally) tagged with categories."
    icon = "icons/TextFile.svg"
    priority = 10

    outputs = [(Output.CORPUS, Corpus)]
    want_main_area = False
    resizing_enabled = False

    dlgFormats = ("All readable files ({});;".format(
        '*' + ' *'.join(FileFormat.readers.keys())) + ";;".join(
            "{} (*{})".format(f.DESCRIPTION, ' *'.join(f.EXTENSIONS))
            for f in sorted(set(FileFormat.readers.values()),
                            key=list(FileFormat.readers.values()).index)))

    settingsHandler = PerfectDomainContextHandler(
        match_values=PerfectDomainContextHandler.MATCH_VALUES_ALL)

    recent_files = Setting([])
    used_attrs = ContextSetting([])

    class Error(OWWidget.Error):
        read_file = Msg("Can't read file {} ({})")

    def __init__(self):
        super().__init__()

        self.corpus = None

        # Browse file box
        fbox = gui.widgetBox(self.controlArea, "Corpus file", orientation=0)
        widget = widgets.FileWidget(recent_files=self.recent_files,
                                    icon_size=(16, 16),
                                    on_open=self.open_file,
                                    directory_aliases={
                                        "Browse documentation corpora ...":
                                        get_sample_corpora_dir()
                                    },
                                    dialog_format=self.dlgFormats,
                                    dialog_title='Open Orange Document Corpus',
                                    allow_empty=False,
                                    reload_label='Reload',
                                    browse_label='Browse')
        fbox.layout().addWidget(widget)

        # Corpus info
        ibox = gui.widgetBox(self.controlArea, "Corpus info", addSpace=True)
        corp_info = "Corpus of 0 documents."
        self.info_label = gui.label(ibox, self, corp_info)

        # Used Text Features
        fbox = gui.widgetBox(self.controlArea, orientation=0)
        ubox = gui.widgetBox(fbox, "Used text features", addSpace=True)
        self.used_attrs_model = VariableListModel(enable_dnd=True)
        self.used_attrs_view = VariablesListItemView()
        self.used_attrs_view.setModel(self.used_attrs_model)
        ubox.layout().addWidget(self.used_attrs_view)

        aa = self.used_attrs_model
        aa.dataChanged.connect(self.update_feature_selection)
        aa.rowsInserted.connect(self.update_feature_selection)
        aa.rowsRemoved.connect(self.update_feature_selection)

        # Ignored Text Features
        ibox = gui.widgetBox(fbox, "Ignored text features", addSpace=True)
        self.unused_attrs_model = VariableListModel(enable_dnd=True)
        self.unused_attrs_view = VariablesListItemView()
        self.unused_attrs_view.setModel(self.unused_attrs_model)
        ibox.layout().addWidget(self.unused_attrs_view)

        # load first file
        widget.select(0)

    def open_file(self, path):
        self.closeContext()
        self.Error.read_file.clear()
        self.used_attrs_model[:] = []
        self.unused_attrs_model[:] = []
        if path:
            try:
                self.corpus = Corpus.from_file(path)
                self.corpus.name = os.path.splitext(os.path.basename(path))[0]
                self.info_label.setText("Corpus of {} documents.".format(
                    len(self.corpus)))
                self.used_attrs = list(self.corpus.text_features)
                self.openContext(self.corpus)
                self.used_attrs_model.extend(self.used_attrs)
                self.unused_attrs_model.extend([
                    f for f in self.corpus.domain.metas
                    if f.is_string and f not in self.used_attrs_model
                ])
            except BaseException as err:
                self.Error.read_file(path, str(err))

    def update_feature_selection(self):
        # TODO fix VariablesListItemView so it does not emit
        # duplicated data when reordering inside a single window
        def remove_duplicates(l):
            unique = []
            for i in l:
                if i not in unique:
                    unique.append(i)
            return unique

        if self.corpus is not None:
            self.corpus.set_text_features(
                remove_duplicates(self.used_attrs_model))
            self.send(Output.CORPUS, self.corpus)
            self.used_attrs = list(self.used_attrs_model)
Example #17
0
class OWMultifile(widget.OWWidget, RelocatablePathsWidgetMixin):
    name = "Multifile"
    id = "orangecontrib.spectroscopy.widgets.files"
    icon = "icons/multifile.svg"
    description = "Read data from input files " \
                  "and send a data table to the output."
    priority = 10000
    replaces = [
        "orangecontrib.infrared.widgets.owfiles.OWFiles",
        "orangecontrib.infrared.widgets.owmultifile.OWMultifile"
    ]

    class Outputs:
        data = Output("Data", Table, doc="Concatenated input files.")

    want_main_area = False

    file_idx = []

    settingsHandler = PerfectDomainContextHandler(
        match_values=PerfectDomainContextHandler.MATCH_VALUES_ALL)

    recent_paths: List[RecentPath]
    variables: list

    sheet = Setting(None, schema_only=True)
    label = Setting("", schema_only=True)
    recent_paths = Setting([], schema_only=True)
    variables = ContextSetting([], schema_only=True)

    class Error(widget.OWWidget.Error):
        file_not_found = widget.Msg("File(s) not found.")
        missing_reader = widget.Msg("Missing reader(s).")
        read_error = widget.Msg("Read error(s).")

    domain_editor = SettingProvider(DomainEditor)

    def __init__(self):
        widget.OWWidget.__init__(self)
        RelocatablePathsWidgetMixin.__init__(self)
        self.domain = None
        self.data = None
        self.loaded_file = ""
        self.sheets = []

        self.lb = gui.listBox(self.controlArea,
                              self,
                              "file_idx",
                              selectionMode=QListWidget.MultiSelection)
        self.default_foreground = None

        layout = QGridLayout()
        gui.widgetBox(self.controlArea, margin=0, orientation=layout)

        file_button = gui.button(None,
                                 self,
                                 '  ...',
                                 callback=self.browse_files,
                                 autoDefault=False)
        file_button.setIcon(self.style().standardIcon(QStyle.SP_DirOpenIcon))
        file_button.setSizePolicy(Policy.Maximum, Policy.Fixed)
        layout.addWidget(file_button, 0, 0)

        remove_button = gui.button(None,
                                   self,
                                   'Remove',
                                   callback=self.remove_item)

        clear_button = gui.button(None, self, 'Clear', callback=self.clear)

        layout.addWidget(remove_button, 0, 1)
        layout.addWidget(clear_button, 0, 2)

        reload_button = gui.button(None,
                                   self,
                                   "Reload",
                                   callback=self.load_data,
                                   autoDefault=False)
        reload_button.setIcon(self.style().standardIcon(
            QStyle.SP_BrowserReload))
        reload_button.setSizePolicy(Policy.Fixed, Policy.Fixed)
        layout.addWidget(reload_button, 0, 7)

        self.sheet_box = gui.hBox(None, addToLayout=False, margin=0)
        self.sheet_index = 0
        self.sheet_combo = gui.comboBox(None,
                                        self,
                                        "sheet_index",
                                        callback=self.select_sheet)
        self.sheet_combo.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed)
        self.sheet_label = QLabel()
        self.sheet_label.setText('Sheet')
        self.sheet_label.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed)
        self.sheet_box.layout().addWidget(self.sheet_label, Qt.AlignLeft)
        self.sheet_box.layout().addWidget(self.sheet_combo, Qt.AlignVCenter)
        layout.addWidget(self.sheet_box, 2, 1)
        self.sheet_box.hide()

        layout.addWidget(self.sheet_box, 0, 5)

        label_box = gui.hBox(None, addToLayout=False, margin=0)
        gui.lineEdit(label_box,
                     self,
                     "label",
                     callback=self.set_label,
                     label="Label",
                     orientation=Qt.Horizontal)
        layout.addWidget(label_box, 0, 6)

        layout.setColumnStretch(3, 2)

        box = gui.widgetBox(self.controlArea, "Columns (Double click to edit)")
        self.domain_editor = DomainEditor(self)
        self.editor_model = self.domain_editor.model()
        box.layout().addWidget(self.domain_editor)

        for rp in self.recent_paths:
            self.lb.addItem(rp.abspath)

        box = gui.hBox(self.controlArea)
        gui.rubber(box)

        if hasattr(DomainEditor, "reset_domain"):  # Orange>=3.21
            gui.button(box, self, "Reset", callback=self.reset_domain_edit)
        self.apply_button = gui.button(box,
                                       self,
                                       "Apply",
                                       callback=self.apply_domain_edit)
        self.apply_button.setEnabled(False)
        self.apply_button.setFixedWidth(170)
        self.editor_model.dataChanged.connect(
            lambda: self.apply_button.setEnabled(True))

        self._update_sheet_combo()
        self.load_data()

    def set_label(self):
        self.load_data()

    def _select_active_sheet(self):
        if self.sheet:
            try:
                sheet_list = [s[0] for s in self.sheets]
                idx = sheet_list.index(self.sheet)
                self.sheet_combo.setCurrentIndex(idx)
            except ValueError:
                # Requested sheet does not exist in this file
                self.sheet = None
        else:
            self.sheet_combo.setCurrentIndex(0)

    def _update_sheet_combo(self):
        sheets = Counter()

        for rp in self.recent_paths:
            try:
                reader = _get_reader(rp)
                sheets.update(reader.sheets)
            except:
                pass

        sheets = sorted(sheets.items(), key=lambda x: x[0])

        self.sheets = [(s, s + " (" + str(n) + ")") for s, n in sheets]

        if len(sheets) < 2:
            self.sheet_box.hide()
            self.sheet = None
        else:
            self.sheets.insert(0, (None, "(None)"))
            self.sheet_combo.clear()
            self.sheet_combo.addItems([s[1] for s in self.sheets])
            self._select_active_sheet()
            self.sheet_box.show()

    def select_sheet(self):
        self.sheet = self.sheets[self.sheet_combo.currentIndex()][0]
        self.load_data()

    def remove_item(self):
        ri = [i.row() for i in self.lb.selectedIndexes()]
        for i in sorted(ri, reverse=True):
            self.recent_paths.pop(i)
            self.lb.takeItem(i)
        self._update_sheet_combo()
        self.load_data()

    def clear(self):
        self.lb.clear()
        while self.recent_paths:
            self.recent_paths.pop()
        self._update_sheet_combo()
        self.load_data()

    def browse_files(self, in_demos=False):
        start_file = self.last_path() or os.path.expanduser("~/")

        readers = [
            f for f in FileFormat.formats
            if getattr(f, 'read', None) and getattr(f, "EXTENSIONS", None)
        ]
        filenames, reader, _ = open_filename_dialog(
            start_file, None, readers, dialog=QFileDialog.getOpenFileNames)

        self.load_files(filenames, reader)

    def load_files(self, filenames, reader):
        if not filenames:
            return

        for f in filenames:
            self.add_path(f, reader)
            self.lb.addItem(f)

        self._update_sheet_combo()
        self.load_data()

    def load_data(self):
        self.closeContext()

        self.Error.file_not_found.clear()
        self.Error.missing_reader.clear()
        self.Error.read_error.clear()

        data_list = []
        fnok_list = []

        def show_error(li, msg):
            li.setForeground(Qt.red)
            li.setToolTip(msg)

        empty_domain = Domain(attributes=[])
        for i, rp in enumerate(self.recent_paths):
            fn = rp.abspath

            li = self.lb.item(i)
            li.setToolTip("")
            if self.default_foreground is None:
                self.default_foreground = li.foreground()
            li.setForeground(self.default_foreground)

            if not os.path.exists(fn):
                show_error(li, "File not found.")
                self.Error.file_not_found()
                continue

            try:
                reader = _get_reader(rp)
                assert reader is not None
            except Exception:  # pylint: disable=broad-except
                show_error(li, "Reader not found.")
                self.Error.missing_reader()
                continue

            try:
                if self.sheet in reader.sheets:
                    reader.select_sheet(self.sheet)
                if isinstance(reader, SpectralFileFormat):
                    xs, vals, additional = reader.read_spectra()
                    if additional is None:
                        additional = Table.from_domain(empty_domain,
                                                       n_rows=len(vals))
                    data_list.append((xs, vals, additional))
                else:
                    data_list.append(reader.read())
                fnok_list.append(fn)
            except Exception as ex:  # pylint: disable=broad-except
                show_error(li, "Read error:\n" + str(ex))
                self.Error.read_error()

        if not data_list \
                or self.Error.file_not_found.is_shown() \
                or self.Error.missing_reader.is_shown() \
                or self.Error.read_error.is_shown():
            self.data = None
            self.domain_editor.set_domain(None)
        else:
            data = concatenate_data(data_list, fnok_list, self.label)
            self.data = data
            self.openContext(data.domain)

        self.apply_domain_edit()  # sends data

    def storeSpecificSettings(self):
        self.current_context.modified_variables = self.variables[:]

    def retrieveSpecificSettings(self):
        if hasattr(self.current_context, "modified_variables"):
            self.variables[:] = self.current_context.modified_variables

    def apply_domain_edit(self):
        if self.data is None:
            table = None
        else:
            domain, cols = self.domain_editor.get_domain(
                self.data.domain, self.data)
            if not (domain.variables or domain.metas):
                table = None
            else:
                X, y, m = cols
                table = Table.from_numpy(domain, X, y, m, self.data.W)
                table.name = self.data.name
                table.ids = np.array(self.data.ids)
                table.attributes = getattr(self.data, 'attributes', {})

        self.Outputs.data.send(table)
        self.apply_button.setEnabled(False)

    def reset_domain_edit(self):
        self.domain_editor.reset_domain()
        self.apply_domain_edit()

    def send_report(self):
        def get_format_name(format):
            try:
                return format.DESCRIPTION
            except AttributeError:
                return format.__class__.__name__

        if self.data is None:
            self.report_paragraph("File", "No file.")
            return

        files = []

        for rp in self.recent_paths:
            format = _get_reader(rp)
            files.append([rp.abspath, get_format_name(format)])

        self.report_table("Files", table=files)

        self.report_data("Data", self.data)

    def workflowEnvChanged(self, key, value, oldvalue):
        """
        Function called when environment changes (e.g. while saving the scheme)
        It make sure that all environment connected values are modified
        (e.g. relative file paths are changed)
        """
        self.update_file_list(key, value, oldvalue)

    def update_file_list(self, key, value, oldvalue):
        if key == "basedir":
            self._relocate_recent_files()
class OWExplainPredictions(OWWidget, ConcurrentWidgetMixin):
    name = "Explain Predictions"
    description = "Predictions explanation widget."
    keywords = ["explain", "explain prediction", "explain model"]
    icon = "icons/ExplainPredictions.svg"
    priority = 120

    class Inputs:
        model = Input("Model", Model)
        background_data = Input("Background Data", Table)
        data = Input("Data", Table)

    class Outputs:
        selected_data = Output("Selected Data", Table, default=True)
        annotated_data = Output(ANNOTATED_DATA_SIGNAL_NAME, Table)
        scores = Output("Scores", Table)

    class Error(OWWidget.Error):
        domain_transform_err = Msg("{}")
        unknown_err = Msg("{}")
        not_enough_data = Msg("At least two instances are needed.")

    class Information(OWWidget.Information):
        data_sampled = Msg("Data has been sampled.")

    buttons_area_orientation = Qt.Vertical

    settingsHandler = PerfectDomainContextHandler()
    target_index = ContextSetting(0)
    order_index = ContextSetting(0)
    annot_index = ContextSetting(0)
    show_tooltip = Setting(True)
    highlight_feature = Setting(True)
    selection_ranges = Setting([], schema_only=True)
    auto_send = Setting(True)
    visual_settings = Setting({}, schema_only=True)

    graph_name = "graph.plotItem"

    ANNOTATIONS = ["None", "Enumeration"]

    def __init__(self):
        OWWidget.__init__(self)
        ConcurrentWidgetMixin.__init__(self)
        self.__results: Optional[RunnerResults] = None
        self.model: Optional[Model] = None
        self.background_data: Optional[Table] = None
        self.data: Optional[Table] = None
        # cached instance indices after instance ordering
        self.__data_idxs: Optional[np.ndarray] = None
        self.__pending_selection: List[Tuple[float, float]] = \
            self.selection_ranges

        self.graph: ForcePlot = None
        self._target_combo: QComboBox = None
        self._order_combo: QComboBox = None
        self._annot_combo: QComboBox = None

        self.setup_gui()

        initial_settings = self.graph.parameter_setter.initial_settings
        VisualSettingsDialog(self, initial_settings)

    def setup_gui(self):
        self._add_plot()
        self._add_controls()
        self._add_buttons()

    def _add_plot(self):
        box = gui.vBox(self.mainArea)
        self.graph = ForcePlot(self)
        self.graph.set_show_tooltip(self.show_tooltip)
        self.graph.set_highlight_feature(self.highlight_feature)
        self.graph.selectionChanged.connect(self.__on_selection_changed)
        box.layout().addWidget(self.graph)

    def __on_selection_changed(self, selection: List[Tuple[float, float]]):
        self.selection_ranges = selection
        self.commit()

    def _add_controls(self):
        box = gui.vBox(self.controlArea, "Target class")
        self._target_combo = gui.comboBox(box, self, "target_index",
                                          callback=self.__on_target_changed,
                                          contentsLength=12)

        box = gui.vBox(self.controlArea, "Instance order")
        self._order_combo = gui.comboBox(box, self, "order_index",
                                         callback=self.__on_order_changed,
                                         searchable=True, contentsLength=12)
        model = VariableListModel()
        model[:] = INSTANCE_ORDERINGS
        self._order_combo.setModel(model)

        box = gui.vBox(self.controlArea, "Annotation")
        self._annot_combo = gui.comboBox(box, self, "annot_index",
                                         callback=self.__on_annot_changed,
                                         searchable=True, contentsLength=12)
        model = VariableListModel()
        model[:] = self.ANNOTATIONS
        self._annot_combo.setModel(model)

        box = gui.vBox(self.controlArea, "", margin=True,
                       contentsMargins=(8, 4, 8, 4))
        gui.checkBox(box, self, "show_tooltip", "Show tooltips",
                     callback=self.__on_show_tooltip_changed)
        gui.checkBox(box, self, "highlight_feature",
                     "Highlight feature on hover",
                     callback=self.__on_highlight_feature_changed)

        gui.rubber(self.controlArea)

    def __on_target_changed(self):
        self.selection_ranges = []
        self.setup_plot()
        self.commit()

    def __on_order_changed(self):
        self.selection_ranges = []
        self.setup_plot()
        self.commit()

    def __on_annot_changed(self):
        if not self.__results or not self.data:
            return
        self._set_plot_annotations()

    def __on_show_tooltip_changed(self):
        self.graph.set_show_tooltip(self.show_tooltip)

    def __on_highlight_feature_changed(self):
        self.graph.set_highlight_feature(self.highlight_feature)

    def _add_buttons(self):
        plot_gui = OWPlotGUI(self)
        plot_gui.box_zoom_select(self.buttonsArea)
        gui.auto_send(self.buttonsArea, self, "auto_send")

    @Inputs.data
    @check_sql_input
    def set_data(self, data: Optional[Table]):
        self.closeContext()
        self.data = data
        self._check_data()
        self._setup_controls()
        self.openContext(self.data.domain if self.data else None)

    @Inputs.background_data
    @check_sql_input
    def set_background_data(self, data: Optional[Table]):
        self.background_data = data

    @Inputs.model
    def set_model(self, model: Optional[Model]):
        self.model = model

    def _check_data(self):
        self.Error.not_enough_data.clear()
        if self.data and len(self.data) < 2:
            self.data = None
            self.Error.not_enough_data()

    def _setup_controls(self):
        self._target_combo.clear()
        self._target_combo.setEnabled(True)

        self.order_index = 0
        self.annot_index = 0
        self._order_combo.clear()
        self._annot_combo.clear()
        orderings = INSTANCE_ORDERINGS
        annotations = self.ANNOTATIONS

        if self.data:
            domain = self.data.domain
            if domain.has_discrete_class:
                self._target_combo.addItems(domain.class_var.values)
                self.target_index = 0
            elif domain.has_continuous_class:
                self.target_index = -1
                self._target_combo.setEnabled(False)

            orderings = chain(
                INSTANCE_ORDERINGS,
                [VariableListModel.Separator] if domain.metas else [],
                domain.metas,
                [VariableListModel.Separator] if domain.class_vars else [],
                domain.class_vars,
                [VariableListModel.Separator] if domain.attributes else [],
                domain.attributes,
            )

            annotations = chain(
                self.ANNOTATIONS,
                [VariableListModel.Separator] if domain.metas else [],
                domain.metas,
                [VariableListModel.Separator] if domain.class_vars else [],
                domain.class_vars,
                [VariableListModel.Separator] if domain.attributes else [],
                domain.attributes,
            )

        self._order_combo.model()[:] = orderings
        self._annot_combo.model()[:] = annotations

    def handleNewSignals(self):
        self.clear()
        self.start(run, self.data, self.background_data, self.model)
        self.commit()

    def clear(self):
        self.__results = None
        self.cancel()
        self.Error.domain_transform_err.clear()
        self.Error.unknown_err.clear()
        self.Information.data_sampled.clear()
        self.selection_ranges = []
        self.graph.clear_all()
        self.graph.set_axis(None)
        self.__data_idxs = None

    def setup_plot(self):
        self.graph.clear_all()
        self.__data_idxs = None
        if not self.__results or not self.data:
            return

        order = self._order_combo.model()[self.order_index]
        values_idxs = get_instance_ordering(
            self.__results.values[self.target_index],
            self.__results.predictions[self.__results.mask, self.target_index],
            self.data[self.__results.mask],
            order
        )

        data_idxs = np.arange(len(self.data))
        self.__data_idxs = data_idxs[self.__results.mask][values_idxs]

        x_data, pos_y_data, neg_y_data, pos_labels, neg_labels = \
            prepare_force_plot_data_multi_inst(
                self.__results.values[self.target_index][values_idxs],
                self.__results.base_value[self.target_index],
                self.model.domain
            )

        if self.order_index == 0:
            order = "hierarhical clustering"
        elif self.order_index == 1:
            order = "output value"
        elif self.order_index == 2:
            order = "original ordering"
        x_label = f"Instances ordered by {order}"

        target = self.model.domain.class_var
        if self.model.domain.has_discrete_class:
            target = f"{target} = {target.values[self.target_index]}"
        y_label = f"Output value ({target})"

        self.graph.set_data(x_data, pos_y_data, neg_y_data,
                            pos_labels, neg_labels, x_label, y_label,
                            self.__results.transformed_data[self.__data_idxs])
        self._set_plot_annotations()

    def _set_plot_annotations(self):
        annotator = self._annot_combo.model()[self.annot_index]
        if isinstance(annotator, Variable):
            ticks = [[(i, str(row[annotator].value)) for i, row in
                      enumerate(self.data[self.__data_idxs])]]
            self.graph.set_axis(ticks)
        elif annotator == "None":
            self.graph.set_axis([])
        elif annotator == "Enumeration":
            ticks = [[(i, str(idx + 1)) for i, idx in
                      enumerate(self.__data_idxs)]]
            self.graph.set_axis(ticks)
        else:
            raise NotImplementedError(annotator)

    def on_partial_result(self, _):
        pass

    def on_done(self, results: Optional[RunnerResults]):
        self.__results = results
        if results is not None and not all(results.mask):
            self.Information.data_sampled()
        self.setup_plot()
        self.apply_selection()
        self.output_scores()

    def on_exception(self, ex: Exception):
        if isinstance(ex, DomainTransformationError):
            self.Error.domain_transform_err(ex)
        else:
            self.Error.unknown_err(ex)

    def onDeleteWidget(self):
        self.shutdown()
        super().onDeleteWidget()

    def apply_selection(self):
        selection_ranges = self.selection_ranges or self.__pending_selection
        if selection_ranges:
            self.graph.apply_selection(selection_ranges)
            self.__on_selection_changed(selection_ranges)
            self.__pending_selection = []

    def commit(self):
        selected = None
        selected_indices = []

        if self.__results:
            selection = list(set(
                chain.from_iterable(
                    range(int(np.ceil(start)), int(np.floor(stop) + 1))
                    for start, stop in self.selection_ranges)
            ))
            selected_indices = sorted(self.__data_idxs[selection])

        if self.data and selected_indices:
            selected = self.data[selected_indices]
        annotated = create_annotated_table(self.data, selected_indices)
        self.Outputs.selected_data.send(selected)
        self.Outputs.annotated_data.send(annotated)

    def output_scores(self):
        scores = None
        if self.__results is not None:
            mask = self.__results.mask
            data = self.__results.transformed_data[mask]
            domain = data.domain
            attrs = [ContinuousVariable(f"S({a.name})")
                     for a in domain.attributes]
            domain = Domain(attrs, domain.class_vars, domain.metas)
            scores = self.__results.values[self.target_index]
            scores = Table(domain, scores, data.Y, data.metas)
            scores.name = "Feature Scores"
        self.Outputs.scores.send(scores)

    def send_report(self):
        if not self.data or not self.background_data or not self.model:
            return
        items = {"Target class": "None"}
        if self.model.domain.has_discrete_class:
            class_var = self.model.domain.class_var
            items["Target class"] = class_var.values[self.target_index]
        self.report_items(items)
        self.report_plot()

    def set_visual_settings(self, key: Tuple[str, str, str], value: Any):
        self.visual_settings[key] = value
        self.graph.parameter_setter.set_parameter(key, value)
class OWKaplanMeier(OWWidget):
    name = 'Kaplan-Meier Plot'
    # TODO
    description = ''
    # TODO
    icon = ''
    priority = 0

    show_confidence_interval: bool
    show_confidence_interval = Setting(False)

    show_median_line: bool
    show_median_line = Setting(False)

    show_censored_data: bool
    show_censored_data = Setting(False)

    settingsHandler = PerfectDomainContextHandler()
    time_var = ContextSetting(None)
    event_var = ContextSetting(None)
    group_var: Optional[DiscreteVariable] = ContextSetting(None)

    graph = SettingProvider(KaplanMeierPlot)

    auto_commit: bool = Setting(False, schema_only=True)

    class Inputs:
        data = Input('Data', Table)

    class Outputs:
        selected_data = Output('Data', Table)

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.data: Optional[Table] = None
        self.plot_curves = None

        time_var_model = DomainModel(valid_types=(ContinuousVariable, ))
        event_var_model = DomainModel(valid_types=DomainModel.PRIMITIVE)
        group_var_model = DomainModel(placeholder='(None)',
                                      valid_types=(DiscreteVariable, ))

        box = gui.vBox(self.controlArea, 'Time', margin=0)
        gui.comboBox(box,
                     self,
                     'time_var',
                     model=time_var_model,
                     callback=self.on_controls_changed)

        box = gui.vBox(self.controlArea, 'Event', margin=0)
        gui.comboBox(box,
                     self,
                     'event_var',
                     model=event_var_model,
                     callback=self.on_controls_changed)

        box = gui.vBox(self.controlArea, 'Group', margin=0)
        gui.comboBox(box,
                     self,
                     'group_var',
                     model=group_var_model,
                     callback=self.on_controls_changed)

        box = gui.vBox(self.controlArea, 'Display options')
        gui.checkBox(
            box,
            self,
            'show_confidence_interval',
            label='Confidence intervals',
            callback=self.on_display_option_changed,
        )

        gui.checkBox(
            box,
            self,
            'show_median_line',
            label='Median',
            callback=self.on_display_option_changed,
        )

        gui.checkBox(
            box,
            self,
            'show_censored_data',
            label='Censored data',
            callback=self.on_display_option_changed,
        )

        self.graph: KaplanMeierPlot = KaplanMeierPlot(parent=self)
        self.graph.selection_changed.connect(self.commit)
        self.mainArea.layout().addWidget(self.graph)

        plot_gui = OWPlotGUI(self)
        plot_gui.box_zoom_select(self.controlArea)

        gui.rubber(self.controlArea)

        self.commit_button = gui.auto_commit(self.controlArea,
                                             self,
                                             'auto_commit',
                                             '&Commit',
                                             box=False)

    @Inputs.data
    def set_data(self, data: Table):
        self.closeContext()
        if not data:
            return

        self.data = data
        self.controls.time_var.model().set_domain(data.domain)
        self.controls.event_var.model().set_domain(data.domain)
        self.controls.group_var.model().set_domain(data.domain)
        self.time_var = None
        self.event_var = None
        self.group_var = None
        self.graph.selection = {}
        self.openContext(data.domain)

        self.graph.curves = {
            curve_id: curve
            for curve_id, curve in enumerate(self.generate_plot_curves())
        }
        self.graph.update_plot(**self._get_plot_options())
        self.commit()

    def _get_plot_options(self):
        return {
            'confidence_interval': self.show_confidence_interval,
            'median': self.show_median_line,
            'censored': self.show_censored_data,
        }

    def on_display_option_changed(self) -> None:
        self.graph.update_plot(**self._get_plot_options())

    def on_controls_changed(self):
        if not self.data:
            return

        self.graph.curves = {
            curve_id: curve
            for curve_id, curve in enumerate(self.generate_plot_curves())
        }
        self.graph.clear_selection()
        self.graph.update_plot(**self._get_plot_options())
        self.commit()

    def _get_discrete_var_color(self, index: Optional[int]):
        if self.group_var is not None and index is not None:
            return list(self.group_var.colors[index])

    def generate_plot_curves(self) -> List[EstimatedFunctionCurve]:
        if self.time_var is None or self.event_var is None:
            return []

        time, _ = self.data.get_column_view(self.time_var)
        events, _ = self.data.get_column_view(self.event_var)

        # time = np.array([2.5, 4, 4, 5, 6, 6])
        # events = np.array([1, 1, 1, 1, 0, 0])

        if self.group_var:
            groups, _ = self.data.get_column_view(self.group_var)
            group_indexes = [
                index for index, _ in enumerate(self.group_var.values)
            ]
            colors = [
                self._get_discrete_var_color(index) for index in group_indexes
            ]
            masks = groups == np.reshape(group_indexes, (-1, 1))

            return [
                EstimatedFunctionCurve(time[mask],
                                       events[mask],
                                       color=color,
                                       label=label) for mask, color, label in
                zip(masks, colors, self.group_var.values) if mask.any()
            ]

        else:
            return [EstimatedFunctionCurve(time, events)]

    def commit(self):
        if not self.graph.selection:
            self.Outputs.selected_data.send(None)
            return

        time, _ = self.data.get_column_view(self.time_var)
        if self.group_var is None:
            time_interval = self.graph.selection[0].x
            start, end = time_interval[0], time_interval[-1]
            selection = np.argwhere((time >= start)
                                    & (time <= end)).reshape(-1).astype(int)
        else:
            selection = []
            group, _ = self.data.get_column_view(self.group_var)
            for group_id, time_interval in self.graph.selection.items():
                start, end = time_interval.x[0], time_interval.x[-1]
                selection += (np.argwhere((time >= start) & (time <= end) & (
                    group == group_id)).reshape(-1).astype(int).tolist())
            selection = sorted(selection)

        self.Outputs.selected_data.send(self.data[selection, :])

    def sizeHint(self):
        return QSize(1280, 620)
Example #20
0
class OWFile(widget.OWWidget, RecentPathsWComboMixin):
    name = "文件(File)"
    id = "orange.widgets.data.file"
    description = "从输入文件或网络读取数据并将数据表发送到输出。"

    icon = "icons/File.svg"
    priority = 10
    category = "数据(Data)"
    keywords = ["file", "load", "read", "open", "wenjian"]

    class Outputs:
        data = Output("数据(Data)",
                      Table,
                      doc="Attribute-valued dataset read from the input file.",
                      replaces=['Data'])

    want_main_area = False
    buttons_area_orientation = None

    SEARCH_PATHS = [("sample-datasets", get_sample_datasets_dir())]
    SIZE_LIMIT = 1e7
    LOCAL_FILE, URL = range(2)

    settingsHandler = PerfectDomainContextHandler(
        match_values=PerfectDomainContextHandler.MATCH_VALUES_ALL)

    # pylint seems to want declarations separated from definitions
    recent_paths: List[RecentPath]
    recent_urls: List[str]
    variables: list

    # Overload RecentPathsWidgetMixin.recent_paths to set defaults
    recent_paths = Setting([
        RecentPath("", "sample-datasets", "iris.tab"),
        RecentPath("", "sample-datasets", "titanic.tab"),
        RecentPath("", "sample-datasets", "housing.tab"),
        RecentPath("", "sample-datasets", "heart_disease.tab"),
        RecentPath("", "sample-datasets", "brown-selected.tab"),
        RecentPath("", "sample-datasets", "zoo.tab"),
    ])
    recent_urls = Setting([])
    source = Setting(LOCAL_FILE)
    sheet_names = Setting({})
    url = Setting("")

    variables = ContextSetting([])

    domain_editor = SettingProvider(DomainEditor)

    class Information(widget.OWWidget.Information):
        no_file_selected = Msg("No file selected.")

    class Warning(widget.OWWidget.Warning):
        file_too_big = Msg("The file is too large to load automatically."
                           " Press Reload to load.")
        load_warning = Msg("Read warning:\n{}")
        performance_warning = Msg(
            "Categorical variables with >100 values may decrease performance.")
        renamed_vars = Msg("Some variables have been renamed "
                           "to avoid duplicates.\n{}")
        multiple_targets = Msg("Most widgets do not support multiple targets")

    class Error(widget.OWWidget.Error):
        file_not_found = Msg("File not found.")
        missing_reader = Msg("Missing reader.")
        sheet_error = Msg("Error listing available sheets.")
        unknown = Msg("Read error:\n{}")

    UserAdviceMessages = [
        widget.Message(
            "Use CSV File Import widget for advanced options "
            "for comma-separated files", "use-csv-file-import"),
        widget.Message(
            "This widget loads only tabular data. Use other widgets to load "
            "other data types like models, distance matrices and networks.",
            "other-data-types")
    ]

    def __init__(self):
        super().__init__()
        RecentPathsWComboMixin.__init__(self)
        self.domain = None
        self.data = None
        self.loaded_file = ""
        self.reader = None

        readers = [
            f for f in FileFormat.formats
            if getattr(f, 'read', None) and getattr(f, "EXTENSIONS", None)
        ]

        def group_readers_per_addon_key(w):
            # readers from Orange.data.io should go first
            def package(w):
                package = w.qualified_name().split(".")[:-1]
                package = package[:2]
                if ".".join(package) == "Orange.data":
                    return ["0"]  # force "Orange" to come first
                return package

            return package(w), w.DESCRIPTION

        self.available_readers = sorted(set(readers),
                                        key=group_readers_per_addon_key)

        layout = QGridLayout()
        layout.setSpacing(4)
        gui.widgetBox(self.controlArea, orientation=layout, box='数据源')
        vbox = gui.radioButtons(None,
                                self,
                                "source",
                                box=True,
                                callback=self.load_data,
                                addToLayout=False)

        rb_button = gui.appendRadioButton(vbox, "文件:", addToLayout=False)
        layout.addWidget(rb_button, 0, 0, Qt.AlignVCenter)

        box = gui.hBox(None, addToLayout=False, margin=0)
        box.setSizePolicy(Policy.Expanding, Policy.Fixed)
        self.file_combo.setSizePolicy(Policy.Expanding, Policy.Fixed)
        self.file_combo.setMinimumSize(QSize(100, 1))
        self.file_combo.activated[int].connect(self.select_file)
        box.layout().addWidget(self.file_combo)
        layout.addWidget(box, 0, 1)

        file_button = gui.button(None,
                                 self,
                                 '...',
                                 callback=self.browse_file,
                                 autoDefault=False)
        file_button.setIcon(self.style().standardIcon(QStyle.SP_DirOpenIcon))
        file_button.setSizePolicy(Policy.Maximum, Policy.Fixed)
        layout.addWidget(file_button, 0, 2)

        reload_button = gui.button(None,
                                   self,
                                   "重新加载",
                                   callback=self.load_data,
                                   autoDefault=False)
        reload_button.setIcon(self.style().standardIcon(
            QStyle.SP_BrowserReload))
        reload_button.setSizePolicy(Policy.Fixed, Policy.Fixed)
        layout.addWidget(reload_button, 0, 3)

        self.sheet_box = gui.hBox(None, addToLayout=False, margin=0)
        self.sheet_combo = QComboBox()
        self.sheet_combo.activated[str].connect(self.select_sheet)
        self.sheet_combo.setSizePolicy(Policy.Expanding, Policy.Fixed)
        self.sheet_combo.setMinimumSize(QSize(50, 1))
        self.sheet_label = QLabel()
        self.sheet_label.setText('Sheet')
        self.sheet_label.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed)
        self.sheet_box.layout().addWidget(self.sheet_label, Qt.AlignLeft)
        self.sheet_box.layout().addWidget(self.sheet_combo, Qt.AlignVCenter)
        layout.addWidget(self.sheet_box, 2, 1)
        self.sheet_box.hide()

        rb_button = gui.appendRadioButton(vbox, "URL:", addToLayout=False)
        layout.addWidget(rb_button, 3, 0, Qt.AlignVCenter)

        self.url_combo = url_combo = QComboBox()
        url_model = NamedURLModel(self.sheet_names)
        url_model.wrap(self.recent_urls)
        url_combo.setLineEdit(LineEditSelectOnFocus())
        url_combo.setModel(url_model)
        url_combo.setSizePolicy(Policy.Ignored, Policy.Fixed)
        url_combo.setEditable(True)
        url_combo.setInsertPolicy(url_combo.InsertAtTop)
        url_edit = url_combo.lineEdit()
        l, t, r, b = url_edit.getTextMargins()
        url_edit.setTextMargins(l + 5, t, r, b)
        layout.addWidget(url_combo, 3, 1, 1, 3)
        url_combo.activated.connect(self._url_set)
        # whit completer we set that combo box is case sensitive when
        # matching the history
        completer = QCompleter()
        completer.setCaseSensitivity(Qt.CaseSensitive)
        url_combo.setCompleter(completer)

        layout = QGridLayout()
        layout.setSpacing(4)
        gui.widgetBox(self.controlArea, orientation=layout, box='文件类型')

        box = gui.hBox(None, addToLayout=False, margin=0)
        box.setSizePolicy(Policy.Expanding, Policy.Fixed)
        self.reader_combo = QComboBox(self)
        self.reader_combo.setSizePolicy(Policy.Expanding, Policy.Fixed)
        self.reader_combo.setMinimumSize(QSize(100, 1))
        self.reader_combo.activated[int].connect(self.select_reader)

        box.layout().addWidget(self.reader_combo)
        layout.addWidget(box, 0, 1)

        box = gui.vBox(self.controlArea, "信息")
        self.infolabel = gui.widgetLabel(box, '未加载数据.')

        box = gui.widgetBox(self.controlArea, "列(双击编辑)")
        self.domain_editor = DomainEditor(self)
        self.editor_model = self.domain_editor.model()
        box.layout().addWidget(self.domain_editor)

        box = gui.hBox(box)
        gui.button(box,
                   self,
                   "重置",
                   callback=self.reset_domain_edit,
                   autoDefault=False)
        gui.rubber(box)
        self.apply_button = gui.button(box,
                                       self,
                                       "应用",
                                       callback=self.apply_domain_edit)
        self.apply_button.setEnabled(False)
        self.apply_button.setFixedWidth(170)
        self.editor_model.dataChanged.connect(
            lambda: self.apply_button.setEnabled(True))

        hBox = gui.hBox(self.controlArea)
        gui.rubber(hBox)
        gui.button(hBox,
                   self,
                   "浏览文档数据集",
                   callback=lambda: self.browse_file(True),
                   autoDefault=False)
        gui.rubber(hBox)

        self.set_file_list()
        # Must not call open_file from within __init__. open_file
        # explicitly re-enters the event loop (by a progress bar)

        self.setAcceptDrops(True)

        if self.source == self.LOCAL_FILE:
            last_path = self.last_path()
            if last_path and os.path.exists(last_path) and \
                    os.path.getsize(last_path) > self.SIZE_LIMIT:
                self.Warning.file_too_big()
                return

        QTimer.singleShot(0, self.load_data)

    @staticmethod
    def sizeHint():
        return QSize(600, 550)

    def select_file(self, n):
        assert n < len(self.recent_paths)
        super().select_file(n)
        if self.recent_paths:
            self.source = self.LOCAL_FILE
            self.load_data()
            self.set_file_list()

    def select_sheet(self):
        self.recent_paths[0].sheet = self.sheet_combo.currentText()
        self.load_data()

    def select_reader(self, n):
        if self.source != self.LOCAL_FILE:
            return  # ignore for URL's

        if self.recent_paths:
            path = self.recent_paths[0]
            if n == 0:  # default
                path.file_format = None
                self.load_data()
            elif n <= len(self.available_readers):
                reader = self.available_readers[n - 1]
                path.file_format = reader.qualified_name()
                self.load_data()
            else:  # the rest include just qualified names
                path.file_format = self.reader_combo.itemText(n)
                self.load_data()

    def _url_set(self):
        url = self.url_combo.currentText()
        pos = self.recent_urls.index(url)
        url = url.strip()

        if not urlparse(url).scheme:
            url = 'http://' + url
            self.url_combo.setItemText(pos, url)
            self.recent_urls[pos] = url

        self.source = self.URL
        self.load_data()

    def browse_file(self, in_demos=False):
        if in_demos:
            start_file = get_sample_datasets_dir()
            if not os.path.exists(start_file):
                QMessageBox.information(None, "文件", "无法找到文件")
                return
        else:
            start_file = self.last_path() or os.path.expanduser("~/")

        filename, reader, _ = open_filename_dialog(start_file, None,
                                                   self.available_readers)
        if not filename:
            return
        self.add_path(filename)
        if reader is not None:
            self.recent_paths[0].file_format = reader.qualified_name()

        self.source = self.LOCAL_FILE
        self.load_data()

    # Open a file, create data from it and send it over the data channel
    def load_data(self):
        # We need to catch any exception type since anything can happen in
        # file readers
        self.closeContext()
        self.domain_editor.set_domain(None)
        self.apply_button.setEnabled(False)
        self.clear_messages()
        self.set_file_list()

        error = self._try_load()
        if error:
            error()
            self.data = None
            self.sheet_box.hide()
            self.Outputs.data.send(None)
            self.infolabel.setText("无数据")

    def _try_load(self):
        self._initialize_reader_combo()

        # pylint: disable=broad-except
        if self.source == self.LOCAL_FILE:
            if self.last_path() is None:
                return self.Information.no_file_selected
            elif not os.path.exists(self.last_path()):
                return self.Error.file_not_found
        else:
            url = self.url_combo.currentText().strip()
            if not url:
                return self.Information.no_file_selected

        def mark_problematic_reader():
            self.reader_combo.setItemData(self.reader_combo.currentIndex(),
                                          QBrush(Qt.red), Qt.ForegroundRole)

        try:
            self.reader = self._get_reader()  # also sets current reader index
            assert self.reader is not None
        except MissingReaderException:
            mark_problematic_reader()
            return self.Error.missing_reader
        except Exception as ex:
            mark_problematic_reader()
            log.exception(ex)
            return lambda x=ex: self.Error.unknown(str(x))

        try:
            self._update_sheet_combo()
        except Exception:
            return self.Error.sheet_error

        with log_warnings() as warnings:
            try:
                data = self.reader.read()
            except Exception as ex:
                mark_problematic_reader()
                log.exception(ex)
                return lambda x=ex: self.Error.unknown(str(x))
            if warnings:
                self.Warning.load_warning(warnings[-1].message.args[0])

        self.infolabel.setText(self._describe(data))

        self.loaded_file = self.last_path()
        add_origin(data, self.loaded_file)
        self.data = data
        self.openContext(data.domain)
        self.apply_domain_edit()  # sends data
        return None

    def _get_reader(self) -> FileFormat:
        if self.source == self.LOCAL_FILE:
            path = self.last_path()
            self.reader_combo.setEnabled(True)
            if self.recent_paths and self.recent_paths[0].file_format:
                qname = self.recent_paths[0].file_format
                qname_index = {
                    r.qualified_name(): i
                    for i, r in enumerate(self.available_readers)
                }
                if qname in qname_index:
                    self.reader_combo.setCurrentIndex(qname_index[qname] + 1)
                else:
                    # reader may be accessible, but not in self.available_readers
                    # (perhaps its code was moved)
                    self.reader_combo.addItem(qname)
                    self.reader_combo.setCurrentIndex(
                        len(self.reader_combo) - 1)
                try:
                    reader_class = class_from_qualified_name(qname)
                except Exception as ex:
                    raise MissingReaderException(
                        f'Can not find reader "{qname}"') from ex
                reader = reader_class(path)
            else:
                self.reader_combo.setCurrentIndex(0)
                reader = FileFormat.get_reader(path)
            if self.recent_paths and self.recent_paths[0].sheet:
                reader.select_sheet(self.recent_paths[0].sheet)
            return reader
        else:
            url = self.url_combo.currentText().strip()
            return UrlReader(url)

    def _update_sheet_combo(self):
        if len(self.reader.sheets) < 2:
            self.sheet_box.hide()
            self.reader.select_sheet(None)
            return

        self.sheet_combo.clear()
        self.sheet_combo.addItems(self.reader.sheets)
        self._select_active_sheet()
        self.sheet_box.show()

    def _select_active_sheet(self):
        try:
            idx = self.reader.sheets.index(self.reader.sheet)
            self.sheet_combo.setCurrentIndex(idx)
        except ValueError:
            # Requested sheet does not exist in this file
            self.reader.select_sheet(None)
            self.sheet_combo.setCurrentIndex(0)

    def _initialize_reader_combo(self):
        self.reader_combo.clear()
        filters = [format_filter(f) for f in self.available_readers]
        self.reader_combo.addItems([DEFAULT_READER_TEXT] + filters)
        self.reader_combo.setCurrentIndex(0)
        self.reader_combo.setDisabled(True)
        # additional readers may be added in self._get_reader()

    @staticmethod
    def _describe(table):
        def missing_prop(prop):
            if prop:
                return f"({prop * 100:.1f}% 个缺失值)"
            else:
                return "(无缺失值)"

        domain = table.domain
        text = ""

        attrs = getattr(table, "attributes", {})
        descs = [
            attrs[desc] for desc in ("Name", "Description") if desc in attrs
        ]
        if len(descs) == 2:
            descs[0] = f"<b>{descs[0]}</b>"
        if descs:
            text += f"<p>{'<br/>'.join(descs)}</p>"

        text += f"<p>{len(table)} 条数据"

        missing_in_attr = missing_prop(table.has_missing_attribute()
                                       and table.get_nan_frequency_attribute())
        missing_in_class = missing_prop(table.has_missing_class()
                                        and table.get_nan_frequency_class())
        text += f"<br/>特征数目: {len(domain.attributes)} {missing_in_attr}"
        if domain.has_continuous_class:
            text += f"<br/>回归; 数值类 {missing_in_class}"
        elif domain.has_discrete_class:
            text += "<br/>分类: 分类种类共 " \
                f"{len(domain.class_var.values)} 个 {missing_in_class}"
        elif table.domain.class_vars:
            text += "<br/>Multi-target; " \
                f"{len(table.domain.class_vars)} target variables " \
                f"{missing_in_class}"
        else:
            text += "<br/>Data has no target variable."
        text += f"<br/>元属性: { len(domain.metas)}"
        text += "</p>"

        if 'Timestamp' in table.domain:
            # Google Forms uses this header to timestamp responses
            text += f"<p>First entry: {table[0, 'Timestamp']}<br/>" \
                f"Last entry: {table[-1, 'Timestamp']}</p>"
        return text

    def storeSpecificSettings(self):
        self.current_context.modified_variables = self.variables[:]

    def retrieveSpecificSettings(self):
        if hasattr(self.current_context, "modified_variables"):
            self.variables[:] = self.current_context.modified_variables

    def reset_domain_edit(self):
        self.domain_editor.reset_domain()
        self.apply_domain_edit()

    def _inspect_discrete_variables(self, domain):
        for var in chain(domain.variables, domain.metas):
            if var.is_discrete and len(var.values) > 100:
                self.Warning.performance_warning()

    def apply_domain_edit(self):
        self.Warning.performance_warning.clear()
        self.Warning.renamed_vars.clear()
        if self.data is None:
            table = None
        else:
            domain, cols, renamed = \
                self.domain_editor.get_domain(self.data.domain, self.data,
                                              deduplicate=True)
            if not (domain.variables or domain.metas):
                table = None
            elif domain is self.data.domain:
                table = self.data
            else:
                X, y, m = cols
                table = Table.from_numpy(domain, X, y, m, self.data.W)
                table.name = self.data.name
                table.ids = np.array(self.data.ids)
                table.attributes = getattr(self.data, 'attributes', {})
                self._inspect_discrete_variables(domain)
            if renamed:
                self.Warning.renamed_vars(f"Renamed: {', '.join(renamed)}")

        self.Warning.multiple_targets(
            shown=table is not None and len(table.domain.class_vars) > 1)
        self.Outputs.data.send(table)
        self.apply_button.setEnabled(False)

    def get_widget_name_extension(self):
        _, name = os.path.split(self.loaded_file)
        return os.path.splitext(name)[0]

    def send_report(self):
        def get_ext_name(filename):
            try:
                return FileFormat.names[os.path.splitext(filename)[1]]
            except KeyError:
                return "unknown"

        if self.data is None:
            self.report_paragraph("File", "No file.")
            return

        if self.source == self.LOCAL_FILE:
            home = os.path.expanduser("~")
            if self.loaded_file.startswith(home):
                # os.path.join does not like ~
                name = "~" + os.path.sep + \
                       self.loaded_file[len(home):].lstrip("/").lstrip("\\")
            else:
                name = self.loaded_file
            if self.sheet_combo.isVisible():
                name += f" ({self.sheet_combo.currentText()})"
            self.report_items("File", [("File name", name),
                                       ("Format", get_ext_name(name))])
        else:
            self.report_items("Data", [("Resource", self.url),
                                       ("Format", get_ext_name(self.url))])

        self.report_data("Data", self.data)

    @staticmethod
    def dragEnterEvent(event):
        """Accept drops of valid file urls"""
        urls = event.mimeData().urls()
        if urls:
            try:
                FileFormat.get_reader(urls[0].toLocalFile())
                event.acceptProposedAction()
            except MissingReaderException:
                pass

    def dropEvent(self, event):
        """Handle file drops"""
        urls = event.mimeData().urls()
        if urls:
            self.add_path(urls[0].toLocalFile())  # add first file
            self.source = self.LOCAL_FILE
            self.load_data()

    def workflowEnvChanged(self, key, value, oldvalue):
        """
        Function called when environment changes (e.g. while saving the scheme)
        It make sure that all environment connected values are modified
        (e.g. relative file paths are changed)
        """
        self.update_file_list(key, value, oldvalue)
Example #21
0
class OWCorpus(OWWidget):
    name = "语料库"
    description = "加载文档语料库."
    icon = "icons/TextFile.svg"
    priority = 100
    replaces = ["orangecontrib.text.widgets.owloadcorpus.OWLoadCorpus"]

    class Inputs:
        data = Input('Data', Table)

    class Outputs:
        corpus = Output('Corpus', Corpus)

    want_main_area = False
    resizing_enabled = True

    dlgFormats = (
        "所有可读文档 ({});;".format('*' + ' *'.join(FileFormat.readers.keys())) +
        ";;".join(
            "{} (*{})".format(f.DESCRIPTION, ' *'.join(f.EXTENSIONS))
            for f in sorted(set(FileFormat.readers.values()),
                            key=list(FileFormat.readers.values()).index)))

    settingsHandler = PerfectDomainContextHandler(
        match_values=PerfectDomainContextHandler.MATCH_VALUES_ALL)

    recent_files = Setting([
        "book-excerpts.tab",
        "grimm-tales-selected.tab",
        "election-tweets-2016.tab",
        "friends-transcripts.tab",
        "andersen.tab",
        "chinese-example.tab",
    ])
    used_attrs = ContextSetting([])

    class Error(OWWidget.Error):
        read_file = Msg("无法读取文件 {} ({})")
        no_text_features_used = Msg("至少使用一个文本特征")
        corpus_without_text_features = Msg("语料库没有文本特征")

    def __init__(self):
        super().__init__()

        self.corpus = None

        # Browse file box
        fbox = gui.widgetBox(self.controlArea, "语料库文件", orientation=0)
        self.file_widget = widgets.FileWidget(
            recent_files=self.recent_files,
            icon_size=(16, 16),
            on_open=self.open_file,
            dialog_format=self.dlgFormats,
            dialog_title='打开语料库文档',
            reload_label='重新加载',
            browse_label='浏览',
            allow_empty=False,
            minimal_width=250,
        )
        fbox.layout().addWidget(self.file_widget)

        # Corpus info
        ibox = gui.widgetBox(self.controlArea, "语料库信息", addSpace=True)
        self.info_label = gui.label(ibox, self, "")
        self.update_info()

        # Used Text Features
        fbox = gui.widgetBox(self.controlArea, orientation=0)
        ubox = gui.widgetBox(fbox, "已使用的文本特征", addSpace=False)
        self.used_attrs_model = VariableListModel(enable_dnd=True)
        self.used_attrs_view = VariablesListItemView()
        self.used_attrs_view.setModel(self.used_attrs_model)
        ubox.layout().addWidget(self.used_attrs_view)

        aa = self.used_attrs_model
        aa.dataChanged.connect(self.update_feature_selection)
        aa.rowsInserted.connect(self.update_feature_selection)
        aa.rowsRemoved.connect(self.update_feature_selection)

        # Ignored Text Features
        ibox = gui.widgetBox(fbox, "未使用的文本特征", addSpace=False)
        self.unused_attrs_model = VariableListModel(enable_dnd=True)
        self.unused_attrs_view = VariablesListItemView()
        self.unused_attrs_view.setModel(self.unused_attrs_model)
        ibox.layout().addWidget(self.unused_attrs_view)

        # Documentation Data Sets & Report
        box = gui.hBox(self.controlArea)
        self.browse_documentation = gui.button(
            box,
            self,
            "浏览语料库文档",
            callback=lambda: self.file_widget.browse(get_sample_corpora_dir()),
            autoDefault=False,
        )

        # load first file
        self.file_widget.select(0)

    def sizeHint(self):
        return QSize(400, 300)

    @Inputs.data
    def set_data(self, data):
        have_data = data is not None

        # Enable/Disable command when data from input
        self.file_widget.setEnabled(not have_data)
        self.browse_documentation.setEnabled(not have_data)

        if have_data:
            self.open_file(data=data)
        else:
            self.file_widget.reload()

    def open_file(self, path=None, data=None):
        self.closeContext()
        self.Error.clear()
        self.unused_attrs_model[:] = []
        self.used_attrs_model[:] = []
        if data:
            self.corpus = Corpus.from_table(data.domain, data)
        elif path:
            try:
                self.corpus = Corpus.from_file(path)
                self.corpus.name = os.path.splitext(os.path.basename(path))[0]
            except BaseException as err:
                self.Error.read_file(path, str(err))
        else:
            return

        self.update_info()
        self.used_attrs = list(self.corpus.text_features)
        if not self.corpus.text_features:
            self.Error.corpus_without_text_features()
            self.Outputs.corpus.send(None)
            return
        self.openContext(self.corpus)
        self.used_attrs_model.extend(self.used_attrs)
        self.unused_attrs_model.extend([
            f for f in self.corpus.domain.metas
            if f.is_string and f not in self.used_attrs_model
        ])

    def update_info(self):
        def describe(corpus):
            dom = corpus.domain
            text_feats = sum(m.is_string for m in dom.metas)
            other_feats = len(dom.attributes) + len(dom.metas) - text_feats
            text = \
                "{} 个文档, {} 个文本特征, {} 个其他特征.". \
                format(len(corpus), text_feats, other_feats)
            if dom.has_continuous_class:
                text += "<br/>回归; 数值类."
            elif dom.has_discrete_class:
                text += "<br/>分类; 离散值含有 {} 种值.". \
                    format(len(dom.class_var.values))
            elif corpus.domain.class_vars:
                text += "<br/>多目标; {} 个目标变量.".format(
                    len(corpus.domain.class_vars))
            else:
                text += "<br/>数据没有目标变量"
            text += "</p>"
            return text

        if self.corpus is None:
            self.info_label.setText("没有加载语料库")
        else:
            self.info_label.setText(describe(self.corpus))

    def update_feature_selection(self):
        self.Error.no_text_features_used.clear()

        # TODO fix VariablesListItemView so it does not emit
        # duplicated data when reordering inside a single window
        def remove_duplicates(l):
            unique = []
            for i in l:
                if i not in unique:
                    unique.append(i)
            return unique

        if self.corpus is not None:
            self.corpus.set_text_features(
                remove_duplicates(self.used_attrs_model))
            self.used_attrs = list(self.used_attrs_model)

            if len(self.unused_attrs_model
                   ) > 0 and not self.corpus.text_features:
                self.Error.no_text_features_used()

            # prevent sending "empty" corpora
            dom = self.corpus.domain
            empty = not (dom.variables or dom.metas) \
                or len(self.corpus) == 0 \
                or not self.corpus.text_features
            self.Outputs.corpus.send(self.corpus if not empty else None)

    def send_report(self):
        def describe(features):
            if len(features):
                return ', '.join([f.name for f in features])
            else:
                return '(无)'

        if self.corpus is not None:
            domain = self.corpus.domain
            self.report_items('Corpus', (
                ("File", self.file_widget.get_selected_filename()),
                ("Documents", len(self.corpus)),
                ("Used text features", describe(self.used_attrs_model)),
                ("Ignored text features", describe(self.unused_attrs_model)),
                ('Other features', describe(domain.attributes)),
                ('Target', describe(domain.class_vars)),
            ))
Example #22
0
class OWVcfFile(widget.OWWidget, RecentPathsWComboMixin):
    name = "VCF File"
    id = "orangecontrib.variants.widgets.vcf"
    description = "Read data from a VCF file."
    icon = "icons/VCFFile.svg"
    priority = 10
    category = "Variants"
    keywords = ["data", "vcf", "file", "load", "read"]

    class Outputs:
        data = Output(
            "Data",
            Table,
            doc="Attribute-valued data set read from the input file.")

    want_main_area = False

    SEARCH_PATHS = [("sample-datasets", get_sample_datasets_dir())]
    SIZE_LIMIT = 1e7

    settingsHandler = PerfectDomainContextHandler(
        match_values=PerfectDomainContextHandler.MATCH_VALUES_ALL)

    # Overload RecentPathsWidgetMixin.recent_paths to set defaults
    recent_paths = Setting([
        RecentPath("", "sample-datasets", "small.vcf"),
    ])
    quality = Setting(1)
    cb_qual = Setting(True)
    frequency = Setting(1)
    cb_freq = Setting(True)

    class Warning(widget.OWWidget.Warning):
        file_too_big = widget.Msg(
            "The file is too large to load automatically."
            " Press Reload to load.")

    class Error(widget.OWWidget.Error):
        file_not_found = widget.Msg("File not found.")

    def __init__(self):
        super().__init__()
        RecentPathsWComboMixin.__init__(self)
        self.domain = None
        self.variants = None
        self.table = None
        self.loaded_file = ""

        layout = QGridLayout()
        gui.widgetBox(self.controlArea, margin=0, orientation=layout)
        label = gui.widgetLabel(self, " File:  ")
        layout.addWidget(label, 0, 0, Qt.AlignVCenter)

        box = gui.hBox(None, addToLayout=False, margin=0)
        box.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed)
        self.file_combo.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed)
        self.file_combo.activated[int].connect(self.select_file)
        box.layout().addWidget(self.file_combo)
        layout.addWidget(box, 0, 1)

        file_button = gui.button(None,
                                 self,
                                 '...',
                                 callback=self.browse_file,
                                 autoDefault=False)
        file_button.setIcon(self.style().standardIcon(QStyle.SP_DirOpenIcon))
        file_button.setSizePolicy(Policy.Maximum, Policy.Fixed)
        layout.addWidget(file_button, 0, 2)

        reload_button = gui.button(None,
                                   self,
                                   "Reload",
                                   callback=self.load_data,
                                   autoDefault=False)
        reload_button.setIcon(self.style().standardIcon(
            QStyle.SP_BrowserReload))
        reload_button.setSizePolicy(Policy.Fixed, Policy.Fixed)
        layout.addWidget(reload_button, 0, 3)

        box = gui.vBox(self.controlArea, "Info")
        self.info = gui.widgetLabel(box, 'No data loaded.')
        self.warnings = gui.widgetLabel(box, '')

        def enable_apply():
            self.apply_button.setEnabled(True)

        box = gui.vBox(self.controlArea, "Filtering")
        _, qspin = gui.spin(box,
                            self,
                            'quality',
                            0,
                            999,
                            step=1,
                            label='Quality threshold (QT)',
                            callback=enable_apply,
                            checked='cb_qual',
                            checkCallback=enable_apply)
        qspin.setToolTip("Minimum quality to use reads.")
        _, fspin = gui.spin(box,
                            self,
                            'frequency',
                            0,
                            999,
                            step=1,
                            label='Frequency threshold (FT)',
                            callback=enable_apply,
                            checked='cb_freq',
                            checkCallback=enable_apply)
        fspin.setToolTip("Keep only variants with at least this many "
                         "occurrences of alternative alleles.")

        gui.rubber(self.controlArea)

        box = gui.hBox(self.controlArea)
        box.layout().addWidget(self.report_button)
        self.report_button.setFixedWidth(170)
        gui.rubber(box)

        self.apply_button = gui.button(box, self, "Apply", callback=self.apply)
        self.apply_button.setEnabled(False)
        self.apply_button.setFixedWidth(170)

        self.set_file_list()
        # Must not call open_file from within __init__. open_file
        # explicitly re-enters the event loop (by a progress bar)

        self.setAcceptDrops(True)

        last_path = self.last_path()
        if last_path and os.path.exists(last_path) and \
                os.path.getsize(last_path) > self.SIZE_LIMIT:
            self.Warning.file_too_big()
            return

        QTimer.singleShot(0, self.load_data)

    def sizeHint(self):
        return QSize(500, 200)

    def select_file(self, n):
        assert n < len(self.recent_paths)
        super().select_file(n)
        if self.recent_paths:
            self.load_data()
            self.set_file_list()

    def browse_file(self):
        start_file = self.last_path() or os.path.expanduser("~/")
        dialog_formats = "VCF files (*.vcf);;All files (*)"

        filename, _ = QFileDialog.getOpenFileName(self,
                                                  'Open Orange Data File',
                                                  start_file, dialog_formats)
        if not filename:
            return
        self.add_path(filename)
        self.load_data()

    # Open a file, create data from it and send it over the data channel
    def load_data(self):
        # We need to catch any exception type since anything can happen in
        # file readers
        # pylint: disable=broad-except
        self.apply_button.setEnabled(False)
        self.clear_messages()
        self.set_file_list()
        if not self.last_path() or not os.path.exists(self.last_path()):
            if self.last_path():
                self.Error.file_not_found()
            self.Outputs.data.send(None)
            self.info.setText("No data.")
            return

        error = None

        if not error:
            with catch_warnings(record=True) as warnings:
                try:
                    variants = VariantData(self.last_path())
                except Exception as ex:
                    log.exception(ex)
                    error = ex
                self.warning(warnings[-1].message.args[0] if warnings else '')

        if error:
            self.variants = self.table = None
            self.Outputs.data.send(None)
            self.info.setText("An error occurred:\n{}".format(error))
            return

        self.loaded_file = self.last_path()
        self.variants = variants
        self.apply()  # sends data

    def update_info(self):
        pl = lambda x: '' if x == 1 else 's'
        text = ""
        if self.variants is not None:
            nsamples, nvariants = self.variants.gt.T.shape
            text += ("<p>Before filtering:<br/>" +
                     "&nbsp; {} sample{}, {} variant{}</p>").\
                format(nsamples, pl(nsamples), nvariants, pl(nvariants), )
        if self.table is not None:
            nsamples, nvariants = self.table.X.shape
            below = np.isnan(self.table.X).sum() / self.table.X.size * 100
            text += ("<p>After filtering:<br/>" +
                     "&nbsp; {} sample{}, {} variant{}<br/>" +
                     "&nbsp; {:.2f}% reads below QT</p>").\
                format(nsamples, pl(nsamples), nvariants, pl(nvariants), below)
        self.info.setText(text)

    def apply(self):
        if self.variants is None:
            self.table = None
        else:
            q = self.quality if self.cb_qual else None
            f = self.frequency if self.cb_freq else None
            self.table = self.variants.get_data(q, f)

        self.update_info()
        self.Outputs.data.send(self.table)
        self.apply_button.setEnabled(False)

    def get_widget_name_extension(self):
        _, name = os.path.split(self.loaded_file)
        return os.path.splitext(name)[0]

    def send_report(self):
        if self.table is None:
            self.report_paragraph("VCF File", "No file.")
            return
        home = os.path.expanduser("~")
        if self.loaded_file.startswith(home):
            # os.path.join does not like ~
            name = "~" + os.path.sep + \
                   self.loaded_file[len(home):].lstrip("/").lstrip("\\")
        else:
            name = self.loaded_file
        self.report_items("VCF File", [
            ("File name", name),
        ])
        parameters = [("Quality", self.quality, self.cb_qual),
                      ("Frequency", self.frequency, self.cb_freq)]
        self.report_items("Filtering parameters",
                          [(name, value)
                           for name, value, enabled in parameters if enabled])
        self.report_data("Data", self.table)

    def dragEnterEvent(self, event):
        """Accept drops of valid file urls"""
        urls = event.mimeData().urls()
        if urls:
            try:
                FileFormat.get_reader(
                    OSX_NSURL_toLocalFile(urls[0]) or urls[0].toLocalFile())
                event.acceptProposedAction()
            except IOError:
                pass

    def dropEvent(self, event):
        """Handle file drops"""
        urls = event.mimeData().urls()
        if urls:
            self.add_path(
                OSX_NSURL_toLocalFile(urls[0])
                or urls[0].toLocalFile())  # add first file
            self.load_data()
Example #23
0
class OWFile(widget.OWWidget, RecentPathsWComboMixin):
    name = "File"
    id = "orange.widgets.data.file"
    description = "Read data from an input file or network " \
                  "and send a data table to the output."
    icon = "icons/File.svg"
    priority = 10
    category = "Data"
    keywords = ["data", "file", "load", "read"]
    outputs = [
        widget.OutputSignal(
            "Data",
            Table,
            doc="Attribute-valued data set read from the input file.")
    ]

    want_main_area = False

    SEARCH_PATHS = [("sample-datasets", get_sample_datasets_dir())]
    SIZE_LIMIT = 1e7
    LOCAL_FILE, URL = range(2)

    settingsHandler = PerfectDomainContextHandler()

    # Overload RecentPathsWidgetMixin.recent_paths to set defaults
    recent_paths = Setting([
        RecentPath("", "sample-datasets", "iris.tab"),
        RecentPath("", "sample-datasets", "titanic.tab"),
        RecentPath("", "sample-datasets", "housing.tab"),
        RecentPath("", "sample-datasets", "heart_disease.tab"),
    ])
    recent_urls = Setting([])
    source = Setting(LOCAL_FILE)
    xls_sheet = ContextSetting("")
    sheet_names = Setting({})
    url = Setting("")

    variables = ContextSetting([])

    dlg_formats = ("All readable files ({});;".format(
        '*' + ' *'.join(FileFormat.readers.keys())) + ";;".join(
            "{} (*{})".format(f.DESCRIPTION, ' *'.join(f.EXTENSIONS))
            for f in sorted(set(FileFormat.readers.values()),
                            key=list(FileFormat.readers.values()).index)))

    domain_editor = SettingProvider(DomainEditor)

    class Warning(widget.OWWidget.Warning):
        file_too_big = widget.Msg(
            "The file is too large to load automatically."
            " Press Reload to load.")

    class Error(widget.OWWidget.Error):
        file_not_found = widget.Msg("File not found.")

    def __init__(self):
        super().__init__()
        RecentPathsWComboMixin.__init__(self)
        self.domain = None
        self.data = None
        self.loaded_file = ""
        self.reader = None

        layout = QGridLayout()
        gui.widgetBox(self.controlArea, margin=0, orientation=layout)
        vbox = gui.radioButtons(None,
                                self,
                                "source",
                                box=True,
                                addSpace=True,
                                callback=self.load_data,
                                addToLayout=False)

        rb_button = gui.appendRadioButton(vbox, "File:", addToLayout=False)
        layout.addWidget(rb_button, 0, 0, Qt.AlignVCenter)

        box = gui.hBox(None, addToLayout=False, margin=0)
        box.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed)
        self.file_combo.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed)
        self.file_combo.activated[int].connect(self.select_file)
        box.layout().addWidget(self.file_combo)
        layout.addWidget(box, 0, 1)

        file_button = gui.button(None,
                                 self,
                                 '...',
                                 callback=self.browse_file,
                                 autoDefault=False)
        file_button.setIcon(self.style().standardIcon(QStyle.SP_DirOpenIcon))
        file_button.setSizePolicy(Policy.Maximum, Policy.Fixed)
        layout.addWidget(file_button, 0, 2)

        reload_button = gui.button(None,
                                   self,
                                   "Reload",
                                   callback=self.load_data,
                                   autoDefault=False)
        reload_button.setIcon(self.style().standardIcon(
            QStyle.SP_BrowserReload))
        reload_button.setSizePolicy(Policy.Fixed, Policy.Fixed)
        layout.addWidget(reload_button, 0, 3)

        self.sheet_box = gui.hBox(None, addToLayout=False, margin=0)
        self.sheet_combo = gui.comboBox(
            None,
            self,
            "xls_sheet",
            callback=self.select_sheet,
            sendSelectedValue=True,
        )
        self.sheet_combo.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed)
        self.sheet_label = QLabel()
        self.sheet_label.setText('Sheet')
        self.sheet_label.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed)
        self.sheet_box.layout().addWidget(self.sheet_label, Qt.AlignLeft)
        self.sheet_box.layout().addWidget(self.sheet_combo, Qt.AlignVCenter)
        layout.addWidget(self.sheet_box, 2, 1)
        self.sheet_box.hide()

        rb_button = gui.appendRadioButton(vbox, "URL:", addToLayout=False)
        layout.addWidget(rb_button, 3, 0, Qt.AlignVCenter)

        self.url_combo = url_combo = QComboBox()
        url_model = NamedURLModel(self.sheet_names)
        url_model.wrap(self.recent_urls)
        url_combo.setLineEdit(LineEditSelectOnFocus())
        url_combo.setModel(url_model)
        url_combo.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed)
        url_combo.setEditable(True)
        url_combo.setInsertPolicy(url_combo.InsertAtTop)
        url_edit = url_combo.lineEdit()
        l, t, r, b = url_edit.getTextMargins()
        url_edit.setTextMargins(l + 5, t, r, b)
        layout.addWidget(url_combo, 3, 1, 3, 3)
        url_combo.activated.connect(self._url_set)

        box = gui.vBox(self.controlArea, "Info")
        self.info = gui.widgetLabel(box, 'No data loaded.')
        self.warnings = gui.widgetLabel(box, '')

        box = gui.widgetBox(self.controlArea, "Columns (Double click to edit)")
        self.domain_editor = DomainEditor(self)
        self.editor_model = self.domain_editor.model()
        box.layout().addWidget(self.domain_editor)

        box = gui.hBox(self.controlArea)
        gui.button(box,
                   self,
                   "Browse documentation data sets",
                   callback=lambda: self.browse_file(True),
                   autoDefault=False)
        gui.rubber(box)
        box.layout().addWidget(self.report_button)
        self.report_button.setFixedWidth(170)

        self.apply_button = gui.button(box,
                                       self,
                                       "Apply",
                                       callback=self.apply_domain_edit)
        self.apply_button.setEnabled(False)
        self.apply_button.setFixedWidth(170)
        self.editor_model.dataChanged.connect(
            lambda: self.apply_button.setEnabled(True))

        self.set_file_list()
        # Must not call open_file from within __init__. open_file
        # explicitly re-enters the event loop (by a progress bar)

        self.setAcceptDrops(True)

        if self.source == self.LOCAL_FILE:
            last_path = self.last_path()
            if last_path and os.path.exists(last_path) and \
                    os.path.getsize(last_path) > self.SIZE_LIMIT:
                self.Warning.file_too_big()
                return

        QTimer.singleShot(0, self.load_data)

    def sizeHint(self):
        return QSize(600, 550)

    def select_file(self, n):
        assert n < len(self.recent_paths)
        super().select_file(n)
        if self.recent_paths:
            self.source = self.LOCAL_FILE
            self.load_data()
            self.set_file_list()

    def select_sheet(self):
        self.recent_paths[0].sheet = self.sheet_combo.currentText()
        self.load_data()

    def _url_set(self):
        self.source = self.URL
        self.load_data()

    def browse_file(self, in_demos=False):
        if in_demos:
            start_file = get_sample_datasets_dir()
            if not os.path.exists(start_file):
                QMessageBox.information(
                    None, "File",
                    "Cannot find the directory with documentation data sets")
                return
        else:
            start_file = self.last_path() or os.path.expanduser("~/")

        filename, _ = QFileDialog.getOpenFileName(self,
                                                  'Open Orange Data File',
                                                  start_file, self.dlg_formats)
        if not filename:
            return
        self.add_path(filename)
        self.source = self.LOCAL_FILE
        self.load_data()

    # Open a file, create data from it and send it over the data channel
    def load_data(self):
        # We need to catch any exception type since anything can happen in
        # file readers
        # pylint: disable=broad-except
        self.closeContext()
        self.domain_editor.set_domain(None)
        self.apply_button.setEnabled(False)
        self.clear_messages()
        self.set_file_list()
        if self.last_path() and not os.path.exists(self.last_path()):
            self.Error.file_not_found()
            self.send("Data", None)
            self.info.setText("No data.")
            return

        error = None
        try:
            self.reader = self._get_reader()
            if self.reader is None:
                self.data = None
                self.send("Data", None)
                self.info.setText("No data.")
                self.sheet_box.hide()
                return
        except Exception as ex:
            error = ex

        if not error:
            self._update_sheet_combo()
            with catch_warnings(record=True) as warnings:
                try:
                    data = self.reader.read()
                except Exception as ex:
                    log.exception(ex)
                    error = ex
                self.warning(warnings[-1].message.args[0] if warnings else '')

        if error:
            self.data = None
            self.send("Data", None)
            self.info.setText("An error occurred:\n{}".format(error))
            self.sheet_box.hide()
            return

        self.info.setText(self._describe(data))

        self.loaded_file = self.last_path()
        add_origin(data, self.loaded_file)
        self.data = data
        self.openContext(data.domain)
        self.apply_domain_edit()  # sends data

    def _get_reader(self):
        """

        Returns
        -------
        FileFormat
        """
        if self.source == self.LOCAL_FILE:
            reader = FileFormat.get_reader(self.last_path())
            if self.recent_paths and self.recent_paths[0].sheet:
                reader.select_sheet(self.recent_paths[0].sheet)
            return reader
        elif self.source == self.URL:
            url = self.url_combo.currentText().strip()
            if url:
                return UrlReader(url)

    def _update_sheet_combo(self):
        if len(self.reader.sheets) < 2:
            self.sheet_box.hide()
            self.reader.select_sheet(None)
            return

        self.sheet_combo.clear()
        self.sheet_combo.addItems(self.reader.sheets)
        self._select_active_sheet()
        self.sheet_box.show()

    def _select_active_sheet(self):
        if self.reader.sheet:
            try:
                idx = self.reader.sheets.index(self.reader.sheet)
                self.sheet_combo.setCurrentIndex(idx)
            except ValueError:
                # Requested sheet does not exist in this file
                self.reader.select_sheet(None)
        else:
            self.sheet_combo.setCurrentIndex(0)

    def _describe(self, table):
        domain = table.domain
        text = ""

        attrs = getattr(table, "attributes", {})
        descs = [
            attrs[desc] for desc in ("Name", "Description") if desc in attrs
        ]
        if len(descs) == 2:
            descs[0] = "<b>{}</b>".format(descs[0])
        if descs:
            text += "<p>{}</p>".format("<br/>".join(descs))

        text += "<p>{} instance(s), {} feature(s), {} meta attribute(s)".\
            format(len(table), len(domain.attributes), len(domain.metas))
        if domain.has_continuous_class:
            text += "<br/>Regression; numerical class."
        elif domain.has_discrete_class:
            text += "<br/>Classification; discrete class with {} values.".\
                format(len(domain.class_var.values))
        elif table.domain.class_vars:
            text += "<br/>Multi-target; {} target variables.".format(
                len(table.domain.class_vars))
        else:
            text += "<br/>Data has no target variable."
        text += "</p>"

        if 'Timestamp' in table.domain:
            # Google Forms uses this header to timestamp responses
            text += '<p>First entry: {}<br/>Last entry: {}</p>'.format(
                table[0, 'Timestamp'], table[-1, 'Timestamp'])
        return text

    def storeSpecificSettings(self):
        self.current_context.modified_variables = self.variables[:]

    def retrieveSpecificSettings(self):
        if hasattr(self.current_context, "modified_variables"):
            self.variables[:] = self.current_context.modified_variables

    def apply_domain_edit(self):
        if self.data is not None:
            domain, cols = self.domain_editor.get_domain(
                self.data.domain, self.data)
            X, y, m = cols
            X = np.array(X).T if len(X) else np.empty((len(self.data), 0))
            y = np.array(y).T if len(y) else None
            dtpe = object if any(
                isinstance(m, StringVariable) for m in domain.metas) else float
            m = np.array(m, dtype=dtpe).T if len(m) else None
            table = Table.from_numpy(domain, X, y, m, self.data.W)
            table.name = self.data.name
            table.ids = np.array(self.data.ids)
            table.attributes = getattr(self.data, 'attributes', {})
        else:
            table = self.data

        self.send("Data", table)
        self.apply_button.setEnabled(False)

    def get_widget_name_extension(self):
        _, name = os.path.split(self.loaded_file)
        return os.path.splitext(name)[0]

    def send_report(self):
        def get_ext_name(filename):
            try:
                return FileFormat.names[os.path.splitext(filename)[1]]
            except KeyError:
                return "unknown"

        if self.data is None:
            self.report_paragraph("File", "No file.")
            return

        if self.source == self.LOCAL_FILE:
            home = os.path.expanduser("~")
            if self.loaded_file.startswith(home):
                # os.path.join does not like ~
                name = "~" + os.path.sep + \
                       self.loaded_file[len(home):].lstrip("/").lstrip("\\")
            else:
                name = self.loaded_file
            if self.sheet_combo.isVisible():
                name += " ({})".format(self.sheet_combo.currentText())
            self.report_items("File", [("File name", name),
                                       ("Format", get_ext_name(name))])
        else:
            self.report_items("Data", [("Resource", self.url),
                                       ("Format", get_ext_name(self.url))])

        self.report_data("Data", self.data)

    def dragEnterEvent(self, event):
        """Accept drops of valid file urls"""
        urls = event.mimeData().urls()
        if urls:
            try:
                FileFormat.get_reader(
                    OSX_NSURL_toLocalFile(urls[0]) or urls[0].toLocalFile())
                event.acceptProposedAction()
            except IOError:
                pass

    def dropEvent(self, event):
        """Handle file drops"""
        urls = event.mimeData().urls()
        if urls:
            self.add_path(
                OSX_NSURL_toLocalFile(urls[0])
                or urls[0].toLocalFile())  # add first file
            self.source = self.LOCAL_FILE
            self.load_data()
Example #24
0
class OWCorpusViewer(OWWidget):
    name = "Corpus Viewer"
    description = "Display corpus contents."
    icon = "icons/CorpusViewer.svg"
    priority = 500

    class Inputs:
        corpus = Input("Corpus", Corpus, replaces=["Data"])

    class Outputs:
        matching_docs = Output("Matching Docs", Corpus, default=True)
        other_docs = Output("Other Docs", Corpus)
        corpus = Output("Corpus", Corpus)

    settingsHandler = PerfectDomainContextHandler(
        match_values = PerfectDomainContextHandler.MATCH_VALUES_ALL
    )

    search_indices = ContextSetting([], exclude_metas=False)   # features included in search
    display_indices = ContextSetting([], exclude_metas=False)  # features for display
    display_features = ContextSetting([], exclude_metas=False)
    selected_documents = ContextSetting([])
    regexp_filter = ContextSetting("")

    show_tokens = Setting(False)
    autocommit = Setting(True)

    class Warning(OWWidget.Warning):
        no_feats_search = Msg('No features included in search.')
        no_feats_display = Msg('No features selected for display.')

    def __init__(self):
        super().__init__()

        self.corpus = None              # Corpus
        self.corpus_docs = None         # Documents generated from Corpus
        self.doc_webview = None         # WebView for showing content
        self.search_features = []       # two copies are needed since Display allows drag & drop
        self.display_list_indices = [0]
        self.matches = 0                # Matches of the query

        # Info attributes
        self.update_info()
        info_box = gui.widgetBox(self.controlArea, 'Info')
        gui.label(info_box, self, 'Tokens: %(n_tokens)s')
        gui.label(info_box, self, 'Types: %(n_types)s')
        gui.label(info_box, self, 'Matching documents: %(n_matching)s')
        gui.label(info_box, self, 'Matches: %(n_matches)s')

        # Search features
        self.search_listbox = gui.listBox(
            self.controlArea, self, 'search_indices', 'search_features',
            selectionMode=QListView.ExtendedSelection,
            box='Search features', callback=self.search_features_changed)

        # Display features
        display_box = gui.widgetBox(self.controlArea, 'Display features')
        self.display_listbox = gui.listBox(
            display_box, self, 'display_list_indices', 'display_features',
            selectionMode=QListView.ExtendedSelection,
            callback=self.show_docs, enableDragDrop=True)
        self.show_tokens_checkbox = gui.checkBox(display_box, self, 'show_tokens',
                                                 'Show Tokens && Tags', callback=self.show_docs)

        # Auto-commit box
        gui.auto_commit(self.controlArea, self, 'autocommit', 'Send data', 'Auto send is on')

        # Search
        self.filter_input = gui.lineEdit(self.mainArea, self, 'regexp_filter',
                                         orientation=Qt.Horizontal,
                                         sizePolicy=QSizePolicy(QSizePolicy.MinimumExpanding,
                                                                QSizePolicy.Fixed),
                                         label='RegExp Filter:',
                                         callback=self.refresh_search)

        # Main area
        self.splitter = QSplitter(
            orientation=Qt.Horizontal,
            childrenCollapsible=False,
        )
        # Document list
        self.doc_list = QTableView()
        self.doc_list.setSelectionBehavior(QTableView.SelectRows)
        self.doc_list.setSelectionMode(QTableView.ExtendedSelection)
        self.doc_list.setEditTriggers(QAbstractItemView.NoEditTriggers)
        self.doc_list.horizontalHeader().setSectionResizeMode(QHeaderView.Stretch)
        self.doc_list.horizontalHeader().setVisible(False)
        self.splitter.addWidget(self.doc_list)

        self.doc_list_model = QStandardItemModel(self)
        self.doc_list.setModel(self.doc_list_model)
        self.doc_list.selectionModel().selectionChanged.connect(
            self.selection_changed
        )
        # Document contents
        self.doc_webview = gui.WebviewWidget(self.splitter, debug=False)

        self.mainArea.layout().addWidget(self.splitter)

    def copy_to_clipboard(self):
        text = self.doc_webview.selectedText()
        QApplication.clipboard().setText(text)

    @Inputs.corpus
    def set_data(self, corpus=None):
        self.closeContext()
        self.reset_widget()
        self.corpus = corpus
        self.search_features = []
        if corpus is not None:
            domain = self.corpus.domain
            # Enable/disable tokens checkbox
            if not self.corpus.has_tokens():
                self.show_tokens_checkbox.setCheckState(False)
            self.show_tokens_checkbox.setEnabled(self.corpus.has_tokens())

            self.search_features = list(filter_visible(chain(domain.variables, domain.metas)))
            self.display_features = list(filter_visible(chain(domain.variables, domain.metas)))
            self.search_indices = list(range(len(self.search_features)))
            self.display_indices = list(range(len(self.display_features)))
            self.selected_documents = [corpus.titles[0]] if \
                corpus.titles is not None and len(corpus.titles) else []
            self.openContext(self.corpus)
            self.display_list_indices = self.display_indices
            self.regenerate_docs()
            self.list_docs()
            self.update_info()
            self.set_selection()
            self.show_docs()
        self.commit()

    def reset_widget(self):
        # Corpus
        self.corpus = None
        self.corpus_docs = None
        self.display_features = []
        # Widgets
        self.search_listbox.clear()
        self.display_listbox.clear()
        self.filter_input.clear()
        self.update_info()
        # Models/vars
        self.search_features.clear()
        self.search_indices.clear()
        self.display_indices.clear()
        self.doc_list_model.clear()
        # Warnings
        self.Warning.clear()
        # WebView
        self.doc_webview.setHtml('')

    def list_docs(self):
        """ List documents into the left scrolling area """
        if self.corpus_docs is None:
            return
        # TODO: remove search_keyword??
        search_keyword = self.regexp_filter.strip('|')
        matches = 0
        try:
            reg = re.compile(search_keyword, re.IGNORECASE)
        except sre_constants.error:
            return

        self.doc_list_model.clear()

        for i, (doc, title, content) in enumerate(zip(self.corpus, self.corpus.titles,
                                                      self.corpus_docs)):
            res = len(list(reg.finditer(content))) if self.regexp_filter else 0
            if not self.regexp_filter or res:
                matches += res
                item = QStandardItem()
                item.setData(str(title), Qt.DisplayRole)
                item.setData(doc, Qt.UserRole)
                self.doc_list_model.appendRow(item)
        self.matches = matches

    def get_selected_documents_from_view(self) -> Set[str]:
        """
        Returns
        -------
        Set with names of selected documents in the QTableView
        """
        return {
            i.data(Qt.DisplayRole)
            for i in self.doc_list.selectionModel().selectedRows()
        }

    def set_selection(self) -> None:
        """
        Select documents in selected_documents attribute in the view
        """
        view = self.doc_list
        model = view.model()

        previously_selected = self.selected_documents.copy()
        selection = QItemSelection()
        for row in range(model.rowCount()):
            document = model.data(model.index(row, 0), Qt.DisplayRole)
            if document in self.selected_documents:
                selection.append(QItemSelectionRange(
                    view.model().index(row, 0),
                    view.model().index(row, 0)
                ))
        view.selectionModel().select(
            selection, QItemSelectionModel.ClearAndSelect
        )
        if len(selection) == 0:
            # in cases when selection is empty qt's selection_changed is not
            # called and so we need to manually trigger show_docs
            self.show_docs()
        # select emmit selection change signal which causes calling
        # selection_changed when filtering it means that documents which
        # are currently filtered out get removed from self.selected_douments
        # we still want to keep them to be still selected after user removes
        # filter
        self.selected_documents = previously_selected

    def selection_changed(self) -> None:
        """
        Function is called every time the selection changes - when user select
        new range of documents
        """
        self.selected_documents = self.get_selected_documents_from_view()
        self.show_docs()
        self.commit()

    def show_docs(self):
        """ Show the selected documents in the right area """
        HTML = '''
        <!doctype html>
        <html>
        <head>
        <script type="text/javascript" src="resources/jquery-3.1.1.min.js">
        </script>
        <script type="text/javascript" src="resources/jquery.mark.min.js">
        </script>
        <script type="text/javascript" src="resources/highlighter.js">
        </script>
        <meta charset='utf-8'>
        <style>

        table {{ border-collapse: collapse; }}
        mark {{ background: #FFCD28; }}

        tr > td {{
            padding-bottom: 3px;
            padding-top: 3px;
        }}

        body {{
            font-family: Helvetica;
            font-size: 10pt;
        }}

        .line {{ border-bottom: 1px solid #000; }}
        .separator {{ height: 5px; }}

        .variables {{
            vertical-align: top;
            padding-right: 10px;
        }}
        
        .content {{
            /* Adopted from https://css-tricks.com/snippets/css/prevent-long-urls-from-breaking-out-of-container/ */
        
            /* These are technically the same, but use both */
            overflow-wrap: break-word;
            word-wrap: break-word;
        
            -ms-word-break: break-all;
            /* This is the dangerous one in WebKit, as it breaks things wherever */
            word-break: break-all;
            /* Instead use this non-standard one: */
            word-break: break-word;
        
            /* Adds a hyphen where the word breaks, if supported (No Blink) */
            -ms-hyphens: auto;
            -moz-hyphens: auto;
            -webkit-hyphens: auto;
            hyphens: auto;
        }}

        .token {{
            padding: 3px;
            border: 1px #B0B0B0 solid;
            margin-right: 5px;
            margin-bottom: 5px;
            display: inline-block;
        }}

        img {{
            max-width: 100%;
        }}

        </style>
        </head>
        <body>
        {}
        </body>
        </html>
        '''
        self.display_indices = self.display_list_indices
        if self.corpus is None:
            return

        self.Warning.no_feats_display.clear()
        if len(self.display_indices) == 0:
            self.Warning.no_feats_display()

        if self.show_tokens:
            tokens = list(self.corpus.ngrams_iterator(include_postags=True))

        marked_search_features = [f for i, f in enumerate(self.search_features)
                                  if i in self.search_indices]

        html = '<table>'
        for doc_count, index in enumerate(self.doc_list.selectionModel().selectedRows()):
            if doc_count > 0:   # add split
                html += '<tr class="line separator"><td/><td/></tr>' \
                        '<tr class="separator"><td/><td/></tr>'

            row_ind = index.data(Qt.UserRole).row_index
            for ind in self.display_indices:
                feature = self.display_features[ind]
                value = str(index.data(Qt.UserRole)[feature.name])
                if feature in marked_search_features:
                    value = self.__mark_text(value)
                value = value.replace('\n', '<br/>')
                is_image = feature.attributes.get('type', '') == 'image'
                if is_image and value != '?':
                    value = '<img src="{}"></img>'.format(value)
                html += '<tr><td class="variables"><strong>{}:</strong></td>' \
                        '<td class="content">{}</td></tr>'.format(
                    feature.name, value)

            if self.show_tokens:
                html += '<tr><td class="variables"><strong>Tokens & Tags:</strong></td>' \
                        '<td>{}</td></tr>'.format(''.join('<span class="token">{}</span>'.format(
                    token) for token in tokens[row_ind]))

        html += '</table>'
        base = QUrl.fromLocalFile(__file__)
        self.doc_webview.setHtml(HTML.format(html), base)

    def __mark_text(self, text):
        search_keyword = self.regexp_filter.strip('|')
        if not search_keyword:
            return text

        try:
            reg = re.compile(search_keyword, re.IGNORECASE | re.MULTILINE)
        except sre_constants.error:
            return text

        matches = list(reg.finditer(text))
        if not matches:
            return text

        text = list(text)
        for m in matches[::-1]:
            text[m.start():m.end()] = list('<mark data-markjs="true">{}</mark>'\
                .format("".join(text[m.start():m.end()])))

        return "".join(text)

    def search_features_changed(self):
        self.regenerate_docs()
        self.refresh_search()

    def regenerate_docs(self):
        self.corpus_docs = None
        self.Warning.no_feats_search.clear()
        if self.corpus is not None:
            feats = [self.search_features[i] for i in self.search_indices]
            if len(feats) == 0:
                self.Warning.no_feats_search()
            self.corpus_docs = self.corpus.documents_from_features(feats)

    def refresh_search(self):
        if self.corpus is not None:
            self.list_docs()
            self.set_selection()
            self.update_info()
            self.commit()

    def update_info(self):
        if self.corpus is not None:
            self.n_matching = '{}/{}'.format(self.doc_list_model.rowCount(), len(self.corpus))
            self.n_matches = self.matches if self.matches else 'n/a'
            self.n_tokens = sum(map(len, self.corpus.tokens)) if self.corpus.has_tokens() else 'n/a'
            self.n_types = len(self.corpus.dictionary) if self.corpus.has_tokens() else 'n/a'
        else:
            self.n_matching = ''
            self.n_matches = ''
            self.n_tokens = ''
            self.n_types = ''

    def commit(self):
        matched = unmatched = annotated_corpus = None
        corpus = self.corpus
        if corpus is not None:
            # it returns a set of selected documents which are in view
            selected_docs = self.get_selected_documents_from_view()
            titles = corpus.titles
            matched_mask = [
                i for i, t in enumerate(titles) if t in selected_docs
            ]
            unmatched_mask = [
                i for i, t in enumerate(titles) if t not in selected_docs
            ]

            matched = corpus[matched_mask] if len(matched_mask) else None
            unmatched = corpus[unmatched_mask] if len(unmatched_mask) else None
            annotated_corpus = create_annotated_table(corpus, matched_mask)
        self.Outputs.matching_docs.send(matched)
        self.Outputs.other_docs.send(unmatched)
        self.Outputs.corpus.send(annotated_corpus)

    def send_report(self):
        self.report_items((
            ("Query", self.regexp_filter),
            ("Matching documents", self.n_matching),
            ("Matches", self.n_matches)
        ))

    def showEvent(self, event):
        super().showEvent(event)
        self.update_splitter()

    def update_splitter(self):
        """
        Update splitter that document list on the left never take more
        than 1/3 of the space. It is only set on showEvent. If user
        later changes sizes it stays as it is.
        """
        w1, w2 = self.splitter.sizes()
        ws = w1 + w2
        if w2 < 2/3 * ws:
            self.splitter.setSizes([ws * 1/3, ws * 2/3])
Example #25
0
class OWFile(widget.OWWidget, RecentPathsWComboMixin):
    name = "File"
    id = "orange.widgets.data.file"
    description = "Read data from an input file or network " \
                  "and send a data table to the output."
    icon = "icons/File.svg"
    priority = 10
    category = "Data"
    keywords = ["file", "load", "read", "open"]

    class Outputs:
        data = Output("Data",
                      Table,
                      doc="Attribute-valued dataset read from the input file.")

    want_main_area = False

    SEARCH_PATHS = [("sample-datasets", get_sample_datasets_dir())]
    SIZE_LIMIT = 1e7
    LOCAL_FILE, URL = range(2)

    settingsHandler = PerfectDomainContextHandler(
        match_values=PerfectDomainContextHandler.MATCH_VALUES_ALL)

    # pylint seems to want declarations separated from definitions
    recent_paths: List[RecentPath]
    recent_urls: List[str]
    variables: list

    # Overload RecentPathsWidgetMixin.recent_paths to set defaults
    recent_paths = Setting([
        RecentPath("", "sample-datasets", "iris.tab"),
        RecentPath("", "sample-datasets", "titanic.tab"),
        RecentPath("", "sample-datasets", "housing.tab"),
        RecentPath("", "sample-datasets", "heart_disease.tab"),
        RecentPath("", "sample-datasets", "brown-selected.tab"),
        RecentPath("", "sample-datasets", "zoo.tab"),
    ])
    recent_urls = Setting([])
    source = Setting(LOCAL_FILE)
    xls_sheet = ContextSetting("")
    sheet_names = Setting({})
    url = Setting("")

    variables = ContextSetting([])

    domain_editor = SettingProvider(DomainEditor)

    class Warning(widget.OWWidget.Warning):
        file_too_big = widget.Msg(
            "The file is too large to load automatically."
            " Press Reload to load.")
        load_warning = widget.Msg("Read warning:\n{}")

    class Error(widget.OWWidget.Error):
        file_not_found = widget.Msg("File not found.")
        missing_reader = widget.Msg("Missing reader.")
        sheet_error = widget.Msg("Error listing available sheets.")
        unknown = widget.Msg("Read error:\n{}")

    class NoFileSelected:
        pass

    UserAdviceMessages = [
        widget.Message(
            "Use CSV File Import widget for advanced options "
            "for comma-separated files", "use-csv-file-import"),
        widget.Message(
            "This widget loads only tabular data. Use other widgets to load "
            "other data types like models, distance matrices and networks.",
            "other-data-types")
    ]

    def __init__(self):
        super().__init__()
        RecentPathsWComboMixin.__init__(self)
        self.domain = None
        self.data = None
        self.loaded_file = ""
        self.reader = None

        layout = QGridLayout()
        gui.widgetBox(self.controlArea, margin=0, orientation=layout)
        vbox = gui.radioButtons(None,
                                self,
                                "source",
                                box=True,
                                addSpace=True,
                                callback=self.load_data,
                                addToLayout=False)

        rb_button = gui.appendRadioButton(vbox, "File:", addToLayout=False)
        layout.addWidget(rb_button, 0, 0, Qt.AlignVCenter)

        box = gui.hBox(None, addToLayout=False, margin=0)
        box.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed)
        self.file_combo.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed)
        self.file_combo.activated[int].connect(self.select_file)
        box.layout().addWidget(self.file_combo)
        layout.addWidget(box, 0, 1)

        file_button = gui.button(None,
                                 self,
                                 '...',
                                 callback=self.browse_file,
                                 autoDefault=False)
        file_button.setIcon(self.style().standardIcon(QStyle.SP_DirOpenIcon))
        file_button.setSizePolicy(Policy.Maximum, Policy.Fixed)
        layout.addWidget(file_button, 0, 2)

        reload_button = gui.button(None,
                                   self,
                                   "Reload",
                                   callback=self.load_data,
                                   autoDefault=False)
        reload_button.setIcon(self.style().standardIcon(
            QStyle.SP_BrowserReload))
        reload_button.setSizePolicy(Policy.Fixed, Policy.Fixed)
        layout.addWidget(reload_button, 0, 3)

        self.sheet_box = gui.hBox(None, addToLayout=False, margin=0)
        self.sheet_combo = gui.comboBox(
            None,
            self,
            "xls_sheet",
            callback=self.select_sheet,
            sendSelectedValue=True,
        )
        self.sheet_combo.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed)
        self.sheet_label = QLabel()
        self.sheet_label.setText('Sheet')
        self.sheet_label.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed)
        self.sheet_box.layout().addWidget(self.sheet_label, Qt.AlignLeft)
        self.sheet_box.layout().addWidget(self.sheet_combo, Qt.AlignVCenter)
        layout.addWidget(self.sheet_box, 2, 1)
        self.sheet_box.hide()

        rb_button = gui.appendRadioButton(vbox, "URL:", addToLayout=False)
        layout.addWidget(rb_button, 3, 0, Qt.AlignVCenter)

        self.url_combo = url_combo = QComboBox()
        url_model = NamedURLModel(self.sheet_names)
        url_model.wrap(self.recent_urls)
        url_combo.setLineEdit(LineEditSelectOnFocus())
        url_combo.setModel(url_model)
        url_combo.setSizePolicy(Policy.Ignored, Policy.Fixed)
        url_combo.setEditable(True)
        url_combo.setInsertPolicy(url_combo.InsertAtTop)
        url_edit = url_combo.lineEdit()
        l, t, r, b = url_edit.getTextMargins()
        url_edit.setTextMargins(l + 5, t, r, b)
        layout.addWidget(url_combo, 3, 1, 3, 3)
        url_combo.activated.connect(self._url_set)
        # whit completer we set that combo box is case sensitive when
        # matching the history
        completer = QCompleter()
        completer.setCaseSensitivity(Qt.CaseSensitive)
        url_combo.setCompleter(completer)

        box = gui.vBox(self.controlArea, "Info")
        self.infolabel = gui.widgetLabel(box, 'No data loaded.')
        self.warnings = gui.widgetLabel(box, '')

        box = gui.widgetBox(self.controlArea, "Columns (Double click to edit)")
        self.domain_editor = DomainEditor(self)
        self.editor_model = self.domain_editor.model()
        box.layout().addWidget(self.domain_editor)

        box = gui.hBox(self.controlArea)
        gui.button(box,
                   self,
                   "Browse documentation datasets",
                   callback=lambda: self.browse_file(True),
                   autoDefault=False)
        gui.rubber(box)

        gui.button(box, self, "Reset", callback=self.reset_domain_edit)
        self.apply_button = gui.button(box,
                                       self,
                                       "Apply",
                                       callback=self.apply_domain_edit)
        self.apply_button.setEnabled(False)
        self.apply_button.setFixedWidth(170)
        self.editor_model.dataChanged.connect(
            lambda: self.apply_button.setEnabled(True))

        self.set_file_list()
        # Must not call open_file from within __init__. open_file
        # explicitly re-enters the event loop (by a progress bar)

        self.setAcceptDrops(True)

        if self.source == self.LOCAL_FILE:
            last_path = self.last_path()
            if last_path and os.path.exists(last_path) and \
                    os.path.getsize(last_path) > self.SIZE_LIMIT:
                self.Warning.file_too_big()
                return

        QTimer.singleShot(0, self.load_data)

    @staticmethod
    def sizeHint():
        return QSize(600, 550)

    def select_file(self, n):
        assert n < len(self.recent_paths)
        super().select_file(n)
        if self.recent_paths:
            self.source = self.LOCAL_FILE
            self.load_data()
            self.set_file_list()

    def select_sheet(self):
        self.recent_paths[0].sheet = self.sheet_combo.currentText()
        self.load_data()

    def _url_set(self):
        url = self.url_combo.currentText()
        pos = self.recent_urls.index(url)
        url = url.strip()

        if not urlparse(url).scheme:
            url = 'http://' + url
            self.url_combo.setItemText(pos, url)
            self.recent_urls[pos] = url

        self.source = self.URL
        self.load_data()

    def browse_file(self, in_demos=False):
        if in_demos:
            start_file = get_sample_datasets_dir()
            if not os.path.exists(start_file):
                QMessageBox.information(
                    None, "File",
                    "Cannot find the directory with documentation datasets")
                return
        else:
            start_file = self.last_path() or os.path.expanduser("~/")

        readers = [
            f for f in FileFormat.formats
            if getattr(f, 'read', None) and getattr(f, "EXTENSIONS", None)
        ]
        filename, reader, _ = open_filename_dialog(start_file, None, readers)
        if not filename:
            return
        self.add_path(filename)
        if reader is not None:
            self.recent_paths[0].file_format = reader.qualified_name()

        self.source = self.LOCAL_FILE
        self.load_data()

    # Open a file, create data from it and send it over the data channel
    def load_data(self):
        # We need to catch any exception type since anything can happen in
        # file readers
        self.closeContext()
        self.domain_editor.set_domain(None)
        self.apply_button.setEnabled(False)
        self.clear_messages()
        self.set_file_list()

        error = self._try_load()
        if error:
            error()
            self.data = None
            self.sheet_box.hide()
            self.Outputs.data.send(None)
            self.infolabel.setText("No data.")

    def _try_load(self):
        # pylint: disable=broad-except
        if self.last_path() and not os.path.exists(self.last_path()):
            return self.Error.file_not_found

        try:
            self.reader = self._get_reader()
            assert self.reader is not None
        except Exception:
            return self.Error.missing_reader

        if self.reader is self.NoFileSelected:
            self.Outputs.data.send(None)
            return None

        try:
            self._update_sheet_combo()
        except Exception:
            return self.Error.sheet_error

        with catch_warnings(record=True) as warnings:
            try:
                data = self.reader.read()
            except Exception as ex:
                log.exception(ex)
                return lambda x=ex: self.Error.unknown(str(x))
            if warnings:
                self.Warning.load_warning(warnings[-1].message.args[0])

        self.infolabel.setText(self._describe(data))

        self.loaded_file = self.last_path()
        add_origin(data, self.loaded_file)
        self.data = data
        self.openContext(data.domain)
        self.apply_domain_edit()  # sends data
        return None

    def _get_reader(self) -> FileFormat:
        if self.source == self.LOCAL_FILE:
            path = self.last_path()
            if path is None:
                return self.NoFileSelected
            if self.recent_paths and self.recent_paths[0].file_format:
                qname = self.recent_paths[0].file_format
                reader_class = class_from_qualified_name(qname)
                reader = reader_class(path)
            else:
                reader = FileFormat.get_reader(path)
            if self.recent_paths and self.recent_paths[0].sheet:
                reader.select_sheet(self.recent_paths[0].sheet)
            return reader
        else:
            url = self.url_combo.currentText().strip()
            if url:
                return UrlReader(url)
            else:
                return self.NoFileSelected

    def _update_sheet_combo(self):
        if len(self.reader.sheets) < 2:
            self.sheet_box.hide()
            self.reader.select_sheet(None)
            return

        self.sheet_combo.clear()
        self.sheet_combo.addItems(self.reader.sheets)
        self._select_active_sheet()
        self.sheet_box.show()

    def _select_active_sheet(self):
        if self.reader.sheet:
            try:
                idx = self.reader.sheets.index(self.reader.sheet)
                self.sheet_combo.setCurrentIndex(idx)
            except ValueError:
                # Requested sheet does not exist in this file
                self.reader.select_sheet(None)
        else:
            self.sheet_combo.setCurrentIndex(0)

    @staticmethod
    def _describe(table):
        def missing_prop(prop):
            if prop:
                return f"({prop * 100:.1f}% missing values)"
            else:
                return "(no missing values)"

        domain = table.domain
        text = ""

        attrs = getattr(table, "attributes", {})
        descs = [
            attrs[desc] for desc in ("Name", "Description") if desc in attrs
        ]
        if len(descs) == 2:
            descs[0] = f"<b>{descs[0]}</b>"
        if descs:
            text += f"<p>{'<br/>'.join(descs)}</p>"

        text += f"<p>{len(table)} instance(s)"

        missing_in_attr = missing_prop(table.has_missing_attribute()
                                       and table.get_nan_frequency_attribute())
        missing_in_class = missing_prop(table.has_missing_class()
                                        and table.get_nan_frequency_class())
        text += f"<br/>{len(domain.attributes)} feature(s) {missing_in_attr}"
        if domain.has_continuous_class:
            text += f"<br/>Regression; numerical class {missing_in_class}"
        elif domain.has_discrete_class:
            text += "<br/>Classification; categorical class " \
                f"with {len(domain.class_var.values)} values {missing_in_class}"
        elif table.domain.class_vars:
            text += "<br/>Multi-target; " \
                f"{len(table.domain.class_vars)} target variables " \
                f"{missing_in_class}"
        else:
            text += "<br/>Data has no target variable."
        text += f"<br/>{len(domain.metas)} meta attribute(s)"
        text += "</p>"

        if 'Timestamp' in table.domain:
            # Google Forms uses this header to timestamp responses
            text += f"<p>First entry: {table[0, 'Timestamp']}<br/>" \
                f"Last entry: {table[-1, 'Timestamp']}</p>"
        return text

    def storeSpecificSettings(self):
        self.current_context.modified_variables = self.variables[:]

    def retrieveSpecificSettings(self):
        if hasattr(self.current_context, "modified_variables"):
            self.variables[:] = self.current_context.modified_variables

    def reset_domain_edit(self):
        self.domain_editor.reset_domain()
        self.apply_domain_edit()

    def apply_domain_edit(self):
        if self.data is None:
            table = None
        else:
            domain, cols = self.domain_editor.get_domain(
                self.data.domain, self.data)
            if not (domain.variables or domain.metas):
                table = None
            else:
                X, y, m = cols
                table = Table.from_numpy(domain, X, y, m, self.data.W)
                table.name = self.data.name
                table.ids = np.array(self.data.ids)
                table.attributes = getattr(self.data, 'attributes', {})

        self.Outputs.data.send(table)
        self.apply_button.setEnabled(False)

    def get_widget_name_extension(self):
        _, name = os.path.split(self.loaded_file)
        return os.path.splitext(name)[0]

    def send_report(self):
        def get_ext_name(filename):
            try:
                return FileFormat.names[os.path.splitext(filename)[1]]
            except KeyError:
                return "unknown"

        if self.data is None:
            self.report_paragraph("File", "No file.")
            return

        if self.source == self.LOCAL_FILE:
            home = os.path.expanduser("~")
            if self.loaded_file.startswith(home):
                # os.path.join does not like ~
                name = "~" + os.path.sep + \
                       self.loaded_file[len(home):].lstrip("/").lstrip("\\")
            else:
                name = self.loaded_file
            if self.sheet_combo.isVisible():
                name += f" ({self.sheet_combo.currentText()})"
            self.report_items("File", [("File name", name),
                                       ("Format", get_ext_name(name))])
        else:
            self.report_items("Data", [("Resource", self.url),
                                       ("Format", get_ext_name(self.url))])

        self.report_data("Data", self.data)

    @staticmethod
    def dragEnterEvent(event):
        """Accept drops of valid file urls"""
        urls = event.mimeData().urls()
        if urls:
            try:
                FileFormat.get_reader(urls[0].toLocalFile())
                event.acceptProposedAction()
            except IOError:
                pass

    def dropEvent(self, event):
        """Handle file drops"""
        urls = event.mimeData().urls()
        if urls:
            self.add_path(urls[0].toLocalFile())  # add first file
            self.source = self.LOCAL_FILE
            self.load_data()

    def workflowEnvChanged(self, key, value, oldvalue):
        """
        Function called when environment changes (e.g. while saving the scheme)
        It make sure that all environment connected values are modified
        (e.g. relative file paths are changed)
        """
        self.update_file_list(key, value, oldvalue)
Example #26
0
class OWCorpus(OWWidget):
    name = "Corpus"
    description = "Load a corpus of text documents."
    icon = "icons/TextFile.svg"
    priority = 10
    replaces = ["orangecontrib.text.widgets.owloadcorpus.OWLoadCorpus"]

    class Outputs:
        corpus = Output("Corpus", Corpus)

    want_main_area = False
    resizing_enabled = True

    dlgFormats = ("All readable files ({});;".format(
        '*' + ' *'.join(FileFormat.readers.keys())) + ";;".join(
            "{} (*{})".format(f.DESCRIPTION, ' *'.join(f.EXTENSIONS))
            for f in sorted(set(FileFormat.readers.values()),
                            key=list(FileFormat.readers.values()).index)))

    settingsHandler = PerfectDomainContextHandler(
        match_values=PerfectDomainContextHandler.MATCH_VALUES_ALL)

    recent_files = Setting([
        "book-excerpts.tab",
        "grimm-tales-selected.tab",
        "election-tweets-2016.tab",
        "friends-transcripts.tab",
        "andersen.tab",
    ])
    used_attrs = ContextSetting([])

    class Error(OWWidget.Error):
        read_file = Msg("Can't read file {} ({})")

    def __init__(self):
        super().__init__()

        self.corpus = None

        # Browse file box
        fbox = gui.widgetBox(self.controlArea, "Corpus file", orientation=0)
        self.file_widget = widgets.FileWidget(
            recent_files=self.recent_files,
            icon_size=(16, 16),
            on_open=self.open_file,
            dialog_format=self.dlgFormats,
            dialog_title='Open Orange Document Corpus',
            reload_label='Reload',
            browse_label='Browse',
            allow_empty=False,
            minimal_width=250,
        )
        fbox.layout().addWidget(self.file_widget)

        # Corpus info
        ibox = gui.widgetBox(self.controlArea, "Corpus info", addSpace=True)
        self.info_label = gui.label(ibox, self, "")
        self.update_info()

        # Used Text Features
        fbox = gui.widgetBox(self.controlArea, orientation=0)
        ubox = gui.widgetBox(fbox, "Used text features", addSpace=False)
        self.used_attrs_model = VariableListModel(enable_dnd=True)
        self.used_attrs_view = VariablesListItemView()
        self.used_attrs_view.setModel(self.used_attrs_model)
        ubox.layout().addWidget(self.used_attrs_view)

        aa = self.used_attrs_model
        aa.dataChanged.connect(self.update_feature_selection)
        aa.rowsInserted.connect(self.update_feature_selection)
        aa.rowsRemoved.connect(self.update_feature_selection)

        # Ignored Text Features
        ibox = gui.widgetBox(fbox, "Ignored text features", addSpace=False)
        self.unused_attrs_model = VariableListModel(enable_dnd=True)
        self.unused_attrs_view = VariablesListItemView()
        self.unused_attrs_view.setModel(self.unused_attrs_model)
        ibox.layout().addWidget(self.unused_attrs_view)

        # Documentation Data Sets & Report
        box = gui.hBox(self.controlArea)
        gui.button(
            box,
            self,
            "Browse documentation corpora",
            callback=lambda: self.file_widget.browse(get_sample_corpora_dir()),
            autoDefault=False,
        )
        box.layout().addWidget(self.report_button)

        # load first file
        self.file_widget.select(0)

    def open_file(self, path):
        self.closeContext()
        self.Error.read_file.clear()
        self.used_attrs_model[:] = []
        self.unused_attrs_model[:] = []
        if path:
            try:
                self.corpus = Corpus.from_file(path)
                self.corpus.name = os.path.splitext(os.path.basename(path))[0]
                self.update_info()
                self.used_attrs = list(self.corpus.text_features)
                self.openContext(self.corpus)
                self.used_attrs_model.extend(self.used_attrs)
                self.unused_attrs_model.extend([
                    f for f in self.corpus.domain.metas
                    if f.is_string and f not in self.used_attrs_model
                ])
            except BaseException as err:
                self.Error.read_file(path, str(err))

    def update_info(self):
        def describe(corpus):
            dom = corpus.domain
            text_feats = sum(m.is_string for m in dom.metas)
            other_feats = len(dom.attributes) + len(dom.metas) - text_feats
            text = \
                "{} document(s), {} text features(s), {} other feature(s).". \
                format(len(corpus), text_feats, other_feats)
            if dom.has_continuous_class:
                text += "<br/>Regression; numerical class."
            elif dom.has_discrete_class:
                text += "<br/>Classification; discrete class with {} values.". \
                    format(len(dom.class_var.values))
            elif corpus.domain.class_vars:
                text += "<br/>Multi-target; {} target variables.".format(
                    len(corpus.domain.class_vars))
            else:
                text += "<br/>Data has no target variable."
            text += "</p>"
            return text

        if self.corpus is None:
            self.info_label.setText("No corpus loaded.")
        else:
            self.info_label.setText(describe(self.corpus))

    def update_feature_selection(self):
        # TODO fix VariablesListItemView so it does not emit
        # duplicated data when reordering inside a single window
        def remove_duplicates(l):
            unique = []
            for i in l:
                if i not in unique:
                    unique.append(i)
            return unique

        if self.corpus is not None:
            self.corpus.set_text_features(
                remove_duplicates(self.used_attrs_model))
            self.used_attrs = list(self.used_attrs_model)

            # prevent sending "empty" corpora
            dom = self.corpus.domain
            empty = not (dom.variables or dom.metas) or len(self.corpus) == 0
            self.Outputs.corpus.send(self.corpus if not empty else None)

    def send_report(self):
        def describe(features):
            if len(features):
                return ', '.join([f.name for f in features])
            else:
                return '(none)'

        if self.corpus is not None:
            domain = self.corpus.domain
            self.report_items('Corpus', (
                ("File", self.file_widget.get_selected_filename()),
                ("Documents", len(self.corpus)),
                ("Used text features", describe(self.used_attrs_model)),
                ("Ignored text features", describe(self.unused_attrs_model)),
                ('Other features', describe(domain.attributes)),
                ('Target', describe(domain.class_vars)),
            ))
Example #27
0
class OWScoreDocuments(OWWidget, ConcurrentWidgetMixin):
    name = "Score Documents"
    description = ""
    icon = "icons/ScoreDocuments.svg"
    priority = 500

    buttons_area_orientation = Qt.Vertical

    # default order - table sorted in input order
    DEFAULT_SORTING = (-1, Qt.AscendingOrder)

    settingsHandler = PerfectDomainContextHandler()
    auto_commit: bool = Setting(True)
    aggregation: int = Setting(0)

    word_frequency: bool = Setting(True)
    word_appearance: bool = Setting(False)
    embedding_similarity: bool = Setting(False)
    embedding_language: int = Setting(0)

    sort_column_order: Tuple[int, int] = Setting(DEFAULT_SORTING)
    selected_rows: List[int] = ContextSetting([], schema_only=True)
    sel_method: int = ContextSetting(SelectionMethods.N_BEST)
    n_selected: int = ContextSetting(3)

    class Inputs:
        corpus = Input("Corpus", Corpus)
        words = Input("Words", Table)

    class Outputs:
        selected_documents = Output("Selected documents", Corpus, default=True)
        corpus = Output("Corpus", Corpus)

    class Warning(OWWidget.Warning):
        corpus_not_normalized = Msg("Use Preprocess Text to normalize corpus.")

    class Error(OWWidget.Error):
        custom_err = Msg("{}")

    def __init__(self):
        OWWidget.__init__(self)
        ConcurrentWidgetMixin.__init__(self)
        self._setup_control_area()
        self._setup_main_area()
        self.corpus = None
        self.words = None
        # saves scores avoid multiple computation of the same score
        self.scores = {}

    def _setup_control_area(self) -> None:
        box = gui.widgetBox(self.controlArea, "Word Scoring Methods")
        for value, (n, _, tt) in SCORING_METHODS.items():
            b = gui.hBox(box, margin=0)
            gui.checkBox(
                b,
                self,
                value,
                label=n,
                callback=self.__setting_changed,
                tooltip=tt,
            )
            if value in ADDITIONAL_OPTIONS:
                value, options = ADDITIONAL_OPTIONS[value]
                gui.comboBox(
                    b,
                    self,
                    value,
                    items=options,
                    callback=self.__setting_changed,
                )

        box = gui.widgetBox(self.controlArea, "Aggregation")
        gui.comboBox(
            box,
            self,
            "aggregation",
            items=[n for n in AGGREGATIONS],
            callback=self.__setting_changed,
        )

        gui.rubber(self.controlArea)

        # select words box
        box = gui.vBox(self.buttonsArea, "Select Documents")
        grid = QGridLayout()
        grid.setContentsMargins(0, 0, 0, 0)

        self._sel_method_buttons = QButtonGroup()
        for method, label in enumerate(SelectionMethods.ITEMS):
            button = QRadioButton(label)
            button.setChecked(method == self.sel_method)
            grid.addWidget(button, method, 0)
            self._sel_method_buttons.addButton(button, method)
        self._sel_method_buttons.buttonClicked[int].connect(
            self.__set_selection_method)

        spin = gui.spin(
            box,
            self,
            "n_selected",
            1,
            999,
            addToLayout=False,
            callback=lambda: self.__set_selection_method(SelectionMethods.
                                                         N_BEST),
        )
        grid.addWidget(spin, 3, 1)
        box.layout().addLayout(grid)

        # autocommit
        gui.auto_send(self.buttonsArea, self, "auto_commit")

    def _setup_main_area(self) -> None:
        self._filter_line_edit = QLineEdit(
            textChanged=self.__on_filter_changed, placeholderText="Filter...")
        self.mainArea.layout().addWidget(self._filter_line_edit)

        self.model = model = ScoreDocumentsTableModel(parent=self)
        model.setHorizontalHeaderLabels(["Document"])

        def select_manual():
            self.__set_selection_method(SelectionMethods.MANUAL)

        self.view = view = ScoreDocumentsTableView()
        view.pressedAny.connect(select_manual)
        self.mainArea.layout().addWidget(view)
        # by default data are sorted in the Table order
        header = self.view.horizontalHeader()
        header.sectionClicked.connect(self.__on_horizontal_header_clicked)

        proxy_model = ScoreDocumentsProxyModel()
        proxy_model.setFilterKeyColumn(0)
        proxy_model.setFilterCaseSensitivity(False)
        view.setModel(proxy_model)
        view.model().setSourceModel(self.model)
        self.view.selectionModel().selectionChanged.connect(
            self.__on_selection_change)

    def __on_filter_changed(self) -> None:
        model = self.view.model()
        model.setFilterFixedString(self._filter_line_edit.text().strip())

    def __on_horizontal_header_clicked(self, index: int):
        header = self.view.horizontalHeader()
        self.sort_column_order = (index, header.sortIndicatorOrder())
        self._select_rows()
        # when sorting change output table must consider the new order
        # call explicitly since selection in table is not changed
        if (self.sel_method == SelectionMethods.MANUAL and self.selected_rows
                or self.sel_method == SelectionMethods.ALL):
            # retrieve selection in new order
            self.selected_rows = self.get_selected_indices()
            self._send_output()

    def __on_selection_change(self):
        self.selected_rows = self.get_selected_indices()
        self._send_output()

    def __set_selection_method(self, method: int):
        self.sel_method = method
        self._sel_method_buttons.button(method).setChecked(True)
        self._select_rows()

    @Inputs.corpus
    def set_data(self, corpus: Corpus) -> None:
        self.closeContext()
        self.Warning.corpus_not_normalized.clear()
        if corpus is None:
            self.corpus = None
            self._clear_and_run()
            return
        if not self._is_corpus_normalized(corpus):
            self.Warning.corpus_not_normalized()
        self.corpus = corpus
        self.selected_rows = []
        self.openContext(corpus)
        self._sel_method_buttons.button(self.sel_method).setChecked(True)
        self._clear_and_run()

    @staticmethod
    def _get_word_attribute(words: Table) -> None:
        attrs = [
            a for a in words.domain.metas + words.domain.variables
            if isinstance(a, StringVariable)
        ]
        if not attrs:
            return None
        words_attr = next(
            (a for a in attrs if a.attributes.get("type", "") == "words"),
            None)
        if words_attr:
            return words.get_column_view(words_attr)[0].tolist()
        else:
            # find the most suitable attribute - one with lowest average text
            # length - counted as a number of words
            def avg_len(attr):
                array_ = words.get_column_view(attr)[0]
                array_ = array_[~isnull(array_)]
                return sum(len(a.split()) for a in array_) / len(array_)

            attr = sorted(attrs, key=avg_len)[0]
            return words.get_column_view(attr)[0].tolist()

    @Inputs.words
    def set_words(self, words: Table) -> None:
        if words is None or len(words.domain.variables +
                                words.domain.metas) == 0:
            self.words = None
        else:
            self.words = self._get_word_attribute(words)
        self._clear_and_run()

    def _gather_scores(self) -> Tuple[np.ndarray, List[str]]:
        """
        Gather scores and labels for the dictionary that holds scores

        Returns
        -------
        scores
            Scores table
        labels
            The list with score names for the header and variables names
        """
        if self.corpus is None:
            return np.empty((0, 0)), []
        aggregation = self._get_active_aggregation()
        scorers = self._get_active_scorers()
        methods = [m for m in scorers if (m, aggregation) in self.scores]
        scores = [self.scores[(m, aggregation)] for m in methods]
        scores = np.column_stack(scores) if scores else np.empty(
            (len(self.corpus), 0))
        labels = [SCORING_METHODS[m][0] for m in methods]
        return scores, labels

    def _send_output(self) -> None:
        """
        Create corpus with scores and output it
        """
        if self.corpus is None:
            self.Outputs.corpus.send(None)
            self.Outputs.selected_documents.send(None)
            return

        scores, labels = self._gather_scores()
        if labels:
            d = self.corpus.domain
            domain = Domain(
                d.attributes,
                d.class_var,
                metas=d.metas + tuple(
                    ContinuousVariable(get_unique_names(d, l))
                    for l in labels),
            )
            out_corpus = Corpus(
                domain,
                self.corpus.X,
                self.corpus.Y,
                np.hstack([self.corpus.metas, scores]),
            )
            Corpus.retain_preprocessing(self.corpus, out_corpus)
        else:
            out_corpus = self.corpus

        self.Outputs.corpus.send(
            create_annotated_table(out_corpus, self.selected_rows))
        self.Outputs.selected_documents.send(
            out_corpus[self.selected_rows] if self.selected_rows else None)

    def _fill_table(self) -> None:
        """
        Fill the table in the widget with scores and document names
        """
        if self.corpus is None:
            self.model.clear()
            return
        scores, labels = self._gather_scores()
        labels = ["Document"] + labels
        titles = self.corpus.titles.tolist()

        # clearing selection and sorting to prevent SEGFAULT on model.wrap
        self.view.horizontalHeader().setSortIndicator(-1, Qt.AscendingOrder)
        with disconnected(self.view.selectionModel().selectionChanged,
                          self.__on_selection_change):
            self.view.clearSelection()

        self.model.fill_table(titles, scores)
        self.model.setHorizontalHeaderLabels(labels)
        self.view.update_column_widths()
        if self.model.columnCount() > self.sort_column_order[0]:
            # if not enough columns do not apply sorting from settings since
            # sorting can besaved for score column while scores are still computing
            # tables is filled before scores are computed with document names
            self.view.horizontalHeader().setSortIndicator(
                *self.sort_column_order)

        self._select_rows()

    def _fill_and_output(self) -> None:
        """Fill the table in the widget and send the output"""
        self._fill_table()
        self._send_output()

    def _clear_and_run(self) -> None:
        """Clear cached scores and commit"""
        self.scores = {}
        self.cancel()
        self._fill_and_output()
        self.commit()

    def __setting_changed(self) -> None:
        self.commit()

    def commit(self) -> None:
        self.Error.custom_err.clear()
        self.cancel()
        if self.corpus is not None and self.words is not None:
            scorers = self._get_active_scorers()
            aggregation = self._get_active_aggregation()
            new_scores = [
                s for s in scorers if (s, aggregation) not in self.scores
            ]
            if new_scores:
                self.start(
                    _run,
                    self.corpus,
                    self.words,
                    new_scores,
                    aggregation,
                    {
                        v: items[getattr(self, v)]
                        for v, items in ADDITIONAL_OPTIONS.values()
                    },
                )
            else:
                self._fill_and_output()

    def on_done(self, _: None) -> None:
        self._send_output()

    def on_partial_result(self, result: Tuple[str, str, np.ndarray]) -> None:
        sc_method, aggregation, scores = result
        self.scores[(sc_method, aggregation)] = scores
        self._fill_table()

    def on_exception(self, ex: Exception) -> None:
        self.Error.custom_err(ex)
        self._fill_and_output()

    def _get_active_scorers(self) -> List[str]:
        """
        Gather currently active/selected scores

        Returns
        -------
        List with selected scores names
        """
        return [attr for attr in SCORING_METHODS if getattr(self, attr)]

    def _get_active_aggregation(self) -> str:
        """
        Gather currently active/selected aggregation

        Returns
        -------
        Selected aggregation name
        """
        return list(AGGREGATIONS.keys())[self.aggregation]

    @staticmethod
    def _is_corpus_normalized(corpus: Corpus) -> bool:
        """
        Check if corpus is normalized.
        """
        return any(
            isinstance(pp, BaseNormalizer)
            for pp in corpus.used_preprocessor.preprocessors)

    def get_selected_indices(self) -> List[int]:
        # get indices in table's order - that the selected output table have same order
        selected_rows = sorted(self.view.selectionModel().selectedRows(),
                               key=lambda idx: idx.row())
        return [self.view.model().mapToSource(r).row() for r in selected_rows]

    def _select_rows(self):
        proxy_model = self.view.model()
        n_rows, n_columns = proxy_model.rowCount(), proxy_model.columnCount()
        if self.sel_method == SelectionMethods.NONE:
            selection = QItemSelection()
        elif self.sel_method == SelectionMethods.ALL:
            selection = QItemSelection(
                proxy_model.index(0, 0),
                proxy_model.index(n_rows - 1, n_columns - 1))
        elif self.sel_method == SelectionMethods.MANUAL:
            selection = QItemSelection()
            new_sel = []
            for row in self.selected_rows:
                if row < n_rows:
                    new_sel.append(row)
                    _selection = QItemSelection(
                        self.model.index(row, 0),
                        self.model.index(row, n_columns - 1))
                    selection.merge(
                        proxy_model.mapSelectionFromSource(_selection),
                        QItemSelectionModel.Select,
                    )
            # selected rows must be updated when the same dataset with less rows
            # appear at the input - it is not handled by selectionChanged
            # in cases when all selected rows missing in new table
            self.selected_rows = new_sel
        elif self.sel_method == SelectionMethods.N_BEST:
            n_sel = min(self.n_selected, n_rows)
            selection = QItemSelection(
                proxy_model.index(0, 0),
                proxy_model.index(n_sel - 1, n_columns - 1))
        else:
            raise NotImplementedError

        self.view.selectionModel().select(selection,
                                          QItemSelectionModel.ClearAndSelect)
class OWPermutationImportance(OWExplainFeatureBase):
    name = "Feature Importance"
    description = "Inspect model using Permutation Feature " \
                  "Importance technique."
    keywords = ["explain", "model", "permutation", "feature", "importance"]
    icon = "icons/PermutationImportance.svg"
    priority = 50

    settingsHandler = PerfectDomainContextHandler()
    score_index = ContextSetting(0)
    n_repeats = Setting(5)

    PLOT_CLASS = FeatureImportancePlot

    class Warning(OWExplainFeatureBase.Warning):
        missing_target = Msg("Instances with unknown target values "
                             "were removed from data.")

    # GUI setup
    def _add_controls(self):
        box = gui.vBox(self.controlArea, "Parameters")
        self._score_combo: QComboBox = gui.comboBox(
            box, self, "score_index", label="Score:",
            items=BUILTIN_SCORERS_ORDER[DiscreteVariable],
            orientation=Qt.Horizontal, contentsLength=12,
            callback=self.__parameter_changed
        )
        gui.spin(
            box, self, "n_repeats", 1, 1000, label="Permutations:",
            controlWidth=50, callback=self.__parameter_changed
        )

        super()._add_controls()

    def __parameter_changed(self):
        self.clear()
        self.start(self.run, *self.get_runner_parameters())

    def _check_data(self):
        self.Warning.missing_target.clear()
        if self.data and np.isnan(self.data.Y).any():
            self.Warning.missing_target()
            self.data = HasClass()(self.data)

    def openContext(self, model: Optional[Model]):
        super().openContext(model.domain if model else None)

    def setup_controls(self):
        if self.model and self.model.domain.has_continuous_class:
            class_type = ContinuousVariable
        else:
            class_type = DiscreteVariable
        self._score_combo.clear()
        items = BUILTIN_SCORERS_ORDER[class_type]
        self._score_combo.addItems(items)
        self.score_index = items.index("R2") if "R2" in items else 0

    def get_runner_parameters(self) -> Tuple[Optional[Table], Optional[Model],
                                             Optional[Type[Score]], int]:
        score = None
        if self.model:
            if version > "3.31.1":
                # Eventually, keep this line (remove lines 305-306) and
                # upgrade minimal Orange version to 3.32.0.
                # Also remove the Orange.version import
                score = usable_scorers(self.model.domain)[self.score_index]
            else:
                var = self.model.domain.class_var
                score = usable_scorers(var)[self.score_index]
        return self.data, self.model, score, self.n_repeats

    # Plot setup
    def update_scene(self):
        super().update_scene()
        if self.results is not None:
            importance = self.results.x
            mean = np.mean(importance, axis=1)
            std = np.std(importance, axis=1)
            indices = np.argsort(mean)[::-1]
            names = [self.results.names[i] for i in indices]
            score = self._score_combo.itemText(self.score_index)
            txt = "Increase" if score in ("MSE", "RMSE", "MAE") else "Decrease"
            x_label = f"{txt} in {score}"
            self.setup_plot(mean[indices], names, std[indices], x_label)

    # Selection
    def update_selection(self, attr_names: Set[str]):
        if set(self.selection) == attr_names:
            return
        assert self.results is not None
        self.selection = tuple(attr_names)
        self.commit()

    def select_pending(self, pending_selection: Tuple):
        if not pending_selection or self.results is None:
            return

        self.plot.select_from_settings(pending_selection)
        super().select_pending(())

    # Outputs
    def get_selected_data(self) -> Optional[Domain]:
        if not self.selection or not self.data:
            return None
        domain = self.data.domain
        attrs = [a for a in domain.attributes if a.name in self.selection]
        return self.data[:, attrs + list(domain.class_vars + domain.metas)]

    def get_scores_table(self) -> Table:
        domain = Domain([ContinuousVariable("Mean"),
                         ContinuousVariable("Std")],
                        metas=[StringVariable("Feature")])
        x = self.results.x
        X = np.vstack((np.mean(x, axis=1), np.std(x, axis=1))).T
        M = np.array(self.results.names)[:, None]
        scores_table = Table(domain, X, metas=M)
        scores_table.name = "Feature Scores"
        return scores_table

    # Misc
    def send_report(self):
        if not self.data or not self.model or not self.data.domain.class_var:
            return
        var_type = type(self.data.domain.class_var)
        items = {
            "Score": BUILTIN_SCORERS_ORDER[var_type][self.score_index],
            "Permutations": self.n_repeats,
        }
        self.report_items(items)
        super().send_report()

    @staticmethod
    def run(data: Table, model: Model, score_class: Type[Score],
            n_repeats: int, state: TaskState) -> Optional[Results]:
        if not data or not model or not score_class:
            return None

        def callback(i: float, status=""):
            state.set_progress_value(i * 100)
            if status:
                state.set_status(status)
            if state.is_interruption_requested():
                raise Exception

        importance, names = permutation_feature_importance(
            model, data, score_class(), n_repeats, callback)
        mask = np.ones(importance.shape[0], dtype=bool)
        return Results(x=importance, names=names, mask=mask)