def setUp(self): self.domain = Domain( attributes=[ ContinuousVariable("c1"), DiscreteVariable("d1", values="abc"), DiscreteVariable("d2", values="def"), ], class_vars=[DiscreteVariable("d3", values="ghi")], metas=[ ContinuousVariable("c2"), DiscreteVariable("d4", values="jkl") ], ) self.args = ( self.domain, (("c1", Continuous), ("d1", Discrete), ("d2", Discrete)), (("d3", Discrete), ), (("c2", Continuous), ("d4", Discrete)), ) self.args_match_all = ( self.domain, (("c1", Continuous), ("d1", list("abc")), ("d2", list("def"))), (("d3", list("ghi")), ), (("c2", Continuous), ("d4", list("jkl"))), ) self.handler = PerfectDomainContextHandler() self.handler.read_defaults = lambda: None self.handler.bind(SimpleWidget) self.widget = SimpleWidget() self.handler.initialize(self.widget)
def setUp(self): self.domain = Domain(attributes=[ ContinuousVariable('c1'), DiscreteVariable('d1', values='abc'), DiscreteVariable('d2', values='def') ], class_vars=[DiscreteVariable('d3', values='ghi')], metas=[ ContinuousVariable('c2'), DiscreteVariable('d4', values='jkl') ]) self.args = (self.domain, (('c1', Continuous), ('d1', Discrete), ('d2', Discrete)), (('d3', Discrete), ), (('c2', Continuous), ('d4', Discrete))) self.args_match_all = (self.domain, (('c1', Continuous), ('d1', list('abc')), ('d2', list('def'))), (('d3', list('ghi')), ), (('c2', Continuous), ('d4', list('jkl')))) self.handler = PerfectDomainContextHandler() self.handler.read_defaults = lambda: None self.handler.bind(SimpleWidget) self.widget = SimpleWidget() self.handler.initialize(self.widget)
def setUp(self): self.domain = Domain( attributes=[ContinuousVariable('c1'), DiscreteVariable('d1', values='abc'), DiscreteVariable('d2', values='def')], class_vars=[DiscreteVariable('d3', values='ghi')], metas=[ContinuousVariable('c2'), DiscreteVariable('d4', values='jkl')] ) self.args = (self.domain, (('c1', Continuous), ('d1', Discrete), ('d2', Discrete)), (('d3', Discrete),), (('c2', Continuous), ('d4', Discrete))) self.args_match_all = (self.domain, (('c1', Continuous), ('d1', list('abc')), ('d2', list('def'))), (('d3', list('ghi')),), (('c2', Continuous), ('d4', list('jkl')))) self.handler = PerfectDomainContextHandler() self.handler.read_defaults = lambda: None self.handler.bind(SimpleWidget) self.widget = SimpleWidget() self.handler.initialize(self.widget)
class OWLoadModel(widget.OWWidget, RecentPathsWComboMixin): name = "Load PMML/PFA Model" id = "orange.widgets.scoring.model" description = "Load model from an input PMML file ( *.pmml, *.xml) " \ "or from an input PFA file ( *.pfa, *.json, *.yml, *.yaml) " \ "and send the model to the output." icon = "icons/model.svg" priority = 1 category = "Scoring" keywords = ["pmml", "pfa", "load", "read", "open"] class Outputs: data = Output("Scoring Model", ScoringModel, doc="PMML/PFA Model read from the input file.") want_main_area = False SEARCH_PATHS = [("location", os.getcwd())] SIZE_LIMIT = 1e7 LOCAL_FILE, URL = range(2) settingsHandler = PerfectDomainContextHandler( match_values=PerfectDomainContextHandler.MATCH_VALUES_ALL) # pylint seems to want declarations separated from definitions recent_paths: List[RecentPath] # Overload RecentPathsWidgetMixin.recent_paths to set defaults recent_paths = Setting([]) source = Setting(LOCAL_FILE) class Warning(widget.OWWidget.Warning): file_too_big = widget.Msg( "The file is too large to load automatically." " Press Reload to load.") load_warning = widget.Msg("Read warning:\n{}") class Error(widget.OWWidget.Error): file_not_found = widget.Msg("File not found.") missing_reader = widget.Msg("Missing reader.") unknown = widget.Msg("Read error:\n{}") class NoFileSelected: pass def __init__(self): super().__init__() RecentPathsWComboMixin.__init__(self) self.domain = None self.data = None self.loaded_file = "" self.reader = None layout = QGridLayout() gui.widgetBox(self.controlArea, margin=0, orientation=layout) vbox = gui.radioButtons(None, self, "source", box=True, addSpace=True, callback=self.load_data, addToLayout=False) rb_button = gui.appendRadioButton(vbox, "File:", addToLayout=False) layout.addWidget(rb_button, 0, 0, Qt.AlignVCenter) box = gui.hBox(None, addToLayout=False, margin=0) box.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed) self.file_combo.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed) self.file_combo.activated[int].connect(self.select_file) box.layout().addWidget(self.file_combo) layout.addWidget(box, 0, 1) file_button = gui.button(None, self, '...', callback=self.browse_file, autoDefault=False) file_button.setIcon(self.style().standardIcon(QStyle.SP_DirOpenIcon)) file_button.setSizePolicy(Policy.Maximum, Policy.Fixed) layout.addWidget(file_button, 0, 2) reload_button = gui.button(None, self, "Reload", callback=self.load_data, autoDefault=False) reload_button.setIcon(self.style().standardIcon( QStyle.SP_BrowserReload)) reload_button.setSizePolicy(Policy.Fixed, Policy.Fixed) layout.addWidget(reload_button, 0, 3) box = gui.vBox(self.controlArea, "Info") self.infolabel = gui.widgetLabel(box, 'No model loaded.') self.warnings = gui.widgetLabel(box, '') box = gui.hBox(self.controlArea) gui.rubber(box) self.apply_button = gui.button(box, self, "Send", callback=self.send_data) self.apply_button.setEnabled(False) self.apply_button.setFixedWidth(170) self.set_file_list() # Must not call open_file from within __init__. open_file # explicitly re-enters the event loop (by a progress bar) self.setAcceptDrops(True) if self.source == self.LOCAL_FILE: last_path = self.last_path() if last_path and os.path.exists(last_path) and \ os.path.getsize(last_path) > self.SIZE_LIMIT: self.Warning.file_too_big() return QTimer.singleShot(0, self.load_data) @staticmethod def sizeHint(): return QSize(600, 30) def select_file(self, n): assert n < len(self.recent_paths) super().select_file(n) if self.recent_paths: self.source = self.LOCAL_FILE self.load_data() self.set_file_list() def browse_file(self): start_file = self.last_path() or os.path.expanduser("~/") readers = [ PMMLFormat, PFAFormat, ] filename, file_format, filter = open_filename_dialog( start_file, None, readers) if not filename: return self.add_path(filename) if file_format is not None: self.recent_paths[0].file_format = file_format.qualified_name() self.source = self.LOCAL_FILE self.load_data() # Open a file, create data from it and send it over the data channel def load_data(self): # We need to catch any exception type since anything can happen in # file readers self.closeContext() self.apply_button.setEnabled(False) self.clear_messages() self.set_file_list() error = self._try_load() if error: error() self.data = None self.Outputs.data.send(None) self.infolabel.setText("No model.") def _try_load(self): # pylint: disable=broad-except if self.last_path() and not os.path.exists(self.last_path()): return self.Error.file_not_found try: self.reader = self._get_reader() assert self.reader is not None except Exception: return self.Error.missing_reader if self.reader is self.NoFileSelected: self.Outputs.data.send(None) return None with catch_warnings(record=True) as warnings: try: model = self.reader.read() except Exception as ex: log.exception(ex) return lambda x=ex: self.Error.unknown(str(x)) if warnings: self.Warning.load_warning(warnings[-1].message.args[0]) self.infolabel.setText(self._describe(model)) self.loaded_file = self.last_path() self.data = model self.apply_button.setEnabled(True) return None def _get_reader(self): if self.source == self.LOCAL_FILE: path = self.last_path() if path is None: return self.NoFileSelected if self.recent_paths and self.recent_paths[0].file_format: qname = self.recent_paths[0].file_format reader_class = class_from_qualified_name(qname) reader = reader_class.get_reader(path) else: _, ext = os.path.splitext(path) reader = self.NoFileSelected if ext in PMMLFormat.EXTENSIONS: reader = PMMLFormat.get_reader(path) if ext in PFAFormat.EXTENSIONS: reader = PFAFormat.get_reader(path) return reader return self.NoFileSelected @staticmethod def _describe(modelFormat): text = "" if modelFormat.type == "PFA": text += "Method:<br/> " + modelFormat.method + "<br/>" text += "Input fields(s)" if len(modelFormat.inputFields) > 0: text += ":<br/> " + \ ", ".join([name+ " ("+dataType+")" for name, dataType in modelFormat.inputFields]) else: text += ":<br/> None" text += "<br/>Output fields(s)" if len(modelFormat.outputFields) > 0: text += ":<br/> " + \ ", ".join([name+ " ("+dataType+")" for name, dataType in modelFormat.outputFields]) else: text += ":<br/> None" if modelFormat.type == "PMML": text += "<br/>Target fields(s)" if len(modelFormat.targetFields) > 0: text += ":<br/> " + \ ", ".join([name+ " ("+dataType+")" for name, dataType in modelFormat.targetFields]) else: text += ":<br/> None" return text def get_widget_name_extension(self): _, name = os.path.split(self.loaded_file) return os.path.splitext(name)[0] def send_data(self): self.Outputs.data.send(self.data) self.apply_button.setEnabled(False)
class OWTableToRelation(OWWidget): name = "Table to Relation" description = "Convert data table to relation matrix. Label matrix axis." priority = 50000 icon = "icons/TableToRelation.svg" class Inputs: data = Input("Data", Table) class Outputs: relation = Output("Relation", Relation) settingsHandler = PerfectDomainContextHandler() data = None relation_name = ContextSetting("") transpose = ContextSetting(False) row_type = ContextSetting("") selected_meta = ContextSetting(0) row_names = None col_type = ContextSetting("") col_names = None auto_commit = Setting(True) def __init__(self): super().__init__() self.model = None self.view = None self.row_names_combo = None self.icons = gui.attributeIconDict self.populate_control_area() self.populate_main_area() def populate_control_area(self): rel = gui.widgetBox(self.controlArea, "Relation") gui.lineEdit(rel, self, "relation_name", "Name", callbackOnType=True, callback=self.apply) gui.checkBox(rel, self, "transpose", "Transpose", callback=self.apply) col = gui.widgetBox(self.controlArea, "Column") gui.lineEdit(col, self, "col_type", "Object Type", callbackOnType=True, callback=self.apply) row = gui.widgetBox(self.controlArea, "Row") gui.lineEdit(row, self, "row_type", "Object Type", callbackOnType=True, callback=self.apply) self.row_names_combo = gui.comboBox(row, self, "selected_meta", label="Object Names", callback=self.update_row_names) gui.rubber(self.controlArea) gui.auto_commit(self.controlArea, self, "auto_commit", "Send", checkbox_label='Auto-send', orientation='vertical') def populate_main_area(self): grid = QWidget() grid.setLayout(QGridLayout(grid)) self.mainArea.layout().addWidget(grid) col_type = gui.label(None, self, '%(col_type)s') grid.layout().addWidget(col_type, 0, 1) grid.layout().setAlignment(col_type, Qt.AlignHCenter) row_type = gui.label(None, self, '%(row_type)s') grid.layout().addWidget(row_type, 1, 0) grid.layout().setAlignment(row_type, Qt.AlignVCenter) self.view = QTableView() self.model = None grid.layout().addWidget(self.view, 1, 1) def sizeHint(self): return QSize(800, 500) @Inputs.data def set_data(self, data): self.closeContext() self.data = data if data is not None: self.init_attr_values(data.domain.metas) self.openContext(self.data) self.col_names = [str(a.name) for a in data.domain.attributes] if hasattr(data, 'col_type'): self.col_type = data.col_type else: self.init_attr_values(()) self.update_preview() self.update_row_names() self.unconditional_commit() def init_attr_values(self, candidates): self.col_type = "" self.col_names = None if candidates: self.row_type = candidates[0].name self.selected_meta = 1 else: self.row_type = "" self.selected_meta = 0 self.row_names = None self.row_names_combo.clear() self.row_names_combo.addItem('(None)') for var in candidates: self.row_names_combo.addItem(self.icons[var], var.name) self.row_names_combo.setCurrentIndex(self.selected_meta) def update_row_names(self): if self.selected_meta: self.row_names = list( self.data[:, -self.selected_meta].metas.flatten()) else: self.row_names = None if self.model: self.model.headerDataChanged.emit(Qt.Vertical, 0, self.model.rowCount() - 1) self.commit() def update_preview(self): this = self class MyTableModel(TableModel): def headerData(self, section, orientation, role): if orientation == Qt.Vertical and role == Qt.DisplayRole: if this.row_names: return this.row_names[section] else: return super().headerData(section, orientation, role) if self.data: domain = Domain(self.data.domain.attributes) preview_data = Table(domain, self.data) self.model = MyTableModel(preview_data) else: self.model = None self.view.setModel(self.model) def apply(self): self.commit() def commit(self): if self.data: domain = self.data.domain metadata_cols = list(domain.class_vars) + list(domain.metas) metadata = [{ var: var.to_val(value) for var, value in zip(metadata_cols, values.list) } for values in self.data[:, metadata_cols]] if self.transpose: relation = fusion.Relation( self.data.X.T, name=self.relation_name, row_type=fusion.ObjectType(self.col_type or 'Unknown'), row_names=self.col_names, col_type=fusion.ObjectType(self.row_type or 'Unknown'), col_names=self.row_names, col_metadata=metadata) else: relation = fusion.Relation( self.data.X, name=self.relation_name, row_type=fusion.ObjectType(self.row_type or 'Unknown'), row_names=self.row_names, row_metadata=metadata, col_type=fusion.ObjectType(self.col_type or 'Unknown'), col_names=self.col_names, ) self.Outputs.relation.send(Relation(relation))
class OWBatchNorm(OWWidget): name = "Batch Effect Removal" description = "Batch effect normalization on Single Cell data set." icon = "icons/BatchEffectRemoval.svg" priority = 230 class Inputs: data = Input("Data", Table) class Outputs: data = Output("Data", Table) class Error(OWWidget.Error): general_error = Msg({}) discrete_attributes = Msg("Data with discrete attributes " "can not be processed.") class Warning(OWWidget.Warning): missing_values = Msg("Missing values have been replaced with 0.") negative_values = Msg("Unable to use current settings due " "to negative values in data.") resizing_enabled = False want_main_area = False settingsHandler = PerfectDomainContextHandler() batch_vars = ContextSetting([]) link_method = Setting(LinkMethod.IDENTITY_LINK) skip_zeros = Setting(False) auto_commit = Setting(True) def __init__(self, parent=None): super().__init__(parent) self.data = None # Info infobox = gui.widgetBox(self.controlArea, "Info") self.info_label = gui.widgetLabel(infobox, "No data on input.") # Link method method_box = gui.widgetBox(self.controlArea, "Method") gui.comboBox(method_box, self, "link_method", items=LinkMethod.items(), callback=self.__link_method_changed) gui.separator(method_box) self.skip_zeros_check = gui.checkBox( method_box, self, "skip_zeros", "Skip zero expressions", enabled=self.link_method != LinkMethod.LOG_LINK, callback=lambda: self.commit()) # Batch Variable Selection header_shema = (("selected", ""), ("variable", "Variable"), ("count", "#"), ("score", "Score")) header_labels = labels = [label for _, label in header_shema] header = namedtuple("header", [tag for tag, _ in header_shema]) self.Header = header(*[index for index, _ in enumerate(labels)]) batch_box = gui.widgetBox(self.controlArea, "Batch Variable Selection") self.view = QTreeView() self.model = QStandardItemModel() self.model.itemChanged.connect(self.__selected_batch_vars_changed) self.model.setHorizontalHeaderLabels(header_labels) batch_box.layout().addWidget(self.view) self._setup_view() gui.auto_commit(self.controlArea, self, "auto_commit", "Apply", "Apply Automatically") def __link_method_changed(self): enable = self.link_method != LinkMethod.LOG_LINK self.skip_zeros_check.setEnabled(enable) if not enable: self.skip_zeros_check.setChecked(True) self.commit() def __selected_batch_vars_changed(self, item): if item.checkState(): self.batch_vars.append(item.data(VariableRole)) else: self.batch_vars.remove(item.data(VariableRole)) self.commit() def _setup_view(self): self.view.setModel(self.model) self.view.setSelectionMode(QTreeView.NoSelection) self.view.setSortingEnabled(True) self.view.setRootIsDecorated(False) self.view.setItemDelegateForColumn(self.Header.count, IntegralDelegate(self)) self.view.setItemDelegateForColumn(self.Header.score, RealDelegate(self)) self.view.header().setSectionResizeMode(QHeaderView.ResizeToContents) self.view.header().setStretchLastSection(False) self.view.header().setSectionResizeMode(self.Header.variable, QHeaderView.Stretch) self.view.setFocus() @Inputs.data def set_data(self, data): self.closeContext() self.clear() self.data = data self._setup_info_label() self._check_data() self.openContext(data) if self.data is not None: self.batch_vars = [data.domain[v.name] for v in self.batch_vars] self._setup_model() self.commit() def clear(self): self.batch_vars = [] if self.model: n_rows = self.model.rowCount() self.model.removeRows(0, n_rows) def _setup_info_label(self): text = "No data on input." if self.data is not None: domain, attrs = self.data.domain, self.data.domain.attributes text = "{} cells, {} genes\n".format(len(self.data), len(attrs)) text += "{} meta features".format(len(domain.metas)) \ if len(domain.metas) else "(no meta features)" self.info_label.setText(text) def _check_data(self): self.clear_messages() if self.data and self.data.domain.has_discrete_attributes(): self.data = None self.Error.discrete_attributes() if self.data and np.isnan(self.data.X).any(): self.data.X = np.nan_to_num(self.data.X) self.Warning.missing_values() def _setup_model(self): estimator = ScBatchScorer() for var in self.data.domain.class_vars + self.data.domain.metas: if not var.is_primitive(): continue try: score = float(estimator.score_data(self.data, var)) except Exception: score = np.nan self.model.appendRow([ self.__selected_item(var), self.__variable_item(var), self.__count_item(var), self.__score_item(score) ]) def __selected_item(self, var): item = QStandardItem() item.setData(var, VariableRole) item.setCheckable(True) select = var in self.batch_vars item.setCheckState(Qt.Checked if select else Qt.Unchecked) item.setEditable(False) return item def __variable_item(self, var): item = QStandardItem() item.setData(var.name, Qt.DisplayRole) item.setData(gui.attributeIconDict[var], Qt.DecorationRole) item.setEditable(False) return item def __count_item(self, var): item = QStandardItem() if var.is_discrete: item.setData(len(var.values), Qt.DisplayRole) item.setEditable(False) return item def __score_item(self, score): item = QStandardItem() item.setData(score, Qt.DisplayRole) item.setEditable(False) return item def commit(self): data = None self.Error.general_error.clear() self.Warning.negative_values.clear() if self.data is not None: if (self.data.X < 0).any() and self.skip_zeros: self.Warning.negative_values() data = self.data else: try: data = SCBatchNormalizer( LinkMethod.items()[self.link_method], self.skip_zeros, self.batch_vars)(self.data) except Exception as e: self.Error.general_error(str(e)) data = None self.Outputs.data.send(data) def send_report(self): method = LinkMethod.items()[self.link_method] if self.skip_zeros: method += " (Skip zero expressions)" variables = ", ".join([v.name for v in self.batch_vars]) \ if self.batch_vars else "None" self.report_items("", [("Method", method), ("Batch variable selection", variables)])
class OWFile(widget.OWWidget, RecentPathsWComboMixin): name = "领域编辑器2" icon = "icons/gear.svg" id = "orange.widgets.data.file" description = "Read data from an input file or network " \ "and send a data table to the output." priority = 10 category = "Data" keywords = ["file", "load", "read", "open"] class Outputs: data = Output("领域背景", Table, doc="专业领域背景的介绍") want_main_area = False SEARCH_PATHS = [("sample-datasets", get_sample_datasets_dir())] SIZE_LIMIT = 1e7 LOCAL_FILE, URL = range(2) settingsHandler = PerfectDomainContextHandler( match_values=PerfectDomainContextHandler.MATCH_VALUES_ALL) # Overload RecentPathsWidgetMixin.recent_paths to set defaults recent_paths = Setting([ RecentPath("", "sample-datasets", "iris.tab"), RecentPath("", "sample-datasets", "titanic.tab"), RecentPath("", "sample-datasets", "housing.tab"), RecentPath("", "sample-datasets", "heart_disease.tab"), ]) recent_urls = Setting([]) source = Setting(LOCAL_FILE) xls_sheet = ContextSetting("") sheet_names = Setting({}) url = Setting("") variables = ContextSetting([]) domain_editor = SettingProvider(DomainEditor) ##用于警告代码可以无视 class Warning(widget.OWWidget.Warning): file_too_big = widget.Msg( "The file is too large to load automatically." " Press Reload to load.") load_warning = widget.Msg("Read warning:\n{}") ##用于报错代码可以无视 class Error(widget.OWWidget.Error): file_not_found = widget.Msg("File not found.") missing_reader = widget.Msg("Missing reader.") sheet_error = widget.Msg("Error listing available sheets.") unknown = widget.Msg("Read error:\n{}") def __init__(self): super().__init__() RecentPathsWComboMixin.__init__(self) self.domain = None self.data = None self.loaded_file = "" self.reader = None layout = QGridLayout() ##画布的布局,使用网格划分的方式 gui.widgetBox(self.controlArea, margin=20, orientation=layout) vbox = gui.radioButtons(None, self, "source", box=True, addSpace=True, callback=self.load_data, addToLayout=False) rb_button = gui.appendRadioButton(vbox, "File:", addToLayout=False) layout.addWidget(rb_button, 0, 0, Qt.AlignVCenter) #确定位置0,0 box = gui.hBox(None, addToLayout=False, margin=0) #水平box box.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed) #设置size self.file_combo.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed) # 按钮和下拉菜单的联合体 self.file_combo.activated[int].connect( self.select_file) ##使用.connect(功能函数)来实现与功能函数的连接 box.layout().addWidget(self.file_combo) layout.addWidget(box, 0, 1) #确定位置0,1 file_button = gui.button(None, self, '...', callback=self.browse_file, autoDefault=False) file_button.setIcon(self.style().standardIcon(QStyle.SP_DirOpenIcon)) file_button.setSizePolicy(Policy.Maximum, Policy.Fixed) layout.addWidget(file_button, 0, 2) reload_button = gui.button(None, self, "Reload", callback=self.load_data, autoDefault=False) reload_button.setIcon(self.style().standardIcon( QStyle.SP_BrowserReload)) reload_button.setSizePolicy(Policy.Fixed, Policy.Fixed) layout.addWidget(reload_button, 0, 3) ## 含Information的box设置 box = gui.widgetBox(self.controlArea, "Info") self.info = gui.widgetLabel(box, '请设置领域特征') # self.warnings = gui.widgetLabel(box, '') ##下面几句控制含有table的box box = gui.widgetBox(self.controlArea, "双击进行编辑") self.domain_editor = DomainEditor(self) ##对table操作的事情在DomainEditor内部定义 self.editor_model = self.domain_editor.model() ##设置与Apply激活状态有关 box.layout().addWidget(self.domain_editor) ## Apply 按钮 box = gui.hBox(self.controlArea) # gui.button( # box, self, "Browse documentation datasets", # callback=lambda: self.browse_file(True), autoDefault=False) # gui.rubber(box) self.apply_button = gui.button(box, self, "应用", callback=self.apply_domain_edit) self.apply_button.setEnabled(False) self.apply_button.setFixedWidth(170) # print('editor_model',self.editor_model) ## 如果数据改变就激活apply按钮.dataChange表示是否改变数据 self.editor_model.dataChanged.connect( lambda: self.apply_button.setEnabled(True)) self.set_file_list() ##设置文件列表中的项 # Must not call open_file from within __init__. open_file # explicitly re-enters the event loop (by a progress bar) self.setAcceptDrops(True) ##表示接受响应释放操作 if self.source == self.LOCAL_FILE: last_path = self.last_path() if last_path and os.path.exists(last_path) and \ os.path.getsize(last_path) > self.SIZE_LIMIT: self.Warning.file_too_big() return ##QTimer.singleShot()表示在s秒后调用一个槽函数(self.load_data) QTimer.singleShot(0, self.load_data) def sizeHint(self): return QSize(600, 550) def select_file(self, n): assert n < len(self.recent_paths) super().select_file(n) if self.recent_paths: self.source = self.LOCAL_FILE self.load_data() self.set_file_list() ## 读取文件 def browse_file(self, in_demos=False): if in_demos: start_file = get_sample_datasets_dir() if not os.path.exists(start_file): QMessageBox.information( None, "File", "Cannot find the directory with documentation datasets") return else: start_file = self.last_path() or os.path.expanduser("~/") readers = [ f for f in FileFormat.formats if getattr(f, 'read', None) and getattr(f, "EXTENSIONS", None) ] filename, reader, _ = open_filename_dialog(start_file, None, readers) if not filename: return self.add_path(filename) if reader is not None: self.recent_paths[0].file_format = reader.qualified_name() self.source = self.LOCAL_FILE self.load_data() ## 获取数据self.data,方式是调用了_try_load函数,并且将数据send到Output的channel中 # Open a file, create data from it and send it over the data channel def load_data(self): # We need to catch any exception type since anything can happen in # file readers self.closeContext() ##重新设置widget 的context self.domain_editor.set_domain(None) #把domain设置为None self.apply_button.setEnabled(False) #把apply button设置为不可见 self.clear_messages() self.set_file_list() ##这句话判断数据导入是否有错误 error = self._try_load() if error: error() self.data = None # self.sheet_box.hide() self.Outputs.data.send(None) self.info.setText("无数据.") ## 导入数据的核心方法:获取self.data数据,同时判断这个出错可能性 def _try_load(self): # pylint: disable=broad-except if self.last_path() and not os.path.exists(self.last_path()): return self.Error.file_not_found try: self.reader = self._get_reader() ##这里获取reader assert self.reader is not None except Exception: return self.Error.missing_reader try: self._update_sheet_combo() except Exception: return self.Error.sheet_error with catch_warnings(record=True) as warnings: try: data = self.reader.read() ##通过这句话读取数据,这是的data已经是table型数据了 print('jia', type(data)) except Exception as ex: log.exception(ex) return lambda x=ex: self.Error.unknown(str(x)) if warnings: self.Warning.load_warning(warnings[-1].message.args[0]) self.info.setText(self._describe(data)) #描述info的text self.loaded_file = self.last_path() ##描述文档地址 add_origin(data, self.loaded_file) self.data = data # print('liangyue',dir(self.data)) self.openContext(data.domain) # print('data',data) self.apply_domain_edit() # sends data ## 获取导入文件的格式 def _get_reader(self): """ Returns ------- FileFormat """ if self.source == self.LOCAL_FILE: path = self.last_path() if self.recent_paths and self.recent_paths[0].file_format: qname = self.recent_paths[0].file_format reader_class = class_from_qualified_name(qname) reader = reader_class(path) print('reader_class', reader_class) else: reader = FileFormat.get_reader(path) # Return reader instance that can be used to read the file if self.recent_paths and self.recent_paths[0].sheet: reader.select_sheet(self.recent_paths[0].sheet) return reader elif self.source == self.URL: url = self.url_combo.currentText().strip() if url: return UrlReader(url) ## 更新file的下拉列表中的内容 def _update_sheet_combo(self): if len(self.reader.sheets) < 2: # self.sheet_box.hide() self.reader.select_sheet(None) return self.sheet_combo.clear() self.sheet_combo.addItems(self.reader.sheets) self._select_active_sheet() # self.sheet_box.show() def _select_active_sheet(self): if self.reader.sheet: try: idx = self.reader.sheets.index(self.reader.sheet) self.sheet_combo.setCurrentIndex(idx) except ValueError: # Requested sheet does not exist in this file self.reader.select_sheet(None) else: self.sheet_combo.setCurrentIndex(0) ## 下面是info的描述语句 def _describe(self, table): domain = table.domain text = "" attrs = getattr(table, "attributes", {}) descs = [ attrs[desc] for desc in ("Name", "Description") if desc in attrs ] if len(descs) == 2: descs[0] = "<b>{}</b>".format(descs[0]) if descs: text += "<p>{}</p>".format("<br/>".join(descs)) text += "<p>{} 个实例数据(s), {} 个输入特征(s), {} 个元特征(s)".\ format(len(table), len(domain.attributes), len(domain.metas)) if domain.has_continuous_class: text += "<br/>回归模型 ." elif domain.has_discrete_class: text += "<br/>分类模型; 共分为 {} 类.".\ format(len(domain.class_var.values)) elif table.domain.class_vars: text += "<br/>多目标模型; {} 个目标".format(len(table.domain.class_vars)) else: text += "<br/>无目标值." text += "</p>" if 'Timestamp' in table.domain: # Google Forms uses this header to timestamp responses text += '<p>First entry: {}<br/>Last entry: {}</p>'.format( table[0, 'Timestamp'], table[-1, 'Timestamp']) return text def storeSpecificSettings(self): self.current_context.modified_variables = self.variables[:] def retrieveSpecificSettings(self): if hasattr(self.current_context, "modified_variables"): self.variables[:] = self.current_context.modified_variables ## 对Ourputs的data赋值为table def apply_domain_edit(self): if self.data is None: table = None else: domain, cols = self.domain_editor.get_domain( self.data.domain, self.data) printData = self.data printDomain = self.data.domain if not (domain.variables or domain.metas): table = None else: X, y, m = cols #X是输入,domain.attributes;y是输出class_var;m是元特征 ## 下面解决将self.data的数据付给了table。 # 1data's name; 2数据编号ids;3数据属性attributes table = Table.from_numpy(domain, X, y, m, self.data.W) table.name = self.data.name index = self.data.ids table.ids = np.array(self.data.ids) # print('ids',table.ids) data = self.data table.attributes = getattr(self.data, 'attributes', {}) ## 将table的属性定义为{} ''' 对Ourputs的data赋值为table''' # print('table is :',table) # print('table domain',table.domain) # print('table name',table.name) # print('table class_var name',table.domain.class_vars[0].name) self.Outputs.data.send(table) self.apply_button.setEnabled(False) def get_widget_name_extension(self): _, name = os.path.split(self.loaded_file) return os.path.splitext(name)[0] def send_report(self): def get_ext_name(filename): try: return FileFormat.names[os.path.splitext(filename)[1]] except KeyError: return "unknown" if self.data is None: self.report_paragraph("File", "No file.") return if self.source == self.LOCAL_FILE: home = os.path.expanduser("~") if self.loaded_file.startswith(home): # os.path.join does not like ~ name = "~" + os.path.sep + \ self.loaded_file[len(home):].lstrip("/").lstrip("\\") else: name = self.loaded_file if self.sheet_combo.isVisible(): name += " ({})".format(self.sheet_combo.currentText()) self.report_items("File", [("File name", name), ("Format", get_ext_name(name))]) else: self.report_items("Data", [("Resource", self.url), ("Format", get_ext_name(self.url))]) self.report_data("Data", self.data) def dragEnterEvent(self, event): """Accept drops of valid file urls""" urls = event.mimeData().urls() if urls: try: FileFormat.get_reader( OSX_NSURL_toLocalFile(urls[0]) or urls[0].toLocalFile()) event.acceptProposedAction() except IOError: pass def dropEvent(self, event): """Handle file drops""" urls = event.mimeData().urls() if urls: self.add_path( OSX_NSURL_toLocalFile(urls[0]) or urls[0].toLocalFile()) # add first file self.source = self.LOCAL_FILE self.load_data() def workflowEnvChanged(self, key, value, oldvalue): """ Function called when environment changes (e.g. while saving the scheme) It make sure that all environment connected values are modified (e.g. relative file paths are changed) """ self.update_file_list(key, value, oldvalue)
class OWConcordance(OWWidget): name = "Concordance" description = "Display the context of the word." icon = "icons/Concordance.svg" priority = 520 class Inputs: corpus = Input("Corpus", Corpus) query_word = Input("Query Word", Topic) class Outputs: selected_documents = Output("Selected Documents", Corpus) concordances = Output("Concordances", Corpus) settingsHandler = PerfectDomainContextHandler( match_values=PerfectDomainContextHandler.MATCH_VALUES_ALL) autocommit = Setting(True) context_width = Setting(5) word = ContextSetting("", exclude_metas=False) selected_rows = Setting([], schema_only=True) class Warning(OWWidget.Warning): multiple_words_on_input = Msg("Multiple query words on input. " "Only the first one is considered!") def __init__(self): super().__init__() self.corpus = None # Corpus self.n_matching = '' # Info on docs matching the word self.n_tokens = '' # Info on tokens self.n_types = '' # Info on types (unique tokens) self.is_word_on_input = False # Info attributes info_box = gui.widgetBox(self.controlArea, 'Info') gui.label(info_box, self, 'Tokens: %(n_tokens)s') gui.label(info_box, self, 'Types: %(n_types)s') gui.label(info_box, self, 'Matching: %(n_matching)s') # Width parameter gui.spin(self.controlArea, self, 'context_width', 3, 10, box=True, label="Number of words:", callback=self.set_width) gui.rubber(self.controlArea) # Search c_box = gui.widgetBox(self.mainArea, orientation="vertical") self.input = gui.lineEdit(c_box, self, 'word', orientation=Qt.Horizontal, sizePolicy=QSizePolicy( QSizePolicy.MinimumExpanding, QSizePolicy.Fixed), label='Query:', callback=self.set_word, callbackOnType=True) self.input.setFocus() # Concordances view self.conc_view = QTableView() self.model = ConcordanceModel() self.conc_view.setModel(self.model) self.conc_view.setWordWrap(False) self.conc_view.setSelectionBehavior(QTableView.SelectRows) self.conc_view.setSelectionModel(DocumentSelectionModel(self.model)) self.conc_view.setItemDelegate(HorizontalGridDelegate()) self.conc_view.selectionModel().selectionChanged.connect( self.selection_changed) self.conc_view.horizontalHeader().hide() self.conc_view.setShowGrid(False) self.mainArea.layout().addWidget(self.conc_view) self.set_width() # Auto-commit box gui.auto_commit(self.controlArea, self, 'autocommit', 'Commit', 'Auto commit is on') def sizeHint(self): # pragma: no cover return QSize(600, 400) def set_width(self): sel = self.conc_view.selectionModel().selection() self.model.set_width(self.context_width) if sel: self.conc_view.selectionModel().select( sel, QItemSelectionModel.SelectCurrent | QItemSelectionModel.Rows) def selection_changed(self): selection = self.conc_view.selectionModel().selection() self.selected_rows = sorted( set(cell.row() for cell in selection.indexes())) self.commit() def set_selection(self, selection): if selection: sel = QItemSelection() for row in selection: index = self.conc_view.model().index(row, 0) sel.select(index, index) self.conc_view.selectionModel().select( sel, QItemSelectionModel.SelectCurrent | QItemSelectionModel.Rows) @Inputs.corpus def set_corpus(self, data=None): self.closeContext() self.corpus = data if data is None: # data removed, clear selection self.selected_rows = [] if not self.is_word_on_input: self.word = "" self.openContext(self.corpus) self.model.set_corpus(self.corpus) self.set_word() @Inputs.query_word def set_word_from_input(self, topic): self.Warning.multiple_words_on_input.clear() if self.is_word_on_input: # word changed, clear selection self.selected_rows = [] self.is_word_on_input = topic is not None and len(topic) > 0 self.input.setEnabled(not self.is_word_on_input) if self.is_word_on_input: if len(topic) > 1: self.Warning.multiple_words_on_input() self.word = topic.metas[0, 0] self.set_word() def set_word(self): self.selected_rows = [] self.model.set_word(self.word) self.update_widget() self.commit() def handleNewSignals(self): self.set_selection(self.selected_rows) def resize_columns(self): col_width = (self.conc_view.width() - self.conc_view.columnWidth(1)) / 2 - 12 self.conc_view.setColumnWidth(0, col_width) self.conc_view.setColumnWidth(2, col_width) def resizeEvent(self, event): # pragma: no cover super().resizeEvent(event) self.resize_columns() def update_widget(self): self.conc_view.resizeColumnToContents(1) self.resize_columns() self.conc_view.resizeRowsToContents() if self.corpus is not None: self.n_matching = '{}/{}'.format( self.model.matching_docs() if self.word else 0, len(self.corpus)) self.n_tokens = self.model.n_tokens self.n_types = self.model.n_types else: self.n_matching = '' self.n_tokens = '' self.n_types = '' def commit(self): selected_docs = sorted( set(self.model.word_index[row][0] for row in self.selected_rows)) concordance = self.model.get_data() if selected_docs: selected = self.corpus[selected_docs] self.Outputs.selected_documents.send(selected) else: self.Outputs.selected_documents.send(None) self.Outputs.concordances.send(concordance) def send_report(self): view = self.conc_view model = self.conc_view.model() self.report_items("Concordances", ( ("Query", model.word), ("Tokens", model.n_tokens), ("Types", model.n_types), ("Matching", self.n_matching), )) self.report_table(view)
class TestPerfectDomainContextHandler(TestCase): def setUp(self): self.domain = Domain( attributes=[ContinuousVariable('c1'), DiscreteVariable('d1', values='abc'), DiscreteVariable('d2', values='def')], class_vars=[DiscreteVariable('d3', values='ghi')], metas=[ContinuousVariable('c2'), DiscreteVariable('d4', values='jkl')] ) self.args = (self.domain, (('c1', Continuous), ('d1', Discrete), ('d2', Discrete)), (('d3', Discrete),), (('c2', Continuous), ('d4', Discrete))) self.args_match_all = (self.domain, (('c1', Continuous), ('d1', list('abc')), ('d2', list('def'))), (('d3', list('ghi')),), (('c2', Continuous), ('d4', list('jkl')))) self.handler = PerfectDomainContextHandler() self.handler.read_defaults = lambda: None self.handler.bind(SimpleWidget) self.widget = SimpleWidget() self.handler.initialize(self.widget) def test_new_context(self): context = self.handler.new_context(*self.args) _, attrs, class_vars, metas = self.args self.assertEqual(context.attributes, attrs) self.assertEqual(context.class_vars, class_vars) self.assertEqual(context.metas, metas) def test_open_context(self): context = Context() context.attributes = () context.class_vars = () self.handler.new_context = Mock(return_value=context) self.handler.open_context(self.widget, self.domain) self.handler.new_context.assert_called_with(*self.args) def test_encode_domain_simple(self): attrs, class_vars, metas = self.handler.encode_domain(self.domain) self.assertEqual(attrs, (('c1', Continuous), ('d1', Discrete), ('d2', Discrete))) self.assertEqual(class_vars, (('d3', Discrete),)) self.assertEqual(metas, (('c2', Continuous), ('d4', Discrete))) def test_encode_domain_match_values(self): self.handler.match_values = self.handler.MATCH_VALUES_ALL attrs, class_vars, metas = self.handler.encode_domain(self.domain) self.assertEqual(attrs, (('c1', Continuous), ('d1', list('abc')), ('d2', list('def')))) self.assertEqual(class_vars, (('d3', list('ghi')),)) self.assertEqual(metas, (('c2', Continuous), ('d4', list('jkl')))) def test_match_simple(self): domain, attrs, class_vars, metas = self.args context = self._create_context(attrs, class_vars, metas) self.assertEqual(self.handler.match(context, *self.args), self.handler.PERFECT_MATCH) attrs2 = list(attrs) attrs2[:2] = attrs[1::-1] self.assertEqual(self.handler.match(context, domain, attrs2, class_vars, metas), self.handler.NO_MATCH) attrs3 = list(attrs) attrs3.append(attrs[0]) self.assertEqual(self.handler.match(context, domain, attrs3, class_vars, metas), self.handler.NO_MATCH) metas2 = list(metas) metas2.append(attrs[0]) self.assertEqual(self.handler.match(context, domain, attrs, class_vars, metas2), self.handler.NO_MATCH) def test_match_values(self): domain, attrs, class_vars, metas = self.args_match_all context = self._create_context(attrs, class_vars, metas) self.handler.match_values = self.handler.MATCH_VALUES_ALL self.assertEqual(self.handler.match(context, *self.args_match_all), self.handler.PERFECT_MATCH) attrs2 = list(attrs) attrs2[:2] = attrs[1::-1] self.assertEqual(self.handler.match(context, domain, attrs2, class_vars, metas), self.handler.NO_MATCH) attrs3 = list(attrs) attrs3.append(attrs[0]) self.assertEqual(self.handler.match(context, domain, attrs2, class_vars, metas), self.handler.NO_MATCH) def test_encode_setting(self): _, attrs, class_vars, metas = self.args context = self._create_context(attrs, class_vars, metas) encoded_setting = self.handler.encode_setting( context, SimpleWidget.setting, "d1") self.assertEqual(encoded_setting, ("d1", -2)) encoded_setting = self.handler.encode_setting( context, SimpleWidget.text, "d1") self.assertEqual(encoded_setting, ("d1", -1)) encoded_setting = self.handler.encode_setting( context, SimpleWidget.with_metas, "d4") self.assertEqual(encoded_setting, ("d4", 1)) def _create_context(self, attrs, class_vars, metas): context = Context() context.attributes = attrs context.class_vars = class_vars context.metas = metas return context
def test_migrate_removes_invalid_contexts(self): context_invalid = ClassValuesContextHandler().new_context([0, 1, 2]) context_valid = PerfectDomainContextHandler().new_context(*[[]] * 4) settings = {'context_settings': [context_invalid, context_valid]} self.widget.migrate_settings(settings, 2) self.assertEqual(settings['context_settings'], [context_valid])
class OWCorpusViewer(OWWidget): name = "Corpus Viewer" description = "Display corpus contents." icon = "icons/CorpusViewer.svg" priority = 500 class Inputs: corpus = Input("Corpus", Corpus, replaces=["Data"]) class Outputs: matching_docs = Output("Matching Docs", Corpus, default=True) other_docs = Output("Other Docs", Corpus) settingsHandler = PerfectDomainContextHandler( match_values=PerfectDomainContextHandler.MATCH_VALUES_ALL) search_indices = ContextSetting( [], exclude_metas=False) # features included in search display_indices = ContextSetting( [], exclude_metas=False) # features for display display_features = ContextSetting([], exclude_metas=False) regexp_filter = ContextSetting("") selection = [0] # TODO: DataHashContextHandler show_tokens = Setting(False) autocommit = Setting(True) class Warning(OWWidget.Warning): no_feats_search = Msg('No features included in search.') no_feats_display = Msg('No features selected for display.') def __init__(self): super().__init__() self.corpus = None # Corpus self.corpus_docs = None # Documents generated from Corpus self.output_mask = [] # Output corpus indices self.doc_webview = None # WebView for showing content self.search_features = [ ] # two copies are needed since Display allows drag & drop self.display_list_indices = [0] # Info attributes self.update_info() info_box = gui.widgetBox(self.controlArea, 'Info') gui.label(info_box, self, 'Documents: %(n_documents)s') gui.label(info_box, self, 'Preprocessed: %(is_preprocessed)s') gui.label(info_box, self, ' ◦ Tokens: %(n_tokens)s') gui.label(info_box, self, ' ◦ Types: %(n_types)s') gui.label(info_box, self, 'POS tagged: %(is_pos_tagged)s') gui.label(info_box, self, 'N-grams range: %(ngram_range)s') gui.label(info_box, self, 'Matching: %(n_matching)s') # Search features self.search_listbox = gui.listBox( self.controlArea, self, 'search_indices', 'search_features', selectionMode=QListView.ExtendedSelection, box='Search features', callback=self.search_features_changed) # Display features display_box = gui.widgetBox(self.controlArea, 'Display features') self.display_listbox = gui.listBox( display_box, self, 'display_list_indices', 'display_features', selectionMode=QListView.ExtendedSelection, callback=self.show_docs, enableDragDrop=True) self.show_tokens_checkbox = gui.checkBox(display_box, self, 'show_tokens', 'Show Tokens && Tags', callback=self.show_docs) # Auto-commit box gui.auto_commit(self.controlArea, self, 'autocommit', 'Send data', 'Auto send is on') # Search self.filter_input = gui.lineEdit(self.mainArea, self, 'regexp_filter', orientation=Qt.Horizontal, sizePolicy=QSizePolicy( QSizePolicy.MinimumExpanding, QSizePolicy.Fixed), label='RegExp Filter:') self.filter_input.textChanged.connect(self.refresh_search) # Main area self.splitter = QSplitter( orientation=Qt.Horizontal, childrenCollapsible=False, ) # Document list self.doc_list = QTableView() self.doc_list.setSelectionBehavior(QTableView.SelectRows) self.doc_list.setSelectionMode(QTableView.ExtendedSelection) self.doc_list.setEditTriggers(QAbstractItemView.NoEditTriggers) self.doc_list.horizontalHeader().setSectionResizeMode( QHeaderView.Stretch) self.doc_list.horizontalHeader().setVisible(False) self.splitter.addWidget(self.doc_list) self.doc_list_model = QStandardItemModel(self) self.doc_list.setModel(self.doc_list_model) self.doc_list.selectionModel().selectionChanged.connect(self.show_docs) # Document contents self.doc_webview = gui.WebviewWidget(self.splitter, debug=False) self.doc_webview.loadFinished.connect(self.highlight_docs) self.mainArea.layout().addWidget(self.splitter) def copy_to_clipboard(self): text = self.doc_webview.selectedText() QApplication.clipboard().setText(text) @Inputs.corpus def set_data(self, corpus=None): self.closeContext() self.reset_widget() self.corpus = corpus self.search_features = [] if corpus is not None: domain = self.corpus.domain # Enable/disable tokens checkbox if not self.corpus.has_tokens(): self.show_tokens_checkbox.setCheckState(False) self.show_tokens_checkbox.setEnabled(self.corpus.has_tokens()) self.search_features = list( filter_visible(chain(domain.variables, domain.metas))) self.display_features = list( filter_visible(chain(domain.variables, domain.metas))) self.search_indices = list(range(len(self.search_features))) self.display_indices = list(range(len(self.display_features))) self.selection = [0] self.openContext(self.corpus) self.display_list_indices = self.display_indices self.regenerate_docs() self.list_docs() self.update_info() self.set_selection() self.show_docs() self.commit() def reset_widget(self): # Corpus self.corpus = None self.corpus_docs = None self.output_mask = [] self.display_features = [] # Widgets self.search_listbox.clear() self.display_listbox.clear() self.filter_input.clear() self.update_info() # Models/vars self.search_features.clear() self.search_indices.clear() self.display_indices.clear() self.doc_list_model.clear() # Warnings self.Warning.clear() # WebView self.doc_webview.setHtml('') def list_docs(self): """ List documents into the left scrolling area """ if self.corpus_docs is None: return search_keyword = self.regexp_filter.strip('|') try: reg = re.compile(search_keyword, re.IGNORECASE) except sre_constants.error: return def is_match(x): return not bool(search_keyword) or reg.search(x) self.output_mask.clear() self.doc_list_model.clear() for i, (doc, title, content) in enumerate( zip(self.corpus, self.corpus.titles, self.corpus_docs)): if is_match(content): item = QStandardItem() item.setData(title, Qt.DisplayRole) item.setData(doc, Qt.UserRole) self.doc_list_model.appendRow(item) self.output_mask.append(i) def reset_selection(self): if self.doc_list_model.rowCount() > 0: self.doc_list.selectRow(0) # Select the first document else: self.doc_webview.setHtml('') def set_selection(self): view = self.doc_list if len(self.selection): selection = QItemSelection() for row in self.selection: selection.append( QItemSelectionRange(view.model().index(row, 0), view.model().index(row, 0))) view.selectionModel().select(selection, QItemSelectionModel.ClearAndSelect) def show_docs(self): """ Show the selected documents in the right area """ HTML = ''' <!doctype html> <html> <head> <script type="text/javascript" src="resources/jquery-3.1.1.min.js"> </script> <script type="text/javascript" src="resources/jquery.mark.min.js"> </script> <script type="text/javascript" src="resources/highlighter.js"> </script> <meta charset='utf-8'> <style> table {{ border-collapse: collapse; }} mark {{ background: #FFCD28; }} tr > td {{ padding-bottom: 3px; padding-top: 3px; }} body {{ font-family: Helvetica; font-size: 10pt; }} .line {{ border-bottom: 1px solid #000; }} .separator {{ height: 5px; }} .variables {{ vertical-align: top; padding-right: 10px; }} .token {{ padding: 3px; border: 1px #B0B0B0 solid; margin-right: 5px; margin-bottom: 5px; display: inline-block; }} img {{ max-width: 100%; }} </style> </head> <body> {} </body> </html> ''' self.display_indices = self.display_list_indices if self.corpus is None: return self.Warning.no_feats_display.clear() if len(self.display_indices) == 0: self.Warning.no_feats_display() if self.show_tokens: tokens = list(self.corpus.ngrams_iterator(include_postags=True)) marked_search_features = [ f for i, f in enumerate(self.search_features) if i in self.search_indices ] html = '<table>' selection = [ i.row() for i in self.doc_list.selectionModel().selectedRows() ] if selection != []: self.selection = selection for doc_count, index in enumerate( self.doc_list.selectionModel().selectedRows()): if doc_count > 0: # add split html += '<tr class="line separator"><td/><td/></tr>' \ '<tr class="separator"><td/><td/></tr>' row_ind = index.data(Qt.UserRole).row_index for ind in self.display_indices: feature = self.display_features[ind] mark = 'class="mark-area"' if feature in marked_search_features else '' value = str(index.data(Qt.UserRole)[feature.name]) is_image = feature.attributes.get('type', '') == 'image' if is_image and value != '?': value = '<img src="{}"></img>'.format(value) html += '<tr><td class="variables"><strong>{}:</strong></td>' \ '<td {}>{}</td></tr>'.format( feature.name, mark, value) if self.show_tokens: html += '<tr><td class="variables"><strong>Tokens & Tags:</strong></td>' \ '<td>{}</td></tr>'.format(''.join('<span class="token">{}</span>'.format( token) for token in tokens[row_ind])) html += '</table>' base = QUrl.fromLocalFile(__file__) self.doc_webview.setHtml(HTML.format(html), base) def search_features_changed(self): self.regenerate_docs() self.refresh_search() def regenerate_docs(self): self.corpus_docs = None self.Warning.no_feats_search.clear() if self.corpus is not None: feats = [self.search_features[i] for i in self.search_indices] if len(feats) == 0: self.Warning.no_feats_search() self.corpus_docs = self.corpus.documents_from_features(feats) def refresh_search(self): if self.corpus is not None: self.list_docs() self.reset_selection() self.update_info() self.commit() @Slot() def highlight_docs(self): search_keyword = self.regexp_filter.\ strip('|').replace('\\', '\\\\') # escape one \ to two for mark.js if search_keyword: # mark is undefined when clearing the view (`setHtml('')`). Maybe # set and template html with all the scripts, ... but no contents? self.doc_webview.runJavaScript(''' if (typeof mark !== "undefined") {{ mark("{}"); }} '''.format(search_keyword)) def update_info(self): if self.corpus is not None: self.n_documents = len(self.corpus) self.n_matching = '{}/{}'.format(self.doc_list_model.rowCount(), self.n_documents) self.n_tokens = sum( map(len, self.corpus.tokens)) if self.corpus.has_tokens() else 'n/a' self.n_types = len( self.corpus.dictionary) if self.corpus.has_tokens() else 'n/a' self.is_preprocessed = self.corpus.has_tokens() self.is_pos_tagged = self.corpus.pos_tags is not None self.ngram_range = '{}-{}'.format(*self.corpus.ngram_range) else: self.n_documents = '' self.n_matching = '' self.n_tokens = '' self.n_types = '' self.is_preprocessed = '' self.is_pos_tagged = '' self.ngram_range = '' def commit(self): if self.corpus is not None: matched = self.corpus[self.output_mask] output_mask = set(self.output_mask) unmatched_mask = [ i for i in range(len(self.corpus)) if i not in output_mask ] unmatched = self.corpus[unmatched_mask] self.Outputs.matching_docs.send(matched) self.Outputs.other_docs.send(unmatched) else: self.Outputs.matching_docs.send(None) self.Outputs.other_docs.send(None)
class TestPerfectDomainContextHandler(TestCase): def setUp(self): self.domain = Domain(attributes=[ ContinuousVariable('c1'), DiscreteVariable('d1', values='abc'), DiscreteVariable('d2', values='def') ], class_vars=[DiscreteVariable('d3', values='ghi')], metas=[ ContinuousVariable('c2'), DiscreteVariable('d4', values='jkl') ]) self.args = (self.domain, (('c1', Continuous), ('d1', Discrete), ('d2', Discrete)), (('d3', Discrete), ), (('c2', Continuous), ('d4', Discrete))) self.args_match_all = (self.domain, (('c1', Continuous), ('d1', list('abc')), ('d2', list('def'))), (('d3', list('ghi')), ), (('c2', Continuous), ('d4', list('jkl')))) self.handler = PerfectDomainContextHandler() self.handler.read_defaults = lambda: None self.handler.bind(SimpleWidget) self.widget = SimpleWidget() self.handler.initialize(self.widget) def test_new_context(self): context = self.handler.new_context(*self.args) _, attrs, class_vars, metas = self.args self.assertEqual(context.attributes, attrs) self.assertEqual(context.class_vars, class_vars) self.assertEqual(context.metas, metas) def test_open_context(self): context = Context() context.attributes = () context.class_vars = () self.handler.new_context = Mock(return_value=context) self.handler.open_context(self.widget, self.domain) self.handler.new_context.assert_called_with(*self.args) def test_encode_domain_simple(self): attrs, class_vars, metas = self.handler.encode_domain(self.domain) self.assertEqual(attrs, (('c1', Continuous), ('d1', Discrete), ('d2', Discrete))) self.assertEqual(class_vars, (('d3', Discrete), )) self.assertEqual(metas, (('c2', Continuous), ('d4', Discrete))) def test_encode_domain_match_values(self): self.handler.match_values = self.handler.MATCH_VALUES_ALL attrs, class_vars, metas = self.handler.encode_domain(self.domain) self.assertEqual(attrs, (('c1', Continuous), ('d1', list('abc')), ('d2', list('def')))) self.assertEqual(class_vars, (('d3', list('ghi')), )) self.assertEqual(metas, (('c2', Continuous), ('d4', list('jkl')))) def test_match_simple(self): domain, attrs, class_vars, metas = self.args context = self._create_context(attrs, class_vars, metas) self.assertEqual(self.handler.match(context, *self.args), self.handler.PERFECT_MATCH) attrs2 = list(attrs) attrs2[:2] = attrs[1::-1] self.assertEqual( self.handler.match(context, domain, attrs2, class_vars, metas), self.handler.NO_MATCH) attrs3 = list(attrs) attrs3.append(attrs[0]) self.assertEqual( self.handler.match(context, domain, attrs3, class_vars, metas), self.handler.NO_MATCH) metas2 = list(metas) metas2.append(attrs[0]) self.assertEqual( self.handler.match(context, domain, attrs, class_vars, metas2), self.handler.NO_MATCH) def test_match_values(self): domain, attrs, class_vars, metas = self.args_match_all context = self._create_context(attrs, class_vars, metas) self.handler.match_values = self.handler.MATCH_VALUES_ALL self.assertEqual(self.handler.match(context, *self.args_match_all), self.handler.PERFECT_MATCH) attrs2 = list(attrs) attrs2[:2] = attrs[1::-1] self.assertEqual( self.handler.match(context, domain, attrs2, class_vars, metas), self.handler.NO_MATCH) attrs3 = list(attrs) attrs3.append(attrs[0]) self.assertEqual( self.handler.match(context, domain, attrs2, class_vars, metas), self.handler.NO_MATCH) def test_encode_setting(self): _, attrs, class_vars, metas = self.args context = self._create_context(attrs, class_vars, metas) encoded_setting = self.handler.encode_setting(context, SimpleWidget.setting, "d1") self.assertEqual(encoded_setting, ("d1", -2)) encoded_setting = self.handler.encode_setting(context, SimpleWidget.text, "d1") self.assertEqual(encoded_setting, ("d1", -1)) encoded_setting = self.handler.encode_setting(context, SimpleWidget.with_metas, "d4") self.assertEqual(encoded_setting, ("d4", 1)) def _create_context(self, attrs, class_vars, metas): context = Context() context.attributes = attrs context.class_vars = class_vars context.metas = metas return context
class OWFile(widget.OWWidget, RecentPathsWComboMixin): name = "File" id = "orange.widgets.data.file" description = "Read a data from an input file or network " \ "and send the data table to the output." icon = "icons/File.svg" priority = 10 category = "Data" keywords = ["data", "file", "load", "read"] outputs = [widget.OutputSignal( "Data", Table, doc="Attribute-valued data set read from the input file.")] want_main_area = False SEARCH_PATHS = [("sample-datasets", get_sample_datasets_dir())] LOCAL_FILE, URL = range(2) settingsHandler = PerfectDomainContextHandler() # Overload RecentPathsWidgetMixin.recent_paths to set defaults recent_paths = Setting([ RecentPath("", "sample-datasets", "iris.tab"), RecentPath("", "sample-datasets", "titanic.tab"), RecentPath("", "sample-datasets", "housing.tab"), ]) recent_urls = Setting([]) source = Setting(LOCAL_FILE) xls_sheet = ContextSetting("") sheet_names = Setting({}) url = Setting("") variables = ContextSetting([]) dlg_formats = ( "All readable files ({});;".format( '*' + ' *'.join(FileFormat.readers.keys())) + ";;".join("{} (*{})".format(f.DESCRIPTION, ' *'.join(f.EXTENSIONS)) for f in sorted(set(FileFormat.readers.values()), key=list(FileFormat.readers.values()).index))) def __init__(self): super().__init__() RecentPathsWComboMixin.__init__(self) self.domain = None self.data = None self.loaded_file = "" self.reader = None layout = QtGui.QGridLayout() gui.widgetBox(self.controlArea, margin=0, orientation=layout) vbox = gui.radioButtons(None, self, "source", box=True, addSpace=True, callback=self.load_data, addToLayout=False) rb_button = gui.appendRadioButton(vbox, "File:", addToLayout=False) layout.addWidget(rb_button, 0, 0, QtCore.Qt.AlignVCenter) box = gui.hBox(None, addToLayout=False, margin=0) box.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed) self.file_combo.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed) self.file_combo.activated[int].connect(self.select_file) box.layout().addWidget(self.file_combo) layout.addWidget(box, 0, 1) file_button = gui.button( None, self, '...', callback=self.browse_file, autoDefault=False) file_button.setIcon(self.style().standardIcon( QtGui.QStyle.SP_DirOpenIcon)) file_button.setSizePolicy(Policy.Maximum, Policy.Fixed) layout.addWidget(file_button, 0, 2) reload_button = gui.button( None, self, "Reload", callback=self.load_data, autoDefault=False) reload_button.setIcon(self.style().standardIcon( QtGui.QStyle.SP_BrowserReload)) reload_button.setSizePolicy(Policy.Fixed, Policy.Fixed) layout.addWidget(reload_button, 0, 3) self.sheet_box = gui.hBox(None, addToLayout=False, margin=0) self.sheet_combo = gui.comboBox(None, self, "xls_sheet", callback=self.select_sheet, sendSelectedValue=True) self.sheet_combo.setSizePolicy( Policy.MinimumExpanding, Policy.Fixed) self.sheet_label = QtGui.QLabel() self.sheet_label.setText('Sheet') self.sheet_label.setSizePolicy( Policy.MinimumExpanding, Policy.Fixed) self.sheet_box.layout().addWidget( self.sheet_label, QtCore.Qt.AlignLeft) self.sheet_box.layout().addWidget( self.sheet_combo, QtCore.Qt.AlignVCenter) layout.addWidget(self.sheet_box, 2, 1) self.sheet_box.hide() rb_button = gui.appendRadioButton(vbox, "URL:", addToLayout=False) layout.addWidget(rb_button, 3, 0, QtCore.Qt.AlignVCenter) self.url_combo = url_combo = QtGui.QComboBox() url_model = NamedURLModel(self.sheet_names) url_model.wrap(self.recent_urls) url_combo.setModel(url_model) url_combo.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed) url_combo.setEditable(True) url_combo.setInsertPolicy(url_combo.InsertAtTop) url_edit = url_combo.lineEdit() l, t, r, b = url_edit.getTextMargins() url_edit.setTextMargins(l + 5, t, r, b) layout.addWidget(url_combo, 3, 1, 3, 3) url_combo.activated.connect(self._url_set) box = gui.vBox(self.controlArea, "Info") self.info = gui.widgetLabel(box, 'No data loaded.') self.warnings = gui.widgetLabel(box, '') box = gui.widgetBox(self.controlArea, "Columns (Double click to edit)") domain_editor = DomainEditor(self.variables) self.editor_model = domain_editor.model() box.layout().addWidget(domain_editor) box = gui.hBox(self.controlArea) gui.button( box, self, "Browse documentation data sets", callback=lambda: self.browse_file(True), autoDefault=False) gui.rubber(box) box.layout().addWidget(self.report_button) self.report_button.setFixedWidth(170) self.apply_button = gui.button( box, self, "Apply", callback=self.apply_domain_edit) self.apply_button.hide() self.apply_button.setFixedWidth(170) self.editor_model.dataChanged.connect(self.apply_button.show) self.set_file_list() # Must not call open_file from within __init__. open_file # explicitly re-enters the event loop (by a progress bar) QtCore.QTimer.singleShot(0, self.load_data) self.setAcceptDrops(True) def sizeHint(self): return QtCore.QSize(600, 550) def select_file(self, n): assert n < len(self.recent_paths) super().select_file(n) if self.recent_paths: self.source = self.LOCAL_FILE self.load_data() self.set_file_list() def select_sheet(self): self.recent_paths[0].sheet = self.sheet_combo.currentText() self.load_data() def _url_set(self): self.source = self.URL self.load_data() def browse_file(self, in_demos=False): if in_demos: start_file = get_sample_datasets_dir() if not os.path.exists(start_file): QtGui.QMessageBox.information( None, "File", "Cannot find the directory with documentation data sets") return else: start_file = self.last_path() or os.path.expanduser("~/") filename = QtGui.QFileDialog.getOpenFileName( self, 'Open Orange Data File', start_file, self.dlg_formats) if not filename: return self.loaded_file = filename self.add_path(filename) self.source = self.LOCAL_FILE self.load_data() # Open a file, create data from it and send it over the data channel def load_data(self): self.reader = self._get_reader() self._update_sheet_combo() errors = [] with catch_warnings(record=True) as warnings: try: data = self.reader.read() except Exception as ex: errors.append("An error occurred:") errors.append(str(ex)) data = None self.editor_model.reset() self.warning(warnings[-1].message.args[0] if warnings else '') if data is None: self.send("Data", None) self.info.setText("\n".join(errors)) return self.info.setText(self._describe(data)) add_origin(data, self.loaded_file or self.last_path()) self.send("Data", data) self.editor_model.set_domain(data.domain) self.data = data def _get_reader(self): """ Returns ------- FileFormat """ if self.source == self.LOCAL_FILE: reader = FileFormat.get_reader(self.last_path()) if self.recent_paths and self.recent_paths[0].sheet: reader.select_sheet(self.recent_paths[0].sheet) return reader elif self.source == self.URL: return UrlReader(self.url_combo.currentText()) def _update_sheet_combo(self): if len(self.reader.sheets) < 2: self.sheet_box.hide() self.reader.select_sheet(None) return self.sheet_combo.clear() self.sheet_combo.addItems(self.reader.sheets) self._select_active_sheet() self.sheet_box.show() def _select_active_sheet(self): if self.reader.sheet: try: idx = self.reader.sheets.index(self.reader.sheet) self.sheet_combo.setCurrentIndex(idx) except ValueError: # Requested sheet does not exist in this file self.reader.select_sheet(None) else: self.sheet_combo.setCurrentIndex(0) def _describe(self, table): domain = table.domain text = "{} instance(s), {} feature(s), {} meta attribute(s)".format( len(table), len(domain.attributes), len(domain.metas)) if domain.has_continuous_class: text += "\nRegression; numerical class." elif domain.has_discrete_class: text += "\nClassification; discrete class with {} values.".format( len(domain.class_var.values)) elif table.domain.class_vars: text += "\nMulti-target; {} target variables.".format( len(table.domain.class_vars)) else: text += "\nData has no target variable." if 'Timestamp' in table.domain: # Google Forms uses this header to timestamp responses text += '\n\nFirst entry: {}\nLast entry: {}'.format( table[0, 'Timestamp'], table[-1, 'Timestamp']) return text def storeSpecificSettings(self): self.current_context.modified_variables = self.variables[:] def retrieveSpecificSettings(self): if hasattr(self.current_context, "modified_variables"): self.variables[:] = self.current_context.modified_variables def apply_domain_edit(self): attributes = [] class_vars = [] metas = [] places = [attributes, class_vars, metas] X, y, m = [], [], [] cols = [X, y, m] # Xcols, Ycols, Mcols def is_missing(x): return str(x) in ("nan", "") for column, (name, tpe, place, vals, is_con), (orig_var, orig_plc) in \ zip(count(), self.editor_model.variables, chain([(at, 0) for at in self.data.domain.attributes], [(cl, 1) for cl in self.data.domain.class_vars], [(mt, 2) for mt in self.data.domain.metas])): if place == 3: continue if orig_plc == 2: col_data = list(chain(*self.data[:, orig_var].metas)) else: col_data = list(chain(*self.data[:, orig_var])) if name == orig_var.name and tpe == type(orig_var): var = orig_var elif tpe == DiscreteVariable: values = list(str(i) for i in set(col_data) if not is_missing(i)) var = tpe(name, values) col_data = [np.nan if is_missing(x) else values.index(str(x)) for x in col_data] elif tpe == StringVariable and type(orig_var) == DiscreteVariable: var = tpe(name) col_data = [orig_var.repr_val(x) if not np.isnan(x) else "" for x in col_data] else: var = tpe(name) places[place].append(var) cols[place].append(col_data) domain = Domain(attributes, class_vars, metas) X = np.array(X).T if len(X) else np.empty((len(self.data), 0)) y = np.array(y).T if len(y) else None dtpe = object if any(isinstance(m, StringVariable) for m in domain.metas) else float m = np.array(m, dtype=dtpe).T if len(m) else None table = Table.from_numpy(domain, X, y, m, self.data.W) self.send("Data", table) self.apply_button.hide() def get_widget_name_extension(self): _, name = os.path.split(self.loaded_file) return os.path.splitext(name)[0] def send_report(self): def get_ext_name(filename): try: return FileFormat.names[os.path.splitext(filename)[1]] except KeyError: return "unknown" if self.data is None: self.report_paragraph("File", "No file.") return if self.source == self.LOCAL_FILE: home = os.path.expanduser("~") if self.loaded_file.startswith(home): # os.path.join does not like ~ name = "~/" + \ self.loaded_file[len(home):].lstrip("/").lstrip("\\") else: name = self.loaded_file if self.sheet_combo.isVisible(): name += " ({})".format(self.sheet_combo.currentText()) self.report_items("File", [("File name", name), ("Format", get_ext_name(name))]) else: self.report_items("Data", [("Resource", self.url), ("Format", get_ext_name(self.url))]) self.report_data("Data", self.data) def dragEnterEvent(self, event): """Accept drops of valid file urls""" urls = event.mimeData().urls() if urls: try: FileFormat.get_reader(OSX_NSURL_toLocalFile(urls[0]) or urls[0].toLocalFile()) event.acceptProposedAction() except IOError: pass def dropEvent(self, event): """Handle file drops""" urls = event.mimeData().urls() if urls: self.add_path(OSX_NSURL_toLocalFile(urls[0]) or urls[0].toLocalFile()) # add first file self.source = self.LOCAL_FILE self.load_data()
class OWSelectRows(widget.OWWidget): name = "Select Rows" id = "Orange.widgets.data.file" description = "Select rows from the data based on values of variables." icon = "icons/SelectRows.svg" priority = 100 category = "Data" author = "Peter Juvan, Janez Demšar" author_email = "janez.demsar(@at@)fri.uni-lj.si" inputs = [("Data", Table, "set_data")] outputs = [("Matching Data", Table, widget.Default), ("Unmatched Data", Table)] want_main_area = False settingsHandler = PerfectDomainContextHandler() conditions = ContextSetting([]) update_on_change = Setting(True) purge_attributes = Setting(True) purge_classes = Setting(True) auto_commit = Setting(True) operator_names = { ContinuousVariable: [ "equals", "is not", "is below", "is at most", "is greater than", "is at least", "is between", "is outside", "is defined" ], DiscreteVariable: ["is", "is not", "is one of", "is defined"], StringVariable: [ "equals", "is not", "is before", "is equal or before", "is after", "is equal or after", "is between", "is outside", "contains", "begins with", "ends with", "is defined" ] } def __init__(self): super().__init__() self.old_purge_classes = True self.conditions = [] self.last_output_conditions = None self.data = None self.data_desc = self.match_desc = self.nonmatch_desc = None box = gui.widgetBox(self.controlArea, 'Conditions', stretch=100) self.cond_list = QtGui.QTableWidget(box) box.layout().addWidget(self.cond_list) self.cond_list.setShowGrid(False) self.cond_list.setSelectionMode(QtGui.QTableWidget.NoSelection) self.cond_list.setColumnCount(3) self.cond_list.setRowCount(0) self.cond_list.verticalHeader().hide() self.cond_list.horizontalHeader().hide() self.cond_list.resizeColumnToContents(0) self.cond_list.horizontalHeader().setResizeMode( QtGui.QHeaderView.Stretch) self.cond_list.viewport().setBackgroundRole(QtGui.QPalette.Window) box2 = gui.widgetBox(box, orientation="horizontal") self.add_button = gui.button(box2, self, "Add condition", callback=self.add_row) self.add_all_button = gui.button(box2, self, "Add all variables", callback=self.add_all) self.remove_all_button = gui.button(box2, self, "Remove all", callback=self.remove_all) gui.rubber(box2) info = gui.widgetBox(self.controlArea, '', orientation="horizontal") box_data_in = gui.widgetBox(info, 'Data In') # self.data_in_rows = gui.widgetLabel(box_data_in, " ") self.data_in_variables = gui.widgetLabel(box_data_in, " ") gui.rubber(box_data_in) box_data_out = gui.widgetBox(info, 'Data Out') self.data_out_rows = gui.widgetLabel(box_data_out, " ") # self.dataOutAttributesLabel = gui.widgetLabel(box_data_out, " ") gui.rubber(box_data_out) box = gui.widgetBox(self.controlArea, orientation="horizontal") box_setting = gui.widgetBox(box, 'Purging') self.cb_pa = gui.checkBox(box_setting, self, "purge_attributes", "Remove unused features", callback=self.conditions_changed) gui.separator(box_setting, height=1) self.cb_pc = gui.checkBox(box_setting, self, "purge_classes", "Remove unused classes", callback=self.conditions_changed) gui.auto_commit(box, self, "auto_commit", label="Commit", checkbox_label="Commit on change") self.set_data(None) self.resize(600, 400) def add_row(self, attr=None, condition_type=None, condition_value=None): model = self.cond_list.model() row = model.rowCount() model.insertRow(row) attr_combo = QtGui.QComboBox(minimumContentsLength=12, sizeAdjustPolicy=QtGui.QComboBox. AdjustToMinimumContentsLengthWithIcon) attr_combo.row = row for var in chain(self.data.domain.variables, self.data.domain.metas): attr_combo.addItem(*gui.attributeItem(var)) attr_combo.setCurrentIndex(attr or 0) self.cond_list.setCellWidget(row, 0, attr_combo) self.remove_all_button.setDisabled(False) self.set_new_operators(attr_combo, attr is not None, condition_type, condition_value) attr_combo.currentIndexChanged.connect( lambda _: self.set_new_operators(attr_combo, False)) self.cond_list.resizeRowToContents(row) def add_all(self): if self.cond_list.rowCount(): Mb = QtGui.QMessageBox if Mb.question( self, "Remove existing filters", "This will replace the existing filters with " "filters for all variables.", Mb.Ok | Mb.Cancel) != Mb.Ok: return self.remove_all() domain = self.data.domain for i in range(len(domain.variables) + len(domain.metas)): self.add_row(i) def remove_all(self): self.remove_all_rows() self.conditions_changed() def remove_all_rows(self): self.cond_list.clear() self.cond_list.setRowCount(0) self.remove_all_button.setDisabled(True) def set_new_operators(self, attr_combo, adding_all, selected_index=None, selected_values=None): oper_combo = QtGui.QComboBox() oper_combo.row = attr_combo.row oper_combo.attr_combo = attr_combo var = self.data.domain[attr_combo.currentText()] oper_combo.addItems(self.operator_names[type(var)]) oper_combo.setCurrentIndex(selected_index or 0) self.set_new_values(oper_combo, adding_all, selected_values) self.cond_list.setCellWidget(oper_combo.row, 1, oper_combo) oper_combo.currentIndexChanged.connect( lambda _: self.set_new_values(oper_combo, False)) @staticmethod def _get_lineedit_contents(box): return [ child.text() for child in getattr(box, "controls", [box]) if isinstance(child, QtGui.QLineEdit) ] @staticmethod def _get_value_contents(box): cont = [] names = [] for child in getattr(box, "controls", [box]): if isinstance(child, QtGui.QLineEdit): cont.append(child.text()) elif isinstance(child, QtGui.QComboBox): cont.append(child.currentIndex()) elif isinstance(child, QtGui.QToolButton): if child.popup is not None: model = child.popup.list_view.model() for row in range(model.rowCount()): item = model.item(row) if item.checkState(): cont.append(row + 1) names.append(item.text()) child.desc_text = ', '.join(names) child.set_text() elif child is None: pass else: raise TypeError('Type %s not supported.' % type(child)) return tuple(cont) class QDoubleValidatorEmpty(QtGui.QDoubleValidator): def validate(self, input_, pos): if not input_: return (QtGui.QDoubleValidator.Acceptable, input_, pos) else: return super().validate(input_, pos) def set_new_values(self, oper_combo, adding_all, selected_values=None): # def remove_children(): # for child in box.children()[1:]: # box.layout().removeWidget(child) # child.setParent(None) def add_textual(contents): le = gui.lineEdit(box, self, None) if contents: le.setText(contents) le.setAlignment(QtCore.Qt.AlignRight) le.editingFinished.connect(self.conditions_changed) return le def add_numeric(contents): le = add_textual(contents) le.setValidator(OWSelectRows.QDoubleValidatorEmpty()) return le var = self.data.domain[oper_combo.attr_combo.currentText()] box = self.cond_list.cellWidget(oper_combo.row, 2) if selected_values is not None: lc = list(selected_values) + ["", ""] lc = [str(x) for x in lc[:2]] else: lc = ["", ""] if box and vartype(var) == box.var_type: lc = self._get_lineedit_contents(box) + lc oper = oper_combo.currentIndex() if oper == oper_combo.count() - 1: self.cond_list.removeCellWidget(oper_combo.row, 2) elif var.is_discrete: if oper_combo.currentText() == "is one of": if selected_values: lc = [x for x in list(selected_values)] button = DropDownToolButton(self, var, lc) button.var_type = vartype(var) self.cond_list.setCellWidget(oper_combo.row, 2, button) else: combo = QtGui.QComboBox() combo.addItems([""] + var.values) if lc[0]: combo.setCurrentIndex(int(lc[0])) else: combo.setCurrentIndex(0) combo.var_type = vartype(var) self.cond_list.setCellWidget(oper_combo.row, 2, combo) combo.currentIndexChanged.connect(self.conditions_changed) else: box = gui.widgetBox(self, orientation="horizontal", addToLayout=False) box.var_type = vartype(var) self.cond_list.setCellWidget(oper_combo.row, 2, box) if var.is_continuous: box.controls = [add_numeric(lc[0])] if oper > 5: gui.widgetLabel(box, " and ") box.controls.append(add_numeric(lc[1])) gui.rubber(box) elif var.is_string: box.controls = [add_textual(lc[0])] if oper in [6, 7]: gui.widgetLabel(box, " and ") box.controls.append(add_textual(lc[1])) else: box.controls = [] if not adding_all: self.conditions_changed() def set_data(self, data): self.closeContext() self.data = data self.cb_pa.setEnabled(not isinstance(data, SqlTable)) self.cb_pc.setEnabled(not isinstance(data, SqlTable)) self.remove_all_rows() self.add_button.setDisabled(data is None) self.add_all_button.setDisabled( data is None or len(data.domain.variables) + len(data.domain.metas) > 100) if not data: self.data_desc = None self.commit() return self.data_desc = report.describe_data_brief(data) self.conditions = [] try: self.openContext(data) except Exception: pass if not self.conditions and len(data.domain.variables): self.add_row() self.update_info(data, self.data_in_variables) for attr, cond_type, cond_value in self.conditions: attrs = [a.name for a in data.domain.variables + data.domain.metas] if attr in attrs: self.add_row(attrs.index(attr), cond_type, cond_value) self.unconditional_commit() def conditions_changed(self): try: self.conditions = [] self.conditions = [ (self.cond_list.cellWidget(row, 0).currentText(), self.cond_list.cellWidget(row, 1).currentIndex(), self._get_value_contents(self.cond_list.cellWidget(row, 2))) for row in range(self.cond_list.rowCount()) ] if self.update_on_change and ( self.last_output_conditions is None or self.last_output_conditions != self.conditions): self.commit() except AttributeError: # Attribute error appears if the signal is triggered when the # controls are being constructed pass def commit(self): matching_output = self.data non_matching_output = None if self.data: domain = self.data.domain conditions = [] for attr_name, oper, values in self.conditions: attr_index = domain.index(attr_name) attr = domain[attr_index] if attr.is_continuous: if any(not v for v in values): continue filter = data_filter.FilterContinuous( attr_index, oper, *[float(v) for v in values]) elif attr.is_string: filter = data_filter.FilterString( attr_index, oper, *[str(v) for v in values]) else: if oper == 3: f_values = None else: if not values or not values[0]: continue values = [attr.values[i - 1] for i in values] if oper == 0: f_values = {values[0]} elif oper == 1: f_values = set(attr.values) f_values.remove(values[0]) elif oper == 2: f_values = set(values) else: raise ValueError("invalid operand") filter = data_filter.FilterDiscrete(attr_index, f_values) conditions.append(filter) if conditions: filters = data_filter.Values(conditions) matching_output = filters(self.data) filters.negate = True non_matching_output = filters(self.data) # if hasattr(self.data, "name"): # matching_output.name = self.data.name # non_matching_output.name = self.data.name purge_attrs = self.purge_attributes purge_classes = self.purge_classes if (purge_attrs or purge_classes) and \ not isinstance(self.data, SqlTable): attr_flags = sum([ Remove.RemoveConstant * purge_attrs, Remove.RemoveUnusedValues * purge_attrs ]) class_flags = sum([ Remove.RemoveConstant * purge_classes, Remove.RemoveUnusedValues * purge_classes ]) # same settings used for attributes and meta features remover = Remove(attr_flags, class_flags, attr_flags) matching_output = remover(matching_output) non_matching_output = remover(non_matching_output) self.send("Matching Data", matching_output) self.send("Unmatched Data", non_matching_output) self.match_desc = report.describe_data_brief(matching_output) self.nonmatch_desc = report.describe_data_brief(non_matching_output) self.update_info(matching_output, self.data_out_rows) def update_info(self, data, lab1): def sp(s, capitalize=True): return s and s or ("No" if capitalize else "no"), "s" * (s != 1) if data is None: lab1.setText("") else: lab1.setText( "~%s row%s, %s variable%s" % (sp(data.approx_len()) + sp(len(data.domain.variables) + len(data.domain.metas)))) def send_report(self): if not self.data: self.report_paragraph("No data.") return pdesc = None describe_domain = False for d in (self.data_desc, self.match_desc, self.nonmatch_desc): if not d or not d["Data instances"]: continue ndesc = d.copy() del ndesc["Data instances"] if pdesc is not None and pdesc != ndesc: describe_domain = True pdesc = ndesc conditions = [] domain = self.data.domain for attr_name, oper, values in self.conditions: attr_index = domain.index(attr_name) attr = domain[attr_index] names = self.operator_names[type(attr)] name = names[oper] if oper == len(names) - 1: conditions.append("{} {}".format(attr, name)) elif attr.is_discrete: if name == "is one of": if len(values) == 1: conditions.append("{} is {}".format( attr, attr.values[values[0] - 1])) elif len(values) > 1: conditions.append("{} is {} or {}".format( attr, ", ".join(attr.values[v - 1] for v in values[:-1]), attr.values[values[-1] - 1])) else: if not (values and values[0]): continue value = values[0] - 1 conditions.append("{} {} {}".format( attr, name, attr.values[value])) else: if len(values) == 1: conditions.append("{} {} {}".format(attr, name, *values)) else: conditions.append("{} {} {} and {}".format( attr, name, *values)) items = OrderedDict() if describe_domain: items.update(self.data_desc) else: items["Instances"] = self.data_desc["Data instances"] items["Condition"] = " AND ".join(conditions) or "no conditions" self.report_items("Data", items) if describe_domain: self.report_items("Matching data", self.match_desc) self.report_items("Non-matching data", self.nonmatch_desc) else: match_inst = \ bool(self.match_desc) and \ self.match_desc["Data instances"] nonmatch_inst = \ bool(self.nonmatch_desc) and \ self.nonmatch_desc["Data instances"] self.report_items( "Output", (("Matching data", "{} instances".format(match_inst) if match_inst else "None"), ("Non-matching data", nonmatch_inst > 0 and "{} instances".format(nonmatch_inst))))
class OWAlignDatasets(widget.OWWidget): name = "Align Datasets" description = "Alignment of multiple datasets with a diagram of correlation visualization." icon = "icons/AlignDatasets.svg" priority = 240 class Inputs: data = Input("Data", Table) class Outputs: transformed_data = Output("Transformed Data", Table) genes_components = Output("Genes per n. Components", Table) settingsHandler = PerfectDomainContextHandler() axis_labels = ContextSetting(10) source_id = ContextSetting(None) ncomponents = ContextSetting(20) ngenes = ContextSetting(30) scoring = ContextSetting(list(SCORINGS.keys())[0]) quantile_normalization = ContextSetting(False) quantile_normalization_perc = ContextSetting(2.5) dynamic_time_warping = ContextSetting(False) auto_update = Setting(True) auto_commit = Setting(True) graph_name = "plot.plotItem" class Error(widget.OWWidget.Error): no_features = widget.Msg("At least 1 feature is required") no_instances = widget.Msg( "At least 2 data instances are required for each class") no_class = widget.Msg("At least 1 Discrete class variable is required") nan_class = widget.Msg( "Data contains undefined instances for the selected Data source indicator" ) nan_input = widget.Msg("Input data contains non numeric values") sparse_data = widget.Msg("Sparse data is not supported") only_one_dataset = widget.Msg( "Data source indicator attribute column must indicate at least two datasets." ) def __init__(self): super().__init__() self.data = None self.source_id = None self._mas = None self._Ws = None self._transformed = None self._components = None self._use_genes = None self._shared_correlations = None self._transformed_table = None self._line = False self._feature_model = DomainModel(valid_types=DiscreteVariable, separators=False) self._feature_model.set_domain(None) self._init_mas() self._legend = None form = QFormLayout(labelAlignment=Qt.AlignLeft, formAlignment=Qt.AlignLeft, fieldGrowthPolicy=QFormLayout.AllNonFixedFieldsGrow, verticalSpacing=10) # Data source indicator box = gui.vBox(self.controlArea, "Data source indicator") gui.comboBox( box, self, "source_id", sendSelectedValue=True, callback=self._update_combo_source_id, model=self._feature_model, ) # Canonical correlation analysis box = gui.vBox(self.controlArea, "Canonical correlation analysis") gui.spin(box, self, "ncomponents", 1, MAX_COMPONENTS, callback=self._update_selection_component_spin, keyboardTracking=False, label="Num. of components") # Shared genes box = gui.vBox(self.controlArea, "Shared genes") gui.spin( box, self, "ngenes", 1, MAX_GENES, callback=self._update_ngenes_spin, keyboardTracking=False, ) form.addRow("Num. of genes", self.controls.ngenes) gui.comboBox( box, self, "scoring", callback=self._update_scoring_combo, items=list(SCORINGS.keys()), sendSelectedValue=True, editable=False, ) form.addRow("Scoring:", self.controls.scoring) box.layout().addLayout(form) # Post-processing box = gui.vBox(self.controlArea, "Post-processing") gui.doubleSpin( box, self, "quantile_normalization_perc", minv=0, maxv=49, step=5e-1, callback=self._update_quantile_normalization, checkCallback=self._update_quantile_normalization, controlWidth=80, alignment=Qt.AlignRight, label="Quantile normalization", checked="quantile_normalization", ) self.controls.quantile_normalization_perc.setSuffix("%") b = gui.vBox(box) gui.checkBox(b, self, "dynamic_time_warping", callback=self._update_dynamic_time_warping, label="Dynamic time warping") self.controlArea.layout().addStretch() gui.auto_commit(self.controlArea, self, "auto_commit", "Apply", callback=self._invalidate_selection(), checkbox_label="Apply automatically") self.plot = pg.PlotWidget(background="w") axis = self.plot.getAxis("bottom") axis.setLabel("Correlation components") axis = self.plot.getAxis("left") axis.setLabel("Correlation strength") self.plot_horlabels = [] self.plot_horlines = [] self.plot.getViewBox().setMenuEnabled(False) self.plot.getViewBox().setMouseEnabled(False, False) self.plot.showGrid(True, True, alpha=0.5) self.plot.setRange(xRange=(0.0, 1.0), yRange=(0.0, 1.0)) self.mainArea.layout().addWidget(self.plot) @Inputs.data @check_sql_input def set_data(self, data): self.closeContext() self.clear_messages() self.clear() self.information() self.clear_outputs() self._feature_model.set_domain(None) self.data = data if self.data: self._feature_model.set_domain(self.data.domain) if self._feature_model: # if source id is available we assume that it is the feature that describes a dataset source if "Source ID" in self.data.domain: self.source_id = self.data.domain["Source ID"] self.openContext(self.data.domain) if self.source_id is None or self.source_id == '': for model in self._feature_model: y = np.array(self.data.get_column_view(model)[0], dtype=np.float64) _, counts = np.unique(y, return_counts=True) if np.isfinite(y).all() and min(counts) > 1: self.source_id = model self._reset_max_components() break if not self.source_id: self.Error.nan_class() return if len(self.data.domain.attributes) == 0: self.Error.no_features() return if len(self.data) == 0: self.Error.no_instances() return if np.isnan(self.data.X).any(): self.Error.nan_input() return y = np.array(self.data.get_column_view(self.source_id)[0], dtype=np.float64) _, counts = np.unique(y, return_counts=True) if min(counts) < 2: self.Error.no_instances() return self._reset_max_components() self.fit() else: self.Error.no_class() self.clear() return def fit(self): if self.data is None: return global MAX_COMPONENTS if self.ncomponents > MAX_COMPONENTS: self.ncomponents = MAX_COMPONENTS X = self.data.X y = self.data.get_column_view(self.source_id)[0] if len(set(y)) < 2: self.Error.only_one_dataset() return self._init_mas() self._Ws = self._mas.fit(X, y) self._shared_correlations = self._mas.shared_correlations if np.isnan(np.sum(self._shared_correlations)): self._shared_correlations = np.array( [interpolate_nans(x) for x in self._shared_correlations]) self._use_genes = self._mas.use_genes self._setup_plot() if self.auto_commit: self.commit() def clear(self): self.data = None self.source_id = None self._mas = None self._Ws = None self._transformed = None self._transformed_table = None self._components = None self._use_genes = None self._shared_correlations = None self._feature_model.set_domain(None) self.clear_plot() def clear_legend(self): if self._legend is None: return scene = self._legend.scene() if scene is None: return scene.removeItem(self._legend) self._legend = None def clear_plot(self): self.clear_legend() self._line = False self.plot_horlabels = [] self.plot_horlines = [] self._mas = None self._setup_plot() def clear_outputs(self): self.Outputs.transformed_data.send(None) self.Outputs.genes_components.send(None) def _reset_max_components(self): y = np.array(self.data.get_column_view(self.source_id)[0], dtype=np.float64) _, counts = np.unique(y, return_counts=True) global MAX_COMPONENTS if min(counts) < MAX_COMPONENTS_DEFAULT or len( self.data.domain.attributes) < MAX_COMPONENTS_DEFAULT: MAX_COMPONENTS = min(min(counts), len( self.data.domain.attributes)) - 1 if self.ncomponents > MAX_COMPONENTS: self.ncomponents = MAX_COMPONENTS // 2 self.controls.ncomponents.setMaximum(MAX_COMPONENTS) else: MAX_COMPONENTS = MAX_COMPONENTS_DEFAULT self.ncomponents = 20 self.controls.ncomponents.setMaximum(MAX_COMPONENTS) def _init_mas(self): self._mas = SeuratAlignmentModel( n_components=MAX_COMPONENTS, n_metagenes=self.ngenes, gene_scoring=SCORINGS[self.scoring], ) def get_model(self): if self.data is None: return self.fit() self._setup_plot() self.commit() def _setup_plot(self): self.plot.clear() if self._mas is None: return shared_correlations = self._shared_correlations p = MAX_COMPONENTS # Colors chosen based on: http://colorbrewer2.org/?type=qualitative&scheme=Set1&n=9 colors = [ '#e41a1c', '#377eb8', '#4daf4a', '#984ea3', '#ff7f00', '#ffff33', '#a65628', '#f781bf', '#999999' ] self.clear_legend() self._legend = self.plot.addLegend(offset=(-1, 1)) # correlation lines offset = 2 if MAX_COMPONENTS > 2 * offset + 1: smoothed_correlations = smooth_correlations(shared_correlations, offset=offset) else: smoothed_correlations = shared_correlations plotitem = dict() for i, corr in enumerate(smoothed_correlations): plotitem[i] = self.plot.plot( np.arange(p), corr, pen=pg.mkPen(QColor(colors[i]), width=2), antialias=True) # name=self.source_id.values[i] # self.plot.plotItem.legend.addItem(3, "maximum value") for i in range(len(plotitem)): self._legend.addItem( MyLegendItem(pg.ScatterPlotItem(pen=colors[i])), self.source_id.values[i]) # vertical movable line cutpos = self.ncomponents - 1 self._line = pg.InfiniteLine(angle=90, pos=cutpos, movable=True, bounds=(0, p - 1)) self._line.setCursor(Qt.SizeHorCursor) self._line.setPen(pg.mkPen(QColor(Qt.black), width=2)) self._line.sigPositionChanged.connect(self._on_cut_changed) self.plot.addItem(self._line) # horizontal lines self.plot_horlines = tuple( pg.PlotCurveItem( pen=pg.mkPen(QColor(colors[i]), style=Qt.DashLine)) for i in range(len(shared_correlations))) self.plot_horlabels = tuple( pg.TextItem(color=QColor('k'), anchor=(0, 1)) for _ in range(len(shared_correlations))) for item in self.plot_horlabels + self.plot_horlines: self.plot.addItem(item) self._set_horline_pos() # self.plot.setRange(xRange=(0.0, p - 1), yRange=(0.0, 1.0)) self.plot.setXRange(0.0, p - 1, padding=0) self.plot.setYRange(0.0, 1.0, padding=0) self._update_axis() def _set_horline_pos(self): cutidx = self.ncomponents - 1 for line, label, curve in zip(self.plot_horlines, self.plot_horlabels, self._shared_correlations): y = curve[cutidx] line.setData([-1, cutidx], 2 * [y]) label.setPos(cutidx, y) label.setPlainText("{:.3f}".format(y)) def _on_cut_changed(self, line): # cut changed by means of a cut line over the scree plot. value = int(round(line.value())) components = value + 1 if not (self.ncomponents == 0 and components == len(self._components)): self.ncomponents = components self._line.setValue(value) self._set_horline_pos() self.commit() def _update_selection_component_spin(self): # cut changed by "ncomponents" spin. if self._mas is None: self._invalidate_selection() return if np.floor(self._line.value()) + 1 != self.ncomponents: self._line.setValue(self.ncomponents - 1) self.commit() def _invalidate_selection(self): if self.data is not None: self._transformed = None self.commit() def _update_scoring_combo(self): self.fit() self._invalidate_selection() def _update_dynamic_time_warping(self): self._invalidate_selection() def _update_quantile_normalization(self): self._invalidate_selection() def _update_ngenes_spin(self): self.clear_plot() if self.data is None: return if self._has_nan_classes(): self.Error.nan_class() return self.clear_messages() self.fit() self._invalidate_selection() def _update_combo_source_id(self): self.clear_plot() if self.data is None: return y = np.array(self.data.get_column_view(self.source_id)[0], dtype=np.float64) _, counts = np.unique(y, return_counts=True) if min(counts) < 2: self.Error.no_instances() return self._reset_max_components() if self._has_nan_classes(): self.Error.nan_class() return self.clear_messages() self.fit() self._invalidate_selection() def _update_axis(self): p = MAX_COMPONENTS axis = self.plot.getAxis("bottom") d = max((p - 1) // (self.axis_labels - 1), 1) axis.setTicks([[(i, str(i + 1)) for i in range(0, p, d)]]) def _has_nan_classes(self): y = np.array(self.data.get_column_view(self.source_id)[0], dtype=np.float64) return not np.isfinite(y).all() def commit(self): transformed_table = meta_genes = None if self._mas is not None: # Compute the full transform (MAX_COMPONENTS components) only once. if self._transformed is None: X = self.data.X y = self.data.get_column_view(self.source_id)[0] self._transformed = self._mas.transform( X, y, normalize=self.quantile_normalization, quantile=self.quantile_normalization_perc, dtw=self.dynamic_time_warping) attributes = tuple( ContinuousVariable.make("CCA{}".format(x + 1)) for x in range(MAX_COMPONENTS)) dom = Domain(attributes, self.data.domain.class_vars, self.data.domain.metas) # Meta-genes meta_genes = self.data.transform(dom) genes_components = np.zeros( (self.data.X.shape[1], MAX_COMPONENTS)) for key, genes in self._mas.use_genes.items(): for gene in genes: genes_components[gene - 1, key] = genes.index(gene) + 1 genes_components[genes_components == 0] = np.NaN meta_genes.X = genes_components self.meta_genes = Table.from_numpy(Domain(attributes), genes_components) # Transformed data transformed = self._transformed new_domain = add_columns(self.data.domain, attributes=attributes) transformed_table_temp = self.data.transform(new_domain) transformed_table_temp.X[:, -MAX_COMPONENTS:] = transformed self.transformed_table = Table.from_table( dom, transformed_table_temp) ncomponents_attributes = tuple( ContinuousVariable.make("CCA{}".format(x + 1)) for x in range(self.ncomponents)) ncomponents_domain = Domain(ncomponents_attributes, self.data.domain.class_vars, self.data.domain.metas) meta_genes = self.meta_genes.transform( Domain(ncomponents_attributes)) transformed_table = self.transformed_table.transform( ncomponents_domain) self.Outputs.transformed_data.send(transformed_table) self.Outputs.genes_components.send(meta_genes) def send_report(self): if self.data is None: return self.report_items( (("Source ID", self.source_id), ("Selected num. of components", self.ncomponents), ("Selected num. of genes", self.ngenes), ("Scoring", self.scoring), ("Quantile normalization", True if self.quantile_normalization else "False"), ("Quantile normalization percentage", self.quantile_normalization_perc if self.quantile_normalization else False), ("Dynamic time warping", True if self.dynamic_time_warping else "False"))) self.report_plot() """
class OWLoadCorpus(OWWidget): name = "Corpus" description = "Load a corpus of text documents, (optionally) tagged with categories." icon = "icons/TextFile.svg" priority = 10 outputs = [(Output.CORPUS, Corpus)] want_main_area = False resizing_enabled = False dlgFormats = ("All readable files ({});;".format( '*' + ' *'.join(FileFormat.readers.keys())) + ";;".join( "{} (*{})".format(f.DESCRIPTION, ' *'.join(f.EXTENSIONS)) for f in sorted(set(FileFormat.readers.values()), key=list(FileFormat.readers.values()).index))) settingsHandler = PerfectDomainContextHandler( match_values=PerfectDomainContextHandler.MATCH_VALUES_ALL) recent_files = Setting([]) used_attrs = ContextSetting([]) class Error(OWWidget.Error): read_file = Msg("Can't read file {} ({})") def __init__(self): super().__init__() self.corpus = None # Browse file box fbox = gui.widgetBox(self.controlArea, "Corpus file", orientation=0) widget = widgets.FileWidget(recent_files=self.recent_files, icon_size=(16, 16), on_open=self.open_file, directory_aliases={ "Browse documentation corpora ...": get_sample_corpora_dir() }, dialog_format=self.dlgFormats, dialog_title='Open Orange Document Corpus', allow_empty=False, reload_label='Reload', browse_label='Browse') fbox.layout().addWidget(widget) # Corpus info ibox = gui.widgetBox(self.controlArea, "Corpus info", addSpace=True) corp_info = "Corpus of 0 documents." self.info_label = gui.label(ibox, self, corp_info) # Used Text Features fbox = gui.widgetBox(self.controlArea, orientation=0) ubox = gui.widgetBox(fbox, "Used text features", addSpace=True) self.used_attrs_model = VariableListModel(enable_dnd=True) self.used_attrs_view = VariablesListItemView() self.used_attrs_view.setModel(self.used_attrs_model) ubox.layout().addWidget(self.used_attrs_view) aa = self.used_attrs_model aa.dataChanged.connect(self.update_feature_selection) aa.rowsInserted.connect(self.update_feature_selection) aa.rowsRemoved.connect(self.update_feature_selection) # Ignored Text Features ibox = gui.widgetBox(fbox, "Ignored text features", addSpace=True) self.unused_attrs_model = VariableListModel(enable_dnd=True) self.unused_attrs_view = VariablesListItemView() self.unused_attrs_view.setModel(self.unused_attrs_model) ibox.layout().addWidget(self.unused_attrs_view) # load first file widget.select(0) def open_file(self, path): self.closeContext() self.Error.read_file.clear() self.used_attrs_model[:] = [] self.unused_attrs_model[:] = [] if path: try: self.corpus = Corpus.from_file(path) self.corpus.name = os.path.splitext(os.path.basename(path))[0] self.info_label.setText("Corpus of {} documents.".format( len(self.corpus))) self.used_attrs = list(self.corpus.text_features) self.openContext(self.corpus) self.used_attrs_model.extend(self.used_attrs) self.unused_attrs_model.extend([ f for f in self.corpus.domain.metas if f.is_string and f not in self.used_attrs_model ]) except BaseException as err: self.Error.read_file(path, str(err)) def update_feature_selection(self): # TODO fix VariablesListItemView so it does not emit # duplicated data when reordering inside a single window def remove_duplicates(l): unique = [] for i in l: if i not in unique: unique.append(i) return unique if self.corpus is not None: self.corpus.set_text_features( remove_duplicates(self.used_attrs_model)) self.send(Output.CORPUS, self.corpus) self.used_attrs = list(self.used_attrs_model)
class OWMultifile(widget.OWWidget, RelocatablePathsWidgetMixin): name = "Multifile" id = "orangecontrib.spectroscopy.widgets.files" icon = "icons/multifile.svg" description = "Read data from input files " \ "and send a data table to the output." priority = 10000 replaces = [ "orangecontrib.infrared.widgets.owfiles.OWFiles", "orangecontrib.infrared.widgets.owmultifile.OWMultifile" ] class Outputs: data = Output("Data", Table, doc="Concatenated input files.") want_main_area = False file_idx = [] settingsHandler = PerfectDomainContextHandler( match_values=PerfectDomainContextHandler.MATCH_VALUES_ALL) recent_paths: List[RecentPath] variables: list sheet = Setting(None, schema_only=True) label = Setting("", schema_only=True) recent_paths = Setting([], schema_only=True) variables = ContextSetting([], schema_only=True) class Error(widget.OWWidget.Error): file_not_found = widget.Msg("File(s) not found.") missing_reader = widget.Msg("Missing reader(s).") read_error = widget.Msg("Read error(s).") domain_editor = SettingProvider(DomainEditor) def __init__(self): widget.OWWidget.__init__(self) RelocatablePathsWidgetMixin.__init__(self) self.domain = None self.data = None self.loaded_file = "" self.sheets = [] self.lb = gui.listBox(self.controlArea, self, "file_idx", selectionMode=QListWidget.MultiSelection) self.default_foreground = None layout = QGridLayout() gui.widgetBox(self.controlArea, margin=0, orientation=layout) file_button = gui.button(None, self, ' ...', callback=self.browse_files, autoDefault=False) file_button.setIcon(self.style().standardIcon(QStyle.SP_DirOpenIcon)) file_button.setSizePolicy(Policy.Maximum, Policy.Fixed) layout.addWidget(file_button, 0, 0) remove_button = gui.button(None, self, 'Remove', callback=self.remove_item) clear_button = gui.button(None, self, 'Clear', callback=self.clear) layout.addWidget(remove_button, 0, 1) layout.addWidget(clear_button, 0, 2) reload_button = gui.button(None, self, "Reload", callback=self.load_data, autoDefault=False) reload_button.setIcon(self.style().standardIcon( QStyle.SP_BrowserReload)) reload_button.setSizePolicy(Policy.Fixed, Policy.Fixed) layout.addWidget(reload_button, 0, 7) self.sheet_box = gui.hBox(None, addToLayout=False, margin=0) self.sheet_index = 0 self.sheet_combo = gui.comboBox(None, self, "sheet_index", callback=self.select_sheet) self.sheet_combo.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed) self.sheet_label = QLabel() self.sheet_label.setText('Sheet') self.sheet_label.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed) self.sheet_box.layout().addWidget(self.sheet_label, Qt.AlignLeft) self.sheet_box.layout().addWidget(self.sheet_combo, Qt.AlignVCenter) layout.addWidget(self.sheet_box, 2, 1) self.sheet_box.hide() layout.addWidget(self.sheet_box, 0, 5) label_box = gui.hBox(None, addToLayout=False, margin=0) gui.lineEdit(label_box, self, "label", callback=self.set_label, label="Label", orientation=Qt.Horizontal) layout.addWidget(label_box, 0, 6) layout.setColumnStretch(3, 2) box = gui.widgetBox(self.controlArea, "Columns (Double click to edit)") self.domain_editor = DomainEditor(self) self.editor_model = self.domain_editor.model() box.layout().addWidget(self.domain_editor) for rp in self.recent_paths: self.lb.addItem(rp.abspath) box = gui.hBox(self.controlArea) gui.rubber(box) if hasattr(DomainEditor, "reset_domain"): # Orange>=3.21 gui.button(box, self, "Reset", callback=self.reset_domain_edit) self.apply_button = gui.button(box, self, "Apply", callback=self.apply_domain_edit) self.apply_button.setEnabled(False) self.apply_button.setFixedWidth(170) self.editor_model.dataChanged.connect( lambda: self.apply_button.setEnabled(True)) self._update_sheet_combo() self.load_data() def set_label(self): self.load_data() def _select_active_sheet(self): if self.sheet: try: sheet_list = [s[0] for s in self.sheets] idx = sheet_list.index(self.sheet) self.sheet_combo.setCurrentIndex(idx) except ValueError: # Requested sheet does not exist in this file self.sheet = None else: self.sheet_combo.setCurrentIndex(0) def _update_sheet_combo(self): sheets = Counter() for rp in self.recent_paths: try: reader = _get_reader(rp) sheets.update(reader.sheets) except: pass sheets = sorted(sheets.items(), key=lambda x: x[0]) self.sheets = [(s, s + " (" + str(n) + ")") for s, n in sheets] if len(sheets) < 2: self.sheet_box.hide() self.sheet = None else: self.sheets.insert(0, (None, "(None)")) self.sheet_combo.clear() self.sheet_combo.addItems([s[1] for s in self.sheets]) self._select_active_sheet() self.sheet_box.show() def select_sheet(self): self.sheet = self.sheets[self.sheet_combo.currentIndex()][0] self.load_data() def remove_item(self): ri = [i.row() for i in self.lb.selectedIndexes()] for i in sorted(ri, reverse=True): self.recent_paths.pop(i) self.lb.takeItem(i) self._update_sheet_combo() self.load_data() def clear(self): self.lb.clear() while self.recent_paths: self.recent_paths.pop() self._update_sheet_combo() self.load_data() def browse_files(self, in_demos=False): start_file = self.last_path() or os.path.expanduser("~/") readers = [ f for f in FileFormat.formats if getattr(f, 'read', None) and getattr(f, "EXTENSIONS", None) ] filenames, reader, _ = open_filename_dialog( start_file, None, readers, dialog=QFileDialog.getOpenFileNames) self.load_files(filenames, reader) def load_files(self, filenames, reader): if not filenames: return for f in filenames: self.add_path(f, reader) self.lb.addItem(f) self._update_sheet_combo() self.load_data() def load_data(self): self.closeContext() self.Error.file_not_found.clear() self.Error.missing_reader.clear() self.Error.read_error.clear() data_list = [] fnok_list = [] def show_error(li, msg): li.setForeground(Qt.red) li.setToolTip(msg) empty_domain = Domain(attributes=[]) for i, rp in enumerate(self.recent_paths): fn = rp.abspath li = self.lb.item(i) li.setToolTip("") if self.default_foreground is None: self.default_foreground = li.foreground() li.setForeground(self.default_foreground) if not os.path.exists(fn): show_error(li, "File not found.") self.Error.file_not_found() continue try: reader = _get_reader(rp) assert reader is not None except Exception: # pylint: disable=broad-except show_error(li, "Reader not found.") self.Error.missing_reader() continue try: if self.sheet in reader.sheets: reader.select_sheet(self.sheet) if isinstance(reader, SpectralFileFormat): xs, vals, additional = reader.read_spectra() if additional is None: additional = Table.from_domain(empty_domain, n_rows=len(vals)) data_list.append((xs, vals, additional)) else: data_list.append(reader.read()) fnok_list.append(fn) except Exception as ex: # pylint: disable=broad-except show_error(li, "Read error:\n" + str(ex)) self.Error.read_error() if not data_list \ or self.Error.file_not_found.is_shown() \ or self.Error.missing_reader.is_shown() \ or self.Error.read_error.is_shown(): self.data = None self.domain_editor.set_domain(None) else: data = concatenate_data(data_list, fnok_list, self.label) self.data = data self.openContext(data.domain) self.apply_domain_edit() # sends data def storeSpecificSettings(self): self.current_context.modified_variables = self.variables[:] def retrieveSpecificSettings(self): if hasattr(self.current_context, "modified_variables"): self.variables[:] = self.current_context.modified_variables def apply_domain_edit(self): if self.data is None: table = None else: domain, cols = self.domain_editor.get_domain( self.data.domain, self.data) if not (domain.variables or domain.metas): table = None else: X, y, m = cols table = Table.from_numpy(domain, X, y, m, self.data.W) table.name = self.data.name table.ids = np.array(self.data.ids) table.attributes = getattr(self.data, 'attributes', {}) self.Outputs.data.send(table) self.apply_button.setEnabled(False) def reset_domain_edit(self): self.domain_editor.reset_domain() self.apply_domain_edit() def send_report(self): def get_format_name(format): try: return format.DESCRIPTION except AttributeError: return format.__class__.__name__ if self.data is None: self.report_paragraph("File", "No file.") return files = [] for rp in self.recent_paths: format = _get_reader(rp) files.append([rp.abspath, get_format_name(format)]) self.report_table("Files", table=files) self.report_data("Data", self.data) def workflowEnvChanged(self, key, value, oldvalue): """ Function called when environment changes (e.g. while saving the scheme) It make sure that all environment connected values are modified (e.g. relative file paths are changed) """ self.update_file_list(key, value, oldvalue) def update_file_list(self, key, value, oldvalue): if key == "basedir": self._relocate_recent_files()
class OWExplainPredictions(OWWidget, ConcurrentWidgetMixin): name = "Explain Predictions" description = "Predictions explanation widget." keywords = ["explain", "explain prediction", "explain model"] icon = "icons/ExplainPredictions.svg" priority = 120 class Inputs: model = Input("Model", Model) background_data = Input("Background Data", Table) data = Input("Data", Table) class Outputs: selected_data = Output("Selected Data", Table, default=True) annotated_data = Output(ANNOTATED_DATA_SIGNAL_NAME, Table) scores = Output("Scores", Table) class Error(OWWidget.Error): domain_transform_err = Msg("{}") unknown_err = Msg("{}") not_enough_data = Msg("At least two instances are needed.") class Information(OWWidget.Information): data_sampled = Msg("Data has been sampled.") buttons_area_orientation = Qt.Vertical settingsHandler = PerfectDomainContextHandler() target_index = ContextSetting(0) order_index = ContextSetting(0) annot_index = ContextSetting(0) show_tooltip = Setting(True) highlight_feature = Setting(True) selection_ranges = Setting([], schema_only=True) auto_send = Setting(True) visual_settings = Setting({}, schema_only=True) graph_name = "graph.plotItem" ANNOTATIONS = ["None", "Enumeration"] def __init__(self): OWWidget.__init__(self) ConcurrentWidgetMixin.__init__(self) self.__results: Optional[RunnerResults] = None self.model: Optional[Model] = None self.background_data: Optional[Table] = None self.data: Optional[Table] = None # cached instance indices after instance ordering self.__data_idxs: Optional[np.ndarray] = None self.__pending_selection: List[Tuple[float, float]] = \ self.selection_ranges self.graph: ForcePlot = None self._target_combo: QComboBox = None self._order_combo: QComboBox = None self._annot_combo: QComboBox = None self.setup_gui() initial_settings = self.graph.parameter_setter.initial_settings VisualSettingsDialog(self, initial_settings) def setup_gui(self): self._add_plot() self._add_controls() self._add_buttons() def _add_plot(self): box = gui.vBox(self.mainArea) self.graph = ForcePlot(self) self.graph.set_show_tooltip(self.show_tooltip) self.graph.set_highlight_feature(self.highlight_feature) self.graph.selectionChanged.connect(self.__on_selection_changed) box.layout().addWidget(self.graph) def __on_selection_changed(self, selection: List[Tuple[float, float]]): self.selection_ranges = selection self.commit() def _add_controls(self): box = gui.vBox(self.controlArea, "Target class") self._target_combo = gui.comboBox(box, self, "target_index", callback=self.__on_target_changed, contentsLength=12) box = gui.vBox(self.controlArea, "Instance order") self._order_combo = gui.comboBox(box, self, "order_index", callback=self.__on_order_changed, searchable=True, contentsLength=12) model = VariableListModel() model[:] = INSTANCE_ORDERINGS self._order_combo.setModel(model) box = gui.vBox(self.controlArea, "Annotation") self._annot_combo = gui.comboBox(box, self, "annot_index", callback=self.__on_annot_changed, searchable=True, contentsLength=12) model = VariableListModel() model[:] = self.ANNOTATIONS self._annot_combo.setModel(model) box = gui.vBox(self.controlArea, "", margin=True, contentsMargins=(8, 4, 8, 4)) gui.checkBox(box, self, "show_tooltip", "Show tooltips", callback=self.__on_show_tooltip_changed) gui.checkBox(box, self, "highlight_feature", "Highlight feature on hover", callback=self.__on_highlight_feature_changed) gui.rubber(self.controlArea) def __on_target_changed(self): self.selection_ranges = [] self.setup_plot() self.commit() def __on_order_changed(self): self.selection_ranges = [] self.setup_plot() self.commit() def __on_annot_changed(self): if not self.__results or not self.data: return self._set_plot_annotations() def __on_show_tooltip_changed(self): self.graph.set_show_tooltip(self.show_tooltip) def __on_highlight_feature_changed(self): self.graph.set_highlight_feature(self.highlight_feature) def _add_buttons(self): plot_gui = OWPlotGUI(self) plot_gui.box_zoom_select(self.buttonsArea) gui.auto_send(self.buttonsArea, self, "auto_send") @Inputs.data @check_sql_input def set_data(self, data: Optional[Table]): self.closeContext() self.data = data self._check_data() self._setup_controls() self.openContext(self.data.domain if self.data else None) @Inputs.background_data @check_sql_input def set_background_data(self, data: Optional[Table]): self.background_data = data @Inputs.model def set_model(self, model: Optional[Model]): self.model = model def _check_data(self): self.Error.not_enough_data.clear() if self.data and len(self.data) < 2: self.data = None self.Error.not_enough_data() def _setup_controls(self): self._target_combo.clear() self._target_combo.setEnabled(True) self.order_index = 0 self.annot_index = 0 self._order_combo.clear() self._annot_combo.clear() orderings = INSTANCE_ORDERINGS annotations = self.ANNOTATIONS if self.data: domain = self.data.domain if domain.has_discrete_class: self._target_combo.addItems(domain.class_var.values) self.target_index = 0 elif domain.has_continuous_class: self.target_index = -1 self._target_combo.setEnabled(False) orderings = chain( INSTANCE_ORDERINGS, [VariableListModel.Separator] if domain.metas else [], domain.metas, [VariableListModel.Separator] if domain.class_vars else [], domain.class_vars, [VariableListModel.Separator] if domain.attributes else [], domain.attributes, ) annotations = chain( self.ANNOTATIONS, [VariableListModel.Separator] if domain.metas else [], domain.metas, [VariableListModel.Separator] if domain.class_vars else [], domain.class_vars, [VariableListModel.Separator] if domain.attributes else [], domain.attributes, ) self._order_combo.model()[:] = orderings self._annot_combo.model()[:] = annotations def handleNewSignals(self): self.clear() self.start(run, self.data, self.background_data, self.model) self.commit() def clear(self): self.__results = None self.cancel() self.Error.domain_transform_err.clear() self.Error.unknown_err.clear() self.Information.data_sampled.clear() self.selection_ranges = [] self.graph.clear_all() self.graph.set_axis(None) self.__data_idxs = None def setup_plot(self): self.graph.clear_all() self.__data_idxs = None if not self.__results or not self.data: return order = self._order_combo.model()[self.order_index] values_idxs = get_instance_ordering( self.__results.values[self.target_index], self.__results.predictions[self.__results.mask, self.target_index], self.data[self.__results.mask], order ) data_idxs = np.arange(len(self.data)) self.__data_idxs = data_idxs[self.__results.mask][values_idxs] x_data, pos_y_data, neg_y_data, pos_labels, neg_labels = \ prepare_force_plot_data_multi_inst( self.__results.values[self.target_index][values_idxs], self.__results.base_value[self.target_index], self.model.domain ) if self.order_index == 0: order = "hierarhical clustering" elif self.order_index == 1: order = "output value" elif self.order_index == 2: order = "original ordering" x_label = f"Instances ordered by {order}" target = self.model.domain.class_var if self.model.domain.has_discrete_class: target = f"{target} = {target.values[self.target_index]}" y_label = f"Output value ({target})" self.graph.set_data(x_data, pos_y_data, neg_y_data, pos_labels, neg_labels, x_label, y_label, self.__results.transformed_data[self.__data_idxs]) self._set_plot_annotations() def _set_plot_annotations(self): annotator = self._annot_combo.model()[self.annot_index] if isinstance(annotator, Variable): ticks = [[(i, str(row[annotator].value)) for i, row in enumerate(self.data[self.__data_idxs])]] self.graph.set_axis(ticks) elif annotator == "None": self.graph.set_axis([]) elif annotator == "Enumeration": ticks = [[(i, str(idx + 1)) for i, idx in enumerate(self.__data_idxs)]] self.graph.set_axis(ticks) else: raise NotImplementedError(annotator) def on_partial_result(self, _): pass def on_done(self, results: Optional[RunnerResults]): self.__results = results if results is not None and not all(results.mask): self.Information.data_sampled() self.setup_plot() self.apply_selection() self.output_scores() def on_exception(self, ex: Exception): if isinstance(ex, DomainTransformationError): self.Error.domain_transform_err(ex) else: self.Error.unknown_err(ex) def onDeleteWidget(self): self.shutdown() super().onDeleteWidget() def apply_selection(self): selection_ranges = self.selection_ranges or self.__pending_selection if selection_ranges: self.graph.apply_selection(selection_ranges) self.__on_selection_changed(selection_ranges) self.__pending_selection = [] def commit(self): selected = None selected_indices = [] if self.__results: selection = list(set( chain.from_iterable( range(int(np.ceil(start)), int(np.floor(stop) + 1)) for start, stop in self.selection_ranges) )) selected_indices = sorted(self.__data_idxs[selection]) if self.data and selected_indices: selected = self.data[selected_indices] annotated = create_annotated_table(self.data, selected_indices) self.Outputs.selected_data.send(selected) self.Outputs.annotated_data.send(annotated) def output_scores(self): scores = None if self.__results is not None: mask = self.__results.mask data = self.__results.transformed_data[mask] domain = data.domain attrs = [ContinuousVariable(f"S({a.name})") for a in domain.attributes] domain = Domain(attrs, domain.class_vars, domain.metas) scores = self.__results.values[self.target_index] scores = Table(domain, scores, data.Y, data.metas) scores.name = "Feature Scores" self.Outputs.scores.send(scores) def send_report(self): if not self.data or not self.background_data or not self.model: return items = {"Target class": "None"} if self.model.domain.has_discrete_class: class_var = self.model.domain.class_var items["Target class"] = class_var.values[self.target_index] self.report_items(items) self.report_plot() def set_visual_settings(self, key: Tuple[str, str, str], value: Any): self.visual_settings[key] = value self.graph.parameter_setter.set_parameter(key, value)
class OWKaplanMeier(OWWidget): name = 'Kaplan-Meier Plot' # TODO description = '' # TODO icon = '' priority = 0 show_confidence_interval: bool show_confidence_interval = Setting(False) show_median_line: bool show_median_line = Setting(False) show_censored_data: bool show_censored_data = Setting(False) settingsHandler = PerfectDomainContextHandler() time_var = ContextSetting(None) event_var = ContextSetting(None) group_var: Optional[DiscreteVariable] = ContextSetting(None) graph = SettingProvider(KaplanMeierPlot) auto_commit: bool = Setting(False, schema_only=True) class Inputs: data = Input('Data', Table) class Outputs: selected_data = Output('Data', Table) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.data: Optional[Table] = None self.plot_curves = None time_var_model = DomainModel(valid_types=(ContinuousVariable, )) event_var_model = DomainModel(valid_types=DomainModel.PRIMITIVE) group_var_model = DomainModel(placeholder='(None)', valid_types=(DiscreteVariable, )) box = gui.vBox(self.controlArea, 'Time', margin=0) gui.comboBox(box, self, 'time_var', model=time_var_model, callback=self.on_controls_changed) box = gui.vBox(self.controlArea, 'Event', margin=0) gui.comboBox(box, self, 'event_var', model=event_var_model, callback=self.on_controls_changed) box = gui.vBox(self.controlArea, 'Group', margin=0) gui.comboBox(box, self, 'group_var', model=group_var_model, callback=self.on_controls_changed) box = gui.vBox(self.controlArea, 'Display options') gui.checkBox( box, self, 'show_confidence_interval', label='Confidence intervals', callback=self.on_display_option_changed, ) gui.checkBox( box, self, 'show_median_line', label='Median', callback=self.on_display_option_changed, ) gui.checkBox( box, self, 'show_censored_data', label='Censored data', callback=self.on_display_option_changed, ) self.graph: KaplanMeierPlot = KaplanMeierPlot(parent=self) self.graph.selection_changed.connect(self.commit) self.mainArea.layout().addWidget(self.graph) plot_gui = OWPlotGUI(self) plot_gui.box_zoom_select(self.controlArea) gui.rubber(self.controlArea) self.commit_button = gui.auto_commit(self.controlArea, self, 'auto_commit', '&Commit', box=False) @Inputs.data def set_data(self, data: Table): self.closeContext() if not data: return self.data = data self.controls.time_var.model().set_domain(data.domain) self.controls.event_var.model().set_domain(data.domain) self.controls.group_var.model().set_domain(data.domain) self.time_var = None self.event_var = None self.group_var = None self.graph.selection = {} self.openContext(data.domain) self.graph.curves = { curve_id: curve for curve_id, curve in enumerate(self.generate_plot_curves()) } self.graph.update_plot(**self._get_plot_options()) self.commit() def _get_plot_options(self): return { 'confidence_interval': self.show_confidence_interval, 'median': self.show_median_line, 'censored': self.show_censored_data, } def on_display_option_changed(self) -> None: self.graph.update_plot(**self._get_plot_options()) def on_controls_changed(self): if not self.data: return self.graph.curves = { curve_id: curve for curve_id, curve in enumerate(self.generate_plot_curves()) } self.graph.clear_selection() self.graph.update_plot(**self._get_plot_options()) self.commit() def _get_discrete_var_color(self, index: Optional[int]): if self.group_var is not None and index is not None: return list(self.group_var.colors[index]) def generate_plot_curves(self) -> List[EstimatedFunctionCurve]: if self.time_var is None or self.event_var is None: return [] time, _ = self.data.get_column_view(self.time_var) events, _ = self.data.get_column_view(self.event_var) # time = np.array([2.5, 4, 4, 5, 6, 6]) # events = np.array([1, 1, 1, 1, 0, 0]) if self.group_var: groups, _ = self.data.get_column_view(self.group_var) group_indexes = [ index for index, _ in enumerate(self.group_var.values) ] colors = [ self._get_discrete_var_color(index) for index in group_indexes ] masks = groups == np.reshape(group_indexes, (-1, 1)) return [ EstimatedFunctionCurve(time[mask], events[mask], color=color, label=label) for mask, color, label in zip(masks, colors, self.group_var.values) if mask.any() ] else: return [EstimatedFunctionCurve(time, events)] def commit(self): if not self.graph.selection: self.Outputs.selected_data.send(None) return time, _ = self.data.get_column_view(self.time_var) if self.group_var is None: time_interval = self.graph.selection[0].x start, end = time_interval[0], time_interval[-1] selection = np.argwhere((time >= start) & (time <= end)).reshape(-1).astype(int) else: selection = [] group, _ = self.data.get_column_view(self.group_var) for group_id, time_interval in self.graph.selection.items(): start, end = time_interval.x[0], time_interval.x[-1] selection += (np.argwhere((time >= start) & (time <= end) & ( group == group_id)).reshape(-1).astype(int).tolist()) selection = sorted(selection) self.Outputs.selected_data.send(self.data[selection, :]) def sizeHint(self): return QSize(1280, 620)
class OWFile(widget.OWWidget, RecentPathsWComboMixin): name = "文件(File)" id = "orange.widgets.data.file" description = "从输入文件或网络读取数据并将数据表发送到输出。" icon = "icons/File.svg" priority = 10 category = "数据(Data)" keywords = ["file", "load", "read", "open", "wenjian"] class Outputs: data = Output("数据(Data)", Table, doc="Attribute-valued dataset read from the input file.", replaces=['Data']) want_main_area = False buttons_area_orientation = None SEARCH_PATHS = [("sample-datasets", get_sample_datasets_dir())] SIZE_LIMIT = 1e7 LOCAL_FILE, URL = range(2) settingsHandler = PerfectDomainContextHandler( match_values=PerfectDomainContextHandler.MATCH_VALUES_ALL) # pylint seems to want declarations separated from definitions recent_paths: List[RecentPath] recent_urls: List[str] variables: list # Overload RecentPathsWidgetMixin.recent_paths to set defaults recent_paths = Setting([ RecentPath("", "sample-datasets", "iris.tab"), RecentPath("", "sample-datasets", "titanic.tab"), RecentPath("", "sample-datasets", "housing.tab"), RecentPath("", "sample-datasets", "heart_disease.tab"), RecentPath("", "sample-datasets", "brown-selected.tab"), RecentPath("", "sample-datasets", "zoo.tab"), ]) recent_urls = Setting([]) source = Setting(LOCAL_FILE) sheet_names = Setting({}) url = Setting("") variables = ContextSetting([]) domain_editor = SettingProvider(DomainEditor) class Information(widget.OWWidget.Information): no_file_selected = Msg("No file selected.") class Warning(widget.OWWidget.Warning): file_too_big = Msg("The file is too large to load automatically." " Press Reload to load.") load_warning = Msg("Read warning:\n{}") performance_warning = Msg( "Categorical variables with >100 values may decrease performance.") renamed_vars = Msg("Some variables have been renamed " "to avoid duplicates.\n{}") multiple_targets = Msg("Most widgets do not support multiple targets") class Error(widget.OWWidget.Error): file_not_found = Msg("File not found.") missing_reader = Msg("Missing reader.") sheet_error = Msg("Error listing available sheets.") unknown = Msg("Read error:\n{}") UserAdviceMessages = [ widget.Message( "Use CSV File Import widget for advanced options " "for comma-separated files", "use-csv-file-import"), widget.Message( "This widget loads only tabular data. Use other widgets to load " "other data types like models, distance matrices and networks.", "other-data-types") ] def __init__(self): super().__init__() RecentPathsWComboMixin.__init__(self) self.domain = None self.data = None self.loaded_file = "" self.reader = None readers = [ f for f in FileFormat.formats if getattr(f, 'read', None) and getattr(f, "EXTENSIONS", None) ] def group_readers_per_addon_key(w): # readers from Orange.data.io should go first def package(w): package = w.qualified_name().split(".")[:-1] package = package[:2] if ".".join(package) == "Orange.data": return ["0"] # force "Orange" to come first return package return package(w), w.DESCRIPTION self.available_readers = sorted(set(readers), key=group_readers_per_addon_key) layout = QGridLayout() layout.setSpacing(4) gui.widgetBox(self.controlArea, orientation=layout, box='数据源') vbox = gui.radioButtons(None, self, "source", box=True, callback=self.load_data, addToLayout=False) rb_button = gui.appendRadioButton(vbox, "文件:", addToLayout=False) layout.addWidget(rb_button, 0, 0, Qt.AlignVCenter) box = gui.hBox(None, addToLayout=False, margin=0) box.setSizePolicy(Policy.Expanding, Policy.Fixed) self.file_combo.setSizePolicy(Policy.Expanding, Policy.Fixed) self.file_combo.setMinimumSize(QSize(100, 1)) self.file_combo.activated[int].connect(self.select_file) box.layout().addWidget(self.file_combo) layout.addWidget(box, 0, 1) file_button = gui.button(None, self, '...', callback=self.browse_file, autoDefault=False) file_button.setIcon(self.style().standardIcon(QStyle.SP_DirOpenIcon)) file_button.setSizePolicy(Policy.Maximum, Policy.Fixed) layout.addWidget(file_button, 0, 2) reload_button = gui.button(None, self, "重新加载", callback=self.load_data, autoDefault=False) reload_button.setIcon(self.style().standardIcon( QStyle.SP_BrowserReload)) reload_button.setSizePolicy(Policy.Fixed, Policy.Fixed) layout.addWidget(reload_button, 0, 3) self.sheet_box = gui.hBox(None, addToLayout=False, margin=0) self.sheet_combo = QComboBox() self.sheet_combo.activated[str].connect(self.select_sheet) self.sheet_combo.setSizePolicy(Policy.Expanding, Policy.Fixed) self.sheet_combo.setMinimumSize(QSize(50, 1)) self.sheet_label = QLabel() self.sheet_label.setText('Sheet') self.sheet_label.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed) self.sheet_box.layout().addWidget(self.sheet_label, Qt.AlignLeft) self.sheet_box.layout().addWidget(self.sheet_combo, Qt.AlignVCenter) layout.addWidget(self.sheet_box, 2, 1) self.sheet_box.hide() rb_button = gui.appendRadioButton(vbox, "URL:", addToLayout=False) layout.addWidget(rb_button, 3, 0, Qt.AlignVCenter) self.url_combo = url_combo = QComboBox() url_model = NamedURLModel(self.sheet_names) url_model.wrap(self.recent_urls) url_combo.setLineEdit(LineEditSelectOnFocus()) url_combo.setModel(url_model) url_combo.setSizePolicy(Policy.Ignored, Policy.Fixed) url_combo.setEditable(True) url_combo.setInsertPolicy(url_combo.InsertAtTop) url_edit = url_combo.lineEdit() l, t, r, b = url_edit.getTextMargins() url_edit.setTextMargins(l + 5, t, r, b) layout.addWidget(url_combo, 3, 1, 1, 3) url_combo.activated.connect(self._url_set) # whit completer we set that combo box is case sensitive when # matching the history completer = QCompleter() completer.setCaseSensitivity(Qt.CaseSensitive) url_combo.setCompleter(completer) layout = QGridLayout() layout.setSpacing(4) gui.widgetBox(self.controlArea, orientation=layout, box='文件类型') box = gui.hBox(None, addToLayout=False, margin=0) box.setSizePolicy(Policy.Expanding, Policy.Fixed) self.reader_combo = QComboBox(self) self.reader_combo.setSizePolicy(Policy.Expanding, Policy.Fixed) self.reader_combo.setMinimumSize(QSize(100, 1)) self.reader_combo.activated[int].connect(self.select_reader) box.layout().addWidget(self.reader_combo) layout.addWidget(box, 0, 1) box = gui.vBox(self.controlArea, "信息") self.infolabel = gui.widgetLabel(box, '未加载数据.') box = gui.widgetBox(self.controlArea, "列(双击编辑)") self.domain_editor = DomainEditor(self) self.editor_model = self.domain_editor.model() box.layout().addWidget(self.domain_editor) box = gui.hBox(box) gui.button(box, self, "重置", callback=self.reset_domain_edit, autoDefault=False) gui.rubber(box) self.apply_button = gui.button(box, self, "应用", callback=self.apply_domain_edit) self.apply_button.setEnabled(False) self.apply_button.setFixedWidth(170) self.editor_model.dataChanged.connect( lambda: self.apply_button.setEnabled(True)) hBox = gui.hBox(self.controlArea) gui.rubber(hBox) gui.button(hBox, self, "浏览文档数据集", callback=lambda: self.browse_file(True), autoDefault=False) gui.rubber(hBox) self.set_file_list() # Must not call open_file from within __init__. open_file # explicitly re-enters the event loop (by a progress bar) self.setAcceptDrops(True) if self.source == self.LOCAL_FILE: last_path = self.last_path() if last_path and os.path.exists(last_path) and \ os.path.getsize(last_path) > self.SIZE_LIMIT: self.Warning.file_too_big() return QTimer.singleShot(0, self.load_data) @staticmethod def sizeHint(): return QSize(600, 550) def select_file(self, n): assert n < len(self.recent_paths) super().select_file(n) if self.recent_paths: self.source = self.LOCAL_FILE self.load_data() self.set_file_list() def select_sheet(self): self.recent_paths[0].sheet = self.sheet_combo.currentText() self.load_data() def select_reader(self, n): if self.source != self.LOCAL_FILE: return # ignore for URL's if self.recent_paths: path = self.recent_paths[0] if n == 0: # default path.file_format = None self.load_data() elif n <= len(self.available_readers): reader = self.available_readers[n - 1] path.file_format = reader.qualified_name() self.load_data() else: # the rest include just qualified names path.file_format = self.reader_combo.itemText(n) self.load_data() def _url_set(self): url = self.url_combo.currentText() pos = self.recent_urls.index(url) url = url.strip() if not urlparse(url).scheme: url = 'http://' + url self.url_combo.setItemText(pos, url) self.recent_urls[pos] = url self.source = self.URL self.load_data() def browse_file(self, in_demos=False): if in_demos: start_file = get_sample_datasets_dir() if not os.path.exists(start_file): QMessageBox.information(None, "文件", "无法找到文件") return else: start_file = self.last_path() or os.path.expanduser("~/") filename, reader, _ = open_filename_dialog(start_file, None, self.available_readers) if not filename: return self.add_path(filename) if reader is not None: self.recent_paths[0].file_format = reader.qualified_name() self.source = self.LOCAL_FILE self.load_data() # Open a file, create data from it and send it over the data channel def load_data(self): # We need to catch any exception type since anything can happen in # file readers self.closeContext() self.domain_editor.set_domain(None) self.apply_button.setEnabled(False) self.clear_messages() self.set_file_list() error = self._try_load() if error: error() self.data = None self.sheet_box.hide() self.Outputs.data.send(None) self.infolabel.setText("无数据") def _try_load(self): self._initialize_reader_combo() # pylint: disable=broad-except if self.source == self.LOCAL_FILE: if self.last_path() is None: return self.Information.no_file_selected elif not os.path.exists(self.last_path()): return self.Error.file_not_found else: url = self.url_combo.currentText().strip() if not url: return self.Information.no_file_selected def mark_problematic_reader(): self.reader_combo.setItemData(self.reader_combo.currentIndex(), QBrush(Qt.red), Qt.ForegroundRole) try: self.reader = self._get_reader() # also sets current reader index assert self.reader is not None except MissingReaderException: mark_problematic_reader() return self.Error.missing_reader except Exception as ex: mark_problematic_reader() log.exception(ex) return lambda x=ex: self.Error.unknown(str(x)) try: self._update_sheet_combo() except Exception: return self.Error.sheet_error with log_warnings() as warnings: try: data = self.reader.read() except Exception as ex: mark_problematic_reader() log.exception(ex) return lambda x=ex: self.Error.unknown(str(x)) if warnings: self.Warning.load_warning(warnings[-1].message.args[0]) self.infolabel.setText(self._describe(data)) self.loaded_file = self.last_path() add_origin(data, self.loaded_file) self.data = data self.openContext(data.domain) self.apply_domain_edit() # sends data return None def _get_reader(self) -> FileFormat: if self.source == self.LOCAL_FILE: path = self.last_path() self.reader_combo.setEnabled(True) if self.recent_paths and self.recent_paths[0].file_format: qname = self.recent_paths[0].file_format qname_index = { r.qualified_name(): i for i, r in enumerate(self.available_readers) } if qname in qname_index: self.reader_combo.setCurrentIndex(qname_index[qname] + 1) else: # reader may be accessible, but not in self.available_readers # (perhaps its code was moved) self.reader_combo.addItem(qname) self.reader_combo.setCurrentIndex( len(self.reader_combo) - 1) try: reader_class = class_from_qualified_name(qname) except Exception as ex: raise MissingReaderException( f'Can not find reader "{qname}"') from ex reader = reader_class(path) else: self.reader_combo.setCurrentIndex(0) reader = FileFormat.get_reader(path) if self.recent_paths and self.recent_paths[0].sheet: reader.select_sheet(self.recent_paths[0].sheet) return reader else: url = self.url_combo.currentText().strip() return UrlReader(url) def _update_sheet_combo(self): if len(self.reader.sheets) < 2: self.sheet_box.hide() self.reader.select_sheet(None) return self.sheet_combo.clear() self.sheet_combo.addItems(self.reader.sheets) self._select_active_sheet() self.sheet_box.show() def _select_active_sheet(self): try: idx = self.reader.sheets.index(self.reader.sheet) self.sheet_combo.setCurrentIndex(idx) except ValueError: # Requested sheet does not exist in this file self.reader.select_sheet(None) self.sheet_combo.setCurrentIndex(0) def _initialize_reader_combo(self): self.reader_combo.clear() filters = [format_filter(f) for f in self.available_readers] self.reader_combo.addItems([DEFAULT_READER_TEXT] + filters) self.reader_combo.setCurrentIndex(0) self.reader_combo.setDisabled(True) # additional readers may be added in self._get_reader() @staticmethod def _describe(table): def missing_prop(prop): if prop: return f"({prop * 100:.1f}% 个缺失值)" else: return "(无缺失值)" domain = table.domain text = "" attrs = getattr(table, "attributes", {}) descs = [ attrs[desc] for desc in ("Name", "Description") if desc in attrs ] if len(descs) == 2: descs[0] = f"<b>{descs[0]}</b>" if descs: text += f"<p>{'<br/>'.join(descs)}</p>" text += f"<p>{len(table)} 条数据" missing_in_attr = missing_prop(table.has_missing_attribute() and table.get_nan_frequency_attribute()) missing_in_class = missing_prop(table.has_missing_class() and table.get_nan_frequency_class()) text += f"<br/>特征数目: {len(domain.attributes)} {missing_in_attr}" if domain.has_continuous_class: text += f"<br/>回归; 数值类 {missing_in_class}" elif domain.has_discrete_class: text += "<br/>分类: 分类种类共 " \ f"{len(domain.class_var.values)} 个 {missing_in_class}" elif table.domain.class_vars: text += "<br/>Multi-target; " \ f"{len(table.domain.class_vars)} target variables " \ f"{missing_in_class}" else: text += "<br/>Data has no target variable." text += f"<br/>元属性: { len(domain.metas)}" text += "</p>" if 'Timestamp' in table.domain: # Google Forms uses this header to timestamp responses text += f"<p>First entry: {table[0, 'Timestamp']}<br/>" \ f"Last entry: {table[-1, 'Timestamp']}</p>" return text def storeSpecificSettings(self): self.current_context.modified_variables = self.variables[:] def retrieveSpecificSettings(self): if hasattr(self.current_context, "modified_variables"): self.variables[:] = self.current_context.modified_variables def reset_domain_edit(self): self.domain_editor.reset_domain() self.apply_domain_edit() def _inspect_discrete_variables(self, domain): for var in chain(domain.variables, domain.metas): if var.is_discrete and len(var.values) > 100: self.Warning.performance_warning() def apply_domain_edit(self): self.Warning.performance_warning.clear() self.Warning.renamed_vars.clear() if self.data is None: table = None else: domain, cols, renamed = \ self.domain_editor.get_domain(self.data.domain, self.data, deduplicate=True) if not (domain.variables or domain.metas): table = None elif domain is self.data.domain: table = self.data else: X, y, m = cols table = Table.from_numpy(domain, X, y, m, self.data.W) table.name = self.data.name table.ids = np.array(self.data.ids) table.attributes = getattr(self.data, 'attributes', {}) self._inspect_discrete_variables(domain) if renamed: self.Warning.renamed_vars(f"Renamed: {', '.join(renamed)}") self.Warning.multiple_targets( shown=table is not None and len(table.domain.class_vars) > 1) self.Outputs.data.send(table) self.apply_button.setEnabled(False) def get_widget_name_extension(self): _, name = os.path.split(self.loaded_file) return os.path.splitext(name)[0] def send_report(self): def get_ext_name(filename): try: return FileFormat.names[os.path.splitext(filename)[1]] except KeyError: return "unknown" if self.data is None: self.report_paragraph("File", "No file.") return if self.source == self.LOCAL_FILE: home = os.path.expanduser("~") if self.loaded_file.startswith(home): # os.path.join does not like ~ name = "~" + os.path.sep + \ self.loaded_file[len(home):].lstrip("/").lstrip("\\") else: name = self.loaded_file if self.sheet_combo.isVisible(): name += f" ({self.sheet_combo.currentText()})" self.report_items("File", [("File name", name), ("Format", get_ext_name(name))]) else: self.report_items("Data", [("Resource", self.url), ("Format", get_ext_name(self.url))]) self.report_data("Data", self.data) @staticmethod def dragEnterEvent(event): """Accept drops of valid file urls""" urls = event.mimeData().urls() if urls: try: FileFormat.get_reader(urls[0].toLocalFile()) event.acceptProposedAction() except MissingReaderException: pass def dropEvent(self, event): """Handle file drops""" urls = event.mimeData().urls() if urls: self.add_path(urls[0].toLocalFile()) # add first file self.source = self.LOCAL_FILE self.load_data() def workflowEnvChanged(self, key, value, oldvalue): """ Function called when environment changes (e.g. while saving the scheme) It make sure that all environment connected values are modified (e.g. relative file paths are changed) """ self.update_file_list(key, value, oldvalue)
class OWCorpus(OWWidget): name = "语料库" description = "加载文档语料库." icon = "icons/TextFile.svg" priority = 100 replaces = ["orangecontrib.text.widgets.owloadcorpus.OWLoadCorpus"] class Inputs: data = Input('Data', Table) class Outputs: corpus = Output('Corpus', Corpus) want_main_area = False resizing_enabled = True dlgFormats = ( "所有可读文档 ({});;".format('*' + ' *'.join(FileFormat.readers.keys())) + ";;".join( "{} (*{})".format(f.DESCRIPTION, ' *'.join(f.EXTENSIONS)) for f in sorted(set(FileFormat.readers.values()), key=list(FileFormat.readers.values()).index))) settingsHandler = PerfectDomainContextHandler( match_values=PerfectDomainContextHandler.MATCH_VALUES_ALL) recent_files = Setting([ "book-excerpts.tab", "grimm-tales-selected.tab", "election-tweets-2016.tab", "friends-transcripts.tab", "andersen.tab", "chinese-example.tab", ]) used_attrs = ContextSetting([]) class Error(OWWidget.Error): read_file = Msg("无法读取文件 {} ({})") no_text_features_used = Msg("至少使用一个文本特征") corpus_without_text_features = Msg("语料库没有文本特征") def __init__(self): super().__init__() self.corpus = None # Browse file box fbox = gui.widgetBox(self.controlArea, "语料库文件", orientation=0) self.file_widget = widgets.FileWidget( recent_files=self.recent_files, icon_size=(16, 16), on_open=self.open_file, dialog_format=self.dlgFormats, dialog_title='打开语料库文档', reload_label='重新加载', browse_label='浏览', allow_empty=False, minimal_width=250, ) fbox.layout().addWidget(self.file_widget) # Corpus info ibox = gui.widgetBox(self.controlArea, "语料库信息", addSpace=True) self.info_label = gui.label(ibox, self, "") self.update_info() # Used Text Features fbox = gui.widgetBox(self.controlArea, orientation=0) ubox = gui.widgetBox(fbox, "已使用的文本特征", addSpace=False) self.used_attrs_model = VariableListModel(enable_dnd=True) self.used_attrs_view = VariablesListItemView() self.used_attrs_view.setModel(self.used_attrs_model) ubox.layout().addWidget(self.used_attrs_view) aa = self.used_attrs_model aa.dataChanged.connect(self.update_feature_selection) aa.rowsInserted.connect(self.update_feature_selection) aa.rowsRemoved.connect(self.update_feature_selection) # Ignored Text Features ibox = gui.widgetBox(fbox, "未使用的文本特征", addSpace=False) self.unused_attrs_model = VariableListModel(enable_dnd=True) self.unused_attrs_view = VariablesListItemView() self.unused_attrs_view.setModel(self.unused_attrs_model) ibox.layout().addWidget(self.unused_attrs_view) # Documentation Data Sets & Report box = gui.hBox(self.controlArea) self.browse_documentation = gui.button( box, self, "浏览语料库文档", callback=lambda: self.file_widget.browse(get_sample_corpora_dir()), autoDefault=False, ) # load first file self.file_widget.select(0) def sizeHint(self): return QSize(400, 300) @Inputs.data def set_data(self, data): have_data = data is not None # Enable/Disable command when data from input self.file_widget.setEnabled(not have_data) self.browse_documentation.setEnabled(not have_data) if have_data: self.open_file(data=data) else: self.file_widget.reload() def open_file(self, path=None, data=None): self.closeContext() self.Error.clear() self.unused_attrs_model[:] = [] self.used_attrs_model[:] = [] if data: self.corpus = Corpus.from_table(data.domain, data) elif path: try: self.corpus = Corpus.from_file(path) self.corpus.name = os.path.splitext(os.path.basename(path))[0] except BaseException as err: self.Error.read_file(path, str(err)) else: return self.update_info() self.used_attrs = list(self.corpus.text_features) if not self.corpus.text_features: self.Error.corpus_without_text_features() self.Outputs.corpus.send(None) return self.openContext(self.corpus) self.used_attrs_model.extend(self.used_attrs) self.unused_attrs_model.extend([ f for f in self.corpus.domain.metas if f.is_string and f not in self.used_attrs_model ]) def update_info(self): def describe(corpus): dom = corpus.domain text_feats = sum(m.is_string for m in dom.metas) other_feats = len(dom.attributes) + len(dom.metas) - text_feats text = \ "{} 个文档, {} 个文本特征, {} 个其他特征.". \ format(len(corpus), text_feats, other_feats) if dom.has_continuous_class: text += "<br/>回归; 数值类." elif dom.has_discrete_class: text += "<br/>分类; 离散值含有 {} 种值.". \ format(len(dom.class_var.values)) elif corpus.domain.class_vars: text += "<br/>多目标; {} 个目标变量.".format( len(corpus.domain.class_vars)) else: text += "<br/>数据没有目标变量" text += "</p>" return text if self.corpus is None: self.info_label.setText("没有加载语料库") else: self.info_label.setText(describe(self.corpus)) def update_feature_selection(self): self.Error.no_text_features_used.clear() # TODO fix VariablesListItemView so it does not emit # duplicated data when reordering inside a single window def remove_duplicates(l): unique = [] for i in l: if i not in unique: unique.append(i) return unique if self.corpus is not None: self.corpus.set_text_features( remove_duplicates(self.used_attrs_model)) self.used_attrs = list(self.used_attrs_model) if len(self.unused_attrs_model ) > 0 and not self.corpus.text_features: self.Error.no_text_features_used() # prevent sending "empty" corpora dom = self.corpus.domain empty = not (dom.variables or dom.metas) \ or len(self.corpus) == 0 \ or not self.corpus.text_features self.Outputs.corpus.send(self.corpus if not empty else None) def send_report(self): def describe(features): if len(features): return ', '.join([f.name for f in features]) else: return '(无)' if self.corpus is not None: domain = self.corpus.domain self.report_items('Corpus', ( ("File", self.file_widget.get_selected_filename()), ("Documents", len(self.corpus)), ("Used text features", describe(self.used_attrs_model)), ("Ignored text features", describe(self.unused_attrs_model)), ('Other features', describe(domain.attributes)), ('Target', describe(domain.class_vars)), ))
class OWVcfFile(widget.OWWidget, RecentPathsWComboMixin): name = "VCF File" id = "orangecontrib.variants.widgets.vcf" description = "Read data from a VCF file." icon = "icons/VCFFile.svg" priority = 10 category = "Variants" keywords = ["data", "vcf", "file", "load", "read"] class Outputs: data = Output( "Data", Table, doc="Attribute-valued data set read from the input file.") want_main_area = False SEARCH_PATHS = [("sample-datasets", get_sample_datasets_dir())] SIZE_LIMIT = 1e7 settingsHandler = PerfectDomainContextHandler( match_values=PerfectDomainContextHandler.MATCH_VALUES_ALL) # Overload RecentPathsWidgetMixin.recent_paths to set defaults recent_paths = Setting([ RecentPath("", "sample-datasets", "small.vcf"), ]) quality = Setting(1) cb_qual = Setting(True) frequency = Setting(1) cb_freq = Setting(True) class Warning(widget.OWWidget.Warning): file_too_big = widget.Msg( "The file is too large to load automatically." " Press Reload to load.") class Error(widget.OWWidget.Error): file_not_found = widget.Msg("File not found.") def __init__(self): super().__init__() RecentPathsWComboMixin.__init__(self) self.domain = None self.variants = None self.table = None self.loaded_file = "" layout = QGridLayout() gui.widgetBox(self.controlArea, margin=0, orientation=layout) label = gui.widgetLabel(self, " File: ") layout.addWidget(label, 0, 0, Qt.AlignVCenter) box = gui.hBox(None, addToLayout=False, margin=0) box.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed) self.file_combo.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed) self.file_combo.activated[int].connect(self.select_file) box.layout().addWidget(self.file_combo) layout.addWidget(box, 0, 1) file_button = gui.button(None, self, '...', callback=self.browse_file, autoDefault=False) file_button.setIcon(self.style().standardIcon(QStyle.SP_DirOpenIcon)) file_button.setSizePolicy(Policy.Maximum, Policy.Fixed) layout.addWidget(file_button, 0, 2) reload_button = gui.button(None, self, "Reload", callback=self.load_data, autoDefault=False) reload_button.setIcon(self.style().standardIcon( QStyle.SP_BrowserReload)) reload_button.setSizePolicy(Policy.Fixed, Policy.Fixed) layout.addWidget(reload_button, 0, 3) box = gui.vBox(self.controlArea, "Info") self.info = gui.widgetLabel(box, 'No data loaded.') self.warnings = gui.widgetLabel(box, '') def enable_apply(): self.apply_button.setEnabled(True) box = gui.vBox(self.controlArea, "Filtering") _, qspin = gui.spin(box, self, 'quality', 0, 999, step=1, label='Quality threshold (QT)', callback=enable_apply, checked='cb_qual', checkCallback=enable_apply) qspin.setToolTip("Minimum quality to use reads.") _, fspin = gui.spin(box, self, 'frequency', 0, 999, step=1, label='Frequency threshold (FT)', callback=enable_apply, checked='cb_freq', checkCallback=enable_apply) fspin.setToolTip("Keep only variants with at least this many " "occurrences of alternative alleles.") gui.rubber(self.controlArea) box = gui.hBox(self.controlArea) box.layout().addWidget(self.report_button) self.report_button.setFixedWidth(170) gui.rubber(box) self.apply_button = gui.button(box, self, "Apply", callback=self.apply) self.apply_button.setEnabled(False) self.apply_button.setFixedWidth(170) self.set_file_list() # Must not call open_file from within __init__. open_file # explicitly re-enters the event loop (by a progress bar) self.setAcceptDrops(True) last_path = self.last_path() if last_path and os.path.exists(last_path) and \ os.path.getsize(last_path) > self.SIZE_LIMIT: self.Warning.file_too_big() return QTimer.singleShot(0, self.load_data) def sizeHint(self): return QSize(500, 200) def select_file(self, n): assert n < len(self.recent_paths) super().select_file(n) if self.recent_paths: self.load_data() self.set_file_list() def browse_file(self): start_file = self.last_path() or os.path.expanduser("~/") dialog_formats = "VCF files (*.vcf);;All files (*)" filename, _ = QFileDialog.getOpenFileName(self, 'Open Orange Data File', start_file, dialog_formats) if not filename: return self.add_path(filename) self.load_data() # Open a file, create data from it and send it over the data channel def load_data(self): # We need to catch any exception type since anything can happen in # file readers # pylint: disable=broad-except self.apply_button.setEnabled(False) self.clear_messages() self.set_file_list() if not self.last_path() or not os.path.exists(self.last_path()): if self.last_path(): self.Error.file_not_found() self.Outputs.data.send(None) self.info.setText("No data.") return error = None if not error: with catch_warnings(record=True) as warnings: try: variants = VariantData(self.last_path()) except Exception as ex: log.exception(ex) error = ex self.warning(warnings[-1].message.args[0] if warnings else '') if error: self.variants = self.table = None self.Outputs.data.send(None) self.info.setText("An error occurred:\n{}".format(error)) return self.loaded_file = self.last_path() self.variants = variants self.apply() # sends data def update_info(self): pl = lambda x: '' if x == 1 else 's' text = "" if self.variants is not None: nsamples, nvariants = self.variants.gt.T.shape text += ("<p>Before filtering:<br/>" + " {} sample{}, {} variant{}</p>").\ format(nsamples, pl(nsamples), nvariants, pl(nvariants), ) if self.table is not None: nsamples, nvariants = self.table.X.shape below = np.isnan(self.table.X).sum() / self.table.X.size * 100 text += ("<p>After filtering:<br/>" + " {} sample{}, {} variant{}<br/>" + " {:.2f}% reads below QT</p>").\ format(nsamples, pl(nsamples), nvariants, pl(nvariants), below) self.info.setText(text) def apply(self): if self.variants is None: self.table = None else: q = self.quality if self.cb_qual else None f = self.frequency if self.cb_freq else None self.table = self.variants.get_data(q, f) self.update_info() self.Outputs.data.send(self.table) self.apply_button.setEnabled(False) def get_widget_name_extension(self): _, name = os.path.split(self.loaded_file) return os.path.splitext(name)[0] def send_report(self): if self.table is None: self.report_paragraph("VCF File", "No file.") return home = os.path.expanduser("~") if self.loaded_file.startswith(home): # os.path.join does not like ~ name = "~" + os.path.sep + \ self.loaded_file[len(home):].lstrip("/").lstrip("\\") else: name = self.loaded_file self.report_items("VCF File", [ ("File name", name), ]) parameters = [("Quality", self.quality, self.cb_qual), ("Frequency", self.frequency, self.cb_freq)] self.report_items("Filtering parameters", [(name, value) for name, value, enabled in parameters if enabled]) self.report_data("Data", self.table) def dragEnterEvent(self, event): """Accept drops of valid file urls""" urls = event.mimeData().urls() if urls: try: FileFormat.get_reader( OSX_NSURL_toLocalFile(urls[0]) or urls[0].toLocalFile()) event.acceptProposedAction() except IOError: pass def dropEvent(self, event): """Handle file drops""" urls = event.mimeData().urls() if urls: self.add_path( OSX_NSURL_toLocalFile(urls[0]) or urls[0].toLocalFile()) # add first file self.load_data()
class OWFile(widget.OWWidget, RecentPathsWComboMixin): name = "File" id = "orange.widgets.data.file" description = "Read data from an input file or network " \ "and send a data table to the output." icon = "icons/File.svg" priority = 10 category = "Data" keywords = ["data", "file", "load", "read"] outputs = [ widget.OutputSignal( "Data", Table, doc="Attribute-valued data set read from the input file.") ] want_main_area = False SEARCH_PATHS = [("sample-datasets", get_sample_datasets_dir())] SIZE_LIMIT = 1e7 LOCAL_FILE, URL = range(2) settingsHandler = PerfectDomainContextHandler() # Overload RecentPathsWidgetMixin.recent_paths to set defaults recent_paths = Setting([ RecentPath("", "sample-datasets", "iris.tab"), RecentPath("", "sample-datasets", "titanic.tab"), RecentPath("", "sample-datasets", "housing.tab"), RecentPath("", "sample-datasets", "heart_disease.tab"), ]) recent_urls = Setting([]) source = Setting(LOCAL_FILE) xls_sheet = ContextSetting("") sheet_names = Setting({}) url = Setting("") variables = ContextSetting([]) dlg_formats = ("All readable files ({});;".format( '*' + ' *'.join(FileFormat.readers.keys())) + ";;".join( "{} (*{})".format(f.DESCRIPTION, ' *'.join(f.EXTENSIONS)) for f in sorted(set(FileFormat.readers.values()), key=list(FileFormat.readers.values()).index))) domain_editor = SettingProvider(DomainEditor) class Warning(widget.OWWidget.Warning): file_too_big = widget.Msg( "The file is too large to load automatically." " Press Reload to load.") class Error(widget.OWWidget.Error): file_not_found = widget.Msg("File not found.") def __init__(self): super().__init__() RecentPathsWComboMixin.__init__(self) self.domain = None self.data = None self.loaded_file = "" self.reader = None layout = QGridLayout() gui.widgetBox(self.controlArea, margin=0, orientation=layout) vbox = gui.radioButtons(None, self, "source", box=True, addSpace=True, callback=self.load_data, addToLayout=False) rb_button = gui.appendRadioButton(vbox, "File:", addToLayout=False) layout.addWidget(rb_button, 0, 0, Qt.AlignVCenter) box = gui.hBox(None, addToLayout=False, margin=0) box.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed) self.file_combo.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed) self.file_combo.activated[int].connect(self.select_file) box.layout().addWidget(self.file_combo) layout.addWidget(box, 0, 1) file_button = gui.button(None, self, '...', callback=self.browse_file, autoDefault=False) file_button.setIcon(self.style().standardIcon(QStyle.SP_DirOpenIcon)) file_button.setSizePolicy(Policy.Maximum, Policy.Fixed) layout.addWidget(file_button, 0, 2) reload_button = gui.button(None, self, "Reload", callback=self.load_data, autoDefault=False) reload_button.setIcon(self.style().standardIcon( QStyle.SP_BrowserReload)) reload_button.setSizePolicy(Policy.Fixed, Policy.Fixed) layout.addWidget(reload_button, 0, 3) self.sheet_box = gui.hBox(None, addToLayout=False, margin=0) self.sheet_combo = gui.comboBox( None, self, "xls_sheet", callback=self.select_sheet, sendSelectedValue=True, ) self.sheet_combo.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed) self.sheet_label = QLabel() self.sheet_label.setText('Sheet') self.sheet_label.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed) self.sheet_box.layout().addWidget(self.sheet_label, Qt.AlignLeft) self.sheet_box.layout().addWidget(self.sheet_combo, Qt.AlignVCenter) layout.addWidget(self.sheet_box, 2, 1) self.sheet_box.hide() rb_button = gui.appendRadioButton(vbox, "URL:", addToLayout=False) layout.addWidget(rb_button, 3, 0, Qt.AlignVCenter) self.url_combo = url_combo = QComboBox() url_model = NamedURLModel(self.sheet_names) url_model.wrap(self.recent_urls) url_combo.setLineEdit(LineEditSelectOnFocus()) url_combo.setModel(url_model) url_combo.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed) url_combo.setEditable(True) url_combo.setInsertPolicy(url_combo.InsertAtTop) url_edit = url_combo.lineEdit() l, t, r, b = url_edit.getTextMargins() url_edit.setTextMargins(l + 5, t, r, b) layout.addWidget(url_combo, 3, 1, 3, 3) url_combo.activated.connect(self._url_set) box = gui.vBox(self.controlArea, "Info") self.info = gui.widgetLabel(box, 'No data loaded.') self.warnings = gui.widgetLabel(box, '') box = gui.widgetBox(self.controlArea, "Columns (Double click to edit)") self.domain_editor = DomainEditor(self) self.editor_model = self.domain_editor.model() box.layout().addWidget(self.domain_editor) box = gui.hBox(self.controlArea) gui.button(box, self, "Browse documentation data sets", callback=lambda: self.browse_file(True), autoDefault=False) gui.rubber(box) box.layout().addWidget(self.report_button) self.report_button.setFixedWidth(170) self.apply_button = gui.button(box, self, "Apply", callback=self.apply_domain_edit) self.apply_button.setEnabled(False) self.apply_button.setFixedWidth(170) self.editor_model.dataChanged.connect( lambda: self.apply_button.setEnabled(True)) self.set_file_list() # Must not call open_file from within __init__. open_file # explicitly re-enters the event loop (by a progress bar) self.setAcceptDrops(True) if self.source == self.LOCAL_FILE: last_path = self.last_path() if last_path and os.path.exists(last_path) and \ os.path.getsize(last_path) > self.SIZE_LIMIT: self.Warning.file_too_big() return QTimer.singleShot(0, self.load_data) def sizeHint(self): return QSize(600, 550) def select_file(self, n): assert n < len(self.recent_paths) super().select_file(n) if self.recent_paths: self.source = self.LOCAL_FILE self.load_data() self.set_file_list() def select_sheet(self): self.recent_paths[0].sheet = self.sheet_combo.currentText() self.load_data() def _url_set(self): self.source = self.URL self.load_data() def browse_file(self, in_demos=False): if in_demos: start_file = get_sample_datasets_dir() if not os.path.exists(start_file): QMessageBox.information( None, "File", "Cannot find the directory with documentation data sets") return else: start_file = self.last_path() or os.path.expanduser("~/") filename, _ = QFileDialog.getOpenFileName(self, 'Open Orange Data File', start_file, self.dlg_formats) if not filename: return self.add_path(filename) self.source = self.LOCAL_FILE self.load_data() # Open a file, create data from it and send it over the data channel def load_data(self): # We need to catch any exception type since anything can happen in # file readers # pylint: disable=broad-except self.closeContext() self.domain_editor.set_domain(None) self.apply_button.setEnabled(False) self.clear_messages() self.set_file_list() if self.last_path() and not os.path.exists(self.last_path()): self.Error.file_not_found() self.send("Data", None) self.info.setText("No data.") return error = None try: self.reader = self._get_reader() if self.reader is None: self.data = None self.send("Data", None) self.info.setText("No data.") self.sheet_box.hide() return except Exception as ex: error = ex if not error: self._update_sheet_combo() with catch_warnings(record=True) as warnings: try: data = self.reader.read() except Exception as ex: log.exception(ex) error = ex self.warning(warnings[-1].message.args[0] if warnings else '') if error: self.data = None self.send("Data", None) self.info.setText("An error occurred:\n{}".format(error)) self.sheet_box.hide() return self.info.setText(self._describe(data)) self.loaded_file = self.last_path() add_origin(data, self.loaded_file) self.data = data self.openContext(data.domain) self.apply_domain_edit() # sends data def _get_reader(self): """ Returns ------- FileFormat """ if self.source == self.LOCAL_FILE: reader = FileFormat.get_reader(self.last_path()) if self.recent_paths and self.recent_paths[0].sheet: reader.select_sheet(self.recent_paths[0].sheet) return reader elif self.source == self.URL: url = self.url_combo.currentText().strip() if url: return UrlReader(url) def _update_sheet_combo(self): if len(self.reader.sheets) < 2: self.sheet_box.hide() self.reader.select_sheet(None) return self.sheet_combo.clear() self.sheet_combo.addItems(self.reader.sheets) self._select_active_sheet() self.sheet_box.show() def _select_active_sheet(self): if self.reader.sheet: try: idx = self.reader.sheets.index(self.reader.sheet) self.sheet_combo.setCurrentIndex(idx) except ValueError: # Requested sheet does not exist in this file self.reader.select_sheet(None) else: self.sheet_combo.setCurrentIndex(0) def _describe(self, table): domain = table.domain text = "" attrs = getattr(table, "attributes", {}) descs = [ attrs[desc] for desc in ("Name", "Description") if desc in attrs ] if len(descs) == 2: descs[0] = "<b>{}</b>".format(descs[0]) if descs: text += "<p>{}</p>".format("<br/>".join(descs)) text += "<p>{} instance(s), {} feature(s), {} meta attribute(s)".\ format(len(table), len(domain.attributes), len(domain.metas)) if domain.has_continuous_class: text += "<br/>Regression; numerical class." elif domain.has_discrete_class: text += "<br/>Classification; discrete class with {} values.".\ format(len(domain.class_var.values)) elif table.domain.class_vars: text += "<br/>Multi-target; {} target variables.".format( len(table.domain.class_vars)) else: text += "<br/>Data has no target variable." text += "</p>" if 'Timestamp' in table.domain: # Google Forms uses this header to timestamp responses text += '<p>First entry: {}<br/>Last entry: {}</p>'.format( table[0, 'Timestamp'], table[-1, 'Timestamp']) return text def storeSpecificSettings(self): self.current_context.modified_variables = self.variables[:] def retrieveSpecificSettings(self): if hasattr(self.current_context, "modified_variables"): self.variables[:] = self.current_context.modified_variables def apply_domain_edit(self): if self.data is not None: domain, cols = self.domain_editor.get_domain( self.data.domain, self.data) X, y, m = cols X = np.array(X).T if len(X) else np.empty((len(self.data), 0)) y = np.array(y).T if len(y) else None dtpe = object if any( isinstance(m, StringVariable) for m in domain.metas) else float m = np.array(m, dtype=dtpe).T if len(m) else None table = Table.from_numpy(domain, X, y, m, self.data.W) table.name = self.data.name table.ids = np.array(self.data.ids) table.attributes = getattr(self.data, 'attributes', {}) else: table = self.data self.send("Data", table) self.apply_button.setEnabled(False) def get_widget_name_extension(self): _, name = os.path.split(self.loaded_file) return os.path.splitext(name)[0] def send_report(self): def get_ext_name(filename): try: return FileFormat.names[os.path.splitext(filename)[1]] except KeyError: return "unknown" if self.data is None: self.report_paragraph("File", "No file.") return if self.source == self.LOCAL_FILE: home = os.path.expanduser("~") if self.loaded_file.startswith(home): # os.path.join does not like ~ name = "~" + os.path.sep + \ self.loaded_file[len(home):].lstrip("/").lstrip("\\") else: name = self.loaded_file if self.sheet_combo.isVisible(): name += " ({})".format(self.sheet_combo.currentText()) self.report_items("File", [("File name", name), ("Format", get_ext_name(name))]) else: self.report_items("Data", [("Resource", self.url), ("Format", get_ext_name(self.url))]) self.report_data("Data", self.data) def dragEnterEvent(self, event): """Accept drops of valid file urls""" urls = event.mimeData().urls() if urls: try: FileFormat.get_reader( OSX_NSURL_toLocalFile(urls[0]) or urls[0].toLocalFile()) event.acceptProposedAction() except IOError: pass def dropEvent(self, event): """Handle file drops""" urls = event.mimeData().urls() if urls: self.add_path( OSX_NSURL_toLocalFile(urls[0]) or urls[0].toLocalFile()) # add first file self.source = self.LOCAL_FILE self.load_data()
class OWCorpusViewer(OWWidget): name = "Corpus Viewer" description = "Display corpus contents." icon = "icons/CorpusViewer.svg" priority = 500 class Inputs: corpus = Input("Corpus", Corpus, replaces=["Data"]) class Outputs: matching_docs = Output("Matching Docs", Corpus, default=True) other_docs = Output("Other Docs", Corpus) corpus = Output("Corpus", Corpus) settingsHandler = PerfectDomainContextHandler( match_values = PerfectDomainContextHandler.MATCH_VALUES_ALL ) search_indices = ContextSetting([], exclude_metas=False) # features included in search display_indices = ContextSetting([], exclude_metas=False) # features for display display_features = ContextSetting([], exclude_metas=False) selected_documents = ContextSetting([]) regexp_filter = ContextSetting("") show_tokens = Setting(False) autocommit = Setting(True) class Warning(OWWidget.Warning): no_feats_search = Msg('No features included in search.') no_feats_display = Msg('No features selected for display.') def __init__(self): super().__init__() self.corpus = None # Corpus self.corpus_docs = None # Documents generated from Corpus self.doc_webview = None # WebView for showing content self.search_features = [] # two copies are needed since Display allows drag & drop self.display_list_indices = [0] self.matches = 0 # Matches of the query # Info attributes self.update_info() info_box = gui.widgetBox(self.controlArea, 'Info') gui.label(info_box, self, 'Tokens: %(n_tokens)s') gui.label(info_box, self, 'Types: %(n_types)s') gui.label(info_box, self, 'Matching documents: %(n_matching)s') gui.label(info_box, self, 'Matches: %(n_matches)s') # Search features self.search_listbox = gui.listBox( self.controlArea, self, 'search_indices', 'search_features', selectionMode=QListView.ExtendedSelection, box='Search features', callback=self.search_features_changed) # Display features display_box = gui.widgetBox(self.controlArea, 'Display features') self.display_listbox = gui.listBox( display_box, self, 'display_list_indices', 'display_features', selectionMode=QListView.ExtendedSelection, callback=self.show_docs, enableDragDrop=True) self.show_tokens_checkbox = gui.checkBox(display_box, self, 'show_tokens', 'Show Tokens && Tags', callback=self.show_docs) # Auto-commit box gui.auto_commit(self.controlArea, self, 'autocommit', 'Send data', 'Auto send is on') # Search self.filter_input = gui.lineEdit(self.mainArea, self, 'regexp_filter', orientation=Qt.Horizontal, sizePolicy=QSizePolicy(QSizePolicy.MinimumExpanding, QSizePolicy.Fixed), label='RegExp Filter:', callback=self.refresh_search) # Main area self.splitter = QSplitter( orientation=Qt.Horizontal, childrenCollapsible=False, ) # Document list self.doc_list = QTableView() self.doc_list.setSelectionBehavior(QTableView.SelectRows) self.doc_list.setSelectionMode(QTableView.ExtendedSelection) self.doc_list.setEditTriggers(QAbstractItemView.NoEditTriggers) self.doc_list.horizontalHeader().setSectionResizeMode(QHeaderView.Stretch) self.doc_list.horizontalHeader().setVisible(False) self.splitter.addWidget(self.doc_list) self.doc_list_model = QStandardItemModel(self) self.doc_list.setModel(self.doc_list_model) self.doc_list.selectionModel().selectionChanged.connect( self.selection_changed ) # Document contents self.doc_webview = gui.WebviewWidget(self.splitter, debug=False) self.mainArea.layout().addWidget(self.splitter) def copy_to_clipboard(self): text = self.doc_webview.selectedText() QApplication.clipboard().setText(text) @Inputs.corpus def set_data(self, corpus=None): self.closeContext() self.reset_widget() self.corpus = corpus self.search_features = [] if corpus is not None: domain = self.corpus.domain # Enable/disable tokens checkbox if not self.corpus.has_tokens(): self.show_tokens_checkbox.setCheckState(False) self.show_tokens_checkbox.setEnabled(self.corpus.has_tokens()) self.search_features = list(filter_visible(chain(domain.variables, domain.metas))) self.display_features = list(filter_visible(chain(domain.variables, domain.metas))) self.search_indices = list(range(len(self.search_features))) self.display_indices = list(range(len(self.display_features))) self.selected_documents = [corpus.titles[0]] if \ corpus.titles is not None and len(corpus.titles) else [] self.openContext(self.corpus) self.display_list_indices = self.display_indices self.regenerate_docs() self.list_docs() self.update_info() self.set_selection() self.show_docs() self.commit() def reset_widget(self): # Corpus self.corpus = None self.corpus_docs = None self.display_features = [] # Widgets self.search_listbox.clear() self.display_listbox.clear() self.filter_input.clear() self.update_info() # Models/vars self.search_features.clear() self.search_indices.clear() self.display_indices.clear() self.doc_list_model.clear() # Warnings self.Warning.clear() # WebView self.doc_webview.setHtml('') def list_docs(self): """ List documents into the left scrolling area """ if self.corpus_docs is None: return # TODO: remove search_keyword?? search_keyword = self.regexp_filter.strip('|') matches = 0 try: reg = re.compile(search_keyword, re.IGNORECASE) except sre_constants.error: return self.doc_list_model.clear() for i, (doc, title, content) in enumerate(zip(self.corpus, self.corpus.titles, self.corpus_docs)): res = len(list(reg.finditer(content))) if self.regexp_filter else 0 if not self.regexp_filter or res: matches += res item = QStandardItem() item.setData(str(title), Qt.DisplayRole) item.setData(doc, Qt.UserRole) self.doc_list_model.appendRow(item) self.matches = matches def get_selected_documents_from_view(self) -> Set[str]: """ Returns ------- Set with names of selected documents in the QTableView """ return { i.data(Qt.DisplayRole) for i in self.doc_list.selectionModel().selectedRows() } def set_selection(self) -> None: """ Select documents in selected_documents attribute in the view """ view = self.doc_list model = view.model() previously_selected = self.selected_documents.copy() selection = QItemSelection() for row in range(model.rowCount()): document = model.data(model.index(row, 0), Qt.DisplayRole) if document in self.selected_documents: selection.append(QItemSelectionRange( view.model().index(row, 0), view.model().index(row, 0) )) view.selectionModel().select( selection, QItemSelectionModel.ClearAndSelect ) if len(selection) == 0: # in cases when selection is empty qt's selection_changed is not # called and so we need to manually trigger show_docs self.show_docs() # select emmit selection change signal which causes calling # selection_changed when filtering it means that documents which # are currently filtered out get removed from self.selected_douments # we still want to keep them to be still selected after user removes # filter self.selected_documents = previously_selected def selection_changed(self) -> None: """ Function is called every time the selection changes - when user select new range of documents """ self.selected_documents = self.get_selected_documents_from_view() self.show_docs() self.commit() def show_docs(self): """ Show the selected documents in the right area """ HTML = ''' <!doctype html> <html> <head> <script type="text/javascript" src="resources/jquery-3.1.1.min.js"> </script> <script type="text/javascript" src="resources/jquery.mark.min.js"> </script> <script type="text/javascript" src="resources/highlighter.js"> </script> <meta charset='utf-8'> <style> table {{ border-collapse: collapse; }} mark {{ background: #FFCD28; }} tr > td {{ padding-bottom: 3px; padding-top: 3px; }} body {{ font-family: Helvetica; font-size: 10pt; }} .line {{ border-bottom: 1px solid #000; }} .separator {{ height: 5px; }} .variables {{ vertical-align: top; padding-right: 10px; }} .content {{ /* Adopted from https://css-tricks.com/snippets/css/prevent-long-urls-from-breaking-out-of-container/ */ /* These are technically the same, but use both */ overflow-wrap: break-word; word-wrap: break-word; -ms-word-break: break-all; /* This is the dangerous one in WebKit, as it breaks things wherever */ word-break: break-all; /* Instead use this non-standard one: */ word-break: break-word; /* Adds a hyphen where the word breaks, if supported (No Blink) */ -ms-hyphens: auto; -moz-hyphens: auto; -webkit-hyphens: auto; hyphens: auto; }} .token {{ padding: 3px; border: 1px #B0B0B0 solid; margin-right: 5px; margin-bottom: 5px; display: inline-block; }} img {{ max-width: 100%; }} </style> </head> <body> {} </body> </html> ''' self.display_indices = self.display_list_indices if self.corpus is None: return self.Warning.no_feats_display.clear() if len(self.display_indices) == 0: self.Warning.no_feats_display() if self.show_tokens: tokens = list(self.corpus.ngrams_iterator(include_postags=True)) marked_search_features = [f for i, f in enumerate(self.search_features) if i in self.search_indices] html = '<table>' for doc_count, index in enumerate(self.doc_list.selectionModel().selectedRows()): if doc_count > 0: # add split html += '<tr class="line separator"><td/><td/></tr>' \ '<tr class="separator"><td/><td/></tr>' row_ind = index.data(Qt.UserRole).row_index for ind in self.display_indices: feature = self.display_features[ind] value = str(index.data(Qt.UserRole)[feature.name]) if feature in marked_search_features: value = self.__mark_text(value) value = value.replace('\n', '<br/>') is_image = feature.attributes.get('type', '') == 'image' if is_image and value != '?': value = '<img src="{}"></img>'.format(value) html += '<tr><td class="variables"><strong>{}:</strong></td>' \ '<td class="content">{}</td></tr>'.format( feature.name, value) if self.show_tokens: html += '<tr><td class="variables"><strong>Tokens & Tags:</strong></td>' \ '<td>{}</td></tr>'.format(''.join('<span class="token">{}</span>'.format( token) for token in tokens[row_ind])) html += '</table>' base = QUrl.fromLocalFile(__file__) self.doc_webview.setHtml(HTML.format(html), base) def __mark_text(self, text): search_keyword = self.regexp_filter.strip('|') if not search_keyword: return text try: reg = re.compile(search_keyword, re.IGNORECASE | re.MULTILINE) except sre_constants.error: return text matches = list(reg.finditer(text)) if not matches: return text text = list(text) for m in matches[::-1]: text[m.start():m.end()] = list('<mark data-markjs="true">{}</mark>'\ .format("".join(text[m.start():m.end()]))) return "".join(text) def search_features_changed(self): self.regenerate_docs() self.refresh_search() def regenerate_docs(self): self.corpus_docs = None self.Warning.no_feats_search.clear() if self.corpus is not None: feats = [self.search_features[i] for i in self.search_indices] if len(feats) == 0: self.Warning.no_feats_search() self.corpus_docs = self.corpus.documents_from_features(feats) def refresh_search(self): if self.corpus is not None: self.list_docs() self.set_selection() self.update_info() self.commit() def update_info(self): if self.corpus is not None: self.n_matching = '{}/{}'.format(self.doc_list_model.rowCount(), len(self.corpus)) self.n_matches = self.matches if self.matches else 'n/a' self.n_tokens = sum(map(len, self.corpus.tokens)) if self.corpus.has_tokens() else 'n/a' self.n_types = len(self.corpus.dictionary) if self.corpus.has_tokens() else 'n/a' else: self.n_matching = '' self.n_matches = '' self.n_tokens = '' self.n_types = '' def commit(self): matched = unmatched = annotated_corpus = None corpus = self.corpus if corpus is not None: # it returns a set of selected documents which are in view selected_docs = self.get_selected_documents_from_view() titles = corpus.titles matched_mask = [ i for i, t in enumerate(titles) if t in selected_docs ] unmatched_mask = [ i for i, t in enumerate(titles) if t not in selected_docs ] matched = corpus[matched_mask] if len(matched_mask) else None unmatched = corpus[unmatched_mask] if len(unmatched_mask) else None annotated_corpus = create_annotated_table(corpus, matched_mask) self.Outputs.matching_docs.send(matched) self.Outputs.other_docs.send(unmatched) self.Outputs.corpus.send(annotated_corpus) def send_report(self): self.report_items(( ("Query", self.regexp_filter), ("Matching documents", self.n_matching), ("Matches", self.n_matches) )) def showEvent(self, event): super().showEvent(event) self.update_splitter() def update_splitter(self): """ Update splitter that document list on the left never take more than 1/3 of the space. It is only set on showEvent. If user later changes sizes it stays as it is. """ w1, w2 = self.splitter.sizes() ws = w1 + w2 if w2 < 2/3 * ws: self.splitter.setSizes([ws * 1/3, ws * 2/3])
class OWFile(widget.OWWidget, RecentPathsWComboMixin): name = "File" id = "orange.widgets.data.file" description = "Read data from an input file or network " \ "and send a data table to the output." icon = "icons/File.svg" priority = 10 category = "Data" keywords = ["file", "load", "read", "open"] class Outputs: data = Output("Data", Table, doc="Attribute-valued dataset read from the input file.") want_main_area = False SEARCH_PATHS = [("sample-datasets", get_sample_datasets_dir())] SIZE_LIMIT = 1e7 LOCAL_FILE, URL = range(2) settingsHandler = PerfectDomainContextHandler( match_values=PerfectDomainContextHandler.MATCH_VALUES_ALL) # pylint seems to want declarations separated from definitions recent_paths: List[RecentPath] recent_urls: List[str] variables: list # Overload RecentPathsWidgetMixin.recent_paths to set defaults recent_paths = Setting([ RecentPath("", "sample-datasets", "iris.tab"), RecentPath("", "sample-datasets", "titanic.tab"), RecentPath("", "sample-datasets", "housing.tab"), RecentPath("", "sample-datasets", "heart_disease.tab"), RecentPath("", "sample-datasets", "brown-selected.tab"), RecentPath("", "sample-datasets", "zoo.tab"), ]) recent_urls = Setting([]) source = Setting(LOCAL_FILE) xls_sheet = ContextSetting("") sheet_names = Setting({}) url = Setting("") variables = ContextSetting([]) domain_editor = SettingProvider(DomainEditor) class Warning(widget.OWWidget.Warning): file_too_big = widget.Msg( "The file is too large to load automatically." " Press Reload to load.") load_warning = widget.Msg("Read warning:\n{}") class Error(widget.OWWidget.Error): file_not_found = widget.Msg("File not found.") missing_reader = widget.Msg("Missing reader.") sheet_error = widget.Msg("Error listing available sheets.") unknown = widget.Msg("Read error:\n{}") class NoFileSelected: pass UserAdviceMessages = [ widget.Message( "Use CSV File Import widget for advanced options " "for comma-separated files", "use-csv-file-import"), widget.Message( "This widget loads only tabular data. Use other widgets to load " "other data types like models, distance matrices and networks.", "other-data-types") ] def __init__(self): super().__init__() RecentPathsWComboMixin.__init__(self) self.domain = None self.data = None self.loaded_file = "" self.reader = None layout = QGridLayout() gui.widgetBox(self.controlArea, margin=0, orientation=layout) vbox = gui.radioButtons(None, self, "source", box=True, addSpace=True, callback=self.load_data, addToLayout=False) rb_button = gui.appendRadioButton(vbox, "File:", addToLayout=False) layout.addWidget(rb_button, 0, 0, Qt.AlignVCenter) box = gui.hBox(None, addToLayout=False, margin=0) box.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed) self.file_combo.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed) self.file_combo.activated[int].connect(self.select_file) box.layout().addWidget(self.file_combo) layout.addWidget(box, 0, 1) file_button = gui.button(None, self, '...', callback=self.browse_file, autoDefault=False) file_button.setIcon(self.style().standardIcon(QStyle.SP_DirOpenIcon)) file_button.setSizePolicy(Policy.Maximum, Policy.Fixed) layout.addWidget(file_button, 0, 2) reload_button = gui.button(None, self, "Reload", callback=self.load_data, autoDefault=False) reload_button.setIcon(self.style().standardIcon( QStyle.SP_BrowserReload)) reload_button.setSizePolicy(Policy.Fixed, Policy.Fixed) layout.addWidget(reload_button, 0, 3) self.sheet_box = gui.hBox(None, addToLayout=False, margin=0) self.sheet_combo = gui.comboBox( None, self, "xls_sheet", callback=self.select_sheet, sendSelectedValue=True, ) self.sheet_combo.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed) self.sheet_label = QLabel() self.sheet_label.setText('Sheet') self.sheet_label.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed) self.sheet_box.layout().addWidget(self.sheet_label, Qt.AlignLeft) self.sheet_box.layout().addWidget(self.sheet_combo, Qt.AlignVCenter) layout.addWidget(self.sheet_box, 2, 1) self.sheet_box.hide() rb_button = gui.appendRadioButton(vbox, "URL:", addToLayout=False) layout.addWidget(rb_button, 3, 0, Qt.AlignVCenter) self.url_combo = url_combo = QComboBox() url_model = NamedURLModel(self.sheet_names) url_model.wrap(self.recent_urls) url_combo.setLineEdit(LineEditSelectOnFocus()) url_combo.setModel(url_model) url_combo.setSizePolicy(Policy.Ignored, Policy.Fixed) url_combo.setEditable(True) url_combo.setInsertPolicy(url_combo.InsertAtTop) url_edit = url_combo.lineEdit() l, t, r, b = url_edit.getTextMargins() url_edit.setTextMargins(l + 5, t, r, b) layout.addWidget(url_combo, 3, 1, 3, 3) url_combo.activated.connect(self._url_set) # whit completer we set that combo box is case sensitive when # matching the history completer = QCompleter() completer.setCaseSensitivity(Qt.CaseSensitive) url_combo.setCompleter(completer) box = gui.vBox(self.controlArea, "Info") self.infolabel = gui.widgetLabel(box, 'No data loaded.') self.warnings = gui.widgetLabel(box, '') box = gui.widgetBox(self.controlArea, "Columns (Double click to edit)") self.domain_editor = DomainEditor(self) self.editor_model = self.domain_editor.model() box.layout().addWidget(self.domain_editor) box = gui.hBox(self.controlArea) gui.button(box, self, "Browse documentation datasets", callback=lambda: self.browse_file(True), autoDefault=False) gui.rubber(box) gui.button(box, self, "Reset", callback=self.reset_domain_edit) self.apply_button = gui.button(box, self, "Apply", callback=self.apply_domain_edit) self.apply_button.setEnabled(False) self.apply_button.setFixedWidth(170) self.editor_model.dataChanged.connect( lambda: self.apply_button.setEnabled(True)) self.set_file_list() # Must not call open_file from within __init__. open_file # explicitly re-enters the event loop (by a progress bar) self.setAcceptDrops(True) if self.source == self.LOCAL_FILE: last_path = self.last_path() if last_path and os.path.exists(last_path) and \ os.path.getsize(last_path) > self.SIZE_LIMIT: self.Warning.file_too_big() return QTimer.singleShot(0, self.load_data) @staticmethod def sizeHint(): return QSize(600, 550) def select_file(self, n): assert n < len(self.recent_paths) super().select_file(n) if self.recent_paths: self.source = self.LOCAL_FILE self.load_data() self.set_file_list() def select_sheet(self): self.recent_paths[0].sheet = self.sheet_combo.currentText() self.load_data() def _url_set(self): url = self.url_combo.currentText() pos = self.recent_urls.index(url) url = url.strip() if not urlparse(url).scheme: url = 'http://' + url self.url_combo.setItemText(pos, url) self.recent_urls[pos] = url self.source = self.URL self.load_data() def browse_file(self, in_demos=False): if in_demos: start_file = get_sample_datasets_dir() if not os.path.exists(start_file): QMessageBox.information( None, "File", "Cannot find the directory with documentation datasets") return else: start_file = self.last_path() or os.path.expanduser("~/") readers = [ f for f in FileFormat.formats if getattr(f, 'read', None) and getattr(f, "EXTENSIONS", None) ] filename, reader, _ = open_filename_dialog(start_file, None, readers) if not filename: return self.add_path(filename) if reader is not None: self.recent_paths[0].file_format = reader.qualified_name() self.source = self.LOCAL_FILE self.load_data() # Open a file, create data from it and send it over the data channel def load_data(self): # We need to catch any exception type since anything can happen in # file readers self.closeContext() self.domain_editor.set_domain(None) self.apply_button.setEnabled(False) self.clear_messages() self.set_file_list() error = self._try_load() if error: error() self.data = None self.sheet_box.hide() self.Outputs.data.send(None) self.infolabel.setText("No data.") def _try_load(self): # pylint: disable=broad-except if self.last_path() and not os.path.exists(self.last_path()): return self.Error.file_not_found try: self.reader = self._get_reader() assert self.reader is not None except Exception: return self.Error.missing_reader if self.reader is self.NoFileSelected: self.Outputs.data.send(None) return None try: self._update_sheet_combo() except Exception: return self.Error.sheet_error with catch_warnings(record=True) as warnings: try: data = self.reader.read() except Exception as ex: log.exception(ex) return lambda x=ex: self.Error.unknown(str(x)) if warnings: self.Warning.load_warning(warnings[-1].message.args[0]) self.infolabel.setText(self._describe(data)) self.loaded_file = self.last_path() add_origin(data, self.loaded_file) self.data = data self.openContext(data.domain) self.apply_domain_edit() # sends data return None def _get_reader(self) -> FileFormat: if self.source == self.LOCAL_FILE: path = self.last_path() if path is None: return self.NoFileSelected if self.recent_paths and self.recent_paths[0].file_format: qname = self.recent_paths[0].file_format reader_class = class_from_qualified_name(qname) reader = reader_class(path) else: reader = FileFormat.get_reader(path) if self.recent_paths and self.recent_paths[0].sheet: reader.select_sheet(self.recent_paths[0].sheet) return reader else: url = self.url_combo.currentText().strip() if url: return UrlReader(url) else: return self.NoFileSelected def _update_sheet_combo(self): if len(self.reader.sheets) < 2: self.sheet_box.hide() self.reader.select_sheet(None) return self.sheet_combo.clear() self.sheet_combo.addItems(self.reader.sheets) self._select_active_sheet() self.sheet_box.show() def _select_active_sheet(self): if self.reader.sheet: try: idx = self.reader.sheets.index(self.reader.sheet) self.sheet_combo.setCurrentIndex(idx) except ValueError: # Requested sheet does not exist in this file self.reader.select_sheet(None) else: self.sheet_combo.setCurrentIndex(0) @staticmethod def _describe(table): def missing_prop(prop): if prop: return f"({prop * 100:.1f}% missing values)" else: return "(no missing values)" domain = table.domain text = "" attrs = getattr(table, "attributes", {}) descs = [ attrs[desc] for desc in ("Name", "Description") if desc in attrs ] if len(descs) == 2: descs[0] = f"<b>{descs[0]}</b>" if descs: text += f"<p>{'<br/>'.join(descs)}</p>" text += f"<p>{len(table)} instance(s)" missing_in_attr = missing_prop(table.has_missing_attribute() and table.get_nan_frequency_attribute()) missing_in_class = missing_prop(table.has_missing_class() and table.get_nan_frequency_class()) text += f"<br/>{len(domain.attributes)} feature(s) {missing_in_attr}" if domain.has_continuous_class: text += f"<br/>Regression; numerical class {missing_in_class}" elif domain.has_discrete_class: text += "<br/>Classification; categorical class " \ f"with {len(domain.class_var.values)} values {missing_in_class}" elif table.domain.class_vars: text += "<br/>Multi-target; " \ f"{len(table.domain.class_vars)} target variables " \ f"{missing_in_class}" else: text += "<br/>Data has no target variable." text += f"<br/>{len(domain.metas)} meta attribute(s)" text += "</p>" if 'Timestamp' in table.domain: # Google Forms uses this header to timestamp responses text += f"<p>First entry: {table[0, 'Timestamp']}<br/>" \ f"Last entry: {table[-1, 'Timestamp']}</p>" return text def storeSpecificSettings(self): self.current_context.modified_variables = self.variables[:] def retrieveSpecificSettings(self): if hasattr(self.current_context, "modified_variables"): self.variables[:] = self.current_context.modified_variables def reset_domain_edit(self): self.domain_editor.reset_domain() self.apply_domain_edit() def apply_domain_edit(self): if self.data is None: table = None else: domain, cols = self.domain_editor.get_domain( self.data.domain, self.data) if not (domain.variables or domain.metas): table = None else: X, y, m = cols table = Table.from_numpy(domain, X, y, m, self.data.W) table.name = self.data.name table.ids = np.array(self.data.ids) table.attributes = getattr(self.data, 'attributes', {}) self.Outputs.data.send(table) self.apply_button.setEnabled(False) def get_widget_name_extension(self): _, name = os.path.split(self.loaded_file) return os.path.splitext(name)[0] def send_report(self): def get_ext_name(filename): try: return FileFormat.names[os.path.splitext(filename)[1]] except KeyError: return "unknown" if self.data is None: self.report_paragraph("File", "No file.") return if self.source == self.LOCAL_FILE: home = os.path.expanduser("~") if self.loaded_file.startswith(home): # os.path.join does not like ~ name = "~" + os.path.sep + \ self.loaded_file[len(home):].lstrip("/").lstrip("\\") else: name = self.loaded_file if self.sheet_combo.isVisible(): name += f" ({self.sheet_combo.currentText()})" self.report_items("File", [("File name", name), ("Format", get_ext_name(name))]) else: self.report_items("Data", [("Resource", self.url), ("Format", get_ext_name(self.url))]) self.report_data("Data", self.data) @staticmethod def dragEnterEvent(event): """Accept drops of valid file urls""" urls = event.mimeData().urls() if urls: try: FileFormat.get_reader(urls[0].toLocalFile()) event.acceptProposedAction() except IOError: pass def dropEvent(self, event): """Handle file drops""" urls = event.mimeData().urls() if urls: self.add_path(urls[0].toLocalFile()) # add first file self.source = self.LOCAL_FILE self.load_data() def workflowEnvChanged(self, key, value, oldvalue): """ Function called when environment changes (e.g. while saving the scheme) It make sure that all environment connected values are modified (e.g. relative file paths are changed) """ self.update_file_list(key, value, oldvalue)
class OWCorpus(OWWidget): name = "Corpus" description = "Load a corpus of text documents." icon = "icons/TextFile.svg" priority = 10 replaces = ["orangecontrib.text.widgets.owloadcorpus.OWLoadCorpus"] class Outputs: corpus = Output("Corpus", Corpus) want_main_area = False resizing_enabled = True dlgFormats = ("All readable files ({});;".format( '*' + ' *'.join(FileFormat.readers.keys())) + ";;".join( "{} (*{})".format(f.DESCRIPTION, ' *'.join(f.EXTENSIONS)) for f in sorted(set(FileFormat.readers.values()), key=list(FileFormat.readers.values()).index))) settingsHandler = PerfectDomainContextHandler( match_values=PerfectDomainContextHandler.MATCH_VALUES_ALL) recent_files = Setting([ "book-excerpts.tab", "grimm-tales-selected.tab", "election-tweets-2016.tab", "friends-transcripts.tab", "andersen.tab", ]) used_attrs = ContextSetting([]) class Error(OWWidget.Error): read_file = Msg("Can't read file {} ({})") def __init__(self): super().__init__() self.corpus = None # Browse file box fbox = gui.widgetBox(self.controlArea, "Corpus file", orientation=0) self.file_widget = widgets.FileWidget( recent_files=self.recent_files, icon_size=(16, 16), on_open=self.open_file, dialog_format=self.dlgFormats, dialog_title='Open Orange Document Corpus', reload_label='Reload', browse_label='Browse', allow_empty=False, minimal_width=250, ) fbox.layout().addWidget(self.file_widget) # Corpus info ibox = gui.widgetBox(self.controlArea, "Corpus info", addSpace=True) self.info_label = gui.label(ibox, self, "") self.update_info() # Used Text Features fbox = gui.widgetBox(self.controlArea, orientation=0) ubox = gui.widgetBox(fbox, "Used text features", addSpace=False) self.used_attrs_model = VariableListModel(enable_dnd=True) self.used_attrs_view = VariablesListItemView() self.used_attrs_view.setModel(self.used_attrs_model) ubox.layout().addWidget(self.used_attrs_view) aa = self.used_attrs_model aa.dataChanged.connect(self.update_feature_selection) aa.rowsInserted.connect(self.update_feature_selection) aa.rowsRemoved.connect(self.update_feature_selection) # Ignored Text Features ibox = gui.widgetBox(fbox, "Ignored text features", addSpace=False) self.unused_attrs_model = VariableListModel(enable_dnd=True) self.unused_attrs_view = VariablesListItemView() self.unused_attrs_view.setModel(self.unused_attrs_model) ibox.layout().addWidget(self.unused_attrs_view) # Documentation Data Sets & Report box = gui.hBox(self.controlArea) gui.button( box, self, "Browse documentation corpora", callback=lambda: self.file_widget.browse(get_sample_corpora_dir()), autoDefault=False, ) box.layout().addWidget(self.report_button) # load first file self.file_widget.select(0) def open_file(self, path): self.closeContext() self.Error.read_file.clear() self.used_attrs_model[:] = [] self.unused_attrs_model[:] = [] if path: try: self.corpus = Corpus.from_file(path) self.corpus.name = os.path.splitext(os.path.basename(path))[0] self.update_info() self.used_attrs = list(self.corpus.text_features) self.openContext(self.corpus) self.used_attrs_model.extend(self.used_attrs) self.unused_attrs_model.extend([ f for f in self.corpus.domain.metas if f.is_string and f not in self.used_attrs_model ]) except BaseException as err: self.Error.read_file(path, str(err)) def update_info(self): def describe(corpus): dom = corpus.domain text_feats = sum(m.is_string for m in dom.metas) other_feats = len(dom.attributes) + len(dom.metas) - text_feats text = \ "{} document(s), {} text features(s), {} other feature(s).". \ format(len(corpus), text_feats, other_feats) if dom.has_continuous_class: text += "<br/>Regression; numerical class." elif dom.has_discrete_class: text += "<br/>Classification; discrete class with {} values.". \ format(len(dom.class_var.values)) elif corpus.domain.class_vars: text += "<br/>Multi-target; {} target variables.".format( len(corpus.domain.class_vars)) else: text += "<br/>Data has no target variable." text += "</p>" return text if self.corpus is None: self.info_label.setText("No corpus loaded.") else: self.info_label.setText(describe(self.corpus)) def update_feature_selection(self): # TODO fix VariablesListItemView so it does not emit # duplicated data when reordering inside a single window def remove_duplicates(l): unique = [] for i in l: if i not in unique: unique.append(i) return unique if self.corpus is not None: self.corpus.set_text_features( remove_duplicates(self.used_attrs_model)) self.used_attrs = list(self.used_attrs_model) # prevent sending "empty" corpora dom = self.corpus.domain empty = not (dom.variables or dom.metas) or len(self.corpus) == 0 self.Outputs.corpus.send(self.corpus if not empty else None) def send_report(self): def describe(features): if len(features): return ', '.join([f.name for f in features]) else: return '(none)' if self.corpus is not None: domain = self.corpus.domain self.report_items('Corpus', ( ("File", self.file_widget.get_selected_filename()), ("Documents", len(self.corpus)), ("Used text features", describe(self.used_attrs_model)), ("Ignored text features", describe(self.unused_attrs_model)), ('Other features', describe(domain.attributes)), ('Target', describe(domain.class_vars)), ))
class OWScoreDocuments(OWWidget, ConcurrentWidgetMixin): name = "Score Documents" description = "" icon = "icons/ScoreDocuments.svg" priority = 500 buttons_area_orientation = Qt.Vertical # default order - table sorted in input order DEFAULT_SORTING = (-1, Qt.AscendingOrder) settingsHandler = PerfectDomainContextHandler() auto_commit: bool = Setting(True) aggregation: int = Setting(0) word_frequency: bool = Setting(True) word_appearance: bool = Setting(False) embedding_similarity: bool = Setting(False) embedding_language: int = Setting(0) sort_column_order: Tuple[int, int] = Setting(DEFAULT_SORTING) selected_rows: List[int] = ContextSetting([], schema_only=True) sel_method: int = ContextSetting(SelectionMethods.N_BEST) n_selected: int = ContextSetting(3) class Inputs: corpus = Input("Corpus", Corpus) words = Input("Words", Table) class Outputs: selected_documents = Output("Selected documents", Corpus, default=True) corpus = Output("Corpus", Corpus) class Warning(OWWidget.Warning): corpus_not_normalized = Msg("Use Preprocess Text to normalize corpus.") class Error(OWWidget.Error): custom_err = Msg("{}") def __init__(self): OWWidget.__init__(self) ConcurrentWidgetMixin.__init__(self) self._setup_control_area() self._setup_main_area() self.corpus = None self.words = None # saves scores avoid multiple computation of the same score self.scores = {} def _setup_control_area(self) -> None: box = gui.widgetBox(self.controlArea, "Word Scoring Methods") for value, (n, _, tt) in SCORING_METHODS.items(): b = gui.hBox(box, margin=0) gui.checkBox( b, self, value, label=n, callback=self.__setting_changed, tooltip=tt, ) if value in ADDITIONAL_OPTIONS: value, options = ADDITIONAL_OPTIONS[value] gui.comboBox( b, self, value, items=options, callback=self.__setting_changed, ) box = gui.widgetBox(self.controlArea, "Aggregation") gui.comboBox( box, self, "aggregation", items=[n for n in AGGREGATIONS], callback=self.__setting_changed, ) gui.rubber(self.controlArea) # select words box box = gui.vBox(self.buttonsArea, "Select Documents") grid = QGridLayout() grid.setContentsMargins(0, 0, 0, 0) self._sel_method_buttons = QButtonGroup() for method, label in enumerate(SelectionMethods.ITEMS): button = QRadioButton(label) button.setChecked(method == self.sel_method) grid.addWidget(button, method, 0) self._sel_method_buttons.addButton(button, method) self._sel_method_buttons.buttonClicked[int].connect( self.__set_selection_method) spin = gui.spin( box, self, "n_selected", 1, 999, addToLayout=False, callback=lambda: self.__set_selection_method(SelectionMethods. N_BEST), ) grid.addWidget(spin, 3, 1) box.layout().addLayout(grid) # autocommit gui.auto_send(self.buttonsArea, self, "auto_commit") def _setup_main_area(self) -> None: self._filter_line_edit = QLineEdit( textChanged=self.__on_filter_changed, placeholderText="Filter...") self.mainArea.layout().addWidget(self._filter_line_edit) self.model = model = ScoreDocumentsTableModel(parent=self) model.setHorizontalHeaderLabels(["Document"]) def select_manual(): self.__set_selection_method(SelectionMethods.MANUAL) self.view = view = ScoreDocumentsTableView() view.pressedAny.connect(select_manual) self.mainArea.layout().addWidget(view) # by default data are sorted in the Table order header = self.view.horizontalHeader() header.sectionClicked.connect(self.__on_horizontal_header_clicked) proxy_model = ScoreDocumentsProxyModel() proxy_model.setFilterKeyColumn(0) proxy_model.setFilterCaseSensitivity(False) view.setModel(proxy_model) view.model().setSourceModel(self.model) self.view.selectionModel().selectionChanged.connect( self.__on_selection_change) def __on_filter_changed(self) -> None: model = self.view.model() model.setFilterFixedString(self._filter_line_edit.text().strip()) def __on_horizontal_header_clicked(self, index: int): header = self.view.horizontalHeader() self.sort_column_order = (index, header.sortIndicatorOrder()) self._select_rows() # when sorting change output table must consider the new order # call explicitly since selection in table is not changed if (self.sel_method == SelectionMethods.MANUAL and self.selected_rows or self.sel_method == SelectionMethods.ALL): # retrieve selection in new order self.selected_rows = self.get_selected_indices() self._send_output() def __on_selection_change(self): self.selected_rows = self.get_selected_indices() self._send_output() def __set_selection_method(self, method: int): self.sel_method = method self._sel_method_buttons.button(method).setChecked(True) self._select_rows() @Inputs.corpus def set_data(self, corpus: Corpus) -> None: self.closeContext() self.Warning.corpus_not_normalized.clear() if corpus is None: self.corpus = None self._clear_and_run() return if not self._is_corpus_normalized(corpus): self.Warning.corpus_not_normalized() self.corpus = corpus self.selected_rows = [] self.openContext(corpus) self._sel_method_buttons.button(self.sel_method).setChecked(True) self._clear_and_run() @staticmethod def _get_word_attribute(words: Table) -> None: attrs = [ a for a in words.domain.metas + words.domain.variables if isinstance(a, StringVariable) ] if not attrs: return None words_attr = next( (a for a in attrs if a.attributes.get("type", "") == "words"), None) if words_attr: return words.get_column_view(words_attr)[0].tolist() else: # find the most suitable attribute - one with lowest average text # length - counted as a number of words def avg_len(attr): array_ = words.get_column_view(attr)[0] array_ = array_[~isnull(array_)] return sum(len(a.split()) for a in array_) / len(array_) attr = sorted(attrs, key=avg_len)[0] return words.get_column_view(attr)[0].tolist() @Inputs.words def set_words(self, words: Table) -> None: if words is None or len(words.domain.variables + words.domain.metas) == 0: self.words = None else: self.words = self._get_word_attribute(words) self._clear_and_run() def _gather_scores(self) -> Tuple[np.ndarray, List[str]]: """ Gather scores and labels for the dictionary that holds scores Returns ------- scores Scores table labels The list with score names for the header and variables names """ if self.corpus is None: return np.empty((0, 0)), [] aggregation = self._get_active_aggregation() scorers = self._get_active_scorers() methods = [m for m in scorers if (m, aggregation) in self.scores] scores = [self.scores[(m, aggregation)] for m in methods] scores = np.column_stack(scores) if scores else np.empty( (len(self.corpus), 0)) labels = [SCORING_METHODS[m][0] for m in methods] return scores, labels def _send_output(self) -> None: """ Create corpus with scores and output it """ if self.corpus is None: self.Outputs.corpus.send(None) self.Outputs.selected_documents.send(None) return scores, labels = self._gather_scores() if labels: d = self.corpus.domain domain = Domain( d.attributes, d.class_var, metas=d.metas + tuple( ContinuousVariable(get_unique_names(d, l)) for l in labels), ) out_corpus = Corpus( domain, self.corpus.X, self.corpus.Y, np.hstack([self.corpus.metas, scores]), ) Corpus.retain_preprocessing(self.corpus, out_corpus) else: out_corpus = self.corpus self.Outputs.corpus.send( create_annotated_table(out_corpus, self.selected_rows)) self.Outputs.selected_documents.send( out_corpus[self.selected_rows] if self.selected_rows else None) def _fill_table(self) -> None: """ Fill the table in the widget with scores and document names """ if self.corpus is None: self.model.clear() return scores, labels = self._gather_scores() labels = ["Document"] + labels titles = self.corpus.titles.tolist() # clearing selection and sorting to prevent SEGFAULT on model.wrap self.view.horizontalHeader().setSortIndicator(-1, Qt.AscendingOrder) with disconnected(self.view.selectionModel().selectionChanged, self.__on_selection_change): self.view.clearSelection() self.model.fill_table(titles, scores) self.model.setHorizontalHeaderLabels(labels) self.view.update_column_widths() if self.model.columnCount() > self.sort_column_order[0]: # if not enough columns do not apply sorting from settings since # sorting can besaved for score column while scores are still computing # tables is filled before scores are computed with document names self.view.horizontalHeader().setSortIndicator( *self.sort_column_order) self._select_rows() def _fill_and_output(self) -> None: """Fill the table in the widget and send the output""" self._fill_table() self._send_output() def _clear_and_run(self) -> None: """Clear cached scores and commit""" self.scores = {} self.cancel() self._fill_and_output() self.commit() def __setting_changed(self) -> None: self.commit() def commit(self) -> None: self.Error.custom_err.clear() self.cancel() if self.corpus is not None and self.words is not None: scorers = self._get_active_scorers() aggregation = self._get_active_aggregation() new_scores = [ s for s in scorers if (s, aggregation) not in self.scores ] if new_scores: self.start( _run, self.corpus, self.words, new_scores, aggregation, { v: items[getattr(self, v)] for v, items in ADDITIONAL_OPTIONS.values() }, ) else: self._fill_and_output() def on_done(self, _: None) -> None: self._send_output() def on_partial_result(self, result: Tuple[str, str, np.ndarray]) -> None: sc_method, aggregation, scores = result self.scores[(sc_method, aggregation)] = scores self._fill_table() def on_exception(self, ex: Exception) -> None: self.Error.custom_err(ex) self._fill_and_output() def _get_active_scorers(self) -> List[str]: """ Gather currently active/selected scores Returns ------- List with selected scores names """ return [attr for attr in SCORING_METHODS if getattr(self, attr)] def _get_active_aggregation(self) -> str: """ Gather currently active/selected aggregation Returns ------- Selected aggregation name """ return list(AGGREGATIONS.keys())[self.aggregation] @staticmethod def _is_corpus_normalized(corpus: Corpus) -> bool: """ Check if corpus is normalized. """ return any( isinstance(pp, BaseNormalizer) for pp in corpus.used_preprocessor.preprocessors) def get_selected_indices(self) -> List[int]: # get indices in table's order - that the selected output table have same order selected_rows = sorted(self.view.selectionModel().selectedRows(), key=lambda idx: idx.row()) return [self.view.model().mapToSource(r).row() for r in selected_rows] def _select_rows(self): proxy_model = self.view.model() n_rows, n_columns = proxy_model.rowCount(), proxy_model.columnCount() if self.sel_method == SelectionMethods.NONE: selection = QItemSelection() elif self.sel_method == SelectionMethods.ALL: selection = QItemSelection( proxy_model.index(0, 0), proxy_model.index(n_rows - 1, n_columns - 1)) elif self.sel_method == SelectionMethods.MANUAL: selection = QItemSelection() new_sel = [] for row in self.selected_rows: if row < n_rows: new_sel.append(row) _selection = QItemSelection( self.model.index(row, 0), self.model.index(row, n_columns - 1)) selection.merge( proxy_model.mapSelectionFromSource(_selection), QItemSelectionModel.Select, ) # selected rows must be updated when the same dataset with less rows # appear at the input - it is not handled by selectionChanged # in cases when all selected rows missing in new table self.selected_rows = new_sel elif self.sel_method == SelectionMethods.N_BEST: n_sel = min(self.n_selected, n_rows) selection = QItemSelection( proxy_model.index(0, 0), proxy_model.index(n_sel - 1, n_columns - 1)) else: raise NotImplementedError self.view.selectionModel().select(selection, QItemSelectionModel.ClearAndSelect)
class OWPermutationImportance(OWExplainFeatureBase): name = "Feature Importance" description = "Inspect model using Permutation Feature " \ "Importance technique." keywords = ["explain", "model", "permutation", "feature", "importance"] icon = "icons/PermutationImportance.svg" priority = 50 settingsHandler = PerfectDomainContextHandler() score_index = ContextSetting(0) n_repeats = Setting(5) PLOT_CLASS = FeatureImportancePlot class Warning(OWExplainFeatureBase.Warning): missing_target = Msg("Instances with unknown target values " "were removed from data.") # GUI setup def _add_controls(self): box = gui.vBox(self.controlArea, "Parameters") self._score_combo: QComboBox = gui.comboBox( box, self, "score_index", label="Score:", items=BUILTIN_SCORERS_ORDER[DiscreteVariable], orientation=Qt.Horizontal, contentsLength=12, callback=self.__parameter_changed ) gui.spin( box, self, "n_repeats", 1, 1000, label="Permutations:", controlWidth=50, callback=self.__parameter_changed ) super()._add_controls() def __parameter_changed(self): self.clear() self.start(self.run, *self.get_runner_parameters()) def _check_data(self): self.Warning.missing_target.clear() if self.data and np.isnan(self.data.Y).any(): self.Warning.missing_target() self.data = HasClass()(self.data) def openContext(self, model: Optional[Model]): super().openContext(model.domain if model else None) def setup_controls(self): if self.model and self.model.domain.has_continuous_class: class_type = ContinuousVariable else: class_type = DiscreteVariable self._score_combo.clear() items = BUILTIN_SCORERS_ORDER[class_type] self._score_combo.addItems(items) self.score_index = items.index("R2") if "R2" in items else 0 def get_runner_parameters(self) -> Tuple[Optional[Table], Optional[Model], Optional[Type[Score]], int]: score = None if self.model: if version > "3.31.1": # Eventually, keep this line (remove lines 305-306) and # upgrade minimal Orange version to 3.32.0. # Also remove the Orange.version import score = usable_scorers(self.model.domain)[self.score_index] else: var = self.model.domain.class_var score = usable_scorers(var)[self.score_index] return self.data, self.model, score, self.n_repeats # Plot setup def update_scene(self): super().update_scene() if self.results is not None: importance = self.results.x mean = np.mean(importance, axis=1) std = np.std(importance, axis=1) indices = np.argsort(mean)[::-1] names = [self.results.names[i] for i in indices] score = self._score_combo.itemText(self.score_index) txt = "Increase" if score in ("MSE", "RMSE", "MAE") else "Decrease" x_label = f"{txt} in {score}" self.setup_plot(mean[indices], names, std[indices], x_label) # Selection def update_selection(self, attr_names: Set[str]): if set(self.selection) == attr_names: return assert self.results is not None self.selection = tuple(attr_names) self.commit() def select_pending(self, pending_selection: Tuple): if not pending_selection or self.results is None: return self.plot.select_from_settings(pending_selection) super().select_pending(()) # Outputs def get_selected_data(self) -> Optional[Domain]: if not self.selection or not self.data: return None domain = self.data.domain attrs = [a for a in domain.attributes if a.name in self.selection] return self.data[:, attrs + list(domain.class_vars + domain.metas)] def get_scores_table(self) -> Table: domain = Domain([ContinuousVariable("Mean"), ContinuousVariable("Std")], metas=[StringVariable("Feature")]) x = self.results.x X = np.vstack((np.mean(x, axis=1), np.std(x, axis=1))).T M = np.array(self.results.names)[:, None] scores_table = Table(domain, X, metas=M) scores_table.name = "Feature Scores" return scores_table # Misc def send_report(self): if not self.data or not self.model or not self.data.domain.class_var: return var_type = type(self.data.domain.class_var) items = { "Score": BUILTIN_SCORERS_ORDER[var_type][self.score_index], "Permutations": self.n_repeats, } self.report_items(items) super().send_report() @staticmethod def run(data: Table, model: Model, score_class: Type[Score], n_repeats: int, state: TaskState) -> Optional[Results]: if not data or not model or not score_class: return None def callback(i: float, status=""): state.set_progress_value(i * 100) if status: state.set_status(status) if state.is_interruption_requested(): raise Exception importance, names = permutation_feature_importance( model, data, score_class(), n_repeats, callback) mask = np.ones(importance.shape[0], dtype=bool) return Results(x=importance, names=names, mask=mask)