예제 #1
0
class OWGenes(OWWidget, ConcurrentWidgetMixin):
    name = "Genes"
    description = "Tool for working with genes"
    icon = "../widgets/icons/OWGeneInfo.svg"
    priority = 40
    want_main_area = True

    selected_organism: int = Setting(11)
    search_pattern: str = Setting('')
    exclude_unmatched = Setting(True)
    replace_id_with_symbol = Setting(True)
    auto_commit = Setting(True)

    settingsHandler = DomainContextHandler()
    selected_gene_col = ContextSetting(None)
    use_attr_names = ContextSetting(True)

    replaces = [
        'orangecontrib.bioinformatics.widgets.OWGeneNameMatcher.OWGeneNameMatcher'
    ]

    class Inputs:
        data_table = Input("Data", Table)

    class Outputs:
        data_table = Output("Data", Table)
        gene_matcher_results = Output("Genes", Table)

    class Information(OWWidget.Information):
        pass

    def sizeHint(self):
        return QSize(1280, 960)

    def __init__(self):
        OWWidget.__init__(self)
        ConcurrentWidgetMixin.__init__(self)

        # ATTRIBUTES #
        self.target_database = ENTREZ_ID

        # input data
        self.input_data = None
        self.input_genes = None
        self.tax_id = None
        self.column_candidates = []

        # input options
        self.organisms = []

        # gene matcher
        self.gene_matcher = None

        # progress bar
        self.progress_bar = None

        self._timer = QTimer()
        self._timer.timeout.connect(self._apply_filter)
        self._timer.setSingleShot(True)

        # GUI SECTION #

        # Control area
        self.info_box = widgetLabel(
            widgetBox(self.controlArea, "Info", addSpace=True),
            'No data on input.\n')

        organism_box = vBox(self.controlArea, 'Organism')
        self.organism_select_combobox = comboBox(
            organism_box,
            self,
            'selected_organism',
            callback=self.on_input_option_change)

        self.get_available_organisms()
        self.organism_select_combobox.setCurrentIndex(self.selected_organism)

        box = widgetBox(self.controlArea, 'Gene IDs in the input data')
        self.gene_columns_model = itemmodels.DomainModel(
            valid_types=(StringVariable, DiscreteVariable))
        self.gene_column_combobox = comboBox(
            box,
            self,
            'selected_gene_col',
            label='Stored in data column',
            model=self.gene_columns_model,
            sendSelectedValue=True,
            callback=self.on_input_option_change,
        )

        self.attr_names_checkbox = checkBox(
            box,
            self,
            'use_attr_names',
            'Stored as feature (column) names',
            disables=[(-1, self.gene_column_combobox)],
            callback=self.on_input_option_change,
        )

        self.gene_column_combobox.setDisabled(bool(self.use_attr_names))

        output_box = vBox(self.controlArea, 'Output')

        # separator(output_box)
        # output_box.layout().addWidget(horizontal_line())
        # separator(output_box)
        self.exclude_radio = checkBox(output_box,
                                      self,
                                      'exclude_unmatched',
                                      'Exclude unmatched genes',
                                      callback=self.commit)

        self.replace_radio = checkBox(output_box,
                                      self,
                                      'replace_id_with_symbol',
                                      'Replace feature IDs with gene names',
                                      callback=self.commit)

        auto_commit(self.controlArea,
                    self,
                    "auto_commit",
                    "&Commit",
                    box=False)

        rubber(self.controlArea)

        # Main area
        self.filter = lineEdit(self.mainArea,
                               self,
                               'search_pattern',
                               'Filter:',
                               callbackOnType=True,
                               callback=self.handle_filter_callback)
        # rubber(self.radio_group)
        self.mainArea.layout().addWidget(self.filter)

        # set splitter
        self.splitter = QSplitter()
        self.splitter.setOrientation(Qt.Vertical)

        self.table_model = GeneInfoModel()
        self.table_view = QTableView()
        self.table_view.setAlternatingRowColors(True)
        self.table_view.viewport().setMouseTracking(True)
        self.table_view.setSortingEnabled(True)
        self.table_view.setShowGrid(False)
        self.table_view.verticalHeader().hide()
        # self.table_view.horizontalHeader().setSectionResizeMode(QHeaderView.Stretch)

        self.unknown_model = UnknownGeneInfoModel()

        self.unknown_view = QTableView()
        self.unknown_view.setModel(self.unknown_model)
        self.unknown_view.verticalHeader().hide()
        self.unknown_view.setShowGrid(False)
        self.unknown_view.setSelectionMode(QAbstractItemView.NoSelection)
        self.unknown_view.horizontalHeader().setSectionResizeMode(
            QHeaderView.Stretch)

        self.splitter.addWidget(self.table_view)
        self.splitter.addWidget(self.unknown_view)

        self.splitter.setStretchFactor(0, 90)
        self.splitter.setStretchFactor(1, 10)

        self.mainArea.layout().addWidget(self.splitter)

    def handle_filter_callback(self):
        self._timer.stop()
        self._timer.start(500)

    def _apply_filter(self):
        # filter only if input data is present and model is populated
        if self.table_model.table is not None:
            self.table_model.update_model(
                filter_pattern=str(self.search_pattern))
            self.commit()

    def __reset_widget_state(self):
        self.table_view.clearSpans()
        self.table_view.setModel(None)
        self.table_model.clear()
        self.unknown_model.clear()
        self._update_info_box()

    def _update_info_box(self):

        if self.input_genes and self.gene_matcher:
            num_genes = len(self.gene_matcher.genes)
            known_genes = len(self.gene_matcher.get_known_genes())

            info_text = ('{} genes in input data\n'
                         '{} genes match Entrez database\n'
                         '{} genes with match conflicts\n'.format(
                             num_genes, known_genes, num_genes - known_genes))

        else:
            info_text = 'No data on input.'

        self.info_box.setText(info_text)

    def on_done(self, _):
        # update info box
        self._update_info_box()

        # set output options
        self.toggle_radio_options()

        # set known genes
        self.table_model.initialize(self.gene_matcher.genes)
        self.table_view.setModel(self.table_model)
        self.table_view.selectionModel().selectionChanged.connect(self.commit)
        self.table_view.setSelectionBehavior(QAbstractItemView.SelectRows)

        self.table_view.setItemDelegateForColumn(
            self.table_model.entrez_column_index,
            LinkStyledItemDelegate(self.table_view))
        v_header = self.table_view.verticalHeader()
        option = self.table_view.viewOptions()
        size = self.table_view.style().sizeFromContents(
            QStyle.CT_ItemViewItem, option, QSize(20, 20), self.table_view)

        v_header.setDefaultSectionSize(size.height() + 2)
        v_header.setMinimumSectionSize(5)
        self.table_view.horizontalHeader().setStretchLastSection(True)

        # set unknown genes
        self.unknown_model.initialize(self.gene_matcher.genes)
        self.unknown_view.verticalHeader().setStretchLastSection(True)

        self._apply_filter()

    def get_available_organisms(self):
        available_organism = sorted(((tax_id, taxonomy.name(tax_id))
                                     for tax_id in taxonomy.common_taxids()),
                                    key=lambda x: x[1])

        self.organisms = [tax_id[0] for tax_id in available_organism]
        self.organism_select_combobox.addItems(
            [tax_id[1] for tax_id in available_organism])

    def gene_names_from_table(self):
        """ Extract and return gene names from `Orange.data.Table`.
        """
        self.input_genes = []
        if self.input_data:
            if self.use_attr_names:
                self.input_genes = [
                    str(attr.name).strip()
                    for attr in self.input_data.domain.attributes
                ]
            else:
                if self.selected_gene_col is None:
                    self.selected_gene_col = self.gene_column_identifier()

                self.input_genes = [
                    str(e[self.selected_gene_col]) for e in self.input_data
                    if not np.isnan(e[self.selected_gene_col])
                ]

    def _update_gene_matcher(self):
        self.gene_names_from_table()

        self.gene_matcher = GeneMatcher(self.get_selected_organism(),
                                        auto_start=False)
        self.gene_matcher.genes = self.input_genes
        # self.gene_matcher.organism = self.get_selected_organism()

    def get_selected_organism(self):
        return self.organisms[self.selected_organism]

    def _run(self):
        if self.gene_matcher is not None:
            self.start(run_gene_matcher, self.gene_matcher)

    def on_input_option_change(self):
        self.__reset_widget_state()
        self._update_gene_matcher()
        self._run()

    def gene_column_identifier(self):
        """
        Get most suitable column that stores genes. If there are
        several suitable columns, select the one with most unique
        values. Take the best one.
        """

        # candidates -> (variable, num of unique values)
        candidates = ((col,
                       np.unique(self.input_data.get_column_view(col)[0]).size)
                      for col in self.gene_columns_model
                      if isinstance(col, DiscreteVariable)
                      or isinstance(col, StringVariable))

        best_candidate, _ = sorted(candidates, key=lambda x: x[1])[-1]
        return best_candidate

    def find_genes_location(self):
        """ Try locate the genes in the input data when we first load the data.

            Proposed rules:
                - when no suitable feature names are present, check the columns.
                - find the most suitable column, that is, the one with most unique values.

        """
        domain = self.input_data.domain
        if not domain.attributes:
            if self.selected_gene_col is None:
                self.selected_gene_col = self.gene_column_identifier()
                self.use_attr_names = False

    @Inputs.data_table
    def handle_input(self, data):
        self.closeContext()
        self.input_data = None
        self.input_genes = None
        self.__reset_widget_state()
        self.gene_columns_model.set_domain(None)
        self.selected_gene_col = None

        if data:
            self.input_data = data
            self.gene_columns_model.set_domain(self.input_data.domain)

            # check if input table has tax_id, human is used if tax_id is not found
            self.tax_id = str(self.input_data.attributes.get(TAX_ID, '9606'))
            # check for gene location. Default is that genes are attributes in the input table.
            self.use_attr_names = self.input_data.attributes.get(
                GENE_AS_ATTRIBUTE_NAME, self.use_attr_names)

            if self.tax_id in self.organisms and not self.selected_organism:
                self.selected_organism = self.organisms.index(self.tax_id)

            self.openContext(self.input_data.domain)
            self.find_genes_location()
            self.on_input_option_change()

    def commit(self):
        selection = self.table_view.selectionModel().selectedRows(
            self.table_model.entrez_column_index)

        selected_genes = [row.data() for row in selection]
        if not len(selected_genes):
            selected_genes = self.table_model.get_filtered_genes()

        gene_ids = self.get_target_ids()
        known_genes = [gid for gid in gene_ids if gid != '?']

        table = None
        gm_table = None
        if known_genes:
            # Genes are in rows (we have a column with genes).
            if not self.use_attr_names:

                if self.target_database in self.input_data.domain:
                    gene_var = self.input_data.domain[self.target_database]
                    metas = self.input_data.domain.metas
                else:
                    gene_var = StringVariable(self.target_database)
                    metas = self.input_data.domain.metas + (gene_var, )

                domain = Domain(self.input_data.domain.attributes,
                                self.input_data.domain.class_vars, metas)

                table = self.input_data.transform(domain)
                col, _ = table.get_column_view(gene_var)
                col[:] = gene_ids

                # filter selected rows
                selected_genes_set = set(selected_genes)
                selected_rows = [
                    row_index for row_index, row in enumerate(table)
                    if str(row[gene_var]) in selected_genes_set
                ]

                # handle table attributes
                table.attributes[TAX_ID] = self.get_selected_organism()
                table.attributes[GENE_AS_ATTRIBUTE_NAME] = False
                table.attributes[GENE_ID_COLUMN] = self.target_database
                table = table[selected_rows] if selected_rows else table

                if self.exclude_unmatched:
                    # create filter from selected column for genes
                    only_known = table_filter.FilterStringList(
                        gene_var, known_genes)
                    # apply filter to the data
                    table = table_filter.Values([only_known])(table)

                self.Outputs.data_table.send(table)

            # genes are are in columns (genes are features).
            else:
                domain = self.input_data.domain.copy()
                table = self.input_data.transform(domain)

                for gene in self.gene_matcher.genes:
                    if gene.input_identifier in table.domain:

                        table.domain[gene.input_identifier].attributes[
                            self.target_database] = (str(gene.gene_id)
                                                     if gene.gene_id else '?')

                        if self.replace_id_with_symbol:
                            try:
                                table.domain[gene.input_identifier].name = str(
                                    gene.symbol)
                            except AttributeError:
                                # TODO: missing gene symbol, need to handle this?
                                pass

                # filter selected columns
                selected_genes_set = set(selected_genes)
                selected = [
                    column for column in table.domain.attributes
                    if self.target_database in column.attributes
                    and str(column.attributes[
                        self.target_database]) in selected_genes_set
                ]

                output_attrs = table.domain.attributes

                if selected:
                    output_attrs = selected

                if self.exclude_unmatched:
                    known_genes_set = set(known_genes)
                    output_attrs = [
                        col for col in output_attrs if col.attributes[
                            self.target_database] in known_genes_set
                    ]

                domain = Domain(output_attrs, table.domain.class_vars,
                                table.domain.metas)

                table = table.from_table(domain, table)

                # handle table attributes
                table.attributes[TAX_ID] = self.get_selected_organism()
                table.attributes[GENE_AS_ATTRIBUTE_NAME] = True
                table.attributes[GENE_ID_ATTRIBUTE] = self.target_database

            gm_table = self.gene_matcher.to_data_table(
                selected_genes=selected_genes if selected_genes else None)

        self.Outputs.data_table.send(table)
        self.Outputs.gene_matcher_results.send(gm_table)

    def toggle_radio_options(self):
        self.replace_radio.setEnabled(bool(self.use_attr_names))

        if self.gene_matcher.genes:
            # enable checkbox if unknown genes are detected
            self.exclude_radio.setEnabled(
                len(self.gene_matcher.genes) != len(
                    self.gene_matcher.get_known_genes()))
            self.exclude_unmatched = len(self.gene_matcher.genes) != len(
                self.gene_matcher.get_known_genes())

    def get_target_ids(self):
        return [
            str(gene.gene_id) if gene.gene_id else '?'
            for gene in self.gene_matcher.genes
        ]
class OWMarkerGenes(widget.OWWidget):
    name = "Marker Genes"
    icon = 'icons/OWMarkerGenes.svg'
    priority = 170

    replaces = [
        'orangecontrib.single_cell.widgets.owmarkergenes.OWMarkerGenes'
    ]

    class Warning(widget.OWWidget.Warning):
        using_local_files = widget.Msg(
            "Can't connect to serverfiles. Using cached files.")

    class Outputs:
        genes = widget.Output("Genes", Table)

    want_main_area = True

    selected_group: str = settings.Setting('')
    selected_db_source: str = settings.Setting('')
    filter_text: str = settings.Setting('')
    header_state: bytes = settings.Setting(b'')
    auto_commit = settings.Setting(True)

    settingsHandler = MarkerGroupContextHandler()
    selected_genes: Set[tuple] = settings.ContextSetting(set())

    def __init__(self):
        super().__init__()
        self._data = None
        self._available_db_sources = None
        self.output = None

        self._timer = QTimer()
        self._timer.timeout.connect(self._filter_table)
        self._timer.setSingleShot(True)
        self.info.set_input_summary("0")
        self.info.set_output_summary("0")

        box = gui.widgetBox(self.controlArea, 'Database', margin=0)
        self.db_source_index = -1
        self.db_source_cb = gui.comboBox(box, self, 'db_source_index')
        self.db_source_cb.activated[int].connect(self.handle_source_changed)

        box = gui.widgetBox(self.controlArea, 'Organism', margin=0)
        self.group_index = -1
        self.group_cb = gui.comboBox(box, self, 'group_index')
        self.group_cb.activated[int].connect(self.set_group_index)

        gui.rubber(self.controlArea)

        gui.auto_commit(self.controlArea, self, "auto_commit", "Commit",
                        "Commit Automatically")
        # TODO: to avoid this, marker genes table should have 'tax_id' column
        self.map_group_to_taxid = {'Human': '9606', 'Mouse': '10090'}

        filter_line_edit = gui.lineEdit(self.mainArea, self,
                                        "filter_text")  # type: QLineEdit
        filter_line_edit.setPlaceholderText("Filter...")
        filter_line_edit.textEdited.connect(self.call_filter_timer)

        self.view = view = QTreeView(
            rootIsDecorated=False,
            uniformRowHeights=True,
            selectionMode=QTreeView.ExtendedSelection,
            sortingEnabled=True,
        )

        view.viewport().setMouseTracking(True)
        self.mainArea.layout().addWidget(view)

        self._load_data()
        if self.header_state:
            view.header().restoreState(self.header_state)

    @property
    def available_db_sources(self) -> dict:
        return self._available_db_sources

    @available_db_sources.setter
    def available_db_sources(self, value: dict):
        self._available_db_sources = value

        items = list(value.keys())
        try:
            idx = items.index(self.selected_db_source)
        except ValueError:
            idx = -1

        self.db_source_cb.clear()
        self.db_source_cb.addItems(items)

        if idx != -1:
            self.db_source_index = idx
            self.selected_db_source = items[idx]
        elif items:
            self.db_source_index = min(max(self.db_source_index, 0),
                                       len(items) - 1)

        self.set_db_source_index(self.db_source_index)

    @property
    def data(self) -> Table:
        return self._data

    @data.setter
    def data(self, value: Table):
        """ Set the source data.

        The data is then filtered on the first meta column (group)
        """
        self._data = value
        domain = value.domain

        if domain.metas:
            group = domain.metas[0]
            groupcol, _ = value.get_column_view(group)

            if group.is_string:
                group_values = list(set(groupcol))
            elif group.is_discrete:
                group_values = group.values
            else:
                raise TypeError("Invalid column type")
            try:
                idx = group_values.index(self.selected_group)
            except ValueError:
                idx = -1

            self.group_cb.clear()
            self.group_cb.addItems(group_values)

            if idx != -1:
                self.group_index = idx
                self.selected_group = group_values[idx]
            elif group_values:
                self.group_index = min(max(self.group_index, 0),
                                       len(group_values) - 1)

            self.set_group_index(self.group_index)

    def _load_data(self):
        self.Warning.using_local_files.clear()

        found_sources = {}
        try:
            found_sources.update(
                serverfiles.ServerFiles().allinfo(serverfiles_domain))
        except requests.exceptions.ConnectionError:
            found_sources.update(serverfiles.allinfo(serverfiles_domain))
            self.Warning.using_local_files()

        self.available_db_sources = {
            item.get('title').split(': ')[-1]: item
            for item in found_sources.values()
        }

        if self.available_db_sources:
            file_name = self.available_db_sources[
                self.selected_db_source]['filename']

            try:
                serverfiles.update(serverfiles_domain, file_name)
            except requests.exceptions.ConnectionError:
                # try to update file. Ignore network errors.
                pass

            try:
                file_path = serverfiles.localpath_download(
                    serverfiles_domain, file_name)
            except requests.exceptions.ConnectionError as err:
                # Unexpected error.
                raise err

            data = Table(file_path)
            # enforce order
            old_domain = data.domain
            new_domain = Domain(
                [],
                metas=[
                    old_domain['Organism'],
                    old_domain['Name'],
                    old_domain['Entrez ID'],
                    old_domain['Cell Type'],
                    old_domain['Function'],
                    old_domain['Reference'],
                    old_domain['URL'],
                ],
            )
            data = data.transform(new_domain)
            self.data = data

    def set_selection(self):
        selected = self.selected_rows()

        if len(selected):
            header_count = self.view.header().count() - 1

            if self.view.model().rowCount() <= selected[-1]:
                return

            selection = QItemSelection()

            for row_index in selected:
                selection.append(
                    QItemSelectionRange(
                        self.view.model().index(row_index, 0),
                        self.view.model().index(row_index, header_count)))

            self.view.selectionModel().select(
                selection, QItemSelectionModel.ClearAndSelect)

    def handle_source_changed(self, source_index):
        self.set_db_source_index(source_index)
        self._load_data()

    def set_db_source_index(self, source_index):
        self.closeContext()
        self.db_source_index = source_index
        self.selected_db_source = self.db_source_cb.itemText(source_index)

    def set_group_index(self, group_index):
        self.closeContext()
        self.group_index = group_index
        self.selected_group = self.group_cb.itemText(group_index)
        self._setup()

    def call_filter_timer(self, search_string):
        self._timer.stop()
        if search_string != self.filter_text:
            self.filter_text = search_string
        self._timer.start(700)

    def _filter_table(self):
        model = self.view.model()
        assert isinstance(model, SearchableTableModel)
        model.update_model(str(self.filter_text))
        self.set_selection()
        self.update_data_info()
        self.update_model()
        self.commit()

    def update_data_info(self):
        model = self.view.model()
        self.info.set_input_summary(
            f"Shown : {str(len(model.source))}/{str(model.data_length())}")
        self.info.set_output_summary(
            f"Selected: {str(len(self.selected_genes))}")

    def _setup(self):
        self.closeContext()
        data = self.data
        group = data.domain.metas[0]
        gvec = data.get_column_view(group)[0]

        if group.is_string:
            mask = gvec == self.group_cb.itemData(self.group_index,
                                                  Qt.DisplayRole)
        else:
            mask = gvec == self.group_index

        data = data[mask]
        rest = data[:, data.domain.metas[1:]]
        model = SearchableTableModel(rest, parent=self)
        ref_col = rest.domain.metas.index(
            rest.domain[HeaderLabels[HeaderIndex.REFERENCE]])
        self.view.setItemDelegateForColumn(
            ref_col, gui.LinkStyledItemDelegate(self.view))

        self.view.setModel(model)
        self.view.selectionModel().selectionChanged.connect(
            self._on_selection_changed)

        self.openContext(self.selected_group)
        self.call_filter_timer(self.filter_text)
        self.view.hideColumn(HeaderIndex.URL)

    def _on_selection_changed(self, *args):
        self.update_model()
        self.commit()
        self.update_data_info()

    def selected_rows(self):
        """ Return row index for selected genes
        """
        model = self.view.model()

        if not self.selected_genes:
            return []

        return [
            row_index for row_index in range(model.rowCount()) if (
                model.index(row_index, HeaderIndex.GENE).data(),
                model.index(row_index, HeaderIndex.CELL_TYPE).data(),
                model.index(row_index, HeaderIndex.REFERENCE).data(),
            ) in self.selected_genes
        ]

    def commit(self):
        self.Outputs.genes.send(self.output)

    def update_model(self):
        model = self.view.model()
        assert isinstance(model, SearchableTableModel)
        rows = [mi.row() for mi in self.view.selectionModel().selectedRows(0)]

        if rows and len(rows) != len(model.source):
            rows = model.mapToSourceRows(rows)
            output = model.source[rows]
        else:
            output = model.source

        gene_id = self.view.selectionModel().selectedRows(HeaderIndex.GENE)
        cell_type = self.view.selectionModel().selectedRows(
            HeaderIndex.CELL_TYPE)
        ref = self.view.selectionModel().selectedRows(HeaderIndex.REFERENCE)

        self.selected_genes = {
            (entrez.data(), cell.data(), ref.data())
            for entrez, cell, ref in zip(gene_id, cell_type, ref)
        }

        # always false for marker genes data tables in single cell
        output.attributes[GENE_AS_ATTRIBUTE_NAME] = False
        # set taxonomy id in data.attributes
        output.attributes[TAX_ID] = self.map_group_to_taxid.get(
            self.selected_group, '')
        # set column id flag
        output.attributes[GENE_ID_COLUMN] = HeaderLabels[HeaderIndex.GENE]
        output.name = 'Marker Genes'

        self.output = output

    def closeEvent(self, event):
        self.header_state = bytes(self.view.header().saveState())
        super().closeEvent(event)

    def sizeHint(self):
        return super().sizeHint().expandedTo(QSize(750, 500))