Beispiel #1
0
def dicty_mutant_gene_sets(tax_id: str):
    """ Return dicty mutant phenotype gene sets from Dictybase
    """
    if tax_id == '44689':
        gene_sets = []
        gene_matcher = GeneMatcher('44689')

        for phenotype, mutants in phenotypes.phenotype_mutants().items():
            phenotype = phenotype.replace(",", " ")
            gene_symbols = [
                phenotypes.mutant_genes(mutant)[0] for mutant in mutants
            ]
            gene_matcher.genes = gene_symbols
            genes = set()

            for gene in gene_matcher.genes:
                if gene.gene_id is not None:
                    genes.add(str(gene.gene_id))

            gs = GeneSet(gs_id=phenotype,
                         name=phenotype,
                         genes=genes,
                         hierarchy=('Dictybase', 'Phenotypes'),
                         organism=tax_id,
                         link='')

            gene_sets.append(gs)

        for gs_group in GeneSets(gene_sets).split_by_hierarchy():
            hierarchy = gs_group.common_hierarchy()
            gs_group.to_gmt_file_format(
                f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
 def test_synonym_multiple_matches(self):
     gm = GeneMatcher('9606')
     gm.genes = ['HB1']
     gene = gm.genes[0]
     self.assertEqual(gene.input_identifier, 'HB1')
     # Gene matcher should not find any unique match
     self.assertEqual(gene.gene_id, None)
Beispiel #3
0
def cytoband_gene_sets(tax_id: str) -> None:
    """ Create cytoband gene sets from Stanford Microarray Database
    """
    if tax_id == '9606':
        download_link = 'http://statweb.stanford.edu/~tibs/GSA/cytobands-stanford.gmt'
        gene_matcher = GeneMatcher('9606')

        with urlopen(download_link) as stream:
            data = stream.read().splitlines()
            genesets = []

            for band in data:
                b = band.decode().split('\t')
                gene_symbols = b[2:]
                gene_matcher.genes = gene_symbols

                genes = set()
                for gene in gene_matcher.genes:
                    if gene.gene_id is not None:
                        genes.add(gene.gene_id)

                genesets.append(
                    GeneSet(gs_id=b[0],
                            name=b[1],
                            genes=genes if b[2:] else set(),
                            hierarchy=('Cytobands', ),
                            organism='9606',
                            link=''))

        for gs_group in GeneSets(genesets).split_by_hierarchy():
            hierarchy = gs_group.common_hierarchy()
            gs_group.to_gmt_file_format(
                f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
Beispiel #4
0
def cytoband_gene_sets(org):
    """ Create cytoband gene sets from Stanford Microarray Database
    """
    if org == '9606':
        gene_matcher = GeneMatcher('9606')

        with urlopen(CYTOBAND_DOWNLOAD_LINK) as stream:
            data = stream.read().splitlines()
            genesets = []

            for band in data:
                b = band.decode().split('\t')
                gene_symbols = b[2:]
                gene_matcher.genes = gene_symbols
                gene_matcher.run_matcher()

                genes = []
                for gene in gene_matcher.genes:
                    if gene.ncbi_id is not None:
                        genes.append(int(gene.ncbi_id))

                genesets.append(
                    GeneSet(gs_id=b[0],
                            name=b[1],
                            genes=genes if b[2:] else [],
                            hierarchy=('Cytobands', ),
                            organism='9606',
                            link=''))

            return GeneSets(genesets)
Beispiel #5
0
def reactome_gene_sets(org):
    """ Prepare human pathways gene sets from reactome pathways
    """
    if org == '9606':
        gene_matcher = GeneMatcher('9606')

        with urlopen(REACTOME_DOWNLOAD_LINK) as url:
            memfile = io.BytesIO(url.read())

            with ZipFile(memfile, 'r') as myzip:
                f = myzip.open(REACTOME_FILE_NAME)
                content = f.read().decode().splitlines()
                genesets = []

                for path in content:
                    gene_symbols = path.split('\t')[2:] if path.split(
                        '\t')[2:] else []
                    gene_matcher.genes = gene_symbols
                    gene_matcher.run_matcher()
                    genes = []

                    for gene in gene_matcher.genes:
                        if gene.ncbi_id is not None:
                            genes.append(int(gene.ncbi_id))

                    gs = GeneSet(gs_id=path.split('\t')[0],
                                 name=path.split('\t')[0],
                                 genes=genes,
                                 hierarchy=('Reactome', 'Pathways'),
                                 organism='9606',
                                 link='')

                    genesets.append(gs)

                return GeneSets(genesets)
Beispiel #6
0
    def send_to_output(self, result):
        self.progress_bar.finish()
        self.setStatusMessage('')

        etc_json, table_name = result

        # convert to table
        data = etc_to_table(etc_json, bool(self.gene_as_attr_name))
        # set table name
        data.name = table_name

        # match genes
        gene_matcher = GeneMatcher(str(self.organism))

        if not bool(self.gene_as_attr_name):
            if 'Gene' in data.domain:
                data = gene_matcher.match_table_column(
                    data, 'Gene', StringVariable(ENTREZ_ID))
            data.attributes[GENE_ID_COLUMN] = ENTREZ_ID
        else:
            gene_matcher.match_table_attributes(data)
            data.attributes[GENE_ID_ATTRIBUTE] = ENTREZ_ID

        # add table attributes
        data.attributes[TAX_ID] = str(self.organism)
        data.attributes[GENE_AS_ATTRIBUTE_NAME] = bool(self.gene_as_attr_name)

        # reset cache indicators
        self.set_cached_indicator()
        # send data to the output signal
        self.Outputs.etc_data.send(data)
Beispiel #7
0
    def __init__(self, organism, ontology=None, progress_callback=None):
        self.ontology = ontology

        #: A dictionary mapping a gene (gene_id) to a set of all annotations of that gene.
        self.gene_annotations = defaultdict(list)

        #: A dictionary mapping a GO term id to a set of annotations that are directly annotated to that term
        self.term_anotations = defaultdict(list)

        self.all_annotations = defaultdict(list)

        self._gene_names = None
        self._gene_names_dict = None
        self.gene_matcher = GeneMatcher(organism)

        #: A list of all :class:`AnnotationRecords` instances.
        self.annotations = []
        self.header = ''
        self.taxid = organism

        self._ontology = None

        try:
            path = serverfiles.localpath_download(
                DOMAIN,
                FILENAME_ANNOTATION.format(organism),
                progress_callback=progress_callback)
        except FileNotFoundError:
            raise taxonomy.UnknownSpeciesIdentifier(organism)

        self._parse_file(path)
Beispiel #8
0
def omim_gene_sets(org):
    """ Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
    """
    if org == '9606':
        gene_matcher = GeneMatcher('9606')
        genesets = []

        for disease in omim.diseases():
            gene_symbols = omim.disease_genes(disease)
            gene_matcher.genes = gene_symbols
            gene_matcher.run_matcher()
            genes = []

            for gene in gene_matcher.genes:
                if gene.ncbi_id is not None:
                    genes.append(int(gene.ncbi_id))

            gs = GeneSet(
                gs_id=disease.id,
                name=disease.name,
                genes=genes,
                hierarchy=('OMIM', ),
                organism='9606',
                link=(OMIM_LINK.format(disease.id) if disease.id else None))
            genesets.append(gs)

        return GeneSets(genesets)
Beispiel #9
0
def dicty_mutant_gene_sets(org):
    """ Return dicty mutant phenotype gene sets from Dictybase
    """
    if org == '352472':
        gene_sets = []
        gene_matcher = GeneMatcher('352472')

        for phenotype, mutants in dicty.phenotypes.phenotype_mutants().items():

            gene_symbols = [
                dicty.phenotypes.mutant_genes(mutant)[0] for mutant in mutants
            ]
            gene_matcher.genes = gene_symbols
            gene_matcher.run_matcher()
            genes = []

            for gene in gene_matcher.genes:
                if gene.ncbi_id is not None:
                    genes.append(int(gene.ncbi_id))

            if len(gene_symbols) != len(genes):
                print(len(gene_symbols), len(genes))

            gs = GeneSet(gs_id=phenotype,
                         name=phenotype,
                         genes=genes,
                         hierarchy=('Dictybase', 'Phenotypes'),
                         organism='352472',
                         link='')

            gene_sets.append(gs)

        return GeneSets(gene_sets)
    def test_taxonomy_change(self):
        gm = GeneMatcher('4932')
        self.assertEqual(gm.tax_id, '4932')
        self.assertEqual(basename(normpath(gm.gene_db_path)), '4932.sqlite')

        gm.tax_id = '9606'
        self.assertEqual(gm.tax_id, '9606')
        self.assertEqual(basename(normpath(gm.gene_db_path)), '9606.sqlite')
    def test_symbol_match_scenario(self):
        gm = GeneMatcher('9606')
        gm.genes = ['SCN5A']
        gene = gm.genes[0]

        self.assertEqual(gene.input_identifier, 'SCN5A')
        self.assertEqual(gene.symbol, 'SCN5A')
        self.assertEqual(gene.gene_id, '6331')
    def test_match_table_attributes(self):
        gm = GeneMatcher('4932')

        data = Table('brown-selected.tab')
        data = Table.transpose(data, feature_names_column='gene')
        gm.match_table_attributes(data)

        for column in data.domain.attributes:
            self.assertTrue(ENTREZ_ID in column.attributes)
    def test_different_input_identifier_types(self):
        gm = GeneMatcher('9606')
        gm.genes = ['CD4', '614535', 'HB-1Y', 'ENSG00000205426']

        for gene in gm.genes:
            self.assertIsNotNone(gene.description)
            self.assertIsNotNone(gene.tax_id)
            self.assertIsNotNone(gene.species)
            self.assertIsNotNone(gene.gene_id)
Beispiel #14
0
    def find_homologs(self, genes: List[Union[str,
                                              Gene]]) -> List[Optional[Gene]]:
        gm = GeneMatcher(self.source_tax)
        gm.genes = genes

        homologs = [
            g.homolog_gene(taxonomy_id=self.target_tax) for g in gm.genes
        ]
        homologs = load_gene_summary(self.target_tax, homologs)

        return homologs
Beispiel #15
0
    def _update_gene_matcher(self):
        self.gene_names_from_table()

        if not self.input_genes:
            self._update_info_box()

        if not self.gene_matcher:
            self.gene_matcher = GeneMatcher(self.get_selected_organism(),
                                            case_insensitive=True)

        self.gene_matcher.genes = self.input_genes
        self.gene_matcher.organism = self.get_selected_organism()
    def test_homologs(self):
        gm = GeneMatcher('9606')
        gm.genes = ['920']
        g = gm.genes[0]

        self.assertIsNotNone(g.homologs)
        self.assertTrue(len(g.homologs))
        self.assertIn('10090', g.homologs)
        self.assertEqual(g.homology_group_id, '513')

        self.assertEqual(g.homolog_gene('10090'), '12504')
        self.assertIsNone(g.homolog_gene('Unknown_taxonomy'))
Beispiel #17
0
def matchDDBids(genesDDB):
    matcher = GeneMatcher(44689)
    matcher.genes = genesDDB
    geneNames = matcher.genes
    geneInfo = dict()
    for gene in geneNames:
        ddb = gene.input_identifier
        symbol = parseNoneStr(gene.symbol)
        entrez = parseNoneStr(gene.gene_id)
        description = parseNoneStr(gene.description)
        geneInfo[ddb] = (symbol, entrez, description)
    return geneInfo
Beispiel #18
0
def run_gene_matcher(gene_matcher: GeneMatcher, state: TaskState):
    current_iter = 0
    max_iter = len(gene_matcher.genes)

    def callback():
        nonlocal current_iter
        current_iter += 1
        state.set_progress_value(100 * (current_iter / max_iter))

    state.set_status("Working ...")
    gene_matcher._progress_callback = callback
    gene_matcher.match_genes()
Beispiel #19
0
def reactome_gene_sets(tax_id: str) -> None:
    """ Prepare human pathways gene sets from reactome pathways
    """
    if tax_id == '9606':
        download_link = 'http://www.reactome.org/download/current/ReactomePathways.gmt.zip'
        file_name = 'ReactomePathways.gmt'
        detail_link = 'https://reactome.org/content/detail/{}'

        gene_matcher = GeneMatcher('9606')

        with urlopen(download_link) as url:
            memfile = io.BytesIO(url.read())

            with ZipFile(memfile, 'r') as myzip:
                f = myzip.open(file_name)
                content = f.read().decode().splitlines()
                genesets = []

                for path in content:
                    gene_symbols = path.split('\t')[2:] if path.split(
                        '\t')[2:] else []
                    gene_matcher.genes = gene_symbols
                    genes = set()

                    for gene in gene_matcher.genes:
                        if gene.gene_id is not None:
                            genes.add(str(gene.gene_id))

                    pathway = path.split('\t')[0].replace(',', ' ')
                    pathway_id = path.split('\t')[1].replace(',', ' ')

                    gs = GeneSet(gs_id=pathway_id,
                                 name=pathway,
                                 genes=genes,
                                 hierarchy=('Reactome', 'pathways'),
                                 organism='9606',
                                 link=detail_link.format(pathway_id))

                    genesets.append(gs)

        for gs_group in GeneSets(genesets).split_by_hierarchy():
            hierarchy = gs_group.common_hierarchy()
            gs_group.to_gmt_file_format(
                f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
Beispiel #20
0
def name_genes_entrez(gene_names: list, key_entrez: bool, organism: int = ORGANISM) -> dict:
    """
    Add entrez id to each gene name
    :param gene_names: Gene names (eg. from dictyBase)
    :param organism: organism ID
    :param key_entrez: True: Entrez IDs as keys and names as values, False: vice versa
    :return: Dict of gene names and matching Entres IDs for genes that have Entrez ID
    """
    entrez_names = dict()
    matcher = GeneMatcher(organism)
    matcher.genes = gene_names
    for gene in matcher.genes:
        name = gene.input_identifier
        entrez = gene.gene_id
        if entrez is not None:
            if key_entrez:
                entrez_names[entrez] = name
            else:
                entrez_names[name] = entrez
    return entrez_names
    def send_to_output(self, result):
        self.progress_bar.finish()
        self.setStatusMessage('')

        etc_json, table_name = result

        # convert to table
        data = etc_to_table(etc_json, bool(self.gene_as_attr_name))
        # set table name
        data.name = table_name

        # match genes
        gene_matcher = GeneMatcher(str(self.organism))

        if not bool(self.gene_as_attr_name):
            if 'Gene' in data.domain:
                gene_column = data.domain['Gene']
                gene_names = data.get_column_view(gene_column)[0]
                gene_matcher.genes = gene_names

                domain_ids = Domain([], metas=[StringVariable(ENTREZ_ID)])
                data_ids = [[str(gene.gene_id) if gene.gene_id else '?']
                            for gene in gene_matcher.genes]
                table_ids = Table(domain_ids, data_ids)
                data = Table.concatenate([data, table_ids])

            data.attributes[GENE_ID_COLUMN] = ENTREZ_ID
        else:
            gene_matcher.match_table_attributes(data)
            data.attributes[GENE_ID_ATTRIBUTE] = ENTREZ_ID

        # add table attributes
        data.attributes[TAX_ID] = str(self.organism)
        data.attributes[GENE_AS_ATTRIBUTE_NAME] = bool(self.gene_as_attr_name)

        # reset cache indicators
        self.set_cached_indicator()
        # send data to the output signal
        self.Outputs.etc_data.send(data)
    def Update(self):
        """
        Update (recompute enriched pathways) the widget state.
        """
        if not self.data:
            return

        self.error(0)
        self.information(0)

        # XXX: Check data in setData, do not even allow this to be executed if
        # data has no genes
        try:
            genes = self.GeneNamesFromData(self.data)
        except ValueError:
            self.error(0, "Cannot extract gene names from input.")
            genes = []

        if not self.useAttrNames and any("," in gene for gene in genes):
            genes = reduce(add, (split_and_strip(gene, ",")
                                 for gene in genes),
                           [])
            self.information(0,
                             "Separators detected in input gene names. "
                             "Assuming multiple genes per instance.")

        self.queryGenes = genes

        self.information(1)
        reference = None
        if self.useReference and self.refData:
            reference = self.GeneNamesFromData(self.refData)
            if not self.useAttrNames \
                    and any("," in gene for gene in reference):
                reference = reduce(add, (split_and_strip(gene, ",")
                                         for gene in reference),
                                   [])
                self.information(1,
                                 "Separators detected in reference gene "
                                 "names. Assuming multiple genes per "
                                 "instance.")

        org_code = self.SelectedOrganismCode()

        from orangecontrib.bioinformatics.ncbi.gene import GeneMatcher
        gm = GeneMatcher(kegg.to_taxid(org_code))
        gm.genes = genes
        gm.run_matcher()
        mapped_genes = {gene: str(ncbi_id) for gene, ncbi_id in gm.map_input_to_ncbi().items()}

        def run_enrichment(org_code, genes, reference=None, progress=None):
            org = kegg.KEGGOrganism(org_code)
            if reference is None:
                reference = org.get_ncbi_ids()

            # This is here just to keep widget working without any major changes.
            # map not needed, geneMatcher will not work on widget level.
            unique_genes = genes
            unique_ref_genes = dict([(gene, gene) for gene in set(reference)])

            taxid = kegg.to_taxid(org.org_code)
            # Map the taxid back to standard 'common' taxids
            # (as used by 'geneset') if applicable
            r_tax_map = dict((v, k) for k, v in
                             kegg.KEGGGenome.TAXID_MAP.items())
            if taxid in r_tax_map:
                taxid = r_tax_map[taxid]

            # We use the kegg pathway gene sets provided by 'geneset' for
            # the enrichment calculation.

            kegg_api = kegg.api.CachedKeggApi()
            linkmap = kegg_api.link(org.org_code, "pathway")
            converted_ids = kegg_api.conv(org.org_code, 'ncbi-geneid')
            kegg_sets = relation_list_to_multimap(linkmap, dict((gene.upper(), ncbi.split(':')[-1])
                                                                for ncbi, gene in converted_ids))

            kegg_sets = geneset.GeneSets(input=kegg_sets)

            pathways = pathway_enrichment(
                kegg_sets, unique_genes.values(),
                unique_ref_genes.keys(),
                callback=progress
            )
            # Ensure that pathway entries are pre-cached for later use in the
            # list/tree view
            kegg_pathways = kegg.KEGGPathways()
            kegg_pathways.pre_cache(
                pathways.keys(), progress_callback=progress
            )

            return pathways, org, unique_genes, unique_ref_genes

        self.progressBarInit()
        self.setEnabled(False)
        self.infoLabel.setText("Retrieving...\n")

        progress = concurrent.methodinvoke(self, "setProgress", (float,))

        self._enrichTask = concurrent.Task(
            function=lambda:
                run_enrichment(org_code, mapped_genes, reference, progress)
        )
        self._enrichTask.finished.connect(self._onEnrichTaskFinished)
        self._executor.submit(self._enrichTask)
Beispiel #23
0
    def _update_gene_matcher(self):
        self.gene_names_from_table()

        self.gene_matcher = GeneMatcher(self.get_selected_organism(),
                                        auto_start=False)
        self.gene_matcher.genes = self.input_genes
Beispiel #24
0
class OWGenes(OWWidget, ConcurrentWidgetMixin):
    name = "Genes"
    description = "Tool for working with genes"
    icon = "../widgets/icons/OWGeneInfo.svg"
    priority = 40
    want_main_area = True

    selected_organism: int = Setting(11)
    search_pattern: str = Setting('')
    exclude_unmatched = Setting(True)
    replace_id_with_symbol = Setting(True)
    auto_commit = Setting(True)

    settingsHandler = DomainContextHandler()
    selected_gene_col = ContextSetting(None)
    use_attr_names = ContextSetting(True)

    replaces = [
        'orangecontrib.bioinformatics.widgets.OWGeneNameMatcher.OWGeneNameMatcher'
    ]

    class Inputs:
        data_table = Input("Data", Table)

    class Outputs:
        data_table = Output("Data", Table)
        gene_matcher_results = Output("Genes", Table)

    class Information(OWWidget.Information):
        pass

    def sizeHint(self):
        return QSize(1280, 960)

    def __init__(self):
        OWWidget.__init__(self)
        ConcurrentWidgetMixin.__init__(self)

        # ATTRIBUTES #
        self.target_database = ENTREZ_ID

        # input data
        self.input_data = None
        self.input_genes = None
        self.tax_id = None
        self.column_candidates = []

        # input options
        self.organisms = []

        # gene matcher
        self.gene_matcher = None

        # progress bar
        self.progress_bar = None

        self._timer = QTimer()
        self._timer.timeout.connect(self._apply_filter)
        self._timer.setSingleShot(True)

        # GUI SECTION #

        # Control area
        self.info_box = widgetLabel(
            widgetBox(self.controlArea, "Info", addSpace=True),
            'No data on input.\n')

        organism_box = vBox(self.controlArea, 'Organism')
        self.organism_select_combobox = comboBox(
            organism_box,
            self,
            'selected_organism',
            callback=self.on_input_option_change)

        self.get_available_organisms()
        self.organism_select_combobox.setCurrentIndex(self.selected_organism)

        box = widgetBox(self.controlArea, 'Gene IDs in the input data')
        self.gene_columns_model = itemmodels.DomainModel(
            valid_types=(StringVariable, DiscreteVariable))
        self.gene_column_combobox = comboBox(
            box,
            self,
            'selected_gene_col',
            label='Stored in data column',
            model=self.gene_columns_model,
            sendSelectedValue=True,
            callback=self.on_input_option_change,
        )

        self.attr_names_checkbox = checkBox(
            box,
            self,
            'use_attr_names',
            'Stored as feature (column) names',
            disables=[(-1, self.gene_column_combobox)],
            callback=self.on_input_option_change,
        )

        self.gene_column_combobox.setDisabled(bool(self.use_attr_names))

        output_box = vBox(self.controlArea, 'Output')

        # separator(output_box)
        # output_box.layout().addWidget(horizontal_line())
        # separator(output_box)
        self.exclude_radio = checkBox(output_box,
                                      self,
                                      'exclude_unmatched',
                                      'Exclude unmatched genes',
                                      callback=self.commit)

        self.replace_radio = checkBox(output_box,
                                      self,
                                      'replace_id_with_symbol',
                                      'Replace feature IDs with gene names',
                                      callback=self.commit)

        auto_commit(self.controlArea,
                    self,
                    "auto_commit",
                    "&Commit",
                    box=False)

        rubber(self.controlArea)

        # Main area
        self.filter = lineEdit(self.mainArea,
                               self,
                               'search_pattern',
                               'Filter:',
                               callbackOnType=True,
                               callback=self.handle_filter_callback)
        # rubber(self.radio_group)
        self.mainArea.layout().addWidget(self.filter)

        # set splitter
        self.splitter = QSplitter()
        self.splitter.setOrientation(Qt.Vertical)

        self.table_model = GeneInfoModel()
        self.table_view = QTableView()
        self.table_view.setAlternatingRowColors(True)
        self.table_view.viewport().setMouseTracking(True)
        self.table_view.setSortingEnabled(True)
        self.table_view.setShowGrid(False)
        self.table_view.verticalHeader().hide()
        # self.table_view.horizontalHeader().setSectionResizeMode(QHeaderView.Stretch)

        self.unknown_model = UnknownGeneInfoModel()

        self.unknown_view = QTableView()
        self.unknown_view.setModel(self.unknown_model)
        self.unknown_view.verticalHeader().hide()
        self.unknown_view.setShowGrid(False)
        self.unknown_view.setSelectionMode(QAbstractItemView.NoSelection)
        self.unknown_view.horizontalHeader().setSectionResizeMode(
            QHeaderView.Stretch)

        self.splitter.addWidget(self.table_view)
        self.splitter.addWidget(self.unknown_view)

        self.splitter.setStretchFactor(0, 90)
        self.splitter.setStretchFactor(1, 10)

        self.mainArea.layout().addWidget(self.splitter)

    def handle_filter_callback(self):
        self._timer.stop()
        self._timer.start(500)

    def _apply_filter(self):
        # filter only if input data is present and model is populated
        if self.table_model.table is not None:
            self.table_model.update_model(
                filter_pattern=str(self.search_pattern))
            self.commit()

    def __reset_widget_state(self):
        self.table_view.clearSpans()
        self.table_view.setModel(None)
        self.table_model.clear()
        self.unknown_model.clear()
        self._update_info_box()

    def _update_info_box(self):

        if self.input_genes and self.gene_matcher:
            num_genes = len(self.gene_matcher.genes)
            known_genes = len(self.gene_matcher.get_known_genes())

            info_text = ('{} genes in input data\n'
                         '{} genes match Entrez database\n'
                         '{} genes with match conflicts\n'.format(
                             num_genes, known_genes, num_genes - known_genes))

        else:
            info_text = 'No data on input.'

        self.info_box.setText(info_text)

    def on_done(self, _):
        # update info box
        self._update_info_box()

        # set output options
        self.toggle_radio_options()

        # set known genes
        self.table_model.initialize(self.gene_matcher.genes)
        self.table_view.setModel(self.table_model)
        self.table_view.selectionModel().selectionChanged.connect(self.commit)
        self.table_view.setSelectionBehavior(QAbstractItemView.SelectRows)

        self.table_view.setItemDelegateForColumn(
            self.table_model.entrez_column_index,
            LinkStyledItemDelegate(self.table_view))
        v_header = self.table_view.verticalHeader()
        option = self.table_view.viewOptions()
        size = self.table_view.style().sizeFromContents(
            QStyle.CT_ItemViewItem, option, QSize(20, 20), self.table_view)

        v_header.setDefaultSectionSize(size.height() + 2)
        v_header.setMinimumSectionSize(5)
        self.table_view.horizontalHeader().setStretchLastSection(True)

        # set unknown genes
        self.unknown_model.initialize(self.gene_matcher.genes)
        self.unknown_view.verticalHeader().setStretchLastSection(True)

        self._apply_filter()

    def get_available_organisms(self):
        available_organism = sorted(((tax_id, taxonomy.name(tax_id))
                                     for tax_id in taxonomy.common_taxids()),
                                    key=lambda x: x[1])

        self.organisms = [tax_id[0] for tax_id in available_organism]
        self.organism_select_combobox.addItems(
            [tax_id[1] for tax_id in available_organism])

    def gene_names_from_table(self):
        """ Extract and return gene names from `Orange.data.Table`.
        """
        self.input_genes = []
        if self.input_data:
            if self.use_attr_names:
                self.input_genes = [
                    str(attr.name).strip()
                    for attr in self.input_data.domain.attributes
                ]
            else:
                if self.selected_gene_col is None:
                    self.selected_gene_col = self.gene_column_identifier()

                self.input_genes = [
                    str(e[self.selected_gene_col]) for e in self.input_data
                    if not np.isnan(e[self.selected_gene_col])
                ]

    def _update_gene_matcher(self):
        self.gene_names_from_table()

        self.gene_matcher = GeneMatcher(self.get_selected_organism(),
                                        auto_start=False)
        self.gene_matcher.genes = self.input_genes
        # self.gene_matcher.organism = self.get_selected_organism()

    def get_selected_organism(self):
        return self.organisms[self.selected_organism]

    def _run(self):
        if self.gene_matcher is not None:
            self.start(run_gene_matcher, self.gene_matcher)

    def on_input_option_change(self):
        self.__reset_widget_state()
        self._update_gene_matcher()
        self._run()

    def gene_column_identifier(self):
        """
        Get most suitable column that stores genes. If there are
        several suitable columns, select the one with most unique
        values. Take the best one.
        """

        # candidates -> (variable, num of unique values)
        candidates = ((col,
                       np.unique(self.input_data.get_column_view(col)[0]).size)
                      for col in self.gene_columns_model
                      if isinstance(col, DiscreteVariable)
                      or isinstance(col, StringVariable))

        best_candidate, _ = sorted(candidates, key=lambda x: x[1])[-1]
        return best_candidate

    def find_genes_location(self):
        """ Try locate the genes in the input data when we first load the data.

            Proposed rules:
                - when no suitable feature names are present, check the columns.
                - find the most suitable column, that is, the one with most unique values.

        """
        domain = self.input_data.domain
        if not domain.attributes:
            if self.selected_gene_col is None:
                self.selected_gene_col = self.gene_column_identifier()
                self.use_attr_names = False

    @Inputs.data_table
    def handle_input(self, data):
        self.closeContext()
        self.input_data = None
        self.input_genes = None
        self.__reset_widget_state()
        self.gene_columns_model.set_domain(None)
        self.selected_gene_col = None

        if data:
            self.input_data = data
            self.gene_columns_model.set_domain(self.input_data.domain)

            # check if input table has tax_id, human is used if tax_id is not found
            self.tax_id = str(self.input_data.attributes.get(TAX_ID, '9606'))
            # check for gene location. Default is that genes are attributes in the input table.
            self.use_attr_names = self.input_data.attributes.get(
                GENE_AS_ATTRIBUTE_NAME, self.use_attr_names)

            if self.tax_id in self.organisms and not self.selected_organism:
                self.selected_organism = self.organisms.index(self.tax_id)

            self.openContext(self.input_data.domain)
            self.find_genes_location()
            self.on_input_option_change()

    def commit(self):
        selection = self.table_view.selectionModel().selectedRows(
            self.table_model.entrez_column_index)

        selected_genes = [row.data() for row in selection]
        if not len(selected_genes):
            selected_genes = self.table_model.get_filtered_genes()

        gene_ids = self.get_target_ids()
        known_genes = [gid for gid in gene_ids if gid != '?']

        table = None
        gm_table = None
        if known_genes:
            # Genes are in rows (we have a column with genes).
            if not self.use_attr_names:

                if self.target_database in self.input_data.domain:
                    gene_var = self.input_data.domain[self.target_database]
                    metas = self.input_data.domain.metas
                else:
                    gene_var = StringVariable(self.target_database)
                    metas = self.input_data.domain.metas + (gene_var, )

                domain = Domain(self.input_data.domain.attributes,
                                self.input_data.domain.class_vars, metas)

                table = self.input_data.transform(domain)
                col, _ = table.get_column_view(gene_var)
                col[:] = gene_ids

                # filter selected rows
                selected_genes_set = set(selected_genes)
                selected_rows = [
                    row_index for row_index, row in enumerate(table)
                    if str(row[gene_var]) in selected_genes_set
                ]

                # handle table attributes
                table.attributes[TAX_ID] = self.get_selected_organism()
                table.attributes[GENE_AS_ATTRIBUTE_NAME] = False
                table.attributes[GENE_ID_COLUMN] = self.target_database
                table = table[selected_rows] if selected_rows else table

                if self.exclude_unmatched:
                    # create filter from selected column for genes
                    only_known = table_filter.FilterStringList(
                        gene_var, known_genes)
                    # apply filter to the data
                    table = table_filter.Values([only_known])(table)

                self.Outputs.data_table.send(table)

            # genes are are in columns (genes are features).
            else:
                domain = self.input_data.domain.copy()
                table = self.input_data.transform(domain)

                for gene in self.gene_matcher.genes:
                    if gene.input_identifier in table.domain:

                        table.domain[gene.input_identifier].attributes[
                            self.target_database] = (str(gene.gene_id)
                                                     if gene.gene_id else '?')

                        if self.replace_id_with_symbol:
                            try:
                                table.domain[gene.input_identifier].name = str(
                                    gene.symbol)
                            except AttributeError:
                                # TODO: missing gene symbol, need to handle this?
                                pass

                # filter selected columns
                selected_genes_set = set(selected_genes)
                selected = [
                    column for column in table.domain.attributes
                    if self.target_database in column.attributes
                    and str(column.attributes[
                        self.target_database]) in selected_genes_set
                ]

                output_attrs = table.domain.attributes

                if selected:
                    output_attrs = selected

                if self.exclude_unmatched:
                    known_genes_set = set(known_genes)
                    output_attrs = [
                        col for col in output_attrs if col.attributes[
                            self.target_database] in known_genes_set
                    ]

                domain = Domain(output_attrs, table.domain.class_vars,
                                table.domain.metas)

                table = table.from_table(domain, table)

                # handle table attributes
                table.attributes[TAX_ID] = self.get_selected_organism()
                table.attributes[GENE_AS_ATTRIBUTE_NAME] = True
                table.attributes[GENE_ID_ATTRIBUTE] = self.target_database

            gm_table = self.gene_matcher.to_data_table(
                selected_genes=selected_genes if selected_genes else None)

        self.Outputs.data_table.send(table)
        self.Outputs.gene_matcher_results.send(gm_table)

    def toggle_radio_options(self):
        self.replace_radio.setEnabled(bool(self.use_attr_names))

        if self.gene_matcher.genes:
            # enable checkbox if unknown genes are detected
            self.exclude_radio.setEnabled(
                len(self.gene_matcher.genes) != len(
                    self.gene_matcher.get_known_genes()))
            self.exclude_unmatched = len(self.gene_matcher.genes) != len(
                self.gene_matcher.get_known_genes())

    def get_target_ids(self):
        return [
            str(gene.gene_id) if gene.gene_id else '?'
            for gene in self.gene_matcher.genes
        ]
    def runner(self, state: TaskState) -> Table:
        exp_type = self.data_output_options.expression_type[self.exp_type].type
        exp_source = self.data_output_options.expression_sources[
            self.exp_source]
        proc_slug = self.data_output_options.process[self.proc_slug].slug
        collection_id = self.selected_collection_id

        table = self.data_table
        progress_steps_download = iter(np.linspace(0, 50, 2))

        def callback(i: float, status=""):
            state.set_progress_value(i * 100)
            if status:
                state.set_status(status)
            if state.is_interruption_requested():
                raise Exception

        if not table:
            collection = self.res.get_collection_by_id(collection_id)
            coll_table = resdk.tables.RNATables(
                collection,
                expression_source=exp_source,
                expression_process_slug=proc_slug,
                progress_callable=wrap_callback(callback, end=0.5),
            )
            species = coll_table._data[0].output['species']
            sample = coll_table._samples[0]

            state.set_status('Downloading ...')
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
            df_exp = coll_table.exp if exp_type != 'rc' else coll_table.rc
            df_exp = df_exp.rename(index=coll_table.readable_index)
            df_metas = coll_table.meta
            df_metas = df_metas.rename(index=coll_table.readable_index)
            df_qc = None
            if self.append_qc_data:
                # TODO: check if there is a way to detect if collection
                #       table contains QC data
                try:
                    df_qc = coll_table.qc
                    df_qc = df_qc.rename(index=coll_table.readable_index)
                except ValueError:
                    pass
            loop.close()

            state.set_status('To data table ...')

            duplicates = {
                item
                for item, count in Counter([
                    label.split('.')[1]
                    for label in df_metas.columns.to_list() if '.' in label
                ]).items() if count > 1
            }

            # what happens if there is more nested sections?
            section_name_to_label = {
                section['name']: section['label']
                for section in sample.descriptor_schema.schema
            }

            column_labels = {}
            for field_schema, fields, path in iterate_schema(
                    sample.descriptor, sample.descriptor_schema.schema,
                    path=''):
                path = path[1:]  # this is ugly, but cant go around it
                if path not in df_metas.columns:
                    continue
                label = field_schema['label']
                section_name, field_name = path.split('.')
                column_labels[path] = (
                    label if field_name not in duplicates else
                    f'{section_name_to_label[section_name]} - {label}')

            df_exp = df_exp.reset_index(drop=True)
            df_metas = df_metas.astype('object')
            df_metas = df_metas.fillna(np.nan)
            df_metas = df_metas.replace('nan', np.nan)
            df_metas = df_metas.rename(columns=column_labels)
            if df_qc is not None:
                df_metas = pd.merge(df_metas,
                                    df_qc,
                                    left_index=True,
                                    right_index=True)

            xym, domain_metas = vars_from_df(df_metas)
            x, _, m = xym
            x_metas = np.hstack((x, m))
            attrs = [ContinuousVariable(col) for col in df_exp.columns]
            metas = domain_metas.attributes + domain_metas.metas
            domain = Domain(attrs, metas=metas)
            table = Table(domain, df_exp.to_numpy(), metas=x_metas)
            state.set_progress_value(next(progress_steps_download))

            state.set_status('Matching genes ...')
            progress_steps_gm = iter(
                np.linspace(50, 99, len(coll_table.gene_ids)))

            def gm_callback():
                state.set_progress_value(next(progress_steps_gm))

            tax_id = species_name_to_taxid(species)
            gm = GeneMatcher(tax_id, progress_callback=gm_callback)
            table = gm.match_table_attributes(table, rename=True)
            table.attributes[TableAnnotation.tax_id] = tax_id
            table.attributes[TableAnnotation.gene_as_attr_name] = True
            table.attributes[TableAnnotation.gene_id_attribute] = 'Entrez ID'
            self.data_table = table

        state.set_status('Normalizing ...')
        table = self.normalize(table)
        state.set_progress_value(100)

        return table
Beispiel #26
0
def runner(
    res: ResolweAPI,
    data_objects: List[Data],
    options: DataOutputOptions,
    exp_type: int,
    proc_type: int,
    input_annotation: int,
    state: TaskState,
) -> Table:
    data_frames = []
    metadata = defaultdict(list)

    def parse_sample_descriptor(sample: Sample) -> None:
        general = sample.descriptor.get('general', {})

        for label in SAMPLE_DESCRIPTOR_LABELS:
            metadata[label].append([general.get(label, '')])

        metadata['sample_name'].append([sample.name])

    exp_type = file_output_field = options.expression[exp_type].type
    proc_type = options.process[proc_type].type
    source = options.input_annotation[input_annotation].source
    species = options.input_annotation[input_annotation].species
    build = options.input_annotation[input_annotation].build

    # apply filters
    data_objects = [obj for obj in data_objects if obj.process.type == proc_type]
    data_objects = [
        obj
        for obj in data_objects
        if obj.output['source'] == source and obj.output['species'] == species and obj.output['build'] == build
    ]
    if exp_type != 'rc':
        file_output_field = 'exp'
        data_objects = [obj for obj in data_objects if obj.output['exp_type'] == exp_type]

    if not data_objects:
        raise ResolweDataObjectsNotFound

    step, steps = 0, len(data_objects) + 3

    def set_progress():
        nonlocal step
        step += 1
        state.set_progress_value(100 * (step / steps))

    state.set_status('Downloading ...')
    for data_object in data_objects:
        set_progress()
        parse_sample_descriptor(data_object.sample)
        metadata['expression_type'].append([exp_type.upper()])

        response = res.get_expressions(data_object.id, data_object.output[file_output_field]['file'])
        with io.BytesIO() as f:
            f.write(response.content)
            f.seek(0)
            # expressions to data frame
            df = pd.read_csv(f, sep='\t', compression='gzip')
            df = df.set_index('Gene').T.reset_index(drop=True)
            data_frames.append(df)

    state.set_status('Concatenating samples ...')
    df = pd.concat(data_frames, axis=0)

    state.set_status('To data table ...')
    table = table_from_frame(df)
    set_progress()

    state.set_status('Adding metadata ...')
    metas = [StringVariable(label) for label in metadata.keys()]
    domain = Domain(table.domain.attributes, table.domain.class_vars, metas)
    table = table.transform(domain)

    for key, value in metadata.items():
        table[:, key] = value
    set_progress()

    state.set_status('Matching genes ...')
    tax_id = species_name_to_taxid(species)
    gm = GeneMatcher(tax_id)
    table = gm.match_table_attributes(table, rename=True)
    table.attributes[TableAnnotation.tax_id] = tax_id
    table.attributes[TableAnnotation.gene_as_attr_name] = True
    table.attributes[TableAnnotation.gene_id_attribute] = 'Entrez ID'
    set_progress()

    return table
Beispiel #27
0
def panglao_db(file_path: str):
    file_name = 'panglao_gene_markers.tab'
    reference, reference_url = 'PanglaoDB', 'https://panglaodb.se/'

    with gzip.open(file_path, 'rb') as f:
        content = f.read().decode('utf-8').strip()

    species = 0
    gene_symbol = 1
    cell_type = 2
    genes_by_organism = defaultdict(list)
    organism_mapper = {'Mm': 'Mouse', 'Hs': 'Human'}

    def _gene_function_table(desc_col: StringVariable,
                             gm_results: GeneMatcher):
        _domain = Domain([], metas=[desc_col])
        _data = [[str(gene.description) if gene.description else '']
                 for gene in gm_results.genes]
        return Table(_domain, _data)

    for line in content.split('\n'):
        columns = line.split('\t')

        for org in columns[species].split(' '):
            if org in organism_mapper.keys():
                gene_entry = [
                    organism_mapper[org], columns[gene_symbol],
                    columns[cell_type], reference, reference_url
                ]
                genes_by_organism[organism_mapper[org]].append(gene_entry)

    domain = Domain(
        [],
        metas=[
            StringVariable('Organism'),
            StringVariable('Name'),
            StringVariable('Cell Type'),
            StringVariable('Reference'),
            StringVariable('URL'),
        ],
    )

    entrez_id_column = StringVariable('Entrez ID')
    description_column = StringVariable('Function')

    # construct data table for mouse
    gm_mouse = GeneMatcher('10090')
    mouse_table = Table(domain, genes_by_organism['Mouse'])
    mouse_table = gm_mouse.match_table_column(mouse_table, 'Name',
                                              entrez_id_column)
    mouse_table = Table.concatenate(
        [mouse_table,
         _gene_function_table(description_column, gm_mouse)])

    # construct data table for human
    gm_human = GeneMatcher('9606')
    human_table = Table(domain, genes_by_organism['Human'])
    human_table = gm_human.match_table_column(human_table, 'Name',
                                              entrez_id_column)
    human_table = Table.concatenate(
        [human_table,
         _gene_function_table(description_column, gm_human)])

    # return combined tables
    Table.concatenate([mouse_table, human_table],
                      axis=0).save(f'data/marker_genes/{file_name}')
    def test_match_table_column(self):
        gm = GeneMatcher('4932')

        data = gm.match_table_column(Table('brown-selected.tab'), 'gene')
        self.assertTrue(ENTREZ_ID in data.domain)
from orangecontrib.bioinformatics.ncbi.gene import GeneMatcher, GENE_INFO_TAGS

# specify input
organism = 9606
genes_symbols_to_match = ['HB1', 'BCKDHB', 'TWIST1']

# initialize gene matcher object
gene_matcher = GeneMatcher(organism)
gene_matcher.genes = genes_symbols_to_match

# run matching process
gene_matcher.run_matcher()

# inspect results
for gene in gene_matcher.genes:
    print("\ninput name: " + gene.input_name,
          "\nid from ncbi: ", gene.ncbi_id,
          "\nmatch type: ", gene.type_of_match
          )
    if gene.ncbi_id is None and gene.possible_hits:
        print('possible_hits: ', [hit.ncbi_id for hit in gene.possible_hits])
                                            Gene()).homology_group_id
        homologs = [
            gene.gene_id
            for gene in self._homologs_by_group.get(homology_group, [])
            if gene.tax_id == organism
        ]
        if len(homologs) == 1:
            return homologs[0]
        else:
            # Is possible that find more then one gene?
            return None


if __name__ == "__main__":
    from orangecontrib.bioinformatics.ncbi.gene import GeneMatcher, load_gene_summary
    import Orange

    homology = HomoloGene()

    gm = GeneMatcher('4932')
    genes = Orange.data.Table("brown-selected")

    gm.genes = genes
    _homologs = [
        homology.find_homolog(str(gene.gene_id), '9606') for gene in gm.genes
    ]
    _homologs = load_gene_summary('9606', _homologs)

    for gene, homolog in zip(gm.genes, _homologs):
        print(f'{gene} ----> {homolog}')