Ejemplo n.º 1
0
def omim_gene_sets(org):
    """ Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
    """
    if org == '9606':
        gene_matcher = GeneMatcher('9606')
        genesets = []

        for disease in omim.diseases():
            gene_symbols = omim.disease_genes(disease)
            gene_matcher.genes = gene_symbols
            gene_matcher.run_matcher()
            genes = []

            for gene in gene_matcher.genes:
                if gene.ncbi_id is not None:
                    genes.append(int(gene.ncbi_id))

            gs = GeneSet(
                gs_id=disease.id,
                name=disease.name,
                genes=genes,
                hierarchy=('OMIM', ),
                organism='9606',
                link=(OMIM_LINK.format(disease.id) if disease.id else None))
            genesets.append(gs)

        return GeneSets(genesets)
Ejemplo n.º 2
0
def dicty_mutant_gene_sets(org):
    """ Return dicty mutant phenotype gene sets from Dictybase
    """
    if org == '352472':
        gene_sets = []
        gene_matcher = GeneMatcher('352472')

        for phenotype, mutants in dicty.phenotypes.phenotype_mutants().items():

            gene_symbols = [
                dicty.phenotypes.mutant_genes(mutant)[0] for mutant in mutants
            ]
            gene_matcher.genes = gene_symbols
            gene_matcher.run_matcher()
            genes = []

            for gene in gene_matcher.genes:
                if gene.ncbi_id is not None:
                    genes.append(int(gene.ncbi_id))

            if len(gene_symbols) != len(genes):
                print(len(gene_symbols), len(genes))

            gs = GeneSet(gs_id=phenotype,
                         name=phenotype,
                         genes=genes,
                         hierarchy=('Dictybase', 'Phenotypes'),
                         organism='352472',
                         link='')

            gene_sets.append(gs)

        return GeneSets(gene_sets)
Ejemplo n.º 3
0
def cytoband_gene_sets(org):
    """ Create cytoband gene sets from Stanford Microarray Database
    """
    if org == '9606':
        gene_matcher = GeneMatcher('9606')

        with urlopen(CYTOBAND_DOWNLOAD_LINK) as stream:
            data = stream.read().splitlines()
            genesets = []

            for band in data:
                b = band.decode().split('\t')
                gene_symbols = b[2:]
                gene_matcher.genes = gene_symbols
                gene_matcher.run_matcher()

                genes = []
                for gene in gene_matcher.genes:
                    if gene.ncbi_id is not None:
                        genes.append(int(gene.ncbi_id))

                genesets.append(
                    GeneSet(gs_id=b[0],
                            name=b[1],
                            genes=genes if b[2:] else [],
                            hierarchy=('Cytobands', ),
                            organism='9606',
                            link=''))

            return GeneSets(genesets)
Ejemplo n.º 4
0
def reactome_gene_sets(org):
    """ Prepare human pathways gene sets from reactome pathways
    """
    if org == '9606':
        gene_matcher = GeneMatcher('9606')

        with urlopen(REACTOME_DOWNLOAD_LINK) as url:
            memfile = io.BytesIO(url.read())

            with ZipFile(memfile, 'r') as myzip:
                f = myzip.open(REACTOME_FILE_NAME)
                content = f.read().decode().splitlines()
                genesets = []

                for path in content:
                    gene_symbols = path.split('\t')[2:] if path.split(
                        '\t')[2:] else []
                    gene_matcher.genes = gene_symbols
                    gene_matcher.run_matcher()
                    genes = []

                    for gene in gene_matcher.genes:
                        if gene.ncbi_id is not None:
                            genes.append(int(gene.ncbi_id))

                    gs = GeneSet(gs_id=path.split('\t')[0],
                                 name=path.split('\t')[0],
                                 genes=genes,
                                 hierarchy=('Reactome', 'Pathways'),
                                 organism='9606',
                                 link='')

                    genesets.append(gs)

                return GeneSets(genesets)
    def send_to_output(self, result):
        self.progress_bar.finish()
        self.setStatusMessage('')

        etc_json, table_name = result

        # convert to table
        data = etc_to_table(etc_json, bool(self.gene_as_attr_name))
        # set table name
        data.name = table_name

        # match genes
        gene_matcher = GeneMatcher(str(self.orgnism))

        if not bool(self.gene_as_attr_name):
            if 'Gene' in data.domain:
                gene_column = data.domain['Gene']
                gene_names = data.get_column_view(gene_column)[0]
                gene_matcher.genes = gene_names
                gene_matcher.run_matcher()

                domain_ids = Domain([], metas=[StringVariable(NCBI_ID)])
                data_ids = [[str(gene.ncbi_id) if gene.ncbi_id else '?']
                            for gene in gene_matcher.genes]
                table_ids = Table(domain_ids, data_ids)
                data = Table.concatenate([data, table_ids])

            data.attributes[GENE_ID_COLUMN] = NCBI_ID
        else:
            gene_matcher.match_table_attributes(data)
            data.attributes[GENE_ID_ATTRIBUTE] = NCBI_ID

        # add table attributes
        data.attributes[TAX_ID] = str(self.orgnism)
        data.attributes[GENE_AS_ATTRIBUTE_NAME] = bool(self.gene_as_attr_name)

        # reset cache indicators
        self.set_cached_indicator()
        # send data to the output signal
        self.Outputs.etc_data.send(data)
from orangecontrib.bioinformatics.ncbi.gene import GeneMatcher, GENE_INFO_TAGS

# specify input
organism = 9606
genes_symbols_to_match = ['HB1', 'BCKDHB', 'TWIST1']

# initialize gene matcher object
gene_matcher = GeneMatcher(organism)
gene_matcher.genes = genes_symbols_to_match

# run matching process
gene_matcher.run_matcher()

# inspect results
for gene in gene_matcher.genes:
    print("\ninput name: " + gene.input_name,
          "\nid from ncbi: ", gene.ncbi_id,
          "\nmatch type: ", gene.type_of_match
          )
    if gene.ncbi_id is None and gene.possible_hits:
        print('possible_hits: ', [hit.ncbi_id for hit in gene.possible_hits])
    def Update(self):
        """
        Update (recompute enriched pathways) the widget state.
        """
        if not self.data:
            return

        self.error(0)
        self.information(0)

        # XXX: Check data in setData, do not even allow this to be executed if
        # data has no genes
        try:
            genes = self.GeneNamesFromData(self.data)
        except ValueError:
            self.error(0, "Cannot extract gene names from input.")
            genes = []

        if not self.useAttrNames and any("," in gene for gene in genes):
            genes = reduce(add, (split_and_strip(gene, ",")
                                 for gene in genes),
                           [])
            self.information(0,
                             "Separators detected in input gene names. "
                             "Assuming multiple genes per instance.")

        self.queryGenes = genes

        self.information(1)
        reference = None
        if self.useReference and self.refData:
            reference = self.GeneNamesFromData(self.refData)
            if not self.useAttrNames \
                    and any("," in gene for gene in reference):
                reference = reduce(add, (split_and_strip(gene, ",")
                                         for gene in reference),
                                   [])
                self.information(1,
                                 "Separators detected in reference gene "
                                 "names. Assuming multiple genes per "
                                 "instance.")

        org_code = self.SelectedOrganismCode()

        from orangecontrib.bioinformatics.ncbi.gene import GeneMatcher
        gm = GeneMatcher(kegg.to_taxid(org_code))
        gm.genes = genes
        gm.run_matcher()
        mapped_genes = {gene: str(ncbi_id) for gene, ncbi_id in gm.map_input_to_ncbi().items()}

        def run_enrichment(org_code, genes, reference=None, progress=None):
            org = kegg.KEGGOrganism(org_code)
            if reference is None:
                reference = org.get_ncbi_ids()

            # This is here just to keep widget working without any major changes.
            # map not needed, geneMatcher will not work on widget level.
            unique_genes = genes
            unique_ref_genes = dict([(gene, gene) for gene in set(reference)])

            taxid = kegg.to_taxid(org.org_code)
            # Map the taxid back to standard 'common' taxids
            # (as used by 'geneset') if applicable
            r_tax_map = dict((v, k) for k, v in
                             kegg.KEGGGenome.TAXID_MAP.items())
            if taxid in r_tax_map:
                taxid = r_tax_map[taxid]

            # We use the kegg pathway gene sets provided by 'geneset' for
            # the enrichment calculation.

            kegg_api = kegg.api.CachedKeggApi()
            linkmap = kegg_api.link(org.org_code, "pathway")
            converted_ids = kegg_api.conv(org.org_code, 'ncbi-geneid')
            kegg_sets = relation_list_to_multimap(linkmap, dict((gene.upper(), ncbi.split(':')[-1])
                                                                for ncbi, gene in converted_ids))

            kegg_sets = geneset.GeneSets(input=kegg_sets)

            pathways = pathway_enrichment(
                kegg_sets, unique_genes.values(),
                unique_ref_genes.keys(),
                callback=progress
            )
            # Ensure that pathway entries are pre-cached for later use in the
            # list/tree view
            kegg_pathways = kegg.KEGGPathways()
            kegg_pathways.pre_cache(
                pathways.keys(), progress_callback=progress
            )

            return pathways, org, unique_genes, unique_ref_genes

        self.progressBarInit()
        self.setEnabled(False)
        self.infoLabel.setText("Retrieving...\n")

        progress = concurrent.methodinvoke(self, "setProgress", (float,))

        self._enrichTask = concurrent.Task(
            function=lambda:
                run_enrichment(org_code, mapped_genes, reference, progress)
        )
        self._enrichTask.finished.connect(self._onEnrichTaskFinished)
        self._executor.submit(self._enrichTask)
Ejemplo n.º 8
0
class Annotations:
    """ :class:`Annotations` object holds the annotations.

    :param str organism:
        an organism specifier (e.g. ``'9606'``). Annotations for that organism will be loaded.

    :param ontology: :class:`Ontology` object for annotations
    :type ontology: :class:`Ontology`

    """
    def __init__(self, organism, ontology=None, progress_callback=None):
        self.ontology = ontology

        #: A dictionary mapping a gene (gene_id) to a set of all annotations of that gene.
        self.gene_annotations = defaultdict(list)

        #: A dictionary mapping a GO term id to a set of annotations that are directly annotated to that term
        self.term_anotations = defaultdict(list)

        self.all_annotations = defaultdict(list)

        self._gene_names = None
        self._gene_names_dict = None
        self.gene_matcher = GeneMatcher(organism)

        #: A list of all :class:`AnnotationRecords` instances.
        self.annotations = []
        self.header = ''
        self.taxid = organism

        self._ontology = None

        try:
            path = serverfiles.localpath_download(
                DOMAIN,
                FILENAME_ANNOTATION.format(organism),
                progress_callback=progress_callback)
        except FileNotFoundError:
            raise taxonomy.UnknownSpeciesIdentifier(organism)

        self._parse_file(path)

    @property
    def ontology(self):
        return self._ontology

    @ontology.setter
    def ontology(self, ontology):
        """ Set the ontology to use in the annotations mapping.
        """
        self.all_annotations = defaultdict(list)
        self._ontology = ontology

    def _ensure_ontology(self):
        if self.ontology is None:
            self.ontology = Ontology()

    def _parse_file(self, file_path):

        with open(file_path, 'r') as anno_file:
            self.header = anno_file.readline()

            for line in anno_file.readlines():
                self.add_annotation(AnnotationRecord.from_string(line))

    def add_annotation(self, a):
        """ Add a single :class:`AnotationRecord` instance to this object.
        """
        if not isinstance(a, AnnotationRecord):
            a = AnnotationRecord(a)
        if not a.gene_id or not a.go_id or a.qualifier == 'NOT':
            return

        self.gene_annotations[int(a.gene_id)].append(a)
        self.term_anotations[a.go_id].append(a)

        self.annotations.append(a)
        self.all_annotations = defaultdict(list)

    def map_to_ncbi_id(self, genes):
        """ Run gene name matching and return only known genes """
        self.gene_matcher.genes = genes
        self.gene_matcher.run_matcher()

        if self.gene_matcher:
            return {
                input_gene: ncbi_id
                for input_gene, ncbi_id in
                self.gene_matcher.map_input_to_ncbi().items()
                if self.gene_annotations[ncbi_id]
            }

    def map_from_ncbi_id(self):
        if self.gene_matcher:
            return {
                ncbi_id: input_gene
                for input_gene, ncbi_id in
                self.gene_matcher.map_input_to_ncbi().items()
                if self.gene_annotations[ncbi_id]
            }

    def _collect_annotations(self, go_id, visited):
        """ Recursive function collects and caches all annotations for id
        """
        if go_id not in self.all_annotations and go_id not in visited:
            if go_id in self.ontology.reverse_alias_mapper:
                annotations = [
                    self.term_anotations.get(alt_id, [])
                    for alt_id in self.ontology.reverse_alias_mapper[go_id]
                ] + [self.term_anotations[go_id]]
            else:
                annotations = [self.term_anotations[go_id]
                               ]  # annotations for this term alone
            visited.add(go_id)

            for typeId, child in self.ontology[go_id].related_to:
                aa = self._collect_annotations(child, visited)
                if type(aa) == set:
                    annotations.append(
                        aa)  # if it was already reduced in get_all_annotations
                else:
                    annotations.extend(aa)
            self.all_annotations[go_id] = annotations
        return self.all_annotations[go_id]

    def get_annotations_by_go_id(self, go_id):
        """ Return a set of all annotations (instances of :obj:`AnnotationRecord`)
        for GO term `id` and all it's subterms.

        Args:
            go_id (:obj:`str`): GO term id

        """
        self._ensure_ontology()
        id = self.ontology.alias_mapper.get(go_id, go_id)
        if id not in self.all_annotations or type(
                self.all_annotations[id]) == list:
            annot_set = set()
            for annots in self._collect_annotations(id, set()):
                annot_set.update(annots)
            self.all_annotations[id] = annot_set
        return self.all_annotations[id]

    def get_genes_by_go_term(self, go_id, evidence_codes=None):
        """ Return a list of genes annotated by specified `evidence_codes`
        to GO term 'id' and all it's subterms."

        :param str go_id: GO term id

        :param list-of-strings evidence_codes:
            List of evidence codes to consider when matching annotations
            to terms.

        """
        evidence_codes = set(evidence_codes or evidenceDict.keys())
        annotations = self.get_annotations_by_go_id(go_id)
        return list(
            set([
                int(ann.gene_id) for ann in annotations
                if ann.evidence in evidence_codes
            ]))

    def genes(self):
        return set([int(ann.gene_id) for ann in self.annotations])

    def get_enriched_terms(self,
                           genes,
                           reference=None,
                           evidence_codes=None,
                           slims_only=False,
                           aspect=None,
                           prob=statistics.Binomial(),
                           use_fdr=True,
                           progress_callback=None):
        """ Return a dictionary of enriched terms, with tuples of
        (list_of_genes, p_value, reference_count) for items and term
        ids as keys. P-Values are FDR adjusted if use_fdr is True (default).

         Args:
            genes: List of genes
            reference: List of genes (if None all genes included in the annotations will be used).
            evidence_codes: List of evidence codes to consider.
            slims_only: If `True` return only slim terms.
            aspect: Which aspects to use. Use all by default;
                        one of Process (biological process),
                               Function (molecular function) or
                               Component (cellular component)

            prob:
            use_fdr:
            progress_callback:

        """

        all_genes = set(genes)

        if not reference:
            reference = all_genes

        if aspect is None:
            aspects_set = {'Process', 'Component', 'Function'}
        elif isinstance(aspect, str):
            aspects_set = {aspect}
        else:
            aspects_set = aspect

        evidence_codes = set(evidence_codes or evidenceDict.keys())
        annotations = [
            ann for gene in genes for ann in self.gene_annotations[gene]
            if ann.evidence in evidence_codes and ann.aspect in aspects_set
        ]

        ref_annotations = set([
            ann for gene in reference for ann in self.gene_annotations[gene]
            if ann.evidence in evidence_codes and ann.aspect in aspects_set
        ])

        annotations_dict = defaultdict(set)
        for ann in annotations:
            annotations_dict[ann.go_id].add(ann)

        self._ensure_ontology()
        if slims_only and not self.ontology.slims_subset:
            warnings.warn(
                "Unspecified slims subset in the ontology! "
                "Using 'goslim_generic' subset", UserWarning)
            self.ontology.set_slims_subset('goslim_generic')

        terms = annotations_dict.keys()
        filtered_terms = [term for term in terms if term in self.ontology]

        if len(terms) != len(filtered_terms):
            term_diff = set(terms) - set(filtered_terms)
            warnings.warn(
                "%s terms in the annotations were not found in the "
                "ontology." % ",".join(map(repr, term_diff)), UserWarning)

        terms = self.ontology.extract_super_graph(filtered_terms)
        res = {}

        milestones = progress_bar_milestones(len(terms), 100)
        unmatch = self.map_from_ncbi_id()

        for i, term in enumerate(terms):
            if slims_only and term not in self.ontology.slims_subset:
                continue
            all_annotations = self.get_annotations_by_go_id(term).intersection(
                ref_annotations)
            all_annotated_genes = set(
                [int(ann.gene_id) for ann in all_annotations])
            mapped_genes = all_genes.intersection(all_annotated_genes)

            if len(reference) > len(all_annotated_genes):
                mapped_reference_genes = reference.intersection(
                    all_annotated_genes)
            else:
                mapped_reference_genes = all_annotated_genes.intersection(
                    reference)

            res[term] = ([unmatch[gene] for gene in mapped_genes],
                         prob.p_value(len(mapped_genes), len(reference),
                                      len(mapped_reference_genes),
                                      len(genes)), len(mapped_reference_genes))

            if progress_callback and i in milestones:
                progress_callback(100.0 * i / len(terms))

        if use_fdr:
            res = sorted(res.items(), key=lambda x: x[1][1])
            res = dict([(id, (genes, p, ref))
                        for (id, (genes, _, ref)), p in zip(
                            res, statistics.FDR([p for _, (_, p, _) in res]))])
        return res

    def get_annotated_terms(self,
                            genes,
                            direct_annotation_only=False,
                            evidence_codes=None,
                            progress_callback=None):
        """ Return all terms that are annotated by genes with evidence_codes.
        """

        genes = [genes] if type(genes) == str else genes
        match = self.map_to_ncbi_id(genes)
        unmatch = self.map_from_ncbi_id()
        genes = set([match[gene] for gene in genes])

        evidence_codes = set(evidence_codes or evidenceDict.keys())
        annotations = [
            ann for gene in genes for ann in self.gene_annotations[gene]
            if ann.evidence in evidence_codes
        ]

        dd = defaultdict(set)
        for ann in annotations:
            dd[ann.go_id].add(unmatch[int(ann.gene_id)])

        if not direct_annotation_only:
            self._ensure_ontology()
            terms = dd.keys()
            filtered_terms = [term for term in terms if term in self.ontology]
            if len(terms) != len(filtered_terms):
                term_diff = set(terms) - set(filtered_terms)
                warnings.warn(
                    "%s terms in the annotations were not found in the "
                    "ontology." % ",".join(map(repr, term_diff)), UserWarning)

            terms = self.ontology.extract_super_graph(filtered_terms)
            for i, term in enumerate(terms):
                term_annotations = self.get_annotations_by_go_id(
                    term).intersection(annotations)
                dd[term].update(
                    [unmatch[int(ann.gene_id)] for ann in term_annotations])
        return dict(dd)

    def __add__(self, iterable):
        """ Return a new Annotations object with combined annotations
        """
        return Annotations([a for a in self] + [a for a in iterable],
                           ontology=self.ontology)

    def __iadd__(self, iterable):
        """ Add annotations to this instance
        """
        self.extend(iterable)
        return self

    def __contains__(self, item):
        return item in self.annotations

    def __iter__(self):
        """ Iterate over all AnnotationRecord objects in annotations
        """
        return iter(self.annotations)

    def __len__(self):
        """ Return the number of annotations
        """
        return len(self.annotations)

    def __getitem__(self, index):
        """ Return the i-th annotation record
        """
        return self.annotations[index]

    def __getslice__(self, *args):
        return self.annotations.__getslice__(*args)

    def add(self, line):
        """ Add one annotation
        """
        self.add_annotation(line)

    def append(self, line):
        """ Add one annotation
        """
        self.add_annotation(line)

    def extend(self, lines):
        """ Add multiple annotations
        """
        for line in lines:
            self.add_annotation(line)
    def _on_dataready(self):
        self.setEnabled(True)
        self.setBlocking(False)
        self.progressBarFinished(processEvents=False)

        try:
            data = self._datatask.result()
        except urlrequest.URLError as error:
            self.error(0, ("Error while connecting to the NCBI ftp server! "
                           "'%s'" % error))
            sys.excepthook(type(error), error, getattr(error, "__traceback__"))
            return
        finally:
            self._datatask = None

        data_name = data.name
        samples, _ = self.selectedSamples()

        self.warning(0)
        message = None
        from orangecontrib.bioinformatics.ncbi.gene import GeneMatcher

        gene_matcher = GeneMatcher(self.currentGds.get('taxid', ''))

        if self.outputRows:
            def samplesinst(ex):
                out = []
                for meta in data.domain.metas:
                    out.append((meta.name, ex[meta].value))

                if data.domain.class_var.name != 'class':
                    out.append((data.domain.class_var.name,
                                ex[data.domain.class_var].value))

                return out
            samples = set(samples)
            mask = [samples.issuperset(samplesinst(ex)) for ex in data]
            data = data[numpy.array(mask, dtype=bool)]
            gene_matcher.match_table_attributes(data)
            if len(data) == 0:
                message = "No samples with selected sample annotations."
        else:
            samples = set(samples)
            domain = Domain(
                [attr for attr in data.domain.attributes
                 if samples.issuperset(attr.attributes.items())],
                data.domain.class_var,
                data.domain.metas
            )
#             domain.addmetas(data.domain.getmetas())

            if len(domain.attributes) == 0:
                message = "No samples with selected sample annotations."
            stypes = set(s[0] for s in samples)
            for attr in domain.attributes:
                attr.attributes = dict(
                    (key, value) for key, value in attr.attributes.items()
                    if key in stypes
                )

            data = Table(domain, data)

            if 'gene' in data.domain:
                gene_column = data.domain['gene']
                gene_names = data.get_column_view(gene_column)[0]
                gene_matcher.genes = gene_names
                gene_matcher.run_matcher()

                domain_ids = Domain([], metas=[StringVariable(NCBI_ID)])
                data_ids = [[str(gene.ncbi_id) if gene.ncbi_id else '?'] for gene in gene_matcher.genes]
                table_ids = Table(domain_ids, data_ids)

                data = Table.concatenate([data, table_ids])

        if message is not None:
            self.warning(0, message)

        data.attributes[TAX_ID] = self.currentGds.get('taxid', '')
        data.attributes[GENE_AS_ATTRIBUTE_NAME] = bool(self.outputRows)

        if not bool(self.outputRows):
            data.attributes[GENE_ID_COLUMN] = NCBI_ID
        else:
            data.attributes[GENE_ID_ATTRIBUTE] = NCBI_ID

        data.name = data_name
        self.send("Expression Data", data)

        model = self.treeWidget.model().sourceModel()
        row = self.gds.index(self.currentGds)

        model.setData(model.index(row, 0),  " ", Qt.DisplayRole)

        self.updateInfo()
        self.selectionChanged = False