Example #1
0
    def update_from_obo_library(
        cls,
        prefix: str,
        extension: str = "owl",
        **kwargs,
    ):
        prefix = prefix.lower()
        cache_path = get_resource_path(f"{prefix}.{extension}.pkl")

        if os.path.exists(cache_path):
            with open(cache_path, "rb") as file:
                ontology = pickle.load(file)
        else:
            try:
                import pronto
            except ImportError:
                raise ImportError(
                    "To use the INDRA OWL Client, you must first"
                    "install Pronto with `pip install pronto`."
                )
            ontology = pronto.Ontology.from_obo_library(
                f"{prefix.upper()}.{extension}")
            with open(cache_path, "wb") as file:
                pickle.dump(ontology, file, protocol=pickle.HIGHEST_PROTOCOL)

        cls.update_resource(prefix=prefix, ontology=ontology, **kwargs)
Example #2
0
def _process_categories():
    """Collect protein category labels from multiple sources."""
    idg_df = pandas.read_csv('IDG_target_final.csv')
    tf_df = pandas.read_csv(get_resource_path('transcription_factors.csv'))
    pp_df = pandas.read_csv(get_resource_path('phosphatases.tsv'), sep='\t',
                            header=None)
    categories = {}
    for _, row in idg_df.iterrows():
        categories[row['gene']] = row['idgFamily']

    for _, row in tf_df.iterrows():
        categories[row[1]] = 'Transcription factor'

    for _, row in pp_df.iterrows():
        categories[row[0]] = 'Phosphatase'
    return categories
Example #3
0
def _read_famplex_map():
    fname = get_resource_path('famplex_map.tsv')
    raw_map = read_unicode_csv(fname, '\t')

    m = {}
    for row in raw_map:
        m[(row[0], row[1])] = row[2]
    return m
Example #4
0
 def add_famplex_nodes(self):
     nodes = []
     for row in read_unicode_csv(get_resource_path(
             os.path.join('famplex', 'entities.csv')),
                                 delimiter=','):
         entity = row[0]
         nodes.append((self.label('FPLX', entity), {'name': entity}))
     self.add_nodes_from(nodes)
Example #5
0
def _build_chebi_map():
    fname = get_resource_path('bel_chebi_map.tsv')
    chebi_name_id = {}
    csv_rows = read_unicode_csv(fname, delimiter='\t')
    for row in csv_rows:
        chebi_name = row[0]
        chebi_id = row[1]
        chebi_name_id[chebi_name] = chebi_id
    return chebi_name_id
Example #6
0
def _build_famplex_map():
    fname = get_resource_path('famplex_map.tsv')
    bel_to_indra = {}
    csv_rows = read_unicode_csv(fname, delimiter='\t')
    for row in csv_rows:
        namespace = row[0]
        entry = row[1]
        indra_name = row[2]
        if namespace == 'BEL':
            bel_to_indra[entry] = indra_name
    return bel_to_indra
Example #7
0
 def add_famplex_hierarchy(self):
     from indra.databases import hgnc_client
     edges = []
     for row in read_unicode_csv(get_resource_path(
             os.path.join('famplex', 'relations.csv')), delimiter=','):
         ns1, id1, rel, ns2, id2 = row
         if ns1 == 'HGNC':
             id1 = hgnc_client.get_hgnc_id(id1)
         edges.append((self.label(ns1, id1),
                       self.label(ns2, id2),
                       {'type': rel}))
     self.add_edges_from(edges)
Example #8
0
 def add_biomappings(self):
     biomappings_tsv = get_resource_path('biomappings.tsv')
     edges = []
     for source_ns, source_id, _, target_ns, target_id, _ in \
             read_unicode_csv(biomappings_tsv, delimiter='\t'):
         edges.append((self.label(source_ns, source_id),
                       self.label(target_ns, target_id),
                       {'type': 'xref', 'source': 'biomappings'}))
         edges.append((self.label(target_ns, target_id),
                       self.label(source_ns, source_id),
                       {'type': 'xref', 'source': 'biomappings'}))
     self.add_edges_from(edges)
Example #9
0
def read_selventa_resources():
    fname = get_resource_path('selventa_entries.tsv')
    csv_rows = read_unicode_csv(fname)
    selventa_lookup = {}
    for namespace, sid, name, xrefs_str in csv_rows:
        # namespace, name -> ID, xrefs
        if xrefs_str:
            xrefs_dict = {
                x.split(':', 1)[0]: x.split(':', 1)[1]
                for x in xrefs_str.split('|')
            }
        else:
            xrefs_dict = {}
        selventa_lookup[(namespace, name)] = (sid, xrefs_dict)
    return selventa_lookup
Example #10
0
 def add_famplex_xrefs(self):
     edges = []
     include_refs = {'PF', 'IP', 'GO', 'NCIT', 'ECCODE', 'HGNC_GROUP',
                     'MESH'}
     for row in read_unicode_csv(get_resource_path('famplex_map.tsv'),
                                 delimiter='\t'):
         ref_ns, ref_id, fplx_id = row
         if ref_ns not in include_refs:
             continue
         edges.append((self.label(ref_ns, ref_id),
                       self.label('FPLX', fplx_id),
                       {'type': 'xref', 'source': 'fplx'}))
         edges.append((self.label('FPLX', fplx_id),
                       self.label(ref_ns, ref_id),
                       {'type': 'xref', 'source': 'fplx'}))
     self.add_edges_from(edges)
Example #11
0
 def add_lspci(self):
     lspci = read_unicode_csv(get_resource_path('lspci.tsv'),
                              delimiter='\t')
     nodes_to_add = []
     edges_to_add = []
     next(lspci)
     for (lspcid, name, members_str) in lspci:
         label = self.label('LSPCI', lspcid)
         nodes_to_add.append((label, {'name': name,
                                      'type': 'small_molecule'}))
         members = [member.split(':', maxsplit=1)
                    for member in members_str.split('|')]
         edges_to_add += [(self.label(*member), label, {'type': 'isa'})
                          for member in members]
     self.add_nodes_from(nodes_to_add)
     self.add_edges_from(edges_to_add)
Example #12
0
    def update_resource(
        cls,
        directory,
        url,
        prefix,
        *args,
        remove_prefix=False,
        allowed_synonyms=None,
        allowed_external_ns=None,
        force: bool = False,
    ):
        """Write the OBO information to files in the given directory."""
        resource_path = get_resource_path(f'{prefix}.json')
        obo_path = os.path.join(directory, '%s.obo.pkl' % prefix)
        if os.path.exists(obo_path) and not force:
            with open(obo_path, 'rb') as file:
                g = pickle.load(file)
        else:
            g = obonet.read_obo(url)
            with open(obo_path, 'wb') as file:
                pickle.dump(g, file)

        entries = \
            OboClient.entries_from_graph(
                g, prefix=prefix,
                remove_prefix=remove_prefix,
                allowed_synonyms=allowed_synonyms,
                allowed_external_ns=allowed_external_ns)
        entries = prune_standard(entries)

        def sort_key(x):
            val = x['id']
            if not remove_prefix:
                val = val.split(':')[1]
            try:
                val = int(val)
            except ValueError:
                pass
            return val

        entries = sorted(entries, key=sort_key)
        with open(resource_path, 'w') as file:
            json.dump(entries, file, indent=1, sort_keys=True)
Example #13
0
 def add_famplex_xrefs(self):
     edges = []
     include_refs = {'PF', 'IP', 'GO', 'NCIT', 'ECCODE', 'HGNC_GROUP',
                     'MESH'}
     for row in read_unicode_csv(get_resource_path('famplex_map.tsv'),
                                 delimiter='\t'):
         ref_ns, ref_id, fplx_id = row
         if ref_ns not in include_refs:
             continue
         edges.append((self.label(ref_ns, ref_id),
                       self.label('FPLX', fplx_id),
                       {'type': 'xref', 'source': 'fplx'}))
         # We avoid FPLX->MESH mappings in this direction due to
         # species-specificity issues
         if ref_ns != 'MESH':
             edges.append((self.label('FPLX', fplx_id),
                           self.label(ref_ns, ref_id),
                           {'type': 'xref', 'source': 'fplx'}))
     self.add_edges_from(edges)
Example #14
0
    def update_by_prefix(
        cls,
        prefix: str,
        include_relations: bool = False,
        predicate: Optional[Callable[["pyobo.Term"], bool]] = None,
        indra_prefix: str = None,
    ):
        """Update the JSON data by looking up the ontology through PyOBO."""
        import pyobo

        terms = iter(pyobo.get_ontology(prefix))
        if predicate:
            terms = filter(predicate, terms)
        terms = sorted(terms, key=attrgetter("identifier"))
        entries = [{
            'id':
            term.identifier,
            'name':
            term.name,
            'synonyms': [synonym.name for synonym in term.synonyms],
            'xrefs': [
                dict(namespace=xref.prefix, id=xref.identifier)
                for xref in term.xrefs
            ],
            'alt_ids': [alt_id.identifier for alt_id in term.alt_ids],
            'relations':
            _get_pyobo_rels(
                term,
                include_relations=include_relations,
            ),
        } for term in terms]
        entries = prune_standard(entries)
        indra_prefix = prefix if not indra_prefix else indra_prefix
        resource_path = get_resource_path(f'{indra_prefix}.json')
        with open(resource_path, 'w') as file:
            json.dump(entries, fp=file, indent=1, sort_keys=True)
Example #15
0
    def update_resource(
        cls,
        prefix: str,
        ontology: "pronto.Ontology",
        skip_obsolete: bool = True,
        remove_prefix: bool = False,
    ):
        prefix = prefix.lower()
        entries = cls.entries_from_ontology(
            prefix=prefix, ontology=ontology, skip_obsolete=skip_obsolete,
            remove_prefix=remove_prefix
        )
        entries = prune_empty_entries(
            entries,
            {"synonyms", "xrefs", "alt_ids", "relations"},
        )
        entries = sorted(
            entries,
            key=itemgetter("id") if remove_prefix else _id_key,
        )

        resource_path = get_resource_path(f"{prefix}.json")
        with open(resource_path, "w") as file:
            json.dump(entries, file, indent=1, sort_keys=True)
Example #16
0
def _read_hgnc_maps():
    hgnc_file = get_resource_path("hgnc_entries.tsv")
    csv_rows = read_unicode_csv(hgnc_file, delimiter='\t', encoding='utf-8')
    hgnc_names = {}
    hgnc_ids = {}
    hgnc_withdrawn = []
    uniprot_ids = {}
    entrez_ids = {}
    entrez_ids_reverse = {}
    mouse_map = {}
    rat_map = {}
    prev_sym_map = {}
    ensembl_ids = {}
    ensembl_ids_reverse = {}
    hgnc_withdrawn_new_ids = {}
    gene_types = {}
    hgnc_to_enzymes = defaultdict(set)
    enzyme_to_hgncs = defaultdict(set)
    # Skip the header
    next(csv_rows)
    for row in csv_rows:
        hgnc_id = row[0][5:]
        hgnc_status = row[3]
        if hgnc_status in {'Approved', 'Entry Withdrawn'}:
            hgnc_name = row[1]
            hgnc_names[hgnc_id] = hgnc_name
            # Note that withdrawn entries don't overlap with approved
            # entries at this point so it's safe to add mappings for
            # withdrawn names
            hgnc_ids[hgnc_name] = hgnc_id
        elif hgnc_status == 'Symbol Withdrawn':
            descr = row[2]
            m = re.match(r'symbol withdrawn, see \[HGNC:(?: ?)(\d+)\]', descr)
            new_id = m.groups()[0]
            hgnc_withdrawn.append(hgnc_id)
            hgnc_withdrawn_new_ids[hgnc_id] = new_id
        # Uniprot
        uniprot_id = row[6]
        if uniprot_id:
            uniprot_ids[hgnc_id] = uniprot_id
        # Entrez
        entrez_id = row[5]
        if entrez_id:
            entrez_ids[hgnc_id] = entrez_id
            entrez_ids_reverse[entrez_id] = hgnc_id
        # Mouse
        mgi_id = row[7]
        if mgi_id:
            mgi_ids = mgi_id.split(', ')
            for mgi_id in mgi_ids:
                if mgi_id.startswith('MGI:'):
                    mgi_id = mgi_id[4:]
                mouse_map[mgi_id] = hgnc_id
        # Rat
        rgd_id = row[8]
        if rgd_id:
            rgd_ids = rgd_id.split(', ')
            for rgd_id in rgd_ids:
                if rgd_id.startswith('RGD:'):
                    rgd_id = rgd_id[4:]
                rat_map[rgd_id] = hgnc_id
        # Previous symbols
        prev_sym_entry = row[9]
        if prev_sym_entry:
            prev_syms = prev_sym_entry.split(', ')
            for prev_sym in prev_syms:
                # If we already mapped this previous symbol to another ID
                if prev_sym in prev_sym_map:
                    # If we already have a list here, we just extend it
                    if isinstance(prev_sym_map[prev_sym], list):
                        prev_sym_map[prev_sym].append(hgnc_id)
                    # Otherwise we create a list and start it with the two
                    # IDs we know the symbol is mapped to
                    else:
                        prev_sym_map[prev_sym] = [
                            prev_sym_map[prev_sym], hgnc_id
                        ]
                # Otherwise we just make a string entry here
                else:
                    prev_sym_map[prev_sym] = hgnc_id
        ensembl_id = row[10]
        # Ensembl IDs
        if ensembl_id:
            ensembl_ids[hgnc_id] = ensembl_id
            ensembl_ids_reverse[ensembl_id] = hgnc_id
        gene_type = row[11]
        if gene_type:
            gene_types[hgnc_id] = gene_type
        enyzyme_ids = row[12]
        if enyzyme_ids:
            for enzyme_id in enyzyme_ids.split(", "):
                hgnc_to_enzymes[hgnc_id].add(enzyme_id)
                enzyme_to_hgncs[enzyme_id].add(hgnc_id)

    for old_id, new_id in hgnc_withdrawn_new_ids.items():
        hgnc_names[old_id] = hgnc_names[new_id]

    return (
        hgnc_names,
        hgnc_ids,
        hgnc_withdrawn,
        uniprot_ids,
        entrez_ids,
        entrez_ids_reverse,
        mouse_map,
        rat_map,
        prev_sym_map,
        ensembl_ids,
        ensembl_ids_reverse,
        gene_types,
        dict(hgnc_to_enzymes),
        dict(enzyme_to_hgncs),
    )
Example #17
0
def _load_pubchem_mesh_map():
    rows = read_unicode_csv(get_resource_path('pubchem_mesh_map.tsv'),
                            delimiter='\t')
    mappings = {row[0]: row[1] for row in rows}
    return mappings