def main(): graph = get_obo_graph('chiro') chebi_mapping = get_id_name_mapping('chebi') mappings = { prefix: get_id_name_mapping(prefix) for prefix in MAPPING_PREFIXES } triples = [] for h, data in graph.nodes(data=True): if not data: continue r, t = data['relationship'][0].split() r = r[:-len('_of')] h_name = chebi_mapping.get(h) if h_name is None: print(f'Could not find name for chemical {h}') continue t_namespace = t.split(':')[0].lower() t_mapping = mappings[t_namespace] t_name = t_mapping.get(t) if t_name is None: print(f'Could not find name for target {t}') continue triples.append(('chebi', h, h_name, r, t_namespace, t, t_name)) with open('chiro_import.tsv', 'w') as file: print( 'source_db source_id source_name modulation type target_db target_id target_name', file=file) for t in sorted(triples): print(*t, sep='\t', file=file)
def upload_artifacts_for_prefix(*, prefix: str, bucket: str, s3_client=None): """Upload compiled parts for the given prefix to AWS.""" if s3_client is None: s3_client = boto3.client("s3") logger.info("[%s] getting id->name mapping", prefix) get_id_name_mapping(prefix) id_name_path = prefix_cache_join(prefix, name="names.tsv", version=get_version(prefix)) if not id_name_path.exists(): raise FileNotFoundError id_name_key = os.path.join(prefix, "cache", "names.tsv") logger.info("[%s] uploading id->name mapping", prefix) upload_file(path=id_name_path, bucket=bucket, key=id_name_key, s3_client=s3_client) logger.info("[%s] getting id->synonyms mapping", prefix) get_id_synonyms_mapping(prefix) id_synonyms_path = prefix_cache_join(prefix, name="synonyms.tsv", version=get_version(prefix)) if not id_synonyms_path.exists(): raise FileNotFoundError id_synonyms_key = os.path.join(prefix, "cache", "synonyms.tsv") logger.info("[%s] uploading id->synonyms mapping", prefix) upload_file(path=id_synonyms_path, bucket=bucket, key=id_synonyms_key, s3_client=s3_client) logger.info("[%s] getting xrefs", prefix) get_xrefs_df(prefix) xrefs_path = prefix_cache_join(prefix, name="xrefs.tsv", version=get_version(prefix)) if not xrefs_path.exists(): raise FileNotFoundError xrefs_key = os.path.join(prefix, "cache", "xrefs.tsv") logger.info("[%s] uploading xrefs", prefix) upload_file(path=xrefs_path, bucket=bucket, key=xrefs_key, s3_client=s3_client) logger.info("[%s] getting relations", prefix) get_relations_df(prefix) relations_path = prefix_cache_join(prefix, name="relations.tsv", version=get_version(prefix)) if not relations_path.exists(): raise FileNotFoundError relations_key = os.path.join(prefix, "cache", "relations.tsv") logger.info("[%s] uploading relations", prefix) upload_file(path=relations_path, bucket=bucket, key=relations_key, s3_client=s3_client) logger.info("[%s] getting properties", prefix) get_properties_df(prefix) properties_path = prefix_cache_join(prefix, name="properties.tsv", version=get_version(prefix)) if not properties_path.exists(): raise FileNotFoundError properties_key = os.path.join(prefix, "cache", "properties.tsv") logger.info("[%s] uploading properties", prefix) upload_file(path=properties_path, bucket=bucket, key=properties_key, s3_client=s3_client) logger.info("[%s] getting alternative identifiers", prefix) get_id_to_alts(prefix) alts_path = prefix_cache_join(prefix, name="alt_ids.tsv", version=get_version(prefix)) if not alts_path.exists(): raise FileNotFoundError alts_key = os.path.join(prefix, "cache", "alt_ids.tsv") logger.info("[%s] uploading alternative identifiers", prefix) upload_file(path=alts_path, bucket=bucket, key=alts_key)
def __init__( self, *, graph: BELGraph, managers: List, ): """Initialize the pathway assigner with several lookup dictionaries. :param managers: A ComPath manager or iterable of ComPath managers """ self.graph = graph self.pathway_to_symbols = defaultdict(set) self.symbol_to_pathways = defaultdict(set) if not isinstance(managers, list): managers = [] for manager in managers: self._add_manager(manager) # These won't be loaded more so convert to normal dicts self.pathway_to_symbols = dict(self.pathway_to_symbols) self.symbol_to_pathways = dict(self.symbol_to_pathways) hgnc_obo = pyobo.sources.hgnc.get_obo() self.hgnc_id_to_symbol = pyobo.get_id_name_mapping('hgnc') # Prepare MGI self.hgnc_mgi_mapping = hgnc_obo.get_relations_mapping('ro:HOM0000017', 'mgi') self.mgi_to_hgnc = {v: k for k, v in self.hgnc_mgi_mapping.items()} self.mgi_id_to_symbol = pyobo.get_id_name_mapping('mgi') self.mgi_symbol_to_hgnc_symbol = { self.mgi_id_to_symbol[mgi_id]: self.hgnc_id_to_symbol[hgnc_id] for mgi_id, hgnc_id in self.mgi_to_hgnc.items() } # Prepare RGD self.hgnc_rgd_mapping = hgnc_obo.get_relations_mapping('ro:HOM0000017', 'rgd') self.rgd_to_hgnc = {v: k for k, v in self.hgnc_rgd_mapping.items()} self.rgd_id_to_symbol = pyobo.get_id_name_mapping('rgd') self.rgd_symbol_to_hgnc_symbol = { self.rgd_id_to_symbol[rgd_id]: self.hgnc_id_to_symbol[hgnc_id] for rgd_id, hgnc_id in self.rgd_to_hgnc.items() } self.pathway_to_key = defaultdict(set) self.key_to_pathway = defaultdict(set) self.pmid_to_pathway = defaultdict(set) self.pathway_to_pmid = defaultdict(set) self.double_annotated = defaultdict(lambda: defaultdict(list))
def get_gilda_terms(prefix: str, url: Optional[str] = None) -> Iterable[gilda.term.Term]: """Get gilda terms for the given namespace.""" id_to_name = get_id_name_mapping(prefix, url=url) for identifier, name in tqdm(id_to_name.items(), desc='mapping names'): yield gilda.term.Term( norm_text=normalize(name), text=name, db=prefix, id=identifier, entry_name=name, status='name', source=prefix, ) id_to_synonyms = get_id_synonyms_mapping(prefix, url=url) for identifier, synonyms in tqdm(id_to_synonyms.items(), desc='mapping synonyms'): name = id_to_name[identifier] for synonym in synonyms: yield gilda.term.Term( norm_text=normalize(synonym), text=synonym, db=prefix, id=identifier, entry_name=name, status='synonym', source=prefix, )
def main( port: str, host: str, sql: bool, sql_uri: str, sql_refs_table: str, sql_alts_table: str, data: Optional[str], test: bool, with_gunicorn: bool, lazy: bool, workers: int, ): """Run the resolver app.""" if test and lazy: click.secho('Can not run in --test and --lazy mode at the same time', fg='red') sys.exit(0) if test: data = [ (prefix, identifier, name) for prefix in ['hgnc', 'chebi', 'doid', 'go'] for identifier, name in pyobo.get_id_name_mapping(prefix).items() ] data = pd.DataFrame(data, columns=['prefix', 'identifier', 'name']) app = get_app( data, lazy=lazy, sql=sql, uri=sql_uri, refs_table=sql_refs_table, alts_table=sql_alts_table, ) run_app(app=app, host=host, port=port, with_gunicorn=with_gunicorn, workers=workers)
def update_drugbank_mappings(): """Update mappings from DrugBank to CHEBI/CHEMBL""" # Note that for this to work, PyOBO (https://github.com/pyobo/pyobo) has # to be installed and the DrugBank download # (https://www.drugbank.ca/releases/latest) put into ~/.obo/drugbank/ # Note that the DrugBank download requires signing up for an account and # waiting for approval. import pyobo drugbank_chembl = pyobo.get_filtered_xrefs('drugbank', 'chembl.compound') drugbank_chebi = pyobo.get_filtered_xrefs('drugbank', 'chebi') chebi_drugbank = pyobo.get_filtered_xrefs('chebi', 'drugbank') drugbank_names = pyobo.get_id_name_mapping('drugbank') rows = [] for drugbank_id, chembl_id in drugbank_chembl.items(): rows.append([drugbank_id, 'CHEMBL', chembl_id, 'drugbank']) for drugbank_id, chebi_id in drugbank_chebi.items(): rows.append([drugbank_id, 'CHEBI', chebi_id, 'drugbank']) for chebi_id, drugbank_id in chebi_drugbank.items(): rows.append([drugbank_id, 'CHEBI', chebi_id, 'chebi']) for drugbank_id, name in drugbank_names.items(): rows.append([drugbank_id, 'NAME', name, 'drugbank']) fname = os.path.join(path, 'drugbank_mappings.tsv') header = ['DRUGBANK_ID', 'NAMESPACE', 'ID', 'SOURCE'] rows = [header] + sorted(rows) write_unicode_csv(fname, rows, delimiter='\t')
def test_get_names(self): """Test getting names.""" id_to_name = get_id_name_mapping('chebi', url=TEST_CHEBI_OBO_PATH, local=True) for identifier in id_to_name: self.assertFalse(identifier.startswith('CHEBI')) self.assertFalse(identifier.startswith('CHEBI:')) self.assertFalse(identifier.startswith('chebi:')) self.assertFalse(identifier.startswith('chebi'))
def test_get_names(self): """Test getting names.""" with chebi_patch: id_to_name = get_id_name_mapping('chebi') for identifier in id_to_name: self.assertFalse(identifier.startswith('CHEBI')) self.assertFalse(identifier.startswith('CHEBI:')) self.assertFalse(identifier.startswith('chebi:')) self.assertFalse(identifier.startswith('chebi'))
def get_terms() -> Iterable[Term]: """Get ComplexPortal terms.""" df = get_df() df['aliases'] = df['aliases'].map(lambda s: s.split('|') if pd.notna(s) else []) df['members'] = df['members'].map(_parse_members) df['xrefs'] = df['xrefs'].map(_parse_xrefs) taxnomy_id_to_name = get_id_name_mapping('ncbitaxon') df['taxonomy_name'] = df['taxonomy_id'].map(taxnomy_id_to_name.get) slim_df = df[[ 'complexportal_id', 'name', 'definition', 'aliases', 'xrefs', 'taxonomy_id', 'taxonomy_name', 'members', ]] it = tqdm(slim_df.values, total=len(slim_df.index), desc=f'mapping {PREFIX}') unhandled_xref_type = set() for complexportal_id, name, definition, aliases, xrefs, taxonomy_id, taxonomy_name, members in it: synonyms = [Synonym(name=alias) for alias in aliases] _xrefs = [] provenance = [] for reference, note in xrefs: if note == 'identity': _xrefs.append(reference) elif note == 'see-also' and reference.prefix == 'pubmed': provenance.append(reference) elif (note, reference.prefix) not in unhandled_xref_type: logger.debug( f'unhandled xref type: {note} / {reference.prefix}') unhandled_xref_type.add((note, reference.prefix)) term = Term( reference=Reference(prefix=PREFIX, identifier=complexportal_id, name=name), definition=definition.strip(), synonyms=synonyms, xrefs=_xrefs, provenance=provenance, ) term.set_species(identifier=taxonomy_id, name=taxonomy_name) for reference, _count in members: term.append_relationship(has_part, reference) yield term
def main(port: int, host: str, data: Optional[str], test: bool, gunicorn: bool, lazy: bool): """Run the resolver app.""" if test: data = [ (prefix, identifier, name) for prefix in ['hgnc', 'chebi', 'doid'] for identifier, name in pyobo.get_id_name_mapping(prefix).items() ] data = pd.DataFrame(data, columns=['prefix', 'identifier', 'name']) app = get_app(data, lazy=lazy) run_app(app=app, host=host, port=port, gunicorn=gunicorn)
def get_gilda_terms( prefix: str, identifiers_are_names: bool = False) -> Iterable[gilda.term.Term]: """Get gilda terms for the given namespace.""" id_to_name = get_id_name_mapping(prefix) it = tqdm(id_to_name.items(), desc=f"[{prefix}] mapping", unit_scale=True, unit="name") for identifier, name in it: yield gilda.term.Term( norm_text=normalize(name), text=name, db=prefix, id=identifier, entry_name=name, status="name", source=prefix, ) id_to_synonyms = get_id_synonyms_mapping(prefix) it = tqdm(id_to_synonyms.items(), desc=f"[{prefix}] mapping", unit_scale=True, unit="synonym") for identifier, synonyms in it: name = id_to_name[identifier] for synonym in synonyms: yield gilda.term.Term( norm_text=normalize(synonym), text=synonym, db=prefix, id=identifier, entry_name=name, status="synonym", source=prefix, ) if identifiers_are_names: it = tqdm(get_ids(prefix), desc=f"[{prefix}] mapping", unit_scale=True, unit="id") for identifier in it: yield gilda.term.Term( norm_text=normalize(identifier), text=identifier, db=prefix, id=identifier, entry_name=None, status="identifier", source=prefix, )
def iter_gilda_prediction_tuples( prefix: str, relation: str, *, grounder: Optional[Grounder] = None, identifiers_are_names: bool = False, ) -> Iterable[Tuple[str, str, str, str, str, str, str, str, float]]: """Iterate over prediction tuples for a given prefix.""" if grounder is None: grounder = gilda.api.grounder id_name_mapping = get_id_name_mapping(prefix) it = tqdm(id_name_mapping.items(), desc=f"[{prefix}] gilda tuples", unit_scale=True, unit="name") for identifier, name in it: for scored_match in grounder.ground(name): target_prefix = scored_match.term.db.lower() yield ( prefix, normalize_identifier(prefix, identifier), name, relation, target_prefix, normalize_identifier(target_prefix, scored_match.term.id), scored_match.term.entry_name, "lexical", scored_match.score, ) if identifiers_are_names: it = tqdm(get_ids(prefix), desc=f"[{prefix}] gilda tuples", unit_scale=True, unit="id") for identifier in it: for scored_match in grounder.ground(identifier): target_prefix = scored_match.term.db.lower() yield ( prefix, normalize_identifier(prefix, identifier), identifier, relation, target_prefix, normalize_identifier(target_prefix, scored_match.term.id), scored_match.term.entry_name, "lexical", scored_match.score, )
def main(): """Run the MeSH curation pipeline.""" xrefs_df = get_xrefs_df() mesh_xrefs_df = xrefs_df[xrefs_df['source_db'] == 'mesh'] curated_mesh_ids = set(mesh_xrefs_df['source_id']) terms = { identifier: (name, suffix.strip('s')) for identifier, name in pyobo.get_id_name_mapping('mesh').items() if identifier not in curated_mesh_ids and identifier not in BLACKLIST for suffix in SUFFIXES if name.lower().endswith(suffix) } for i, (identifier, (name, suffix)) in enumerate(sorted(terms.items(), key=lambda t: t[1][0]), start=1): print('mesh', identifier, name, suffix, '?', '?', '?', '?', sep='\t')
def get_all_enzymes(): HOME = str(Path.home()) ec_code_path = '.obo/ec-code/ec-code.obo' if not os.path.exists(os.path.join(HOME, ec_code_path)): _ = pyobo.get_id_name_mapping('ec-code') obo = obonet.read_obo(os.path.join(HOME, ec_code_path)) else: obo = obonet.read_obo(os.path.join(HOME, ec_code_path)) up_nodes = set() for node in obo.nodes: if node.startswith('uniprot'): up_nodes.add(node[8:]) human_ups = {u for u in up_nodes if uniprot_client.is_human(u)} enzymes = {uniprot_client.get_gene_name(u) for u in human_ups} enzymes = {g for g in enzymes if not hgnc_client.is_kinase(g)} enzymes = {g for g in enzymes if not hgnc_client.is_phosphatase(g)} logger.info(f'Filtered {len(enzymes)} enzymes in total') return enzymes
def main(show_ungrounded: bool, output: Optional[TextIO]): """Run the MeSH curation pipeline.""" xrefs_df = get_xrefs_df() mesh_xrefs_df = xrefs_df[xrefs_df['source_db'] == 'mesh'] curated_mesh_ids = set(mesh_xrefs_df['source_id']) terms = { identifier: (name, name[:-len(suffix)], suffix.strip('s')) for identifier, name in pyobo.get_id_name_mapping('mesh').items() if identifier not in curated_mesh_ids and identifier not in BLACKLIST for suffix in SUFFIXES if name.lower().endswith(suffix) } it = sorted(terms.items(), key=lambda t: t[1][0]) it = tqdm(it, desc='making MeSH curation sheet') for i, (identifier, (name, search_text, suffix)) in enumerate(it, start=1): for row in yield_gilda('mesh', identifier, name, suffix, search_text, show_ungrounded or output is not None): print(*row, sep='\t', file=output)
def _get_example(prefix: str) -> Optional[str]: if prefix in {'gaz', 'bila', 'pubchem.compound'}: return None if prefix in pyobo.getters.SKIP: return None try: x = pyobo.get_id_name_mapping(prefix) except (pyobo.getters.NoBuild, ValueError, urllib.error.URLError): return None if not x: return None x = list(x) try: rv = x[random.randint(0, len(x))] # noqa:S311 except IndexError: print('failed', prefix, x) return None else: print('adding', prefix, rv) return rv
def iter_gilda_prediction_tuples(prefix: str, relation: str) -> Iterable[PredictionTuple]: """Iterate over prediction tuples for a given prefix.""" provenance = get_script_url(__file__) id_name_mapping = pyobo.get_id_name_mapping(prefix) for identifier, name in tqdm(id_name_mapping.items(), desc=f'Mapping {prefix}'): for scored_match in gilda.ground(name): yield PredictionTuple( prefix, identifier, name, relation, scored_match.term.db.lower(), scored_match.term.id, scored_match.term.entry_name, 'lexical', scored_match.score, provenance, )
BIOLOGICAL_ROLE_ID = '24432' APPLICATION_ROLE_ID = '33232' BIOCHEMICAL_ROLE_CHEBI_ID = '52206' PATHWAY_INHIBITOR_CHEBI_ID = '76932' ENZYME_INHIBITOR_CHEBI_ID = '23924' AGONIST_CHEBI_ID = '48705' INVERSE_AGONIST_CHEBI_ID = '90847' INHIBITOR_CHEBI_ID = '35222' ANTAGONIST_CHEBI_ID = '48706' BLACKLIST = [ '48001', # protein synthesis inhibitor '64106', # protein kinase agonist ] chebi_obo = pyobo.get('chebi') chebi_id_to_name = pyobo.get_id_name_mapping('chebi') XREFS_DF = get_xrefs_df() CURATED_ROLE_CHEBI_IDS = { source_id[len('CHEBI:'):] for source_db, source_id in XREFS_DF[['source_db', 'source_id']].values if source_db == 'chebi' } IRRELEVANT_ROLE_CHEBI_IDS = set( itt.chain.from_iterable( chebi_obo.descendants(chebi_id[len('CHEBI'):]) for chebi_id in get_irrelevant_roles_df().identifier if chebi_id[len('CHEBI'):] in chebi_obo.hierarchy)) def _get_inhibitors_reclassification() -> pd.DataFrame:
def populate(self, paths: Optional[Mapping[str, str]] = None): """Populate the database. :param paths: mapping from tax identifiers to paths to GMT files """ if not paths: logger.info('No paths given.') paths = {info.taxonomy_id: info.path for info in infos.values()} logger.info(f'Using default paths at {paths}.') elif not isinstance(paths, dict): raise TypeError('Invalid type for paths. Shoudl be dict.') pathways = [ pathway for taxonomy_id, path in paths.items() for pathway in parse_wikipathways_gmt(path) ] versions = { version for _identifier, version, _revision, _name, _species_name, _entries in pathways } if len(versions) != 1: raise ValueError('got multiple versions') version = list(versions)[0] taxonomy_name_to_id = get_name_id_mapping('ncbitaxon') species_names = { SPECIES_REMAPPING.get(species_name, species_name) for _identifier, _version, _revision, _name, species_name, _entries in pathways } species_name_to_species = {} for species_name in tqdm(species_names, desc=f'v{version} serializing species'): taxonomy_id = taxonomy_name_to_id[species_name] species = species_name_to_species[species_name] = Species(taxonomy_id=taxonomy_id, name=species_name) self.session.add(species) hgnc_id_to_entrez_id = get_filtered_xrefs('hgnc', 'ncbigene') if not hgnc_id_to_entrez_id: raise ValueError('Mappings from hgnc to ncbigene couldnt be loaded') entrez_id_to_hgnc_id = {v: k for k, v in hgnc_id_to_entrez_id.items()} hgnc_id_to_name = get_id_name_mapping('hgnc') missing_entrez_ids = set() entrez_ids = { entrez_id for _identifier, _version, _revision, _name, _species, entrez_ids in pathways for entrez_id in entrez_ids } entrez_id_protein = {} for entrez_id in tqdm(entrez_ids, desc=f'v{version} serializing proteins'): hgnc_id = entrez_id_to_hgnc_id.get(entrez_id) if hgnc_id: hgnc_symbol = hgnc_id_to_name[hgnc_id] else: hgnc_symbol = None if not hgnc_symbol: logging.debug(f"ncbigene:{entrez_id} has no HGNC identifier") missing_entrez_ids.add(entrez_id) entrez_id_protein[entrez_id] = protein = self.get_or_create_protein( entrez_id=entrez_id, hgnc_symbol=hgnc_symbol, hgnc_id=hgnc_id, ) self.session.add(protein) logger.info(f'Proteins: {len(entrez_id_protein)}') logger.info(f"Proteins w/o HGNC mapping: {len(missing_entrez_ids)}") for ( wikipathways_id, _version, revision, pathway_name, species_name, entrez_ids, ) in tqdm(pathways, desc=f'v{version} serializing pathways'): proteins = [ entrez_id_protein[entrez_id] for entrez_id in entrez_ids ] pathway = self.get_or_create_pathway( identifier=wikipathways_id, name=pathway_name.strip(), revision=revision, species=species_name_to_species[SPECIES_REMAPPING.get(species_name, species_name)], proteins=proteins, ) self.session.add(pathway) self.session.commit()
def _get_urls(prefix='doid', host='localhost', port=5000): identifiers = pyobo.get_id_name_mapping(prefix) return [ f'http://{host}:{port}/resolve/{prefix}:{identifier}' for identifier in identifiers ]
from tqdm import tqdm import pybel import pybel.dsl from pybel import BELGraph from ..compath import CompathManager, CompathPathwayMixin, CompathProteinMixin from ..utils import get_data_dir logger = logging.getLogger(__name__) MODULE_NAME = 'pid' DIRECTORY = get_data_dir(MODULE_NAME) URL = 'https://github.com/NCIP/pathway-interaction-database/raw/master/download/NCI-Pathway-Info.xlsx' chebi_id_to_name = get_id_name_mapping('chebi') hgnc_name_to_id = get_name_id_mapping('hgnc') hgnc_id_to_entrez_id = get_filtered_xrefs('hgnc', 'ncbigene') relation_to_adder = { 'controls-state-change-of': BELGraph.add_regulates, } namespace_to_dsl = { 'cas': pybel.dsl.Abundance, 'uniprot': pybel.dsl.Protein, 'hprd': pybel.dsl.Protein, 'chebi': pybel.dsl.Abundance, 'hgnc': pybel.dsl.Protein, }
def get_terms(version: str) -> Iterable[Term]: """Get ComplexPortal terms.""" df = get_df(version=version) df.rename( inplace=True, columns={ "Aliases for complex": "aliases", "Identifiers (and stoichiometry) of molecules in complex": "members", "Taxonomy identifier": "taxonomy_id", "Cross references": "xrefs", "Description": "definition", "Recommended name": "name", "#Complex ac": "complexportal_id", }, ) df["aliases"] = df["aliases"].map(lambda s: s.split("|") if pd.notna(s) else []) df["members"] = df["members"].map(_parse_members) df["xrefs"] = df["xrefs"].map(_parse_xrefs) taxnomy_id_to_name = get_id_name_mapping("ncbitaxon") df["taxonomy_name"] = df["taxonomy_id"].map(taxnomy_id_to_name.get) slim_df = df[ [ "complexportal_id", "name", "definition", "aliases", "xrefs", "taxonomy_id", "taxonomy_name", "members", ] ] it = tqdm(slim_df.values, total=len(slim_df.index), desc=f"mapping {PREFIX}") unhandled_xref_type = set() for ( complexportal_id, name, definition, aliases, xrefs, taxonomy_id, taxonomy_name, members, ) in it: synonyms = [Synonym(name=alias) for alias in aliases] _xrefs = [] provenance = [] for reference, note in xrefs: if note == "identity": _xrefs.append(reference) elif note == "see-also" and reference.prefix == "pubmed": provenance.append(reference) elif (note, reference.prefix) not in unhandled_xref_type: logger.debug(f"unhandled xref type: {note} / {reference.prefix}") unhandled_xref_type.add((note, reference.prefix)) term = Term( reference=Reference(prefix=PREFIX, identifier=complexportal_id, name=name), definition=definition.strip() if pd.notna(definition) else None, synonyms=synonyms, xrefs=_xrefs, provenance=provenance, ) term.set_species(identifier=taxonomy_id, name=taxonomy_name) for reference, _count in members: term.append_relationship(has_part, reference) yield term
def get_relations_df() -> pd.DataFrame: """Assemble the relations dataframe.""" xrefs_df = get_xrefs_df() logger.info('loading famplex mapping') famplex_id_to_members = defaultdict(list) famplex_relations_df = pd.read_csv(FAMPLEX_RELATIONS_URL) for source_id, source_name, rel, target_db, target_name in famplex_relations_df.values: if source_id.lower() == 'hgnc' and rel == 'isa' and target_db.lower() == 'fplx': try: hgnc_id = hgnc_name_to_id[source_name] except KeyError: logger.warning(f'Could not find {source_name} for fplx:{target_name}') continue famplex_id_to_members[target_name].append((hgnc_id, source_name)) logger.info('getting enzyme classes') expasy_graph, ec_code_to_children = get_expasy_closure() logger.info('getting ec2go') ec2go = get_ec2go() logger.info('inferring over target hierarchies') x = defaultdict(list) for source_db, source_id, _, modulation, target_type, target_db, target_id, target_name in xrefs_df.values: if source_db != 'chebi': continue if target_db == 'hgnc': # Append original x[source_db, source_id].append((modulation, 'protein', 'hgnc', target_id, target_name)) # Append inferred for uniprot_id, uniprot_name in get_uniprot_id_names(target_id): x[source_db, source_id].append((modulation, 'protein', 'uniprot', uniprot_id, uniprot_name)) elif target_db == 'fplx': # Append original x[source_db, source_id].append((modulation, target_type, target_db, target_id, target_name)) # Append inferred for hgnc_id, hgnc_symbol in famplex_id_to_members.get(target_id, []): x[source_db, source_id].append((modulation, 'protein', 'hgnc', hgnc_id, hgnc_symbol)) for uniprot_id, uniprot_name in get_uniprot_id_names(hgnc_id): x[source_db, source_id].append((modulation, 'protein', 'uniprot', uniprot_id, uniprot_name)) elif target_db == 'ec-code': children_ec_codes = ec_code_to_children.get(target_id) if children_ec_codes is None: # this is the case for about 15 entries logger.info(f'could not find children of {target_db}:{target_id}') continue for sub_target_db, sub_target_id, sub_target_name in children_ec_codes: target_type = DB_TO_TYPE[sub_target_db] x[source_db, source_id].append(( modulation, target_type, sub_target_db, sub_target_id, sub_target_name, )) for go_id, go_name in ec2go.get(target_id, []): x[source_db, source_id].append(( modulation, 'molecular function', 'go', go_id, go_name, )) else: x[source_db, source_id].append((modulation, target_type, target_db, target_id, target_name)) logger.info('inferring over role hiearchies') db_to_role_to_chemical_curies = { 'chebi': get_chebi_role_to_children(), } db_to_id_mapping = { 'chebi': get_id_name_mapping('chebi'), } #: A set of databases to remove the prefix from remove_prefix = {'chebi'} rows = [] for (role_db, role_id), entries in x.items(): if role_db in remove_prefix and role_id.lower().startswith(f'{role_db}:'.lower()): role_id = role_id[len(f'{role_db}:'):] # TODO map role_db, role_id to set of sub_role_db, sub_role_id sub_role_curies = {(role_db, role_id)} for modulation, target_type, target_db, target_id, target_name in entries: chemical_curies = set(itt.chain.from_iterable( db_to_role_to_chemical_curies[sub_role_db].get(sub_role_id, []) for sub_role_db, sub_role_id in sub_role_curies )) if not chemical_curies: logger.debug('no inference for %s:%s', role_db, role_id) continue for chemical_db, chemical_id in chemical_curies: rows.append(( chemical_db, chemical_id, db_to_id_mapping[chemical_db][chemical_id], modulation, target_type, target_db, target_id, target_name, )) return pd.DataFrame(rows, columns=XREFS_COLUMNS)
def main( port: str, host: str, sql: bool, sql_uri: str, sql_refs_table: str, sql_alts_table: str, sql_defs_table: str, name_data: Optional[str], alts_data: Optional[str], defs_data: Optional[str], test: bool, with_gunicorn: bool, lazy: bool, workers: int, ): """Run the resolver app.""" if test and lazy: click.secho('Can not run in --test and --lazy mode at the same time', fg='red') sys.exit(0) from .resolver import get_app if test: from pyobo import get_id_name_mapping, get_alts_to_id, get_id_definition_mapping import pandas as pd prefixes = ['hgnc', 'chebi', 'doid', 'go'] name_data = pd.DataFrame( [ (prefix, identifier, name) for prefix in prefixes for identifier, name in get_id_name_mapping(prefix).items() ], columns=['prefix', 'identifier', 'name'], ) alts_data = pd.DataFrame( [ (prefix, alt, identifier) for prefix in prefixes for alt, identifier in get_alts_to_id(prefix).items() ], columns=['prefix', 'alt', 'identifier'], ) defs_data = pd.DataFrame( [ (prefix, identifier, definition) for prefix in prefixes for identifier, definition in get_id_definition_mapping(prefix).items() ], columns=['prefix', 'identifier', 'definition'], ) app = get_app( name_data=name_data, alts_data=alts_data, defs_data=defs_data, lazy=lazy, sql=sql, uri=sql_uri, refs_table=sql_refs_table, alts_table=sql_alts_table, defs_table=sql_defs_table, ) run_app(app=app, host=host, port=port, with_gunicorn=with_gunicorn, workers=workers)