def _process_annotations( data, remove_ungrounded: bool = False, skip_namespaces: Optional[Collection[str]] = None, ) -> None: """Process the annotations in a PyBEL edge data dictionary.""" cell_line_entities = data[ANNOTATIONS].get('CellLine') if cell_line_entities: ne = [] for entity in cell_line_entities: if entity[NAMESPACE] == 'CellLine': _namespaces = [ 'efo', # 'clo', # FIXME implement CLO in PyOBO then uncomment ] g_prefix, g_identifier, g_name = pyobo.ground( _namespaces, entity[IDENTIFIER]) if g_prefix and g_identifier: ne.append( Entity(namespace=g_prefix, identifier=g_identifier, name=g_name)) elif not remove_ungrounded: logger.warning('could not ground CellLine: "%s"', entity[IDENTIFIER]) ne.append(entity) data[ANNOTATIONS]['CellLine'] = ne # fix text locations text_location = data[ANNOTATIONS].get('TextLocation') if text_location: data[ANNOTATIONS]['TextLocation'] = [ text_location_labels.get(entity.identifier, entity) for entity in text_location ] # remap category names data[ANNOTATIONS] = { _BEL_ANNOTATION_PREFIX_CATEGORY_MAP.get(category, category): entities for category, entities in data[ANNOTATIONS].items() } # fix namespaces that were categories before for category, entities in data[ANNOTATIONS].items(): if category in CATEGORY_BLACKLIST: continue ne = [] for entity in entities: if not isinstance(entity, dict): raise TypeError(f'entity should be a dict. got: {entity}') nn = _BEL_ANNOTATION_PREFIX_MAP.get(entity[NAMESPACE]) if nn is not None: entity[NAMESPACE] = nn _process_concept(concept=entity, skip_namespaces=skip_namespaces) ne.append(entity) data[ANNOTATIONS][category] = ne
def test_ground(self): """Test grounding a ChEBI entry by name and synonym.""" for query in ("Fusilade II", "fluazifop-P-butyl"): with self.subTest(query=query), mock_id_name_mapping, mock_id_synonyms_mapping: prefix, identifier, name = pyobo.ground("chebi", query) self.assertIsNotNone(prefix) self.assertIsNotNone(identifier) self.assertIsNotNone(name) self.assertEqual("chebi", prefix) self.assertEqual("132964", identifier) self.assertEqual("fluazifop-P-butyl", name)
def test_ground(self): """Test grounding a ChEBI entry by name and synonym.""" for query in ('Fusilade II', 'fluazifop-P-butyl'): with self.subTest( query=query ), mock_id_name_mapping, mock_id_synonyms_mapping: prefix, identifier, name = pyobo.ground('chebi', query) self.assertIsNotNone(prefix) self.assertIsNotNone(identifier) self.assertIsNotNone(name) self.assertEqual('chebi', prefix) self.assertEqual('132964', identifier) self.assertEqual('fluazifop-P-butyl', name)
def get_identifier( namespace: str, name: str) -> Union[Tuple[str, None, str], Tuple[str, str, str]]: if namespace in {'SFAM', 'SCOMP'}: return 'fplx', bel_fplx.get(name), name if namespace in {'SCHEM', 'CHEBI'}: prefix, identifier, name = pyobo.ground('chebi', name) return prefix or namespace, identifier, name norm_namespace = bioregistry.normalize_prefix(namespace) if norm_namespace is None: raise ValueError(f'could not normalize {namespace}') namespace = norm_namespace if namespace in MISSING_NAMESPACE: return namespace, None, name try: name_id_mapping = pyobo.get_name_id_mapping(namespace) except: logger.info('missing namespace: %s', namespace) MISSING_NAMESPACE.add(namespace) return namespace, None, name if name_id_mapping: identifier = name_id_mapping.get(name) if identifier: return namespace, identifier, name elif (namespace, name) not in MISSING: MISSING.add((namespace, name)) logger.debug('missing lookup for %s ! %s', namespace, name) return namespace, None, name elif namespace not in MISSING_NAMESPACE: logger.info('empty namespace: %s', namespace) MISSING_NAMESPACE.add(namespace) return namespace, None, name
def _process_annotations(data, remove_ungrounded: bool = False) -> None: """Process the annotations in a PyBEL edge data dictionary.""" grounded_category_curie_polarity = [] ungrounded_category_name_polarity = [] for category, names in data[ANNOTATIONS].items(): if category == 'CellLine': _namespaces = [ 'efo', # 'clo', # FIXME implement CLO import and add here ] for name, polarity in names.items(): g_prefix, g_identifier, g_name = pyobo.ground(_namespaces, name) if g_prefix and g_identifier: grounded_category_curie_polarity.append(( category, Entity(namespace=g_prefix, identifier=g_identifier, name=g_name), polarity, )) else: ungrounded_category_name_polarity.append((category, name, polarity)) elif category in _BEL_ANNOTATION_PREFIX_MAP: norm_prefix = _BEL_ANNOTATION_PREFIX_MAP[category] norm_category = _BEL_ANNOTATION_PREFIX_CATEGORY_MAP.get(category, category) for name, polarity in names.items(): _, identifier, _ = pyobo.ground(norm_prefix, name) if identifier: grounded_category_curie_polarity.append(( norm_category, Entity(namespace=norm_prefix, identifier=identifier, name=name), polarity, )) else: ungrounded_category_name_polarity.append((norm_category, name, polarity)) elif normalize_prefix(category): norm_prefix = normalize_prefix(category) for name, polarity in names.items(): _, identifier, _ = pyobo.ground(norm_prefix, name) if identifier: grounded_category_curie_polarity.append(( category, Entity(namespace=norm_prefix, identifier=identifier, name=name), polarity, )) else: ungrounded_category_name_polarity.append((category, name, polarity)) else: if category not in _UNHANDLED_ANNOTATION: logger.warning('unhandled annotation: %s', category) _UNHANDLED_ANNOTATION.add(category) if isinstance(names, dict): for name, polarity in names.items(): ungrounded_category_name_polarity.append((category, name, polarity)) else: ungrounded_category_name_polarity.append((category, names, True)) data[ANNOTATIONS] = defaultdict(dict) for category, entity, polarity in grounded_category_curie_polarity: data[ANNOTATIONS][category][entity.curie] = polarity data[ANNOTATIONS] = dict(data[ANNOTATIONS]) if not remove_ungrounded and ungrounded_category_name_polarity: data[FREE_ANNOTATIONS] = defaultdict(dict) for category, name, polarity in ungrounded_category_name_polarity: data[FREE_ANNOTATIONS][category][name] = polarity data[FREE_ANNOTATIONS] = dict(data[FREE_ANNOTATIONS])
def get_bel() -> pybel.BELGraph: """Get the HMDD data.""" # category mir disease pmid description path = ensure_path(PREFIX, URL) df = pd.read_csv( path, sep='\t', dtype=str, encoding="ISO-8859-1", ) failed_mirnas = 0 mirna_to_dsl = {} mirnas = df['mir'].unique() it = tqdm(mirnas, desc='mapping miRNA names') for text in it: _, identifier, name = pyobo.ground('mirbase', text) if identifier is None: it.write(f'[mirbase] could not ground: {text}') failed_mirnas += 1 continue mirna_to_dsl[text] = pybel.dsl.MicroRna( namespace='mirbase', identifier=identifier, name=name, ) logger.info(f'failed on {failed_mirnas}/{len(mirnas)} miRNAs') failed_diseases = 0 disease_to_dsl = {} diseases = df['disease'].unique() it = tqdm(diseases, desc='mapping disease names') for text in it: prefix, identifier, name = pyobo.ground(['mondo', 'doid', 'efo', 'hp', 'mesh'], text) if identifier is None and ', ' in text: i = text.index(', ') left, right = text[:i], text[i + 2:] x = f'{right} {left}' prefix, identifier, name = pyobo.ground(['mondo', 'doid', 'efo', 'hp', 'mesh'], x) if identifier is None and ', ' in x: x2 = ' '.join(z.strip() for z in text.split(',')[::-1]) prefix, identifier, name = pyobo.ground(['mondo', 'doid', 'efo', 'hp', 'mesh'], x2) if identifier is None: it.write(f'could not ground {text}') failed_diseases += 1 continue disease_to_dsl[text] = pybel.dsl.Pathology( namespace=prefix, identifier=identifier, name=name, ) logger.info(f'failed on {failed_diseases}/{len(diseases)} diseases') rv = pybel.BELGraph(name='HMDD', version=VERSION) for _category, mir, disease, pmid, text in df.values: source = mirna_to_dsl.get(mir) target = disease_to_dsl.get(disease) if not source or not target: continue rv.add_regulates( source, target, citation=pmid, evidence=text, ) return rv