Ejemplo n.º 1
0
def _process_annotations(
    data,
    remove_ungrounded: bool = False,
    skip_namespaces: Optional[Collection[str]] = None,
) -> None:
    """Process the annotations in a PyBEL edge data dictionary."""
    cell_line_entities = data[ANNOTATIONS].get('CellLine')
    if cell_line_entities:
        ne = []
        for entity in cell_line_entities:
            if entity[NAMESPACE] == 'CellLine':
                _namespaces = [
                    'efo',
                    # 'clo',  # FIXME implement CLO in PyOBO then uncomment
                ]
                g_prefix, g_identifier, g_name = pyobo.ground(
                    _namespaces, entity[IDENTIFIER])
                if g_prefix and g_identifier:
                    ne.append(
                        Entity(namespace=g_prefix,
                               identifier=g_identifier,
                               name=g_name))
                elif not remove_ungrounded:
                    logger.warning('could not ground CellLine: "%s"',
                                   entity[IDENTIFIER])
                    ne.append(entity)
        data[ANNOTATIONS]['CellLine'] = ne

    # fix text locations
    text_location = data[ANNOTATIONS].get('TextLocation')
    if text_location:
        data[ANNOTATIONS]['TextLocation'] = [
            text_location_labels.get(entity.identifier, entity)
            for entity in text_location
        ]

    # remap category names
    data[ANNOTATIONS] = {
        _BEL_ANNOTATION_PREFIX_CATEGORY_MAP.get(category, category): entities
        for category, entities in data[ANNOTATIONS].items()
    }
    # fix namespaces that were categories before
    for category, entities in data[ANNOTATIONS].items():
        if category in CATEGORY_BLACKLIST:
            continue

        ne = []
        for entity in entities:
            if not isinstance(entity, dict):
                raise TypeError(f'entity should be a dict. got: {entity}')
            nn = _BEL_ANNOTATION_PREFIX_MAP.get(entity[NAMESPACE])
            if nn is not None:
                entity[NAMESPACE] = nn

            _process_concept(concept=entity, skip_namespaces=skip_namespaces)
            ne.append(entity)
        data[ANNOTATIONS][category] = ne
Ejemplo n.º 2
0
 def test_ground(self):
     """Test grounding a ChEBI entry by name and synonym."""
     for query in ("Fusilade II", "fluazifop-P-butyl"):
         with self.subTest(query=query), mock_id_name_mapping, mock_id_synonyms_mapping:
             prefix, identifier, name = pyobo.ground("chebi", query)
             self.assertIsNotNone(prefix)
             self.assertIsNotNone(identifier)
             self.assertIsNotNone(name)
             self.assertEqual("chebi", prefix)
             self.assertEqual("132964", identifier)
             self.assertEqual("fluazifop-P-butyl", name)
Ejemplo n.º 3
0
 def test_ground(self):
     """Test grounding a ChEBI entry by name and synonym."""
     for query in ('Fusilade II', 'fluazifop-P-butyl'):
         with self.subTest(
                 query=query
         ), mock_id_name_mapping, mock_id_synonyms_mapping:
             prefix, identifier, name = pyobo.ground('chebi', query)
             self.assertIsNotNone(prefix)
             self.assertIsNotNone(identifier)
             self.assertIsNotNone(name)
             self.assertEqual('chebi', prefix)
             self.assertEqual('132964', identifier)
             self.assertEqual('fluazifop-P-butyl', name)
Ejemplo n.º 4
0
def get_identifier(
        namespace: str,
        name: str) -> Union[Tuple[str, None, str], Tuple[str, str, str]]:
    if namespace in {'SFAM', 'SCOMP'}:
        return 'fplx', bel_fplx.get(name), name
    if namespace in {'SCHEM', 'CHEBI'}:
        prefix, identifier, name = pyobo.ground('chebi', name)
        return prefix or namespace, identifier, name

    norm_namespace = bioregistry.normalize_prefix(namespace)
    if norm_namespace is None:
        raise ValueError(f'could not normalize {namespace}')
    namespace = norm_namespace

    if namespace in MISSING_NAMESPACE:
        return namespace, None, name
    try:
        name_id_mapping = pyobo.get_name_id_mapping(namespace)
    except:
        logger.info('missing namespace: %s', namespace)
        MISSING_NAMESPACE.add(namespace)
        return namespace, None, name

    if name_id_mapping:
        identifier = name_id_mapping.get(name)
        if identifier:
            return namespace, identifier, name
        elif (namespace, name) not in MISSING:
            MISSING.add((namespace, name))
            logger.debug('missing lookup for %s ! %s', namespace, name)
            return namespace, None, name
    elif namespace not in MISSING_NAMESPACE:
        logger.info('empty namespace: %s', namespace)
        MISSING_NAMESPACE.add(namespace)

    return namespace, None, name
Ejemplo n.º 5
0
def _process_annotations(data, remove_ungrounded: bool = False) -> None:
    """Process the annotations in a PyBEL edge data dictionary."""
    grounded_category_curie_polarity = []
    ungrounded_category_name_polarity = []

    for category, names in data[ANNOTATIONS].items():
        if category == 'CellLine':
            _namespaces = [
                'efo',
                # 'clo',  # FIXME implement CLO import and add here
            ]
            for name, polarity in names.items():
                g_prefix, g_identifier, g_name = pyobo.ground(_namespaces, name)
                if g_prefix and g_identifier:
                    grounded_category_curie_polarity.append((
                        category, Entity(namespace=g_prefix, identifier=g_identifier, name=g_name), polarity,
                    ))
                else:
                    ungrounded_category_name_polarity.append((category, name, polarity))

        elif category in _BEL_ANNOTATION_PREFIX_MAP:
            norm_prefix = _BEL_ANNOTATION_PREFIX_MAP[category]
            norm_category = _BEL_ANNOTATION_PREFIX_CATEGORY_MAP.get(category, category)
            for name, polarity in names.items():
                _, identifier, _ = pyobo.ground(norm_prefix, name)
                if identifier:
                    grounded_category_curie_polarity.append((
                        norm_category, Entity(namespace=norm_prefix, identifier=identifier, name=name), polarity,
                    ))
                else:
                    ungrounded_category_name_polarity.append((norm_category, name, polarity))

        elif normalize_prefix(category):
            norm_prefix = normalize_prefix(category)
            for name, polarity in names.items():
                _, identifier, _ = pyobo.ground(norm_prefix, name)
                if identifier:
                    grounded_category_curie_polarity.append((
                        category, Entity(namespace=norm_prefix, identifier=identifier, name=name), polarity,
                    ))
                else:
                    ungrounded_category_name_polarity.append((category, name, polarity))

        else:
            if category not in _UNHANDLED_ANNOTATION:
                logger.warning('unhandled annotation: %s', category)
                _UNHANDLED_ANNOTATION.add(category)

            if isinstance(names, dict):
                for name, polarity in names.items():
                    ungrounded_category_name_polarity.append((category, name, polarity))
            else:
                ungrounded_category_name_polarity.append((category, names, True))

    data[ANNOTATIONS] = defaultdict(dict)
    for category, entity, polarity in grounded_category_curie_polarity:
        data[ANNOTATIONS][category][entity.curie] = polarity
    data[ANNOTATIONS] = dict(data[ANNOTATIONS])

    if not remove_ungrounded and ungrounded_category_name_polarity:
        data[FREE_ANNOTATIONS] = defaultdict(dict)
        for category, name, polarity in ungrounded_category_name_polarity:
            data[FREE_ANNOTATIONS][category][name] = polarity
        data[FREE_ANNOTATIONS] = dict(data[FREE_ANNOTATIONS])
Ejemplo n.º 6
0
def get_bel() -> pybel.BELGraph:
    """Get the HMDD data."""
    #  category	mir	disease	pmid	description
    path = ensure_path(PREFIX, URL)
    df = pd.read_csv(
        path,
        sep='\t',
        dtype=str,
        encoding="ISO-8859-1",
    )

    failed_mirnas = 0
    mirna_to_dsl = {}
    mirnas = df['mir'].unique()
    it = tqdm(mirnas, desc='mapping miRNA names')
    for text in it:
        _, identifier, name = pyobo.ground('mirbase', text)
        if identifier is None:
            it.write(f'[mirbase] could not ground: {text}')
            failed_mirnas += 1
            continue
        mirna_to_dsl[text] = pybel.dsl.MicroRna(
            namespace='mirbase',
            identifier=identifier,
            name=name,
        )

    logger.info(f'failed on {failed_mirnas}/{len(mirnas)} miRNAs')

    failed_diseases = 0
    disease_to_dsl = {}
    diseases = df['disease'].unique()
    it = tqdm(diseases, desc='mapping disease names')
    for text in it:
        prefix, identifier, name = pyobo.ground(['mondo', 'doid', 'efo', 'hp', 'mesh'], text)
        if identifier is None and ', ' in text:
            i = text.index(', ')
            left, right = text[:i], text[i + 2:]
            x = f'{right} {left}'
            prefix, identifier, name = pyobo.ground(['mondo', 'doid', 'efo', 'hp', 'mesh'], x)
            if identifier is None and ', ' in x:
                x2 = ' '.join(z.strip() for z in text.split(',')[::-1])
                prefix, identifier, name = pyobo.ground(['mondo', 'doid', 'efo', 'hp', 'mesh'], x2)
        if identifier is None:
            it.write(f'could not ground {text}')
            failed_diseases += 1
            continue
        disease_to_dsl[text] = pybel.dsl.Pathology(
            namespace=prefix,
            identifier=identifier,
            name=name,
        )

    logger.info(f'failed on {failed_diseases}/{len(diseases)} diseases')

    rv = pybel.BELGraph(name='HMDD', version=VERSION)
    for _category, mir, disease, pmid, text in df.values:
        source = mirna_to_dsl.get(mir)
        target = disease_to_dsl.get(disease)
        if not source or not target:
            continue
        rv.add_regulates(
            source,
            target,
            citation=pmid,
            evidence=text,
        )
    return rv