def test_prefixes():
    assert contract_uri(bp_iri) == [bp_id]
    assert expand_uri(bp_id) == bp_iri
    assert contract_uri("FAKE", strict=False) == []
    try:
        contract_uri("FAKE", strict=True)
    except NoPrefix as e:
        pass
    else:
        assert False
Beispiel #2
0
def contract(uri) -> str:
    """
    We sort the curies to ensure that we take the same item every time
    """
    curies = contract_uri(str(uri), cmaps=cmaps)
    if len(curies) > 0:
        curies.sort()
        return curies[0]
    return None
def test_prefixes_cmaps():
    cmaps = [ {'GO': 'http://purl.obolibrary.org/obo/GO_'},
              {'OBO': 'http://purl.obolibrary.org/obo/'}
    ]
    assert contract_uri(bp_iri, cmaps) == [bp_id]
    all_curies = contract_uri(bp_iri, cmaps, shortest=False)
    assert len(all_curies) == 2
    assert obo_bp_id in all_curies
    assert bp_id in all_curies
    assert expand_uri(bp_id, cmaps) == bp_iri
    assert expand_uri(obo_bp_id, cmaps) == bp_iri
    assert contract_uri("FAKE", cmaps, strict=False) == []
    try:
        contract_uri("FAKE", cmaps, strict=True)
    except NoPrefix as e:
        pass
    else:
        assert False
Beispiel #4
0
def get_descendants(graph: Graph,
                    node: str,
                    edge: Optional[URIRef] = RDFS['subClassOf'],
                    reflexive: Optional[bool] = True) -> Set[str]:

    nodes = set()
    node = URIRef(expand_uri(node, strict=True))
    for sub in graph.transitive_subjects(edge, node):
        if not reflexive and node == sub:
            continue
        if isinstance(sub, Literal):
            continue
        nodes.add(contract_uri(str(sub), strict=True)[0])
    return nodes
Beispiel #5
0
    def contract_uri(self, iri) -> str:
        """Contract a given IRI.

        Contract a given IRI, with special parsing and transformations
        depending on the nature of the IRI.

        Args:
            iri: IRI as string

        Returns:
            str.

        """
        curie = ""
        if 'http://www.genenames.org/cgi-bin/gene_symbol_report?match=' in iri:
            identifier = iri.split('=')[-1]
            if identifier in self.gene_info_map:
                curie = f"NCBIGene:{self.gene_info_map[identifier]['NCBI']}"
            else:
                [curie] = contract_uri(iri, cmaps=[CUSTOM_CMAP])
        else:
            if self.is_iri(iri):
                curie = contract_uri(iri)
                if curie:
                    curie = curie[0]
                else:
                    curie = contract_uri(iri, cmaps=[CUSTOM_CMAP])
                    if curie:
                        curie = curie[0]
                    else:
                        curie = iri
            elif self.is_curie(iri):
                curie = iri
            else:
                curie = f":{iri}"

        return curie
Beispiel #6
0
def get_ancestors(graph: Graph,
                  node: str,
                  edge: Optional[URIRef] = RDFS['subClassOf'],
                  root: Optional[str] = None,
                  reflexive: Optional[bool] = True) -> Set[str]:
    nodes = set()
    root_seen = {}
    node = URIRef(expand_uri(node, strict=True))

    if root is not None:
        root = URIRef(expand_uri(root, strict=True))
        root_seen = {root: 1}
    for obj in graph.transitive_objects(node, edge, root_seen):
        if isinstance(obj, Literal) or isinstance(obj, BNode):
            continue
        if not reflexive and node == obj:
            continue
        nodes.add(contract_uri(str(obj), strict=True)[0])

    # Add root to graph
    if root is not None:
        nodes.add(contract_uri(str(root), strict=True)[0])

    return nodes
Beispiel #7
0
def get_leaf_nodes(graph: Graph,
                   node: str,
                   edge: Optional[URIRef] = RDFS['subClassOf']) -> Set[str]:

    if not isinstance(node, URIRef):
        obj = URIRef(expand_uri(node, strict=True))
    else:
        obj = node

    subjects = list(graph.subjects(edge, obj))
    if len(subjects) == 0:
        yield contract_uri(str(obj), strict=True)[0]
    else:
        for subject in subjects:
            for leaf in get_leaf_nodes(graph, subject, edge):
                yield leaf
Beispiel #8
0
def shorten_iri_to_curie(iri: str, curie_to_iri_map: list = []):
    if iri.startswith('owl:') or iri.startswith('OIO:'):
        return iri
    if "/GO/GO%3A" in iri:  # hack for fixing issue #410
        iri = iri.replace("/GO/GO%3A", "/GO/")
    if "/HPO/HP%3A" in iri:  # hack for fixing issue #665
        iri = iri.replace("/HPO/HP%3A", "/HP/")
    curie_list = prefixcommons.contract_uri(iri, curie_to_iri_map)
    assert len(curie_list) in [0, 1]
    if len(curie_list) == 1:
        curie_id = curie_list[0]
    else:
        curie_id = None
    if curie_id is not None:
        # deal with IRIs like 'https://identifiers.org/umls/ATC/L01AX02' which get converted to CURIE 'UMLS:ATC/L01AX02'
        umls_match = REGEX_UMLS_CURIE.match(curie_id)
        if umls_match is not None:
            curie_id = umls_match[1] + ':' + umls_match[2]
    return curie_id
Beispiel #9
0
def contract(uri) -> str:
    """
    Contract a URI a CURIE.
    We sort the curies to ensure that we take the same item every time.

    Parameters
    ----------
    uri: Union[rdflib.term.URIRef, str]
        A URI

    Returns
    -------
    str
        The CURIE

    """
    curies = contract_uri(str(uri), cmaps=cmaps)
    if len(curies) > 0:
        curies.sort()
        return curies[0]
    return None
Beispiel #10
0
def shorten_iri_to_curie(iri: str, curie_to_iri_map: list) -> str:
    if iri is None:
        raise ValueError('cannot shorten an IRI with value None')
    curie_list = prefixcommons.contract_uri(iri, curie_to_iri_map)
    if len(curie_list) == 0:
        return None

    if len(curie_list) == 1:
        curie_id = curie_list[0]
    else:
        assert False, "somehow got a list after calling prefixcommons.contract on URI: " + iri + "; list is: " + str(
            curie_list)
        curie_id = None

    # if curie_id is not None:
    #     # deal with IRIs like 'https://identifiers.org/umls/ATC/L01AX02' which get converted to CURIE 'UMLS:ATC/L01AX02'
    #     umls_match = REGEX_UMLS_CURIE.match(curie_id)
    #     if umls_match is not None:
    #         curie_id = umls_match[1] + ':' + umls_match[2]

    return curie_id
def _process_hpo_data(file_path: str) -> Dict[str, List[str]]:
    logger.info("loading mondo into memory")
    mondo = Graph()
    mondo.parse(gzip.open("../data/mondo.owl.gz", 'rb'), format='xml')
    logger.info("finished loading mondo")

    mondo_merged_lines: List[str] = []
    disease_info: Dict[str, List[str]] = {}

    if file_path.startswith("http"):
        context_manager = closing(requests.get(file_path))
    else:
        context_manager = open(file_path, "r")

    # https://stackoverflow.com/a/35371451
    with context_manager as file:
        if file_path.startswith("http"):
            file = file.content.decode('utf-8').splitlines()
        reader = csv.reader(file, delimiter='\t', quotechar='\"')
        counter = 0
        for row in reader:
            try:
                (db, num, name, severity, pheno_id, publist, eco, onset,
                 freq) = row[0:9]
            except ValueError:
                logger.warning("Too few values in row {}".format(row))
                continue

            # Align Id prefixes
            if db == 'MIM': db = 'OMIM'
            if db == 'ORPHA': db = 'Orphanet'
            if db == 'ORPHANET': db = 'Orphanet'

            disease_id = "{}:{}".format(db, num)
            disease_iri = URIRef(expand_uri(disease_id, strict=True))
            mondo_curie = None
            mondo_iri = None
            for subj in mondo.subjects(OWL['equivalentClass'], disease_iri):
                curie = contract_uri(str(subj), strict=True)[0]
                if curie.startswith('MONDO'):
                    mondo_curie = curie
                    mondo_iri = subj
                    break
            if mondo_curie is None:
                logger.warn("No mondo id for {}".format(disease_id))
                continue

            has_omim = False
            for obj in mondo.objects(mondo_iri, OWL['equivalentClass']):
                try:
                    curie = contract_uri(str(obj), strict=True)[0]
                except NoPrefix:
                    continue
                if curie.startswith('OMIM'):
                    has_omim = True

            # use scigraph instead of the above
            # mondo_node = monarch.get_clique_leader(disease_id)
            # mondo_curie = mondo_node['id']
            if mondo_curie is not None and 'hgnc' in mondo_curie:
                # to keep these, likely decipher IDs
                # mondo_curie = disease_id
                continue

            if disease_id.startswith('Orphanet') \
                    and has_omim is False \
                    and len(list(mondo.objects(mondo_iri, RDFS['subClassOf']))) > 0:
                # disease is a disease group, skip
                logger.info(
                    "{} is a disease group, skipping".format(disease_id))
                continue

            mondo_merged_lines.append(
                (mondo_curie, pheno_id, onset, freq, severity))

            counter += 1
            if counter % 10000 == 0:
                logger.info("processed {} rows".format(counter))

    logger.info("processed {} rows".format(counter))

    for line in mondo_merged_lines:
        key = "{}-{}".format(line[0], line[1])
        values = [line[2], line[3], line[4]]
        if key in disease_info and disease_info[key] != values:
            logger.warning("Metadata for {} and {} mismatch: {} vs {}".format(
                line[0], line[1], values, disease_info[key]))
            # attempt to merge by collapsing freq, onset, severity
            # that is empty in one disease but not another
            # conflicts will defer to the disease first inserted
            merged_disease_info = disease_info[key]
            for index, val in enumerate(values):
                if val == disease_info[key][index] \
                        or val == '' and disease_info[key][index] != '':
                    continue
                elif val != '' and disease_info[key][index] == '':
                    merged_disease_info[index] = val
                else:
                    logger.warning("Cannot merge {} and {} for {}".format(
                        values, disease_info[key], line[0]))
        else:
            disease_info[key] = values

    return disease_info