def load_mesh(
        kb_name='mesh',
        path='C:\\Users\\EDMISML\\Desktop\\ont_align_data\\disease_subtrees\\mesh_dis.xml'
    ):

        kb = KnowledgeBase()
        kb.name = kb_name

        # parse the file
        try:
            tree = etree.parse(path)
        except etree.XMLSyntaxError:
            p = etree.XMLParser(huge_tree=True)
            tree = etree.parse(path, parser=p)

        root = tree.getroot()
        ns = root.nsmap
        desc_iter = root.findall('rdf:Description', ns)
        # get description dict
        for desc in desc_iter:
            # get nci id
            entity_id = desc.find('meshv:identifier', ns).text
            entity = KBEntity(entity_id, None, [], '')
            entity.canonical_name = desc.find('skos:prefLabel', ns).text
            try:
                # get alt labels
                for label in desc.findall('skos:altLabel', ns):
                    if label.text is not None:
                        entity.aliases.append(label.text)

                relations = []
                for sc_rel in desc.findall('meshv:broaderDescriptor', ns):
                    target_research_entity_id = sc_rel.get(
                        '{' + ns['rdf'] + '}resource',
                        ns).split('/')[-1].strip()
                    if isinstance(target_research_entity_id, str):
                        relation = KBRelation(relation_type='subClassOf',
                                              entity_ids=[
                                                  entity.research_entity_id,
                                                  target_research_entity_id
                                              ],
                                              symmetric=False)
                        relations.append(relation)

            except AttributeError:
                print(f'skipping {entity_id} in load_mesh')
                continue

            for rel in relations:
                kb.add_relation(rel)
                rel_index = len(kb.relations) - 1
                entity.relation_ids.append(rel_index)

            # add entity to kb
            kb.add_entity(entity)

        return kb
    def import_mesh(name, mesh_filename):
        """
        Create a KnowledgeBase object with entities from MeSH file
        :param name:
        :param mesh_filename:
        :return:
        """
        # initialize the KB
        kb = KnowledgeBase()
        kb.name = name

        def _make_mesh_entity(entity_chunk):
            """
            Make a KBEntity from each MeSH chunk
            :param entity_chunk:
            :return:
            """
            entity = KBEntity()
            for line in entity_chunk:
                fields = line.split(" = ")
                if len(fields) != 2:
                    continue
                key, value = fields[0], fields[1]
                if key == 'UI':
                    entity.research_entity_id = value
                elif key == 'MH' or key == 'SH':
                    entity.canonical_name = value
                    entity.aliases.append(value)
                elif key == 'ENTRY' or key == 'PRINT ENTRY':
                    entity.aliases.append(value.split("|")[0])
                elif key == 'MS':
                    entity.definition = value
            return entity

        for chunk in KBLoader._chunkify(file_util.read_lines(mesh_filename),
                                        KBLoader.MESH_ENTITY_START_TAG):
            kb.add_entity(_make_mesh_entity(chunk))
        return kb
    def import_dbpedia(kb_name, kb_filename, entities_count=0):
        """
        Instantiate a KnowledgeBase object with entities and relations from dbpedia
        :param kb_name:
        :param kb_filename:
        :param entities_count:
        :return:
        """
        # initialize the KB.
        kb = KnowledgeBase()
        kb.name = kb_name
        # only the "turtle" format is allowed for this kb.
        assert ('.ttl' in kb_filename)
        kb_filename = file_util.cache_file(kb_filename)

        # parse the turtle file
        abstracts_graph = rdflib.Graph()
        abstracts_graph.parse(kb_filename, format='turtle')
        logging.warning('done parsing dbpedia .ttl files.')

        counter = 0
        for item_subject, item_property, item_object in abstracts_graph:
            entity = KBEntity()
            entity.research_entity_id = str(item_subject)
            if not entity.research_entity_id.startswith(
                    'http://dbpedia.org/resource/'):
                continue
            entity.canonical_name = entity.research_entity_id[
                len('http://dbpedia.org/resource/'):].replace('_', ' ')
            entity.aliases.append(entity.canonical_name)
            entity.definition = str(item_object)
            # verify and add entity to the KB.
            kb.add_entity(entity)
            counter += 1
            if counter >= entities_count > 0:
                break
        return kb
    def create_umls_kbs(self, entities):
        """
        From entity list, create several KnowledgeBase objects with entities from different KBs
        :param entities: dict of entities
        :return:
        """
        for kb_name in constants.TRAINING_KBS:
            sys.stdout.write("\tCreating KB %s\n" % kb_name)
            kb = KnowledgeBase()
            kb.name = kb_name

            entities_to_add = entities[kb_name]

            for ent_id, ent_val in entities_to_add.items():
                new_ent = KBEntity(ent_val['research_entity_id'],
                                   ent_val['canonical_name'],
                                   ent_val['aliases'],
                                   ' '.join(ent_val['definition']))
                for ent1_id, ent2_id, rel_type, symmetric in ent_val[
                        'relations']:
                    rel_id1 = '{}:{}'.format(ent1_id[0], ent1_id[1])
                    rel_id2 = '{}:{}'.format(ent2_id[0], ent2_id[1])
                    new_rel = KBRelation(rel_type, [rel_id1, rel_id2],
                                         symmetric)
                    kb.add_relation(new_rel)
                    rel_ind = len(kb.relations) - 1
                    new_ent.relation_ids.append(rel_ind)
                kb.add_entity(new_ent)

            # write plain KB to json
            out_fname = 'kb-{}.json'.format(kb_name)
            kb.dump(kb, os.path.join(self.OUTPUT_KB_DIR, out_fname))

            # add context to kb and write to file
            self.add_context_to_kb(kb)
        return
    def load_nci(
        kb_name='nci',
        path="C:\\Users\\EDMISML\\Desktop\\ont_align_data\\disease_subtrees\\nci_dis_subset.rdf"
    ):

        # initialize the KB
        kb = KnowledgeBase()
        kb.name = kb_name

        # parse the file
        try:
            tree = etree.parse(path)
        except etree.XMLSyntaxError:
            p = etree.XMLParser(huge_tree=True)
            tree = etree.parse(path, parser=p)

        root = tree.getroot()
        ns = root.nsmap

        # get description dict
        desc_iter = root.findall('rdf:Description', ns)
        print('LEN OF RESOURCES', len(desc_iter))
        for desc in root.findall('rdf:Description', ns):
            # get nci id
            entity_id = str(desc.find('ns1:NHC0', ns).text.strip())
            entity = KBEntity(entity_id, None, [], '')
            entity.canonical_name = desc.find('rdfs:label', ns).text
            try:
                # get definition
                definition = desc.find('ns1:P97', ns)
                if definition is not None:
                    entity.definition = definition.text
                # get alt labels
                for label in desc.findall('ns1:P90', ns):
                    if label.text is not None:
                        entity.aliases.append(label.text)

                relations = []
                for sc_rel in desc.findall('rdfs:subClassOf', ns):
                    try:
                        target_research_entity_id = str(
                            sc_rel.get('{' + ns['rdf'] + '}resource',
                                       ns).split('#')[-1].strip())
                    except AttributeError as ae:
                        print(f'skipping element {sc_rel.attrib}')
                        continue
                    if isinstance(target_research_entity_id, str):
                        relation = KBRelation(relation_type='subClassOf',
                                              entity_ids=[
                                                  entity.research_entity_id,
                                                  target_research_entity_id
                                              ],
                                              symmetric=False)
                        relations.append(relation)

            except AttributeError as ae:
                print(f'skipping {entity_id} in load_nci')
                continue

            for rel in relations:
                kb.add_relation(rel)
                rel_index = len(kb.relations) - 1
                entity.relation_ids.append(rel_index)

            # add entity to kb
            kb.add_entity(entity)

        return kb
    def import_owl_kb(kb_name, kb_filename):
        """
        Create a KnowledgeBase object with entities and relations from an OWL file
        :param kb_name:
        :param kb_filename:
        :return:
        """

        # get the description label for this resource id
        def get_label(l):
            if l.text is not None:
                return l.text
            else:
                r_id = l.get('{' + ns['rdf'] + '}resource')
                if r_id in descriptions:
                    return descriptions[r_id][0]
            return None

        assert kb_filename.endswith('.owl') or kb_filename.endswith('.rdf')

        # initialize the KB
        kb = KnowledgeBase()
        kb.name = kb_name

        # parse the file
        try:
            tree = etree.parse(kb_filename)
        except etree.XMLSyntaxError:
            p = etree.XMLParser(huge_tree=True)
            tree = etree.parse(kb_filename, parser=p)

        root = tree.getroot()
        ns = root.nsmap

        if None in ns:
            del ns[None]

        # get description dict
        descriptions = dict()
        for desc in root.findall('rdf:Description', ns):
            resource_id = desc.get('{' + ns['rdf'] + '}about')
            try:
                labels = []
                for label in desc.findall('rdfs:label', ns):
                    if label.text is not None:
                        labels.append(label.text)
                if 'skos' in ns:
                    for label in desc.findall('skos:prefLabel', ns):
                        if label.text is not None:
                            labels.append(label.text)
                if 'oboInOwl' in ns:
                    for syn in desc.findall('oboInOwl:hasExactSynonym', ns):
                        if syn.text is not None:
                            labels.append(syn.text)
                    for syn in desc.findall('oboInOwl:hasRelatedSynonym', ns) \
                            + desc.findall('oboInOwl:hasNarrowSynonym', ns) \
                            + desc.findall('oboInOwl:hasBroadSynonym', ns):
                        if syn.text is not None:
                            labels.append(syn.text)
                if len(labels) > 0:
                    descriptions[resource_id] = labels
            except AttributeError:
                continue

        # parse OWL classes
        for cl in root.findall('owl:Class', ns):
            # instantiate an entity.
            research_entity_id = cl.get('{' + ns['rdf'] + '}about')
            entity = KBEntity(research_entity_id, None, [], '')

            # list of KBRelations to add
            relations = []

            if entity.research_entity_id is not None and entity.research_entity_id != '':
                try:
                    labels = []

                    # get rdfs labels
                    for label in cl.findall('rdfs:label', ns):
                        l_text = get_label(label)
                        if l_text is not None:
                            labels.append(l_text)

                    # add labels from description
                    if entity.research_entity_id in descriptions:
                        labels += descriptions[entity.research_entity_id]

                    # get skos labels
                    if 'skos' in ns:
                        for label in cl.findall('skos:prefLabel', ns):
                            l_text = get_label(label)
                            if l_text is not None:
                                labels.append(l_text)
                        for label in cl.findall('skos:altLabel', ns):
                            l_text = get_label(label)
                            if l_text is not None:
                                labels.append(l_text)
                        for label in cl.findall('skos:hiddenLabel', ns):
                            l_text = get_label(label)
                            if l_text is not None:
                                labels.append(l_text)

                    # get synonyms
                    if 'oboInOwl' in ns:
                        for syn in cl.findall('oboInOwl:hasExactSynonym', ns):
                            l_text = get_label(syn)
                            if l_text is not None:
                                labels.append(l_text)
                        for syn in cl.findall('oboInOwl:hasRelatedSynonym', ns) \
                                + cl.findall('oboInOwl:hasNarrowSynonym', ns) \
                                + cl.findall('oboInOwl:hasBroadSynonym', ns):
                            l_text = get_label(syn)
                            if l_text is not None:
                                labels.append(l_text)

                    # set canonical_name and aliases
                    if len(labels) > 0:
                        entity.canonical_name = labels[0]
                        entity.aliases = list(
                            set([lab.lower() for lab in labels]))

                    # if no name available (usually entity from external KB), replace name with id
                    if entity.canonical_name is None:
                        entity.canonical_name = entity.research_entity_id

                    # get definition
                    if 'skos' in ns:
                        for definition in cl.findall('skos:definition', ns):
                            if definition.text is not None:
                                entity.definition += definition.text.lower(
                                ) + ' '
                    if 'obo' in ns:
                        for definition in cl.findall('obo:IAO_0000115', ns):
                            if definition.text is not None:
                                entity.definition += definition.text.lower(
                                ) + ' '
                    entity.definition = entity.definition.strip()

                    # get subclass relations
                    for sc_rel in cl.findall('rdfs:subClassOf', ns):
                        target_research_entity_id = sc_rel.get(
                            '{' + ns['rdf'] + '}resource', ns)
                        if isinstance(target_research_entity_id, str):
                            relation = KBRelation(
                                relation_type='subClassOf',
                                entity_ids=[
                                    entity.research_entity_id,
                                    target_research_entity_id
                                ],
                                symmetric=False)
                            relations.append(relation)
                except AttributeError:
                    pass

                # add relations to entity and to kb
                for rel in relations:
                    kb.add_relation(rel)
                    rel_index = len(kb.relations) - 1
                    entity.relation_ids.append(rel_index)

                # add entity to kb
                kb.add_entity(entity)

        return kb
    def import_obo_kb(kb_name, kb_filename):
        """
        Create a KnowledgeBase object with entities and relations from an OBO file
        :param kb_name:
        :param kb_filename: OBO file where KB is located
        :return:
        """
        # initialize the KB
        kb = KnowledgeBase()
        kb.name = kb_name

        for chunk in KBLoader._chunkify(file_util.read_lines(kb_filename),
                                        KBLoader.OBO_ENTITY_START_TAG):
            # instantiate an empty entity.
            entity = KBEntity()

            # list of KBRelations to add
            relations = []

            for line_index, line in enumerate(chunk):
                if line.startswith('id: '):
                    # research_entity_id
                    entity.research_entity_id = line[len('id: '):]
                elif line.startswith('name: '):
                    # canonical_name
                    entity.canonical_name = line[len('name: '):].replace(
                        '_', ' ')
                    entity.aliases.append(entity.canonical_name)
                elif line.startswith('def: '):
                    # definition
                    start_offset, end_offset = line.index(
                        '"') + 1, line.rindex('"')
                    entity.definition = line[start_offset:end_offset]
                elif line.startswith('synonym: '):
                    # other aliases
                    start_offset, end_offset = line.index(
                        '"') + 1, line.rindex('"')
                    entity.aliases.append(line[start_offset:end_offset])
                elif line.startswith('is_a: '):
                    # is_a relationships
                    assert entity.research_entity_id
                    splits = line.strip().split(' ')
                    assert (len(splits) > 1)
                    target_research_entity_id = splits[1]
                    relation = KBRelation(relation_type='is_a',
                                          entity_ids=[
                                              entity.research_entity_id,
                                              target_research_entity_id
                                          ],
                                          symmetric=True)
                    relations.append(relation)
                elif line.startswith('relationship: '):
                    # other relationships
                    assert entity.research_entity_id
                    splits = line.split(' ')
                    assert (len(splits) > 2)
                    relation_type = splits[1]
                    target_research_entity_id = splits[2]
                    # is the relation symmetric?
                    if relation_type in KBLoader.OBO_ASYM_RELATION_SET:
                        symmetric = False
                    elif relation_type in KBLoader.OBO_SYM_RELATION_SET:
                        symmetric = True
                    else:
                        # unknown relation type
                        logging.info('unknown relation type: ' + relation_type)
                        assert False
                    relation = KBRelation(relation_type=relation_type,
                                          entity_ids=[
                                              entity.research_entity_id,
                                              target_research_entity_id
                                          ],
                                          symmetric=symmetric)
                    relations.append(relation)
                elif line.startswith('intersection_of: ') or \
                        line.startswith('is_obsolete: ') or \
                        line.startswith('comment: ') or \
                        line.startswith('disjoint_from: ') or \
                        line.startswith('alt_id: ') or \
                        line.startswith('xref: ') or \
                        line.startswith('property_value: has_rank') or \
                        line.startswith('subset: ') or \
                        line.startswith('xref_analog') or \
                        line.startswith('xylem') or \
                        line.startswith('related_synonym') or \
                        line.startswith('exact_synonym') or \
                        line.startswith('broad_synonym') or \
                        line.startswith('narrow_synonym') or \
                        line.startswith('namespace') or \
                        line.startswith('consider') or \
                        line.startswith('replaced_by') or \
                        line.startswith('union_of'):
                    # properties don't map naturally to the unified schema.
                    pass
                else:
                    # unknown obo property.
                    logging.info('unknown OBO property: ' + line)
                    assert False

            # add relations to entity and to kb
            for rel in relations:
                kb.add_relation(rel)
                rel_index = len(kb.relations) - 1
                entity.relation_ids.append(rel_index)

            # add entity to kb
            kb.add_entity(entity)

        return kb