Exemple #1
0
 def _make_mesh_entity(entity_chunk):
     """
     Make a KBEntity from each MeSH chunk
     :param entity_chunk:
     :return:
     """
     entity = KBEntity()
     for line in entity_chunk:
         fields = line.split(" = ")
         if len(fields) != 2:
             continue
         key, value = fields[0], fields[1]
         if key == 'UI':
             entity.research_entity_id = value
         elif key == 'MH' or key == 'SH':
             entity.canonical_name = value
             entity.aliases.append(value)
         elif key == 'ENTRY' or key == 'PRINT ENTRY':
             entity.aliases.append(value.split("|")[0])
         elif key == 'MS':
             entity.definition = value
     return entity
    def import_dbpedia(kb_name, kb_filename, entities_count=0):
        """
        Instantiate a KnowledgeBase object with entities and relations from dbpedia
        :param kb_name:
        :param kb_filename:
        :param entities_count:
        :return:
        """
        # initialize the KB.
        kb = KnowledgeBase()
        kb.name = kb_name
        # only the "turtle" format is allowed for this kb.
        assert ('.ttl' in kb_filename)
        kb_filename = file_util.cache_file(kb_filename)

        # parse the turtle file
        abstracts_graph = rdflib.Graph()
        abstracts_graph.parse(kb_filename, format='turtle')
        logging.warning('done parsing dbpedia .ttl files.')

        counter = 0
        for item_subject, item_property, item_object in abstracts_graph:
            entity = KBEntity()
            entity.research_entity_id = str(item_subject)
            if not entity.research_entity_id.startswith(
                    'http://dbpedia.org/resource/'):
                continue
            entity.canonical_name = entity.research_entity_id[
                len('http://dbpedia.org/resource/'):].replace('_', ' ')
            entity.aliases.append(entity.canonical_name)
            entity.definition = str(item_object)
            # verify and add entity to the KB.
            kb.add_entity(entity)
            counter += 1
            if counter >= entities_count > 0:
                break
        return kb
    def import_obo_kb(kb_name, kb_filename):
        """
        Create a KnowledgeBase object with entities and relations from an OBO file
        :param kb_name:
        :param kb_filename: OBO file where KB is located
        :return:
        """
        # initialize the KB
        kb = KnowledgeBase()
        kb.name = kb_name

        for chunk in KBLoader._chunkify(file_util.read_lines(kb_filename),
                                        KBLoader.OBO_ENTITY_START_TAG):
            # instantiate an empty entity.
            entity = KBEntity()

            # list of KBRelations to add
            relations = []

            for line_index, line in enumerate(chunk):
                if line.startswith('id: '):
                    # research_entity_id
                    entity.research_entity_id = line[len('id: '):]
                elif line.startswith('name: '):
                    # canonical_name
                    entity.canonical_name = line[len('name: '):].replace(
                        '_', ' ')
                    entity.aliases.append(entity.canonical_name)
                elif line.startswith('def: '):
                    # definition
                    start_offset, end_offset = line.index(
                        '"') + 1, line.rindex('"')
                    entity.definition = line[start_offset:end_offset]
                elif line.startswith('synonym: '):
                    # other aliases
                    start_offset, end_offset = line.index(
                        '"') + 1, line.rindex('"')
                    entity.aliases.append(line[start_offset:end_offset])
                elif line.startswith('is_a: '):
                    # is_a relationships
                    assert entity.research_entity_id
                    splits = line.strip().split(' ')
                    assert (len(splits) > 1)
                    target_research_entity_id = splits[1]
                    relation = KBRelation(relation_type='is_a',
                                          entity_ids=[
                                              entity.research_entity_id,
                                              target_research_entity_id
                                          ],
                                          symmetric=True)
                    relations.append(relation)
                elif line.startswith('relationship: '):
                    # other relationships
                    assert entity.research_entity_id
                    splits = line.split(' ')
                    assert (len(splits) > 2)
                    relation_type = splits[1]
                    target_research_entity_id = splits[2]
                    # is the relation symmetric?
                    if relation_type in KBLoader.OBO_ASYM_RELATION_SET:
                        symmetric = False
                    elif relation_type in KBLoader.OBO_SYM_RELATION_SET:
                        symmetric = True
                    else:
                        # unknown relation type
                        logging.info('unknown relation type: ' + relation_type)
                        assert False
                    relation = KBRelation(relation_type=relation_type,
                                          entity_ids=[
                                              entity.research_entity_id,
                                              target_research_entity_id
                                          ],
                                          symmetric=symmetric)
                    relations.append(relation)
                elif line.startswith('intersection_of: ') or \
                        line.startswith('is_obsolete: ') or \
                        line.startswith('comment: ') or \
                        line.startswith('disjoint_from: ') or \
                        line.startswith('alt_id: ') or \
                        line.startswith('xref: ') or \
                        line.startswith('property_value: has_rank') or \
                        line.startswith('subset: ') or \
                        line.startswith('xref_analog') or \
                        line.startswith('xylem') or \
                        line.startswith('related_synonym') or \
                        line.startswith('exact_synonym') or \
                        line.startswith('broad_synonym') or \
                        line.startswith('narrow_synonym') or \
                        line.startswith('namespace') or \
                        line.startswith('consider') or \
                        line.startswith('replaced_by') or \
                        line.startswith('union_of'):
                    # properties don't map naturally to the unified schema.
                    pass
                else:
                    # unknown obo property.
                    logging.info('unknown OBO property: ' + line)
                    assert False

            # add relations to entity and to kb
            for rel in relations:
                kb.add_relation(rel)
                rel_index = len(kb.relations) - 1
                entity.relation_ids.append(rel_index)

            # add entity to kb
            kb.add_entity(entity)

        return kb