Beispiel #1
0
def test_kb_serialize_vocab(nlp):
    """Test serialization of the KB and custom strings"""
    entity = "MyFunnyID"
    assert entity not in nlp.vocab.strings
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
    assert not mykb.contains_entity(entity)
    mykb.add_entity(entity, freq=342, entity_vector=[3])
    assert mykb.contains_entity(entity)
    assert entity in mykb.vocab.strings
    with make_tempdir() as d:
        # normal read-write behaviour
        mykb.to_disk(d / "kb")
        mykb_new = KnowledgeBase(Vocab(), entity_vector_length=1)
        mykb_new.from_disk(d / "kb")
        assert entity in mykb_new.vocab.strings
print(len(desc_enc))

#Now we want to specify aliases or synonyms. We first add the full names. Here, we are 100% certain that they resolve to their corresponding QID, as there is no ambiguity.
for qid, name in name_dict.items():
    kb.add_alias(alias=name, entities=[qid],
                 probabilities=[1])  # 100% prior probability P(entity|alias)

aliases = {}
words = []
with open('disease_alieases.tsv', 'r') as fr:
    for row in fr:
        row = row.strip().split('\t')
        qid = row[0]
        name = row[1]
        #print (row)
        if kb.contains_entity(qid):
            aliases[name] = qid
            kb.add_alias(
                alias=name, entities=[qid],
                probabilities=[1])  # 100% prior probability P(entity|alias)

print("Checking KB ...")
print(kb.contains_entity('MONDO:0000001'))

annots = parse_annots('dailymed_disease3_L.jsonl')

entity_labels = []
for text, res in annots.items():
    t = res['text']
    for span in res['spans']:
        s = span['start']
def create_index(
    model: str,
    kb_dir: Path,
    output_dir: Path,
    new_model_name: str = "ann_linker",
    cg_threshold: float = 0.8,
    n_iter: int = 5,
    verbose: bool = True,
):

    """Create an AnnLinker based on the Character N-Gram
    TF-IDF vectors for aliases in a KnowledgeBase

    model (str): spaCy language model directory or name to load
    kb_dir (Path): path to the directory with kb entities.jsonl and aliases.jsonl files
    output_dir (Path): path to output_dir for spaCy model with ann_linker pipe


    kb File Formats
    
    e.g. entities.jsonl

    {"id": "a1", "description": "Machine learning (ML) is the scientific study of algorithms and statistical models..."}
    {"id": "a2", "description": "ML (\"Meta Language\") is a general-purpose functional programming language. It has roots in Lisp, and has been characterized as \"Lisp with types\"."}

    e.g. aliases.jsonl
    {"alias": "ML", "entities": ["a1", "a2"], "probabilities": [0.5, 0.5]}
    """
    msg = Printer(hide_animation=not verbose)

    msg.divider("Load Model")
    with msg.loading(f"Loading model {model}"):
        nlp = spacy.load(model)
        msg.good("Done.")

    if output_dir is not None:
        output_dir = Path(output_dir / new_model_name)
        if not output_dir.exists():
            output_dir.mkdir(parents=True)

    entities = list(srsly.read_jsonl(kb_dir / "entities.jsonl"))
    aliases = list(srsly.read_jsonl(kb_dir / "aliases.jsonl"))
    kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=INPUT_DIM)

    # set up the data
    entity_ids = []
    descriptions = []
    freqs = []
    for e in entities:
        entity_ids.append(e["id"])
        descriptions.append(e.get("description", ""))
        freqs.append(100)

    # msg.divider("Train EntityEncoder")

    # with msg.loading("Starting training EntityEncoder"):
    #     # training entity description encodings
    #     # this part can easily be replaced with a custom entity encoder
    #     encoder = EntityEncoder(nlp=nlp, input_dim=INPUT_DIM, desc_width=DESC_WIDTH, epochs=n_iter)
    #     encoder.train(description_list=descriptions, to_print=True)
    #     msg.good("Done Training")

    msg.divider("Apply EntityEncoder")

    with msg.loading("Applying EntityEncoder to descriptions"):
        # get the pretrained entity vectors
        embeddings = [nlp.make_doc(desc).vector for desc in descriptions]
        msg.good("Finished, embeddings created")

    with msg.loading("Setting kb entities and aliases"):
        # set the entities, can also be done by calling `kb.add_entity` for each entity
        for i in range(len(entity_ids)):
            entity = entity_ids[i]
            if not kb.contains_entity(entity):
                kb.add_entity(entity, freqs[i], embeddings[i])

        for a in aliases:
            ents = [e for e in a["entities"] if kb.contains_entity(e)]
            n_ents = len(ents)
            if n_ents > 0:
                prior_prob = [1.0 / n_ents] * n_ents
                kb.add_alias(alias=a["alias"], entities=ents, probabilities=prior_prob)

        msg.good("Done adding entities and aliases to kb")

    msg.divider("Create ANN Index")

    cg = CandidateGenerator().fit(kb.get_alias_strings(), verbose=True)

    ann_linker = nlp.create_pipe("ann_linker")
    ann_linker.set_kb(kb)
    ann_linker.set_cg(cg)

    nlp.add_pipe(ann_linker, last=True)

    nlp.meta["name"] = new_model_name
    nlp.to_disk(output_dir)
    nlp.from_disk(output_dir)
Beispiel #4
0
def create_index(
    model: str,
    kb_dir: Path,
    output_dir: Path,
    new_model_name: str = "ann_linker",
    cg_threshold: float = 0.8,
    n_iter: int = 5,
    verbose: bool = True,
):
    """Create an AnnLinker based on the Character N-Gram
    TF-IDF vectors for aliases in a KnowledgeBase

    model (str): spaCy language model directory or name to load
    kb_dir (Path): path to the directory with kb entities.jsonl and aliases.jsonl files
    output_dir (Path): path to output_dir for spaCy model with ann_linker pipe


    kb File Formats
    
    e.g. entities.jsonl

    {"id": "a1", "description": "Machine learning (ML) is the scientific study of algorithms and statistical models..."}
    {"id": "a2", "description": "ML (\"Meta Language\") is a general-purpose functional programming language. It has roots in Lisp, and has been characterized as \"Lisp with types\"."}

    e.g. aliases.jsonl
    {"alias": "ML", "entities": ["a1", "a2"], "probabilities": [0.5, 0.5]}
    """
    msg = Printer(hide_animation=not verbose)

    msg.divider("Load Model")
    with msg.loading(f"Loading model {model}"):
        nlp = spacy.load(model)
        msg.good("Done.")

    if output_dir is not None:
        output_dir = Path(output_dir / new_model_name)
        if not output_dir.exists():
            output_dir.mkdir(parents=True)

    entities, entities_copy = tee(srsly.read_jsonl(kb_dir / "entities.jsonl"))
    total_entities = sum(1 for _ in entities_copy)

    aliases, aliases_copy = tee(srsly.read_jsonl(kb_dir / "aliases.jsonl"))
    total_aliases = sum(1 for _ in aliases_copy)

    kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=INPUT_DIM)

    empty_doc = nlp.make_doc('').vector

    for entity in tqdm(entities,
                       desc='Adding entities to KB',
                       total=total_entities):
        id = entity['id']
        if not kb.contains_entity(id):
            embedding = nlp.make_doc(
                entity['description']
            ).vector if 'description' in entity else empty_doc
            label = entity['label'] if 'label' in entity else 0
            if label: label = kb_type_vs_index[label]
            kb.add_entity(
                entity=id,
                freq=
                label,  #TODO: Add a proper "label" field (repurposed freq field as the type label)
                entity_vector=embedding)

    for alias in tqdm(aliases,
                      desc="Setting kb entities and aliases",
                      total=total_aliases):
        entities = [e for e in alias["entities"] if kb.contains_entity(e)]
        num_entities = len(entities)
        if num_entities > 0:
            prior_probabilities = alias['probabilities'] if len(
                alias['probabilities']
            ) == num_entities else [1.0 / num_entities] * num_entities
            kb.add_alias(alias=alias["alias"],
                         entities=entities,
                         probabilities=prior_probabilities)

    msg.divider("Create ANN Index")
    alias_strings = kb.get_alias_strings()
    cg = CandidateGenerator().fit(alias_strings, verbose=True)

    ann_linker = nlp.create_pipe("ann_linker")
    ann_linker.set_kb(kb)
    ann_linker.set_cg(cg)

    nlp.add_pipe(ann_linker, last=True)

    nlp.meta["name"] = new_model_name
    nlp.to_disk(output_dir)
    nlp.from_disk(output_dir)