Example #1
0
person_string = nlp.vocab.strings[person_hash]
print(person_string)

########## as you might observe nlp object is universal (not dependent on text), its the doc object that carries
# info about our particular use case
# so, hashes generated are universal, here even if we searched for hash of a very common english word "ball" it
# returns a value even though its not in doc text
# any new words encountered will be added to dictionary when we try to lookup for their hash

#### creating Doc and Span objects from built in classes
from spacy.tokens import Doc, Span
# suppose the text is same as in previous example
words = ["David", "Bowie", "is", 'a', 'PERSON', '!']
spaces_after_all_tokens = [True, True, True, True, False, False]

doc = Doc(nlp.vocab, words, spaces_after_all_tokens)
print(doc.text)

span = Span(doc, 0, 2, label="PERSON")
print(span.text, span.label_)

type(doc.ents)
doc.ents = [span]
# Print entities' text and labels
print([(ent.text, ent.label_) for ent in doc.ents])

############# Spacy's word vectors (300d) and similarities ##############
# you'll need large or medium models for this
# Load the en_core_web_md model
nlp = spacy.load('en_core_web_md')
Example #2
0
def test_issue912(en_vocab, text, tag, lemma):
    """Test base-forms are preserved."""
    doc = Doc(en_vocab, words=[text])
    doc[0].tag_ = tag
    assert doc[0].lemma_ == lemma
def main(json_loc: Path,
         train_file: Path,
         dev_file: Path,
         test_file: Path,
         test_split=0.189,
         train_split=0.709):
    """Creating the corpus from the Prodigy annotations."""
    Doc.set_extension("rel", default={})
    vocab = Vocab()

    docs = {"train": [], "dev": [], "test": []}
    ids = {"train": set(), "dev": set(), "test": set()}
    count_all = {"train": 0, "dev": 0, "test": 0}
    count_pos = {"train": 0, "dev": 0, "test": 0}

    long_rel_count = 0  #how many relations are longer
    error_count_rel = 0  #how often is something different than ARGO, ARG1, ARG

    with json_loc.open("r", encoding="utf8") as jsonfile:
        length_training_data = len([
            True for line in jsonfile if json.loads(line)["answer"] == "accept"
        ])
        msg.info(f"Number of accepted recipes: {length_training_data}")

    with json_loc.open("r", encoding="utf8") as jsonfile:
        for line in jsonfile:
            example = json.loads(line)  #one recipe
            span_starts = set()

            if example["answer"] == "accept":
                neg = 0
                pos = 0
                try:
                    # Parse the tokens -> example["tokens"] = list of dicts
                    words = [t["text"] for t in example["tokens"]
                             ]  #list containing all words
                    spaces = [
                        t["ws"] for t in example["tokens"]
                    ]  #list containing ws is behind word (ws = True/False)
                    doc = Doc(vocab, words=words, spaces=spaces)

                    # Parse the entities
                    spans = example[
                        "spans"]  #list of dicts containing entities
                    entities = []
                    span_end_to_start = {}
                    ents_dict = {}
                    for span in spans:  #every detected span
                        entity = doc.char_span(
                            span["start"], span["end"], label=span["label"]
                        )  #"start" = wievielter character ist start character des spans im doc
                        span_end_to_start[span["token_end"]] = span[
                            "token_start"]  #end_token of span as key for start_token (start token = wievielter token in doc)
                        entities.append(entity)  #appended to list
                        span_starts.add(span["token_start"])  #added to set
                        ents_dict[span["token_start"]] = (span["label"],
                                                          span["token_start"])
                    doc.ents = entities  #entity list assigned as doc entites

                    # Parse the relations
                    rels = {}

                    # create token combinations
                    for x1 in span_starts:

                        #VERBS_TO_OTHER 1a
                        if VERBS_TO_OTHER == True:
                            if ents_dict[x1][0] == "V":  #filter entity type
                                for x2 in span_starts:
                                    if ents_dict[x2][0] in [
                                            "Z", "TOOL", "ATTR", "TEMP",
                                            "DAUER", "ZEITP", "PRĂ„P"
                                    ]:  #filter entity type

                                        #DIFF_FRONT_BACK 1a
                                        if DIFF_FRONT_BACK == True:

                                            if ((x1 - x2) >= 0 and
                                                (x1 - x2) <= BACK) or (
                                                    (x1 - x2) < 0 and
                                                    (x1 - x2) >= FRONT * -1):
                                                rels[(x1, x2)] = {}

                                            else:
                                                pass
                                        #DIFF_FRONT_BACK 1b
                                        else:
                                            if abs(
                                                    ents_dict[x1][1] -
                                                    ents_dict[x2][1]
                                            ) <= TOKEN_LENGTH:  #filter token distance (match with config?)
                                                rels[(x1, x2)] = {
                                                }  #every possible span combination becomes key for individual dict (1,1), (1,2) ...
                        #VERBS_TO_OTHER 1b
                        else:
                            for x2 in span_starts:
                                #DIFF_FRONT_BACK 2a
                                if DIFF_FRONT_BACK == True:

                                    if ((x1 - x2) >= 0 and
                                        (x1 - x2) <= BACK) or (
                                            (x1 - x2) < 0 and
                                            (x1 - x2) >= FRONT * -1):
                                        rels[(x1, x2)] = {}

                                    else:
                                        pass
                                #DIFF_FRONT_BACK 2b
                                else:
                                    if abs(
                                            ents_dict[x1][1] - ents_dict[x2][1]
                                    ) <= TOKEN_LENGTH:  #filter token distance (match with config?)
                                        rels[(x1, x2)] = {
                                        }  #every possible span combination becomes key for individual dict (1,1), (1,2) ...

                    relations = example[
                        "relations"]  #relations is list of dict
                    for relation in relations:
                        # the 'head' and 'child' annotations refer to the end token in the span
                        # but we want the first token
                        start = span_end_to_start[relation[
                            "head"]]  #wievielter token ist start token des head
                        end = span_end_to_start[relation[
                            "child"]]  #wievielter token ist start token des child
                        label = relation["label"]

                        #DETAILED_ARGS 1a
                        if DETAILED_ARGS == True:
                            if label == "ARG0":
                                if ents_dict[end][0] not in ["Z", "TOOL"]:
                                    label = MAP_LABELS_ARG[ents_dict[end][0]]
                                else:
                                    label = MAP_LABELS_ARG0[ents_dict[end][
                                        0]]  #assign new label based on span type
                            elif label == "ARG1":
                                if ents_dict[end][0] not in ["Z", "TOOL"]:
                                    label = MAP_LABELS_ARG[ents_dict[end][0]]
                                else:
                                    label = MAP_LABELS_ARG1[ents_dict[end][0]]
                            elif label == "ARG":
                                if ents_dict[end][0] in ["Z", "TOOL"]:
                                    if ents_dict[end][0] == "Z":
                                        label = "Arg0Z"
                                    elif ents_dict[end][0] == "TOOL":
                                        label = "Arg1Tool"
                                else:
                                    label = MAP_LABELS_ARG[ents_dict[end][0]]
                            else:
                                error_count_rel += 1

                        #DETAILED_ARGS 1b
                        else:
                            label = MAP_LABELS_STANDARD[
                                label]  #MAP_LABELS = dict containing label as key

                        # Positive relations are being added
                        try:
                            if label not in rels[(
                                    start, end
                            )]:  #check if label already exists for token combination
                                rels[(
                                    start, end
                                )][label] = 1.0  #initialize label as new key with value 1.0
                                pos += 1  #positive case
                        except:
                            long_rel_count += 1  #error only if relation exists in annotation but isn't a valid token combi (too long/not starting from verb)
                            pass

                    # The annotation is complete, so fill in zero's where the data is missing
                    for x1 in span_starts:

                        #VERBS_TO_OTHER 2a
                        if VERBS_TO_OTHER == True:
                            if ents_dict[x1][0] == "V":  #filter entity type
                                for x2 in span_starts:
                                    if ents_dict[x2][0] in [
                                            "Z", "TOOL", "ATTR", "TEMP",
                                            "DAUER", "ZEITP", "PRĂ„P"
                                    ]:  #filter entity type

                                        #DIFF_FRONT_BACK 2a
                                        if DIFF_FRONT_BACK == True:
                                            if ((x1 - x2) >= 0 and
                                                (x1 - x2) <= BACK) or (
                                                    (x1 - x2) < 0 and
                                                    (x1 - x2) >= FRONT * -1):
                                                #DETAILED_ARGS 2a
                                                if DETAILED_ARGS == True:
                                                    merged_labels = list(
                                                        MAP_LABELS_ARG0.values(
                                                        )) + list(
                                                            MAP_LABELS_ARG1.
                                                            values()) + list(
                                                                MAP_LABELS_ARG.
                                                                values())
                                                    for label in merged_labels:
                                                        if label not in rels[(
                                                                x1, x2
                                                        )]:  #if label isn't assigned to span combination
                                                            neg += 1
                                                            rels[(
                                                                x1, x2
                                                            )][label] = 0.0
                                            #DETAILED_ARGS 2b
                                                else:
                                                    for label in MAP_LABELS_STANDARD.values(
                                                    ):  #for every label
                                                        if label not in rels[(
                                                                x1, x2
                                                        )]:  #if label isn't assigned to span combination
                                                            neg += 1
                                                            rels[(
                                                                x1, x2
                                                            )][label] = 0.0

                                        #DIFF_FRONT_BACK 2b
                                        else:
                                            if abs(
                                                    ents_dict[x1][1] -
                                                    ents_dict[x2][1]
                                            ) <= TOKEN_LENGTH:  #filter token distance (match with config?)
                                                #DETAILED_ARGS 3a
                                                if DETAILED_ARGS == True:
                                                    merged_labels = list(
                                                        MAP_LABELS_ARG0.values(
                                                        )) + list(
                                                            MAP_LABELS_ARG1.
                                                            values()) + list(
                                                                MAP_LABELS_ARG.
                                                                values())
                                                    for label in merged_labels:
                                                        if label not in rels[(
                                                                x1, x2
                                                        )]:  #if label isn't assigned to span combination
                                                            neg += 1
                                                            rels[(
                                                                x1, x2
                                                            )][label] = 0.0
                                                #DETAILED_ARGS 3b
                                                else:
                                                    for label in MAP_LABELS_STANDARD.values(
                                                    ):  #for every label
                                                        if label not in rels[(
                                                                x1, x2
                                                        )]:  #if label isn't assigned to span combination
                                                            neg += 1
                                                            rels[(
                                                                x1, x2
                                                            )][label] = 0.0  #span combination with label as key gets 0 as value
                        #VERBS_TO_OTHER 2b
                        else:
                            for x2 in span_starts:
                                #DIFF_FRONT_BACK 3a
                                if DIFF_FRONT_BACK == True:
                                    if ((x1 - x2) >= 0 and
                                        (x1 - x2) <= BACK) or (
                                            (x1 - x2) < 0 and
                                            (x1 - x2) >= FRONT * -1):
                                        #DETAILED_ARGS 4a
                                        if DETAILED_ARGS == True:
                                            merged_labels = list(
                                                MAP_LABELS_ARG0.values()
                                            ) + list(MAP_LABELS_ARG1.values(
                                            )) + list(MAP_LABELS_ARG.values())
                                            for label in merged_labels:
                                                if label not in rels[(
                                                        x1, x2
                                                )]:  #if label isn't assigned to span combination
                                                    neg += 1
                                                    rels[(x1, x2)][label] = 0.0
                                    #DETAILED_ARGS 4b
                                        else:
                                            for label in MAP_LABELS_STANDARD.values(
                                            ):  #for every label
                                                if label not in rels[(
                                                        x1, x2
                                                )]:  #if label isn't assigned to span combination
                                                    neg += 1
                                                    rels[(x1, x2)][label] = 0.0

                                #DIFF_FRONT_BACK 3b
                                else:
                                    if abs(
                                            ents_dict[x1][1] - ents_dict[x2][1]
                                    ) <= TOKEN_LENGTH:  #filter token distance (match with config?)
                                        #DETAILED_ARGS 5a
                                        if DETAILED_ARGS == True:
                                            merged_labels = list(
                                                MAP_LABELS_ARG0.values()
                                            ) + list(MAP_LABELS_ARG1.values(
                                            )) + list(MAP_LABELS_ARG.values())
                                            for label in merged_labels:
                                                if label not in rels[(
                                                        x1, x2
                                                )]:  #if label isn't assigned to span combination
                                                    neg += 1
                                                    rels[(x1, x2)][label] = 0.0
                                        #DETAILED_ARGS 5b
                                        else:
                                            for label in MAP_LABELS_STANDARD.values(
                                            ):  #for every label
                                                if label not in rels[(
                                                        x1, x2
                                                )]:  #if label isn't assigned to span combination
                                                    neg += 1
                                                    rels[(x1, x2)][label] = 0.0

                    #print(rels)
                    doc._.rel = rels  # rels = {(1,1): {Arg0 : 1, Arg1 : 0, Arg : 0}, (1,2): {Arg0 : 0, ...}}

                    # only keeping documents with at least 1 positive case (if doc isn't annotated relations = empty list)
                    if pos > 0:

                        recipe_id = example["_input_hash"]

                        if len(docs["train"]) < round(
                                train_split * length_training_data):
                            ids["train"].add(recipe_id)
                            docs["train"].append(doc)
                            count_pos["train"] += pos
                            count_all["train"] += pos + neg
                        elif len(docs["test"]) < round(
                                test_split * length_training_data):
                            ids["test"].add(recipe_id)
                            docs["test"].append(doc)
                            count_pos["test"] += pos
                            count_all["test"] += pos + neg
                        else:
                            ids["dev"].add(recipe_id)
                            docs["dev"].append(doc)
                            count_pos["dev"] += pos
                            count_all["dev"] += pos + neg

                except KeyError as e:
                    msg.fail(
                        f"Skipping doc because of key error: {e} in {example['_input_hash']}"
                    )

    msg.info(
        f"{long_rel_count} relations have been cut because tokens are too far apart."
    )

    docbin = DocBin(docs=docs["train"], store_user_data=True)
    docbin.to_disk(train_file)
    msg.info(
        f"{len(docs['train'])} training recipes from {len(ids['train'])} unique recipes, "
        f"{count_pos['train']}/{count_all['train']} pos instances.")

    docbin = DocBin(docs=docs["dev"], store_user_data=True)
    docbin.to_disk(dev_file)
    msg.info(
        f"{len(docs['dev'])} dev recipes from {len(ids['dev'])} unique recipes, "
        f"{count_pos['dev']}/{count_all['dev']} pos instances.")

    docbin = DocBin(docs=docs["test"], store_user_data=True)
    docbin.to_disk(test_file)
    msg.info(
        f"{len(docs['test'])} test recipes from {len(ids['test'])} unique recipes, "
        f"{count_pos['test']}/{count_all['test']} pos instances.")
Example #4
0
 def __call__(self, text):
     words = text.split(" ")
     return Doc(self.vocab, words=words)
Example #5
0
def test_issue600():
    vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
    doc = Doc(vocab, words=["hello"])
    doc[0].tag_ = "NN"
Example #6
0
def test_matcher_match_end(matcher):
    words = ["I", "like", "java"]
    doc = Doc(matcher.vocab, words=words)
    assert matcher(doc) == [(doc.vocab.strings["Java"], 2, 3)]
Example #7
0
def test_matcher_valid_callback(en_vocab):
    """Test that on_match can only be None or callable."""
    matcher = Matcher(en_vocab)
    with pytest.raises(ValueError):
        matcher.add("TEST", [[{"TEXT": "test"}]], on_match=[])
    matcher(Doc(en_vocab, words=["test"]))
Example #8
0
def test_vectors_lexeme_doc_similarity(vocab, text):
    doc = Doc(vocab, words=text)
    lex = vocab[text[0]]
    assert lex.similarity(doc) == doc.similarity(lex)
    assert -1.0 < lex.similarity(doc) < 1.0
Example #9
0
def test_vectors_span_doc_similarity(vocab, text):
    doc = Doc(vocab, words=text)
    with pytest.warns(UserWarning):
        assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
        assert -1.0 < doc[0:2].similarity(doc) < 1.0
Example #10
0
def test_vectors_token_doc_similarity(vocab, text):
    doc = Doc(vocab, words=text)
    assert doc[0].similarity(doc) == doc.similarity(doc[0])
    assert -1.0 < doc[0].similarity(doc) < 1.0
Example #11
0
def test_vectors_lexeme_span_similarity(vocab, text):
    doc = Doc(vocab, words=text)
    lex = vocab[text[0]]
    assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex)
    assert -1.0 < doc.similarity(doc[1:3]) < 1.0
Example #12
0
def test_vectors_span_vector(vocab, text):
    span = Doc(vocab, words=text)[0:2]
    assert list(span.vector)
    assert span.vector_norm
Example #13
0
def test_vectors_doc_vector(vocab, text):
    doc = Doc(vocab, words=text)
    assert list(doc.vector)
    assert doc.vector_norm
def leer_texto(texto):
    """Funcion auxiliar para leer un archivo de texto"""
    with open(texto, 'r', encoding="latin-1") as text:
        return text.read()

    #creamos nuestro <procesador> con la funcion spacy.load y pasamos el nombre de nuestro corpus como parametro, !!!!procesador es un metodo!!!!!


#procesador = spacy.load('es_core_news_lg')

#usamos el metodo leer_texto(), y asignamos el nombre del archivo que deseamos procesar,
#el string resultante lo guardamos en la variable <texto>

#texto = leer_texto("Sirena.txt")
#usamos nuestro procesador y le asignamos el texto
texto_procesado = Doc(Vocab()).from_disk("texto_procesado")
#texto_procesado.to_disk("texto_procesado")
##procesador.to_disk("procesador")

#Ahora tenemos un objeto llamado texto_procesado, el cual le ha asignado a cada palabra del texto una [tokenizacion]
#Podemos valernos de una palabra aleatoria incluida en el tipo [tag] de cada token.
#usamos la siguiente formula para crear el metodo correspondiente:
"""
            def NOMBRE_METODO():
                return random.choice([token.orth_ for token in texto_procesado if token.tag_ == 'TIPO DE TAG'])
"""
#NOMBRE_METODO = al nombre que queramos asignar para la palabra
#TIPO DE TAG = Un tag que describa el tipo de palabra
# la lista: de tipos :

Example #15
0
def test_retokenize_disallow_zero_length(en_vocab):
    doc = Doc(en_vocab, words=["hello", "world", "!"])
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.merge(doc[1:1])
Example #16
0
def test_vectors_doc_doc_similarity(vocab, text1, text2):
    doc1 = Doc(vocab, words=text1)
    doc2 = Doc(vocab, words=text2)
    assert doc1.similarity(doc2) == doc2.similarity(doc1)
    assert -1.0 < doc1.similarity(doc2) < 1.0
Example #17
0
def read_data(nlp,
              conllu_file,
              text_file,
              raw_text=True,
              oracle_segments=False,
              max_doc_length=None,
              limit=None):
    '''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
    include Doc objects created using nlp.make_doc and then aligned against
    the gold-standard sequences. If oracle_segments=True, include Doc objects
    created from the gold-standard segments. At least one must be True.'''
    if not raw_text and not oracle_segments:
        raise ValueError(
            "At least one of raw_text or oracle_segments must be True")
    paragraphs = split_text(text_file.read())
    conllu = read_conllu(conllu_file)
    # sd is spacy doc; cd is conllu doc
    # cs is conllu sent, ct is conllu token
    docs = []
    golds = []
    for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)):
        sent_annots = []
        for cs in cd:
            sent = defaultdict(list)
            for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
                if '.' in id_:
                    continue
                if '-' in id_:
                    continue
                id_ = int(id_) - 1
                head = int(head) - 1 if head != '0' else id_
                sent['words'].append(word)
                sent['tags'].append(tag)
                sent['heads'].append(head)
                sent['deps'].append('ROOT' if dep == 'root' else dep)
                sent['spaces'].append(space_after == '_')
            sent['entities'] = ['-'] * len(sent['words'])
            sent['heads'], sent['deps'] = projectivize(sent['heads'],
                                                       sent['deps'])
            if oracle_segments:
                docs.append(
                    Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces']))
                golds.append(GoldParse(docs[-1], **sent))

            sent_annots.append(sent)
            if raw_text and max_doc_length and len(
                    sent_annots) >= max_doc_length:
                doc, gold = _make_gold(nlp, None, sent_annots)
                sent_annots = []
                docs.append(doc)
                golds.append(gold)
                if limit and len(docs) >= limit:
                    return docs, golds

        if raw_text and sent_annots:
            doc, gold = _make_gold(nlp, None, sent_annots)
            docs.append(doc)
            golds.append(gold)
        if limit and len(docs) >= limit:
            return docs, golds
    return docs, golds
Example #18
0
def test_vectors_similarity_DD(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc1 = Doc(vocab, words=[word1, word2])
    doc2 = Doc(vocab, words=[word2, word1])
    assert isinstance(doc1.similarity(doc2), float)
    assert doc1.similarity(doc2) == doc2.similarity(doc1)
Example #19
0
def test_matcher_match_middle(matcher):
    words = ["I", "like", "Google", "Now", "best"]
    doc = Doc(matcher.vocab, words=words)
    assert matcher(doc) == [(doc.vocab.strings["GoogleNow"], 2, 4)]
Example #20
0
def test_vectors_similarity_DS(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = Doc(vocab, words=[word1, word2])
    assert isinstance(doc.similarity(doc[:2]), float)
    assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
Example #21
0
def test_matcher_no_match(matcher):
    doc = Doc(matcher.vocab, words=["I", "like", "cheese", "."])
    assert matcher(doc) == []
Example #22
0
if not os.path.isdir(path_to_mlgenome):
    print(f"Create Directory {path_to_mlgenome}")
    os.mkdir(path_to_mlgenome)

print("Loading Vocab...")
vocab = Vocab().from_disk(os.path.join(path_to_annotations, "spacy.vocab"))
infoDF = pd.read_pickle(os.path.join(path_to_annotations, 'info_db.pandas'))

acronym_dictionary = dict()
entities = set()

lt = LoopTimer(update_after=1, avg_length=1000, target=len(infoDF))
for abstract_id, row in infoDF.iterrows():

    file_path = os.path.join(path_to_annotations, f"{abstract_id}.spacy")
    doc = Doc(vocab).from_disk(file_path)

    for sentence in doc.sents:
        for ent in sentence.ents:
            entities.add(ent.text.lower())
            definition_span = find_definition_candidate(sentence, ent)
            if definition_span is not None:
                acronym_string = ent.text.lower()
                acronym_orth = [token.orth_ for token in ent]

                d_string = definition_span.text.lower()
                d_orth = [token.orth_.lower() for token in definition_span]
                d_orth_with_ws = [token.text_with_ws.lower() for token in definition_span]
                d_lemma = [token.lemma_.lower() for token in definition_span]
                d_lemma_with_ws = [f"{token.lemma_.lower()}{token.whitespace_}" for token in definition_span]
Example #23
0
def test_issue589():
    vocab = Vocab()
    vocab.strings.set_frozen(True)
    doc = Doc(vocab, words=["whata"])
    assert doc
def get_spans(word_seqs):
    vocab = Vocab()
    docs = [Doc(vocab, words=words) for words in word_seqs]
    return [doc[:] for doc in docs]
Example #25
0
def test_issue743():
    doc = Doc(Vocab(), ["hello", "world"])
    token = doc[0]
    s = set([token])
    items = list(s)
    assert items[0] is token
Example #26
0
def doc(vocab):
    return Doc(vocab, words=["Casey", "went", "to", "New", "York", "."])
from spacy.tokens import Doc, Span
from spacy.lang.en import English

nlp = English()

# Create doc
doc = Doc(nlp.vocab,
          words=["I", "like", "David", "Bowie"],
          spaces=[True, True, True, False])
print(doc)
# create span for David Bowie
span = Span(doc, 2, 4, label=nlp.vocab.strings["PERSON"])

# add span to doc entities
doc.ents = [span]

# print entities' text and labels
print([(ent.text, ent.label_) for ent in doc.ents])
Example #28
0
def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
    # Test entity IOB stays consistent after merging
    words = ["a", "b", "c", "d", "e"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [
        (doc.vocab.strings.add("ent-abc"), 0, 3),
        (doc.vocab.strings.add("ent-d"), 3, 4),
    ]
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"
    assert doc[2].ent_iob_ == "I"
    assert doc[3].ent_iob_ == "B"
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[0:2])
    assert len(doc) == len(words) - 1
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"

    # Test that IOB stays consistent with provided IOB
    words = ["a", "b", "c", "d", "e"]
    doc = Doc(Vocab(), words=words)
    with doc.retokenize() as retokenizer:
        attrs = {"ent_type": "ent-abc", "ent_iob": 1}
        retokenizer.merge(doc[0:3], attrs=attrs)
        retokenizer.merge(doc[3:5], attrs=attrs)
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"

    # if no parse/heads, the first word in the span is the root and provides
    # default values
    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [
        (doc.vocab.strings.add("ent-de"), 3, 5),
        (doc.vocab.strings.add("ent-fg"), 5, 7),
    ]
    assert doc[3].ent_iob_ == "B"
    assert doc[4].ent_iob_ == "I"
    assert doc[5].ent_iob_ == "B"
    assert doc[6].ent_iob_ == "I"
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[2:4])
        retokenizer.merge(doc[4:6])
        retokenizer.merge(doc[7:9])
    assert len(doc) == 6
    assert doc[3].ent_iob_ == "B"
    assert doc[3].ent_type_ == "ent-de"
    assert doc[4].ent_iob_ == "B"
    assert doc[4].ent_type_ == "ent-fg"

    # if there is a parse, span.root provides default values
    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
    heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
    ents = ["O"] * len(words)
    ents[3] = "B-ent-de"
    ents[4] = "I-ent-de"
    ents[5] = "B-ent-fg"
    ents[6] = "I-ent-fg"
    deps = ["dep"] * len(words)
    en_vocab.strings.add("ent-de")
    en_vocab.strings.add("ent-fg")
    en_vocab.strings.add("dep")
    doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
    assert doc[2:4].root == doc[3]  # root of 'c d' is d
    assert doc[4:6].root == doc[4]  # root is 'e f' is e
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[2:4])
        retokenizer.merge(doc[4:6])
        retokenizer.merge(doc[7:9])
    assert len(doc) == 6
    assert doc[2].ent_iob_ == "B"
    assert doc[2].ent_type_ == "ent-de"
    assert doc[3].ent_iob_ == "I"
    assert doc[3].ent_type_ == "ent-de"
    assert doc[4].ent_iob_ == "B"
    assert doc[4].ent_type_ == "ent-fg"

    # check that B is preserved if span[start] is B
    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
    heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
    ents = ["O"] * len(words)
    ents[3] = "B-ent-de"
    ents[4] = "I-ent-de"
    ents[5] = "B-ent-de"
    ents[6] = "I-ent-de"
    deps = ["dep"] * len(words)
    doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[3:5])
        retokenizer.merge(doc[5:7])
    assert len(doc) == 7
    assert doc[3].ent_iob_ == "B"
    assert doc[3].ent_type_ == "ent-de"
    assert doc[4].ent_iob_ == "B"
    assert doc[4].ent_type_ == "ent-de"
Example #29
0
def test_matcher_match_start(matcher):
    doc = Doc(matcher.vocab, words=["JavaScript", "is", "good"])
    assert matcher(doc) == [(matcher.vocab.strings["JS"], 0, 1)]
Example #30
0
 def __call__(self, text):
     words = text.split()
     # All tokens 'own' a subsequent space character in this tokenizer
     spaces = [True] * len(words)
     return Doc(self.vocab, words=words, spaces=spaces)