import spacy
from spacy.matcher import PhraseMatcher
from spacy_lookup import Entity
from spacy.lang.es import Spanish

nlp = Spanish()
entity = Entity(nlp,
                keywords_list=['pera en Dulce', 'manzana', 'tentacion'],
                label='FOOD')
nlp.add_pipe(entity, name='Food')
entity2 = Entity(nlp, keywords_list=['#mora'], label='FOOD_HASHTAGS')
nlp.add_pipe(entity2, name='FoodHashtags')
text = "Me gustan mucho la manzana y tambien la pera en dulce en salsa de #mora. También me gusta la paleta tentación."
doc = nlp(text)
for e in doc:
    print(e.text, e._.is_entity, e.ent_type_)
Beispiel #2
0
def getSentences(text):
    nlp = Spanish()
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    document = nlp(text)
    return [sent.string.strip() for sent in document.sents]
Beispiel #3
0
nlp = Spanish()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))


def countries_component(doc):
    # Crea un Span de entidades con el label "LOC" para todos los resultados
    matches = matcher(doc)
    doc.ents = [
        Span(doc, start, end, label="LOC") for match_id, start, end in matches
    ]
    return doc


# Añade el componente al pipeline
nlp.add_pipe(countries_component)
print(nlp.pipe_names)

# El getter que busca el texto del span en un diccionario de ciudades
# capitales de países
get_capital = lambda span: CAPITALS.get(span.text)

# Registra la extensión de atributo del Span, "capital", con el
# getter get_capital
Span.set_extension("capital", getter=get_capital)

# Procesa el texto e imprime en pantalla el texto de la entidad,
# el label y los atributos "capital"
doc = nlp("La República Checa podría ayudar a la República Eslovaca "
          "a proteger su espacio aéreo")
print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])
Beispiel #4
0
    # Output the annotation file
    annotation_filepath = './evaluation/NER/' + doc[:-3] + 'ann'

    with open(annotation_filepath, 'w', encoding='utf-8') as annotation_file:
        annotation_file.write(output)
        annotation_file.close()


if __name__ == "__main__":
    # load the model you trained
    model = SequenceTagger.load(
        'resources/taggers/medium_updated/final-model.pt')

    ## Create spanish sentence segmenter with Spacy
    # the sentence segmentation by spacy is non-destructive, i.e., the empty lines are considered when getting a span of a given word/entity
    nlp = Spanish()
    sentencizer = nlp.create_pipe("sentencizer")
    nlp.add_pipe(sentencizer)

    test_dir = "./data/datasets/test-background-set-to-publish/"

    if not os.path.exists("./evaluation/NER/"):
        os.makedirs("./evaluation/NER/")

    for doc in os.listdir(
            test_dir
    ):  #For each document in test_dir build the respective annotation file with predicted entities
        doc_filepath = test_dir + doc
        build_annotation_file(doc, doc_filepath, model)