Example #1
0
def test_issue1242():
    nlp = English()
    doc = nlp("")
    assert len(doc) == 0
    docs = list(nlp.pipe(["", "hello"]))
    assert len(docs[0]) == 0
    assert len(docs[1]) == 1
Example #2
0
def test_issue3449():
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    text1 = "He gave the ball to I. Do you want to go to the movies with I?"
    text2 = "He gave the ball to I.  Do you want to go to the movies with I?"
    text3 = "He gave the ball to I.\nDo you want to go to the movies with I?"
    t1 = nlp(text1)
    t2 = nlp(text2)
    t3 = nlp(text3)
    assert t1[5].text == "I"
    assert t2[5].text == "I"
    assert t3[5].text == "I"
Example #3
0
def test_issue3410():
    texts = ["Hello world", "This is a test"]
    nlp = English()
    matcher = Matcher(nlp.vocab)
    phrasematcher = PhraseMatcher(nlp.vocab)
    with pytest.deprecated_call():
        docs = list(nlp.pipe(texts, n_threads=4))
    with pytest.deprecated_call():
        docs = list(nlp.tokenizer.pipe(texts, n_threads=4))
    with pytest.deprecated_call():
        list(matcher.pipe(docs, n_threads=4))
    with pytest.deprecated_call():
        list(phrasematcher.pipe(docs, n_threads=4))
Example #4
0
def test_issue3468():
    """Test that sentence boundaries are set correctly so Doc.is_sentenced can
    be restored after serialization."""
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    doc = nlp("Hello world")
    assert doc[0].is_sent_start
    assert doc.is_sentenced
    assert len(list(doc.sents)) == 1
    doc_bytes = doc.to_bytes()
    new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
    assert new_doc[0].is_sent_start
    assert new_doc.is_sentenced
    assert len(list(new_doc.sents)) == 1
def main():
    # For simplicity, we start off with only the blank English Language class
    # and no model or pre-defined pipeline loaded.
    nlp = English()
    rest_countries = RESTCountriesComponent(nlp)  # initialise component
    nlp.add_pipe(rest_countries) # add it to the pipeline
    doc = nlp(u"Some text about Colombia and the Czech Republic")
    print('Pipeline', nlp.pipe_names)  # pipeline contains component name
    print('Doc has countries', doc._.has_country)  # Doc contains countries
    for token in doc:
        if token._.is_country:
            print(token.text, token._.country_capital, token._.country_latlng,
                token._.country_flag)  # country data
    print('Entities', [(e.text, e.label_) for e in doc.ents])  # entities
Example #6
0
def test_issue1494():
    infix_re = re.compile(r"""[^a-z]""")
    test_cases = [
        ("token 123test", ["token", "1", "2", "3", "test"]),
        ("token 1test", ["token", "1test"]),
        ("hello...test", ["hello", ".", ".", ".", "test"]),
    ]

    def new_tokenizer(nlp):
        return Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer)

    nlp = English()
    nlp.tokenizer = new_tokenizer(nlp)
    for text, expected in test_cases:
        assert [token.text for token in nlp(text)] == expected
def main(text="Alphabet Inc. is the company behind Google.", *companies):
    # For simplicity, we start off with only the blank English Language class
    # and no model or pre-defined pipeline loaded.
    nlp = English()
    if not companies:  # set default companies if none are set via args
        companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple']  # etc.
    component = TechCompanyRecognizer(nlp, companies)  # initialise component
    nlp.add_pipe(component, last=True)  # add last to the pipeline

    doc = nlp(text)
    print('Pipeline', nlp.pipe_names)  # pipeline contains component name
    print('Tokens', [t.text for t in doc])  # company names from the list are merged
    print('Doc has_tech_org', doc._.has_tech_org)  # Doc contains tech orgs
    print('Token 0 is_tech_org', doc[0]._.is_tech_org)  # "Alphabet Inc." is a tech org
    print('Token 1 is_tech_org', doc[1]._.is_tech_org)  # "is" is not
    print('Entities', [(e.text, e.label_) for e in doc.ents])  # all orgs are entities
Example #8
0
def test_issue1506():
    def string_generator():
        for _ in range(10001):
            yield "It's sentence produced by that bug."
        for _ in range(10001):
            yield "I erase some hbdsaj lemmas."
        for _ in range(10001):
            yield "I erase lemmas."
        for _ in range(10001):
            yield "It's sentence produced by that bug."
        for _ in range(10001):
            yield "It's sentence produced by that bug."

    nlp = English()
    for i, d in enumerate(nlp.pipe(string_generator())):
        # We should run cleanup more than one time to actually cleanup data.
        # In first run — clean up only mark strings as «not hitted».
        if i == 10000 or i == 20000 or i == 30000:
            gc.collect()
        for t in d:
            str(t.lemma_)
Example #9
0
def test_issue1488():
    prefix_re = re.compile(r"""[\[\("']""")
    suffix_re = re.compile(r"""[\]\)"']""")
    infix_re = re.compile(r"""[-~\.]""")
    simple_url_re = re.compile(r"""^https?://""")

    def my_tokenizer(nlp):
        return Tokenizer(
            nlp.vocab,
            {},
            prefix_search=prefix_re.search,
            suffix_search=suffix_re.search,
            infix_finditer=infix_re.finditer,
            token_match=simple_url_re.match,
        )

    nlp = English()
    nlp.tokenizer = my_tokenizer(nlp)
    doc = nlp("This is a test.")
    for token in doc:
        assert token.text
Example #10
0
def normalize_batch(p_iter, p_batch_size=1000, p_thread_count=5):
    """Normalize and tokenize strings.

    Args:
    p_iter (iter): iter over strings to normalize and tokenize.
    p_batch_size (int): number of batches.
    p_thread_count (int): number of threads running.

    Returns:
    iter: iter over normalized and tokenized string.
    """

    global NLP
    if not NLP:
        NLP = NlpEnglish(parser=False)

    output_iter = NLP.pipe(p_iter, \
                           batch_size=p_batch_size, \
                           n_threads=p_thread_count)

    for doc in output_iter:
        tokens = [str(w).strip().lower() for w in doc]
        yield ' '.join(tokens)
def identify_KEEL_LAID_in_text(text):
    nlp = English()
    doc = nlp(text)
    matcher = Matcher(nlp.vocab)
    tokens_in_doc_count = len(doc)

    #
    # START - spaCy patterns
    #

    matcher.add("KEEL_LAID", [[{
        "LOWER": {
            "IN": ["kjølstrukk", "kjølstrukket"]
        }
    }]])

    matcher.add("DATE", None, [{'IS_DIGIT': True, 'LENGTH': 4}])

    #
    # END - spaCy patterns
    #

    result = []

    for match_id, token_start, token_end in matcher(doc):

        match_id_as_string = nlp.vocab.strings[match_id]
        final_token_start = token_start
        final_token_end = token_end

        spacy_pattern_detection = doc[token_start:token_end]
        spacy_pattern_detection_as_lower_text = spacy_pattern_detection.text.lower(
        )

        #
        # Expand?
        #

        if match_id_as_string == "DATE" and token_start > 0:

            # At this point, DATE is just a year string. Example: 2021

            prev_word_1_token_number = token_start - 1
            prev_word_1_token = doc[prev_word_1_token_number]
            if prev_word_1_token.text in ("januar", "februar", "mars", "april",
                                          "mai", "juni", "juli", "august",
                                          "september", "oktober", "november",
                                          "desember"):
                final_token_start = prev_word_1_token_number  # expanding

                # Expand more?
                prev_word_2_token_number = token_start - 2
                prev_word_2_token = doc[prev_word_2_token_number]
                prev_word_3_token_number = token_start - 3
                prev_word_3_token = doc[prev_word_3_token_number]

                if prev_word_2_token.text == "." and is_int(
                        prev_word_3_token.text):
                    final_token_start = prev_word_3_token_number  # expanding

                    #
                    # convert token_span to char_span.
                    # char_span is needed to display correctly withdisplacy.render().
                    #
                    span = doc[final_token_start:final_token_end]
                    span_char_start = span[0].idx
                    span_char_end = span[-1].idx + len(span[-1].text)

                    # return result
                    identified_entity = {
                        'start': span_char_start,
                        'end': span_char_end,
                        'label': match_id_as_string
                    }
                    result.append(identified_entity)

                    #
                    # Identify prefix or suffix
                    #

                    if final_token_start > 0:

                        prev_word_1_token_number = final_token_start - 1
                        prev_word_1_token = doc[prev_word_1_token_number]

                        if prev_word_1_token.text.lower() == "før":

                            # Prefix detected.

                            #
                            # convert token_span to char_span.
                            # char_span is needed to display correctly withdisplacy.render().
                            #
                            span = doc[
                                prev_word_1_token_number:final_token_start]
                            span_char_start = span[0].idx
                            span_char_end = span[-1].idx + len(span[-1].text)

                            # return result
                            identified_entity = {
                                'start': span_char_start,
                                'end': span_char_end,
                                'label': "DATE_PREFIX"
                            }
                            result.append(identified_entity)

                    if ((final_token_end + 1) < tokens_in_doc_count):

                        next_word_1_token_number = final_token_end
                        next_word_1_token = doc[next_word_1_token_number]
                        next_word_2_token_number = final_token_end + 1
                        next_word_2_token = doc[next_word_2_token_number]

                        if (next_word_1_token.text.lower() == "eller" and
                                next_word_2_token.text.lower() == "senere"):

                            # Suffix detected.

                            #
                            # convert token_span to char_span.
                            # char_span is needed to display correctly withdisplacy.render().
                            #
                            span = doc[next_word_1_token_number:(
                                next_word_1_token_number + 2)]
                            span_char_start = span[0].idx
                            span_char_end = span[-1].idx + len(span[-1].text)

                            # return result
                            identified_entity = {
                                'start': span_char_start,
                                'end': span_char_end,
                                'label': "DATE_SUFFIX"
                            }
                            result.append(identified_entity)

        elif match_id_as_string == "KEEL_LAID":

            #
            # convert token_span to char_span.
            # char_span is needed to display correctly withdisplacy.render().
            #
            span = doc[final_token_start:final_token_end]
            span_char_start = span[0].idx
            span_char_end = span[-1].idx + len(span[-1].text)

            # return result
            identified_entity = {
                'start': span_char_start,
                'end': span_char_end,
                'label': match_id_as_string
            }
            result.append(identified_entity)

    return result
Example #12
0
def test_overfitting_IO():
    # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly
    nlp = English()
    tagger = nlp.add_pipe("tagger")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert tagger.model.get_dim("nO") == len(TAGS)

    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["tagger"] < 0.00001

    # test the trained model
    test_text = "I like blue eggs"
    doc = nlp(test_text)
    assert doc[0].tag_ == "N"
    assert doc[1].tag_ == "V"
    assert doc[2].tag_ == "J"
    assert doc[3].tag_ == "N"

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
        assert doc2[0].tag_ == "N"
        assert doc2[1].tag_ == "V"
        assert doc2[2].tag_ == "J"
        assert doc2[3].tag_ == "N"

    # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
    texts = [
        "Just a sentence.",
        "I like green eggs.",
        "Here is another one.",
        "I eat ham.",
    ]
    batch_deps_1 = [doc.to_array([TAG]) for doc in nlp.pipe(texts)]
    batch_deps_2 = [doc.to_array([TAG]) for doc in nlp.pipe(texts)]
    no_batch_deps = [
        doc.to_array([TAG]) for doc in [nlp(text) for text in texts]
    ]
    assert_equal(batch_deps_1, batch_deps_2)
    assert_equal(batch_deps_1, no_batch_deps)

    # Try to unlearn the first 'N' tag with negative annotation
    neg_ex = Example.from_dict(nlp.make_doc(test_text),
                               {"tags": ["!N", "V", "J", "N"]})

    for i in range(20):
        losses = {}
        nlp.update([neg_ex], sgd=optimizer, losses=losses)

    # test the "untrained" tag
    doc3 = nlp(test_text)
    assert doc3[0].tag_ != "N"
Example #13
0
def test_overfitting_IO(use_upper):
    # Simple test to try and quickly overfit the NER component
    nlp = English()
    ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(
            Example.from_dict(nlp.make_doc(text), annotations))
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    optimizer = nlp.initialize()

    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["ner"] < 0.00001

    # test the trained model
    test_text = "I like London."
    doc = nlp(test_text)
    ents = doc.ents
    assert len(ents) == 1
    assert ents[0].text == "London"
    assert ents[0].label_ == "LOC"

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
        ents2 = doc2.ents
        assert len(ents2) == 1
        assert ents2[0].text == "London"
        assert ents2[0].label_ == "LOC"
        # Ensure that the predictions are still the same, even after adding a new label
        ner2 = nlp2.get_pipe("ner")
        assert ner2.model.attrs["has_upper"] == use_upper
        ner2.add_label("RANDOM_NEW_LABEL")
        doc3 = nlp2(test_text)
        ents3 = doc3.ents
        assert len(ents3) == 1
        assert ents3[0].text == "London"
        assert ents3[0].label_ == "LOC"

    # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
    texts = [
        "Just a sentence.",
        "Then one more sentence about London.",
        "Here is another one.",
        "I like London.",
    ]
    batch_deps_1 = [doc.to_array([ENT_IOB]) for doc in nlp.pipe(texts)]
    batch_deps_2 = [doc.to_array([ENT_IOB]) for doc in nlp.pipe(texts)]
    no_batch_deps = [
        doc.to_array([ENT_IOB]) for doc in [nlp(text) for text in texts]
    ]
    assert_equal(batch_deps_1, batch_deps_2)
    assert_equal(batch_deps_1, no_batch_deps)

    # test that kb_id is preserved
    test_text = "I like London and London."
    doc = nlp.make_doc(test_text)
    doc.ents = [Span(doc, 2, 3, label="LOC", kb_id=1234)]
    ents = doc.ents
    assert len(ents) == 1
    assert ents[0].text == "London"
    assert ents[0].label_ == "LOC"
    assert ents[0].kb_id == 1234
    doc = nlp.get_pipe("ner")(doc)
    ents = doc.ents
    assert len(ents) == 2
    assert ents[0].text == "London"
    assert ents[0].label_ == "LOC"
    assert ents[0].kb_id == 1234
    # ent added by ner has kb_id == 0
    assert ents[1].text == "London"
    assert ents[1].label_ == "LOC"
    assert ents[1].kb_id == 0
Example #14
0
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from torch.nn.functional import softmax

sys.path.append("../")
from model.modeling_classification import BertForSequenceClassification, BertForTokenClassification
from model.tokenization import BertTokenizer

logger = logging.getLogger(__name__)
MaskedTokenInstance = collections.namedtuple("MaskedTokenInstance",
                                             ["tokens", "info"])
MaskedItemInfo = collections.namedtuple(
    "MaskedItemInfo",
    ["current_pos", "sen_doc_pos", "sen_right_id", "doc_ground_truth"])
nlp = English()
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)


class InputFeatures(object):
    def __init__(self, input_ids, input_mask, segment_ids=None):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids


class SC(nn.Module):
    def __init__(self,
                 mask_rate,
                 top_sen_rate,
Example #15
0
def test_issue3209():
    """Test issue that occurred in spaCy nightly where NER labels were being
    mapped to classes incorrectly after loading the model, when the labels
    were added using ner.add_label().
    """
    nlp = English()
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)

    ner.add_label("ANIMAL")
    nlp.begin_training()
    move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
    assert ner.move_names == move_names
    nlp2 = English()
    nlp2.add_pipe(nlp2.create_pipe("ner"))
    nlp2.from_bytes(nlp.to_bytes())
    assert nlp2.get_pipe("ner").move_names == move_names
Example #16
0
def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
    nlp = English()
    nlp_plain = English()
    # load both vec and hashvec tables
    with make_tempdir() as tmpdir:
        p = tmpdir / "test.hashvec"
        with open(p, "w") as fileh:
            fileh.write(floret_vectors_hashvec_str)
        convert_vectors(nlp, p, truncate=0, prune=-1, mode="floret")
        p = tmpdir / "test.vec"
        with open(p, "w") as fileh:
            fileh.write(floret_vectors_vec_str)
        convert_vectors(nlp_plain, p, truncate=0, prune=-1)

    word = "der"
    # ngrams: full padded word + padded 2-grams + padded 3-grams
    ngrams = nlp.vocab.vectors._get_ngrams(word)
    assert ngrams == ["<der>", "<d", "de", "er", "r>", "<de", "der", "er>"]
    # rows: 2 rows per ngram
    rows = OPS.xp.asarray(
        [
            h % nlp.vocab.vectors.shape[0] for ngram in ngrams
            for h in nlp.vocab.vectors._get_ngram_hashes(ngram)
        ],
        dtype="uint32",
    )
    assert_equal(
        OPS.to_numpy(rows),
        numpy.asarray([5, 6, 7, 5, 8, 2, 8, 9, 3, 3, 4, 6, 7, 3, 0, 2]),
    )
    assert len(rows) == len(ngrams) * nlp.vocab.vectors.hash_count
    # all vectors are equivalent for plain static table vs. hash ngrams
    for word in nlp_plain.vocab.vectors:
        word = nlp_plain.vocab.strings.as_string(word)
        assert_almost_equal(nlp.vocab[word].vector,
                            nlp_plain.vocab[word].vector,
                            decimal=3)

        # every word has a vector
        assert nlp.vocab[word * 5].has_vector

    # n_keys is -1 for floret
    assert nlp_plain.vocab.vectors.n_keys > 0
    assert nlp.vocab.vectors.n_keys == -1

    # check that single and batched vector lookups are identical
    words = [s for s in nlp_plain.vocab.vectors]
    single_vecs = OPS.to_numpy(
        OPS.asarray([nlp.vocab[word].vector for word in words]))
    batch_vecs = OPS.to_numpy(nlp.vocab.vectors.get_batch(words))
    assert_equal(single_vecs, batch_vecs)

    # an empty key returns 0s
    assert_equal(
        OPS.to_numpy(nlp.vocab[""].vector),
        numpy.zeros((nlp.vocab.vectors.shape[0], )),
    )
    # an empty batch returns 0s
    assert_equal(
        OPS.to_numpy(nlp.vocab.vectors.get_batch([""])),
        numpy.zeros((1, nlp.vocab.vectors.shape[0])),
    )
    # an empty key within a batch returns 0s
    assert_equal(
        OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]),
        numpy.zeros((nlp.vocab.vectors.shape[0], )),
    )

    # the loaded ngram vector table cannot be modified
    # except for clear: warning, then return without modifications
    vector = list(range(nlp.vocab.vectors.shape[1]))
    orig_bytes = nlp.vocab.vectors.to_bytes(exclude=["strings"])
    with pytest.warns(UserWarning):
        nlp.vocab.set_vector("the", vector)
    assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"])
    with pytest.warns(UserWarning):
        nlp.vocab[word].vector = vector
    assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"])
    with pytest.warns(UserWarning):
        nlp.vocab.vectors.add("the", row=6)
    assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"])
    with pytest.warns(UserWarning):
        nlp.vocab.vectors.resize(shape=(100, 10))
    assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"])
    with pytest.raises(ValueError):
        nlp.vocab.vectors.clear()

    # data and settings are serialized correctly
    with make_tempdir() as d:
        nlp.vocab.to_disk(d)
        vocab_r = Vocab()
        vocab_r.from_disk(d)
        assert nlp.vocab.vectors.to_bytes() == vocab_r.vectors.to_bytes()
        assert_equal(OPS.to_numpy(nlp.vocab.vectors.data),
                     OPS.to_numpy(vocab_r.vectors.data))
        assert_equal(nlp.vocab.vectors._get_cfg(), vocab_r.vectors._get_cfg())
        assert_almost_equal(
            OPS.to_numpy(nlp.vocab[word].vector),
            OPS.to_numpy(vocab_r[word].vector),
            decimal=6,
        )
Example #17
0
#import packages
from gensim.summarization import keywords
from matplotlib import pyplot
import spacy
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))
from spacy.lang.en import English
nlp = English()
nlp.max_length = 10000000
import lyricsgenius
import pandas as pd
from textblob import TextBlob
import requests, json
import numpy as np
import matplotlib.ticker as plticker
import seaborn as sns
import time
from datetime import date
import matplotlib.dates as mdates

#define Genius API authentication
api_key = 'SfbYPF1AJ0-lnm6Km8_sIJoebvrIFfRyAGoZqxnRfkZIvP5ceGwBNZa4g0DHayP-'
genius = lyricsgenius.Genius(api_key)
BASE_URL = "https://api.genius.com"


def main():
    artist_name = ""
    while True:
Example #18
0
def test_partial_links():
    # Test that having some entities on the doc without gold links, doesn't crash
    TRAIN_DATA = [(
        "Russ Cochran his reprints include EC Comics.",
        {
            "links": {
                (0, 12): {
                    "Q2146908": 1.0
                }
            },
            "entities": [(0, 12, "PERSON")],
            "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0],
        },
    )]
    nlp = English()
    vector_length = 3
    train_examples = []
    for text, annotation in TRAIN_DATA:
        doc = nlp(text)
        train_examples.append(Example.from_dict(doc, annotation))

    def create_kb(vocab):
        # create artificial KB
        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
        mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9])
        return mykb

    # Create and train the Entity Linker
    entity_linker = nlp.add_pipe("entity_linker", last=True)
    entity_linker.set_kb(create_kb)
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    for i in range(2):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    # adding additional components that are required for the entity_linker
    nlp.add_pipe("sentencizer", first=True)
    patterns = [
        {
            "label": "PERSON",
            "pattern": [{
                "LOWER": "russ"
            }, {
                "LOWER": "cochran"
            }]
        },
        {
            "label": "ORG",
            "pattern": [{
                "LOWER": "ec"
            }, {
                "LOWER": "comics"
            }]
        },
    ]
    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
    ruler.add_patterns(patterns)

    # this will run the pipeline on the examples and shouldn't crash
    results = nlp.evaluate(train_examples)
    assert "PERSON" in results["ents_per_type"]
    assert "PERSON" in results["nel_f_per_type"]
    assert "ORG" in results["ents_per_type"]
    assert "ORG" not in results["nel_f_per_type"]
Example #19
0
def test_overfitting_IO():
    # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
    nlp = English()
    vector_length = 3
    assert "Q2146908" not in nlp.vocab.strings

    # Convert the texts to docs to make sure we have doc.ents set for the training examples
    train_examples = []
    for text, annotation in TRAIN_DATA:
        doc = nlp(text)
        train_examples.append(Example.from_dict(doc, annotation))

    def create_kb(vocab):
        # create artificial KB - assign same prior weight to the two russ cochran's
        # Q2146908 (Russ Cochran): American golfer
        # Q7381115 (Russ Cochran): publisher
        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
        mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
        mykb.add_alias(
            alias="Russ Cochran",
            entities=["Q2146908", "Q7381115"],
            probabilities=[0.5, 0.5],
        )
        return mykb

    # Create the Entity Linker component and add it to the pipeline
    entity_linker = nlp.add_pipe("entity_linker", last=True)
    assert isinstance(entity_linker, EntityLinker)
    entity_linker.set_kb(create_kb)
    assert "Q2146908" in entity_linker.vocab.strings
    assert "Q2146908" in entity_linker.kb.vocab.strings

    # train the NEL pipe
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert entity_linker.model.get_dim("nO") == vector_length
    assert entity_linker.model.get_dim(
        "nO") == entity_linker.kb.entity_vector_length

    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["entity_linker"] < 0.001

    # adding additional components that are required for the entity_linker
    nlp.add_pipe("sentencizer", first=True)

    # Add a custom component to recognize "Russ Cochran" as an entity for the example training data
    patterns = [{
        "label": "PERSON",
        "pattern": [{
            "LOWER": "russ"
        }, {
            "LOWER": "cochran"
        }]
    }]
    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
    ruler.add_patterns(patterns)

    # test the trained model
    predictions = []
    for text, annotation in TRAIN_DATA:
        doc = nlp(text)
        for ent in doc.ents:
            predictions.append(ent.kb_id_)
    assert predictions == GOLD_entities

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        assert nlp2.pipe_names == nlp.pipe_names
        assert "Q2146908" in nlp2.vocab.strings
        entity_linker2 = nlp2.get_pipe("entity_linker")
        assert "Q2146908" in entity_linker2.vocab.strings
        assert "Q2146908" in entity_linker2.kb.vocab.strings
        predictions = []
        for text, annotation in TRAIN_DATA:
            doc2 = nlp2(text)
            for ent in doc2.ents:
                predictions.append(ent.kb_id_)
        assert predictions == GOLD_entities

    # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
    texts = [
        "Russ Cochran captured his first major title with his son as caddie.",
        "Russ Cochran his reprints include EC Comics.",
        "Russ Cochran has been publishing comic art.",
        "Russ Cochran was a member of University of Kentucky's golf team.",
    ]
    batch_deps_1 = [doc.to_array([ENT_KB_ID]) for doc in nlp.pipe(texts)]
    batch_deps_2 = [doc.to_array([ENT_KB_ID]) for doc in nlp.pipe(texts)]
    no_batch_deps = [
        doc.to_array([ENT_KB_ID]) for doc in [nlp(text) for text in texts]
    ]
    assert_equal(batch_deps_1, batch_deps_2)
    assert_equal(batch_deps_1, no_batch_deps)
Example #20
0
def test_issue7065_b():
    # Test that the NEL doesn't crash when an entity crosses a sentence boundary
    nlp = English()
    vector_length = 3
    nlp.add_pipe("sentencizer")
    text = "Mahler 's Symphony No. 8 was beautiful."
    entities = [(0, 6, "PERSON"), (10, 24, "WORK")]
    links = {
        (0, 6): {
            "Q7304": 1.0,
            "Q270853": 0.0
        },
        (10, 24): {
            "Q7304": 0.0,
            "Q270853": 1.0
        },
    }
    sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0]
    doc = nlp(text)
    example = Example.from_dict(doc, {
        "entities": entities,
        "links": links,
        "sent_starts": sent_starts
    })
    train_examples = [example]

    def create_kb(vocab):
        # create artificial KB
        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
        mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7])
        mykb.add_alias(
            alias="No. 8",
            entities=["Q270853"],
            probabilities=[1.0],
        )
        mykb.add_entity(entity="Q7304", freq=12, entity_vector=[6, -4, 3])
        mykb.add_alias(
            alias="Mahler",
            entities=["Q7304"],
            probabilities=[1.0],
        )
        return mykb

    # Create the Entity Linker component and add it to the pipeline
    entity_linker = nlp.add_pipe("entity_linker", last=True)
    entity_linker.set_kb(create_kb)
    # train the NEL pipe
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    for i in range(2):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    # Add a custom rule-based component to mimick NER
    patterns = [
        {
            "label": "PERSON",
            "pattern": [{
                "LOWER": "mahler"
            }]
        },
        {
            "label":
            "WORK",
            "pattern": [
                {
                    "LOWER": "symphony"
                },
                {
                    "LOWER": "no"
                },
                {
                    "LOWER": "."
                },
                {
                    "LOWER": "8"
                },
            ],
        },
    ]
    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
    ruler.add_patterns(patterns)
    # test the trained model - this should not throw E148
    doc = nlp(text)
    assert doc
Example #21
0
def nlp():
    return English()
Example #22
0
def test_no_gold_ents(patterns):
    # test that annotating components work
    TRAIN_DATA = [(
        "Kirby is pink",
        {
            "links": {
                (0, 5): {
                    "Q613241": 1.0
                }
            },
            "entities": [(0, 5, "CHARACTER")],
            "sent_starts": [1, 0, 0],
        },
    )]
    nlp = English()
    vector_length = 3
    train_examples = []
    for text, annotation in TRAIN_DATA:
        doc = nlp(text)
        train_examples.append(Example.from_dict(doc, annotation))

    # Create a ruler to mark entities
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)

    # Apply ruler to examples. In a real pipeline this would be an annotating component.
    for eg in train_examples:
        eg.predicted = ruler(eg.predicted)

    def create_kb(vocab):
        # create artificial KB
        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
        mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
        mykb.add_alias("Kirby", ["Q613241"], [0.9])
        # Placeholder
        mykb.add_entity(entity="pink", freq=12, entity_vector=[7, 2, -5])
        mykb.add_alias("pink", ["pink"], [0.9])
        return mykb

    # Create and train the Entity Linker
    entity_linker = nlp.add_pipe("entity_linker",
                                 config={"use_gold_ents": False},
                                 last=True)
    entity_linker.set_kb(create_kb)
    assert entity_linker.use_gold_ents is False

    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    for i in range(2):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    # adding additional components that are required for the entity_linker
    nlp.add_pipe("sentencizer", first=True)

    # this will run the pipeline on the examples and shouldn't crash
    nlp.evaluate(train_examples)
Example #23
0
def test_nel_to_bytes():
    # Test that a pipeline with an EL component can be converted to bytes
    def create_kb(vocab):
        kb = KnowledgeBase(vocab, entity_vector_length=3)
        kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
        kb.add_alias(alias="Russ Cochran",
                     entities=["Q2146908"],
                     probabilities=[0.8])
        return kb

    nlp_1 = English()
    nlp_1.add_pipe("ner")
    entity_linker_1 = nlp_1.add_pipe("entity_linker", last=True)
    entity_linker_1.set_kb(create_kb)
    assert entity_linker_1.kb.contains_alias("Russ Cochran")
    assert nlp_1.pipe_names == ["ner", "entity_linker"]

    nlp_bytes = nlp_1.to_bytes()
    nlp_2 = English()
    nlp_2.add_pipe("ner")
    nlp_2.add_pipe("entity_linker", last=True)
    assert nlp_2.pipe_names == ["ner", "entity_linker"]
    assert not nlp_2.get_pipe("entity_linker").kb.contains_alias(
        "Russ Cochran")
    nlp_2 = nlp_2.from_bytes(nlp_bytes)
    kb_2 = nlp_2.get_pipe("entity_linker").kb
    assert kb_2.contains_alias("Russ Cochran")
    assert kb_2.get_vector("Q2146908") == [6, -4, 3]
    assert_almost_equal(
        kb_2.get_prior_prob(entity="Q2146908", alias="Russ Cochran"), 0.8)
Example #24
0
 def __init__(self):
     self.nlp = English()
Example #25
0
def create_spacy_tokenizer():
    nlp = English()
    sentencizer = nlp.create_pipe('sentencizer')
    nlp.add_pipe(sentencizer)
Example #26
0
response = sound_stuff.start_transcription_job(job_name, object_url, 'mp3')
print(response)
print('...done')

print('Waiting on transcription task...')
sound_stuff.wait_for_transaction_job(job_name)
print('...done')

print("Loading Text File...")
text = sound_stuff.load_transcript_from_job(job_name)
print("...done")

print("Extracting sentences...")
from spacy.lang.en import English

nlp = English()
sbd = nlp.create_pipe('sentencizer')
nlp.add_pipe(sbd)
doc = nlp(text)
sentences = [sentence.text for sentence in doc.sents]
print("...done")

file_utils.write_json('sentences.json', sentences, 3)

print("Loading summarizer...")
summarizer = models.get_summarizer_model()
print("...done")

print("Summarizing...")

summary_indices = {}
Example #27
0
from spacy.lang.en import English

nlp = English()

# Importe a classe Doc
from ____ import ____

# Texto desejado: "spaCy is cool!"
words = ["spaCy", "is", "cool", "!"]
spaces = [True, True, False, False]

# Crie um Doc a partir das palavras words e o espaçamento spaces
doc = ____(____, words=words, spaces=spaces)
print(doc.text)
Example #28
0
def test_issue3289():
    """Test that Language.to_bytes handles serializing a pipeline component
    with an uninitialized model."""
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("textcat"))
    bytes_data = nlp.to_bytes()
    new_nlp = English()
    new_nlp.add_pipe(nlp.create_pipe("textcat"))
    new_nlp.from_bytes(bytes_data)
Example #29
0
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
import json

with open("exercises/countries.json") as f:
    COUNTRIES = json.loads(f.read())
with open("exercises/country_text.txt") as f:
    TEXT = f.read()

nlp = English()
matcher = PhraseMatcher(nlp.vocab)
patterns = list(nlp.pipe(COUNTRIES))
matcher.add("COUNTRY", None, *patterns)

# Create a doc and find matches in it
doc = nlp(TEXT)

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Create a Span with the label for "GPE"
    span = Span(doc, start, end, label="GPE")

    # Overwrite the doc.ents and add the span
    doc.ents = list(doc.ents) + [span]

    # Get the span's root head token
    span_root_head = span.root.head
    # Print the text of the span root's head token and the span text
    print(span_root_head.text, "-->", span.text)
def getSentences(text):
    nlp = English()
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    document = nlp(text)
    return [sent.string.strip() for sent in document.sents]
Example #31
0
def read_corpus(path,
                path_to_translations=None,
                path_to_translatable_ids=None,
                path_to_generated_questions=None):
    translations = {}
    translate_count = 0
    if path_to_translations is not None:
        # INITIALIZE TOKENIZER FOR TRANSLATIONS
        # import sys
        # sys.setdefaultencoding("utf-8")
        import spacy
        nlp = spacy.load('en')
        from spacy.lang.en import English
        tokenizer = English().Defaults.create_tokenizer(nlp)

        def fun_proc(t):
            t = tokenizer(u'{}'.format(t.decode("utf-8")))
            t = ' '.join([str(i) for i in t]).lower().strip()
            return t

        # STORE TRANSLATIONS IN A DICT
        fopen = gzip.open if path_to_translations.endswith(".gz") else open
        with fopen(path_to_translations) as f:
            for l in f:
                if len(l.strip()) > 0:
                    i, t, b = l.strip().split('\t')
                    i, t = i.strip(), t.strip()
                    translations[i] = fun_proc(t)

        if path_to_translatable_ids is not None:
            # READ TRANSLATABLE IDS AND POP ALL THE OTHER IDS FROM TRANSLATIONS
            translatables = []
            with open(path_to_translatable_ids, 'r') as fid:
                for l in fid:
                    if len(l.strip()) > 0:
                        i = l.strip().split()[0]
                        translatables.append(i)
                translations_keys = translations.keys()
                for i in translations_keys:
                    if i not in translatables:
                        translations.pop(i)

    raw_corpus = {}

    # AR edit.
    # We add all the generated titles as additional items with id <orig-id>_qgen
    # Later we imply a truth label for the pairs (<orig-id>, <orig-id>_qgen)
    if path_to_generated_questions is not None:
        with open(path_to_generated_questions) as f:
            for l in f:
                qid, dist, q = l.strip().split('\t')
                key = '{}_qgen'.format(qid)
                assert (key not in raw_corpus)
                raw_corpus[key] = q.strip().split(), []
        print('Read {} generated questions'.format(len(raw_corpus)))

    empty_cnt = 0
    fopen = gzip.open if path.endswith(".gz") else open
    with fopen(path) as fin:
        for line in fin:
            id, title, body = line.split("\t")
            if len(title) == 0:
                print(id)
                empty_cnt += 1
                continue
            if id in translations:
                translate_count += 1
                title = translations[id]
            title = title.strip().split()
            body = body.strip().split()
            raw_corpus[id] = (title, body)
    say("{} empty titles ignored.\n".format(empty_cnt))
    say("{} titles translated.\n".format(translate_count))
    return raw_corpus
Example #32
0
def load_sentencizer_and_tokenizer():
    nlp = English()
    sentencizer = nlp.create_pipe("sentencizer")
    nlp.add_pipe(sentencizer)
    tokenizer = nlp.Defaults.create_tokenizer(nlp)
    return nlp, tokenizer
Example #33
0
def create_api_response_for_post_identify_build_date_in_text_service_english_chapter_input(
        title_dictionary,
        forward_filtered_result_with_only_the_things_we_are_looking_for):

    nlp = English()

    forward_result = []

    ### TEMP AREA
    temp_detection_dictionary = {}
    temp_check_before_reset = {}
    ###

    for line in forward_filtered_result_with_only_the_things_we_are_looking_for:

        # new line and temp reset
        temp_detection_dictionary.clear()
        temp_check_before_reset.clear()

        # Get NLP data from line
        text_service_url = line['title']
        text = line['text']
        ents = line['ents']  # discovered enteties in the line
        last_index_number_of_ents = len(ents) - 1
        doc = nlp(text)

        # Get metadata from URL
        result_text_service_url = get_data_from_text_service_item_url(
            text_service_url)
        metadata_from_url = {}
        if "regulation_year" in result_text_service_url:
            metadata_from_url['regulation_year'] = result_text_service_url[
                'regulation_year']
        if "regulation_month" in result_text_service_url:
            metadata_from_url['regulation_month'] = result_text_service_url[
                'regulation_month']
        if "regulation_day" in result_text_service_url:
            metadata_from_url['regulation_day'] = result_text_service_url[
                'regulation_day']
        if "regulation_id" in result_text_service_url:
            metadata_from_url['regulation_id'] = result_text_service_url[
                'regulation_id']
        if "chapter_number" in result_text_service_url:
            metadata_from_url['chapter_number'] = result_text_service_url[
                'chapter_number']
        if "section_number" in result_text_service_url:
            metadata_from_url['section_number'] = result_text_service_url[
                'section_number']
        if "part_number" in result_text_service_url:
            metadata_from_url['part_number'] = result_text_service_url[
                'part_number']
        if "sub_part_number" in result_text_service_url:
            metadata_from_url['sub_part_number'] = result_text_service_url[
                'sub_part_number']

        # add chapter_title and section_title
        if "chapter_title" in title_dictionary:
            metadata_from_url['chapter_title'] = title_dictionary[
                'chapter_title']
        if "section_title_in_dictionary" in title_dictionary:
            section_title_dictionary = title_dictionary[
                'section_title_in_dictionary']
            if text_service_url in section_title_dictionary:
                metadata_from_url['section_title'] = section_title_dictionary[
                    text_service_url]

        # For each ent in line
        for ent_index_number, ent in enumerate(ents):

            ent_label = ent['label']
            ent_start = ent['start']
            ent_end = ent['end']

            ent_text = text[
                ent_start:
                ent_end]  # same as: doc[ent_token_span.start:ent_token_span.end]
            ent_doc = nlp(ent_text)
            words_in_doc_count = len(ent_doc)

            ent_token_span = doc.char_span(ent_start, ent_end)
            ent_token_span_start = ent_token_span.start
            ent_token_span_end = ent_token_span.end

            #print(ent_text + " - " + ent_label + " (" + str(ent_token_span_start) + ":" + str(ent_token_span_end) + ")")

            #
            # Statment builder
            #

            if ent_label == "WATER_VESSEL":
                if "START_detected" not in temp_detection_dictionary:
                    temp_detection_dictionary["START_detected"] = True
                else:  # restart with new term
                    temp_check_before_reset = dict(temp_detection_dictionary)
                    temp_detection_dictionary.clear()
                    temp_detection_dictionary["START_detected"] = True

            elif ent_label == "CONSTRUCT":
                if ("START_detected" in temp_detection_dictionary and
                        "CONSTRUCT_detected" not in temp_detection_dictionary):
                    temp_detection_dictionary["CONSTRUCT_detected"] = True
                else:  # reset
                    temp_check_before_reset = dict(temp_detection_dictionary)
                    temp_detection_dictionary.clear()

            elif ent_label == "DATE_PREFIX":
                if ("START_detected" in temp_detection_dictionary
                        and "CONSTRUCT_detected" in temp_detection_dictionary
                        and "DATE_PREFIX_value"
                        not in temp_detection_dictionary
                        and "DATE_value_1" not in temp_detection_dictionary and
                        "DATE_SEPARATOR_value" not in temp_detection_dictionary
                        and "DATE_value_2" not in temp_detection_dictionary):
                    temp_detection_dictionary["DATE_PREFIX_value"] = ent_text
                else:  # reset
                    temp_check_before_reset = dict(temp_detection_dictionary)
                    temp_detection_dictionary.clear()

            elif ent_label == "DATE":
                if ("START_detected" in temp_detection_dictionary
                        and "CONSTRUCT_detected" in temp_detection_dictionary
                        and "DATE_value_1" not in temp_detection_dictionary
                        and "DATE_value_1_token_end"
                        not in temp_detection_dictionary):
                    temp_detection_dictionary["DATE_value_1"] = ent_text
                    temp_detection_dictionary[
                        "DATE_value_1_token_end"] = ent_token_span_end

                elif ("START_detected" in temp_detection_dictionary
                      and "CONSTRUCT_detected" in temp_detection_dictionary
                      and "DATE_value_1" in temp_detection_dictionary
                      and "DATE_SEPARATOR_value" in temp_detection_dictionary
                      and "DATE_value_2" not in temp_detection_dictionary):
                    temp_detection_dictionary['DATE_value_2'] = ent_text
                    # because this is the last value in a statment:
                    temp_check_before_reset = dict(temp_detection_dictionary)
                    temp_detection_dictionary.clear()
                else:  # reset
                    temp_check_before_reset = dict(temp_detection_dictionary)
                    temp_detection_dictionary.clear()

            elif ent_label == "DATE_SEPARATOR":
                if ("START_detected" in temp_detection_dictionary
                        and "CONSTRUCT_detected" in temp_detection_dictionary
                        and "DATE_value_1" in temp_detection_dictionary and
                        "DATE_value_1_token_end" in temp_detection_dictionary
                        and "DATE_SEPARATOR_value"
                        not in temp_detection_dictionary):
                    # Q: Is the separator the next term after value 1?
                    if temp_detection_dictionary[
                            "DATE_value_1_token_end"] == ent_token_span_start:
                        # A: Yes, this separator is the first word after value 1
                        temp_detection_dictionary[
                            "DATE_SEPARATOR_value"] = ent_text
                    else:  # reset
                        # A: No. Reject value and reset.
                        temp_check_before_reset = dict(
                            temp_detection_dictionary)
                        temp_detection_dictionary.clear()
                else:  # reset
                    temp_check_before_reset = dict(temp_detection_dictionary)
                    temp_detection_dictionary.clear()

            #
            # Statment concluder
            # Q: Do we have what we need to build a statment?
            #

            # The statment builder have restarted.
            # Check what we have for a statment before continuing.
            if len(temp_check_before_reset) > 0:
                # If we have a double value statement
                if ("START_detected" in temp_check_before_reset
                        and "CONSTRUCT_detected" in temp_check_before_reset
                        and "DATE_value_1" in temp_check_before_reset
                        and "DATE_SEPARATOR_value" in temp_check_before_reset
                        and "DATE_value_2" in temp_check_before_reset):
                    detection_with_url_metadata = dict(metadata_from_url)
                    if "DATE_PREFIX_value" in temp_check_before_reset:
                        detection_with_url_metadata[
                            "date_context"] = temp_check_before_reset[
                                "DATE_PREFIX_value"]
                    detection_with_url_metadata[
                        "date_value_1"] = temp_check_before_reset[
                            "DATE_value_1"]
                    detection_with_url_metadata[
                        "date_separator"] = temp_check_before_reset[
                            "DATE_SEPARATOR_value"]
                    detection_with_url_metadata[
                        "date_value_2"] = temp_check_before_reset[
                            "DATE_value_2"]
                    forward_result.append(detection_with_url_metadata)
                # If we have a single value statment
                elif ("START_detected" in temp_check_before_reset
                      and "CONSTRUCT_detected" in temp_check_before_reset
                      and "DATE_value_1" in temp_check_before_reset):
                    detection_with_url_metadata = dict(metadata_from_url)
                    if "DATE_PREFIX_value" in temp_check_before_reset:
                        detection_with_url_metadata[
                            "date_context"] = temp_check_before_reset[
                                "DATE_PREFIX_value"]
                    detection_with_url_metadata[
                        "date_value_1"] = temp_check_before_reset[
                            "DATE_value_1"]
                    forward_result.append(detection_with_url_metadata)
                temp_check_before_reset.clear()

            # Conclude on current detections
            if ("START_detected" in temp_detection_dictionary
                    and "CONSTRUCT_detected" in temp_detection_dictionary
                    and "DATE_value_1" in temp_detection_dictionary
                    and "DATE_SEPARATOR_value" in temp_detection_dictionary
                    and "DATE_value_2" in temp_detection_dictionary):
                # we have a full statment.
                # add and reset.
                detection_with_url_metadata = dict(metadata_from_url)
                if "DATE_PREFIX_value" in temp_detection_dictionary:
                    detection_with_url_metadata[
                        "date_context"] = temp_detection_dictionary[
                            "DATE_PREFIX_value"]
                detection_with_url_metadata[
                    "date_value_1"] = temp_detection_dictionary["DATE_value_1"]
                detection_with_url_metadata[
                    "date_separator"] = temp_detection_dictionary[
                        "DATE_SEPARATOR_value"]
                detection_with_url_metadata[
                    "date_value_2"] = temp_detection_dictionary["DATE_value_2"]
                forward_result.append(detection_with_url_metadata)
                temp_detection_dictionary.clear()

            else:
                # get next ent
                next_ent_index_number = ent_index_number + 1
                next_ent_label = ""
                if next_ent_index_number <= last_index_number_of_ents:
                    next_ent = ents[next_ent_index_number]
                    next_ent_label = next_ent["label"]
                # Q: Do we have enough for a new statment?
                if ("START_detected" in temp_detection_dictionary
                        and "CONSTRUCT_detected" in temp_detection_dictionary
                        and "DATE_value_1" in temp_detection_dictionary):
                    # A: Yes, we have enough for a new statment.
                    # Is the next ent relevant?
                    if ("DATE_SEPARATOR_value" not in temp_detection_dictionary
                            and next_ent_label == "DATE_SEPARATOR"):
                        continue  # we want the next ent
                    elif ("DATE_SEPARATOR_value" in temp_detection_dictionary
                          and "DATE_value_2" not in temp_detection_dictionary):
                        continue  # we know that the next value is a date
                    else:  # add the statment and move on
                        detection_with_url_metadata = dict(metadata_from_url)
                        if "DATE_PREFIX_value" in temp_detection_dictionary:
                            detection_with_url_metadata[
                                "date_context"] = temp_detection_dictionary[
                                    "DATE_PREFIX_value"]
                        detection_with_url_metadata[
                            "date_value_1"] = temp_detection_dictionary[
                                "DATE_value_1"]
                        forward_result.append(detection_with_url_metadata)
                        temp_detection_dictionary.clear()

    return forward_result
Example #34
0
import json
from spacy.matcher import Matcher
from spacy.lang.en import English

with open("exercises/iphone.json") as f:
    TEXTS = json.loads(f.read())

nlp = English()
matcher = Matcher(nlp.vocab)
pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True, "OP": "?"}]
matcher.add("GADGET", None, pattern1, pattern2)

TRAINING_DATA = []

# Create a Doc object for each text in TEXTS
for doc in nlp.pipe(TEXTS):
    # Match on the doc and create a list of matched spans
    spans = [doc[start:end] for match_id, start, end in matcher(doc)]
    # Get (start character, end character, label) tuples of matches
    entities = [(span.start_char, span.end_char, "GADGET") for span in spans]
    # Format the matches as a (doc.text, entities) tuple
    training_example = (doc.text, {"entities": entities})
    # Append the example to the training data
    TRAINING_DATA.append(training_example)

print(*TRAINING_DATA, sep="\n")
Example #35
0
def test_tagger_requires_labels():
    nlp = English()
    nlp.add_pipe("tagger")
    with pytest.raises(ValueError):
        nlp.initialize()
from gensim import corpora
import gensim
import nltk
import string
import spacy
import en_core_web_sm

filepath = 'Enter filepath'
filename = 'Enter filename'

dataframe = pd.read_csv(filepath + filename)
print(len(dataframe))

import re
from spacy.lang.en import English
parser = English()


#Creating tokens by removing stopwords, punctuation using SpaCy
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    stop_list = [
        '<', '>', '</i>', '<i>', '<b>', '</b>', '=', '<i', '<b', '</i', '</b',
        '<sub>', '</sub>', '<sub'
    ]
    parser.Defaults.stop_words.update(stop_list)
    #print(tokens)
    for token in tokens:
        if token.orth_.isspace():
            continue
Example #37
0
def test_beam_overfitting_IO(neg_key):
    # Simple test to try and quickly overfit the Beam NER component
    nlp = English()
    beam_width = 16
    beam_density = 0.0001
    config = {
        "beam_width": beam_width,
        "beam_density": beam_density,
        "incorrect_spans_key": neg_key,
    }
    ner = nlp.add_pipe("beam_ner", config=config)
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(
            Example.from_dict(nlp.make_doc(text), annotations))
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    optimizer = nlp.initialize()

    # run overfitting
    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["beam_ner"] < 0.0001

    # test the scores from the beam
    test_text = "I like London"
    docs = [nlp.make_doc(test_text)]
    beams = ner.predict(docs)
    entity_scores = ner.scored_ents(beams)[0]
    assert entity_scores[(2, 3, "LOC")] == 1.0
    assert entity_scores[(2, 3, "PERSON")] == 0.0
    assert len(nlp(test_text).ents) == 1

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        docs2 = [nlp2.make_doc(test_text)]
        ner2 = nlp2.get_pipe("beam_ner")
        beams2 = ner2.predict(docs2)
        entity_scores2 = ner2.scored_ents(beams2)[0]
        assert entity_scores2[(2, 3, "LOC")] == 1.0
        assert entity_scores2[(2, 3, "PERSON")] == 0.0

    # Try to unlearn the entity by using negative annotations
    neg_doc = nlp.make_doc(test_text)
    neg_ex = Example(neg_doc, neg_doc)
    neg_ex.reference.spans[neg_key] = [Span(neg_doc, 2, 3, "LOC")]
    neg_train_examples = [neg_ex]

    for i in range(20):
        losses = {}
        nlp.update(neg_train_examples, sgd=optimizer, losses=losses)

    # test the "untrained" model
    assert len(nlp(test_text).ents) == 0
Example #38
0
        with open(args.jsonlines_path, 'r') as f:
            lines = f.readlines()
        docs = [json.loads(line) for line in lines]
        tensor_examples, stored_info = data_processor.get_tensor_examples_from_custom_input(docs)
        predicted_clusters, _, _ = runner.predict(model, tensor_examples)

        if args.output_path:
            with open(args.output_path, 'w') as f:
                for i, doc in enumerate(docs):
                    doc['predicted_clusters'] = predicted_clusters[i]
                    f.write(json.dumps(doc) + "\n")
            #print(f'Saved prediction in {args.output_path}')
    else:
        # Interactive input
        model.to(model.device)
        nlp = English()
        nlp.add_pipe(nlp.create_pipe('sentencizer'))
        while True:
            input_str = str(input('Input document:'))
            bert_tokenizer, spacy_tokenizer = data_processor.tokenizer, nlp
            doc = get_document_from_string(input_str, args.seg_len, bert_tokenizer, nlp)
            tensor_examples, stored_info = data_processor.get_tensor_examples_from_custom_input([doc])
            predicted_clusters, _, _ = runner.predict(model, tensor_examples)

            subtokens = util.flatten(doc['sentences'])
            #print('---Predicted clusters:')
            for cluster in predicted_clusters[0]:
                mentions_str = [' '.join(subtokens[m[0]:m[1]+1]) for m in cluster]
                mentions_str = [m.replace(' ##', '') for m in mentions_str]
                mentions_str = [m.replace('##', '') for m in mentions_str]
                #print(mentions_str)  # Print out strings
Example #39
0
import sys
import datetime

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import English

import pandas as pd
import numpy as np
import re

from stop_words import STOP_WORDS

nlp = spacy.load('en')
tokenizer = English().Defaults.create_tokenizer(nlp)

data = pd.read_csv('Data.csv', encoding='latin1')
data = data['Text'][0:10]

s = datetime.datetime.now()
# remove stop words USING REGEX
# remove_list = re.compile('[^a-zA-z0-9@ :\.\/]')
# remove_list2 = re.compile('(\n|\.$|(\.?=\s+)|(:(?!\/\/)))')
# remove_list3 = re.compile('(\s)(the|this|that|there|to|is|are|am|on|in|out|do|a|an|be|just|from|with|so|as|just|for|by|â€Â|)(?!\w)' )
# #txt = ' '.join(re.sub("[0-9]+","NUM",txt).split()

# def remove_stop_word(data):
#     l = len(data)
#     for index in range(0,l):
#         data.loc[index] = re.sub(remove_list, '', (data.loc[index]).lower())
#         data.loc[index] = re.sub(remove_list2, '', (data.loc[index]))
from spacy.lang.en import English

nlp = English()

people = ["David Bowie", "Angela Merkel", "Lady Gaga"]

# Create a list of patterns for the PhraseMatcher
patterns = list(nlp.pipe(people))
def identify_build_date_in_text(text):
    nlp = English()
    doc = nlp(text)
    matcher = Matcher(nlp.vocab)

    #
    # START - spaCy patterns
    #

    # WATER_VESSEL
    water_vessel_pattern = [{"LOWER": {"IN": ["vessels"]}}]
    matcher.add("WATER_VESSEL", None, water_vessel_pattern)

    # DATE
    matcher.add("DATE", None, [{'IS_DIGIT': True, 'LENGTH': 4}])

    # CONSTRUCT
    matcher.add("CONSTRUCT", None, [{"LOWER": {"IN": ["constructed"]}}])

    #
    # END - spaCy patterns
    #

    result = []

    for match_id, token_start, token_end in matcher(doc):

        match_id_as_string = nlp.vocab.strings[match_id]
        final_token_start = token_start
        final_token_end = token_end

        if match_id_as_string == "DATE" and token_start > 0:

            # At this point, DATE is just a year string. Example: 2021

            # Expand DATE?
            prev_word_1_token_number = token_start - 1
            prev_word_1_token = doc[prev_word_1_token_number]
            if prev_word_1_token.text.lower() in ("january", "february",
                                                  "march", "april", "may",
                                                  "june", "july", "august",
                                                  "september", "october",
                                                  "november", "december"):
                final_token_start = prev_word_1_token_number  # expanding
                # Expand more?
                prev_word_2_token_number = token_start - 2
                prev_word_2_token = doc[prev_word_2_token_number]
                if is_int(prev_word_2_token.text):
                    final_token_start = prev_word_2_token_number  # expanding

            prev_word_on_date_token_number = final_token_start - 1
            prev_word_on_date_token = doc[prev_word_on_date_token_number]

            # Does the DATE have a DATE_SEPARATOR?
            if prev_word_on_date_token.text in ("and", "to"):
                prev_word_on_date_char_span_start_number = prev_word_on_date_token.idx
                prev_word_on_date_char_span_end_number = prev_word_on_date_char_span_start_number + len(
                    prev_word_on_date_token.text)
                identified_entity = {
                    'start': prev_word_on_date_char_span_start_number,
                    'end': prev_word_on_date_char_span_end_number,
                    'label': "DATE_SEPARATOR"
                }
                result.append(identified_entity)

            # Does the DATE have a DATE_SEPARATOR?
            elif prev_word_on_date_token.text in ("between", "before",
                                                  "after"):
                # DATE_PREFIX detected
                prev_word_on_date_char_span_start_number = prev_word_on_date_token.idx
                prev_word_on_date_char_span_end_number = prev_word_on_date_char_span_start_number + len(
                    prev_word_on_date_token.text)
                identified_entity = {
                    'start': prev_word_on_date_char_span_start_number,
                    'end': prev_word_on_date_char_span_end_number,
                    'label': "DATE_PREFIX"
                }
                result.append(identified_entity)

        #
        # convert token_span to char_span.
        # char_span is needed to display correctly withdisplacy.render().
        #
        span = doc[final_token_start:final_token_end]
        span_char_start = span[0].idx
        span_char_end = span[-1].idx + len(span[-1].text)

        # return result
        identified_entity = {
            'start': span_char_start,
            'end': span_char_end,
            'label': match_id_as_string
        }
        result.append(identified_entity)

    return result
Example #42
0
#!/usr/bin/env python
# coding: utf-8

# In[101]:

import spacy

# In[123]:

nlp = spacy.load('en_core_web_sm')
from spacy.lang.en import English
nlp2 = English()

# In[124]:


def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_ + ' - ' +
                  str(spacy.explain(ent.label_)))

    else:
        print('No entities found')


# In[125]:

doc = nlp(u'Hi how are you?')

# In[126]: