Python English Examples, spacy.lang.en.English Python Examples

Example #1

0

Show file

File: test_issue1001-1500.py Project: spacy-io/spaCy

def test_issue1242():
    nlp = English()
    doc = nlp("")
    assert len(doc) == 0
    docs = list(nlp.pipe(["", "hello"]))
    assert len(docs[0]) == 0
    assert len(docs[1]) == 1

Example #2

0

Show file

File: test_issue3449.py Project: spacy-io/spaCy

def test_issue3449():
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    text1 = "He gave the ball to I. Do you want to go to the movies with I?"
    text2 = "He gave the ball to I.  Do you want to go to the movies with I?"
    text3 = "He gave the ball to I.\nDo you want to go to the movies with I?"
    t1 = nlp(text1)
    t2 = nlp(text2)
    t3 = nlp(text3)
    assert t1[5].text == "I"
    assert t2[5].text == "I"
    assert t3[5].text == "I"

Example #3

0

Show file

File: test_issue3410.py Project: spacy-io/spaCy

def test_issue3410():
    texts = ["Hello world", "This is a test"]
    nlp = English()
    matcher = Matcher(nlp.vocab)
    phrasematcher = PhraseMatcher(nlp.vocab)
    with pytest.deprecated_call():
        docs = list(nlp.pipe(texts, n_threads=4))
    with pytest.deprecated_call():
        docs = list(nlp.tokenizer.pipe(texts, n_threads=4))
    with pytest.deprecated_call():
        list(matcher.pipe(docs, n_threads=4))
    with pytest.deprecated_call():
        list(phrasematcher.pipe(docs, n_threads=4))

Example #4

0

Show file

File: test_issue3468.py Project: spacy-io/spaCy

def test_issue3468():
    """Test that sentence boundaries are set correctly so Doc.is_sentenced can
    be restored after serialization."""
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    doc = nlp("Hello world")
    assert doc[0].is_sent_start
    assert doc.is_sentenced
    assert len(list(doc.sents)) == 1
    doc_bytes = doc.to_bytes()
    new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
    assert new_doc[0].is_sent_start
    assert new_doc.is_sentenced
    assert len(list(new_doc.sents)) == 1

Example #5

0

Show file

File: custom_component_countries_api.py Project: AvinashGupta/spaCy

def main():
    # For simplicity, we start off with only the blank English Language class
    # and no model or pre-defined pipeline loaded.
    nlp = English()
    rest_countries = RESTCountriesComponent(nlp)  # initialise component
    nlp.add_pipe(rest_countries) # add it to the pipeline
    doc = nlp(u"Some text about Colombia and the Czech Republic")
    print('Pipeline', nlp.pipe_names)  # pipeline contains component name
    print('Doc has countries', doc._.has_country)  # Doc contains countries
    for token in doc:
        if token._.is_country:
            print(token.text, token._.country_capital, token._.country_latlng,
                token._.country_flag)  # country data
    print('Entities', [(e.text, e.label_) for e in doc.ents])  # entities

Example #6

0

Show file

File: test_issue1001-1500.py Project: spacy-io/spaCy

def test_issue1494():
    infix_re = re.compile(r"""[^a-z]""")
    test_cases = [
        ("token 123test", ["token", "1", "2", "3", "test"]),
        ("token 1test", ["token", "1test"]),
        ("hello...test", ["hello", ".", ".", ".", "test"]),
    ]

    def new_tokenizer(nlp):
        return Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer)

    nlp = English()
    nlp.tokenizer = new_tokenizer(nlp)
    for text, expected in test_cases:
        assert [token.text for token in nlp(text)] == expected

Example #7

0

Show file

File: custom_component_entities.py Project: AvinashGupta/spaCy

def main(text="Alphabet Inc. is the company behind Google.", *companies):
    # For simplicity, we start off with only the blank English Language class
    # and no model or pre-defined pipeline loaded.
    nlp = English()
    if not companies:  # set default companies if none are set via args
        companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple']  # etc.
    component = TechCompanyRecognizer(nlp, companies)  # initialise component
    nlp.add_pipe(component, last=True)  # add last to the pipeline

    doc = nlp(text)
    print('Pipeline', nlp.pipe_names)  # pipeline contains component name
    print('Tokens', [t.text for t in doc])  # company names from the list are merged
    print('Doc has_tech_org', doc._.has_tech_org)  # Doc contains tech orgs
    print('Token 0 is_tech_org', doc[0]._.is_tech_org)  # "Alphabet Inc." is a tech org
    print('Token 1 is_tech_org', doc[1]._.is_tech_org)  # "is" is not
    print('Entities', [(e.text, e.label_) for e in doc.ents])  # all orgs are entities

Example #8

0

Show file

File: test_issue1501-2000.py Project: spacy-io/spaCy

def test_issue1506():
    def string_generator():
        for _ in range(10001):
            yield "It's sentence produced by that bug."
        for _ in range(10001):
            yield "I erase some hbdsaj lemmas."
        for _ in range(10001):
            yield "I erase lemmas."
        for _ in range(10001):
            yield "It's sentence produced by that bug."
        for _ in range(10001):
            yield "It's sentence produced by that bug."

    nlp = English()
    for i, d in enumerate(nlp.pipe(string_generator())):
        # We should run cleanup more than one time to actually cleanup data.
        # In first run — clean up only mark strings as «not hitted».
        if i == 10000 or i == 20000 or i == 30000:
            gc.collect()
        for t in d:
            str(t.lemma_)

Example #9

0

Show file

File: test_issue1001-1500.py Project: spacy-io/spaCy

def test_issue1488():
    prefix_re = re.compile(r"""[\[\("']""")
    suffix_re = re.compile(r"""[\]\)"']""")
    infix_re = re.compile(r"""[-~\.]""")
    simple_url_re = re.compile(r"""^https?://""")

    def my_tokenizer(nlp):
        return Tokenizer(
            nlp.vocab,
            {},
            prefix_search=prefix_re.search,
            suffix_search=suffix_re.search,
            infix_finditer=infix_re.finditer,
            token_match=simple_url_re.match,
        )

    nlp = English()
    nlp.tokenizer = my_tokenizer(nlp)
    doc = nlp("This is a test.")
    for token in doc:
        assert token.text

Example #10

0

Show file

File: ms_marco_eval.py Project: zhouyonglong/MSMARCOV2

def normalize_batch(p_iter, p_batch_size=1000, p_thread_count=5):
    """Normalize and tokenize strings.

    Args:
    p_iter (iter): iter over strings to normalize and tokenize.
    p_batch_size (int): number of batches.
    p_thread_count (int): number of threads running.

    Returns:
    iter: iter over normalized and tokenized string.
    """

    global NLP
    if not NLP:
        NLP = NlpEnglish(parser=False)

    output_iter = NLP.pipe(p_iter, \
                           batch_size=p_batch_size, \
                           n_threads=p_thread_count)

    for doc in output_iter:
        tokens = [str(w).strip().lower() for w in doc]
        yield ' '.join(tokens)

Example #11

0

Show file

File: spacy_matching_rule_identify_KEEL_LAID_no.py Project: christianfosli/NLP_PoC

def identify_KEEL_LAID_in_text(text):
    nlp = English()
    doc = nlp(text)
    matcher = Matcher(nlp.vocab)
    tokens_in_doc_count = len(doc)

    #
    # START - spaCy patterns
    #

    matcher.add("KEEL_LAID", [[{
        "LOWER": {
            "IN": ["kjølstrukk", "kjølstrukket"]
        }
    }]])

    matcher.add("DATE", None, [{'IS_DIGIT': True, 'LENGTH': 4}])

    #
    # END - spaCy patterns
    #

    result = []

    for match_id, token_start, token_end in matcher(doc):

        match_id_as_string = nlp.vocab.strings[match_id]
        final_token_start = token_start
        final_token_end = token_end

        spacy_pattern_detection = doc[token_start:token_end]
        spacy_pattern_detection_as_lower_text = spacy_pattern_detection.text.lower(
        )

        #
        # Expand?
        #

        if match_id_as_string == "DATE" and token_start > 0:

            # At this point, DATE is just a year string. Example: 2021

            prev_word_1_token_number = token_start - 1
            prev_word_1_token = doc[prev_word_1_token_number]
            if prev_word_1_token.text in ("januar", "februar", "mars", "april",
                                          "mai", "juni", "juli", "august",
                                          "september", "oktober", "november",
                                          "desember"):
                final_token_start = prev_word_1_token_number  # expanding

                # Expand more?
                prev_word_2_token_number = token_start - 2
                prev_word_2_token = doc[prev_word_2_token_number]
                prev_word_3_token_number = token_start - 3
                prev_word_3_token = doc[prev_word_3_token_number]

                if prev_word_2_token.text == "." and is_int(
                        prev_word_3_token.text):
                    final_token_start = prev_word_3_token_number  # expanding

                    #
                    # convert token_span to char_span.
                    # char_span is needed to display correctly withdisplacy.render().
                    #
                    span = doc[final_token_start:final_token_end]
                    span_char_start = span[0].idx
                    span_char_end = span[-1].idx + len(span[-1].text)

                    # return result
                    identified_entity = {
                        'start': span_char_start,
                        'end': span_char_end,
                        'label': match_id_as_string
                    }
                    result.append(identified_entity)

                    #
                    # Identify prefix or suffix
                    #

                    if final_token_start > 0:

                        prev_word_1_token_number = final_token_start - 1
                        prev_word_1_token = doc[prev_word_1_token_number]

                        if prev_word_1_token.text.lower() == "før":

                            # Prefix detected.

                            #
                            # convert token_span to char_span.
                            # char_span is needed to display correctly withdisplacy.render().
                            #
                            span = doc[
                                prev_word_1_token_number:final_token_start]
                            span_char_start = span[0].idx
                            span_char_end = span[-1].idx + len(span[-1].text)

                            # return result
                            identified_entity = {
                                'start': span_char_start,
                                'end': span_char_end,
                                'label': "DATE_PREFIX"
                            }
                            result.append(identified_entity)

                    if ((final_token_end + 1) < tokens_in_doc_count):

                        next_word_1_token_number = final_token_end
                        next_word_1_token = doc[next_word_1_token_number]
                        next_word_2_token_number = final_token_end + 1
                        next_word_2_token = doc[next_word_2_token_number]

                        if (next_word_1_token.text.lower() == "eller" and
                                next_word_2_token.text.lower() == "senere"):

                            # Suffix detected.

                            #
                            # convert token_span to char_span.
                            # char_span is needed to display correctly withdisplacy.render().
                            #
                            span = doc[next_word_1_token_number:(
                                next_word_1_token_number + 2)]
                            span_char_start = span[0].idx
                            span_char_end = span[-1].idx + len(span[-1].text)

                            # return result
                            identified_entity = {
                                'start': span_char_start,
                                'end': span_char_end,
                                'label': "DATE_SUFFIX"
                            }
                            result.append(identified_entity)

        elif match_id_as_string == "KEEL_LAID":

            #
            # convert token_span to char_span.
            # char_span is needed to display correctly withdisplacy.render().
            #
            span = doc[final_token_start:final_token_end]
            span_char_start = span[0].idx
            span_char_end = span[-1].idx + len(span[-1].text)

            # return result
            identified_entity = {
                'start': span_char_start,
                'end': span_char_end,
                'label': match_id_as_string
            }
            result.append(identified_entity)

    return result

Example #12

0

Show file

def test_overfitting_IO():
    # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly
    nlp = English()
    tagger = nlp.add_pipe("tagger")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert tagger.model.get_dim("nO") == len(TAGS)

    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["tagger"] < 0.00001

    # test the trained model
    test_text = "I like blue eggs"
    doc = nlp(test_text)
    assert doc[0].tag_ == "N"
    assert doc[1].tag_ == "V"
    assert doc[2].tag_ == "J"
    assert doc[3].tag_ == "N"

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
        assert doc2[0].tag_ == "N"
        assert doc2[1].tag_ == "V"
        assert doc2[2].tag_ == "J"
        assert doc2[3].tag_ == "N"

    # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
    texts = [
        "Just a sentence.",
        "I like green eggs.",
        "Here is another one.",
        "I eat ham.",
    ]
    batch_deps_1 = [doc.to_array([TAG]) for doc in nlp.pipe(texts)]
    batch_deps_2 = [doc.to_array([TAG]) for doc in nlp.pipe(texts)]
    no_batch_deps = [
        doc.to_array([TAG]) for doc in [nlp(text) for text in texts]
    ]
    assert_equal(batch_deps_1, batch_deps_2)
    assert_equal(batch_deps_1, no_batch_deps)

    # Try to unlearn the first 'N' tag with negative annotation
    neg_ex = Example.from_dict(nlp.make_doc(test_text),
                               {"tags": ["!N", "V", "J", "N"]})

    for i in range(20):
        losses = {}
        nlp.update([neg_ex], sgd=optimizer, losses=losses)

    # test the "untrained" tag
    doc3 = nlp(test_text)
    assert doc3[0].tag_ != "N"

Example #13

0

Show file

File: test_ner.py Project: xettrisomeman/spaCy

def test_overfitting_IO(use_upper):
    # Simple test to try and quickly overfit the NER component
    nlp = English()
    ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(
            Example.from_dict(nlp.make_doc(text), annotations))
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    optimizer = nlp.initialize()

    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["ner"] < 0.00001

    # test the trained model
    test_text = "I like London."
    doc = nlp(test_text)
    ents = doc.ents
    assert len(ents) == 1
    assert ents[0].text == "London"
    assert ents[0].label_ == "LOC"

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
        ents2 = doc2.ents
        assert len(ents2) == 1
        assert ents2[0].text == "London"
        assert ents2[0].label_ == "LOC"
        # Ensure that the predictions are still the same, even after adding a new label
        ner2 = nlp2.get_pipe("ner")
        assert ner2.model.attrs["has_upper"] == use_upper
        ner2.add_label("RANDOM_NEW_LABEL")
        doc3 = nlp2(test_text)
        ents3 = doc3.ents
        assert len(ents3) == 1
        assert ents3[0].text == "London"
        assert ents3[0].label_ == "LOC"

    # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
    texts = [
        "Just a sentence.",
        "Then one more sentence about London.",
        "Here is another one.",
        "I like London.",
    ]
    batch_deps_1 = [doc.to_array([ENT_IOB]) for doc in nlp.pipe(texts)]
    batch_deps_2 = [doc.to_array([ENT_IOB]) for doc in nlp.pipe(texts)]
    no_batch_deps = [
        doc.to_array([ENT_IOB]) for doc in [nlp(text) for text in texts]
    ]
    assert_equal(batch_deps_1, batch_deps_2)
    assert_equal(batch_deps_1, no_batch_deps)

    # test that kb_id is preserved
    test_text = "I like London and London."
    doc = nlp.make_doc(test_text)
    doc.ents = [Span(doc, 2, 3, label="LOC", kb_id=1234)]
    ents = doc.ents
    assert len(ents) == 1
    assert ents[0].text == "London"
    assert ents[0].label_ == "LOC"
    assert ents[0].kb_id == 1234
    doc = nlp.get_pipe("ner")(doc)
    ents = doc.ents
    assert len(ents) == 2
    assert ents[0].text == "London"
    assert ents[0].label_ == "LOC"
    assert ents[0].kb_id == 1234
    # ent added by ner has kb_id == 0
    assert ents[1].text == "London"
    assert ents[1].label_ == "LOC"
    assert ents[1].kb_id == 0

Example #14

0

Show file

File: sc_mask_gen.py Project: zhyq/SelectiveMasking

from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from torch.nn.functional import softmax

sys.path.append("../")
from model.modeling_classification import BertForSequenceClassification, BertForTokenClassification
from model.tokenization import BertTokenizer

logger = logging.getLogger(__name__)
MaskedTokenInstance = collections.namedtuple("MaskedTokenInstance",
                                             ["tokens", "info"])
MaskedItemInfo = collections.namedtuple(
    "MaskedItemInfo",
    ["current_pos", "sen_doc_pos", "sen_right_id", "doc_ground_truth"])
nlp = English()
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)


class InputFeatures(object):
    def __init__(self, input_ids, input_mask, segment_ids=None):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids


class SC(nn.Module):
    def __init__(self,
                 mask_rate,
                 top_sen_rate,

Example #15

0

Show file

File: test_issue3209.py Project: spacy-io/spaCy

def test_issue3209():
    """Test issue that occurred in spaCy nightly where NER labels were being
    mapped to classes incorrectly after loading the model, when the labels
    were added using ner.add_label().
    """
    nlp = English()
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)

    ner.add_label("ANIMAL")
    nlp.begin_training()
    move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
    assert ner.move_names == move_names
    nlp2 = English()
    nlp2.add_pipe(nlp2.create_pipe("ner"))
    nlp2.from_bytes(nlp.to_bytes())
    assert nlp2.get_pipe("ner").move_names == move_names

Example #16

0

Show file

def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
    nlp = English()
    nlp_plain = English()
    # load both vec and hashvec tables
    with make_tempdir() as tmpdir:
        p = tmpdir / "test.hashvec"
        with open(p, "w") as fileh:
            fileh.write(floret_vectors_hashvec_str)
        convert_vectors(nlp, p, truncate=0, prune=-1, mode="floret")
        p = tmpdir / "test.vec"
        with open(p, "w") as fileh:
            fileh.write(floret_vectors_vec_str)
        convert_vectors(nlp_plain, p, truncate=0, prune=-1)

    word = "der"
    # ngrams: full padded word + padded 2-grams + padded 3-grams
    ngrams = nlp.vocab.vectors._get_ngrams(word)
    assert ngrams == ["<der>", "<d", "de", "er", "r>", "<de", "der", "er>"]
    # rows: 2 rows per ngram
    rows = OPS.xp.asarray(
        [
            h % nlp.vocab.vectors.shape[0] for ngram in ngrams
            for h in nlp.vocab.vectors._get_ngram_hashes(ngram)
        ],
        dtype="uint32",
    )
    assert_equal(
        OPS.to_numpy(rows),
        numpy.asarray([5, 6, 7, 5, 8, 2, 8, 9, 3, 3, 4, 6, 7, 3, 0, 2]),
    )
    assert len(rows) == len(ngrams) * nlp.vocab.vectors.hash_count
    # all vectors are equivalent for plain static table vs. hash ngrams
    for word in nlp_plain.vocab.vectors:
        word = nlp_plain.vocab.strings.as_string(word)
        assert_almost_equal(nlp.vocab[word].vector,
                            nlp_plain.vocab[word].vector,
                            decimal=3)

        # every word has a vector
        assert nlp.vocab[word * 5].has_vector

    # n_keys is -1 for floret
    assert nlp_plain.vocab.vectors.n_keys > 0
    assert nlp.vocab.vectors.n_keys == -1

    # check that single and batched vector lookups are identical
    words = [s for s in nlp_plain.vocab.vectors]
    single_vecs = OPS.to_numpy(
        OPS.asarray([nlp.vocab[word].vector for word in words]))
    batch_vecs = OPS.to_numpy(nlp.vocab.vectors.get_batch(words))
    assert_equal(single_vecs, batch_vecs)

    # an empty key returns 0s
    assert_equal(
        OPS.to_numpy(nlp.vocab[""].vector),
        numpy.zeros((nlp.vocab.vectors.shape[0], )),
    )
    # an empty batch returns 0s
    assert_equal(
        OPS.to_numpy(nlp.vocab.vectors.get_batch([""])),
        numpy.zeros((1, nlp.vocab.vectors.shape[0])),
    )
    # an empty key within a batch returns 0s
    assert_equal(
        OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]),
        numpy.zeros((nlp.vocab.vectors.shape[0], )),
    )

    # the loaded ngram vector table cannot be modified
    # except for clear: warning, then return without modifications
    vector = list(range(nlp.vocab.vectors.shape[1]))
    orig_bytes = nlp.vocab.vectors.to_bytes(exclude=["strings"])
    with pytest.warns(UserWarning):
        nlp.vocab.set_vector("the", vector)
    assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"])
    with pytest.warns(UserWarning):
        nlp.vocab[word].vector = vector
    assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"])
    with pytest.warns(UserWarning):
        nlp.vocab.vectors.add("the", row=6)
    assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"])
    with pytest.warns(UserWarning):
        nlp.vocab.vectors.resize(shape=(100, 10))
    assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"])
    with pytest.raises(ValueError):
        nlp.vocab.vectors.clear()

    # data and settings are serialized correctly
    with make_tempdir() as d:
        nlp.vocab.to_disk(d)
        vocab_r = Vocab()
        vocab_r.from_disk(d)
        assert nlp.vocab.vectors.to_bytes() == vocab_r.vectors.to_bytes()
        assert_equal(OPS.to_numpy(nlp.vocab.vectors.data),
                     OPS.to_numpy(vocab_r.vectors.data))
        assert_equal(nlp.vocab.vectors._get_cfg(), vocab_r.vectors._get_cfg())
        assert_almost_equal(
            OPS.to_numpy(nlp.vocab[word].vector),
            OPS.to_numpy(vocab_r[word].vector),
            decimal=6,
        )

Example #17

0

Show file

#import packages
from gensim.summarization import keywords
from matplotlib import pyplot
import spacy
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))
from spacy.lang.en import English
nlp = English()
nlp.max_length = 10000000
import lyricsgenius
import pandas as pd
from textblob import TextBlob
import requests, json
import numpy as np
import matplotlib.ticker as plticker
import seaborn as sns
import time
from datetime import date
import matplotlib.dates as mdates

#define Genius API authentication
api_key = 'SfbYPF1AJ0-lnm6Km8_sIJoebvrIFfRyAGoZqxnRfkZIvP5ceGwBNZa4g0DHayP-'
genius = lyricsgenius.Genius(api_key)
BASE_URL = "https://api.genius.com"


def main():
    artist_name = ""
    while True:

Example #18

0

Show file