Example #1
0
def test_doc_array_attr_of_token(en_vocab):
    doc = Doc(en_vocab, words=["An", "example", "sentence"])
    example = doc.vocab["example"]
    assert example.orth != example.shape
    feats_array = doc.to_array((ORTH, SHAPE))
    assert feats_array[0][0] != feats_array[0][1]
    assert feats_array[0][0] != feats_array[0][1]
Example #2
0
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
    """Create Doc object from given vocab, words and annotations."""
    pos = pos or [""] * len(words)
    tags = tags or [""] * len(words)
    heads = heads or [0] * len(words)
    deps = deps or [""] * len(words)
    for value in deps + tags + pos:
        vocab.strings.add(value)

    doc = Doc(vocab, words=words)
    attrs = doc.to_array([POS, HEAD, DEP])
    for i, (p, head, dep) in enumerate(zip(pos, heads, deps)):
        attrs[i, 0] = doc.vocab.strings[p]
        attrs[i, 1] = head
        attrs[i, 2] = doc.vocab.strings[dep]
    doc.from_array([POS, HEAD, DEP], attrs)
    if ents:
        doc.ents = [
            Span(doc, start, end, label=doc.vocab.strings[label])
            for start, end, label in ents
        ]
    if tags:
        for token in doc:
            token.tag_ = tags[token.i]
    return doc
Example #3
0
def test_doc_array_to_from_string_attrs(en_vocab, attrs):
    """Test that both Doc.to_array and Doc.from_array accept string attrs,
    as well as single attrs and sequences of attrs.
    """
    words = ["An", "example", "sentence"]
    doc = Doc(en_vocab, words=words)
    Doc(en_vocab, words=words).from_array(attrs, doc.to_array(attrs))
    def __init__(self, nlp, label='GPE'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        # Make request once on initialisation and store the data
        r = requests.get('https://restcountries.eu/rest/v2/all')
        r.raise_for_status()  # make sure requests raises an error if it fails
        countries = r.json()

        # Convert API response to dict keyed by country name for easy lookup
        # This could also be extended using the alternative and foreign language
        # names provided by the API
        self.countries = {c['name']: c for c in countries}
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher with Doc patterns for each country name
        patterns = [nlp(c) for c in self.countries.keys()]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('COUNTRIES', None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        # If no default value is set, it defaults to None.
        Token.set_extension('is_country', default=False)
        Token.set_extension('country_capital')
        Token.set_extension('country_latlng')
        Token.set_extension('country_flag')

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_country == True.
        Doc.set_extension('has_country', getter=self.has_country)
        Span.set_extension('has_country', getter=self.has_country)
Example #5
0
def test_issue1547():
    """Test that entity labels still match after merging tokens."""
    words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])]
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[5:7])
    assert [ent.text for ent in doc.ents]
Example #6
0
def test_serialize_empty_doc(en_vocab):
    doc = Doc(en_vocab)
    data = doc.to_bytes()
    doc2 = Doc(en_vocab)
    doc2.from_bytes(data)
    assert len(doc) == len(doc2)
    for token1, token2 in zip(doc, doc2):
        assert token1.text == token2.text
Example #7
0
def test_doc_retokenize_merge_extension_attrs_invalid(en_vocab, underscore_attrs):
    Token.set_extension("a", getter=lambda x: x, force=True)
    Token.set_extension("b", method=lambda x: x, force=True)
    doc = Doc(en_vocab, words=["hello", "world", "!"])
    attrs = {"_": underscore_attrs}
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.merge(doc[0:2], attrs=attrs)
Example #8
0
def test_doc_api_similarity_match():
    doc = Doc(Vocab(), words=["a"])
    assert doc.similarity(doc[0]) == 1.0
    assert doc.similarity(doc.vocab["a"]) == 1.0
    doc2 = Doc(doc.vocab, words=["a", "b", "c"])
    with pytest.warns(ModelsWarning):
        assert doc.similarity(doc2[:1]) == 1.0
        assert doc.similarity(doc2) == 0.0
Example #9
0
def test_doc_to_json_underscore(doc):
    Doc.set_extension("json_test1", default=False)
    Doc.set_extension("json_test2", default=False)
    doc._.json_test1 = "hello world"
    doc._.json_test2 = [1, 2, 3]
    json_doc = doc.to_json(underscore=["json_test1", "json_test2"])
    assert "_" in json_doc
    assert json_doc["_"]["json_test1"] == "hello world"
    assert json_doc["_"]["json_test2"] == [1, 2, 3]
Example #10
0
def test_underscore_dir(en_vocab):
    """Test that dir() correctly returns extension attributes. This enables
    things like tab-completion for the attributes in doc._."""
    Doc.set_extension("test_dir", default=None)
    doc = Doc(en_vocab, words=["hello", "world"])
    assert "_" in dir(doc)
    assert "test_dir" in dir(doc._)
    assert "test_dir" not in dir(doc[0]._)
    assert "test_dir" not in dir(doc[0:2]._)
Example #11
0
def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs):
    Token.set_extension("x", default=False, force=True)
    Token.set_extension("a", getter=lambda x: x, force=True)
    Token.set_extension("b", method=lambda x: x, force=True)
    doc = Doc(en_vocab, words=["LosAngeles", "start"])
    attrs = {"_": underscore_attrs}
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            heads = [(doc[0], 1), doc[1]]
            retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
Example #12
0
def test_spans_override_sentiment(en_tokenizer):
    """Test span.sentiment property's default averaging behaviour"""
    text = "good stuff bad stuff"
    tokens = en_tokenizer(text)
    tokens.vocab[tokens[0].text].sentiment = 3.0
    tokens.vocab[tokens[2].text].sentiment = -2.0
    doc = Doc(tokens.vocab, words=[t.text for t in tokens])
    doc.user_span_hooks["sentiment"] = lambda span: 10.0
    assert doc[:2].sentiment == 10.0
    assert doc[-2:].sentiment == 10.0
    assert doc[:-1].sentiment == 10.0
Example #13
0
def test_doc_retokenize_split_orths_mismatch(en_vocab):
    """Test that the regular retokenizer.split raises an error if the orths
    don't match the original token text. There might still be a method that
    allows this, but for the default use cases, merging and splitting should
    always conform with spaCy's non-destructive tokenization policy. Otherwise,
    it can lead to very confusing and unexpected results.
    """
    doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.split(doc[0], ["L", "A"], [(doc[0], 0), (doc[0], 0)])
Example #14
0
def test_doc_retokenize_split_heads_error(en_vocab):
    doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
    # Not enough heads
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1]])

    # Too many heads
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1], doc[1], doc[1]])
Example #15
0
def test_doc_add_entities_set_ents_iob(en_vocab):
    doc = Doc(en_vocab, words=["This", "is", "a", "lion"])
    ner = EntityRecognizer(en_vocab)
    ner.begin_training([])
    ner(doc)
    assert len(list(doc.ents)) == 0
    assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
    doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
    assert [w.ent_iob_ for w in doc] == ["", "", "", "B"]
    doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
    assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]
Example #16
0
def test_underscore_docstring(en_vocab):
    """Test that docstrings are available for extension methods, even though
    they're partials."""

    def test_method(doc, arg1=1, arg2=2):
        """I am a docstring"""
        return (arg1, arg2)

    Doc.set_extension("test_docstrings", method=test_method)
    doc = Doc(en_vocab, words=["hello", "world"])
    assert test_method.__doc__ == "I am a docstring"
    assert doc._.test_docstrings.__doc__.rsplit(". ")[-1] == "I am a docstring"
Example #17
0
def test_doc_retokenize_split_dependencies(en_vocab):
    doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
    dep1 = doc.vocab.strings.add("amod")
    dep2 = doc.vocab.strings.add("subject")
    with doc.retokenize() as retokenizer:
        retokenizer.split(
            doc[0],
            ["Los", "Angeles"],
            [(doc[0], 1), doc[1]],
            attrs={"dep": [dep1, dep2]},
        )
    assert doc[0].dep == dep1
    assert doc[1].dep == dep2
Example #18
0
def test_doc_retokenize_spans_entity_split_iob():
    # Test entity IOB stays consistent after merging
    words = ["abc", "d", "e"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [(doc.vocab.strings.add("ent-abcd"), 0, 2)]
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"
    with doc.retokenize() as retokenizer:
        retokenizer.split(doc[0], ["a", "b", "c"], [(doc[0], 1), (doc[0], 2), doc[1]])
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"
    assert doc[2].ent_iob_ == "I"
    assert doc[3].ent_iob_ == "I"
Example #19
0
def test_underscore_mutable_defaults_list(en_vocab):
    """Test that mutable default arguments are handled correctly (see #2581)."""
    Doc.set_extension("mutable", default=[])
    doc1 = Doc(en_vocab, words=["one"])
    doc2 = Doc(en_vocab, words=["two"])
    doc1._.mutable.append("foo")
    assert len(doc1._.mutable) == 1
    assert doc1._.mutable[0] == "foo"
    assert len(doc2._.mutable) == 0
    doc1._.mutable = ["bar", "baz"]
    doc1._.mutable.append("foo")
    assert len(doc1._.mutable) == 3
    assert len(doc2._.mutable) == 0
Example #20
0
def test_sbd_serialization_projective(EN):
    """
    test that before and after serialization, the sentence boundaries are the same.
    """

    example = EN.tokenizer.tokens_from_list(u"I bought a couch from IKEA. It was n't very comfortable .".split(' '))
    EN.tagger(example)
    apply_transition_sequence(EN, example, ['L-nsubj','S','L-det','R-dobj','D','R-prep','R-pobj','B-ROOT','L-nsubj','R-neg','D','S','L-advmod','R-acomp','D','R-punct'])

    example_serialized = Doc(EN.vocab).from_bytes(example.to_bytes())

    assert example.to_bytes() == example_serialized.to_bytes()
    assert [s.text for s in example.sents] == [s.text for s in example_serialized.sents]
Example #21
0
def test_doc_is_nered(en_vocab):
    words = ["I", "live", "in", "New", "York"]
    doc = Doc(en_vocab, words=words)
    assert not doc.is_nered
    doc.ents = [Span(doc, 3, 5, label="GPE")]
    assert doc.is_nered
    # Test creating doc from array with unknown values
    arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
    doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
    assert doc.is_nered
    # Test serialization
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
    assert new_doc.is_nered
Example #22
0
def test_issue1834():
    """Test that sentence boundaries & parse/tag flags are not lost
    during serialization."""
    string = "This is a first sentence . And another one"
    doc = Doc(Vocab(), words=string.split())
    doc[6].sent_start = True
    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
    assert new_doc[6].sent_start
    assert not new_doc.is_parsed
    assert not new_doc.is_tagged
    doc.is_parsed = True
    doc.is_tagged = True
    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
    assert new_doc.is_parsed
    assert new_doc.is_tagged
Example #23
0
def test_doc_retokenize_split_extension_attrs(en_vocab):
    Token.set_extension("a", default=False, force=True)
    Token.set_extension("b", default="nothing", force=True)
    doc = Doc(en_vocab, words=["LosAngeles", "start"])
    with doc.retokenize() as retokenizer:
        heads = [(doc[0], 1), doc[1]]
        underscore = [{"a": True, "b": "1"}, {"b": "2"}]
        attrs = {"lemma": ["los", "angeles"], "_": underscore}
        retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
    assert doc[0].lemma_ == "los"
    assert doc[0]._.a is True
    assert doc[0]._.b == "1"
    assert doc[1].lemma_ == "angeles"
    assert doc[1]._.a is False
    assert doc[1]._.b == "2"
Example #24
0
def test_serialize_after_adding_entity():
    # Re issue #514
    vocab = spacy.en.English.Defaults.create_vocab()
    entity_recognizer = spacy.en.English.Defaults.create_entity()

    doc = Doc(vocab, words=u'This is a sentence about pasta .'.split())
    entity_recognizer.add_label('Food')
    entity_recognizer(doc)

    label_id = vocab.strings[u'Food']
    doc.ents = [(label_id, 5,6)]

    assert [(ent.label_, ent.text) for ent in doc.ents] == [(u'Food', u'pasta')]

    byte_string = doc.to_bytes()
Example #25
0
def test_doc_retokenizer_split_lex_attrs(en_vocab):
    """Test that retokenization also sets attributes on the lexeme if they're
    lexical attributes. For example, if a user sets IS_STOP, it should mean that
    "all tokens with that lexeme" are marked as a stop word, so the ambiguity
    here is acceptable. Also see #2390.
    """
    assert not Doc(en_vocab, words=["Los"])[0].is_stop
    assert not Doc(en_vocab, words=["Angeles"])[0].is_stop
    doc = Doc(en_vocab, words=["LosAngeles", "start"])
    assert not doc[0].is_stop
    with doc.retokenize() as retokenizer:
        attrs = {"is_stop": [True, False]}
        heads = [(doc[0], 1), doc[1]]
        retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
    assert doc[0].is_stop
    assert not doc[1].is_stop
Example #26
0
def main(output_dir=None):
    nlp = English()  # start off with blank English class

    Doc.set_extension('overlap', method=overlap_tokens)
    doc1 = nlp(u"Peach emoji is where it has always been.")
    doc2 = nlp(u"Peach is the superior emoji.")
    print("Text 1:", doc1.text)
    print("Text 2:", doc2.text)
    print("Overlapping tokens:", doc1._.overlap(doc2))

    Doc.set_extension('to_html', method=to_html)
    doc = nlp(u"This is a sentence about Apple.")
    # add entity manually for demo purposes, to make it work without a model
    doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
    print("Text:", doc.text)
    doc._.to_html(output=output_dir, style='ent')
Example #27
0
def test_issue599(en_vocab):
    doc = Doc(en_vocab)
    doc.is_tagged = True
    doc.is_parsed = True
    doc2 = Doc(doc.vocab)
    doc2.from_bytes(doc.to_bytes())
    assert doc2.is_parsed
Example #28
0
def test_issue1799():
    """Test sentence boundaries are deserialized correctly, even for
    non-projective sentences."""
    heads_deps = numpy.asarray(
        [
            [1, 397],
            [4, 436],
            [2, 426],
            [1, 402],
            [0, 8206900633647566924],
            [18446744073709551615, 440],
            [18446744073709551614, 442],
        ],
        dtype="uint64",
    )
    doc = Doc(Vocab(), words="Just what I was looking for .".split())
    doc.vocab.strings.add("ROOT")
    doc = doc.from_array([HEAD, DEP], heads_deps)
    assert len(list(doc.sents)) == 1
Example #29
0
def test_serialize_doc_exclude(en_vocab):
    doc = Doc(en_vocab, words=["hello", "world"])
    doc.user_data["foo"] = "bar"
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
    assert new_doc.user_data["foo"] == "bar"
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(), exclude=["user_data"])
    assert not new_doc.user_data
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(exclude=["user_data"]))
    assert not new_doc.user_data
    with pytest.raises(ValueError):
        doc.to_bytes(user_data=False)
    with pytest.raises(ValueError):
        Doc(en_vocab).from_bytes(doc.to_bytes(), tensor=False)
Example #30
0
def test_doc_retokenize_merge_extension_attrs(en_vocab):
    Token.set_extension("a", default=False, force=True)
    Token.set_extension("b", default="nothing", force=True)
    doc = Doc(en_vocab, words=["hello", "world", "!"])
    # Test regular merging
    with doc.retokenize() as retokenizer:
        attrs = {"lemma": "hello world", "_": {"a": True, "b": "1"}}
        retokenizer.merge(doc[0:2], attrs=attrs)
    assert doc[0].lemma_ == "hello world"
    assert doc[0]._.a is True
    assert doc[0]._.b == "1"
    # Test bulk merging
    doc = Doc(en_vocab, words=["hello", "world", "!", "!"])
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[0:2], attrs={"_": {"a": True, "b": "1"}})
        retokenizer.merge(doc[2:4], attrs={"_": {"a": None, "b": "2"}})
    assert doc[0]._.a is True
    assert doc[0]._.b == "1"
    assert doc[1]._.a is None
    assert doc[1]._.b == "2"
Example #31
0
def doc(vocab):
    return Doc(vocab, words=["Casey", "went", "to", "New", "York", "."])
Example #32
0
def test_issue2219(en_vocab):
    vectors = [("a", [1, 2, 3]), ("letter", [4, 5, 6])]
    add_vecs_to_vocab(en_vocab, vectors)
    [(word1, vec1), (word2, vec2)] = vectors
    doc = Doc(en_vocab, words=[word1, word2])
    assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
Example #33
0
def set_doc_extensions():
    for attr, attr_info in _doc_extensions.items():
        try:
            Doc.set_extension(attr, **attr_info)
        except ValueError as e:  # If the attribute has already set, this will raise an error
            pass
Example #34
0
from spacy.tokens import Doc, Span, Token
from sentence_transformers import SentenceTransformer

from . import util


def get_vector(sent):
    doc = sent.doc
    model_name = doc._.sentence_bert_model_name
    model = SentenceBert.get_model(model_name)
    vector = model.encode([sent.text])[0]
    return vector


# create an extension where the model will be used
Doc.set_extension('sentence_bert_model_name', default=None, force=True)

# set the extension both on doc and span level. This will contain the computed vector
Token.set_extension('sentence_bert', getter=get_vector, force=True)
Span.set_extension('sentence_bert', getter=get_vector, force=True)
Doc.set_extension('sentence_bert', getter=get_vector, force=True)


# the pipeline stage factory
@Language.factory('sentence_bert',
                  default_config={
                      'model_name': None,
                      'debug': True
                  })
def sentence_bert_factory(nlp, name, model_name, debug):
    if model_name:
Example #35
0
import numpy as np
import spacy
from spacy.tokens import Doc, Span, Token

from .base_parser import BaseParser, PTB_TOKEN_ESCAPE

__all__ = ['BeneparComponent', 'NonConstituentException']

# None is not allowed as a default extension value!
NOT_PARSED_SENTINEL = object()
Doc.set_extension('_constituent_data', default=NOT_PARSED_SENTINEL)


class NonConstituentException(Exception):
    pass


#%%
class ConstituentData():
    def __init__(self, starts, ends, labels, loc_to_constituent, label_vocab):
        self.starts = starts
        self.ends = ends
        self.labels = labels
        self.loc_to_constituent = loc_to_constituent
        self.label_vocab = label_vocab


class PartialConstituentData():
    def __init__(self):
        self.starts = [np.array([], dtype=int)]
        self.ends = [np.array([], dtype=int)]
def doc_from_bytes(nlp, bytes):
    """Returns a serialised doc from the bytes coming from `doc.to_bytes()` """
    doc = Doc(nlp.vocab).from_bytes(bytes)
    language.set_hooks(doc)
    return doc
Example #37
0
def test_matcher_valid_callback(en_vocab):
    """Test that on_match can only be None or callable."""
    matcher = Matcher(en_vocab)
    with pytest.raises(ValueError):
        matcher.add("TEST", [[{"TEXT": "test"}]], on_match=[])
    matcher(Doc(en_vocab, words=["test"]))
Example #38
0
def test_matcher_no_match(matcher):
    doc = Doc(matcher.vocab, words=["I", "like", "cheese", "."])
    assert matcher(doc) == []
Example #39
0
def test_matcher_match_start(matcher):
    doc = Doc(matcher.vocab, words=["JavaScript", "is", "good"])
    assert matcher(doc) == [(matcher.vocab.strings["JS"], 0, 1)]
Example #40
0
 def __call__(self, text):
     words = [f"{self.prefix}{word}" for word in text.split(" ")]
     return Doc(self.vocab, words=words)
Example #41
0
 def __init__(self, wikigraph: WikiGraph) -> None:
     Doc.set_extension("wiki_spans", default=[])
     Span.set_extension("wiki_pages", default=[])
     self._wg = wikigraph
Example #42
0
def test_doc_scalar_attr_of_token(en_vocab):
    doc = Doc(en_vocab, words=["An", "example", "sentence"])
    example = doc.vocab["example"]
    assert example.orth != example.shape
    feats_array = doc.to_array(ORTH)
    assert feats_array.shape == (3, )
Example #43
0
def docs(vocab):
    return [
        Doc(vocab, words=["hello", "world"]),
        Doc(vocab, words=["this", "is", "another"]),
    ]
Example #44
0
def test_common_vocab_lex_attrs(NLP):
    doc = Doc(NLP.vocab, words=["Lorem", "IPSUM", "dolor", "."])
    assert doc[0].is_title
    assert doc[1].is_upper
    assert doc[2].is_lower
    assert doc[3].is_punct
Example #45
0
 def __call__(self, text):
     words = text.split()
     # All tokens 'own' a subsequent space character in this tokenizer
     spaces = [True] * len(words)
     return Doc(self.vocab, words=words, spaces=spaces)
Example #46
0
def test_matcher_match_middle(matcher):
    words = ["I", "like", "Google", "Now", "best"]
    doc = Doc(matcher.vocab, words=words)
    assert matcher(doc) == [(doc.vocab.strings["GoogleNow"], 2, 4)]
Example #47
0
from itertools import combinations
from spacy.tokens import Doc
from spacy.tokens import Token

from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
from text_complexity_analyzer_cm.utils.utils import split_doc_into_sentences

Doc.set_extension('referential_cohesion_all', default=[], force=True)


class ReferentialCohesionAllSentencesAnalyzer:
    name = 'referential cohesion all sentences analyzer'

    def __init__(self, language: str = 'es') -> None:
        '''
        This constructor will initialize the object that processes referential cohesion for adjacent sentences in a text. It goes after sentencizer.

        Parameters:
        language: The language that this pipeline will be used in.

        Returns:
        None.
        '''
        if not language in ACCEPTED_LANGUAGES:
            raise ValueError(f'Language {language} is not supported yet')

        self.language = language
        self.sentence_analyzer = None

    def __call__(self, doc: Doc) -> Doc:
        '''
Example #48
0
def test_matcher_match_end(matcher):
    words = ["I", "like", "java"]
    doc = Doc(matcher.vocab, words=words)
    assert matcher(doc) == [(doc.vocab.strings["Java"], 2, 3)]
Example #49
0
 def __call__(self):
     return Doc(self.vocab,
                words=self.all_input_tokens,
                spaces=self.all_spaces)
Example #50
0
def test_issue1257():
    """Test that tokens compare correctly."""
    doc1 = Doc(Vocab(), words=["a", "b", "c"])
    doc2 = Doc(Vocab(), words=["a", "c", "e"])
    assert doc1[0] != doc2[0]
    assert not doc1[0] == doc2[0]
def test_serialize_doc_exclude(en_vocab):
    doc = Doc(en_vocab, words=["hello", "world"])
    doc.user_data["foo"] = "bar"
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
    assert new_doc.user_data["foo"] == "bar"
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(), exclude=["user_data"])
    assert not new_doc.user_data
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(exclude=["user_data"]))
    assert not new_doc.user_data
    with pytest.raises(ValueError):
        doc.to_bytes(user_data=False)
    with pytest.raises(ValueError):
        Doc(en_vocab).from_bytes(doc.to_bytes(), tensor=False)
    def __init__(self):
        """Initialise the pipeline component.
        """
        if not Doc.has_extension("flesch_kincaid_grade_level"):
            Doc.set_extension("flesch_kincaid_grade_level",
                              getter=self.fk_grade)

        if not Doc.has_extension("flesch_kincaid_reading_ease"):
            Doc.set_extension("flesch_kincaid_reading_ease",
                              getter=self.fk_ease)

        if not Doc.has_extension("dale_chall"):
            Doc.set_extension("dale_chall", getter=self.dale_chall)

        if not Doc.has_extension("smog"):
            Doc.set_extension("smog", getter=self.smog)

        if not Doc.has_extension("coleman_liau_index"):
            Doc.set_extension("coleman_liau_index", getter=self.coleman_liau)

        if not Doc.has_extension("automated_readability_index"):
            Doc.set_extension("automated_readability_index", getter=self.ari)

        if not Doc.has_extension("forcast"):
            Doc.set_extension("forcast", getter=self.forcast)
Example #53
0
 def __call__(self,text):
   t=text.replace("\r","").replace("(","(").replace(")",")").replace("[","[").replace("]","]").replace("{","{").replace("}","}")
   u=self.model(t) if t else ""
   vs=self.vocab.strings
   r=vs.add("ROOT")
   words=[]
   lemmas=[]
   pos=[]
   tags=[]
   morphs=[]
   heads=[]
   deps=[]
   spaces=[]
   norms=[]
   ent_iobs=[]
   ent_types=[]
   bunsetu=[]
   for t in u.split("\n"):
     if t=="" or t.startswith("#"):
       continue
     s=t.split("\t")
     if len(s)!=10:
       continue
     id,form,lemma,upos,xpos,feats,head,deprel,_,misc=s
     words.append(form)
     lemmas.append(vs.add(lemma))
     pos.append(vs.add(upos))
     tags.append(vs.add(xpos))
     morphs.append(feats)
     if deprel=="root":
       heads.append(0)
       deps.append(r)
     else:
       heads.append(int(head)-int(id))
       deps.append(vs.add(deprel))
     spaces.append(False if "SpaceAfter=No" in misc else True)
     i=misc.find("Translit=")
     norms.append(vs.add(form if i<0 else misc[i+9:]))
     i=misc.find("NE=")
     if i<0:
       ent_iobs.append(2)
       ent_types.append(0)
     else:
       j=misc.find("|",i)
       if j<0:
         j=len(misc)
       if misc[i+3:i+4]=="B":
         ent_iobs.append(3)
       else:
         ent_iobs.append(1)
       ent_types.append(vs.add(misc[i+5:j]))
     bunsetu.append("I")
     if misc.startswith("BunsetuBILabel="):
       bunsetu[-1]=misc[15:16]
   doc=Doc(self.vocab,words=words,spaces=spaces)
   a=numpy.array(list(zip(lemmas,pos,tags,deps,heads,norms,ent_iobs,ent_types)),dtype="uint64")
   doc.from_array([LEMMA,POS,TAG,DEP,HEAD,NORM,ENT_IOB,ENT_TYPE],a)
   try:
     doc.is_tagged=True
     doc.is_parsed=True
   except:
     for i,j in enumerate(morphs):
       if j!="_" and j!="":
         doc[i].set_morph(j)
   doc.user_data["bunsetu_bi_labels"]=bunsetu
   return doc
def test_serialize_doc_roundtrip_bytes(en_vocab):
    doc = Doc(en_vocab, words=["hello", "world"])
    doc_b = doc.to_bytes()
    new_doc = Doc(en_vocab).from_bytes(doc_b)
    assert new_doc.to_bytes() == doc_b
def test_wp_start(wp_tokens, span, expected_start):
    doc = Doc(Vocab(), words=wp_tokens[1:-1])
    doc._.pytt_word_pieces_ = wp_tokens
    doc._.pytt_alignment = align_word_pieces([w.text for w in doc], wp_tokens)
    assert doc[span]._.pytt_start == expected_start
from pprint import pprint
import json
import pickle
from time import time
import tablib
from spacy.tokens import Token, Doc
import role_pattern_nlp
import db
import util
from config import config

with open(config['pattern_eval_sheet'], 'rb') as f:
    pattern_eval_data = tablib.Dataset().load(f.read())

Token.set_extension('has_valence', default=False)
Doc.set_extension('sentence_id', default=None)


def pattern_fitness(pattern, matches, pos_matches, neg_matches):
    true_pos = [m for m in pos_matches if util.match_is_in_list(m, matches)]
    true_neg = [
        m for m in neg_matches if not util.match_is_in_list(m, matches)
    ]
    false_pos = [m for m in neg_matches if util.match_is_in_list(m, matches)]
    false_neg = [
        m for m in pos_matches if not util.match_is_in_list(m, matches)
    ]
    n_true_pos = len(true_pos)
    n_true_neg = len(true_neg)
    n_false_pos = len(false_pos)
    n_false_neg = len(false_neg)
Example #57
0
def test_vectors_doc_vector(vocab, text):
    doc = Doc(vocab, words=text)
    assert list(doc.vector)
    assert doc.vector_norm
Example #58
0
 def __call__(self, text):
     words = text.rstrip().split(' ')
     spaces = [True] * len(words)
     return Doc(self.vocab, words=words, spaces=spaces)
Example #59
0
def test_vectors_span_vector(vocab, text):
    span = Doc(vocab, words=text)[0:2]
    assert list(span.vector)
    assert span.vector_norm
Example #60
0
        lines = "".join(["-" for i in range(len(string))])
        print(lines)
        print(string)
        print(lines)

        learn_rules = rules[category]
        for it in range(0, iterations):
            """
            =============================
                FIND PHRASES BY RULES
            =============================
            """
            patterns = list()
            lt = LoopTimer(update_after=500, avg_length=10000, target=db_size)
            for abstract_id, row in infoDF.iterrows():
                doc = Doc(vocab).from_disk(
                    os.path.join(path_to_annotations, f"{abstract_id}.spacy"))
                patterns.extend(
                    find_phrases_by_rule(doc, learn_rules, phrase_boundaries))
                n = lt.update(f"Find Phrases - {len(patterns)}")

            print()
            """
            =============================
                    BUILD MATCHER
            =============================
            """
            matcher = Matcher(vocab)
            lt = LoopTimer(update_after=10000,
                           avg_length=10000,
                           target=len(patterns))
            for p_id, pattern in enumerate(patterns):