def test_doc_retokenize_merge_extension_attrs_invalid(en_vocab, underscore_attrs):
    Token.set_extension("a", getter=lambda x: x, force=True)
    Token.set_extension("b", method=lambda x: x, force=True)
    doc = Doc(en_vocab, words=["hello", "world", "!"])
    attrs = {"_": underscore_attrs}
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.merge(doc[0:2], attrs=attrs)
Beispiel #2
0
def test_issue3555(en_vocab):
    """Test that custom extensions with default None don't break matcher."""
    Token.set_extension("issue3555", default=None)
    matcher = Matcher(en_vocab)
    pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
    matcher.add("TEST", None, pattern)
    doc = Doc(en_vocab, words=["have", "apple"])
    matcher(doc)
Beispiel #3
0
def test_issue_1971_3(en_vocab):
    """Test that pattern matches correctly for multiple extension attributes."""
    Token.set_extension("a", default=1, force=True)
    Token.set_extension("b", default=2, force=True)
    doc = Doc(en_vocab, words=["hello", "world"])
    matcher = Matcher(en_vocab)
    matcher.add("A", None, [{"_": {"a": 1}}])
    matcher.add("B", None, [{"_": {"b": 2}}])
    matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc))
    assert len(matches) == 4
    assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
Beispiel #4
0
def test_matcher_extension_set_membership(en_vocab):
    matcher = Matcher(en_vocab)
    get_reversed = lambda token: "".join(reversed(token.text))
    Token.set_extension("reversed", getter=get_reversed, force=True)
    pattern = [{"_": {"reversed": {"IN": ["eyb", "ih"]}}}]
    matcher.add("REVERSED", None, pattern)
    doc = Doc(en_vocab, words=["hi", "bye", "hello"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["aardvark"])
    matches = matcher(doc)
    assert len(matches) == 0
Beispiel #5
0
def test_matcher_extension_attribute(en_vocab):
    matcher = Matcher(en_vocab)
    get_is_fruit = lambda token: token.text in ("apple", "banana")
    Token.set_extension("is_fruit", getter=get_is_fruit, force=True)
    pattern = [{"ORTH": "an"}, {"_": {"is_fruit": True}}]
    matcher.add("HAVING_FRUIT", None, pattern)
    doc = Doc(en_vocab, words=["an", "apple"])
    matches = matcher(doc)
    assert len(matches) == 1
    doc = Doc(en_vocab, words=["an", "aardvark"])
    matches = matcher(doc)
    assert len(matches) == 0
Beispiel #6
0
def test_issue_1971_4(en_vocab):
    """Test that pattern matches correctly with multiple extension attribute
    values on a single token.
    """
    Token.set_extension("ext_a", default="str_a", force=True)
    Token.set_extension("ext_b", default="str_b", force=True)
    matcher = Matcher(en_vocab)
    doc = Doc(en_vocab, words=["this", "is", "text"])
    pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
    matcher.add("TEST", None, pattern)
    matches = matcher(doc)
    # Uncommenting this caused a segmentation fault
    assert len(matches) == 1
    def __init__(self, nlp, label='GPE'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        # Make request once on initialisation and store the data
        r = requests.get('https://restcountries.eu/rest/v2/all')
        r.raise_for_status()  # make sure requests raises an error if it fails
        countries = r.json()

        # Convert API response to dict keyed by country name for easy lookup
        # This could also be extended using the alternative and foreign language
        # names provided by the API
        self.countries = {c['name']: c for c in countries}
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher with Doc patterns for each country name
        patterns = [nlp(c) for c in self.countries.keys()]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('COUNTRIES', None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        # If no default value is set, it defaults to None.
        Token.set_extension('is_country', default=False)
        Token.set_extension('country_capital')
        Token.set_extension('country_latlng')
        Token.set_extension('country_flag')

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_country == True.
        Doc.set_extension('has_country', getter=self.has_country)
        Span.set_extension('has_country', getter=self.has_country)
def test_doc_retokenize_split_extension_attrs(en_vocab):
    Token.set_extension("a", default=False, force=True)
    Token.set_extension("b", default="nothing", force=True)
    doc = Doc(en_vocab, words=["LosAngeles", "start"])
    with doc.retokenize() as retokenizer:
        heads = [(doc[0], 1), doc[1]]
        underscore = [{"a": True, "b": "1"}, {"b": "2"}]
        attrs = {"lemma": ["los", "angeles"], "_": underscore}
        retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
    assert doc[0].lemma_ == "los"
    assert doc[0]._.a is True
    assert doc[0]._.b == "1"
    assert doc[1].lemma_ == "angeles"
    assert doc[1]._.a is False
    assert doc[1]._.b == "2"
Beispiel #9
0
def test_underscore_mutable_defaults_dict(en_vocab):
    """Test that mutable default arguments are handled correctly (see #2581)."""
    Token.set_extension("mutable", default={})
    token1 = Doc(en_vocab, words=["one"])[0]
    token2 = Doc(en_vocab, words=["two"])[0]
    token1._.mutable["foo"] = "bar"
    assert len(token1._.mutable) == 1
    assert token1._.mutable["foo"] == "bar"
    assert len(token2._.mutable) == 0
    token1._.mutable["foo"] = "baz"
    assert len(token1._.mutable) == 1
    assert token1._.mutable["foo"] == "baz"
    token1._.mutable["x"] = []
    token1._.mutable["x"].append("y")
    assert len(token1._.mutable) == 2
    assert token1._.mutable["x"] == ["y"]
    assert len(token2._.mutable) == 0
Beispiel #10
0
def test_issue1971(en_vocab):
    # Possibly related to #2675 and #2671?
    matcher = Matcher(en_vocab)
    pattern = [
        {"ORTH": "Doe"},
        {"ORTH": "!", "OP": "?"},
        {"_": {"optional": True}, "OP": "?"},
        {"ORTH": "!", "OP": "?"},
    ]
    Token.set_extension("optional", default=False)
    matcher.add("TEST", None, pattern)
    doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"])
    # We could also assert length 1 here, but this is more conclusive, because
    # the real problem here is that it returns a duplicate match for a match_id
    # that's not actually in the vocab!
    matches = matcher(doc)
    assert all([match_id in en_vocab.strings for match_id, start, end in matches])
Beispiel #11
0
    def __init__(self, nlp: Language, path: str, lang: str='en_US'):
        path = Path.cwd() / path
        
        if not any([nlp, isinstance(nlp, Language)]):
            raise ValueError('nlp must be of a spaCy Language.') from None

        if not path.exists():
            raise NotADirectoryError('{} does not exist.'.format(path)) from None

        dic_path, aff_path = (
            path / '{}.dic'.format(lang),
            path / '{}.aff'.format(lang),
        )

        self.hobj = HunSpell(dic_path, aff_path)

        Token.set_extension('hunspell_spell', default=None)
        Token.set_extension('hunspell_suggest', getter=self.get_suggestion)
    def __init__(self, nlp):

        self.nlp = nlp

        Doc.set_extension("outgroup_entities", default=[], force=True)
        Doc.set_extension("ingroup_entities", default=[], force=True)
        Token.set_extension("outgroup", default=False, force=True)
        Token.set_extension("ingroup", default=False, force=True)

        self.outgroups = Matcher(nlp.vocab)

        self.outgroups.add("OUTGROUP", None,
                           [{'ENT_TYPE': {"IN": Group_ID.GROUP}}, {"_": {"ATTRIBUTE": "outgroup"}}])

        self.ingroups = Matcher(nlp.vocab)

        self.ingroups.add("INGROUP", None,
                          [{'ENT_TYPE': {"IN": Group_ID.GROUP}}, {"_": {"ATTRIBUTE": "ingroup"}}])
Beispiel #13
0
def load_spacy_model(team_file, players_file):
    nlp = spacy.load('en')

    # Teams
    teams = get_teams(team_file)
    teams = teams[0]

    # Players
    player_list = get_players(players_file)

    component = NFLTeamRecognizer(nlp, teams)
    nlp.add_pipe(component, last=True)
    component = NFLPlayerRecognizer(nlp, player_list)
    nlp.add_pipe(component, last=True)
    Token.set_extension('template_tag', default=None)
    Span.set_extension('record_type', default=None)

    return nlp
Beispiel #14
0
    def __init__(self,
                 first_name_extension_name=FirstNameListMatcher.EXTENSION_NAME,
                 last_name_extension_name=LastNameListMatcher.EXTENSION_NAME):

        self.token_extension_name = self.TOKEN_EXTENSION_NAME
        self.span_extension_name = self.SPAN_EXTENSION_NAME
        self.doc_extension_name = self.DOC_EXTENSION_NAME
        self.first_name_extension_name = first_name_extension_name
        self.last_name_extension_name = last_name_extension_name

        if not Token.has_extension(self.token_extension_name):
            Token.set_extension(self.token_extension_name,
                                default=self.ANOT_NONE)
        if not Span.has_extension(self.span_extension_name):
            Span.set_extension(self.span_extension_name,
                               getter=self.is_full_name_getter)
        if not Doc.has_extension(self.doc_extension_name):
            Doc.set_extension(self.doc_extension_name, default=[])
Beispiel #15
0
def add_custom_properties(nlp):
    def is_symbol_getter(token):
        return (len(token) == 1
                and unicodedata.category(token.text).startswith('S'))

    # Replace weird behavior when normalizing ('a' -> 'going to' to 'a' -> 'a')
    special_case = [{ORTH: u'a', NORM: u'a'}]
    nlp.tokenizer.add_special_case(u'a', special_case)
    # Naive replacement of "'s" as "is" (could indicate possession)
    special_case = [{ORTH: u'is', NORM: u'is'}]
    nlp.tokenizer.add_special_case(u"'s", special_case)
    # Avoid ('am' -> 'a.m.')
    special_case = [{ORTH: u'am', NORM: u'am'}]
    nlp.tokenizer.add_special_case(u"am", special_case)

    # Add custom token attribute for symbols
    # token._.is_symbol now returns True if the token is a unicode symbol
    Token.set_extension('is_symbol', getter=is_symbol_getter, force=True)
Beispiel #16
0
    def __init__(self, nlp, lang=None, measures=None):
        """Initialise the pipeline component.
        """
        super(Readability, self).__init__(nlp, lang=lang)
        lang = lang or nlp.lang
        # take only supported measures
        if measures:
            self.measures = {metric: MEASURE_PARAMETERS[lang][metric] for metric in set(MEASURE_PARAMETERS[lang].keys()) & set(measures)}
        else:
            self.measures = MEASURE_PARAMETERS[lang]

        for metric in ["total_sentences", "total_words", "total_syllables", "total_letters"]:
            Doc.set_extension(metric, default=None, force=True)

        Token.set_extension("letters_count", default=None, force=True)

        for metric in self.measures.keys():
            if not Doc.has_extension(metric):
                Doc.set_extension(metric, getter=getattr(self, metric))
Beispiel #17
0
def main():
    nlp = spacy.load('en_core_web_sm')

    fruit_getter = lambda token: token.text in (u"apple", u"pear", u"banana")
    pid_getter = lambda token: token.text in (u'123a123', u'1234', u'123123123'
                                              )
    Token.set_extension("is_fruit", getter=fruit_getter)
    Token.set_extension("is_pid", getter=pid_getter)
    doc = nlp(u"I have an apple, a pear, and a watermelon")
    doc2 = nlp(u'123a123 SKF-23-Pump Handle Made to last')
    assert doc[3]._.is_fruit
    for token in doc:
        if token._.is_fruit:
            print('found: {}'.format(token.text))
    for token in doc2:
        if token._.is_pid:
            print('{} is a product id'.format(token.text))

    print('Done')
def main():
    Token.set_extension("extract", default=False)
    Token.set_extension("weight", default=0.0)
    Token.set_extension("dist_cit", default=0)
    Token.set_extension("dist_cit_norm", default=0.0)

    exp_dir = "/Users/masterman/NLP/PhD/aac/experiments/aac_generate_kw_trace"
    features_data_filename = os.path.join(exp_dir, "feature_data.json.gz")

    contexts = FeaturesReader(features_data_filename, 10)

    render_all(get_spacy_parse(contexts))
Beispiel #19
0
    def add_tagger(self, tagger, name, additional_fields=[]):
        r''' Add any kind of a tagger for tokens.

        Args:
            tagger (`object/function`):
                Any object/function that takes a spacy doc as an input, does something
                and returns the same doc.
            name (`str`):
                Name for this component in the pipeline.
            additional_fields (`List[str]`):
                Fields to be added to the `_` properties of a token.
        '''
        self.nlp.add_pipe(tagger, name='tag_' + name, first=True)
        # Add custom fields needed for this usecase
        Token.set_extension('to_skip', default=False, force=True)

        # Add any additional fields that are required
        for field in additional_fields:
            Token.set_extension(field, default=False, force=True)
Beispiel #20
0
 def __init__(self, nlp):
     self.load_dicts()
     # Token.set_extension('is_neg', default=False, force=True)
     # Token.set_extension('is_pos', default=False, force=True)
     Token.set_extension("is_neg", getter=self.is_neg_getter, force=True)
     Token.set_extension("is_pos", getter=self.is_pos_getter, force=True)
     Token.set_extension("is_negated", getter=self.is_negated_getter, force=True)
     Token.set_extension("span_sent", default=None, force=True)
     Doc.set_extension("has_neg", getter=self.has_neg, force=True)
     Doc.set_extension("has_pos", getter=self.has_pos, force=True)
     Span.set_extension("has_neg", getter=self.has_neg, force=True)
     Span.set_extension("has_pos", getter=self.has_pos, force=True)
Beispiel #21
0
def _install_extensions():
    K = KNP_USER_KEYS
    Token.set_extension(K.morph.element, default=None, force=True)
    for k in ["bunsetsu", "tag"]:
        Token.set_extension(getattr(K.morph, k), getter=token_to_knp_span(k))
    for k in ["bunsetsu", "morph", "tag"]:
        for feature in ["element", "list_"]:
            key = getattr(getattr(K, k), feature)
            Span.set_extension(key, default=None, force=True)
    for k in ["bunsetsu", "morph", "tag"]:
        for feature in ["spans", "list_"]:
            key = getattr(getattr(K, k), feature)
            Doc.set_extension(key, getter=get_all_knp_features_from_sents(k, feature))
    for k in [BUNSETSU, TAG]:
        Span.set_extension(getattr(KNP_USER_KEYS, k).spans, getter=get_knp_span(k))
        Span.set_extension(getattr(KNP_USER_KEYS, k).parent, getter=get_knp_parent(k))
        Span.set_extension(
            getattr(KNP_USER_KEYS, k).children, getter=get_knp_children(k)
        )
def test_dependency_matcher_span_user_data(en_tokenizer):
    doc = en_tokenizer("a b c d e")
    for token in doc:
        token.head = doc[0]
        token.dep_ = "a"
    Token.set_extension("is_c", default=False)
    doc[2]._.is_c = True
    pattern = [
        {"RIGHT_ID": "c", "RIGHT_ATTRS": {"_": {"is_c": True}}},
    ]
    matcher = DependencyMatcher(en_tokenizer.vocab)
    matcher.add("C", [pattern])
    doc_matches = matcher(doc)
    offset = 1
    span_matches = matcher(doc[offset:])
    for doc_match, span_match in zip(sorted(doc_matches), sorted(span_matches)):
        assert doc_match[0] == span_match[0]
        for doc_t_i, span_t_i in zip(doc_match[1], span_match[1]):
            assert doc_t_i == span_t_i + offset
    def __init__(self, nlp, patterns, patterns_by_class, default_label=None):
        """
        Initialise the Spacy pipeline component

        Set up the extensions on the Tokens and Spans.


        :param nlp: Spacy NLP engine
        :param patterns: List of dicts of patterns to match on
        :param patterns_by_class: List of dicts of patterns to match on, grouped by entity type
        :param default_label: default label to use on matched entities.
        """
        self.nlp = nlp
        if default_label is None:
            self.default_label = "CUSTOM"
        else:
            self.default_label = default_label
        _ = self.nlp.tokenizer.vocab[self.default_label]  # add string to vocab
        self.nlp.get_pipe("ner").add_label(
            self.default_label)  # add string to vocab
        self.patterns = patterns
        self.patterns_by_class = patterns_by_class
        # initialise the matcher and add patterns
        self.keyword_processor = KeywordProcessor()
        for k, v in self.patterns_by_class.items():
            _ = self.nlp.tokenizer.vocab[k]  # add string to vocab
            self.nlp.get_pipe("ner").add_label(k)  # add string to vocab
            self.keyword_processor.add_keywords_from_list(
                self.patterns_by_class[k])
        try:
            Token.set_extension("original_label", default=None)
        except ValueError:  # do not force overwrite if extension already set
            pass
        try:
            Span.set_extension(
                "original_label",
                getter=lambda span: list(
                    set([token._.original_label for token in span])),
            )
        except ValueError:  # do not force overwrite if extension already set
            pass
        # no callback function on the matcher patterns.
        logging.debug("PMC Flashget based pattern matcher added.")
    def __init__(self, nlp, label='AzureResource'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        self.label = nlp.vocab.strings[label]  # get entity label ID
        patterns = [nlp(org) for org in azureResources]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('AzureResource', None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        Token.set_extension('is_azure_resource', default=False)

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_azure_resource == True.
        Doc.set_extension('has_azure_resource', getter=self.has_azure_resource)
        Span.set_extension('has_azure_resource',
                           getter=self.has_azure_resource)
Beispiel #25
0
    def __init__(self,
                 nlp,
                 pattern_id='EmailAddrTagger',
                 attrs=('has_email_addr', 'is_email_addr', 'email_addr'),
                 force_extension=False):
        """Initialise the pipeline component.

        nlp (Language): The shared nlp object. Used to initialise the matcher
            with the shared `Vocab`, and create `Doc` match patterns.
        pattern_id (unicode): ID of match pattern, defaults to 'EmailAddrTagger'. Can be
            changed to avoid ID clashes.
        attrs (tuple): Attributes to set on the ._ property. Defaults to
            ('has_email_addr', 'is_email_addr', 'email_addr').
        force_extension (bool): Force creation of extension objects.
        RETURNS (callable): A spaCy pipeline component.
        """
        self._has_email_addr, self._is_email_addr, self._email_addr = attrs
        self.matcher = Matcher(nlp.vocab)

        # Add email address rule to matcher
        self._email_addr_re = re.compile(email_expr,
                                         re.VERBOSE | re.I | re.UNICODE)
        email_addr_mask = lambda text: bool(self._email_addr_re.match(text))
        email_addr_flag = nlp.vocab.add_flag(email_addr_mask)
        self.matcher.add('email_addr', None, [{email_addr_flag: True}])

        # Add attributes
        Doc.set_extension(self._has_email_addr,
                          getter=self.has_email_addr,
                          force=force_extension)
        Doc.set_extension(self._email_addr,
                          getter=self.iter_email_addr,
                          force=force_extension)
        Span.set_extension(self._has_email_addr,
                           getter=self.has_email_addr,
                           force=force_extension)
        Span.set_extension(self._email_addr,
                           getter=self.iter_email_addr,
                           force=force_extension)
        Token.set_extension(self._is_email_addr,
                            default=False,
                            force=force_extension)
def merge_compounds(doc):
    """
    pipeline component to merge compound linked terms in a doc
    
    """

    Token.set_extension("compound_merge", default=False, force=True)

    def get_compound(chunk):

        """
        function which returns compound words of a token
        input: list of a token's left children
        output: the left most compound term
        """

        for token in list(chunk.root.lefts):
            if token.dep_ == "compound":
                return token

    with doc.retokenize() as retokenizer:

        for chunk in doc.noun_chunks:
            if chunk.root.dep_ == "compound":
                continue

            left_token = get_compound(chunk)

            if left_token:
                #             print(doc[left_token.i : chunk.end])

                entity_type = ""
                if left_token.ent_type:
                    entity_type = left_token.ent_type
                else:
                    entity_type = chunk.root.ent_type_

                attrs = {"ENT_TYPE": entity_type,
                         "_": {"compound_merge": True}}
                retokenizer.merge(doc[left_token.i: chunk.end], attrs=attrs)

    return doc
Beispiel #27
0
def test_doc_retokenize_merge_extension_attrs(en_vocab):
    Token.set_extension("a", default=False, force=True)
    Token.set_extension("b", default="nothing", force=True)
    doc = Doc(en_vocab, words=["hello", "world", "!"])
    # Test regular merging
    with doc.retokenize() as retokenizer:
        attrs = {"lemma": "hello world", "_": {"a": True, "b": "1"}}
        retokenizer.merge(doc[0:2], attrs=attrs)
    assert doc[0].lemma_ == "hello world"
    assert doc[0]._.a is True
    assert doc[0]._.b == "1"
    # Test bulk merging
    doc = Doc(en_vocab, words=["hello", "world", "!", "!"])
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[0:2], attrs={"_": {"a": True, "b": "1"}})
        retokenizer.merge(doc[2:4], attrs={"_": {"a": None, "b": "2"}})
    assert doc[0]._.a is True
    assert doc[0]._.b == "1"
    assert doc[1]._.a is None
    assert doc[1]._.b == "2"
    def __init__(self,
                 nlp,
                 name="medspacy_concept_tagger",
                 attr_name="concept_tag"):
        """Create a new ConceptTagger.
        Params:
            nlp: A spaCy Language model.
            attr_name (str): The name of the attribute to set to tokens.
        """
        self.nlp = nlp
        self.name = name
        self.attr_name = attr_name
        self.target_matcher = TargetMatcher(nlp, add_ents=False)
        self.rules = []

        # If the token attribute hasn't been set, add it now
        try:
            Token.set_extension(attr_name, default="")
        except:
            pass
Beispiel #29
0
def test_doc_retokenize_merge_extension_attrs(en_vocab):
    Token.set_extension("a", default=False, force=True)
    Token.set_extension("b", default="nothing", force=True)
    doc = Doc(en_vocab, words=["hello", "world", "!"])
    # Test regular merging
    with doc.retokenize() as retokenizer:
        attrs = {"lemma": "hello world", "_": {"a": True, "b": "1"}}
        retokenizer.merge(doc[0:2], attrs=attrs)
    assert doc[0].lemma_ == "hello world"
    assert doc[0]._.a is True
    assert doc[0]._.b == "1"
    # Test bulk merging
    doc = Doc(en_vocab, words=["hello", "world", "!", "!"])
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[0:2], attrs={"_": {"a": True, "b": "1"}})
        retokenizer.merge(doc[2:4], attrs={"_": {"a": None, "b": "2"}})
    assert doc[0]._.a is True
    assert doc[0]._.b == "1"
    assert doc[1]._.a is None
    assert doc[1]._.b == "2"
Beispiel #30
0
    def _set_extensions(self):
        """Sets the default extensions if they do not exist yet."""
        for obj in Doc, Span, Token:
            if not obj.has_extension(self.ext_names["conll_str"]):
                obj.set_extension(self.ext_names["conll_str"], default=None)
            if not obj.has_extension(self.ext_names["conll"]):
                obj.set_extension(self.ext_names["conll"], default=None)

            if PD_AVAILABLE and not self.disable_pandas:
                if not obj.has_extension(self.ext_names["conll_pd"]):
                    obj.set_extension(self.ext_names["conll_pd"], default=None)

        # Adds fields from the CoNLL-U format that are not available in spaCy
        # However, ConllParser might set these fields when it has read CoNLL_str->spaCy
        if not Token.has_extension("conll_deps_graphs_field"):
            Token.set_extension("conll_deps_graphs_field", default="_")
        if not Token.has_extension("conll_misc_field"):
            Token.set_extension("conll_misc_field", default="_")
        if not Span.has_extension("conll_metadata"):
            Span.set_extension("conll_metadata", default=None)
    def __call__(self, doc):
        """
        Runs the document through the Table Matcher Component.  Uses regex patterns to identify terms that
        likely came from a table in the unstructured text.
        :param doc:
        :return:
        """
        logging.debug("Called Table Matcher Component")
        TABLE_PATTERN = re.compile(r'^(.*?)[ \t]{3,}\d+')
        Token.set_extension('feature_is_from_table', default=False, force=True)

        for match in re.finditer(TABLE_PATTERN, doc.text):
            start, end = match.span()
            span = doc.char_span(start, end)
            if span is None:
                continue
            for token in span:
                token._.set('feature_is_from_table', True)

        return doc
 def __init__(self,
              data_dir=DATA_DIR,
              lefff_file_name=LEFFF_FILE_NAME,
              after_melt=False):
     LOGGER.info('New LefffLemmatizer instantiated.')
     # register your new attribute token._.lefff_lemma
     if not Token.get_extension(self.name):
         Token.set_extension(self.name, default=None)
     else:
         LOGGER.info('Token {} already registered'.format(self.name))
     # In memory lemma mapping
     self.lemma_dict = {}
     self.after_melt = after_melt
     with io.open(os.path.join(data_dir, lefff_file_name),
                  encoding='utf-8') as lefff_file:
         LOGGER.info('Reading lefff data...')
         for line in lefff_file:
             els = line.split('\t')
             self.lemma_dict[(els[0], els[1])] = els[2]
     LOGGER.info('Successfully loaded lefff lemmatizer')
Beispiel #33
0
    def __init__(self, spacy_pipeline):
        self.nlp = spacy_pipeline
        Token.set_extension('feature_is_time_unit', default=False)
        self.nlp.entity.add_label('time_unit')
        self.time_matcher = Matcher(self.nlp.vocab)

        self.time_matcher.add('UNIT_OF_TIME', None, [{
            'LOWER': 'sec'
        }], [{
            'LOWER': 'second'
        }], [{
            'LOWER': 'seconds'
        }], [{
            'LOWER': 'min'
        }], [{
            'LOWER': 'minute'
        }], [{
            'LOWER': 'minutes'
        }], [{
            'LOWER': 'hr'
        }], [{
            'LOWER': 'hour'
        }], [{
            'LOWER': 'day'
        }], [{
            'LOWER': 'days'
        }], [{
            'LOWER': 'week'
        }], [{
            'LOWER': 'weeks'
        }], [{
            'LOWER': 'month'
        }], [{
            'LOWER': 'months'
        }], [{
            'LOWER': 'year'
        }], [{
            'LOWER': 'years'
        }], [{
            'LOWER': 'yrs'
        }])
    def __init__(self, use_spacy=False, spacy_extensions={}):
        """
        spacy_extensions looks like {"Tokens": {"name": "mask", "kwargs": {"default": False}}}
        """
        if use_spacy:
            import spacy

            if spacy_extensions:
                from spacy.tokens import Token

                allowed_keys = ["Tokens"]
                for key, settings_list in spacy_extensions.items():
                    # This code sucks, but you get the idea
                    for settings in settings_list:
                        if key in allowed_keys:
                            Token.set_extension(settings["name"],
                                                **settings["kwargs"])

            self.nlp = spacy.load("en_core_web_sm")
        else:
            pass
Beispiel #35
0
    def __init__(self, nlp, companies=tuple(), label='ORG'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher – it can now take Doc objects as patterns,
        # so even if the list of companies is long, it's very efficient
        patterns = [nlp(org) for org in companies]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('TECH_ORGS', None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        Token.set_extension('is_tech_org', default=False)

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_tech_org == True.
        Doc.set_extension('has_tech_org', getter=self.has_tech_org)
        Span.set_extension('has_tech_org', getter=self.has_tech_org)
Beispiel #36
0
def test_span_as_doc_user_data(doc):
    """Test that the user_data can be preserved (but not by default)."""
    my_key = "my_info"
    my_value = 342
    doc.user_data[my_key] = my_value
    Token.set_extension("is_x", default=False)
    doc[7]._.is_x = True

    span = doc[4:10]
    span_doc_with = span.as_doc(copy_user_data=True)
    span_doc_without = span.as_doc()

    assert doc.user_data.get(my_key, None) is my_value
    assert span_doc_with.user_data.get(my_key, None) is my_value
    assert span_doc_without.user_data.get(my_key, None) is None
    for i in range(len(span_doc_with)):
        if i != 3:
            assert span_doc_with[i]._.is_x is False
        else:
            assert span_doc_with[i]._.is_x is True
    assert not any([t._.is_x for t in span_doc_without])
Beispiel #37
0
 def install_extensions():
     K = KNP_USER_KEYS
     Token.set_extension(K.morph.element, default=None, force=True)
     for k in [
             K.bunsetsu.element,
             K.tag.element,
             K.bunsetsu.list_,
             K.morph.list_,
             K.tag.list_,
     ]:
         Span.set_extension(k, default=None, force=True)
     for k in ["bunsetsu", "morph", "tag"]:
         Doc.set_extension(getattr(K, k).list_,
                           getter=get_all_knp_list_from_sents(k))
     for k in [BUNSETSU, TAG]:
         Span.set_extension(getattr(KNP_USER_KEYS, k).spans,
                            getter=get_knp_span(k))
         Span.set_extension(getattr(KNP_USER_KEYS, k).parent,
                            getter=get_knp_parent(k))
         Span.set_extension(getattr(KNP_USER_KEYS, k).children,
                            getter=get_knp_children(k))
Beispiel #38
0
class JapaneseDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda _text: "ja"
    stop_words = STOP_WORDS
    tag_map = TAG_MAP
    syntax_iterators = SYNTAX_ITERATORS
    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}

    if not Token.get_extension('inf'):
        Token.set_extension('inf', default='')
    if not Token.get_extension('reading'):
        Token.set_extension('reading', default='')
    if not Token.get_extension('sudachi'):
        Token.set_extension('sudachi', default='')
    if not Token.get_extension('bunsetu_index'):
        Token.set_extension('bunsetu_index', default='')
    if not Token.get_extension('bunsetu_bi_label'):
        Token.set_extension('bunsetu_bi_label', default='')
    if not Token.get_extension('bunsetu_position_type'):
        Token.set_extension('bunsetu_position_type', default='')

    @classmethod
    def create_tokenizer(cls, nlp=None):
        return SudachiTokenizer(nlp)

    @classmethod
    def create_lemmatizer(cls, nlp=None, lookups=None):
        return None
Beispiel #39
0
    def __init__(self, nlp, terms_dict, label='EMP_TYPE'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher – it can now take Doc objects as patterns,
        # so even if the list of terms is long, it's very efficient
        self.matcher = PhraseMatcher(nlp.vocab)
        for match_label in terms_dict.keys():
            patterns = [nlp(term) for term in terms_dict[match_label]]
            # patterns = [nlp(term) for term in terms]

            self.matcher.add(match_label, None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        Token.set_extension('is_emp_type', default=False, force=True)
        Token.set_extension('is_part_time', default=False, force=True)
        Token.set_extension('is_full_time', default=False, force=True)

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_emp_type == True.
        Doc.set_extension('has_emp_type', getter=self.has_emp_type, force=True)
        Span.set_extension('has_emp_type', getter=self.has_emp_type, force=True)
        Doc.set_extension('has_part_time', getter=self.has_part_time, force=True)
        Span.set_extension('has_part_time', getter=self.has_part_time, force=True)
        Doc.set_extension('has_full_time', getter=self.has_full_time, force=True)
        Span.set_extension('has_full_time', getter=self.has_full_time, force=True)
Beispiel #40
0
def test_matcher_subset_value_operator(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"MORPH": {"IS_SUBSET": ["Feat=Val", "Feat2=Val2"]}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    assert len(matcher(doc)) == 3
    doc[0].set_morph("Feat=Val")
    assert len(matcher(doc)) == 3
    doc[0].set_morph("Feat=Val|Feat2=Val2")
    assert len(matcher(doc)) == 3
    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
    assert len(matcher(doc)) == 2
    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
    assert len(matcher(doc)) == 2

    # IS_SUBSET acts like "IN" for attrs other than MORPH
    matcher = Matcher(en_vocab)
    pattern = [{"TAG": {"IS_SUBSET": ["A", "B"]}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0].tag_ = "A"
    assert len(matcher(doc)) == 1

    # IS_SUBSET with an empty list matches nothing
    matcher = Matcher(en_vocab)
    pattern = [{"TAG": {"IS_SUBSET": []}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0].tag_ = "A"
    assert len(matcher(doc)) == 0

    # IS_SUBSET with a list value
    Token.set_extension("ext", default=[])
    matcher = Matcher(en_vocab)
    pattern = [{"_": {"ext": {"IS_SUBSET": ["A", "B"]}}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0]._.ext = ["A"]
    doc[1]._.ext = ["C", "D"]
    assert len(matcher(doc)) == 2
Beispiel #41
0
def custom_extensions(doc):

    lemmatizer = GermaLemma()
    negation_words = set(["nie", "keinsterweise", "keinerweise", "niemals", "nichts", "kaum", "keinesfalls", "ebensowenig", "nicht", "kein", "keine", "weder"])
    negation_cconj = set(['aber', 'jedoch', 'doch', 'sondern'])

    def lemma_getter(token):
        # if " " in token.text:
        #     return token.lemma_.lower()
        try:
            return lemmatizer.find_lemma(token.text, token.tag_).lower()
        except:
            return token.lemma_.lower()

    def is_negation_getter(token):
        if token._.lemma in negation_words:
            return True
        else:
            return False

    def is_sentence_break_getter(token):
        if token._.lemma in negation_cconj:
            return True
        else:
            return False

    Token.set_extension("lemma", getter=lemma_getter, force=True)
    Token.set_extension("is_negation", getter=is_negation_getter, force=True)
    Token.set_extension("is_sentence_break", getter=is_sentence_break_getter, force=True)
    return doc
Beispiel #42
0
def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs):
    Token.set_extension("x", default=False, force=True)
    Token.set_extension("a", getter=lambda x: x, force=True)
    Token.set_extension("b", method=lambda x: x, force=True)
    doc = Doc(en_vocab, words=["LosAngeles", "start"])
    attrs = {"_": underscore_attrs}
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            heads = [(doc[0], 1), doc[1]]
            retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
Beispiel #43
0
def main(test_data_dir, experiment_dir, corpus):
    Token.set_extension("split_start", getter=get_token_split_start)
    Token.set_extension("split_end", getter=get_token_split_end)
    Token.set_extension("begins_fused", default=False)
    Token.set_extension("inside_fused", default=False)
    lang.zh.Chinese.Defaults.use_jieba = False
    lang.ja.Japanese.Defaults.use_janome = False
    lang.ru.Russian.Defaults.use_pymorphy2 = False

    nlp = load_nlp(experiment_dir, corpus)

    treebank_code = nlp.meta["treebank"]
    for section in ("test", "dev"):
        if section == "dev":
            section_dir = "conll17-ud-development-2017-03-19"
        else:
            section_dir = "conll17-ud-test-2017-05-09"
        text_path = test_data_dir / "input" / section_dir / (treebank_code + ".txt")
        udpipe_path = (
            test_data_dir / "input" / section_dir / (treebank_code + "-udpipe.conllu")
        )
        gold_path = test_data_dir / "gold" / section_dir / (treebank_code + ".conllu")

        header = [section, "LAS", "UAS", "TAG", "SENT", "WORD"]
        print("\t".join(header))
        inputs = {"gold": gold_path, "udp": udpipe_path, "raw": text_path}
        for input_type in ("udp", "raw"):
            input_path = inputs[input_type]
            output_path = (
                experiment_dir / corpus / "{section}.conllu".format(section=section)
            )

            parsed_docs, test_scores = evaluate(nlp, input_path, gold_path, output_path)

            accuracy = print_results(input_type, test_scores)
            acc_path = (
                experiment_dir
                / corpus
                / "{section}-accuracy.json".format(section=section)
            )
            srsly.write_json(acc_path, accuracy)
Beispiel #44
0
        str(i + 1),
        token.text,
        token.lemma_,
        token.pos_,
        token.tag_,
        "_",
        str(head),
        token.dep_.lower(),
        "_",
        "_",
    ]
    lines.append("\t".join(fields))
    return "\n".join(lines)


Token.set_extension("get_conllu_lines", method=get_token_conllu)
Token.set_extension("begins_fused", default=False)
Token.set_extension("inside_fused", default=False)


##################
# Initialization #
##################


def load_nlp(corpus, config):
    lang = corpus.split("_")[0]
    nlp = spacy.blank(lang)
    if config.vectors:
        nlp.vocab.from_disk(config.vectors / "vocab")
    return nlp