def test_doc_retokenize_merge_extension_attrs_invalid(en_vocab, underscore_attrs): Token.set_extension("a", getter=lambda x: x, force=True) Token.set_extension("b", method=lambda x: x, force=True) doc = Doc(en_vocab, words=["hello", "world", "!"]) attrs = {"_": underscore_attrs} with pytest.raises(ValueError): with doc.retokenize() as retokenizer: retokenizer.merge(doc[0:2], attrs=attrs)
def test_issue3555(en_vocab): """Test that custom extensions with default None don't break matcher.""" Token.set_extension("issue3555", default=None) matcher = Matcher(en_vocab) pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}] matcher.add("TEST", None, pattern) doc = Doc(en_vocab, words=["have", "apple"]) matcher(doc)
def test_issue_1971_3(en_vocab): """Test that pattern matches correctly for multiple extension attributes.""" Token.set_extension("a", default=1, force=True) Token.set_extension("b", default=2, force=True) doc = Doc(en_vocab, words=["hello", "world"]) matcher = Matcher(en_vocab) matcher.add("A", None, [{"_": {"a": 1}}]) matcher.add("B", None, [{"_": {"b": 2}}]) matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc)) assert len(matches) == 4 assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
def test_matcher_extension_attribute(en_vocab): matcher = Matcher(en_vocab) get_is_fruit = lambda token: token.text in ("apple", "banana") Token.set_extension("is_fruit", getter=get_is_fruit, force=True) pattern = [{"ORTH": "an"}, {"_": {"is_fruit": True}}] matcher.add("HAVING_FRUIT", None, pattern) doc = Doc(en_vocab, words=["an", "apple"]) matches = matcher(doc) assert len(matches) == 1 doc = Doc(en_vocab, words=["an", "aardvark"]) matches = matcher(doc) assert len(matches) == 0
def test_matcher_extension_set_membership(en_vocab): matcher = Matcher(en_vocab) get_reversed = lambda token: "".join(reversed(token.text)) Token.set_extension("reversed", getter=get_reversed, force=True) pattern = [{"_": {"reversed": {"IN": ["eyb", "ih"]}}}] matcher.add("REVERSED", None, pattern) doc = Doc(en_vocab, words=["hi", "bye", "hello"]) matches = matcher(doc) assert len(matches) == 2 doc = Doc(en_vocab, words=["aardvark"]) matches = matcher(doc) assert len(matches) == 0
def test_issue_1971_4(en_vocab): """Test that pattern matches correctly with multiple extension attribute values on a single token. """ Token.set_extension("ext_a", default="str_a", force=True) Token.set_extension("ext_b", default="str_b", force=True) matcher = Matcher(en_vocab) doc = Doc(en_vocab, words=["this", "is", "text"]) pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3 matcher.add("TEST", None, pattern) matches = matcher(doc) # Uncommenting this caused a segmentation fault assert len(matches) == 1
def __init__(self, nlp, label="GPE"): """Initialise the pipeline component. The shared nlp instance is used to initialise the matcher with the shared vocab, get the label ID and generate Doc objects as phrase match patterns. """ # Make request once on initialisation and store the data r = requests.get("https://restcountries.eu/rest/v2/all") r.raise_for_status() # make sure requests raises an error if it fails countries = r.json() # Convert API response to dict keyed by country name for easy lookup # This could also be extended using the alternative and foreign language # names provided by the API self.countries = {c["name"]: c for c in countries} self.label = nlp.vocab.strings[label] # get entity label ID # Set up the PhraseMatcher with Doc patterns for each country name patterns = [nlp(c) for c in self.countries.keys()] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add("COUNTRIES", None, *patterns) # Register attribute on the Token. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. # If no default value is set, it defaults to None. Token.set_extension("is_country", default=False) Token.set_extension("country_capital", default=False) Token.set_extension("country_latlng", default=False) Token.set_extension("country_flag", default=False) # Register attributes on Doc and Span via a getter that checks if one of # the contained tokens is set to is_country == True. Doc.set_extension("has_country", getter=self.has_country) Span.set_extension("has_country", getter=self.has_country)
def __init__(self, nlp, label='GPE'): """Initialise the pipeline component. The shared nlp instance is used to initialise the matcher with the shared vocab, get the label ID and generate Doc objects as phrase match patterns. """ # Make request once on initialisation and store the data r = requests.get('https://restcountries.eu/rest/v2/all') r.raise_for_status() # make sure requests raises an error if it fails countries = r.json() # Convert API response to dict keyed by country name for easy lookup # This could also be extended using the alternative and foreign language # names provided by the API self.countries = {c['name']: c for c in countries} self.label = nlp.vocab.strings[label] # get entity label ID # Set up the PhraseMatcher with Doc patterns for each country name patterns = [nlp(c) for c in self.countries.keys()] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add('COUNTRIES', None, *patterns) # Register attribute on the Token. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. # If no default value is set, it defaults to None. Token.set_extension('is_country', default=False) Token.set_extension('country_capital') Token.set_extension('country_latlng') Token.set_extension('country_flag') # Register attributes on Doc and Span via a getter that checks if one of # the contained tokens is set to is_country == True. Doc.set_extension('has_country', getter=self.has_country) Span.set_extension('has_country', getter=self.has_country)
def add_token_extension(force=False): """ Extend spaCy's :py:class:`spacy.tokens.Token` with attributes for sentiment specific data. This should be called only once during the runtime of the application. If multiple calls cannot be avoided, use ``force=True`` to prevent spaCy from rejecting to redundant setting. """ Token.set_extension('topic', default=None, force=force) Token.set_extension('rating', default=None, force=force) Token.set_extension('is_negation', default=False, force=force) Token.set_extension('is_intensifier', default=False, force=force) Token.set_extension('is_diminisher', default=False, force=force)
def process(self, docOfSentence, nlp, optionalObject=None): docOfSentence, sentenceEnding = self.removeAllSpacesAndPunctiationMarksAtEndOfSentence( docOfSentence) Token.set_extension("isMainClause", default=False, force=True) Token.set_extension("predicateIsAtBegin", default=True, force=True) Token.set_extension("shouldBeLowercase", default=False, force=True) Token.set_extension("belongsToPreviousPart", default=False, force=True) Span.set_extension("isMainClause", default=False, force=True) Span.set_extension("predicateIsAtBegin", default=True, force=True) Span.set_extension("shouldBeLowercase", default=False, force=True) Span.set_extension("belongsToPreviousPart", default=False, force=True) # Zerlegung des Satzes in seine Haupt- und Nebensätze allSentenceParts = self.splitAndCategorizeSentenceParts(docOfSentence) if allSentenceParts == None: return None # Verbindung von Nebensätzen zu deren Hauptsätzen relatedMainAndDependentClauses = self.relateSentenceToEachOther( allSentenceParts) if relatedMainAndDependentClauses == None: return None # Vertauschung der Haupt- und Nebensätze durchführen possibleVariations = self.changeMainAndDependentClauses( relatedMainAndDependentClauses, sentenceEnding) # Rückgabe der Variationen return possibleVariations
def __init__(self, spacy_pipeline, labels): """ :param spacy_pipeline: An existing spaCy pipeline :param labels: The subset of labels from the gold annotations to restrict labeling to. """ super().__init__( component_name=self.name, dependencies=self.dependencies ) self.nlp = spacy_pipeline self.labels = labels self.failed_overlay_count = 0 self.failed_identifying_span_count = 0 Token.set_extension('gold_label', default="O", force=True)
def test_issue_1971_4(en_vocab): """Test that pattern matches correctly with multiple extension attribute values on a single token. """ Token.set_extension("ext_a", default="str_a", force=True) Token.set_extension("ext_b", default="str_b", force=True) matcher = Matcher(en_vocab) doc = Doc(en_vocab, words=["this", "is", "text"]) pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3 matcher.add("TEST", None, pattern) matches = matcher(doc) # Uncommenting this caused a segmentation fault assert len(matches) == 1 assert matches[0] == (en_vocab.strings["TEST"], 0, 3)
def test_matcher_superset_value_operator(en_vocab): matcher = Matcher(en_vocab) pattern = [{ "MORPH": { "IS_SUPERSET": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"] } }] matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) assert len(matcher(doc)) == 0 doc[0].set_morph("Feat=Val|Feat2=Val2") assert len(matcher(doc)) == 0 doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3") assert len(matcher(doc)) == 1 doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4") assert len(matcher(doc)) == 1 # IS_SUPERSET with more than one value only matches for MORPH matcher = Matcher(en_vocab) pattern = [{"TAG": {"IS_SUPERSET": ["A", "B"]}}] matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) doc[0].tag_ = "A" assert len(matcher(doc)) == 0 # IS_SUPERSET with one value is the same as == matcher = Matcher(en_vocab) pattern = [{"TAG": {"IS_SUPERSET": ["A"]}}] matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) doc[0].tag_ = "A" assert len(matcher(doc)) == 1 # IS_SUPERSET with an empty value matches everything matcher = Matcher(en_vocab) pattern = [{"TAG": {"IS_SUPERSET": []}}] matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) doc[0].tag_ = "A" assert len(matcher(doc)) == 3 # IS_SUPERSET with a list value Token.set_extension("ext", default=[]) matcher = Matcher(en_vocab) pattern = [{"_": {"ext": {"IS_SUPERSET": ["A"]}}}] matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) doc[0]._.ext = ["A", "B"] assert len(matcher(doc)) == 1
def __init__(self, nlp, path=HUNSPELL_PROFILE): if path in DEFAULT_DICTIONARY_PATHS: default_path = DEFAULT_DICTIONARY_PATHS[path] dic_path, aff_path = ( os.path.join(default_path, 'en_US.dic'), os.path.join(default_path, 'en_US.aff'), ) else: assert len(path) == 2, 'Include two paths: dic_path and aff_path' dic_path, aff_path = path self.hobj = HunSpell(dic_path, aff_path) Token.set_extension('hunspell_spell', default=None) Token.set_extension('hunspell_suggest', getter=self.get_suggestion)
def __init__(self, nlp, semantic): elements = semantic.get_all_values() self.label = nlp.vocab.strings[self.name] patterns = [nlp(org) for org in elements] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add(self.name, None, *patterns) Token.set_extension(self.extension, default=False, force=True) Doc.set_extension(self.extension, getter=self.has_quantifier, force=True) Span.set_extension(self.extension, getter=self.has_quantifier, force=True)
def __init__(self, spacy_pipeline): self.nlp = spacy_pipeline Token.set_extension('feature_is_volume_unit', default=False) self.nlp.entity.add_label('volume_unit') self.volume_matcher = Matcher(self.nlp.vocab) self.volume_matcher.add('UNIT_OF_VOLUME', None, [{ 'LOWER': 'ml' }], [{ 'ORTH': 'dL' }], [{ 'LOWER': 'cc' }], [{ 'ORTH': 'L' }])
def sentences_gen(labels): for label in labels: doc = nlp(gendocs(label)) for i, sent in enumerate(doc.sents): res = [] for j, token in enumerate(sent): Token.set_extension('lemma', getter=lemma_getter, force=True) if not token.is_punct and not token.is_digit and not token.is_space: tok = token._.lemma.lower() tok = tok.replace('.', '') res.append(tok) # print(sent) yield res
def test_doc_retokenize_split_extension_attrs(en_vocab): Token.set_extension("a", default=False, force=True) Token.set_extension("b", default="nothing", force=True) doc = Doc(en_vocab, words=["LosAngeles", "start"]) with doc.retokenize() as retokenizer: heads = [(doc[0], 1), doc[1]] underscore = [{"a": True, "b": "1"}, {"b": "2"}] attrs = {"lemma": ["los", "angeles"], "_": underscore} retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs) assert doc[0].lemma_ == "los" assert doc[0]._.a is True assert doc[0]._.b == "1" assert doc[1].lemma_ == "angeles" assert doc[1]._.a is False assert doc[1]._.b == "2"
def darcolor(doc): try: Token.set_extension('plot', default={}) except: pass for token in doc: node_label = '{0} [{1}] /{2})'.format(token.orth_, token.i, token.pos_) token._.plot['label'] = node_label if token.pos_ == 'VERB': token._.plot['color'] = 'green' elif token.pos_=='PROPN': token._.plot['color']='red' elif token.pos_=='NOUN': token._.plot['color']='blue' return doc
def test_custom_attribute(text): from spacy.tokens import Token fruit_getter = lambda token: token.text in ("apple", "pear", "banana") Token.set_extension("is_fruit", getter=fruit_getter) doc = dframcy.nlp(text) dataframe = dframcy.to_dataframe(doc, columns=["id", "start", "end"], custom_attributes=["is_fruit"]) results = pd.DataFrame({ "token_start": [0, 2, 7, 10], "token_end": [1, 6, 9, 15], "token_is_fruit": [False, False, False, True], }) assert_frame_equal(dataframe, results)
def test_call_lexicon_component(self): """ Test running a doc through the lexicon component and properly overlaying features from the lexicon. :return: """ lexicon_component = LexiconComponent(self.nlp, self.lexicon) self.assertIs(Token.has_extension('feature_is_ADE_from_lexicon'), False) self.assertIs(Token.has_extension('feature_is_DRUG_from_lexicon'), False) doc = lexicon_component(self.doc) self.assertIs(Token.has_extension('feature_is_ADE_from_lexicon'), True) self.assertIs(Token.has_extension('feature_is_DRUG_from_lexicon'), True)
def __init__(self, nlp, pattern_list, match_id='FALSE_DATE', label='FALSE_DATE', regex_pat=regex_pat): # register a new token extension to flag false_date tokens self.label = nlp.vocab.strings[label] # get entity label ID self.orig_label = nlp.vocab.strings['DATE'] # get entity label ID for date Token.set_extension('is_false_date', default=False, force=True) self.matcher = Matcher(nlp.vocab) self.matcher.add(match_id, None, pattern_list) self.regex_pat = regex_pat self.nlp=nlp # Register attributes on Doc and Span via a getter that checks if one of # the contained tokens is set to is_false_date == True. Doc.set_extension('has_false_date', getter=self.has_false_date, force=True) Span.set_extension('has_false_date', getter=self.has_false_date, force=True)
def _task_add_metadata_per_doc(self, key, data, default): logger.debug('worker `%s`: adding metadata per document' % self.name) attr_name = 'meta_' + key Token.set_extension(attr_name, default=default) for doc in self._docs: meta_vals = data.get(doc._.label, [default] * len(doc)) assert sum(doc.user_data['mask']) == len(meta_vals) for t, v, m in zip(doc, meta_vals, doc.user_data['mask']): if m: setattr(t._, attr_name, v) if key not in self._metadata_attrs: self._metadata_attrs[key] = default
def __init__(self, spacy_pipeline): self.nlp = spacy_pipeline Token.set_extension('feature_is_mass_unit', default=False) self.nlp.entity.add_label('mass_unit') self.mass_matcher = Matcher(self.nlp.vocab) self.mass_matcher.add('UNIT_OF_MASS', None, [{'LOWER': 'mcg'}], [{'LOWER': 'microgram'}], [{'LOWER': 'micrograms'}], [{'ORTH': 'mg'}], [{'LOWER': 'milligram'}], [{'LOWER': 'g'}], [{'LOWER': 'kg'}], [{'ORTH': 'mEq'}])
def __init__(self, nlp, pattern_id='IPTagger', attrs=('has_ipv4', 'is_ipv4', 'ipv4'), force_extension=False, subnets_to_keep=4): """Initialise the pipeline component. nlp (Language): The shared nlp object. Used to initialise the matcher with the shared `Vocab`, and create `Doc` match patterns. pattern_id (unicode): ID of match pattern, defaults to 'IPTagger'. Can be changed to avoid ID clashes. attrs (tuple): Attributes to set on the ._ property. Defaults to ('has_ipv4', 'is_ipv4', 'ipv4'). force_extension (bool): Force creation of extension objects. subnets_to_keep (int): Number of subnets to include in lemmatization. RETURNS (callable): A spaCy pipeline component. """ self._has_ipv4, self._is_ipv4, self._ipv4 = attrs self.matcher = Matcher(nlp.vocab) if (subnets_to_keep < 1) or (subnets_to_keep > 4): raise ValueError('Subnets_to_keep must be in the range 1-4') self.subnets_to_keep = subnets_to_keep # Add IPv4 rule to matcher self._ipv4_re = re.compile(ipv4_expr, re.VERBOSE | re.I | re.UNICODE) ipv4_mask = lambda text: bool(self._ipv4_re.match(text)) ipv4_flag = nlp.vocab.add_flag(ipv4_mask) self.matcher.add('IPV4', None, [{ipv4_flag: True}]) # Add attributes # Need to force since extensions are global by default Doc.set_extension(self._has_ipv4, getter=self.has_ipv4, force=force_extension) Doc.set_extension(self._ipv4, getter=self.iter_ipv4, force=force_extension) Span.set_extension(self._has_ipv4, getter=self.has_ipv4, force=force_extension) Span.set_extension(self._ipv4, getter=self.iter_ipv4, force=force_extension) Token.set_extension(self._is_ipv4, default=False, force=force_extension)
def __init__( self, nlp: Language, merge_spans: bool = True, lookup: Optional[Dict[str, str]] = None, pattern_id: str = "EMOJI", attrs: Tuple[str, str, str, str] = DEFAULT_ATTRS, force_extension: bool = True, ) -> None: """Initialise the pipeline component. nlp (Language): The shared nlp object. Used to initialise the matcher with the shared `Vocab`, and create `Doc` match patterns. attrs (tuple): Attributes to set on the ._ property. Defaults to ('has_emoji', 'is_emoji', 'emoji_desc', 'emoji'). pattern_id (unicode): ID of match pattern, defaults to 'EMOJI'. Can be changed to avoid ID clashes. merge_spans (bool): Merge spans containing multi-character emoji. Will only merge combined emoji resulting in one icon, not sequences. lookup (dict): Optional lookup table that maps emoji unicode strings to custom descriptions, e.g. translations or other annotations. RETURNS (callable): A spaCy pipeline component. """ self._has_emoji, self._is_emoji, self._emoji_desc, self._emoji = attrs self.merge_spans = merge_spans self.lookup = lookup or {} self.matcher = PhraseMatcher(nlp.vocab) emoji_patterns = list(nlp.tokenizer.pipe(EMOJI.keys())) self.matcher.add(pattern_id, None, *emoji_patterns) # Add attributes Doc.set_extension(self._has_emoji, getter=self.has_emoji, force=force_extension) Doc.set_extension(self._emoji, getter=self.iter_emoji, force=force_extension) Span.set_extension(self._has_emoji, getter=self.has_emoji, force=force_extension) Span.set_extension(self._emoji, getter=self.iter_emoji, force=force_extension) Token.set_extension(self._is_emoji, default=False, force=force_extension) Token.set_extension(self._emoji_desc, getter=self.get_emoji_desc, force=force_extension)
def _get_dep_noun(tag: Token) -> str: f: Dict[str, Any] = tag._.knp_morph_tag._.knp_tag_element.features if "係" not in f: return "dep" k = f["係"] if f["係"] != "未格" or "解析格" not in f else f["解析格"] + "格" x = { "隣": "nmod", "文節内": "compound", "ガ格": "nsubj", "ヲ格": "obj", "ガ2格": "dislocated", } if k in x: return x[k] elif k == "ノ格": if tag.head.pos in {VERB, ADJ}: return "nsubj" elif tag.pos in {DET, PRON}: tag.pos = DET return "det" else: return "nummod" if tag.pos == NUM else "nmod" elif "並列タイプ" in f: if tag.head.pos in {VERB, ADJ}: return "obl" else: return "conj" return "obl"
def __init__(self, nlp, lemma_sequences, attribute, label, name, merge=False): self.name = name self.nlp = nlp self.label = label self.attribute = attribute self.matcher = Matcher(self.nlp.vocab) self.merge = merge # Build patterns from sequences of lemmas read from the lexicon file for lemmas in lemma_sequences: pattern = [] for lemma in lemmas.split(): pattern.append({LEMMA: lemma}) self.matcher.add(label, None, pattern) Token.set_extension(attribute, default=False, force=True)
def __init__(self, nlp, attr_name="concept_tag"): """Create a new ConceptTagger. Params: nlp: A spaCy Language model. attr_name (str): The name of the attribute to set to tokens. """ self.nlp = nlp self.attr_name = attr_name self.target_matcher = TargetMatcher(nlp, add_ents=False) self.rules = [] # If the token attribute hasn't been set, add it now try: Token.set_extension(attr_name, default="") except: pass
def __init__(self, nlp, keywords_list=[], keywords_dict={}, keywords_file=None, label='', attrs=('has_entities', 'is_entity', 'entity_desc', 'entities')): self._has_entities, self._is_entity, self._entity_desc, self._entities = attrs self.keyword_processor = KeywordProcessor() self.keyword_processor.add_keywords_from_list(keywords_list) self.keyword_processor.add_keywords_from_dict(keywords_dict) if keywords_file: self.keyword_processor.add_keyword_from_file(keywords_file) self.label = label # Add attributes Doc.set_extension(self._has_entities, getter=self.has_entities) Doc.set_extension(self._entities, getter=self.iter_entities) Span.set_extension(self._has_entities, getter=self.has_entities) Span.set_extension(self._entities, getter=self.iter_entities) Token.set_extension(self._is_entity, default=False) Token.set_extension(self._entity_desc, getter=self.get_entity_desc)
def __init__(self): # register Token attributes if they are not registered already from spacy.tokens import Token for attr_name in [ "speaker", "start_time", "end_time", "confidence", "entity_linking", "addressee" ]: if not Token.has_extension(attr_name): Token.set_extension(attr_name, default=None) # register Span attributes if they are not registered already from spacy.tokens import Span if not Span.has_extension("speaker"): Span.set_extension("speaker", getter=self.span_speaker) if not Span.has_extension("start_time"): Span.set_extension("start_time", getter=self.span_start_time) if not Span.has_extension("end_time"): Span.set_extension("end_time", getter=self.span_end_time) if not Span.has_extension("confidence"): Span.set_extension("confidence", getter=self.span_average_confidence) if not Span.has_extension("entity_linking"): Span.set_extension("entity_linking", getter=self.span_entity_linking) if not Span.has_extension("addressee"): Span.set_extension("addressee", getter=self.span_addressee) # minimalist spaCy pipeline (used only for its tokenizer) self.tokenizer = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"]) # custom spaCy pipeline (that adds forced alignment attributes and ensures # that a new sentence starts at every speaker change) self.nlp = spacy.load("en_core_web_sm") self.nlp.add_pipe(self.placeholder, name="forced_alignment", first=True) self.nlp.add_pipe(self.start_sentence_at_speaker_change, after="forced_alignment")
def __call__(self, doc): NSUBJ = 429 DOBJ = 416 # doc._.set("word_pair", None) for sentence in doc.sents: word_pairs = [] for chunk in sentence.noun_chunks: if not chunk.root.head.pos == VERB or not ( chunk.root.dep == NSUBJ or chunk.root.dep == DOBJ): continue if not (chunk.root.head.is_stop and chunk.root.is_stop): continue noun_norm = chunk.root.text if chunk.root.pos == PRON else chunk.root.lemma_ noun = Token(noun_norm, chunk.root) verb = Token(chunk.root.head.lemma_, chunk.root.head) word_pair = WordPair(verb, noun) word_pair.noun_chunk = chunk # word pair vectorized word_pair = self.nlp.w2v(word_pair) if not word_pair.has_vector: continue # word pair clustered word_pair = self.v2c(word_pair) # SPS identification word_pair.sps = self.word_freq.sps(word_pair.verb.cluster) # SA identification word_pair.sa = 0 for word in chunk: word_pair.sa += self.word_freq.sa(word_pair.verb.cluster, word.cluster) word_pair.sa = word_pair.sa / len(chunk) word_pairs.append(word_pair) sentence._.set("word_pairs", word_pairs) return doc
def test_underscore_mutable_defaults_dict(en_vocab): """Test that mutable default arguments are handled correctly (see #2581).""" Token.set_extension("mutable", default={}) token1 = Doc(en_vocab, words=["one"])[0] token2 = Doc(en_vocab, words=["two"])[0] token1._.mutable["foo"] = "bar" assert len(token1._.mutable) == 1 assert token1._.mutable["foo"] == "bar" assert len(token2._.mutable) == 0 token1._.mutable["foo"] = "baz" assert len(token1._.mutable) == 1 assert token1._.mutable["foo"] == "baz" token1._.mutable["x"] = [] token1._.mutable["x"].append("y") assert len(token1._.mutable) == 2 assert token1._.mutable["x"] == ["y"] assert len(token2._.mutable) == 0
def __init__(self, nlp): # register a new token extension to flag bad HTML Token.set_extension('bad_html', default=False, force=True) self.matcher = Matcher(nlp.vocab) self.matcher.add('BAD_HTML', None, [{ 'ORTH': '<' }, { 'LOWER': 'br' }, { 'ORTH': '>' }], [{ 'ORTH': '<' }, { 'LOWER': 'br/' }, { 'ORTH': '>' }])
def in_compound(tok: Token): """Returns true if the spacy token is part of a compound phrase""" if tok.dep_ == "compound": return True elif tok.i > 0 and tok.nbor(-1).dep_ == "compound": return True return False
def __init__(self, data_dir=DATA_DIR, lefff_file_name=LEFFF_FILE_NAME, after_melt=False): LOGGER.info('New LefffLemmatizer instantiated.') # register your new attribute token._.lefff_lemma Token.set_extension('lefff_lemma', default=None) #In memory lemma mapping self.lemma_dict = {} self.after_melt = after_melt with io.open(os.path.join(data_dir, lefff_file_name), encoding='utf-8') as lefff_file: LOGGER.info('Reading lefff data...') for line in lefff_file: els = line.split('\t') self.lemma_dict[(els[0], els[1])] = els[2] LOGGER.info('Successfully loaded lefff lemmatizer')
def test_issue1971(en_vocab): # Possibly related to #2675 and #2671? matcher = Matcher(en_vocab) pattern = [ {"ORTH": "Doe"}, {"ORTH": "!", "OP": "?"}, {"_": {"optional": True}, "OP": "?"}, {"ORTH": "!", "OP": "?"}, ] Token.set_extension("optional", default=False) matcher.add("TEST", None, pattern) doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"]) # We could also assert length 1 here, but this is more conclusive, because # the real problem here is that it returns a duplicate match for a match_id # that's not actually in the vocab! matches = matcher(doc) assert all([match_id in en_vocab.strings for match_id, start, end in matches])
def test_doc_retokenize_merge_extension_attrs(en_vocab): Token.set_extension("a", default=False, force=True) Token.set_extension("b", default="nothing", force=True) doc = Doc(en_vocab, words=["hello", "world", "!"]) # Test regular merging with doc.retokenize() as retokenizer: attrs = {"lemma": "hello world", "_": {"a": True, "b": "1"}} retokenizer.merge(doc[0:2], attrs=attrs) assert doc[0].lemma_ == "hello world" assert doc[0]._.a is True assert doc[0]._.b == "1" # Test bulk merging doc = Doc(en_vocab, words=["hello", "world", "!", "!"]) with doc.retokenize() as retokenizer: retokenizer.merge(doc[0:2], attrs={"_": {"a": True, "b": "1"}}) retokenizer.merge(doc[2:4], attrs={"_": {"a": None, "b": "2"}}) assert doc[0]._.a is True assert doc[0]._.b == "1" assert doc[1]._.a is None assert doc[1]._.b == "2"
def __init__(self, nlp, companies=tuple(), label='ORG'): """Initialise the pipeline component. The shared nlp instance is used to initialise the matcher with the shared vocab, get the label ID and generate Doc objects as phrase match patterns. """ self.label = nlp.vocab.strings[label] # get entity label ID # Set up the PhraseMatcher – it can now take Doc objects as patterns, # so even if the list of companies is long, it's very efficient patterns = [nlp(org) for org in companies] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add('TECH_ORGS', None, *patterns) # Register attribute on the Token. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. Token.set_extension('is_tech_org', default=False) # Register attributes on Doc and Span via a getter that checks if one of # the contained tokens is set to is_tech_org == True. Doc.set_extension('has_tech_org', getter=self.has_tech_org) Span.set_extension('has_tech_org', getter=self.has_tech_org)
def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs): Token.set_extension("x", default=False, force=True) Token.set_extension("a", getter=lambda x: x, force=True) Token.set_extension("b", method=lambda x: x, force=True) doc = Doc(en_vocab, words=["LosAngeles", "start"]) attrs = {"_": underscore_attrs} with pytest.raises(ValueError): with doc.retokenize() as retokenizer: heads = [(doc[0], 1), doc[1]] retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
def main(test_data_dir, experiment_dir, corpus): Token.set_extension("split_start", getter=get_token_split_start) Token.set_extension("split_end", getter=get_token_split_end) Token.set_extension("begins_fused", default=False) Token.set_extension("inside_fused", default=False) lang.zh.Chinese.Defaults.use_jieba = False lang.ja.Japanese.Defaults.use_janome = False lang.ru.Russian.Defaults.use_pymorphy2 = False nlp = load_nlp(experiment_dir, corpus) treebank_code = nlp.meta["treebank"] for section in ("test", "dev"): if section == "dev": section_dir = "conll17-ud-development-2017-03-19" else: section_dir = "conll17-ud-test-2017-05-09" text_path = test_data_dir / "input" / section_dir / (treebank_code + ".txt") udpipe_path = ( test_data_dir / "input" / section_dir / (treebank_code + "-udpipe.conllu") ) gold_path = test_data_dir / "gold" / section_dir / (treebank_code + ".conllu") header = [section, "LAS", "UAS", "TAG", "SENT", "WORD"] print("\t".join(header)) inputs = {"gold": gold_path, "udp": udpipe_path, "raw": text_path} for input_type in ("udp", "raw"): input_path = inputs[input_type] output_path = ( experiment_dir / corpus / "{section}.conllu".format(section=section) ) parsed_docs, test_scores = evaluate(nlp, input_path, gold_path, output_path) accuracy = print_results(input_type, test_scores) acc_path = ( experiment_dir / corpus / "{section}-accuracy.json".format(section=section) ) srsly.write_json(acc_path, accuracy)
str(i + 1), token.text, token.lemma_, token.pos_, token.tag_, "_", str(head), token.dep_.lower(), "_", "_", ] lines.append("\t".join(fields)) return "\n".join(lines) Token.set_extension("get_conllu_lines", method=get_token_conllu) Token.set_extension("begins_fused", default=False) Token.set_extension("inside_fused", default=False) ################## # Initialization # ################## def load_nlp(corpus, config): lang = corpus.split("_")[0] nlp = spacy.blank(lang) if config.vectors: nlp.vocab.from_disk(config.vectors / "vocab") return nlp