def tag_noun_chunks(doc): # entities spans = list(doc.ents) spans = filter_spans(spans) with doc.retokenize() as retokenizer: string_store = doc.vocab.strings for span in spans: start = span.start end = span.end retokenizer.merge(doc[start:end], attrs=intify_attrs({'ent_type': 'NOUN_CHUNK'}, string_store)) #retokenizer.merge(span) # noun chunks spans = list(doc.noun_chunks) spans = filter_spans(spans) with doc.retokenize() as retokenizer: string_store = doc.vocab.strings for span in spans: start = span.start end = span.end retokenizer.merge(doc[start:end], attrs=intify_attrs({'ent_type': 'NOUN_CHUNK'}, string_store))
def test_attrs_idempotence(text): int_attrs = intify_attrs({ "lemma": text, "is_alpha": True }, strings_map={text: 10}) assert intify_attrs(int_attrs) == {LEMMA: 10, IS_ALPHA: True}
def test_attrs_ent_iob_intify(): int_attrs = intify_attrs({"ENT_IOB": ""}) assert int_attrs == {ENT_IOB: 0} int_attrs = intify_attrs({"ENT_IOB": "I"}) assert int_attrs == {ENT_IOB: 1} int_attrs = intify_attrs({"ENT_IOB": "O"}) assert int_attrs == {ENT_IOB: 2} int_attrs = intify_attrs({"ENT_IOB": "B"}) assert int_attrs == {ENT_IOB: 3} int_attrs = intify_attrs({ENT_IOB: ""}) assert int_attrs == {ENT_IOB: 0} int_attrs = intify_attrs({ENT_IOB: "I"}) assert int_attrs == {ENT_IOB: 1} int_attrs = intify_attrs({ENT_IOB: "O"}) assert int_attrs == {ENT_IOB: 2} int_attrs = intify_attrs({ENT_IOB: "B"}) assert int_attrs == {ENT_IOB: 3} with pytest.raises(ValueError): int_attrs = intify_attrs({"ENT_IOB": "XX"}) with pytest.raises(ValueError): int_attrs = intify_attrs({ENT_IOB: "XX"})
def spacy_dependency_parse(self, charter_abstract): """ Execute spaCy NLP pipeline for charter abstracts :param charter_abstract: the charter abstract :return: the spaCy doc object """ # Attention during sentence segmentation. The German model of spaCy tends to recognise some elements too quickly # as a sentence, see: https://github.com/explosion/spaCy/issues/1756 and # https://spacy.io/usage/linguistic-features#sbd # Lemmatisation issues # Merge NEs into a single token: https://github.com/explosion/spaCy/issues/2193 # Lemmatise NEs https://github.com/explosion/spaCy/issues/1809 # Issues with spaCy German lemmatiser (1) https://github.com/explosion/spaCy/issues/2486 # Issues with spaCy German lemmatiser (2) https://github.com/explosion/spaCy/issues/2668 doc = self.nlp(charter_abstract.decode('utf-8')) # get named entities entities = [(ent.start, ent.end, ent.label, ent.lemma_) for ent in doc.ents] # merge and retokenize named entities with doc.retokenize() as retokenizer: string_store = doc.vocab.strings for start, end, label, lemma in entities: retokenizer.merge(doc[start:end], attrs=intify_attrs( { 'ent_type': label, 'lemma': lemma }, string_store)) return doc
def merge_entities(doc): """ Merge named entities into single tokens in ``doc``, *in-place*. Can be used as a stand-alone function, or as part of a spaCy language pipeline:: >>> spacy_lang = textacy.load_spacy('en') >>> spacy_lang.add_pipe(merge_entities, after='ner') >>> doc = spacy_lang('The entity in this sentence is Burton DeWilde.') >>> doc[-2] Burton DeWilde Args: doc (``SpacyDoc``) Returns: ``SpacyDoc``: Input ``doc`` with merged entities. """ try: # retokenizer was added to spacy in v2.0.11 with doc.retokenize() as retokenizer: string_store = doc.vocab.strings for ent in doc.ents: retokenizer.merge(doc[ent.start:ent.end], attrs=intify_attrs({'ent_type': ent.label}, string_store)) except AttributeError: ents = [(ent.start_char, ent.end_char, ent.label) for ent in doc.ents] for start_char, end_char, label in ents: doc.merge(start_char, end_char, ent_type=label) return doc
def test_attrs_do_deprecated(text): int_attrs = intify_attrs({ "F": text, "is_alpha": True }, strings_map={text: 10}, _do_deprecated=True) assert int_attrs == {ORTH: 10, IS_ALPHA: True}
def tag_chunks_spans(doc, spans, span_type): spans = filter_spans(spans) with doc.retokenize() as retokenizer: string_store = doc.vocab.strings for span in spans: start = span.start end = span.end retokenizer.merge(doc[start:end], attrs=intify_attrs({'ent_type': span_type}, string_store))
def tag_chunks(doc): spans = list(doc.ents) + list(doc.noun_chunks) spans = filter_spans(spans) with doc.retokenize() as retokenizer: string_store = doc.vocab.strings for span in spans: start = span.start end = span.end retokenizer.merge(doc[start:end], attrs=intify_attrs({'ent_type': 'ENTITY'}, string_store))
def merge_spans(spans: Iterable[Span], doc: Doc) -> None: """ Merge spans into single tokens in ``doc``, *in-place*. Args: spans (Iterable[:class:`spacy.tokens.Span`]) doc (:class:`spacy.tokens.Doc`) """ with doc.retokenize() as retokenizer: string_store = doc.vocab.strings for span in spans: retokenizer.merge( doc[span.start:span.end], attrs=attrs.intify_attrs({"ent_type": span.label}, string_store), )
def merge_spans(spans, doc): """ Merge spans into single tokens in ``doc``, *in-place*. Args: spans (Iterable[``spacy.Span``]) doc (``spacy.Doc``) """ try: # retokenizer was added to spacy in v2.0.11 with doc.retokenize() as retokenizer: string_store = doc.vocab.strings for span in spans: retokenizer.merge(doc[span.start:span.end], attrs=attrs.intify_attrs( {'ent_type': span.label}, string_store)) except AttributeError: spans = [(span.start_char, span.end_char, span.label) for span in spans] for start_char, end_char, label in spans: doc.merge(start_char, end_char, ent_type=label)
def test_attrs_key(text): assert intify_attrs({"ORTH": text}) == {ORTH: text} assert intify_attrs({"NORM": text}) == {NORM: text} assert intify_attrs({"lemma": text}, strings_map={text: 10}) == {LEMMA: 10}
def test_attrs_key(text): assert intify_attrs({"ORTH": text}) == {ORTH: text} assert intify_attrs({"NORM": text}) == {NORM: text} assert intify_attrs({"lemma": text}, strings_map={text: 10}) == {LEMMA: 10}
def test_attrs_do_deprecated(text): int_attrs = intify_attrs( {"F": text, "is_alpha": True}, strings_map={text: 10}, _do_deprecated=True ) assert int_attrs == {ORTH: 10, IS_ALPHA: True}
def test_attrs_idempotence(text): int_attrs = intify_attrs({"lemma": text, "is_alpha": True}, strings_map={text: 10}) assert intify_attrs(int_attrs) == {LEMMA: 10, IS_ALPHA: True}