Exemple #1
0
def test_attr_pipeline_checks(en_vocab):
    doc1 = Doc(en_vocab, words=["Test"])
    doc1.is_parsed = True
    doc2 = Doc(en_vocab, words=["Test"])
    doc2.is_tagged = True
    doc3 = Doc(en_vocab, words=["Test"])
    # DEP requires is_parsed
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"DEP": "a"}]])
    matcher(doc1)
    with pytest.raises(ValueError):
        matcher(doc2)
    with pytest.raises(ValueError):
        matcher(doc3)
    # TAG, POS, LEMMA require is_tagged
    for attr in ("TAG", "POS", "LEMMA"):
        matcher = Matcher(en_vocab)
        matcher.add("TEST", [[{attr: "a"}]])
        matcher(doc2)
        with pytest.raises(ValueError):
            matcher(doc1)
        with pytest.raises(ValueError):
            matcher(doc3)
    # TEXT/ORTH only require tokens
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"ORTH": "a"}]])
    matcher(doc1)
    matcher(doc2)
    matcher(doc3)
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"TEXT": "a"}]])
    matcher(doc1)
    matcher(doc2)
    matcher(doc3)
Exemple #2
0
def test_attr_pipeline_checks(en_vocab):
    doc1 = Doc(en_vocab, words=["Test"])
    doc1.is_parsed = True
    doc2 = Doc(en_vocab, words=["Test"])
    doc2.is_tagged = True
    doc3 = Doc(en_vocab, words=["Test"])
    # DEP requires is_parsed
    matcher = PhraseMatcher(en_vocab, attr="DEP")
    matcher.add("TEST1", [doc1])
    with pytest.raises(ValueError):
        matcher.add("TEST2", [doc2])
    with pytest.raises(ValueError):
        matcher.add("TEST3", [doc3])
    # TAG, POS, LEMMA require is_tagged
    for attr in ("TAG", "POS", "LEMMA"):
        matcher = PhraseMatcher(en_vocab, attr=attr)
        matcher.add("TEST2", [doc2])
        with pytest.raises(ValueError):
            matcher.add("TEST1", [doc1])
        with pytest.raises(ValueError):
            matcher.add("TEST3", [doc3])
    # TEXT/ORTH only require tokens
    matcher = PhraseMatcher(en_vocab, attr="ORTH")
    matcher.add("TEST3", [doc3])
    matcher = PhraseMatcher(en_vocab, attr="TEXT")
    matcher.add("TEST3", [doc3])
Exemple #3
0
def test_issue599(en_vocab):
    doc = Doc(en_vocab)
    doc.is_tagged = True
    doc.is_parsed = True
    doc2 = Doc(doc.vocab)
    doc2.from_bytes(doc.to_bytes())
    assert doc2.is_parsed
Exemple #4
0
def test_issue599(en_vocab):
    doc = Doc(en_vocab)
    doc.is_tagged = True
    doc.is_parsed = True
    doc2 = Doc(doc.vocab)
    doc2.from_bytes(doc.to_bytes())
    assert doc2.is_parsed
Exemple #5
0
 def to_doc(self, vocab, is_parsed=False):
     words = [morph.surface for morph in self.morphs] + [EOS]
     spaces = [morph.trailing_space for morph in self.morphs] + [False]
     doc = Doc(vocab, words=words, spaces=spaces)
     root_label = None
     for token, morph in zip(doc, self.morphs):
         token.tag_ = morph.pos
         token._.pos_detail = morph.pos_detail
         token._.inf = morph.inf
         token.lemma_ = morph.lemma  # work around: lemma_ must be set after tag_ (spaCy's bug)
         if is_parsed and morph.dep_label:
             if morph.id == morph.dep_morph.id:
                 root_label = morph.dep_label
                 token.dep_ = root_label if root_label.find('as_') >= 0 else '{}_as_{}'.format(root_label, morph.pos)
                 token.head = doc[-1]
             else:
                 token.dep_ = morph.dep_label
                 token.head = doc[morph.dep_morph.id]
     doc[-1].tag_ = 'X'  # work around: lemma_ must be set after tag_ (spaCy's bug)
     doc[-1].lemma_ = EOS
     if root_label:
         doc[-1].head = doc[-1]
         doc[-1].dep_ = 'root'
         doc.is_parsed = True
     return doc
def test_issue3882(en_vocab):
    """Test that displaCy doesn't serialize the doc.user_data when making a
    copy of the Doc.
    """
    doc = Doc(en_vocab, words=["Hello", "world"])
    doc.is_parsed = True
    doc.user_data["test"] = set()
    parse_deps(doc)
def test_issue3199():
    """Test that Span.noun_chunks works correctly if no noun chunks iterator
    is available. To make this test future-proof, we're constructing a Doc
    with a new Vocab here and setting is_parsed to make sure the noun chunks run.
    """
    doc = Doc(Vocab(), words=["This", "is", "a", "sentence"])
    doc.is_parsed = True
    assert list(doc[0:3].noun_chunks) == []
    def __call__(self, text):
        """Convert input text to a spaCy Doc.

        text (unicode): The text to process.
        RETURNS (spacy.tokens.Doc): The spaCy Doc object.
        """
        udpipe_sents = self.model(text) if text else [Sentence()]
        text = " ".join(s.getText() for s in udpipe_sents)
        tokens, heads = self.get_tokens_with_heads(udpipe_sents)
        if not tokens:
            return Doc(self.vocab)

        words = []
        spaces = []
        pos = []
        tags = []
        deps = []
        lemmas = []
        offset = 0
        is_aligned = self.check_aligned(text, tokens)
        for i, token in enumerate(tokens):
            span = text[offset:]
            if not span:
                break
            while len(span) and span[0].isspace():
                # If we encounter leading whitespace, skip one character ahead
                offset += 1
                span = text[offset:]
            words.append(token.form)
            # Make sure all strings are in the vocabulary
            pos.append(self.vocab.strings.add(token.upostag or ""))
            # CoNNL xpostag-s, custom for each UD treebank
            #tags.append(self.vocab.strings.add(token.xpostag or ""))
            tags.append(self.vocab.strings.add(token.feats or ""))
            deps.append(self.vocab.strings.add(self._dep(token.deprel) or ""))
            lemmas.append(self.vocab.strings.add(token.lemma or ""))
            offset += len(token.form)
            span = text[offset:]
            if i == len(tokens) - 1 or "SpaceAfter=No" in token.misc:
                spaces.append(False)
            elif not is_aligned:
                spaces.append(True)
            else:
                next_token = tokens[i + 1]
                spaces.append(not span.startswith(next_token.form))
        attrs = [POS, TAG, DEP, HEAD]
        array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64")
        doc = Doc(self.vocab, words=words,
                  spaces=spaces).from_array(attrs, array)
        # Overwrite lemmas separately to prevent overwritting by spaCy
        lemma_array = numpy.array([[lemma] for lemma in lemmas],
                                  dtype="uint64")
        doc.from_array([LEMMA], lemma_array)
        if any(pos) and any(tags):
            doc.is_tagged = True
        if any(deps):
            doc.is_parsed = True
        return doc
    def __call__(self, text):
        """Convert a StanfordNLP Doc to a spaCy Doc.

        text (unicode): The text to process.
        RETURNS (spacy.tokens.Doc): The spaCy Doc object.
        """
        snlp_doc = self.snlp(text) if text else Document("")
        text = snlp_doc.text
        tokens, heads = self.get_tokens_with_heads(snlp_doc)
        if not len(tokens):
            return Doc(self.vocab)

        words = []
        spaces = []
        pos = []
        tags = []
        deps = []
        lemmas = []
        offset = 0
        is_aligned = self.check_aligned(text, tokens)
        for i, token in enumerate(tokens):
            span = text[offset:]
            if not len(span):
                break
            while len(span) and span[0].isspace():
                # If we encounter leading whitespace, skip one character ahead
                offset += 1
                span = text[offset:]
            words.append(token.text)
            # Make sure all strings are in the vocabulary
            pos.append(self.vocab.strings.add(token.upos or ""))
            tags.append(self.vocab.strings.add(token.xpos or ""))
            deps.append(self.vocab.strings.add(token.dependency_relation
                                               or ""))
            lemmas.append(self.vocab.strings.add(token.lemma or ""))
            offset += len(token.text)
            span = text[offset:]
            if i == len(tokens) - 1:
                spaces.append(False)
            elif not is_aligned:
                spaces.append(True)
            else:
                next_token = tokens[i + 1]
                spaces.append(not span.startswith(next_token.text))
        attrs = [POS, TAG, DEP, HEAD]
        array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64")
        doc = Doc(self.vocab, words=words,
                  spaces=spaces).from_array(attrs, array)
        # Overwrite lemmas separately to prevent them from being overwritten by spaCy
        lemma_array = numpy.array([[lemma] for lemma in lemmas],
                                  dtype="uint64")
        doc.from_array([LEMMA], lemma_array)
        if any(pos) and any(tags):
            doc.is_tagged = True
        if any(deps):
            doc.is_parsed = True
        return doc
Exemple #10
0
 def __call__(self,text):
   u=self.model(text,raw=True) if text else ""
   vs=self.vocab.strings
   r=vs.add("ROOT")
   words=[]
   lemmas=[]
   pos=[]
   tags=[]
   feats=[]
   heads=[]
   deps=[]
   spaces=[]
   norms=[]
   for t in u.split("\n"):
     if t=="" or t.startswith("#"):
       continue
     s=t.split("\t")
     if len(s)!=10:
       continue
     id,form,lemma,upos,xpos,feat,head,deprel,dummy_deps,misc=s
     words.append(form)
     lemmas.append(vs.add(lemma))
     pos.append(vs.add(upos))
     tags.append(vs.add(xpos))
     feats.append(feat)
     if deprel=="root":
       heads.append(0)
       deps.append(r)
     else:
       heads.append(int(head)-int(id))
       deps.append(vs.add(deprel))
     spaces.append(False if "SpaceAfter=No" in misc else True)
     i=misc.find("Translit=")
     norms.append(vs.add(form if i<0 else misc[i+9:]))
   doc=Doc(self.vocab,words=words,spaces=spaces)
   a=numpy.array(list(zip(lemmas,pos,tags,deps,heads,norms)),dtype="uint64")
   doc.from_array([LEMMA,POS,TAG,DEP,HEAD,NORM],a)
   try:
     doc.is_tagged=True
     doc.is_parsed=True
   except:
     for i,j in enumerate(feats):
       if j!="_" and j!="":
         doc[i].set_morph(j)
   t=Tree(u)
   t._makeChunks()
   bunsetu=["I"]*len(doc)
   for s in t._cabocha._sentences:
     for w in s:
       try:
         bunsetu[w[0]-1]="B"
       except:
         pass
   doc.user_data["bunsetu_bi_labels"]=bunsetu
   return doc
Exemple #11
0
def knp_dependency_parser(doc: Doc) -> Doc:
    tag_spans: Iterable[Span] = doc._.get(KNP_USER_KEYS.tag.spans)
    for tag in tag_spans:
        parent: Optional[Span] = tag._.get(KNP_USER_KEYS.tag.parent)
        if parent is not None:
            tag[0].head = parent[0]
        else:
            tag[0].head = tag[0]
        for p, c in zip(tag, tag[1:]):
            c.head = p
    doc.is_parsed = True
    return doc
Exemple #12
0
    def __call__(self, text):
        """Convert a StanfordNLP Doc to a spaCy Doc.

        text (unicode): The text to process.
        RETURNS (spacy.tokens.Doc): The spaCy Doc object.
        """
        snlp_doc = self.snlp(text)
        text = snlp_doc.text
        tokens, heads = self.get_tokens_with_heads(snlp_doc)
        if not len(tokens):
            raise ValueError("No tokens available.")
        words = []
        spaces = []
        pos = []
        tags = []
        deps = []
        lemmas = []
        offset = 0
        for i, token in enumerate(tokens):
            span = text[offset:]
            if not len(span):
                break
            while not span.startswith(token.text):
                # If we encounter leading whitespace, skip one character ahead
                offset += 1
                span = text[offset:]
            words.append(token.text)
            # Make sure all strings are in the vocabulary
            pos.append(self.vocab.strings.add(token.upos or ""))
            tags.append(self.vocab.strings.add(token.xpos or ""))
            deps.append(self.vocab.strings.add(token.dependency_relation
                                               or ""))
            lemmas.append(self.vocab.strings.add(token.lemma or ""))
            offset += len(token.text)
            span = text[offset:]
            if i == len(tokens) - 1:
                spaces.append(False)
            else:
                next_token = tokens[i + 1]
                spaces.append(not span.startswith(next_token.text))
        attrs = [POS, TAG, DEP, HEAD, LEMMA]
        array = numpy.array(list(zip(pos, tags, deps, heads, lemmas)),
                            dtype="uint64")
        doc = Doc(self.vocab, words=words,
                  spaces=spaces).from_array(attrs, array)
        if any(pos) and any(tags):
            doc.is_tagged = True
        if any(deps):
            doc.is_parsed = True
        return doc
Exemple #13
0
 def __call__(self,text):
   u=self.model(text,raw=True) if text else ""
   vs=self.vocab.strings
   r=vs.add("ROOT")
   words=[]
   lemmas=[]
   pos=[]
   tags=[]
   feats=[]
   heads=[]
   deps=[]
   spaces=[]
   norms=[]
   for t in u.split("\n"):
     if t=="" or t.startswith("#"):
       continue
     s=t.split("\t")
     if len(s)!=10:
       continue
     id,form,lemma,upos,xpos,feat,head,deprel,dummy_deps,misc=s
     words.append(form)
     lemmas.append(vs.add(lemma))
     pos.append(vs.add(upos))
     tags.append(vs.add(xpos))
     feats.append(feat)
     if deprel=="root":
       heads.append(0)
       deps.append(r)
     else:
       heads.append(int(head)-int(id))
       deps.append(vs.add(deprel))
     spaces.append(False if "SpaceAfter=No" in misc else True)
     i=misc.find("Gloss=")
     if i<0:
       norms.append(vs.add(form))
     else:
       j=misc.find("|",i)
       norms.append(vs.add(misc[i+6:] if j<0 else misc[i+6:j]))
   doc=Doc(self.vocab,words=words,spaces=spaces)
   a=numpy.array(list(zip(lemmas,pos,tags,deps,heads,norms)),dtype="uint64")
   doc.from_array([LEMMA,POS,TAG,DEP,HEAD,NORM],a)
   try:
     doc.is_tagged=True
     doc.is_parsed=True
   except:
     for i,j in enumerate(feats):
       if j!="_" and j!="":
         doc[i].set_morph(j)
   return doc
Exemple #14
0
def test_issue1834():
    """Test that sentence boundaries & parse/tag flags are not lost
    during serialization."""
    string = "This is a first sentence . And another one"
    doc = Doc(Vocab(), words=string.split())
    doc[6].sent_start = True
    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
    assert new_doc[6].sent_start
    assert not new_doc.is_parsed
    assert not new_doc.is_tagged
    doc.is_parsed = True
    doc.is_tagged = True
    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
    assert new_doc.is_parsed
    assert new_doc.is_tagged
Exemple #15
0
def test_issue1834():
    """Test that sentence boundaries & parse/tag flags are not lost
    during serialization."""
    string = "This is a first sentence . And another one"
    doc = Doc(Vocab(), words=string.split())
    doc[6].sent_start = True
    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
    assert new_doc[6].sent_start
    assert not new_doc.is_parsed
    assert not new_doc.is_tagged
    doc.is_parsed = True
    doc.is_tagged = True
    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
    assert new_doc.is_parsed
    assert new_doc.is_tagged
Exemple #16
0
def test_attr_pipeline_checks(en_vocab):
    doc1 = Doc(en_vocab, words=["Test"])
    doc1[0].dep_ = "ROOT"
    doc2 = Doc(en_vocab, words=["Test"])
    doc2[0].tag_ = "TAG"
    doc2[0].pos_ = "X"
    if spacy_version >= 3:
        doc2[0].set_morph("Feat=Val")
    else:
        doc1.is_parsed = True
    doc2[0].lemma_ = "LEMMA"
    doc3 = Doc(en_vocab, words=["Test"])
    # DEP requires DEP
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"DEP": "a"}]])
    matcher(doc1)
    with pytest.raises(ValueError):
        matcher(doc2)
    with pytest.raises(ValueError):
        matcher(doc3)
    # errors can be suppressed if desired
    matcher(doc2, allow_missing=True)
    matcher(doc3, allow_missing=True)
    # TAG, POS, LEMMA require those values
    for attr in ("TAG", "POS", "LEMMA"):
        matcher = Matcher(en_vocab)
        matcher.add("TEST", [[{attr: "a"}]])
        if spacy_version < 3:
            doc2.is_tagged = True
        matcher(doc2)
        with pytest.raises(ValueError):
            matcher(doc1)
        with pytest.raises(ValueError):
            matcher(doc3)
    # TEXT/ORTH only require tokens
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"ORTH": "a"}]])
    matcher(doc1)
    matcher(doc2)
    matcher(doc3)
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"TEXT": "a"}]])
    matcher(doc1)
    matcher(doc2)
    matcher(doc3)
Exemple #17
0
def test_phrase_matcher_validation(en_vocab):
    doc1 = Doc(en_vocab, words=["Test"])
    doc1.is_parsed = True
    doc2 = Doc(en_vocab, words=["Test"])
    doc2.is_tagged = True
    doc3 = Doc(en_vocab, words=["Test"])
    matcher = PhraseMatcher(en_vocab, validate=True)
    with pytest.warns(UserWarning):
        matcher.add("TEST1", [doc1])
    with pytest.warns(UserWarning):
        matcher.add("TEST2", [doc2])
    with pytest.warns(None) as record:
        matcher.add("TEST3", [doc3])
        assert not record.list
    matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
    with pytest.warns(None) as record:
        matcher.add("TEST4", [doc2])
        assert not record.list
Exemple #18
0
 def __call__(self,text):
   u=self.model(text) if text else ""
   if not self.convUD:
     return u
   vs=self.vocab.strings
   r=vs.add("ROOT")
   words=[]
   lemmas=[]
   pos=[]
   tags=[]
   heads=[]
   deps=[]
   spaces=[]
   for t in u.split("\n"):
     if t=="" or t.startswith("#"):
       continue
     s=t.split("\t")
     if len(s)!=10:
       continue
     id,form,lemma,upos,xpos,dummy_feats,head,deprel,dummy_deps,misc=s
     words.append(form)
     lemmas.append(vs.add(lemma))
     pos.append(vs.add(upos))
     tags.append(vs.add(xpos))
     if deprel=="root" or deprel=="ROOT":
       heads.append(0)
       deps.append(r)
     elif head=="0":
       heads.append(0)
       deps.append(vs.add(deprel))
     else:
       heads.append(int(head)-int(id))
       deps.append(vs.add(deprel))
     spaces.append(False if "SpaceAfter=No" in misc else True)
   doc=Doc(self.vocab,words=words,spaces=spaces)
   a=numpy.array(list(zip(lemmas,pos,tags,deps,heads)),dtype="uint64")
   doc.from_array([LEMMA,POS,TAG,DEP,HEAD],a)
   try:
     doc.is_tagged=True
     doc.is_parsed=True
   except:
     pass
   return doc
Exemple #19
0
def knp_dependency_parser(doc: Doc) -> Doc:
    tag_spans: Iterable[Span] = doc._.get(KNP_USER_KEYS.tag.spans)
    s = []
    for tag in tag_spans:
        for c in tag[1:]:
            c.head = tag[0]
            c.dep_ = _get_child_dep(c)
        parent: Optional[Span] = tag._.get(KNP_USER_KEYS.tag.parent)
        if parent is not None:
            tag[0].head = parent[0]
            tag[0].dep_ = _get_dep(tag[0])
        else:
            tag[0].head = tag[0]
            tag[0].dep_ = "ROOT"
        s.append(tag[0])
    s = _modify_head_punct(s)
    s = _modify_head_flat(s)
    s = _modify_head_conj(s)
    doc.is_parsed = True
    return doc
Exemple #20
0
 def __call__(self,text):
   from suparkanbun.tradify import tradify
   t=""
   for c in text:
     if c in self.simplify:
       t+=self.simplify[c]
     else:
       t+=c
   if self.danku!=None:
     u=t.replace("\n","")
     t=""
     while len(u)>500:
       s=self.danku(u[0:500])
       r=""
       for c,p in s:
         r+=c
         if p=="S" or p=="E":
           r+="\n"
       r="\n".join(r.split("\n")[0:-2])+"\n"
       t+=r
       u=u[len(r.replace("\n","")):]
     s=self.danku(u)
     for c,p in s:
       t+=c
       if p=="S" or p=="E":
         t+="\n"
   if len(t)<500:
     p=self.tagger(t.replace("\n",""))
   else:
     p=[]
     u=""
     for s in t.strip().split("\n"):
       u+=s
       if len(u)>400:
         p+=self.tagger(u)
         u=""
     if len(u)>0:
       p+=self.tagger(u)
   u=self.supar.predict([[c for c in s] for s in t.strip().split("\n")],lang=None)
   t=text.replace("\n","")
   i=0
   w=[]
   for s in u.sentences:
     v=[]
     for h,d in zip(s.values[6],s.values[7]):
       j=t[i]
       k=tradify[j] if j in tradify else j
       v.append({"form":j,"lemma":k,"pos":p[i][1],"head":h,"deprel":d})
       i+=1
     for j in reversed(range(0,len(v)-1)):
       if v[j]["deprel"]=="compound" and v[j]["head"]==j+2 and v[j]["pos"]==v[j+1]["pos"]:
         k=v.pop(j)
         v[j]["form"]=k["form"]+v[j]["form"]
         v[j]["lemma"]=k["lemma"]+v[j]["lemma"]
         for k in range(0,len(v)):
           if v[k]["head"]>j+1:
             v[k]["head"]-=1
     w.append(list(v))
   vs=self.vocab.strings
   r=vs.add("ROOT")
   words=[]
   lemmas=[]
   pos=[]
   tags=[]
   feats=[]
   heads=[]
   deps=[]
   spaces=[]
   norms=[]
   for s in w:
     for i,t in enumerate(s):
       form=t["form"]
       words.append(form)
       lemmas.append(vs.add(t["lemma"]))
       p=t["pos"].split(",")
       xpos=",".join(p[0:4])
       pos.append(vs.add(p[4]))
       tags.append(vs.add(xpos))
       feats.append(p[5])
       if t["deprel"]=="root":
         heads.append(0)
         deps.append(r)
       else:
         heads.append(t["head"]-i-1)
         deps.append(vs.add(t["deprel"]))
       spaces.append(False)
       g=self.gloss(form,xpos)
       if g!=None:
         norms.append(vs.add(g))
       else:
         norms.append(vs.add(form))
   doc=Doc(self.vocab,words=words,spaces=spaces)
   a=numpy.array(list(zip(lemmas,pos,tags,deps,heads,norms)),dtype="uint64")
   doc.from_array([LEMMA,POS,TAG,DEP,HEAD,NORM],a)
   try:
     doc.is_tagged=True
     doc.is_parsed=True
   except:
     for i,j in enumerate(feats):
       if j!="_" and j!="":
         doc[i].set_morph(j)
   return doc
Exemple #21
0
def doc_not_parsed(en_tokenizer):
    text = "This is a sentence. This is another sentence. And a third."
    tokens = en_tokenizer(text)
    doc = Doc(tokens.vocab, words=[t.text for t in tokens])
    doc.is_parsed = False
    return doc
Exemple #22
0
    def __call__(self, text: Union[str, List[str], List[List[str]]]) -> Doc:
        """Convert input text to a spaCy Doc.

        text: The text to process. It can be presegmented or pretokenized:
            str             : raw text,
            List[str]       : presegmented text,
            List[List[str]] : pretokenized text.
        RETURNS: The spaCy Doc object.
        """
        udpipe_sents = self.model(text=text) if text else [Sentence()]
        text = " ".join(s.getText() for s in udpipe_sents)
        tokens, heads = self._get_tokens_with_heads(udpipe_sents=udpipe_sents)
        if not tokens:
            return Doc(vocab=self.vocab)

        words = []
        spaces = []
        pos = []
        tags = []
        deps = []
        lemmas = []
        offset = 0
        is_aligned = self._check_aligned(text=text, tokens=tokens)
        if not is_aligned:
            text = ""
            for token in tokens:
                text += token.form
                if NO_SPACE not in token.misc:
                    text += " "
        for i, token in enumerate(tokens):
            span = text[offset:]
            if not span:
                break
            while len(span) and span[0].isspace():
                # If we encounter leading whitespace, skip one character ahead
                offset += 1
                span = text[offset:]
            words.append(token.form)
            # Make sure all strings are in the vocabulary
            pos.append(self.vocab.strings.add(token.upostag or ""))
            # CoNNL xpostag-s, custom for each UD treebank
            tags.append(self.vocab.strings.add(token.xpostag or ""))
            deps.append(self.vocab.strings.add(self._dep(token.deprel) or ""))
            lemmas.append(self.vocab.strings.add(token.lemma or ""))
            offset += len(token.form)
            span = text[offset:]
            if i == len(tokens) - 1 or NO_SPACE in token.misc:
                spaces.append(False)
            elif not is_aligned:
                spaces.append(True)
            else:
                next_token = tokens[i + 1]
                spaces.append(not span.startswith(next_token.form))
        try:
            attrs = [POS, TAG, DEP, HEAD]
            array = numpy.array(list(zip(pos, tags, deps, heads)),
                                dtype="uint64")
            doc = Doc(self.vocab, words=words,
                      spaces=spaces).from_array(attrs, array)
        except ValueError as e:
            if '[E167]' in str(e):
                raise ValueError(
                    "Could not properly assign morphology features. "
                    f"Please update the tag map for '{self.model._lang}'"
                    " language. See "
                    "https://spacy.io/usage/adding-languages#tag-map "
                    "for details. A quick workaround is to use the keyword "
                    "argument ignore_tag_map=True when loading UDPipeLanguage."
                )
            else:
                raise e
        # Overwrite lemmas separately to prevent overwritting by spaCy
        lemma_array = numpy.array([[lemma] for lemma in lemmas],
                                  dtype="uint64")
        doc.from_array(attrs=[LEMMA], array=lemma_array)
        doc.is_tagged = bool(any(pos) and any(tags))
        doc.is_parsed = bool(any(deps))
        return doc
Exemple #23
0
 def __call__(self, text):
     u = self.model(text) if text else ""
     vs = self.vocab.strings
     r = vs.add("ROOT")
     p = {
         "ACAUS": "VERB",
         "ACOND": "SCONJ",
         "ADV": "ADV",
         "ALIM": "SCONJ",
         "APREC": "SCONJ",
         "ART": "DET",
         "CCIRC": "SCONJ",
         "CFOC": "PART",
         "CONJ": "CCONJ",
         "COP": "PRON",
         "CPRET": "AUX",
         "CREL": "SCONJ",
         "EXIST": "VERB",
         "FUT": "AUX",
         "IMOD": "ADV",
         "NEG": "ADV",
         "NPROP": "PROPN",
         "NUM": "NUM",
         "PDEM": "DET",
         "PPOS": "DET",
         "PREP": "ADP",
         "PTC": "PART",
         "PUNCT": "PUNCT"
     }
     words = []
     lemmas = []
     pos = []
     tags = []
     heads = []
     deps = []
     spaces = []
     norms = []
     for s in u.split("\n"):
         if s.startswith('<norm xml:id="u'):
             id = s[15:s.index('"', 16)]
             i = s.index(' orig="')
             form = s[i + 7:s.index('"', i + 8)]
             words.append(form)
             i = s.find(' lemma="')
             lemmas.append(
                 vs.add(form if i < 0 else s[i + 8:s.index('"', i + 9)]))
             i = s.find(' norm="')
             norms.append(
                 vs.add(form if i < 0 else s[i + 7:s.index('"', i + 8)]))
             i = s.index(' func="')
             dep = s[i + 7:s.index('"', i + 8)]
             if dep == "root":
                 heads.append(0)
                 deps.append(r)
             else:
                 i = s.find(' head="#u')
                 heads.append(
                     0 if i < 0 else int(s[i + 9:s.index('"', i + 10)]) -
                     int(id))
                 deps.append(vs.add(dep))
             i = s.index(' pos="')
             xpos = s[i + 6:s.index('"', i + 7)]
             tags.append(vs.add(xpos))
             upos = "X"
             if xpos in p:
                 upos = p[xpos]
             elif xpos.startswith("A"):
                 upos = "AUX"
             elif xpos.startswith("N"):
                 upos = "ADJ" if dep in {"amod", "acl"} else "NOUN"
             elif xpos.startswith("P"):
                 upos = "PRON"
             elif xpos.startswith("V"):
                 upos = "VERB"
             pos.append(vs.add(upos))
             spaces.append(False)
         elif s.startswith("</norm_group>"):
             spaces[-1] = True
     doc = Doc(self.vocab, words=words, spaces=spaces)
     a = numpy.array(list(zip(lemmas, pos, tags, deps, heads, norms)),
                     dtype="uint64")
     doc.from_array([LEMMA, POS, TAG, DEP, HEAD, NORM], a)
     try:
         doc.is_tagged = True
         doc.is_parsed = True
     except:
         pass
     return doc
Exemple #24
0
    def __call__(self, text):
        """Convert a Stanza Doc to a spaCy Doc.

        text (unicode): The text to process.
        RETURNS (spacy.tokens.Doc): The spaCy Doc object.
        """
        if not text:
            return Doc(self.vocab)
        elif text.isspace():
            return Doc(self.vocab, words=[text], spaces=[False])

        snlp_doc = self.snlp(text)
        text = snlp_doc.text
        snlp_tokens, snlp_heads = self.get_tokens_with_heads(snlp_doc)
        words = []
        spaces = []
        pos = []
        tags = []
        deps = []
        heads = []
        lemmas = []
        offset = 0
        token_texts = [t.text for t in snlp_tokens]
        is_aligned = True
        try:
            words, spaces = self.get_words_and_spaces(token_texts, text)
        except ValueError:
            words = token_texts
            spaces = [True] * len(words)
            is_aligned = False
            warnings.warn(
                "Due to multiword token expansion or an alignment "
                "issue, the original text has been replaced by space-separated "
                "expanded tokens.",
                stacklevel=4,
            )
        offset = 0
        for i, word in enumerate(words):
            if word.isspace() and word != snlp_tokens[i + offset].text:
                # insert a space token
                pos.append(self.vocab.strings.add("SPACE"))
                tags.append(self.vocab.strings.add("_SP"))
                deps.append(self.vocab.strings.add(""))
                lemmas.append(self.vocab.strings.add(word))

                # increment any heads left of this position that point beyond
                # this position to the right (already present in heads)
                for j in range(0, len(heads)):
                    if j + heads[j] >= i:
                        heads[j] += 1

                # decrement any heads right of this position that point beyond
                # this position to the left (yet to be added from snlp_heads)
                for j in range(i + offset, len(snlp_heads)):
                    if j + snlp_heads[j] < i + offset:
                        snlp_heads[j] -= 1

                # initial space tokens are attached to the following token,
                # otherwise attach to the preceding token
                if i == 0:
                    heads.append(1)
                else:
                    heads.append(-1)

                offset -= 1
            else:
                token = snlp_tokens[i + offset]
                assert word == token.text

                pos.append(self.vocab.strings.add(token.upos or ""))
                tags.append(
                    self.vocab.strings.add(token.xpos or token.feats or ""))
                deps.append(self.vocab.strings.add(token.deprel or ""))
                heads.append(snlp_heads[i + offset])
                lemmas.append(self.vocab.strings.add(token.lemma or ""))

        attrs = [POS, TAG, DEP, HEAD]
        array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64")
        doc = Doc(self.vocab, words=words,
                  spaces=spaces).from_array(attrs, array)
        ents = []
        for ent in snlp_doc.entities:
            ent_span = doc.char_span(ent.start_char, ent.end_char, ent.type)
            ents.append(ent_span)
        if not is_aligned or not all(ents):
            warnings.warn(
                f"Can't set named entities because of multi-word token "
                f"expansion or because the character offsets don't map to "
                f"valid tokens produced by the Stanza tokenizer:\n"
                f"Words: {words}\n"
                f"Entities: {[(e.text, e.type, e.start_char, e.end_char) for e in snlp_doc.entities]}",
                stacklevel=4,
            )
        else:
            doc.ents = ents
        # Overwrite lemmas separately to prevent them from being overwritten by spaCy
        lemma_array = numpy.array([[lemma] for lemma in lemmas],
                                  dtype="uint64")
        doc.from_array([LEMMA], lemma_array)
        if any(pos) or any(tags):
            doc.is_tagged = True
        if any(deps) or any(heads):
            doc.is_parsed = True
        return doc
Exemple #25
0
def correct_dep(doc, rewrite_ne_as_proper_noun):
    complex_tokens = []
    last_head = -1
    for token in doc[0:-1]:
        label = token.dep_
        p = label.find('_as_')
        if p >= 0:
            tag = label[p + 4:]
            if len(tag) > 0:
                lemma = token.lemma_
                token.tag_ = tag  # work around: lemma_ must be set after tag_ (spaCy's bug)
                token.lemma_ = lemma
            token.dep_ = label[0:p]

    for token in doc[0:-1]:
        label = token.dep_
        if label.startswith('as_'):
            head = token.head
            if last_head == head.i:
                complex_tokens[-1].append(token)
            else:
                complex_tokens.append([token])
                last_head = token.i
            tag = label[3:]
            if len(tag) > 0:
                lemma = token.lemma_
                token.tag_ = tag  # work around: lemma_ must be set after tag_ (spaCy's bug)
                token.lemma_ = lemma
            token.dep_ = 'dep'
        else:
            complex_tokens.append([token])
            last_head = token.i
    complex_tokens.append([doc[-1]])  # for root detection error

    index = 0
    count = 0
    index_map = [0] * (len(doc) + 1)  # last element is for ner
    for comp in complex_tokens:
        for _ in comp:
            index_map[count] = index
            count += 1
        index += 1
    index_map[-1] = count

    if len(complex_tokens) > 1:
        words, lemmas, tags, pos_details, infs, spaces, sent_starts, deps, heads = zip(*[
            (
                ''.join([t.orth_ + ' ' if t.whitespace_ else t.orth_ for t in comp[0:-1]] + [comp[-1].orth_]),
                ''.join([t.lemma_ + ' ' if t.whitespace_ else t.lemma_ for t in comp[0:-1]] + [comp[-1].lemma_]),
                comp[0].pos_,
                comp[0]._.pos_detail,
                comp[-1]._.inf,
                comp[-1].whitespace_,
                True if comp[0].sent_start else False,
                comp[0].dep_,
                index_map[comp[0].head.i],
            ) if len(comp) > 1 else (
                comp[0].orth_,
                comp[0].lemma_,
                comp[0].pos_,
                comp[0]._.pos_detail,
                comp[0]._.inf,
                comp[0].whitespace_,
                True if comp[0].sent_start else False,
                comp[0].dep_,
                index_map[comp[0].head.i],
            ) for comp in complex_tokens[0:-1]
        ])
    else:
        words = lemmas = tags = pos_details = infs = spaces = sent_starts = deps = heads = []
    new_doc = Doc(doc.vocab, words=words, spaces=spaces)
    for token, lemma, tag, pos_detail, inf, dep in zip(new_doc, lemmas, tags, pos_details, infs, deps):
        token.tag_ = tag
        token.lemma_ = lemma  # work around: lemma_ must be set after tag_ (spaCy's bug)
        token._.pos_detail = pos_detail
        token._.inf = inf
        token.dep_ = dep
    for token, sent_start in zip(new_doc, sent_starts):
        if sent_start:
            token.sent_start = True
    root_i = len(new_doc)
    for token, head in zip(new_doc, heads):
        if head == root_i:
            token.head = token
        else:
            token.head = new_doc[head]

    ents = []
    prev_start = prev_end = -1
    for ent in doc.ents:
        start = index_map[ent.start]
        end = max(index_map[ent.end], start + 1)
        if prev_end > start and prev_start < end:
            ents = ents[:-1]
        ents.append((ent.label, start, end))
        prev_start = start
        prev_end = end
    new_doc.ents = ents

    new_doc.is_tagged = doc.is_tagged
    new_doc.is_parsed = doc.is_parsed

    if rewrite_ne_as_proper_noun:
        for _, start, end in ents:
            for token in new_doc[start:end]:
                lemma = token.lemma_
                token.tag_ = 'PROPN'  # work around: lemma_ must be set after tag_ (spaCy's bug)
                token.lemma_ = lemma

    new_doc.noun_chunks_iterator = noun_chunks  # TODO work around for spaCy 2.0.12

    if len(doc.text) - len(EOS) != len(new_doc.text):
        print(
            'doc.text length is different from source={} to corrected={}'.format(
                len(doc.text) - len(EOS),
                len(new_doc.text)),
            file=sys.stderr
        )
        for t in doc:
            print('<', t.i, t.orth_, t.pos_, t.dep_, t.head.i, file=sys.stderr)
        for t in new_doc:
            print('>', t.i, t.orth_, t.pos_, t.dep_, t.head.i, file=sys.stderr)

    return new_doc
Exemple #26
0
def doc_not_parsed(en_tokenizer):
    text = "This is a sentence. This is another sentence. And a third."
    tokens = en_tokenizer(text)
    doc = Doc(tokens.vocab, words=[t.text for t in tokens])
    doc.is_parsed = False
    return doc
Exemple #27
0
 def __call__(self,text):
   t=text.replace("\r","").replace("(","(").replace(")",")").replace("[","[").replace("]","]").replace("{","{").replace("}","}")
   u=self.model(t) if t else ""
   vs=self.vocab.strings
   r=vs.add("ROOT")
   words=[]
   lemmas=[]
   pos=[]
   tags=[]
   morphs=[]
   heads=[]
   deps=[]
   spaces=[]
   norms=[]
   ent_iobs=[]
   ent_types=[]
   bunsetu=[]
   for t in u.split("\n"):
     if t=="" or t.startswith("#"):
       continue
     s=t.split("\t")
     if len(s)!=10:
       continue
     id,form,lemma,upos,xpos,feats,head,deprel,_,misc=s
     words.append(form)
     lemmas.append(vs.add(lemma))
     pos.append(vs.add(upos))
     tags.append(vs.add(xpos))
     morphs.append(feats)
     if deprel=="root":
       heads.append(0)
       deps.append(r)
     else:
       heads.append(int(head)-int(id))
       deps.append(vs.add(deprel))
     spaces.append(False if "SpaceAfter=No" in misc else True)
     i=misc.find("Translit=")
     norms.append(vs.add(form if i<0 else misc[i+9:]))
     i=misc.find("NE=")
     if i<0:
       ent_iobs.append(2)
       ent_types.append(0)
     else:
       j=misc.find("|",i)
       if j<0:
         j=len(misc)
       if misc[i+3:i+4]=="B":
         ent_iobs.append(3)
       else:
         ent_iobs.append(1)
       ent_types.append(vs.add(misc[i+5:j]))
     bunsetu.append("I")
     if misc.startswith("BunsetuBILabel="):
       bunsetu[-1]=misc[15:16]
   doc=Doc(self.vocab,words=words,spaces=spaces)
   a=numpy.array(list(zip(lemmas,pos,tags,deps,heads,norms,ent_iobs,ent_types)),dtype="uint64")
   doc.from_array([LEMMA,POS,TAG,DEP,HEAD,NORM,ENT_IOB,ENT_TYPE],a)
   try:
     doc.is_tagged=True
     doc.is_parsed=True
   except:
     for i,j in enumerate(morphs):
       if j!="_" and j!="":
         doc[i].set_morph(j)
   doc.user_data["bunsetu_bi_labels"]=bunsetu
   return doc
Exemple #28
0
    def __call__(self, text):
        """Convert a Stanza Doc to a spaCy Doc.

        text (unicode): The text to process.
        RETURNS (spacy.tokens.Doc): The spaCy Doc object.
        """
        snlp_doc = self.snlp(text) if text else Document("")
        text = snlp_doc.text
        tokens, heads = self.get_tokens_with_heads(snlp_doc)
        if not len(tokens):
            return Doc(self.vocab)
        words = []
        spaces = []
        pos = []
        tags = []
        deps = []
        lemmas = []
        offset = 0
        is_aligned = self.check_aligned(text, tokens)
        for i, token in enumerate(tokens):
            span = text[offset:]
            if not len(span):
                break
            while len(span) and span[0].isspace():
                # If we encounter leading whitespace, skip one character ahead
                offset += 1
                span = text[offset:]
            words.append(token.text)
            # Make sure all strings are in the vocabulary
            pos.append(self.vocab.strings.add(token.upos or ""))
            tags.append(self.vocab.strings.add(token.xpos or ""))
            deps.append(self.vocab.strings.add(token.deprel or ""))
            lemmas.append(self.vocab.strings.add(token.lemma or ""))
            offset += len(token.text)
            span = text[offset:]
            if i == len(tokens) - 1:
                spaces.append(False)
            elif not is_aligned:
                spaces.append(True)
            else:
                next_token = tokens[i + 1]
                spaces.append(not span.startswith(next_token.text))
        attrs = [POS, TAG, DEP, HEAD]
        array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64")
        doc = Doc(self.vocab, words=words,
                  spaces=spaces).from_array(attrs, array)
        ents = []
        for ent in snlp_doc.entities:
            ent_span = doc.char_span(ent.start_char, ent.end_char, ent.type)
            ents.append(ent_span)
        if not all(ents):
            warnings.warn(
                f"Can't set named entities because the character offsets don't "
                f"map to valid tokens produced by the Stanza tokenizer:\n"
                f"Words: {words}\n"
                f"Entities: {[(e.text, e.type, e.start_char, e.end_char) for e in snlp_doc.entities]}",
                stacklevel=4,
            )
        else:
            doc.ents = ents
        # Overwrite lemmas separately to prevent them from being overwritten by spaCy
        lemma_array = numpy.array([[lemma] for lemma in lemmas],
                                  dtype="uint64")
        doc.from_array([LEMMA], lemma_array)
        if any(pos) and any(tags):
            doc.is_tagged = True
        if any(deps):
            doc.is_parsed = True
        return doc
Exemple #29
0
def serialize_spacy_doc(orig_doc, converted_sentences):
    words = []
    spaces = []
    total_attrs = []
    attrs_ = list(attrs.NAMES)
    attrs_.remove('SENT_START')  # this clashes HEAD (see spacy documentation)
    attrs_.remove(
        'SPACY')  # we dont want to override the spaces we assign later on

    for orig_span, converted_sentence in zip(orig_doc.sents,
                                             converted_sentences):
        # remove redundant dummy-root-node
        converted = {
            iid: tok
            for iid, tok in converted_sentence.items() if iid != 0
        }
        orig = orig_span.as_doc()

        # get attributes of original doc
        orig_attrs = orig.to_array(attrs_)

        # append copied attributes for new nodes
        new_nodes_attrs = []
        for iid, tok in converted.items():
            if int(iid) != iid:
                new_node_attrs = list(orig_attrs[int(iid) - 1])

                # here we fix the relative head he is pointing to,
                # in case it is a negative number we need to cast it to its unsigned synonym
                relative = int(iid) - (len(orig_attrs) + len(new_nodes_attrs) +
                                       1)
                new_node_attrs[attrs_.index('HEAD')] = relative + (
                    2**NUM_OF_BITS if relative < 0 else 0)

                new_nodes_attrs.append(new_node_attrs)
        if new_nodes_attrs:
            new_attrs = np.append(orig_attrs, new_nodes_attrs, axis=0)
        else:
            new_attrs = orig_attrs
        total_attrs = np.append(total_attrs, new_attrs,
                                axis=0) if len(total_attrs) > 0 else new_attrs

        # fix whitespaces in case of new nodes: take original spaces. change the last one if there are new nodes.
        #   add spaces for each new nodes, except for last
        spaces += [t.whitespace_ if not ((i + 1 == len(orig)) and (len(new_nodes_attrs) > 0)) else ' ' for i, t in enumerate(orig)] + \
                  [' ' if i + 1 < len(converted.keys()) else '' for i, iid in enumerate(converted.keys()) if int(iid) != iid]
        spaces[-1] = ' '
        words += [t.get_conllu_field("form") for iid, t in converted.items()]

    # form new doc including new nodes and set attributes
    spaces[-1] = ''
    new_doc = Doc(orig_doc.vocab, words=words, spaces=spaces)
    new_doc.from_array(attrs_, total_attrs)

    j = 0
    for converted_sentence in converted_sentences:
        converted = {
            iid: tok
            for iid, tok in converted_sentence.items() if iid != 0
        }

        # store spacy ids for head indices extraction later on
        spacy_ids = {
            iid: (spacy_i + j)
            for spacy_i, iid in enumerate(converted.keys())
        }

        # set new info for all tokens per their head lists
        for i, bart_tok in enumerate(converted.values()):
            spacy_tok = new_doc[i + j]
            for head, rel in bart_tok.get_new_relations():
                # extract spacy correspondent head id
                head_tok = new_doc[
                    spacy_ids[head.get_conllu_field("id")] if head.
                    get_conllu_field("id") != 0 else spacy_tok.i]
                # parse stringish label
                is_state_head_node = ((head_tok.text == "STATE") and (head.get_conllu_field("id") != int(head.get_conllu_field("id")))) or \
                                     (bart_tok.get_conllu_field("id") != int(bart_tok.get_conllu_field("id")))
                new_rel, src, unc, alt = parse_bart_label(
                    rel, is_state_head_node=is_state_head_node)
                # add info to token
                spacy_tok._.parent_list.append({
                    'head': head_tok,
                    'rel': new_rel,
                    'src': src,
                    'alt': alt,
                    'unc': unc
                })

            # fix sentence boundaries, need to turn off is_parsed bool as it prevents setting the boundaries
            new_doc.is_parsed = False
            spacy_tok.is_sent_start = False if i != 0 else True
            new_doc.is_parsed = True

        j += len(converted)

    return new_doc