Esempio n. 1
0
def test_attr_pipeline_checks(en_vocab):
    doc1 = Doc(en_vocab, words=["Test"])
    doc1.is_parsed = True
    doc2 = Doc(en_vocab, words=["Test"])
    doc2.is_tagged = True
    doc3 = Doc(en_vocab, words=["Test"])
    # DEP requires is_parsed
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"DEP": "a"}]])
    matcher(doc1)
    with pytest.raises(ValueError):
        matcher(doc2)
    with pytest.raises(ValueError):
        matcher(doc3)
    # TAG, POS, LEMMA require is_tagged
    for attr in ("TAG", "POS", "LEMMA"):
        matcher = Matcher(en_vocab)
        matcher.add("TEST", [[{attr: "a"}]])
        matcher(doc2)
        with pytest.raises(ValueError):
            matcher(doc1)
        with pytest.raises(ValueError):
            matcher(doc3)
    # TEXT/ORTH only require tokens
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"ORTH": "a"}]])
    matcher(doc1)
    matcher(doc2)
    matcher(doc3)
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"TEXT": "a"}]])
    matcher(doc1)
    matcher(doc2)
    matcher(doc3)
Esempio n. 2
0
 def read_conlldoc(self, inputdoc):
     words = list()
     sentbounds = list()
     #    pos = list()
     tags = list()
     #    lemmas = list()
     for sent in string2doc(inputdoc, hide_fields=HIDDEN_FIELDS):
         for i, tok in enumerate(sent):
             if i == 0:
                 sentbounds.append(True)
             else:
                 sentbounds.append(False)
             words.append(tok.word)
             tags.append(self.nlp.vocab.strings.add(tok.xpos))
             # pos.append(self.nlp.vocab.strings.add(conv_table.get(tok.xpos, "_")))
     #            lemmas.append(self.nlp.vocab.strings.add(tok.lemma))
     # attrs = [POS, TAG]
     attrs = [TAG]
     # arr = np.array(list(zip(pos, tags)), dtype="uint64")
     arr = np.array(tags, dtype="uint64")
     sdoc = Doc(self.nlp.vocab, words=words).from_array(attrs, arr)
     for i, sb in enumerate(sentbounds):
         if sb:
             sdoc[i].is_sent_start = True
         else:
             # these must be set to False, since,
             # if left as None, spaCy will add further sentbounds
             sdoc[i].is_sent_start = False
     #    lemma_array = np.array([[lemma] for lemma in lemmas], dtype="uint64")
     #    sdoc.from_array([LEMMA], lemma_array)
     if any(tags):
         sdoc.is_tagged = True
     return sdoc
Esempio n. 3
0
def test_issue599(en_vocab):
    doc = Doc(en_vocab)
    doc.is_tagged = True
    doc.is_parsed = True
    doc2 = Doc(doc.vocab)
    doc2.from_bytes(doc.to_bytes())
    assert doc2.is_parsed
Esempio n. 4
0
def test_issue599(en_vocab):
    doc = Doc(en_vocab)
    doc.is_tagged = True
    doc.is_parsed = True
    doc2 = Doc(doc.vocab)
    doc2.from_bytes(doc.to_bytes())
    assert doc2.is_parsed
Esempio n. 5
0
def test_attr_pipeline_checks(en_vocab):
    doc1 = Doc(en_vocab, words=["Test"])
    doc1.is_parsed = True
    doc2 = Doc(en_vocab, words=["Test"])
    doc2.is_tagged = True
    doc3 = Doc(en_vocab, words=["Test"])
    # DEP requires is_parsed
    matcher = PhraseMatcher(en_vocab, attr="DEP")
    matcher.add("TEST1", [doc1])
    with pytest.raises(ValueError):
        matcher.add("TEST2", [doc2])
    with pytest.raises(ValueError):
        matcher.add("TEST3", [doc3])
    # TAG, POS, LEMMA require is_tagged
    for attr in ("TAG", "POS", "LEMMA"):
        matcher = PhraseMatcher(en_vocab, attr=attr)
        matcher.add("TEST2", [doc2])
        with pytest.raises(ValueError):
            matcher.add("TEST1", [doc1])
        with pytest.raises(ValueError):
            matcher.add("TEST3", [doc3])
    # TEXT/ORTH only require tokens
    matcher = PhraseMatcher(en_vocab, attr="ORTH")
    matcher.add("TEST3", [doc3])
    matcher = PhraseMatcher(en_vocab, attr="TEXT")
    matcher.add("TEST3", [doc3])
Esempio n. 6
0
def test_matcher_no_zero_length(en_vocab):
    doc = Doc(en_vocab, words=["a", "b"])
    doc[0].tag_ = "A"
    doc[1].tag_ = "B"
    doc.is_tagged = True
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
    assert len(matcher(doc)) == 0
Esempio n. 7
0
    def __call__(self, text):
        """Convert input text to a spaCy Doc.

        text (unicode): The text to process.
        RETURNS (spacy.tokens.Doc): The spaCy Doc object.
        """
        udpipe_sents = self.model(text) if text else [Sentence()]
        text = " ".join(s.getText() for s in udpipe_sents)
        tokens, heads = self.get_tokens_with_heads(udpipe_sents)
        if not tokens:
            return Doc(self.vocab)

        words = []
        spaces = []
        pos = []
        tags = []
        deps = []
        lemmas = []
        offset = 0
        is_aligned = self.check_aligned(text, tokens)
        for i, token in enumerate(tokens):
            span = text[offset:]
            if not span:
                break
            while len(span) and span[0].isspace():
                # If we encounter leading whitespace, skip one character ahead
                offset += 1
                span = text[offset:]
            words.append(token.form)
            # Make sure all strings are in the vocabulary
            pos.append(self.vocab.strings.add(token.upostag or ""))
            # CoNNL xpostag-s, custom for each UD treebank
            #tags.append(self.vocab.strings.add(token.xpostag or ""))
            tags.append(self.vocab.strings.add(token.feats or ""))
            deps.append(self.vocab.strings.add(self._dep(token.deprel) or ""))
            lemmas.append(self.vocab.strings.add(token.lemma or ""))
            offset += len(token.form)
            span = text[offset:]
            if i == len(tokens) - 1 or "SpaceAfter=No" in token.misc:
                spaces.append(False)
            elif not is_aligned:
                spaces.append(True)
            else:
                next_token = tokens[i + 1]
                spaces.append(not span.startswith(next_token.form))
        attrs = [POS, TAG, DEP, HEAD]
        array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64")
        doc = Doc(self.vocab, words=words,
                  spaces=spaces).from_array(attrs, array)
        # Overwrite lemmas separately to prevent overwritting by spaCy
        lemma_array = numpy.array([[lemma] for lemma in lemmas],
                                  dtype="uint64")
        doc.from_array([LEMMA], lemma_array)
        if any(pos) and any(tags):
            doc.is_tagged = True
        if any(deps):
            doc.is_parsed = True
        return doc
Esempio n. 8
0
    def __call__(self, text):
        """Convert a StanfordNLP Doc to a spaCy Doc.

        text (unicode): The text to process.
        RETURNS (spacy.tokens.Doc): The spaCy Doc object.
        """
        snlp_doc = self.snlp(text) if text else Document("")
        text = snlp_doc.text
        tokens, heads = self.get_tokens_with_heads(snlp_doc)
        if not len(tokens):
            return Doc(self.vocab)

        words = []
        spaces = []
        pos = []
        tags = []
        deps = []
        lemmas = []
        offset = 0
        is_aligned = self.check_aligned(text, tokens)
        for i, token in enumerate(tokens):
            span = text[offset:]
            if not len(span):
                break
            while len(span) and span[0].isspace():
                # If we encounter leading whitespace, skip one character ahead
                offset += 1
                span = text[offset:]
            words.append(token.text)
            # Make sure all strings are in the vocabulary
            pos.append(self.vocab.strings.add(token.upos or ""))
            tags.append(self.vocab.strings.add(token.xpos or ""))
            deps.append(self.vocab.strings.add(token.dependency_relation
                                               or ""))
            lemmas.append(self.vocab.strings.add(token.lemma or ""))
            offset += len(token.text)
            span = text[offset:]
            if i == len(tokens) - 1:
                spaces.append(False)
            elif not is_aligned:
                spaces.append(True)
            else:
                next_token = tokens[i + 1]
                spaces.append(not span.startswith(next_token.text))
        attrs = [POS, TAG, DEP, HEAD]
        array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64")
        doc = Doc(self.vocab, words=words,
                  spaces=spaces).from_array(attrs, array)
        # Overwrite lemmas separately to prevent them from being overwritten by spaCy
        lemma_array = numpy.array([[lemma] for lemma in lemmas],
                                  dtype="uint64")
        doc.from_array([LEMMA], lemma_array)
        if any(pos) and any(tags):
            doc.is_tagged = True
        if any(deps):
            doc.is_parsed = True
        return doc
Esempio n. 9
0
 def _dtokens_to_doc(self, dtokens: List[ShortUnitWord]) -> Doc:
     words = [x.surface for x in dtokens]
     spaces = [x.space for x in dtokens]
     doc = Doc(self.vocab, words=words, spaces=spaces)
     for token, dtoken in zip(doc, dtokens):
         token.lemma_ = dtoken.lemma
         token.tag_ = dtoken.pos
         token._.set(self.key_fstring, dtoken.fstring)
     doc.is_tagged = True
     return doc
Esempio n. 10
0
 def __call__(self,text):
   u=self.model(text,raw=True) if text else ""
   vs=self.vocab.strings
   r=vs.add("ROOT")
   words=[]
   lemmas=[]
   pos=[]
   tags=[]
   feats=[]
   heads=[]
   deps=[]
   spaces=[]
   norms=[]
   for t in u.split("\n"):
     if t=="" or t.startswith("#"):
       continue
     s=t.split("\t")
     if len(s)!=10:
       continue
     id,form,lemma,upos,xpos,feat,head,deprel,dummy_deps,misc=s
     words.append(form)
     lemmas.append(vs.add(lemma))
     pos.append(vs.add(upos))
     tags.append(vs.add(xpos))
     feats.append(feat)
     if deprel=="root":
       heads.append(0)
       deps.append(r)
     else:
       heads.append(int(head)-int(id))
       deps.append(vs.add(deprel))
     spaces.append(False if "SpaceAfter=No" in misc else True)
     i=misc.find("Translit=")
     norms.append(vs.add(form if i<0 else misc[i+9:]))
   doc=Doc(self.vocab,words=words,spaces=spaces)
   a=numpy.array(list(zip(lemmas,pos,tags,deps,heads,norms)),dtype="uint64")
   doc.from_array([LEMMA,POS,TAG,DEP,HEAD,NORM],a)
   try:
     doc.is_tagged=True
     doc.is_parsed=True
   except:
     for i,j in enumerate(feats):
       if j!="_" and j!="":
         doc[i].set_morph(j)
   t=Tree(u)
   t._makeChunks()
   bunsetu=["I"]*len(doc)
   for s in t._cabocha._sentences:
     for w in s:
       try:
         bunsetu[w[0]-1]="B"
       except:
         pass
   doc.user_data["bunsetu_bi_labels"]=bunsetu
   return doc
Esempio n. 11
0
    def __call__(self, text):
        result = self.tokenizer.tokenize(text=text)
        morph_spaces = []
        last_morph = None
        for m in result:
            if m.surface():
                if m.part_of_speech()[0] == '空白':
                    if last_morph:
                        morph_spaces.append((last_morph, True))
                        last_morph = None
                    else:
                        morph_spaces.append((m, False))
                elif last_morph:
                    morph_spaces.append((last_morph, False))
                    last_morph = m
                else:
                    last_morph = m
        if last_morph:
            morph_spaces.append((last_morph, False))

        # the last space is removed by JapaneseReviser at the final stage of pipeline
        words = [m.surface() for m, spaces in morph_spaces]
        spaces = [space for m, space in morph_spaces]
        doc = Doc(self.nlp.vocab if self.nlp else Vocab(),
                  words=words,
                  spaces=spaces)
        next_tag = morph_tag(
            morph_spaces[0][0].part_of_speech()[0:4]) if len(doc) else ''
        for token, (morph, spaces) in zip(doc, morph_spaces):
            tag = next_tag
            next_tag = morph_tag(morph_spaces[token.i + 1][0].part_of_speech()
                                 [0:4]) if token.i < len(doc) - 1 else ''
            token.tag_ = tag
            token.pos = TAG_MAP[tag][POS]
            # TODO separate lexical rules to resource files
            if morph.normalized_form() == '為る' and tag == '動詞-非自立可能':
                token.pos_ = 'AUX'
            elif tag == '名詞-普通名詞-サ変可能':
                if next_tag == '動詞-非自立可能':
                    token.pos_ = 'VERB'
            elif tag == '名詞-普通名詞-サ変形状詞可能':
                if next_tag == '動詞-非自立可能':
                    token.pos_ = 'VERB'
                elif next_tag == '助動詞' or next_tag.find('形状詞') >= 0:
                    token.pos_ = 'ADJ'
            token.lemma_ = morph.normalized_form()
            token._.inf = ','.join(morph.part_of_speech()[4:])
            token._.reading = morph.reading_form()
            if self.enable_ex_sudachi:
                token._.sudachi = morph
        if self.use_sentence_separator:
            separate_sentences(doc)
        doc.is_tagged = True
        return doc
Esempio n. 12
0
    def __init__(self,
                 qid: str,
                 text: List[str],
                 head: Tuple[int, int],
                 tail: Tuple[int, int],
                 head_type: str = None,
                 tail_type: str = None,
                 ner: List[str] = None,
                 pos: List[str] = None,
                 dep: List[str] = None,
                 dep_heads: List[str] = None,
                 vid: int = 0,
                 annotator=None,
                 metas: Dict[str, any] = None) -> None:
        Target.__init__(self,
                        qid=qid,
                        text=None,
                        vid=vid,
                        annotator=annotator,
                        metas=metas)

        self.head = head
        self.tail = tail
        self.head_type = head_type
        self.tail_type = tail_type

        self.ner = ner
        self.pos = pos
        self.dep = dep
        self.dep_heads = dep_heads

        self.annotator = annotator

        vocab = annotator.model.vocab

        words = text
        spaces = [True] * len(words)
        tags = [vocab.strings.add(p) for p in pos]
        # deps = [vocab.strings.add(d) for d in dep]
        # heads = dep_heads
        ent_types = [vocab.strings.add(e) for e in ner]
        # attrs = [ENT_TYPE, POS, TAG, DEP, HEAD]
        attrs = [ENT_TYPE, TAG]
        # array = numpy.array(list(zip(ent_types, pos, tags, deps, heads)), dtype="uint64")
        array = numpy.array(list(zip(ent_types, tags)), dtype="uint64")
        doc = Doc(vocab, words=words, spaces=spaces).from_array(attrs, array)
        if any(pos) and any(tags):
            doc.is_tagged = True
        # if any(deps):
        #     doc.is_parsed = True
        self.doc = doc
Esempio n. 13
0
    def __call__(self, text):
        """Convert a StanfordNLP Doc to a spaCy Doc.

        text (unicode): The text to process.
        RETURNS (spacy.tokens.Doc): The spaCy Doc object.
        """
        snlp_doc = self.snlp(text)
        text = snlp_doc.text
        tokens, heads = self.get_tokens_with_heads(snlp_doc)
        if not len(tokens):
            raise ValueError("No tokens available.")
        words = []
        spaces = []
        pos = []
        tags = []
        deps = []
        lemmas = []
        offset = 0
        for i, token in enumerate(tokens):
            span = text[offset:]
            if not len(span):
                break
            while not span.startswith(token.text):
                # If we encounter leading whitespace, skip one character ahead
                offset += 1
                span = text[offset:]
            words.append(token.text)
            # Make sure all strings are in the vocabulary
            pos.append(self.vocab.strings.add(token.upos or ""))
            tags.append(self.vocab.strings.add(token.xpos or ""))
            deps.append(self.vocab.strings.add(token.dependency_relation
                                               or ""))
            lemmas.append(self.vocab.strings.add(token.lemma or ""))
            offset += len(token.text)
            span = text[offset:]
            if i == len(tokens) - 1:
                spaces.append(False)
            else:
                next_token = tokens[i + 1]
                spaces.append(not span.startswith(next_token.text))
        attrs = [POS, TAG, DEP, HEAD, LEMMA]
        array = numpy.array(list(zip(pos, tags, deps, heads, lemmas)),
                            dtype="uint64")
        doc = Doc(self.vocab, words=words,
                  spaces=spaces).from_array(attrs, array)
        if any(pos) and any(tags):
            doc.is_tagged = True
        if any(deps):
            doc.is_parsed = True
        return doc
Esempio n. 14
0
 def __call__(self,text):
   u=self.model(text,raw=True) if text else ""
   vs=self.vocab.strings
   r=vs.add("ROOT")
   words=[]
   lemmas=[]
   pos=[]
   tags=[]
   feats=[]
   heads=[]
   deps=[]
   spaces=[]
   norms=[]
   for t in u.split("\n"):
     if t=="" or t.startswith("#"):
       continue
     s=t.split("\t")
     if len(s)!=10:
       continue
     id,form,lemma,upos,xpos,feat,head,deprel,dummy_deps,misc=s
     words.append(form)
     lemmas.append(vs.add(lemma))
     pos.append(vs.add(upos))
     tags.append(vs.add(xpos))
     feats.append(feat)
     if deprel=="root":
       heads.append(0)
       deps.append(r)
     else:
       heads.append(int(head)-int(id))
       deps.append(vs.add(deprel))
     spaces.append(False if "SpaceAfter=No" in misc else True)
     i=misc.find("Gloss=")
     if i<0:
       norms.append(vs.add(form))
     else:
       j=misc.find("|",i)
       norms.append(vs.add(misc[i+6:] if j<0 else misc[i+6:j]))
   doc=Doc(self.vocab,words=words,spaces=spaces)
   a=numpy.array(list(zip(lemmas,pos,tags,deps,heads,norms)),dtype="uint64")
   doc.from_array([LEMMA,POS,TAG,DEP,HEAD,NORM],a)
   try:
     doc.is_tagged=True
     doc.is_parsed=True
   except:
     for i,j in enumerate(feats):
       if j!="_" and j!="":
         doc[i].set_morph(j)
   return doc
Esempio n. 15
0
def test_issue1834():
    """Test that sentence boundaries & parse/tag flags are not lost
    during serialization."""
    string = "This is a first sentence . And another one"
    doc = Doc(Vocab(), words=string.split())
    doc[6].sent_start = True
    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
    assert new_doc[6].sent_start
    assert not new_doc.is_parsed
    assert not new_doc.is_tagged
    doc.is_parsed = True
    doc.is_tagged = True
    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
    assert new_doc.is_parsed
    assert new_doc.is_tagged
Esempio n. 16
0
def test_issue1834():
    """Test that sentence boundaries & parse/tag flags are not lost
    during serialization."""
    string = "This is a first sentence . And another one"
    doc = Doc(Vocab(), words=string.split())
    doc[6].sent_start = True
    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
    assert new_doc[6].sent_start
    assert not new_doc.is_parsed
    assert not new_doc.is_tagged
    doc.is_parsed = True
    doc.is_tagged = True
    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
    assert new_doc.is_parsed
    assert new_doc.is_tagged
Esempio n. 17
0
    def __call__(self, text: str) -> Doc:
        dtokens = self.detailed_tokens(text)
        words = [x.surface for x in dtokens]
        spaces = [x.space for x in dtokens]
        doc = Doc(self.vocab, words=words, spaces=spaces)
        for token, dtoken in zip(doc, dtokens):
            token.tag_ = dtoken.pos
            token.lemma_ = dtoken.lemma if dtoken.lemma != "*" else token.text
            token._.set(self.key_fstring, dtoken.fstring)

        with doc.retokenize() as retokenizer:
            for match in RE_URL.finditer(doc.text):
                span = doc.char_span(*match.span())
                if span:
                    retokenizer.merge(span)
        doc.is_tagged = True
        return doc
Esempio n. 18
0
def test_attr_pipeline_checks(en_vocab):
    doc1 = Doc(en_vocab, words=["Test"])
    doc1[0].dep_ = "ROOT"
    doc2 = Doc(en_vocab, words=["Test"])
    doc2[0].tag_ = "TAG"
    doc2[0].pos_ = "X"
    if spacy_version >= 3:
        doc2[0].set_morph("Feat=Val")
    else:
        doc1.is_parsed = True
    doc2[0].lemma_ = "LEMMA"
    doc3 = Doc(en_vocab, words=["Test"])
    # DEP requires DEP
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"DEP": "a"}]])
    matcher(doc1)
    with pytest.raises(ValueError):
        matcher(doc2)
    with pytest.raises(ValueError):
        matcher(doc3)
    # errors can be suppressed if desired
    matcher(doc2, allow_missing=True)
    matcher(doc3, allow_missing=True)
    # TAG, POS, LEMMA require those values
    for attr in ("TAG", "POS", "LEMMA"):
        matcher = Matcher(en_vocab)
        matcher.add("TEST", [[{attr: "a"}]])
        if spacy_version < 3:
            doc2.is_tagged = True
        matcher(doc2)
        with pytest.raises(ValueError):
            matcher(doc1)
        with pytest.raises(ValueError):
            matcher(doc3)
    # TEXT/ORTH only require tokens
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"ORTH": "a"}]])
    matcher(doc1)
    matcher(doc2)
    matcher(doc3)
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"TEXT": "a"}]])
    matcher(doc1)
    matcher(doc2)
    matcher(doc3)
Esempio n. 19
0
def test_phrase_matcher_validation(en_vocab):
    doc1 = Doc(en_vocab, words=["Test"])
    doc1.is_parsed = True
    doc2 = Doc(en_vocab, words=["Test"])
    doc2.is_tagged = True
    doc3 = Doc(en_vocab, words=["Test"])
    matcher = PhraseMatcher(en_vocab, validate=True)
    with pytest.warns(UserWarning):
        matcher.add("TEST1", [doc1])
    with pytest.warns(UserWarning):
        matcher.add("TEST2", [doc2])
    with pytest.warns(None) as record:
        matcher.add("TEST3", [doc3])
        assert not record.list
    matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
    with pytest.warns(None) as record:
        matcher.add("TEST4", [doc2])
        assert not record.list
Esempio n. 20
0
 def __call__(self,text):
   u=self.model(text) if text else ""
   if not self.convUD:
     return u
   vs=self.vocab.strings
   r=vs.add("ROOT")
   words=[]
   lemmas=[]
   pos=[]
   tags=[]
   heads=[]
   deps=[]
   spaces=[]
   for t in u.split("\n"):
     if t=="" or t.startswith("#"):
       continue
     s=t.split("\t")
     if len(s)!=10:
       continue
     id,form,lemma,upos,xpos,dummy_feats,head,deprel,dummy_deps,misc=s
     words.append(form)
     lemmas.append(vs.add(lemma))
     pos.append(vs.add(upos))
     tags.append(vs.add(xpos))
     if deprel=="root" or deprel=="ROOT":
       heads.append(0)
       deps.append(r)
     elif head=="0":
       heads.append(0)
       deps.append(vs.add(deprel))
     else:
       heads.append(int(head)-int(id))
       deps.append(vs.add(deprel))
     spaces.append(False if "SpaceAfter=No" in misc else True)
   doc=Doc(self.vocab,words=words,spaces=spaces)
   a=numpy.array(list(zip(lemmas,pos,tags,deps,heads)),dtype="uint64")
   doc.from_array([LEMMA,POS,TAG,DEP,HEAD],a)
   try:
     doc.is_tagged=True
     doc.is_parsed=True
   except:
     pass
   return doc
Esempio n. 21
0
 def __call__(self,doc):
   vs=self.vocab.strings
   words=[]
   pos=[]
   tags=[]
   spaces=[]
   for i,(form,xpos) in enumerate(self.pos_tag([t.orth_ for t in doc])):
     if form.strip()=="":
       if len(spaces)>0:
         spaces[-1]=True
     else:
       words.append(form)
       spaces.append(doc[i].whitespace_!="")
       tags.append(vs.add(xpos))
       pos.append(self.tag_map[xpos][POS] if xpos in self.tag_map else X)
   doc=Doc(self.vocab,words=words,spaces=spaces)
   a=numpy.array(list(zip(pos,tags)),dtype="uint64")
   doc.from_array([POS,TAG],a)
   if not SPACY_V3:
     doc.is_tagged=True
   return doc
Esempio n. 22
0
def test_issue4133(en_vocab):
    nlp = English()
    vocab_bytes = nlp.vocab.to_bytes()
    words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
    pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
    doc = Doc(en_vocab, words=words)
    for i, token in enumerate(doc):
        token.pos_ = pos[i]

    # usually this is already True when starting from proper models instead of blank English
    doc.is_tagged = True

    doc_bytes = doc.to_bytes()

    vocab = Vocab()
    vocab = vocab.from_bytes(vocab_bytes)
    doc = Doc(vocab).from_bytes(doc_bytes)

    actual = []
    for token in doc:
        actual.append(token.pos_)

    assert actual == pos
Esempio n. 23
0
    def __call__(self, text):
        """Convert a Stanza Doc to a spaCy Doc.

        text (unicode): The text to process.
        RETURNS (spacy.tokens.Doc): The spaCy Doc object.
        """
        snlp_doc = self.snlp(text) if text else Document("")
        text = snlp_doc.text
        tokens, heads = self.get_tokens_with_heads(snlp_doc)
        if not len(tokens):
            return Doc(self.vocab)
        words = []
        spaces = []
        pos = []
        tags = []
        deps = []
        lemmas = []
        offset = 0
        is_aligned = self.check_aligned(text, tokens)
        for i, token in enumerate(tokens):
            span = text[offset:]
            if not len(span):
                break
            while len(span) and span[0].isspace():
                # If we encounter leading whitespace, skip one character ahead
                offset += 1
                span = text[offset:]
            words.append(token.text)
            # Make sure all strings are in the vocabulary
            pos.append(self.vocab.strings.add(token.upos or ""))
            tags.append(self.vocab.strings.add(token.xpos or ""))
            deps.append(self.vocab.strings.add(token.deprel or ""))
            lemmas.append(self.vocab.strings.add(token.lemma or ""))
            offset += len(token.text)
            span = text[offset:]
            if i == len(tokens) - 1:
                spaces.append(False)
            elif not is_aligned:
                spaces.append(True)
            else:
                next_token = tokens[i + 1]
                spaces.append(not span.startswith(next_token.text))
        attrs = [POS, TAG, DEP, HEAD]
        array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64")
        doc = Doc(self.vocab, words=words,
                  spaces=spaces).from_array(attrs, array)
        ents = []
        for ent in snlp_doc.entities:
            ent_span = doc.char_span(ent.start_char, ent.end_char, ent.type)
            ents.append(ent_span)
        if not all(ents):
            warnings.warn(
                f"Can't set named entities because the character offsets don't "
                f"map to valid tokens produced by the Stanza tokenizer:\n"
                f"Words: {words}\n"
                f"Entities: {[(e.text, e.type, e.start_char, e.end_char) for e in snlp_doc.entities]}",
                stacklevel=4,
            )
        else:
            doc.ents = ents
        # Overwrite lemmas separately to prevent them from being overwritten by spaCy
        lemma_array = numpy.array([[lemma] for lemma in lemmas],
                                  dtype="uint64")
        doc.from_array([LEMMA], lemma_array)
        if any(pos) and any(tags):
            doc.is_tagged = True
        if any(deps):
            doc.is_parsed = True
        return doc
Esempio n. 24
0
 def __call__(self,text):
   from suparkanbun.tradify import tradify
   t=""
   for c in text:
     if c in self.simplify:
       t+=self.simplify[c]
     else:
       t+=c
   if self.danku!=None:
     u=t.replace("\n","")
     t=""
     while len(u)>500:
       s=self.danku(u[0:500])
       r=""
       for c,p in s:
         r+=c
         if p=="S" or p=="E":
           r+="\n"
       r="\n".join(r.split("\n")[0:-2])+"\n"
       t+=r
       u=u[len(r.replace("\n","")):]
     s=self.danku(u)
     for c,p in s:
       t+=c
       if p=="S" or p=="E":
         t+="\n"
   if len(t)<500:
     p=self.tagger(t.replace("\n",""))
   else:
     p=[]
     u=""
     for s in t.strip().split("\n"):
       u+=s
       if len(u)>400:
         p+=self.tagger(u)
         u=""
     if len(u)>0:
       p+=self.tagger(u)
   u=self.supar.predict([[c for c in s] for s in t.strip().split("\n")],lang=None)
   t=text.replace("\n","")
   i=0
   w=[]
   for s in u.sentences:
     v=[]
     for h,d in zip(s.values[6],s.values[7]):
       j=t[i]
       k=tradify[j] if j in tradify else j
       v.append({"form":j,"lemma":k,"pos":p[i][1],"head":h,"deprel":d})
       i+=1
     for j in reversed(range(0,len(v)-1)):
       if v[j]["deprel"]=="compound" and v[j]["head"]==j+2 and v[j]["pos"]==v[j+1]["pos"]:
         k=v.pop(j)
         v[j]["form"]=k["form"]+v[j]["form"]
         v[j]["lemma"]=k["lemma"]+v[j]["lemma"]
         for k in range(0,len(v)):
           if v[k]["head"]>j+1:
             v[k]["head"]-=1
     w.append(list(v))
   vs=self.vocab.strings
   r=vs.add("ROOT")
   words=[]
   lemmas=[]
   pos=[]
   tags=[]
   feats=[]
   heads=[]
   deps=[]
   spaces=[]
   norms=[]
   for s in w:
     for i,t in enumerate(s):
       form=t["form"]
       words.append(form)
       lemmas.append(vs.add(t["lemma"]))
       p=t["pos"].split(",")
       xpos=",".join(p[0:4])
       pos.append(vs.add(p[4]))
       tags.append(vs.add(xpos))
       feats.append(p[5])
       if t["deprel"]=="root":
         heads.append(0)
         deps.append(r)
       else:
         heads.append(t["head"]-i-1)
         deps.append(vs.add(t["deprel"]))
       spaces.append(False)
       g=self.gloss(form,xpos)
       if g!=None:
         norms.append(vs.add(g))
       else:
         norms.append(vs.add(form))
   doc=Doc(self.vocab,words=words,spaces=spaces)
   a=numpy.array(list(zip(lemmas,pos,tags,deps,heads,norms)),dtype="uint64")
   doc.from_array([LEMMA,POS,TAG,DEP,HEAD,NORM],a)
   try:
     doc.is_tagged=True
     doc.is_parsed=True
   except:
     for i,j in enumerate(feats):
       if j!="_" and j!="":
         doc[i].set_morph(j)
   return doc
Esempio n. 25
0
    def __call__(self, text: Union[str, List[str], List[List[str]]]) -> Doc:
        """Convert input text to a spaCy Doc.

        text: The text to process. It can be presegmented or pretokenized:
            str             : raw text,
            List[str]       : presegmented text,
            List[List[str]] : pretokenized text.
        RETURNS: The spaCy Doc object.
        """
        udpipe_sents = self.model(text=text) if text else [Sentence()]
        text = " ".join(s.getText() for s in udpipe_sents)
        tokens, heads = self._get_tokens_with_heads(udpipe_sents=udpipe_sents)
        if not tokens:
            return Doc(vocab=self.vocab)

        words = []
        spaces = []
        pos = []
        tags = []
        deps = []
        lemmas = []
        offset = 0
        is_aligned = self._check_aligned(text=text, tokens=tokens)
        if not is_aligned:
            text = ""
            for token in tokens:
                text += token.form
                if NO_SPACE not in token.misc:
                    text += " "
        for i, token in enumerate(tokens):
            span = text[offset:]
            if not span:
                break
            while len(span) and span[0].isspace():
                # If we encounter leading whitespace, skip one character ahead
                offset += 1
                span = text[offset:]
            words.append(token.form)
            # Make sure all strings are in the vocabulary
            pos.append(self.vocab.strings.add(token.upostag or ""))
            # CoNNL xpostag-s, custom for each UD treebank
            tags.append(self.vocab.strings.add(token.xpostag or ""))
            deps.append(self.vocab.strings.add(self._dep(token.deprel) or ""))
            lemmas.append(self.vocab.strings.add(token.lemma or ""))
            offset += len(token.form)
            span = text[offset:]
            if i == len(tokens) - 1 or NO_SPACE in token.misc:
                spaces.append(False)
            elif not is_aligned:
                spaces.append(True)
            else:
                next_token = tokens[i + 1]
                spaces.append(not span.startswith(next_token.form))
        try:
            attrs = [POS, TAG, DEP, HEAD]
            array = numpy.array(list(zip(pos, tags, deps, heads)),
                                dtype="uint64")
            doc = Doc(self.vocab, words=words,
                      spaces=spaces).from_array(attrs, array)
        except ValueError as e:
            if '[E167]' in str(e):
                raise ValueError(
                    "Could not properly assign morphology features. "
                    f"Please update the tag map for '{self.model._lang}'"
                    " language. See "
                    "https://spacy.io/usage/adding-languages#tag-map "
                    "for details. A quick workaround is to use the keyword "
                    "argument ignore_tag_map=True when loading UDPipeLanguage."
                )
            else:
                raise e
        # Overwrite lemmas separately to prevent overwritting by spaCy
        lemma_array = numpy.array([[lemma] for lemma in lemmas],
                                  dtype="uint64")
        doc.from_array(attrs=[LEMMA], array=lemma_array)
        doc.is_tagged = bool(any(pos) and any(tags))
        doc.is_parsed = bool(any(deps))
        return doc
Esempio n. 26
0
 def __call__(self, text):
     u = self.model(text) if text else ""
     vs = self.vocab.strings
     r = vs.add("ROOT")
     p = {
         "ACAUS": "VERB",
         "ACOND": "SCONJ",
         "ADV": "ADV",
         "ALIM": "SCONJ",
         "APREC": "SCONJ",
         "ART": "DET",
         "CCIRC": "SCONJ",
         "CFOC": "PART",
         "CONJ": "CCONJ",
         "COP": "PRON",
         "CPRET": "AUX",
         "CREL": "SCONJ",
         "EXIST": "VERB",
         "FUT": "AUX",
         "IMOD": "ADV",
         "NEG": "ADV",
         "NPROP": "PROPN",
         "NUM": "NUM",
         "PDEM": "DET",
         "PPOS": "DET",
         "PREP": "ADP",
         "PTC": "PART",
         "PUNCT": "PUNCT"
     }
     words = []
     lemmas = []
     pos = []
     tags = []
     heads = []
     deps = []
     spaces = []
     norms = []
     for s in u.split("\n"):
         if s.startswith('<norm xml:id="u'):
             id = s[15:s.index('"', 16)]
             i = s.index(' orig="')
             form = s[i + 7:s.index('"', i + 8)]
             words.append(form)
             i = s.find(' lemma="')
             lemmas.append(
                 vs.add(form if i < 0 else s[i + 8:s.index('"', i + 9)]))
             i = s.find(' norm="')
             norms.append(
                 vs.add(form if i < 0 else s[i + 7:s.index('"', i + 8)]))
             i = s.index(' func="')
             dep = s[i + 7:s.index('"', i + 8)]
             if dep == "root":
                 heads.append(0)
                 deps.append(r)
             else:
                 i = s.find(' head="#u')
                 heads.append(
                     0 if i < 0 else int(s[i + 9:s.index('"', i + 10)]) -
                     int(id))
                 deps.append(vs.add(dep))
             i = s.index(' pos="')
             xpos = s[i + 6:s.index('"', i + 7)]
             tags.append(vs.add(xpos))
             upos = "X"
             if xpos in p:
                 upos = p[xpos]
             elif xpos.startswith("A"):
                 upos = "AUX"
             elif xpos.startswith("N"):
                 upos = "ADJ" if dep in {"amod", "acl"} else "NOUN"
             elif xpos.startswith("P"):
                 upos = "PRON"
             elif xpos.startswith("V"):
                 upos = "VERB"
             pos.append(vs.add(upos))
             spaces.append(False)
         elif s.startswith("</norm_group>"):
             spaces[-1] = True
     doc = Doc(self.vocab, words=words, spaces=spaces)
     a = numpy.array(list(zip(lemmas, pos, tags, deps, heads, norms)),
                     dtype="uint64")
     doc.from_array([LEMMA, POS, TAG, DEP, HEAD, NORM], a)
     try:
         doc.is_tagged = True
         doc.is_parsed = True
     except:
         pass
     return doc
Esempio n. 27
0
def correct_dep(doc, rewrite_ne_as_proper_noun):
    complex_tokens = []
    last_head = -1
    for token in doc[0:-1]:
        label = token.dep_
        p = label.find('_as_')
        if p >= 0:
            tag = label[p + 4:]
            if len(tag) > 0:
                lemma = token.lemma_
                token.tag_ = tag  # work around: lemma_ must be set after tag_ (spaCy's bug)
                token.lemma_ = lemma
            token.dep_ = label[0:p]

    for token in doc[0:-1]:
        label = token.dep_
        if label.startswith('as_'):
            head = token.head
            if last_head == head.i:
                complex_tokens[-1].append(token)
            else:
                complex_tokens.append([token])
                last_head = token.i
            tag = label[3:]
            if len(tag) > 0:
                lemma = token.lemma_
                token.tag_ = tag  # work around: lemma_ must be set after tag_ (spaCy's bug)
                token.lemma_ = lemma
            token.dep_ = 'dep'
        else:
            complex_tokens.append([token])
            last_head = token.i
    complex_tokens.append([doc[-1]])  # for root detection error

    index = 0
    count = 0
    index_map = [0] * (len(doc) + 1)  # last element is for ner
    for comp in complex_tokens:
        for _ in comp:
            index_map[count] = index
            count += 1
        index += 1
    index_map[-1] = count

    if len(complex_tokens) > 1:
        words, lemmas, tags, pos_details, infs, spaces, sent_starts, deps, heads = zip(*[
            (
                ''.join([t.orth_ + ' ' if t.whitespace_ else t.orth_ for t in comp[0:-1]] + [comp[-1].orth_]),
                ''.join([t.lemma_ + ' ' if t.whitespace_ else t.lemma_ for t in comp[0:-1]] + [comp[-1].lemma_]),
                comp[0].pos_,
                comp[0]._.pos_detail,
                comp[-1]._.inf,
                comp[-1].whitespace_,
                True if comp[0].sent_start else False,
                comp[0].dep_,
                index_map[comp[0].head.i],
            ) if len(comp) > 1 else (
                comp[0].orth_,
                comp[0].lemma_,
                comp[0].pos_,
                comp[0]._.pos_detail,
                comp[0]._.inf,
                comp[0].whitespace_,
                True if comp[0].sent_start else False,
                comp[0].dep_,
                index_map[comp[0].head.i],
            ) for comp in complex_tokens[0:-1]
        ])
    else:
        words = lemmas = tags = pos_details = infs = spaces = sent_starts = deps = heads = []
    new_doc = Doc(doc.vocab, words=words, spaces=spaces)
    for token, lemma, tag, pos_detail, inf, dep in zip(new_doc, lemmas, tags, pos_details, infs, deps):
        token.tag_ = tag
        token.lemma_ = lemma  # work around: lemma_ must be set after tag_ (spaCy's bug)
        token._.pos_detail = pos_detail
        token._.inf = inf
        token.dep_ = dep
    for token, sent_start in zip(new_doc, sent_starts):
        if sent_start:
            token.sent_start = True
    root_i = len(new_doc)
    for token, head in zip(new_doc, heads):
        if head == root_i:
            token.head = token
        else:
            token.head = new_doc[head]

    ents = []
    prev_start = prev_end = -1
    for ent in doc.ents:
        start = index_map[ent.start]
        end = max(index_map[ent.end], start + 1)
        if prev_end > start and prev_start < end:
            ents = ents[:-1]
        ents.append((ent.label, start, end))
        prev_start = start
        prev_end = end
    new_doc.ents = ents

    new_doc.is_tagged = doc.is_tagged
    new_doc.is_parsed = doc.is_parsed

    if rewrite_ne_as_proper_noun:
        for _, start, end in ents:
            for token in new_doc[start:end]:
                lemma = token.lemma_
                token.tag_ = 'PROPN'  # work around: lemma_ must be set after tag_ (spaCy's bug)
                token.lemma_ = lemma

    new_doc.noun_chunks_iterator = noun_chunks  # TODO work around for spaCy 2.0.12

    if len(doc.text) - len(EOS) != len(new_doc.text):
        print(
            'doc.text length is different from source={} to corrected={}'.format(
                len(doc.text) - len(EOS),
                len(new_doc.text)),
            file=sys.stderr
        )
        for t in doc:
            print('<', t.i, t.orth_, t.pos_, t.dep_, t.head.i, file=sys.stderr)
        for t in new_doc:
            print('>', t.i, t.orth_, t.pos_, t.dep_, t.head.i, file=sys.stderr)

    return new_doc
Esempio n. 28
0
    def __call__(self, text):
        """Convert a Stanza Doc to a spaCy Doc.

        text (unicode): The text to process.
        RETURNS (spacy.tokens.Doc): The spaCy Doc object.
        """
        if not text:
            return Doc(self.vocab)
        elif text.isspace():
            return Doc(self.vocab, words=[text], spaces=[False])

        snlp_doc = self.snlp(text)
        text = snlp_doc.text
        snlp_tokens, snlp_heads = self.get_tokens_with_heads(snlp_doc)
        words = []
        spaces = []
        pos = []
        tags = []
        deps = []
        heads = []
        lemmas = []
        offset = 0
        token_texts = [t.text for t in snlp_tokens]
        is_aligned = True
        try:
            words, spaces = self.get_words_and_spaces(token_texts, text)
        except ValueError:
            words = token_texts
            spaces = [True] * len(words)
            is_aligned = False
            warnings.warn(
                "Due to multiword token expansion or an alignment "
                "issue, the original text has been replaced by space-separated "
                "expanded tokens.",
                stacklevel=4,
            )
        offset = 0
        for i, word in enumerate(words):
            if word.isspace() and word != snlp_tokens[i + offset].text:
                # insert a space token
                pos.append(self.vocab.strings.add("SPACE"))
                tags.append(self.vocab.strings.add("_SP"))
                deps.append(self.vocab.strings.add(""))
                lemmas.append(self.vocab.strings.add(word))

                # increment any heads left of this position that point beyond
                # this position to the right (already present in heads)
                for j in range(0, len(heads)):
                    if j + heads[j] >= i:
                        heads[j] += 1

                # decrement any heads right of this position that point beyond
                # this position to the left (yet to be added from snlp_heads)
                for j in range(i + offset, len(snlp_heads)):
                    if j + snlp_heads[j] < i + offset:
                        snlp_heads[j] -= 1

                # initial space tokens are attached to the following token,
                # otherwise attach to the preceding token
                if i == 0:
                    heads.append(1)
                else:
                    heads.append(-1)

                offset -= 1
            else:
                token = snlp_tokens[i + offset]
                assert word == token.text

                pos.append(self.vocab.strings.add(token.upos or ""))
                tags.append(
                    self.vocab.strings.add(token.xpos or token.feats or ""))
                deps.append(self.vocab.strings.add(token.deprel or ""))
                heads.append(snlp_heads[i + offset])
                lemmas.append(self.vocab.strings.add(token.lemma or ""))

        attrs = [POS, TAG, DEP, HEAD]
        array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64")
        doc = Doc(self.vocab, words=words,
                  spaces=spaces).from_array(attrs, array)
        ents = []
        for ent in snlp_doc.entities:
            ent_span = doc.char_span(ent.start_char, ent.end_char, ent.type)
            ents.append(ent_span)
        if not is_aligned or not all(ents):
            warnings.warn(
                f"Can't set named entities because of multi-word token "
                f"expansion or because the character offsets don't map to "
                f"valid tokens produced by the Stanza tokenizer:\n"
                f"Words: {words}\n"
                f"Entities: {[(e.text, e.type, e.start_char, e.end_char) for e in snlp_doc.entities]}",
                stacklevel=4,
            )
        else:
            doc.ents = ents
        # Overwrite lemmas separately to prevent them from being overwritten by spaCy
        lemma_array = numpy.array([[lemma] for lemma in lemmas],
                                  dtype="uint64")
        doc.from_array([LEMMA], lemma_array)
        if any(pos) or any(tags):
            doc.is_tagged = True
        if any(deps) or any(heads):
            doc.is_parsed = True
        return doc
Esempio n. 29
0
 def __call__(self,text):
   t=text.replace("\r","").replace("(","(").replace(")",")").replace("[","[").replace("]","]").replace("{","{").replace("}","}")
   u=self.model(t) if t else ""
   vs=self.vocab.strings
   r=vs.add("ROOT")
   words=[]
   lemmas=[]
   pos=[]
   tags=[]
   morphs=[]
   heads=[]
   deps=[]
   spaces=[]
   norms=[]
   ent_iobs=[]
   ent_types=[]
   bunsetu=[]
   for t in u.split("\n"):
     if t=="" or t.startswith("#"):
       continue
     s=t.split("\t")
     if len(s)!=10:
       continue
     id,form,lemma,upos,xpos,feats,head,deprel,_,misc=s
     words.append(form)
     lemmas.append(vs.add(lemma))
     pos.append(vs.add(upos))
     tags.append(vs.add(xpos))
     morphs.append(feats)
     if deprel=="root":
       heads.append(0)
       deps.append(r)
     else:
       heads.append(int(head)-int(id))
       deps.append(vs.add(deprel))
     spaces.append(False if "SpaceAfter=No" in misc else True)
     i=misc.find("Translit=")
     norms.append(vs.add(form if i<0 else misc[i+9:]))
     i=misc.find("NE=")
     if i<0:
       ent_iobs.append(2)
       ent_types.append(0)
     else:
       j=misc.find("|",i)
       if j<0:
         j=len(misc)
       if misc[i+3:i+4]=="B":
         ent_iobs.append(3)
       else:
         ent_iobs.append(1)
       ent_types.append(vs.add(misc[i+5:j]))
     bunsetu.append("I")
     if misc.startswith("BunsetuBILabel="):
       bunsetu[-1]=misc[15:16]
   doc=Doc(self.vocab,words=words,spaces=spaces)
   a=numpy.array(list(zip(lemmas,pos,tags,deps,heads,norms,ent_iobs,ent_types)),dtype="uint64")
   doc.from_array([LEMMA,POS,TAG,DEP,HEAD,NORM,ENT_IOB,ENT_TYPE],a)
   try:
     doc.is_tagged=True
     doc.is_parsed=True
   except:
     for i,j in enumerate(morphs):
       if j!="_" and j!="":
         doc[i].set_morph(j)
   doc.user_data["bunsetu_bi_labels"]=bunsetu
   return doc