Exemple #1
0
def test_issue5048(en_vocab):
    words = ["This", "is", "a", "sentence"]
    pos_s = ["DET", "VERB", "DET", "NOUN"]
    spaces = [" ", " ", " ", ""]
    deps_s = ["dep", "adj", "nn", "atm"]
    tags_s = ["DT", "VBZ", "DT", "NN"]

    strings = en_vocab.strings

    for w in words:
        strings.add(w)
    deps = [strings.add(d) for d in deps_s]
    pos = [strings.add(p) for p in pos_s]
    tags = [strings.add(t) for t in tags_s]

    attrs = [POS, DEP, TAG]
    array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64")

    doc = Doc(en_vocab, words=words, spaces=spaces)
    doc.from_array(attrs, array)
    v1 = [(token.text, token.pos_, token.tag_) for token in doc]

    doc2 = get_doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s)
    v2 = [(token.text, token.pos_, token.tag_) for token in doc2]
    assert v1 == v2
Exemple #2
0
    def get_doc(self,
                words=[],
                pos=None,
                heads=None,
                deps=None,
                tags=None,
                ents=None):
        """Create Doc object from given vocab, words and annotations."""

        vocab = Vocab()
        pos = pos or [""] * len(words)
        tags = tags or [""] * len(words)
        heads = heads or [0] * len(words)
        deps = deps or [""] * len(words)
        for value in deps + tags + pos:
            vocab.strings.add(value)

        doc = Doc(vocab, words=words)
        attrs = doc.to_array([POS, HEAD, DEP])
        for i, (p, head, dep) in enumerate(zip(pos, heads, deps)):
            attrs[i, 0] = doc.vocab.strings[p]
            attrs[i, 1] = head
            attrs[i, 2] = doc.vocab.strings[dep]
        doc.from_array([POS, HEAD, DEP], attrs)
        if ents:
            doc.ents = [
                Span(doc, start, end, label=doc.vocab.strings[label])
                for start, end, label in ents
            ]
        if tags:
            for token in doc:
                token.tag_ = tags[token.i]
        return doc
Exemple #3
0
def get_doc(vocab, words, pos, heads, deps):
    assert len(pos) == len(words)
    assert len(heads) == len(words)
    assert len(deps) == len(words)

    headings = []
    values = []
    annotations = [pos, heads, deps]
    possible_headings = [POS, HEAD, DEP]
    for a, annot in enumerate(annotations):
        headings.append(possible_headings[a])
        if annot is not heads:
            values.extend(annot)
    for value in values:
        vocab.strings.add(value)

    doc = Doc(vocab, words=words)

    attrs = doc.to_array(headings)

    j = 0
    for annot in annotations:
        if annot is heads:
            for i in range(len(words)):
                attrs[i, j] = heads[i]
        else:
            for i in range(len(words)):
                attrs[i, j] = doc.vocab.strings[annot[i]]
        j += 1

    doc.from_array(headings, attrs)

    return doc
Exemple #4
0
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
    """Create Doc object from given vocab, words and annotations."""
    pos = pos or [""] * len(words)
    tags = tags or [""] * len(words)
    heads = heads or [0] * len(words)
    deps = deps or [""] * len(words)
    for value in deps + tags + pos:
        vocab.strings.add(value)

    doc = Doc(vocab, words=words)
    attrs = doc.to_array([POS, HEAD, DEP])
    for i, (p, head, dep) in enumerate(zip(pos, heads, deps)):
        attrs[i, 0] = doc.vocab.strings[p]
        attrs[i, 1] = head
        attrs[i, 2] = doc.vocab.strings[dep]
    doc.from_array([POS, HEAD, DEP], attrs)
    if ents:
        doc.ents = [
            Span(doc, start, end, label=doc.vocab.strings[label])
            for start, end, label in ents
        ]
    if tags:
        for token in doc:
            token.tag_ = tags[token.i]
    return doc
    def __call__(self, text):
        """Convert input text to a spaCy Doc.

        text (unicode): The text to process.
        RETURNS (spacy.tokens.Doc): The spaCy Doc object.
        """
        udpipe_sents = self.model(text) if text else [Sentence()]
        text = " ".join(s.getText() for s in udpipe_sents)
        tokens, heads = self.get_tokens_with_heads(udpipe_sents)
        if not tokens:
            return Doc(self.vocab)

        words = []
        spaces = []
        pos = []
        tags = []
        deps = []
        lemmas = []
        offset = 0
        is_aligned = self.check_aligned(text, tokens)
        for i, token in enumerate(tokens):
            span = text[offset:]
            if not span:
                break
            while len(span) and span[0].isspace():
                # If we encounter leading whitespace, skip one character ahead
                offset += 1
                span = text[offset:]
            words.append(token.form)
            # Make sure all strings are in the vocabulary
            pos.append(self.vocab.strings.add(token.upostag or ""))
            # CoNNL xpostag-s, custom for each UD treebank
            #tags.append(self.vocab.strings.add(token.xpostag or ""))
            tags.append(self.vocab.strings.add(token.feats or ""))
            deps.append(self.vocab.strings.add(self._dep(token.deprel) or ""))
            lemmas.append(self.vocab.strings.add(token.lemma or ""))
            offset += len(token.form)
            span = text[offset:]
            if i == len(tokens) - 1 or "SpaceAfter=No" in token.misc:
                spaces.append(False)
            elif not is_aligned:
                spaces.append(True)
            else:
                next_token = tokens[i + 1]
                spaces.append(not span.startswith(next_token.form))
        attrs = [POS, TAG, DEP, HEAD]
        array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64")
        doc = Doc(self.vocab, words=words,
                  spaces=spaces).from_array(attrs, array)
        # Overwrite lemmas separately to prevent overwritting by spaCy
        lemma_array = numpy.array([[lemma] for lemma in lemmas],
                                  dtype="uint64")
        doc.from_array([LEMMA], lemma_array)
        if any(pos) and any(tags):
            doc.is_tagged = True
        if any(deps):
            doc.is_parsed = True
        return doc
    def __call__(self, text):
        """Convert a StanfordNLP Doc to a spaCy Doc.

        text (unicode): The text to process.
        RETURNS (spacy.tokens.Doc): The spaCy Doc object.
        """
        snlp_doc = self.snlp(text) if text else Document("")
        text = snlp_doc.text
        tokens, heads = self.get_tokens_with_heads(snlp_doc)
        if not len(tokens):
            return Doc(self.vocab)

        words = []
        spaces = []
        pos = []
        tags = []
        deps = []
        lemmas = []
        offset = 0
        is_aligned = self.check_aligned(text, tokens)
        for i, token in enumerate(tokens):
            span = text[offset:]
            if not len(span):
                break
            while len(span) and span[0].isspace():
                # If we encounter leading whitespace, skip one character ahead
                offset += 1
                span = text[offset:]
            words.append(token.text)
            # Make sure all strings are in the vocabulary
            pos.append(self.vocab.strings.add(token.upos or ""))
            tags.append(self.vocab.strings.add(token.xpos or ""))
            deps.append(self.vocab.strings.add(token.dependency_relation
                                               or ""))
            lemmas.append(self.vocab.strings.add(token.lemma or ""))
            offset += len(token.text)
            span = text[offset:]
            if i == len(tokens) - 1:
                spaces.append(False)
            elif not is_aligned:
                spaces.append(True)
            else:
                next_token = tokens[i + 1]
                spaces.append(not span.startswith(next_token.text))
        attrs = [POS, TAG, DEP, HEAD]
        array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64")
        doc = Doc(self.vocab, words=words,
                  spaces=spaces).from_array(attrs, array)
        # Overwrite lemmas separately to prevent them from being overwritten by spaCy
        lemma_array = numpy.array([[lemma] for lemma in lemmas],
                                  dtype="uint64")
        doc.from_array([LEMMA], lemma_array)
        if any(pos) and any(tags):
            doc.is_tagged = True
        if any(deps):
            doc.is_parsed = True
        return doc
Exemple #7
0
def get_doc(vocab,
            words=[],
            pos=None,
            heads=None,
            deps=None,
            tags=None,
            ents=None,
            lemmas=None):
    """Create Doc object from given vocab, words and annotations."""
    if deps and not heads:
        heads = [0] * len(deps)
    headings = []
    values = []
    annotations = [pos, heads, deps, lemmas, tags]
    possible_headings = [POS, HEAD, DEP, LEMMA, TAG]
    for a, annot in enumerate(annotations):
        if annot is not None:
            if len(annot) != len(words):
                raise ValueError(Errors.E189)
            headings.append(possible_headings[a])
            if annot is not heads:
                values.extend(annot)
    for value in values:
        vocab.strings.add(value)

    doc = Doc(vocab, words=words)

    # if there are any other annotations, set them
    if headings:
        attrs = doc.to_array(headings)

        j = 0
        for annot in annotations:
            if annot:
                if annot is heads:
                    for i in range(len(words)):
                        if attrs.ndim == 1:
                            attrs[i] = heads[i]
                        else:
                            attrs[i, j] = heads[i]
                else:
                    for i in range(len(words)):
                        if attrs.ndim == 1:
                            attrs[i] = doc.vocab.strings[annot[i]]
                        else:
                            attrs[i, j] = doc.vocab.strings[annot[i]]
                j += 1
        doc.from_array(headings, attrs)

    # finally, set the entities
    if ents:
        doc.ents = [
            Span(doc, start, end, label=doc.vocab.strings[label])
            for start, end, label in ents
        ]
    return doc
def remove_tokens_on_match(doc):
    indexes = []
    for index, token in enumerate(doc):
        if (token.pos_  in ('PUNCT', 'NUM', 'SYM')):
            indexes.append(index)
    np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
    np_array = numpy.delete(np_array, indexes, axis = 0)
    doc2 = Doc(doc.vocab, words=[t.text for i, t in enumerate(doc) if i not in indexes])
    doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
    return doc2
Exemple #9
0
 def __call__(self,text):
   u=self.model(text,raw=True) if text else ""
   vs=self.vocab.strings
   r=vs.add("ROOT")
   words=[]
   lemmas=[]
   pos=[]
   tags=[]
   feats=[]
   heads=[]
   deps=[]
   spaces=[]
   norms=[]
   for t in u.split("\n"):
     if t=="" or t.startswith("#"):
       continue
     s=t.split("\t")
     if len(s)!=10:
       continue
     id,form,lemma,upos,xpos,feat,head,deprel,dummy_deps,misc=s
     words.append(form)
     lemmas.append(vs.add(lemma))
     pos.append(vs.add(upos))
     tags.append(vs.add(xpos))
     feats.append(feat)
     if deprel=="root":
       heads.append(0)
       deps.append(r)
     else:
       heads.append(int(head)-int(id))
       deps.append(vs.add(deprel))
     spaces.append(False if "SpaceAfter=No" in misc else True)
     i=misc.find("Translit=")
     norms.append(vs.add(form if i<0 else misc[i+9:]))
   doc=Doc(self.vocab,words=words,spaces=spaces)
   a=numpy.array(list(zip(lemmas,pos,tags,deps,heads,norms)),dtype="uint64")
   doc.from_array([LEMMA,POS,TAG,DEP,HEAD,NORM],a)
   try:
     doc.is_tagged=True
     doc.is_parsed=True
   except:
     for i,j in enumerate(feats):
       if j!="_" and j!="":
         doc[i].set_morph(j)
   t=Tree(u)
   t._makeChunks()
   bunsetu=["I"]*len(doc)
   for s in t._cabocha._sentences:
     for w in s:
       try:
         bunsetu[w[0]-1]="B"
       except:
         pass
   doc.user_data["bunsetu_bi_labels"]=bunsetu
   return doc
Exemple #10
0
def test_doc_from_array_morph(en_vocab):
    # fmt: off
    words = ["I", "live", "in", "New", "York", "."]
    morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
    # fmt: on
    doc = Doc(en_vocab, words=words, morphs=morphs)
    attrs = [MORPH]
    arr = doc.to_array(attrs)
    new_doc = Doc(en_vocab, words=words)
    new_doc.from_array(attrs, arr)
    assert [str(t.morph) for t in new_doc] == morphs
    assert [str(t.morph) for t in doc] == [str(t.morph) for t in new_doc]
Exemple #11
0
def test_doc_from_array_sent_starts(en_vocab):
    # fmt: off
    words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
    heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
    deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep"]
    # fmt: on
    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
    # HEAD overrides SENT_START without warning
    attrs = [SENT_START, HEAD]
    arr = doc.to_array(attrs)
    new_doc = Doc(en_vocab, words=words)
    new_doc.from_array(attrs, arr)
    # no warning using default attrs
    attrs = doc._get_array_attrs()
    arr = doc.to_array(attrs)
    with pytest.warns(None) as record:
        new_doc.from_array(attrs, arr)
        assert len(record) == 0
    # only SENT_START uses SENT_START
    attrs = [SENT_START]
    arr = doc.to_array(attrs)
    new_doc = Doc(en_vocab, words=words)
    new_doc.from_array(attrs, arr)
    assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
    assert not new_doc.has_annotation("DEP")
    # only HEAD uses HEAD
    attrs = [HEAD, DEP]
    arr = doc.to_array(attrs)
    new_doc = Doc(en_vocab, words=words)
    new_doc.from_array(attrs, arr)
    assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
    assert new_doc.has_annotation("DEP")
Exemple #12
0
    def remove_tokens_on_match(self, doc):
        indexes = []
        for inx, token in enumerate(doc):
            if not token.is_stop and token.tag_ == "NN":
                indexes.append(inx)

        np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
        np_array = numpy.delete(np_array, indexes, axis=0)
        doc2 = Doc(
            doc.vocab,
            words=[t.text for i, t in enumerate(doc) if i not in indexes])
        doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)

        return doc2
Exemple #13
0
 def __call__(self,text):
   u=self.model(text,raw=True) if text else ""
   vs=self.vocab.strings
   r=vs.add("ROOT")
   words=[]
   lemmas=[]
   pos=[]
   tags=[]
   feats=[]
   heads=[]
   deps=[]
   spaces=[]
   norms=[]
   for t in u.split("\n"):
     if t=="" or t.startswith("#"):
       continue
     s=t.split("\t")
     if len(s)!=10:
       continue
     id,form,lemma,upos,xpos,feat,head,deprel,dummy_deps,misc=s
     words.append(form)
     lemmas.append(vs.add(lemma))
     pos.append(vs.add(upos))
     tags.append(vs.add(xpos))
     feats.append(feat)
     if deprel=="root":
       heads.append(0)
       deps.append(r)
     else:
       heads.append(int(head)-int(id))
       deps.append(vs.add(deprel))
     spaces.append(False if "SpaceAfter=No" in misc else True)
     i=misc.find("Gloss=")
     if i<0:
       norms.append(vs.add(form))
     else:
       j=misc.find("|",i)
       norms.append(vs.add(misc[i+6:] if j<0 else misc[i+6:j]))
   doc=Doc(self.vocab,words=words,spaces=spaces)
   a=numpy.array(list(zip(lemmas,pos,tags,deps,heads,norms)),dtype="uint64")
   doc.from_array([LEMMA,POS,TAG,DEP,HEAD,NORM],a)
   try:
     doc.is_tagged=True
     doc.is_parsed=True
   except:
     for i,j in enumerate(feats):
       if j!="_" and j!="":
         doc[i].set_morph(j)
   return doc
Exemple #14
0
def make_doc_from_text_chunks(
    text: str,
    lang: Union[str, Language],
    chunk_size: int = 100000,
) -> Doc:
    """
    Make a single spaCy-processed document from 1 or more chunks of ``text``.
    This is a workaround for processing very long texts, for which spaCy
    is unable to allocate enough RAM.

    Although this function's performance is *pretty good*, it's inherently
    less performant that just processing the entire text in one shot.
    Only use it if necessary!

    Args:
        text: Text document to be chunked and processed by spaCy.
        lang: A 2-letter language code (e.g. "en"),
            the name of a spaCy model for the desired language, or
            an already-instantiated spaCy language pipeline.
        chunk_size: Number of characters comprising each text chunk
            (excluding the last chunk, which is probably smaller).
            For best performance, value should be somewhere between 1e3 and 1e7,
            depending on how much RAM you have available.

            .. note:: Since chunking is done by character, chunks edges' probably
               won't respect natural language segmentation, which means that every
               ``chunk_size`` characters, spaCy will probably get tripped up and
               make weird parsing errors.

    Returns:
        A single processed document, initialized from components accumulated chunk by chunk.
    """
    if isinstance(lang, str):
        lang = core.load_spacy_lang(lang)
    elif not isinstance(lang, Language):
        raise TypeError(
            errors.type_invalid_msg("lang", type(lang), Union[str, Language]))

    words: List[str] = []
    spaces: List[bool] = []
    np_arrays = []
    cols = [
        attrs.POS, attrs.TAG, attrs.DEP, attrs.HEAD, attrs.ENT_IOB,
        attrs.ENT_TYPE
    ]
    text_len = len(text)
    i = 0
    # iterate over text chunks and accumulate components needed to make a doc
    while i < text_len:
        chunk_doc = lang(text[i:i + chunk_size])
        words.extend(tok.text for tok in chunk_doc)
        spaces.extend(bool(tok.whitespace_) for tok in chunk_doc)
        np_arrays.append(chunk_doc.to_array(cols))
        i += chunk_size
    # now, initialize the doc from words and spaces
    # then load attribute values from the concatenated np array
    doc = Doc(lang.vocab, words=words, spaces=spaces)
    doc = doc.from_array(cols, np.concatenate(np_arrays, axis=0))

    return doc
Exemple #15
0
def doc_cleaning(doc: Doc):
    np_array = doc.to_array(
        [LEMMA, LOWER, POS, TAG, ENT_TYPE, IS_ALPHA, DEP, HEAD, SPACY])
    words = [t.text for i, t in enumerate(doc)]
    cleaned_words = list()
    for w in words:
        if w != 'PairDrug1' and w != 'PairDrug2':
            w = number_substitution(w)
        if w == '%':
            w = ' '
        cleaned_words.append(w)
    doc2 = Doc(doc.vocab, words=cleaned_words)
    doc2.from_array(
        [LEMMA, LOWER, POS, TAG, ENT_TYPE, IS_ALPHA, DEP, HEAD, SPACY],
        np_array)
    return doc2
Exemple #16
0
def test_issue2203(en_vocab):
    """Test that lemmas are set correctly in doc.from_array."""
    words = ["I", "'ll", "survive"]
    tags = ["PRP", "MD", "VB"]
    lemmas = ["-PRON-", "will", "survive"]
    tag_ids = [en_vocab.strings.add(tag) for tag in tags]
    lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas]
    doc = Doc(en_vocab, words=words)
    # Work around lemma corrpution problem and set lemmas after tags
    doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64"))
    doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64"))
    assert [t.tag_ for t in doc] == tags
    assert [t.lemma_ for t in doc] == lemmas
    # We need to serialize both tag and lemma, since this is what causes the bug
    doc_array = doc.to_array(["TAG", "LEMMA"])
    new_doc = Doc(doc.vocab, words=words).from_array(["TAG", "LEMMA"], doc_array)
    assert [t.tag_ for t in new_doc] == tags
    assert [t.lemma_ for t in new_doc] == lemmas
def remove_tokens_on_match(doc):
    tokens_to_remove = [
        "NNP", "NN", "NNS", "CD", "UH", "JJ"
    ]

    indexes = []
    for index, token in enumerate(doc):
        # print(index, token.text, token.tag_, token.dep_)
        for tag in tokens_to_remove:
            if (token.tag_ == tag):
                indexes.append(index)
                # print("REMOVE: ",token.text, tag, dep)
    np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
    np_array = numpy.delete(np_array, indexes, axis = 0)
    doc2 = Doc(doc.vocab, words=[t.text for i, t in enumerate(doc) if i not in indexes])
    doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
    doc2 = doc2.text
    return doc2
Exemple #18
0
def test_issue2203(en_vocab):
    """Test that lemmas are set correctly in doc.from_array."""
    words = ["I", "'ll", "survive"]
    tags = ["PRP", "MD", "VB"]
    lemmas = ["-PRON-", "will", "survive"]
    tag_ids = [en_vocab.strings.add(tag) for tag in tags]
    lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas]
    doc = Doc(en_vocab, words=words)
    # Work around lemma corrpution problem and set lemmas after tags
    doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64"))
    doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64"))
    assert [t.tag_ for t in doc] == tags
    assert [t.lemma_ for t in doc] == lemmas
    # We need to serialize both tag and lemma, since this is what causes the bug
    doc_array = doc.to_array(["TAG", "LEMMA"])
    new_doc = Doc(doc.vocab, words=words).from_array(["TAG", "LEMMA"], doc_array)
    assert [t.tag_ for t in new_doc] == tags
    assert [t.lemma_ for t in new_doc] == lemmas
Exemple #19
0
def test_example_constructor(en_vocab):
    words = ["I", "like", "stuff"]
    tags = ["NOUN", "VERB", "NOUN"]
    tag_ids = [en_vocab.strings.add(tag) for tag in tags]
    predicted = Doc(en_vocab, words=words)
    reference = Doc(en_vocab, words=words)
    reference = reference.from_array("TAG", numpy.array(tag_ids, dtype="uint64"))
    example = Example(predicted, reference)
    tags = example.get_aligned("TAG", as_string=True)
    assert tags == ["NOUN", "VERB", "NOUN"]
Exemple #20
0
def substitution(doc: Doc, index: int, value: int) -> Doc:
    np_array = doc.to_array(
        [LEMMA, LOWER, POS, TAG, ENT_TYPE, IS_ALPHA, DEP, HEAD, SPACY])
    words = [t.text for i, t in enumerate(doc)]
    #print(words[index])
    if value == -1:
        item = 'NoPair'
    if value == 0:
        item = "Drug"
    if value == 1:
        item = "PairDrug1"
    if value == 2:
        item = "PairDrug2"
    words.__setitem__(index, item)
    doc2 = Doc(doc.vocab, words=words)
    doc2.from_array(
        [LEMMA, LOWER, POS, TAG, ENT_TYPE, IS_ALPHA, DEP, HEAD, SPACY],
        np_array)
    return doc2
Exemple #21
0
 def __call__(self,text):
   u=self.model(text) if text else ""
   if not self.convUD:
     return u
   vs=self.vocab.strings
   r=vs.add("ROOT")
   words=[]
   lemmas=[]
   pos=[]
   tags=[]
   heads=[]
   deps=[]
   spaces=[]
   for t in u.split("\n"):
     if t=="" or t.startswith("#"):
       continue
     s=t.split("\t")
     if len(s)!=10:
       continue
     id,form,lemma,upos,xpos,dummy_feats,head,deprel,dummy_deps,misc=s
     words.append(form)
     lemmas.append(vs.add(lemma))
     pos.append(vs.add(upos))
     tags.append(vs.add(xpos))
     if deprel=="root" or deprel=="ROOT":
       heads.append(0)
       deps.append(r)
     elif head=="0":
       heads.append(0)
       deps.append(vs.add(deprel))
     else:
       heads.append(int(head)-int(id))
       deps.append(vs.add(deprel))
     spaces.append(False if "SpaceAfter=No" in misc else True)
   doc=Doc(self.vocab,words=words,spaces=spaces)
   a=numpy.array(list(zip(lemmas,pos,tags,deps,heads)),dtype="uint64")
   doc.from_array([LEMMA,POS,TAG,DEP,HEAD],a)
   try:
     doc.is_tagged=True
     doc.is_parsed=True
   except:
     pass
   return doc
Exemple #22
0
 def __call__(self,doc):
   vs=self.vocab.strings
   words=[]
   pos=[]
   tags=[]
   spaces=[]
   for i,(form,xpos) in enumerate(self.pos_tag([t.orth_ for t in doc])):
     if form.strip()=="":
       if len(spaces)>0:
         spaces[-1]=True
     else:
       words.append(form)
       spaces.append(doc[i].whitespace_!="")
       tags.append(vs.add(xpos))
       pos.append(self.tag_map[xpos][POS] if xpos in self.tag_map else X)
   doc=Doc(self.vocab,words=words,spaces=spaces)
   a=numpy.array(list(zip(pos,tags)),dtype="uint64")
   doc.from_array([POS,TAG],a)
   if not SPACY_V3:
     doc.is_tagged=True
   return doc
Exemple #23
0
def test_doc_from_array_heads_in_bounds(en_vocab):
    """Test that Doc.from_array doesn't set heads that are out of bounds."""
    words = ["This", "is", "a", "sentence", "."]
    doc = Doc(en_vocab, words=words)
    for token in doc:
        token.head = doc[0]

    # correct
    arr = doc.to_array(["HEAD"])
    doc_from_array = Doc(en_vocab, words=words)
    doc_from_array.from_array(["HEAD"], arr)

    # head before start
    arr = doc.to_array(["HEAD"])
    arr[0] = -1
    doc_from_array = Doc(en_vocab, words=words)
    with pytest.raises(ValueError):
        doc_from_array.from_array(["HEAD"], arr)

    # head after end
    arr = doc.to_array(["HEAD"])
    arr[0] = 5
    doc_from_array = Doc(en_vocab, words=words)
    with pytest.raises(ValueError):
        doc_from_array.from_array(["HEAD"], arr)
Exemple #24
0
def test_doc_from_array_sent_starts(en_vocab):
    words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
    heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
    # fmt: off
    deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"]
    # fmt: on
    doc = Doc(en_vocab, words=words)
    for i, (dep, head) in enumerate(zip(deps, heads)):
        doc[i].dep_ = dep
        doc[i].head = doc[head]
        if head == i:
            doc[i].is_sent_start = True
    doc.is_parsed

    attrs = [SENT_START, HEAD]
    arr = doc.to_array(attrs)
    new_doc = Doc(en_vocab, words=words)
    with pytest.raises(ValueError):
        new_doc.from_array(attrs, arr)

    attrs = [SENT_START, DEP]
    arr = doc.to_array(attrs)
    new_doc = Doc(en_vocab, words=words)
    new_doc.from_array(attrs, arr)
    assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
    assert not new_doc.is_parsed

    attrs = [HEAD, DEP]
    arr = doc.to_array(attrs)
    new_doc = Doc(en_vocab, words=words)
    new_doc.from_array(attrs, arr)
    assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
    assert new_doc.is_parsed
def test_issue3012(en_vocab):
    """Test that the is_tagged attribute doesn't get overwritten when we from_array
    without tag information."""
    words = ["This", "is", "10", "%", "."]
    tags = ["DT", "VBZ", "CD", "NN", "."]
    pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
    ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"]
    doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
    assert doc.has_annotation("TAG")
    expected = ("10", "NUM", "CD", "PERCENT")
    assert (doc[2].text, doc[2].pos_, doc[2].tag_,
            doc[2].ent_type_) == expected
    header = [ENT_IOB, ENT_TYPE]
    ent_array = doc.to_array(header)
    doc.from_array(header, ent_array)
    assert (doc[2].text, doc[2].pos_, doc[2].tag_,
            doc[2].ent_type_) == expected
    # Serializing then deserializing
    doc_bytes = doc.to_bytes()
    doc2 = Doc(en_vocab).from_bytes(doc_bytes)
    assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_,
            doc2[2].ent_type_) == expected
Exemple #26
0
def test_issue1799():
    """Test sentence boundaries are deserialized correctly, even for
    non-projective sentences."""
    heads_deps = numpy.asarray(
        [
            [1, 397],
            [4, 436],
            [2, 426],
            [1, 402],
            [0, 8206900633647566924],
            [18446744073709551615, 440],
            [18446744073709551614, 442],
        ],
        dtype="uint64",
    )
    doc = Doc(Vocab(), words="Just what I was looking for .".split())
    doc.vocab.strings.add("ROOT")
    doc = doc.from_array([HEAD, DEP], heads_deps)
    assert len(list(doc.sents)) == 1
Exemple #27
0
def test_issue1799():
    """Test sentence boundaries are deserialized correctly, even for
    non-projective sentences."""
    heads_deps = numpy.asarray(
        [
            [1, 397],
            [4, 436],
            [2, 426],
            [1, 402],
            [0, 8206900633647566924],
            [18446744073709551615, 440],
            [18446744073709551614, 442],
        ],
        dtype="uint64",
    )
    doc = Doc(Vocab(), words="Just what I was looking for .".split())
    doc.vocab.strings.add("ROOT")
    doc = doc.from_array([HEAD, DEP], heads_deps)
    assert len(list(doc.sents)) == 1
Exemple #28
0
def read_spacy_docs(
    filepath: Union[str, pathlib.Path],
    *,
    format: str = "pickle",
    lang: Optional[Union[str, Language]] = None,
) -> Iterable[Doc]:
    """
    Read the contents of a file at ``filepath``, written either in pickle or binary
    format.

    Args:
        filepath: Path to file on disk from which data will be read.
        format ({"pickle", "binary"}): Format of the data that was written to disk.
            If 'pickle', use ``pickle`` in python's stdlib; if 'binary', use
            the 3rd-party ``msgpack`` library.

            .. warning:: Docs written in pickle format were saved all together
               as a list, which means they're all loaded into memory at once
               before streaming one by one. Mind your RAM usage, especially when
               reading many docs!

            .. warning:: When writing docs in binary format, spaCy's built-in
               ``spacy.Doc.to_bytes()`` method is used, but when reading the data
               back in :func:`read_spacy_docs()`, experimental and *unofficial*
               work-arounds are used to allow for all the docs in ``data`` to be
               read from the same file. If spaCy changes, this code could break,
               so use this functionality at your own risk!

        lang: Already-instantiated ``spacy.Language`` object, or the string name
            by which it can be loaded, used to process the docs written to disk
            at ``filepath``. Note that this is only applicable when ``format="binary"``.

    Yields:
        Next deserialized document.

    Raises:
        ValueError: if format is not "pickle" or "binary", or if ``lang`` is not
            provided when ``format="binary"``
    """
    if format == "pickle":
        with io_utils.open_sesame(filepath, mode="rb") as f:
            for spacy_doc in pickle.load(f):
                yield spacy_doc
    elif format == "binary":
        if lang is None:
            raise ValueError(
                "When format='binary', a `spacy.Language` (and its associated "
                "`spacy.Vocab`) is required to deserialize the binary data; "
                "and these should be the same as were used when processing "
                "the original docs!")
        elif isinstance(lang, Language):
            vocab = lang.vocab
        elif isinstance(lang, str):
            vocab = spacier.core.load_spacy_lang(lang).vocab
        else:
            raise ValueError(
                "lang = '{}' is invalid; must be a str or `spacy.Language`")
        with io_utils.open_sesame(filepath, mode="rb") as f:
            unpacker = msgpack.Unpacker(f, raw=False, unicode_errors="strict")
            for msg in unpacker:

                # NOTE: The following code has been adapted from spaCy's
                # built-in ``spacy.Doc.from_bytes()``. If that functionality
                # changes, the following will probably break...

                # Msgpack doesn't distinguish between lists and tuples, which is
                # vexing for user data. As a best guess, we *know* that within
                # keys, we must have tuples. In values we just have to hope
                # users don't mind getting a list instead of a tuple.
                if "user_data_keys" in msg:
                    user_data_keys = msgpack.loads(msg["user_data_keys"],
                                                   use_list=False)
                    user_data_values = msgpack.loads(msg["user_data_values"])
                    user_data = {
                        key: value
                        for key, value in zip(user_data_keys, user_data_values)
                    }
                else:
                    user_data = None

                text = msg["text"]
                attrs = msg["array_body"]
                words = []
                spaces = []
                start = 0
                for i in range(attrs.shape[0]):
                    end = start + int(attrs[i, 0])
                    has_space = int(attrs[i, 1])
                    words.append(text[start:end])
                    spaces.append(bool(has_space))
                    start = end + has_space

                spacy_doc = Doc(vocab,
                                words=words,
                                spaces=spaces,
                                user_data=user_data)
                spacy_doc = spacy_doc.from_array(msg["array_head"][2:],
                                                 attrs[:, 2:])
                if "sentiment" in msg:
                    spacy_doc.sentiment = msg["sentiment"]
                if "tensor" in msg:
                    spacy_doc.tensor = msg["tensor"]
                yield spacy_doc
    else:
        raise ValueError(
            "format = '{}' is invalid; value must be one of {}".format(
                format, {"pickle", "binary"}))
Exemple #29
0
 def __call__(self,text):
   from suparkanbun.tradify import tradify
   t=""
   for c in text:
     if c in self.simplify:
       t+=self.simplify[c]
     else:
       t+=c
   if self.danku!=None:
     u=t.replace("\n","")
     t=""
     while len(u)>500:
       s=self.danku(u[0:500])
       r=""
       for c,p in s:
         r+=c
         if p=="S" or p=="E":
           r+="\n"
       r="\n".join(r.split("\n")[0:-2])+"\n"
       t+=r
       u=u[len(r.replace("\n","")):]
     s=self.danku(u)
     for c,p in s:
       t+=c
       if p=="S" or p=="E":
         t+="\n"
   if len(t)<500:
     p=self.tagger(t.replace("\n",""))
   else:
     p=[]
     u=""
     for s in t.strip().split("\n"):
       u+=s
       if len(u)>400:
         p+=self.tagger(u)
         u=""
     if len(u)>0:
       p+=self.tagger(u)
   u=self.supar.predict([[c for c in s] for s in t.strip().split("\n")],lang=None)
   t=text.replace("\n","")
   i=0
   w=[]
   for s in u.sentences:
     v=[]
     for h,d in zip(s.values[6],s.values[7]):
       j=t[i]
       k=tradify[j] if j in tradify else j
       v.append({"form":j,"lemma":k,"pos":p[i][1],"head":h,"deprel":d})
       i+=1
     for j in reversed(range(0,len(v)-1)):
       if v[j]["deprel"]=="compound" and v[j]["head"]==j+2 and v[j]["pos"]==v[j+1]["pos"]:
         k=v.pop(j)
         v[j]["form"]=k["form"]+v[j]["form"]
         v[j]["lemma"]=k["lemma"]+v[j]["lemma"]
         for k in range(0,len(v)):
           if v[k]["head"]>j+1:
             v[k]["head"]-=1
     w.append(list(v))
   vs=self.vocab.strings
   r=vs.add("ROOT")
   words=[]
   lemmas=[]
   pos=[]
   tags=[]
   feats=[]
   heads=[]
   deps=[]
   spaces=[]
   norms=[]
   for s in w:
     for i,t in enumerate(s):
       form=t["form"]
       words.append(form)
       lemmas.append(vs.add(t["lemma"]))
       p=t["pos"].split(",")
       xpos=",".join(p[0:4])
       pos.append(vs.add(p[4]))
       tags.append(vs.add(xpos))
       feats.append(p[5])
       if t["deprel"]=="root":
         heads.append(0)
         deps.append(r)
       else:
         heads.append(t["head"]-i-1)
         deps.append(vs.add(t["deprel"]))
       spaces.append(False)
       g=self.gloss(form,xpos)
       if g!=None:
         norms.append(vs.add(g))
       else:
         norms.append(vs.add(form))
   doc=Doc(self.vocab,words=words,spaces=spaces)
   a=numpy.array(list(zip(lemmas,pos,tags,deps,heads,norms)),dtype="uint64")
   doc.from_array([LEMMA,POS,TAG,DEP,HEAD,NORM],a)
   try:
     doc.is_tagged=True
     doc.is_parsed=True
   except:
     for i,j in enumerate(feats):
       if j!="_" and j!="":
         doc[i].set_morph(j)
   return doc
Exemple #30
0
    def __call__(self, text):
        """Convert a Stanza Doc to a spaCy Doc.

        text (unicode): The text to process.
        RETURNS (spacy.tokens.Doc): The spaCy Doc object.
        """
        if not text:
            return Doc(self.vocab)
        elif text.isspace():
            return Doc(self.vocab, words=[text], spaces=[False])

        snlp_doc = self.snlp(text)
        text = snlp_doc.text
        snlp_tokens, snlp_heads = self.get_tokens_with_heads(snlp_doc)
        words = []
        spaces = []
        pos = []
        tags = []
        deps = []
        heads = []
        lemmas = []
        offset = 0
        token_texts = [t.text for t in snlp_tokens]
        is_aligned = True
        try:
            words, spaces = self.get_words_and_spaces(token_texts, text)
        except ValueError:
            words = token_texts
            spaces = [True] * len(words)
            is_aligned = False
            warnings.warn(
                "Due to multiword token expansion or an alignment "
                "issue, the original text has been replaced by space-separated "
                "expanded tokens.",
                stacklevel=4,
            )
        offset = 0
        for i, word in enumerate(words):
            if word.isspace() and word != snlp_tokens[i + offset].text:
                # insert a space token
                pos.append(self.vocab.strings.add("SPACE"))
                tags.append(self.vocab.strings.add("_SP"))
                deps.append(self.vocab.strings.add(""))
                lemmas.append(self.vocab.strings.add(word))

                # increment any heads left of this position that point beyond
                # this position to the right (already present in heads)
                for j in range(0, len(heads)):
                    if j + heads[j] >= i:
                        heads[j] += 1

                # decrement any heads right of this position that point beyond
                # this position to the left (yet to be added from snlp_heads)
                for j in range(i + offset, len(snlp_heads)):
                    if j + snlp_heads[j] < i + offset:
                        snlp_heads[j] -= 1

                # initial space tokens are attached to the following token,
                # otherwise attach to the preceding token
                if i == 0:
                    heads.append(1)
                else:
                    heads.append(-1)

                offset -= 1
            else:
                token = snlp_tokens[i + offset]
                assert word == token.text

                pos.append(self.vocab.strings.add(token.upos or ""))
                tags.append(
                    self.vocab.strings.add(token.xpos or token.feats or ""))
                deps.append(self.vocab.strings.add(token.deprel or ""))
                heads.append(snlp_heads[i + offset])
                lemmas.append(self.vocab.strings.add(token.lemma or ""))

        attrs = [POS, TAG, DEP, HEAD]
        array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64")
        doc = Doc(self.vocab, words=words,
                  spaces=spaces).from_array(attrs, array)
        ents = []
        for ent in snlp_doc.entities:
            ent_span = doc.char_span(ent.start_char, ent.end_char, ent.type)
            ents.append(ent_span)
        if not is_aligned or not all(ents):
            warnings.warn(
                f"Can't set named entities because of multi-word token "
                f"expansion or because the character offsets don't map to "
                f"valid tokens produced by the Stanza tokenizer:\n"
                f"Words: {words}\n"
                f"Entities: {[(e.text, e.type, e.start_char, e.end_char) for e in snlp_doc.entities]}",
                stacklevel=4,
            )
        else:
            doc.ents = ents
        # Overwrite lemmas separately to prevent them from being overwritten by spaCy
        lemma_array = numpy.array([[lemma] for lemma in lemmas],
                                  dtype="uint64")
        doc.from_array([LEMMA], lemma_array)
        if any(pos) or any(tags):
            doc.is_tagged = True
        if any(deps) or any(heads):
            doc.is_parsed = True
        return doc
Exemple #31
0
 def __call__(self, text):
     u = self.model(text) if text else ""
     vs = self.vocab.strings
     r = vs.add("ROOT")
     p = {
         "ACAUS": "VERB",
         "ACOND": "SCONJ",
         "ADV": "ADV",
         "ALIM": "SCONJ",
         "APREC": "SCONJ",
         "ART": "DET",
         "CCIRC": "SCONJ",
         "CFOC": "PART",
         "CONJ": "CCONJ",
         "COP": "PRON",
         "CPRET": "AUX",
         "CREL": "SCONJ",
         "EXIST": "VERB",
         "FUT": "AUX",
         "IMOD": "ADV",
         "NEG": "ADV",
         "NPROP": "PROPN",
         "NUM": "NUM",
         "PDEM": "DET",
         "PPOS": "DET",
         "PREP": "ADP",
         "PTC": "PART",
         "PUNCT": "PUNCT"
     }
     words = []
     lemmas = []
     pos = []
     tags = []
     heads = []
     deps = []
     spaces = []
     norms = []
     for s in u.split("\n"):
         if s.startswith('<norm xml:id="u'):
             id = s[15:s.index('"', 16)]
             i = s.index(' orig="')
             form = s[i + 7:s.index('"', i + 8)]
             words.append(form)
             i = s.find(' lemma="')
             lemmas.append(
                 vs.add(form if i < 0 else s[i + 8:s.index('"', i + 9)]))
             i = s.find(' norm="')
             norms.append(
                 vs.add(form if i < 0 else s[i + 7:s.index('"', i + 8)]))
             i = s.index(' func="')
             dep = s[i + 7:s.index('"', i + 8)]
             if dep == "root":
                 heads.append(0)
                 deps.append(r)
             else:
                 i = s.find(' head="#u')
                 heads.append(
                     0 if i < 0 else int(s[i + 9:s.index('"', i + 10)]) -
                     int(id))
                 deps.append(vs.add(dep))
             i = s.index(' pos="')
             xpos = s[i + 6:s.index('"', i + 7)]
             tags.append(vs.add(xpos))
             upos = "X"
             if xpos in p:
                 upos = p[xpos]
             elif xpos.startswith("A"):
                 upos = "AUX"
             elif xpos.startswith("N"):
                 upos = "ADJ" if dep in {"amod", "acl"} else "NOUN"
             elif xpos.startswith("P"):
                 upos = "PRON"
             elif xpos.startswith("V"):
                 upos = "VERB"
             pos.append(vs.add(upos))
             spaces.append(False)
         elif s.startswith("</norm_group>"):
             spaces[-1] = True
     doc = Doc(self.vocab, words=words, spaces=spaces)
     a = numpy.array(list(zip(lemmas, pos, tags, deps, heads, norms)),
                     dtype="uint64")
     doc.from_array([LEMMA, POS, TAG, DEP, HEAD, NORM], a)
     try:
         doc.is_tagged = True
         doc.is_parsed = True
     except:
         pass
     return doc
Exemple #32
0
def serialize_spacy_doc(orig_doc, converted_sentences):
    words = []
    spaces = []
    total_attrs = []
    attrs_ = list(attrs.NAMES)
    attrs_.remove('SENT_START')  # this clashes HEAD (see spacy documentation)
    attrs_.remove(
        'SPACY')  # we dont want to override the spaces we assign later on

    for orig_span, converted_sentence in zip(orig_doc.sents,
                                             converted_sentences):
        # remove redundant dummy-root-node
        converted = {
            iid: tok
            for iid, tok in converted_sentence.items() if iid != 0
        }
        orig = orig_span.as_doc()

        # get attributes of original doc
        orig_attrs = orig.to_array(attrs_)

        # append copied attributes for new nodes
        new_nodes_attrs = []
        for iid, tok in converted.items():
            if int(iid) != iid:
                new_node_attrs = list(orig_attrs[int(iid) - 1])

                # here we fix the relative head he is pointing to,
                # in case it is a negative number we need to cast it to its unsigned synonym
                relative = int(iid) - (len(orig_attrs) + len(new_nodes_attrs) +
                                       1)
                new_node_attrs[attrs_.index('HEAD')] = relative + (
                    2**NUM_OF_BITS if relative < 0 else 0)

                new_nodes_attrs.append(new_node_attrs)
        if new_nodes_attrs:
            new_attrs = np.append(orig_attrs, new_nodes_attrs, axis=0)
        else:
            new_attrs = orig_attrs
        total_attrs = np.append(total_attrs, new_attrs,
                                axis=0) if len(total_attrs) > 0 else new_attrs

        # fix whitespaces in case of new nodes: take original spaces. change the last one if there are new nodes.
        #   add spaces for each new nodes, except for last
        spaces += [t.whitespace_ if not ((i + 1 == len(orig)) and (len(new_nodes_attrs) > 0)) else ' ' for i, t in enumerate(orig)] + \
                  [' ' if i + 1 < len(converted.keys()) else '' for i, iid in enumerate(converted.keys()) if int(iid) != iid]
        spaces[-1] = ' '
        words += [t.get_conllu_field("form") for iid, t in converted.items()]

    # form new doc including new nodes and set attributes
    spaces[-1] = ''
    new_doc = Doc(orig_doc.vocab, words=words, spaces=spaces)
    new_doc.from_array(attrs_, total_attrs)

    j = 0
    for converted_sentence in converted_sentences:
        converted = {
            iid: tok
            for iid, tok in converted_sentence.items() if iid != 0
        }

        # store spacy ids for head indices extraction later on
        spacy_ids = {
            iid: (spacy_i + j)
            for spacy_i, iid in enumerate(converted.keys())
        }

        # set new info for all tokens per their head lists
        for i, bart_tok in enumerate(converted.values()):
            spacy_tok = new_doc[i + j]
            for head, rel in bart_tok.get_new_relations():
                # extract spacy correspondent head id
                head_tok = new_doc[
                    spacy_ids[head.get_conllu_field("id")] if head.
                    get_conllu_field("id") != 0 else spacy_tok.i]
                # parse stringish label
                is_state_head_node = ((head_tok.text == "STATE") and (head.get_conllu_field("id") != int(head.get_conllu_field("id")))) or \
                                     (bart_tok.get_conllu_field("id") != int(bart_tok.get_conllu_field("id")))
                new_rel, src, unc, alt = parse_bart_label(
                    rel, is_state_head_node=is_state_head_node)
                # add info to token
                spacy_tok._.parent_list.append({
                    'head': head_tok,
                    'rel': new_rel,
                    'src': src,
                    'alt': alt,
                    'unc': unc
                })

            # fix sentence boundaries, need to turn off is_parsed bool as it prevents setting the boundaries
            new_doc.is_parsed = False
            spacy_tok.is_sent_start = False if i != 0 else True
            new_doc.is_parsed = True

        j += len(converted)

    return new_doc
Exemple #33
0
def _spacy_decode(self, x):
    doc = Doc(self.nlp.vocab, words=x['words'])
    return doc.from_array([
        DEP, IS_ALPHA, IS_PUNCT, IS_STOP, LEMMA, LOWER, TAG, SENT_END,
        SENT_START, ORTH, POS, ENT_IOB
    ], x['arr'].reshape(x['shape']))