Beispiel #1
0
 def postprocess(self):
     self.data = list()
     for sent in self.output_data:
         mytokens = list()
         for tok in sent.rstrip().split("\n"):
             (
                 index,
                 word,
                 lemma,
                 upos,
                 xpos,
                 feats,
                 head,
                 deprel,
                 deps,
                 misc,
             ) = tok.split("\t")
             mytokens.append(
                 Token(
                     id=index,
                     word=word,
                     lemma=lemma,
                     # don't write out gold pos
                     # upos=upos, xpos=xpos,
                     feats=str(Morph.from_parzu(xpos + "|" + feats)),
                     head=head,
                     deprel=deprel,
                     deps=deps,
                     misc=misc,
                 ))
         self.data.append(Sentence(mytokens))
Beispiel #2
0
 def postprocess(self):
     self.data = list()
     for sent_doc in self.output_data:
         self.data.append(
             Sentence(
                 Token(word=str(tok),
                       xpos=tok.tag_,
                       upos=tok.pos_,
                       lemma=tok.lemma_) for tok in sent_doc))
Beispiel #3
0
 def postprocess(self):
     self.data = list()
     for sent in self.output_data:
         senttokens = list()
         for tok in sent.split("\n"):
             token, tag = tok.split("\t")
             stts = rftag2stts(tag)
             senttokens.append(
                 Token(word=token,
                       xpos=stts,
                       feats=str(Morph.from_rftag(tag))))
         self.data.append(Sentence(senttokens))
Beispiel #4
0
 def postprocess(self):
     self.data = list()
     senttokens = list()
     for token in self.output_data:
         if token == "</s>":
             self.data.append(Sentence(senttokens))
             senttokens = list()
         else:
             tok, tag, lemma = token.split("\t")
             senttokens.append(Token(word=tok, xpos=tag, lemma=lemma))
     if senttokens:  # add last sentence
         self.data.append(Sentence(senttokens))
Beispiel #5
0
 def postprocess(self):
     self.data = list()
     for sent in self.output_data:
         self.data.append(
             Sentence(
                 Token(
                     id=str(rel.dep().index()),
                     word=rel.dep().word(),
                     # don't write out gold pos
                     #   xpos=rel.dep().tag(),
                     head=str(rel.gov().index()),
                     deprel=str(rel.reln()),
                 ) for rel in sent.typedDependencies()))
Beispiel #6
0
 def postprocess(self):
     self.data = list()
     for sent in self.output_data.sentences:
         self.data.append(
             Sentence(
                 Token(
                     id=tok.index,
                     word=tok.text,
                     lemma=tok.lemma,
                     feats=tok.feats,
                     head=str(tok.governor),
                     deprel=tok.dependency_relation,
                 ) for tok in sent.words))
Beispiel #7
0
 def postprocess(self):
     self.data = list()
     for sent in self.output_data.sents:
         self.data.append(
             Sentence(
                 Token(
                     word=tok.text,
                     lemma=tok.lemma_,
                     # upos=tok.pos_,
                     #   xpos=tok.tag_,
                     head=str(tok.head.i - sent[0].i + 1),
                     deprel=tok.dep_,
                 ) for tok in sent))
Beispiel #8
0
 def postprocess(self):
     self.data = list()
     for sent in self.output_data:
         mytokens = list()
         for tok in sent:
             text, rftmorph, stts, lemma = tok
             mytokens.append(
                 Token(
                     word=text,
                     xpos=stts,
                     feats=str(Morph.from_rftag(rftmorph)),
                     lemma=lemma,
                 ))
         self.data.append(Sentence(mytokens))
Beispiel #9
0
def retokenize(input_data) -> Document:
    #  get input from stdin
    doc = Document(input_data)

    # for tok in Doc
    #  if tok is APPRART
    #   use map to find new tokens
    #   replace xpos with APPR + ART
    #   set feats on ART
    for sent in doc.sentences:
        new_tok_locations = list()
        for i, tok in enumerate(sent):
            if tok.xpos == "APPRART":
                try:
                    appr, art = APPRARTMAP.get(tok.word.lower())
                    if tok.word[0].isupper():
                        appr = appr.capitalize()
                except TypeError:
                    print(tok, file=sys.stderr)
                    exit(1)

                new_tok = Token(word=art,
                                lemma="der",
                                xpos="ART",
                                feats=tok.feats,
                                head=tok.head)

                tok.word = appr
                tok.xpos = "APPR"
                tok.feats = "_"
                # assuming that the article will point to the noun,
                # set the preposition to point to the article
                tok.head = new_tok.id

                sent.tokens.insert(i + 1, new_tok)
                # remember where new tokens were added!
                # -> all head references afterwards must be updated!
                new_tok_locations.append(i)

            # TODO also handle PTKA="am", as in "am besten" -> "an dem besten"

        # then renumber tokens per sentence before sending to output
        for i, tok in enumerate(sent):
            tok.id = str(i + 1)
            if tok.head.isdigit():
                for new_tok_index in new_tok_locations:
                    if int(tok.head) > new_tok_index:
                        tok.head = str(int(tok.head) + 1)

    return doc
Beispiel #10
0
 def postprocess(self):
     self.data = list()
     for sent in self.output_data.rstrip().split("\n\n"):
         mytokens = list()
         for token_entry in sent.split("\n"):
             tok, tag, lemma = token_entry.split("\t")
             maintag = tag.split(".")[0]
             # kleine korrektur
             stts = "$." if maintag == "$" else maintag
             mytokens.append(
                 Token(
                     word=tok,
                     xpos=stts,
                     lemma=lemma,
                     feats=str(Morph.from_tigertag(tag)),
                 ))
         self.data.append(Sentence(mytokens))
Beispiel #11
0
 def postprocess(self):
     self.data = list()
     for sent in self.output_data.sentence:
         self.data.append(
             Sentence(
                 Token(word=tok.word, xpos=tok.pos) for tok in sent.token))
Beispiel #12
0
 def postprocess(self):
     """re-format output_data so that it conforms to eval format"""
     self.data = list()
     for sent in self.output_data:
         self.data.append(
             Sentence(Token(word=tok, xpos=tag) for tok, tag in sent))