def postprocess(self): self.data = list() for sent in self.output_data: mytokens = list() for tok in sent.rstrip().split("\n"): ( index, word, lemma, upos, xpos, feats, head, deprel, deps, misc, ) = tok.split("\t") mytokens.append( Token( id=index, word=word, lemma=lemma, # don't write out gold pos # upos=upos, xpos=xpos, feats=str(Morph.from_parzu(xpos + "|" + feats)), head=head, deprel=deprel, deps=deps, misc=misc, )) self.data.append(Sentence(mytokens))
def postprocess(self): self.data = list() for sent_doc in self.output_data: self.data.append( Sentence( Token(word=str(tok), xpos=tok.tag_, upos=tok.pos_, lemma=tok.lemma_) for tok in sent_doc))
def postprocess(self): self.data = list() for sent in self.output_data: senttokens = list() for tok in sent.split("\n"): token, tag = tok.split("\t") stts = rftag2stts(tag) senttokens.append( Token(word=token, xpos=stts, feats=str(Morph.from_rftag(tag)))) self.data.append(Sentence(senttokens))
def postprocess(self): self.data = list() senttokens = list() for token in self.output_data: if token == "</s>": self.data.append(Sentence(senttokens)) senttokens = list() else: tok, tag, lemma = token.split("\t") senttokens.append(Token(word=tok, xpos=tag, lemma=lemma)) if senttokens: # add last sentence self.data.append(Sentence(senttokens))
def postprocess(self): self.data = list() for sent in self.output_data: self.data.append( Sentence( Token( id=str(rel.dep().index()), word=rel.dep().word(), # don't write out gold pos # xpos=rel.dep().tag(), head=str(rel.gov().index()), deprel=str(rel.reln()), ) for rel in sent.typedDependencies()))
def postprocess(self): self.data = list() for sent in self.output_data.sentences: self.data.append( Sentence( Token( id=tok.index, word=tok.text, lemma=tok.lemma, feats=tok.feats, head=str(tok.governor), deprel=tok.dependency_relation, ) for tok in sent.words))
def postprocess(self): self.data = list() for sent in self.output_data.sents: self.data.append( Sentence( Token( word=tok.text, lemma=tok.lemma_, # upos=tok.pos_, # xpos=tok.tag_, head=str(tok.head.i - sent[0].i + 1), deprel=tok.dep_, ) for tok in sent))
def postprocess(self): self.data = list() for sent in self.output_data: mytokens = list() for tok in sent: text, rftmorph, stts, lemma = tok mytokens.append( Token( word=text, xpos=stts, feats=str(Morph.from_rftag(rftmorph)), lemma=lemma, )) self.data.append(Sentence(mytokens))
def retokenize(input_data) -> Document: # get input from stdin doc = Document(input_data) # for tok in Doc # if tok is APPRART # use map to find new tokens # replace xpos with APPR + ART # set feats on ART for sent in doc.sentences: new_tok_locations = list() for i, tok in enumerate(sent): if tok.xpos == "APPRART": try: appr, art = APPRARTMAP.get(tok.word.lower()) if tok.word[0].isupper(): appr = appr.capitalize() except TypeError: print(tok, file=sys.stderr) exit(1) new_tok = Token(word=art, lemma="der", xpos="ART", feats=tok.feats, head=tok.head) tok.word = appr tok.xpos = "APPR" tok.feats = "_" # assuming that the article will point to the noun, # set the preposition to point to the article tok.head = new_tok.id sent.tokens.insert(i + 1, new_tok) # remember where new tokens were added! # -> all head references afterwards must be updated! new_tok_locations.append(i) # TODO also handle PTKA="am", as in "am besten" -> "an dem besten" # then renumber tokens per sentence before sending to output for i, tok in enumerate(sent): tok.id = str(i + 1) if tok.head.isdigit(): for new_tok_index in new_tok_locations: if int(tok.head) > new_tok_index: tok.head = str(int(tok.head) + 1) return doc
def postprocess(self): self.data = list() for sent in self.output_data.rstrip().split("\n\n"): mytokens = list() for token_entry in sent.split("\n"): tok, tag, lemma = token_entry.split("\t") maintag = tag.split(".")[0] # kleine korrektur stts = "$." if maintag == "$" else maintag mytokens.append( Token( word=tok, xpos=stts, lemma=lemma, feats=str(Morph.from_tigertag(tag)), )) self.data.append(Sentence(mytokens))
def postprocess(self): self.data = list() for sent in self.output_data.sentence: self.data.append( Sentence( Token(word=tok.word, xpos=tok.pos) for tok in sent.token))
def postprocess(self): """re-format output_data so that it conforms to eval format""" self.data = list() for sent in self.output_data: self.data.append( Sentence(Token(word=tok, xpos=tag) for tok, tag in sent))