def postprocess(self): self.data = list() for sent in self.output_data: mytokens = list() for tok in sent.rstrip().split("\n"): ( index, word, lemma, upos, xpos, feats, head, deprel, deps, misc, ) = tok.split("\t") mytokens.append( Token( id=index, word=word, lemma=lemma, # don't write out gold pos # upos=upos, xpos=xpos, feats=str(Morph.from_parzu(xpos + "|" + feats)), head=head, deprel=deprel, deps=deps, misc=misc, )) self.data.append(Sentence(mytokens))
def postprocess(self): self.data = list() for sent in self.output_data: senttokens = list() for tok in sent.split("\n"): token, tag = tok.split("\t") stts = rftag2stts(tag) senttokens.append( Token(word=token, xpos=stts, feats=str(Morph.from_rftag(tag)))) self.data.append(Sentence(senttokens))
def postprocess(self): self.data = list() for sent in self.output_data: mytokens = list() for tok in sent: text, rftmorph, stts, lemma = tok mytokens.append( Token( word=text, xpos=stts, feats=str(Morph.from_rftag(rftmorph)), lemma=lemma, )) self.data.append(Sentence(mytokens))
def postprocess(self): self.data = list() for sent in self.output_data.rstrip().split("\n\n"): mytokens = list() for token_entry in sent.split("\n"): tok, tag, lemma = token_entry.split("\t") maintag = tag.split(".")[0] # kleine korrektur stts = "$." if maintag == "$" else maintag mytokens.append( Token( word=tok, xpos=stts, lemma=lemma, feats=str(Morph.from_tigertag(tag)), )) self.data.append(Sentence(mytokens))
def compare_morph(g, a): gfeats = Morph(from_string=g.feats).feats afeats = Morph(from_string=a.feats).feats new_row = dict() this_instance_vals = list() for feat in MORPH_FEATS: if feat in gfeats: # correct if feature exists and value matches if gfeats.get(feat) == afeats.get(feat): new_row[feat] = 1 else: new_row[feat] = 0 else: # not in gold, # ignorieren wir das new_row[feat] = np.nan this_instance_vals.append(new_row[feat]) return pd.Series(this_instance_vals).mean(), new_row
chunks.append(chunk) else: chunks = [] chunk = Chunk(idx, dst) elif line == "EOS": chunks.append(chunk) for k, v in srcs_dict.items(): chunks[k].update_srsc(v) sents.append(chunks) srcs_dict.clear() else: morph = Morph(line) chunk.update_morph(morph) for sent in sents: for m in sent: dst = m.dst srcs = m.srcs verb = None for morph in m.morphs: if morph.pos == "動詞": verb = morph.base break if verb: if dst != -1: subs = []
sentence_list_temp = [] temp = [] temp1 = line[:-1].split(" ") num_list.append(temp1[1]) dst_list.append(temp1[2][:-1]) elif "\t" in line: item = line.strip().split("\t") try: surf = item[0] items = item[1].split(",") except IndexError: next if item == ['記号,空白,*,*,*,*,\u3000,\u3000,']: surf = "\u3000" one_morph.append(Morph(surf, items[6], items[0], items[1])) sentence_list_temp.append(surf) elif "EOS" in line: temp = [] if len(sentence_list_temp) > 0: for item in sentence_list_temp: temp.append(item) sentence_list.append("".join(temp)) morph_list.append(one_morph) one_morph = [] sentence_list_temp = [] temp = [] if len(morph_list) == 0: one_sent = [] dst_list = []
import CaboCha from common import Morph all_sent = [] sent = [] with open("./data/neko.txt.cabocha") as f: for line in f: if line[0] == "*": next if "\t" in line: item = line.strip().split("\t") try: surf = item[0] items = item[1].split(",") except IndexError: next if not item == ['記号,空白,*,*,*,*,\u3000,\u3000,']: sent.append(Morph(surf, items[6], items[0], items[1])) elif "EOS" in line: if len(sent): all_sent.append(sent) sent = [] for item in all_sent[1]: print('surface=%s\tbase=%s\tpos=%s\tpos1=%s' % (item.surface, item.base, item.pos, item.pos1))