def compare_morph(g, a): gfeats = Morph(from_string=g.feats).feats afeats = Morph(from_string=a.feats).feats new_row = dict() this_instance_vals = list() for feat in MORPH_FEATS: if feat in gfeats: # correct if feature exists and value matches if gfeats.get(feat) == afeats.get(feat): new_row[feat] = 1 else: new_row[feat] = 0 else: # not in gold, # ignorieren wir das new_row[feat] = np.nan this_instance_vals.append(new_row[feat]) return pd.Series(this_instance_vals).mean(), new_row
chunks.append(chunk) else: chunks = [] chunk = Chunk(idx, dst) elif line == "EOS": chunks.append(chunk) for k, v in srcs_dict.items(): chunks[k].update_srsc(v) sents.append(chunks) srcs_dict.clear() else: morph = Morph(line) chunk.update_morph(morph) for sent in sents: for m in sent: dst = m.dst srcs = m.srcs verb = None for morph in m.morphs: if morph.pos == "動詞": verb = morph.base break if verb: if dst != -1: subs = []
sentence_list_temp = [] temp = [] temp1 = line[:-1].split(" ") num_list.append(temp1[1]) dst_list.append(temp1[2][:-1]) elif "\t" in line: item = line.strip().split("\t") try: surf = item[0] items = item[1].split(",") except IndexError: next if item == ['記号,空白,*,*,*,*,\u3000,\u3000,']: surf = "\u3000" one_morph.append(Morph(surf, items[6], items[0], items[1])) sentence_list_temp.append(surf) elif "EOS" in line: temp = [] if len(sentence_list_temp) > 0: for item in sentence_list_temp: temp.append(item) sentence_list.append("".join(temp)) morph_list.append(one_morph) one_morph = [] sentence_list_temp = [] temp = [] if len(morph_list) == 0: one_sent = [] dst_list = []
import CaboCha from common import Morph all_sent = [] sent = [] with open("./data/neko.txt.cabocha") as f: for line in f: if line[0] == "*": next if "\t" in line: item = line.strip().split("\t") try: surf = item[0] items = item[1].split(",") except IndexError: next if not item == ['記号,空白,*,*,*,*,\u3000,\u3000,']: sent.append(Morph(surf, items[6], items[0], items[1])) elif "EOS" in line: if len(sent): all_sent.append(sent) sent = [] for item in all_sent[1]: print('surface=%s\tbase=%s\tpos=%s\tpos1=%s' % (item.surface, item.base, item.pos, item.pos1))