예제 #1
0
 def postprocess(self):
     self.data = list()
     for sent in self.output_data:
         mytokens = list()
         for tok in sent.rstrip().split("\n"):
             (
                 index,
                 word,
                 lemma,
                 upos,
                 xpos,
                 feats,
                 head,
                 deprel,
                 deps,
                 misc,
             ) = tok.split("\t")
             mytokens.append(
                 Token(
                     id=index,
                     word=word,
                     lemma=lemma,
                     # don't write out gold pos
                     # upos=upos, xpos=xpos,
                     feats=str(Morph.from_parzu(xpos + "|" + feats)),
                     head=head,
                     deprel=deprel,
                     deps=deps,
                     misc=misc,
                 ))
         self.data.append(Sentence(mytokens))
예제 #2
0
 def postprocess(self):
     self.data = list()
     for sent in self.output_data:
         senttokens = list()
         for tok in sent.split("\n"):
             token, tag = tok.split("\t")
             stts = rftag2stts(tag)
             senttokens.append(
                 Token(word=token,
                       xpos=stts,
                       feats=str(Morph.from_rftag(tag))))
         self.data.append(Sentence(senttokens))
예제 #3
0
 def postprocess(self):
     self.data = list()
     for sent in self.output_data:
         mytokens = list()
         for tok in sent:
             text, rftmorph, stts, lemma = tok
             mytokens.append(
                 Token(
                     word=text,
                     xpos=stts,
                     feats=str(Morph.from_rftag(rftmorph)),
                     lemma=lemma,
                 ))
         self.data.append(Sentence(mytokens))
예제 #4
0
 def postprocess(self):
     self.data = list()
     for sent in self.output_data.rstrip().split("\n\n"):
         mytokens = list()
         for token_entry in sent.split("\n"):
             tok, tag, lemma = token_entry.split("\t")
             maintag = tag.split(".")[0]
             # kleine korrektur
             stts = "$." if maintag == "$" else maintag
             mytokens.append(
                 Token(
                     word=tok,
                     xpos=stts,
                     lemma=lemma,
                     feats=str(Morph.from_tigertag(tag)),
                 ))
         self.data.append(Sentence(mytokens))
예제 #5
0
def compare_morph(g, a):
    gfeats = Morph(from_string=g.feats).feats
    afeats = Morph(from_string=a.feats).feats

    new_row = dict()
    this_instance_vals = list()
    for feat in MORPH_FEATS:
        if feat in gfeats:
            # correct if feature exists and value matches
            if gfeats.get(feat) == afeats.get(feat):
                new_row[feat] = 1
            else:
                new_row[feat] = 0
        else:
            # not in gold,
            # ignorieren wir das
            new_row[feat] = np.nan
        this_instance_vals.append(new_row[feat])
    return pd.Series(this_instance_vals).mean(), new_row
예제 #6
0
            chunks.append(chunk)
        else:
            chunks = []

        chunk = Chunk(idx, dst)

    elif line == "EOS":
        chunks.append(chunk)
        for k, v in srcs_dict.items():
            chunks[k].update_srsc(v)

        sents.append(chunks)
        srcs_dict.clear()

    else:
        morph = Morph(line)
        chunk.update_morph(morph)

for sent in sents:
    for m in sent:
        dst = m.dst
        srcs = m.srcs

        verb = None
        for morph in m.morphs:
            if morph.pos == "動詞":
                verb = morph.base
                break
        if verb:
            if dst != -1:
                subs = []
예제 #7
0
                sentence_list_temp = []
                temp = []
            temp1 = line[:-1].split(" ")
            num_list.append(temp1[1])
            dst_list.append(temp1[2][:-1])

        elif "\t" in line:
            item = line.strip().split("\t")
            try:
                surf = item[0]
                items = item[1].split(",")
            except IndexError:
                next
            if item == ['記号,空白,*,*,*,*,\u3000,\u3000,']:
                surf = "\u3000"
            one_morph.append(Morph(surf, items[6], items[0], items[1]))
            sentence_list_temp.append(surf)

        elif "EOS" in line:
            temp = []
            if len(sentence_list_temp) > 0:
                for item in sentence_list_temp:
                    temp.append(item)
                sentence_list.append("".join(temp))
                morph_list.append(one_morph)
                one_morph = []
                sentence_list_temp = []
                temp = []
            if len(morph_list) == 0:
                one_sent = []
                dst_list = []
예제 #8
0
파일: 40.py 프로젝트: tmu-nlp/100knock2021
import CaboCha
from common import Morph

all_sent = []
sent = []
with open("./data/neko.txt.cabocha") as f:
    for line in f:
        if line[0] == "*":
            next
        if "\t" in line:
            item = line.strip().split("\t")
            try:
                surf = item[0]
                items = item[1].split(",")
            except IndexError:
                next
            if not item == ['記号,空白,*,*,*,*,\u3000,\u3000,']:
                sent.append(Morph(surf, items[6], items[0], items[1]))
        elif "EOS" in line:
            if len(sent):
                all_sent.append(sent)
                sent = []

for item in all_sent[1]:
    print('surface=%s\tbase=%s\tpos=%s\tpos1=%s' %
          (item.surface, item.base, item.pos, item.pos1))