Esempio n. 1
0
def lu_rev():
    new_kolu = []
    err = 0
    for i in kolu:
        surface_forms = []
        luid = i['lu_id']

        #if luid == 129:
        #if luid == 193:

        #if luid == 5:
        #if luid == 249:
        #if luid == 5392:

        annos = kfn.annotation(luid)
        print('###', luid, i['lu'])
        for anno in annos:
            denos = anno['denotations']
            text = anno['text']
            #print(text)
            for deno in denos:

                if deno['obj'] == 'target' or deno['obj'] == 'Target':
                    #print(deno)
                    target_span = deno['span']
                    b, e = target_span['begin'], target_span['end']
                    #annotation error 제외
                    if type(b) != str:
                        target = get_eojeol(target_span, text)
                        target_anno = text[b:e]
                        if target_anno in target:
                            print('target_anno:', target_anno, 'target:',
                                  target)
                            surface_forms.append(target)
                        else:
                            err = err + 1
                            #print('eeee',err)
                    else:
                        #e = e+1
                        pass
        surface_forms = list(set(surface_forms))
        i['surface_forms'] = surface_forms
        #print('')
        new_kolu.append(i)
        print('error:', err)
        #else:
        #pass
        #break
    #print(new_kolu)

    with open('./resource/KFN_lus.json', 'w') as f:
        json.dump(new_kolu, f, ensure_ascii=False, indent=4)
Esempio n. 2
0
def remove_annotation_error():
    with open('./resource/KFN_lus.json.bak2', 'r') as f:
        klu = json.load(f)
    for i in klu:
        surface_forms = []
        luid = i['lu_id']

        #if luid == 129:
        #if luid == 193:
        err = 0
        #if luid == 5:
        annos = kfn.annotation(luid)
        print('###', luid, i['lu'])
        for anno in annos:
            denos = anno['denotations']
            text = anno['text']
            #print(text)
            for deno in denos:
                #print(anno['ko_annotation_id'])

                if deno['obj'] == 'target' or deno['obj'] == 'Target':
                    #print(deno)
                    target_span = deno['span']
                    b, e = target_span['begin'], target_span['end']
                    #annotation error 제외
                    if type(b) == str:
                        err = anno['ko_annotation_id']
                        print(err)
                    else:
                        #e = e+1
                        pass
        #print(i['ko_annotation_id'])
        if err != 0:
            i['ko_annotation_id'].remove(err)
        print('')
    with open('./resource/KFN_lus.json', 'w') as f:
        json.dump(klu, f, ensure_ascii=False, indent=4)
Esempio n. 3
0
def gen_list_training_test():
    # training/test data 조건: FE annotation 에 오류가 없는 경우 (이건 그냥 믿고 진행)
    # test data 조건 1) ambiguity 가 있는 LU
    lexs = []
    for i in kolu:
        lex = i['lu'].split('.')[0]
        lexs.append(lex)
    ambi = []
    for i in lexs:
        count = lexs.count(i)
        if count > 1:
            ambi.append(i)
    #ambi = list(set(ambi))
    print('전체 LU 개수:', len(lexs))
    print('모호성 있는 LU 개수:', len(ambi))
    ambi = list(set(ambi))
    # test data 조건 2) FE 가 있는 LU
    felus = []
    for i in kolu:
        lex = i['lu'].split('.')[0]
        luid = i['lu_id']
        if lex in ambi:
            print('### luid:', luid, 'added:', len(felus))
            anno = kfn.annotation(luid)
            for i in anno:
                for d in i['denotations']:
                    if d['obj'] != 'target':
                        aid = i['ko_annotation_id']
                        print('aid:', aid)
                        felus.append(aid)
            print('')

    print(len(felus))
    felus = list(set(felus))
    print(len(felus))
    with open('./anno_ids_for_test_v1.json', 'w') as f:
        json.dump(felus, f, ensure_ascii=False, indent=4)
import json
import kfn
import pprint
from nltk.corpus import framenet as fn

#get all lus
lus = kfn.lus()
print(len(lus))

#get lus
lus = kfn.lus_by_lemma('나누다')
print(lus)

#get lu by lu_id
lu = kfn.lu(lus[0]['lu_id'])
pprint.pprint(lu)

frame_id = lu['fid']
f = fn.frame(frame_id)
print(f.name)
print(f.definition)

#get annotations by lu_id
annotations = kfn.annotation(lus[0]['lu_id'])
print(annotations)
Esempio n. 5
0
def gen_data():
    with open('./resource/KFN_annotations_from_sejong.json', 'r') as f:
        sejong = json.load(f)
    with open('./aids_list.json', 'r') as f:
        aids_list = json.load(f)
    training_list = aids_list['training']
    test_list = aids_list['test']

    #training_for_all = []
    for i in kolu:
        frame = i['lu'].split('.')[2]
        aids = i['ko_annotation_id']
        luid = i['lu_id']

        #if luid == 1:
        #if luid == 193:
        #if luid == 129:

        #for all

        #if luid == 5:
        #if luid == 6795:
        print('training', i['lu'])
        print(i['lu'])
        annos = kfn.annotation(luid)
        for aid in aids:
            if aid in test_list:
                for anno in annos:
                    if aid == anno['ko_annotation_id']:
                        try:
                            conll = get_conll(anno, frame)
                            sent_id = str(anno['sent_id'])
                            sent = anno['text']
                            write_test(conll, sent_id, sent)
                            print(str(aid) + ': success')
                        except KeyboardInterrupt:
                            raise
                        except Exception as e:
                            print(str(aid) + ': error')
                            pass

            else:
                for anno in annos:
                    if aid == anno['ko_annotation_id']:
                        try:
                            conll = get_conll(anno, frame)
                            sent_id = str(anno['sent_id'])
                            sent = anno['text']
                            write_training_for_all(conll, sent_id, sent)
                            print(str(aid) + ': success')
                        except KeyboardInterrupt:
                            raise
                        except Exception as e:
                            print(str(aid) + ': error')
                            pass

        sejong_ids = i['sejong_annotation_id']

        if len(sejong_ids) > 0:
            for sid in sejong_ids:
                try:
                    sejong_anno = get_sejong_anno(sid)
                    sent = sejong_anno['text']
                    conll = get_conll(sejong_anno, frame)
                    sent_id = str(sejong_anno['ko_annotation_id'])
                    write_training_for_all(conll, sent_id, sent)
                    print(str(sid) + ': success')
                except KeyboardInterrupt:
                    raise
                except Exception as e:
                    print(str(sid) + ': error')
                    pass
            #break

    for i in kolu:
        frame = i['lu'].split('.')[2]
        aids = i['ko_annotation_id']
        luid = i['lu_id']
        print('test', i['lu'])
        annos = kfn.annotation(luid)
        for aid in aids:
            if aid in test_list:
                pass

            else:
                for anno in annos:
                    if aid == anno['ko_annotation_id']:
                        try:
                            conll = get_conll(anno, frame)
                            sent_id = str(anno['sent_id'])
                            sent = anno['text']
                            write_training(conll, sent_id, sent)
                        except KeyboardInterrupt:
                            raise
                        except Exception as e:
                            print(str(aid) + ': error')
                            pass
Esempio n. 6
0
def gen_tt_data():
    with open('./anno_ids_for_test_v1.json', 'r') as f:
        aids = json.load(f)
    #print(len(aids))
    training_list = []
    test_list = []
    for i in kolu:

        luid = i['lu_id']
        print(luid)
        #        if luid == 2:
        anno = kfn.annotation(luid)
        test_1 = False
        for i in anno:
            aid = i['ko_annotation_id']
            if aid in aids:
                test_list.append(aid)
                break
        for i in anno:
            aid = i['ko_annotation_id']
            if aid not in aids:
                training_list.append(aid)
#             if test_1 == True:
#                 fetags = []
#                 for a in anno:
#                     for d in a['denotations']:
#                         if d['obj'] != 'target':
#                             fetag = d['obj']
#                             fetags.append(fetag)
#                 for a in anno:
#                     mytags = []
#                     for d in a['denotations']:
#                         if d['obj'] != 'target':
#                             mytag = d['obj']
#                             mytags.append(mytag)
#                     add = True
#                     print('mytags', mytags)
#                     for t in mytags:
#                         count = fetags.count(t)
#                         if count == 1:
#                             add = False
#                     #if len()
#                     if add == True:
#                         test_list.append(a['ko_annotation_id'])
#                         break
            print('test_data:', len(test_list))
            print('')
            #print(test_list)
    #total_aids = []
    #for i in kolu:
    #    luid = i['lu_id']
    #    anno = kfn.annotation(luid)
    #    for i in anno:
    #        aid = i['ko_annotation_id']
    #        total_aids.append(aid)
    #        if aid in test_list:
    #            pass
    #        else:
    #            training_list.append(aid)

    #print('total LU:',len(kolu))
    #print('total_aids:', len(total_aids))
    #print('test_list:')
    #print(len(test_list))
    #print(len(list(set(test_list))))
    #print('training_list:')
    #print(len(training_list))
    #print(len(list(set(training_list))))
    result = {}
    #training_list = []
    result['training'] = training_list
    result['test'] = test_list

    with open('./aids_list.json', 'w') as f:
        json.dump(result, f, ensure_ascii=False, indent=4)