def lu_rev(): new_kolu = [] err = 0 for i in kolu: surface_forms = [] luid = i['lu_id'] #if luid == 129: #if luid == 193: #if luid == 5: #if luid == 249: #if luid == 5392: annos = kfn.annotation(luid) print('###', luid, i['lu']) for anno in annos: denos = anno['denotations'] text = anno['text'] #print(text) for deno in denos: if deno['obj'] == 'target' or deno['obj'] == 'Target': #print(deno) target_span = deno['span'] b, e = target_span['begin'], target_span['end'] #annotation error 제외 if type(b) != str: target = get_eojeol(target_span, text) target_anno = text[b:e] if target_anno in target: print('target_anno:', target_anno, 'target:', target) surface_forms.append(target) else: err = err + 1 #print('eeee',err) else: #e = e+1 pass surface_forms = list(set(surface_forms)) i['surface_forms'] = surface_forms #print('') new_kolu.append(i) print('error:', err) #else: #pass #break #print(new_kolu) with open('./resource/KFN_lus.json', 'w') as f: json.dump(new_kolu, f, ensure_ascii=False, indent=4)
def remove_annotation_error(): with open('./resource/KFN_lus.json.bak2', 'r') as f: klu = json.load(f) for i in klu: surface_forms = [] luid = i['lu_id'] #if luid == 129: #if luid == 193: err = 0 #if luid == 5: annos = kfn.annotation(luid) print('###', luid, i['lu']) for anno in annos: denos = anno['denotations'] text = anno['text'] #print(text) for deno in denos: #print(anno['ko_annotation_id']) if deno['obj'] == 'target' or deno['obj'] == 'Target': #print(deno) target_span = deno['span'] b, e = target_span['begin'], target_span['end'] #annotation error 제외 if type(b) == str: err = anno['ko_annotation_id'] print(err) else: #e = e+1 pass #print(i['ko_annotation_id']) if err != 0: i['ko_annotation_id'].remove(err) print('') with open('./resource/KFN_lus.json', 'w') as f: json.dump(klu, f, ensure_ascii=False, indent=4)
def gen_list_training_test(): # training/test data 조건: FE annotation 에 오류가 없는 경우 (이건 그냥 믿고 진행) # test data 조건 1) ambiguity 가 있는 LU lexs = [] for i in kolu: lex = i['lu'].split('.')[0] lexs.append(lex) ambi = [] for i in lexs: count = lexs.count(i) if count > 1: ambi.append(i) #ambi = list(set(ambi)) print('전체 LU 개수:', len(lexs)) print('모호성 있는 LU 개수:', len(ambi)) ambi = list(set(ambi)) # test data 조건 2) FE 가 있는 LU felus = [] for i in kolu: lex = i['lu'].split('.')[0] luid = i['lu_id'] if lex in ambi: print('### luid:', luid, 'added:', len(felus)) anno = kfn.annotation(luid) for i in anno: for d in i['denotations']: if d['obj'] != 'target': aid = i['ko_annotation_id'] print('aid:', aid) felus.append(aid) print('') print(len(felus)) felus = list(set(felus)) print(len(felus)) with open('./anno_ids_for_test_v1.json', 'w') as f: json.dump(felus, f, ensure_ascii=False, indent=4)
import json import kfn import pprint from nltk.corpus import framenet as fn #get all lus lus = kfn.lus() print(len(lus)) #get lus lus = kfn.lus_by_lemma('나누다') print(lus) #get lu by lu_id lu = kfn.lu(lus[0]['lu_id']) pprint.pprint(lu) frame_id = lu['fid'] f = fn.frame(frame_id) print(f.name) print(f.definition) #get annotations by lu_id annotations = kfn.annotation(lus[0]['lu_id']) print(annotations)
def gen_data(): with open('./resource/KFN_annotations_from_sejong.json', 'r') as f: sejong = json.load(f) with open('./aids_list.json', 'r') as f: aids_list = json.load(f) training_list = aids_list['training'] test_list = aids_list['test'] #training_for_all = [] for i in kolu: frame = i['lu'].split('.')[2] aids = i['ko_annotation_id'] luid = i['lu_id'] #if luid == 1: #if luid == 193: #if luid == 129: #for all #if luid == 5: #if luid == 6795: print('training', i['lu']) print(i['lu']) annos = kfn.annotation(luid) for aid in aids: if aid in test_list: for anno in annos: if aid == anno['ko_annotation_id']: try: conll = get_conll(anno, frame) sent_id = str(anno['sent_id']) sent = anno['text'] write_test(conll, sent_id, sent) print(str(aid) + ': success') except KeyboardInterrupt: raise except Exception as e: print(str(aid) + ': error') pass else: for anno in annos: if aid == anno['ko_annotation_id']: try: conll = get_conll(anno, frame) sent_id = str(anno['sent_id']) sent = anno['text'] write_training_for_all(conll, sent_id, sent) print(str(aid) + ': success') except KeyboardInterrupt: raise except Exception as e: print(str(aid) + ': error') pass sejong_ids = i['sejong_annotation_id'] if len(sejong_ids) > 0: for sid in sejong_ids: try: sejong_anno = get_sejong_anno(sid) sent = sejong_anno['text'] conll = get_conll(sejong_anno, frame) sent_id = str(sejong_anno['ko_annotation_id']) write_training_for_all(conll, sent_id, sent) print(str(sid) + ': success') except KeyboardInterrupt: raise except Exception as e: print(str(sid) + ': error') pass #break for i in kolu: frame = i['lu'].split('.')[2] aids = i['ko_annotation_id'] luid = i['lu_id'] print('test', i['lu']) annos = kfn.annotation(luid) for aid in aids: if aid in test_list: pass else: for anno in annos: if aid == anno['ko_annotation_id']: try: conll = get_conll(anno, frame) sent_id = str(anno['sent_id']) sent = anno['text'] write_training(conll, sent_id, sent) except KeyboardInterrupt: raise except Exception as e: print(str(aid) + ': error') pass
def gen_tt_data(): with open('./anno_ids_for_test_v1.json', 'r') as f: aids = json.load(f) #print(len(aids)) training_list = [] test_list = [] for i in kolu: luid = i['lu_id'] print(luid) # if luid == 2: anno = kfn.annotation(luid) test_1 = False for i in anno: aid = i['ko_annotation_id'] if aid in aids: test_list.append(aid) break for i in anno: aid = i['ko_annotation_id'] if aid not in aids: training_list.append(aid) # if test_1 == True: # fetags = [] # for a in anno: # for d in a['denotations']: # if d['obj'] != 'target': # fetag = d['obj'] # fetags.append(fetag) # for a in anno: # mytags = [] # for d in a['denotations']: # if d['obj'] != 'target': # mytag = d['obj'] # mytags.append(mytag) # add = True # print('mytags', mytags) # for t in mytags: # count = fetags.count(t) # if count == 1: # add = False # #if len() # if add == True: # test_list.append(a['ko_annotation_id']) # break print('test_data:', len(test_list)) print('') #print(test_list) #total_aids = [] #for i in kolu: # luid = i['lu_id'] # anno = kfn.annotation(luid) # for i in anno: # aid = i['ko_annotation_id'] # total_aids.append(aid) # if aid in test_list: # pass # else: # training_list.append(aid) #print('total LU:',len(kolu)) #print('total_aids:', len(total_aids)) #print('test_list:') #print(len(test_list)) #print(len(list(set(test_list)))) #print('training_list:') #print(len(training_list)) #print(len(list(set(training_list)))) result = {} #training_list = [] result['training'] = training_list result['test'] = test_list with open('./aids_list.json', 'w') as f: json.dump(result, f, ensure_ascii=False, indent=4)