import os from load_icd import load_icd, load_additional_icd, clean from tqdm import tqdm import pkuseg seg = pkuseg.pkuseg(model_name='medicine') with open("train.txt", "r", encoding="utf-8") as f: lines = f.readlines() x_list = [] y_list = [] for line in lines: l = line.strip().split("\t") x_list.append(clean(l[0])) y_list.append(l[1].split("##")) y_list = [[clean(icd) for icd in y] for y in y_list] icd2str, match_list = load_icd() str2standard = load_additional_icd() print(match_list[0:5]) def jaccard(x, y): unions = len(x.union(y)) intersec = len(x.intersection(y)) return intersec / unions def acc(pred_y, true_y): term_count = 0
for item in sorted_dist[:output_y_num]: pred_y.append(item[0]) output.append({x:pred_y}) pickle.dump({x:pred_y}, f) # lock.acquire() # q.put(output) # lock.release() if __name__ =="__main__": with open("train.txt", "r", encoding="utf-8") as f: lines = f.readlines() x_list = [] y_list = [] for line in lines: l = line.strip().split("\t") x_list.append(clean(l[0])) q = mp.Queue() Lock = mp.Lock() # find_distance_xy_mp(11, q, x_list[:2], output_y_num = 200, metrics = 'wmd') processes = [] for i in range(10): processes.append(mp.Process(target=find_distance_xy_mp,args=(Lock, i, q, x_list[i*800: (i+1)*800], 200, 'wrd'))) for p in processes: p.start() for p in processes: p.join() # out = [] # for k in range(20): # out1 = q.get() # with open('./wmd_200/wmd_200'+str(k)+'.pkl', 'wb') as f: