Ejemplo n.º 1
0
import os
from load_icd import load_icd, load_additional_icd, clean
from tqdm import tqdm
import pkuseg


seg = pkuseg.pkuseg(model_name='medicine')

with open("train.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

x_list = []
y_list = []
for line in lines:
    l = line.strip().split("\t")
    x_list.append(clean(l[0]))
    y_list.append(l[1].split("##"))
y_list = [[clean(icd) for icd in y] for y in y_list]

icd2str, match_list = load_icd()
str2standard = load_additional_icd()
print(match_list[0:5])


def jaccard(x, y):
    unions = len(x.union(y))
    intersec = len(x.intersection(y))
    return intersec / unions

def acc(pred_y, true_y):
    term_count = 0
Ejemplo n.º 2
0
            for item in sorted_dist[:output_y_num]:
                pred_y.append(item[0])
            output.append({x:pred_y})
            pickle.dump({x:pred_y}, f)
    # lock.acquire()
    # q.put(output)
    # lock.release()

if __name__ =="__main__":
    with open("train.txt", "r", encoding="utf-8") as f:
        lines = f.readlines()
    x_list = []
    y_list = []
    for line in lines:
        l = line.strip().split("\t")
        x_list.append(clean(l[0]))

    q = mp.Queue()
    Lock = mp.Lock()
    # find_distance_xy_mp(11, q, x_list[:2], output_y_num = 200, metrics = 'wmd')
    processes = []
    for i in range(10):
        processes.append(mp.Process(target=find_distance_xy_mp,args=(Lock, i, q, x_list[i*800: (i+1)*800], 200, 'wrd')))
    for p in processes:
        p.start()
    for p in processes:
        p.join()
    # out = []
    # for k in range(20):
    #     out1 = q.get()
    #     with open('./wmd_200/wmd_200'+str(k)+'.pkl', 'wb') as f: