def gen_sepsis_json_data(): 
    vital_file = args.vital_file
    patient_time_record_dict = dict()
    feature_index_dict = py_op.myreadjson(os.path.join(args.result_dir, 'feature_index_dict.json'))
    index_feature_list = py_op.myreadjson(os.path.join(args.result_dir, 'index_feature_list.json'))
    feature_value_order_dict = py_op.myreadjson(os.path.join(args.result_dir, 'feature_value_order_dict.json'))
    feature_value_order_dict = { str(feature_index_dict[k]):v for k,v in feature_value_order_dict.items()  if 'time' not in k}
    patient_time_dict = py_op.myreadjson(os.path.join(args.result_dir, 'sepsis_time_dict.json'))
    # return
    for i_line, line in enumerate(open(vital_file)):
        if i_line and i_line % 10000 == 0:
            print('line', i_line) 
        if i_line:
            data = line.strip().split(',')
            patient, time = data[:2]
            time = time_to_min(time)
            if patient not in patient_time_dict:
                continue
            if time > patient_time_dict[patient]:
                continue
            time = int(float(time))
            if patient not in patient_time_record_dict:
                patient_time_record_dict[patient] = dict()

            data = data[2:]
            vs = dict()
            for idx, val in enumerate(data):
                if len(val) == 0:
                    continue
                value_order = feature_value_order_dict[str(idx)]
                vs[idx] = float('{:3.3f}'.format(value_order[val]))
            patient_time_record_dict[patient][time - patient_time_dict[patient] - 1] = vs

    with open(os.path.join(args.result_dir, 'sepsis_time_record_dict.json'), 'w') as f:
        f.write(json.dumps(patient_time_record_dict))
def analyze_sepsis():
    sepsis_time_record_dict = py_op.myreadjson(os.path.join(args.result_dir, 'sepsis_time_record_dict.json'))
    sepsis_label_dict = py_op.myreadjson(os.path.join(args.result_dir, 'sepsis_label_dict.json'))
    print(len(sepsis_time_record_dict))
    n = 0
    np = 0
    d = {
            30: 0,
            60: 0,
            120: 0,

            }
    for p,vd in sepsis_time_record_dict.items():
        if sepsis_label_dict[p]:
            n += 1
        else:
            continue
            pass
        min_t = - int(min(vd.keys()))
        for k in d:
            if min_t < k:
                d[k] += 1
    print(n)
    print(d)
    sepsis_label_dict = { k:v for k,v in sepsis_label_dict.items() if k in sepsis_time_record_dict }
Beispiel #3
0
def compare_sepsis():
    print('reading')
    sepsis_label_dict = py_op.myreadjson(
        os.path.join(args.result_dir, 'sepsis_label_dict.json'))
    print('reading')
    patient_label_dict = py_op.myreadjson(
        os.path.join(args.result_dir, 'patient_label_dict.json'))
    print(len(set(sepsis_label_dict) & set(patient_label_dict)))
    # sepsis_label_dict = [k for k,v in sepsis_label_dict.items() if v ]
    print(len(set(sepsis_label_dict) & set(patient_label_dict)))
    d = dict()
    for p, l in sepsis_label_dict.items():
        if p not in patient_label_dict:
            continue
        if l == 0:
            d[p] = 0
        else:
            d[p] = 1
    print(len(d))
    print(sum(d.values()))
    py_op.mywritejson(os.path.join(args.result_dir, 'sepsis_label_dict.json'),
                      d)

    sepsis_time_dict = py_op.myreadjson(
        os.path.join(args.result_dir, 'sepsis_time_dict.json'))
    sepsis_time_dict = {k: v for k, v in sepsis_time_dict.items() if k in d}
    print(len(sepsis_time_dict))
    py_op.mywritejson(os.path.join(args.result_dir, 'sepsis_time_dict.json'),
                      sepsis_time_dict)
Beispiel #4
0
def ana_feat_dist(task):
    n_split = 100
    feature_label_count = np.zeros((143, 2, n_split))
    patient_time_record_dict = py_op.myreadjson(
        os.path.join(args.result_dir, 'json_data',
                     '{:s}.json'.format(args.task)))
    patient_label_dict = py_op.myreadjson(
        os.path.join(args.file_dir,
                     'patient_label_dict.{:s}.json'.format(args.task)))
    [[[0. for _ in range(n_split)], [0. for _ in range(n_split)]]
     for i in range(143)]
    for ip, (p, t_dict) in enumerate(patient_time_record_dict.items()):
        if ip % 10000 == 0:
            print ip, len(patient_time_record_dict)

        label = patient_label_dict[p]
        for t, vs in t_dict.items():
            for v in vs:
                feature, value = v
                idx = int(value * n_split)
                feature_label_count[feature, label, idx] += 1
    for f in range(143):
        for l in range(2):
            feature_label_count[feature,
                                label] /= feature_label_count[feature,
                                                              label].sum()
    np.save('../file/feature_label_count.npy', feature_label_count)
Beispiel #5
0
def gen_json_data():
    vital_file = args.vital_file
    patient_time_record_dict = dict()
    feature_index_dict = py_op.myreadjson(
        os.path.join(args.file_dir, 'feature_index_dict.json'))
    feature_value_order_dict = py_op.myreadjson(
        os.path.join(args.file_dir, 'feature_value_order_dict.json'))
    feature_value_order_dict = {
        str(feature_index_dict[k]): v
        for k, v in feature_value_order_dict.items() if 'event' not in k
    }
    index_group_dict = py_op.myreadjson(
        os.path.join(args.file_dir, 'index_group_dict.json'))
    patient_time_dict = py_op.myreadjson(
        os.path.join(args.result_dir, 'patient_time_dict.json'))
    mx_time = -100
    for i_line, line in enumerate(open(vital_file)):
        if i_line % 10000 == 0:
            print 'line', i_line
        if 'event_time' not in line:
            data = line.strip().split(',')
            patient, time = data[:2]
            time = int(float(time))
            mx_time = max(mx_time, time)
            if patient not in patient_time_record_dict:
                patient_time_record_dict[patient] = dict()
            if time not in patient_time_record_dict[patient]:
                patient_time_record_dict[patient][time] = dict()

            data = data[2:]
            vs = dict()
            for idx, val in enumerate(data):
                if len(val) == 0:
                    continue
                if str(idx) in index_group_dict:
                    idx = index_group_dict[str(idx)]
                value_order = feature_value_order_dict[str(idx)]
                vs[idx] = value_order[val]
            patient_time_record_dict[patient][time].update(vs)

    new_d = dict()
    for p, tr in patient_time_record_dict.items():
        new_d[p] = dict()
        for t, vs in tr.items():
            if mx_time > 0:
                t = int(t - patient_time_dict[p] - 4)
            if t < -102:
                continue
            nvs = []
            for k in sorted(vs.keys()):
                nvs.append([k, vs[k]])
            new_d[p][t] = nvs
    with open(os.path.join(args.result_dir, 'patient_time_record_dict.json'),
              'w') as f:
        # f.write(json.dumps(new_d, indent=4))
        f.write(json.dumps(new_d))
def analyse_variation_trend(task='task1'):
    '''
    generate new vital file
    '''
    feature_variation_trend_dict = dict()

    feature_value_order_dict = py_op.myreadjson(os.path.join(args.file_dir, 'feature_value_order_dict.{:s}.json'.format(task)))

    patient_time_dict = py_op.myreadjson(os.path.join(args.file_dir, 'patient_time_dict.json'))

    task_dir = os.path.join(args.data_dir, 'sepsis2_{:s}_training'.format(task))
    vital_file = os.path.join(task_dir, 'sepsis2_{:s}_vital_training.csv'.format(task))
    vital_dict = { } # key-valuelist-dict

    last_patient = ''
    feature_time_value_dict = dict()
    for i_line,line in enumerate(open(vital_file)):
        if i_line % 10000 == 0:
            print i_line
        if i_line:
            ctt_list = line.strip().split(',')[2:]
            new_ctt = line.strip().split(',')[:2]
            if task == 'task1':
                patient, time = new_ctt
                new_time = float(time) - patient_time_dict[patient] - 4.0
                new_ctt = [patient, str(new_time)]

            patient, time = new_ctt
            time = int(float(time))

            if patient != last_patient:
                for feature, tv in feature_time_value_dict.items():
                    if len(tv) > 4:
                        ts = sorted(tv.keys())
                        vs = [tv[t] for t in ts]
                        feature_variation_trend_dict[feature] = feature_variation_trend_dict.get(feature, []) + [[ts, vs]]
                if i_line >= 500000:
                    break

                feature_time_value_dict = dict()
                last_patient = patient

            for idx, value in enumerate(ctt_list):
                if len(value.strip()):
                    value = float(value.strip())
                    if idx not in feature_time_value_dict:
                        feature_time_value_dict[idx] = { }
                    feature_time_value_dict[idx][time] = value



    # py_op.mywritejson(os.path.join(args.file_dir, 'feature_variation_trend_dict.json'), feature_variation_trend_dict)
    with open (os.path.join(args.file_dir, 'feature_variation_trend_dict.json'), 'w') as f:
        f.write(json.dumps(feature_variation_trend_dict))
def feature_change():
    print('reading')
    patient_time_record_dict = json.load(open(os.path.join(args.result_dir, 'patient_time_record_dict.json')))
    print(patient_time_record_dict.keys()) 
    patient_label_dict = py_op.myreadjson(os.path.join(args.result_dir, 'patient_label_dict.json'))
    feature_list_dict = { str(i): [] for i in range(8) }
    for p in patient_time_record_dict:
        if p in patient_label_dict and patient_label_dict[p]:
            tr = patient_time_record_dict[p]
            last_v = { }
            for rs in tr.values():
                for i,v in rs.items():
                    if i in last_v:
                        feature_list_dict[i].append(abs(v - last_v[i]))
                    last_v[i] = v
    for f,l in feature_list_dict.items():
        l = sorted(l)
        ds = [0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.0]
        ds = [0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.0]
        ns = []
        for i,delta in enumerate(l):
            if delta  > ds[0]:
                ns.append(float('{:3.2f}'.format(1.0*i/len(l))))
                ds = ds[1:]
        ns.append(len(l))
        print(f, ns) 
Beispiel #8
0
def split_data():
    patient_label_dict = py_op.myreadjson(
        os.path.join(args.result_dir, 'patient_label_dict.json'))
    # patients = patient_label_dict.keys()
    # patients = sorted(patients)
    patients = py_op.myreadjson(
        os.path.join(args.result_dir, 'patient_list.json'))
    n = int(len(patients) * 0.8)
    patient_train = patients[:n]
    patient_valid = patients[n:]
    py_op.mywritejson(os.path.join(args.result_dir, 'train.json'),
                      patient_train)
    py_op.mywritejson(os.path.join(args.result_dir, 'valid.json'),
                      patient_valid)
    print(sum([patient_label_dict[k] for k in patient_train]))
    print(sum([patient_label_dict[k] for k in patient_valid]))
    print(len([patient_label_dict[k] for k in patient_train]))
def ana_patient():
    fo = '/home/yin/comparison'
    for task in os.listdir(fo):
        print '\n', fo
        task_dir = os.path.join(fo, task)
        task_dir = os.path.join(task_dir, os.listdir(task_dir)[-1])
        for fi in os.listdir(task_dir):
            patients = py_op.myreadjson(os.path.join(task_dir, fi))
            print fi, len(patients)
Beispiel #10
0
def gen_normal_range_order():
    feature_value_order_dict = py_op.myreadjson(os.path.join(args.result_dir, 'feature_value_order_dict.json'))
    index_vital_list = py_op.myreadjson(os.path.join(args.result_dir, 'index_feature_list.json'))
    vital_normal_range_dict  = py_op.myreadjson(os.path.join(args.result_dir, 'vital_normal_range_dict.json'))
    feature_normal_range_order_dict = { }
    for feature, d in feature_value_order_dict.items():
        if 'time' in feature:
            continue
        normal_range = vital_normal_range_dict[feature]
        values = sorted(d.keys(), key = lambda s:float(s))
        feature_normal_range_order_dict[feature] = []
        for v in values:
            if float(v) > normal_range[0] and len(feature_normal_range_order_dict[feature]) == 0:
                feature_normal_range_order_dict[feature].append(d[v])
            if float(v) > normal_range[1] and len(feature_normal_range_order_dict[feature]) == 1:
                feature_normal_range_order_dict[feature].append(d[v])
                break
    print(feature_normal_range_order_dict) 
    py_op.mywritejson(os.path.join(args.result_dir, 'feature_normal_range_order_dict.json'), feature_normal_range_order_dict)
Beispiel #11
0
def stati_mse_diff_distribution(clean_dir,
                                stain_dir,
                                pred_dir,
                                stati_bad=False,
                                result_json='../../data/result/7_311.json',
                                use_max=False):
    '''
    统计生成图片中仍然存在的MSE主要由哪些区域产生
    分为多个区域 以threshhold为界
    stati_bad: 仅仅统计生成较差的图片结果
    result_json: 预测结果统计json文件
    '''
    if stati_bad:
        result_dict = py_op.myreadjson(result_json)
        result_list = sorted([(v, k) for k, v in result_dict.items()
                              ])[:int(0.1 * len(result_dict))]
        result_list = set([k.split('.')[0] for v, k in result_list])
    threshhold_list = [0, 2, 6, 20, 60, 100, 150, 250]
    mse_sum = np.zeros(len(threshhold_list))
    mask_sum = np.zeros(len(threshhold_list))
    for fi in tqdm(os.listdir(pred_dir)):
        if stati_bad:
            if fi.split('.')[0] not in result_list:
                continue
        pred_fi = os.path.join(pred_dir, fi)
        clean_fi = os.path.join(clean_dir, fi.replace('png', 'jpg'))
        stain_fi = os.path.join(stain_dir, fi.replace('.png', '_.jpg'))
        try:
            pred_image = np.array(Image.open(pred_fi).resize(
                (250, 250))).astype(np.float32)
            clean_image = np.array(Image.open(clean_fi).resize(
                (250, 250))).astype(np.float32)
            stain_image = np.array(Image.open(stain_fi).resize(
                (250, 250))).astype(np.float32)
            mask = get_mask(
                np.abs(clean_image.astype(np.float32) - stain_image),
                threshhold_list, use_max)
        except:
            continue
        for n in range(len(threshhold_list)):
            mask_sum[n] += (mask == n).sum()
            mse_sum[n] += ((clean_image[mask == n] -
                            pred_image[mask == n])**2).sum()
    print '生成的图片主要的mse分布'
    print '灰度差别 \t 区域占比例 \t mse占比例'
    for n in range(len(threshhold_list) - 1):
        threshhold_min = threshhold_list[n]
        threshhold_max = threshhold_list[n + 1]
        print '[{:d}, {:d}]   \t {:2.2f} \t\t {:2.2f}'.format(
            threshhold_min, threshhold_max, mask_sum[n] / sum(mask_sum),
            mse_sum[n] / sum(mse_sum))
Beispiel #12
0
def ana_data_similar():
    def get_master(task):
        if task == 'task2':
            master_file = '/home/yin/contestdata2/DII_sepsis2_task2_evaluation/sepsis2_task2_evaluation_master.csv'
        elif task == 'case1':
            master_file = '/home/yin/contestdata2/DII_sepsis2_task1_evaluation/sepsis2_task1_evaluation_case1_master.csv'
        else:
            master_file = '/home/yin/contestdata2/DII_sepsis2_task1_evaluation/sepsis2_task1_evaluation_case2_master.csv'
        master_pid_dict = dict()
        for i,line in enumerate(open(master_file)):
            if i == 0:
                continue
            pid = line.split(',')[0]
            # master = line.replace(pid+',', '')
            master = line[len(pid) + 1:]
            master = ''.join(master.split())
            master_pid_dict[master] = master_pid_dict.get(master, []) + [pid]
        return master_pid_dict
    task_master_pid_dict = dict()
    task_patient_data = py_op.myreadjson('../result/task_patient_data.json')
    for k in ['case1', 'case2', 'task2']:
        task_master_pid_dict[k] = get_master(k)

    kf = 'case1'
    ks = 'task2'
    ks = 'case2'
    master_set = set(task_master_pid_dict[kf]) & set(task_master_pid_dict[ks])
    cset = set()
    n = 0
    for master in master_set:
        pc = task_master_pid_dict[kf][master]
        pt = task_master_pid_dict[ks][master]
        if len(pc) + len(pt) >= 2:
            for ppc in pc:
                n += 1
                for ppt in pt:
                    ppc_data = set(task_patient_data[kf][ppc])
                    ppt_data = set(task_patient_data[ks][ppt])
                    same = 0
                    for cline in ppc_data:
                        for tline in ppt_data:
                            if cline == tline:
                                # print ppc, ppt
                                # cset.add(ppc)
                                # print cline
                                # print tline
                                same += 1
                    if same > 5:
                        print same, len(ppc_data), len(ppt_data)
                        cset.add(ppc)
    print len(cset), n
Beispiel #13
0
def get_cases():
    sepsis_label_dict = py_op.myreadjson(os.path.join(args.result_dir, 'sepsis_label_dict.json'))
    print(len(sepsis_label_dict))
    icu_file  = '../data/icustays.csv'
    print('reading icustays.csv')
    icu_data = pd.read_csv(icu_file)
    icu_adm_dict = dict()
    icu_intime_dict = dict()
    for iline in range(len(icu_data)):
        icu = icu_data.loc[iline, 'icustay_id']
        intime = icu_data.loc[iline, 'intime']
        adm = icu_data.loc[iline, 'hadm_id']
        icu_adm_dict[icu] = adm
        icu_intime_dict[icu] = time_to_min(intime)

    sepsis_label_dict = { k:0 for k in sepsis_label_dict }
    sepsis_time_dict = py_op.myreadjson(os.path.join(args.result_dir, 'sepsis_time_dict.json'))

    for iline, line in enumerate(open('../data/sepsis_onset_time.csv')):
        icustay_id, h = line.strip().split(',')
        adm = icu_adm_dict[int(icustay_id)]
        sepsis_label_dict[adm] = 1
        time = icu_intime_dict[int(icustay_id)] + 60 * int(h)
        sepsis_time_dict[adm] = time


    for iline, line in enumerate(open('../data/sepsis3_cases.csv')):
        break
        if iline:
            icustay_id,intime,outtime,length_of_stay,delta_score,sepsis_onset,sepsis_onset_day,sepsis_onset_hour = line.strip().split(',')
            adm = icu_adm_dict[int(icustay_id)]
            sepsis_label_dict[adm] = 1

            time = time_to_min(sepsis_onset)
            sepsis_time_dict[adm] = time

    py_op.mywritejson(os.path.join(args.result_dir, 'sepsis_label_dict.json'), sepsis_label_dict)
    py_op.mywritejson(os.path.join(args.result_dir, 'sepsis_time_dict.json'), sepsis_time_dict)
def draw_pic():
    import numpy as np
    import matplotlib.pyplot as plt
    flc = np.load('../file/feature_label_count.npy')
    fvt = py_op.myreadjson(os.path.join(args.file_dir, 'feature_variation_trend_dict.json'))

    for f in range(143):
        vt = fvt[str(f)]
        print vt
        for i, (t, v) in enumerate(vt):
            plt.plot(t,v)
            if i > 10:
                break
        plt.savefig('../result/variation_trend/{:d}.png'.format(f))
        plt.clf()
Beispiel #15
0
def test_all():
    test_clean = '../data/test_clean'
    try:
        pred_dict = py_op.myreadjson('../data/result/result.json')
    except:
        pred_dict = dict()
    for i,pred_clean in enumerate(os.listdir(test_clean)):
        if pred_clean in pred_dict:
            if pred_dict[pred_clean] < 0.85:
                os.system('rm -r {:s}'.format(os.path.join(test_clean, pred_clean)))
            continue
        result = measures.compute_pred_clean_psnr(pred_clean,'../data/AI/testB', '../data/result')
        if result < 0.88:
            os.system('rm -r {:s}'.format(os.path.join(test_clean, pred_clean)))
        pred_dict[pred_clean] = result
    pred_dict = py_op.mysorteddict(pred_dict, key=lambda s:pred_dict[s])
    py_op.mywritejson('../data/result/result.json',pred_dict)
Beispiel #16
0
def stati_gray_stain(clean_dir,
                     stain_dir,
                     pred_dir,
                     stati_bad=True,
                     result_json='../../data/result/7_311.json',
                     use_max=False):
    '''
    统计不同灰度下的网纹点比例
    '''
    if stati_bad:
        result_dict = py_op.myreadjson(result_json)
        result_list = sorted([(v, k) for k, v in result_dict.items()
                              ])[:int(0.1 * len(result_dict))]
        result_list = set([k.split('.')[0] for v, k in result_list])
    threshhold_list = [0, 80, 120, 256]
    diff_sum = np.zeros(len(threshhold_list))
    mask_sum = np.zeros(len(threshhold_list))
    for fi in tqdm(os.listdir(pred_dir)):
        if stati_bad:
            if fi.split('.')[0] not in result_list:
                continue
        pred_fi = os.path.join(pred_dir, fi)
        clean_fi = os.path.join(clean_dir, fi.replace('png', 'jpg'))
        stain_fi = os.path.join(stain_dir, fi.replace('.png', '_.jpg'))
        try:
            clean_image = np.array(Image.open(clean_fi).resize(
                (250, 250))).astype(np.float32)
            stain_image = np.array(Image.open(stain_fi).resize(
                (250, 250))).astype(np.float32)
            mask = get_mask(stain_image, threshhold_list, use_max)
        except:
            traceback.print_exc()
            continue
        for n in range(len(threshhold_list)):
            mask_sum[n] += (mask == n).sum()
            diff_sum[n] += (np.abs(clean_image[mask == n] -
                                   stain_image[mask == n]) > 20).sum()
    print '灰度区域 \t 区域占比例 \t 网纹点占比例'
    for n in range(len(threshhold_list) - 1):
        threshhold_min = threshhold_list[n]
        threshhold_max = threshhold_list[n + 1]
        print '[{:d}, {:d}]   \t {:2.2f} \t\t {:2.2f}'.format(
            threshhold_min, threshhold_max, mask_sum[n] / sum(mask_sum),
            diff_sum[n] / sum(diff_sum))
Beispiel #17
0
def scp_files(json_file):
    result_dict = py_op.myreadjson(json_file)
    score_jpg_list = sorted([(v, k) for k, v in result_dict.items()])
    for score, jpg in score_jpg_list[:20]:
        png = jpg.replace('jpg', 'png')
        # png =  + png
        image = Image.open('../../data/pred_clean/AI/testB/' + png)
        image = image.resize((250, 250))
        tmp_png = 'tmp.png'
        image.save(tmp_png)
        cmd = 'scp {:s} ycclab:tmp/bad/{:s}'.format(tmp_png, png)
        os.system(cmd)
        os.remove(tmp_png)
        cmd = 'scp ../../data/AI/testB/{:s} ycclab:tmp/bad/'.format(jpg)
        os.system(cmd)
        cmd = 'scp ../../data/AI/testA/{:s} ycclab:tmp/bad/{:s}'.format(
            jpg.replace('.jpg', '_.jpg'), jpg.replace('.jpg', '.qs.jpg'))
        os.system(cmd)
        cmd = 'scp ../../data/pred_mask/AI/testB/{:s} ycclab:tmp/bad/{:s}'.format(
            png, png.replace('.png', '.rm.png'))
        os.system(cmd)
Beispiel #18
0
def ensemble(level=2):
    test_clean = '../data/test_clean/'
    clean_dir = '../data/AI/testB/'
    rgb_prob = np.load('../data/rgb_stati/rgb_prob_{:d}.npy'.format(level))
    obj_dir = 'ensemble_1'
    pred_dict = py_op.myreadjson('../data/result/result.json')
    file_names = os.listdir(clean_dir)
    # pred_dir_list = [os.path.join(test_clean,d) for d in os.listdir(test_clean) if obj_dir not in d and pred_dict.get(d,0)>0.94 and pred_dict.get(d,0)<0.95]
    pred_dir_list = [os.path.join(test_clean,d) for d in os.listdir(test_clean) if obj_dir not in d and pred_dict.get(d,0)>0.94]
    if len(pred_dir_list) == 0:
        return
    # print pred_dir_list
    # return

    pool = multiprocessing.Pool(processes=15)
    for fi,file_name in enumerate(os.listdir(clean_dir)):
        pool.apply_async(write_ensemble_image, (fi, file_name,pred_dir_list, os.path.join(test_clean, obj_dir), rgb_prob, level, pred_dict))
        # write_ensemble_image(fi, file_name,pred_dir_list, os.path.join(test_clean, obj_dir), rgb_prob, level, pred_dict)
    pool.close()
    pool.join()
    print 'processed all'
Beispiel #19
0
def gen_sepsis_label_dict():
    sepsis_label_dict = dict()

    sepsis_file = '../data/sepsis3.csv'
    print('reading sepsis3.csv')
    sepsis_data = pd.read_csv(sepsis_file)
    sepsis_infection_dict = dict()
    sepsis_set = set()
    for iline in range(len(sepsis_data)):
        adm = sepsis_data.loc[iline, 'hadm_id']
        adm = str(adm)
        excluded = sepsis_data.loc[iline, 'excluded']
        suspected_infection_time_poe = sepsis_data.loc[
            iline, 'suspected_infection_time_poe']
        if len(str(suspected_infection_time_poe)) > 5:
            sepsis_infection_dict[adm] = time_to_min(
                suspected_infection_time_poe)
            # sepsis_set.add(adm)
            if excluded == 0:
                sepsis_set.add(adm)
        # if len(str(suspected_infection_time_poe)) > 5:
        # sepsis_infection_dict[adm] = time_to_min(suspected_infection_time_poe)
        # print(suspected_infection_time_poe)
        # if excluded == 0 and len(str(suspected_infection_time_poe)) > 0:
        #     sepsis_set.add(adm)
        #     return
    # print(len(sepsis_infection_dict))
    # print(len(sepsis_set))
    print('Infection No: {:d}'.format(len(sepsis_infection_dict)))
    print('Sepsis No: {:d}'.format(len(sepsis_set)))
    # py_op.mywritejson(os.path.join(args.result_dir, 'sepsis_time_dict.json'), patient_label_dict)
    # return

    icu_file = '../data/icustays.csv'
    print('reading icustays.csv')
    icu_data = pd.read_csv(icu_file)
    icu_adm_dict = dict()
    for iline in range(len(icu_data)):
        icu = icu_data.loc[iline, 'icustay_id']
        adm = icu_data.loc[iline, 'hadm_id']
        icu_adm_dict[icu] = adm

    sofa_file = '../data/sofa.csv'
    print('reading sofa.csv')
    sofa_data = pd.read_csv(sofa_file)

    print('mapping sofa to adm')
    adm_sofa_dict = dict()
    for iline in range(len(sofa_data)):
        break
        if iline and iline % 10000 == 0:
            print('mapping sofa to adm', iline, len(sofa_data))
        icu = sofa_data.loc[iline, 'icustay_id']
        sofa = sofa_data.loc[iline, 'sofa_24hours']
        starttime = sofa_data.loc[iline, 'starttime']
        endtime = sofa_data.loc[iline, 'endtime']
        adm = icu_adm_dict[icu]
        adm_sofa_dict[adm] = adm_sofa_dict.get(
            adm, []) + [[sofa, starttime, endtime]]
    # py_op.mywritejson('../result/adm_sofa_dict.json', adm_sofa_dict)
    # return
    adm_sofa_dict = py_op.myreadjson('../result/adm_sofa_dict.json')

    print('set sepsis label')
    pos_num = 0
    for iline, (adm, sofa_list) in enumerate(adm_sofa_dict.items()):
        # print(adm, type(adm))
        if iline and iline % 10000 == 0:
            print('set sepsis label', iline, len(adm_sofa_dict))
        # if adm not in sepsis_infection_dict:
        if adm in sepsis_infection_dict:
            sepsis_label_dict[adm] = [0, sepsis_infection_dict[adm]]
        else:
            continue
        if adm not in sepsis_set:
            continue

        # sofa_list = sofa_list

        # if time_to_min(sofa_list[0][1]) < sepsis_infection_dict[adm] :
        #     continue

        # print('have data')

        sofa_init = ''
        for sofa in sofa_list:
            starttime = sofa[1]
            endtime = sofa[2]
            time = time_to_min(endtime)
            sofa = int(sofa[0])
            if time - sepsis_infection_dict[
                    adm] >= -48 * 60 and time - sepsis_infection_dict[
                        adm] <= 24 * 60:
                if sofa_init == '':
                    sofa_init = sofa
                elif sofa - sofa_init >= 2 and sofa >= 2:
                    sepsis_label_dict[adm] = [1, sepsis_infection_dict[adm]]
                    sepsis_infection_dict[adm] = max(
                        time, sepsis_infection_dict[adm])
                    pos_num += 1
                    break

    print('writing sepsis_label_dict')
    py_op.mywritejson(os.path.join(args.result_dir, 'sepsis_time_dict.json'),
                      sepsis_infection_dict)
    py_op.mywritejson(os.path.join(args.result_dir, 'sepsis_label_dict.json'),
                      {k: v[0]
                       for k, v in sepsis_label_dict.items()})

    print('There are {:d} positive samples.'.format(pos_num))
    print('There are {:d} negtive samples.'.format(
        len(sepsis_label_dict) - pos_num))
def gen_feature_order_dict():
    '''
    generate the order of value for each feature
    '''

    feature_value_order_dict = dict()

    # vital information
    vital_file = args.vital_file
    vital_dict = {}  # key-valuelist-dict
    for i_line, line in enumerate(open(vital_file)):
        if i_line % 10000 == 0:
            print i_line
        # if i_line > 10000:
        #     break
        if i_line == 0:
            new_line = ''
            vis = 0
            for c in line:
                if c == '"':
                    vis = (vis + 1) % 2
                if vis == 1 and c == ',':
                    c = ';'
                new_line += c
            line = new_line
            col_list = line.strip().split(',')[1:]
            for col in col_list:
                vital_dict[col] = []
        else:
            ctt_list = line.strip().split(',')[1:]
            assert len(ctt_list) == len(col_list)
            for col, ctt in zip(col_list, ctt_list):
                if len(ctt):
                    vital_dict[col].append(float(ctt))
        # if i_line > 10000:
        #    break
        # if i_line % 10000 == 0:
        #     print i_line

    # add group info
    groups = py_op.myreadjson(os.path.join(args.file_dir, 'similar.json'))
    feature_index_dict = py_op.myreadjson(
        os.path.join(args.file_dir, 'feature_index_dict.json'))
    index_feature_list = py_op.myreadjson(
        os.path.join(args.file_dir, 'index_feature_list.json'))
    for g in groups:
        for k in g:
            mg = min(g)
            if k != mg:
                kf = index_feature_list[k]
                mf = index_feature_list[mg]
                vital_dict[mf] = vital_dict[mf] + vital_dict[kf]
                vital_dict.pop(kf)
    print 'features', len(vital_dict)

    # feature_count_dict = { k: len(v) for k,v in vital_dict.items() }
    # py_op.mywritejson(os.path.join(args.file_dir, 'feature_count_dict.json'), feature_count_dict)

    ms_list = []
    for col in col_list:
        if col not in vital_dict:
            continue
        value_list = sorted(vital_dict[col])
        value_order_dict = dict()
        value_minorder_dict = dict()
        value_maxorder_dict = dict()
        for i_value, value in enumerate(value_list):
            if value not in value_minorder_dict:
                value_minorder_dict[value] = i_value
            if value == value_list[-1]:
                value_maxorder_dict[value] = len(value_list) - 1
                break
            if value != value_list[i_value + 1]:
                value_maxorder_dict[value] = i_value
        for value in value_maxorder_dict:
            value_order_dict[value] = (
                value_maxorder_dict[value] +
                value_minorder_dict[value]) / 2.0 / len(value_list)
        feature_value_order_dict[col] = value_order_dict
    py_op.mywritejson(
        os.path.join(args.file_dir, 'feature_value_order_dict.json'),
        feature_value_order_dict)
Beispiel #21
0
def main():
    p_dict = dict()  # All the parameters
    p_dict['args'] = args
    args.split_nn = args.split_num + args.split_nor * 3
    args.vocab_size = args.split_nn * 145 + 1
    print 'vocab_size', args.vocab_size

    ### load data
    print 'read data ...'
    patient_time_record_dict = py_op.myreadjson(
        os.path.join(args.result_dir, 'patient_time_record_dict.json'))
    patient_master_dict = py_op.myreadjson(
        os.path.join(args.result_dir, 'patient_master_dict.json'))
    patient_label_dict = py_op.myreadjson(
        os.path.join(args.result_dir, 'patient_label_dict.json'))

    patient_train = list(
        json.load(open(os.path.join(args.file_dir, args.task, 'train.json'))))
    patient_valid = list(
        json.load(open(os.path.join(args.file_dir, args.task, 'val.json'))))

    if len(patient_train) > len(patient_label_dict):
        patients = patient_time_record_dict.keys()
        patients = patient_label_dict.keys()
        n = int(0.8 * len(patients))
        patient_train = patients[:n]
        patient_valid = patients[n:]

    print 'data loading ...'
    train_dataset = dataloader.DataSet(patient_train,
                                       patient_time_record_dict,
                                       patient_label_dict,
                                       patient_master_dict,
                                       args=args,
                                       phase='train')
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=8,
                              pin_memory=True)
    val_dataset = dataloader.DataSet(patient_valid,
                                     patient_time_record_dict,
                                     patient_label_dict,
                                     patient_master_dict,
                                     args=args,
                                     phase='val')
    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=args.batch_size,
                            shuffle=False,
                            num_workers=8,
                            pin_memory=True)

    p_dict['train_loader'] = train_loader
    p_dict['val_loader'] = val_loader

    cudnn.benchmark = True
    net = lstm.LSTM(args)
    if args.gpu:
        net = net.cuda()
        p_dict['loss'] = loss.Loss().cuda()
    else:
        p_dict['loss'] = loss.Loss()

    parameters = []
    for p in net.parameters():
        parameters.append(p)
    optimizer = torch.optim.Adam(parameters, lr=args.lr)
    p_dict['optimizer'] = optimizer
    p_dict['model'] = net
    start_epoch = 0
    # args.epoch = start_epoch
    # print ('best_f1score' + str(best_f1score))

    p_dict['epoch'] = 0
    p_dict['best_metric'] = [0, 0]

    ### resume pretrained model
    if os.path.exists(args.resume):
        print 'resume from model ' + args.resume
        function.load_model(p_dict, args.resume)
        print 'best_metric', p_dict['best_metric']
        # return

    if args.phase == 'train':

        best_f1score = 0
        for epoch in range(p_dict['epoch'] + 1, args.epochs):
            p_dict['epoch'] = epoch
            for param_group in optimizer.param_groups:
                param_group['lr'] = args.lr
            train_eval(p_dict, 'train')
            train_eval(p_dict, 'val')
Beispiel #22
0
def main():
    p_dict = dict()  # All the parameters
    p_dict['args'] = args
    args.split_nn = 3 * 5
    args.vocab_size = args.split_nn * 145 + 2
    print('vocab_size', args.vocab_size)

    ### load data
    print('read data ...')
    if args.task == 'mortality':

        patient_time_record_dict = py_op.myreadjson(
            os.path.join(args.result_dir, 'patient_time_record_dict.json'))
        patient_master_dict = py_op.myreadjson(
            os.path.join(args.result_dir, 'patient_master_dict.json'))
        patient_label_dict = py_op.myreadjson(
            os.path.join(args.result_dir, 'patient_label_dict.json'))

        if os.path.exists(os.path.join(args.result_dir, 'train.json')):
            patient_train = list(
                json.load(open(os.path.join(args.result_dir, 'train.json'))))
            patient_valid = list(
                json.load(open(os.path.join(args.result_dir, 'valid.json'))))
            patient_test = list(
                json.load(open(os.path.join(args.result_dir, 'test.json'))))
        else:
            patients = sorted(
                set(patient_label_dict.keys()) & set(patient_time_record_dict)
                & set(patient_master_dict))
            print(len(patient_master_dict), len(patient_label_dict),
                  len(patient_time_record_dict))
            print('There are {:d} patients.'.format(len(patients)))
            n_train = int(0.7 * len(patients))
            n_valid = int(0.2 * len(patients))
            patient_train = patients[:n_train]
            patient_valid = patients[n_train:n_train + n_valid]
            patient_test = patients[n_train + n_valid:]

        args.master_size = len(patient_master_dict[patients[0]])
    elif args.task == 'sepsis':
        patient_time_record_dict = py_op.myreadjson(
            os.path.join(args.result_dir, 'sepsis_time_record_dict.json'))
        patient_master_dict = py_op.myreadjson(
            os.path.join(args.result_dir, 'patient_master_dict.json'))
        patient_label_dict = py_op.myreadjson(
            os.path.join(args.result_dir, 'sepsis_label_dict.json'))
        sepsis_split = py_op.myreadjson(
            os.path.join(args.result_dir, 'sepsis_split.json'))
        print(sepsis_split.keys())
        sepsis_split = sepsis_split[str(-args.last_time)]

        patient_train = sepsis_split['train']
        patient_valid = sepsis_split['valid']
        print('train: {:d}'.format(len(patient_train)))
        print('valid: {:d}'.format(len(patient_valid)))

    print('data loading ...')
    train_dataset = dataloader.DataSet(patient_train,
                                       patient_time_record_dict,
                                       patient_label_dict,
                                       patient_master_dict,
                                       args=args,
                                       phase='train')
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=8,
                              pin_memory=True)
    val_dataset = dataloader.DataSet(patient_valid,
                                     patient_time_record_dict,
                                     patient_label_dict,
                                     patient_master_dict,
                                     args=args,
                                     phase='val')
    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=args.batch_size,
                            shuffle=True,
                            num_workers=8,
                            pin_memory=True)
    test_dataset = dataloader.DataSet(patient_test,
                                      patient_time_record_dict,
                                      patient_label_dict,
                                      patient_master_dict,
                                      args=args,
                                      phase='val')
    test_loader = DataLoader(dataset=test_dataset,
                             batch_size=args.batch_size,
                             shuffle=True,
                             num_workers=8,
                             pin_memory=True)

    p_dict['train_loader'] = train_loader
    if args.phase == 'train':
        p_dict['val_loader'] = val_loader
    else:
        p_dict['val_loader'] = test_loader

    cudnn.benchmark = True
    net = attention.Attention(args)
    if args.gpu:
        net = net.cuda()
        p_dict['loss'] = loss.Loss().cuda()
    else:
        p_dict['loss'] = loss.Loss()

    parameters = []
    for p in net.parameters():
        parameters.append(p)
    optimizer = torch.optim.Adam(parameters, lr=args.lr)
    p_dict['optimizer'] = optimizer
    p_dict['model'] = net
    start_epoch = 0
    # args.epoch = start_epoch
    # print ('best_f1score' + str(best_f1score))

    p_dict['epoch'] = 0
    p_dict['best_metric'] = [0, 0]

    ### resume pretrained model
    if os.path.exists(args.resume):
        print('resume from model ' + args.resume)
        function.load_model(p_dict, args.resume)
        print('best_metric', p_dict['best_metric'])

    if args.phase == 'train':

        best_f1score = 0
        for epoch in range(p_dict['epoch'] + 1, args.epochs):
            p_dict['epoch'] = epoch
            for param_group in optimizer.param_groups:
                param_group['lr'] = args.lr
            train_eval(p_dict, 'train')
            train_eval(p_dict, 'val')
        log_info = '# task : {:s}; model: {:s} ; last_time: {:d} ; auc: {:3.4f} \n'.format(
            args.task, args.model, args.last_time, p_dict['best_metric'][0])
        with open('../result/log.txt', 'a') as f:
            f.write(log_info)
    else:
        train_eval(p_dict, 'test')