Example #1
0
def main():
    dataset = data_loader.DataBowl(args, phase='train')
    train_loader = DataLoader(dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.workers,
                              pin_memory=True)
    dataset = data_loader.DataBowl(args, phase='valid')
    valid_loader = DataLoader(dataset,
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.workers,
                              pin_memory=True)
    args.vocab = dataset.vocab
    args.relation = dataset.relation

    # net, loss = model.Net(args), model.Loss()
    net, loss = model.FCModel(args), model.Loss()

    net = _cuda(net, 0)
    loss = _cuda(loss, 0)

    parameters_all = []
    for p in net.parameters():
        parameters_all.append(p)

    optimizer = torch.optim.Adam(parameters_all, args.lr)

    best_auc = [0, 0, 0, 0, 0, 0]

    cui_con_dict = {}
    if args.phase == 'train':
        for epoch in range(args.epochs):
            train(train_loader, net, loss, epoch, optimizer, best_auc)
            best_auc, cui_con_dict = test(valid_loader, net, loss, epoch,
                                          best_auc, 'valid', cui_con_dict)
            print args.words

        if 1:
            cons_dir = '../result/cons/{:s}/{:d}'.format(
                args.model, args.predict_day)
            py_op.mkdir(cons_dir)
            num = len(os.listdir(cons_dir))
            py_op.mywritejson(os.path.join(cons_dir, '{:d}.json'.format(num)),
                              cui_con_dict)
            # break

        print 'best auc', best_auc
        auc = best_auc[0]
        with open('../result/log.txt', 'a') as f:
            f.write('#model {:s} #auc {:3.4f}\n'.format(args.model, auc))

    elif args.phase == 'test':
        net.load_state_dict(torch.load(args.resume))
        test(valid_loader, net, loss, 0, best_auc, 'valid', cui_con_dict)
Example #2
0
def generate_feature_mm_dict():
    files = sorted(
        glob(os.path.join(args.data_dir, args.dataset, 'train_groundtruth/*')))
    feature_value_dict = dict()
    for ifi, fi in enumerate(tqdm(files)):
        if 'csv' not in fi:
            continue
        for iline, line in enumerate(open(fi)):
            line = line.strip()
            if iline == 0:
                feat_list = line.split(',')
            else:
                data = line.split(',')
                for iv, v in enumerate(data):
                    if v in ['NA', '']:
                        continue
                    else:
                        feat = feat_list[iv]
                        if feat not in feature_value_dict:
                            feature_value_dict[feat] = []
                        feature_value_dict[feat].append(float(v))
    feature_mm_dict = dict()
    feature_ms_dict = dict()

    feature_range_dict = dict()
    for feat, vs in feature_value_dict.items():
        vs = sorted(vs)
        value_split = []
        for i in range(args.split_num):
            n = int(i * len(vs) / args.split_num)
            value_split.append(vs[n])
        value_split.append(vs[-1])
        feature_range_dict[feat] = value_split

        n = int(len(vs) / args.split_num)
        feature_mm_dict[feat] = [vs[n], vs[-n - 1]]
        feature_ms_dict[feat] = [np.mean(vs), np.std(vs)]

    py_op.mkdir(args.file_dir)
    py_op.mywritejson(
        os.path.join(args.file_dir, args.dataset + '_feature_mm_dict.json'),
        feature_mm_dict)
    py_op.mywritejson(
        os.path.join(args.file_dir, args.dataset + '_feature_ms_dict.json'),
        feature_ms_dict)
    py_op.mywritejson(
        os.path.join(args.file_dir, args.dataset + '_feature_list.json'),
        feat_list)
    py_op.mywritejson(
        os.path.join(
            args.file_dir, args.dataset +
            '_feature_value_dict_{:d}.json'.format(args.split_num)),
        feature_range_dict)
def generate_ehr_files():

    hadm_time_dict = py_op.myreadjson(
        os.path.join(args.data_dir, 'hadm_time_dict.json'))
    hadm_demo_dict = py_op.myreadjson(
        os.path.join(args.data_dir, 'hadm_demo_dict.json'))
    hadm_sid_dict = py_op.myreadjson(
        os.path.join(args.data_dir, 'hadm_sid_dict.json'))
    hadm_icd_dict = py_op.myreadjson(
        os.path.join(args.data_dir, 'hadm_icd_dict.json'))
    hadm_time_drug_dict = py_op.myreadjson(
        os.path.join(args.data_dir, 'hadm_time_drug_dict.json'))
    groundtruth_dir = os.path.join(args.data_dir, 'train_groundtruth')
    py_op.mkdir(groundtruth_dir)
    ehr_count_dict = dict()

    for hadm_id in hadm_sid_dict:

        time_drug_dict = hadm_time_drug_dict.get(hadm_id, {})
        icd_list = hadm_icd_dict.get(hadm_id, [])
        demo = hadm_demo_dict[hadm_id]
        demo[0] = demo[0] + '1'
        demo[1] = 'A' + str(int(demo[1] / 9))
        icd_demo = icd_list + demo

        for icd in icd_demo:
            ehr_count_dict[icd] = ehr_count_dict.get(icd, 0) + 1

        ehr_dict = {'drug': {}, 'icd_demo': icd_demo}

        for setime, drug_list in time_drug_dict.items():
            try:
                stime, etime = setime.split(' -- ')
                start_second = time_to_second(hadm_time_dict[hadm_id])
                stime = str((time_to_second(stime) - start_second) / 3600)
                etime = str((time_to_second(etime) - start_second) / 3600)
                setime = stime + ' -- ' + etime
                for drug in drug_list:
                    ehr_count_dict[drug] = ehr_count_dict.get(drug, 0) + 1
                ehr_dict['drug'][setime] = list(set(drug_list))
            except:
                pass

        py_op.mywritejson(os.path.join(groundtruth_dir, hadm_id + '.json'),
                          ehr_dict)
        # break
    py_op.mywritejson(os.path.join(args.data_dir, 'ehr_count_dict.json'),
                      ehr_count_dict)
Example #4
0
def sort_pivoted_data():
    sort_dir = os.path.join(args.data_dir, args.dataset, 'sort_pivoted')
    os.system('rm -r ' + sort_dir)
    os.system('mkdir ' + sort_dir)
    merge_dir = os.path.join(args.data_dir, args.dataset, 'merge_pivoted')

    for i_fi, fi in enumerate(tqdm(os.listdir(merge_dir))):
        wf = open(os.path.join(sort_dir, fi), 'w')
        time_line_dict = dict()
        for i_line, line in enumerate(open(os.path.join(merge_dir, fi))):
            if i_line:
                line_data = line.strip().split(',')
                delta = 3
                ctime = delta * int(int(line_data[1]) / delta)
                if ctime not in time_line_dict:
                    time_line_dict[ctime] = []
                time_line_dict[ctime].append(line_data)
            else:
                line_data = line.split(',')[1:]
                line_data[0] = 'time'
                wf.write(','.join(line_data))
        for t in sorted(time_line_dict):
            line_list = time_line_dict[t]
            new_line = line_list[0]
            for line_data in line_list[1:]:
                for iv, v in enumerate(line_data):
                    if len(v.strip()):
                        new_line[iv] = v
            new_line = ','.join(new_line[1:]) + '\n'
            wf.write(new_line)
        wf.close()
    py_op.mkdir('../../data/MIMIC/train_groundtruth')
    py_op.mkdir('../../data/MIMIC/train_with_missing')
    os.system('rm ../../data/MIMIC/train_groundtruth/*.csv')
    os.system(
        'cp ../../data/MIMIC/sort_pivoted/* ../../data/MIMIC/train_groundtruth/'
    )
Example #5
0
def merge_pivoted_data(csv_list):
    name_list = ['hadm_id', 'charttime']
    for k, v in variable_map_dict.items():
        if k not in ['age', 'gender']:
            if len(v):
                name_list.append(v)
            elif k in item_id_dict:
                name_list.append(k)
    name_index_dict = {name: id for id, name in enumerate(name_list)}

    hadm_time_dict = py_op.myreadjson(
        os.path.join(args.data_dir, args.dataset, 'hadm_time_dict.json'))
    icu_hadm_dict = py_op.myreadjson(
        os.path.join(args.data_dir, args.dataset, 'icu_hadm_dict.json'))
    merge_dir = os.path.join(args.data_dir, args.dataset, 'merge_pivoted')
    os.system('rm -r ' + merge_dir)
    os.system('mkdir ' + merge_dir)
    pivoted_dir = os.path.join(args.result_dir, 'mimic/pivoted_sofa')
    py_op.mkdir(pivoted_dir)

    for fi in csv_list:
        print(fi)
        for i_line, line in enumerate(open(os.path.join(args.mimic_dir, fi))):
            if i_line:
                line_data = line.strip().split(',')
                if len(line_data) <= 0:
                    continue
                line_dict = dict()
                for iv, v in enumerate(line_data):
                    if len(v.strip()):
                        name = head[iv]
                        line_dict[name] = v

                if fi == 'pivoted_sofa.csv':
                    icu_id = line_dict.get('icustay_id', 'xxx')
                    if icu_id not in icu_hadm_dict:
                        continue
                    hadm_id = str(icu_hadm_dict[icu_id])
                    line_dict['hadm_id'] = hadm_id
                    line_dict['charttime'] = line_dict['starttime']

                hadm_id = line_dict.get('hadm_id', 'xxx')
                if hadm_id not in hadm_time_dict:
                    continue
                hadm_time = time_to_second(hadm_time_dict[hadm_id])
                now_time = time_to_second(line_dict['charttime'])
                delta_hour = int((now_time - hadm_time) / 3600)
                line_dict['charttime'] = str(delta_hour)

                if fi == 'pivoted_sofa.csv':
                    sofa_file = os.path.join(pivoted_dir, hadm_id + '.csv')
                    if not os.path.exists(sofa_file):
                        with open(sofa_file, 'w') as f:
                            f.write(sofa_head)
                    wf = open(sofa_file, 'a')
                    sofa_line = [str(delta_hour)] + line.split(',')[4:]
                    wf.write(','.join(sofa_line))
                    wf.close()

                assert 'hadm_id' in line_dict
                assert 'charttime' in line_dict
                new_line = []
                for name in name_list:
                    new_line.append(line_dict.get(name, ''))
                new_line = ','.join(new_line) + '\n'
                hadm_file = os.path.join(merge_dir, hadm_id + '.csv')
                if not os.path.exists(hadm_file):
                    with open(hadm_file, 'w') as f:
                        f.write(','.join(name_list) + '\n')
                wf = open(hadm_file, 'a')
                wf.write(new_line)
                wf.close()

            else:
                if fi == 'pivoted_sofa.csv':
                    sofa_head = ','.join(['time'] +
                                         line.replace('"', '').split(',')[4:])
                # "icustay_id","hr","starttime","endtime","pao2fio2ratio_novent","pao2fio2ratio_vent","rate_epinephrine","rate_norepinephrine","rate_dopamine","rate_dobutamine","meanbp_min","gcs_min","urineoutput","bilirubin_max","creatinine_max","platelet_min","respiration","coagulation","liver","cardiovascular","cns","renal","respiration_24hours","coagulation_24hours","liver_24hours","cardiovascular_24hours","cns_24hours","renal_24hours","sofa_24hours"

                head = line.replace('"', '').strip().split(',')
                head = [h.strip() for h in head]
                # print(line)
                for h in head:
                    if h not in name_index_dict:
                        print(h)
Example #6
0
def compute_dist_mat():
    files = glob(
        os.path.join(args.result_dir, args.dataset,
                     'imputation_result/*.csv'))  # [:100]
    feature_ms_dict = py_op.myreadjson(
        os.path.join(args.file_dir, args.dataset + '_feature_ms_dict.json'))
    subtyping_dir = os.path.join(args.result_dir, args.dataset, 'subtyping')
    py_op.mkdir(subtyping_dir)
    hadm_id_list = []
    mean_variables = []
    hadm_variable_dict = {}
    all_values = []

    for i_fi, fi in enumerate(tqdm(files)):
        hadm_id = fi.split('/')[-1].split('.')[0]
        hadm_data = []
        for i_line, line in enumerate(open(fi)):
            if i_line:
                line_data = line.strip().split(',')
                line_data = np.array([float(x) for x in line_data])
                if len(line_data) != n_variables + 1:
                    print(i_fi, fi)
                if line_data[0] < 0:
                    continue
                elif line_data[0] < 24:
                    hadm_data.append(line_data)
                else:
                    break
            else:
                head = line.strip().split(',')[1:]
                assert len(head) == n_variables

        values = np.array(hadm_data, dtype=np.float32)
        values = values[-24:]
        times = values[:, 0]
        values = values[:, 1:]

        assert len(values.shape) == 2
        assert values.shape[1] == n_variables

        hadm_variable_dict[hadm_id] = values
        hadm_id_list.append(hadm_id)
        all_values.append(values)

    all_values = np.concatenate(all_values, 0)
    ms = [all_values.mean(0), all_values.std(0)]

    hadm_dist_matrix = np.zeros((len(hadm_id_list), len(hadm_id_list))) - 1
    for i in tqdm(range(len(hadm_id_list))):
        hadm_dist_matrix[i, i] = 0
        for j in range(i + 1, len(hadm_id_list)):
            if hadm_dist_matrix[i, j] >= 0 or i == j:
                continue
            s1 = hadm_variable_dict[hadm_id_list[i]]
            s2 = hadm_variable_dict[hadm_id_list[j]]
            s1 = norm(s1, ms)
            s2 = norm(s2, ms)
            dist_mat = dist_func(s1, s2)
            path = np.zeros([dist_mat.shape[0], dist_mat.shape[1], 3
                             ]) - inf - 1
            compute_dtw(dist_mat, path, hadm_dist_matrix, i, j)

    py_op.mywritejson(os.path.join(subtyping_dir, 'hadm_id_list.json'),
                      hadm_id_list)
    np.save(os.path.join(subtyping_dir, 'hadm_dist_matrix.npy'),
            hadm_dist_matrix)