def filter_all_event():
    all_events_df = CsvUtility.read_pickle(
        path.join(Path, '/data-repository/allevents.pkl'), 'r')
    all_events_df['icd9_3'] = ''
    print all_events_df[:5]
    print all_events_df.shape
    # diagnoses_events = all_events_df[all_events_df['event_type'] == 'diagnosis']
    # print diagnoses_events[:5]
    # print diagnoses_events.shape
    # diagnoses_set = set(list(pd.read_csv('../data-repository/merge_diagnoses_dict.csv', header=None).index))
    # print len(diagnoses_set)
    # i=0
    # for index_iter in diagnoses_events.index:
    #     icd_code = diagnoses_events.ix[index_iter, 'event']
    #     assert len(icd_code) >= 3
    #     if len(icd_code) >= 3:
    #         if icd_code[:3] in diagnoses_set:
    #             all_events_df.ix[index_iter, 'icd9_3'] = all_events_df.ix[index_iter, 'event'][:3]
    #         else:
    #             all_events_df.drop(index_iter, axis=0, inplace=True)
    #     sys.stdout.write('\rROW {0} of {1}...'.format(i, diagnoses_events.shape[0]))
    #     i += 1
    # all_events_df.index = np.array(range(all_events_df.shape[0]))
    print all_events_df[:5]
    print all_events_df.shape
    CsvUtility.write2pickle(
        path.join(Path, '/data-repository/all_events_icd9.pkl'), all_events_df,
        'w')
def get_events_together():
    columns_name = [
        'hadm_id', 'subject_id', 'charttime', 'event_type', 'event'
    ]

    diag_columns = [
        'HADM_ID', 'SUBJECT_ID', 'DISCHTIME', 'DIAGNOSIS', 'ICD9_CODE'
    ]
    diag_events = get_all_diagnoses_event()
    diag_events = diag_events.ix[:, diag_columns]
    diag_events.columns = columns_name
    print diag_events[:5]

    lab_events = get_lab_event()
    lab_columns = ['HADM_ID', 'SUBJECT_ID', 'CHARTTIME', 'FLAG', 'ITEMID']
    lab_events = lab_events.ix[:, lab_columns]
    lab_events.columns = columns_name
    print lab_events[:5]

    medic_events = get_medication_event()
    medic_columns = [
        'HADM_ID', 'SUBJECT_ID', 'STARTDATE', 'DRUG_TYPE', 'FORMULARY_DRUG_CD'
    ]
    medic_events = medic_events.ix[:, medic_columns]
    medic_events.columns = columns_name
    print medic_events[:5]

    all_events = pd.concat([diag_events, lab_events, medic_events],
                           ignore_index=True)
    print all_events[:5]
    print all_events[-5:]
    print all_events.shape

    CsvUtility.write2pickle(path.join(Path, 'data-repository/allevents.pkl'),
                            all_events, 'w')
def get_drug_over(file_name, over_num):
    drug_df = pd.read_csv(os.path.join(Path, file_name),
                          dtype=str)[['HADM_ID', 'FORMULARY_DRUG_CD']]
    print drug_df[:5]
    print drug_df.shape
    drug_df.drop_duplicates(inplace=True)
    print drug_df.shape
    drug_count = drug_df['FORMULARY_DRUG_CD'].value_counts()
    drug_df = pd.DataFrame(drug_count[drug_count > over_num])
    drug_df.columns = ['COUNT']
    drug_df.index.name = 'FORMULARY_DRUG_CD'
    print drug_df[:5]
    print 'size:', drug_df.shape
    CsvUtility.write2pickle(
        path.join(Path, 'data-repository/prescription_drug_over.pkl'), drug_df,
        'w')
def get_lab_item_over(file_name, over_num):
    labevent_df = pd.read_csv(os.path.join(Path, file_name),
                              dtype=str)[['HADM_ID', 'ITEMID', 'FLAG']]
    print labevent_df[:5]
    print labevent_df.shape
    labevent_df = labevent_df[labevent_df['FLAG'] == 'abnormal']
    print labevent_df.shape
    labevent_df.drop_duplicates(inplace=True)
    print labevent_df.shape
    item_count = labevent_df['ITEMID'].value_counts()
    item_df = pd.DataFrame(item_count[item_count > over_num])
    item_df.columns = ['COUNT']
    item_df.index.name = 'ITEMID'
    print item_df[:5]
    print 'size:', item_df.shape
    CsvUtility.write2pickle(
        path.join(Path, 'data-repository/lab_item_over.pkl'), item_df, 'w')
def subject_admission_over(filename, over_num):
    admission_df = pd.read_csv(os.path.join(Path, filename), dtype=str)
    # print admission_df[:5]
    # admission_df.filter(items=['SUBJECT_ID'], like=)
    print admission_df[:5]
    print admission_df.shape
    admission_df.drop_duplicates(inplace=True)
    print admission_df.shape
    sub_vc = admission_df['SUBJECT_ID'].value_counts()
    sub_df = pd.DataFrame(sub_vc[sub_vc > over_num])
    sub_df.columns = ['COUNT']
    sub_df.index.name = 'SUBJECT_ID'
    print sub_df[:5]
    print 'size: ', sub_df.shape
    CsvUtility.write2pickle(
        path.join(Path, 'data-repository/subject_admission_over.pkl'), sub_df,
        'w')
def icd_procedures_over(filename, over_num):
    procedures_df = pd.read_csv(os.path.join(Path, filename),
                                dtype=str)[['HADM_ID', 'ICD9_CODE']]
    print procedures_df[:5]
    print procedures_df.shape
    procedures_df.drop_duplicates(inplace=True)
    print procedures_df.shape
    procedure_count = procedures_df['ICD9_CODE'].value_counts()
    print procedure_count
    procedure_df = pd.DataFrame(procedure_count[procedure_count > over_num])
    procedure_df.columns = ['COUNT']
    procedure_df.index.name = 'ICD9_CODE'
    print procedure_df[:5]
    print 'size:', procedure_df.shape
    CsvUtility.write2pickle(
        path.join(Path, 'data-repository/icd_procedures_over.pkl'),
        procedure_df, 'w')
Beispiel #7
0
def get_sequence():
    print 'reading.....'
    all_events = CsvUtility.read_pickle('../data-repository/allevents.pkl',
                                        'r')
    print all_events.shape
    all_events.dropna(axis=0,
                      how='any',
                      subset=['subject_id', 'charttime', 'event', 'hadm_id'],
                      inplace=True)
    print all_events.shape
    print 'changing the order......'
    all_events = all_events.ix[:, [
        'subject_id', 'charttime', 'event_type', 'event', 'icd9_3', 'hadm_id'
    ]]
    print all_events.dtypes
    all_events = all_events.astype({'hadm_id': 'int64'})
    print all_events.dtypes
    print 'sorting ......'
    all_events.sort_values(
        by=['subject_id', 'hadm_id', 'charttime', 'event_type', 'event'],
        inplace=True)
    print all_events[:10]
    rows = np.array(all_events)

    prev_time = None
    prev_subject = None
    prev_hadm_id = None
    # temp diagnoses in each admission
    diags = set()
    # temp event sequence in each admission
    temp_event_seq = []
    event_seq = []
    # event sequence for each person
    all_seq = []
    # map the time to the events in all_seq
    all_days = []
    # whole set of events
    unique_events = set()
    # whole diagnoses count dict
    diag_count = defaultdict(lambda: 0)
    # get the static feature of a patient
    p_features = set_p_features()
    # count the length of sequence
    seq_len = 0
    seq_max = 0
    seq_min = 100000
    for i in rows[0]:
        print type(i)
    for i, row in enumerate(rows):
        # print i, row
        if row[2] == "diagnosis":
            event = row[2][:1] + "_" + str(row[4])
            if not row[2].startswith("E"):
                diag_count[event] += 1
        else:
            event = row[2][:1] + "_" + str(row[3])

        if row[0] is None or row[1] is None or row[5] is None:
            print 'delete None:', row
            continue
        elif type(row[1]) != str and math.isnan(row[1]):
            print 'delete nan:', row
            continue

        elif prev_time is None or prev_subject is None:
            print 'first event'
            pass

        elif (row[0] != prev_subject) or (NLP_Utility.strtime2datetime(
                row[1]) > prev_time + datetime.timedelta(365)):
            print 'change sequence', row, ' pre: ', prev_subject, prev_time
            if len(diags) > 0 and len(event_seq) > 4:
                # pre, suf = calculate_window(event_seq + temp_event_seq, all_days)
                # all_seq.append([p_features, event_seq, temp_event_seq, diags, pre, suf])
                temp_event_seq = [x for x in temp_event_seq if x not in diags]
                for item in event_seq:
                    unique_events.add(item)
                for item in temp_event_seq:
                    unique_events.add(item)
                all_days.append(len(temp_event_seq))
                all_seq.append([
                    p_features[prev_hadm_id], event_seq, temp_event_seq,
                    all_days, diags
                ])
                print '!!!__!!!', prev_subject
                print len(event_seq) + len(temp_event_seq), len(all_days), sum(
                    all_days)
                seq_len += len(all_days)
                seq_max = seq_max if seq_max > len(all_days) else len(all_days)
                seq_min = seq_min if seq_min < len(all_days) else len(all_days)
            diags = set()
            event_seq = []
            temp_event_seq = []
            all_days = []
        elif prev_hadm_id != row[5]:
            print 'change temp sequence:', row, ' prev: ', prev_hadm_id
            all_days.append(len(temp_event_seq))
            event_seq += temp_event_seq
            temp_event_seq = []
            diags = set()
        elif NLP_Utility.strtime2datetime(row[1]) != prev_time:
            # print 'just change time: ', prev_time, rows[1]
            all_days.append(len(temp_event_seq))
            event_seq += temp_event_seq
            temp_event_seq = []

        # print 'adding ....'
        temp_event_seq.append(event)

        prev_time = datetime.datetime.strptime(row[1], '%Y-%m-%d %H:%M:%S')
        prev_subject = row[0]
        prev_hadm_id = row[5]

        if row[2] == "diagnosis":
            diags.add(event)

        if i % 10000 == 0:
            print 'complete {0} of {1}'.format(i, len(rows))

    # Write down the vocalulary used and diagnoses that we want to predict
    predicted_diags = [
        y[0]
        for y in sorted(diag_count.items(), key=lambda x: x[1], reverse=True)
        [:num_pred_diag]
    ]

    # uniq = open('../data-repository/vocab', 'w')
    # uniq.write(' '.join(unique_events) + '\n')
    # uniq.write(' '.join(predicted_diags))
    # uniq.close()
    print len(all_seq)
    print all_seq[0]
    after_del_sequence = []
    for instance in all_seq:
        fil_diag = [diag for diag in instance[-1] if diag in predicted_diags]
        if len(fil_diag) > 0:
            after_del_sequence.append(instance)
            after_del_sequence[-1][-1] = fil_diag
    print 'num of seq: ', len(after_del_sequence)
    print 'max/min of seq: ', seq_max, seq_min
    print 'mean of seq: ', seq_len / len(after_del_sequence)
    CsvUtility.write2pickle('../data-repository/after_sequence.pickle',
                            after_del_sequence, 'w')
    CsvUtility.write2pickle('../data-repository/event_dict.pickle',
                            unique_events, 'w')

    print '************************************************************'

    #######################################################################################################

    def get_diag_sequence():
        pass
from baseline_method.multi_logistic_model import MultiLogistic
from load_data import load_corpus, reload_corpus
from utility.csv_utility import CsvUtility

if __name__ == '__main__':
    print 'loading data...'
    train_x, train_y, test_x, test_y, idx = reload_corpus()
    print 'loading ready...'
    multi_logs = MultiLogistic(len(train_y[0]))
    print 'training...'
    multi_logs.training(training_x=train_x, training_y=train_y)
    print 'testing...'
    re_auc, re_list = multi_logs.testing(testing_x=test_x, testing_y=test_y)
    print re_auc[:-1]
    print re_auc[-1]
    CsvUtility.write2pickle('../data-repository/model_multilogisticCV.pkl',
                            [idx, re_auc, re_list], 'w')
def get_instance(time_before_diag=90):
    print 'reading.....'
    all_events = CsvUtility.read_pickle(
        path.join(Path, 'data-repository/allevents.pkl'), 'r')
    print all_events.shape
    all_events.dropna(axis=0, how='any', inplace=True)
    print all_events.shape
    print 'changing the order......'
    all_events = all_events.ix[:, [
        'subject_id', 'charttime', 'event_type', 'event', 'hadm_id'
    ]]
    print all_events.dtypes
    # all_events = all_events.astype({'hadm_id': 'int64'})
    # print all_events.dtypes
    all_events['subject_id'] = all_events['subject_id'].astype('int64')
    for rr in all_events.ix[0, :]:
        print type(rr)
    print 'sorting ......'
    all_events.sort_values(
        by=['subject_id', 'charttime', 'event_type', 'event'], inplace=True)
    print all_events[:10]
    rows = np.array(all_events, dtype=str)

    prev_time = None
    prev_subject = None
    # temp diagnoses in each time
    tem_diags = set()
    # temp event sequence in each time
    temp_event_seq = []
    # event sequence for each person
    event_seq = []
    # map the time for each person
    event_days = []
    # first time for each person
    base_time = None
    # all instance
    all_seq = []
    # whole set of events
    unique_events = set()
    # whole diagnoses count dict
    diag_count = defaultdict(lambda: 0)
    # count the length of instance
    seq_max = 0
    seq_min = 100000
    for i in rows[0]:
        print type(i)
    for i, row in enumerate(rows):
        # print i, row
        # if row[2] == "diagnosis":
        #     event = row[2][:1] + "_" + str(row[4])
        # else:
        #     event = row[2][:1] + "_" + str(row[3])
        event = row[2][:1] + "_" + str(row[3])

        # if type(row[1]) != str and math.isnan(row[1]):
        #     print 'delete nan:', row
        #     continue
        if prev_time is None or prev_subject is None:
            print 'first event'
            base_time = NLP_Utility.strtime2datetime(row[1])
        elif row[0] != prev_subject or NLP_Utility.strtime2datetime(
                row[1]) != prev_time:
            if len(tem_diags) > 0:
                # why exclude the diagnoses?
                # temp_event_seq = [x for x in temp_event_seq if x not in tem_diags]
                this_days = (prev_time - base_time).days
                find_days = this_days - time_before_diag if this_days >= time_before_diag else 0
                start_position = get_first_index(event_days, find_days)
                t_event_seq = []
                # for i_pos in range(start_position, len(event_days)):
                #     t_event_seq.append(event_seq[i_pos])
                # unique_events.add(event_seq[i_pos])
                t_event_seq += event_seq[start_position:]
                # print len(event_seq[start_position:])
                # for test_event in event_seq[start_position:]:
                #     if test_event.startswith("p_"):
                #         print "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
                # for item in temp_event_seq:
                #     # t_event_seq.append(item)
                #     unique_events.add(item)
                all_seq.append([t_event_seq, list(tem_diags)])
                for iter_diag in tem_diags:
                    diag_count[iter_diag] = diag_count[iter_diag] + 1
                seq_max = seq_max if seq_max > len(t_event_seq) else len(
                    t_event_seq)
                seq_min = seq_min if seq_min < len(t_event_seq) else len(
                    t_event_seq)
            if row[0] != prev_subject:
                # print 'change patient ', row, ' pre: ', prev_subject, row[0]
                event_seq = []
                event_days = []
                base_time = NLP_Utility.strtime2datetime(row[1])
            else:
                # print 'change time ', row, ' pre: ', prev_time, row[1]
                event_seq += temp_event_seq
                # print prev_time
                # print base_time
                # print type((prev_time - base_time).days)
                event_days += [(prev_time - base_time).days
                               ] * len(temp_event_seq)
            tem_diags = set()
            temp_event_seq = []
        # print 'adding ....'
        temp_event_seq.append(event)
        prev_time = datetime.datetime.strptime(row[1], '%Y-%m-%d %H:%M:%S')
        prev_subject = row[0]
        if row[2] == "diagnosis":
            tem_diags.add(event)

        if i % 10000 == 0:
            print 'complete {0} of {1}'.format(i, len(rows))

    # Write down the vocalulary used and diagnoses that we want to predict
    predicted_diags = [
        y[0]
        for y in sorted(diag_count.items(), key=lambda x: x[1], reverse=True)
        [:num_pred_diag]
    ]
    print 'num of seq: ', len(all_seq)
    print all_seq[0]
    after_del_sequence = []
    for instance in all_seq:
        fil_diag = [diag for diag in instance[-1] if diag in predicted_diags]
        # if len(fil_diag) > 0:
        for item in instance[0]:
            unique_events.add(item)
        after_del_sequence.append(instance)
        after_del_sequence[-1][-1] = fil_diag
        for diag in fil_diag:
            unique_events.add(diag)
    print 'after limit the predict diagnoses, num of seq: ', len(
        after_del_sequence)
    print 'max/min of seq: ', seq_max, seq_min
    print 'number of unique items:', len(unique_events)
    CsvUtility.write2pickle(
        path.join(Path, 'data-repository/after_instance.pkl'),
        after_del_sequence, 'w')
    CsvUtility.write2pickle(
        path.join(Path, 'data-repository/event_instance_dict.pkl'),
        unique_events, 'w')
    CsvUtility.write2pickle(
        path.join(Path, 'data-repository/predict_diags_dict.pkl'),
        predicted_diags, 'w')
    print '************************************************************'