def clean_dgfa():
    fn = 'CASEDDGFA.csv'
    read_file_path = gu.get_file_path(fn, under_raw=True)
    df_dgfa = pd.read_csv(read_file_path, encoding='utf8')
    df_dgfa = df_dgfa.drop('IPROTOCOL_ID', axis=1)
    df_dgfa = df_dgfa.drop([
        'HDMT_ID', 'PCVAMT_ID', 'POMT_ID', 'UA_ID', 'UAMT_ID', 'URMT_ID',
        'SMC_NM', 'SMY_NM', 'SMCP_ID', 'PTIAMT_ID', 'HCY_NM', 'HCMT_ID',
        'HTY_NM', 'HTMT_ID', 'DMY_NM', 'DMMT_ID', 'PADMT_ID', 'CA_TX', 'OT_ID',
        'OT_TX', 'THISHC_ID', 'THISHY_ID', 'THISDI_ID', 'IGUID_FT'
    ],
                           axis=1)
    df_dgfa.loc[out_of_range(df_dgfa['HD_ID'], ['0', '1']), 'HD_ID'] = np.nan
    df_dgfa.loc[out_of_range(df_dgfa['PCVA_ID'], ['0', '1']),
                'PCVA_ID'] = np.nan
    df_dgfa.loc[out_of_range(df_dgfa['PCVACI_ID'], ['0', '1']),
                'PCVACI_ID'] = np.nan
    df_dgfa.loc[out_of_range(df_dgfa['PCVACH_ID'], ['0', '1']),
                'PCVACH_ID'] = np.nan
    df_dgfa.loc[out_of_range(df_dgfa['PO_ID'], ['0', '1']), 'PO_ID'] = np.nan
    df_dgfa.loc[out_of_range(df_dgfa['UR_ID'], ['0', '1']), 'UR_ID'] = np.nan
    df_dgfa.loc[out_of_range(df_dgfa['SM_ID'], ['0', '1']), 'SM_ID'] = np.nan
    df_dgfa.loc[out_of_range(df_dgfa['PTIA_ID'], ['0', '1']),
                'PTIA_ID'] = np.nan
    df_dgfa.loc[out_of_range(df_dgfa['HC_ID'], ['0', '1']), 'HC_ID'] = np.nan
    df_dgfa.loc[out_of_range(df_dgfa['HCHT_ID'], ['0', '1']),
                'HCHT_ID'] = np.nan
    df_dgfa.loc[out_of_range(df_dgfa['HCHC_ID'], ['0', '1']),
                'HCHC_ID'] = np.nan
    df_dgfa.loc[out_of_range(df_dgfa['HT_ID'], ['0', '1']), 'HT_ID'] = np.nan
    df_dgfa.loc[out_of_range(df_dgfa['DM_ID'], ['0', '1']), 'DM_ID'] = np.nan
    df_dgfa.loc[out_of_range(df_dgfa['PAD_ID'], ['0', '1']), 'PAD_ID'] = np.nan
    df_dgfa.loc[out_of_range(df_dgfa['AL_ID'], ['0', '1']), 'AL_ID'] = np.nan
    df_dgfa.loc[out_of_range(df_dgfa['CA_ID'], ['0', '1']), 'CA_ID'] = np.nan
    return df_dgfa
def clean_dbmrs():
    fn = 'CASEDBMRS(denormalized).csv'
    read_file_path = gu.get_file_path(fn, under_raw=True)
    df_dbmrs = pd.read_csv(read_file_path, encoding='utf8')
    df_dbmrs.loc[out_of_range(df_dbmrs['Feeding'], ['0', '5', '10']),
                 'Feeding'] = np.nan
    df_dbmrs.loc[out_of_range(df_dbmrs['Transfers'], ['0', '5', '10', '15']),
                 'Transfers'] = np.nan
    df_dbmrs.loc[out_of_range(df_dbmrs['Bathing'], ['0', '5']),
                 'Bathing'] = np.nan
    df_dbmrs.loc[out_of_range(df_dbmrs['Toilet_use'], ['0', '5', '10']),
                 'Toilet_use'] = np.nan
    df_dbmrs.loc[out_of_range(df_dbmrs['Grooming'], ['0', '5']),
                 'Grooming'] = np.nan
    df_dbmrs.loc[out_of_range(df_dbmrs['Mobility'], ['0', '5', '10', '15']),
                 'Mobility'] = np.nan
    df_dbmrs.loc[out_of_range(df_dbmrs['Stairs'], ['0', '5', '10']),
                 'Stairs'] = np.nan
    df_dbmrs.loc[out_of_range(df_dbmrs['Dressing'], ['0', '5', '10']),
                 'Dressing'] = np.nan
    df_dbmrs.loc[out_of_range(df_dbmrs['Bowel_control'], ['0', '5', '10']),
                 'Bowel_control'] = np.nan
    df_dbmrs.loc[out_of_range(df_dbmrs['Bladder_control'], ['0', '5', '10']),
                 'Bladder_control'] = np.nan
    df_dbmrs.loc[out_of_range(df_dbmrs['discharged_mrs'],
                              ['0', '1', '2', '3', '4', '5', '6']),
                 'discharged_mrs'] = np.nan
    return df_dbmrs
def de_casedfahi():
    patients_dic = {}
    title = ['ICASE_ID', 'IDCASE_ID', 'FH_HBP', 'FH_DB', 'FH_HD', 'FH_ST']
    diseace_code = {'1': 'FH_HBP', '2': 'FH_DB', '3': 'FH_HD', '4': 'FH_ST'}
    read_file_path = gu.get_file_path('CASEDFAHI.csv', under_raw=True)
    with open(read_file_path, 'r', encoding='utf8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            icase_id = row['ICASE_ID']
            idcase_id = row['IDCASE_ID']
            combind_id = icase_id + idcase_id
            fahiid_id = row['FAHIID_ID']
            parents_v = row['PARENTS_CD']
            brsi_v = row['BRSI_CD']
            if combind_id in patients_dic.keys():
                key = diseace_code.get(fahiid_id)
                patients_dic.get(combind_id)[key] = get_hist_value(
                    parents_v, brsi_v)
            else:
                # initial a patient's dictionary
                p_dic = {
                    'ICASE_ID': icase_id,
                    'IDCASE_ID': idcase_id,
                    'FH_HBP': '',
                    'FH_DB': '',
                    'FH_HD': '',
                    'FH_ST': ''
                }
                key = diseace_code.get(fahiid_id)
                p_dic[key] = get_hist_value(parents_v, brsi_v)
                patients_dic[combind_id] = p_dic
    gu.save_array_to_csv('CASEDFAHI(denormalized)',
                         title,
                         patients_dic,
                         under_raw=True)
def de_casedbmrs():
    patients_dic = {}
    title = [
        'ICASE_ID', 'IDCASE_ID', 'Feeding', 'Transfers', 'Bathing',
        'Toilet_use', 'Grooming', 'Mobility', 'Stairs', 'Dressing',
        'Bowel_control', 'Bladder_control', 'discharged_mrs'
    ]
    bid_code = {
        '1.00': 'Feeding',
        '2.00': 'Transfers',
        '3.00': 'Bathing',
        '4.00': 'Toilet_use',
        '5.00': 'Grooming',
        '6.00': 'Mobility',
        '7.00': 'Stairs',
        '8.00': 'Dressing',
        '9.00': 'Bowel_control',
        '10.00': 'Bladder_control',
        '11.00': 'discharged_mrs'
    }
    read_file_path = gu.get_file_path('CASEDBMRS.csv', under_raw=True)
    with open(read_file_path, 'r', encoding='utf8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            icase_id = row['ICASE_ID']
            idcase_id = row['IDCASE_ID']
            combind_id = icase_id + idcase_id
            bid_nm = row['BID_NM']
            botv_nm = row['BOTV_NM']
            if combind_id in patients_dic.keys():
                key = bid_code.get(bid_nm)
                patients_dic.get(combind_id)[key] = botv_nm
            else:
                # initial a patient's dictionary
                p_dic = {
                    'ICASE_ID': icase_id,
                    'IDCASE_ID': idcase_id,
                    'Feeding': '',
                    'Transfers': '',
                    'Bathing': '',
                    'Toilet_use': '',
                    'Grooming': '',
                    'Mobility': '',
                    'Stairs': '',
                    'Dressing': '',
                    'Bowel_control': '',
                    'Bladder_control': '',
                    'discharged_mrs': ''
                }
                key = bid_code.get(bid_nm)
                p_dic[key] = botv_nm
                patients_dic[combind_id] = p_dic
    gu.save_array_to_csv('CASEDBMRS(denormalized)',
                         title,
                         patients_dic,
                         under_raw=True)
def clean_mcase():
    fn = 'CASEMCASE.csv'
    read_file_path = gu.get_file_path(fn, under_raw=True)
    df_mcase = pd.read_csv(read_file_path, encoding='utf8')
    df_mcase = df_mcase.drop(['IPROTOCOL_ID', 'CNAME_TX', 'CID_ID'], axis=1)
    df_mcase['GENDER_TX'] = df_mcase['GENDER_TX'].replace({'F': '0', 'M': '1'})
    df_mcase['GENDER_TX'] = df_mcase['GENDER_TX'].replace(to_replace=r"[^0-1]",
                                                          value=np.NaN,
                                                          regex=True)
    return df_mcase
def clean_ctmr():
    fn = 'CASEDCTMR(denormalized).csv'
    read_file_path = gu.get_file_path(fn, under_raw=True)
    df_ctmr = pd.read_csv(read_file_path, encoding='utf8')
    df_ctmr.iloc[:, 2:df_ctmr.shape[1]] = df_ctmr.iloc[:, 2:df_ctmr.
                                                       shape[1]].replace({
                                                           'N':
                                                           '0',
                                                           'Y':
                                                           '1'
                                                       })
    return df_ctmr
def de_casedrfur():
    patients_dic = {}
    title = [
        'ICASE_ID', 'IDCASE_ID', 'VERS_1', 'VERS_3', 'VERS_6', 'VERS_12',
        'VEIHD_1', 'VEIHD_3', 'VEIHD_6', 'VEIHD_12', 'MRS_1', 'MRS_3', 'MRS_6',
        'MRS_12'
    ]
    read_file_path = gu.get_file_path('CASEDRFUR.csv', under_raw=True)
    with open(read_file_path, 'r', encoding='utf8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            icase_id = row['ICASE_ID']
            idcase_id = row['IDCASE_ID']
            combind_id = icase_id + idcase_id
            rfur_nm = row['RFUR_NM']
            vers_fl = row['VERS_FL']
            veihd_fl = row['VEIHD_FL']
            mrs_tx = row['MRS_TX']
            if combind_id in patients_dic.keys():
                patients_dic.get(combind_id)['VERS_' + rfur_nm] = vers_fl
                patients_dic.get(combind_id)['VEIHD_' + rfur_nm] = veihd_fl
                patients_dic.get(combind_id)['MRS_' + rfur_nm] = mrs_tx
            else:
                # initial a patient's dictionary
                p_dic = {
                    'ICASE_ID': icase_id,
                    'IDCASE_ID': idcase_id,
                    'VERS_1': '',
                    'VERS_3': '',
                    'VERS_6': '',
                    'VERS_12': '',
                    'VEIHD_1': '',
                    'VEIHD_3': '',
                    'VEIHD_6': '',
                    'VEIHD_12': '',
                    'MRS_1': '',
                    'MRS_3': '',
                    'MRS_6': '',
                    'MRS_12': ''
                }
                p_dic['VERS_' + rfur_nm] = vers_fl
                p_dic['VEIHD_' + rfur_nm] = veihd_fl
                p_dic['MRS_' + rfur_nm] = mrs_tx
                patients_dic[combind_id] = p_dic
    gu.save_array_to_csv('CASEDRFUR(denormalized)',
                         title,
                         patients_dic,
                         under_raw=True)
def clean_rfur():
    fn = 'CASEDRFUR(denormalized).csv'
    read_file_path = gu.get_file_path(fn, under_raw=True)
    df_rfur = pd.read_csv(read_file_path, encoding='utf8')
    rfur_cols = [
        'VERS_1', 'VERS_3', 'VERS_6', 'VERS_12', 'VEIHD_1', 'VEIHD_3',
        'VEIHD_6', 'VEIHD_12'
    ]
    df_rfur[rfur_cols] = df_rfur[rfur_cols].replace({'N': '0', 'Y': '1'})
    df_rfur.loc[
        out_of_range(df_rfur['MRS_1'], ['0', '1', '2', '3', '4', '5', '6']),
        'MRS_1'] = np.nan
    df_rfur.loc[
        out_of_range(df_rfur['MRS_3'], ['0', '1', '2', '3', '4', '5', '6']),
        'MRS_3'] = np.nan
    df_rfur.loc[
        out_of_range(df_rfur['MRS_6'], ['0', '1', '2', '3', '4', '5', '6']),
        'MRS_6'] = np.nan
    df_rfur.loc[
        out_of_range(df_rfur['MRS_12'], ['0', '1', '2', '3', '4', '5', '6']),
        'MRS_12'] = np.nan
    return df_rfur
def clean_fahi():
    fn = 'CASEDFAHI(denormalized).csv'
    read_file_path = gu.get_file_path(fn, under_raw=True)
    df_fahi = pd.read_csv(read_file_path, encoding='utf8')
    return df_fahi
Beispiel #10
0
from tools import genral_utils as gu
import numpy as np
from visualization import plot_utils as pu
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN
from visualization import plot_utils as pu

if __name__ == '__main__':
    # -- Load Data
    df = pd.read_csv(gu.get_file_path('NIH_data.csv', under_raw=False),
                     encoding='utf8')
    n = 'Discharge: NIHSS Total'
    b = 'Discharge: Barthel Scale Total'
    m = 'Discharge: Rankin Score'
    df_nbm = df[[n, b, m]]
    df_nbm = df_nbm[df_nbm[m] != 6]
    df_nbm = df_nbm.dropna()

    # -- Plot
    # df_nbm[[m, b]].boxplot(column=[b], by=m)
    # fig = plt.figure(figsize=(15, 5))
    # pu.bubble_plot(df_nbm[[m, b]], [m, b])
    # pu.violin_plot(df_nbm[[m, n]])
    # df_nbm[[n, b]].boxplot(column=[b], by=n)
    pu.scatt_plot(df_nbm)
    plt.show()

    # for i in range(3, 4, 1):
def de_casedfahi():
    patients_dic_1 = {}
    patients_dic_2 = {}
    title_1 = ['ICASE_ID', 'IDCASE_ID', 'GUID_TSYM', 'FAHIID_PARENTS_1', 'FAHIID_PARENTS_2', 'FAHIID_PARENTS_3', 'FAHIID_PARENTS_4']
    diseace_code = {
                    '1': 'FAHIID_PARENTS_1',
                    '2': 'FAHIID_PARENTS_2',
                    '3': 'FAHIID_PARENTS_3',
                    '4': 'FAHIID_PARENTS_4'}
    read_file_path = gu.get_file_path('CASEDFAHI.csv', under_raw=True)
    with open(read_file_path, 'r', encoding='utf8', errors='ignore') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            icase_id = row['ICASE_ID']
            idcase_id = row['IDCASE_ID']
            combind_id = icase_id + idcase_id
            fahiid_id = str(int(float(row['FAHIID_ID'])))
            parents_v = row['PARENTS_CD']
            # guid = row['GUID_TSYM']
            if combind_id in patients_dic_1.keys():
                key = diseace_code.get(fahiid_id)
                patients_dic_1.get(combind_id)[key] = parents_v
            else:
                # initial a patient's dictionary
                # p_dic = {'ICASE_ID': icase_id, 'IDCASE_ID': idcase_id, 'GUID_TSYM': guid,
                #          'FAHIID_PARENTS_1': '', 'FAHIID_PARENTS_2': '', 'FAHIID_PARENTS_3': '', 'FAHIID_PARENTS_4': ''}
                p_dic = {'ICASE_ID': icase_id, 'IDCASE_ID': idcase_id,
                         'FAHIID_PARENTS_1': '', 'FAHIID_PARENTS_2': '', 'FAHIID_PARENTS_3': '', 'FAHIID_PARENTS_4': ''}
                key = diseace_code.get(fahiid_id)
                p_dic[key] = parents_v
                patients_dic_1[combind_id] = p_dic
    # ==
    title_2 = ['ICASE_ID', 'IDCASE_ID', 'GUID_TSYM', 'FAHIID_BRSI_1', 'FAHIID_BRSI_2', 'FAHIID_BRSI_3', 'FAHIID_BRSI_4']
    diseace_code = {
        '1': 'FAHIID_BRSI_1',
        '2': 'FAHIID_BRSI_2',
        '3': 'FAHIID_BRSI_3',
        '4': 'FAHIID_BRSI_4'}
    read_file_path = gu.get_file_path('CASEDFAHI.csv', under_raw=True)
    with open(read_file_path, 'r', encoding='utf8', errors='ignore') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            icase_id = row['ICASE_ID']
            idcase_id = row['IDCASE_ID']
            combind_id = icase_id + idcase_id
            fahiid_id = str(int(float(row['FAHIID_ID'])))
            brsi_v = row['BRSI_CD']
            # guid = row['GUID_TSYM']
            if combind_id in patients_dic_2.keys():
                key = diseace_code.get(fahiid_id)
                patients_dic_2.get(combind_id)[key] = brsi_v
            else:
                # initial a patient's dictionary
                # p_dic = {'ICASE_ID': icase_id, 'IDCASE_ID': idcase_id, 'GUID_TSYM': guid,
                #          'FAHIID_BRSI_1': '', 'FAHIID_BRSI_2': '', 'FAHIID_BRSI_3': '', 'FAHIID_BRSI_4': ''}
                p_dic = {'ICASE_ID': icase_id, 'IDCASE_ID': idcase_id,
                         'FAHIID_BRSI_1': '', 'FAHIID_BRSI_2': '', 'FAHIID_BRSI_3': '', 'FAHIID_BRSI_4': ''}
                key = diseace_code.get(fahiid_id)
                p_dic[key] = brsi_v
                patients_dic_2[combind_id] = p_dic

    # title = ['ICASE_ID', 'IDCASE_ID', 'GUID_TSYM', 'FAHIID_PARENTS_1', 'FAHIID_PARENTS_2', 'FAHIID_PARENTS_3',
    #            'FAHIID_PARENTS_4', 'FAHIID_BRSI_1', 'FAHIID_BRSI_2', 'FAHIID_BRSI_3', 'FAHIID_BRSI_4']
    title = ['ICASE_ID', 'IDCASE_ID', 'FAHIID_PARENTS_1', 'FAHIID_PARENTS_2', 'FAHIID_PARENTS_3',
             'FAHIID_PARENTS_4', 'FAHIID_BRSI_1', 'FAHIID_BRSI_2', 'FAHIID_BRSI_3', 'FAHIID_BRSI_4']
    patients_dic = {}
    if len(patients_dic_1) == len(patients_dic_2):
        for k in patients_dic_1.keys():
            dic_1 = patients_dic_1[k]
            dic_2 = patients_dic_2[k]
            patients_dic[k] = {**dic_1, **dic_2}
    gu.save_array_to_csv('CASEDFAHI(denormalized)', title, patients_dic, under_raw=True)
def de_casedrfur():
    patients_dic = {}
    # title = ['ICASE_ID', 'IDCASE_ID', 'GUID_TSYM',
    #           'FSTATUS_ID_1', 'RFUR_DT_1', 'LOCATION_ID_1', 'TORG_ID_1', 'FLU_ID_1', 'FLUORG_ID_1', 'FLUORG_TX_1', 'FLURESULT_TX_1', 'DEATH_DT_1', 'DEATH_ID_1', 'DEATHSK_ID_1', 'DEATHO_TX_1', 'VE_ID_1', 'VERS_FL_1', 'VERSCICH_ID_1', 'VERS_DT_1', 'VERSORG_ID_1', 'VEIHD_FL_1', 'VEIHD_ID_1', 'VEIHD_DT_1', 'VEIHDORG_ID_1', 'MRS_TX_1', 'TORG_TX_1', 'VERSORG_TX_1', 'VEIHDORG_TX_1',
    #           'FSTATUS_ID_3', 'RFUR_DT_3', 'LOCATION_ID_3', 'TORG_ID_3', 'FLU_ID_3', 'FLUORG_ID_3', 'FLUORG_TX_3', 'FLURESULT_TX_3', 'DEATH_DT_3', 'DEATH_ID_3', 'DEATHSK_ID_3', 'DEATHO_TX_3', 'VE_ID_3', 'VERS_FL_3', 'VERSCICH_ID_3', 'VERS_DT_3', 'VERSORG_ID_3', 'VEIHD_FL_3', 'VEIHD_ID_3', 'VEIHD_DT_3', 'VEIHDORG_ID_3', 'MRS_TX_3', 'TORG_TX_3', 'VERSORG_TX_3', 'VEIHDORG_TX_3',
    #          'FSTATUS_ID_6', 'RFUR_DT_6', 'LOCATION_ID_6', 'TORG_ID_6', 'FLU_ID_6', 'FLUORG_ID_6', 'FLUORG_TX_6', 'FLURESULT_TX_6', 'DEATH_DT_6', 'DEATH_ID_6', 'DEATHSK_ID_6', 'DEATHO_TX_6', 'VE_ID_6', 'VERS_FL_6', 'VERSCICH_ID_6', 'VERS_DT_6', 'VERSORG_ID_6', 'VEIHD_FL_6', 'VEIHD_ID_6', 'VEIHD_DT_6', 'VEIHDORG_ID_6', 'MRS_TX_6', 'TORG_TX_6', 'VERSORG_TX_6', 'VEIHDORG_TX_6',
    #          'FSTATUS_ID_12', 'RFUR_DT_12', 'LOCATION_ID_12', 'TORG_ID_12', 'FLU_ID_12', 'FLUORG_ID_12', 'FLUORG_TX_12', 'FLURESULT_TX_12', 'DEATH_DT_12', 'DEATH_ID_12', 'DEATHSK_ID_12', 'DEATHO_TX_12', 'VE_ID_12', 'VERS_FL_12', 'VERSCICH_ID_12', 'VERS_DT_12', 'VERSORG_ID_12', 'VEIHD_FL_12', 'VEIHD_ID_12', 'VEIHD_DT_12', 'VEIHDORG_ID_12', 'MRS_TX_12', 'TORG_TX_12', 'VERSORG_TX_12', 'VEIHDORG_TX_12'
    #          ]
    title = ['ICASE_ID', 'IDCASE_ID',
             'FSTATUS_ID_1', 'RFUR_DT_1', 'LOCATION_ID_1', 'TORG_ID_1', 'FLU_ID_1', 'FLUORG_ID_1', 'FLUORG_TX_1',
             'FLURESULT_TX_1', 'DEATH_DT_1', 'DEATH_ID_1', 'DEATHSK_ID_1', 'DEATHO_TX_1', 'VE_ID_1', 'VERS_FL_1',
             'VERSCICH_ID_1', 'VERS_DT_1', 'VERSORG_ID_1', 'VEIHD_FL_1', 'VEIHD_ID_1', 'VEIHD_DT_1', 'VEIHDORG_ID_1',
             'MRS_TX_1', 'TORG_TX_1', 'VERSORG_TX_1', 'VEIHDORG_TX_1',
             'FSTATUS_ID_3', 'RFUR_DT_3', 'LOCATION_ID_3', 'TORG_ID_3', 'FLU_ID_3', 'FLUORG_ID_3', 'FLUORG_TX_3',
             'FLURESULT_TX_3', 'DEATH_DT_3', 'DEATH_ID_3', 'DEATHSK_ID_3', 'DEATHO_TX_3', 'VE_ID_3', 'VERS_FL_3',
             'VERSCICH_ID_3', 'VERS_DT_3', 'VERSORG_ID_3', 'VEIHD_FL_3', 'VEIHD_ID_3', 'VEIHD_DT_3', 'VEIHDORG_ID_3',
             'MRS_TX_3', 'TORG_TX_3', 'VERSORG_TX_3', 'VEIHDORG_TX_3',
             'FSTATUS_ID_6', 'RFUR_DT_6', 'LOCATION_ID_6', 'TORG_ID_6', 'FLU_ID_6', 'FLUORG_ID_6', 'FLUORG_TX_6',
             'FLURESULT_TX_6', 'DEATH_DT_6', 'DEATH_ID_6', 'DEATHSK_ID_6', 'DEATHO_TX_6', 'VE_ID_6', 'VERS_FL_6',
             'VERSCICH_ID_6', 'VERS_DT_6', 'VERSORG_ID_6', 'VEIHD_FL_6', 'VEIHD_ID_6', 'VEIHD_DT_6', 'VEIHDORG_ID_6',
             'MRS_TX_6', 'TORG_TX_6', 'VERSORG_TX_6', 'VEIHDORG_TX_6',
             'FSTATUS_ID_12', 'RFUR_DT_12', 'LOCATION_ID_12', 'TORG_ID_12', 'FLU_ID_12', 'FLUORG_ID_12', 'FLUORG_TX_12',
             'FLURESULT_TX_12', 'DEATH_DT_12', 'DEATH_ID_12', 'DEATHSK_ID_12', 'DEATHO_TX_12', 'VE_ID_12', 'VERS_FL_12',
             'VERSCICH_ID_12', 'VERS_DT_12', 'VERSORG_ID_12', 'VEIHD_FL_12', 'VEIHD_ID_12', 'VEIHD_DT_12',
             'VEIHDORG_ID_12', 'MRS_TX_12', 'TORG_TX_12', 'VERSORG_TX_12', 'VEIHDORG_TX_12'
             ]
    read_file_path = gu.get_file_path('CASEDRFUR.csv', under_raw=True)
    with open(read_file_path, 'r', encoding='utf8', errors='replace') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            icase_id = row['ICASE_ID']
            idcase_id = row['IDCASE_ID']
            combind_id = icase_id + idcase_id
            # guid = row['GUID_TSYM']
            rfur_nm = str(int(float(row['RFUR_NM'])))
            fstatus_id = row['FSTATUS_ID']
            rfur_dt = row['RFUR_DT']
            location_id = row['LOCATION_ID']
            torg_id = row['TORG_ID']
            flu_id = row['FLU_ID']
            fluorg_id = row['FLUORG_ID']
            fluorg_tx = row['FLUORG_TX']
            fluresult_tx = row['FLURESULT_TX']
            death_dt = row['DEATH_DT']
            death_id = row['DEATH_ID']
            deathsk_id = row['DEATHSK_ID']
            deatho_tx = row['DEATHO_TX']
            ve_id = row['VE_ID']
            vers_fl = row['VERS_FL']
            verscich_id = row['VERSCICH_ID']
            vers_dt = row['VERS_DT']
            versorg_id = row['VERSORG_ID']
            veihd_fl = row['VEIHD_FL']
            veihd_id = row['VEIHD_ID']
            veihd_dt = row['VEIHD_DT']
            veihdorg_id = row['VEIHDORG_ID']
            mrs_tx = row['MRS_TX']
            torg_tx = row['TORG_TX']
            versorg_tx = row['VERSORG_TX']
            veihdorg_tx = row['VEIHDORG_TX']

            if combind_id in patients_dic.keys():
                patients_dic.get(combind_id)['FSTATUS_ID_' + rfur_nm] = fstatus_id
                patients_dic.get(combind_id)['RFUR_DT_' + rfur_nm] = rfur_dt
                patients_dic.get(combind_id)['LOCATION_ID_' + rfur_nm] = location_id
                patients_dic.get(combind_id)['TORG_ID_' + rfur_nm] = torg_id
                patients_dic.get(combind_id)['FLU_ID_' + rfur_nm] = flu_id
                patients_dic.get(combind_id)['FLUORG_ID_' + rfur_nm] = fluorg_id
                patients_dic.get(combind_id)['FLUORG_TX_' + rfur_nm] = fluorg_tx
                patients_dic.get(combind_id)['FLURESULT_TX_' + rfur_nm] = fluresult_tx
                patients_dic.get(combind_id)['DEATH_DT_' + rfur_nm] = death_dt
                patients_dic.get(combind_id)['DEATH_ID_' + rfur_nm] = death_id
                patients_dic.get(combind_id)['DEATHSK_ID_' + rfur_nm] = deathsk_id
                patients_dic.get(combind_id)['DEATHO_TX_' + rfur_nm] = deatho_tx
                patients_dic.get(combind_id)['VE_ID_' + rfur_nm] = ve_id
                patients_dic.get(combind_id)['VERS_FL_' + rfur_nm] = vers_fl
                patients_dic.get(combind_id)['VERSCICH_ID_' + rfur_nm] = verscich_id
                patients_dic.get(combind_id)['VERS_DT_' + rfur_nm] = vers_dt
                patients_dic.get(combind_id)['VERSORG_ID_' + rfur_nm] = versorg_id
                patients_dic.get(combind_id)['VEIHD_FL_' + rfur_nm] = veihd_fl
                patients_dic.get(combind_id)['VEIHD_ID_' + rfur_nm] = veihd_id
                patients_dic.get(combind_id)['VEIHD_DT_' + rfur_nm] = veihd_dt
                patients_dic.get(combind_id)['VEIHDORG_ID_' + rfur_nm] = veihdorg_id
                patients_dic.get(combind_id)['MRS_TX_' + rfur_nm] = mrs_tx
                patients_dic.get(combind_id)['TORG_TX_' + rfur_nm] = torg_tx
                patients_dic.get(combind_id)['VERSORG_TX_' + rfur_nm] = versorg_tx
                patients_dic.get(combind_id)['VEIHDORG_TX_' + rfur_nm] = veihdorg_tx
            else:
                # initial a patient's dictionary
                # p_dic = {'ICASE_ID': icase_id, 'IDCASE_ID': idcase_id, 'GUID_TSYM': guid,
                #          'FSTATUS_ID_1': '', 'RFUR_DT_1': '', 'LOCATION_ID_1': '', 'TORG_ID_1': '', 'FLU_ID_1': '', 'FLUORG_ID_1': '', 'FLUORG_TX_1': '', 'FLURESULT_TX_1': '', 'DEATH_DT_1': '', 'DEATH_ID_1': '', 'DEATHSK_ID_1': '', 'DEATHO_TX_1': '', 'VE_ID_1': '', 'VERS_FL_1': '', 'VERSCICH_ID_1': '', 'VERS_DT_1': '', 'VERSORG_ID_1': '', 'VEIHD_FL_1': '', 'VEIHD_ID_1': '', 'VEIHD_DT_1': '', 'VEIHDORG_ID_1': '', 'MRS_TX_1': '', 'TORG_TX_1': '', 'VERSORG_TX_1': '', 'VEIHDORG_TX_1': '',
                #          'FSTATUS_ID_3': '', 'RFUR_DT_3': '', 'LOCATION_ID_3': '', 'TORG_ID_3': '', 'FLU_ID_3': '', 'FLUORG_ID_3': '', 'FLUORG_TX_3': '', 'FLURESULT_TX_3': '', 'DEATH_DT_3': '', 'DEATH_ID_3': '', 'DEATHSK_ID_3': '', 'DEATHO_TX_3': '', 'VE_ID_3': '', 'VERS_FL_3': '', 'VERSCICH_ID_3': '', 'VERS_DT_3': '', 'VERSORG_ID_3': '', 'VEIHD_FL_3': '', 'VEIHD_ID_3': '', 'VEIHD_DT_3': '', 'VEIHDORG_ID_3': '', 'MRS_TX_3': '', 'TORG_TX_3': '', 'VERSORG_TX_3': '', 'VEIHDORG_TX_3': '',
                #          'FSTATUS_ID_6': '', 'RFUR_DT_6': '', 'LOCATION_ID_6': '', 'TORG_ID_6': '', 'FLU_ID_6': '', 'FLUORG_ID_6': '', 'FLUORG_TX_6': '', 'FLURESULT_TX_6': '', 'DEATH_DT_6': '', 'DEATH_ID_6': '', 'DEATHSK_ID_6': '', 'DEATHO_TX_6': '', 'VE_ID_6': '', 'VERS_FL_6': '', 'VERSCICH_ID_6': '', 'VERS_DT_6': '', 'VERSORG_ID_6': '', 'VEIHD_FL_6': '', 'VEIHD_ID_6': '', 'VEIHD_DT_6': '', 'VEIHDORG_ID_6': '', 'MRS_TX_6': '', 'TORG_TX_6': '', 'VERSORG_TX_6': '', 'VEIHDORG_TX_6': '',
                #          'FSTATUS_ID_12': '', 'RFUR_DT_12': '', 'LOCATION_ID_12': '', 'TORG_ID_12': '', 'FLU_ID_12': '', 'FLUORG_ID_12': '', 'FLUORG_TX_12': '', 'FLURESULT_TX_12': '', 'DEATH_DT_12': '', 'DEATH_ID_12': '', 'DEATHSK_ID_12': '', 'DEATHO_TX_12': '', 'VE_ID_12': '', 'VERS_FL_12': '', 'VERSCICH_ID_12': '', 'VERS_DT_12': '', 'VERSORG_ID_12': '', 'VEIHD_FL_12': '', 'VEIHD_ID_12': '', 'VEIHD_DT_12': '', 'VEIHDORG_ID_12': '', 'MRS_TX_12': '', 'TORG_TX_12': '', 'VERSORG_TX_12': '', 'VEIHDORG_TX_12': ''
                #          }
                p_dic = {'ICASE_ID': icase_id, 'IDCASE_ID': idcase_id,
                         'FSTATUS_ID_1': '', 'RFUR_DT_1': '', 'LOCATION_ID_1': '', 'TORG_ID_1': '', 'FLU_ID_1': '',
                         'FLUORG_ID_1': '', 'FLUORG_TX_1': '', 'FLURESULT_TX_1': '', 'DEATH_DT_1': '', 'DEATH_ID_1': '',
                         'DEATHSK_ID_1': '', 'DEATHO_TX_1': '', 'VE_ID_1': '', 'VERS_FL_1': '', 'VERSCICH_ID_1': '',
                         'VERS_DT_1': '', 'VERSORG_ID_1': '', 'VEIHD_FL_1': '', 'VEIHD_ID_1': '', 'VEIHD_DT_1': '',
                         'VEIHDORG_ID_1': '', 'MRS_TX_1': '', 'TORG_TX_1': '', 'VERSORG_TX_1': '', 'VEIHDORG_TX_1': '',
                         'FSTATUS_ID_3': '', 'RFUR_DT_3': '', 'LOCATION_ID_3': '', 'TORG_ID_3': '', 'FLU_ID_3': '',
                         'FLUORG_ID_3': '', 'FLUORG_TX_3': '', 'FLURESULT_TX_3': '', 'DEATH_DT_3': '', 'DEATH_ID_3': '',
                         'DEATHSK_ID_3': '', 'DEATHO_TX_3': '', 'VE_ID_3': '', 'VERS_FL_3': '', 'VERSCICH_ID_3': '',
                         'VERS_DT_3': '', 'VERSORG_ID_3': '', 'VEIHD_FL_3': '', 'VEIHD_ID_3': '', 'VEIHD_DT_3': '',
                         'VEIHDORG_ID_3': '', 'MRS_TX_3': '', 'TORG_TX_3': '', 'VERSORG_TX_3': '', 'VEIHDORG_TX_3': '',
                         'FSTATUS_ID_6': '', 'RFUR_DT_6': '', 'LOCATION_ID_6': '', 'TORG_ID_6': '', 'FLU_ID_6': '',
                         'FLUORG_ID_6': '', 'FLUORG_TX_6': '', 'FLURESULT_TX_6': '', 'DEATH_DT_6': '', 'DEATH_ID_6': '',
                         'DEATHSK_ID_6': '', 'DEATHO_TX_6': '', 'VE_ID_6': '', 'VERS_FL_6': '', 'VERSCICH_ID_6': '',
                         'VERS_DT_6': '', 'VERSORG_ID_6': '', 'VEIHD_FL_6': '', 'VEIHD_ID_6': '', 'VEIHD_DT_6': '',
                         'VEIHDORG_ID_6': '', 'MRS_TX_6': '', 'TORG_TX_6': '', 'VERSORG_TX_6': '', 'VEIHDORG_TX_6': '',
                         'FSTATUS_ID_12': '', 'RFUR_DT_12': '', 'LOCATION_ID_12': '', 'TORG_ID_12': '', 'FLU_ID_12': '',
                         'FLUORG_ID_12': '', 'FLUORG_TX_12': '', 'FLURESULT_TX_12': '', 'DEATH_DT_12': '',
                         'DEATH_ID_12': '', 'DEATHSK_ID_12': '', 'DEATHO_TX_12': '', 'VE_ID_12': '', 'VERS_FL_12': '',
                         'VERSCICH_ID_12': '', 'VERS_DT_12': '', 'VERSORG_ID_12': '', 'VEIHD_FL_12': '',
                         'VEIHD_ID_12': '', 'VEIHD_DT_12': '', 'VEIHDORG_ID_12': '', 'MRS_TX_12': '', 'TORG_TX_12': '',
                         'VERSORG_TX_12': '', 'VEIHDORG_TX_12': ''
                         }
                p_dic['FSTATUS_ID_' + rfur_nm] = fstatus_id
                p_dic['RFUR_DT_' + rfur_nm] = rfur_dt
                p_dic['LOCATION_ID_' + rfur_nm] = location_id
                p_dic['TORG_ID_' + rfur_nm] = torg_id
                p_dic['FLU_ID_' + rfur_nm] = flu_id
                p_dic['FLUORG_ID_' + rfur_nm] = fluorg_id
                p_dic['FLUORG_TX_' + rfur_nm] = fluorg_tx
                p_dic['FLURESULT_TX_' + rfur_nm] = fluresult_tx
                p_dic['DEATH_DT_' + rfur_nm] = death_dt
                p_dic['DEATH_ID_' + rfur_nm] = death_id
                p_dic['DEATHSK_ID_' + rfur_nm] = deathsk_id
                p_dic['DEATHO_TX_' + rfur_nm] = deatho_tx
                p_dic['VE_ID_' + rfur_nm] = ve_id
                p_dic['VERS_FL_' + rfur_nm] = vers_fl
                p_dic['VERSCICH_ID_' + rfur_nm] = verscich_id
                p_dic['VERS_DT_' + rfur_nm] = vers_dt
                p_dic['VERSORG_ID_' + rfur_nm] = versorg_id
                p_dic['VEIHD_FL_' + rfur_nm] = veihd_fl
                p_dic['VEIHD_ID_' + rfur_nm] = veihd_id
                p_dic['VEIHD_DT_' + rfur_nm] = veihd_dt
                p_dic['VEIHDORG_ID_' + rfur_nm] = veihdorg_id
                p_dic['MRS_TX_' + rfur_nm] = mrs_tx
                p_dic['TORG_TX_' + rfur_nm] = torg_tx
                p_dic['VERSORG_TX_' + rfur_nm] = versorg_tx
                p_dic['VEIHDORG_TX_' + rfur_nm] = veihdorg_tx
                patients_dic[combind_id] = p_dic
    gu.save_array_to_csv('CASEDRFUR(denormalized)', title, patients_dic, under_raw=True)
    n_class = 2

    if n_class == 2:
        id_data, x_data, y_data = genral_utils.get_poor_god(
            'wholeset_Jim_nomissing_validated.csv')
        fn = 'reduced_dimension_30_2c'
    else:
        id_data, x_data, y_data = genral_utils.get_individual(
            'wholeset_Jim_nomissing_validated.csv')
        fn = 'reduced_dimension_30_individual'
    # calculation
    # x_data_train = genral_utils.scale(x_data)
    # t_sne = TSNE(n_components=2, perplexity=30).fit_transform(x_data_train)
    # df = pd.DataFrame(t_sne, columns=['x', 'y'])
    # df['p'] = y_data.values
    # genral_utils.save_dataframe_to_csv(df, fn)

    df = pd.read_csv(genral_utils.get_file_path(fn + '.csv', under_raw=False),
                     encoding='utf8')
    plt.figure()
    plt.scatter(df.ix[:, 0],
                df.ix[:, 1],
                c=df.ix[:, 2],
                s=0.1,
                cmap=plt.cm.get_cmap("jet", n_class))
    plt.colorbar(ticks=range(n_class))
    plt.title('t-SNE 2D visualization of Taiwan stoke registry data')
    plt.xlabel('t-SNE 1')
    plt.ylabel('t-SNE 2')
    plt.savefig("t-sne.png", dpi=300)
    plt.show()
def clean_case():
    fn = 'CASEDCASE.csv'
    read_file_path = gu.get_file_path(fn, under_raw=True)
    df_case = pd.read_csv(read_file_path, encoding='utf8')
    # Dropping unused column
    df_case = df_case.drop([
        'IPROTOCOL_ID', 'IPROTOCOL_ID', 'ORG_ID', 'CSTATUS_ID', 'DCTYPE24_ID',
        'PATIENT_ID', 'INPUT_NM', 'AGE_NM', 'EDU_ID', 'PRO_ID', 'PROOT_TX',
        'ITOWN_ID', 'ADDR_TX', 'TELH_TX', 'TELP_TX', 'TELF_TX', 'FTITLE_TX',
        'CASEMEMO_TX', 'IH_FL', 'IH_DT', 'OH_DT', 'ONSETH_NM', 'ONSETM_NM',
        'ONSET_FL', 'OT_DT', 'OTTIH_NM', 'OTTIM_NM', 'OT_FL', 'FLOOK_DT',
        'FLOOKH_NM', 'FLOOKM_NM', 'FLOOK_FL', 'FCT_DT', 'FCTH_NM', 'FCTM_NM',
        'FCTOH_FL', 'IVTPATH_ID', 'IVTPATH_FL', 'IVTPAAH_FL', 'IVTPA_DT',
        'IVTPAH_NM', 'IVTPAM_NM', 'NIVTPA1_FL', 'NIVTPA2_FL', 'NIVTPA3_FL',
        'NIVTPA4_FL', 'NIVTPA5_FL', 'NIVTPA6_FL', 'NIVTPA7_FL', 'NIVTPA8_FL',
        'NIVTPA9_FL', 'NIVTPA10_FL', 'NIVTPA11_FL', 'NIVTPA99_FL',
        'NIVTPA99_TX', 'ICDO_TX', 'TOASTSCAT_TX', 'TOASTSO_FL', 'TOASTSO_TX',
        'CSAHO_TX', 'THD_ID', 'THDO_FL', 'THDOO_FL', 'THDOO_TX', 'TRM_ID',
        'TRMEN_ID', 'TRMOT_FL', 'TRMOT_TX', 'OM_ID', 'OM_FL', 'OMAND_ID',
        'OMLI_ID', 'OMLIOT_FL', 'OMLIOT_TX', 'OMLIOT2_FL', 'OMLIOT2_TX',
        'AM_FL', 'AMLIOT_FL', 'AMLIOT_TX', 'AMLIOT2_FL', 'AMLIOT2_TX',
        'COM_ID', 'COMO_TX', 'DET_ID', 'DETO_TX', 'DETO_FL', 'OFFD_DT',
        'OFFD_ID', 'OFFD_TX', 'OFFDTORG_ID', 'OFFDTORG_TX', 'OFFRE_DT',
        'NIHSIN_DT', 'NIHSINTI_TX', 'NIHSINH_NM', 'NIHSINM_NM', 'NIHSOT_DT',
        'NIHSOTTI_TX', 'NIHSOTH_NM', 'NIHSOTM_NM', 'BRS_DT', 'CT_DT',
        'CTTI_TX', 'CTH_NM', 'CTM_NM', 'CTO_TX', 'MRI_DT', 'MRITI_TX',
        'MRIH_NM', 'MRIM_NM', 'MRIO_TX', 'ECG_ID', 'ECGO_FL', 'ECGO_TX',
        'CREATE_DT', 'CREATESTAFF_ID', 'SYSUPD_DT', 'SYSUPDSTAFF_ID',
        'MODIFY_NM', 'IGUID_FT', 'DETHOH_FL', 'OMAD_FL', 'OMAD_ID'
    ],
                           axis=1)
    # Replace NULL to NaN
    df_case.replace('NULL', np.nan)
    # Replace outlier to Median
    outlier_cols = [
        'HEIGHT_NM', 'WEIGHT_NM', 'SBP_NM', 'DBP_NM', 'BT_NM', 'HR_NM',
        'RR_NM', 'HB_NM', 'HCT_NM', 'PLATELET_NM', 'WBC_NM', 'PTT1_NM',
        'PTT2_NM', 'PTINR_NM', 'ER_NM', 'BUN_NM', 'CRE_NM', 'ALB_NM', 'CRP_NM',
        'HBAC_NM', 'AC_NM', 'UA_NM', 'TCHO_NM', 'TG_NM', 'HDL_NM', 'LDL_NM',
        'GOT_NM', 'GPT_NM', 'HB_NM', 'HCT_NM', 'PLATELET_NM', 'WBC_NM',
        'PTT1_NM', 'PTT2_NM', 'PTINR_NM', 'ER_NM', 'BUN_NM', 'CRE_NM',
        'ALB_NM', 'CRP_NM', 'HBAC_NM', 'AC_NM', 'UA_NM', 'TCHO_NM', 'TG_NM',
        'HDL_NM', 'LDL_NM', 'GOT_NM', 'GPT_NM', 'OMWA_TX'
    ]
    df_case[outlier_cols] = df_case[outlier_cols].replace(999.9, np.nan)
    for col in outlier_cols:
        df_case.loc[outliers_iqr(df_case[col]), col] = np.nan
    df_case[outlier_cols] = df_case[outlier_cols].apply(pd.to_numeric,
                                                        errors='coerce')
    df_case[outlier_cols] = Imputer(missing_values=np.nan,
                                    strategy='mean',
                                    axis=0).fit_transform(
                                        df_case[outlier_cols])
    # Replace un-coded value to Nan
    df_case.loc[out_of_range(df_case['OPC_ID'], ['1', '2', '3']),
                'OPC_ID'] = np.nan
    df_case.loc[
        out_of_range(df_case['GCSE_NM'], ['1', '2', '3', '4', '5', '6']),
        'GCSE_NM'] = np.nan
    df_case.loc[
        out_of_range(df_case['GCSV_NM'], ['1', '2', '3', '4', '5', '6']),
        'GCSV_NM'] = np.nan
    df_case.loc[
        out_of_range(df_case['GCSM_NM'], ['1', '2', '3', '4', '5', '6']),
        'GCSM_NM'] = np.nan
    df_case.loc[out_of_range(df_case['ICD_ID'], ['1', '2', '3', '4', '99']),
                'ICD_ID'] = np.nan
    df_case.loc[out_of_range(df_case['ICDTIA_ID'], ['1', '2']),
                'ICDTIA_ID'] = np.nan
    df_case.loc[out_of_range(df_case['TOAST_ID'], ['1', '2', '3', '4', '5']),
                'TOAST_ID'] = np.nan
    df_case.loc[out_of_range(df_case['TOASTU_ID'], ['1', '2', '3']),
                'TOASTU_ID'] = np.nan
    df_case.loc[out_of_range(df_case['CICH_ID'], ['1', '2']),
                'CICH_ID'] = np.nan
    df_case.loc[out_of_range(df_case['CSAH_ID'], ['1', '2', '3', '4']),
                'CSAH_ID'] = np.nan
    df_case.loc[out_of_range(df_case['TRMOP_ID'], ['1', '2', '3', '4', '5']),
                'TRMOP_ID'] = np.nan
    df_case.loc[out_of_range(df_case['OFF_ID'], ['1', '2', '3']),
                'OFF_ID'] = np.nan
    df_case.loc[out_of_range(df_case['OFFDT_ID'], ['1', '2', '3', '4', '5']),
                'OFFDT_ID'] = np.nan
    df_case.loc[out_of_range(df_case['CD_ID'], ['0', '1', '2']),
                'CD_ID'] = np.nan
    df_case.loc[out_of_range(df_case['CDR_ID'], ['1', '2', '3', '4']),
                'CDR_ID'] = np.nan
    df_case.loc[out_of_range(df_case['CDL_ID'], ['1', '2', '3', '4']),
                'CDL_ID'] = np.nan
    df_case.loc[out_of_range(df_case['TCCS_ID'], ['0', '1']),
                'TCCS_ID'] = np.nan
    df_case.loc[out_of_range(df_case['TCCSR_ID'], ['1', '2', '3']),
                'TCCSR_ID'] = np.nan
    df_case.loc[out_of_range(df_case['TCCSL_ID'], ['1', '2', '3']),
                'TCCSL_ID'] = np.nan
    df_case.loc[out_of_range(df_case['TCCSBA_ID'], ['1', '2', '3']),
                'TCCSBA_ID'] = np.nan
    df_case.loc[out_of_range(df_case['MCDR_ID'], ['1', '2', '3']),
                'MCDR_ID'] = np.nan
    df_case.loc[out_of_range(df_case['MCDL_ID'], ['1', '2', '3']),
                'MCDL_ID'] = np.nan
    df_case.loc[out_of_range(df_case['MCDBA_ID'], ['1', '2', '3']),
                'MCDBA_ID'] = np.nan
    df_case.loc[out_of_range(df_case['MCDRI_ID'], ['1', '2', '3']),
                'MCDRI_ID'] = np.nan
    df_case.loc[out_of_range(df_case['MCDLI_ID'], ['1', '2', '3']),
                'MCDLI_ID'] = np.nan
    #
    df_case.loc[df_case['ICD_TX'].apply(not_icd), 'ICD_TX'] = np.nan
    #
    toas_cols = [
        'TOASTLE_FL', 'TOASTLI_FL', 'TOASTSCE_FL', 'TOASTSMO_FL',
        'TOASTSRA_FL', 'TOASTSDI_FL', 'TOASTSMI_FL', 'TOASTSANTIP_FL',
        'TOASTSAU_FL', 'TOASTSHY_FL', 'TOASTSPR_FL', 'TOASTSANTIT_FL',
        'TOASTSHO_FL', 'TOASTSHYS_FL', 'TOASTSCA_FL'
    ]
    thd_cols = [
        'THDA_FL', 'THDH_FL', 'THDI_FL', 'THDAM_FL', 'THDV_FL', 'THDE_FL',
        'THDM_FL', 'THDR_FL', 'THDP_FL'
    ]
    trm_cols = [
        'TRMAN_FL', 'TRMAS_FL', 'TRMTI_FL', 'TRMHE_FL', 'TRMWA_FL', 'TRMIA_FL',
        'TRMFO_FL', 'TRMTA_FL', 'TRMSD_FL', 'TRMRE_FL', 'TRMEN_FL', 'TRMAG_FL',
        'TRMCL_FL', 'TRMPL_FL', 'TRMLM_FL', 'TRMIV_FL', 'TRMVE_FL', 'TRMNG_FL',
        'TRMDY_FL', 'TRMICU_FL', 'TRMSM_FL', 'TRMED_FL', 'TRMOP_FL'
    ]
    om_cols = [
        'OMAS_FL', 'OMAG_FL', 'OMTI_FL', 'OMCL_FL', 'OMWA_FL', 'OMPL_FL',
        'OMANH_FL', 'OMAND_FL', 'OMORA_FL', 'OMINS_FL', 'OMLI_FL', 'OMST_FL',
        'OMNS_FL'
    ]
    am_cols = [
        'AMAS_FL', 'AMAG_FL', 'AMTI_FL', 'AMCL_FL', 'AMWA_FL', 'AMPL_FL',
        'AMANH_FL', 'AMAND_FL', 'AMLI_FL'
    ]
    com_cols = [
        'COMPN_FL', 'COMUT_FL', 'COMUG_FL', 'COMPR_FL', 'COMPU_FL', 'COMAC_FL',
        'COMSE_FL', 'COMDE_FL', 'COMO_FL'
    ]
    det_cols = [
        'DETST_FL', 'DETHE_FL', 'DETHO_FL', 'DETHA_FL', 'DETVA_FL', 'DETRE_FL',
        'DETME_FL'
    ]
    cm_cols = ['CT_FL', 'MRI_FL']
    ecg_cols = ['ECGL_FL', 'ECGA_FL', 'ECGQ_FL']
    mcd_cold = ['MCD_ID', 'MRA_FL', 'CTA_FL', 'DSA_FL']
    all_cols = toas_cols + thd_cols + trm_cols + om_cols + am_cols + com_cols + det_cols + cm_cols + ecg_cols + mcd_cold
    df_case[all_cols] = replace_flg(df_case[all_cols], all_cols)
    return df_case
# plt.scatter(df_0.ix[:,0], df_0.ix[:,1], c='blue', s=0.1, label='Good')
# plt.scatter(df_1.ix[:,0], df_1.ix[:,1], c='red', s=0.1, label='Poor')
# # plt.title('t-SNE 2D visualization of 90-day stroke mRS outcome')
# plt.rcParams["legend.markerscale"] = 10
# plt.legend()
# plt.xlabel('t-SNE 1')
# plt.ylabel('t-SNE 2')
# plt.savefig("t-sne.png", dpi=300)
# plt.show()

# BI v.s NIHSS
b = 'bi_total'
n = 'nihss_total'
m = 'discharged_mrs'
# TSR data
df_3m = pd.read_csv(gu.get_file_path('wholeset_Jim_nomissing.csv',
                                     under_raw=False),
                    encoding='utf8')
df_3m[b] = pd.DataFrame(
    np.sum(df_3m[[
        'Feeding', 'Transfers', 'Bathing', 'Toilet_use', 'Grooming',
        'Mobility', 'Stairs', 'Dressing', 'Bowel_control', 'Bladder_control'
    ]],
           axis=1))
df_3m[n] = pd.DataFrame(
    np.sum(df_3m[[
        'NIHS_1a_out', 'NIHS_1b_out', 'NIHS_1c_out', 'NIHS_2_out',
        'NIHS_3_out', 'NIHS_4_out', 'NIHS_5aL_out', 'NIHS_5bR_out',
        'NIHS_6aL_out', 'NIHS_6bR_out', 'NIHS_7_out', 'NIHS_8_out',
        'NIHS_9_out', 'NIHS_10_out', 'NIHS_11_out'
    ]],
           axis=1))
def de_casednihs():
    patients_dic = {}
    title = [
        'ICASE_ID', 'IDCASE_ID', 'NIHS_1a_in', 'NIHS_1b_in', 'NIHS_1c_in',
        'NIHS_2_in', 'NIHS_3_in', 'NIHS_4_in', 'NIHS_5aL_in', 'NIHS_5bR_in',
        'NIHS_6aL_in', 'NIHS_6bR_in', 'NIHS_7_in', 'NIHS_8_in', 'NIHS_9_in',
        'NIHS_10_in', 'NIHS_11_in', 'NIHS_1a_out', 'NIHS_1b_out',
        'NIHS_1c_out', 'NIHS_2_out', 'NIHS_3_out', 'NIHS_4_out',
        'NIHS_5aL_out', 'NIHS_5bR_out', 'NIHS_6aL_out', 'NIHS_6bR_out',
        'NIHS_7_out', 'NIHS_8_out', 'NIHS_9_out', 'NIHS_10_out', 'NIHS_11_out'
    ]
    test_code = {
        '1.10': 'NIHS_1a',
        '1.20': 'NIHS_1b',
        '1.30': 'NIHS_1c',
        '2.00': 'NIHS_2',
        '3.00': 'NIHS_3',
        '4.00': 'NIHS_4',
        '5.10': 'NIHS_5aL',
        '5.20': 'NIHS_5bR',
        '6.10': 'NIHS_6aL',
        '6.20': 'NIHS_6bR',
        '7.00': 'NIHS_7',
        '8.00': 'NIHS_8',
        '9.00': 'NIHS_9',
        '10.00': 'NIHS_10',
        '11.00': 'NIHS_11'
    }
    read_file_path = gu.get_file_path('CASEDNIHS.csv', under_raw=True)
    with open(read_file_path, 'r', encoding='utf8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            icase_id = row['ICASE_ID']
            idcase_id = row['IDCASE_ID']
            combind_id = icase_id + idcase_id
            nid_nm = row['NID_NM']
            ninv_nm = row['NINV_NM']
            notv_nm = row['NOTV_NM']
            if combind_id in patients_dic.keys():
                key = test_code.get(nid_nm)
                patients_dic.get(combind_id)[key + '_in'] = ninv_nm
                patients_dic.get(combind_id)[key + '_out'] = notv_nm
            else:
                # initial a patient's dictionary
                p_dic = {
                    'ICASE_ID': icase_id,
                    'IDCASE_ID': idcase_id,
                    'NIHS_1a_in': '',
                    'NIHS_1b_in': '',
                    'NIHS_1c_in': '',
                    'NIHS_2_in': '',
                    'NIHS_3_in': '',
                    'NIHS_4_in': '',
                    'NIHS_5aL_in': '',
                    'NIHS_5bR_in': '',
                    'NIHS_6aL_in': '',
                    'NIHS_6bR_in': '',
                    'NIHS_7_in': '',
                    'NIHS_8_in': '',
                    'NIHS_9_in': '',
                    'NIHS_10_in': '',
                    'NIHS_11_in': '',
                    'NIHS_1a_out': '',
                    'NIHS_1b_out': '',
                    'NIHS_1c_out': '',
                    'NIHS_2_out': '',
                    'NIHS_3_out': '',
                    'NIHS_4_out': '',
                    'NIHS_5aL_out': '',
                    'NIHS_5bR_out': '',
                    'NIHS_6aL_out': '',
                    'NIHS_6bR_out': '',
                    'NIHS_7_out': '',
                    'NIHS_8_out': '',
                    'NIHS_9_out': '',
                    'NIHS_10_out': '',
                    'NIHS_11_out': ''
                }
                key = test_code.get(nid_nm)
                p_dic[key + '_in'] = ninv_nm
                p_dic[key + '_out'] = notv_nm
                patients_dic[combind_id] = p_dic
    gu.save_array_to_csv('CASEDNIHS(denormalized)',
                         title,
                         patients_dic,
                         under_raw=True)
def clean_nihs():
    fn = 'CASEDNIHS(denormalized).csv'
    read_file_path = gu.get_file_path(fn, under_raw=True)
    df_nihs = pd.read_csv(read_file_path, encoding='utf8')
    df_nihs.loc[out_of_range(df_nihs['NIHS_1a_in'], ['0', '1', '2', '3']),
                'NIHS_1a_in'] = np.nan
    df_nihs.loc[out_of_range(df_nihs['NIHS_1a_out'], ['0', '1', '2', '3']),
                'NIHS_1a_out'] = np.nan
    df_nihs.loc[out_of_range(df_nihs['NIHS_1b_in'], ['0', '1', '2']),
                'NIHS_1b_in'] = np.nan
    df_nihs.loc[out_of_range(df_nihs['NIHS_1b_out'], ['0', '1', '2']),
                'NIHS_1b_out'] = np.nan
    df_nihs.loc[out_of_range(df_nihs['NIHS_1c_in'], ['0', '1', '2']),
                'NIHS_1c_in'] = np.nan
    df_nihs.loc[out_of_range(df_nihs['NIHS_1c_out'], ['0', '1', '2']),
                'NIHS_1c_out'] = np.nan
    df_nihs.loc[out_of_range(df_nihs['NIHS_2_in'], ['0', '1', '2']),
                'NIHS_2_in'] = np.nan
    df_nihs.loc[out_of_range(df_nihs['NIHS_2_out'], ['0', '1', '2']),
                'NIHS_2_out'] = np.nan
    df_nihs.loc[out_of_range(df_nihs['NIHS_3_in'], ['0', '1', '2', '3']),
                'NIHS_3_in'] = np.nan
    df_nihs.loc[out_of_range(df_nihs['NIHS_3_out'], ['0', '1', '2', '3']),
                'NIHS_3_out'] = np.nan
    df_nihs.loc[out_of_range(df_nihs['NIHS_4_in'], ['0', '1', '2', '3']),
                'NIHS_4_in'] = np.nan
    df_nihs.loc[out_of_range(df_nihs['NIHS_4_out'], ['0', '1', '2', '3']),
                'NIHS_4_out'] = np.nan
    df_nihs.loc[
        out_of_range(df_nihs['NIHS_5aL_in'], ['0', '1', '2', '3', '4']),
        'NIHS_5aL_in'] = np.nan
    df_nihs.loc[
        out_of_range(df_nihs['NIHS_5aL_out'], ['0', '1', '2', '3', '4']),
        'NIHS_5aL_out'] = np.nan
    df_nihs.loc[
        out_of_range(df_nihs['NIHS_5bR_in'], ['0', '1', '2', '3', '4']),
        'NIHS_5bR_in'] = np.nan
    df_nihs.loc[
        out_of_range(df_nihs['NIHS_5bR_out'], ['0', '1', '2', '3', '4']),
        'NIHS_5bR_out'] = np.nan
    df_nihs.loc[
        out_of_range(df_nihs['NIHS_6aL_in'], ['0', '1', '2', '3', '4']),
        'NIHS_6aL_in'] = np.nan
    df_nihs.loc[
        out_of_range(df_nihs['NIHS_6aL_out'], ['0', '1', '2', '3', '4']),
        'NIHS_6aL_out'] = np.nan
    df_nihs.loc[
        out_of_range(df_nihs['NIHS_6bR_in'], ['0', '1', '2', '3', '4']),
        'NIHS_6bR_in'] = np.nan
    df_nihs.loc[
        out_of_range(df_nihs['NIHS_6bR_out'], ['0', '1', '2', '3', '4']),
        'NIHS_6bR_out'] = np.nan
    df_nihs.loc[out_of_range(df_nihs['NIHS_7_in'], ['0', '1', '2']),
                'NIHS_7_in'] = np.nan
    df_nihs.loc[out_of_range(df_nihs['NIHS_7_out'], ['0', '1', '2']),
                'NIHS_7_out'] = np.nan
    df_nihs.loc[out_of_range(df_nihs['NIHS_8_in'], ['0', '1', '2']),
                'NIHS_8_in'] = np.nan
    df_nihs.loc[out_of_range(df_nihs['NIHS_8_out'], ['0', '1', '2']),
                'NIHS_8_out'] = np.nan
    df_nihs.loc[out_of_range(df_nihs['NIHS_9_in'], ['0', '1', '2', '3']),
                'NIHS_9_in'] = np.nan
    df_nihs.loc[out_of_range(df_nihs['NIHS_9_out'], ['0', '1', '2', '3']),
                'NIHS_9_out'] = np.nan
    df_nihs.loc[out_of_range(df_nihs['NIHS_10_in'], ['0', '1', '2']),
                'NIHS_10_in'] = np.nan
    df_nihs.loc[out_of_range(df_nihs['NIHS_10_out'], ['0', '1', '2']),
                'NIHS_10_out'] = np.nan
    df_nihs.loc[out_of_range(df_nihs['NIHS_11_in'], ['0', '1', '2']),
                'NIHS_11_in'] = np.nan
    df_nihs.loc[out_of_range(df_nihs['NIHS_11_out'], ['0', '1', '2']),
                'NIHS_11_out'] = np.nan
    return df_nihs
def de_casedctmr():
    patients_dic = {}
    title = [
        'ICASE_ID', 'IDCASE_ID', 'cortical_ACA_ctr', 'cortical_MCA_ctr',
        'subcortical_ACA_ctr', 'subcortical_MCA_ctr', 'PCA_cortex_ctr',
        'thalamus_ctr', 'brainstem_ctr', 'cerebellum_ctr', 'Watershed_ctr',
        'Hemorrhagic_infarct_ctr', 'Old_stroke_ctci', 'cortical_ACA_ctl',
        'cortical_MCA_ctl', 'subcortical_ACA_ctl', 'subcortical_MCA_ctl',
        'PCA_cortex_ctl', 'thalamus_ctl', 'brainstem_ctl', 'cerebellum_ctl',
        'Watershed_ctl', 'Hemorrhagic_infarct_ctl', 'Old_stroke_ctch',
        'cortical_ACA_mrir', 'cortical_MCA_mrir', 'subcortical_ACA_mrir',
        'subcortical_MCA_mrir', 'PCA_cortex_mrir', 'thalamus_mrir',
        'brainstem_mrir', 'cerebellum_mrir', 'Watershed_mrir',
        'Hemorrhagic_infarct_mrir', 'Old_stroke_mrici', 'cortical_ACA_mril',
        'cortical_MCA_mril', 'subcortical_ACA_mril', 'subcortical_MCA_mril',
        'PCA_cortex_mril', 'thalamus_mril', 'brainstem_mril',
        'cerebellum_mril', 'Watershed_mril', 'Hemorrhagic_infarct_mril',
        'Old_stroke_mrich'
    ]
    cm_code = {
        '1': 'cortical_ACA',
        '2': 'cortical_MCA',
        '3': 'subcortical_ACA',
        '4': 'subcortical_MCA',
        '5': 'PCA_cortex',
        '6': 'thalamus',
        '7': 'brainstem',
        '8': 'cerebellum',
        '9': 'Watershed',
        '10': 'Hemorrhagic_infarct',
        '11': 'Old_stroke'
    }

    read_file_path = gu.get_file_path('CASEDCTMR.csv', under_raw=True)
    with open(read_file_path, 'r', encoding='utf8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            icase_id = row['ICASE_ID']
            idcase_id = row['IDCASE_ID']
            combind_id = icase_id + idcase_id
            ctmriid_nm = row['CTMRIID_NM']
            ctright_fl = row['CTRIGHT_FL']
            ctleft_fl = row['CTLEFT_FL']
            mriright_fl = row['MRIRIGHT_FL']
            mrileft_fl = row['MRILEFT_FL']
            if combind_id in patients_dic.keys():
                key = cm_code.get(ctmriid_nm)
                if ctmriid_nm != '11':
                    patients_dic.get(combind_id)[key + '_ctr'] = ctright_fl
                    patients_dic.get(combind_id)[key + '_ctl'] = ctleft_fl
                    patients_dic.get(combind_id)[key + '_mrir'] = mriright_fl
                    patients_dic.get(combind_id)[key + '_mril'] = mrileft_fl
                else:
                    patients_dic.get(combind_id)[key + '_ctci'] = ctright_fl
                    patients_dic.get(combind_id)[key + '_ctch'] = ctleft_fl
                    patients_dic.get(combind_id)[key + '_mrici'] = mriright_fl
                    patients_dic.get(combind_id)[key + '_mrich'] = mrileft_fl
            else:
                # initial a patient's dictionary
                p_dic = {
                    'ICASE_ID': icase_id,
                    'IDCASE_ID': idcase_id,
                    'cortical_ACA_ctr': '',
                    'cortical_MCA_ctr': '',
                    'subcortical_ACA_ctr': '',
                    'subcortical_MCA_ctr': '',
                    'PCA_cortex_ctr': '',
                    'thalamus_ctr': '',
                    'brainstem_ctr': '',
                    'cerebellum_ctr': '',
                    'Watershed_ctr': '',
                    'Hemorrhagic_infarct_ctr': '',
                    'Old_stroke_ctci': '',
                    'cortical_ACA_ctl': '',
                    'cortical_MCA_ctl': '',
                    'subcortical_ACA_ctl': '',
                    'subcortical_MCA_ctl': '',
                    'PCA_cortex_ctl': '',
                    'thalamus_ctl': '',
                    'brainstem_ctl': '',
                    'cerebellum_ctl': '',
                    'Watershed_ctl': '',
                    'Hemorrhagic_infarct_ctl': '',
                    'Old_stroke_ctch': '',
                    'cortical_ACA_mrir': '',
                    'cortical_MCA_mrir': '',
                    'subcortical_ACA_mrir': '',
                    'subcortical_MCA_mrir': '',
                    'PCA_cortex_mrir': '',
                    'thalamus_mrir': '',
                    'brainstem_mrir': '',
                    'cerebellum_mrir': '',
                    'Watershed_mrir': '',
                    'Hemorrhagic_infarct_mrir': '',
                    'Old_stroke_mrici': '',
                    'cortical_ACA_mril': '',
                    'cortical_MCA_mril': '',
                    'subcortical_ACA_mril': '',
                    'subcortical_MCA_mril': '',
                    'PCA_cortex_mril': '',
                    'thalamus_mril': '',
                    'brainstem_mril': '',
                    'cerebellum_mril': '',
                    'Watershed_mril': '',
                    'Hemorrhagic_infarct_mril': '',
                    'Old_stroke_mrich': ''
                }
                key = cm_code.get(ctmriid_nm)
                if ctmriid_nm != '11':
                    p_dic[key + '_ctr'] = ctright_fl
                    p_dic[key + '_ctl'] = ctleft_fl
                    p_dic[key + '_mrir'] = mriright_fl
                    p_dic[key + '_mril'] = mrileft_fl
                else:
                    p_dic[key + '_ctci'] = ctright_fl
                    p_dic[key + '_ctch'] = ctleft_fl
                    p_dic[key + '_mrici'] = mriright_fl
                    p_dic[key + '_mrich'] = mrileft_fl
                patients_dic[combind_id] = p_dic
    gu.save_array_to_csv('CASEDCTMR(denormalized)',
                         title,
                         patients_dic,
                         under_raw=True)
    # print(df_fahi.shape)
    # print(df_nihs.shape)
    # print(df_rfur.shape)
    # df_joined = reduce(lambda left, right: pd.merge(left, right, how='outer', on=['ICASE_ID', 'IDCASE_ID']), dfs)
    # print(df_joined.shape)
    #  ===================== convert feature
    # df_withMissing = clnUtil.convert_features(df_joined)
    # gu.save_dataframe_to_csv(df_withMissing, 'TSR_2018_withMissing')

    #  ######################################################## 3-month mRS#############################################
    #  ===================== Remove high missing features
    # df_withMissing = pd.read_csv(gu.get_file_path('TSR_2018_withMissing.csv', under_raw=False), encoding='utf8')
    # df_remove_hing_missing_columns = nomissUtil.remove_missing_intensive_features(df_withMissing)
    #  nomissUtil.plot_missing(df_remove_hing_missing_columns)
    #  ===================== only 3-month followup
    # df_3m = df_remove_hing_missing_columns.drop(['VERS_3', 'VERS_6', 'VERS_12', 'VEIHD_3', 'VEIHD_6', 'VEIHD_12', 'MRS_6', 'MRS_12'], axis=1)
    #  ===================== Remove NaN observations
    # df_3m.dropna(inplace=True)
    #  ===================== Remove dead cases
    # df_3m.drop(df_3m[df_3m.OFF_ID == 2.].index, inplace=True)
    #  ===================== Make dummy variables
    # df_3m = clnUtil.make_dummy(df_3m)
    # gu.save_dataframe_to_csv(df_3m, 'TSR_2018_3m_noMissing')
    #  ===================== validated mRS
    df_3m = pd.read_csv(gu.get_file_path('TSR_2018_3m_noMissing.csv',
                                         under_raw=False),
                        encoding='utf8')
    df_3m_validated = mv.mRS_validate(df_3m)
    gu.save_dataframe_to_csv(df_3m_validated,
                             'TSR_2018_3m_noMissing_validated')
    print("Done")