コード例 #1
0
def run():
    global PER_LAB_DIR, PREP_OUTPUT_DIR, PATIENT_COUNT, USE_LAB_COL_NAME, SAMPLE_PATIENT_PATH, DEBUG_PRINT

    # syntax checking existence for directory
    PER_LAB_DIR = check_directory(PER_LAB_DIR)
    PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR)
    output_path = PREP_OUTPUT_DIR + SAMPLE_PATIENT_PATH

    result_df = pd.DataFrame(index=range(1, PATIENT_COUNT))

    re_per_lab = re.compile("^labtest_.*\.csv")
    for file_name in os.listdir(PER_LAB_DIR):
        if re_per_lab.match(file_name):

            per_lab_name = file_name.replace('labtest_',
                                             '').replace('.csv', '')
            per_lab_path = PER_LAB_DIR + file_name
            per_lab_df = pd.read_csv(per_lab_path,
                                     delimiter=DELIM,
                                     usecols=USE_LAB_COL_NAME)
            result_df[per_lab_name] = per_lab_df.drop_duplicates(
                ['no', 'date']).groupby(['no']).count().date

            if DEBUG_PRINT:
                print("{} is clear".format(file_name))

    result_df = result_df.count(1).to_frame('count')
    result_df.index.name = 'no'

    result_df.to_hdf(output_path,
                     "metadata/patient_count",
                     format='table',
                     data_columns=True,
                     mode='a')
    del result_df
コード例 #2
0
def save_mapping_to_hdf5():
    global MAPPING_DIR, MEDICINE_MAPPING_PATH, PREP_OUTPUT_DIR, PRESCRIBE_OUTPUT_PATH

    MAPPING_DIR = check_directory(MAPPING_DIR)
    PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR)

    prescribe_output_path = PREP_OUTPUT_DIR + PRESCRIBE_OUTPUT_PATH
    medicine_mapping_path = MAPPING_DIR + MEDICINE_MAPPING_PATH

    if not os.path.isfile(medicine_mapping_path):
        raise ValueError("There is no medicine_mapping dataframe!")

    save_to_hdf5(medicine_mapping_path, prescribe_output_path,
                 'metadata/mapping_table')
コード例 #3
0
def run():
    global PER_LAB_DIR, PREP_OUTPUT_DIR, LAB_COL_NAME, USE_LAB_COL_NAME, LABTEST_OUTPUT_PATH, DEBUG_PRINT

    # syntax checking existence for directory
    PER_LAB_DIR = check_directory(PER_LAB_DIR)
    PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR)
    output_path = PREP_OUTPUT_DIR + LABTEST_OUTPUT_PATH

    # if the previous output file exists, remove it
    if os.path.isfile(output_path):
        os.remove(output_path)
    # get mapping dataframe and save to hdf5 file
    labtest_mapping_df = get_labtest_map()
    labtest_mapping_df = labtest_mapping_df.apply(pd.to_numeric,
                                                  errors='ignore')
    labtest_mapping_df.to_hdf(output_path,
                              "metadata/mapping_table",
                              format='table',
                              date_columns=True,
                              mode='a')

    re_per_lab = re.compile("^labtest_.*\.csv")
    for file in os.listdir(PER_LAB_DIR):
        if re_per_lab.match(file):
            per_lab_name = file.replace('labtest_', '').replace('.csv', '')
            per_lab_path = PER_LAB_DIR + file
            per_lab_df = pd.read_csv(per_lab_path,
                                     delimiter=DELIM,
                                     usecols=USE_LAB_COL_NAME)
            # 1. 값 가져오기
            r_avg, r_min, r_max = get_labtest_value(labtest_mapping_df,
                                                    per_lab_name)
            per_lab_df.result = per_lab_df.result.map(
                normalize_number(r_avg, r_min, r_max))
            per_lab_df.date = per_lab_df.date.map(convert_month)
            # file type change
            save_name = 'data/' + per_lab_name
            per_lab_df = per_lab_df.apply(pd.to_numeric, errors='ignore')
            per_lab_df.to_hdf(output_path,
                              save_name,
                              format='table',
                              data_columns=True,
                              mode='a')

            if DEBUG_PRINT:
                print("{} dataframe enters hdf5 file".format(per_lab_name))
コード例 #4
0
def save_mapping_to_hdf5():
    '''
    mapping table을 hdf5 포맷으로 저장하는 함수
    metadata/mapping_table에 저장
    '''
    global MAPPING_DIR, KCD_MAPPING_PATH, PREP_OUTPUT_DIR, DIAGNOSIS_OUTPUT_PATH

    MAPPING_DIR = check_directory(MAPPING_DIR)
    PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR)

    KCD_output_path = MAPPING_DIR + KCD_MAPPING_PATH
    diagnosis_output_path = PREP_OUTPUT_DIR + DIAGNOSIS_OUTPUT_PATH

    if not os.path.isfile(KCD_output_path):
        raise ValueError("There is no KCD_OUTPUT file!")

    save_to_hdf5(KCD_output_path, diagnosis_output_path,
                 'metadata/mapping_table')
コード例 #5
0
def get_labtest_map():
    global MAPPING_DIR, LAB_MAPPING_PATH, DELIM

    MAPPING_DIR = check_directory(MAPPING_DIR)
    lab_mapping_path = MAPPING_DIR + LAB_MAPPING_PATH

    if not os.path.isfile(lab_mapping_path):
        raise ValueError("There is no labtest_OUTPUT file!")

    labtest_mapping_df = pd.read_csv(lab_mapping_path, delimiter=DELIM)
    return labtest_mapping_df
コード例 #6
0
def get_ka_label_df():
    global PER_LAB_DIR, PREP_OUTPUT_DIR, PATIENT_COUNT, USE_LAB_COL_NAME, SAMPLE_PATIENT_PATH, DEBUG_PRINT
    # syntax checking existence for directory
    PER_LAB_DIR = check_directory(PER_LAB_DIR)
    PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR)
    output_path = PREP_OUTPUT_DIR + SAMPLE_PATIENT_PATH

    result_df = pd.DataFrame(index=range(1, PATIENT_COUNT))
    ka_df = pd.read_csv(PER_LAB_DIR + 'labtest_L3042.csv')
    ka_e_df = pd.read_csv(PER_LAB_DIR + 'labtest_L8042.csv')

    ka_df.date = ka_df.date.map(convert_month)
    ka_e_df.date = ka_e_df.date.map(convert_month)

    ka_df.result = ka_df.result.map(convert_to_numeric)
    ka_e_df.result = ka_e_df.result.map(convert_to_numeric)

    ka_df.loc[ka_df.result < 3.5, 'result'] = 1
    ka_df.loc[(ka_df.result >= 3.5) & (ka_df.result <= 5.5), 'result'] = 0
    ka_df.loc[ka_df.result > 5.5, 'result'] = 2

    ka_e_df.loc[ka_e_df.result < 3.5, 'result'] = 1
    ka_e_df.loc[(ka_e_df.result >= 3.5) & (ka_e_df.result <= 5.5),
                'result'] = 0
    ka_e_df.loc[ka_e_df.result > 5.5, 'result'] = 2

    total_df = pd.concat([ka_df, ka_e_df])
    total_df = total_df.groupby(['no', 'date',
                                 'result']).size().unstack(fill_value=0)
    total_df = (2 * (total_df[2.0] > 0)) + (1 * (total_df[1.0] > 0))
    total_df = total_df.reset_index()
    total_df.columns = ['no', 'date', 'label']

    total_df.to_hdf(output_path,
                    "data/ka_label",
                    format='table',
                    data_columns=True,
                    mode='a')
コード例 #7
0
def get_prescribe_map():
    global MAPPING_DIR, MEDICINE_MAPPING_PATH
    MAPPING_DIR = check_directory(MAPPING_DIR)
    medicine_mapping_path = MAPPING_DIR + MEDICINE_MAPPING_PATH

    if not os.path.isfile(medicine_mapping_path):
        raise ValueError("There is no medicine_mapping dataframe!")

    prescribe_map_df = pd.read_csv(medicine_mapping_path, delimiter=DELIM)
    mapping_dict = pd.Series(prescribe_map_df.mapping_code.values,
                             index=prescribe_map_df.medi_code).to_dict()

    del prescribe_map_df
    return mapping_dict
コード例 #8
0
def run(demographic_path):
    global PREP_OUTPUT_DIR, DEMOGRAPHIC_OUTPUT_PATH, TEMP_PATH

    PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR)
    demographic_output_path = PREP_OUTPUT_DIR + DEMOGRAPHIC_OUTPUT_PATH

    if os.path.isfile(TEMP_PATH):
        raise ValueError("data Corruption WARNING! --> maybe other process using TEMP file ")
        
    demographic_df = pd.read_excel(demographic_path)
    demographic_df.columns = DEMO_COL_NAME
    sex_dict = {'F':1,'M':0}
    demographic_df['sex'] = demographic_df['sex'].map(sex_dict)
    demographic_df['age'] = demographic_df['age'].map(check_age)

    demographic_df.to_csv(TEMP_PATH,sep=DELIM, index=False)

    save_to_hdf5(TEMP_PATH, demographic_output_path, 'data/original')
    os.remove(TEMP_PATH)
コード例 #9
0
def run(diagnosis_data_path):
    global DELIM, KCD_COL_NAME, KCD_USE_COLS, CHUNK_SIZE, DIAGNOSIS_OUTPUT_PATH, DEBUG_PRINT, TEMP_PATH, PREP_OUTPUT_DIR

    PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR)
    diagnosis_output_path = PREP_OUTPUT_DIR + DIAGNOSIS_OUTPUT_PATH

    KCD_to_code = get_diagnosis_map()  # mapping dictionary

    if os.path.isfile(TEMP_PATH):
        raise ValueError(
            "data Corruption WARNING! --> maybe other process using TEMP file "
        )

    chunks = pd.read_csv(diagnosis_data_path,
                         delimiter=DELIM,
                         header=None,
                         names=KCD_COL_NAME,
                         usecols=KCD_USE_COLS,
                         chunksize=CHUNK_SIZE)
    for idx, chunk in enumerate(chunks):
        #### mapping
        chunk.KCD_code = chunk.KCD_code.map(strip_space)
        chunk.KCD_code = chunk.KCD_code.map(KCD_to_code)
        chunk.date = chunk.date.map(convert_month)

        if idx is 0:
            chunk.to_csv(TEMP_PATH,
                         sep=DELIM,
                         header=KCD_USE_COLS,
                         index=False)
        else:
            chunk.to_csv(TEMP_PATH,
                         sep=DELIM,
                         header=False,
                         index=False,
                         mode='a')

        if DEBUG_PRINT:
            print('{} th chunk of output enters temp file'.format(idx))

    save_to_hdf5(TEMP_PATH, diagnosis_output_path, 'data')
    os.remove(TEMP_PATH)  # temp file remove
    save_mapping_to_hdf5()
コード例 #10
0
def get_diagnosis_map():
    '''
    mapping table을 dictionary 형태로 가져오는 함수
    '''
    global MAPPING_DIR, KCD_MAPPING_PATH

    MAPPING_DIR = check_directory(MAPPING_DIR)
    KCD_output_path = MAPPING_DIR + KCD_MAPPING_PATH

    if not os.path.isfile(KCD_output_path):
        raise ValueError("There is no KCD_OUTPUT file!")

    KCD_df = pd.read_csv(KCD_output_path, delimiter=DELIM)
    KCD_to_code = pd.Series(KCD_df.mapping_code.values,
                            index=KCD_df.KCD_code.values).to_dict()

    del KCD_df

    return KCD_to_code
コード例 #11
0
def run(prescribe_lab_path):
    global DELIM, CHUNK_SIZE, PRESCRIBE_OUTPUT_PATH, PREP_OUTPUT_DIR, TEMP_PATH

    PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR)
    prescribe_output_path = PREP_OUTPUT_DIR + PRESCRIBE_OUTPUT_PATH

    mapping_dict = get_prescribe_map()  # mapping dictionary

    if os.path.isfile(TEMP_PATH):
        raise ValueError(
            "data Corruption WARNING! --> maybe other process using TEMP file "
        )

    chunks = pd.read_csv(prescribe_lab_path,
                         delimiter=DELIM,
                         chunksize=CHUNK_SIZE)
    for idx, chunk in enumerate(chunks):
        #### 임시 코드 start###
        chunk.drop(chunk[chunk.date.map(check_not_date_type)].index,
                   inplace=True)
        chunk.drop(['medi_name', 'date1'], axis=1, inplace=True)
        #### 임시 코드  end###
        chunk['medi_code'] = chunk['medi_code'].map(mapping_dict)
        chunk['date'] = chunk['date'].map(convert_month)
        chunk['times'] = chunk['times'].map(convert_times_per_month)
        if idx is 0:
            chunk.to_csv(TEMP_PATH, sep=DELIM, index=False)
        else:
            chunk.to_csv(TEMP_PATH,
                         sep=DELIM,
                         index=False,
                         header=False,
                         mode='a')
        if DEBUG_PRINT:
            print('{} th chunk of output enters temp file'.format(idx))

    save_to_hdf5(TEMP_PATH, prescribe_output_path, 'data')
    os.remove(TEMP_PATH)  # temp file remove
    save_mapping_to_hdf5()