Ejemplo n.º 1
0
# -*- coding: utf-8 -*-
"""MIMIC dataset handling with full dataset
"""
# Author: Yue Zhao <*****@*****.**>
# License: BSD 2 clause
import os
import sys
import pandas as pd
import json
from joblib import Parallel, delayed
# temporary solution for relative imports in case combo is not installed
# if combo is installed, no need to use the following line
sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
from pyhealth.data.base_mimic import parallel_parse_tables
from pyhealth.utils.utility_parallel import unfold_parallel
from pyhealth.utils.utility_parallel import partition_estimators
from pyhealth.utils.utility import read_csv_to_df
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
if __name__ == "__main__":
    n_jobs = 6  # number of parallel jobs
    duration = 21600  # time window for episode generation
    selection_method = 'last'
    mimic_data_loc = 'D:\\mimic-iii-clinical-database-1.4'  # change this to your mimic full data location
    save_dir = os.path.join('outputs', 'mimic')
    # make saving directory if needed
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
Ejemplo n.º 2
0
        'icd9_dgns_cd_8',
        'icd9_dgns_cd_9',
        'icd9_dgns_cd_10',
    ]

    patient_data_loc = 'cms_patient_data.json'
    patient_list_loc = 'cms_patient_list.json'

    valid_data_list = []  # keep tracking the stored data
    valid_id_list = []  # keep tracking a list of patient IDs#

    valid_sequence_list = []

    # read in tables
    patient_df = read_csv_to_df(
        os.path.join('data', 'cms-sample-1',
                     'DE1_0_2008_Beneficiary_Summary_File_Sample_1.csv'))
    patient_id_list = patient_df['desynpuf_id'].tolist()
    # change the format of the date
    patient_df['dob'] = pd.to_datetime(patient_df['bene_birth_dt'],
                                       format='%Y%m%d')

    event_df = read_csv_to_df(
        os.path.join('data', 'cms-sample-1',
                     'DE1_0_2008_to_2010_Inpatient_Claims_Sample_1.csv'))
    event_df['icd9_prcdr_cd_1'] = event_df['icd9_prcdr_cd_1'].astype('Int64')
    event_df['icd9_prcdr_cd_1'] = event_df['icd9_prcdr_cd_1'].astype(str)

    # change the format of the date
    event_df['clm_from_dt'] = pd.to_datetime(event_df['clm_from_dt'],
                                             format='%Y%m%d')
Ejemplo n.º 3
0
# -*- coding: utf-8 -*-
"""MIMIC dataset handling using parallelization on demo data
"""
# Author: Yue Zhao <*****@*****.**>
# License: BSD 2 clause
import os
import sys
import pandas as pd
import json
from joblib import Parallel, delayed
# temporary solution for relative imports in case combo is not installed
# if combo is installed, no need to use the following line
sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
from pyhealth.data.base_mimic import parallel_parse_tables
from pyhealth.utils.utility_parallel import unfold_parallel
from pyhealth.utils.utility_parallel import partition_estimators
from pyhealth.utils.utility import read_csv_to_df
from pyhealth.utils.utility import make_dirs_if_not_exists
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
if __name__ == "__main__":
    n_jobs = 4  # number of parallel jobs
    duration = 21600  # time window for episode generation
    selection_method = 'last'
    save_dir = os.path.join('outputs', 'mimic_demo', 'raw')
    make_dirs_if_not_exists(save_dir)
    patient_data_loc = os.path.join(save_dir, 'patient_data_demo.json')
    valid_data_list = []  # keep tracking the stored data
Ejemplo n.º 4
0
# -*- coding: utf-8 -*-
"""MIMIC dataset handling with single thread on demo data
"""
# Author: Yue Zhao <*****@*****.**>
# License: BSD 2 clause
import os
import sys
import time
import pandas as pd
from tqdm import tqdm
import json
# temporary solution for relative imports in case combo is not installed
# if combo is installed, no need to use the following line
sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
from pyhealth.data.base_mimic import MIMIC_Data
from pyhealth.utils.utility import read_csv_to_df
from pyhealth.utils.utility import make_dirs_if_not_exists
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
if __name__ == "__main__":
    n_jobs = 4  # number of parallel jobs
    duration = 21600  # time window for episode generation
    selection_method = 'last'
    save_dir = os.path.join('outputs', 'mimic_demo', 'raw')
    make_dirs_if_not_exists(save_dir)
    patient_data_loc = os.path.join(save_dir, 'patient_data_demo.json')
    valid_data_list = []  # keep tracking the stored data
    valid_id_list = []  # keep tracking a list of patient IDs
Ejemplo n.º 5
0
    def test_01_flow(self):

        n_jobs = 2  # number of parallel jobs
        n_samples = 10
        duration = 21600  # time window for episode generation
        selection_method = 'last'

        # make saving directory if needed
        if not os.path.isdir(self.save_dir):
            os.makedirs(self.save_dir)

        patient_data_loc = 'patient_data.json'
        patient_list_loc = 'patient_list.json'

        valid_data_list = []  # keep tracking the stored data
        valid_id_list = []  # keep tracking a list of patient IDs

        # key variables to track in the episode
        var_list = [
            'Capillary refill rate', 'Diastolic blood pressure',
            'Fraction inspired oxygen', 'Glascow coma scale eye opening',
            'Glascow coma scale motor response', 'Glascow coma scale total',
            'Glascow coma scale verbal response', 'Glucose', 'Heart Rate',
            'Height', 'Mean blood pressure', 'Oxygen saturation',
            'Respiratory rate', 'Systolic blood pressure', 'Temperature',
            'Weight', 'pH'
        ]

        # enforce and convert to lower case
        var_list = [item.lower() for item in var_list]
        print(os.getcwd())
        print(os.getcwd())
        print(os.getcwd())
        event_mapping_df = read_csv_to_df(
            os.path.join('examples', 'data_generation', 'resources',
                         'itemid_to_variable_map.csv'))
        event_mapping_df['level2'] = event_mapping_df['level2'].str.lower()

        key_df = event_mapping_df[event_mapping_df['level2'].isin(var_list)]
        key_items = key_df['itemid'].tolist()

        #################################################################
        # read in tables
        patient_df = read_csv_to_df(
            os.path.join('examples', 'data_generation', 'data',
                         'mimic-iii-clinical-database-demo-1.4',
                         'PATIENTS.csv'))
        patient_id_list = patient_df['subject_id'].tolist()

        admission_df = read_csv_to_df(
            os.path.join('examples', 'data_generation', 'data',
                         'mimic-iii-clinical-database-demo-1.4',
                         'ADMISSIONS.csv'))

        icu_df = read_csv_to_df(
            os.path.join('examples', 'data_generation', 'data',
                         'mimic-iii-clinical-database-demo-1.4',
                         'ICUSTAYS.csv'))

        events_vars = [
            'subject_id',
            'hadm_id',
            'icustay_id',
            'itemid',
            'charttime',
            'value',
            'valueuom',
        ]
        # because MIMIC's header is in upper case
        # however, demo code does not
        # events_vars = [item.upper() for item in events_vars]

        # define datatype to reduce the memory cost
        dtypes_dict = {
            'subject_id': 'int32',
            'hadm_id': 'int32',
            'icustay_id': 'object',
            'itemid': 'int32',
            'charttime': 'object',
            'value': 'object',
            'valueuom': 'object',
        }

        event_df = read_csv_to_df(
            # os.path.join('data', 'mimic-iii-clinical-database-demo-1.4',
            os.path.join('examples', 'data_generation', 'data',
                         'mimic-iii-clinical-database-demo-1.4',
                         'CHARTEVENTS.csv'),
            usecols=events_vars,
            dtype=dtypes_dict,
            low_memory=True)

        # only keep the events we are interested in
        event_df = event_df[event_df['itemid'].isin(key_items)]

        oevent_df = read_csv_to_df(os.path.join(
            'examples', 'data_generation', 'data',
            'mimic-iii-clinical-database-demo-1.4', 'OUTPUTEVENTS.csv'),
                                   usecols=events_vars,
                                   dtype=dtypes_dict,
                                   low_memory=True)

        # only keep the events we are interested in
        oevent_df = oevent_df[oevent_df['itemid'].isin(key_items)]

        event_df = pd.concat([event_df, oevent_df])
        event_df['charttime'] = pd.to_datetime(event_df['charttime'])

        # Start data processing
        n_patients_list, starts, n_jobs = partition_estimators(n_samples,
                                                               n_jobs=n_jobs)

        all_results = Parallel(n_jobs=n_jobs, max_nbytes=None, verbose=True)(
            delayed(parallel_parse_tables)(
                patient_id_list=patient_id_list[starts[i]:starts[i + 1]],
                patient_df=patient_df,
                admission_df=admission_df,
                icu_df=icu_df,
                event_df=event_df,
                event_mapping_df=event_mapping_df,
                duration=duration,
                selection_method=selection_method,
                var_list=var_list,
                save_dir=self.save_dir) for i in range(n_jobs))

        all_results = list(map(list, zip(*all_results)))
        valid_data_list = unfold_parallel(all_results[0], n_jobs)
        valid_id_list = unfold_parallel(all_results[1], n_jobs)

        patient_data_list = []
        for p in valid_data_list:
            patient_data_list.append(p.data)

        with open(self.patient_data_loc, 'w') as outfile:
            json.dump(patient_data_list, outfile)

        print(patient_data_list)
Ejemplo n.º 6
0
    # key variables to track in the episode
    var_list = [
        'Capillary refill rate', 'Diastolic blood pressure',
        'Fraction inspired oxygen', 'Glascow coma scale eye opening',
        'Glascow coma scale motor response', 'Glascow coma scale total',
        'Glascow coma scale verbal response', 'Glucose', 'Heart Rate',
        'Height', 'Mean blood pressure', 'Oxygen saturation',
        'Respiratory rate', 'Systolic blood pressure', 'Temperature', 'Weight',
        'pH'
    ]

    # enforce and convert to lower case
    var_list = [item.lower() for item in var_list]

    event_mapping_df = read_csv_to_df(
        os.path.join('resources', 'itemid_to_variable_map.csv'))
    event_mapping_df['level2'] = event_mapping_df['level2'].str.lower()

    key_df = event_mapping_df[event_mapping_df['level2'].isin(var_list)]
    key_items = key_df['itemid'].tolist()

    #################################################################
    # read in tables
    patient_df = read_csv_to_df(os.path.join(mimic_data_loc,
                                             'PATIENTS.csv.gz'))
    patient_id_list = patient_df['subject_id'].tolist()

    admission_df = read_csv_to_df(
        os.path.join(mimic_data_loc, 'ADMISSIONS.csv.gz'))

    icu_df = read_csv_to_df(os.path.join(mimic_data_loc, 'ICUSTAYS.csv.gz'))