Ejemplo n.º 1
0
def mp_in_hospital_mimic(mimic_dir: str, save_dir: str, seed: int,
                         admission_only: bool):
    """
    Extracts information needed for the task from the MIMIC dataset. Namely "TEXT" column from NOTEEVENTS.csv and
    "HOSPITAL_EXPIRE_FLAG" from ADMISSIONS.csv. Filters specific admission sections for often occuring signal words.
    Creates 70/10/20 split over patients for train/val/test sets.
    """

    # set task name
    task_name = "MP_IN"
    if admission_only:
        task_name = f"{task_name}_adm"

    # load dataframes
    mimic_notes = pd.read_csv(os.path.join(mimic_dir, "NOTEEVENTS.csv"))
    mimic_admissions = pd.read_csv(os.path.join(mimic_dir, "ADMISSIONS.csv"))

    # filter notes
    mimic_notes = mimic_utils.filter_notes(mimic_notes,
                                           mimic_admissions,
                                           admission_text_only=admission_only)

    # append HOSPITAL_EXPIRE_FLAG to notes
    notes_expire_flag = pd.merge(
        mimic_notes,
        mimic_admissions[["HADM_ID", "HOSPITAL_EXPIRE_FLAG"]],
        how="left",
        on="HADM_ID")

    # drop all rows without hospital expire flag
    notes_expire_flag = notes_expire_flag.dropna(
        how='any', subset=['HOSPITAL_EXPIRE_FLAG'], axis=0)

    # filter out written out death indications
    notes_expire_flag['TEXT'] = notes_expire_flag['TEXT'].str.replace(
        'patient died', '')
    notes_expire_flag['TEXT'] = notes_expire_flag['TEXT'].str.replace(
        'patient deceased', '')
    notes_expire_flag['TEXT'] = notes_expire_flag['TEXT'].str.replace(
        '\ndeceased\n', '\n')

    mimic_utils.save_mimic_split_patient_wise(
        notes_expire_flag,
        label_column='HOSPITAL_EXPIRE_FLAG',
        save_dir=save_dir,
        task_name=task_name,
        seed=seed)
def dia_groups_3_digits_mimic(mimic_dir: str, save_dir: int, seed: int, admission_only: bool):
    """
    Extracts information needed for the task from the MIMIC dataset. Namely "TEXT" column from NOTEEVENTS.csv and
    "ICD9_CODE" from DIAGNOSES_ICD.csv. Divide all ICD9 codes' first three digits and group them per admission into
    column "SHORT_CODES".
    Creates 70/10/20 split over patients for train/val/test sets.
    """

    # set task name
    task_name = "DIA_GROUPS_3_DIGITS"

    if admission_only:
        task_name = f"{task_name}_adm"

    # load dataframes
    mimic_diagnoses = pd.read_csv(os.path.join(mimic_dir, "DIAGNOSES_ICD.csv"))
    mimic_notes = pd.read_csv(os.path.join(mimic_dir, "NOTEEVENTS.csv"))
    mimic_admissions = pd.read_csv(os.path.join(mimic_dir, "ADMISSIONS.csv"))

    # filter notes
    mimic_notes = mimic_utils.filter_notes(
        mimic_notes, mimic_admissions, admission_text_only=admission_only)

    # only keep relevant columns
    mimic_diagnoses = mimic_diagnoses[['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE']]

    # drop all rows without diagnosis codes
    mimic_diagnoses = mimic_diagnoses.dropna(
        how='any', subset=['ICD9_CODE'], axis=0)

    # create column SHORT_CODE including first 3 digits of ICD9 code
    mimic_diagnoses["SHORT_CODE"] = mimic_diagnoses.ICD9_CODE.astype(str)

    mimic_diagnoses.loc[
        mimic_diagnoses['SHORT_CODE'].str.startswith("V"), 'SHORT_CODE'] = mimic_diagnoses.SHORT_CODE.apply(
        lambda x: x[:4])
    mimic_diagnoses.loc[
        mimic_diagnoses['SHORT_CODE'].str.startswith("E"), 'SHORT_CODE'] = mimic_diagnoses.SHORT_CODE.apply(
        lambda x: x[:4])
    mimic_diagnoses.loc[(~mimic_diagnoses.SHORT_CODE.str.startswith("E")) & (
        ~mimic_diagnoses.SHORT_CODE.str.startswith("V")), 'SHORT_CODE'] = mimic_diagnoses.SHORT_CODE.apply(
        lambda x: x[:3])

    # remove duplicated code groups per admission
    mimic_diagnoses = mimic_diagnoses.drop_duplicates(
        ["HADM_ID", "SHORT_CODE"])

    # store all ICD codes for vectorization
    icd9_codes = mimic_diagnoses.SHORT_CODE.unique().tolist()

    grouped_codes = mimic_diagnoses.groupby(['HADM_ID', 'SUBJECT_ID'])['SHORT_CODE'].apply(
        lambda d: ",".join(d.astype(str))).reset_index()

    # rename column
    grouped_codes = grouped_codes.rename(columns={'SHORT_CODE': 'SHORT_CODES'})

    # merge discharge summaries into diagnosis table
    notes_diagnoses_df = pd.merge(
        grouped_codes[['HADM_ID', 'SHORT_CODES']], mimic_notes, how='inner', on='HADM_ID')

    mimic_utils.save_mimic_split_patient_wise(notes_diagnoses_df,
                                              label_column='SHORT_CODES',
                                              save_dir=save_dir,
                                              task_name=task_name,
                                              seed=seed)

    # save file with all occuring codes
    write_icd_codes_to_file(icd9_codes, save_dir)
Ejemplo n.º 3
0
def los_mimic(mimic_dir: str, save_dir: str, seed: int, admission_only: bool):
    """
    Extracts information needed for the task from the MIMIC dataset. Namely "TEXT" column from NOTEEVENTS.csv and
    "ADMITTIME" and "DISCHTIME" from ADMISSIONS.csv.
    Creates 70/10/20 split over patients for train/val/test sets.
    """

    # set task name
    task_name = "LOS_WEEKS"
    if admission_only:
        task_name = f"{task_name}_adm"

    # load dataframes
    mimic_notes = pd.read_csv(os.path.join(mimic_dir, "NOTEEVENTS.csv"))
    mimic_admissions = pd.read_csv(os.path.join(mimic_dir, "ADMISSIONS.csv"))

    # filter notes
    mimic_notes = mimic_utils.filter_notes(mimic_notes,
                                           mimic_admissions,
                                           admission_text_only=admission_only)

    # Calculating the Length of Stay in days per admission
    mimic_admissions['ADMITTIME'] = pd.to_datetime(
        mimic_admissions['ADMITTIME'])
    mimic_admissions['DISCHTIME'] = pd.to_datetime(
        mimic_admissions['DISCHTIME'])

    mimic_admissions['LOS_days'] = round(
        (mimic_admissions['DISCHTIME'] -
         mimic_admissions['ADMITTIME']).dt.total_seconds() / (24 * 60 * 60), 1)

    mimic_admissions = mimic_admissions[[
        "ROW_ID", "SUBJECT_ID", "HADM_ID", "LOS_days", "HOSPITAL_EXPIRE_FLAG"
    ]]

    # Creation of Label
    '''
        <= 3: 0
        > 3 & <= 7: 1
        > 7 & <= 14: 2
        >14: 3
    '''
    mimic_admissions.loc[mimic_admissions['LOS_days'] <= 3, 'LOS_label'] = 0
    mimic_admissions.loc[(mimic_admissions['LOS_days'] > 3) &
                         (mimic_admissions['LOS_days'] <= 7), 'LOS_label'] = 1
    mimic_admissions.loc[(mimic_admissions['LOS_days'] > 7) &
                         (mimic_admissions['LOS_days'] <= 14), 'LOS_label'] = 2
    mimic_admissions.loc[(mimic_admissions['LOS_days'] > 14), 'LOS_label'] = 3
    mimic_admissions.LOS_label = mimic_admissions.LOS_label.astype(int)

    # Keeping the required variables
    mimic_admissions = mimic_admissions[[
        "HADM_ID", "LOS_label", "HOSPITAL_EXPIRE_FLAG"
    ]]
    mimic_notes = mimic_notes[['HADM_ID', 'TEXT', "ROW_ID", "SUBJECT_ID"]]

    # Merging Mimic Notes data with Admissions data
    notes_adm_df = pd.merge(mimic_notes,
                            mimic_admissions,
                            how="left",
                            on="HADM_ID")

    # Removing records where the patient died within a given hospitalization
    notes_adm_df = notes_adm_df[notes_adm_df['HOSPITAL_EXPIRE_FLAG'] == 0]
    notes_adm_df = notes_adm_df[[
        "ROW_ID", "SUBJECT_ID", "HADM_ID", "TEXT", "LOS_label"
    ]]

    mimic_utils.save_mimic_split_patient_wise(notes_adm_df,
                                              label_column='LOS_label',
                                              save_dir=save_dir,
                                              task_name=task_name,
                                              seed=seed)
def pro_plus_mimic(mimic_dir: str, save_dir: str, seed: int,
                   admission_only: bool):
    """
    Extracts information needed for the task of procedure prediction from the MIMIC dataset.
    The output data holds as labels all words of assigned procedures and all 3- and 4-digit codes.
    Creates 70/10/20 split over patients for train/val/test sets.
    """

    # set task name
    task_name = "PRO_PLUS"
    if admission_only:
        task_name = f"{task_name}_adm"

    # load dataframes
    mimic_pro_names = pd.read_csv(os.path.join(mimic_dir,
                                               "D_ICD_PROCEDURES.csv"),
                                  dtype={"ICD9_CODE": str})
    mimic_procedures = pd.read_csv(os.path.join(mimic_dir,
                                                "PROCEDURES_ICD.csv"),
                                   dtype={"ICD9_CODE": str})
    mimic_notes = pd.read_csv(os.path.join(mimic_dir, "NOTEEVENTS.csv"))
    mimic_admissions = pd.read_csv(os.path.join(mimic_dir, "ADMISSIONS.csv"))

    # filter notes
    mimic_notes = mimic_utils.filter_notes(mimic_notes,
                                           mimic_admissions,
                                           admission_text_only=admission_only)

    # only keep relevant columns
    mimic_procedures = mimic_procedures[['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE']]

    # drop all rows without procedure codes
    mimic_procedures = mimic_procedures.dropna(how='any',
                                               subset=['ICD9_CODE', 'HADM_ID'],
                                               axis=0)

    # CREATE LABELS FOR PROCEDURE NAMES

    # remove punctuation and split words of procedure descriptions
    mimic_pro_names["PRO_NAMES"] = mimic_pro_names.LONG_TITLE.str.replace('[{}]'.format(string.punctuation), '') \
        .str.lower().str.split()

    # remove stopwords and duplicates
    mimic_pro_names["PRO_NAMES"] = mimic_pro_names.PRO_NAMES.apply(
        lambda x: " ".join(
            set([
                word for word in x
                if word not in list(stopwords.words('english'))
            ])))

    # CREATE LABELS FOR 3 DIGIT CODES

    # Truncate codes to 3 digits
    mimic_procedures["SHORT_CODE"] = mimic_procedures.ICD9_CODE.astype(str)

    mimic_procedures["SHORT_CODE"] = mimic_procedures.SHORT_CODE.apply(
        lambda x: x[:3])

    # CREATE LABELS FOR 4 DIGIT CODES

    # Truncate codes to 4 digits
    mimic_procedures["LONG_CODE"] = mimic_procedures.ICD9_CODE.astype(str)

    mimic_procedures["LONG_CODE"] = mimic_procedures.LONG_CODE.apply(
        lambda x: x[:4])

    # MERGE DESCRIPTION WITH ADMISSION CODES
    admissions_with_pro_names = pd.merge(
        mimic_procedures,
        mimic_pro_names[["ICD9_CODE", "PRO_NAMES"]],
        on="ICD9_CODE",
        how="left")
    admissions_with_pro_names[
        "PRO_NAMES"] = admissions_with_pro_names.PRO_NAMES.fillna("")

    # GROUP CODES BY ADMISSION
    grouped_short_codes = admissions_with_pro_names.groupby(
        ['HADM_ID'])['SHORT_CODE'].apply(" ".join).reset_index()
    grouped_long_codes = admissions_with_pro_names.groupby(
        ['HADM_ID'])['LONG_CODE'].apply(" ".join).reset_index()
    grouped_pro_names = admissions_with_pro_names.groupby(
        ['HADM_ID'])['PRO_NAMES'].apply(" ".join).reset_index()

    # COMBINE 3-DIGIT CODES, 4-DIGIT CODES AND PROCEDURE NAMES

    # combine into one dataframe
    combined_df = reduce(
        lambda left, right: pd.merge(left, right, on=['HADM_ID'], how='outer'),
        [grouped_short_codes, grouped_long_codes, grouped_pro_names])

    # combine into one column
    combined_df["LABELS"] = combined_df["SHORT_CODE"] + " " + combined_df[
        "LONG_CODE"] + " " + combined_df["PRO_NAMES"]

    # remove duplicates, sort and join with comma
    combined_df["LABELS"] = combined_df.LABELS.str.split(" ").apply(
        lambda x: ",".join(sorted(set(x))))

    # merge discharge summaries into procedures table
    notes_procedures_df = pd.merge(combined_df[['HADM_ID', 'LABELS']],
                                   mimic_notes,
                                   how='inner',
                                   on='HADM_ID')

    # collect all possible tokens aka classes
    all_tokens = set()
    for i, row in notes_procedures_df.iterrows():
        for token in row.LABELS.split(","):
            all_tokens.add(token)

    mimic_utils.save_mimic_split_patient_wise(notes_procedures_df,
                                              label_column='LABELS',
                                              save_dir=save_dir,
                                              task_name=task_name,
                                              seed=seed)

    write_codes_to_file(sorted(all_tokens), save_dir)
Ejemplo n.º 5
0
def split_admission_discharge(mimic_dir: str, save_dir: str, seed: int):
    """
    Filter text information by section and only keep sections that are known on admission time.
    """

    # set task name
    task_name = "ADM_DIS_MATCH"

    # load dataframes
    mimic_notes = pd.read_csv(os.path.join(mimic_dir, "NOTEEVENTS.csv"),
                              usecols=["ROW_ID", "SUBJECT_ID", "HADM_ID", "CHARTDATE", "CATEGORY", "DESCRIPTION",
                                       "TEXT"])

    mimic_admissions = pd.read_csv(os.path.join(mimic_dir, "ADMISSIONS.csv"))

    # filter notes
    mimic_notes = mimic_utils.filter_notes(mimic_notes, mimic_admissions, admission_text_only=False)

    admission_sections = {
        "CHIEF_COMPLAINT": "chief complaint:",
        "PRESENT_ILLNESS": "present illness:",
        "MEDICAL_HISTORY": "medical history:",
        "MEDICATION_ADM": "medications on admission:",
        "ALLERGIES": ["allergy:", "allergies:"],
        "PHYSICAL_EXAM": ["physical exam:", "physical examination:"],
        "FAMILY_HISTORY": "family history:",
        "SOCIAL_HISTORY": "social history:"
    }

    discharge_sections = {
        "PROCEDURE": "procedure:",
        "MEDICATION_DIS": ["discharge medications:", "discharge medication:"],
        "DIAGNOSIS_DIS": ["discharge diagnosis:", "discharge diagnoses:"],
        "CONDITION": "discharge condition:",
        "PERTINENT_RESULTS": "pertinent results:",
        "HOSPITAL_COURSE": "hospital course:"
    }

    # replace linebreak indicators
    mimic_notes['TEXT'] = mimic_notes['TEXT'].str.replace(r"\n", r"\\n")

    # extract each section by regex
    for key in list(admission_sections.keys()) + list(discharge_sections.keys()):
        section = admission_sections[key] if key in admission_sections else discharge_sections[key]

        # handle multiple heading possibilities
        if isinstance(section, list):
            mimic_notes[key] = None
            for heading in section:
                mimic_notes.loc[mimic_notes[key].isnull(), key] = extract_section(mimic_notes, heading)
        else:
            mimic_notes[key] = extract_section(mimic_notes, section)

        mimic_notes[key] = mimic_notes[key].str.replace(r'\\n', r' ')
        mimic_notes[key] = mimic_notes[key].str.strip()
        mimic_notes[key] = mimic_notes[key].fillna("")
        mimic_notes[mimic_notes[key].str.startswith("[]")][key] = ""

    # filter notes with missing main admission information
    mimic_notes = mimic_notes[(mimic_notes.CHIEF_COMPLAINT != "") | (mimic_notes.PRESENT_ILLNESS != "") |
                              (mimic_notes.MEDICAL_HISTORY != "")]

    # filter notes with missing main information
    mimic_notes = mimic_notes[(mimic_notes.HOSPITAL_COURSE != "") | (mimic_notes.DIAGNOSIS_DIS != "")]

    # add section headers and combine into TEXT_ADMISSION
    mimic_notes = mimic_notes.assign(TEXT_ADMISSION="CHIEF COMPLAINT: " + mimic_notes.CHIEF_COMPLAINT.astype(str)
                                                    + '\n\n' +
                                                    "PRESENT ILLNESS: " + mimic_notes.PRESENT_ILLNESS.astype(str)
                                                    + '\n\n' +
                                                    "MEDICAL HISTORY: " + mimic_notes.MEDICAL_HISTORY.astype(str)
                                                    + '\n\n' +
                                                    "MEDICATION ON ADMISSION: " + mimic_notes.MEDICATION_ADM.astype(str)
                                                    + '\n\n' +
                                                    "ALLERGIES: " + mimic_notes.ALLERGIES.astype(str)
                                                    + '\n\n' +
                                                    "PHYSICAL EXAM: " + mimic_notes.PHYSICAL_EXAM.astype(str)
                                                    + '\n\n' +
                                                    "FAMILY HISTORY: " + mimic_notes.FAMILY_HISTORY.astype(str)
                                                    + '\n\n' +
                                                    "SOCIAL HISTORY: " + mimic_notes.SOCIAL_HISTORY.astype(str))

    # add section headers and combine into TEXT_DISCHARGE
    mimic_notes = mimic_notes.assign(
        TEXT_DISCHARGE="MAJOR SURGICAL / INVASIVE PROCEDURE: " + mimic_notes.PROCEDURE.astype(str)
                       + '\n\n' +
                       "PERTINENT RESULTS: " + mimic_notes.PERTINENT_RESULTS.astype(str)
                       + '\n\n' +
                       "HOSPITAL COURSE: " + mimic_notes.HOSPITAL_COURSE.astype(str)
                       + '\n\n' +
                       "DISCHARGE MEDICATIONS: " + mimic_notes.MEDICATION_DIS.astype(str)
                       + '\n\n' +
                       "DISCHARGE DIAGNOSES: " + mimic_notes.DIAGNOSIS_DIS.astype(str)
                       + '\n\n' +
                       "DISCHARGE CONDITION: " + mimic_notes.CONDITION.astype(str))

    mimic_utils.save_mimic_split_patient_wise(
        df=mimic_notes[['ROW_ID', 'SUBJECT_ID', 'TEXT_ADMISSION', 'TEXT_DISCHARGE']],
        label_column=None,
        column_list=['ID', 'TEXT_ADMISSION', 'TEXT_DISCHARGE'],
        save_dir=save_dir,
        task_name=task_name,
        seed=seed)