Beispiel #1
0
def filter_sections(report):
    # Remove text in INDICATION section ...
    #     as it might contain keywords which contains keywords
    #     but does not mean existence of keywords
    #
    sections, section_names, section_idx = section_text(report)
    filtered = [
        i for i, x in enumerate(section_names)
        if x not in ['indication', 'history', 'comparison', 'technique']
    ]
    report = "".join(sections[i] for i in filtered)
    report = report.replace('\n', '').replace('\r', '')
    return report
def load_mimic_reports(filename, all_ids):
    """ Load the MIMIC-CXR reports from the zip file, using a set of study and subject IDs.
    """
    if not filename.endswith('.zip'):
        raise Exception(
            "Unrecognizable format; expecting a .zip file for the MIMIC reports."
        )

    # load and parse reports from file
    id2data = dict()
    with ZipFile(filename) as zfile:
        for study_id, subject_id in all_ids:
            fn = get_filename_from_ids(study_id, subject_id)
            # read from zipfile
            with zfile.open(fn, 'r') as infile:
                text = infile.read().decode('utf-8')

            # parse sections
            sections, section_names, section_idx = sp.section_text(text)
            findings = sections[section_names.index('findings')]
            impression = sections[section_names.index('impression')]

            findings_start = section_idx[section_names.index('findings')]
            background = text[:findings_start]

            findings = clean_findings(findings)
            impression = clean_impression(impression)
            background = clean_background(background)

            data = {
                'study_id': study_id,
                'subject_id': subject_id,
                'findings': findings,
                'impression': impression,
                'background': background
            }
            id2data[study_id] = data
    return id2data
Beispiel #3
0
def main(args):
    args = parser.parse_args(args)

    reports_path = Path(args.reports_path)
    output_path = Path(args.output_path)

    if not output_path.exists():
        output_path.mkdir()

    # not all reports can be automatically sectioned
    # we load in some dictionaries which have manually determined sections
    custom_section_names, custom_indices = sp.custom_mimic_cxr_rules()

    # get all higher up folders (p00, p01, etc)
    p_grp_folders = os.listdir(reports_path)
    p_grp_folders = [
        p for p in p_grp_folders if p.startswith('p') and len(p) == 3
    ]
    p_grp_folders.sort()

    # patient_studies will hold the text for use in NLP labeling
    patient_studies = []

    # study_sections will have an element for each study
    # this element will be a list, each element having text for a specific section
    study_sections = []
    for p_grp in p_grp_folders:
        # get patient folders, usually around ~6k per group folder
        cxr_path = reports_path / p_grp
        p_folders = os.listdir(cxr_path)
        p_folders = [p for p in p_folders if p.startswith('p')]
        p_folders.sort()

        # For each patient in this grouping folder
        print(p_grp)
        for p in tqdm(p_folders):
            patient_path = cxr_path / p

            # get the filename for all their free-text reports
            studies = os.listdir(patient_path)
            studies = [
                s for s in studies if s.endswith('.txt') and s.startswith('s')
            ]

            for s in studies:
                # load in the free-text report
                with open(patient_path / s, 'r') as fp:
                    text = ''.join(fp.readlines())

                # get study string name without the txt extension
                s_stem = s[0:-4]

                # custom rules for some poorly formatted reports
                if s_stem in custom_indices:
                    idx = custom_indices[s_stem]
                    patient_studies.append([s_stem, text[idx[0]:idx[1]]])
                    continue

                # split text into sections
                sections, section_names, section_idx = sp.section_text(text)

                # check to see if this has mis-named sections
                # e.g. sometimes the impression is in the comparison section
                if s_stem in custom_section_names:
                    sn = custom_section_names[s_stem]
                    idx = list_rindex(section_names, sn)
                    patient_studies.append([s_stem, sections[idx].strip()])
                    continue

                # grab the *last* section with the given title
                # prioritizes impression > findings, etc.

                # "last_paragraph" is text up to the end of the report
                # many reports are simple, and have a single section
                # header followed by a few paragraphs
                # these paragraphs are grouped into section "last_paragraph"

                # note also comparison seems unusual but if no other sections
                # exist the radiologist has usually written the report
                # in the comparison section
                idx = -1
                for sn in ('impression', 'findings', 'last_paragraph',
                           'comparison'):
                    if sn in section_names:
                        idx = list_rindex(section_names, sn)
                        break

                if idx == -1:
                    # we didn't find any sections we can use :(
                    patient_studies.append([s_stem, ''])
                    print(f'no impression/findings: {patient_path / s}')
                else:
                    # store the text of the conclusion section
                    patient_studies.append([s_stem, sections[idx].strip()])

                study_sectioned = [s_stem]
                for sn in ('impression', 'findings', 'last_paragraph',
                           'comparison'):
                    if sn in section_names:
                        idx = list_rindex(section_names, sn)
                        study_sectioned.append(sections[idx].strip())
                    else:
                        study_sectioned.append(None)
                study_sections.append(study_sectioned)
    # write distinct files to facilitate modular processing
    if len(patient_studies) > 0:
        # write out a single CSV with the sections
        with open(output_path / 'mimic_cxr_sectioned.csv', 'w') as fp:
            csvwriter = csv.writer(fp)
            # write header
            csvwriter.writerow([
                'study', 'impression', 'findings', 'last_paragraph',
                'comparison'
            ])
            for row in study_sections:
                csvwriter.writerow(row)

        if args.no_split:
            # write all the reports out to a single file
            with open(output_path / f'mimic_cxr_sections.csv', 'w') as fp:
                csvwriter = csv.writer(fp)
                for row in patient_studies:
                    csvwriter.writerow(row)
        else:
            # write ~22 files with ~10k reports each
            n = 0
            jmp = 10000

            while n < len(patient_studies):
                n_fn = n // jmp
                with open(output_path / f'mimic_cxr_{n_fn:02d}.csv',
                          'w') as fp:
                    csvwriter = csv.writer(fp)
                    for row in patient_studies[n:n + jmp]:
                        csvwriter.writerow(row)
                n += jmp
Beispiel #4
0
def main(args):
    args = parser.parse_args(args)

    reports_path = Path(args.reports_path)
    output_path = Path(args.output_path)

    if not output_path.exists():
        output_path.mkdir()

    # not all reports can be automatically sectioned
    # we load in some dictionaries which have manually determined sections
    custom_section_names, custom_indices = sp.custom_mimic_cxr_rules()

    # get all higher up folders (p00, p01, etc)
    p_grp_folders = os.listdir(reports_path)
    p_grp_folders = [
        p for p in p_grp_folders if p.startswith('p') and len(p) == 3
    ]
    p_grp_folders.sort()

    patient_studies = []
    for p_grp in p_grp_folders:
        # the folders in MIMIC-CXR
        cxr_path = reports_path / p_grp
        p_folders = os.listdir(cxr_path)
        p_folders = [p for p in p_folders if p.startswith('p')]
        p_folders.sort()

        # For each patient in this grouping folder
        for p in tqdm(p_folders):
            patient_path = cxr_path / p
            studies = os.listdir(patient_path)
            studies = [
                s for s in studies if s.endswith('.txt') and s.startswith('s')
            ]

            for s in studies:
                with open(patient_path / s, 'r') as fp:
                    text = ''.join(fp.readlines())

                # get study string name without the txt extension
                s_stem = s[0:-4]

                # custom rules for some poorly formatted reports
                if s_stem in custom_indices:
                    idx = custom_indices[s_stem]
                    patient_studies.append([s_stem, text[idx[0]:idx[1]]])
                    continue

                # split text into sections
                sections, section_names, section_idx = sp.section_text(text)

                # check to see if this has mis-named sections
                # e.g. sometimes the impression is in the comparison section
                if s_stem in custom_section_names:
                    sn = custom_section_names[s_stem]
                    idx = list_rindex(section_names, sn)
                    patient_studies.append([s_stem, sections[idx].strip()])
                    continue

                # grab the *last* section with the given title
                # prioritize impression > findings > last paragraph > comparison

                # note comparison seems unusual but if no other sections
                # exist the radiologist has usually written the
                # report in the comparison section
                idx = -1
                for sn in ('impression', 'findings', 'last_paragraph',
                           'comparison'):
                    if sn in section_names:
                        idx = list_rindex(section_names, sn)
                        break

                if idx == -1:
                    # we didn't find anything :(
                    patient_studies.append([s_stem, ''])
                    print(f'no impression/findings: {patient_path / s}')
                else:
                    # store the text of this section
                    patient_studies.append([s_stem, sections[idx].strip()])

            # if len(patient_studies) > 0:
            #     with open(output_path / f'{p}.csv', 'w') as fp:
            #         csvwriter = csv.writer(fp)
            #         for row in patient_studies:
            #             csvwriter.writerow(row)

    # write distinct files to facilitate modular processing by chexpert
    if len(patient_studies) > 0:
        n = 0
        jmp = 10000

        while n < len(patient_studies):
            n_fn = n // jmp
            with open(output_path / f'mimic_cxr_{n_fn:03d}.csv', 'w') as fp:
                csvwriter = csv.writer(fp)
                for row in patient_studies[n:n + jmp]:
                    csvwriter.writerow(row)
            n += jmp