Esempio n. 1
0
def process_note_helper(note):
    # split note into sections
    note_sections = sent_tokenize_rules(note)
    processed_sections = []
    section_frame = pd.DataFrame({'sections':note_sections})
    section_frame.apply(process_section, args=(note,processed_sections,), axis=1)
    return(processed_sections)
def process_note_helper(note):
    # split note into sections
    note_sections = sent_tokenize_rules(note)
    for c, i in enumerate(note_sections):
        note_sections[c] = re.sub('[0-9]+\.','' ,note_sections[c]) # remove '1.', '2.'
        note_sections[c] = re.sub('(-){2,}|_{2,}|={2,}','' ,note_sections[c]) # remove _____
        note_sections[c] = re.sub('dr\.','doctor' ,note_sections[c])
        note_sections[c] = re.sub('m\.d\.','md' ,note_sections[c])
    regex = '(\[\*\*[^*]*\*\*\])'
    processed_sections = [re.sub(regex, repl, i) for i in note_sections]
    processed_sections = [nlp(i.strip()) for i in processed_sections if i is not None and len(i.strip()) > 0]
    return(processed_sections) #list of spacy docs
def extract_description(subject_id, episode_id):
    """
    Function to extract the input to the summariser for an hospital course summary
    * currently focus on those epidoes from one and only discharge summary
    """

    date = summaries[(summaries.SUBJECT_ID == subject_id)
                     & (summaries.HADM_ID == episode_id)].CHARTDATE.iloc[0]

    # extract
    relevent_rows = notes[(notes.SUBJECT_ID == subject_id)
                          & (notes.HADM_ID == episode_id)
                          & (notes.CHARTDATE <= date) &
                          (notes.CATEGORY != 'Discharge summary')]

    text = relevent_rows.TEXT.str.cat(sep=' ')

    # tokenisation
    sents = sent_tokenize_rules(text)

    output = ""

    for sent in sents:

        # convert to lower case
        sent = sent.lower()

        # replace confidential tokens
        sent = re.sub("\[\*+.+\*+\]", "unk", sent)

        # replace patterns like "**** CPT codes *****""
        sent = re.sub('^\*+.+\*+$', "", sent)

        # replace new line character
        sent = sent.replace('\n', ' ')
        sent = sent.replace('/', ' ')

        doc = nlp(sent)
        output += " ".join([token.text
                            for token in doc if token.text.strip()]) + " "

    return '<sec> ' + output.strip() + '\n' if output else None
def extract_summary(file_name):
    """
    Generate hospital course summary in the required format for LeafNATS
    ----------------
    Args:
        file_name: name of the file for the raw summary
    Returns:
        summary: processed hospital course summary
    """

    f = open(summary_path + file_name, 'r')
    summary = f.read()

    sections = sent_tokenize_rules(summary)

    output = ""

    for sec in sections:
        # convert to lower case
        sec = sec.lower()

        # replace confidential tokens
        sec = re.sub("\[\*+.+\*+\]", "unk", sec)

        # replace patterns like "**** CPT codes *****""
        sent = re.sub('^\*+.+\*+$', "", sent)

        # replace new line character
        sec = sec.replace('\n', ' ')
        sec = sec.replace('/', ' ')

        for sent in nltk.sent_tokenize(sec):
            output += '<s> ' + ' '.join([
                token for token in nltk.word_tokenize(sent) if token.strip()
            ]) + ' </s> '

    return output.strip()
Esempio n. 5
0
notes = notes[notes.CATEGORY == category]

# for other notes
if len(sys.argv) < 2:
    print('Please specify the batch number.')
    sys.exit()

batch = sys.argv[1]
other_notes = pd.read_csv('data/notes_batch_{}.csv'.format(batch))

print("start processing: batch {}".format(batch))
to_process = notes if discharge else other_notes

for text in tqdm(to_process.TEXT):
    sents = sent_tokenize_rules(text)
    for sent in sents:
        sent = re.sub("\[\*\*.{0,15}.*?\*\*\]", "unk", sent)
        if not sent or sent.strip() == '\n':
            continue

        sent = sent.replace('\n', ' ')
        sent = sent.replace('/', ' ')

        tokens = nlp(sent)

        for token in tokens:
            word = token.string.strip().lower()

            if not word:
                continue