Ejemplo n.º 1
0
def create_report():
    # get the dataframe from the structured data features step
    df_json_encoded = standard_read_from_db('structured_data_features')
    structured_df = pd.read_json(df_json_encoded.decode())
    
    # create patient summary df
    patient_summary_df = make_patient_summary(structured_df)

    # retrieve the top n terms dataframe from each xgboost step
    _, top_n_feat_los_df_json_encoded, _ = xgb_read_from_db('feat_xgb_los')
    top_n_feat_los_df = pd.read_json(top_n_feat_los_df_json_encoded.decode())

    _, top_n_neg_feat_los_df_json_encoded, _ = xgb_read_from_db('neg_feat_xgb_los')
    top_n_neg_feat_los_df = pd.read_json(top_n_neg_feat_los_df_json_encoded.decode())

    _, top_n_med_los_df_json_encoded, _ = xgb_read_from_db('med_xgb_los')
    top_n_med_los_df = pd.read_json(top_n_med_los_df_json_encoded.decode())

    _, top_n_neg_med_los_df_json_encoded, _ = xgb_read_from_db('neg_med_xgb_los')
    top_n_neg_med_los_df = pd.read_json(top_n_neg_med_los_df_json_encoded.decode())

    _, top_n_feat_readm_df_json_encoded, _ = xgb_read_from_db('feat_xgb_readmission')
    top_n_feat_readm_df = pd.read_json(top_n_feat_readm_df_json_encoded.decode())

    _, top_n_neg_feat_readm_df_json_encoded, _ = xgb_read_from_db('neg_feat_xgb_readmission')
    top_n_neg_feat_readm_df = pd.read_json(top_n_neg_feat_readm_df_json_encoded.decode())

    _, top_n_med_readm_df_json_encoded, _ = xgb_read_from_db('med_xgb_readmission')
    top_n_med_readm_df = pd.read_json(top_n_med_readm_df_json_encoded.decode())

    _, top_n_neg_med_readm_df_json_encoded, _ = xgb_read_from_db('neg_med_xgb_readmission')
    top_n_neg_med_readm_df = pd.read_json(top_n_neg_med_readm_df_json_encoded.decode())

    # Create a dictionary of all the top n dataframes.
    # This will be passed into the function that creates the hospital summary. 
    top_n_dict = {
            'top_n_feat_los_df': top_n_feat_los_df,
            'top_n_neg_feat_los_df': top_n_neg_feat_los_df,
            'top_n_med_los_df': top_n_med_los_df,
            'top_n_neg_med_los_df': top_n_neg_med_los_df,
            'top_n_feat_readm_df': top_n_feat_readm_df,
            'top_n_neg_feat_readm_df': top_n_neg_feat_readm_df,
            'top_n_med_readm_df': top_n_med_readm_df,
            'top_n_neg_med_readm_df': top_n_neg_med_readm_df
            }

    # Get the word2vec model for readmissions.
    readmission_word2vec_model_pickle = standard_read_from_db('readmission_word2vec')
    readmission_word2vec_model = pickle.loads(readmission_Word2vec_model_pickle)
    
    # create hospital summary df
    hospital_summary_df = make_hospital_summary(structured_df, top_n_dict, readmission_word2vec_model)

    # serialize patient and hospital summary dataframes
    patient_summary_df_json_encoded = patient_summary_df.to_json().encode()
    hospital_summary_df_json_encoded = hospital_summary_df.to_json().encode()

    # Persist serialized dataframes into the database
    summary_report_write_to_db(patient_summary_df_json_encoded, hospital_summary_df_json_encoded)
def tokenize_all_notes():
    notes_encoded = standard_read_from_db('all_notes_cleansed')
    notes = notes_encoded.decode()

    tokens = tokenize(notes)

    tokens_string_encoded = str(tokens).encode()
    standard_write_to_db('word2vec_notes_tokenized', tokens_string_encoded)
def train():
    df_json_encoded = standard_read_from_db('readmission_classifier_tokenized')
    df = pd.read_json(df_json_encoded.decode())

    classifier = train_classifier(df)
    classifier_pickle = pickle.dumps(classifier)

    standard_write_to_db('token_classifier_model', classifier_pickle)
Ejemplo n.º 4
0
def create_entity_columns():
    df_json_encoded = standard_read_from_db('ner_labeled_notes')
    df = pd.read_json(df_json_encoded.decode())

    new_df = get_columns_from_notes(df)

    new_df_json_encoded = new_df.to_json().encode()
    standard_write_to_db('entity_columns', new_df_json_encoded)
def create_file():
    df_json_encoded = standard_read_from_db('ner_cleaned_notes')
    df = pd.read_json(df_json_encoded.decode())
    out_file = open('all_note_lines.txt','w+')

    for i, row in df.iterrows():
        notes = row['ner_cleaned_notes'].replace(' ##', '')
        print(notes, file=out_file)
Ejemplo n.º 6
0
def the_callable_that_will_be_used_as_a_task():
    df_json_encoded = standard_read_from_db('the name of the collection used to save the output of the previous step')
    df = pd.read_json(df_json_encoded.decode())

    new_df = some_function_that_does_some_modification_to_a_dataframe(df)

    new_df_json_encoded = new_df.to_json().encode()
    standard_write_to_db('the name of the collection used to save the output of this step')
Ejemplo n.º 7
0
def readmission_classifier_clean_notes():
    df_json_encoded = standard_read_from_db('structured_data_features')
    df = pd.read_json(df_json_encoded.decode())
    
    cleaned_notes = clean_notes(df)
    df['readmission_classifier_tokens'] = cleaned_notes

    df_json_encoded = df.to_json().encode()
    standard_write_to_db('readmission_classifer_tokenized',df_json_encoded)
Ejemplo n.º 8
0
def clean_all_notes():
    df_json_encoded = standard_read_from_db('first_dataframe')
    df_json = df_json_encoded.decode()
    df = pd.read_json(df_json)

    all_notes = combine_and_clean(df)

    all_notes_encoded = all_notes.encode()
    standard_write_to_db('all_notes_cleansed', all_notes_encoded)
Ejemplo n.º 9
0
def add_tokens_column():
    df_json_encoded = standard_read_from_db('first_dataframe')
    df_json = df_json_encoded.decode()
    df = pd.read_json(df_json)

    df = tokenize_by_sentence(df)

    df_json = df.to_json()
    df_json_encoded = df_json.encode()
    standard_write_to_db('ngram_prep_tokenize', df_json_encoded)
Ejemplo n.º 10
0
def readmission_one_hot():
    first_dataframe_json_encoded = standard_read_from_db('first_dataframe')
    first_dataframe_json = first_dataframe_json_encoded.decode()
    first_dataframe = pd.read_json(first_dataframe_json)

    word2vec_pickle = standard_read_from_db('readmission_word2vec')
    word2vec_model = pickle.loads(word2vec_pickle)

    flattened, key_words = find_readmit_similar_terms(word2vec_model)
    df_found_words = add_found_words_column(first_dataframe, key_words)
    df_one_hot = one_hot_encode_found_key_terms(df_found_words)

    df_term_cos_simil = pd.DataFrame()
    df_term_cos_simil['readmission_key_words'] = flattened

    df_one_hot_json_encoded = df_one_hot.to_json().encode()
    df_term_cos_simil_json_encoded = df_term_cos_simil.to_json().encode()
    one_hot_write_to_db(df_one_hot_json_encoded,
                        df_term_cos_simil_json_encoded, collection_name)
def convert_to_likert():
    df_json_encoded = standard_read_from_db(
        'readmission_tensorflow_predictions')
    df = pd.read_json(df_json_encoded.decode())

    likert_values = make_likert_column(df)
    df['readmission_likert'] = likert_values

    df_json_encoded = df.to_json().encode()
    standard_write_to_db('readmission_likert', df_json_encoded)
Ejemplo n.º 12
0
def create_word2vec_model():
    tokens_string = standard_read_from_db('word2vec_notes_tokenized').decode()
    tokens = ast.literal_eval(tokens_string)
    #model = Word2Vec([tokens], size=100, window=10, min_count=2, workers=3)
    #found readmission as one of the tokens in tokens while testing, reduced min_count to get rid of that error

    #word2vec optimization opportunity for LDA
    model = Word2Vec([tokens], size=100, window=10, min_count=1, workers=3)
    model_pickled = pickle.dumps(model)
    standard_write_to_db('word2vec', model_pickled)
def create_vitals_ngrams():
    df_json_encoded = standard_read_from_db('ngram_prep_tokenize')
    df_json = df_json_encoded.decode()
    df = pd.read_json(df_json)

    df = get_vitals_and_generate_ngrams(df)

    df_json = df.to_json()
    df_json_encoded = df_json.encode()
    standard_write_to_db('vitals_ngrams', df_json_encoded)
Ejemplo n.º 14
0
def create_structured_data_features():
    df_json_encoded = standard_read_from_db('first_dataframe')
    df_json = df_json_encoded.decode()
    df = pd.read_json(df_json)

    df = add_los_age_and_binary_deathtime_columns(df)
    df = add_readmission_column(df)

    df_json = df.to_json()
    df_json_encoded = df_json.encode()
    standard_write_to_db('structured_data_features', df_json_encoded)
Ejemplo n.º 15
0
def run_tpot():
    combined_df_json_encoded = standard_read_from_db('combined_dataframe')
    combined_df_json = combined_df_json_encoded.decode()
    combined_df = pd.read_json(combined_df_json)

    tpot_pipeline_code, score = create_tpot_pipeline(combined_df,
                                                     'readmission')

    tpot_pipeline_code_encoded = tpot_pipeline_code.encode()
    score_encoded = str(score).encode()
    tpot_write_to_db(tpot_pipeline_code_encoded, score_encoded,
                     'tpot_readmission')
def clean_readmission_notes():
    df_json_encoded = standard_read_from_db('structured_data_features')
    df_json = df_json_encoded.decode()
    df = pd.read_json(df_json)

    #filter dataframe by readmission==True
    is_readmission = df['readmission'] == True
    df_readmissions = df[is_readmission]

    readmission_notes = combine_and_clean(df)

    readmission_notes_encoded = readmission_notes.encode()
    standard_write_to_db('readmission_notes_cleansed',
                         readmission_notes_encoded)
def train_and_predict():
    df_json_encoded = standard_read_from_db('readmission_classifier_tokens')
    df = pd.read_json(df_json_encoded.decode())

    X, y = create_dataset(df)
    classifier = train_classifier(X, y)

    probabilities = make_probability_column(classifier, X)
    df['readmission_classifier_probabilities'] = probabilities

    df_json_encoded = df.to_json().encode()
    classifier_pickle = pickle.dumps(classifier)

    readmission_classifier_write_to_db(df_json_encoded, classifier_pickle)
def make_predictions():
    df_json_encoded = standard_read_from_db('entity_columns')
    df = pd.read_json(df_json_encoded.decode())
    
    bst = train_xgb_model(df)
    
    df = add_predictions_column(df, bst)

    top_n_df = make_top_n_features(bst, feat_one_hot, 3)

    df_json_encoded = df.to_json().encode()
    top_n_df_json_encoded = df.to_json().encode()
    bst_pickle = pickle.dumps(bst)

    xgb_write_to_db('neg_med_xgb_los', df_json_encoded, top_n_df_json_encoded, bst_pickle)
Ejemplo n.º 19
0
def clean_ner_notes():
    df_json_encoded = standard_read_from_db('structured_data_features')
    df_json = df_json_encoded.decode()
    df = pd.read_json(df_json)

    #df = pd.read_parquet('short_ner_test.parquet')

    nemo_tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer(
        tokenizer_name='nemobert',
        pretrained_model_name='bert-base-uncased',
        tokenizer_model=None)

    cleaned_notes = []
    notes_length = len(df['notes'])
    num_cleaned = 0
    for note in df['notes']:
        note = re.sub('\[', '', note)
        note = re.sub('\]', '', note)
        note = re.sub('\*', '', note)
        note = re.sub(',', '', note)
        note = re.sub(';', '', note)

        cleaned_note = ''

        lines = note.split('\n')
        for line in lines:
            if line != '' and '____' not in line:
                sentences = sent_tokenize(line)
                for sentence in sentences:
                    words = nemo_tokenizer.text_to_tokens(sentence)
                    #Chunked according to this approach:
                    # https://stackoverflow.com/questions/9671224/split-a-python-list-into-other-sublists-i-e-smaller-lists
                    max_len_chunks = [
                        words[x:x + MAX_SEQ_LENGTH]
                        for x in range(0, len(words), MAX_SEQ_LENGTH)
                    ]
                    for chunk in max_len_chunks:
                        new_line = ''
                        for word in chunk:
                            new_line += ' ' + word
                        cleaned_note += '\n' + new_line.strip()
        num_cleaned += 1
        cleaned_notes.append(cleaned_note)
        print(str(num_cleaned / notes_length * 100) + "% notes cleaned")

    df['ner_cleaned_notes'] = cleaned_notes
    df_json_encoded = df.to_json().encode()
    standard_write_to_db('ner_cleaned_notes', df_json_encoded)
Ejemplo n.º 20
0
def run_tpot():
    print('AA1')
    combined_df_json_encoded = standard_read_from_db('combined_dataframe')
    print('AA2')
    combined_df_json = combined_df_json_encoded.decode()
    print('AA3')
    combined_df = pd.read_json(combined_df_json)

    print('AA4')
    tpot_pipeline_code, score = create_tpot_pipeline(combined_df, 'los')

    print('AA5')
    tpot_pipeline_code_encoded = tpot_pipeline_code.encode()
    print('AA6')
    score_encoded = str(score).encode()
    print('AA7')
    tpot_write_to_db(tpot_pipeline_code_encoded, score_encoded, 'tpot_los')
Ejemplo n.º 21
0
def create_labeled_notes_column():
    lines = get_note_lines_from_file()

    df_json_encoded = standard_read_from_db('ner_cleaned_notes')
    df = pd.read_json(df_json_encoded.decode())
    length_array = get_line_length_array(df)

    begin = 0
    end = 0
    labeled_notes = []
    for length in length_array:
        end += length
        note_lines = lines[begin:end]
        begin += length
        note = ''
        for line in note_lines:
            note += line
        labeled_notes.append(note)

    df['labeled_notes'] = labeled_notes
    df_json_encoded = df.to_json().encode()
    standard_write_to_db('ner_labeled_notes',df_json_encoded)
Ejemplo n.º 22
0
def combine():
    #infection_one_hot_df_json_encoded, _ = one_hot_read_from_db('infection_one_hot')
    readmission_one_hot_df_json_encoded, _ = one_hot_read_from_db(
        'readmission_one_hot')
    structured_features_df_json_encoded = standard_read_from_db(
        'structured_data_features')
    #vitals_ngrams_df_json_encoded = standard_read_from_db('vitals_ngrams')
    #ner_processed_df_json_encoded = standard_read_from_db('post_ner_inference')

    #infection_one_hot_df = pd.read_json(infection_one_hot_df_json_encoded.decode())
    readmission_one_hot_df = pd.read_json(
        readmission_one_hot_df_json_encoded.decode())
    structured_features_df = pd.read_json(
        structured_features_df_json_encoded.decode())
    #vitals_ngrams_df = pd.read_json(vitals_ngrams_df_json_encoded.decode())
    #ner_processed_df = pd.read_json(ner_processed_df_json_encoded.decode())

    #combined_df = infection_one_hot_df
    #combined_columns = combined_df.columns

    #for column in readmission_one_hot_df.columns:
    #    if column not in combined_columns:
    #        combined_df[column] = readmission_one_hot_df[column]

    #combined_columns = combined_df.columns

    combined_df = readmission_one_hot_df
    combined_columns = combined_df.columns

    for column in structured_features_df.columns:
        if column not in combined_columns:
            combined_df[column] = structured_features_df[column]

    combined_columns = combined_df.columns

    #for column in vitals_ngrams_df.columns:
    #    if column not in combined_columns:
    #        combined_df[column] = vitals_ngrams_df[column]

    #combined_columns = combined_df.columns

    #for column in ner_processed_df.columns:
    #    if column not in combined_columns:
    #        combined_df[column] = ner_processed_df[column]

    columns_to_remove = [
        'admission_id',
        'admittime',
        'deathtime',
        'dischtime',
        'patient_id',
        'notes',
        #'note_entities_labeled',
        'index',
        #'tokens_in_record',
        #'vitals',
        #'non-vitals',
        #'vitals_ngrams',
    ]

    combined_df.drop(columns_to_remove, axis=1, inplace=True)
    combined_df = combined_df.dropna()

    combined_df_json_encoded = combined_df.to_json().encode()

    standard_write_to_db('combined_dataframe', combined_df_json_encoded)
Ejemplo n.º 23
0
def create_lda_model():
    notes = standard_read_from_db('all_notes_cleansed').decode()
    tokens = create_ngram_tokens(notes)
    dictionary, corpus, lda_model = make_model(tokens)
    lda_write_to_db(dictionary, corpus, lda_model)