def create_report(): # get the dataframe from the structured data features step df_json_encoded = standard_read_from_db('structured_data_features') structured_df = pd.read_json(df_json_encoded.decode()) # create patient summary df patient_summary_df = make_patient_summary(structured_df) # retrieve the top n terms dataframe from each xgboost step _, top_n_feat_los_df_json_encoded, _ = xgb_read_from_db('feat_xgb_los') top_n_feat_los_df = pd.read_json(top_n_feat_los_df_json_encoded.decode()) _, top_n_neg_feat_los_df_json_encoded, _ = xgb_read_from_db('neg_feat_xgb_los') top_n_neg_feat_los_df = pd.read_json(top_n_neg_feat_los_df_json_encoded.decode()) _, top_n_med_los_df_json_encoded, _ = xgb_read_from_db('med_xgb_los') top_n_med_los_df = pd.read_json(top_n_med_los_df_json_encoded.decode()) _, top_n_neg_med_los_df_json_encoded, _ = xgb_read_from_db('neg_med_xgb_los') top_n_neg_med_los_df = pd.read_json(top_n_neg_med_los_df_json_encoded.decode()) _, top_n_feat_readm_df_json_encoded, _ = xgb_read_from_db('feat_xgb_readmission') top_n_feat_readm_df = pd.read_json(top_n_feat_readm_df_json_encoded.decode()) _, top_n_neg_feat_readm_df_json_encoded, _ = xgb_read_from_db('neg_feat_xgb_readmission') top_n_neg_feat_readm_df = pd.read_json(top_n_neg_feat_readm_df_json_encoded.decode()) _, top_n_med_readm_df_json_encoded, _ = xgb_read_from_db('med_xgb_readmission') top_n_med_readm_df = pd.read_json(top_n_med_readm_df_json_encoded.decode()) _, top_n_neg_med_readm_df_json_encoded, _ = xgb_read_from_db('neg_med_xgb_readmission') top_n_neg_med_readm_df = pd.read_json(top_n_neg_med_readm_df_json_encoded.decode()) # Create a dictionary of all the top n dataframes. # This will be passed into the function that creates the hospital summary. top_n_dict = { 'top_n_feat_los_df': top_n_feat_los_df, 'top_n_neg_feat_los_df': top_n_neg_feat_los_df, 'top_n_med_los_df': top_n_med_los_df, 'top_n_neg_med_los_df': top_n_neg_med_los_df, 'top_n_feat_readm_df': top_n_feat_readm_df, 'top_n_neg_feat_readm_df': top_n_neg_feat_readm_df, 'top_n_med_readm_df': top_n_med_readm_df, 'top_n_neg_med_readm_df': top_n_neg_med_readm_df } # Get the word2vec model for readmissions. readmission_word2vec_model_pickle = standard_read_from_db('readmission_word2vec') readmission_word2vec_model = pickle.loads(readmission_Word2vec_model_pickle) # create hospital summary df hospital_summary_df = make_hospital_summary(structured_df, top_n_dict, readmission_word2vec_model) # serialize patient and hospital summary dataframes patient_summary_df_json_encoded = patient_summary_df.to_json().encode() hospital_summary_df_json_encoded = hospital_summary_df.to_json().encode() # Persist serialized dataframes into the database summary_report_write_to_db(patient_summary_df_json_encoded, hospital_summary_df_json_encoded)
def tokenize_all_notes(): notes_encoded = standard_read_from_db('all_notes_cleansed') notes = notes_encoded.decode() tokens = tokenize(notes) tokens_string_encoded = str(tokens).encode() standard_write_to_db('word2vec_notes_tokenized', tokens_string_encoded)
def train(): df_json_encoded = standard_read_from_db('readmission_classifier_tokenized') df = pd.read_json(df_json_encoded.decode()) classifier = train_classifier(df) classifier_pickle = pickle.dumps(classifier) standard_write_to_db('token_classifier_model', classifier_pickle)
def create_entity_columns(): df_json_encoded = standard_read_from_db('ner_labeled_notes') df = pd.read_json(df_json_encoded.decode()) new_df = get_columns_from_notes(df) new_df_json_encoded = new_df.to_json().encode() standard_write_to_db('entity_columns', new_df_json_encoded)
def create_file(): df_json_encoded = standard_read_from_db('ner_cleaned_notes') df = pd.read_json(df_json_encoded.decode()) out_file = open('all_note_lines.txt','w+') for i, row in df.iterrows(): notes = row['ner_cleaned_notes'].replace(' ##', '') print(notes, file=out_file)
def the_callable_that_will_be_used_as_a_task(): df_json_encoded = standard_read_from_db('the name of the collection used to save the output of the previous step') df = pd.read_json(df_json_encoded.decode()) new_df = some_function_that_does_some_modification_to_a_dataframe(df) new_df_json_encoded = new_df.to_json().encode() standard_write_to_db('the name of the collection used to save the output of this step')
def readmission_classifier_clean_notes(): df_json_encoded = standard_read_from_db('structured_data_features') df = pd.read_json(df_json_encoded.decode()) cleaned_notes = clean_notes(df) df['readmission_classifier_tokens'] = cleaned_notes df_json_encoded = df.to_json().encode() standard_write_to_db('readmission_classifer_tokenized',df_json_encoded)
def clean_all_notes(): df_json_encoded = standard_read_from_db('first_dataframe') df_json = df_json_encoded.decode() df = pd.read_json(df_json) all_notes = combine_and_clean(df) all_notes_encoded = all_notes.encode() standard_write_to_db('all_notes_cleansed', all_notes_encoded)
def add_tokens_column(): df_json_encoded = standard_read_from_db('first_dataframe') df_json = df_json_encoded.decode() df = pd.read_json(df_json) df = tokenize_by_sentence(df) df_json = df.to_json() df_json_encoded = df_json.encode() standard_write_to_db('ngram_prep_tokenize', df_json_encoded)
def readmission_one_hot(): first_dataframe_json_encoded = standard_read_from_db('first_dataframe') first_dataframe_json = first_dataframe_json_encoded.decode() first_dataframe = pd.read_json(first_dataframe_json) word2vec_pickle = standard_read_from_db('readmission_word2vec') word2vec_model = pickle.loads(word2vec_pickle) flattened, key_words = find_readmit_similar_terms(word2vec_model) df_found_words = add_found_words_column(first_dataframe, key_words) df_one_hot = one_hot_encode_found_key_terms(df_found_words) df_term_cos_simil = pd.DataFrame() df_term_cos_simil['readmission_key_words'] = flattened df_one_hot_json_encoded = df_one_hot.to_json().encode() df_term_cos_simil_json_encoded = df_term_cos_simil.to_json().encode() one_hot_write_to_db(df_one_hot_json_encoded, df_term_cos_simil_json_encoded, collection_name)
def convert_to_likert(): df_json_encoded = standard_read_from_db( 'readmission_tensorflow_predictions') df = pd.read_json(df_json_encoded.decode()) likert_values = make_likert_column(df) df['readmission_likert'] = likert_values df_json_encoded = df.to_json().encode() standard_write_to_db('readmission_likert', df_json_encoded)
def create_word2vec_model(): tokens_string = standard_read_from_db('word2vec_notes_tokenized').decode() tokens = ast.literal_eval(tokens_string) #model = Word2Vec([tokens], size=100, window=10, min_count=2, workers=3) #found readmission as one of the tokens in tokens while testing, reduced min_count to get rid of that error #word2vec optimization opportunity for LDA model = Word2Vec([tokens], size=100, window=10, min_count=1, workers=3) model_pickled = pickle.dumps(model) standard_write_to_db('word2vec', model_pickled)
def create_vitals_ngrams(): df_json_encoded = standard_read_from_db('ngram_prep_tokenize') df_json = df_json_encoded.decode() df = pd.read_json(df_json) df = get_vitals_and_generate_ngrams(df) df_json = df.to_json() df_json_encoded = df_json.encode() standard_write_to_db('vitals_ngrams', df_json_encoded)
def create_structured_data_features(): df_json_encoded = standard_read_from_db('first_dataframe') df_json = df_json_encoded.decode() df = pd.read_json(df_json) df = add_los_age_and_binary_deathtime_columns(df) df = add_readmission_column(df) df_json = df.to_json() df_json_encoded = df_json.encode() standard_write_to_db('structured_data_features', df_json_encoded)
def run_tpot(): combined_df_json_encoded = standard_read_from_db('combined_dataframe') combined_df_json = combined_df_json_encoded.decode() combined_df = pd.read_json(combined_df_json) tpot_pipeline_code, score = create_tpot_pipeline(combined_df, 'readmission') tpot_pipeline_code_encoded = tpot_pipeline_code.encode() score_encoded = str(score).encode() tpot_write_to_db(tpot_pipeline_code_encoded, score_encoded, 'tpot_readmission')
def clean_readmission_notes(): df_json_encoded = standard_read_from_db('structured_data_features') df_json = df_json_encoded.decode() df = pd.read_json(df_json) #filter dataframe by readmission==True is_readmission = df['readmission'] == True df_readmissions = df[is_readmission] readmission_notes = combine_and_clean(df) readmission_notes_encoded = readmission_notes.encode() standard_write_to_db('readmission_notes_cleansed', readmission_notes_encoded)
def train_and_predict(): df_json_encoded = standard_read_from_db('readmission_classifier_tokens') df = pd.read_json(df_json_encoded.decode()) X, y = create_dataset(df) classifier = train_classifier(X, y) probabilities = make_probability_column(classifier, X) df['readmission_classifier_probabilities'] = probabilities df_json_encoded = df.to_json().encode() classifier_pickle = pickle.dumps(classifier) readmission_classifier_write_to_db(df_json_encoded, classifier_pickle)
def make_predictions(): df_json_encoded = standard_read_from_db('entity_columns') df = pd.read_json(df_json_encoded.decode()) bst = train_xgb_model(df) df = add_predictions_column(df, bst) top_n_df = make_top_n_features(bst, feat_one_hot, 3) df_json_encoded = df.to_json().encode() top_n_df_json_encoded = df.to_json().encode() bst_pickle = pickle.dumps(bst) xgb_write_to_db('neg_med_xgb_los', df_json_encoded, top_n_df_json_encoded, bst_pickle)
def clean_ner_notes(): df_json_encoded = standard_read_from_db('structured_data_features') df_json = df_json_encoded.decode() df = pd.read_json(df_json) #df = pd.read_parquet('short_ner_test.parquet') nemo_tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer( tokenizer_name='nemobert', pretrained_model_name='bert-base-uncased', tokenizer_model=None) cleaned_notes = [] notes_length = len(df['notes']) num_cleaned = 0 for note in df['notes']: note = re.sub('\[', '', note) note = re.sub('\]', '', note) note = re.sub('\*', '', note) note = re.sub(',', '', note) note = re.sub(';', '', note) cleaned_note = '' lines = note.split('\n') for line in lines: if line != '' and '____' not in line: sentences = sent_tokenize(line) for sentence in sentences: words = nemo_tokenizer.text_to_tokens(sentence) #Chunked according to this approach: # https://stackoverflow.com/questions/9671224/split-a-python-list-into-other-sublists-i-e-smaller-lists max_len_chunks = [ words[x:x + MAX_SEQ_LENGTH] for x in range(0, len(words), MAX_SEQ_LENGTH) ] for chunk in max_len_chunks: new_line = '' for word in chunk: new_line += ' ' + word cleaned_note += '\n' + new_line.strip() num_cleaned += 1 cleaned_notes.append(cleaned_note) print(str(num_cleaned / notes_length * 100) + "% notes cleaned") df['ner_cleaned_notes'] = cleaned_notes df_json_encoded = df.to_json().encode() standard_write_to_db('ner_cleaned_notes', df_json_encoded)
def run_tpot(): print('AA1') combined_df_json_encoded = standard_read_from_db('combined_dataframe') print('AA2') combined_df_json = combined_df_json_encoded.decode() print('AA3') combined_df = pd.read_json(combined_df_json) print('AA4') tpot_pipeline_code, score = create_tpot_pipeline(combined_df, 'los') print('AA5') tpot_pipeline_code_encoded = tpot_pipeline_code.encode() print('AA6') score_encoded = str(score).encode() print('AA7') tpot_write_to_db(tpot_pipeline_code_encoded, score_encoded, 'tpot_los')
def create_labeled_notes_column(): lines = get_note_lines_from_file() df_json_encoded = standard_read_from_db('ner_cleaned_notes') df = pd.read_json(df_json_encoded.decode()) length_array = get_line_length_array(df) begin = 0 end = 0 labeled_notes = [] for length in length_array: end += length note_lines = lines[begin:end] begin += length note = '' for line in note_lines: note += line labeled_notes.append(note) df['labeled_notes'] = labeled_notes df_json_encoded = df.to_json().encode() standard_write_to_db('ner_labeled_notes',df_json_encoded)
def combine(): #infection_one_hot_df_json_encoded, _ = one_hot_read_from_db('infection_one_hot') readmission_one_hot_df_json_encoded, _ = one_hot_read_from_db( 'readmission_one_hot') structured_features_df_json_encoded = standard_read_from_db( 'structured_data_features') #vitals_ngrams_df_json_encoded = standard_read_from_db('vitals_ngrams') #ner_processed_df_json_encoded = standard_read_from_db('post_ner_inference') #infection_one_hot_df = pd.read_json(infection_one_hot_df_json_encoded.decode()) readmission_one_hot_df = pd.read_json( readmission_one_hot_df_json_encoded.decode()) structured_features_df = pd.read_json( structured_features_df_json_encoded.decode()) #vitals_ngrams_df = pd.read_json(vitals_ngrams_df_json_encoded.decode()) #ner_processed_df = pd.read_json(ner_processed_df_json_encoded.decode()) #combined_df = infection_one_hot_df #combined_columns = combined_df.columns #for column in readmission_one_hot_df.columns: # if column not in combined_columns: # combined_df[column] = readmission_one_hot_df[column] #combined_columns = combined_df.columns combined_df = readmission_one_hot_df combined_columns = combined_df.columns for column in structured_features_df.columns: if column not in combined_columns: combined_df[column] = structured_features_df[column] combined_columns = combined_df.columns #for column in vitals_ngrams_df.columns: # if column not in combined_columns: # combined_df[column] = vitals_ngrams_df[column] #combined_columns = combined_df.columns #for column in ner_processed_df.columns: # if column not in combined_columns: # combined_df[column] = ner_processed_df[column] columns_to_remove = [ 'admission_id', 'admittime', 'deathtime', 'dischtime', 'patient_id', 'notes', #'note_entities_labeled', 'index', #'tokens_in_record', #'vitals', #'non-vitals', #'vitals_ngrams', ] combined_df.drop(columns_to_remove, axis=1, inplace=True) combined_df = combined_df.dropna() combined_df_json_encoded = combined_df.to_json().encode() standard_write_to_db('combined_dataframe', combined_df_json_encoded)
def create_lda_model(): notes = standard_read_from_db('all_notes_cleansed').decode() tokens = create_ngram_tokens(notes) dictionary, corpus, lda_model = make_model(tokens) lda_write_to_db(dictionary, corpus, lda_model)