Example #1
0
def the_callable_that_will_be_used_as_a_task():
    df_json_encoded = standard_read_from_db('the name of the collection used to save the output of the previous step')
    df = pd.read_json(df_json_encoded.decode())

    new_df = some_function_that_does_some_modification_to_a_dataframe(df)

    new_df_json_encoded = new_df.to_json().encode()
    standard_write_to_db('the name of the collection used to save the output of this step')
def train():
    df_json_encoded = standard_read_from_db('readmission_classifier_tokenized')
    df = pd.read_json(df_json_encoded.decode())

    classifier = train_classifier(df)
    classifier_pickle = pickle.dumps(classifier)

    standard_write_to_db('token_classifier_model', classifier_pickle)
def tokenize_all_notes():
    notes_encoded = standard_read_from_db('all_notes_cleansed')
    notes = notes_encoded.decode()

    tokens = tokenize(notes)

    tokens_string_encoded = str(tokens).encode()
    standard_write_to_db('word2vec_notes_tokenized', tokens_string_encoded)
def create_entity_columns():
    df_json_encoded = standard_read_from_db('ner_labeled_notes')
    df = pd.read_json(df_json_encoded.decode())

    new_df = get_columns_from_notes(df)

    new_df_json_encoded = new_df.to_json().encode()
    standard_write_to_db('entity_columns', new_df_json_encoded)
Example #5
0
def clean_all_notes():
    df_json_encoded = standard_read_from_db('first_dataframe')
    df_json = df_json_encoded.decode()
    df = pd.read_json(df_json)

    all_notes = combine_and_clean(df)

    all_notes_encoded = all_notes.encode()
    standard_write_to_db('all_notes_cleansed', all_notes_encoded)
Example #6
0
def readmission_classifier_clean_notes():
    df_json_encoded = standard_read_from_db('structured_data_features')
    df = pd.read_json(df_json_encoded.decode())
    
    cleaned_notes = clean_notes(df)
    df['readmission_classifier_tokens'] = cleaned_notes

    df_json_encoded = df.to_json().encode()
    standard_write_to_db('readmission_classifer_tokenized',df_json_encoded)
Example #7
0
def make_predictions():
    readmission_classifer_df_json_encoded, _ = readmission_classifier_read_from_db(
    )
    readmission_classifier_df = pd.read_json(
        readmission_classifier_df_json_encoded.decode())

    xgb_demo_df_json_encoded, top_n_demo_df_json_encoded, _ = xgb_read_from_db(
        'demo_xgb_readmission')
    xgb_demo_df = pd.read_json(xgb_demo_df_json_encoded.decode())
    top_n_demo_df = pd.read_json(top_n_demo_df_json_encoded.decode())

    xgb_feat_df_json_encoded, top_n_feat_df_json_encoded, _ = xgb_read_from_db(
        'feat_xgb_readmission')
    xgb_feat_df = pd.read_json(xgb_feat_df_json_encoded.decode())
    top_n_feat_df = pd.read_json(top_n_feat_df_json_encoded.decode())

    xgb_neg_feat_df_json_encoded, top_n_neg_feat_df_json_encoded, _ = xgb_read_from_db(
        'neg_feat_xgb_readmission')
    xgb_neg_feat_df = pd.read_json(xgb_neg_feat_df_json_encoded.decode())
    top_n_neg_feat_df = pd.read_json(top_n_neg_feat_df_json_encoded.decode())

    xgb_med_df_json_encoded, top_n_med_df_json_encoded, _ = xgb_read_from_db(
        'med_xgb_readmission')
    xgb_med_df = pd.read_json(xgb_med_df_json_encoded.decode())
    top_n_med_df = pd.read_json(top_n_med_df_json_encoded.decode())

    xgb_neg_med_df_json_encoded, top_n_neg_med_df_json_encoded, _ = xgb_read_from_db(
        'neg_med_xgb_readmission')
    xgb_neg_med_df = pd.read_json(xgb_neg_med_df_json_encoded.decode())
    top_n_neg_med_df = pd.read_json(top_n_neg_med_df_json_encoded.decode())

    prev_probas = pd.DataFrame()
    prev_probas[
        'readmission_classifier_probabilities'] = readmission_classifier_df[
            'readmission_classifier_probabilities']
    prev_probas['xgb_demo_ent_pred'] = xgb_demo_df['xgb_demo_ent_pred']
    prev_probas['xgb_feat_ent_pred'] = xgb_feat_df['xgb_feat_ent_pred']
    prev_probas['xgb_neg_feat_ent_pred'] = xgb_neg_feat_df['xgb_feat_ent_pred']
    prev_probas['xgb_med_ent_pred'] = xgb_med_df['xgb_med_ent_pred']
    prev_probas['xgb_neg_med_ent_pred'] = xgb_neg_med_df['xgb_med_ent_pred']

    tf_input = pd.concat([
        prev_probas, top_n_demo_df, top_n_feat_df, top_n_neg_feat_df,
        top_n_med_df, top_n_neg_med_df
    ],
                         axis=1)

    readmissions = xgb_demo_df['readmissions']

    model = creat_model(prev_probas, readmissions)
    model_predictions = predict_with_model(prev_probas, model)
    tf_input['keras_pred'] = model_predictions
    tf_input['admission_id'] = readmission_classifier_df['admission_id']

    tf_input_json_encoded = tf_input.to_json().encode()
    standard_write_to_db('readmission_tensorflow_predictions',
                         tf_input_json_encoded)
def create_vitals_ngrams():
    df_json_encoded = standard_read_from_db('ngram_prep_tokenize')
    df_json = df_json_encoded.decode()
    df = pd.read_json(df_json)

    df = get_vitals_and_generate_ngrams(df)

    df_json = df.to_json()
    df_json_encoded = df_json.encode()
    standard_write_to_db('vitals_ngrams', df_json_encoded)
def convert_to_likert():
    df_json_encoded = standard_read_from_db(
        'readmission_tensorflow_predictions')
    df = pd.read_json(df_json_encoded.decode())

    likert_values = make_likert_column(df)
    df['readmission_likert'] = likert_values

    df_json_encoded = df.to_json().encode()
    standard_write_to_db('readmission_likert', df_json_encoded)
Example #10
0
def add_tokens_column():
    df_json_encoded = standard_read_from_db('first_dataframe')
    df_json = df_json_encoded.decode()
    df = pd.read_json(df_json)

    df = tokenize_by_sentence(df)

    df_json = df.to_json()
    df_json_encoded = df_json.encode()
    standard_write_to_db('ngram_prep_tokenize', df_json_encoded)
def create_word2vec_model():
    tokens_string = standard_read_from_db('word2vec_notes_tokenized').decode()
    tokens = ast.literal_eval(tokens_string)
    #model = Word2Vec([tokens], size=100, window=10, min_count=2, workers=3)
    #found readmission as one of the tokens in tokens while testing, reduced min_count to get rid of that error

    #word2vec optimization opportunity for LDA
    model = Word2Vec([tokens], size=100, window=10, min_count=1, workers=3)
    model_pickled = pickle.dumps(model)
    standard_write_to_db('word2vec', model_pickled)
Example #12
0
def create_structured_data_features():
    df_json_encoded = standard_read_from_db('first_dataframe')
    df_json = df_json_encoded.decode()
    df = pd.read_json(df_json)

    df = add_los_age_and_binary_deathtime_columns(df)
    df = add_readmission_column(df)

    df_json = df.to_json()
    df_json_encoded = df_json.encode()
    standard_write_to_db('structured_data_features', df_json_encoded)
def clean_readmission_notes():
    df_json_encoded = standard_read_from_db('structured_data_features')
    df_json = df_json_encoded.decode()
    df = pd.read_json(df_json)

    #filter dataframe by readmission==True
    is_readmission = df['readmission'] == True
    df_readmissions = df[is_readmission]

    readmission_notes = combine_and_clean(df)

    readmission_notes_encoded = readmission_notes.encode()
    standard_write_to_db('readmission_notes_cleansed',
                         readmission_notes_encoded)
def get_dataframe_from_apis():
    notes = get_all_notes()
    admissions = get_admissions()
    icd_codes = get_icd_codes()
    patients = get_patients()

    admissions_with_notes_and_codes = combine_notes_and_admissions_and_codes(admissions, notes, icd_codes, patients)
    #admissions_with_notes = testing_admissions_with_notes()

    df = pd.json_normalize(admissions_with_notes)
    # create an index column where the rows have values from 0 to len(df.iterrows())-1
    df.reset_index(inplace=True)

    df_json_encoded = df.to_json().encode()
    standard_write_to_db('first_dataframe', df_json_encoded)
Example #15
0
def clean_ner_notes():
    df_json_encoded = standard_read_from_db('structured_data_features')
    df_json = df_json_encoded.decode()
    df = pd.read_json(df_json)

    #df = pd.read_parquet('short_ner_test.parquet')

    nemo_tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer(
        tokenizer_name='nemobert',
        pretrained_model_name='bert-base-uncased',
        tokenizer_model=None)

    cleaned_notes = []
    notes_length = len(df['notes'])
    num_cleaned = 0
    for note in df['notes']:
        note = re.sub('\[', '', note)
        note = re.sub('\]', '', note)
        note = re.sub('\*', '', note)
        note = re.sub(',', '', note)
        note = re.sub(';', '', note)

        cleaned_note = ''

        lines = note.split('\n')
        for line in lines:
            if line != '' and '____' not in line:
                sentences = sent_tokenize(line)
                for sentence in sentences:
                    words = nemo_tokenizer.text_to_tokens(sentence)
                    #Chunked according to this approach:
                    # https://stackoverflow.com/questions/9671224/split-a-python-list-into-other-sublists-i-e-smaller-lists
                    max_len_chunks = [
                        words[x:x + MAX_SEQ_LENGTH]
                        for x in range(0, len(words), MAX_SEQ_LENGTH)
                    ]
                    for chunk in max_len_chunks:
                        new_line = ''
                        for word in chunk:
                            new_line += ' ' + word
                        cleaned_note += '\n' + new_line.strip()
        num_cleaned += 1
        cleaned_notes.append(cleaned_note)
        print(str(num_cleaned / notes_length * 100) + "% notes cleaned")

    df['ner_cleaned_notes'] = cleaned_notes
    df_json_encoded = df.to_json().encode()
    standard_write_to_db('ner_cleaned_notes', df_json_encoded)
Example #16
0
def create_labeled_notes_column():
    lines = get_note_lines_from_file()

    df_json_encoded = standard_read_from_db('ner_cleaned_notes')
    df = pd.read_json(df_json_encoded.decode())
    length_array = get_line_length_array(df)

    begin = 0
    end = 0
    labeled_notes = []
    for length in length_array:
        end += length
        note_lines = lines[begin:end]
        begin += length
        note = ''
        for line in note_lines:
            note += line
        labeled_notes.append(note)

    df['labeled_notes'] = labeled_notes
    df_json_encoded = df.to_json().encode()
    standard_write_to_db('ner_labeled_notes',df_json_encoded)
def combine():
    #infection_one_hot_df_json_encoded, _ = one_hot_read_from_db('infection_one_hot')
    readmission_one_hot_df_json_encoded, _ = one_hot_read_from_db(
        'readmission_one_hot')
    structured_features_df_json_encoded = standard_read_from_db(
        'structured_data_features')
    #vitals_ngrams_df_json_encoded = standard_read_from_db('vitals_ngrams')
    #ner_processed_df_json_encoded = standard_read_from_db('post_ner_inference')

    #infection_one_hot_df = pd.read_json(infection_one_hot_df_json_encoded.decode())
    readmission_one_hot_df = pd.read_json(
        readmission_one_hot_df_json_encoded.decode())
    structured_features_df = pd.read_json(
        structured_features_df_json_encoded.decode())
    #vitals_ngrams_df = pd.read_json(vitals_ngrams_df_json_encoded.decode())
    #ner_processed_df = pd.read_json(ner_processed_df_json_encoded.decode())

    #combined_df = infection_one_hot_df
    #combined_columns = combined_df.columns

    #for column in readmission_one_hot_df.columns:
    #    if column not in combined_columns:
    #        combined_df[column] = readmission_one_hot_df[column]

    #combined_columns = combined_df.columns

    combined_df = readmission_one_hot_df
    combined_columns = combined_df.columns

    for column in structured_features_df.columns:
        if column not in combined_columns:
            combined_df[column] = structured_features_df[column]

    combined_columns = combined_df.columns

    #for column in vitals_ngrams_df.columns:
    #    if column not in combined_columns:
    #        combined_df[column] = vitals_ngrams_df[column]

    #combined_columns = combined_df.columns

    #for column in ner_processed_df.columns:
    #    if column not in combined_columns:
    #        combined_df[column] = ner_processed_df[column]

    columns_to_remove = [
        'admission_id',
        'admittime',
        'deathtime',
        'dischtime',
        'patient_id',
        'notes',
        #'note_entities_labeled',
        'index',
        #'tokens_in_record',
        #'vitals',
        #'non-vitals',
        #'vitals_ngrams',
    ]

    combined_df.drop(columns_to_remove, axis=1, inplace=True)
    combined_df = combined_df.dropna()

    combined_df_json_encoded = combined_df.to_json().encode()

    standard_write_to_db('combined_dataframe', combined_df_json_encoded)