def evaluate(start_date, end_date, all_dtoc=False): df = dtoc.sample(40000, start_date, end_date) if all_dtoc: df = df[df['is_dtoc'] == 0] sample_x = vectorization(df.iloc[:, 4:16], Word2Vec.load("diag2vec.model")) model = load_model() model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1, 'acc']) return model.evaluate(sample_x, df['is_dtoc'])
def train(): ## load data df = dtoc.sample(40000) train_df, val_df = train_test_split(df, test_size=0.08, random_state=2018) train_X = train_df[[ 'diag1', 'diag2', 'diag3', 'diag4', 'diag5', 'diag6', 'diag7', 'diag8', 'diag9', 'diag10', 'diag11', 'diag12', 'age' ]] # train_X = train_df['age'].values train_y = train_df['is_dtoc'].values val_X = val_df[[ 'diag1', 'diag2', 'diag3', 'diag4', 'diag5', 'diag6', 'diag7', 'diag8', 'diag9', 'diag10', 'diag11', 'diag12', 'age' ]] # val_X = train_df['age'].values val_y = val_df['is_dtoc'].values # embed_size = 150 # how big is each variable vector # max_features = 10000 # how many unique codes to use (ICD codes + all ages) # max_len = 13 # max number of variables in one records # embedding_matrix = get_embeddings_matrix(df) # load word2vec model tbCallBack = TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True) EMBEDDING_MODEL_FILE = 'diag2vec.model' wv_model = Word2Vec.load(EMBEDDING_MODEL_FILE) #model = model_LR() model = model_LSTM() # model = model_CNN() model.fit(vectorization(train_X, wv_model), train_y, batch_size=512, epochs=20, validation_data=(vectorization(val_X, wv_model), val_y), callbacks=[tbCallBack]) pred_val_y = model.predict(vectorization(val_X, wv_model), batch_size=1024, verbose=1) for thresh in np.arange(0.1, 0.501, 0.01): thresh = np.round(thresh, 2) print("F1 score at threshold {0} is {1}".format( thresh, metrics.f1_score(val_y, (pred_val_y > thresh).astype(int))))
def main(): df = dtoc.sample(40000, '2010-01-01', '2018-01-01') train_df, val_df = train_test_split(df, test_size=0.1) wv_model = Word2Vec.load("diag2vec.model") validation_data = (vectorization(val_df.iloc[:, 4:16], wv_model), np.array(val_df['is_dtoc'])) model = train(train_df, wv_model, validation_data) ##save weights model.save_weights('lstm_weights_before2018.h5') ##save structure with open('lstm_architecture_before2018.json', 'w') as f: f.write(model.to_json())
def predict_evaluate(): model = load_model() df = dtoc.sample(40000) val_y = df['is_dtoc'] sample_x = vectorization(df.iloc[:, 4:16], Word2Vec.load("diag2vec.model")) model = load_model() # model.compile(loss='binary_crossentropy',optimizer='adam',metrics=[f1, 'acc']) pred_val_y = model.predict(sample_x) thresholds = [] for thresh in np.arange(0.1, 0.501, 0.01): thresh = np.round(thresh, 2) res = metrics.f1_score(val_y, (pred_val_y > thresh).astype(int)) thresholds.append([thresh, res]) print("F1 score at threshold {0} is {1}".format(thresh, res))