Esempio n. 1
0
def evaluate(start_date, end_date, all_dtoc=False):
    df = dtoc.sample(40000, start_date, end_date)
    if all_dtoc:
        df = df[df['is_dtoc'] == 0]
    sample_x = vectorization(df.iloc[:, 4:16], Word2Vec.load("diag2vec.model"))
    model = load_model()
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=[f1, 'acc'])
    return model.evaluate(sample_x, df['is_dtoc'])
Esempio n. 2
0
def train():
    ## load data

    df = dtoc.sample(40000)
    train_df, val_df = train_test_split(df, test_size=0.08, random_state=2018)

    train_X = train_df[[
        'diag1', 'diag2', 'diag3', 'diag4', 'diag5', 'diag6', 'diag7', 'diag8',
        'diag9', 'diag10', 'diag11', 'diag12', 'age'
    ]]

    # train_X = train_df['age'].values
    train_y = train_df['is_dtoc'].values

    val_X = val_df[[
        'diag1', 'diag2', 'diag3', 'diag4', 'diag5', 'diag6', 'diag7', 'diag8',
        'diag9', 'diag10', 'diag11', 'diag12', 'age'
    ]]
    # val_X = train_df['age'].values
    val_y = val_df['is_dtoc'].values

    # embed_size = 150 # how big is each variable vector
    # max_features = 10000 # how many unique codes to use (ICD codes + all ages)
    # max_len = 13 # max number of variables in one records
    # embedding_matrix = get_embeddings_matrix(df)
    # load word2vec model
    tbCallBack = TensorBoard(log_dir='./Graph',
                             histogram_freq=0,
                             write_graph=True,
                             write_images=True)

    EMBEDDING_MODEL_FILE = 'diag2vec.model'
    wv_model = Word2Vec.load(EMBEDDING_MODEL_FILE)

    #model = model_LR()
    model = model_LSTM()
    # model = model_CNN()

    model.fit(vectorization(train_X, wv_model),
              train_y,
              batch_size=512,
              epochs=20,
              validation_data=(vectorization(val_X, wv_model), val_y),
              callbacks=[tbCallBack])

    pred_val_y = model.predict(vectorization(val_X, wv_model),
                               batch_size=1024,
                               verbose=1)
    for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        print("F1 score at threshold {0} is {1}".format(
            thresh, metrics.f1_score(val_y,
                                     (pred_val_y > thresh).astype(int))))
Esempio n. 3
0
def main():
    df = dtoc.sample(40000, '2010-01-01', '2018-01-01')
    train_df, val_df = train_test_split(df, test_size=0.1)
    wv_model = Word2Vec.load("diag2vec.model")
    validation_data = (vectorization(val_df.iloc[:, 4:16],
                                     wv_model), np.array(val_df['is_dtoc']))
    model = train(train_df, wv_model, validation_data)

    ##save weights
    model.save_weights('lstm_weights_before2018.h5')
    ##save structure
    with open('lstm_architecture_before2018.json', 'w') as f:
        f.write(model.to_json())
Esempio n. 4
0
def predict_evaluate():
    model = load_model()
    df = dtoc.sample(40000)
    val_y = df['is_dtoc']
    sample_x = vectorization(df.iloc[:, 4:16], Word2Vec.load("diag2vec.model"))
    model = load_model()
    # model.compile(loss='binary_crossentropy',optimizer='adam',metrics=[f1, 'acc'])
    pred_val_y = model.predict(sample_x)
    thresholds = []
    for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        res = metrics.f1_score(val_y, (pred_val_y > thresh).astype(int))
        thresholds.append([thresh, res])
        print("F1 score at threshold {0} is {1}".format(thresh, res))