Beispiel #1
0
def run_evaluation_dense():
    """Use pre-trained patient representations"""

    x_train, y_train, x_test, y_test = data_dense()

    if cfg.get('data', 'classif_param') == 'search':
        classifier = grid_search(x_train, y_train, 'roc_auc')
    else:
        classifier = LogisticRegression(class_weight='balanced')
        classifier.fit(x_train, y_train)

    probs = classifier.predict_proba(x_test)
    metrics.report_roc_auc(y_test, probs[:, 1])
Beispiel #2
0
def run_eval(x_train, y_train, x_test, y_test, search=False):
  """Evaluation on test set"""

  if search:
    classifier = grid_search(x_train, y_train, 'roc_auc')
  else:
    classifier = DummyClassifier(strategy='stratified')
    model = classifier.fit(x_train, y_train)

  probs = classifier.predict_proba(x_test)

  metrics.report_roc_auc(y_test, probs[:, 1])
  metrics.report_ci(y_test, probs[:, 1], sklearn.metrics.roc_auc_score)

  metrics.report_pr_auc(y_test, probs[:, 1])
  metrics.report_ci(y_test, probs[:, 1], metrics.pr_auc_score)
Beispiel #3
0
def main():
    """Train and evaluate"""

    data_root = os.environ['DATA_ROOT']

    if os.path.isdir('./Model/'):
        shutil.rmtree('./Model/')
    os.mkdir('./Model/')

    train_data_provider = DatasetProvider(
        os.path.join(data_root, cfg.get('data', 'train')),
        cfg.get('data', 'tokenizer_pickle'), None)
    x_train, y_train = train_data_provider.load_as_one_hot()
    print('loaded x_train:', x_train.shape)

    # are we evaluating on test or dev?
    if cfg.getfloat('data', 'val_size') != 0:
        x_train, x_val, y_train, y_val = train_test_split(
            x_train, y_train, test_size=cfg.getfloat('data', 'val_size'))
        callbacks = [
            ModelCheckpoint('./Model/model.h5', verbose=1, save_best_only=True)
        ]
        validation_data = (x_val, y_val)
        print('x_train shape:', x_train.shape)
        print('x_val shape:', x_val.shape)

    else:
        test_data_provider = DatasetProvider(
            os.path.join(data_root, cfg.get('data', 'test')),
            cfg.get('data', 'tokenizer_pickle'), None)
        x_test, y_test = test_data_provider.load_as_one_hot()
        print('loaded x_test:', x_test.shape)
        validation_data = None
        callbacks = None

    # train the linear classification layer
    model = get_model(len(train_data_provider.label2int))
    optim = getattr(optimizers, cfg.get('linear', 'optimizer'))
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=optim(lr=cfg.getfloat('linear', 'lr')),
                  metrics=['accuracy'])
    model.fit(x_train,
              y_train,
              validation_data=validation_data,
              epochs=cfg.getint('linear', 'epochs'),
              batch_size=cfg.getint('linear', 'batch'),
              validation_split=0.0,
              callbacks=callbacks)

    # fine-tune the pre-trained layers
    # https://stackoverflow.com/questions/47995324/
    # does-model-compile-initialize-all-the-weights-and-biases-in-keras-tensorflow/47996024

    if cfg.getboolean('base', 'finetune'):

        print()
        for layer in model.layers:
            layer.trainable = True
            print('%s: %s' % (layer.name, layer.trainable))

        optim = getattr(optimizers, cfg.get('base', 'optimizer'))
        model.compile(loss='sparse_categorical_crossentropy',
                      optimizer=optim(lr=cfg.getfloat('base', 'lr')),
                      metrics=['accuracy'])

        model.fit(x_train,
                  y_train,
                  validation_data=validation_data,
                  epochs=cfg.getint('base', 'epochs'),
                  batch_size=cfg.getint('base', 'batch'),
                  validation_split=0.0,
                  callbacks=callbacks)

    if cfg.getfloat('data', 'val_size') != 0:
        # during validation, load last best model
        model = load_model('./Model/model.h5')
        x_test, y_test = x_val, y_val

    # distribution.shape: (test size, num of classes)
    distribution = model.predict(x_test)
    predictions = np.argmax(distribution, axis=1)

    pos_label = train_data_provider.label2int['yes']
    metrics.report_roc_auc(y_test, distribution[:, pos_label])
    metrics.report_pr_auc(y_test, distribution[:, pos_label])