Exemple #1
0
def run_cross_validation(disease, judgement):
    """Run n-fold CV on training set"""

    cfg = configparser.ConfigParser()
    cfg.read(sys.argv[1])
    print_config(cfg)

    base = os.environ['DATA_ROOT']
    data_dir = os.path.join(base, cfg.get('data', 'train_data'))
    annot_xml = os.path.join(base, cfg.get('data', 'train_annot'))
    dataset = DatasetProvider(data_dir,
                              annot_xml,
                              disease,
                              judgement,
                              use_pickled_alphabet=False,
                              min_token_freq=cfg.getint(
                                  'args', 'min_token_freq'))
    x, y = dataset.load()

    classes = len(dataset.label2int)
    maxlen = max([len(seq) for seq in x])
    x = pad_sequences(x, maxlen=maxlen)
    y = to_categorical(y, classes)

    cv_scores = []
    kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=100)
    for train_indices, test_indices in kf.split(x):

        train_x = x[train_indices]
        train_y = y[train_indices]
        test_x = x[test_indices]
        test_y = y[test_indices]

        model = get_model(cfg, dataset.token2int, maxlen, classes, 'softmax')
        optimizer = RMSprop(lr=cfg.getfloat('nn', 'learnrt'))
        model.compile(loss='categorical_crossentropy',
                      optimizer=optimizer,
                      metrics=['accuracy'])
        model.fit(train_x,
                  train_y,
                  epochs=cfg.getint('nn', 'epochs'),
                  batch_size=cfg.getint('nn', 'batch'),
                  validation_split=0.0,
                  verbose=0)

        # probability for each class; (test size, num of classes)
        distribution = model.predict(test_x,
                                     batch_size=cfg.getint('nn', 'batch'))
        # class predictions; (test size,)
        predictions = np.argmax(distribution, axis=1)
        # gold labels; (test size,)
        gold = np.argmax(test_y, axis=1)

        # f1 scores
        f1 = f1_score(gold, predictions, average='macro')
        cv_scores.append(f1)

    print('average f1:', np.mean(cv_scores))
    print('standard deviation:', np.std(cv_scores))
Exemple #2
0
def get_data(disease, judgement):
    """Sequences of tokens to feed into code prediction model"""

    base = os.environ['DATA_ROOT']
    train_data = os.path.join(base, cfg.get('data', 'train_data'))
    train_annot = os.path.join(base, cfg.get('data', 'train_annot'))
    test_data = os.path.join(base, cfg.get('data', 'test_data'))
    test_annot = os.path.join(base, cfg.get('data', 'test_annot'))

    # determine whether to treat input tokens as a sequence or set
    if cfg.get('data', 'model_type') == 'dan':
        use_cuis = True
        tokens_as_set = True
    else:
        use_cuis = False
        tokens_as_set = False

    # load training data
    train_data_provider = DatasetProvider(
        train_data,
        train_annot,
        disease,
        judgement,
        use_pickled_alphabet=True,
        alphabet_pickle=cfg.get('data', 'alphabet_pickle'),
        min_token_freq=cfg.getint('args', 'min_token_freq'),
        use_cuis=use_cuis)
    x_train, y_train = train_data_provider.load(tokens_as_set=tokens_as_set)
    x_train = pad_sequences(x_train, maxlen=get_maxlen())

    # load the test set
    test_data_provider = DatasetProvider(
        test_data,
        test_annot,
        disease,
        judgement,
        use_pickled_alphabet=True,
        alphabet_pickle=cfg.get('data', 'alphabet_pickle'),
        min_token_freq=cfg.getint('args', 'min_token_freq'),
        use_cuis=use_cuis)
    x_test, y_test = test_data_provider.load(tokens_as_set=tokens_as_set)
    x_test = pad_sequences(x_test, maxlen=get_maxlen())

    return x_train, y_train, x_test, y_test
def run_evaluation_dense(disease, judgement):
  """Use pre-trained patient representations"""

  print 'disease:', disease
  print 'judgement:', judgement

  cfg = ConfigParser.ConfigParser()
  cfg.read(sys.argv[1])
  base = os.environ['DATA_ROOT']
  train_data = os.path.join(base, cfg.get('data', 'train_data'))
  train_annot = os.path.join(base, cfg.get('data', 'train_annot'))
  test_data = os.path.join(base, cfg.get('data', 'test_data'))
  test_annot = os.path.join(base, cfg.get('data', 'test_annot'))

  # load pre-trained model
  model = load_model(cfg.get('data', 'model_file'))
  interm_layer_model = Model(
    inputs=model.input,
    outputs=model.get_layer('HL').output)

  # load training data first
  train_data_provider = DatasetProvider(
    train_data,
    train_annot,
    disease,
    judgement,
    use_pickled_alphabet=True,
    alphabet_pickle=cfg.get('data', 'alphabet_pickle'),
    min_token_freq=cfg.getint('args', 'min_token_freq'))
  x_train, y_train = train_data_provider.load()

  classes = len(set(y_train))
  print 'unique labels in train:', classes
  maxlen = cfg.getint('data', 'maxlen')
  x_train = pad_sequences(x_train, maxlen=maxlen)

  # make training vectors for target task
  print 'original x_train shape:', x_train.shape
  x_train = interm_layer_model.predict(x_train)
  print 'new x_train shape:', x_train.shape

  # now load the test set
  test_data_provider = DatasetProvider(
    test_data,
    test_annot,
    disease,
    judgement,
    use_pickled_alphabet=True,
    alphabet_pickle=cfg.get('data', 'alphabet_pickle'),
    min_token_freq=cfg.getint('args', 'min_token_freq'))
  x_test, y_test = test_data_provider.load()
  x_test = pad_sequences(x_test, maxlen=maxlen)

  # make test vectors for target task
  print 'original x_test shape:', x_test.shape
  x_test = interm_layer_model.predict(x_test)
  print 'new x_test shape:', x_test.shape

  classifier = LinearSVC(class_weight='balanced')
  model = classifier.fit(x_train, y_train)
  predictions = classifier.predict(x_test)
  p = precision_score(y_test, predictions, average='macro')
  r = recall_score(y_test, predictions, average='macro')
  f1 = f1_score(y_test, predictions, average='macro')
  print 'p = %.3f' % p
  print 'r = %.3f' % r
  print 'f1 = %.3f\n' % f1

  return p, r, f1
Exemple #4
0
def run_evaluation(disease, judgement):
    """Train on train set and evaluate on test set"""

    cfg = configparser.ConfigParser()
    cfg.read(sys.argv[1])
    print_config(cfg)
    base = os.environ['DATA_ROOT']
    train_data = os.path.join(base, cfg.get('data', 'train_data'))
    train_annot = os.path.join(base, cfg.get('data', 'train_annot'))
    test_data = os.path.join(base, cfg.get('data', 'test_data'))
    test_annot = os.path.join(base, cfg.get('data', 'test_annot'))

    # load training data first
    train_data_provider = DatasetProvider(train_data,
                                          train_annot,
                                          disease,
                                          judgement,
                                          use_pickled_alphabet=False,
                                          min_token_freq=cfg.getint(
                                              'args', 'min_token_freq'))
    x_train, y_train = train_data_provider.load()

    classes = len(train_data_provider.label2int)
    maxlen = max([len(seq) for seq in x_train])
    x_train = pad_sequences(x_train, maxlen=maxlen)
    y_train = to_categorical(y_train, classes)

    # now load the test set
    test_data_provider = DatasetProvider(test_data,
                                         test_annot,
                                         disease,
                                         judgement,
                                         use_pickled_alphabet=True,
                                         min_token_freq=cfg.getint(
                                             'args', 'min_token_freq'))
    x_test, y_test = test_data_provider.load()  # pass maxlen
    x_test = pad_sequences(x_test, maxlen=maxlen)
    y_test = to_categorical(y_test, classes)

    model = get_model(cfg, train_data_provider.token2int, maxlen, classes,
                      'softmax')
    optimizer = RMSprop(lr=cfg.getfloat('nn', 'learnrt'))
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    model.fit(x_train,
              y_train,
              epochs=cfg.getint('nn', 'epochs'),
              batch_size=cfg.getint('nn', 'batch'),
              validation_split=0.0,
              verbose=0)

    # probability for each class; (test size, num of classes)
    distribution = model.predict(x_test, batch_size=cfg.getint('nn', 'batch'))
    # class predictions; (test size,)
    predictions = np.argmax(distribution, axis=1)
    # gold labels; (test size,)
    gold = np.argmax(y_test, axis=1)

    # f1 scores
    f1 = f1_score(gold, predictions, average='macro')
    print('%s: f1 = %.3f' % (disease, f1))

    return f1
Exemple #5
0
def data_dense(cfg, disease, judgement):
    """Data to feed into code prediction model"""

    base = os.environ['DATA_ROOT']
    train_data = os.path.join(base, cfg.get('data', 'train_data'))
    train_annot = os.path.join(base, cfg.get('data', 'train_annot'))
    test_data = os.path.join(base, cfg.get('data', 'test_data'))
    test_annot = os.path.join(base, cfg.get('data', 'test_annot'))

    # load pre-trained model
    model = load_model(cfg.get('data', 'model_file'))
    interm_layer_model = Model(inputs=model.input,
                               outputs=model.get_layer(
                                   cfg.get('data', 'rep_layer')).output)
    maxlen = model.get_layer(name='EL').get_config()['input_length']

    # determine whether to treat input tokens as a sequence or set
    if cfg.get('data', 'model_type') == 'dan':
        use_cuis = True
        tokens_as_set = True
    else:
        use_cuis = False
        tokens_as_set = False

    # load training data first
    train_data_provider = DatasetProvider(
        train_data,
        train_annot,
        disease,
        judgement,
        use_pickled_alphabet=True,
        alphabet_pickle=cfg.get('data', 'alphabet_pickle'),
        min_token_freq=cfg.getint('args', 'min_token_freq'),
        use_cuis=use_cuis)
    x_train, y_train = train_data_provider.load(tokens_as_set=tokens_as_set)

    classes = len(set(y_train))
    print('unique labels in train:', classes)
    x_train = pad_sequences(x_train, maxlen=maxlen)

    # make training vectors for target task
    print('original x_train shape:', x_train.shape)
    x_train = interm_layer_model.predict(x_train)
    print('new x_train shape:', x_train.shape)

    # now load the test set
    test_data_provider = DatasetProvider(
        test_data,
        test_annot,
        disease,
        judgement,
        use_pickled_alphabet=True,
        alphabet_pickle=cfg.get('data', 'alphabet_pickle'),
        min_token_freq=cfg.getint('args', 'min_token_freq'),
        use_cuis=use_cuis)
    x_test, y_test = test_data_provider.load(tokens_as_set=tokens_as_set)
    x_test = pad_sequences(x_test, maxlen=maxlen)

    # make test vectors for target task
    print('original x_test shape:', x_test.shape)
    x_test = interm_layer_model.predict(x_test)
    print('new x_test shape:', x_test.shape)

    return x_train, y_train, x_test, y_test