def run_cross_validation(disease, judgement): """Run n-fold CV on training set""" cfg = configparser.ConfigParser() cfg.read(sys.argv[1]) print_config(cfg) base = os.environ['DATA_ROOT'] data_dir = os.path.join(base, cfg.get('data', 'train_data')) annot_xml = os.path.join(base, cfg.get('data', 'train_annot')) dataset = DatasetProvider(data_dir, annot_xml, disease, judgement, use_pickled_alphabet=False, min_token_freq=cfg.getint( 'args', 'min_token_freq')) x, y = dataset.load() classes = len(dataset.label2int) maxlen = max([len(seq) for seq in x]) x = pad_sequences(x, maxlen=maxlen) y = to_categorical(y, classes) cv_scores = [] kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=100) for train_indices, test_indices in kf.split(x): train_x = x[train_indices] train_y = y[train_indices] test_x = x[test_indices] test_y = y[test_indices] model = get_model(cfg, dataset.token2int, maxlen, classes, 'softmax') optimizer = RMSprop(lr=cfg.getfloat('nn', 'learnrt')) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) model.fit(train_x, train_y, epochs=cfg.getint('nn', 'epochs'), batch_size=cfg.getint('nn', 'batch'), validation_split=0.0, verbose=0) # probability for each class; (test size, num of classes) distribution = model.predict(test_x, batch_size=cfg.getint('nn', 'batch')) # class predictions; (test size,) predictions = np.argmax(distribution, axis=1) # gold labels; (test size,) gold = np.argmax(test_y, axis=1) # f1 scores f1 = f1_score(gold, predictions, average='macro') cv_scores.append(f1) print('average f1:', np.mean(cv_scores)) print('standard deviation:', np.std(cv_scores))
def get_data(disease, judgement): """Sequences of tokens to feed into code prediction model""" base = os.environ['DATA_ROOT'] train_data = os.path.join(base, cfg.get('data', 'train_data')) train_annot = os.path.join(base, cfg.get('data', 'train_annot')) test_data = os.path.join(base, cfg.get('data', 'test_data')) test_annot = os.path.join(base, cfg.get('data', 'test_annot')) # determine whether to treat input tokens as a sequence or set if cfg.get('data', 'model_type') == 'dan': use_cuis = True tokens_as_set = True else: use_cuis = False tokens_as_set = False # load training data train_data_provider = DatasetProvider( train_data, train_annot, disease, judgement, use_pickled_alphabet=True, alphabet_pickle=cfg.get('data', 'alphabet_pickle'), min_token_freq=cfg.getint('args', 'min_token_freq'), use_cuis=use_cuis) x_train, y_train = train_data_provider.load(tokens_as_set=tokens_as_set) x_train = pad_sequences(x_train, maxlen=get_maxlen()) # load the test set test_data_provider = DatasetProvider( test_data, test_annot, disease, judgement, use_pickled_alphabet=True, alphabet_pickle=cfg.get('data', 'alphabet_pickle'), min_token_freq=cfg.getint('args', 'min_token_freq'), use_cuis=use_cuis) x_test, y_test = test_data_provider.load(tokens_as_set=tokens_as_set) x_test = pad_sequences(x_test, maxlen=get_maxlen()) return x_train, y_train, x_test, y_test
def run_evaluation_dense(disease, judgement): """Use pre-trained patient representations""" print 'disease:', disease print 'judgement:', judgement cfg = ConfigParser.ConfigParser() cfg.read(sys.argv[1]) base = os.environ['DATA_ROOT'] train_data = os.path.join(base, cfg.get('data', 'train_data')) train_annot = os.path.join(base, cfg.get('data', 'train_annot')) test_data = os.path.join(base, cfg.get('data', 'test_data')) test_annot = os.path.join(base, cfg.get('data', 'test_annot')) # load pre-trained model model = load_model(cfg.get('data', 'model_file')) interm_layer_model = Model( inputs=model.input, outputs=model.get_layer('HL').output) # load training data first train_data_provider = DatasetProvider( train_data, train_annot, disease, judgement, use_pickled_alphabet=True, alphabet_pickle=cfg.get('data', 'alphabet_pickle'), min_token_freq=cfg.getint('args', 'min_token_freq')) x_train, y_train = train_data_provider.load() classes = len(set(y_train)) print 'unique labels in train:', classes maxlen = cfg.getint('data', 'maxlen') x_train = pad_sequences(x_train, maxlen=maxlen) # make training vectors for target task print 'original x_train shape:', x_train.shape x_train = interm_layer_model.predict(x_train) print 'new x_train shape:', x_train.shape # now load the test set test_data_provider = DatasetProvider( test_data, test_annot, disease, judgement, use_pickled_alphabet=True, alphabet_pickle=cfg.get('data', 'alphabet_pickle'), min_token_freq=cfg.getint('args', 'min_token_freq')) x_test, y_test = test_data_provider.load() x_test = pad_sequences(x_test, maxlen=maxlen) # make test vectors for target task print 'original x_test shape:', x_test.shape x_test = interm_layer_model.predict(x_test) print 'new x_test shape:', x_test.shape classifier = LinearSVC(class_weight='balanced') model = classifier.fit(x_train, y_train) predictions = classifier.predict(x_test) p = precision_score(y_test, predictions, average='macro') r = recall_score(y_test, predictions, average='macro') f1 = f1_score(y_test, predictions, average='macro') print 'p = %.3f' % p print 'r = %.3f' % r print 'f1 = %.3f\n' % f1 return p, r, f1
def run_evaluation(disease, judgement): """Train on train set and evaluate on test set""" cfg = configparser.ConfigParser() cfg.read(sys.argv[1]) print_config(cfg) base = os.environ['DATA_ROOT'] train_data = os.path.join(base, cfg.get('data', 'train_data')) train_annot = os.path.join(base, cfg.get('data', 'train_annot')) test_data = os.path.join(base, cfg.get('data', 'test_data')) test_annot = os.path.join(base, cfg.get('data', 'test_annot')) # load training data first train_data_provider = DatasetProvider(train_data, train_annot, disease, judgement, use_pickled_alphabet=False, min_token_freq=cfg.getint( 'args', 'min_token_freq')) x_train, y_train = train_data_provider.load() classes = len(train_data_provider.label2int) maxlen = max([len(seq) for seq in x_train]) x_train = pad_sequences(x_train, maxlen=maxlen) y_train = to_categorical(y_train, classes) # now load the test set test_data_provider = DatasetProvider(test_data, test_annot, disease, judgement, use_pickled_alphabet=True, min_token_freq=cfg.getint( 'args', 'min_token_freq')) x_test, y_test = test_data_provider.load() # pass maxlen x_test = pad_sequences(x_test, maxlen=maxlen) y_test = to_categorical(y_test, classes) model = get_model(cfg, train_data_provider.token2int, maxlen, classes, 'softmax') optimizer = RMSprop(lr=cfg.getfloat('nn', 'learnrt')) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) model.fit(x_train, y_train, epochs=cfg.getint('nn', 'epochs'), batch_size=cfg.getint('nn', 'batch'), validation_split=0.0, verbose=0) # probability for each class; (test size, num of classes) distribution = model.predict(x_test, batch_size=cfg.getint('nn', 'batch')) # class predictions; (test size,) predictions = np.argmax(distribution, axis=1) # gold labels; (test size,) gold = np.argmax(y_test, axis=1) # f1 scores f1 = f1_score(gold, predictions, average='macro') print('%s: f1 = %.3f' % (disease, f1)) return f1
def data_dense(cfg, disease, judgement): """Data to feed into code prediction model""" base = os.environ['DATA_ROOT'] train_data = os.path.join(base, cfg.get('data', 'train_data')) train_annot = os.path.join(base, cfg.get('data', 'train_annot')) test_data = os.path.join(base, cfg.get('data', 'test_data')) test_annot = os.path.join(base, cfg.get('data', 'test_annot')) # load pre-trained model model = load_model(cfg.get('data', 'model_file')) interm_layer_model = Model(inputs=model.input, outputs=model.get_layer( cfg.get('data', 'rep_layer')).output) maxlen = model.get_layer(name='EL').get_config()['input_length'] # determine whether to treat input tokens as a sequence or set if cfg.get('data', 'model_type') == 'dan': use_cuis = True tokens_as_set = True else: use_cuis = False tokens_as_set = False # load training data first train_data_provider = DatasetProvider( train_data, train_annot, disease, judgement, use_pickled_alphabet=True, alphabet_pickle=cfg.get('data', 'alphabet_pickle'), min_token_freq=cfg.getint('args', 'min_token_freq'), use_cuis=use_cuis) x_train, y_train = train_data_provider.load(tokens_as_set=tokens_as_set) classes = len(set(y_train)) print('unique labels in train:', classes) x_train = pad_sequences(x_train, maxlen=maxlen) # make training vectors for target task print('original x_train shape:', x_train.shape) x_train = interm_layer_model.predict(x_train) print('new x_train shape:', x_train.shape) # now load the test set test_data_provider = DatasetProvider( test_data, test_annot, disease, judgement, use_pickled_alphabet=True, alphabet_pickle=cfg.get('data', 'alphabet_pickle'), min_token_freq=cfg.getint('args', 'min_token_freq'), use_cuis=use_cuis) x_test, y_test = test_data_provider.load(tokens_as_set=tokens_as_set) x_test = pad_sequences(x_test, maxlen=maxlen) # make test vectors for target task print('original x_test shape:', x_test.shape) x_test = interm_layer_model.predict(x_test) print('new x_test shape:', x_test.shape) return x_train, y_train, x_test, y_test