from argparse import Namespace import tensorflow as tf import tensorflow_estimator as tfe from hedgedog.tf.io.dataset import Dataset from hedgedog.tf.models.multitask_bert_model import MultitaskBertModel from hedgedog.tf.typing import TensorOrTensorDict from hedgedog.logging import get_logger from el.data.dataset import NerDataset from el.config import model_ing from el.model.boundary import BoundaryModule from el.model.normalization import NormalizationModule from el.model.type import TypingModule, TypeEmbeddingModule from el.model.cake import CakeModule log = get_logger("el.model") class MTBertModel(MultitaskBertModel): @model_ing.capture def __init__(self, mode: str, hyperparameters: Namespace, bert_model: str, dataset: Dataset = None): is_training = mode == "TRAIN" prediction_modules = [] self.module_names = hyperparameters.model.modules if 'boundary' in hyperparameters.model.modules: prediction_modules.append(BoundaryModule(hyperparameters, is_training)) if 'norm' in hyperparameters.model.modules: prediction_modules.append(NormalizationModule(hyperparameters, is_training)) if 'cake' in hyperparameters.model.modules: cake_model_cons = { 'basic': CakeModule
from collections import defaultdict from pathlib import Path import tensorflow_estimator as tfe from hedgedog.logging import get_logger from hedgedog.tf.estimator import train as hdtrain from el.eval.evaluation import Evaluation from el.eval.example import Example from el.eval.span import Span log = get_logger("el.eval") def f1_eval(name, gold_spans, predicted_spans, sentence_texts, params): exact_eval = Evaluation(set(gold_spans), set(predicted_spans)) report = f"\nExact Boundary Evaluation\n----------" +\ f"\nP: {exact_eval.precision()}" +\ f"\nR: {exact_eval.recall()}" +\ f"\nF1: {exact_eval.f1()}" log.info(report) partial_eval = Evaluation(set(gold_spans), set(predicted_spans), partial=True) partial_report = f"\n----------\nPartial Boundary Evaluation\n----------" +\ f"\nP: {partial_eval.precision()}" +\ f"\nR: {partial_eval.recall()}" +\ f"\nF1: {partial_eval.f1()}" log.info(partial_report) report += partial_report
from sacred import Experiment from hedgedog.tf.estimator.ingredients import * from hedgedog.tf.io.dataset_util import inspect_tfrecords, iterate_records from hedgedog.tf.estimator import train as hdtrain from hedgedog.tf.sacred import convert_to_namespace from el.data.dataset import NerDataset # noinspection PyUnresolvedReferences from el import config as conf from el.model.model import MTBertModel from el.data import clef, medmentions from el.eval.boundary import boundary_eval from el.eval.entity import end_to_end_eval logging.reset_handlers() log = logging.get_logger('el') ex = Experiment(ingredients=[ sampling_ingredient, dataset_ingredient, estimator_ingredient, conf.model_ing, training_ingredient ]) @ex.command def train(_run): params = convert_to_namespace(_run.config) hdtrain.train(_run, model_class=MTBertModel, parameters=params) @ex.command def evaluate(_run):
from abc import ABC import tensorflow as tf import numpy as np import os from hedgedog.tf.typing import TensorDict, TensorOrTensorDict from hedgedog.tf import layers as hdlayers from hedgedog.tf import metrics as hdmetrics from hedgedog.logging import get_logger import hedgedog.tf.models.bert as modeling from el.model.normalization import NormalizationModule from el.config import model_ing log = get_logger("el.model.cake") class CakeModule(NormalizationModule): @model_ing.capture def __init__(self, params, is_training, ace_path: str, train_bert: bool, include_cls_sep: bool = True): super().__init__(params, is_training) # half, just embs, no proj_embs self.embedding_size = 50 self.rnn_num_layers = 1 self.rnn_hidden_size = 512 self.train_bert = train_bert
from typing import List from pathlib import Path import traceback import json from tqdm import tqdm from hedgedog.logging import get_logger from hedgedog.nlp.spacy.umls import UmlsCandidateGenerator from hedgedog.tf.estimator.ingredients import dataset_ingredient from el.data.text import Concept, Document, Span log = get_logger("mm.data.medmentions") class MedMentionsDocument(Document): def __init__(self, lines: List[str], umls, k, mention2idx): did, _, title = lines[0].strip().split('|') text = '|'.join(lines[1].strip().split('|')[2:]) doc_string = title + '. ' + text entities = [] for eid, line in enumerate(lines[2:]): did, start, end, text, types, cui = line.strip().split('\t') cui = cui.replace('UMLS:', '') start, end = int(start), int(end) if start > len(title): start += 1 end += 1 entities.append(Concept([Span(start, end, text)], types.split(','), cui)) super().__init__(did, doc_string, entities, umls, k, mention2idx)
import tensorflow as tf from hedgedog.tf.estimator.multitask import Module from hedgedog.tf.typing import TensorDict, TensorOrTensorDict from hedgedog.nlp.seq_tag import get_tags from tensorflow.contrib import crf from hedgedog.tf import layers as hdlayers from hedgedog.tf import metrics as hdmetrics from hedgedog.logging import get_logger from el.config import model_ing log = get_logger("el.model.boundary") class BoundaryModule(Module): @model_ing.capture def __init__(self, params, is_training, verbose_eval, use_bilstm, thiccness): super().__init__(params, is_training) tags = get_tags(params.dataset.tagset) self.boundary2id = tags.tag2id() d = len(self.boundary2id) self.crf_params = tf.get_variable("crf_params", shape=[d, d], dtype=tf.float32) self.verbose_eval = verbose_eval self.use_bilstm = use_bilstm self.thiccness = thiccness log.info( f"Initialized Boundary Module with tagset: {params.dataset.tagset} {self.boundary2id}" )
import json from collections import defaultdict from pathlib import Path from typing import List, Dict, Callable, Generator import numpy as np import tensorflow as tf from tqdm import tqdm from hedgedog.logging import get_logger from hedgedog.nlp.spacy.umls import UmlsCandidateGenerator from el.data.dataset import overlap from el.eval import init_model, f1_eval from el.eval.span import Span, wpid_sequence_to_string log = get_logger("el.eval.entity") def end_to_end_eval(model_class, params): gold_spans = _load_gold_spans(params) sentence_dicts = _load_sentences(params) # num_steps = params.dataset.num_test if params.estimator.eval_test_set else params.dataset.num_dev # sentence_dicts: Dict[str, Dict[str, None]] = {s['sentence_id']: s # for s in tqdm(predict_boundaries(model_class, params), total=num_steps)} log.info("Boundary detection done. Running entity linking...") predicted_spans = [] sentence_texts = {} id2wp = {v: k for k, v in model_class.dataset().wptokenizer.vocab.items()} for sentence in predict_entities(model_class, params, sentence_dicts): # noinspection PyTypeChecker predicted_spans.extend(sentence['spans'])
from pathlib import Path from typing import List from string import punctuation from hedgedog.logging import get_logger from hedgedog.nlp.spacy.umls import UmlsCandidate, UmlsCandidateGenerator log = get_logger("mm.data.brat") punctuation = punctuation.replace('\'', '') class LazySpacy: def __init__(self): self.spacy = None def annotate(self, text): if self.spacy is None: from hedgedog.nlp.spacy import SpacyAnnotator self.spacy = SpacyAnnotator('en_core_sci_sm', 'default', ['parser']) return self.spacy.annotate(text) spacy = LazySpacy() class Span: def __init__(self, start: int, end: int, text: str): self.start = start self.end = end self.text = text
from pathlib import Path from typing import Generator import json import numpy as np from collections import defaultdict from hedgedog.tf.estimator.ingredients import dataset_ingredient from hedgedog.tf.io.dataset import FeatureDataset, T from hedgedog.tf.io.Feature import * from hedgedog.nlp.wordpiece_tokenization import load_wordpiece_tokenizer from hedgedog.logging import get_logger from hedgedog.nlp.spacy.umls import UmlsCandidateGenerator from hedgedog.nlp.seq_tag import get_tags, ContinuationBoundaryLabeler, IOBESContinuationBoundaryAggregator log = get_logger("mm.data.dataset") class NerDataset(FeatureDataset): @dataset_ingredient.capture def __init__(self, data_dir: str, batch_size: int, bert_model: str, project_dir: str, candidates_per_concept: int, record_dir_name: str, tagset, ignore_sentences_without_concepts, dataset, mention_candidate_path): super().__init__(data_dir, batch_size) info_dir = Path(project_dir) / 'info' self.wptokenizer = load_wordpiece_tokenizer(bert_model) self.cui2id = json.load((info_dir / 'cui2id.json').open()) self.tui2label_id = json.load((info_dir / 'tui2label.json').open()) self.candidates_per_concept = candidates_per_concept self.filter = ignore_sentences_without_concepts self.mention2idx = json.load(
from pathlib import Path from hedgedog.logging import get_logger from hedgedog.nlp.spacy.umls import UmlsCandidateGenerator from hedgedog.tf.estimator.ingredients import dataset_ingredient from el.data.text import Concept, Document, Span import json from tqdm import tqdm import traceback log = get_logger("mm.data.clef") class ClefDocument(Document): def __init__(self, data_dir: Path, doc_id: str, umls, k, code2id, id2code, id2types, mention2idx): text = (data_dir / f"{doc_id}.text").read_text().replace('\t', ' ') concepts = [] with (data_dir / f"{doc_id}.pipe").open('r') as f: for line in f: line = line.strip() try: fields = line.split('||') _, cui = fields[1], fields[2] spans = [] for i in range(3, len(fields), 2): start, end = int(fields[i]), int(fields[i + 1]) spans.append(Span(start, end, text[start:end])) except ValueError as e: log.error(f"Could not parse line: ##{line}##") log.error(e) traceback.print_exc()
from typing import List, Dict, Generator from tqdm import tqdm import json from pathlib import Path from hedgedog.logging import get_logger from hedgedog.nlp.seq_tag import IOBESContinuationBoundaryAggregator from el.eval import init_model, f1_eval from el.eval.span import Span, wpid_sequence_to_string log = get_logger("el.eval.boundary") def boundary_eval(model_class, params): """ Conducts f1/p/r eval given a list of prediction and label dicts """ log.info("Beginning evaluation...") ds = model_class.dataset() id2wp = {v: k for k, v in ds.wptokenizer.vocab.items()} gold_spans = [] predicted_spans = [] sentence_texts = {} num_steps = params.dataset.num_test if params.estimator.eval_test_set else params.dataset.num_dev sentence_dicts = {} text2id = {} for sentence in tqdm(predict_boundaries(model_class, params), total=num_steps): sid = sentence['sentence_id'] sentence_texts[sid] = wpid_sequence_to_string(sentence['tokens'], id2wp)