Esempio n. 1
0
def train_ner(output_dir: str,
              train_data_path: str,
              dev_data_path: str,
              test_data_path: str,
              run_test: bool = None,
              model: str = None,
              n_iter: int = 10,
              meta_overrides: str = None):

    util.fix_random_seed(util.env_opt("seed", 0))
    train_data = read_ner_from_tsv(train_data_path)
    dev_data = read_ner_from_tsv(dev_data_path)
    test_data = read_ner_from_tsv(test_data_path)
    os.makedirs(output_dir, exist_ok=True)
    if run_test:
        nlp = spacy.load(model)
        print("Loaded model '%s'" % model)
        evaluate_ner(nlp,
                     dev_data,
                     dump_path=os.path.join(output_dir, "dev_metrics.json"))
        evaluate_ner(nlp,
                     test_data,
                     dump_path=os.path.join(output_dir, "test_metrics.json"))
    else:
        train(model, train_data, dev_data, test_data, output_dir, n_iter,
              meta_overrides)
Esempio n. 2
0
def train_ner(output_dir: str,
              data_path: str,
              run_test: bool = None,
              model: str = None,
              n_iter: int = 100,
              label_granularity: int = None):

    if label_granularity is not None:
        umls_tree = construct_umls_tree_from_tsv(
            "data/umls_semantic_type_tree.tsv")
        label_mapping = umls_tree.get_collapsed_type_id_map_at_level(
            label_granularity)
        if label_granularity == 0:
            span_only = True
    else:
        label_mapping = None
        span_only = False
    train_data, dev_data, test_data = read_full_med_mentions(
        data_path, label_mapping, span_only)
    os.makedirs(output_dir, exist_ok=True)
    if run_test:
        nlp = spacy.load(model)
        print("Loaded model '%s'" % model)
        evaluate_ner(nlp,
                     dev_data,
                     dump_path=os.path.join(output_dir, "dev_metrics.json"))
        evaluate_ner(nlp,
                     test_data,
                     dump_path=os.path.join(output_dir, "test_metrics.json"))
    else:
        train(model, train_data, dev_data, output_dir, n_iter)
Esempio n. 3
0
def main(model_path: str, dataset: str, output_path: str, code: Optional[str],
         med_mentions_folder_path: Optional[str]):
    if code is not None:
        # need to import code before loading a spacy model
        spec = importlib.util.spec_from_file_location(name, str(loc))
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)

    nlp = spacy.load(model_path)
    if dataset.startswith("medmentions"):
        train_data, dev_data, test_data = read_full_med_mentions(
            med_mentions_folder_path, None, False)
        data_split = dataset.split("-")[1]
        if data_split == "train":
            data = train_data
        elif data_split == "dev":
            data = dev_data
        elif data_split == "test":
            data = test_data
        else:
            raise Exception(f"Unrecognized split {data_split}")
    else:
        data = read_ner_from_tsv(dataset)

    evaluate_ner(nlp, data, dump_path=output_path)
Esempio n. 4
0
def train_ner(output_dir: str,
              train_data_path: str,
              dev_data_path: str,
              test_data_path: str,
              run_test: bool = None,
              model: str = None,
              n_iter: int = 10,
              meta_overrides: str = None):

    model = "en_core_sci_sm"
    util.fix_random_seed(util.env_opt("seed", 0))

    with Path(train_data_path).open('rb') as file:
        train_data = pickle.load(file)

    with Path(dev_data_path).open('rb') as file:
        dev_data = pickle.load(file)

    with Path(test_data_path).open('rb') as file:
        test_data = pickle.load(file)

    # train_data = read_ner_from_tsv(train_data_path)
    # dev_data = read_ner_from_tsv(dev_data_path)
    # test_data = read_ner_from_tsv(test_data_path)

    os.makedirs(output_dir, exist_ok=True)
    if run_test:
        nlp = spacy.load(model)
        print("Loaded model '%s'" % model)
        evaluate_ner(nlp,
                     dev_data,
                     dump_path=os.path.join(output_dir, "dev_metrics.json"))
        evaluate_ner(nlp,
                     test_data,
                     dump_path=os.path.join(output_dir, "test_metrics.json"))
    else:
        train(model, train_data, dev_data, test_data, output_dir, n_iter,
              meta_overrides)
Esempio n. 5
0
def train(model, train_data, dev_data, test_data, output_dir, n_iter,
          meta_overrides):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    if meta_overrides is not None:
        metadata = json.load(open(meta_overrides))
        nlp.meta.update(metadata)

    original_tokenizer = nlp.tokenizer

    nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names and "parser" in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, after="parser")
    elif 'ner' not in nlp.pipe_names and "tagger" in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, after="tagger")
    elif 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe('ner')

    # add labels
    for _, annotations in train_data:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
                                  util.env_opt('dropout_to', 0.2),
                                  util.env_opt('dropout_decay', 0.005))
    batch_sizes = util.compounding(util.env_opt('batch_from', 1),
                                   util.env_opt('batch_to', 32),
                                   util.env_opt('batch_compound', 1.001))

    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
    best_epoch = 0
    best_f1 = 0
    for i in range(n_iter):

        random.shuffle(train_data)
        count = 0
        losses = {}
        total = len(train_data)

        with nlp.disable_pipes(*other_pipes):  # only train NER
            with tqdm.tqdm(total=total, leave=True) as pbar:
                for batch in minibatch(train_data, size=batch_sizes):
                    docs, golds = zip(*batch)
                    nlp.update(docs,
                               golds,
                               sgd=optimizer,
                               losses=losses,
                               drop=next(dropout_rates))
                    pbar.update(len(batch))
                    if count % 100 == 0 and count > 0:
                        print('sum loss: %s' % losses['ner'])
                    count += 1

        # save model to output directory
        output_dir_path = Path(output_dir + "/" + str(i))
        if not output_dir_path.exists():
            output_dir_path.mkdir()

        with nlp.use_params(optimizer.averages):
            nlp.tokenizer = original_tokenizer
            nlp.to_disk(output_dir_path)
            print("Saved model to", output_dir_path)

        # test the saved model
        print("Loading from", output_dir_path)
        nlp2 = util.load_model_from_path(output_dir_path)
        nlp2.tokenizer = WhitespaceTokenizer(nlp.vocab)

        metrics = evaluate_ner(nlp2, dev_data)
        if metrics["f1-measure-overall"] > best_f1:
            best_f1 = metrics["f1-measure-overall"]
            best_epoch = i
    # save model to output directory
    best_model_path = Path(output_dir + "/" + "best")
    print(f"Best Epoch: {best_epoch} of {n_iter}")
    if os.path.exists(best_model_path):
        shutil.rmtree(best_model_path)
    shutil.copytree(os.path.join(output_dir, str(best_epoch)), best_model_path)

    # test the saved model
    print("Loading from", best_model_path)
    nlp2 = util.load_model_from_path(best_model_path)
    nlp2.tokenizer = WhitespaceTokenizer(nlp.vocab)

    evaluate_ner(nlp2,
                 dev_data,
                 dump_path=os.path.join(output_dir, "dev_metrics.json"))
    evaluate_ner(nlp2,
                 test_data,
                 dump_path=os.path.join(output_dir, "test_metrics.json"))