コード例 #1
0
ファイル: cross_validation.py プロジェクト: peleiden/daLUKE
def run_experiment(args: dict[str, Any]):
    set_seeds(seed=0)
    # Remove subolder so we can control location directly
    NER_Results.subfolder = ""

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    entity_vocab, metadata, state_dict, token_map = load_from_archive(
        args["model"])
    state_dict, ent_embed_size = mutate_for_ner(
        state_dict,
        mask_id=entity_vocab["[MASK]"]["id"],
        pad_id=entity_vocab["[PAD]"]["id"])

    log(f"Loading dataset {args['dataset']} ...")
    dataset = load_dataset(args, metadata, device, token_map)

    log("Loading model ...")
    model = load_model(state_dict,
                       dataset,
                       metadata,
                       device,
                       entity_embedding_size=ent_embed_size,
                       bert_attention=args["bert_attention"],
                       dropout=args["dropout"])

    cv_results = cross_validate(model, dataset, args["k"], args)

    log(f"Saving results to {args['location']}")
    for i, r in enumerate(cv_results):
        r.save(os.path.join(args["location"], f"res-cv{i}"))
    log("Micro avg. F1 estimate",
        np.mean([r.statistics["micro avg"]["f1-score"] for r in cv_results]))
コード例 #2
0
def collect_representations(
    modelpath: str, device: torch.device, target_device: torch.device,
    only_positives: bool, fine_tuned: bool
) -> tuple[np.ndarray, np.ndarray, list[dict[str, int | list[tuple[int,
                                                                   int]]]]]:
    entity_vocab, metadata, state_dict, token_map = load_from_archive(
        args["model"])
    log("Loading dataset")
    # Note: We dont fill out dict as we dont allow changing max-entities and max-entity-span here. If this results in an error for any dataset, we must change this.
    dataset = load_dataset(dict(dataset="DaNE"), metadata, device, token_map)
    dataloader = dataset.build(Split.TRAIN, FP_SIZE, shuffle=False)
    log("Loading model")
    if not fine_tuned:
        state_dict, ent_embed_size = mutate_for_ner(
            state_dict,
            mask_id=entity_vocab["[MASK]"]["id"],
            pad_id=entity_vocab["[PAD]"]["id"])
    model = load_model(
        state_dict,
        dataset,
        metadata,
        device,
        entity_embedding_size=ent_embed_size if not fine_tuned else None)
    model.eval()

    log("Forward passing examples")
    batch_representations, labels, content = list(), list(), list()
    for batch in tqdm(dataloader):
        # Use super class as we want the represenations
        word_representations, entity_representations = super(
            type(model), model).forward(batch)
        start_word_representations, end_word_representations = model.collect_start_and_ends(
            word_representations, batch)
        representations = torch.cat([
            start_word_representations, end_word_representations,
            entity_representations
        ],
                                    dim=2)
        # We dont want padding
        mask = batch.entities.attention_mask.bool()
        if only_positives:
            mask &= (batch.entities.labels != 0)
        batch_representations.append(
            representations[mask].contiguous().to(target_device))
        labels.append(
            batch.entities.labels[mask].contiguous().to(target_device))
        for i, text_num in enumerate(batch.text_nums):
            for j in range(batch.entities.N[i]):
                if mask[i, j]:
                    content.append(
                        dict(
                            text_num=text_num,
                            span=batch.entities.fullword_spans[i][j],
                        ))
    return torch.cat(batch_representations).numpy(), torch.cat(
        labels).numpy(), content
コード例 #3
0
def fetch_model(
        model: Models,
        force_download=False
) -> tuple[DaLUKE, dict, dict, Optional[np.ndarray]]:
    # Make sure .tar.gz model file exists
    os.makedirs(_download_dir, exist_ok=True)
    if should_download(model) or force_download:
        # Create status file
        pathlib.Path(_status_files[model]).touch()
        # Download
        wget.download(model.value, out=_model_files[model])
        # Remove status file
        os.remove(_status_files[model])

    # Read model state dict along with metadata and entity vocab
    # This is done in a seperate working directory
    cwd = os.getcwd()
    os.chdir(_download_dir)
    entity_vocab, metadata, state_dict, token_map = load_from_archive(
        _model_files[model])
    os.chdir(cwd)

    # Load model
    bert_config = AutoConfig.from_pretrained(metadata["base-model"])
    bert_config.vocab_size = metadata["vocab-size"]
    if model == Models.DaLUKE:
        net = PretrainTaskDaLUKE(bert_config, len(entity_vocab),
                                 get_ent_embed_size(state_dict))
    elif model == Models.DaLUKE_NER:
        net = NERDaLUKE(
            output_shape=5,  # Always use misc in this case
            bert_config=bert_config,
            ent_vocab_size=2,
            ent_embed_size=get_ent_embed_size(state_dict),
            dropout=0,
            words_only=False,
            entities_only=False,
        )

    net.load_state_dict(state_dict)
    net.eval()

    return net.to(_device), metadata, entity_vocab, token_map
コード例 #4
0
def run_experiment(args: dict[str, Any]):
    set_seeds(seed=0)
    # Remove subfolder so we can control location directly
    NER_Results.subfolder = ""
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    entity_vocab, metadata, state_dict, token_map = load_from_archive(args["model"])
    state_dict, ent_embed_size = mutate_for_ner(state_dict, mask_id=entity_vocab["[MASK]"]["id"], pad_id=entity_vocab["[PAD]"]["id"])

    log("Setting up sampler")
    with open(args["params"], "r") as f:
        param_lists = json.load(f)
    sampler = SAMPLERS[args["sampler"]](param_lists)

    log(f"Loading dataset {args['dataset']} ...")
    dataset = load_dataset(args, metadata, device, token_map)

    log("Loading model ...")
    model = load_model(state_dict, dataset, metadata, device, entity_embedding_size=ent_embed_size)

    optimize(model, dataset, args, sampler)
コード例 #5
0
def run_experiment(args: dict[str, Any]):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    _, metadata, state_dict, token_map = load_from_archive(args["model"])

    log("Loading dataset ...")
    dataset = load_dataset(args, metadata, device, token_map)
    dataloader = dataset.build(Split.TEST, FP_SIZE)

    log("Loading model ...")
    model = load_model(state_dict, dataset, metadata, device)

    # Print some important information to stdout
    log.debug(model)
    dataset.document(dataloader, Split.TEST)
    type_distribution(dataset.data[Split.TEST].annotations)

    log("Starting evaluation of daLUKE for NER")
    results = evaluate_ner(model, dataloader, dataset, device, Split.TEST)

    results.save(args["location"])
    type_distribution(results.preds)
コード例 #6
0
def run_experiment(args: dict[str, Any]):
    log.configure(
        os.path.join(args["location"], "daluke-train-ner.log"),
        args["name"] + " Fine-tuning",
        logger=args["name"] + "-fine-tune",
        print_level=Levels.INFO if args["quieter"] else Levels.DEBUG,
    )
    set_seeds(seed=args["seed"])
    assert not (args["words_only"] and args["entities_only"]), "--words-only and --entities-only cannot be used together"
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    entity_vocab, metadata, state_dict, token_map = load_from_archive(args["model"])
    state_dict, ent_embed_size = mutate_for_ner(state_dict, mask_id=entity_vocab["[MASK]"]["id"], pad_id=entity_vocab["[PAD]"]["id"])

    # Add new NER specific fields to metadata
    metadata["NER-words-only"]    = args["words_only"]
    metadata["NER-entities-only"] = args["entities_only"]

    log(f"Loading dataset {args['dataset']} ...")
    dataset = load_dataset(args, metadata, device, token_map)
    dataloader = dataset.build(Split.TRAIN, args["batch_size"])
    dev_dataloader = dataset.build(Split.DEV, args["batch_size"]) if args["eval"] else None

    # Remember the dimensionality that the model will be trained with
    metadata["output-size"] = len(dataset.all_labels)

    log("Loading model ...")
    model = load_model(
        state_dict,
        dataset,
        metadata,
        device,
        bert_attention = args["bert_attention"],
        entity_embedding_size = ent_embed_size,
        dropout = args["dropout"],
    )

    log(f"Starting training of DaLUKE for NER on {args['dataset']}")
    training = TrainNER(
        model,
        dataloader,
        dataset,
        device         = device,
        epochs         = args["epochs"],
        lr             = args["lr"],
        warmup_prop    = args["warmup_prop"],
        weight_decay   = args["weight_decay"],
        dev_dataloader = dev_dataloader,
        loss_weight    = args["loss_weight"],
    )
    # Log important information out
    log.debug(training.model)
    log.debug(training.scheduler)
    log.debug(training.optimizer)
    dataset.document(dataloader, Split.TRAIN)
    type_distribution(dataset.data[Split.TRAIN].annotations)

    results = training.run()

    log("Saving results and model to %s" % args["location"])
    save_to_archive(os.path.join(args["location"], TRAIN_OUT), entity_vocab, metadata, model, token_map)

    if args["eval"]:
        log("True dev. set distributions")
        results.dev_true_type_distribution = type_distribution(dataset.data[Split.DEV].annotations)
        log("True dev. set distributions")
        results.train_true_type_distribution = type_distribution(dataset.data[Split.TRAIN].annotations)
        log("Saving best model")
        save_to_archive(os.path.join(args["location"], TRAIN_OUT_BEST), entity_vocab, metadata, training.best_model, token_map)

    results.save(args["location"])