コード例 #1
0
ファイル: cross_validation.py プロジェクト: peleiden/daLUKE
def run_experiment(args: dict[str, Any]):
    set_seeds(seed=0)
    # Remove subolder so we can control location directly
    NER_Results.subfolder = ""

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    entity_vocab, metadata, state_dict, token_map = load_from_archive(
        args["model"])
    state_dict, ent_embed_size = mutate_for_ner(
        state_dict,
        mask_id=entity_vocab["[MASK]"]["id"],
        pad_id=entity_vocab["[PAD]"]["id"])

    log(f"Loading dataset {args['dataset']} ...")
    dataset = load_dataset(args, metadata, device, token_map)

    log("Loading model ...")
    model = load_model(state_dict,
                       dataset,
                       metadata,
                       device,
                       entity_embedding_size=ent_embed_size,
                       bert_attention=args["bert_attention"],
                       dropout=args["dropout"])

    cv_results = cross_validate(model, dataset, args["k"], args)

    log(f"Saving results to {args['location']}")
    for i, r in enumerate(cv_results):
        r.save(os.path.join(args["location"], f"res-cv{i}"))
    log("Micro avg. F1 estimate",
        np.mean([r.statistics["micro avg"]["f1-score"] for r in cv_results]))
コード例 #2
0
def plots_vs_length(location: str):
    res = GeometryResults.load()
    only_pos = not (res.labels == 0).any()
    # Hardcoded to train
    log.debug("Loading data...")
    data = load_dataset(dict(dataset="DaNE"), DUMMY_METADATA,
                        torch.device("cpu")).data[Split.TRAIN]
    seq_lengths = np.array(
        [len(data.texts[c["text_num"]]) for c in res.content])
    span_lengths = np.array([c["span"][1] - c["span"][0] for c in res.content])
    N = 4
    for name, Z in zip(
        ("PCA", "t-SNE", "UMAP"),
        (res.pca_transformed, res.tsne_transformed, res.umap_transformed)):
        for dim in range(min(Z.shape[1], N)):
            for lenname, lengths in zip(("sequence", "span"),
                                        (seq_lengths, span_lengths)):
                log.debug(f"Plotting {name}{dim} on {lenname}")
                _, ax = plt.subplots(figsize=figsize_std)
                ax.set_title(
                    f"{name} Representations, Dim. {dim+1} vs. Example {lenname.title()} Length"
                )
                Z_ = Z[:, dim]
                _scatter_transformed(lengths[:len(Z_)], Z_,
                                     res.labels[:len(Z_)], ax)
                ax.legend(*_get_h_l(only_pos), loc="lower right")
                ax.set_ylabel(f"{name}$_{dim+1}$")
                ax.set_xlabel(f"Entity Example {lenname.title()} Length")

                plt.tight_layout()
                plt.savefig(
                    os.path.join(location, "geometry-plots",
                                 f"{name}{dim}-{lenname}-len.png"))
                plt.close()
コード例 #3
0
ファイル: pred_corr.py プロジェクト: peleiden/daLUKE
def main(daluke_path: str, other_path: str, show: bool):
    other_name = os.path.split(other_path)[-1]
    log.configure(os.path.join(daluke_path,
                               f"comparison_with_{other_name}.log"),
                  print_level=Levels.DEBUG)

    daluke_res = NER_Results.load(daluke_path)
    other_res = NER_TestResults.load(other_path)
    if show:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        data = load_dataset(dict(dataset="DaNE"), DUMMY_METADATA,
                            device).data[Split.TEST]
        for da_preds, ot_preds, truths, text in zip(daluke_res.preds,
                                                    other_res.predictions,
                                                    data.annotations,
                                                    data.texts):
            if da_preds != ot_preds:
                t = Table()
                t.add_row(["Text:"] + text)
                t.add_row(["Truth:"] + truths)
                t.add_row(["DaLUKE pred:"] + da_preds)
                t.add_row([f"{other_name} pred:"] + ot_preds)
                log(str(t).replace("|", ""), with_info=False)

    log(f"Confusion matrix with DaLUKE results ↓ and results from {other_name} →"
        )
    log(
        _format_confmat(
            confusion_matrix(daluke_res.preds, other_res.predictions,
                             ["LOC", "PER", "ORG", "MISC", "O"])))
    log(f"Covar. {sequence_covar(daluke_res.preds, other_res.predictions)}")
コード例 #4
0
def collect_representations(
    modelpath: str, device: torch.device, target_device: torch.device,
    only_positives: bool, fine_tuned: bool
) -> tuple[np.ndarray, np.ndarray, list[dict[str, int | list[tuple[int,
                                                                   int]]]]]:
    entity_vocab, metadata, state_dict, token_map = load_from_archive(
        args["model"])
    log("Loading dataset")
    # Note: We dont fill out dict as we dont allow changing max-entities and max-entity-span here. If this results in an error for any dataset, we must change this.
    dataset = load_dataset(dict(dataset="DaNE"), metadata, device, token_map)
    dataloader = dataset.build(Split.TRAIN, FP_SIZE, shuffle=False)
    log("Loading model")
    if not fine_tuned:
        state_dict, ent_embed_size = mutate_for_ner(
            state_dict,
            mask_id=entity_vocab["[MASK]"]["id"],
            pad_id=entity_vocab["[PAD]"]["id"])
    model = load_model(
        state_dict,
        dataset,
        metadata,
        device,
        entity_embedding_size=ent_embed_size if not fine_tuned else None)
    model.eval()

    log("Forward passing examples")
    batch_representations, labels, content = list(), list(), list()
    for batch in tqdm(dataloader):
        # Use super class as we want the represenations
        word_representations, entity_representations = super(
            type(model), model).forward(batch)
        start_word_representations, end_word_representations = model.collect_start_and_ends(
            word_representations, batch)
        representations = torch.cat([
            start_word_representations, end_word_representations,
            entity_representations
        ],
                                    dim=2)
        # We dont want padding
        mask = batch.entities.attention_mask.bool()
        if only_positives:
            mask &= (batch.entities.labels != 0)
        batch_representations.append(
            representations[mask].contiguous().to(target_device))
        labels.append(
            batch.entities.labels[mask].contiguous().to(target_device))
        for i, text_num in enumerate(batch.text_nums):
            for j in range(batch.entities.N[i]):
                if mask[i, j]:
                    content.append(
                        dict(
                            text_num=text_num,
                            span=batch.entities.fullword_spans[i][j],
                        ))
    return torch.cat(batch_representations).numpy(), torch.cat(
        labels).numpy(), content
コード例 #5
0
def main(path: str, pred: str, truth: str):
    log.configure(os.path.join(path,
                               f"prediction-examples-{pred}-{truth}.log"),
                  print_level=Levels.DEBUG)
    log(f"Looking for examples where model predicted {pred}, but the truth was {truth}"
        )
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    res = NER_Results.load(path)
    data = load_dataset(dict(dataset="DaNE"), DUMMY_METADATA,
                        device).data[Split.TEST]
    for preds, truths, text in zip(res.preds, data.annotations, data.texts):
        if any(p != t and cla(p) == pred and cla(t) == truth
               for p, t in zip(preds, truths)):
            t = Table()
            t.add_row(["Text:"] + text)
            t.add_row(["Truth:"] + truths)
            t.add_row(["Pred:"] + preds)
            log(str(t).replace("|", ""), with_info=False)
コード例 #6
0
def make_cal_plots(location: str, base_model: str):
    log.configure(os.path.join(location, "calibration-plot.log"))
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    results = NER_Results.load(location)

    log("Loading data")
    dataset = load_dataset(dict(dataset="DaNE"), {**DEFAULT_METADATA, **{"base-model": base_model}}, device)
    dataloader = dataset.build(Split.TEST, 1, shuffle=False)
    log("Fetching probs and labels")
    truths = [dict() for _ in range(len(results.span_probs))]
    for _, ex in dataloader.dataset:
        truths[ex.text_num].update({s: l for s, l in zip(ex.entities.fullword_spans, ex.entities.labels)})
    flat_preds, flat_truths = list(), list()
    for p, t in zip(results.span_probs, truths):
        for k, probs in p.items():
            flat_preds.append(probs)
            flat_truths.append(t[k])
    log("Calibration plot")
    calibration_plot(flat_preds, flat_truths, location)
コード例 #7
0
def run_experiment(args: dict[str, Any]):
    set_seeds(seed=0)
    # Remove subfolder so we can control location directly
    NER_Results.subfolder = ""
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    entity_vocab, metadata, state_dict, token_map = load_from_archive(args["model"])
    state_dict, ent_embed_size = mutate_for_ner(state_dict, mask_id=entity_vocab["[MASK]"]["id"], pad_id=entity_vocab["[PAD]"]["id"])

    log("Setting up sampler")
    with open(args["params"], "r") as f:
        param_lists = json.load(f)
    sampler = SAMPLERS[args["sampler"]](param_lists)

    log(f"Loading dataset {args['dataset']} ...")
    dataset = load_dataset(args, metadata, device, token_map)

    log("Loading model ...")
    model = load_model(state_dict, dataset, metadata, device, entity_embedding_size=ent_embed_size)

    optimize(model, dataset, args, sampler)
コード例 #8
0
def run_experiment(args: dict[str, Any]):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    _, metadata, state_dict, token_map = load_from_archive(args["model"])

    log("Loading dataset ...")
    dataset = load_dataset(args, metadata, device, token_map)
    dataloader = dataset.build(Split.TEST, FP_SIZE)

    log("Loading model ...")
    model = load_model(state_dict, dataset, metadata, device)

    # Print some important information to stdout
    log.debug(model)
    dataset.document(dataloader, Split.TEST)
    type_distribution(dataset.data[Split.TEST].annotations)

    log("Starting evaluation of daLUKE for NER")
    results = evaluate_ner(model, dataloader, dataset, device, Split.TEST)

    results.save(args["location"])
    type_distribution(results.preds)
コード例 #9
0
def main(path: str, n: int):
    log.configure(os.path.join(path, "geometry-examples.log"),
                  "daLUKE examples",
                  print_level=Levels.DEBUG)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Hardcoded to train
    data = load_dataset(dict(dataset="DaNE"), DUMMY_METADATA,
                        device).data[Split.TRAIN]
    set_seeds()
    GeometryResults.subfolder = ""
    res = GeometryResults.load(path)
    for field, axis in OF_INTEREST.items():
        log.section(field)
        X = getattr(res, field)
        order = X[:, axis].argsort()

        log(f"Examples where dim. {axis} is high")
        _show_examples(res, X, order[::-1][:n], data)
        log(f"Examples where dim. {axis} is low")
        _show_examples(res, X, order[:n], data)
コード例 #10
0
def run_experiment(args: dict[str, Any]):
    log.configure(
        os.path.join(args["location"], "daluke-train-ner.log"),
        args["name"] + " Fine-tuning",
        logger=args["name"] + "-fine-tune",
        print_level=Levels.INFO if args["quieter"] else Levels.DEBUG,
    )
    set_seeds(seed=args["seed"])
    assert not (args["words_only"] and args["entities_only"]), "--words-only and --entities-only cannot be used together"
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    entity_vocab, metadata, state_dict, token_map = load_from_archive(args["model"])
    state_dict, ent_embed_size = mutate_for_ner(state_dict, mask_id=entity_vocab["[MASK]"]["id"], pad_id=entity_vocab["[PAD]"]["id"])

    # Add new NER specific fields to metadata
    metadata["NER-words-only"]    = args["words_only"]
    metadata["NER-entities-only"] = args["entities_only"]

    log(f"Loading dataset {args['dataset']} ...")
    dataset = load_dataset(args, metadata, device, token_map)
    dataloader = dataset.build(Split.TRAIN, args["batch_size"])
    dev_dataloader = dataset.build(Split.DEV, args["batch_size"]) if args["eval"] else None

    # Remember the dimensionality that the model will be trained with
    metadata["output-size"] = len(dataset.all_labels)

    log("Loading model ...")
    model = load_model(
        state_dict,
        dataset,
        metadata,
        device,
        bert_attention = args["bert_attention"],
        entity_embedding_size = ent_embed_size,
        dropout = args["dropout"],
    )

    log(f"Starting training of DaLUKE for NER on {args['dataset']}")
    training = TrainNER(
        model,
        dataloader,
        dataset,
        device         = device,
        epochs         = args["epochs"],
        lr             = args["lr"],
        warmup_prop    = args["warmup_prop"],
        weight_decay   = args["weight_decay"],
        dev_dataloader = dev_dataloader,
        loss_weight    = args["loss_weight"],
    )
    # Log important information out
    log.debug(training.model)
    log.debug(training.scheduler)
    log.debug(training.optimizer)
    dataset.document(dataloader, Split.TRAIN)
    type_distribution(dataset.data[Split.TRAIN].annotations)

    results = training.run()

    log("Saving results and model to %s" % args["location"])
    save_to_archive(os.path.join(args["location"], TRAIN_OUT), entity_vocab, metadata, model, token_map)

    if args["eval"]:
        log("True dev. set distributions")
        results.dev_true_type_distribution = type_distribution(dataset.data[Split.DEV].annotations)
        log("True dev. set distributions")
        results.train_true_type_distribution = type_distribution(dataset.data[Split.TRAIN].annotations)
        log("Saving best model")
        save_to_archive(os.path.join(args["location"], TRAIN_OUT_BEST), entity_vocab, metadata, training.best_model, token_map)

    results.save(args["location"])