Esempio n. 1
0
def run_experiment(args: dict[str, Any]):
    set_seeds(seed=0)
    # Remove subolder so we can control location directly
    NER_Results.subfolder = ""

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    entity_vocab, metadata, state_dict, token_map = load_from_archive(
        args["model"])
    state_dict, ent_embed_size = mutate_for_ner(
        state_dict,
        mask_id=entity_vocab["[MASK]"]["id"],
        pad_id=entity_vocab["[PAD]"]["id"])

    log(f"Loading dataset {args['dataset']} ...")
    dataset = load_dataset(args, metadata, device, token_map)

    log("Loading model ...")
    model = load_model(state_dict,
                       dataset,
                       metadata,
                       device,
                       entity_embedding_size=ent_embed_size,
                       bert_attention=args["bert_attention"],
                       dropout=args["dropout"])

    cv_results = cross_validate(model, dataset, args["k"], args)

    log(f"Saving results to {args['location']}")
    for i, r in enumerate(cv_results):
        r.save(os.path.join(args["location"], f"res-cv{i}"))
    log("Micro avg. F1 estimate",
        np.mean([r.statistics["micro avg"]["f1-score"] for r in cv_results]))
Esempio n. 2
0
def run_experiment(args: dict[str, Any]):
    set_seeds(seed=0)
    # Remove subfolder so we can control location directly
    NER_Results.subfolder = ""
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    entity_vocab, metadata, state_dict, token_map = load_from_archive(args["model"])
    state_dict, ent_embed_size = mutate_for_ner(state_dict, mask_id=entity_vocab["[MASK]"]["id"], pad_id=entity_vocab["[PAD]"]["id"])

    log("Setting up sampler")
    with open(args["params"], "r") as f:
        param_lists = json.load(f)
    sampler = SAMPLERS[args["sampler"]](param_lists)

    log(f"Loading dataset {args['dataset']} ...")
    dataset = load_dataset(args, metadata, device, token_map)

    log("Loading model ...")
    model = load_model(state_dict, dataset, metadata, device, entity_embedding_size=ent_embed_size)

    optimize(model, dataset, args, sampler)
Esempio n. 3
0
def main(path: str, n: int):
    log.configure(os.path.join(path, "geometry-examples.log"),
                  "daLUKE examples",
                  print_level=Levels.DEBUG)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Hardcoded to train
    data = load_dataset(dict(dataset="DaNE"), DUMMY_METADATA,
                        device).data[Split.TRAIN]
    set_seeds()
    GeometryResults.subfolder = ""
    res = GeometryResults.load(path)
    for field, axis in OF_INTEREST.items():
        log.section(field)
        X = getattr(res, field)
        order = X[:, axis].argsort()

        log(f"Examples where dim. {axis} is high")
        _show_examples(res, X, order[::-1][:n], data)
        log(f"Examples where dim. {axis} is low")
        _show_examples(res, X, order[:n], data)
Esempio n. 4
0
def main(path: str, model: str, n_components: int,
         reducer_subsample: Optional[int], tsne_perplexity: float,
         umap_neighbours: int, umap_min_dist: float, only_positives: bool,
         fine_tuned: bool):
    set_seeds()
    log.configure(os.path.join(path, "geometry-analysis.log"),
                  "daLUKE embedding geometry analysis",
                  print_level=Levels.DEBUG)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    with torch.no_grad():
        representations, labels, content = collect_representations(
            model, device, torch.device("cpu"), only_positives, fine_tuned)
    log(f"Acquired representations of shape {representations.shape}")
    log("Performing principal component analysis")
    pca_transformed, principal_components = pca(representations, n_components)
    if reducer_subsample is not None:
        log.debug(
            f"Reducing dataset to {reducer_subsample} examples for UMAP and t-SNE"
        )
        representations = representations[:reducer_subsample]
    log("Running the UMAP algorithm")
    umap_transformed = umap(representations, umap_neighbours, umap_min_dist)
    log("Running the t-SNE algorithm")
    tsne_transformed = tsne(representations, tsne_perplexity)

    log(
        "Saved analysis results to",
        GeometryResults(
            pca_transformed=pca_transformed,
            umap_transformed=umap_transformed,
            tsne_transformed=tsne_transformed,
            labels=labels,
            principal_components=principal_components,
            content=content,
        ).save(path),
    )
Esempio n. 5
0
import numpy as np
import pytest

import torch
import torch.nn as nn

from pelutils import set_seeds
from pelutils.ds import unique, no_grad

set_seeds(sum(ord(c) for c in "GME TO THE MOON! 🚀🚀🚀🚀🚀🚀🚀🚀"))


def test_unique():

    # Simple case: Ordered numbers from 0 to 99
    n = 100
    a = np.arange(n, dtype=np.uint32)
    u, index, inverse, counts = unique(a, return_index=True, return_inverse=True, return_counts=True)
    assert np.all(a == u)
    assert np.all(a == index)
    assert np.all(a == inverse)
    assert np.all(counts == 1)

    # Slightly more complex case with some non-unique values
    a[2:4] = 50
    a[[5, 16, 3]] = 69
    a = a.astype(np.float16)
    u, index, inverse, counts = unique(a, return_index=True, return_inverse=True, return_counts=True)
    argsort = np.argsort(u)
    npu, npindex, npcounts = np.unique(a, return_index=True, return_counts=True)
    assert np.all(u[argsort] == npu)
Esempio n. 6
0
def run_experiment(args: dict[str, Any]):
    log.configure(
        os.path.join(args["location"], "daluke-train-ner.log"),
        args["name"] + " Fine-tuning",
        logger=args["name"] + "-fine-tune",
        print_level=Levels.INFO if args["quieter"] else Levels.DEBUG,
    )
    set_seeds(seed=args["seed"])
    assert not (args["words_only"] and args["entities_only"]), "--words-only and --entities-only cannot be used together"
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    entity_vocab, metadata, state_dict, token_map = load_from_archive(args["model"])
    state_dict, ent_embed_size = mutate_for_ner(state_dict, mask_id=entity_vocab["[MASK]"]["id"], pad_id=entity_vocab["[PAD]"]["id"])

    # Add new NER specific fields to metadata
    metadata["NER-words-only"]    = args["words_only"]
    metadata["NER-entities-only"] = args["entities_only"]

    log(f"Loading dataset {args['dataset']} ...")
    dataset = load_dataset(args, metadata, device, token_map)
    dataloader = dataset.build(Split.TRAIN, args["batch_size"])
    dev_dataloader = dataset.build(Split.DEV, args["batch_size"]) if args["eval"] else None

    # Remember the dimensionality that the model will be trained with
    metadata["output-size"] = len(dataset.all_labels)

    log("Loading model ...")
    model = load_model(
        state_dict,
        dataset,
        metadata,
        device,
        bert_attention = args["bert_attention"],
        entity_embedding_size = ent_embed_size,
        dropout = args["dropout"],
    )

    log(f"Starting training of DaLUKE for NER on {args['dataset']}")
    training = TrainNER(
        model,
        dataloader,
        dataset,
        device         = device,
        epochs         = args["epochs"],
        lr             = args["lr"],
        warmup_prop    = args["warmup_prop"],
        weight_decay   = args["weight_decay"],
        dev_dataloader = dev_dataloader,
        loss_weight    = args["loss_weight"],
    )
    # Log important information out
    log.debug(training.model)
    log.debug(training.scheduler)
    log.debug(training.optimizer)
    dataset.document(dataloader, Split.TRAIN)
    type_distribution(dataset.data[Split.TRAIN].annotations)

    results = training.run()

    log("Saving results and model to %s" % args["location"])
    save_to_archive(os.path.join(args["location"], TRAIN_OUT), entity_vocab, metadata, model, token_map)

    if args["eval"]:
        log("True dev. set distributions")
        results.dev_true_type_distribution = type_distribution(dataset.data[Split.DEV].annotations)
        log("True dev. set distributions")
        results.train_true_type_distribution = type_distribution(dataset.data[Split.TRAIN].annotations)
        log("Saving best model")
        save_to_archive(os.path.join(args["location"], TRAIN_OUT_BEST), entity_vocab, metadata, training.best_model, token_map)

    results.save(args["location"])