Beispiel #1
0
def main(daluke_path: str, other_path: str, show: bool):
    other_name = os.path.split(other_path)[-1]
    log.configure(os.path.join(daluke_path,
                               f"comparison_with_{other_name}.log"),
                  print_level=Levels.DEBUG)

    daluke_res = NER_Results.load(daluke_path)
    other_res = NER_TestResults.load(other_path)
    if show:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        data = load_dataset(dict(dataset="DaNE"), DUMMY_METADATA,
                            device).data[Split.TEST]
        for da_preds, ot_preds, truths, text in zip(daluke_res.preds,
                                                    other_res.predictions,
                                                    data.annotations,
                                                    data.texts):
            if da_preds != ot_preds:
                t = Table()
                t.add_row(["Text:"] + text)
                t.add_row(["Truth:"] + truths)
                t.add_row(["DaLUKE pred:"] + da_preds)
                t.add_row([f"{other_name} pred:"] + ot_preds)
                log(str(t).replace("|", ""), with_info=False)

    log(f"Confusion matrix with DaLUKE results ↓ and results from {other_name} →"
        )
    log(
        _format_confmat(
            confusion_matrix(daluke_res.preds, other_res.predictions,
                             ["LOC", "PER", "ORG", "MISC", "O"])))
    log(f"Covar. {sequence_covar(daluke_res.preds, other_res.predictions)}")
Beispiel #2
0
def word_preds(datadir: str, ff_size: int):
    log.configure(os.path.join(datadir, "dabert-word-preds.log"), "daBERT word predictions")
    log("Loading metadata")
    with open(os.path.join(datadir, DatasetBuilder.metadata_file)) as f:
        metadata = json.load(f)
    log("Loading model")
    dabert = AutoModelForPreTraining.from_pretrained(daBERT).to(device)
    log("Loading data")
    dataloader = DataLoader(
        datadir,
        metadata,
        dict(),
        device,
    )
    loader = dataloader.get_dataloader(ff_size, None)
    log("Forward passing")
    correct_preds = np.zeros(len(loader))
    for i, batch in tqdm(enumerate(loader), total=len(loader)):
        logits = dabert(batch.words.ids).prediction_logits
        masked_logits = logits[batch.word_mask]
        preds = masked_logits.argmax(dim=1)
        correct_preds[i] = (preds == batch.word_mask_labels).float().mean().cpu()
    log(
        "MLM token prediction accuracy",
        "  Mean: %.4f %%" % (100 * correct_preds.mean()),
        "  Std.: %.4f %%" % (100 * correct_preds.std(ddof=1)),
    )
Beispiel #3
0
def main():
    parser = Parser(ARGUMENTS, name="NER_Test", multiple_jobs=False)
    exp = parser.parse()

    log.configure(
        os.path.join(parser.location, "danish-ner.log"), "Benchmark Danish NER models",
    )

    run_experiment(exp)
Beispiel #4
0
def main(path: str, pred: str, truth: str):
    log.configure(os.path.join(path,
                               f"prediction-examples-{pred}-{truth}.log"),
                  print_level=Levels.DEBUG)
    log(f"Looking for examples where model predicted {pred}, but the truth was {truth}"
        )
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    res = NER_Results.load(path)
    data = load_dataset(dict(dataset="DaNE"), DUMMY_METADATA,
                        device).data[Split.TEST]
    for preds, truths, text in zip(res.preds, data.annotations, data.texts):
        if any(p != t and cla(p) == pred and cla(t) == truth
               for p, t in zip(preds, truths)):
            t = Table()
            t.add_row(["Text:"] + text)
            t.add_row(["Truth:"] + truths)
            t.add_row(["Pred:"] + preds)
            log(str(t).replace("|", ""), with_info=False)
def main(path: str, n: int):
    log.configure(os.path.join(path, "geometry-examples.log"),
                  "daLUKE examples",
                  print_level=Levels.DEBUG)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Hardcoded to train
    data = load_dataset(dict(dataset="DaNE"), DUMMY_METADATA,
                        device).data[Split.TRAIN]
    set_seeds()
    GeometryResults.subfolder = ""
    res = GeometryResults.load(path)
    for field, axis in OF_INTEREST.items():
        log.section(field)
        X = getattr(res, field)
        order = X[:, axis].argsort()

        log(f"Examples where dim. {axis} is high")
        _show_examples(res, X, order[::-1][:n], data)
        log(f"Examples where dim. {axis} is low")
        _show_examples(res, X, order[:n], data)
Beispiel #6
0
def main(path: str, model: str, n_components: int,
         reducer_subsample: Optional[int], tsne_perplexity: float,
         umap_neighbours: int, umap_min_dist: float, only_positives: bool,
         fine_tuned: bool):
    set_seeds()
    log.configure(os.path.join(path, "geometry-analysis.log"),
                  "daLUKE embedding geometry analysis",
                  print_level=Levels.DEBUG)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    with torch.no_grad():
        representations, labels, content = collect_representations(
            model, device, torch.device("cpu"), only_positives, fine_tuned)
    log(f"Acquired representations of shape {representations.shape}")
    log("Performing principal component analysis")
    pca_transformed, principal_components = pca(representations, n_components)
    if reducer_subsample is not None:
        log.debug(
            f"Reducing dataset to {reducer_subsample} examples for UMAP and t-SNE"
        )
        representations = representations[:reducer_subsample]
    log("Running the UMAP algorithm")
    umap_transformed = umap(representations, umap_neighbours, umap_min_dist)
    log("Running the t-SNE algorithm")
    tsne_transformed = tsne(representations, tsne_perplexity)

    log(
        "Saved analysis results to",
        GeometryResults(
            pca_transformed=pca_transformed,
            umap_transformed=umap_transformed,
            tsne_transformed=tsne_transformed,
            labels=labels,
            principal_components=principal_components,
            content=content,
        ).save(path),
    )
Beispiel #7
0
def main():
    parser = ArgumentParser(description=\
        "Standalone convenience script used to collect the results from the pretraining of daLUKE "\
        "performed by the pretraining module")
    parser.add_argument("inpath", type=str,
        help= "Path to the output folder of the pretraining containing the model file. "\
            "Entity vocab. and metadata are assumed to be in parent folder of this."\
            "Can also be path to an exact model file, in which case this will be used instead of the newest."
    )
    parser.add_argument("outpath",
                        type=str,
                        help="File path to the compressed model")
    parser.add_argument("--tmpdir",
                        type=str,
                        help="Where to create temporary folder",
                        default="")
    args = parser.parse_args()
    log.configure(os.path.join(
        args.outpath if os.path.isdir(args.outpath) else os.path.dirname(
            args.outpath), "collect.log"),
                  "Collector",
                  print_level=Levels.DEBUG)

    modelpath = args.inpath if os.path.isdir(args.inpath) else os.path.dirname(
        args.inpath)
    vocabfile, metafile = os.path.join(modelpath, "..",
                                       VOCAB_FILE), os.path.join(
                                           modelpath, "..", METADATA_FILE)
    modelfile = os.path.join(args.inpath, _get_newest_model(
        args.inpath)) if os.path.isdir(args.inpath) else args.inpath

    os.makedirs(os.path.split(args.outpath)[0], exist_ok=True)

    ins, outs = [vocabfile, metafile,
                 modelfile], [VOCAB_FILE, METADATA_FILE, MODEL_OUT]
    # If reduction is used, also collect the token map
    with open(metafile, "r") as f:
        is_reduced = json.load(f).get("reduced-vocab")
    if is_reduced:
        ins.append(os.path.join(modelpath, "..",
                                DatasetBuilder.token_map_file))
        outs.append(TOKEN_MAP_FILE)
    tmpdir = os.path.join(args.tmpdir, "tmpdir")
    log.debug(f"Using:", *ins)

    # Operate directly on disk as opposed to serialize.save_to_archive which requires us to load the data into mem.
    if shutil.which("tar"):
        log.debug(f"Compressing to {args.outpath} using system tar tool...")
        try:
            os.makedirs(tmpdir, exist_ok=True)
            for f, n in zip(ins, outs):
                shutil.copy2(f, os.path.join(tmpdir, n))
            p = subprocess.Popen(
                ["tar", "-czvf", args.outpath, "-C", tmpdir] + outs,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
            )
            p.wait()
        finally:
            shutil.rmtree(tmpdir)
    else:
        with tarfile.open(args.outpath, "w:gz") as tar:
            for f, n in zip(ins, outs):
                log.debug(
                    f"Compressing {f} as {n} using build-in tar module (may take a while)..."
                )
                tar.add(f, arcname=n)
    log("Succesfully compressed file saved to", args.outpath)
Beispiel #8
0
def preprocess(
    dump_db_file: str,
    function: str,
    entity_vocab_file: str | None,
    dagw_sections: str | None,
    min_entity_length: int,
    max_entity_length: int,
    max_articles: int | None,
):
    if not entity_vocab_file:
        raise RuntimeError("entity-vocab-file must be given")

    log.configure(
        os.path.join(os.path.split(dump_db_file)[0], "preprocessing.log"),
        "Preprocessing",
        log_commit=True,
    )

    log.section("Collecting data")
    log(
        "Wikidump path: %s" % dump_db_file,
        "Function:      %s" % function,
    )

    log("Loading entity vocab")
    entity_vocab = {
        _insert_xml_special_characters(e.lower())
        for e in load_entity_vocab(entity_vocab_file)
    }

    dagw_files = list()
    if dagw_sections:
        n_words = 0
        log("Finding gigaword data files and counting words")
        dagw_files = list(_get_dagw_files(dagw_sections))
        for dagw_file in tqdm(dagw_files):
            with open(dagw_file) as f:
                n_words += len(f.read().split())
        log("Found %i dagw files containing %i words" %
            (len(dagw_files), n_words))

    # tempdir is not used, as the temporary files can take up more space than what temporary
    # directories usually allow
    tmpdir = os.path.join(os.path.split(dump_db_file)[0], "tmpdir")
    os.makedirs(tmpdir, exist_ok=True)
    log("Saving all articles to temporary directory %s" % tmpdir)
    for dagw_file in tqdm(dagw_files):
        shutil.copy2(
            dagw_file,
            os.path.join(tmpdir, fix_filename(os.path.split(dagw_file)[-1])))
    log("Saving Wikipedia files to temporary directory")
    for is_text, text, title in tqdm(_get_lineblocks(dump_db_file),
                                     unit=" blocks"):
        if is_text and not ignore_title(title):
            text_start = text.index(">") + 1
            text_end = -len("</text>\n")
            with open(
                    os.path.join(tmpdir,
                                 fix_filename(title)[:100] + ".wiki"),
                    "w") as f:
                f.write(text[text_start:text_end])

    files = [
        os.path.join(tmpdir, x) for x in os.listdir(tmpdir)[:max_articles]
    ]
    log("Saved a total of %i articles to %s" % (len(files), tmpdir))

    log.section("Beginning preprocessing on %i threads" % os.cpu_count())
    process_map(
        func,
        [(function, f, entity_vocab, min_entity_length, max_entity_length)
         for f in files],
        max_workers=os.cpu_count(),
        chunksize=1024,
    )

    dump_file = os.path.splitext(dump_db_file)[0] + ".%s.bz2" % function
    log.info("Saving preprocessed files to %s" % dump_file)
    with bz2.BZ2File(dump_file, "w") as dump:
        with bz2.BZ2File(dump_db_file) as old_dump:
            line = b""
            while not line.strip().startswith(b"<page>"):
                dump.write(line)
                line = old_dump.readline()
        for i, fname in tqdm(enumerate(files), total=len(files)):
            with open(fname) as f:
                text = f.read()
            s = """
            <page>
                <title>{title}</title>
                <id>{id}</id>
                <revision>
                    <text bytes="{bytes}" xml:space="preserve">{text}</text>
                </revision>
            </page>""".format(
                title=fname,
                id=i + 1,
                bytes=len(text),
                text=text,
            )
            if i == 0:
                s = s[1:]
            dump.write(s.encode("utf-8"))
        dump.write(b"\n</mediawiki>")

    log.info("Removing temporary files")
    shutil.rmtree(tmpdir)
    log.info("Done preprocessing data")
Beispiel #9
0
                next(d for d in ALL_DATASETS if d.name == name)
            )
        except IndexError as ie:
            raise ValueError(f"Dataset with given name {name} not found, see --help for options") from ie
    for d in datasets:
        log(f"Setting up dataset \"{d.name}\" ...")
        kwargs = dict()
        if isinstance(d, Wikiann):
            kwargs["data_path"] = wikiann_path
        elif isinstance(d, Plank):
            kwargs["data_path"] = plank_path
        d.setup(**kwargs, split=split)
    return datasets

if __name__ == '__main__':
    """ Shows some Data stats """
    localdata = "../../local_data"

    localdata = os.path.join(sys.path[0], localdata)
    wikiann_p, plank_p = os.path.join(localdata, "wikiann"), os.path.join(localdata, "plank")
    log.configure(os.path.join(localdata, "data.log"), "data")

    for split in ("train", "dev", "test"):
        ds = setup_datasets(("DaNE", "Plank", "WikiANN"), wikiann_path=wikiann_p, plank_path=plank_p, split=split)
        for d in ds:
            log(f"{d.name} {split} sentences:", len(d.get_data()[0]))
    # now for better test statistics
    for d in ds:
        for ann in ("ORG", "PER", "LOC", "MISC"):
            log(f"#{ann} in {d.name}", sum(len([w for w in s if ann in w]) for s in d.get_data()[1]))
Beispiel #10
0
    log("Loading model ...")
    model = load_model(state_dict,
                       dataset,
                       metadata,
                       device,
                       entity_embedding_size=ent_embed_size,
                       bert_attention=args["bert_attention"],
                       dropout=args["dropout"])

    cv_results = cross_validate(model, dataset, args["k"], args)

    log(f"Saving results to {args['location']}")
    for i, r in enumerate(cv_results):
        r.save(os.path.join(args["location"], f"res-cv{i}"))
    log("Micro avg. F1 estimate",
        np.mean([r.statistics["micro avg"]["f1-score"] for r in cv_results]))


if __name__ == '__main__':
    with log.log_errors:
        parser = Parser(ARGUMENTS, name="crossval-eval", multiple_jobs=False)
        exp = parser.parse()
        parser.document_settings()
        log.configure(
            os.path.join(parser.location, "daluke-crossval.log"),
            "Cross-validate NER results",
            print_level=Levels.INFO if exp["quieter"] else Levels.DEBUG,
        )
        run_experiment(exp)
Beispiel #11
0
    log("Loading dataset ...")
    dataset = load_dataset(args, metadata, device, token_map)
    dataloader = dataset.build(Split.TEST, FP_SIZE)

    log("Loading model ...")
    model = load_model(state_dict, dataset, metadata, device)

    # Print some important information to stdout
    log.debug(model)
    dataset.document(dataloader, Split.TEST)
    type_distribution(dataset.data[Split.TEST].annotations)

    log("Starting evaluation of daLUKE for NER")
    results = evaluate_ner(model, dataloader, dataset, device, Split.TEST)

    results.save(args["location"])
    type_distribution(results.preds)


if __name__ == '__main__':
    with log.log_errors:
        parser = Parser(ARGUMENTS, name="daluke-ner-eval", multiple_jobs=False)
        exp = parser.parse()
        parser.document_settings()
        log.configure(
            os.path.join(parser.location, "daluke-eval-ner.log"),
            "Finetuning evaluation of daLUKE for Danish NER",
            print_level=Levels.INFO if exp["quieter"] else Levels.DEBUG,
        )
        run_experiment(exp)
Beispiel #12
0
def main():
    log.configure(print_level=Levels.DEBUG)
    cli()
Beispiel #13
0
def run_experiment(args: dict[str, Any]):
    log.configure(
        os.path.join(args["location"], "daluke-train-ner.log"),
        args["name"] + " Fine-tuning",
        logger=args["name"] + "-fine-tune",
        print_level=Levels.INFO if args["quieter"] else Levels.DEBUG,
    )
    set_seeds(seed=args["seed"])
    assert not (args["words_only"] and args["entities_only"]), "--words-only and --entities-only cannot be used together"
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    entity_vocab, metadata, state_dict, token_map = load_from_archive(args["model"])
    state_dict, ent_embed_size = mutate_for_ner(state_dict, mask_id=entity_vocab["[MASK]"]["id"], pad_id=entity_vocab["[PAD]"]["id"])

    # Add new NER specific fields to metadata
    metadata["NER-words-only"]    = args["words_only"]
    metadata["NER-entities-only"] = args["entities_only"]

    log(f"Loading dataset {args['dataset']} ...")
    dataset = load_dataset(args, metadata, device, token_map)
    dataloader = dataset.build(Split.TRAIN, args["batch_size"])
    dev_dataloader = dataset.build(Split.DEV, args["batch_size"]) if args["eval"] else None

    # Remember the dimensionality that the model will be trained with
    metadata["output-size"] = len(dataset.all_labels)

    log("Loading model ...")
    model = load_model(
        state_dict,
        dataset,
        metadata,
        device,
        bert_attention = args["bert_attention"],
        entity_embedding_size = ent_embed_size,
        dropout = args["dropout"],
    )

    log(f"Starting training of DaLUKE for NER on {args['dataset']}")
    training = TrainNER(
        model,
        dataloader,
        dataset,
        device         = device,
        epochs         = args["epochs"],
        lr             = args["lr"],
        warmup_prop    = args["warmup_prop"],
        weight_decay   = args["weight_decay"],
        dev_dataloader = dev_dataloader,
        loss_weight    = args["loss_weight"],
    )
    # Log important information out
    log.debug(training.model)
    log.debug(training.scheduler)
    log.debug(training.optimizer)
    dataset.document(dataloader, Split.TRAIN)
    type_distribution(dataset.data[Split.TRAIN].annotations)

    results = training.run()

    log("Saving results and model to %s" % args["location"])
    save_to_archive(os.path.join(args["location"], TRAIN_OUT), entity_vocab, metadata, model, token_map)

    if args["eval"]:
        log("True dev. set distributions")
        results.dev_true_type_distribution = type_distribution(dataset.data[Split.DEV].annotations)
        log("True dev. set distributions")
        results.train_true_type_distribution = type_distribution(dataset.data[Split.TRAIN].annotations)
        log("Saving best model")
        save_to_archive(os.path.join(args["location"], TRAIN_OUT_BEST), entity_vocab, metadata, training.best_model, token_map)

    results.save(args["location"])
Beispiel #14
0
    set_seeds(seed=0)
    # Remove subfolder so we can control location directly
    NER_Results.subfolder = ""
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    entity_vocab, metadata, state_dict, token_map = load_from_archive(args["model"])
    state_dict, ent_embed_size = mutate_for_ner(state_dict, mask_id=entity_vocab["[MASK]"]["id"], pad_id=entity_vocab["[PAD]"]["id"])

    log("Setting up sampler")
    with open(args["params"], "r") as f:
        param_lists = json.load(f)
    sampler = SAMPLERS[args["sampler"]](param_lists)

    log(f"Loading dataset {args['dataset']} ...")
    dataset = load_dataset(args, metadata, device, token_map)

    log("Loading model ...")
    model = load_model(state_dict, dataset, metadata, device, entity_embedding_size=ent_embed_size)

    optimize(model, dataset, args, sampler)

if __name__ == '__main__':
    with log.log_errors:
        parser = Parser(ARGUMENTS, name="hyper-optim.log", multiple_jobs=False)
        exp = parser.parse()
        parser.document_settings()
        log.configure(
            os.path.join(parser.location, "daluke-hyper-optim.log"), "Search for hyper parameters for daLUKE",
            print_level=Levels.INFO if exp["quieter"] else Levels.DEBUG,
        )
        run_experiment(exp)