Ejemplo n.º 1
0
def optimize(model: NERDaLUKE, dataset: NERDataset, args: dict[str, Any], sampler: Sampler):
    results, tried_params = list(), list()
    best = None
    i = 0
    while (sampled_params := sampler.sample()) is not None:
        log.section(f"Sampling #{i}: chose", f(sampled_params))
        result = objective_function(deepcopy(model), dataset, {**args, **sampled_params})
        score = result.statistics["micro avg"]["f1-score"]
        if best is None or score > results[best].statistics["micro avg"]["f1-score"]:
            log(f"Found new best at F1 of {score}")
            best = i
        result.save(out := os.path.join(args['location'], f"res-optim{i}"))
        log.debug(f"Saved results to {out}")
        results.append(result)
        tried_params.append(sampled_params)
        i += 1
Ejemplo n.º 2
0
    def run(self) -> TrainResults:
        res = TrainResults(
            epoch                        = 0,
            losses                       = list(),
            best_epoch                   = None,
            running_train_statistics     = list(),
            running_dev_evaluations      = list(),
            dev_pred_distributions       = list(),
            dev_true_type_distribution   = dict(),
            train_pred_distributions     = list(),
            train_true_type_distribution = dict()
        )
        for i in range(self.epochs):
            res.epoch = i
            self.model.train()
            for j, batch in enumerate(self.dataloader):
                scores = self.model(batch)
                loss = self.criterion(scores.view(-1, self.model.output_shape), batch.entities.labels.view(-1))
                loss.backward()

                self.optimizer.step()
                self.scheduler.step()
                self.model.zero_grad()

                res.losses.append(loss.item())
                log.debug(f"Epoch {i} / {self.epochs-1}, batch: {j} / {len(self.dataloader)-1}. LR: {self.scheduler.get_last_lr()[0]:.2e} Loss: {loss.item():.5f}.")

            # Perform running evaluation
            if self.dev_dataloader is not None:
                log("Evaluating on development set ...")
                dev_results = evaluate_ner(self.model, self.dev_dataloader, self.dataset, self.device, Split.DEV, also_no_misc=False)
                res.running_dev_evaluations.append(dev_results)
                res.dev_pred_distributions.append(type_distribution(dev_results.preds))

                log("Evaluating on training set ...")
                train_results = evaluate_ner(self.model, self.dataloader, self.dataset, self.device, Split.TRAIN, also_no_misc=False)
                res.running_train_statistics.append(train_results.statistics)
                res.train_pred_distributions.append(type_distribution(train_results.preds))
                if res.best_epoch is None or\
                        (dev_results.statistics["micro avg"]["f1-score"]) > res.running_dev_evaluations[res.best_epoch].statistics["micro avg"]["f1-score"]:
                    log(f"Found new best model at epoch {i}")
                    self.best_model = deepcopy(self.model)
                    res.best_epoch = i
        return res
Ejemplo n.º 3
0
def ner(filepath: str, text: str):
    if not filepath and not text:
        raise ValueError("Either filepath or text must be given")
    elif filepath and text:
        raise ValueError("Filepath and text cannot both be given")
    elif filepath:
        with open(filepath) as f:
            text = f.read()

    log.debug("Loading model and predicting")
    with _no_log():
        daluke_ner = AutoNERDaLUKE()
        preds = predict_ner(text, daluke_ner)

    t = Table()
    t.add_header(["Word", "IOB NER Prediction"])
    for word, pred in zip(text.split(), preds):
        t.add_row([word, pred])
    log(t)
Ejemplo n.º 4
0
def run_experiment(args: dict[str, Any]):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    _, metadata, state_dict, token_map = load_from_archive(args["model"])

    log("Loading dataset ...")
    dataset = load_dataset(args, metadata, device, token_map)
    dataloader = dataset.build(Split.TEST, FP_SIZE)

    log("Loading model ...")
    model = load_model(state_dict, dataset, metadata, device)

    # Print some important information to stdout
    log.debug(model)
    dataset.document(dataloader, Split.TEST)
    type_distribution(dataset.data[Split.TEST].annotations)

    log("Starting evaluation of daLUKE for NER")
    results = evaluate_ner(model, dataloader, dataset, device, Split.TEST)

    results.save(args["location"])
    type_distribution(results.preds)
Ejemplo n.º 5
0
def objective_function(model: NERDaLUKE, dataset: NERDataset, args: dict[str, Any]) -> NER_Results:
    dataloader = dataset.build(Split.TRAIN, args["batch_size"])
    dev_dataloader = dataset.build(Split.DEV, EVAL_BATCH)
    device = next(model.parameters()).device
    training = TrainNER(
        model,
        dataloader,
        dataset,
        device         = device,
        epochs         = args["epochs"],
        lr             = args["lr"],
        warmup_prop    = args["warmup_prop"],
        weight_decay   = args["weight_decay"],
        dev_dataloader = dev_dataloader,
        loss_weight    = args["batch_size"]
    )
    res = training.run()

    log.debug("Evaluating")
    best_res = res.running_dev_evaluations[res.best_epoch]
    log(f"Best model achieved {best_res.statistics['micro avg']['f1-score']} in mic-F1")
    return best_res
Ejemplo n.º 6
0
def main(path: str, model: str, n_components: int,
         reducer_subsample: Optional[int], tsne_perplexity: float,
         umap_neighbours: int, umap_min_dist: float, only_positives: bool,
         fine_tuned: bool):
    set_seeds()
    log.configure(os.path.join(path, "geometry-analysis.log"),
                  "daLUKE embedding geometry analysis",
                  print_level=Levels.DEBUG)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    with torch.no_grad():
        representations, labels, content = collect_representations(
            model, device, torch.device("cpu"), only_positives, fine_tuned)
    log(f"Acquired representations of shape {representations.shape}")
    log("Performing principal component analysis")
    pca_transformed, principal_components = pca(representations, n_components)
    if reducer_subsample is not None:
        log.debug(
            f"Reducing dataset to {reducer_subsample} examples for UMAP and t-SNE"
        )
        representations = representations[:reducer_subsample]
    log("Running the UMAP algorithm")
    umap_transformed = umap(representations, umap_neighbours, umap_min_dist)
    log("Running the t-SNE algorithm")
    tsne_transformed = tsne(representations, tsne_perplexity)

    log(
        "Saved analysis results to",
        GeometryResults(
            pca_transformed=pca_transformed,
            umap_transformed=umap_transformed,
            tsne_transformed=tsne_transformed,
            labels=labels,
            principal_components=principal_components,
            content=content,
        ).save(path),
    )
Ejemplo n.º 7
0
def masked(filepath: str, text: str, entity_spans: list[str]):
    """ Entities are given as 'start1,end1;start2,end2 ...'
    Ends are optional. If not given, they will be set to start+1
    Spans are 1-indexed with inclusive ends """
    if not filepath and not text:
        raise ValueError("Either filepath or text must be given")
    elif filepath and text:
        raise ValueError("Filepath and text cannot both be given")
    elif filepath:
        with open(filepath) as f:
            text = f.read()

    entity_spans = [(int(x.split(",")[0]) - 1,
                     int(x.split(",")[1])) if "," in x else
                    (int(x) - 1, int(x)) for x in entity_spans.split(";") if x]

    log.debug("Loading model and predicting")
    with _no_log():
        daluke_mlm = AutoMLMDaLUKE()
        text, top_preds = predict_mlm(text, entity_spans, daluke_mlm)

    log("The top 5 predictions with likelihoods for each [MASK] were",
        top_preds)
    log("DaLUKE's best predictions were", text)
Ejemplo n.º 8
0
def pca(A: np.ndarray, k: int) -> tuple[np.ndarray, np.ndarray]:
    """
    A is (# data points, # dimensions).
    k is number of eigenvalues used for projection
    """
    log.debug("Calculating covariance matrix")
    A_c = A - A.mean(0)
    # As # data points >>> # dimensions (~1M vs. 2k), we do covariance of features
    covar = (A_c.T @ A_c) / (A_c.shape[1] - 1)
    log.debug("Calculating eigenvalues ...")
    lambdas, Q = np.linalg.eigh(covar)
    # Want it in eigenvalue-descending order
    lambdas, Q = lambdas[::-1], np.flip(Q, axis=1)
    log.debug("Transforming to PC space")
    P = Q[:, :k]
    Z = A_c @ P
    return Z, lambdas
Ejemplo n.º 9
0
def main():
    parser = ArgumentParser(description=\
        "Standalone convenience script used to collect the results from the pretraining of daLUKE "\
        "performed by the pretraining module")
    parser.add_argument("inpath", type=str,
        help= "Path to the output folder of the pretraining containing the model file. "\
            "Entity vocab. and metadata are assumed to be in parent folder of this."\
            "Can also be path to an exact model file, in which case this will be used instead of the newest."
    )
    parser.add_argument("outpath",
                        type=str,
                        help="File path to the compressed model")
    parser.add_argument("--tmpdir",
                        type=str,
                        help="Where to create temporary folder",
                        default="")
    args = parser.parse_args()
    log.configure(os.path.join(
        args.outpath if os.path.isdir(args.outpath) else os.path.dirname(
            args.outpath), "collect.log"),
                  "Collector",
                  print_level=Levels.DEBUG)

    modelpath = args.inpath if os.path.isdir(args.inpath) else os.path.dirname(
        args.inpath)
    vocabfile, metafile = os.path.join(modelpath, "..",
                                       VOCAB_FILE), os.path.join(
                                           modelpath, "..", METADATA_FILE)
    modelfile = os.path.join(args.inpath, _get_newest_model(
        args.inpath)) if os.path.isdir(args.inpath) else args.inpath

    os.makedirs(os.path.split(args.outpath)[0], exist_ok=True)

    ins, outs = [vocabfile, metafile,
                 modelfile], [VOCAB_FILE, METADATA_FILE, MODEL_OUT]
    # If reduction is used, also collect the token map
    with open(metafile, "r") as f:
        is_reduced = json.load(f).get("reduced-vocab")
    if is_reduced:
        ins.append(os.path.join(modelpath, "..",
                                DatasetBuilder.token_map_file))
        outs.append(TOKEN_MAP_FILE)
    tmpdir = os.path.join(args.tmpdir, "tmpdir")
    log.debug(f"Using:", *ins)

    # Operate directly on disk as opposed to serialize.save_to_archive which requires us to load the data into mem.
    if shutil.which("tar"):
        log.debug(f"Compressing to {args.outpath} using system tar tool...")
        try:
            os.makedirs(tmpdir, exist_ok=True)
            for f, n in zip(ins, outs):
                shutil.copy2(f, os.path.join(tmpdir, n))
            p = subprocess.Popen(
                ["tar", "-czvf", args.outpath, "-C", tmpdir] + outs,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
            )
            p.wait()
        finally:
            shutil.rmtree(tmpdir)
    else:
        with tarfile.open(args.outpath, "w:gz") as tar:
            for f, n in zip(ins, outs):
                log.debug(
                    f"Compressing {f} as {n} using build-in tar module (may take a while)..."
                )
                tar.add(f, arcname=n)
    log("Succesfully compressed file saved to", args.outpath)
Ejemplo n.º 10
0
    def __init__(
        self,
        dump_db_file: str,  # Location of file build by build-dump-db
        tokenizer_name:
        str,  # Tokenizer to use, e.g. Maltehb/danish-bert-botxo for Danish BERT
        entity_vocab_file: str,  # Build by build-entity-vocab
        out_dir:
        str,  # Where to put finished dataset. All contents will be removed before saving dataset
        validation_prob:
        float,  # Chance of each finished document to be marked as part of validation set
        max_entities:
        int,  # Only up to this many entities are included in each sequence
        max_entity_span:
        int,  # Maximum number tokens an entity can span before sequence is discarded
        min_sentence_length:
        int,  # Minimum number of tokens a sentence must span to be included
        max_articles: int | None,
        max_vocab_size: int,
    ):
        if not wikipedia2vec_available:
            raise ModuleNotFoundError(
                "Pretrain data generation requires installation of the optional requirement `wikipedia2vec`"
            )
        log("Reading dump database at %s" % dump_db_file)
        self.dump_db = DumpDB(dump_db_file)
        log("Building tokeninizer: %s" % tokenizer_name)
        self.tokenizer_name = tokenizer_name
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        log("Building sentence tokenizer: %s" % self.tokenizer_language)
        self.sentence_tokenizer = ICUSentenceTokenizer(self.tokenizer_language)
        log("Loading entity vocab at %s" % entity_vocab_file)
        self.entity_vocab = load_entity_vocab(entity_vocab_file)
        # Make sure IDs on non-ignored entities are contiguous
        num = 0
        for entity_info in self.entity_vocab.values():
            entity_info["id"] = num
            num += 1
        log("Entity vocab has size %i" % num)

        self.out_dir = out_dir
        self.data_file = os.path.join(self.out_dir, self.data_file)
        self.token_map_file = os.path.join(self.out_dir, self.token_map_file)
        self.max_seq_length = self.tokenizer.model_max_length
        self.validation_prob = validation_prob
        self.max_entities = max_entities
        self.max_entity_span = max_entity_span
        self.min_sentence_length = min_sentence_length
        # Get maximum number of tokens in a sequence excluding start and end tokens
        self.max_num_tokens = self.max_seq_length - 2
        self.max_articles = max_articles
        self.vocab_size = self.tokenizer.vocab_size if max_vocab_size == -1 else min(
            max_vocab_size, max_vocab_size)

        # Filter titles so only real articles are included
        self.target_titles = list(self.dump_db.titles())

        # Remove old datafile if it exists
        if os.path.isfile(self.data_file):
            log.debug("Removing old datafile '%s'" % self.data_file)
            os.remove(self.data_file)

        self.examples = list()
Ejemplo n.º 11
0
            log.section("Reducing token number")
            with TT.profile("Reduce token vocab"):
                token_map, metadata["vocab-size"] = self._reduce_tokens()
            with TT.profile("Rewrite dataset with new tokens"):
                self._update_tokens(token_map)

        with open(path := os.path.join(self.out_dir, self.metadata_file),
                  "w") as f:
            log.section("Saving metadata to %s" % path)
            ujson.dump(metadata, f, indent=4)
        with open(self.data_file, "w") as f, TT.profile("Save data"):
            log("Saving data to '%s'" % self.data_file)
            for example in self.examples:
                f.write(ujson.dumps(example) + "\n")

        log.debug("Time distribution", TT)

    def _get_sentence_features(self,
                               page_title: str) -> list[tuple[list[str], 3]]:

        sentences = list()

        # Process by paragraph
        for paragraph in self.dump_db.get_paragraphs(page_title):
            paragraph_links: list[tuple[str, int, int]] = list()
            paragraph_text = paragraph.text

            # Get paragraph links
            # These are representated by three-tuples consisting of their title, start and end string positions
            TT.profile("Get links")
            for link in paragraph.wiki_links:
Ejemplo n.º 12
0
 def run(self):
     log(f"Evaluating {self.model.name} on {self.dataset.name} ...")
     preds, truths = self._get_results()
     log.debug(f"Calculating statistics for {len(preds)} sentences")
     return  self._calculate_stats(preds, truths)
Ejemplo n.º 13
0
def run_experiment(args: dict[str, Any]):
    log.configure(
        os.path.join(args["location"], "daluke-train-ner.log"),
        args["name"] + " Fine-tuning",
        logger=args["name"] + "-fine-tune",
        print_level=Levels.INFO if args["quieter"] else Levels.DEBUG,
    )
    set_seeds(seed=args["seed"])
    assert not (args["words_only"] and args["entities_only"]), "--words-only and --entities-only cannot be used together"
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    entity_vocab, metadata, state_dict, token_map = load_from_archive(args["model"])
    state_dict, ent_embed_size = mutate_for_ner(state_dict, mask_id=entity_vocab["[MASK]"]["id"], pad_id=entity_vocab["[PAD]"]["id"])

    # Add new NER specific fields to metadata
    metadata["NER-words-only"]    = args["words_only"]
    metadata["NER-entities-only"] = args["entities_only"]

    log(f"Loading dataset {args['dataset']} ...")
    dataset = load_dataset(args, metadata, device, token_map)
    dataloader = dataset.build(Split.TRAIN, args["batch_size"])
    dev_dataloader = dataset.build(Split.DEV, args["batch_size"]) if args["eval"] else None

    # Remember the dimensionality that the model will be trained with
    metadata["output-size"] = len(dataset.all_labels)

    log("Loading model ...")
    model = load_model(
        state_dict,
        dataset,
        metadata,
        device,
        bert_attention = args["bert_attention"],
        entity_embedding_size = ent_embed_size,
        dropout = args["dropout"],
    )

    log(f"Starting training of DaLUKE for NER on {args['dataset']}")
    training = TrainNER(
        model,
        dataloader,
        dataset,
        device         = device,
        epochs         = args["epochs"],
        lr             = args["lr"],
        warmup_prop    = args["warmup_prop"],
        weight_decay   = args["weight_decay"],
        dev_dataloader = dev_dataloader,
        loss_weight    = args["loss_weight"],
    )
    # Log important information out
    log.debug(training.model)
    log.debug(training.scheduler)
    log.debug(training.optimizer)
    dataset.document(dataloader, Split.TRAIN)
    type_distribution(dataset.data[Split.TRAIN].annotations)

    results = training.run()

    log("Saving results and model to %s" % args["location"])
    save_to_archive(os.path.join(args["location"], TRAIN_OUT), entity_vocab, metadata, model, token_map)

    if args["eval"]:
        log("True dev. set distributions")
        results.dev_true_type_distribution = type_distribution(dataset.data[Split.DEV].annotations)
        log("True dev. set distributions")
        results.train_true_type_distribution = type_distribution(dataset.data[Split.TRAIN].annotations)
        log("Saving best model")
        save_to_archive(os.path.join(args["location"], TRAIN_OUT_BEST), entity_vocab, metadata, training.best_model, token_map)

    results.save(args["location"])
Ejemplo n.º 14
0
def evaluate_ner(model: nn.Module,
                 dataloader: torch.utils.data.DataLoader,
                 dataset: NERDataset,
                 device: torch.device,
                 split: Split,
                 also_no_misc=True) -> NER_Results:
    model.eval()
    annotations, texts = dataset.data[split].annotations, dataset.data[
        split].texts
    span_probs: list[dict[tuple[int, int],
                          np.ndarray]] = list(dict()
                                              for _ in range(len(texts)))
    log.debug(f"Forward passing {len(dataloader)} batches")

    TT.tick()
    for batch in tqdm(dataloader):
        scores = model(batch)
        probs = F.softmax(scores, dim=2)
        # We save probability distribution, for every possible span in the example
        for idx, (i, spans) in zip(batch.text_nums,
                                   enumerate(batch.entities.fullword_spans)):
            span_probs[idx].update({
                span: probs[i, j].detach().cpu().numpy()
                for j, span in enumerate(spans) if span
            })
    preds = [
        span_probs_to_preds(p, len(t), dataset)
        for p, t in zip(span_probs, texts)
    ]
    log(f"Forward pass completed: Wall time: {TT.tock():.4f} s.")

    stats = _stats_to_py_nums(
        classification_report(annotations,
                              preds,
                              output_dict=True,
                              zero_division=0))
    log(classification_report(annotations, preds, zero_division=0, digits=4))
    confmat = confusion_matrix(annotations, preds, dataset.all_labels)
    confmat_nomisc = dict()
    log("Prediction distribution", _format_confmat(confmat))

    if also_no_misc:
        #FIXME: Do this manually instead of rerunning everything
        stats_nomisc = _stats_to_py_nums(
            classification_report(_rm_misc(annotations, dataset.null_label),
                                  _rm_misc(preds, dataset.null_label),
                                  output_dict=True))
        log(
            classification_report(_rm_misc(annotations, dataset.null_label),
                                  _rm_misc(preds, dataset.null_label),
                                  digits=4))
        confmat_nomisc = confusion_matrix(
            _rm_misc(annotations, dataset.null_label),
            _rm_misc(preds, dataset.null_label), dataset.all_labels)
        log("Prediction distribution", _format_confmat(confmat))

    return NER_Results(
        preds=preds,
        span_probs=span_probs,
        statistics=stats,
        statistics_nomisc=stats_nomisc if also_no_misc else {},
        confusion_matrix=confmat,
        confusion_matrix_nomisc=confmat_nomisc,
    )