def optimize(model: NERDaLUKE, dataset: NERDataset, args: dict[str, Any], sampler: Sampler): results, tried_params = list(), list() best = None i = 0 while (sampled_params := sampler.sample()) is not None: log.section(f"Sampling #{i}: chose", f(sampled_params)) result = objective_function(deepcopy(model), dataset, {**args, **sampled_params}) score = result.statistics["micro avg"]["f1-score"] if best is None or score > results[best].statistics["micro avg"]["f1-score"]: log(f"Found new best at F1 of {score}") best = i result.save(out := os.path.join(args['location'], f"res-optim{i}")) log.debug(f"Saved results to {out}") results.append(result) tried_params.append(sampled_params) i += 1
def run(self) -> TrainResults: res = TrainResults( epoch = 0, losses = list(), best_epoch = None, running_train_statistics = list(), running_dev_evaluations = list(), dev_pred_distributions = list(), dev_true_type_distribution = dict(), train_pred_distributions = list(), train_true_type_distribution = dict() ) for i in range(self.epochs): res.epoch = i self.model.train() for j, batch in enumerate(self.dataloader): scores = self.model(batch) loss = self.criterion(scores.view(-1, self.model.output_shape), batch.entities.labels.view(-1)) loss.backward() self.optimizer.step() self.scheduler.step() self.model.zero_grad() res.losses.append(loss.item()) log.debug(f"Epoch {i} / {self.epochs-1}, batch: {j} / {len(self.dataloader)-1}. LR: {self.scheduler.get_last_lr()[0]:.2e} Loss: {loss.item():.5f}.") # Perform running evaluation if self.dev_dataloader is not None: log("Evaluating on development set ...") dev_results = evaluate_ner(self.model, self.dev_dataloader, self.dataset, self.device, Split.DEV, also_no_misc=False) res.running_dev_evaluations.append(dev_results) res.dev_pred_distributions.append(type_distribution(dev_results.preds)) log("Evaluating on training set ...") train_results = evaluate_ner(self.model, self.dataloader, self.dataset, self.device, Split.TRAIN, also_no_misc=False) res.running_train_statistics.append(train_results.statistics) res.train_pred_distributions.append(type_distribution(train_results.preds)) if res.best_epoch is None or\ (dev_results.statistics["micro avg"]["f1-score"]) > res.running_dev_evaluations[res.best_epoch].statistics["micro avg"]["f1-score"]: log(f"Found new best model at epoch {i}") self.best_model = deepcopy(self.model) res.best_epoch = i return res
def ner(filepath: str, text: str): if not filepath and not text: raise ValueError("Either filepath or text must be given") elif filepath and text: raise ValueError("Filepath and text cannot both be given") elif filepath: with open(filepath) as f: text = f.read() log.debug("Loading model and predicting") with _no_log(): daluke_ner = AutoNERDaLUKE() preds = predict_ner(text, daluke_ner) t = Table() t.add_header(["Word", "IOB NER Prediction"]) for word, pred in zip(text.split(), preds): t.add_row([word, pred]) log(t)
def run_experiment(args: dict[str, Any]): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") _, metadata, state_dict, token_map = load_from_archive(args["model"]) log("Loading dataset ...") dataset = load_dataset(args, metadata, device, token_map) dataloader = dataset.build(Split.TEST, FP_SIZE) log("Loading model ...") model = load_model(state_dict, dataset, metadata, device) # Print some important information to stdout log.debug(model) dataset.document(dataloader, Split.TEST) type_distribution(dataset.data[Split.TEST].annotations) log("Starting evaluation of daLUKE for NER") results = evaluate_ner(model, dataloader, dataset, device, Split.TEST) results.save(args["location"]) type_distribution(results.preds)
def objective_function(model: NERDaLUKE, dataset: NERDataset, args: dict[str, Any]) -> NER_Results: dataloader = dataset.build(Split.TRAIN, args["batch_size"]) dev_dataloader = dataset.build(Split.DEV, EVAL_BATCH) device = next(model.parameters()).device training = TrainNER( model, dataloader, dataset, device = device, epochs = args["epochs"], lr = args["lr"], warmup_prop = args["warmup_prop"], weight_decay = args["weight_decay"], dev_dataloader = dev_dataloader, loss_weight = args["batch_size"] ) res = training.run() log.debug("Evaluating") best_res = res.running_dev_evaluations[res.best_epoch] log(f"Best model achieved {best_res.statistics['micro avg']['f1-score']} in mic-F1") return best_res
def main(path: str, model: str, n_components: int, reducer_subsample: Optional[int], tsne_perplexity: float, umap_neighbours: int, umap_min_dist: float, only_positives: bool, fine_tuned: bool): set_seeds() log.configure(os.path.join(path, "geometry-analysis.log"), "daLUKE embedding geometry analysis", print_level=Levels.DEBUG) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") with torch.no_grad(): representations, labels, content = collect_representations( model, device, torch.device("cpu"), only_positives, fine_tuned) log(f"Acquired representations of shape {representations.shape}") log("Performing principal component analysis") pca_transformed, principal_components = pca(representations, n_components) if reducer_subsample is not None: log.debug( f"Reducing dataset to {reducer_subsample} examples for UMAP and t-SNE" ) representations = representations[:reducer_subsample] log("Running the UMAP algorithm") umap_transformed = umap(representations, umap_neighbours, umap_min_dist) log("Running the t-SNE algorithm") tsne_transformed = tsne(representations, tsne_perplexity) log( "Saved analysis results to", GeometryResults( pca_transformed=pca_transformed, umap_transformed=umap_transformed, tsne_transformed=tsne_transformed, labels=labels, principal_components=principal_components, content=content, ).save(path), )
def masked(filepath: str, text: str, entity_spans: list[str]): """ Entities are given as 'start1,end1;start2,end2 ...' Ends are optional. If not given, they will be set to start+1 Spans are 1-indexed with inclusive ends """ if not filepath and not text: raise ValueError("Either filepath or text must be given") elif filepath and text: raise ValueError("Filepath and text cannot both be given") elif filepath: with open(filepath) as f: text = f.read() entity_spans = [(int(x.split(",")[0]) - 1, int(x.split(",")[1])) if "," in x else (int(x) - 1, int(x)) for x in entity_spans.split(";") if x] log.debug("Loading model and predicting") with _no_log(): daluke_mlm = AutoMLMDaLUKE() text, top_preds = predict_mlm(text, entity_spans, daluke_mlm) log("The top 5 predictions with likelihoods for each [MASK] were", top_preds) log("DaLUKE's best predictions were", text)
def pca(A: np.ndarray, k: int) -> tuple[np.ndarray, np.ndarray]: """ A is (# data points, # dimensions). k is number of eigenvalues used for projection """ log.debug("Calculating covariance matrix") A_c = A - A.mean(0) # As # data points >>> # dimensions (~1M vs. 2k), we do covariance of features covar = (A_c.T @ A_c) / (A_c.shape[1] - 1) log.debug("Calculating eigenvalues ...") lambdas, Q = np.linalg.eigh(covar) # Want it in eigenvalue-descending order lambdas, Q = lambdas[::-1], np.flip(Q, axis=1) log.debug("Transforming to PC space") P = Q[:, :k] Z = A_c @ P return Z, lambdas
def main(): parser = ArgumentParser(description=\ "Standalone convenience script used to collect the results from the pretraining of daLUKE "\ "performed by the pretraining module") parser.add_argument("inpath", type=str, help= "Path to the output folder of the pretraining containing the model file. "\ "Entity vocab. and metadata are assumed to be in parent folder of this."\ "Can also be path to an exact model file, in which case this will be used instead of the newest." ) parser.add_argument("outpath", type=str, help="File path to the compressed model") parser.add_argument("--tmpdir", type=str, help="Where to create temporary folder", default="") args = parser.parse_args() log.configure(os.path.join( args.outpath if os.path.isdir(args.outpath) else os.path.dirname( args.outpath), "collect.log"), "Collector", print_level=Levels.DEBUG) modelpath = args.inpath if os.path.isdir(args.inpath) else os.path.dirname( args.inpath) vocabfile, metafile = os.path.join(modelpath, "..", VOCAB_FILE), os.path.join( modelpath, "..", METADATA_FILE) modelfile = os.path.join(args.inpath, _get_newest_model( args.inpath)) if os.path.isdir(args.inpath) else args.inpath os.makedirs(os.path.split(args.outpath)[0], exist_ok=True) ins, outs = [vocabfile, metafile, modelfile], [VOCAB_FILE, METADATA_FILE, MODEL_OUT] # If reduction is used, also collect the token map with open(metafile, "r") as f: is_reduced = json.load(f).get("reduced-vocab") if is_reduced: ins.append(os.path.join(modelpath, "..", DatasetBuilder.token_map_file)) outs.append(TOKEN_MAP_FILE) tmpdir = os.path.join(args.tmpdir, "tmpdir") log.debug(f"Using:", *ins) # Operate directly on disk as opposed to serialize.save_to_archive which requires us to load the data into mem. if shutil.which("tar"): log.debug(f"Compressing to {args.outpath} using system tar tool...") try: os.makedirs(tmpdir, exist_ok=True) for f, n in zip(ins, outs): shutil.copy2(f, os.path.join(tmpdir, n)) p = subprocess.Popen( ["tar", "-czvf", args.outpath, "-C", tmpdir] + outs, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) p.wait() finally: shutil.rmtree(tmpdir) else: with tarfile.open(args.outpath, "w:gz") as tar: for f, n in zip(ins, outs): log.debug( f"Compressing {f} as {n} using build-in tar module (may take a while)..." ) tar.add(f, arcname=n) log("Succesfully compressed file saved to", args.outpath)
def __init__( self, dump_db_file: str, # Location of file build by build-dump-db tokenizer_name: str, # Tokenizer to use, e.g. Maltehb/danish-bert-botxo for Danish BERT entity_vocab_file: str, # Build by build-entity-vocab out_dir: str, # Where to put finished dataset. All contents will be removed before saving dataset validation_prob: float, # Chance of each finished document to be marked as part of validation set max_entities: int, # Only up to this many entities are included in each sequence max_entity_span: int, # Maximum number tokens an entity can span before sequence is discarded min_sentence_length: int, # Minimum number of tokens a sentence must span to be included max_articles: int | None, max_vocab_size: int, ): if not wikipedia2vec_available: raise ModuleNotFoundError( "Pretrain data generation requires installation of the optional requirement `wikipedia2vec`" ) log("Reading dump database at %s" % dump_db_file) self.dump_db = DumpDB(dump_db_file) log("Building tokeninizer: %s" % tokenizer_name) self.tokenizer_name = tokenizer_name self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) log("Building sentence tokenizer: %s" % self.tokenizer_language) self.sentence_tokenizer = ICUSentenceTokenizer(self.tokenizer_language) log("Loading entity vocab at %s" % entity_vocab_file) self.entity_vocab = load_entity_vocab(entity_vocab_file) # Make sure IDs on non-ignored entities are contiguous num = 0 for entity_info in self.entity_vocab.values(): entity_info["id"] = num num += 1 log("Entity vocab has size %i" % num) self.out_dir = out_dir self.data_file = os.path.join(self.out_dir, self.data_file) self.token_map_file = os.path.join(self.out_dir, self.token_map_file) self.max_seq_length = self.tokenizer.model_max_length self.validation_prob = validation_prob self.max_entities = max_entities self.max_entity_span = max_entity_span self.min_sentence_length = min_sentence_length # Get maximum number of tokens in a sequence excluding start and end tokens self.max_num_tokens = self.max_seq_length - 2 self.max_articles = max_articles self.vocab_size = self.tokenizer.vocab_size if max_vocab_size == -1 else min( max_vocab_size, max_vocab_size) # Filter titles so only real articles are included self.target_titles = list(self.dump_db.titles()) # Remove old datafile if it exists if os.path.isfile(self.data_file): log.debug("Removing old datafile '%s'" % self.data_file) os.remove(self.data_file) self.examples = list()
log.section("Reducing token number") with TT.profile("Reduce token vocab"): token_map, metadata["vocab-size"] = self._reduce_tokens() with TT.profile("Rewrite dataset with new tokens"): self._update_tokens(token_map) with open(path := os.path.join(self.out_dir, self.metadata_file), "w") as f: log.section("Saving metadata to %s" % path) ujson.dump(metadata, f, indent=4) with open(self.data_file, "w") as f, TT.profile("Save data"): log("Saving data to '%s'" % self.data_file) for example in self.examples: f.write(ujson.dumps(example) + "\n") log.debug("Time distribution", TT) def _get_sentence_features(self, page_title: str) -> list[tuple[list[str], 3]]: sentences = list() # Process by paragraph for paragraph in self.dump_db.get_paragraphs(page_title): paragraph_links: list[tuple[str, int, int]] = list() paragraph_text = paragraph.text # Get paragraph links # These are representated by three-tuples consisting of their title, start and end string positions TT.profile("Get links") for link in paragraph.wiki_links:
def run(self): log(f"Evaluating {self.model.name} on {self.dataset.name} ...") preds, truths = self._get_results() log.debug(f"Calculating statistics for {len(preds)} sentences") return self._calculate_stats(preds, truths)
def run_experiment(args: dict[str, Any]): log.configure( os.path.join(args["location"], "daluke-train-ner.log"), args["name"] + " Fine-tuning", logger=args["name"] + "-fine-tune", print_level=Levels.INFO if args["quieter"] else Levels.DEBUG, ) set_seeds(seed=args["seed"]) assert not (args["words_only"] and args["entities_only"]), "--words-only and --entities-only cannot be used together" device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") entity_vocab, metadata, state_dict, token_map = load_from_archive(args["model"]) state_dict, ent_embed_size = mutate_for_ner(state_dict, mask_id=entity_vocab["[MASK]"]["id"], pad_id=entity_vocab["[PAD]"]["id"]) # Add new NER specific fields to metadata metadata["NER-words-only"] = args["words_only"] metadata["NER-entities-only"] = args["entities_only"] log(f"Loading dataset {args['dataset']} ...") dataset = load_dataset(args, metadata, device, token_map) dataloader = dataset.build(Split.TRAIN, args["batch_size"]) dev_dataloader = dataset.build(Split.DEV, args["batch_size"]) if args["eval"] else None # Remember the dimensionality that the model will be trained with metadata["output-size"] = len(dataset.all_labels) log("Loading model ...") model = load_model( state_dict, dataset, metadata, device, bert_attention = args["bert_attention"], entity_embedding_size = ent_embed_size, dropout = args["dropout"], ) log(f"Starting training of DaLUKE for NER on {args['dataset']}") training = TrainNER( model, dataloader, dataset, device = device, epochs = args["epochs"], lr = args["lr"], warmup_prop = args["warmup_prop"], weight_decay = args["weight_decay"], dev_dataloader = dev_dataloader, loss_weight = args["loss_weight"], ) # Log important information out log.debug(training.model) log.debug(training.scheduler) log.debug(training.optimizer) dataset.document(dataloader, Split.TRAIN) type_distribution(dataset.data[Split.TRAIN].annotations) results = training.run() log("Saving results and model to %s" % args["location"]) save_to_archive(os.path.join(args["location"], TRAIN_OUT), entity_vocab, metadata, model, token_map) if args["eval"]: log("True dev. set distributions") results.dev_true_type_distribution = type_distribution(dataset.data[Split.DEV].annotations) log("True dev. set distributions") results.train_true_type_distribution = type_distribution(dataset.data[Split.TRAIN].annotations) log("Saving best model") save_to_archive(os.path.join(args["location"], TRAIN_OUT_BEST), entity_vocab, metadata, training.best_model, token_map) results.save(args["location"])
def evaluate_ner(model: nn.Module, dataloader: torch.utils.data.DataLoader, dataset: NERDataset, device: torch.device, split: Split, also_no_misc=True) -> NER_Results: model.eval() annotations, texts = dataset.data[split].annotations, dataset.data[ split].texts span_probs: list[dict[tuple[int, int], np.ndarray]] = list(dict() for _ in range(len(texts))) log.debug(f"Forward passing {len(dataloader)} batches") TT.tick() for batch in tqdm(dataloader): scores = model(batch) probs = F.softmax(scores, dim=2) # We save probability distribution, for every possible span in the example for idx, (i, spans) in zip(batch.text_nums, enumerate(batch.entities.fullword_spans)): span_probs[idx].update({ span: probs[i, j].detach().cpu().numpy() for j, span in enumerate(spans) if span }) preds = [ span_probs_to_preds(p, len(t), dataset) for p, t in zip(span_probs, texts) ] log(f"Forward pass completed: Wall time: {TT.tock():.4f} s.") stats = _stats_to_py_nums( classification_report(annotations, preds, output_dict=True, zero_division=0)) log(classification_report(annotations, preds, zero_division=0, digits=4)) confmat = confusion_matrix(annotations, preds, dataset.all_labels) confmat_nomisc = dict() log("Prediction distribution", _format_confmat(confmat)) if also_no_misc: #FIXME: Do this manually instead of rerunning everything stats_nomisc = _stats_to_py_nums( classification_report(_rm_misc(annotations, dataset.null_label), _rm_misc(preds, dataset.null_label), output_dict=True)) log( classification_report(_rm_misc(annotations, dataset.null_label), _rm_misc(preds, dataset.null_label), digits=4)) confmat_nomisc = confusion_matrix( _rm_misc(annotations, dataset.null_label), _rm_misc(preds, dataset.null_label), dataset.all_labels) log("Prediction distribution", _format_confmat(confmat)) return NER_Results( preds=preds, span_probs=span_probs, statistics=stats, statistics_nomisc=stats_nomisc if also_no_misc else {}, confusion_matrix=confmat, confusion_matrix_nomisc=confmat_nomisc, )