def word_preds(datadir: str, ff_size: int): log.configure(os.path.join(datadir, "dabert-word-preds.log"), "daBERT word predictions") log("Loading metadata") with open(os.path.join(datadir, DatasetBuilder.metadata_file)) as f: metadata = json.load(f) log("Loading model") dabert = AutoModelForPreTraining.from_pretrained(daBERT).to(device) log("Loading data") dataloader = DataLoader( datadir, metadata, dict(), device, ) loader = dataloader.get_dataloader(ff_size, None) log("Forward passing") correct_preds = np.zeros(len(loader)) for i, batch in tqdm(enumerate(loader), total=len(loader)): logits = dabert(batch.words.ids).prediction_logits masked_logits = logits[batch.word_mask] preds = masked_logits.argmax(dim=1) correct_preds[i] = (preds == batch.word_mask_labels).float().mean().cpu() log( "MLM token prediction accuracy", " Mean: %.4f %%" % (100 * correct_preds.mean()), " Std.: %.4f %%" % (100 * correct_preds.std(ddof=1)), )
def type_distribution(seqs: list[list[str]]): dist = defaultdict(lambda: 0) for seq in seqs: for pred in seq: dist[pred if "-" not in pred else pred.split("-")[-1]] += 1 log("Type distribution:", json.dumps(dist, indent=4)) return dist
def build(self): log("Saving tokenizer config and word token config to '%s'" % self.out_dir) with open(path := os.path.join(self.out_dir, self.entity_vocab_file), "w", encoding="utf-8") as ev: log("Saving entity vocab to '%s'" % path) ujson.dump(self.entity_vocab, ev, indent=2)
def _show_examples(res: GeometryResults, X: np.ndarray, I: np.ndarray, data: Sequences): for i, idx in enumerate(I): num, span = res.content[idx]["text_num"], res.content[idx]["span"] t, a = [*data.texts[num]], data.annotations[num] t.insert(span[0], "{") t.insert(span[1] + 1, "}") t = " ".join(t) log(f"{i} ({X[idx]}) {a[span[0]].split('-')[1] if '-' in a[span[0]] else a[span[0]]}: {t}\n", with_info=False)
def _reduce_tokens(self) -> tuple[np.ndarray, int]: token_counts = np.zeros(self.tokenizer.vocab_size, dtype=np.int32) log("Counting tokens in dataset") for example in tqdm(self.examples): word_ids = np.array(example["word_ids"]) word_ids, counts = unique(word_ids, return_counts=True) token_counts[word_ids] += counts log("%i of %i tokens in the vocab are used" % ((token_counts > 0).sum(), self.tokenizer.vocab_size)) *ids, unk_id = get_special_ids(self.tokenizer) unk_count = token_counts[unk_id] token_counts[[*ids, unk_id]] = -1 sort_idx = np.argsort(token_counts)[::-1] keep_idx = sort_idx[:self.vocab_size] keep = np.zeros_like(token_counts, dtype=bool) keep[keep_idx] = True keep[[*ids, unk_id]] = True # Always keep special tokens token_map = np.arange(self.tokenizer.vocab_size) token_map[~keep] = unk_id for i, j in enumerate(np.where(keep)[0]): token_map[j] = i log( "Reduced token vocabulary to %i tokens" % keep.sum(), "%.6f %% of word tokens in the dataset are now %s" % ( 100 * (unk_count + 1 + token_counts[~keep].sum()) / (unk_count + 1 + token_counts.sum()), self.tokenizer.unk_token, ), ) np.save(self.token_map_file, token_map) log("Saved token map to '%s'" % self.token_map_file) return token_map, int(keep.sum())
def document(self, loader: DataLoader, split: Split) -> dict[str, int]: """ To be run after _build_examples to document the resulting data. """ examples = [ex for _, ex in loader.dataset] non_zeros = [ (ex.entities.labels[ex.entities.labels != -1] != self.label_to_idx[self.null_label]).float().mean().item() for ex in examples ] log(f"Built dataset of {len(self.data[split].texts)} documents divided into {len(examples)} examples to be forward passed" ) log(f"Average proportion of spans in each example that have positive labels: {np.mean(non_zeros)*100:.2f}%" )
def main(daluke_path: str, other_path: str, show: bool): other_name = os.path.split(other_path)[-1] log.configure(os.path.join(daluke_path, f"comparison_with_{other_name}.log"), print_level=Levels.DEBUG) daluke_res = NER_Results.load(daluke_path) other_res = NER_TestResults.load(other_path) if show: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") data = load_dataset(dict(dataset="DaNE"), DUMMY_METADATA, device).data[Split.TEST] for da_preds, ot_preds, truths, text in zip(daluke_res.preds, other_res.predictions, data.annotations, data.texts): if da_preds != ot_preds: t = Table() t.add_row(["Text:"] + text) t.add_row(["Truth:"] + truths) t.add_row(["DaLUKE pred:"] + da_preds) t.add_row([f"{other_name} pred:"] + ot_preds) log(str(t).replace("|", ""), with_info=False) log(f"Confusion matrix with DaLUKE results ↓ and results from {other_name} →" ) log( _format_confmat( confusion_matrix(daluke_res.preds, other_res.predictions, ["LOC", "PER", "ORG", "MISC", "O"]))) log(f"Covar. {sequence_covar(daluke_res.preds, other_res.predictions)}")
def run_experiment(args: dict[str, Any]): set_seeds(seed=0) # Remove subolder so we can control location directly NER_Results.subfolder = "" device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") entity_vocab, metadata, state_dict, token_map = load_from_archive( args["model"]) state_dict, ent_embed_size = mutate_for_ner( state_dict, mask_id=entity_vocab["[MASK]"]["id"], pad_id=entity_vocab["[PAD]"]["id"]) log(f"Loading dataset {args['dataset']} ...") dataset = load_dataset(args, metadata, device, token_map) log("Loading model ...") model = load_model(state_dict, dataset, metadata, device, entity_embedding_size=ent_embed_size, bert_attention=args["bert_attention"], dropout=args["dropout"]) cv_results = cross_validate(model, dataset, args["k"], args) log(f"Saving results to {args['location']}") for i, r in enumerate(cv_results): r.save(os.path.join(args["location"], f"res-cv{i}")) log("Micro avg. F1 estimate", np.mean([r.statistics["micro avg"]["f1-score"] for r in cv_results]))
def optimize(model: NERDaLUKE, dataset: NERDataset, args: dict[str, Any], sampler: Sampler): results, tried_params = list(), list() best = None i = 0 while (sampled_params := sampler.sample()) is not None: log.section(f"Sampling #{i}: chose", f(sampled_params)) result = objective_function(deepcopy(model), dataset, {**args, **sampled_params}) score = result.statistics["micro avg"]["f1-score"] if best is None or score > results[best].statistics["micro avg"]["f1-score"]: log(f"Found new best at F1 of {score}") best = i result.save(out := os.path.join(args['location'], f"res-optim{i}")) log.debug(f"Saved results to {out}") results.append(result) tried_params.append(sampled_params) i += 1
def run_experiment(args: dict[str, str]): if args["models"] == "all": args["models"] = ALL_MODEL_NAMES if args["datasets"] == "all": args["datasets"] = ALL_DATASET_NAMES models = setup_models(args["models"].split(), args["location"], daner_path=args["daner"]) log(f"Succesfully set up {len(models)} models") datasets = setup_datasets(args["datasets"].split(), wikiann_path=args["wikiann"], plank_path=args["plank"]) log(f"Sucessfully acquired {len(datasets)} NER datasets") for model in models: for dataset in datasets: e = Evaluator(model, dataset) res = e.run() res.save(os.path.join(args["location"], "-".join((model.name, dataset.name))))
def setup_datasets(names_to_setup: list[str], wikiann_path: str="wikiann", plank_path: str="plank", split="test") -> list[TestDataset]: datasets = [] for name in names_to_setup: try: datasets.append( next(d for d in ALL_DATASETS if d.name == name) ) except IndexError as ie: raise ValueError(f"Dataset with given name {name} not found, see --help for options") from ie for d in datasets: log(f"Setting up dataset \"{d.name}\" ...") kwargs = dict() if isinstance(d, Wikiann): kwargs["data_path"] = wikiann_path elif isinstance(d, Plank): kwargs["data_path"] = plank_path d.setup(**kwargs, split=split) return datasets
def main(path: str, pred: str, truth: str): log.configure(os.path.join(path, f"prediction-examples-{pred}-{truth}.log"), print_level=Levels.DEBUG) log(f"Looking for examples where model predicted {pred}, but the truth was {truth}" ) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") res = NER_Results.load(path) data = load_dataset(dict(dataset="DaNE"), DUMMY_METADATA, device).data[Split.TEST] for preds, truths, text in zip(res.preds, data.annotations, data.texts): if any(p != t and cla(p) == pred and cla(t) == truth for p, t in zip(preds, truths)): t = Table() t.add_row(["Text:"] + text) t.add_row(["Truth:"] + truths) t.add_row(["Pred:"] + preds) log(str(t).replace("|", ""), with_info=False)
def setup_models(names_to_setup: list[str], location: str, daner_path: str = "daner") -> list[NER_TestModel]: models = [] for name in names_to_setup: try: models.append([m for m in ALL_MODELS if m.name == name][0]) except IndexError as ie: raise ValueError( f"Model with given name {name} not found, see --help for options" ) from ie for m in models: log(f"Setting up model \"{m.name}\" ... ") kwargs = dict() if isinstance(m, Daner): kwargs["repo_path"] = daner_path kwargs["data_path"] = location m.setup(**kwargs) return models
def ner(filepath: str, text: str): if not filepath and not text: raise ValueError("Either filepath or text must be given") elif filepath and text: raise ValueError("Filepath and text cannot both be given") elif filepath: with open(filepath) as f: text = f.read() log.debug("Loading model and predicting") with _no_log(): daluke_ner = AutoNERDaLUKE() preds = predict_ner(text, daluke_ner) t = Table() t.add_header(["Word", "IOB NER Prediction"]) for word, pred in zip(text.split(), preds): t.add_row([word, pred]) log(t)
def main(path: str, n: int): log.configure(os.path.join(path, "geometry-examples.log"), "daLUKE examples", print_level=Levels.DEBUG) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Hardcoded to train data = load_dataset(dict(dataset="DaNE"), DUMMY_METADATA, device).data[Split.TRAIN] set_seeds() GeometryResults.subfolder = "" res = GeometryResults.load(path) for field, axis in OF_INTEREST.items(): log.section(field) X = getattr(res, field) order = X[:, axis].argsort() log(f"Examples where dim. {axis} is high") _show_examples(res, X, order[::-1][:n], data) log(f"Examples where dim. {axis} is low") _show_examples(res, X, order[:n], data)
def objective_function(model: NERDaLUKE, dataset: NERDataset, args: dict[str, Any]) -> NER_Results: dataloader = dataset.build(Split.TRAIN, args["batch_size"]) dev_dataloader = dataset.build(Split.DEV, EVAL_BATCH) device = next(model.parameters()).device training = TrainNER( model, dataloader, dataset, device = device, epochs = args["epochs"], lr = args["lr"], warmup_prop = args["warmup_prop"], weight_decay = args["weight_decay"], dev_dataloader = dev_dataloader, loss_weight = args["batch_size"] ) res = training.run() log.debug("Evaluating") best_res = res.running_dev_evaluations[res.best_epoch] log(f"Best model achieved {best_res.statistics['micro avg']['f1-score']} in mic-F1") return best_res
def _calculate_stats(self, preds: list[list[str]], truth: list[list[str]]) -> NER_TestResults: # Convert to python numericals to avoid json serialization problems # Set divide by zero cases to 0 to avoid warnings for models that can't see "MISC" stats = self._stats_to_py_nums( classification_report(truth, preds, output_dict=True, zero_division=0) ) # If the dataset includes the MISC category, a version of the result without this is computed stats_nomisc = self._stats_to_py_nums( classification_report(self._rm_misc(truth), self._rm_misc(preds), output_dict=True) ) if any(any("MISC" in ent for ent in sent) for sent in truth) else stats #FIXME: Do this manually instead of rerunning everything log(classification_report(truth, preds, zero_division=0, digits=4)) if stats != stats_nomisc: log(classification_report(self._rm_misc(truth), self._rm_misc(preds), digits=4)) return NER_TestResults( modelname = self.model.name, dataname = self.dataset.name, predictions = preds, statistics = stats, statistics_nomisc = stats_nomisc, )
def masked(filepath: str, text: str, entity_spans: list[str]): """ Entities are given as 'start1,end1;start2,end2 ...' Ends are optional. If not given, they will be set to start+1 Spans are 1-indexed with inclusive ends """ if not filepath and not text: raise ValueError("Either filepath or text must be given") elif filepath and text: raise ValueError("Filepath and text cannot both be given") elif filepath: with open(filepath) as f: text = f.read() entity_spans = [(int(x.split(",")[0]) - 1, int(x.split(",")[1])) if "," in x else (int(x) - 1, int(x)) for x in entity_spans.split(";") if x] log.debug("Loading model and predicting") with _no_log(): daluke_mlm = AutoMLMDaLUKE() text, top_preds = predict_mlm(text, entity_spans, daluke_mlm) log("The top 5 predictions with likelihoods for each [MASK] were", top_preds) log("DaLUKE's best predictions were", text)
def collect_representations( modelpath: str, device: torch.device, target_device: torch.device, only_positives: bool, fine_tuned: bool ) -> tuple[np.ndarray, np.ndarray, list[dict[str, int | list[tuple[int, int]]]]]: entity_vocab, metadata, state_dict, token_map = load_from_archive( args["model"]) log("Loading dataset") # Note: We dont fill out dict as we dont allow changing max-entities and max-entity-span here. If this results in an error for any dataset, we must change this. dataset = load_dataset(dict(dataset="DaNE"), metadata, device, token_map) dataloader = dataset.build(Split.TRAIN, FP_SIZE, shuffle=False) log("Loading model") if not fine_tuned: state_dict, ent_embed_size = mutate_for_ner( state_dict, mask_id=entity_vocab["[MASK]"]["id"], pad_id=entity_vocab["[PAD]"]["id"]) model = load_model( state_dict, dataset, metadata, device, entity_embedding_size=ent_embed_size if not fine_tuned else None) model.eval() log("Forward passing examples") batch_representations, labels, content = list(), list(), list() for batch in tqdm(dataloader): # Use super class as we want the represenations word_representations, entity_representations = super( type(model), model).forward(batch) start_word_representations, end_word_representations = model.collect_start_and_ends( word_representations, batch) representations = torch.cat([ start_word_representations, end_word_representations, entity_representations ], dim=2) # We dont want padding mask = batch.entities.attention_mask.bool() if only_positives: mask &= (batch.entities.labels != 0) batch_representations.append( representations[mask].contiguous().to(target_device)) labels.append( batch.entities.labels[mask].contiguous().to(target_device)) for i, text_num in enumerate(batch.text_nums): for j in range(batch.entities.N[i]): if mask[i, j]: content.append( dict( text_num=text_num, span=batch.entities.fullword_spans[i][j], )) return torch.cat(batch_representations).numpy(), torch.cat( labels).numpy(), content
def main(path: str, model: str, n_components: int, reducer_subsample: Optional[int], tsne_perplexity: float, umap_neighbours: int, umap_min_dist: float, only_positives: bool, fine_tuned: bool): set_seeds() log.configure(os.path.join(path, "geometry-analysis.log"), "daLUKE embedding geometry analysis", print_level=Levels.DEBUG) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") with torch.no_grad(): representations, labels, content = collect_representations( model, device, torch.device("cpu"), only_positives, fine_tuned) log(f"Acquired representations of shape {representations.shape}") log("Performing principal component analysis") pca_transformed, principal_components = pca(representations, n_components) if reducer_subsample is not None: log.debug( f"Reducing dataset to {reducer_subsample} examples for UMAP and t-SNE" ) representations = representations[:reducer_subsample] log("Running the UMAP algorithm") umap_transformed = umap(representations, umap_neighbours, umap_min_dist) log("Running the t-SNE algorithm") tsne_transformed = tsne(representations, tsne_perplexity) log( "Saved analysis results to", GeometryResults( pca_transformed=pca_transformed, umap_transformed=umap_transformed, tsne_transformed=tsne_transformed, labels=labels, principal_components=principal_components, content=content, ).save(path), )
def cross_validate(model: NERDaLUKE, dataset: NERDataset, k: int, train_args: dict[str, Any]) -> list[NER_Results]: cv_splits = random_divide(merge_data(list(dataset.data.values())), k) results = list() log(f"Split into {k} subdatasets with lengths {[len(c.texts) for c in cv_splits]}" ) for i, test_data in enumerate(cv_splits): log.section(f"Cross-validation split {i}") train_data = merge_data([s for j, s in enumerate(cv_splits) if j != i]) # Create split specific model and data split_model = deepcopy(model) split_dataset = deepcopy(dataset) split_dataset.data[Split.TRAIN] = train_data split_dataloader = split_dataset.build(Split.TRAIN, train_args["batch_size"]) log("Training") split_dataset.document(split_dataloader, Split.TRAIN) type_distribution(split_dataset.data[Split.TRAIN].annotations) trainer = TrainNER( split_model, split_dataloader, split_dataset, device=next(split_model.parameters()).device, epochs=train_args["epochs"], lr=train_args["lr"], warmup_prop=train_args["warmup_prop"], weight_decay=train_args["weight_decay"], dev_dataloader=None, # Don't eval loss_weight=train_args["loss_weight"]) trainer.run() split_dataset.data[Split.TEST] = test_data split_test_dataloader = split_dataset.build(Split.TEST, EVAL_BATCH) log("Evaluation") split_dataset.document(split_dataloader, Split.TEST) type_distribution(split_dataset.data[Split.TEST].annotations) results.append( evaluate_ner(split_model, split_test_dataloader, split_dataset, trainer.device, Split.TEST, also_no_misc=False)) return results
def run(self) -> TrainResults: res = TrainResults( epoch = 0, losses = list(), best_epoch = None, running_train_statistics = list(), running_dev_evaluations = list(), dev_pred_distributions = list(), dev_true_type_distribution = dict(), train_pred_distributions = list(), train_true_type_distribution = dict() ) for i in range(self.epochs): res.epoch = i self.model.train() for j, batch in enumerate(self.dataloader): scores = self.model(batch) loss = self.criterion(scores.view(-1, self.model.output_shape), batch.entities.labels.view(-1)) loss.backward() self.optimizer.step() self.scheduler.step() self.model.zero_grad() res.losses.append(loss.item()) log.debug(f"Epoch {i} / {self.epochs-1}, batch: {j} / {len(self.dataloader)-1}. LR: {self.scheduler.get_last_lr()[0]:.2e} Loss: {loss.item():.5f}.") # Perform running evaluation if self.dev_dataloader is not None: log("Evaluating on development set ...") dev_results = evaluate_ner(self.model, self.dev_dataloader, self.dataset, self.device, Split.DEV, also_no_misc=False) res.running_dev_evaluations.append(dev_results) res.dev_pred_distributions.append(type_distribution(dev_results.preds)) log("Evaluating on training set ...") train_results = evaluate_ner(self.model, self.dataloader, self.dataset, self.device, Split.TRAIN, also_no_misc=False) res.running_train_statistics.append(train_results.statistics) res.train_pred_distributions.append(type_distribution(train_results.preds)) if res.best_epoch is None or\ (dev_results.statistics["micro avg"]["f1-score"]) > res.running_dev_evaluations[res.best_epoch].statistics["micro avg"]["f1-score"]: log(f"Found new best model at epoch {i}") self.best_model = deepcopy(self.model) res.best_epoch = i return res
def run_experiment(args: dict[str, Any]): set_seeds(seed=0) # Remove subfolder so we can control location directly NER_Results.subfolder = "" device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") entity_vocab, metadata, state_dict, token_map = load_from_archive(args["model"]) state_dict, ent_embed_size = mutate_for_ner(state_dict, mask_id=entity_vocab["[MASK]"]["id"], pad_id=entity_vocab["[PAD]"]["id"]) log("Setting up sampler") with open(args["params"], "r") as f: param_lists = json.load(f) sampler = SAMPLERS[args["sampler"]](param_lists) log(f"Loading dataset {args['dataset']} ...") dataset = load_dataset(args, metadata, device, token_map) log("Loading model ...") model = load_model(state_dict, dataset, metadata, device, entity_embedding_size=ent_embed_size) optimize(model, dataset, args, sampler)
def run_experiment(args: dict[str, Any]): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") _, metadata, state_dict, token_map = load_from_archive(args["model"]) log("Loading dataset ...") dataset = load_dataset(args, metadata, device, token_map) dataloader = dataset.build(Split.TEST, FP_SIZE) log("Loading model ...") model = load_model(state_dict, dataset, metadata, device) # Print some important information to stdout log.debug(model) dataset.document(dataloader, Split.TEST) type_distribution(dataset.data[Split.TEST].annotations) log("Starting evaluation of daLUKE for NER") results = evaluate_ner(model, dataloader, dataset, device, Split.TEST) results.save(args["location"]) type_distribution(results.preds)
def __init__( self, dump_db_file: str, # Location of file build by build-dump-db tokenizer_name: str, # Tokenizer to use, e.g. Maltehb/danish-bert-botxo for Danish BERT entity_vocab_file: str, # Build by build-entity-vocab out_dir: str, # Where to put finished dataset. All contents will be removed before saving dataset validation_prob: float, # Chance of each finished document to be marked as part of validation set max_entities: int, # Only up to this many entities are included in each sequence max_entity_span: int, # Maximum number tokens an entity can span before sequence is discarded min_sentence_length: int, # Minimum number of tokens a sentence must span to be included max_articles: int | None, max_vocab_size: int, ): if not wikipedia2vec_available: raise ModuleNotFoundError( "Pretrain data generation requires installation of the optional requirement `wikipedia2vec`" ) log("Reading dump database at %s" % dump_db_file) self.dump_db = DumpDB(dump_db_file) log("Building tokeninizer: %s" % tokenizer_name) self.tokenizer_name = tokenizer_name self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) log("Building sentence tokenizer: %s" % self.tokenizer_language) self.sentence_tokenizer = ICUSentenceTokenizer(self.tokenizer_language) log("Loading entity vocab at %s" % entity_vocab_file) self.entity_vocab = load_entity_vocab(entity_vocab_file) # Make sure IDs on non-ignored entities are contiguous num = 0 for entity_info in self.entity_vocab.values(): entity_info["id"] = num num += 1 log("Entity vocab has size %i" % num) self.out_dir = out_dir self.data_file = os.path.join(self.out_dir, self.data_file) self.token_map_file = os.path.join(self.out_dir, self.token_map_file) self.max_seq_length = self.tokenizer.model_max_length self.validation_prob = validation_prob self.max_entities = max_entities self.max_entity_span = max_entity_span self.min_sentence_length = min_sentence_length # Get maximum number of tokens in a sequence excluding start and end tokens self.max_num_tokens = self.max_seq_length - 2 self.max_articles = max_articles self.vocab_size = self.tokenizer.vocab_size if max_vocab_size == -1 else min( max_vocab_size, max_vocab_size) # Filter titles so only real articles are included self.target_titles = list(self.dump_db.titles()) # Remove old datafile if it exists if os.path.isfile(self.data_file): log.debug("Removing old datafile '%s'" % self.data_file) os.remove(self.data_file) self.examples = list()
class DatasetBuilder: tokenizer_language = "da" # Files saved by the build method metadata_file = "metadata.json" entity_vocab_file = "entity-vocab.json" data_file = "data.jsonl" token_map_file = "token-map.npy" def __init__( self, dump_db_file: str, # Location of file build by build-dump-db tokenizer_name: str, # Tokenizer to use, e.g. Maltehb/danish-bert-botxo for Danish BERT entity_vocab_file: str, # Build by build-entity-vocab out_dir: str, # Where to put finished dataset. All contents will be removed before saving dataset validation_prob: float, # Chance of each finished document to be marked as part of validation set max_entities: int, # Only up to this many entities are included in each sequence max_entity_span: int, # Maximum number tokens an entity can span before sequence is discarded min_sentence_length: int, # Minimum number of tokens a sentence must span to be included max_articles: int | None, max_vocab_size: int, ): if not wikipedia2vec_available: raise ModuleNotFoundError( "Pretrain data generation requires installation of the optional requirement `wikipedia2vec`" ) log("Reading dump database at %s" % dump_db_file) self.dump_db = DumpDB(dump_db_file) log("Building tokeninizer: %s" % tokenizer_name) self.tokenizer_name = tokenizer_name self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) log("Building sentence tokenizer: %s" % self.tokenizer_language) self.sentence_tokenizer = ICUSentenceTokenizer(self.tokenizer_language) log("Loading entity vocab at %s" % entity_vocab_file) self.entity_vocab = load_entity_vocab(entity_vocab_file) # Make sure IDs on non-ignored entities are contiguous num = 0 for entity_info in self.entity_vocab.values(): entity_info["id"] = num num += 1 log("Entity vocab has size %i" % num) self.out_dir = out_dir self.data_file = os.path.join(self.out_dir, self.data_file) self.token_map_file = os.path.join(self.out_dir, self.token_map_file) self.max_seq_length = self.tokenizer.model_max_length self.validation_prob = validation_prob self.max_entities = max_entities self.max_entity_span = max_entity_span self.min_sentence_length = min_sentence_length # Get maximum number of tokens in a sequence excluding start and end tokens self.max_num_tokens = self.max_seq_length - 2 self.max_articles = max_articles self.vocab_size = self.tokenizer.vocab_size if max_vocab_size == -1 else min( max_vocab_size, max_vocab_size) # Filter titles so only real articles are included self.target_titles = list(self.dump_db.titles()) # Remove old datafile if it exists if os.path.isfile(self.data_file): log.debug("Removing old datafile '%s'" % self.data_file) os.remove(self.data_file) self.examples = list() def _tokenize(self, text: str, paragraph_text: str, idx: int) -> list[str]: if not text: return list() try: if isinstance(self.tokenizer, RobertaTokenizer): tokens = self.tokenizer.tokenize( text, add_prefix_space=idx == 0 or text.startswith(" ") or paragraph_text[idx - 1] == " ", ) else: tokens = self.tokenizer.tokenize(text) except KeyboardInterrupt: # Make sure program can be keyboard interrupted despite needing to catch BaseException raise except BaseException as e: # Catch an exception caused by rust panicking in the tokenizer log.warning( "Failed to tokenize text with exception '%s'\nText: '%s'" % (e, text)) return list() return tokens def build(self): log("Saving tokenizer config and word token config to '%s'" % self.out_dir) with open(path := os.path.join(self.out_dir, self.entity_vocab_file), "w", encoding="utf-8") as ev: log("Saving entity vocab to '%s'" % path) ujson.dump(self.entity_vocab, ev, indent=2) log.section("Processing %i pages" % len(self.target_titles[:self.max_articles])) n_seqs, n_ents, n_word_toks, n_words = 0, 0, 0, 0 for title in log.tqdm(tqdm(self.target_titles[:self.max_articles])): log("Processing %s" % title) with TT.profile("Process page"): s, e, nt, nw = self._process_page(title) n_seqs += s n_ents += e n_word_toks += nt n_words += nw log("Shuffling data") random.shuffle(self.examples) n_vals = int(self.validation_prob * len(self.examples)) for i in range(n_vals): self.examples[i]["is_validation"] = True # Save metadata metadata = { "number-of-items": n_seqs, "number-of-word-tokens": n_word_toks, "number-of-words": n_words, "number-of-entities": n_ents, "number-of-val-items": n_vals, "max-seq-length": self.max_seq_length, "max-entities": self.max_entities, "max-entity-span": self.max_entity_span, "min-sentence-length": self.min_sentence_length, "base-model": self.tokenizer_name, "tokenizer-class": self.tokenizer.__class__.__name__, "language": self.dump_db.language, "reduced-vocab": self.vocab_size < self.tokenizer.vocab_size, "vocab-size": self.vocab_size, } if self.vocab_size < self.tokenizer.vocab_size: log.section("Reducing token number") with TT.profile("Reduce token vocab"): token_map, metadata["vocab-size"] = self._reduce_tokens() with TT.profile("Rewrite dataset with new tokens"): self._update_tokens(token_map) with open(path := os.path.join(self.out_dir, self.metadata_file), "w") as f: log.section("Saving metadata to %s" % path) ujson.dump(metadata, f, indent=4)
def _update_tokens(self, token_map: np.ndarray): log("Updating dataset with kept tokens") for example in tqdm(self.examples): example["word_ids"] = token_map[example["word_ids"]].tolist()
def main(): parser = ArgumentParser(description=\ "Standalone convenience script used to collect the results from the pretraining of daLUKE "\ "performed by the pretraining module") parser.add_argument("inpath", type=str, help= "Path to the output folder of the pretraining containing the model file. "\ "Entity vocab. and metadata are assumed to be in parent folder of this."\ "Can also be path to an exact model file, in which case this will be used instead of the newest." ) parser.add_argument("outpath", type=str, help="File path to the compressed model") parser.add_argument("--tmpdir", type=str, help="Where to create temporary folder", default="") args = parser.parse_args() log.configure(os.path.join( args.outpath if os.path.isdir(args.outpath) else os.path.dirname( args.outpath), "collect.log"), "Collector", print_level=Levels.DEBUG) modelpath = args.inpath if os.path.isdir(args.inpath) else os.path.dirname( args.inpath) vocabfile, metafile = os.path.join(modelpath, "..", VOCAB_FILE), os.path.join( modelpath, "..", METADATA_FILE) modelfile = os.path.join(args.inpath, _get_newest_model( args.inpath)) if os.path.isdir(args.inpath) else args.inpath os.makedirs(os.path.split(args.outpath)[0], exist_ok=True) ins, outs = [vocabfile, metafile, modelfile], [VOCAB_FILE, METADATA_FILE, MODEL_OUT] # If reduction is used, also collect the token map with open(metafile, "r") as f: is_reduced = json.load(f).get("reduced-vocab") if is_reduced: ins.append(os.path.join(modelpath, "..", DatasetBuilder.token_map_file)) outs.append(TOKEN_MAP_FILE) tmpdir = os.path.join(args.tmpdir, "tmpdir") log.debug(f"Using:", *ins) # Operate directly on disk as opposed to serialize.save_to_archive which requires us to load the data into mem. if shutil.which("tar"): log.debug(f"Compressing to {args.outpath} using system tar tool...") try: os.makedirs(tmpdir, exist_ok=True) for f, n in zip(ins, outs): shutil.copy2(f, os.path.join(tmpdir, n)) p = subprocess.Popen( ["tar", "-czvf", args.outpath, "-C", tmpdir] + outs, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) p.wait() finally: shutil.rmtree(tmpdir) else: with tarfile.open(args.outpath, "w:gz") as tar: for f, n in zip(ins, outs): log.debug( f"Compressing {f} as {n} using build-in tar module (may take a while)..." ) tar.add(f, arcname=n) log("Succesfully compressed file saved to", args.outpath)
def preprocess( dump_db_file: str, function: str, entity_vocab_file: str | None, dagw_sections: str | None, min_entity_length: int, max_entity_length: int, max_articles: int | None, ): if not entity_vocab_file: raise RuntimeError("entity-vocab-file must be given") log.configure( os.path.join(os.path.split(dump_db_file)[0], "preprocessing.log"), "Preprocessing", log_commit=True, ) log.section("Collecting data") log( "Wikidump path: %s" % dump_db_file, "Function: %s" % function, ) log("Loading entity vocab") entity_vocab = { _insert_xml_special_characters(e.lower()) for e in load_entity_vocab(entity_vocab_file) } dagw_files = list() if dagw_sections: n_words = 0 log("Finding gigaword data files and counting words") dagw_files = list(_get_dagw_files(dagw_sections)) for dagw_file in tqdm(dagw_files): with open(dagw_file) as f: n_words += len(f.read().split()) log("Found %i dagw files containing %i words" % (len(dagw_files), n_words)) # tempdir is not used, as the temporary files can take up more space than what temporary # directories usually allow tmpdir = os.path.join(os.path.split(dump_db_file)[0], "tmpdir") os.makedirs(tmpdir, exist_ok=True) log("Saving all articles to temporary directory %s" % tmpdir) for dagw_file in tqdm(dagw_files): shutil.copy2( dagw_file, os.path.join(tmpdir, fix_filename(os.path.split(dagw_file)[-1]))) log("Saving Wikipedia files to temporary directory") for is_text, text, title in tqdm(_get_lineblocks(dump_db_file), unit=" blocks"): if is_text and not ignore_title(title): text_start = text.index(">") + 1 text_end = -len("</text>\n") with open( os.path.join(tmpdir, fix_filename(title)[:100] + ".wiki"), "w") as f: f.write(text[text_start:text_end]) files = [ os.path.join(tmpdir, x) for x in os.listdir(tmpdir)[:max_articles] ] log("Saved a total of %i articles to %s" % (len(files), tmpdir)) log.section("Beginning preprocessing on %i threads" % os.cpu_count()) process_map( func, [(function, f, entity_vocab, min_entity_length, max_entity_length) for f in files], max_workers=os.cpu_count(), chunksize=1024, ) dump_file = os.path.splitext(dump_db_file)[0] + ".%s.bz2" % function log.info("Saving preprocessed files to %s" % dump_file) with bz2.BZ2File(dump_file, "w") as dump: with bz2.BZ2File(dump_db_file) as old_dump: line = b"" while not line.strip().startswith(b"<page>"): dump.write(line) line = old_dump.readline() for i, fname in tqdm(enumerate(files), total=len(files)): with open(fname) as f: text = f.read() s = """ <page> <title>{title}</title> <id>{id}</id> <revision> <text bytes="{bytes}" xml:space="preserve">{text}</text> </revision> </page>""".format( title=fname, id=i + 1, bytes=len(text), text=text, ) if i == 0: s = s[1:] dump.write(s.encode("utf-8")) dump.write(b"\n</mediawiki>") log.info("Removing temporary files") shutil.rmtree(tmpdir) log.info("Done preprocessing data")
next(d for d in ALL_DATASETS if d.name == name) ) except IndexError as ie: raise ValueError(f"Dataset with given name {name} not found, see --help for options") from ie for d in datasets: log(f"Setting up dataset \"{d.name}\" ...") kwargs = dict() if isinstance(d, Wikiann): kwargs["data_path"] = wikiann_path elif isinstance(d, Plank): kwargs["data_path"] = plank_path d.setup(**kwargs, split=split) return datasets if __name__ == '__main__': """ Shows some Data stats """ localdata = "../../local_data" localdata = os.path.join(sys.path[0], localdata) wikiann_p, plank_p = os.path.join(localdata, "wikiann"), os.path.join(localdata, "plank") log.configure(os.path.join(localdata, "data.log"), "data") for split in ("train", "dev", "test"): ds = setup_datasets(("DaNE", "Plank", "WikiANN"), wikiann_path=wikiann_p, plank_path=plank_p, split=split) for d in ds: log(f"{d.name} {split} sentences:", len(d.get_data()[0])) # now for better test statistics for d in ds: for ann in ("ORG", "PER", "LOC", "MISC"): log(f"#{ann} in {d.name}", sum(len([w for w in s if ann in w]) for s in d.get_data()[1]))