def run_experiment(args: dict[str, Any]): set_seeds(seed=0) # Remove subolder so we can control location directly NER_Results.subfolder = "" device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") entity_vocab, metadata, state_dict, token_map = load_from_archive( args["model"]) state_dict, ent_embed_size = mutate_for_ner( state_dict, mask_id=entity_vocab["[MASK]"]["id"], pad_id=entity_vocab["[PAD]"]["id"]) log(f"Loading dataset {args['dataset']} ...") dataset = load_dataset(args, metadata, device, token_map) log("Loading model ...") model = load_model(state_dict, dataset, metadata, device, entity_embedding_size=ent_embed_size, bert_attention=args["bert_attention"], dropout=args["dropout"]) cv_results = cross_validate(model, dataset, args["k"], args) log(f"Saving results to {args['location']}") for i, r in enumerate(cv_results): r.save(os.path.join(args["location"], f"res-cv{i}")) log("Micro avg. F1 estimate", np.mean([r.statistics["micro avg"]["f1-score"] for r in cv_results]))
def collect_representations( modelpath: str, device: torch.device, target_device: torch.device, only_positives: bool, fine_tuned: bool ) -> tuple[np.ndarray, np.ndarray, list[dict[str, int | list[tuple[int, int]]]]]: entity_vocab, metadata, state_dict, token_map = load_from_archive( args["model"]) log("Loading dataset") # Note: We dont fill out dict as we dont allow changing max-entities and max-entity-span here. If this results in an error for any dataset, we must change this. dataset = load_dataset(dict(dataset="DaNE"), metadata, device, token_map) dataloader = dataset.build(Split.TRAIN, FP_SIZE, shuffle=False) log("Loading model") if not fine_tuned: state_dict, ent_embed_size = mutate_for_ner( state_dict, mask_id=entity_vocab["[MASK]"]["id"], pad_id=entity_vocab["[PAD]"]["id"]) model = load_model( state_dict, dataset, metadata, device, entity_embedding_size=ent_embed_size if not fine_tuned else None) model.eval() log("Forward passing examples") batch_representations, labels, content = list(), list(), list() for batch in tqdm(dataloader): # Use super class as we want the represenations word_representations, entity_representations = super( type(model), model).forward(batch) start_word_representations, end_word_representations = model.collect_start_and_ends( word_representations, batch) representations = torch.cat([ start_word_representations, end_word_representations, entity_representations ], dim=2) # We dont want padding mask = batch.entities.attention_mask.bool() if only_positives: mask &= (batch.entities.labels != 0) batch_representations.append( representations[mask].contiguous().to(target_device)) labels.append( batch.entities.labels[mask].contiguous().to(target_device)) for i, text_num in enumerate(batch.text_nums): for j in range(batch.entities.N[i]): if mask[i, j]: content.append( dict( text_num=text_num, span=batch.entities.fullword_spans[i][j], )) return torch.cat(batch_representations).numpy(), torch.cat( labels).numpy(), content
def fetch_model( model: Models, force_download=False ) -> tuple[DaLUKE, dict, dict, Optional[np.ndarray]]: # Make sure .tar.gz model file exists os.makedirs(_download_dir, exist_ok=True) if should_download(model) or force_download: # Create status file pathlib.Path(_status_files[model]).touch() # Download wget.download(model.value, out=_model_files[model]) # Remove status file os.remove(_status_files[model]) # Read model state dict along with metadata and entity vocab # This is done in a seperate working directory cwd = os.getcwd() os.chdir(_download_dir) entity_vocab, metadata, state_dict, token_map = load_from_archive( _model_files[model]) os.chdir(cwd) # Load model bert_config = AutoConfig.from_pretrained(metadata["base-model"]) bert_config.vocab_size = metadata["vocab-size"] if model == Models.DaLUKE: net = PretrainTaskDaLUKE(bert_config, len(entity_vocab), get_ent_embed_size(state_dict)) elif model == Models.DaLUKE_NER: net = NERDaLUKE( output_shape=5, # Always use misc in this case bert_config=bert_config, ent_vocab_size=2, ent_embed_size=get_ent_embed_size(state_dict), dropout=0, words_only=False, entities_only=False, ) net.load_state_dict(state_dict) net.eval() return net.to(_device), metadata, entity_vocab, token_map
def run_experiment(args: dict[str, Any]): set_seeds(seed=0) # Remove subfolder so we can control location directly NER_Results.subfolder = "" device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") entity_vocab, metadata, state_dict, token_map = load_from_archive(args["model"]) state_dict, ent_embed_size = mutate_for_ner(state_dict, mask_id=entity_vocab["[MASK]"]["id"], pad_id=entity_vocab["[PAD]"]["id"]) log("Setting up sampler") with open(args["params"], "r") as f: param_lists = json.load(f) sampler = SAMPLERS[args["sampler"]](param_lists) log(f"Loading dataset {args['dataset']} ...") dataset = load_dataset(args, metadata, device, token_map) log("Loading model ...") model = load_model(state_dict, dataset, metadata, device, entity_embedding_size=ent_embed_size) optimize(model, dataset, args, sampler)
def run_experiment(args: dict[str, Any]): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") _, metadata, state_dict, token_map = load_from_archive(args["model"]) log("Loading dataset ...") dataset = load_dataset(args, metadata, device, token_map) dataloader = dataset.build(Split.TEST, FP_SIZE) log("Loading model ...") model = load_model(state_dict, dataset, metadata, device) # Print some important information to stdout log.debug(model) dataset.document(dataloader, Split.TEST) type_distribution(dataset.data[Split.TEST].annotations) log("Starting evaluation of daLUKE for NER") results = evaluate_ner(model, dataloader, dataset, device, Split.TEST) results.save(args["location"]) type_distribution(results.preds)
def run_experiment(args: dict[str, Any]): log.configure( os.path.join(args["location"], "daluke-train-ner.log"), args["name"] + " Fine-tuning", logger=args["name"] + "-fine-tune", print_level=Levels.INFO if args["quieter"] else Levels.DEBUG, ) set_seeds(seed=args["seed"]) assert not (args["words_only"] and args["entities_only"]), "--words-only and --entities-only cannot be used together" device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") entity_vocab, metadata, state_dict, token_map = load_from_archive(args["model"]) state_dict, ent_embed_size = mutate_for_ner(state_dict, mask_id=entity_vocab["[MASK]"]["id"], pad_id=entity_vocab["[PAD]"]["id"]) # Add new NER specific fields to metadata metadata["NER-words-only"] = args["words_only"] metadata["NER-entities-only"] = args["entities_only"] log(f"Loading dataset {args['dataset']} ...") dataset = load_dataset(args, metadata, device, token_map) dataloader = dataset.build(Split.TRAIN, args["batch_size"]) dev_dataloader = dataset.build(Split.DEV, args["batch_size"]) if args["eval"] else None # Remember the dimensionality that the model will be trained with metadata["output-size"] = len(dataset.all_labels) log("Loading model ...") model = load_model( state_dict, dataset, metadata, device, bert_attention = args["bert_attention"], entity_embedding_size = ent_embed_size, dropout = args["dropout"], ) log(f"Starting training of DaLUKE for NER on {args['dataset']}") training = TrainNER( model, dataloader, dataset, device = device, epochs = args["epochs"], lr = args["lr"], warmup_prop = args["warmup_prop"], weight_decay = args["weight_decay"], dev_dataloader = dev_dataloader, loss_weight = args["loss_weight"], ) # Log important information out log.debug(training.model) log.debug(training.scheduler) log.debug(training.optimizer) dataset.document(dataloader, Split.TRAIN) type_distribution(dataset.data[Split.TRAIN].annotations) results = training.run() log("Saving results and model to %s" % args["location"]) save_to_archive(os.path.join(args["location"], TRAIN_OUT), entity_vocab, metadata, model, token_map) if args["eval"]: log("True dev. set distributions") results.dev_true_type_distribution = type_distribution(dataset.data[Split.DEV].annotations) log("True dev. set distributions") results.train_true_type_distribution = type_distribution(dataset.data[Split.TRAIN].annotations) log("Saving best model") save_to_archive(os.path.join(args["location"], TRAIN_OUT_BEST), entity_vocab, metadata, training.best_model, token_map) results.save(args["location"])