def main(_params): global params params = _params train_df, test_df, dev_df, labels, num_labels, label_map, data_dir = prepare_data( ) data_args, model_args, config, tokenizer = prepare_config_and_tokenizer( data_dir, labels, num_labels, label_map) # ## Create Dataset Objects train_dataset = NerDataset( data_dir=data_args['data_dir'], tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args['max_seq_length'], overwrite_cache=data_args['overwrite_cache'], # True mode=Split.train, data_size=params["data_size"]) eval_dataset = NerDataset(data_dir=data_args['data_dir'], tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args['max_seq_length'], overwrite_cache=data_args['overwrite_cache'], mode=Split.dev, data_size=params["data_size"]) print(train_dataset.__len__(), eval_dataset.__len__()) # Train top-model using the Trainer API trainer, model = run_train(train_dataset, eval_dataset, config, model_args, labels, num_labels, label_map) gc.collect() torch.cuda.empty_cache() # ## Prepare test data, run trainer over test data and print metrics # we can pass overwrite_cache as True since we might like to make new predictions by just changing test.txt test_dataset = NerDataset(data_dir=data_args['data_dir'], tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args['max_seq_length'], overwrite_cache=True, mode=Split.test, data_size=params["data_size"]) run_test(trainer, model, train_dataset, train_df, label_map) run_test(trainer, model, eval_dataset, dev_df, label_map) run_test(trainer, model, test_dataset, test_df, label_map)
def set_data(self, tok_sents: List[List[str]]): """Expects a document given as a list of sentences where each sentence is tokenized already.""" examples = [] for guid, sent in enumerate(tok_sents): words = [x + "\n" for x in sent] labels = ["O" for _ in range(len(sent))] examples.append(InputExample(guid=f"pred-{guid}", words=words, labels=labels)) data = NerDataset( tokenizer=self.tokenizer, examples=examples, labels=["B", "O"], model_type="BertForTokenClassification", max_seq_length=256, mode=Split.pred ) self.data = data
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Prepare CONLL-2003 task labels = get_labels(data_args.labels) label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)} num_labels = len(labels) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, id2label=label_map, label2id={label: i for i, label in enumerate(labels)}, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast, ) model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) tui_ids = None if data_args.umls: tui_ids = create_cui_dict(voc_updated=data_args.med_document, tokenizer=tokenizer) # Get datasets train_dataset = ( NerDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, labels=labels, tui_ids=tui_ids, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, ) if training_args.do_train else None ) eval_dataset = ( NerDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, labels=labels, tui_ids=tui_ids, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.dev, ) if training_args.do_eval else None ) def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]: preds = np.argmax(predictions, axis=2) batch_size, seq_len = preds.shape out_label_list = [[] for _ in range(batch_size)] preds_list = [[] for _ in range(batch_size)] for i in range(batch_size): for j in range(seq_len): if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index: out_label_list[i].append(label_map[label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) return preds_list, out_label_list def compute_metrics(p: EvalPrediction) -> Dict: preds_list, out_label_list = align_predictions(p.predictions, p.label_ids) return { "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list), } # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train( model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None ) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") result = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) results.update(result) # Predict if training_args.do_predict: test_dataset = NerDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, labels=labels, tui_ids=tui_ids, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.test, ) predictions, label_ids, metrics = trainer.predict(test_dataset) preds_list, _ = align_predictions(predictions, label_ids) output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt") if trainer.is_world_master(): with open(output_test_results_file, "w") as writer: for key, value in metrics.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) # Save predictions output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt") if trainer.is_world_master(): with open(output_test_predictions_file, "w") as writer: with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f: example_id = 0 for line in f: try: if line.startswith("-DOCSTART-") or line == "" or line == "\n": writer.write(line) if not preds_list[example_id]: example_id += 1 elif preds_list[example_id]: output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n" writer.write(output_line) else: logger.warning( "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0] ) except: break return results
model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) # Print/save training arguments os.makedirs(args.output_dir, exist_ok=True) torch.save(args, os.path.join(args.output_dir, "run_args.bin")) logger.info("Training/evaluation parameters %s", args) # Get datasets train_dataset = NerDataset( data_dir=args.data_dir, tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=args.max_seq_length, overwrite_cache=False, mode=Split.train, ) split = int(len(train_dataset) * 0.9) train_sampler = SequentialSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.batch_size, collate_fn=default_data_collator ) val_dataset = Subset(train_dataset, list(range(split, len(train_dataset)))) val_sampler = SequentialSampler(val_dataset) if args.local_rank == -1 else DistributedSampler(val_dataset) val_dataloader = DataLoader( val_dataset, sampler=val_sampler, batch_size=args.batch_size, collate_fn=default_data_collator ) eval_dataset = NerDataset(
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Prepare CONLL-2003 task labels = get_labels(data_args.labels) label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)} num_labels = len(labels) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, id2label=label_map, label2id={label: i for i, label in enumerate(labels)}, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast, ) model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = ( NerDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, ) if training_args.do_train else None ) eval_dataset = ( NerDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.dev, ) if training_args.do_eval else None ) def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]: preds = np.argmax(predictions, axis=2) batch_size, seq_len = preds.shape out_label_list = [[] for _ in range(batch_size)] preds_list = [[] for _ in range(batch_size)] for i in range(batch_size): for j in range(seq_len): if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index: out_label_list[i].append(label_map[label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) return preds_list, out_label_list
if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) results.update(result) # Predict if training_args.do_predict: test_dataset = NerDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.test, ) predictions, label_ids, metrics = trainer.predict(test_dataset) preds_list, _ = align_predictions(predictions, label_ids) output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt") if trainer.is_world_master(): with open(output_test_results_file, "w") as writer: for key, value in metrics.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value))
def main(_params): global params params = _params ''' params['seed_value'] = args.seed_value params['set_seed'] = args.set_seed ''' wb_run = wandb.init(project="NER", name=params['exp_name'] + "_init") if params['set_seed']: random_seed_set(params['seed_value']) train_df, test_df, dev_df, labels, num_labels, label_map, data_dir, wt = prepare_data( ) data_args, model_args, config, tokenizer = prepare_config_and_tokenizer( data_dir, labels, num_labels, label_map) if 'add_vocab' in params.keys(): process_entity(tokenizer, train_df) process_entity(tokenizer, dev_df) process_entity(tokenizer, test_df) # ## Create Dataset Objects xargs = {} if params.get('xargs'): xargs = params['xargs'] xargs['wt'] = wt print('Got class weights') xargs["top_model"] = params.get("top_model") train_dataset = NerDataset( data_dir=data_args['data_dir'], tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args['max_seq_length'], overwrite_cache=data_args['overwrite_cache'], # True mode=Split.train, data_size=params["data_size"], xargs=xargs) eval_dataset = NerDataset(data_dir=data_args['data_dir'], tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args['max_seq_length'], overwrite_cache=data_args['overwrite_cache'], mode=Split.dev, data_size=100) # ## Prepare test data, run trainer over test data and print metrics # we can pass overwrite_cache as True since we might like to make new predictions by just changing test.txt test_dataset = NerDataset(data_dir=data_args['data_dir'], tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args['max_seq_length'], overwrite_cache=True, mode=Split.test, data_size=100) print(train_dataset.__len__(), eval_dataset.__len__(), test_dataset.__len__()) wb_run.finish() # Train top-model using the Trainer API if params.get("hyp"): run_hyperp(train_dataset, eval_dataset, config, model_args, labels, num_labels, label_map, tokenizer, xargs) return trainer, model = run_train(train_dataset, eval_dataset, config, model_args, labels, num_labels, label_map, tokenizer, xargs) gc.collect() torch.cuda.empty_cache() wb_run = wandb.init(project="NER", name=params['exp_name'] + "summary") report = run_test(trainer, model, train_dataset, train_df, label_map) wandb.run.summary["train_report"] = report report = run_test(trainer, model, eval_dataset, dev_df, label_map) wandb.run.summary["val_report"] = report report = run_test(trainer, model, test_dataset, test_df, label_map) wandb.run.summary["test_report"] = report wandb.run.summary["model"] = model.__repr__() wandb.run.summary["data"] = { "train": train_dataset.__len__(), "val": eval_dataset.__len__(), "test": test_dataset.__len__(), "wt": wt } params["model_type"] = params["model_type"].name wandb.run.summary["params"] = params wb_run.finish()
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Prepare CONLL-2003 task labels = get_labels(data_args.labels) label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)} num_labels = len(labels) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, id2label=label_map, label2id={label: i for i, label in enumerate(labels)}, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast, ) model = AutoModelForTokenMultiLabelClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = (NerDataset( data_dir=data_args.data_dir, data_format=data_args.data_format, tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, multilabeling=True, ) if training_args.do_train else None) eval_dataset = (NerDataset( data_dir=data_args.data_dir, data_format=data_args.data_format, tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.dev, multilabeling=True, ) if training_args.do_eval else None) def get_label_preds_refs( predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[List[str]], List[List[str]]]: """ Returns a list of labels for each token in each sequence in the dataset. """ logit_threshold = 0.0 # Corresponds to a probability of 0.5 if fed through a sigmoid. preds = predictions > logit_threshold batch_size, seq_len, _ = preds.shape refs_list = [[] for _ in range(batch_size)] preds_list = [[] for _ in range(batch_size)] for i in range(batch_size): for j in range(seq_len): preds_list[i].append( [label_map[x] for x in np.where(preds[i][j] == 1)[0]]) refs_list[i].append( [label_map[x] for x in np.where(label_ids[i][j] == 1)[0]]) return preds_list, refs_list def align_predictions( predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[List[str]], List[List[str]]]: logit_threshold = 0.0 # Corresponds to a probability of 0.5 if fed through a sigmoid. preds = predictions > logit_threshold batch_size, seq_len, _ = preds.shape # is_tagged indicates for each token whether it has an associated tag (i.e. a # label, including the O label) and should be assessed, otherwise it's # a padding or special token. is_tagged = label_ids.sum(axis=2) > 0 out_label_list = [[] for _ in range(batch_size)] preds_list = [[] for _ in range(batch_size)] for i in range(batch_size): for j in range(seq_len): if is_tagged[i, j]: #out_label_list[i].append(label_map[label_ids[i][j]]) out_label_list[i].append([ label_map[x] for x in np.where(label_ids[i][j] == 1)[0] ]) #preds_list[i].append(label_map[preds[i][j]]) preds_list[i].append( [label_map[x] for x in np.where(preds[i][j] == 1)[0]]) return preds_list, out_label_list def compute_metrics(p: EvalPrediction) -> Dict: preds_list, out_label_list = align_predictions(p.predictions, p.label_ids) (chunk_prec, chunk_rec, chunk_f1, tok_prec, tok_rec, tok_f1) = fsn4nlp.utils.conlleval.evaluate_multilabel( out_label_list, preds_list) return { "chunk_precision": chunk_prec, "chunk_recall": chunk_rec, "chunk_f1": chunk_f1, "tok_precision": tok_prec, "tok_recall": tok_rec, "tok_f1": tok_f1, } # Initialize our Trainer trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, labels=labels) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") result = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) results.update(result) # Predict if training_args.do_predict: test_dataset = NerDataset( data_dir=data_args.data_dir, data_format=data_args.data_format, tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.test, multilabeling=True, ) predictions, label_ids, metrics = trainer.predict(test_dataset) preds_list, refs_list = get_label_preds_refs(predictions, label_ids) output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt") if trainer.is_world_master(): with open(output_test_results_file, "w") as writer: for key, value in metrics.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) # Save predictions output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt") if trainer.is_world_master(): with open(output_test_predictions_file, "w") as writer: for i, example in enumerate(test_dataset): for tok_id in example.input_ids: tok = tokenizer.convert_ids_to_tokens(tok_id) if refs_list[i][0] == []: output_line = f"{tok}\n" refs_list[i].pop(0) else: output_line = f"{tok} {refs_list[i].pop(0)} {preds_list[i].pop(0)}\n" writer.write(output_line) return results