def train(no_cache: bool, dataset_path: str, data_config_name: str, training_args: TrainingArguments, tokenizer: RobertaTokenizerFast): print(f"tokenizer vocab size: {tokenizer.vocab_size}") print(f"\nLoading datasets found in {dataset_path}.") train_dataset, eval_dataset, test_dataset = load_dataset( 'EMBO/biolang', data_config_name, data_dir=dataset_path, split=["train", "validation", "test"], # download_mode=GenerateMode.FORCE_REDOWNLOAD if no_cache else GenerateMode.REUSE_DATASET_IF_EXISTS, cache_dir=CACHE) if data_config_name != "MLM": data_collator = DataCollatorForTargetedMasking( tokenizer=tokenizer, max_length=config.max_length) else: data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True) print(f"\nTraining with {len(train_dataset)} examples.") print(f"Evaluating on {len(eval_dataset)} examples.") if config.from_pretrained: model = RobertaForMaskedLM.from_pretrained(config.from_pretrained) else: model_config = RobertaConfig( vocab_size=config.vocab_size, max_position_embeddings=config.max_length + 2, # max_length + 2 for start/end token num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) model = RobertaForMaskedLM(config=model_config) training_args.remove_unused_columns = False # we need pos_mask and special_tokens_mask in collator print("\nTraining arguments:") print(training_args) trainer = MyTrainer(model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, callbacks=[ShowExample(tokenizer)]) print(f"CUDA available: {torch.cuda.is_available()}") trainer.train() trainer.save_model(training_args.output_dir) print(f"Testing on {len(test_dataset)}.") pred: NamedTuple = trainer.predict(test_dataset, metric_key_prefix='test') print(f"{pred.metrics}")
def train(no_cache: bool, dataset_path: str, data_config_name: str, training_args: TrainingArguments, tokenizer: RobertaTokenizerFast): print(f"tokenizer vocab size: {tokenizer.vocab_size}") print(f"\nLoading and tokenizing datasets found in {dataset_path}.") train_dataset, eval_dataset, test_dataset = load_dataset( 'EMBO/sd-nlp', # './tokcl/loader.py', data_config_name, script_version="main", # data_dir=dataset_path, split=["train", "validation", "test"], # download_mode=GenerateMode.FORCE_REDOWNLOAD if no_cache else GenerateMode.REUSE_DATASET_IF_EXISTS, cache_dir=CACHE) print(f"\nTraining with {len(train_dataset)} examples.") print(f"Evaluating on {len(eval_dataset)} examples.") if data_config_name in ["NER", "ROLES"]: # use our fancy data collator that randomly masks some of the inputs to enforce context learning training_args.remove_unused_columns = False # we need tag_mask data_collator = DataCollatorForMaskedTokenClassification( tokenizer=tokenizer, max_length=config.max_length, masking_probability=training_args.masking_probability, replacement_probability=training_args.replacement_probability, select_labels=training_args.select_labels) else: # normal token classification data_collator = DataCollatorForTokenClassification( tokenizer=tokenizer, max_length=config.max_length) num_labels = train_dataset.info.features['labels'].feature.num_classes label_list = train_dataset.info.features['labels'].feature.names print(f"\nTraining on {num_labels} features:") print(", ".join(label_list)) compute_metrics = MetricsComputer(label_list=label_list) model = RobertaForTokenClassification.from_pretrained( LM_MODEL_PATH, num_labels=num_labels, max_position_embeddings=config.max_length + 2) print("\nTraining arguments:") print(training_args) trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, callbacks=[ShowExample(tokenizer)]) print(f"CUDA available: {torch.cuda.is_available()}") trainer.train() trainer.save_model(training_args.output_dir) print(f"Testing on {len(test_dataset)}.") pred: NamedTuple = trainer.predict(test_dataset, metric_key_prefix='test') print(f"{pred.metrics}")