def test_best_model(analysis, model_name, task_name, data_dir): data_args = DataTrainingArguments(task_name=task_name, data_dir=data_dir) tokenizer = AutoTokenizer.from_pretrained(model_name) best_config = analysis.get_best_config(metric="eval_acc", mode="max") print(best_config) best_checkpoint = recover_checkpoint( analysis.get_best_trial(metric="eval_acc", mode="max").checkpoint.value) print(best_checkpoint) best_model = AutoModelForSequenceClassification.from_pretrained( best_checkpoint).to("cuda") test_args = TrainingArguments(output_dir="./best_model_results", ) test_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev", cache_dir=data_dir) test_dataset = test_dataset[len(test_dataset) // 2:] test_trainer = Trainer(best_model, test_args, compute_metrics=build_compute_metrics_fn(task_name)) metrics = test_trainer.evaluate(test_dataset) print(metrics)
def setUpClass(self): self.MODEL_ID = "albert-base-v2" self.data_args = DataTrainingArguments( task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True, ) self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL_ID) self.dataset = GlueDataset(self.data_args, self.tokenizer, mode="dev") self.config = AutoConfig.from_pretrained( self.MODEL_ID, num_labels=3, finetuning_task="mrpc") self.dataloader = DataLoader(self.dataset, batch_size=2, collate_fn=default_data_collator)
def run_pos(task_key: str, cfg: CN, model, model_args, training_args, tokenizer, mode="train", extract=False, **kwargs): r""" cfg: YACS cfg node ckpt_path: Unsupported """ task_name = "POS" data_args = DataTrainingArguments(task_name=task_name, data_dir=cfg.DATA.DATAPATH) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. # Get datasets pos_dataset = load_features_dict(tokenizer, cfg) train_dataset = pos_dataset["pos"]["train"] eval_dataset = pos_dataset["pos"]["validation"] # Initialize our Trainer trainer = FixedTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=get_eval_metrics_func(task_key), tokenizer=tokenizer, data_collator=DataCollatorForTokenClassification(tokenizer), config=cfg) if mode == "train": trainer.train() if mode != "train" or cfg.EVAL_ON_COMPLETION: extract_path = None if extract: extract_path = get_extract_path(cfg, model_args) metrics = trainer.evaluate(extract_path=extract_path, cache_path=osp.join( cfg.TASK.EXTRACT_TOKENS_MASK_CACHE, task_key)) metrics_file = get_metrics_path(cfg, model_args) torch.save(metrics, metrics_file)
def run_glue(task_key, cfg, model, model_args, training_args, tokenizer, mode="train", extract=False, **kwargs): r""" cfg: YACS cfg node ckpt_path: Unsupported """ task_name = TASK_KEY_TO_NAME[task_key] data_args = DataTrainingArguments(task_name=task_name, data_dir=cfg.DATA.DATAPATH) glue_dataset = load_features_dict(tokenizer, cfg) # print(glue_dataset.keys()) train_dataset = glue_dataset[task_key]['train'] split_key = cfg.EVAL.SPLIT if task_key == "mnli": split_key = f"{split_key}_matched" eval_dataset = glue_dataset[task_key][split_key] # eval_dataset = glue_dataset[task_key]['validation_mismached'] # train_dataset = GlueDataset(data_args, tokenizer=tokenizer, limit_length=cfg.TRAIN.TASK_LIMIT) # eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode='dev') collator = DataCollatorWithPadding(tokenizer) trainer = FixedTrainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=get_eval_metrics_func(task_key), data_collator=collator, config=cfg) if mode == "train": trainer.train() if mode != "train" or cfg.EVAL_ON_COMPLETION: extract_path = None if extract: extract_path = get_extract_path(cfg, model_args) metrics = trainer.evaluate(extract_path=extract_path, cache_path=osp.join( cfg.TASK.EXTRACT_TOKENS_MASK_CACHE, task_key)) metrics_file = get_metrics_path(cfg, model_args) torch.save(metrics, metrics_file)
def train_transformer(config, checkpoint_dir=None): data_args = DataTrainingArguments(task_name=config["task_name"], data_dir=config["data_dir"]) tokenizer = AutoTokenizer.from_pretrained(config["model_name"]) train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train", cache_dir=config["data_dir"]) eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev", cache_dir=config["data_dir"]) eval_dataset = eval_dataset[:len(eval_dataset) // 2] training_args = TrainingArguments( output_dir=tune.get_trial_dir(), learning_rate=config["learning_rate"], do_train=True, do_eval=True, evaluate_during_training=True, eval_steps=(len(train_dataset) // config["per_gpu_train_batch_size"]) + 1, # We explicitly set save to 0, and do saving in evaluate instead save_steps=0, num_train_epochs=config["num_epochs"], max_steps=config["max_steps"], per_device_train_batch_size=config["per_gpu_train_batch_size"], per_device_eval_batch_size=config["per_gpu_val_batch_size"], warmup_steps=0, weight_decay=config["weight_decay"], logging_dir="./logs", ) # Arguments for W&B. name = tune.get_trial_name() wandb_args = { "project_name": "transformers_pbt", "watch": "false", # Either set to gradient, false, or all "run_name": name, } tune_trainer = get_trainer(recover_checkpoint(checkpoint_dir, config["model_name"]), train_dataset, eval_dataset, config["task_name"], training_args, wandb_args=wandb_args) tune_trainer.train(recover_checkpoint(checkpoint_dir, config["model_name"]))
def test_meta_dataset(self): data_args = DataTrainingArguments( task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True, ) train_dataset = GlueDataset(data_args, tokenizer=self.tokenizer) meta_dataset = MetaDataset(train_dataset) self.assertEqual(len(meta_dataset[1000]), 2) self.assertEqual(meta_dataset[1000][0]["input_ids"].shape, torch.Size([128])) self.assertEqual( meta_dataset[1000][0]["attention_mask"].shape, torch.Size([128]) ) self.assertEqual(meta_dataset[1000][0]["labels"].item(), 0) self.assertEqual(meta_dataset[1000][1]["labels"].item(), 1)
def main(config): os.environ["WANDB_WATCH"] = "False" # To disable Huggingface logging auto_generated_dir = os.getcwd() log.info(f"Work dir: {auto_generated_dir}") os.chdir(hydra.utils.get_original_cwd()) wandb_run = init_wandb(auto_generated_dir, config) args_train = TrainingArguments(output_dir=auto_generated_dir) args_train = update_config(args_train, config.training) args_data = DataTrainingArguments(task_name=config.data.task_name, data_dir=config.data.data_dir) args_data = update_config(args_data, config.data) train_eval_glue_model(config, args_train, args_data, auto_generated_dir)
def train(X_train, y_train, y_column_name, model_name=None): eval_dataset = y_train[y_column_name] model_args = ModelArguments(model_name_or_path="distilbert-base-cased", ) global data_args data_args = DataTrainingArguments(task_name="mnli", data_dir="../../datasets/Newswire") num_labels = glue_tasks_num_labels[data_args.task_name] training_args = TrainingArguments( output_dir=model_name, overwrite_output_dir=True, do_train=True, do_eval=True, per_gpu_train_batch_size=32, per_gpu_eval_batch_size=128, num_train_epochs=1, logging_steps=500, logging_first_step=True, save_steps=1000, evaluate_during_training=True, ) config = AutoConfig.from_pretrained( model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, ) tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, config=config, ) train_dataset = GlueDataset(data_args, tokenizer=tokenizer, limit_length=100_000) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) trainer.train()
def test_cluster_indices(self): clustering_args = Clustering_Arguments( batch_size=32, num_clusters_elements=32, embedding_path=self.embedding_path, num_clusters=8, cluster_output_path=self.cluster_output_path, ) cluster_indices = self.clustering_proc.get_cluster_indices_by_num( clustering_args.num_clusters_elements ) self.assertTrue(len(cluster_indices) > 10000) # Testing with Pytorch Dataset data_args = DataTrainingArguments( task_name="MRPC", data_dir=self.data_dir, overwrite_cache=True ) tokenizer = AutoTokenizer.from_pretrained("albert-base-v2") train_dataset = GlueDataset(data_args, tokenizer) train_dataset = torch.utils.data.Subset(train_dataset, cluster_indices) self.assertEqual(len(train_dataset[0].input_ids), 128)
def twitter_bert( ROOTPATH=ROOTPATH, model_name_or_path="bert-base-uncased", task_name="TWIT", do_train=True, do_eval=True, data_dir=f'{ROOTPATH}/input', max_seq_length=128, per_device_train_batch_size=8, per_device_eval_batch_size=8, learning_rate=2e-5, num_train_epochs=3.0, cache_dir=None, output_dir=f'{ROOTPATH}/output', overwrite_cache=True, overwrite_output_dir=True, local_rank=-1, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), n_gpu=torch.cuda.device_count(), fp16=False, num_labels=2, evaluate_during_training=False, weight_decay=0, adam_epsilon=1e-8, max_grad_norm=1.0, train_dataset=None, dev_dataset=None, test_dataset=None, full_dataset=None, labels=None, temp_json=f'{ROOTPATH}/temp/run{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}.json', use_test=False, save_steps=1e200, random_state=1234): set_seed(random_state) if full_dataset is not None: train_dataset, dev_dataset = train_test_split( full_dataset, test_size=0.2, random_state=random_state) # Setup logging logger = logging.getLogger(__name__) logger.info(f"LENGTH OF TRAIN DATASET: {len(train_dataset.index)}") # exit(0) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", local_rank, device, n_gpu, bool(local_rank != -1), fp16, ) logger.info( "Training/evaluation parameters local_rank: %s, device: %s, n_gpu: %s, fp16: %s", local_rank, device, n_gpu, fp16) logger.info(f"MAX SEQ LEN: {max_seq_length}") wordsegment.load() ## DEFINE FUNCTIONS @dataclass class ModelArguments: """ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. """ model_name_or_path: str = field( metadata={ "help": "Path to pretrained model or model identifier from huggingface.co/models" }) config_name: Optional[str] = field( default=None, metadata={ "help": "Pretrained config name or path if not the same as model_name" }) tokenizer_name: Optional[str] = field( default=None, metadata={ "help": "Pretrained tokenizer name or path if not the same as model_name" }) cache_dir: Optional[str] = field( default=None, metadata={ "help": "Where do you want to store the pretrained models downloaded from s3" }) training_args = TrainingArguments( output_dir=output_dir, do_train=True, do_eval=True, do_predict=True, num_train_epochs=num_train_epochs, per_device_train_batch_size=per_device_train_batch_size, per_device_eval_batch_size=per_device_eval_batch_size, learning_rate=learning_rate, overwrite_output_dir=overwrite_output_dir, evaluate_during_training=evaluate_during_training, weight_decay=weight_decay, adam_epsilon=adam_epsilon, max_grad_norm=max_grad_norm, save_steps=save_steps) data_args = DataTrainingArguments(task_name=task_name, data_dir=data_dir, max_seq_length=max_seq_length, overwrite_cache=overwrite_cache) model_args = ModelArguments(model_name_or_path=model_name_or_path, ) def simple_accuracy(preds, labels): return (preds == labels).mean() def acc_and_f1(preds, labels): acc = simple_accuracy(preds, labels) f1 = f1_score(y_true=labels, y_pred=preds) return { "acc": acc, "f1": f1, "acc_and_f1": (acc + f1) / 2, } def pearson_and_spearman(preds, labels): pearson_corr = pearsonr(preds, labels)[0] spearman_corr = spearmanr(preds, labels)[0] return { "pearson": pearson_corr, "spearmanr": spearman_corr, "corr": (pearson_corr + spearman_corr) / 2, } def compute_metrics(preds, labels): assert len(preds) == len(labels) return acc_and_f1(preds, labels) class TwitterProcessor(DataProcessor): def __init__(self): super(TwitterProcessor, self).__init__() ''' You need to define three variables here: - self.train_dataset -> train dataset - self.dev_dataset -> dev dataset - self.test_dataset -> test dataset - self.labels -> a list of the labels Each {train,dev,test}_dataset must have (at least) two columns: - "tweet" -> includes the text of the tweet - "label" -> includes the label of the tweet ''' self.train_dataset = train_dataset self.dev_dataset = dev_dataset self.test_dataset = test_dataset self.labels = labels def get_train_examples(self): return self._create_examples(self.train_dataset, "train") def get_dev_examples(self): return self._create_examples(self.dev_dataset, "train") def get_test_examples(self): return self._create_examples(self.test_dataset, "train") def get_labels(self): """See base class.""" return self.labels def _preprocess_text(self, text): # 1 text = emoji.demojize(text) # 2 words = text.split() for word in words: if word[0] != '#': continue hashtag = word[1:] replacement_words = wordsegment.segment(hashtag) text = text.replace(word, " ".join(replacement_words)) # 3 text = text.replace("URL", "http") # 4 text = re.sub(r'(@[A-Za-z]+)( \1\b){3,}', r'\1 \1 \1', text) return text def _create_examples(self, data, set_type): examples = [] raw_texts = data.tweet.values.tolist() raw_labels = data.label.values.tolist() for i in range(0, len(raw_texts)): guid = "%s-%s" % (set_type, i) raw_text = raw_texts[i] raw_label = raw_labels[i] label = raw_label text = self._preprocess_text(raw_text) examples.append( InputExample(guid=guid, text_a=text, text_b=None, label=label)) return examples def convert_examples_to_features( examples: List[InputExample], tokenizer: PreTrainedTokenizer, max_length: Optional[int] = None, task=None, label_list=None, output_mode=None, ): if max_length is None: max_length = tokenizer.max_len processor = TwitterProcessor() label_list = processor.get_labels() label_map = {label: i for i, label in enumerate(label_list)} def label_from_example( example: InputExample) -> Union[int, float, None]: return label_map[example.label] labels = [label_from_example(example) for example in examples] batch_encoding = tokenizer.batch_encode_plus( [(example.text_a, example.text_b) for example in examples], max_length=max_length, pad_to_max_length=True, ) features = [] for i in range(len(examples)): inputs = {k: batch_encoding[k][i] for k in batch_encoding} feature = InputFeatures(**inputs, label=labels[i]) features.append(feature) for i, example in enumerate(examples[:5]): logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("features: %s" % features[i]) return features class TwitterDataset(Dataset): """ This will be superseded by a framework-agnostic approach soon. """ def __init__( self, tokenizer, mode="train", cache_dir=cache_dir, args=data_args, ): self.args = args self.processor = TwitterProcessor() self.output_mode = 'Classification' label_list = self.processor.get_labels() self.label_list = label_list if mode == "dev": examples = self.processor.get_dev_examples() elif mode == "test": examples = self.processor.get_test_examples() else: examples = self.processor.get_train_examples() self.features = convert_examples_to_features( examples, tokenizer, max_length=max_seq_length, label_list=label_list, output_mode=self.output_mode, ) def __len__(self): return len(self.features) def __getitem__(self, i) -> InputFeatures: return self.features[i] def get_labels(self): return self.label_list def build_compute_metrics_fn( task_name: str) -> Callable[[EvalPrediction], Dict]: def compute_metrics_fn(p: EvalPrediction): preds = np.argmax(p.predictions, axis=1) return compute_metrics(preds, p.label_ids) return compute_metrics_fn # Create model config = AutoConfig.from_pretrained( model_name_or_path, num_labels=num_labels, cache_dir=cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_name_or_path, cache_dir=cache_dir, ) model = AutoModelForSequenceClassification.from_pretrained( model_name_or_path, config=config, cache_dir=cache_dir, ) # Get datasets train_dataset = (TwitterDataset(tokenizer=tokenizer, mode="train", cache_dir=cache_dir)) eval_dataset = (TwitterDataset(tokenizer=tokenizer, mode="dev", cache_dir=cache_dir)) if use_test: test_dataset = (TwitterDataset(tokenizer=tokenizer, mode="test", cache_dir=cache_dir)) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=build_compute_metrics_fn(task_name), ) # Train the model if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model(f"{training_args.output_dir}/{task_name}") tokenizer.save_pretrained(f"{training_args.output_dir}/{task_name}") # Evaluation eval_results = [] if training_args.do_eval: logger.info("*** Evaluate ***") if use_test: step_names = ["dev", "test"] eval_datasets = [eval_dataset, test_dataset] else: step_names = ["dev"] eval_datasets = [eval_dataset] ct = 0 for eval_dataset in eval_datasets: step_name = step_names[ct] trainer.compute_metrics = build_compute_metrics_fn( eval_dataset.args.task_name) eval_result = trainer.evaluate(eval_dataset=eval_dataset) logger.info("***** Eval results {} - {}*****".format( eval_dataset.args.task_name, step_name.upper())) for key, value in eval_result.items(): logger.info(" %s = %s", key, value) # output_eval_file = os.path.join( # training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}_{step_name}.txt" # ) # if ct == 0: # with open(output_eval_file, "w") as writer: # logger.info("***** Eval results {} - {}*****".format(eval_dataset.args.task_name, step_name.upper())) # for key, value in eval_result.items(): # logger.info(" %s = %s", key, value) # else: # with open(output_eval_file, "a") as writer: # logger.info("***** Eval results {} - {}*****".format(eval_dataset.args.task_name, step_name.upper())) # for key, value in eval_result.items(): # logger.info(" %s = %s", key, value) eval_results.append(eval_result) write_type = 'a' if os.path.exists(temp_json) else 'w' with open(temp_json, write_type) as f: f.write(json.dumps(eval_result)) f.write("\n") ct += 1 return eval_results[-1]
config_name: Optional[str] = field( default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} ) tokenizer_name: Optional[str] = field( default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} ) cache_dir: Optional[str] = field( default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} ) """### Here are all the training parameters we are going to use:""" model_args = ModelArguments( model_name_or_path="bert-base-cased", ) data_args = DataTrainingArguments(task_name="mnli", data_dir="./glue_data/MNLI") training_args = TrainingArguments( output_dir="./models/model_name", overwrite_output_dir=True, do_train=True, do_eval=True, per_gpu_train_batch_size=32, per_gpu_eval_batch_size=128, num_train_epochs=0.1, logging_steps=500, logging_first_step=True, save_steps=1000, evaluate_during_training=True, ) !ls glue_data