def uni_train(self, data_dir: str, train_dataset: Dataset, eval_dataset: Optional[Dataset] = None, compute_metrics_fn: Optional[Callable[[EvalPrediction], Dict]] = None, seed: int = None) -> PredictionOutput: r""" 统一训练模块. """ if not seed: seed = random.randint(0, 2020) set_seed(seed) # 构造训练参数 training_args = self.init_training_args(self.model_path) # Initialize our Trainer trainer = Trainer( model=self.model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics_fn, ) trainer.train(self.model_path) trainer.save_model() self.tokenizer.save_pretrained(self.model_path) self.trainer = trainer # Evaluation logger.info("*** Evaluate ***") trainer.compute_metrics = compute_metrics_fn eval_result = trainer.predict(test_dataset=eval_dataset) metrics = eval_result.metrics output_eval_file = os.path.join(self.model_path, f"eval_results_{self.task_name}.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(self.task_name)) for key, value in metrics.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) return eval_result
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) try: num_labels = glue_tasks_num_labels[data_args.task_name] output_mode = glue_output_modes[data_args.task_name] except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = (GlueDataset( data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None) eval_dataset = (GlueDataset(data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir) if training_args.do_eval else None) test_dataset = (GlueDataset(data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir) if training_args.do_predict else None) def build_compute_metrics_fn( task_name: str) -> Callable[[EvalPrediction], Dict]: def compute_metrics_fn(p: EvalPrediction): if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) return glue_compute_metrics(task_name, preds, p.label_ids) return compute_metrics_fn # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=build_compute_metrics_fn(data_args.task_name), ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) eval_datasets = [eval_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") eval_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir)) for eval_dataset in eval_datasets: trainer.compute_metrics = build_compute_metrics_fn( eval_dataset.args.task_name) eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join( training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format( eval_dataset.args.task_name)) for key, value in eval_result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(eval_result) if training_args.do_predict: logging.info("*** Test ***") test_datasets = [test_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") test_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir)) for test_dataset in test_datasets: predictions = trainer.predict( test_dataset=test_dataset).predictions if output_mode == "classification": predictions = np.argmax(predictions, axis=1) output_test_file = os.path.join( training_args.output_dir, f"test_results_{test_dataset.args.task_name}.txt") if trainer.is_world_master(): with open(output_test_file, "w") as writer: logger.info("***** Test results {} *****".format( test_dataset.args.task_name)) writer.write("index\tprediction\n") for index, item in enumerate(predictions): if output_mode == "regression": writer.write("%d\t%3.3f\n" % (index, item)) else: item = test_dataset.get_labels()[item] writer.write("%d\t%s\n" % (index, item)) return eval_results
def twitter_bert( ROOTPATH=ROOTPATH, model_name_or_path="bert-base-uncased", task_name="TWIT", do_train=True, do_eval=True, data_dir=f'{ROOTPATH}/input', max_seq_length=128, per_device_train_batch_size=8, per_device_eval_batch_size=8, learning_rate=2e-5, num_train_epochs=3.0, cache_dir=None, output_dir=f'{ROOTPATH}/output', overwrite_cache=True, overwrite_output_dir=True, local_rank=-1, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), n_gpu=torch.cuda.device_count(), fp16=False, num_labels=2, evaluate_during_training=False, weight_decay=0, adam_epsilon=1e-8, max_grad_norm=1.0, train_dataset=None, dev_dataset=None, test_dataset=None, full_dataset=None, labels=None, temp_json=f'{ROOTPATH}/temp/run{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}.json', use_test=False, save_steps=1e200, random_state=1234): set_seed(random_state) if full_dataset is not None: train_dataset, dev_dataset = train_test_split( full_dataset, test_size=0.2, random_state=random_state) # Setup logging logger = logging.getLogger(__name__) logger.info(f"LENGTH OF TRAIN DATASET: {len(train_dataset.index)}") # exit(0) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", local_rank, device, n_gpu, bool(local_rank != -1), fp16, ) logger.info( "Training/evaluation parameters local_rank: %s, device: %s, n_gpu: %s, fp16: %s", local_rank, device, n_gpu, fp16) logger.info(f"MAX SEQ LEN: {max_seq_length}") wordsegment.load() ## DEFINE FUNCTIONS @dataclass class ModelArguments: """ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. """ model_name_or_path: str = field( metadata={ "help": "Path to pretrained model or model identifier from huggingface.co/models" }) config_name: Optional[str] = field( default=None, metadata={ "help": "Pretrained config name or path if not the same as model_name" }) tokenizer_name: Optional[str] = field( default=None, metadata={ "help": "Pretrained tokenizer name or path if not the same as model_name" }) cache_dir: Optional[str] = field( default=None, metadata={ "help": "Where do you want to store the pretrained models downloaded from s3" }) training_args = TrainingArguments( output_dir=output_dir, do_train=True, do_eval=True, do_predict=True, num_train_epochs=num_train_epochs, per_device_train_batch_size=per_device_train_batch_size, per_device_eval_batch_size=per_device_eval_batch_size, learning_rate=learning_rate, overwrite_output_dir=overwrite_output_dir, evaluate_during_training=evaluate_during_training, weight_decay=weight_decay, adam_epsilon=adam_epsilon, max_grad_norm=max_grad_norm, save_steps=save_steps) data_args = DataTrainingArguments(task_name=task_name, data_dir=data_dir, max_seq_length=max_seq_length, overwrite_cache=overwrite_cache) model_args = ModelArguments(model_name_or_path=model_name_or_path, ) def simple_accuracy(preds, labels): return (preds == labels).mean() def acc_and_f1(preds, labels): acc = simple_accuracy(preds, labels) f1 = f1_score(y_true=labels, y_pred=preds) return { "acc": acc, "f1": f1, "acc_and_f1": (acc + f1) / 2, } def pearson_and_spearman(preds, labels): pearson_corr = pearsonr(preds, labels)[0] spearman_corr = spearmanr(preds, labels)[0] return { "pearson": pearson_corr, "spearmanr": spearman_corr, "corr": (pearson_corr + spearman_corr) / 2, } def compute_metrics(preds, labels): assert len(preds) == len(labels) return acc_and_f1(preds, labels) class TwitterProcessor(DataProcessor): def __init__(self): super(TwitterProcessor, self).__init__() ''' You need to define three variables here: - self.train_dataset -> train dataset - self.dev_dataset -> dev dataset - self.test_dataset -> test dataset - self.labels -> a list of the labels Each {train,dev,test}_dataset must have (at least) two columns: - "tweet" -> includes the text of the tweet - "label" -> includes the label of the tweet ''' self.train_dataset = train_dataset self.dev_dataset = dev_dataset self.test_dataset = test_dataset self.labels = labels def get_train_examples(self): return self._create_examples(self.train_dataset, "train") def get_dev_examples(self): return self._create_examples(self.dev_dataset, "train") def get_test_examples(self): return self._create_examples(self.test_dataset, "train") def get_labels(self): """See base class.""" return self.labels def _preprocess_text(self, text): # 1 text = emoji.demojize(text) # 2 words = text.split() for word in words: if word[0] != '#': continue hashtag = word[1:] replacement_words = wordsegment.segment(hashtag) text = text.replace(word, " ".join(replacement_words)) # 3 text = text.replace("URL", "http") # 4 text = re.sub(r'(@[A-Za-z]+)( \1\b){3,}', r'\1 \1 \1', text) return text def _create_examples(self, data, set_type): examples = [] raw_texts = data.tweet.values.tolist() raw_labels = data.label.values.tolist() for i in range(0, len(raw_texts)): guid = "%s-%s" % (set_type, i) raw_text = raw_texts[i] raw_label = raw_labels[i] label = raw_label text = self._preprocess_text(raw_text) examples.append( InputExample(guid=guid, text_a=text, text_b=None, label=label)) return examples def convert_examples_to_features( examples: List[InputExample], tokenizer: PreTrainedTokenizer, max_length: Optional[int] = None, task=None, label_list=None, output_mode=None, ): if max_length is None: max_length = tokenizer.max_len processor = TwitterProcessor() label_list = processor.get_labels() label_map = {label: i for i, label in enumerate(label_list)} def label_from_example( example: InputExample) -> Union[int, float, None]: return label_map[example.label] labels = [label_from_example(example) for example in examples] batch_encoding = tokenizer.batch_encode_plus( [(example.text_a, example.text_b) for example in examples], max_length=max_length, pad_to_max_length=True, ) features = [] for i in range(len(examples)): inputs = {k: batch_encoding[k][i] for k in batch_encoding} feature = InputFeatures(**inputs, label=labels[i]) features.append(feature) for i, example in enumerate(examples[:5]): logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("features: %s" % features[i]) return features class TwitterDataset(Dataset): """ This will be superseded by a framework-agnostic approach soon. """ def __init__( self, tokenizer, mode="train", cache_dir=cache_dir, args=data_args, ): self.args = args self.processor = TwitterProcessor() self.output_mode = 'Classification' label_list = self.processor.get_labels() self.label_list = label_list if mode == "dev": examples = self.processor.get_dev_examples() elif mode == "test": examples = self.processor.get_test_examples() else: examples = self.processor.get_train_examples() self.features = convert_examples_to_features( examples, tokenizer, max_length=max_seq_length, label_list=label_list, output_mode=self.output_mode, ) def __len__(self): return len(self.features) def __getitem__(self, i) -> InputFeatures: return self.features[i] def get_labels(self): return self.label_list def build_compute_metrics_fn( task_name: str) -> Callable[[EvalPrediction], Dict]: def compute_metrics_fn(p: EvalPrediction): preds = np.argmax(p.predictions, axis=1) return compute_metrics(preds, p.label_ids) return compute_metrics_fn # Create model config = AutoConfig.from_pretrained( model_name_or_path, num_labels=num_labels, cache_dir=cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_name_or_path, cache_dir=cache_dir, ) model = AutoModelForSequenceClassification.from_pretrained( model_name_or_path, config=config, cache_dir=cache_dir, ) # Get datasets train_dataset = (TwitterDataset(tokenizer=tokenizer, mode="train", cache_dir=cache_dir)) eval_dataset = (TwitterDataset(tokenizer=tokenizer, mode="dev", cache_dir=cache_dir)) if use_test: test_dataset = (TwitterDataset(tokenizer=tokenizer, mode="test", cache_dir=cache_dir)) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=build_compute_metrics_fn(task_name), ) # Train the model if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model(f"{training_args.output_dir}/{task_name}") tokenizer.save_pretrained(f"{training_args.output_dir}/{task_name}") # Evaluation eval_results = [] if training_args.do_eval: logger.info("*** Evaluate ***") if use_test: step_names = ["dev", "test"] eval_datasets = [eval_dataset, test_dataset] else: step_names = ["dev"] eval_datasets = [eval_dataset] ct = 0 for eval_dataset in eval_datasets: step_name = step_names[ct] trainer.compute_metrics = build_compute_metrics_fn( eval_dataset.args.task_name) eval_result = trainer.evaluate(eval_dataset=eval_dataset) logger.info("***** Eval results {} - {}*****".format( eval_dataset.args.task_name, step_name.upper())) for key, value in eval_result.items(): logger.info(" %s = %s", key, value) # output_eval_file = os.path.join( # training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}_{step_name}.txt" # ) # if ct == 0: # with open(output_eval_file, "w") as writer: # logger.info("***** Eval results {} - {}*****".format(eval_dataset.args.task_name, step_name.upper())) # for key, value in eval_result.items(): # logger.info(" %s = %s", key, value) # else: # with open(output_eval_file, "a") as writer: # logger.info("***** Eval results {} - {}*****".format(eval_dataset.args.task_name, step_name.upper())) # for key, value in eval_result.items(): # logger.info(" %s = %s", key, value) eval_results.append(eval_result) write_type = 'a' if os.path.exists(temp_json) else 'w' with open(temp_json, write_type) as f: f.write(json.dumps(eval_result)) f.write("\n") ct += 1 return eval_results[-1]
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) output_mode = "regression" # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. # tokenizer,用来做分词等数据预处理工作 tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) train_dataset = RecDataset(data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) # num_labels = len(train_dataset.get_labels()) # config 包含了模型的基本参数设定 config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) # 加载模型 # model = AutoModelForSequenceClassification.from_pretrained( # model_args.model_name_or_path, # from_tf=bool(".ckpt" in model_args.model_name_or_path), # config=config, # cache_dir=model_args.cache_dir, # ) #.cuda() model = DualRobertaForDotProduct.from_pretrained( model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, ) # Get datasets eval_dataset = (RecDataset(data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir) if training_args.do_eval else None) test_dataset = (RecDataset(data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir) if training_args.do_predict else None) def mse(preds, labels): return ((preds - labels) * (preds - labels)).mean() def compute_metrics_fn(p: EvalPrediction): preds = p.predictions return {"Rec": mse(preds, p.label_ids)} # Initialize our Trainer # 模型训练代码,非常值得一读 https://github.com/huggingface/transformers/blob/master/src/transformers/trainer.py#L134 trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics_fn, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) eval_datasets = [eval_dataset] for eval_dataset in eval_datasets: trainer.compute_metrics = compute_metrics_fn eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join( training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format( eval_dataset.args.task_name)) for key, value in eval_result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(eval_result) if training_args.do_predict: logging.info("*** Test ***") test_datasets = [test_dataset] for test_dataset in test_datasets: predictions = trainer.predict( test_dataset=test_dataset).predictions output_test_file = os.path.join( training_args.output_dir, f"test_results_{test_dataset.args.task_name}.txt") if trainer.is_world_master(): with open(output_test_file, "w") as writer: logger.info("***** Test results {} *****".format( test_dataset.args.task_name)) writer.write("index\tprediction\n") for index, item in enumerate(predictions): writer.write("%d\t%3.3f\n" % (index, item)) return eval_results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # model_type = model_args.model_type # log_dir = './results' # if model_type == 'base': # model_args.model_name_or_path = 'bert-base-uncased' # elif model_type == 'base-pubmed': # model_args.model_name_or_path = 'bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12' # elif model_type == 'base-pubmed-mimic': # model_args.model_name_or_path = 'bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12' # else: # raise NotImplementedError # Setup logging logging.basicConfig( format= '[%(asctime)s - %(levelname)s - %(filename)s: %(lineno)d (%(funcName)s)] %(message)s', datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) # Set seed set_seed(training_args.seed) try: num_labels = glue_tasks_num_labels[data_args.task_name] output_mode = glue_output_modes[data_args.task_name] except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) # Load tokenizer tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) dataset_name = data_args.data_dir.split('/')[-1] if dataset_name in ['GAD', 'EUADR']: final_split_results = [] original_data_dir = copy.deepcopy(x=data_args.data_dir) data_splits = list(map(str, range(1, 11))) for split in data_splits: data_args.data_dir = os.path.join(original_data_dir, split) # Get datasets train_dataset = (GlueDataset( data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None) eval_dataset = (GlueDataset(data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir) if training_args.do_eval else None) test_dataset = (GlueDataset(data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir) if training_args.do_predict else None) # Load pretrained model # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. # Currently, this code do not support distributed training. training_args.warmup_steps = int( model_args.warmup_proportion * (len(train_dataset) / training_args.per_device_train_batch_size) * training_args.num_train_epochs) training_args_weight_decay = 0.01 logger.info("Training/evaluation parameters %s", training_args) config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) try: model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=False, config=config, cache_dir=model_args.cache_dir, ) except: model = AutoModelForSequenceClassification.from_pretrained( os.path.join(model_args.model_name_or_path, "model.ckpt.index"), from_tf=True, config=config, cache_dir=model_args.cache_dir, ) def build_compute_metrics_fn( task_name: str) -> Callable[[EvalPrediction], Dict]: def compute_metrics_fn(p: EvalPrediction): if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) return glue_compute_metrics(task_name, preds, p.label_ids) return compute_metrics_fn # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=build_compute_metrics_fn(data_args.task_name), ) # Training if training_args.do_train: training_start_time = time.time() trainer.train( model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) training_end_time = time.time() training_total_time = training_end_time - training_start_time trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) eval_datasets = [eval_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace( data_args, task_name="mnli-mm") eval_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir)) for eval_dataset in eval_datasets: trainer.compute_metrics = build_compute_metrics_fn( eval_dataset.args.task_name) eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join( training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format( eval_dataset.args.task_name)) for key, value in eval_result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(eval_result) if training_args.do_predict: logging.info("*** Test ***") test_datasets = [test_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace( data_args, task_name="mnli-mm") test_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir)) for test_dataset in test_datasets: predictions = trainer.predict( test_dataset=test_dataset).predictions labels = np.array([ test_dataset.__getitem__(idx).label for idx in range(len(test_dataset)) ]) assert len(predictions) == len( labels ), f"len(predictions) = {len(predictions)} =/= len(labels) = {len(labels)}" if output_mode == "classification": predictions = np.argmax(predictions, axis=1) output_test_file = os.path.join( training_args.output_dir, f"test_results.txt" #f"test_results_{test_dataset.args.task_name}.txt" ) test_results = glue_compute_metrics(task_name='ddi', preds=predictions, labels=labels) if trainer.is_world_master(): with open(output_test_file, "w") as writer: logger.info("***** Test results {} *****".format( test_dataset.args.task_name)) logger.info( f"Accuracy: {test_results['acc']}\tMacro F1: {test_results['f1']}" ) writer.write("index\tprediction\n") for index, item in enumerate(predictions): if output_mode == "regression": writer.write("%d\t%3.3f\n" % (index, item)) else: item = test_dataset.get_labels()[item] writer.write("%d\t%s\n" % (index, item)) training_time_formatted = time.strftime( '%H:%M:%S', time.gmtime(training_total_time)) logger.info( f"Total training time: {training_time_formatted}") final_results = copy.deepcopy(x=test_results) final_results['training_time'] = training_time_formatted logger.info( f"F1: {final_results['f1']} | Acc: {final_results['acc']} | Time Elapsed: {final_results['training_time']}" ) final_split_results.append(final_results) else: # Get datasets train_dataset = (GlueDataset( data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None) eval_dataset = (GlueDataset(data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir) if training_args.do_eval else None) test_dataset = (GlueDataset(data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir) if training_args.do_predict else None) # Load pretrained model # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. # Currently, this code do not support distributed training. training_args.warmup_steps = int( model_args.warmup_proportion * (len(train_dataset) / training_args.per_device_train_batch_size) * training_args.num_train_epochs) training_args_weight_decay = 0.01 logger.info("Training/evaluation parameters %s", training_args) config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) try: model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=False, config=config, cache_dir=model_args.cache_dir, ) except: model = AutoModelForSequenceClassification.from_pretrained( os.path.join(model_args.model_name_or_path, "model.ckpt.index"), from_tf=True, config=config, cache_dir=model_args.cache_dir, ) def build_compute_metrics_fn( task_name: str) -> Callable[[EvalPrediction], Dict]: def compute_metrics_fn(p: EvalPrediction): if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) return glue_compute_metrics(task_name, preds, p.label_ids) return compute_metrics_fn # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=build_compute_metrics_fn(data_args.task_name), ) # Training if training_args.do_train: training_start_time = time.time() trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) training_end_time = time.time() training_total_time = training_end_time - training_start_time trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) eval_datasets = [eval_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") eval_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir)) for eval_dataset in eval_datasets: trainer.compute_metrics = build_compute_metrics_fn( eval_dataset.args.task_name) eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join( training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format( eval_dataset.args.task_name)) for key, value in eval_result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(eval_result) if training_args.do_predict: logging.info("*** Test ***") test_datasets = [test_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") test_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir)) for test_dataset in test_datasets: predictions = trainer.predict( test_dataset=test_dataset).predictions labels = np.array([ test_dataset.__getitem__(idx).label for idx in range(len(test_dataset)) ]) assert len(predictions) == len( labels ), f"len(predictions) = {len(predictions)} =/= len(labels) = {len(labels)}" if output_mode == "classification": predictions = np.argmax(predictions, axis=1) output_test_file = os.path.join( training_args.output_dir, f"test_results.txt" #f"test_results_{test_dataset.args.task_name}.txt" ) test_results = glue_compute_metrics(task_name='ddi', preds=predictions, labels=labels) if trainer.is_world_master(): with open(output_test_file, "w") as writer: logger.info("***** Test results {} *****".format( test_dataset.args.task_name)) logger.info( f"Accuracy: {test_results['acc']}\tMacro F1: {test_results['f1']}" ) writer.write("index\tprediction\n") for index, item in enumerate(predictions): if output_mode == "regression": writer.write("%d\t%3.3f\n" % (index, item)) else: item = test_dataset.get_labels()[item] writer.write("%d\t%s\n" % (index, item)) training_time_formatted = time.strftime( '%H:%M:%S', time.gmtime(training_total_time)) logger.info( f"Total training time: {training_time_formatted}") final_results = copy.deepcopy(x=test_results) final_results['training_time'] = training_time_formatted logger.info( f"F1: {final_results['f1']} | Acc: {final_results['acc']} | Time Elapsed: {final_results['training_time']}" ) if dataset_name in ['GAD', 'EUADR']: average_f1_scores = np.mean([x['f1'] for x in final_split_results]) average_acc = np.mean([x['acc'] for x in final_split_results]) logger.info( f"Average F1 Scores: {average_f1_scores} | Average Accuracy: {average_acc}" ) return final_split_results else: return final_results
trainer.train( model_path=model_name_or_path if os.path.isdir(model_name_or_path) else None ) # Evaluation eval_results = {} logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) eval_datasets = [eval_dataset] for eval_dataset in eval_datasets: trainer.compute_metrics = build_compute_metrics_fn(eval_dataset.args.task_name) eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join( output_dir, f"eval_results_{eval_dataset.args.task_name}.txt" ) if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(eval_dataset.args.task_name)) for key, value in eval_result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(eval_result)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. parser = HfArgumentParser(dataclass_types=(ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith('.json'): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f'Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome.' ) logger.warning( 'Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s', training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info('Training/evaluation parameters %s', training_args) # Set seed set_seed(training_args.seed) # Load task-specific number of labels (==1 if regression) and output modes) try: num_labels = seq_clf_tasks_num_labels[data_args.task_name] logger.info('number of labels: ', num_labels) output_mode = seq_clf_output_modes[data_args.task_name] logger.info('task output mode: ', output_mode) except KeyError: raise ValueError('Task not found: %s' % (data_args.task_name)) # Load pretrained model and tokenizer if model_args.config_name: logger.info('config_name provided as: %s', model_args.config_name) config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: logger.info('model_name_or_path provided as: %s', model_args.model_name_or_path) if model_args.continue_from_checkpoint: logger.info( 'checking for the newest checkpoint directory %s/checkpoint-<Trainer.global_step>', model_args.model_name_or_path) sorted_checkpoints = _sorted_checkpoints( model_args.model_name_or_path) logger.info('checkpoints found: %s', sorted_checkpoints) if len(sorted_checkpoints) == 0: raise ValueError( 'Used --continue_from_checkpoint but no checkpoint was found in --model_name_or_path.' ) else: model_args.model_name_or_path = sorted_checkpoints[-1] config = AutoConfig.from_pretrained( model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, # use_fast=True, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool('.ckpt' in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = (SeqClfDataset( args=data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None) eval_dataset = (SeqClfDataset(args=data_args, tokenizer=tokenizer, mode='dev', cache_dir=model_args.cache_dir) if training_args.do_eval else None) test_dataset = (SeqClfDataset(args=data_args, tokenizer=tokenizer, mode='test', cache_dir=model_args.cache_dir) if training_args.do_predict else None) # Metrics computation for a task def build_compute_metrics_fn( task_name: str) -> Callable[[EvalPrediction], Dict]: def compute_metrics_fn(p: EvalPrediction) -> Dict: """computes metrics Args: p (EvalPrediction): NamedTuple with predictions and label ids Returns: Dict: a dict with metrics """ if output_mode == 'classification': preds = np.argmax(p.predictions, axis=1) elif output_mode == 'regression': preds = np.squeeze( p.predictions ) # see x = np.array([[[0], [1], [2]]]) x.shape np.squeeze(x).shape # logger.info('DEBUGGING testing: ') # logger.info('preds: ', '\n', preds) # logger.info('p.label_ids: ', '\n', p.label_ids) return glue_compute_metrics(task_name, preds, p.label_ids) return compute_metrics_fn # Initialize Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=build_compute_metrics_fn(data_args.task_name), ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info('*** Evaluate ***') # Loop to handle MNLI double evaluation (matched, mis-matched) eval_datasets = [eval_dataset] if data_args.task_name == 'mnli': mnli_mm_data_args = dataclasses.replace(data_args, task_name='mnli-mm') eval_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, evaluate=True)) for eval_dataset in eval_datasets: trainer.compute_metrics = build_compute_metrics_fn( eval_dataset.args.task_name) eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join( training_args.output_dir, f'eval_results_{eval_dataset.args.task_name}.txt') if trainer.is_world_master(): with open(output_eval_file, 'w') as writer: logger.info('***** Eval results {} *****'.format( eval_dataset.args.task_name)) for key, value in eval_result.items(): logger.info(' %s = %s', key, value) writer.write('%s = %s\n' % (key, value)) eval_results.update(eval_result) if training_args.do_predict: logging.info('*** Test ***') test_datasets = [test_dataset] if data_args.task_name == 'mnli': mnli_mm_data_args = dataclasses.replace(data_args, task_name='mnli-mm') test_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode='test', cache_dir=model_args.cache_dir)) for test_dataset in test_datasets: predictions = trainer.predict( test_dataset=test_dataset).predictions if output_mode == 'classification': predictions = np.argmax(predictions, axis=1) output_test_file = os.path.join( training_args.output_dir, f'test_results_{test_dataset.args.task_name}.txt') if trainer.is_world_master(): with open(output_test_file, 'w') as writer: logger.info('***** Test results {} *****'.format( test_dataset.args.task_name)) writer.write('index\tprediction\n') for index, item in enumerate(predictions): if output_mode == 'regression': writer.write('%d\t%3.3f\n' % (index, item)) else: item = test_dataset.get_labels()[item] writer.write('%d\t%s\n' % (index, item)) return eval_results
def main(args_dict=None): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if args_dict is not None: model_args, data_args, training_args = parser.parse_dict(args_dict) elif len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Set project name os.environ["WANDB_PROJECT"] = "multilingual_zeroshot" num_labels = 3 labels = ['entailment', 'neutral', 'contradiction'] # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = MBartConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, dropout=model_args.dropout, attention_dropout=model_args.attention_dropout, finetuning_task="mnli", cache_dir=model_args.cache_dir, ) tokenizer = MBartTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = MBartForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets columns = ['input_ids', 'attention_mask', 'labels'] map_fn = get_mnli_map_fn(data_args.lang, data_args.max_seq_length, tokenizer) train_dataset = nlp.load_dataset("multi_nli", split="train") train_dataset = train_dataset.map(map_fn, batched=True, batch_size=512) train_dataset.set_format(type='torch', columns=columns) eval_dataset = (nlp.load_dataset("multi_nli", split="validation_matched") if training_args.do_eval else None) eval_dataset = eval_dataset.map(map_fn, batched=True, batch_size=512) eval_dataset.set_format(type='torch', columns=columns) def compute_metrics_fn(p: EvalPrediction): preds = np.argmax(p.predictions, axis=1) return glue_compute_metrics("classification", preds, p.label_ids) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics_fn, data_collator=DataCollator(tokenizer), ) # disable wandb console logs logging.getLogger('wandb.run_manager').setLevel(logging.WARNING) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) mis_matched_dataset = nlp.load_dataset("multi_nli", split="validation_mismatched") mis_matched_dataset = mis_matched_dataset.map(map_fn, batched=True, batch_size=512) mis_matched_dataset.set_format(type='torch', columns=columns) eval_datasets = [eval_dataset, mis_matched_dataset] for eval_dataset in eval_datasets: trainer.compute_metrics = compute_metrics_fn eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join(training_args.output_dir, f"eval_results.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in eval_result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(eval_result)
def train(model_args, data_args, training_args): if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logs logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if 'roberta' in model_args.model_type: tokenizer = RobertaTokenizer.from_pretrained( model_args.model_name_or_path) config = RobertaConfig.from_pretrained(model_args.model_name_or_path) config.num_labels = data_args.num_labels model = RobertaForSequenceClassification.from_pretrained( model_args.model_name_or_path, config=config) elif 'electra' in model_args.model_type: tokenizer = ElectraTokenizer.from_pretrained( model_args.model_name_or_path) config = ElectraConfig.from_pretrained(model_args.model_name_or_path) config.num_labels = data_args.num_labels model = ElectraForSequenceClassification.from_pretrained( model_args.model_name_or_path, config=config) else: # default -> bert tokenizer = BertTokenizer.from_pretrained( model_args.model_name_or_path) config = BertConfig.from_pretrained(model_args.model_name_or_path) config.num_labels = data_args.num_labels model = BertForSequenceClassification.from_pretrained( model_args.model_name_or_path, config=config) tokenizer.add_special_tokens() if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_df = pd.read_csv(data_args.train_data_file, sep='\t') if data_args.add_train_data_file1 is not None: tmp = pd.read_csv(data_args.add_train_data_file1, sep='\t') train_df = pd.concat([train_df, tmp]) if data_args.add_train_data_file2 is not None: tmp = pd.read_csv(data_args.add_train_data_file2, sep='\t') train_df = pd.concat([train_df, tmp]) train_df = train_df.fillna('no_q') train_dataset = get_dataset( data_args, tokenizer, train_df, model_args.model_type) if training_args.do_train else None dev_df = pd.read_csv(data_args.eval_data_file, sep='\t') dev_df = dev_df.fillna('no_q') eval_dataset = get_dataset( data_args, tokenizer, dev_df, model_args.model_type) if training_args.do_eval else None data_collator = MyDataCollator() # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=build_compute_metrics_fn(), ) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") trainer.compute_metrics = build_compute_metrics_fn() result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
def main(): # 在src/transformers/training_args.py中查看所有可能的参数, # 或将 -help标志传递给此脚本。 现在,我们保留了不同的参数集,以更清晰地分离关注点。 parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # 如果我们仅向脚本传递一个参数,并且它是json文件的路径, # 让我们对其进行解析以获取参数。 model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: #通过命令行传递参数 model_args, data_args, training_args = parser.parse_args_into_dataclasses() # 判断是不是重新训练 if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) #设置日志格式 logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) #打印当前设置 logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) #打印参数 logger.info("训练/评估 参数 %s", training_args) #随机数种子 set_seed(training_args.seed) #num_labels 类别数量, output_mode 是任务类型,'classification' try: num_labels = glue_tasks_num_labels[data_args.task_name] output_mode = glue_output_modes[data_args.task_name] except KeyError: raise ValueError("没有发现相关任务: %s" % (data_args.task_name)) # 加载预训练的模型和令牌生成器 # 分布式培训: # from_pretrained方法保证只有一个本地进程可以并发下载模型和vocab。 #添加自定义参数finetuning_task, num_labels config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) # .ckpt 是可以加载tensorflow的模型 model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = ( GlueDataset(data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None ) eval_dataset = ( GlueDataset(data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir) if training_args.do_eval else None ) test_dataset = ( GlueDataset(data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir) if training_args.do_predict else None ) def build_compute_metrics_fn(task_name: str) -> Callable[[EvalPrediction], Dict]: def compute_metrics_fn(p: EvalPrediction): if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) return glue_compute_metrics(task_name, preds, p.label_ids) return compute_metrics_fn #初始化一个Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=build_compute_metrics_fn(data_args.task_name), ) # Training if training_args.do_train: logger.info("*** 开始训练 ***") trainer.train( model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None ) trainer.save_model() # 为了方便起见,我们还将令牌生成器重新保存到同一目录中, # 以便您可以轻松地在huggingface.co/models上共享模型 if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** 开始评估 ***") # 如果是mnli, 循环以处理MNLI双重评估(匹配,不匹配) eval_datasets = [eval_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") eval_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir) ) for eval_dataset in eval_datasets: trainer.compute_metrics = build_compute_metrics_fn(eval_dataset.args.task_name) eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join( training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt" ) if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(eval_dataset.args.task_name)) for key, value in eval_result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(eval_result) if training_args.do_predict: logging.info("*** 开始预测 ***") test_datasets = [test_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") test_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir) ) for test_dataset in test_datasets: predictions = trainer.predict(test_dataset=test_dataset).predictions if output_mode == "classification": predictions = np.argmax(predictions, axis=1) output_test_file = os.path.join( training_args.output_dir, f"test_results_{test_dataset.args.task_name}.txt" ) if trainer.is_world_master(): with open(output_test_file, "w") as writer: logger.info("***** Test results {} *****".format(test_dataset.args.task_name)) writer.write("index\tprediction\n") for index, item in enumerate(predictions): if output_mode == "regression": writer.write("%d\t%3.3f\n" % (index, item)) else: item = test_dataset.get_labels()[item] writer.write("%d\t%s\n" % (index, item)) return eval_results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, TrainingArguments)) model_args, training_args = parser.parse_args_into_dataclasses() print(model_args) print(training_args) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) num_labels = len(LABEL_NAMES) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # config = RobertaConfig.from_pretrained( # model_args.config_name if model_args.config_name else model_args.model_name_or_path, # num_labels=num_labels, # # hidden_dropout_prob=0.00, # cache_dir=model_args.cache_dir, # ) # tokenizer = RobertaTokenizerFast.from_pretrained( # model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, # cache_dir=model_args.cache_dir, # do_lower_case=False # ) # model = HeadlessRobertaForSequenceClassification.from_pretrained( # model_args.model_name_or_path, # from_tf=bool(".ckpt" in model_args.model_name_or_path), # config=config, # cache_dir=model_args.cache_dir, # ) def df_to_dataset(df): print("Loading dataset...") df = df[df.bug != -1] df = df[~df.message.isnull()] text_values = df.message.values label_ids = df.bug.values text_values_list = text_values.tolist() for elm in text_values_list: if not isinstance(elm, str): print(elm) encoding = tokenizer( text_values_list, add_special_tokens=True, return_attention_mask=True, truncation=True, padding=True, max_length=512, return_tensors="pt", ) input_ids = encoding["input_ids"] label_ids_dtype = torch.float32 if num_labels == 1 else torch.int64 label_ids_t = torch.tensor(label_ids, dtype=label_ids_dtype) print(tokenizer.decode(input_ids[0, :].tolist())) print("DF shape: ", df.shape) print(input_ids.shape) print(label_ids_t.shape) dataset = SimpleDataset(input_ids, encoding["attention_mask"], label_ids_t) print("Done") return dataset if model_args.eval_test: print("**** TEST EVAL *****") test_df = pd.read_csv(model_args.data_file) eval_dataset = df_to_dataset(test_df) train_dataset = None else: print("**** TRAINING ******") train_valid_df = pd.read_csv(model_args.data_file) train_df, valid_df = train_test_split(train_valid_df, test_size=0.1, shuffle=False) train_dataset = df_to_dataset(train_df) eval_dataset = df_to_dataset(valid_df) output_mode = model_args.output_mode def compute_metrics_fn(p: EvalPrediction): if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) else: raise ValueError() # elif output_mode == "regression": # raise ValueError() # preds = np.squeeze(p.predictions) print(preds) print(p.label_ids) print( classification_report(p.label_ids, preds, target_names=LABEL_NAMES, digits=3)) acc = accuracy_score(p.label_ids, preds) f1 = f1_score(p.label_ids, preds, average="macro") return { "acc": acc, "f1": f1, } # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics_fn, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_process_zero(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) eval_datasets = [eval_dataset] for eval_dataset in eval_datasets: trainer.compute_metrics = compute_metrics_fn eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join(training_args.output_dir, f"eval_results.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in eval_result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(eval_result) if training_args.do_predict: logging.info("*** Test ***") eval_datasets = [eval_dataset] for eval_dataset in eval_datasets: trainer.compute_metrics = compute_metrics_fn predictions = trainer.predict( test_dataset=eval_dataset).predictions if output_mode == "classification": predictions = np.argmax(predictions, axis=1) output_test_file = os.path.join(training_args.output_dir, f"assigned_labels.csv") if trainer.is_world_process_zero(): with open(output_test_file, "w") as writer: logger.info("***** Test results *****") writer.write("index,prediction\n") for index, item in enumerate(predictions): if output_mode == "regression": writer.write("%d,%3.3f\n" % (index, item)) else: # item = LABEL_NAMES[item] writer.write("%d,%s\n" % (index, item)) return eval_results