def get_model(): return AutoModelForSequenceClassification.from_pretrained( model_name, config=config, )
save_path=args.output_dir, sequence_max_len=args.seq_len, batch_size=args.batch_size, epochs=args.epochs, device=torch.device(args.device), tokenizer=tokenizer, ) valid_data_loader = SmartParaphraseDataloader.build_batches( valid_dataset, 16, mode="sequence", config=configuration) autoconfig = AutoConfig.from_pretrained( args.pretrained_model_path, output_attentions=True, ) autoconfig.num_labels = len(LABELS_TO_ID) model = AutoModelForSequenceClassification.from_pretrained( args.pretrained_model_path, config=autoconfig) """ model = TransformerWrapper.load_pretrained( args.pretrained_model_path, params=configuration, pooler = BertPoolingStrategy(configuration), loss = SoftmaxLoss(configuration)) model_config = config.ModelParameters( model_name = args.config_name, hidden_size = args.embed_dim, num_classes=3, freeze_weights = False, context_layers = (-1,) )
def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub). # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named # label if at least two columns are provided. # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this # single column. You can easily tweak this behavior (see below) # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.task_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset("glue", args.task_name) else: # Loading the dataset from local csv or json file. data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = (args.train_file if args.train_file is not None else args.valid_file).split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # Labels if args.task_name is not None: is_regression = args.task_name == "stsb" if not is_regression: label_list = raw_datasets["train"].features["label"].names num_labels = len(label_list) else: num_labels = 1 else: # Trying to have good defaults here, don't hesitate to tweak to your needs. is_regression = raw_datasets["train"].features["label"].dtype in [ "float32", "float64" ] if is_regression: num_labels = 1 else: # A useful fast method: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique label_list = raw_datasets["train"].unique("label") label_list.sort() # Let's sort it for determinism num_labels = len(label_list) # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained(args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name) tokenizer = AutoTokenizer.from_pretrained( args.model_name_or_path, use_fast=not args.use_slow_tokenizer) model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) # Preprocessing the datasets if args.task_name is not None: sentence1_key, sentence2_key = task_to_keys[args.task_name] else: # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. non_label_column_names = [ name for name in raw_datasets["train"].column_names if name != "label" ] if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", "sentence2" else: if len(non_label_column_names) >= 2: sentence1_key, sentence2_key = non_label_column_names[:2] else: sentence1_key, sentence2_key = non_label_column_names[0], None # Some models have set the order of the labels to use, so let's make sure we do use it. label_to_id = None if (model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id and args.task_name is not None and not is_regression): # Some have all caps in their config, some don't. label_name_to_id = { k.lower(): v for k, v in model.config.label2id.items() } if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): logger.info( f"The configuration of the model provided the following label correspondence: {label_name_to_id}. " "Using it!") label_to_id = { i: label_name_to_id[label_list[i]] for i in range(num_labels) } else: logger.warning( "Your model seems to have been trained with labels, but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." "\nIgnoring the model labels as a result.", ) elif args.task_name is None: label_to_id = {v: i for i, v in enumerate(label_list)} if label_to_id is not None: model.config.label2id = label_to_id model.config.id2label = { id: label for label, id in config.label2id.items() } padding = "max_length" if args.pad_to_max_length else False def preprocess_function(examples): # Tokenize the texts texts = ((examples[sentence1_key], ) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])) result = tokenizer(*texts, padding=padding, max_length=args.max_length, truncation=True) if "label" in examples: if label_to_id is not None: # Map labels to IDs (not necessary for GLUE tasks) result["labels"] = [label_to_id[l] for l in examples["label"]] else: # In all cases, rename the column to labels because the model will expect that. result["labels"] = examples["label"] return result processed_datasets = raw_datasets.map( preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names, desc="Running tokenizer on dataset", ) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets["validation_matched" if args.task_name == "mnli" else "validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: if args.pad_to_max_length: # If padding was already done ot max length, we use the default data collator that will just convert everything # to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorWithPadding( tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader) # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Get the metric function if args.task_name is not None: metric = load_metric("glue", args.task_name) else: metric = load_metric("accuracy") # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break model.eval() for step, batch in enumerate(eval_dataloader): outputs = model(**batch) predictions = outputs.logits.argmax( dim=-1) if not is_regression else outputs.logits.squeeze() metric.add_batch( predictions=accelerator.gather(predictions), references=accelerator.gather(batch["labels"]), ) eval_metric = metric.compute() logger.info(f"epoch {epoch}: {eval_metric}") if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save) if args.task_name == "mnli": # Final evaluation on mismatched validation set eval_dataset = processed_datasets["validation_mismatched"] eval_dataloader = DataLoader( eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) eval_dataloader = accelerator.prepare(eval_dataloader) model.eval() for step, batch in enumerate(eval_dataloader): outputs = model(**batch) predictions = outputs.logits.argmax(dim=-1) metric.add_batch( predictions=accelerator.gather(predictions), references=accelerator.gather(batch["labels"]), ) eval_metric = metric.compute() logger.info(f"mnli-mm: {eval_metric}")
# compute metrics function for binary classification def compute_metrics(pred): labels = pred.label_ids preds = pred.predictions.argmax(-1) precision, recall, f1, _ = precision_recall_fscore_support( labels, preds, average="binary") acc = accuracy_score(labels, preds) return { "accuracy": acc, "f1": f1, "precision": precision, "recall": recall } # download model from model hub model = AutoModelForSequenceClassification.from_pretrained(args.model_name) # define training args training_args = TrainingArguments( output_dir=args.model_dir, num_train_epochs=args.epochs, per_device_train_batch_size=args.train_batch_size, per_device_eval_batch_size=args.eval_batch_size, warmup_steps=args.warmup_steps, evaluation_strategy="epoch", logging_dir=f"{args.output_data_dir}/logs", learning_rate=float(args.learning_rate), ) # create Trainer instance trainer = Trainer(
def tune_transformer(num_samples=8, gpus_per_trial=0, smoke_test=False, ray_address=None): ray.init(ray_address, log_to_driver=False) data_dir_name = "./data" if not smoke_test else "./test_data" data_dir = os.path.abspath(os.path.join(os.getcwd(), data_dir_name)) if not os.path.exists(data_dir): os.mkdir(data_dir, 0o755) # Change these as needed. model_name = "bert-base-uncased" if not smoke_test \ else "sshleifer/tiny-distilroberta-base" task_name = "rte" task_data_dir = os.path.join(data_dir, task_name.upper()) # Download and cache tokenizer, model, and features print("Downloading and caching Tokenizer") # Triggers tokenizer download to cache AutoTokenizer.from_pretrained(model_name) print("Downloading and caching pre-trained model") # Triggers model download to cache AutoModelForSequenceClassification.from_pretrained(model_name) # Download data. download_data(task_name, data_dir) config = { "model_name": model_name, "task_name": task_name, "data_dir": task_data_dir, "per_gpu_val_batch_size": 32, "per_gpu_train_batch_size": tune.choice([16, 32, 64]), "learning_rate": tune.uniform(1e-5, 5e-5), "weight_decay": tune.uniform(0.0, 0.3), "num_epochs": tune.choice([2, 3, 4, 5]), "max_steps": 1 if smoke_test else -1, # Used for smoke test. } scheduler = PopulationBasedTraining( time_attr="training_iteration", metric="eval_acc", mode="max", perturbation_interval=1, hyperparam_mutations={ "weight_decay": lambda: tune.uniform(0.0, 0.3).func(None), "learning_rate": lambda: tune.uniform(1e-5, 5e-5).func(None), "per_gpu_train_batch_size": [16, 32, 64], }) reporter = CLIReporter(parameter_columns={ "weight_decay": "w_decay", "learning_rate": "lr", "per_gpu_train_batch_size": "train_bs/gpu", "num_epochs": "num_epochs" }, metric_columns=[ "eval_acc", "eval_loss", "epoch", "training_iteration" ]) analysis = tune.run(train_transformer, resources_per_trial={ "cpu": 1, "gpu": gpus_per_trial }, config=config, num_samples=num_samples, scheduler=scheduler, keep_checkpoints_num=3, checkpoint_score_attr="training_iteration", stop={"training_iteration": 1} if smoke_test else None, progress_reporter=reporter, local_dir="~/ray_results/", name="tune_transformer_pbt") if not smoke_test: test_best_model(analysis, config["model_name"], config["task_name"], config["data_dir"])
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Detecting last checkpoint. last_checkpoint = None if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named # label if at least two columns are provided. # # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this # single column. You can easily tweak this behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.task_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset("glue", data_args.task_name, cache_dir=model_args.cache_dir) else: # Loading a dataset from your local files. # CSV/JSON training and evaluation files are needed. data_files = {"train": data_args.train_file, "validation": data_args.validation_file} # Get the test dataset: you can provide your own CSV/JSON test file (see below) # when you use `do_predict` without specifying a GLUE benchmark task. if training_args.do_predict: if data_args.test_file is not None: train_extension = data_args.train_file.split(".")[-1] test_extension = data_args.test_file.split(".")[-1] assert ( test_extension == train_extension ), "`test_file` should have the same extension (csv or json) as `train_file`." data_files["test"] = data_args.test_file else: raise ValueError("Need either a GLUE task or a test file for `do_predict`.") for key in data_files.keys(): logger.info(f"load a local file for {key}: {data_files[key]}") if data_args.train_file.endswith(".csv"): # Loading a dataset from local csv files datasets = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir, delimiter=data_args.delimiter) else: # Loading a dataset from local json files datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # Labels if data_args.task_name is not None: is_regression = data_args.task_name == "stsb" if not is_regression: label_list = datasets["train"].features["label"].names num_labels = len(label_list) else: num_labels = 1 else: # Trying to have good defaults here, don't hesitate to tweak to your needs. is_regression = datasets["train"].features["label"].dtype in ["float32", "float64"] if is_regression: num_labels = 1 else: # A useful fast method: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique label_list = datasets["train"].unique("label") label_list.sort() # Let's sort it for determinism num_labels = len(label_list) # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Preprocessing the datasets if data_args.task_name is not None: sentence1_key, sentence2_key = task_to_keys[data_args.task_name] else: # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. non_label_column_names = [name for name in datasets["train"].column_names if name != "label"] if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", "sentence2" else: if len(non_label_column_names) >= 2: sentence1_key, sentence2_key = non_label_column_names[:2] else: sentence1_key, sentence2_key = non_label_column_names[0], None # Padding strategy if data_args.pad_to_max_length: padding = "max_length" else: # We will pad later, dynamically at batch creation, to the max sequence length in each batch padding = False # Some models have set the order of the labels to use, so let's make sure we do use it. label_to_id = None if ( model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id and data_args.task_name is not None and not is_regression ): # Some have all caps in their config, some don't. label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()} if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)} else: logger.warning( "Your model seems to have been trained with labels, but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." "\nIgnoring the model labels as a result.", ) elif data_args.task_name is None and not is_regression: label_to_id = {v: i for i, v in enumerate(label_list)} if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) def preprocess_function(examples): # Tokenize the texts args = ( (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key]) ) result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) # Map labels to IDs (not necessary for GLUE tasks) if label_to_id is not None and "label" in examples: result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]] return result datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) if training_args.do_train: if "train" not in datasets: raise ValueError("--do_train requires a train dataset") train_dataset = datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select(range(data_args.max_train_samples)) if training_args.do_eval: if "validation" not in datasets and "validation_matched" not in datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = datasets["validation_matched" if data_args.task_name == "mnli" else "validation"] if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None: if "test" not in datasets and "test_matched" not in datasets: raise ValueError("--do_predict requires a test dataset") predict_dataset = datasets["test_matched" if data_args.task_name == "mnli" else "test"] if data_args.max_predict_samples is not None: predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) # Log a few random samples from the training set: if training_args.do_train: for index in random.sample(range(len(train_dataset)), 3): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") # Get the metric function if data_args.task_name is not None: metric = load_metric("glue", data_args.task_name) # TODO: When datasets metrics include regular accuracy, make an else here and remove special branch from # compute_metrics # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. def compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) if data_args.task_name is not None: result = metric.compute(predictions=preds, references=p.label_ids) if len(result) > 1: result["combined_score"] = np.mean(list(result.values())).item() return result elif is_regression: return {"mse": ((preds - p.label_ids) ** 2).mean().item()} else: return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()} # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) else: data_collator = None # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, compute_metrics=compute_metrics, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: checkpoint = None if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): # Check the config from that potential checkpoint has the right number of labels before using it as a # checkpoint. if AutoConfig.from_pretrained(model_args.model_name_or_path).num_labels == num_labels: checkpoint = model_args.model_name_or_path train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics max_train_samples = ( data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.save_model() # Saves the tokenizer too for easy upload trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [data_args.task_name] eval_datasets = [eval_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") eval_datasets.append(datasets["validation_mismatched"]) for eval_dataset, task in zip(eval_datasets, tasks): metrics = trainer.evaluate(eval_dataset=eval_dataset) max_eval_samples = ( data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) ) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) if training_args.do_predict: logger.info("*** Predict ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [data_args.task_name] predict_datasets = [predict_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") predict_datasets.append(datasets["test_mismatched"]) for predict_dataset, task in zip(predict_datasets, tasks): # Removing the `label` columns because it contains -1 and Trainer won't like that. predict_dataset.remove_columns_("label") predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1) output_predict_file = os.path.join(training_args.output_dir, f"predict_results_{task}.txt") if trainer.is_world_process_zero(): with open(output_predict_file, "w") as writer: logger.info(f"***** Predict results {task} *****") writer.write("index\tprediction\n") for index, item in enumerate(predictions): if is_regression: writer.write(f"{index}\t{item:3.3f}\n") else: item = label_list[item] writer.write(f"{index}\t{item}\n") if training_args.push_to_hub: trainer.push_to_hub()
from torch.nn import CrossEntropyLoss os.environ["HF_HOME"] = "/scratch/huggingface_cache/" os.makedirs(f'/scratch/devanshg27/{EXPERIMENT_ID}') from torch import cuda device = 'cuda' if cuda.is_available() else 'cpu' tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) #, use_fast=True) config = AutoConfig.from_pretrained(model_checkpoint) config.num_labels = 2 # hack to change num_labels of pretrained model (save without classification head, and then add new classification head while loading) model = AutoModel.from_pretrained(model_checkpoint) model.save_pretrained(f'/scratch/devanshg27_temp_{EXPERIMENT_ID}') model = AutoModelForSequenceClassification.from_pretrained( f'/scratch/devanshg27_temp_{EXPERIMENT_ID}', config=config) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs model = torch.nn.DataParallel(model) model = model.to(device) class GLUECoSNLIProcessor(processors['xnli']): def get_labels(self): return ["contradiction", "entailment"] def get_valid_examples(self, data_dir): lg = self.language if self.train_language is None else self.train_language lines = self._read_tsv(
def train_fever_hesm(model_name="albert-base-v2"): seed = 12 torch.manual_seed(seed) num_epoch = 4 batch_size = 64 # parameters for annealed sampling keep_neg_sample_prob = 0.06 sample_prob_decay = 0.01 min_keep_neg_sample_prob = 0.02 experiment_name = "simple_nn_startkp_{}_de_{}".format( keep_neg_sample_prob, sample_prob_decay) resume_model = None dev_upstream_file = config.RESULT_PATH / "pipeline_r_aaai_doc_exec/2019_10_07_10:14:16_r/nn_doc_retr_1_shared_task_dev.jsonl" train_upstream_file = config.RESULT_PATH / "pipeline_r_aaai_doc/2019_10_27_16:48:33_r/nn_doc_retr_1_train.jsonl" complete_upstream_dev_data = get_first_evidence_list( config.T_FEVER_DEV_JSONL, dev_upstream_file, pred=True) print("Dev size:", len(complete_upstream_dev_data)) device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) model = AutoModelForSequenceClassification.from_pretrained( model_name, num_labels=2, output_attentions=False, output_hidden_states=False, ) if torch.cuda.device_count() > 1: print("More than 1 gpu device found...") model = nn.DataParallel(model) model.to(device) start_lr = 2e-5 optimizer = AdamW(model.parameters(), lr=start_lr, eps=1e-8) if resume_model is not None: print("Resume From:", resume_model) load_model(resume_model, model, optimizer) # Create Log File file_path_prefix, _ = save_tool.gen_file_prefix(f"{experiment_name}") # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # Save source code end. best_dev = -1 iteration = 0 criterion = nn.CrossEntropyLoss() hesm_model = HESMUtil(model, model_name=model_name) display(model) for i_epoch in range(num_epoch): print("Get first evidence for training...") complete_upstream_train_data = get_first_evidence_list( config.T_FEVER_TRAIN_JSONL, train_upstream_file, pred=False) print("Resampling...") print("Sample Prob.:", keep_neg_sample_prob) filtered_train_data = post_filter(complete_upstream_train_data, keep_prob=keep_neg_sample_prob, seed=12 + i_epoch) keep_neg_sample_prob -= sample_prob_decay if keep_neg_sample_prob <= min_keep_neg_sample_prob: keep_neg_sample_prob = min_keep_neg_sample_prob print("Sampled length:", len(filtered_train_data)) sent_list, label_list, pid_list = hesm_model.read(filtered_train_data) train_dataset = HESMDataset({ 'text': sent_list, 'labels': label_list, 'pid': pid_list }) train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size) if i_epoch == 0: accumulation_steps = 2 # accumulate gradients for increasing `batch_size` by a factor of `accumulation_steps` steps_per_epoch = len(train_dataloader) total_steps = steps_per_epoch * num_epoch scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps) save_epoch = 0.5 # evaluate and save every `save_epoch` epochs optimizer.zero_grad() for i, batch in tqdm(enumerate(train_dataloader)): model.train() out = hesm_model.step(batch) y = batch['labels'].cuda() loss = criterion(out, y) loss = loss / accumulation_steps loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 1.0) if (i + 1 ) % accumulation_steps == 0: # Wait for several backward steps optimizer.step() # Now we can do an optimizer step scheduler.step() optimizer.zero_grad() iteration += 1 mod = steps_per_epoch * save_epoch if iteration % mod == 0: sent_list, label_list, pid_list = hesm_model.read( complete_upstream_dev_data) eval_dataset = HESMDataset({ 'text': sent_list, 'labels': label_list, 'pid': pid_list }) eval_dataloader = DataLoader( eval_dataset, sampler=SequentialSampler(eval_dataset), batch_size=batch_size) complete_upstream_dev_data = hidden_eval_hesm( hesm_model, model, eval_dataloader, complete_upstream_dev_data) dev_results_list = score_converter(config.T_FEVER_DEV_JSONL, complete_upstream_dev_data) eval_mode = {'check_sent_id_correct': True, 'standard': True} strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score( dev_results_list, common.load_jsonl(config.T_FEVER_DEV_JSONL), mode=eval_mode, verbose=False) total = len(dev_results_list) hit = eval_mode['check_sent_id_correct_hits'] tracking_score = hit / total print(f"Dev(raw_acc/pr/rec/f1):{acc_score}/{pr}/{rec}/{f1}/") print("Strict score:", strict_score) print(f"Eval Tracking score:", f"{tracking_score}") need_save = False if tracking_score > best_dev: best_dev = tracking_score need_save = True if need_save: save_path = os.path.join( file_path_prefix, f'i({iteration})_epoch({i_epoch})_' f'(tra_score:{tracking_score}|raw_acc:{acc_score}|pr:{pr}|rec:{rec}|f1:{f1})' ) save_model(save_path, model, optimizer)
from flask import Flask, request, render_template, jsonify from transformers import AutoTokenizer, AutoModelForSequenceClassification from transformers import pipeline import re from numpy import argmax app = Flask(__name__) model = AutoModelForSequenceClassification.from_pretrained( "model/bart-large-mnli/") tokenizer = AutoTokenizer.from_pretrained("model/bart-large-mnli/") zero_shot_classifier = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer) @app.route('/') def home(): return render_template('Home.html') @app.route('/join', methods=['GET', 'POST']) def my_form_post(): text = request.form['text1'] labels = request.form['text2'] labels = labels.split(",") results = zero_shot_classifier(text, labels, multi_class=True) SCORES = results["scores"] CLASSES = results["labels"] result = "" for scr, cls in zip(SCORES, CLASSES): result = result + str(cls) + " (" + str(round(scr, 2)) + "), "
def run_finetuning(args): torch.manual_seed(args.seed) device = torch.device('cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu') # Configure tokenizer tokenizer = AutoTokenizer.from_pretrained(args.pretrained, do_lower_case=True if 'uncased' in args.pretrained else False) if args.add_token != '': add_token = {'additional_special_tokens': args.add_token.split(',')} added = tokenizer.add_special_tokens(add_token) # Get text columns t_columns = args.text_columns.split(',') num_texts = len(t_columns) if num_texts == 1: t_columns = t_columns[0] # Get label columns l_columns = args.label_columns.split(',') num_labels = len(l_columns) if num_labels == 1: l_columns = l_columns[0] if args.do_train: print('\n' + '=' * 50, '\nCONFIGURE FINETUNING SETUP', '\n' + '=' * 50) if args.add_token != '': print("Addded {} special tokens:".format(added), args.add_token) # Produce hash code for cache f_string = args.train_data + args.valid_data + str(args.msl) + str(args.seed) + args.pretrained + str(args.data_pct) hashed = 'cache_' + hashlib.md5(f_string.encode()).hexdigest() + '.pt' # Produce the dataset if cache doesn't exist if hashed not in os.listdir() or args.retokenize_data: print("Producing dataset cache. This will take a while.") s = time.time() df = pd.read_csv(args.train_data).sample(frac=args.data_pct, random_state=args.seed) text, labels = df[t_columns].values, df[l_columns].values train_dataset = process_data(text, labels, tokenizer, msl=args.msl) df = pd.read_csv(args.valid_data) text, labels = df[t_columns].values, df[l_columns].values valid_dataset = process_data(text, labels, tokenizer, msl=args.msl) if args.save_cache: print('Saving data cache') with open(hashed, 'wb') as f: torch.save([train_dataset, valid_dataset], f) print("Preprocessing finished. Time elapsed: {:.2f}s".format(time.time() - s)) # Load the dataset if the cache exists else: print('Cache found. Loading training and validation data.') with open(hashed, 'rb') as f: train_dataset, valid_dataset = torch.load(f) # Produce dataloaders train_sampler = data.RandomSampler(train_dataset) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, sampler=train_sampler) valid_loader = data.DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False) # Configure model config = AutoConfig.from_pretrained(args.pretrained, num_labels=2 if num_labels == 1 else num_labels) if args.random_init: print("Initializing new randomly-initialized model from configuration") model = AutoModelForSequenceClassification.from_config(config) else: print("Loading from pretrained checkpoint") model = AutoModelForSequenceClassification.from_pretrained(args.pretrained, config=config) _ = model.resize_token_embeddings(len(tokenizer)) model = model.to(device) print("Model has {:,} trainable parameters".format(sum(p.numel() for p in model.parameters() if p.requires_grad))) # Configure loss function criterion = torch.nn.CrossEntropyLoss() if num_labels == 1 else torch.nn.BCEWithLogitsLoss() # Configure optimizer if args.optimizer == 'adam': no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [{"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay}, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) optimizer.zero_grad() elif args.optimizer == 'lamb': from pytorch_lamb import Lamb optimizer = Lamb(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay, betas=(args.adam_b1, args.adam_b2)) # Configure scheduler if args.use_scheduler: steps = len(train_loader) * args.epochs // args.accumulation scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(steps * args.warmup_pct), num_training_steps=steps) else: scheduler = None print("Using learning rate {:.4E} and weight decay {:.4E}".format(args.learning_rate, args.weight_decay), end='') print(" with scheduler using warmup pct {}".format(args.warmup_pct)) if args.use_scheduler else print("") # Training proper print('\n' + '=' * 50, '\nTRAINING', '\n' + '=' * 50) print("Training batches: {} | Validation batches: {}".format(len(train_loader), len(valid_loader))) for e in range(1, args.epochs + 1): train_loss, train_acc = train(model, criterion, optimizer, train_loader, scheduler=scheduler, accumulation=args.accumulation, device=device) valid_loss, valid_acc = evaluate(model, criterion, valid_loader, device=device) print("Epoch {:3} | Train Loss {:.4f} | Train Acc {:.4f} | Valid Loss {:.4f} | Valid Acc {:.4f}".format(e, train_loss, train_acc, valid_loss, valid_acc)) # Save the model with open(args.checkpoint, 'wb') as f: torch.save(model.state_dict(), f) if args.do_eval: print('\n' + '=' * 50, '\nBEGIN EVALUATION PROPER', '\n' + '=' * 50) # Produce hash code for test cache f_string = args.test_data + str(args.msl) + str(args.seed) + args.pretrained hashed = 'cache_' + hashlib.md5(f_string.encode()).hexdigest() + '.pt' # Produce the dataset if cache doesn't exist if hashed not in os.listdir() or args.retokenize_data: print("Producing test data cache. This will take a while.") s = time.time() df = pd.read_csv(args.test_data) text, labels = df[t_columns].values, df[l_columns].values test_dataset = process_data(text, labels, tokenizer, msl=args.msl) if args.save_cache: print('Saving data cache') with open(hashed, 'wb') as f: torch.save(test_dataset, f) print("Preprocessing finished. Time elapsed: {:.2f}s".format(time.time() - s)) # Load the dataset if the cache exists else: print('Cache found. Loading test data.') with open(hashed, 'rb') as f: test_dataset = torch.load(f) # Dataloaders test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False) # Produce the model config = AutoConfig.from_pretrained(args.pretrained, num_labels=2 if num_labels == 1 else num_labels) model = AutoModelForSequenceClassification.from_config(config) _ = model.resize_token_embeddings(len(tokenizer)) model = model.to(device) # Load checkpoing and configure loss function print("Loading finetuned checkpoint") with open(args.checkpoint, 'rb') as f: model.load_state_dict(torch.load(f)) criterion = torch.nn.CrossEntropyLoss() if num_labels == 1 else torch.nn.BCEWithLogitsLoss() # Testing proper print('\n' + '=' * 50, '\nTESTING', '\n' + '=' * 50) test_loss, test_acc = evaluate(model, criterion, test_loader, device=device) print("Test Loss {:.4f} | Test Accuracy {:.4f}".format(test_loss, test_acc)) # Logging if not args.do_train: train_loss, train_acc, valid_loss, valid_acc = None, None, None, None if not args.do_eval: test_loss, test_acc = None, None return train_loss, train_acc, valid_loss, valid_acc, test_loss, test_acc
def initialize(self, ctx): self.manifest = ctx.manifest properties = ctx.system_properties model_dir = properties.get("model_dir") serialized_file = self.manifest['model']['serializedFile'] model_pt_path = os.path.join(model_dir, serialized_file) self.device = torch.device("cuda:" + str(properties.get("gpu_id")) if torch.cuda. is_available() else "cpu") # read configs for the mode, model_name, etc. from setup_config.json setup_config_path = os.path.join(model_dir, "setup_config.json") if os.path.isfile(setup_config_path): with open(setup_config_path) as setup_config_file: self.setup_config = json.load(setup_config_file) else: logger.warning('Missing the setup_config.json file.') # Loading the model and tokenizer from checkpoint and config files based on the user's choice of mode # further setup config can be added. if self.setup_config["save_mode"] == "torchscript": self.model = torch.jit.load(model_pt_path) elif self.setup_config["save_mode"] == "pretrained": if self.setup_config["mode"] == "sequence_classification": self.model = AutoModelForSequenceClassification.from_pretrained( model_dir) elif self.setup_config["mode"] == "question_answering": self.model = AutoModelForQuestionAnswering.from_pretrained( model_dir) elif self.setup_config["mode"] == "token_classification": self.model = AutoModelForTokenClassification.from_pretrained( model_dir) else: logger.warning('Missing the operation mode.') else: logger.warning('Missing the checkpoint or state_dict.') if not os.path.isfile(os.path.join(model_dir, "vocab.*")): self.tokenizer = AutoTokenizer.from_pretrained( self.setup_config["model_name"], do_lower_case=self.setup_config["do_lower_case"]) else: self.tokenizer = AutoTokenizer.from_pretrained( model_dir, do_lower_case=self.setup_config["do_lower_case"]) self.model.to(self.device) self.model.eval() logger.debug( 'Transformer model from path {0} loaded successfully'.format( model_dir)) # Read the mapping file, index to object name mapping_file_path = os.path.join(model_dir, "index_to_name.json") # Question answering does not need the index_to_name.json file. if not self.setup_config["mode"] == "question_answering": if os.path.isfile(mapping_file_path): with open(mapping_file_path) as f: self.mapping = json.load(f) else: logger.warning('Missing the index_to_name.json file.') self.initialized = True
"""### Configuring training parameters [texte du lien](https://)You can find the explanations of the training parameters in the class docsctrings. """ # Clean the cl_path try: shutil.rmtree(cl_path) except: pass lm_path = project_dir / "models" / "language_model" / "finbertTRC2" # bertmodel = AutoModelForSequenceClassification.from_pretrained(lm_path,cache_dir=None, num_labels=3) bertmodel = AutoModelForSequenceClassification.from_pretrained( "bert-base-uncased", cache_dir=None, num_labels=3 ) hvd.init() config = Config( data_dir=cl_data_path, bert_model=bertmodel, num_train_epochs=2, model_dir=cl_path, max_seq_length=48, train_batch_size=16, learning_rate=2e-5, output_mode="classification", warm_up_proportion=0.2, local_rank=hvd.local_rank(), no_cuda=True,
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, num_labels=2, id2label={ 0: "0", 1: "1" }, label2id={ "0": 0, "1": 1 }, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, num_labels=2, id2label={ 0: "0", 1: "1" }, label2id={ "0": 0, "1": 1 }, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if model_args.model_name_or_path: model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) if model_args.cls_model_name_or_path: cls_config = AutoConfig.from_pretrained( model_args.cls_model_name_or_path, num_labels=2, finetuning_task="cola", cache_dir=model_args.cache_dir, ) cls_model = AutoModelForSequenceClassification.from_pretrained( model_args.cls_model_name_or_path, from_tf=bool(".ckpt" in model_args.cls_model_name_or_path), config=cls_config, cache_dir=model_args.cache_dir, ) cls_model.resize_token_embeddings(len(tokenizer)) # mask_selector = MaskSelector(cls_model,training_args) model.resize_token_embeddings(len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert" ] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm " "flag (masked language modeling).") if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = get_dataset( data_args, tokenizer=tokenizer, model_args=model_args, cache_dir=model_args.cache_dir) if training_args.do_train else None eval_dataset = get_dataset( data_args, model_args=None, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None data_collator = DataCollatorForGAN( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = GANTrainer(generator=model, discriminator=cls_model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, mask_token_id=tokenizer.mask_token_id) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
def main(): if training_args.do_train: if model_args.train_language is None: train_dataset = load_dataset("xnli", model_args.language, split="train", cache_dir=model_args.cache_dir) else: train_dataset = load_dataset("xnli", model_args.train_language, split="train", cache_dir=model_args.cache_dir) label_list = train_dataset.features["label"].names if training_args.do_eval: eval_dataset = load_dataset("xnli", model_args.language, split="validation", cache_dir=model_args.cache_dir) label_list = eval_dataset.features["label"].names if training_args.do_test: predict_dataset = load_dataset("xnli", model_args.language, split="test", cache_dir=model_args.cache_dir) label_list = predict_dataset.features["label"].names # Labels n_labels = len(label_list) # Load pretrained model and tokenizer # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name, n_labels=n_labels, finetune="xnli", cache_dir=model_args.cache_dir, revision=model_args.model_version, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name, lower_case=model_args.lower_case, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_version, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name, from_tf=bool(".ckpt" in model_args.model_name), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_version, use_auth_token=True if model_args.use_auth_token else None, ) # Preprocessing the datasets # Padding strategy if data_args.pad_to_max_length: padding = "max_len" else: # We will pad later, dynamically at batch creation, to the max sequence length in each batch padding = False def preprocess_function(examples): # Tokenize the texts return tokenizer( examples["premise"], examples["hypothesis"], padding=padding, max_len=data_args.max_seq_length, truncation=True, ) if training_args.do_train: if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) with training_args.main_process_first( desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on train dataset", ) # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") if training_args.do_eval: if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select( range(data_args.max_eval_samples)) with training_args.main_process_first( desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on validation dataset", ) if training_args.do_test: if data_args.max_test_samples is not None: predict_dataset = predict_dataset.select( range(data_args.max_test_samples)) with training_args.main_process_first( desc="prediction dataset map pre-processing"): predict_dataset = predict_dataset.map( preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on prediction dataset", ) # Get the metric function metric = load_metric("xnli") # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. def compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = np.argmax(preds, axis=1) return metric.compute(predictions=preds, references=p.label_ids) # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) else: data_collator = None # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, compute_metrics=compute_metrics, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.save_model() # Saves the tokenizer too for easy upload trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate(eval_dataset=eval_dataset) max_eval_samples = (data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Prediction if training_args.do_test: logger.info("*** Predict ***") predictions, labels, metrics = trainer.predict( predict_dataset, metric_key_prefix="predict") max_test_samples = (data_args.max_test_samples if data_args.max_test_samples is not None else len(predict_dataset)) metrics["predict_samples"] = min(max_test_samples, len(predict_dataset)) trainer.log_metrics("predict", metrics) trainer.save_metrics("predict", metrics) predictions = np.argmax(predictions, axis=1) output_predict_file = os.path.join(training_args.out_dir, "predictions.txt") if trainer.is_world_process_zero(): with open(output_predict_file, "w") as writer: writer.write("index\tprediction\n") for index, item in enumerate(predictions): item = label_list[item] writer.write(f"{index}\t{item}\n")
print(" Validation Accuracy: {0:.4f}".format(val_accuracy)) print('Time:', time.time() - t0) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Fine Tuning') parser.add_argument('--iter', type=str, required=True, help='Enter Iteration number') parser.add_argument('--n_epochs', type=int, required=True, help='Enter number of epochs') args = parser.parse_args() print('Entered', args.iter) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if args.iter == '1': tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment") model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment") model = model.to(device) else: load_model_location = 'iteration' + str(int(args.iter)-1) + '/saved_model' tokenizer = AutoTokenizer.from_pretrained(load_model_location) model = AutoModelForSequenceClassification.from_pretrained(load_model_location) model = model.to(device) print('Loaded model and shifted to device', device) #load data for fine tuning current_dir = 'iteration' + args.iter + '/' fine_tune_file = current_dir + 'fine_tune_' + args.iter + '.pkl' print('Fine Tune File:', fine_tune_file) label_dict = {'positive': 2, 'neutral': 1, 'negative': 0} sentences, labels = read_data(fine_tune_file, label_dict)
parser.add_argument('--output', type=str, required=True) args = parser.parse_args() print(args.mode) corpus = {doc['doc_id']: doc for doc in jsonlines.open(args.corpus)} dataset = jsonlines.open(args.dataset) rationale_selection = jsonlines.open(args.rationale_selection) output = jsonlines.open(args.output, 'w') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f'Using device "{device}"') tokenizer = AutoTokenizer.from_pretrained(args.model) config = AutoConfig.from_pretrained(args.model, num_labels=3) model = AutoModelForSequenceClassification.from_pretrained( args.model, config=config).eval().to(device) LABELS = ['CONTRADICT', 'NOT_ENOUGH_INFO', 'SUPPORT'] def encode(sentences, claims): text = { "claim_and_rationale": list(zip(sentences, claims)), "only_claim": claims, "only_rationale": sentences }[args.mode] encoded_dict = tokenizer.batch_encode_plus(text, pad_to_max_length=True, return_tensors='pt') if encoded_dict['input_ids'].size(1) > 512: encoded_dict = tokenizer.batch_encode_plus(
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named # label if at least two columns are provided. # # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this # single column. You can easily tweak this behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. # Loading a dataset from your local files. # CSV/JSON training and evaluation files are needed. data_files = { "train": data_args.train_file, "validation": data_args.validation_file } # Get the test dataset: you can provide your own CSV/JSON test file (see below) when you use `do_predict` if training_args.do_predict: if data_args.test_file is not None: train_extension = data_args.train_file.split(".")[-1] test_extension = data_args.test_file.split(".")[-1] assert ( test_extension == train_extension ), "`test_file` should have the same extension (csv or json) as `train_file`." data_files["test"] = data_args.test_file else: raise ValueError("Need a test file for `do_predict`.") for key in data_files.keys(): logger.info(f"load a local file for {key}: {data_files[key]}") if data_args.train_file.endswith(".csv"): # Loading a dataset from local csv files datasets = load_dataset("csv", data_files=data_files, cache_dir=data_args.dataset_cache_dir) else: # Loading a dataset from local json files datasets = load_dataset("json", data_files=data_files, cache_dir=data_args.dataset_cache_dir) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # Labels # A useful fast method: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique label_list = datasets["train"].unique("label") label_list.sort() # Let's sort it for determinism num_labels = len(label_list) # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # if specified, add special tokens for user mentions, emojis and urls if model_args.use_special_tokens: special_tokens_dict = { 'additional_special_tokens': ['[USER]', '[EMOJI]', '[URL]'] } tokenizer.add_special_tokens(special_tokens_dict) model.resize_token_embeddings(len(tokenizer)) logger.info( "Resize token embeddings to fit tokenizer dimension (necessary for special tokens)" ) # Preprocessing the datasets --> selecting label and text column non_label_column_names = [ name for name in datasets["train"].column_names if name != "label" ] sentence1_key, sentence2_key = non_label_column_names[0], None # Padding strategy if data_args.pad_to_max_length: padding = "max_length" else: # We will pad later, dynamically at batch creation, to the max sequence length in each batch padding = False # convert label to id. label_to_id = {v: i for i, v in enumerate(label_list)} if data_args.max_seq_length > tokenizer.model_max_length: logger.warn( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) def preprocess_function(examples): # Tokenize the texts args = ((examples[sentence1_key], ) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])) result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) # Map labels to IDs if label_to_id is not None and "label" in examples: result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]] return result datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) train_dataset = datasets["train"] eval_dataset = datasets["validation"] if data_args.test_file is not None: test_dataset = datasets["test"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. def compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = np.argmax(preds, axis=1) return { "accuracy": (preds == p.label_ids).astype(np.float32).mean().item() } # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) else: data_collator = None # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset if training_args.do_eval else None, compute_metrics=compute_metrics, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: train_result = trainer.train() metrics = train_result.metrics trainer.save_model() # Saves the tokenizer too for easy upload trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") eval_result = trainer.evaluate(eval_dataset=eval_dataset) trainer.log_metrics("eval", eval_result) trainer.save_metrics("eval", eval_result) if training_args.do_predict: logger.info("*** Test ***") test_dataset.remove_columns_("label") predictions = trainer.predict(test_dataset=test_dataset).predictions predictions = np.argmax(predictions, axis=1) output_test_file = os.path.join(training_args.output_dir, f"test_results.txt") if trainer.is_world_process_zero(): with open(output_test_file, "w") as writer: logger.info(f"***** Test results *****") writer.write("index\tprediction\n") for index, item in enumerate(predictions): item = label_list[item] writer.write(f"{index}\t{item}\n") return 'completed finetuning'
def get_model(pretrained_model_name_or_path: str = 'castorini/monobert-large-msmarco', *args, device: str = None, **kwargs) -> AutoModelForSequenceClassification: device = device or ('cuda' if torch.cuda.is_available() else 'cpu') device = torch.device(device) return AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *args, **kwargs).to(device).eval()
def get_predictions(model_path, dataset_path, max_length, name, pos_label): #max_length = 1024 #val_path = "/scratch/gpfs/cmcwhite/chloro_loc_model/chloro_labeledsetVal.csv" #n_labels = 2 model_config = AutoConfig.from_pretrained(model_path) seq_tokenizer = BertTokenizerFast.from_pretrained(model_path) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = AutoModelForSequenceClassification.from_pretrained( model_path, config=model_config) #model.to(device) seqs, labels, ids = load_dataset(dataset_path, max_length) seqs_encodings = seq_tokenizer(seqs, is_split_into_words=True, return_offsets_mapping=True, truncation=True, padding=True) _ = seqs_encodings.pop("offset_mapping") unique_tags = set(labels) unique_tags = sorted( list(unique_tags)) # make the order of the labels unchanged tag2id = {tag: id for id, tag in enumerate(unique_tags)} id2tag = {id: tag for tag, id in tag2id.items()} print(tag2id) print(id2tag) labels_encodings = encode_tags(labels, tag2id) dataset = SS3Dataset(seqs_encodings, labels_encodings) #valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False) batch_size = 10 dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False) pos_index = tag2id[pos_label] true_labels, predictions_labels, probs, true_probs = validation( dataloader, model, device, pos_index) print(classification_report(true_labels, predictions_labels)) text_true = [id2tag[x] for x in true_labels] text_pred = [id2tag[x] for x in predictions_labels] conf = confusion_matrix(text_true, text_pred) print(conf) outconf = model_path + "/output_confusion_" + name + ".csv" np.savetxt(outconf, conf, delimiter=",") print(ids) print(text_true) print(text_pred) print(probs) print(true_probs) print(len(ids)) print(len(text_true)) print(len(text_pred)) print(len(probs)) print(len(true_probs)) outdict = { "id": ids, "true_labels": text_true, "predicted_labels": text_pred, "prob": probs, "true_probs": true_probs } outdf = pd.DataFrame(outdict) outdf = outdf.sort_values(by=['true_probs'], ascending=False) print(outdf) outdf_path = model_path + "/output_predictions_" + name + ".csv" outdf.to_csv(outdf_path) precision, recall, thresholds = precision_recall_curve( true_labels, true_probs) print(precision) print(recall) thresholds = np.concatenate(([0], thresholds)) print(thresholds) prdict = { "precision": precision, "recall": recall, "threshold": thresholds } prdf = pd.DataFrame(prdict) print(prdf) prdf_path = model_path + "/output_prcurve_" + name + ".csv" prdf.to_csv(prdf_path)
def eval_emotion( model_name, output_path, lang="es", eval_batch_size=16, warmup_proportion=0.1, limit=None, ): """ """ print("=" * 80 + '\n', "=" * 80 + '\n') print(f"Evaluating {model_name} in language {lang}", "\n" * 2) print("Loading dataset") if lang not in ["es", "en"]: print("lang must be one of ", ["es", "en"]) sys.exit(1) model = AutoModelForSequenceClassification.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer.model_max_length = 128 model.eval() tokenizer_class_name = model.config.tokenizer_class load_extra_args = extra_args[ tokenizer_class_name] if tokenizer_class_name in extra_args else {} _, _, test_dataset = load_datasets(lang=lang, **load_extra_args) device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) print("Tokenizing and formatting \n\n") def tokenize(batch): return tokenizer(batch['text'], padding='max_length', truncation=True) def format_dataset(dataset): dataset = dataset.map(lambda examples: {'labels': examples['label']}) columns = ['input_ids', 'attention_mask', 'labels'] if 'token_type_ids' in dataset.features: columns.append('token_type_ids') dataset.set_format(type='torch', columns=columns) print(columns) return dataset test_dataset = test_dataset.map(tokenize, batched=True, batch_size=eval_batch_size) test_dataset = format_dataset(test_dataset) print("Sanity check\n\n") print(tokenizer.decode(test_dataset[0]["input_ids"]), "\n\n") print("\n\nEvaluating\n") training_args = TrainingArguments( output_dir='.', per_device_eval_batch_size=eval_batch_size, ) trainer = Trainer( model=model, args=training_args, compute_metrics=lambda x: compute_metrics(x, id2label=id2label), ) preds = trainer.predict(test_dataset) serialized = { "model": model_name, "lang": lang, "predictions": preds.predictions.tolist(), "labels": preds.label_ids.tolist(), "metrics": preds.metrics } print(f"Saving at {output_path}") with open(output_path, "w+") as f: json.dump(serialized, f, indent=4)
from transformers import AutoModelForSequenceClassification, AutoTokenizer # Connect to AWS RDS Postgresql DB HOST = db_config.host PORT = db_config.port USERNAME = db_config.username PASSWORD = db_config.password DB = db_config.db conn_string = f'postgresql://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/{DB}' engine = create_engine(conn_string, echo=False) print('Connection to DB established') #Initialize BERT Model model = AutoModelForSequenceClassification.from_pretrained( 'oliverguhr/german-sentiment-bert') tokenizer = AutoTokenizer.from_pretrained('oliverguhr/german-sentiment-bert') print('BERT model loaded') def clean_data(data): """Processes raw tweets to clean text""" #Removing URLs with a regular expression url_pattern = re.compile(r'https?://\S+|www\.\S+') data = url_pattern.sub(r'', data) # Remove mentionings data = re.sub(r'@\w*', '', data)
def train(arg): # load model and tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name_dict[arg.m]) # load dataset if arg.train_data == 'val': train_dataset = load_data("/opt/ml/input/data/train/new_train.tsv") elif arg.train_data == 'ner': train_dataset = pd.read_csv("/opt/ml/input/data/train/new_train_ner.tsv", sep='\t') elif arg.train_data == 'train': train_dataset = load_data("/opt/ml/input/data/train/train.tsv") # load validation set if arg.train_data == 'ner': dev_dataset = pd.read_csv("/opt/ml/input/data/train/val_train_ner.tsv", sep='\t') else: dev_dataset = load_data("/opt/ml/input/data/train/val_train.tsv") train_label = train_dataset['label'].values dev_label = dev_dataset['label'].values # tokenizing dataset tokenized_train = tokenized_dataset(train_dataset, tokenizer) tokenized_dev = tokenized_dataset(dev_dataset, tokenizer) # make dataset for pytorch. RE_train_dataset = RE_Dataset(tokenized_train, train_label) RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # setting model hyperparameter config = AutoConfig.from_pretrained(model_name_dict[arg.m]) config.num_labels = 42 model = AutoModelForSequenceClassification.from_pretrained(model_name_dict[arg.m], config=config) model.parameters model.to(device) # 사용한 option 외에도 다양한 option들이 있습니다. # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. training_args = TrainingArguments( output_dir=arg.a, # output directory save_total_limit=3, # number of total save model. # save_steps=500, # model saving step. save_strategy='epoch', num_train_epochs=arg.e, # total number of training epochs learning_rate=arg.lr, # learning_rate per_device_train_batch_size=arg.b, # batch size per device during training per_device_eval_batch_size=40, # batch size for evaluation warmup_steps=500, # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=100, # log saving step. evaluation_strategy='epoch' , # evaluation strategy to adopt during training # `no`: No evaluation during training. # `steps`: Evaluate every `eval_steps`. # `epoch`: Evaluate every end of epoch. # load_best_model_at_end=True, # metric_for_best_model=compute_metrics, # greater_is_better=True, eval_steps = 500, # evaluation step. ) trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_dev_dataset, # evaluation dataset compute_metrics=compute_metrics # define metrics function ) # train model trainer.train() trainer.save_model(arg.o) trainer.save_state()
'''Train a basic transformer model''' from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer import datasets import numpy as np batch_size = 8 num_labels = 11 # Load data ds = datasets.load_dataset("./italki", data_dir="../italki_data") # Init model and trainer model_name = "bert-base-cased" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained( model_name, num_labels=num_labels) max_input_length = tokenizer.max_model_input_sizes[model_name] # Tokenize ds = ds.map( lambda batch: tokenizer(batch["document"], padding="max_length", truncation=True, pad_to_max_length=True, max_length=max_input_length), batched=True, remove_columns=["document", "author_id", "proficiency", "document_id"]) ds = ds.rename_column("native_language", "labels") ds.set_format(type="torch") # Train
def get_sentiment_from_text(text, model_name="ProsusAI/finbert"): tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained(model_name) return predict(text, model, tokenizer)
from sklearn.metrics import precision_recall_fscore_support, accuracy_score import pandas as pd from datasets import load_dataset #train = pd.read_csv('pre_learn/train.tsv',sep='\t') #dev = pd.read_csv('pre_learn/dev.tsv', sep='\t') # #train_conc = train.concept.values #train_label = train.label.values # #dev_conc = dev.concept.values #dev_label = dev.label.values bert_base_small_data = "dbmdz/bert-base-italian-uncased" #13G #bert_base_large_data = "dbmdz/bert-base-italian-xxl-uncased" #81G tokenizer = AutoTokenizer.from_pretrained(bert_base_small_data) model = AutoModelForSequenceClassification.from_pretrained( bert_base_small_data) # AutoModel #for param in model.base_model.parameters(): # param.requires_grad = False def encode(examples): #return tokenizer(examples['concept'], examples['preconcept'], truncation=True, padding='max_length') #return tokenizer(examples['text'], examples['text (#1)'], truncation=True, padding='max_length') return tokenizer(examples['concept_text'], examples['preconcept_text'], truncation=True, padding='max_length') def compute_metrics(pred):
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification model = AutoModelForSequenceClassification.from_pretrained(r"I:\BTU\results") tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help= "Path to pretrained model or model identifier from huggingface.co/models", ) parser.add_argument( "--task_name", default=None, type=str, required=True, help="The name of the task to train selected in the list: " + ", ".join(glue_processors.keys()), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) # Other parameters parser.add_argument( "--config_name", default="", type=str, help= "Pretrained config name or path if not the same as model_name_or_path", ) parser.add_argument( "--tokenizer_name", default="", type=str, help= "Pretrained tokenizer name or path if not the same as model_name_or_path", ) parser.add_argument( "--cache_dir", default=None, type=str, help= "Where do you want to store the pre-trained models downloaded from huggingface.co", ) parser.add_argument( "--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances.") parser.add_argument("--overwrite_output_dir", action="store_true", help="Whether to overwrite data in output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--dont_normalize_importance_by_layer", action="store_true", help="Don't normalize importance score by layers") parser.add_argument( "--dont_normalize_global_importance", action="store_true", help="Don't normalize all importance scores between 0 and 1", ) parser.add_argument( "--try_masking", action="store_true", help="Whether to try to mask head until a threshold of accuracy.") parser.add_argument( "--masking_threshold", default=0.9, type=float, help= "masking threshold in term of metrics (stop masking when metric < threshold * original metric value).", ) parser.add_argument( "--masking_amount", default=0.1, type=float, help="Amount to heads to masking at each masking step.") parser.add_argument("--metric_name", default="acc", type=str, help="Metric to use for head masking.") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, sequences shorter padded.", ) parser.add_argument("--batch_size", default=1, type=int, help="Batch size.") parser.add_argument("--seed", type=int, default=42) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup devices and distributed training if args.local_rank == -1 or args.no_cuda: args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) args.n_gpu = 1 torch.distributed.init_process_group( backend="nccl") # Initializes the distributed backend # Setup logging logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("device: {} n_gpu: {}, distributed: {}".format( args.device, args.n_gpu, bool(args.local_rank != -1))) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Set seeds set_seed(args.seed) # Prepare GLUE task args.task_name = args.task_name.lower() if args.task_name not in glue_processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = glue_processors[args.task_name]() args.output_mode = glue_output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name, output_attentions=True, cache_dir=args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir, ) model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir, ) input_vec = torch.rand(8, 128, 768) model.prune_heads({0: [0]}) output = model(inputs_embeds=input_vec) # Distributed and parallel training model.to(args.device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) # Print/save training arguments os.makedirs(args.output_dir, exist_ok=True) torch.save(args, os.path.join(args.output_dir, "run_args.bin")) logger.info("Training/evaluation parameters %s", args) # Prepare dataset for the GLUE task eval_dataset = GlueDataset(args, tokenizer=tokenizer, mode="dev") if args.data_subset > 0: eval_dataset = Subset( eval_dataset, list(range(min(args.data_subset, len(eval_dataset))))) eval_sampler = SequentialSampler( eval_dataset) if args.local_rank == -1 else DistributedSampler( eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.batch_size, collate_fn=default_data_collator) # Compute head entropy and importance score # compute_heads_importance(args, model, eval_dataloader) # Try head masking (set heads to zero until the score goes under a threshole) # and head pruning (remove masked heads and see the effect on the network) if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0: head_mask = mask_heads(args, model, eval_dataloader) prune_heads(args, model, eval_dataloader, head_mask)
################################################# ################################################## ################Setup for Neural Networks######### ################################################## # Use a GPU if you have one available (Runtime -> Change runtime type -> GPU) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Set seeds for reproducibility random.seed(26) np.random.seed(26) torch.manual_seed(26) tokenizer = AutoTokenizer.from_pretrained("roberta-base") model = AutoModelForSequenceClassification.from_pretrained("roberta-base") model.to(device) # Send the model to the GPU if we have one learning_rate = 1e-5 optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8) def encode_data(tokenizer, questions, passages, max_length): """Encode the question/passage pairs into features than can be fed to the model.""" input_ids = [] attention_masks = [] for question, passage in zip(questions, passages): encoded_data = tokenizer.encode_plus( question, passage,
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) try: num_labels = glue_tasks_num_labels[data_args.task_name] output_mode = glue_output_modes[data_args.task_name] except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = (GlueDataset( data_args, tokenizer=tokenizer, local_rank=training_args.local_rank) if training_args.do_train else None) eval_dataset = (GlueDataset(data_args, tokenizer=tokenizer, local_rank=training_args.local_rank, evaluate=True) if training_args.do_eval else None) def compute_metrics(p: EvalPrediction) -> Dict: if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) return glue_compute_metrics(data_args.task_name, preds, p.label_ids) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval and training_args.local_rank in [-1, 0]: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) eval_datasets = [eval_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") eval_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, local_rank=training_args.local_rank, evaluate=True)) for eval_dataset in eval_datasets: result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join( training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format( eval_dataset.args.task_name)) for key, value in result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) results.update(result) return results
def tune_transformer(num_samples=8, gpus_per_trial=0, smoke_test=False): data_dir_name = "./data" if not smoke_test else "./test_data" data_dir = os.path.abspath(os.path.join(os.getcwd(), data_dir_name)) if not os.path.exists(data_dir): os.mkdir(data_dir, 0o755) # Change these as needed. model_name = ( "bert-base-uncased" if not smoke_test else "sshleifer/tiny-distilroberta-base" ) task_name = "rte" task_data_dir = os.path.join(data_dir, task_name.upper()) num_labels = glue_tasks_num_labels[task_name] config = AutoConfig.from_pretrained( model_name, num_labels=num_labels, finetuning_task=task_name ) # Download and cache tokenizer, model, and features print("Downloading and caching Tokenizer") tokenizer = AutoTokenizer.from_pretrained(model_name) # Triggers tokenizer download to cache print("Downloading and caching pre-trained model") AutoModelForSequenceClassification.from_pretrained( model_name, config=config, ) def get_model(): return AutoModelForSequenceClassification.from_pretrained( model_name, config=config, ) # Download data. download_data(task_name, data_dir) data_args = GlueDataTrainingArguments(task_name=task_name, data_dir=task_data_dir) train_dataset = GlueDataset( data_args, tokenizer=tokenizer, mode="train", cache_dir=task_data_dir ) eval_dataset = GlueDataset( data_args, tokenizer=tokenizer, mode="dev", cache_dir=task_data_dir ) training_args = TrainingArguments( output_dir=".", learning_rate=1e-5, # config do_train=True, do_eval=True, no_cuda=gpus_per_trial <= 0, evaluation_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, num_train_epochs=2, # config max_steps=-1, per_device_train_batch_size=16, # config per_device_eval_batch_size=16, # config warmup_steps=0, weight_decay=0.1, # config logging_dir="./logs", skip_memory_metrics=True, report_to="none", ) trainer = Trainer( model_init=get_model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=build_compute_metrics_fn(task_name), ) tune_config = { "per_device_train_batch_size": 32, "per_device_eval_batch_size": 32, "num_train_epochs": tune.choice([2, 3, 4, 5]), "max_steps": 1 if smoke_test else -1, # Used for smoke test. } scheduler = PopulationBasedTraining( time_attr="training_iteration", metric="eval_acc", mode="max", perturbation_interval=1, hyperparam_mutations={ "weight_decay": tune.uniform(0.0, 0.3), "learning_rate": tune.uniform(1e-5, 5e-5), "per_device_train_batch_size": [16, 32, 64], }, ) reporter = CLIReporter( parameter_columns={ "weight_decay": "w_decay", "learning_rate": "lr", "per_device_train_batch_size": "train_bs/gpu", "num_train_epochs": "num_epochs", }, metric_columns=["eval_acc", "eval_loss", "epoch", "training_iteration"], ) trainer.hyperparameter_search( hp_space=lambda _: tune_config, backend="ray", n_trials=num_samples, resources_per_trial={"cpu": 1, "gpu": gpus_per_trial}, scheduler=scheduler, keep_checkpoints_num=1, checkpoint_score_attr="training_iteration", stop={"training_iteration": 1} if smoke_test else None, progress_reporter=reporter, local_dir="~/ray_results/", name="tune_transformer_pbt", log_to_file=True, )