def main(config): # Get pretrained tokenizer. tokenizer = AutoTokenizer.from_pretrained(config.pretrained_model_name) # Get dataloaders using tokenizer from untokenized corpus. train_loader, valid_loader, index_to_label = get_loaders( config.train_fn, tokenizer) print( '|train| =', len(train_loader) * config.batch_size, '|valid| =', len(valid_loader) * config.batch_size, ) # Get pretrained model with specified softmax layer. model = AutoModelForSequenceClassification.from_pretrained( config.pretrained_model_name, num_labels=len(index_to_label)) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = optim.AdamW(optimizer_grouped_parameters, lr=config.lr, eps=config.adam_epsilon) # By default, model has softmax layer, not log-softmax layer. # Therefore, we need CrossEntropyLoss, not NLLLoss. crit = nn.CrossEntropyLoss() n_total_iterations = len(train_loader) * config.n_epochs n_warmup_steps = int(n_total_iterations * config.warmup_ratio) scheduler = get_linear_schedule_with_warmup(optimizer, n_warmup_steps, n_total_iterations) if config.gpu_id >= 0: model.cuda(config.gpu_id) crit.cuda(config.gpu_id) # Start train. trainer = Trainer(config) model = trainer.train( model, crit, optimizer, scheduler, train_loader, valid_loader, ) torch.save( { 'rnn': None, 'cnn': None, 'bert': model.state_dict(), 'config': config, 'vocab': None, 'classes': index_to_label, 'tokenizer': tokenizer, }, config.model_fn)
def main(args): device = torch.device(args.device) if args.seed is not None: random_seed(args.seed) tokenizer = AutoTokenizer.from_pretrained(args.transformers_path) non_modified_data = json.load(Path(args.non_modified_data).open('r')) result = dict() paths = Path(args.data_folder).glob('*.json') for path in paths: print('=' * 50) method, bpt = str(path).split('/')[2].split('.')[0].split('_') bpt = int(bpt) print(f"method: {method}, beats_per_token: {bpt}") if bpt not in result.keys(): result[bpt] = dict() result[bpt][method] = { 'acc': list(), 'roc_auc': list(), 'f1score': list() } for j in range(args.n_splits): print(f"{j + 1} SPLIT OUT OF {args.n_splits}") seed = random.randint(0, 10e6) modified_data = json.load(Path(path).open('r')) train_data, test_data = get_train_test_data( random.sample(non_modified_data, args.non_modified_data_sample_size), modified_data, test_size=args.test_size, random_state=seed) transform = CustomTransform(tokenizer, max_len=100) train_dataset = CustomDataset(train_data[0], train_data[1], transform=transform) test_dataset = CustomDataset(test_data[0], test_data[1], transform=transform) batcher = { 'train': DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True), 'dev': DataLoader(test_dataset, batch_size=args.batch_size) } config = AutoConfig.from_pretrained(args.transformers_path, num_labels=2) model = AutoModelForSequenceClassification.from_pretrained( args.transformers_path, config=config).to(device) train(model, batcher, args) checkpoint = torch.load(args.checkpoint_path) model.load_state_dict(checkpoint['model_state_dict']) current_res = evaluate(model, batcher, args) result[bpt][method]['acc'].append(current_res['acc']) result[bpt][method]['roc_auc'].append(current_res['roc_auc']) result[bpt][method]['f1score'].append(current_res['f1score']) json.dump(result, Path(args.result_path).open('w')) del model, checkpoint, train_data, test_data, train_dataset, test_dataset, batcher
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named # label if at least two columns are provided. # # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this # single column. You can easily tweak this behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.task_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset("glue", data_args.task_name) elif data_args.train_file.endswith(".csv"): # Loading a dataset from local csv files datasets = load_dataset("csv", data_files={ "train": data_args.train_file, "validation": data_args.validation_file }) else: # Loading a dataset from local json files datasets = load_dataset("json", data_files={ "train": data_args.train_file, "validation": data_args.validation_file }) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # Labels if data_args.task_name is not None: is_regression = data_args.task_name == "stsb" if not is_regression: label_list = datasets["train"].features["label"].names num_labels = len(label_list) else: num_labels = 1 else: # Trying to have good defaults here, don't hesitate to tweak to your needs. is_regression = datasets["train"].features["label"].dtype in [ "float32", "float64" ] if is_regression: num_labels = 1 else: # A useful fast method: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique label_list = datasets["train"].unique("label") label_list.sort() # Let's sort it for determinism num_labels = len(label_list) # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Preprocessing the datasets if data_args.task_name is not None: sentence1_key, sentence2_key = task_to_keys[data_args.task_name] else: # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. non_label_column_names = [ name for name in datasets["train"].column_names if name != "label" ] if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", "sentence2" else: if len(non_label_column_names) >= 2: sentence1_key, sentence2_key = non_label_column_names[:2] else: sentence1_key, sentence2_key = non_label_column_names[0], None # Padding strategy if data_args.pad_to_max_length: padding = "max_length" max_length = data_args.max_seq_length else: # We will pad later, dynamically at batch creation, to the max sequence length in each batch padding = False max_length = None # Some models have set the order of the labels to use, so let's make sure we do use it. label_to_id = None if (model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id and data_args.task_name is not None and is_regression): # Some have all caps in their config, some don't. label_name_to_id = { k.lower(): v for k, v in model.config.label2id.items() } if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): label_to_id = { i: label_name_to_id[label_list[i]] for i in range(num_labels) } else: logger.warn( "Your model seems to have been trained with labels, but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." "\nIgnoring the model labels as a result.", ) elif data_args.task_name is None and not is_regression: label_to_id = {v: i for i, v in enumerate(label_list)} def preprocess_function(examples): # Tokenize the texts args = ((examples[sentence1_key], ) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])) result = tokenizer(*args, padding=padding, max_length=max_length, truncation=True) # Map labels to IDs (not necessary for GLUE tasks) if label_to_id is not None and "label" in examples: result["label"] = [label_to_id[l] for l in examples["label"]] return result datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) train_dataset = datasets["train"] eval_dataset = datasets["validation_matched" if data_args.task_name == "mnli" else "validation"] if data_args.task_name is not None: test_dataset = datasets["test_matched" if data_args.task_name == "mnli" else "test"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # Get the metric function if data_args.task_name is not None: metric = load_metric("glue", data_args.task_name) # TODO: When datasets metrics include regular accuracy, make an else here and remove special branch from # compute_metrics # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. def compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) if data_args.task_name is not None: result = metric.compute(predictions=preds, references=p.label_ids) if len(result) > 1: result["combined_score"] = np.mean(list( result.values())).item() return result elif is_regression: return {"mse": ((preds - p.label_ids)**2).mean().item()} else: return { "accuracy": (preds == p.label_ids).astype(np.float32).mean().item() } # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset if training_args.do_eval else None, compute_metrics=compute_metrics, tokenizer=tokenizer, # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. data_collator=default_data_collator if data_args.pad_to_max_length else None, ) # Training if training_args.do_train: train_result = trainer.train( model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) metrics = train_result.metrics trainer.save_model() # Saves the tokenizer too for easy upload output_train_file = os.path.join(training_args.output_dir, "train_results.txt") if trainer.is_world_process_zero(): with open(output_train_file, "w") as writer: logger.info("***** Train results *****") for key, value in sorted(metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [data_args.task_name] eval_datasets = [eval_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") eval_datasets.append(datasets["validation_mismatched"]) for eval_dataset, task in zip(eval_datasets, tasks): eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join(training_args.output_dir, f"eval_results_{task}.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info(f"***** Eval results {task} *****") for key, value in sorted(eval_result.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") eval_results.update(eval_result) if training_args.do_predict: logger.info("*** Test ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [data_args.task_name] test_datasets = [test_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") test_datasets.append(datasets["test_mismatched"]) for test_dataset, task in zip(test_datasets, tasks): # Removing the `label` columns because it contains -1 and Trainer won't like that. test_dataset.remove_columns_("label") predictions = trainer.predict( test_dataset=test_dataset).predictions predictions = np.squeeze( predictions) if is_regression else np.argmax(predictions, axis=1) output_test_file = os.path.join(training_args.output_dir, f"test_results_{task}.txt") if trainer.is_world_process_zero(): with open(output_test_file, "w") as writer: logger.info(f"***** Test results {task} *****") writer.write("index\tprediction\n") for index, item in enumerate(predictions): if is_regression: writer.write(f"{index}\t{item:3.3f}\n") else: item = label_list[item] writer.write(f"{index}\t{item}\n") return eval_results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Setup distant debugging if needed if data_args.server_ip and data_args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(data_args.server_ip, data_args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if training_args.should_log else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if training_args.should_log: transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # In distributed training, the load_dataset function guarantees that only one local process can concurrently # download the dataset. # Downloading and loading xnli dataset from the hub. if training_args.do_train: if model_args.train_language is None: train_dataset = load_dataset("xnli", model_args.language, split="train", cache_dir=model_args.cache_dir) else: train_dataset = load_dataset("xnli", model_args.train_language, split="train", cache_dir=model_args.cache_dir) label_list = train_dataset.features["label"].names if training_args.do_eval: eval_dataset = load_dataset("xnli", model_args.language, split="validation", cache_dir=model_args.cache_dir) label_list = eval_dataset.features["label"].names if training_args.do_predict: predict_dataset = load_dataset("xnli", model_args.language, split="test", cache_dir=model_args.cache_dir) label_list = predict_dataset.features["label"].names # Labels num_labels = len(label_list) # Load pretrained model and tokenizer # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task="xnli", cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, do_lower_case=model_args.do_lower_case, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Preprocessing the datasets # Padding strategy if data_args.pad_to_max_length: padding = "max_length" else: # We will pad later, dynamically at batch creation, to the max sequence length in each batch padding = False def preprocess_function(examples): # Tokenize the texts return tokenizer( examples["premise"], examples["hypothesis"], padding=padding, max_length=data_args.max_seq_length, truncation=True, ) if training_args.do_train: if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) train_dataset = train_dataset.map( preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on train dataset", ) # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") if training_args.do_eval: if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select( range(data_args.max_eval_samples)) eval_dataset = eval_dataset.map( preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on validation dataset", ) if training_args.do_predict: if data_args.max_predict_samples is not None: predict_dataset = predict_dataset.select( range(data_args.max_predict_samples)) predict_dataset = predict_dataset.map( preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on prediction dataset", ) # Get the metric function metric = load_metric("xnli") # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. def compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = np.argmax(preds, axis=1) return metric.compute(predictions=preds, references=p.label_ids) # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) else: data_collator = None # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, compute_metrics=compute_metrics, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.save_model() # Saves the tokenizer too for easy upload trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate(eval_dataset=eval_dataset) max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Prediction if training_args.do_predict: logger.info("*** Predict ***") predictions, labels, metrics = trainer.predict( predict_dataset, metric_key_prefix="predict") max_predict_samples = (data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)) metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) trainer.log_metrics("predict", metrics) trainer.save_metrics("predict", metrics) predictions = np.argmax(predictions, axis=1) output_predict_file = os.path.join(training_args.output_dir, "predictions.txt") if trainer.is_world_process_zero(): with open(output_predict_file, "w") as writer: writer.write("index\tprediction\n") for index, item in enumerate(predictions): item = label_list[item] writer.write(f"{index}\t{item}\n")
def main(): try: from gpiozero import LED led = LED(12) except ImportError: print('GPIO Not Found') parser = argparse.ArgumentParser() parser.add_argument( '-l', '--local', action='store_true', help='Start in local mode given you have a tweet server') parser.add_argument('--host', type=str, help='Hostname of the tweet server', default='localhost') parser.add_argument('-p', '--port', type=str, help='Port of the tweet server', default='5000') args = parser.parse_args() local = args.local host = args.host port = args.port print("Loading model...") model = AutoModelForSequenceClassification.from_pretrained( 'finetuned_model') tokenizer = AutoTokenizer.from_pretrained('finetuned_model') print(model) print('Model Loaded!') if local: while True: screen_clear() r = requests.get(f'http://{host}:{port}').json() pred, text, masked, time_elapsed = classify_text( model, tokenizer, r.get('text')) print_centre(text) if pred == 1: task = threading.Thread(target=alert) task.start() time.sleep(10) else: headers = initialize_stream_header() with requests.get( "https://api.twitter.com/2/tweets/search/stream", headers=headers, stream=True, ) as response: sys.stdout.flush() if response.status_code != 200: raise Exception("Cannot get stream (HTTP {}): {}".format( response.status_code, response.text)) for response_line in response.iter_lines(): if response_line: screen_clear() json_response = json.loads(response_line) pred, text, masked, time_elapsed = classify_text( model, tokenizer, json_response['data']['text']) print_centre(text) if pred == 1: task = threading.Thread(target=alert) task.start()
def train_discriminator( run_name: str, model_path: str, config_file: str, train_file: str, train_fraq: float, dataset_type: str, output_model_path: str, ): logging.set_verbosity_info() config = json.loads(jsonnet_evaluate_file(config_file)) init_wandb(run_name, config) agency_list = config['agency_list'] print('Agency list:', agency_list) max_tokens_text = config["max_tokens_text"] max_tokens_title = config["max_tokens_title"] tokenizer_model_path = config["tokenizer_model_path"] tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False) print("Fetching data...") if dataset_type == 'tg': all_records = [ r for r in tqdm.tqdm(tg_reader(train_file, agency_list)) ] full_dataset = AgencyTitleDatasetClassification( all_records, tokenizer, agency_list, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) elif dataset_type == 'lenta-ria': lenta_records = [ r for r in tqdm.tqdm( lenta_reader( os.path.join(train_file, 'lenta/lenta-ru-news.train.csv'))) ] lenta_records.extend([ r for r in tqdm.tqdm( lenta_reader( os.path.join(train_file, 'lenta/lenta-ru-news.val.csv'))) ]) ria_records = [ r for r in tqdm.tqdm( ria_reader( os.path.join(train_file, 'ria/ria.shuffled.train.json'))) ] ria_records.extend([ r for r in tqdm.tqdm( ria_reader( os.path.join(train_file, 'ria/ria.shuffled.val.json'))) ]) records = [ r for r in reader( '/home/aobuhtijarov/datasets/full_lenta_ria.test.jsonl') ] filter_lenta = [{ 'text': r['lenta_text'], 'title': r['lenta_title'], 'agency': 'lenta.ru', 'date': r['lenta_date'] } for r in records] filter_ria = [{ 'text': r['ria_text'], 'title': r['ria_title'], 'agency': 'РИА Новости', 'date': r['lenta_date'] } for r in records] lenta_filter_titles = set(x['title'] for x in filter_lenta) ria_filter_titles = set(x['title'] for x in filter_ria) lenta_records = [ r for r in lenta_records if r['title'] not in lenta_filter_titles ] ria_records = [ r for r in ria_records if r['title'] not in ria_filter_titles ] random.shuffle(ria_records) lenta_records = [ r for r in lenta_records if r['date'][:4] in ['2010', '2011', '2012', '2013', '2014'] ] all_records = lenta_records + ria_records[:len(lenta_records)] random.shuffle(all_records) full_dataset = AgencyTitleDatasetClassification( all_records, tokenizer, agency_list, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) elif dataset_type == 'lenta-ria-clusters': full_dataset = LentaRiaDatasetClassification(train_file, tokenizer, agency_list, max_tokens_text, max_tokens_title) print("Building datasets...") train_size = int(train_fraq * len(full_dataset)) test_size = int((1 - train_fraq) * 0.5 * len(full_dataset)) train_dataset, test_dataset, eval_dataset = \ torch.utils.data.random_split(full_dataset, [train_size, test_size, len(full_dataset) - train_size - test_size]) wandb.summary.update({ 'Train dataset size': len(train_dataset), 'Val dataset size': len(eval_dataset), 'Test dataset size': len(test_dataset), }) print("Initializing model...") model = AutoModelForSequenceClassification.from_pretrained( model_path, num_labels=len(agency_list)) print("Training model...") batch_size = config["batch_size"] logging_steps = config["logging_steps"] save_steps = config["save_steps"] eval_steps = config["eval_steps"] warmup_steps = config["num_warmup_steps"] gradient_accumulation_steps = config["gradient_accumulation_steps"] max_steps = config["max_steps"] lr = config["learning_rate"] training_args = TrainingArguments( output_dir=output_model_path, do_train=True, do_eval=True, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps, evaluation_strategy='steps', learning_rate=lr, warmup_steps=warmup_steps, overwrite_output_dir=False, logging_steps=logging_steps, eval_steps=eval_steps, save_steps=save_steps, max_steps=max_steps, save_total_limit=1, weight_decay=0.01, report_to='wandb', ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) trainer.train() wandb.summary.update( {'Test Evaluation': trainer.evaluate(eval_dataset=test_dataset)}) model.save_pretrained(output_model_path)
def load_from_config(self): setattr(self.config, 'num_labels', self.num_labels) self.transformer = AutoModelForSequenceClassification.from_config( self.config)
def main(): parser = HfArgumentParser( (DataTrainingArguments, TeacherModelArguments, StudentModelArguments, DistillTrainingArguments), description=DESCRIPTION, ) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. data_args, teacher_args, student_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: data_args, teacher_args, student_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): utils.logging.set_verbosity_info() utils.logging.enable_default_handler() utils.logging.enable_explicit_format() if training_args.local_rank != -1: raise ValueError("Distributed training is not currently supported.") if training_args.tpu_num_cores is not None: raise ValueError("TPU acceleration is not currently supported.") logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # 1. read in data examples = read_lines(data_args.data_file) class_names = read_lines(data_args.class_names_file) # 2. get teacher predictions and load into dataset logger.info("Generating predictions from zero-shot teacher model") teacher_soft_preds = get_teacher_predictions( teacher_args.teacher_name_or_path, examples, class_names, teacher_args.hypothesis_template, teacher_args.teacher_batch_size, teacher_args.temperature, teacher_args.multi_class, data_args.use_fast_tokenizer, training_args.no_cuda, training_args.fp16, ) dataset = Dataset.from_dict({ "text": examples, "labels": teacher_soft_preds, }) # 3. create student logger.info("Initializing student model") model = AutoModelForSequenceClassification.from_pretrained( student_args.student_name_or_path, num_labels=len(class_names)) tokenizer = AutoTokenizer.from_pretrained( student_args.student_name_or_path, use_fast=data_args.use_fast_tokenizer) model.config.id2label = {i: label for i, label in enumerate(class_names)} model.config.label2id = {label: i for i, label in enumerate(class_names)} # 4. train student on teacher predictions dataset = dataset.map(tokenizer, input_columns="text") dataset.set_format("torch") def compute_metrics(p, return_outputs=False): preds = p.predictions.argmax(-1) proxy_labels = p.label_ids.argmax( -1) # "label_ids" are actually distributions return {"agreement": (preds == proxy_labels).mean().item()} trainer = DistillationTrainer( model=model, tokenizer=tokenizer, args=training_args, train_dataset=dataset, compute_metrics=compute_metrics, ) if training_args.do_train: logger.info("Training student model on teacher predictions") trainer.train() if training_args.do_eval: agreement = trainer.evaluate(eval_dataset=dataset)["eval_agreement"] logger.info( f"Agreement of student and teacher predictions: {agreement * 100:0.2f}%" ) trainer.save_model()
return max(len(d) for d in self.datasets) trainset = FeverLabelPredictionDataset(args.train) devset = FeverLabelPredictionDataset(args.dev) if args.batch_size_unsup_ratio: unsupset = FeverLabelPredictionDataset_UDA(args.data_uda, is_aug=False)#[int(len(trainset)*.7):] augset = FeverLabelPredictionDataset_UDA(args.data_uda, is_aug=True)#[int(len(trainset)*.7):] assert len(unsupset) == len(augset) concatset = ConcatDataset(unsupset, augset) batch_size_unsup = int(args.batch_size_gpu * args.batch_size_unsup_ratio) tokenizer = AutoTokenizer.from_pretrained(args.model) config = AutoConfig.from_pretrained(args.model, num_labels=3) model = AutoModelForSequenceClassification.from_pretrained(args.model, config=config).to(device) optimizer = torch.optim.Adam([ # If you are using non-roberta based models, change this to point to the right base {'params': model.roberta.parameters(), 'lr': args.lr_base}, {'params': model.classifier.parameters(), 'lr': args.lr_linear} ]) scheduler = get_cosine_schedule_with_warmup(optimizer, 0, 20) def encode(claims: List[str], rationale: List[str]): encoded_dict = tokenizer.batch_encode_plus( zip(rationale, claims), pad_to_max_length=True, return_tensors='pt') if encoded_dict['input_ids'].size(1) > 512: # Too long for the model. Truncate it
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer import nlpaug.augmenter.char as nac import json import pandas as pd # Load model tokenizer = AutoTokenizer.from_pretrained( "distilbert-base-uncased-finetuned-sst-2-english") inference_model = AutoModelForSequenceClassification.from_pretrained( "distilbert-base-uncased-finetuned-sst-2-english") model = pipeline("sentiment-analysis", model=inference_model, tokenizer=tokenizer) # Define text perturbation aug = nac.KeyboardAug(aug_word_max=1) # Insert realistic keystroke errors def typo(input): output = aug.augment(input) return (output) def eval_perturb(input_a, input_b): output_a, output_b = model([input_a, input_b]) sq_error = (output_a["score"] - output_b["score"])**2 acc = output_a["label"] == output_b["label"] return (sq_error, acc, output_b["score"]) # Read in our test dataset
def train_func(config: Dict[str, Any]): # Accelerator reads from this environment variable for GPU placement. os.environ["LOCAL_RANK"] = str(ray.train.local_rank()) os.environ["WORLD_SIZE"] = str(ray.train.world_size()) args = config["args"] # Initialize the accelerator. We will let the accelerator handle device # placement for us in this example. accelerator = Accelerator(cpu=not args.use_gpu) # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on # the screen. accelerator.is_local_main_process is only True for one # process per machine. logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Get the datasets: you can either provide your own CSV/JSON training and # evaluation files (see below) or specify a GLUE benchmark task (the # dataset will be downloaded automatically from the datasets Hub). # For CSV/JSON files, this script will use as labels the column called # 'label' and as pair of sentences the sentences in columns called # 'sentence1' and 'sentence2' if such column exists or the first two # columns not named label if at least two columns are provided. # If the CSVs/JSONs contain only one non-label column, the script does # single sentence classification on this single column. You can easily # tweak this behavior (see below) # In distributed training, the load_dataset function guarantee that only # one local process can concurrently download the dataset. if args.task_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset("glue", args.task_name) else: # Loading the dataset from local csv or json file. data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = (args.train_file if args.train_file is not None else args.valid_file).split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # Labels if args.task_name is not None: is_regression = args.task_name == "stsb" if not is_regression: label_list = raw_datasets["train"].features["label"].names num_labels = len(label_list) else: num_labels = 1 else: # Trying to have good defaults here, don't hesitate to tweak to your # needs. is_regression = raw_datasets["train"].features["label"].dtype in [ "float32", "float64", ] if is_regression: num_labels = 1 else: # A useful fast method: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique # noqa:E501 label_list = raw_datasets["train"].unique("label") label_list.sort() # Let's sort it for determinism num_labels = len(label_list) # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that # only one local process can concurrently download model & vocab. config = AutoConfig.from_pretrained(args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name) tokenizer = AutoTokenizer.from_pretrained( args.model_name_or_path, use_fast=not args.use_slow_tokenizer) model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) # Preprocessing the datasets if args.task_name is not None: sentence1_key, sentence2_key = task_to_keys[args.task_name] else: # Again, we try to have some nice defaults but don't hesitate to # tweak to your use case. non_label_column_names = [ name for name in raw_datasets["train"].column_names if name != "label" ] if ("sentence1" in non_label_column_names and "sentence2" in non_label_column_names): sentence1_key, sentence2_key = "sentence1", "sentence2" else: if len(non_label_column_names) >= 2: sentence1_key, sentence2_key = non_label_column_names[:2] else: sentence1_key, sentence2_key = non_label_column_names[0], None # Some models have set the order of the labels to use, # so let's make sure we do use it. label_to_id = None if (model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id and args.task_name is not None and not is_regression): # Some have all caps in their config, some don't. label_name_to_id = { k.lower(): v for k, v in model.config.label2id.items() } if list(sorted(label_name_to_id.keys())) == list( # noqa:C413 sorted(label_list)): # noqa:C413 logger.info( f"The configuration of the model provided the following label " f"correspondence: {label_name_to_id}. Using it!") label_to_id = { i: label_name_to_id[label_list[i]] for i in range(num_labels) } else: logger.warning( "Your model seems to have been trained with labels, " "but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, " # noqa:C413,E501 f"dataset labels: {list(sorted(label_list))}." # noqa:C413 "\nIgnoring the model labels as a result.", ) elif args.task_name is None: label_to_id = {v: i for i, v in enumerate(label_list)} if label_to_id is not None: model.config.label2id = label_to_id model.config.id2label = { id: label for label, id in config.label2id.items() } padding = "max_length" if args.pad_to_max_length else False def preprocess_function(examples): # Tokenize the texts texts = ((examples[sentence1_key], ) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])) result = tokenizer(*texts, padding=padding, max_length=args.max_length, truncation=True) if "label" in examples: if label_to_id is not None: # Map labels to IDs (not necessary for GLUE tasks) result["labels"] = [ label_to_id[l] for l in examples["label"] # noqa:E741 ] else: # In all cases, rename the column to labels because the model # will expect that. result["labels"] = examples["label"] return result processed_datasets = raw_datasets.map( preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names, desc="Running tokenizer on dataset", ) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets["validation_matched" if args.task_name == "mnli" else "validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: if args.pad_to_max_length: # If padding was already done ot max length, we use the default data # collator that will just convert everything to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for # us (by padding to the maximum length of the samples passed). When # using mixed precision, we add `pad_to_multiple_of=8` to pad all # tensors to multiple of 8s, which will enable the use of Tensor # Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorWithPadding( tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size, ) eval_dataloader = DataLoader( eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size, ) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader) # Note -> the training dataloader needs to be prepared before we grab # his length below (cause its length will be shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Get the metric function if args.task_name is not None: metric = load_metric("glue", args.task_name) else: metric = load_metric("accuracy") # Train! total_batch_size = (args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps) logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info(f" Instantaneous batch size per device =" f" {args.per_device_train_batch_size}") logger.info( f" Total train batch size (w. parallel, distributed & accumulation) " f"= {total_batch_size}") logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if (step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1): optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break model.eval() for step, batch in enumerate(eval_dataloader): outputs = model(**batch) predictions = (outputs.logits.argmax( dim=-1) if not is_regression else outputs.logits.squeeze()) metric.add_batch( predictions=accelerator.gather(predictions), references=accelerator.gather(batch["labels"]), ) eval_metric = metric.compute() logger.info(f"epoch {epoch}: {eval_metric}") if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save) if args.task_name == "mnli": # Final evaluation on mismatched validation set eval_dataset = processed_datasets["validation_mismatched"] eval_dataloader = DataLoader( eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size, ) eval_dataloader = accelerator.prepare(eval_dataloader) model.eval() for step, batch in enumerate(eval_dataloader): outputs = model(**batch) predictions = outputs.logits.argmax(dim=-1) metric.add_batch( predictions=accelerator.gather(predictions), references=accelerator.gather(batch["labels"]), ) eval_metric = metric.compute() logger.info(f"mnli-mm: {eval_metric}")
def main(): parser = argparse.ArgumentParser() # for evaluating on paper. Specify the split (e.g. train/valid/test) parser.add_argument("--infer-paper", type=str, default=None) # for evaluating data (to generate contrib_indices) parser.add_argument("--infer-jsonl", type=str, default=None) # for evaluating system outputs parser.add_argument("--decode-type", type=str, default="beam") parser.add_argument( "--decode-results", type=Path, nargs="+", help="Paths to evaluation experiment directories.", default=None, ) parser.add_argument( "--mode", type=str, choices=["contrib", "other"], help="Side to check purity scores.", ) # Required parameters parser.add_argument("--logdir", type=str, required=True) parser.add_argument( "--data_dir", default=None, type=Path, required=True, ) parser.add_argument( "--model_name_or_path", default="allenai/scibert_scivocab_cased", type=str, ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) # Other parameters parser.add_argument( "--tokenizer_name", default="allenai/scibert_scivocab_cased", type=str, ) parser.add_argument( "--max_seq_length", default=128, type=int, ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the test set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step.", ) parser.add_argument( "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.", ) parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.", ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--num_train_epochs", default=5.0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument( "--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.", ) parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument( "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory", ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets", ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument( "--local_rank", type=int, default=-1, help="For distributed training: local_rank", ) parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab config = AutoConfig.from_pretrained(args.model_name_or_path) args.model_type = config.model_type tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, config=config, ) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = AutoModelForSequenceClassification.from_pretrained( args.output_dir) tokenizer = AutoTokenizer.from_pretrained(args.output_dir) model.to(args.device) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: if args.decode_results is not None: checkpoint = args.output_dir prefix = (checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else "") model = AutoModelForSequenceClassification.from_pretrained( checkpoint) model.to(args.device) inference_on_summary_outputs(args, model, tokenizer, prefix=prefix) elif args.infer_jsonl is not None: inference(args, model, tokenizer, prefix=prefix) elif args.infer_paper is not None: inference_on_paper_text(args, model, tokenizer, prefix="") else: checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" prefix = (checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else "") model = AutoModelForSequenceClassification.from_pretrained( checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) result = dict((k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) return results
def load_model(): model = AutoModelForSequenceClassification.from_pretrained( model_checkpoint, num_labels=2) tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, fast=True) return Trainer(model=model), tokenizer
warmup_steps = 1000 // batch_size # Load k-fold Data labels, commits, _ = load_cross_validation_split(use_filtered) # Cross Validation acc_list, prec_list, recall_list = [], [], [] th_accs_list, th_precs_list, th_recalls_list = [], [], [] prob_dict = {} for fold_idx in range(n_fold): print("[Fold {}]".format(fold_idx + 1), end=" ") prob_dict["fold_{}".format(fold_idx + 1)] = {} # Init Tokenizer & Model tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained( model_name, num_labels=n_class) num_params = 0 for param in model.parameters(): num_params += param.numel() print("model size", num_params, end=" ") model.to(device) # Get Train/Eval Split label_eval, commit_eval = labels[fold_idx], commits[fold_idx] label_train, commit_train = [], [] for idx in range(n_fold): if idx != fold_idx: label_train += labels[idx] commit_train += commits[idx] assert len(label_eval) == len(commit_eval) assert len(label_train) == len(commit_train)
return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True) # %% preprocess_function(dataset['train'][:5]) # %% encoded_dataset = dataset.map(preprocess_function, batched=True) # %% # finetuning the model from transformers import AutoModelForSequenceClassification, \ TrainingArguments, Trainer num_labels = 3 if task.startswith("mnli") else 1 if task == "stsb" else 2 model = AutoModelForSequenceClassification.from_pretrained( model_checkpoint, num_labels=num_labels) # %% metric_name = "pearson" if task == "stsb" else "matthews_correlation" if task == "cola" else "accuracy" args = TrainingArguments( "test-glue", evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, num_train_epochs=5, weight_decay=0.01, load_best_model_at_end=True, metric_for_best_model=metric_name, )
def evaluate_style_gen_title( existing_run_name: str, existing_run_id: str, config_file: str, gen_model_file: str, discr_model_file: str, test_file: str, test_sample_rate: float, ): logging.set_verbosity_info() init_wandb(existing_run_name, None, existing_run_id) config = json.loads(jsonnet_evaluate_file(config_file)) tokenizer_model_path = config["tokenizer_model_path"] tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False) max_tokens_text = config["max_tokens_text"] max_tokens_title = config["max_tokens_title"] setattr(tokenizer, 'max_tokens_text', max_tokens_text) batch_size = config["batch_size"] print("Loading model...") model = EncoderDecoderModel.from_pretrained(gen_model_file) model.eval() model.cuda() agency_list = config['agency_list'] discriminator = AutoModelForSequenceClassification.from_pretrained(discr_model_file, num_labels=len(agency_list)).cuda() print("Fetching TG data...") test_records = [r for r in tqdm.tqdm(tg_reader(test_file)) if random.random() <= test_sample_rate] print("Building datasets...") agency_to_special_token_id = { a: tokenizer.vocab[f'[unused{i+1}]'] for i, a in enumerate(agency_list) } agency_to_target = {a: i for i, a in enumerate(sorted(agency_list))} test_dataset = AgencyTitleDatasetGeneration( test_records, tokenizer, filter_agencies=list(agency_to_special_token_id.keys()), agency_to_special_token_id=agency_to_special_token_id, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title ) print('Dataset size:', len(test_dataset)) y_pred = [] y_true = [] for i in tqdm.trange(0, len(test_dataset), batch_size): data = test_dataset[i] for k in tuple(data.keys()): if k not in ('input_ids', 'attention_mask'): del data[k] else: data[k] = data[k].unsqueeze(0) for j in range(i + 1, min(i + batch_size, len(test_dataset))): for k in data.keys(): data[k] = torch.cat((data[k], test_dataset[j][k].unsqueeze(0)), dim=0) y_true.extend([ agency_to_target[test_dataset.get_strings(j)['agency']] for j in range(i, min(i + batch_size, len(test_dataset)))]) data['input_ids'] = data['input_ids'].cuda() data['attention_mask'] = data['attention_mask'].cuda() output_ids = model.generate( **data, decoder_start_token_id=model.config.decoder.pad_token_id, min_length=7, max_length=20, num_beams=6 ) preds = [ tokenizer.decode(first_sent(x, tokenizer.sep_token_id), skip_special_tokens=True) for x in output_ids ] for title in preds: inp = tokenizer(title, add_special_tokens=True, max_length=max_tokens_title, padding='max_length', truncation=True ) logits = discriminator(input_ids=torch.LongTensor(inp['input_ids']).cuda().unsqueeze(0), attention_mask=torch.LongTensor(inp['attention_mask']).cuda().unsqueeze(0))[0] y_pred.append(torch.argmax(logits).item()) wandb.summary.update({ 'D-Style': classification_report(y_true, y_pred, output_dict=True) })
def model_init(): return AutoModelForSequenceClassification.from_pretrained( model_checkpoint, num_labels=num_labels)
nlp = pipeline("sentiment-analysis") result = nlp("I love you")[0] print(f"label: {result['label']}, with score: {round(result['score'], 4)}") ## Sequence classification: Pharaphrases of each other? # 1. instantiate a tokenizer and a model from the checkpoint name # 2. build a sequence from the two sentences # 3. pass this sequence through the model 0: not paraphrase, 1: is a paraphrase # 4. compute the softmax and get probabilities over the classes # 5. print the result from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc") model = AutoModelForSequenceClassification.from_pretrained( "bert-base-cased-finetuned-mrpc") classes = ["not paraphrase", "is paraphrase"] sequence_0 = "The company HuggingFace is based in New York City" sequence_1 = "Apples are especially bad for your health" sequence_2 = "HuggingFace headquarters are situated in Manhattan" # "TypeError: Can't convert this to PyBool..... paraphrase = tokenizer(sequence_0, sequence_1, sequence_2, return_tensors="pt") not_paraphrase = tokenizer(sequence_0, sequence_1, sequence_2, return_tensors="pt") paraphrase_classification_logits = model(**paraphrase).logits not_paraphrase_classification_logits = model(**not_paraphrase).logits
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help= "Path to pretrained model or model identifier from huggingface.co/models", ) parser.add_argument( "--task_name", default=None, type=str, required=True, help="The name of the task to train selected in the list: " + ", ".join(glue_processors.keys()), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) # Other parameters parser.add_argument( "--config_name", default="", type=str, help= "Pretrained config name or path if not the same as model_name_or_path", ) parser.add_argument( "--tokenizer_name", default="", type=str, help= "Pretrained tokenizer name or path if not the same as model_name_or_path", ) parser.add_argument( "--cache_dir", default=None, type=str, help= "Where do you want to store the pre-trained models downloaded from huggingface.co", ) parser.add_argument( "--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances.") parser.add_argument("--overwrite_output_dir", action="store_true", help="Whether to overwrite data in output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--dont_normalize_importance_by_layer", action="store_true", help="Don't normalize importance score by layers") parser.add_argument( "--dont_normalize_global_importance", action="store_true", help="Don't normalize all importance scores between 0 and 1", ) parser.add_argument( "--try_masking", action="store_true", help="Whether to try to mask head until a threshold of accuracy.") parser.add_argument( "--masking_threshold", default=0.9, type=float, help= "masking threshold in term of metrics (stop masking when metric < threshold * original metric value).", ) parser.add_argument( "--masking_amount", default=0.1, type=float, help="Amount to heads to masking at each masking step.") parser.add_argument("--metric_name", default="acc", type=str, help="Metric to use for head masking.") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, sequences shorter padded.", ) parser.add_argument("--batch_size", default=1, type=int, help="Batch size.") parser.add_argument("--seed", type=int, default=42) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup devices and distributed training if args.local_rank == -1 or args.no_cuda: args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) args.n_gpu = 1 torch.distributed.init_process_group( backend="nccl") # Initializes the distributed backend # Setup logging logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("device: {} n_gpu: {}, distributed: {}".format( args.device, args.n_gpu, bool(args.local_rank != -1))) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Set seeds set_seed(args.seed) # Prepare GLUE task args.task_name = args.task_name.lower() if args.task_name not in glue_processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = glue_processors[args.task_name]() args.output_mode = glue_output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name, output_attentions=True, cache_dir=args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir, ) model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir, ) # Distributed and parallel training model.to(args.device) if args.local_rank != -1: model = nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif args.n_gpu > 1: model = nn.DataParallel(model) # Print/save training arguments os.makedirs(args.output_dir, exist_ok=True) torch.save(args, os.path.join(args.output_dir, "run_args.bin")) logger.info("Training/evaluation parameters %s", args) # Prepare dataset for the GLUE task eval_dataset = GlueDataset(args, tokenizer=tokenizer, mode="dev") if args.data_subset > 0: eval_dataset = Subset( eval_dataset, list(range(min(args.data_subset, len(eval_dataset))))) eval_sampler = SequentialSampler( eval_dataset) if args.local_rank == -1 else DistributedSampler( eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.batch_size, collate_fn=default_data_collator) # Compute head entropy and importance score compute_heads_importance(args, model, eval_dataloader) # Try head masking (set heads to zero until the score goes under a threshole) # and head pruning (remove masked heads and see the effect on the network) if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0: head_mask = mask_heads(args, model, eval_dataloader) prune_heads(args, model, eval_dataloader, head_mask)
"dbpedia_14": {"keys": ("text", None), "num_classes": 14, "task_type": "topic"}, "yahoo_answers_topics": {"keys": ("text", None), "num_classes": 10, "task_type": "topic"}, "imdb": {"keys": ("text", None), "num_classes": 2, "task_type": "sentiment"}, "amazon_polarity": {"keys": ("text", None), "num_classes": 2, "task_type": "sentiment"}, "yelp_polarity": {"keys": ("text", None), "num_classes": 2, "task_type": "sentiment"} } sentence1_key, sentence2_key = task_to_keys[task]["keys"] num_classes = task_to_keys[task]["num_classes"] task_type = task_to_keys[task]["task_type"] ############################################################# ## Model + Tokenizer ######################################## ############################################################# checkpoint = save_dir + MODEL_NAME + '-' + task + '-' + t + '-' + str(num_train_per_class) tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_classes).to(device) ############################################################# ## Dataset Preparation ###################################### ############################################################# if "Ada-" in t: train_data_path = os.path.join(data_dir, task, 'ORIG', task + '_train_' + str(num_train_per_class)) else: train_data_path = os.path.join(data_dir, task, t, task + '_train_' + str(num_train_per_class)) valid_data_path = os.path.join(data_dir, task, 'ORIG', task + '_valid_' + str(num_valid_per_class)) train_dataset = load_from_disk(train_data_path).shuffle() eval_dataset = load_from_disk(valid_data_path) test_dataset = load_dataset(task, split='test')
classifier = pipeline('sentiment-analysis') results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."]) for result in results: print(f"label:{result['label']},with score:{round(result['score'], 4)}") '''' 输出的结果为: label:POSITIVE,with score:0.9998 label:NEGATIVE,with score:0.5309 ''' from transformers import AutoTokenizer, AutoModelForSequenceClassification tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment") model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment") # classifier = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment") classifier = pipeline('sentiment-analysis', model=model) results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."]) for result in results: print(f"label:{result['label']},with score:{round(result['score'], 4)}") ''' 指定了模型 输出的结果为: '''
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) try: num_labels = glue_tasks_num_labels[data_args.task_name] output_mode = glue_output_modes[data_args.task_name] except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = GlueDataset( data_args, tokenizer=tokenizer) if training_args.do_train else None eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev") if training_args.do_eval else None test_dataset = GlueDataset( data_args, tokenizer=tokenizer, mode="test") if training_args.do_predict else None def compute_metrics(p: EvalPrediction) -> Dict: if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) return glue_compute_metrics(data_args.task_name, preds, p.label_ids) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) eval_datasets = [eval_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") eval_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="dev")) for eval_dataset in eval_datasets: eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join( training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format( eval_dataset.args.task_name)) for key, value in eval_result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(eval_result) if training_args.do_predict: logging.info("*** Test ***") test_datasets = [test_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") test_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="test")) for test_dataset in test_datasets: predictions = trainer.predict( test_dataset=test_dataset).predictions if output_mode == "classification": predictions = np.argmax(predictions, axis=1) output_test_file = os.path.join( training_args.output_dir, f"test_results_{test_dataset.args.task_name}.txt") if trainer.is_world_master(): with open(output_test_file, "w") as writer: logger.info("***** Test results {} *****".format( test_dataset.args.task_name)) writer.write("index\tprediction\n") for index, item in enumerate(predictions): if output_mode == "regression": writer.write("%d\t%3.3f\n" % (index, item)) else: item = test_dataset.get_labels()[item] writer.write("%d\t%s\n" % (index, item)) return eval_results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. os.environ['CUDA_VISIBLE_DEVICES'] = '0' parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) # elif len(sys.argv)==1: # parse from local dict # model_args, data_args, training_args = parser.parse_dict(args_dict) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # easy way to modify args, could insert for loop here to do hyperparam search (trainer does it too) # training_args.model_name_or_path = "bert-base-uncased" if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) try: num_labels = glue_tasks_num_labels[data_args.task_name] output_mode = glue_output_modes[data_args.task_name] except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) from transformers import AutoModel print('\nsequence classification model params') # print(model.named_parameters()) for name, param in model.named_parameters(): if "classifier" in name: print(name) model_testing = AutoModel.from_pretrained("google/mobilebert-uncased") print('\nmlm model params') for name, param in model_testing.named_parameters(): if "classifier" in name: print(name) # Get datasets train_dataset = (GlueDataset( data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None) eval_dataset = (GlueDataset(data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir) if training_args.do_eval else None) test_dataset = (GlueDataset(data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir) if training_args.do_predict else None) def build_compute_metrics_fn( task_name: str) -> Callable[[EvalPrediction], Dict]: def compute_metrics_fn(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions if output_mode == "classification": preds = np.argmax(preds, axis=1) else: # regression preds = np.squeeze(preds) return glue_compute_metrics(task_name, preds, p.label_ids) return compute_metrics_fn # logdir = training_args.output_dir +'/'+ datetime.now().strftime("%Y%m%d-%H%M%S") # writer = SummaryWriter(log_dir=logdir) # Initialize our Trainer trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=build_compute_metrics_fn( data_args.task_name) # tb_writer=writer ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) eval_datasets = [eval_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") eval_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir)) for eval_dataset in eval_datasets: trainer.compute_metrics = build_compute_metrics_fn( eval_dataset.args.task_name) then = time.time() eval_result = trainer.evaluate(eval_dataset=eval_dataset) elapsed = time.time() - then print("Eval took {} seconds".format(elapsed)) print("throughput: {} inf/sec".format(len(eval_dataset) / elapsed)) output_eval_file = os.path.join( training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format( eval_dataset.args.task_name)) for key, value in eval_result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(eval_result) if training_args.do_predict: logging.info("*** Test ***") test_datasets = [test_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") test_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir)) for test_dataset in test_datasets: predictions = trainer.predict( test_dataset=test_dataset).predictions if output_mode == "classification": predictions = np.argmax(predictions, axis=1) output_test_file = os.path.join( training_args.output_dir, f"test_results_{test_dataset.args.task_name}.txt") if trainer.is_world_master(): with open(output_test_file, "w") as writer: logger.info("***** Test results {} *****".format( test_dataset.args.task_name)) writer.write("index\tprediction\n") for index, item in enumerate(predictions): if output_mode == "regression": writer.write("%d\t%3.3f\n" % (index, item)) else: item = test_dataset.get_labels()[item] writer.write("%d\t%s\n" % (index, item)) return eval_results
def test(sarc_percentage): # BERT-base and RoBERTa-base models fine-tuned on sentiment anlaysis datasets (SST-2 and IMDB). models = [ 'textattack/bert-base-uncased-SST-2', 'textattack/roberta-base-SST-2', 'textattack/bert-base-uncased-imdb', 'textattack/roberta-base-imdb', ] # Datasets - GEN, HYP, RQ, SemEval (sarcasm only) separatly. gen_sarc = pd.read_csv('/scratch/ec2684/GEN-sarc-notsarc.csv') hyp_sarc = pd.read_csv('/scratch/ec2684/HYP-sarc-notsarc.csv') rg_sarc = pd.read_csv('/scratch/ec2684/RQ-sarc-notsarc.csv') sem_eval = pd.read_csv('/scratch/ec2684/SemEval2018-T3-train-taskA.csv') # A csv file that contains the model prediction results of testing the above four sarcastic models. sarc_model_pred_report = pd.read_csv('/scratch/ec2684/report.csv') ## Generating a dataset with only sarcastic examples from the above four datasets. gen_sarc_data = evaluate.extrac_sarc_only(gen_sarc, False) hyp_sarc_data = evaluate.extrac_sarc_only(hyp_sarc, False) rg_sarc_data = evaluate.extrac_sarc_only(rg_sarc, False) sem_sarc_data = evaluate.extrac_sarc_only(sem_eval, True) dataset_name = [ 'GEN-sarc-notsarc.csv', 'HYP-sarc-notsarc.csv', 'RQ-sarc-notsarc.csv', 'SemEval2018-T3-train-taskA.csv' ] sarc_datasets = [gen_sarc_data, hyp_sarc_data, rg_sarc_data, sem_sarc_data] sarc_labels = [] # Labels for the sarcastic examples only datasets. for dataset in sarc_datasets: sarc_labels.append(np.zeros(len(dataset), dtype='int')) ## IMDB dataset instantiation. imdb_test = load_dataset('imdb', split='test') imdb_test_positive = imdb_test.filter( lambda example: example['label'] == 1) imdb_test_negative = imdb_test.filter( lambda example: example['label'] == 0) imdb_test_positive = imdb_test_positive.sort(column='text') imdb_test_negative = imdb_test_negative.sort(column='text') tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') imdb_test_positive = imdb_test_positive.filter(lambda example: len( tokenizer(example['text'])['attention_mask']) < 512) imdb_test_negative = imdb_test_negative.filter(lambda example: len( tokenizer(example['text'])['attention_mask']) < 512) # Extracting the samples from the sarcastic dataset that the models have predicted as being negative. # Dictionary of such data neg_pred_sarc_dataset = {} for i in range(6): k = len(dataset_name) * i dict_temp = {} for j in range(len(dataset_name)): labels = sarc_model_pred_report['Correct index'][k + j] neg_pred_index = list(map(int, labels[1:-1].split(','))) truncated = [] for index in neg_pred_index: truncated.append(sarc_datasets[j][index]) dict_temp[sarc_model_pred_report['Dataset'][k + j]] = truncated neg_pred_sarc_dataset[sarc_model_pred_report['Model'][k]] = dict_temp comprehensive_results = [] index = 1 clean_comprehensive_results = [] perturbation_first_results = [] perturbation_last_results = [] var = 50 imdb_token_length = 400 pert_token_length = 100 imdb_test_positive = imdb_test_positive.filter( lambda example: len(tokenizer(example['text'])['attention_mask'] ) < imdb_token_length + var) imdb_test_positive = imdb_test_positive.filter( lambda example: imdb_token_length - var < len( tokenizer(example['text'])['attention_mask'])) imdb_test_negative = imdb_test_negative.filter( lambda example: len(tokenizer(example['text'])['attention_mask'] ) < imdb_token_length + var) imdb_test_negative = imdb_test_negative.filter( lambda example: imdb_token_length - var < len( tokenizer(example['text'])['attention_mask'])) print(f'{sarc_percentage} keeping {imdb_token_length}/{pert_token_length}') # Iterating through the models and evaluating the results. for model in models: # Initialzing the model. tokenizer = AutoTokenizer.from_pretrained(model) model_init = AutoModelForSequenceClassification.from_pretrained(model) nlp_pipeline = pipeline("sentiment-analysis", model=model_init, tokenizer=tokenizer, framework="pt", device=0) print(f'\n\n({index}) Report for {model}.') subindex = 0 for dataset in sarc_datasets: print( f'''\n({index}-{subindex+1}) Testing on {dataset_name[subindex]} dataset on only sarcastic data.\n''' ) passage_raw = neg_pred_sarc_dataset[model][dataset_name[subindex]] output = evaluate.length_threshold(passage_raw, pert_token_length - var, pert_token_length + var, tokenizer) print( f'''\n{np.around(output['proportion'],4)*100}% of {dataset_name[subindex]} sarcastic data passed threshold test.\n''' ) # Shuffling the order of random (sarcastic) perturbations. passage = output['dataset'] random.shuffle(passage) imdb_length = len(imdb_test_positive['text']) print( f'\nTesting IMDB positive dataset:\n IMDB len: {imdb_length}\n Passage len: {len(passage)}' ) n = imdb_length imdb_passage = imdb_test_positive['text'] label = np.ones(n, dtype=int) print(f'\nWithout perturbations:') predictions, prediction_scores = evaluate.evaluate( imdb_passage, nlp_pipeline) binary_predictions_no_perturb, binary_labels, binary_prediction_scores, binary_original_index = evaluate.report_binary_metrics( predictions, prediction_scores, label) acc = evaluate.report_acc(binary_labels, binary_predictions_no_perturb) clean_summary = evaluate.summary(label, predictions, prediction_scores, model, dataset_name[subindex], acc) print(f'\nWith perturbations: perturbation + IMDB order.') perturbed_passage, ratio = evaluate.merge(imdb_passage, passage, False) predictions, prediction_scores = evaluate.evaluate( perturbed_passage, nlp_pipeline) binary_predictions_perturb, binary_labels, binary_prediction_scores, binary_original_index = evaluate.report_binary_metrics( predictions, prediction_scores, label) acc = evaluate.report_acc(binary_labels, binary_predictions_perturb) no_change = np.count_nonzero( np.asarray(binary_predictions_no_perturb) == np.asarray( binary_predictions_perturb)) differences = len(binary_predictions_no_perturb) - no_change print( f'''\nFrom {len(binary_predictions_no_perturb)} predictions, perturbation + IMDB changed:\n {differences} labels\n {no_change} labels remained the same''' ) perturb_first_summary = evaluate.perturb_summary( label, predictions, prediction_scores, model, dataset_name[subindex], acc, ratio, differences, no_change) print(f'\nWith perturbations: IMDB + perturbation order.') perturbed_passage, ratio = evaluate.merge(imdb_passage, passage, True) predictions, prediction_scores = evaluate.evaluate( perturbed_passage, nlp_pipeline) binary_predictions_perturb, binary_labels, binary_prediction_scores, binary_original_index = evaluate.report_binary_metrics( predictions, prediction_scores, label) acc = evaluate.report_acc(binary_labels, binary_predictions_perturb) no_change = np.count_nonzero( np.asarray(binary_predictions_no_perturb) == np.asarray( binary_predictions_perturb)) differences = len(binary_predictions_no_perturb) - no_change print( f'''\nFrom {len(binary_predictions_no_perturb)} predictions, IMDB + perturbation changed:\n {differences} labels\n {no_change} labels remained the same''' ) perturb_last_summary = evaluate.perturb_summary( label, predictions, prediction_scores, model, dataset_name[subindex], acc, ratio, differences, no_change) clean_comprehensive_results.append(clean_summary) perturbation_first_results.append(perturb_first_summary) perturbation_last_results.append(perturb_last_summary) subindex = subindex + 1 index = index + 1 return clean_comprehensive_results, perturbation_first_results, perturbation_last_results
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_TYPES), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--task_name", default=None, type=str, required=True, help="The name of the task to train selected in the list: " + ", ".join(processors.keys()), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.", ) # Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name", ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step.", ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.", ) parser.add_argument( "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.", ) parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument( "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory", ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets", ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() if ( os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir ): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( args.output_dir ) ) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Prepare GLUE task args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config = AutoConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, ) model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = AutoModelForSequenceClassification.from_pretrained(args.output_dir) tokenizer = AutoTokenizer.from_pretrained(args.output_dir) model.to(args.device) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = AutoTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)) ) logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else "" model = AutoModelForSequenceClassification.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) result = dict((k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) return results
def tune_transformer(num_samples=8, gpus_per_trial=0, smoke_test=False, ray_address=None): ray.init(ray_address, log_to_driver=True) data_dir_name = "./data" if not smoke_test else "./test_data" data_dir = os.path.abspath(os.path.join(os.getcwd(), data_dir_name)) if not os.path.exists(data_dir): os.mkdir(data_dir, 0o755) # Change these as needed. model_name = "bert-base-uncased" if not smoke_test \ else "sshleifer/tiny-distilroberta-base" task_name = "rte" task_data_dir = os.path.join(data_dir, task_name.upper()) num_labels = glue_tasks_num_labels[task_name] config = AutoConfig.from_pretrained(model_name, num_labels=num_labels, finetuning_task=task_name) # Download and cache tokenizer, model, and features print("Downloading and caching Tokenizer") tokenizer = AutoTokenizer.from_pretrained(model_name) # Triggers tokenizer download to cache print("Downloading and caching pre-trained model") AutoModelForSequenceClassification.from_pretrained( model_name, config=config, ) def get_model(): return AutoModelForSequenceClassification.from_pretrained( model_name, config=config, ) # Download data. download_data(task_name, data_dir) data_args = GlueDataTrainingArguments(task_name=task_name, data_dir=task_data_dir) train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train", cache_dir=task_data_dir) eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev", cache_dir=task_data_dir) training_args = TrainingArguments( output_dir=".", learning_rate=1e-5, # config do_train=True, do_eval=True, evaluate_during_training=True, eval_steps=(len(train_dataset) // 16) + 1 if not smoke_test else 1, # config save_steps=(len(train_dataset) // 16) + 1 if not smoke_test else 1, # config, num_train_epochs=2, # config max_steps=-1, per_device_train_batch_size=16, # config per_device_eval_batch_size=16, # config warmup_steps=0, weight_decay=0.1, # config logging_dir="./logs", ) trainer = Trainer(model_init=get_model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=build_compute_metrics_fn(task_name)) tune_config = { "per_device_eval_batch_size": 32, "eval_steps": tune.sample_from(lambda spec: len(train_dataset) // spec.config[ "per_device_train_batch_size"] + 1 # noqa: E501 ) if not smoke_test else 1, "save_steps": tune.sample_from(lambda spec: spec.config["eval_steps"]), "num_train_epochs": tune.choice([2, 3, 4, 5]), "max_steps": 1 if smoke_test else -1, # Used for smoke test. } scheduler = PopulationBasedTraining(time_attr="training_iteration", metric="eval_acc", mode="max", perturbation_interval=1, hyperparam_mutations={ "weight_decay": tune.uniform(0.0, 0.3), "learning_rate": tune.uniform(1e-5, 5e-5), "per_device_train_batch_size": [16, 32, 64], }) reporter = CLIReporter(parameter_columns={ "weight_decay": "w_decay", "learning_rate": "lr", "per_device_train_batch_size": "train_bs/gpu", "num_epochs": "num_epochs" }, metric_columns=[ "eval_acc", "eval_loss", "epoch", "training_iteration" ]) trainer.hyperparameter_search( hp_space=lambda _: tune_config, backend="ray", n_trials=num_samples, resources_per_trial={ "cpu": 1, "gpu": gpus_per_trial }, scheduler=scheduler, keep_checkpoints_num=3, checkpoint_score_attr="training_iteration", stop={"training_iteration": 1} if smoke_test else None, progress_reporter=reporter, local_dir="~/ray_results/", name="tune_transformer_pbt", log_to_file=True)
t = '@user' if t.startswith('@') and len(t) > 1 else t t = 'http' if t.startswith('http') else t new_text.append(t) return " ".join(new_text) # Les modèles sont assez lourds (environ 500Mo) # # Après avoir été téléchargé, il est important de réutiliser les documents sur disque # In[162]: task = 'sentiment' MODEL = f"cardiffnlp/twitter-roberta-base-{task}" model = AutoModelForSequenceClassification.from_pretrained( '/mnt/pretrained_models/' + MODEL) tokenizer = AutoTokenizer.from_pretrained('/mnt/pretrained_models/' + MODEL) config = AutoConfig.from_pretrained('/mnt/pretrained_models/' + MODEL) # In[163]: # download label mapping labels = [] mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt" with urllib.request.urlopen(mapping_link) as f: html = f.read().decode('utf-8').split("\n") csvreader = csv.reader(html, delimiter='\t') labels = [row[1] for row in csvreader if len(row) > 1] # In[164]:
def get_model(): return AutoModelForSequenceClassification.from_pretrained( model_name, config=config, )
def __init__(self, device, args): """ Initializes a MAML few shot learning system :param device: The device to use to use the model on. :param args: A namedtuple of arguments specifying various hyperparameters. """ super(MAMLFewShotClassifier, self).__init__(device, args) config = AutoConfig.from_pretrained(args.pretrained_weights) config.num_labels = args.num_classes_per_set model_initialization = AutoModelForSequenceClassification.from_pretrained( args.pretrained_weights, config=config ) slow_model = MetaBERT # Init fast model state_dict = model_initialization.state_dict() config = model_initialization.config del model_initialization # Slow model self.classifier = slow_model.init_from_pretrained( state_dict, config, num_labels=args.num_classes_per_set, is_distil=self.is_distil, is_xlm=self.is_xlm, per_step_layer_norm_weights=args.per_step_layer_norm_weights, num_inner_loop_steps=args.number_of_training_steps_per_iter, device=device, ) self.classifier.to("cpu") self.classifier.train() self.inner_loop_optimizer = LSLRGradientDescentLearningRule( device=torch.device("cpu"), init_learning_rate=self.task_learning_rate, total_num_inner_loop_steps=self.args.number_of_training_steps_per_iter, use_learnable_learning_rates=self.args.learnable_per_layer_per_step_inner_loop_learning_rate, init_class_head_lr_multiplier=self.args.init_class_head_lr_multiplier, ) self.inner_loop_optimizer.initialise( names_weights_dict=self.get_inner_loop_parameter_dict( params=self.classifier.named_parameters() ) ) print("Inner Loop parameters") for key, value in self.inner_loop_optimizer.named_parameters(): print(key, value.shape) print("Outer Loop parameters") for name, param in self.named_parameters(): if param.requires_grad: print(name, param.shape, param.device, param.requires_grad) self.optimizer = Ranger( [ {"params": self.classifier.parameters(), "lr": args.meta_learning_rate}, { "params": self.inner_loop_optimizer.parameters(), "lr": args.meta_inner_optimizer_learning_rate, }, ], lr=args.meta_learning_rate, ) self.scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer=self.optimizer, T_max=self.args.total_epochs * self.args.total_iter_per_epoch, eta_min=self.args.min_learning_rate, ) self.inner_loop_optimizer.to(self.device) self.clip_value = 1.0 # gradient clipping for p in self.classifier.parameters(): if p.requires_grad: p.register_hook( lambda grad: torch.clamp(grad, -self.clip_value, self.clip_value) ) self.num_freeze_epochs = args.num_freeze_epochs if self.num_freeze_epochs > 0: self.classifier.freeze()
def main(args): set_seed(args.seed) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') config = AutoConfig.from_pretrained(args.model_name, num_labels=args.num_labels) tokenizer = AutoTokenizer.from_pretrained(args.model_name) model = AutoModelForSequenceClassification.from_pretrained(args.model_name, config=config) model.to(device) collator = utils.Collator(pad_token_id=tokenizer.pad_token_id) train_dataset, label_map = utils.load_classification_dataset( args.train, tokenizer, args.field_a, args.field_b, args.label_field, limit=args.limit ) train_loader = DataLoader(train_dataset, batch_size=args.bsz, shuffle=True, collate_fn=collator) dev_dataset, _ = utils.load_classification_dataset( args.dev, tokenizer, args.field_a, args.field_b, args.label_field, label_map ) dev_loader = DataLoader(dev_dataset, batch_size=args.bsz, shuffle=False, collate_fn=collator) test_dataset, _ = utils.load_classification_dataset( args.test, tokenizer, args.field_a, args.field_b, args.label_field, label_map ) test_loader = DataLoader(test_dataset, batch_size=args.bsz, shuffle=False, collate_fn=collator) if args.bias_correction: betas = (0.9, 0.999) else: betas = (0.0, 0.000) optimizer = AdamW( model.parameters(), lr=args.lr, weight_decay=1e-2, betas=betas ) # Use suggested learning rate scheduler num_training_steps = len(train_dataset) * args.epochs // args.bsz num_warmup_steps = num_training_steps // 10 scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps) if not args.ckpt_dir.exists(): logger.info(f'Making checkpoint directory: {args.ckpt_dir}') args.ckpt_dir.mkdir(parents=True) elif not args.force_overwrite: raise RuntimeError('Checkpoint directory already exists.') try: best_accuracy = 0 for epoch in range(args.epochs): logger.info('Training...') model.train() avg_loss = utils.ExponentialMovingAverage() pbar = tqdm(train_loader) for model_inputs, labels in pbar: model_inputs = {k: v.to(device) for k, v in model_inputs.items()} labels = labels.to(device) optimizer.zero_grad() logits, *_ = model(**model_inputs) loss = F.cross_entropy(logits, labels.squeeze(-1)) loss.backward() optimizer.step() scheduler.step() avg_loss.update(loss.item()) pbar.set_description(f'loss: {avg_loss.get_metric(): 0.4f}, ' f'lr: {optimizer.param_groups[0]["lr"]: .3e}') logger.info('Evaluating...') model.eval() correct = 0 total = 0 with torch.no_grad(): for model_inputs, labels in dev_loader: model_inputs = {k: v.to(device) for k, v in model_inputs.items()} labels = labels.to(device) logits, *_ = model(**model_inputs) _, preds = logits.max(dim=-1) correct += (preds == labels.squeeze(-1)).sum().item() total += labels.size(0) accuracy = correct / (total + 1e-13) logger.info(f'Accuracy: {accuracy : 0.4f}') if accuracy > best_accuracy: logger.info('Best performance so far.') model.save_pretrained(args.ckpt_dir) tokenizer.save_pretrained(args.ckpt_dir) best_accuracy = accuracy except KeyboardInterrupt: logger.info('Interrupted...') logger.info('Testing...') model.eval() correct = 0 total = 0 with torch.no_grad(): for model_inputs, labels in test_loader: model_inputs = {k: v.to(device) for k, v in model_inputs.items()} labels = labels.to(device) logits, *_ = model(**model_inputs) _, preds = logits.max(dim=-1) correct += (preds == labels.squeeze(-1)).sum().item() total += labels.size(0) accuracy = correct / (total + 1e-13) logger.info(f'Accuracy: {accuracy : 0.4f}')