def load(self, fname=None): if fname is not None: self.load_path = fname if self.pretrained_bert: log.info(f"From pretrained {self.pretrained_bert}.") config = AutoConfig.from_pretrained(self.pretrained_bert, num_labels=self.n_classes, output_attentions=False, output_hidden_states=False) self.model = AutoModelForSequenceClassification.from_pretrained( self.pretrained_bert, config=config) elif self.bert_config_file and Path(self.bert_config_file).is_file(): self.bert_config = AutoConfig.from_json_file( str(expand_path(self.bert_config_file))) if self.attention_probs_keep_prob is not None: self.bert_config.attention_probs_dropout_prob = 1.0 - self.attention_probs_keep_prob if self.hidden_keep_prob is not None: self.bert_config.hidden_dropout_prob = 1.0 - self.hidden_keep_prob self.model = AutoModelForSequenceClassification.from_config( config=self.bert_config) else: raise ConfigError("No pre-trained BERT model is given.") self.model.to(self.device) self.optimizer = getattr(torch.optim, self.optimizer_name)( self.model.parameters(), **self.optimizer_parameters) if self.lr_scheduler_name is not None: self.lr_scheduler = getattr(torch.optim.lr_scheduler, self.lr_scheduler_name)( self.optimizer, **self.lr_scheduler_parameters) if self.load_path: log.info(f"Load path {self.load_path} is given.") if isinstance(self.load_path, Path) and not self.load_path.parent.is_dir(): raise ConfigError("Provided load path is incorrect!") weights_path = Path(self.load_path.resolve()) weights_path = weights_path.with_suffix(f".pth.tar") if weights_path.exists(): log.info(f"Load path {weights_path} exists.") log.info( f"Initializing `{self.__class__.__name__}` from saved.") # now load the weights, optimizer from saved log.info(f"Loading weights from {weights_path}.") checkpoint = torch.load(weights_path, map_location=self.device) self.model.load_state_dict(checkpoint["model_state_dict"]) self.optimizer.load_state_dict( checkpoint["optimizer_state_dict"]) self.epochs_done = checkpoint.get("epochs_done", 0) else: log.info( f"Init from scratch. Load path {weights_path} does not exist." )
def _convert_to_transformers_classification_regression( adaptive_model, prediction_head): if adaptive_model.language_model.model.base_model_prefix == "roberta": # Classification Heads in transformers have different architecture across Language Model variants # The RobertaClassificationhead has components: input2dense, dropout, tanh, dense2output # The tanh function cannot be mapped to current FARM style linear Feed Forward ClassificationHeads. # So conversion for this type cannot work. We would need a compatible FARM RobertaClassificationHead logger.error( "Conversion for Text Classification and Regression with Roberta or XLMRoberta " "not possible at the moment.") # add more info to config adaptive_model.language_model.model.config.num_labels = prediction_head.num_labels adaptive_model.language_model.model.config.id2label = { id: label for id, label in enumerate(prediction_head.label_list) } adaptive_model.language_model.model.config.label2id = { label: id for id, label in enumerate(prediction_head.label_list) } adaptive_model.language_model.model.config.finetuning_task = prediction_head.model_type adaptive_model.language_model.model.config.language = adaptive_model.language_model.language # init model transformers_model = AutoModelForSequenceClassification.from_config( adaptive_model.language_model.model.config) # transfer weights for language model + prediction head setattr(transformers_model, transformers_model.base_model_prefix, adaptive_model.language_model.model) transformers_model.classifier.load_state_dict( prediction_head.feed_forward.feed_forward[0].state_dict()) return transformers_model
def load_model_full_api(): from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer global loaded_model_full global tokenizer_full config = AutoConfig.from_pretrained(model_name, num_labels=27) model_tmp = AutoModelForSequenceClassification.from_config(config=config) loaded_model_full = model_tmp.from_pretrained("nlp_model_full.pt") tokenizer_full = AutoTokenizer.from_pretrained(model_name, use_fast=True) return {"message": "Full model loaded"}
def __init__(self): self.device = "cpu" default_model_config = AutoConfig.from_pretrained(config["DEFAULT_MODEL_NAME"], num_labels=27) default_model = AutoModelForSequenceClassification.from_config(config=default_model_config) pos_neg_model = AutoModelForSequenceClassification.from_config(config=default_model_config) self.default_tokenizer = AutoTokenizer.from_pretrained(config["DEFAULT_MODEL_NAME"], use_fast=True) default_model.load_state_dict( torch.load(config["DEFAULT_MODEL"], map_location=self.device) ) default_model = default_model.eval() self.default_model = default_model.to(self.device) pos_neg_model.load_state_dict( torch.load(config["POS_NEG_MODEL"], map_location=self.device) ) pos_neg_model = pos_neg_model.eval() self.pos_neg_model = pos_neg_model.to(self.device)
def test_auto_model_for_sequence_classification(self): model_seq_cls = AutoModelForSequenceClassification.from_config( self.config) self.assertIsInstance( model_seq_cls, StaticSparseEncoderBertForSequenceClassification) # The are two layers with six linear layers and only one head. # There should be a total of 12 linear layers. sparse_layers = [] for module in model_seq_cls.modules(): if isinstance(module, SparseWeights): sparse_layers.append(module) self.assertEqual(12, len(sparse_layers))
def test_reload_static_to_flex_head(self): if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_classification_head"): self.skipTest("No classification head available") static_head_model = AutoModelForSequenceClassification.from_config( self.config()) flex_head_model = AutoModelWithHeads.from_pretrained( None, config=self.config(), state_dict=static_head_model.state_dict()) static_head_model.eval() flex_head_model.eval() static_head_model.add_adapter("test") with tempfile.TemporaryDirectory() as temp_dir: static_head_model.save_adapter(temp_dir, "test") loading_info = {} flex_head_model.load_adapter(temp_dir, loading_info=loading_info) # Load the adapter a second time to make sure our conversion script doesn't break anything flex_head_model.load_adapter(temp_dir, loading_info=loading_info) self.assertEqual(0, len(loading_info["missing_keys"])) self.assertEqual(0, len(loading_info["unexpected_keys"])) # adapter and head were loaded self.assertIn("test", flex_head_model.config.adapters) self.assertIn("test", flex_head_model.heads) # check equal output in_data = self.get_input_samples((1, 128), config=flex_head_model.config) output1 = static_head_model(**in_data, adapter_names=["test"]) output2 = flex_head_model(**in_data, adapter_names=["test"], head="test") self.assertTrue( torch.all(torch.isclose(output1.logits, output2.logits)))
def classify(model_dir, samples, device, tokenizer): config = AutoConfig.from_pretrained(model_dir) config.num_labels = 3 config.token_vocab_size = 2 model = AutoModelForSequenceClassification.from_config(config) model.classifier = nn.Linear(768, 3) # model.bert.embeddings.token_type_embeddings = nn.Linear(768, 2, bias=False) model.num_labels = 3 state_dict = torch.load(model_dir + '/pytorch_model.bin') # import ipdb; ipdb.set_trace() model.load_state_dict(state_dict) # model.bert.embeddings.token_type_embeddings = nn.Linear(2, 768, bias=False) model.to(device) model.eval() inputs = tokenizer.batch_encode_plus(samples, add_special_tokens=True, max_length=512) data = [inputs['input_ids'], range(len(inputs['input_ids']))] dataset = SeqTextDataset(data) def collate(examples: List[Dict]): inputs, indexes = [], [] for sample in examples: inputs.append(sample['inputs']) indexes.append(sample['indexes']) indexes = torch.LongTensor(indexes) if tokenizer._pad_token is None: return { 'inputs': pad_sequence(inputs, batch_first=True), 'indexes': indexes } return { 'inputs': pad_sequence(inputs, batch_first=True, padding_value=tokenizer.pad_token_id), 'indexes': indexes } eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=32, collate_fn=collate) nb_eval_steps = 0 scores, labels, indexes = [], [], [] for batch in tqdm(eval_dataloader, desc="Evaluating"): inputs, index = batch['inputs'], batch['indexes'] inputs = inputs.to(device) with torch.no_grad(): outputs = model(inputs) logits = outputs[0].contiguous() score, prediction = torch.softmax(logits, dim=0).max(dim=-1) scores.append(score) labels.append(prediction) indexes.append(index) batch_size = inputs.size(0) nb_eval_steps += batch_size scores, labels = torch.cat(scores, dim=0).tolist(), torch.cat(labels, dim=0).tolist() indexes = torch.cat(indexes, dim=0).tolist() return scores, labels, indexes
def load(self, fname=None): if fname is not None: self.load_path = fname if self.pretrained_bert: log.info(f"From pretrained {self.pretrained_bert}.") config = AutoConfig.from_pretrained( self.pretrained_bert, # num_labels=self.n_classes, output_attentions=False, output_hidden_states=False) self.model = AutoModelForSequenceClassification.from_pretrained( self.pretrained_bert, config=config) try: hidden_size = self.model.classifier.out_proj.in_features if self.n_classes != self.model.num_labels: self.model.classifier.out_proj.weight = torch.nn.Parameter( torch.randn(self.n_classes, hidden_size)) self.model.classifier.out_proj.bias = torch.nn.Parameter( torch.randn(self.n_classes)) self.model.classifier.out_proj.out_features = self.n_classes self.model.num_labels = self.n_classes except torch.nn.modules.module.ModuleAttributeError: hidden_size = self.model.classifier.in_features if self.n_classes != self.model.num_labels: self.model.classifier.weight = torch.nn.Parameter( torch.randn(self.n_classes, hidden_size)) self.model.classifier.bias = torch.nn.Parameter( torch.randn(self.n_classes)) self.model.classifier.out_features = self.n_classes self.model.num_labels = self.n_classes elif self.bert_config_file and Path(self.bert_config_file).is_file(): self.bert_config = AutoConfig.from_json_file( str(expand_path(self.bert_config_file))) if self.attention_probs_keep_prob is not None: self.bert_config.attention_probs_dropout_prob = 1.0 - self.attention_probs_keep_prob if self.hidden_keep_prob is not None: self.bert_config.hidden_dropout_prob = 1.0 - self.hidden_keep_prob self.model = AutoModelForSequenceClassification.from_config( config=self.bert_config) else: raise ConfigError("No pre-trained BERT model is given.") self.model.to(self.device) self.optimizer = getattr(torch.optim, self.optimizer_name)( self.model.parameters(), **self.optimizer_parameters) if self.lr_scheduler_name is not None: self.lr_scheduler = getattr(torch.optim.lr_scheduler, self.lr_scheduler_name)( self.optimizer, **self.lr_scheduler_parameters) if self.load_path: log.info(f"Load path {self.load_path} is given.") if isinstance(self.load_path, Path) and not self.load_path.parent.is_dir(): raise ConfigError("Provided load path is incorrect!") weights_path = Path(self.load_path.resolve()) weights_path = weights_path.with_suffix(f".pth.tar") if weights_path.exists(): log.info(f"Load path {weights_path} exists.") log.info( f"Initializing `{self.__class__.__name__}` from saved.") # now load the weights, optimizer from saved log.info(f"Loading weights from {weights_path}.") checkpoint = torch.load(weights_path, map_location=self.device) # set strict flag to False if position_ids are missing # this is needed to load models trained on older versions # of transformers library strict_load_flag = bool([ key for key in checkpoint["model_state_dict"].keys() if key.endswith("embeddings.position_ids") ]) self.model.load_state_dict(checkpoint["model_state_dict"], strict=strict_load_flag) self.optimizer.load_state_dict( checkpoint["optimizer_state_dict"]) self.epochs_done = checkpoint.get("epochs_done", 0) else: log.info( f"Init from scratch. Load path {weights_path} does not exist." )
def run_finetuning(args): torch.manual_seed(args.seed) device = torch.device( 'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu') # Get text columns t_columns = args.text_columns.split(',') num_texts = len(t_columns) if num_texts == 1: t_columns = t_columns[0] # Get label columns l_columns = args.label_columns.split(',') num_labels = len(l_columns) if num_labels == 1: l_columns = l_columns[0] if args.fp16 and not APEX_AVAILABLE: print( "FP16 toggle is on but Apex is not available. Using FP32 training." ) if args.do_train: # Configure tokenizer tokenizer = AutoTokenizer.from_pretrained(args.pretrained) if args.add_token != '': add_token = { 'additional_special_tokens': args.add_token.split(',') } added = tokenizer.add_special_tokens(add_token) print('\n' + '=' * 50, '\nCONFIGURE FINETUNING SETUP', '\n' + '=' * 50) if args.add_token != '': print("Addded {} special tokens:".format(added), args.add_token) # Produce hash code for cache f_string = args.train_data + args.valid_data + str(args.msl) + str( args.seed) + args.pretrained + str(args.data_pct) hashed = 'cache_' + hashlib.md5(f_string.encode()).hexdigest() + '.pt' # Produce the dataset if cache doesn't exist if hashed not in os.listdir() or args.retokenize_data: print("Producing dataset cache. This will take a while.") s = time.time() df = pd.read_csv(args.train_data, lineterminator='\n').sample( frac=args.data_pct, random_state=args.seed) text, labels = df[t_columns].values, df[l_columns].values train_dataset = process_data(text, labels, tokenizer, msl=args.msl) df = pd.read_csv(args.valid_data, lineterminator='\n') text, labels = df[t_columns].values, df[l_columns].values valid_dataset = process_data(text, labels, tokenizer, msl=args.msl) if args.save_cache: print('Saving data cache') with open(hashed, 'wb') as f: torch.save([train_dataset, valid_dataset], f) print("Preprocessing finished. Time elapsed: {:.2f}s".format( time.time() - s)) # Load the dataset if the cache exists else: print('Cache found. Loading training and validation data.') with open(hashed, 'rb') as f: train_dataset, valid_dataset = torch.load(f) # Produce dataloaders train_sampler = data.RandomSampler(train_dataset) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, sampler=train_sampler) valid_loader = data.DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False) # Configure model config = AutoConfig.from_pretrained( args.pretrained, num_labels=2 if num_labels == 1 else num_labels) if args.random_init: print( "Initializing new randomly-initialized model from configuration" ) model = AutoModelForSequenceClassification.from_config(config) else: print("Loading from pretrained checkpoint") model = AutoModelForSequenceClassification.from_pretrained( args.pretrained, config=config) _ = model.resize_token_embeddings(len(tokenizer)) model = model.to(device) print("Model has {:,} trainable parameters".format( sum(p.numel() for p in model.parameters() if p.requires_grad))) # Configure loss function criterion = torch.nn.CrossEntropyLoss( ) if num_labels == 1 else torch.nn.BCEWithLogitsLoss() # Configure optimizer if args.optimizer == 'adam': no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [{ "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) optimizer.zero_grad() elif args.optimizer == 'lamb': from pytorch_lamb import Lamb optimizer = Lamb(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay, betas=(args.adam_b1, args.adam_b2)) # Configure FP16 if args.fp16 and APEX_AVAILABLE: print("Using FP16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level) # Configure scheduler if args.use_scheduler: steps = len(train_loader) * args.epochs // args.accumulation scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(steps * args.warmup_pct), num_training_steps=steps) else: scheduler = None print("Using learning rate {:.4E} and weight decay {:.4E}".format( args.learning_rate, args.weight_decay), end='') print(" with scheduler using warmup pct {}".format( args.warmup_pct)) if args.use_scheduler else print("") # Training proper print('\n' + '=' * 50, '\nTRAINING', '\n' + '=' * 50) print("Training batches: {} | Validation batches: {}".format( len(train_loader), len(valid_loader))) for e in range(1, args.epochs + 1): train_loss, train_acc = train(model, criterion, optimizer, train_loader, scheduler=scheduler, accumulation=args.accumulation, device=device, fp16=args.fp16) valid_loss, valid_acc = evaluate(model, criterion, valid_loader, device=device) print( "Epoch {:3} | Train Loss {:.4f} | Train Acc {:.4f} | Valid Loss {:.4f} | Valid Acc {:.4f}" .format(e, train_loss, train_acc, valid_loss, valid_acc)) # Save the model model.save_pretrained(args.checkpoint) tokenizer.save_pretrained(args.checkpoint) #with open(args.checkpoint, 'wb') as f: # torch.save(model.state_dict(), f) if args.do_eval: print('\n' + '=' * 50, '\nBEGIN EVALUATION PROPER', '\n' + '=' * 50) # Load saved tokenizer tokenizer = AutoTokenizer.from_pretrained(args.checkpoint) # Produce hash code for test cache f_string = args.test_data + str(args.msl) + str( args.seed) + args.pretrained hashed = 'cache_' + hashlib.md5(f_string.encode()).hexdigest() + '.pt' # Produce the dataset if cache doesn't exist if hashed not in os.listdir() or args.retokenize_data: print("Producing test data cache. This will take a while.") s = time.time() df = pd.read_csv(args.test_data, lineterminator='\n') text, labels = df[t_columns].values, df[l_columns].values test_dataset = process_data(text, labels, tokenizer, msl=args.msl) if args.save_cache: print('Saving data cache') with open(hashed, 'wb') as f: torch.save(test_dataset, f) print("Preprocessing finished. Time elapsed: {:.2f}s".format( time.time() - s)) # Load the dataset if the cache exists else: print('Cache found. Loading test data.') with open(hashed, 'rb') as f: test_dataset = torch.load(f) # Dataloaders test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False) # Produce the model print("Loading finetuned checkpoint") model = AutoModelForSequenceClassification.from_pretrained( args.checkpoint) model = model.to(device) criterion = torch.nn.CrossEntropyLoss( ) if num_labels == 1 else torch.nn.BCEWithLogitsLoss() # Testing proper print('\n' + '=' * 50, '\nTESTING', '\n' + '=' * 50) test_loss, test_acc = evaluate(model, criterion, test_loader, device=device) print("Test Loss {:.4f} | Test Accuracy {:.4f}".format( test_loss, test_acc)) # Logging if not args.do_train: train_loss, train_acc, valid_loss, valid_acc = None, None, None, None if not args.do_eval: test_loss, test_acc = None, None return train_loss, train_acc, valid_loss, valid_acc, test_loss, test_acc
def __init__(self, pretrain_model_name='roberta-large', num_labels=5): super(TransformerMultiClassifier, self).__init__() model_config = AutoConfig.from_pretrained(pretrain_model_name, num_labels=num_labels) self.model = AutoModelForSequenceClassification.from_config( model_config)
def load_from_config(self): setattr(self.config, 'num_labels', self.num_labels) self.transformer = AutoModelForSequenceClassification.from_config( self.config)
def test_train_adapter_fusion(self): tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelForSequenceClassification.from_config(self.config()) # add the adapters to be fused model.add_adapter("a") model.add_adapter("b") model.add_adapter("c") self.assertIn("a", model.config.adapters.adapters) self.assertIn("b", model.config.adapters.adapters) self.assertIn("c", model.config.adapters.adapters) # setup fusion adapter_setup = Fuse("a", "b", "c") model.add_fusion(adapter_setup) model.train_fusion(adapter_setup) model.set_active_adapters(adapter_setup) self.assertEqual(adapter_setup, model.active_adapters) # all weights of the adapters should be frozen (test for one) for k, v in filter_parameters(model, "adapters.a.").items(): self.assertFalse(v.requires_grad, k) # all weights of the fusion layer should be activated for k, v in filter_parameters(model, "adapter_fusion_layer").items(): self.assertTrue(v.requires_grad, k) # weights of the model should be freezed (check on some examples) for k, v in filter_parameters(model, "encoder.layer.0.attention").items(): self.assertFalse(v.requires_grad, k) state_dict_pre = copy.deepcopy(model.state_dict()) # setup dataset data_args = GlueDataTrainingArguments( task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True) train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train") training_args = TrainingArguments(output_dir="./examples", do_train=True, learning_rate=0.1, max_steps=7, no_cuda=True) # evaluate trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, ) trainer.train() for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(), model.state_dict().items()): if "adapter_fusion_layer" in k1 or "classifier" in k1 or "classification_head" in k1 or "score" in k1: self.assertFalse(torch.equal(v1, v2), k1) else: self.assertTrue(torch.equal(v1, v2), k1)