def __init__(self, config): super(Model, self).__init__() model_config = XLNetConfig.from_pretrained(config.bert_path, num_labels=config.num_classes) self.xlnet = XLNetForSequenceClassification.from_pretrained(config.bert_path, config=model_config) for param in self.bert.parameters(): param.requires_grad = True self.fc = nn.Linear(config.hidden_size, config.num_classes)
def __init__(self, config, dataset): super(XLNet, self).__init__(config, dataset) self.pretrained_model_path = config['pretrained_model_path'] self.tokenizer = XLNetTokenizer.from_pretrained( self.pretrained_model_path, bos_token=dataset.sos_token, eos_token=dataset.eos_token, pad_token=dataset.padding_token) self.sos_token = self.tokenizer.bos_token self.eos_token = self.tokenizer.eos_token self.sos_token_idx = self.tokenizer.bos_token_id self.eos_token_idx = self.tokenizer.eos_token_id self.padding_token_idx = self.tokenizer.pad_token_id self.configuration = XLNetConfig.from_pretrained( self.pretrained_model_path, bos_token_id=self.sos_token_idx, eos_token_id=self.eos_token_idx, pad_token_id=self.padding_token_idx) self.decoder = XLNetLMHeadModel.from_pretrained( self.pretrained_model_path, config=self.configuration) self.decoder.resize_token_embeddings(len(self.tokenizer)) self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx, reduction='none')
def __init__(self, config, dataset): super(XLNet, self).__init__(config, dataset) self.eval_generate_num = config['eval_generate_num'] self.tokenizer = XLNetTokenizer.from_pretrained( 'xlnet-base-cased', bos_token=dataset.sos_token, eos_token=dataset.eos_token, pad_token=dataset.padding_token, unk_token=dataset.eos_token) self.configuration = XLNetConfig.from_pretrained('xlnet-base-cased') self.decoder = XLNetLMHeadModel.from_pretrained( 'xlnet-base-cased', config=self.configuration) self.decoder.resize_token_embeddings(len(self.tokenizer)) self.sos_token = dataset.sos_token self.eos_token = dataset.eos_token self.mask_token = '<mask>' self.padding_token_idx = self.tokenizer.pad_token_id self.max_seq_length = config['max_seq_length'] self.device = config["device"] self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx, reduction='none')
def __init__(self, model_name, model_type): """ Hyper-parameters found with validation set: xlnet-large-casd : epoch = 4, learning_rate = 1E-5, batch_size = 16, epsilon = 1e-6 bert-large-uncased : epoch = 4, learning_rate = 3E-5, batch_size = 16, epsilon = 1e-8 ALBERT xxlarge-v2 large : epoch = 3, learning_rate = 5E-5, batch_size = 8, epsilon = 1e-6 to be improved... """ self.model_name = model_name self.model_type = model_type # Cf transformers library, batch of 16 or 32 is advised for training. For memory issues, we will take 16. Gradient accumulation step has not lead # to great improvment and therefore won't be used here. if model_type == 'albert': self.batch_size = 8 else: self.batch_size = 16 available_model_name = ["xlnet-large-cased", "bert-large-uncased", "albert-xlarge-v2"] available_model_type = ["bert", "xlnet", "albert"] if self.model_name not in available_model_name: raise Exception("Error : model_name should be in", available_model_name) if self.model_type not in available_model_type: raise Exception("Error : model_name should be in", available_model_type) # Load BertForSequenceClassification, the pretrained BERT model with a single linear regression layer on top of the pooled output # Load our fined tune model: ex: BertForSequenceClassification.from_pretrained('./my_saved_model_directory/') if self.model_type == 'bert': self.config = BertConfig.from_pretrained(self.model_name, num_labels=1) # num_labels=1 for regression task self.model = BertForSequenceClassification.from_pretrained(self.model_name, config=self.config) elif self.model_type == 'xlnet': self.config = XLNetConfig.from_pretrained(self.model_name, num_labels=1) self.model = XLNetForSequenceClassification.from_pretrained(self.model_name, config=self.config) elif self.model_type == 'albert': self.config = AlbertConfig.from_pretrained(self.model_name, num_labels=1) self.model = AlbertForSequenceClassification.from_pretrained(self.model_name, config=self.config) self.model.cuda() if self.model_name == 'xlnet-large-cased': self.epochs = 4 self.lr = 1e-5 self.eps = 1e-6 elif self.model_name == 'bert-large-uncased': self.epochs = 4 self.lr = 3e-5 self.eps = 1e-8 elif self.model_name == 'albert-xxlarge-v2': self.epochs = 3 self.lr = 5e-5 self.eps = 1e-6 self.max_grad_norm = 1.0 # Gradient threshold, gradients norms that exceed this threshold are scaled down to match the norm. self.optimizer = AdamW(self.model.parameters(), lr=self.lr, eps=self.eps) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.n_gpu = torch.cuda.device_count() torch.cuda.get_device_name(0)
def __init__( self, language=Language.ENGLISHCASED, num_labels=5, cache_dir=".", num_gpus=None, num_epochs=1, batch_size=8, lr=5e-5, adam_eps=1e-8, warmup_steps=0, weight_decay=0.0, max_grad_norm=1.0, ): """Initializes the classifier and the underlying pretrained model. Args: language (Language, optional): The pretrained model's language. Defaults to 'xlnet-base-cased'. num_labels (int, optional): The number of unique labels in the training data. Defaults to 5. cache_dir (str, optional): Location of XLNet's cache directory. Defaults to ".". num_gpus (int, optional): The number of gpus to use. If None is specified, all available GPUs will be used. Defaults to None. num_epochs (int, optional): Number of training epochs. Defaults to 1. batch_size (int, optional): Training batch size. Defaults to 8. lr (float): Learning rate of the Adam optimizer. Defaults to 5e-5. adam_eps (float, optional): term added to the denominator to improve numerical stability. Defaults to 1e-8. warmup_steps (int, optional): Number of steps in which to increase learning rate linearly from 0 to 1. Defaults to 0. weight_decay (float, optional): Weight decay. Defaults to 0. max_grad_norm (float, optional): Maximum norm for the gradients. Defaults to 1.0 """ if num_labels < 2: raise ValueError("Number of labels should be at least 2.") self.language = language self.num_labels = num_labels self.cache_dir = cache_dir self.num_gpus = num_gpus self.num_epochs = num_epochs self.batch_size = batch_size self.lr = lr self.adam_eps = adam_eps self.warmup_steps = warmup_steps self.weight_decay = weight_decay self.max_grad_norm = max_grad_norm # create classifier self.config = XLNetConfig.from_pretrained( self.language.value, num_labels=num_labels, cache_dir=cache_dir ) self.model = XLNetForSequenceClassification(self.config)
def get_bert_model(model_name=config.MODEL_NAME): # a. 通过词典导入分词器 tokenizer = XLNetTokenizer.from_pretrained(model_name) # b. 导入配置文件 model_config = XLNetConfig.from_pretrained(model_name) # 修改配置 return tokenizer, model_config
def main(train_epoch, batch_size, seq_length, lr, corpus_path, vocab_path, config_path, pretrain_model_path, output_record_path, model_save_path): seed_everything(997) num_train_epochs = train_epoch pretrain_batch_size = batch_size seq_length = seq_length lr = lr corpus_path = corpus_path vocab_path = vocab_path config_path = config_path pretrain_model_path = pretrain_model_path output_record_path = output_record_path model_save_path = model_save_path tokenizer = BertTokenizer.from_pretrained(vocab_path) # train_dataset = LineByLineTextDataset(block_size=128, file_path=corpus_path, tokenizer=tokenizer) # data = read_data(corpus_path, tokenizer) train_dataset = OppoDataset(train_file_path=corpus_path, tokenizer=tokenizer, maxlen=128) data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer) config = XLNetConfig.from_pretrained( pretrained_model_name_or_path=config_path) # model = XLNetForMaskedLM(config=config,name='./xlnet_model/pytorch_model.bin') if os.path.exists(pretrain_model_path): model = XLNetLMHeadModel.from_pretrained(pretrain_model_path, config=config) else: model = XLNetLMHeadModel(config=config) # data_collator = Collator(max_seq_len=seq_length, tokenizer=tokenizer, mlm_probability=0.15) training_args = TrainingArguments( output_dir=output_record_path, overwrite_output_dir=True, num_train_epochs=num_train_epochs, learning_rate=lr, dataloader_num_workers=8, prediction_loss_only=True, fp16=True, fp16_backend='amp', per_device_train_batch_size=pretrain_batch_size, save_strategy='no', seed=997) trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset) trainer.train() trainer.save_model(model_save_path)
def main(): # コマンドライン引数の取得(このファイル上部のドキュメントから自動生成) args = docopt(__doc__) pprint(args) # パラメータの取得 lr = float(args['--lr']) seq_len = int(args['--seq_len']) max_epoch = int(args['--max_epoch']) batch_size = int(args['--batch_size']) num_train = int(args['--num_train']) num_valid = int(args['--num_valid']) # モデルの選択 pretrained_weights = 'xlnet-base-cased' tokenizer = XLNetTokenizer.from_pretrained(pretrained_weights) config = XLNetConfig.from_pretrained(pretrained_weights, num_labels=4) model = XLNetForSequenceClassification.from_pretrained(pretrained_weights) print(model.config.num_labels) # 使用デバイスの取得 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) # データの読み込みとデータセットの作成 encoder = TwinPhraseEncoder(tokenizer, seq_len) train_dataset = WordnetDataset(mode='train', num_data=num_train, transform=encoder) valid_dataset = WordnetDataset(mode='valid', num_data=num_valid, transform=encoder) train_loader = data.DataLoader(train_dataset, batch_size, shuffle=True) valid_loader = data.DataLoader(valid_dataset, batch_size, shuffle=True) # 最適化法の定義 optimizer = optim.Adam(model.parameters(), lr=lr) # 学習 for epoch in range(1, max_epoch + 1): print('=' * 27 + f' Epoch {epoch:0>2} ' + '=' * 27) # Training loss, accu = train_model(model, optimizer, train_loader, device) print( f'| Training | loss-avg : {loss:>8.6f} | accuracy : {accu:>8.3%} |' ) # Validation loss, accu = valid_model(model, optimizer, valid_loader, device) print( f'| Validation | loss-avg : {loss:>8.6f} | accuracy : {accu:>8.3%} |' ) # 保存 torch.save(model.state_dict(), f'../result/{pretrained_weights}.pkl')
def init_model(self): basic_encoder = None if self.config['use_bert']: bert_config = BertConfig.from_pretrained(self.config['bert_model_name'], cache_dir=self.config['bert_dir']) if self.config['num_bert_layer'] is not None: bert_config.num_hidden_layers = self.config['num_bert_layer'] bert = BertModel.from_pretrained(self.config['bert_model_name'], cache_dir=self.config['bert_dir'], config=bert_config) basic_encoder = bert elif self.config['use_xlnet']: xlnet_config = XLNetConfig.from_pretrained('hfl/chinese-xlnet-base', cache_dir=self.config['xlnet_dir']) xlnet_config.n_layer = self.config['num_xlnet_layer'] xlnet_config.mem_len = self.config['xlnet_mem_len'] xlnet = XLNetModel.from_pretrained('hfl/chinese-xlnet-base', cache_dir=self.config['xlnet_dir'], config=xlnet_config) basic_encoder = xlnet elif self.config['use_transformer']: bert_config = BertConfig.from_pretrained('bert-base-chinese', cache_dir=self.config['bert_dir']) if self.config['num_transformer_layer'] is not None: bert_config.num_hidden_layers = self.config['num_transformer_layer'] transf = BertModel(bert_config) basic_encoder = transf elif self.config['use_rnn_basic_encoder']: pass else: raise Exception('Not support other basic encoder') self.model = DocEE(self.config, basic_encoder, self.tokenizer) if self.config['cuda']: self.model.cuda() self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config['learning_rate']) if self.config['resume_model']: OUTPUT_DIR = self.config['output_dir'] MODEL_SAVE_DIR = os.path.join(OUTPUT_DIR, self.config['model_save_dir']) if os.path.exists(MODEL_SAVE_DIR): cpt_file_names = os.listdir(MODEL_SAVE_DIR) if len(cpt_file_names) > 0: epoch_record = [] for cpt_file_name in cpt_file_names: epoch_record.append(int(cpt_file_name.split('-')[-1].split('.')[0])) epoch_record.sort() latest_epoch = epoch_record[-1] self.latest_epoch = latest_epoch + 1 latest_model_file_name = os.path.join(MODEL_SAVE_DIR, self.config['model_file'] % (self.config['ee_method'], latest_epoch)) if self.config['cuda']: store_dict = torch.load(latest_model_file_name, map_location=torch.device('cuda')) else: store_dict = torch.load(latest_model_file_name, map_location='cpu') self.model.load_state_dict(store_dict['model_state']) self.optimizer.load_state_dict(store_dict['optimizer_state']) print('resume train from %s' % latest_model_file_name) print('model init finish')
def __init__(self, config, x_embed): super().__init__() # pretrained_weights = "xlnet-base-cased" self.output_attentions = config.output_attentions self.model = XLNetModel.from_pretrained( config.pretrained_weights, output_attentions=self.output_attentions) self.pretrained_config = XLNetConfig.from_pretrained( config.pretrained_weights) self.encoder_out_size = self.model.config.d_model return
def make_model(args, device): if args.model == "roberta": config = RobertaConfig.from_pretrained("roberta-base") config.num_labels = 5 if args.dataset == "imdb": config.num_labels = 2 if args.dataset == "ag_news": config.num_labels = 4 if args.dataset == "yahoo": config.num_labels = 10 pretrained_model = RobertaForSequenceClassification.from_pretrained( "roberta-base", config=config) return scl_model_Roberta(config, device, pretrained_model, with_semi=args.with_mix, with_sum=args.with_summary) if args.model == "bert": config = BertConfig.from_pretrained("bert-base-uncased") config.num_labels = 5 if args.dataset == "imdb": config.num_labels = 2 if args.dataset == "ag_news": config.num_labels = 4 if args.dataset == "yahoo": config.num_labels = 10 pretrained_model = BertForSequenceClassification.from_pretrained( "bert-base-uncased", config=config) return scl_model_Bert(config, device, pretrained_model, with_semi=args.with_mix, with_sum=args.with_summary) if args.model == "xlnet": config = XLNetConfig.from_pretrained("xlnet-base-cased") config.num_labels = 5 if args.dataset == "imdb": config.num_labels = 2 if args.dataset == "ag_news": config.num_labels = 4 if args.dataset == "yahoo": config.num_labels = 10 pretrained_model = XLNetForSequenceClassification.from_pretrained( "xlnet-base-cased", config=config) return scl_model_Xlnet(config, device, pretrained_model, with_semi=args.with_mix, with_sum=args.with_summary)
def get_xlnet(): ids = keras.layers.Input(shape=(None, ), dtype=tf.int32, name='ids') att = keras.layers.Input(shape=(None, ), dtype=tf.int32, name='att') tok_type_ids = keras.layers.Input(shape=(None, ), dtype=tf.int32, name='tti') config = XLNetConfig.from_pretrained(Config.XLNet.config) xlnet_model = TFXLNetModel.from_pretrained(Config.XLNet.model, config=config) x = xlnet_model(ids, attention_mask=att, token_type_ids=tok_type_ids) x1 = keras.layers.Dropout(0.15)(x[0]) x1 = keras.layers.Conv1D(768, 2, padding='same')(x1) x1 = keras.layers.LeakyReLU()(x1) x1 = keras.layers.LayerNormalization()(x1) x1 = keras.layers.Conv1D(64, 2, padding='same')(x1) x1 = keras.layers.LeakyReLU()(x1) x1 = keras.layers.LayerNormalization()(x1) x1 = keras.layers.Conv1D(32, 2, padding='same')(x1) x1 = keras.layers.Conv1D(1, 1)(x1) x1 = keras.layers.Flatten()(x1) x1 = keras.layers.Activation('softmax', dtype='float32', name='sts')(x1) x2 = keras.layers.Dropout(0.15)(x[0]) x2 = keras.layers.Conv1D(768, 2, padding='same')(x2) x2 = keras.layers.LeakyReLU()(x2) x2 = keras.layers.LayerNormalization()(x2) x2 = keras.layers.Conv1D(64, 2, padding='same')(x2) x2 = keras.layers.LeakyReLU()(x2) x2 = keras.layers.LayerNormalization()(x2) x2 = keras.layers.Conv1D(32, 2, padding='same')(x2) x2 = keras.layers.Conv1D(1, 1)(x2) x2 = keras.layers.Flatten()(x2) x2 = keras.layers.Activation('softmax', dtype='float32', name='ets')(x2) model = keras.models.Model(inputs=[ids, att, tok_type_ids], outputs=[x1, x2]) optimizer = keras.optimizers.Adam(learning_rate=6e-5) if Config.Train.use_amp: optimizer = keras.mixed_precision.experimental.LossScaleOptimizer( optimizer, 'dynamic') loss = keras.losses.CategoricalCrossentropy( label_smoothing=Config.Train.label_smoothing) model.compile(loss=loss, optimizer=optimizer) return model
def get_model(model_dir, saved_dir, n_labels, task, model_type='xlnet-base-cased'): config = XLNetConfig.from_pretrained( model_type, num_labels=n_labels, finetuning_task=task, ) model = XLNetModel(config=config) model.load_state_dict(torch.load(saved_dir+'/pytorch_model.bin')) model.eval() # set dropout and batch normalisation layers to evaluation tokeniser = XLNetTokenizer.from_pretrained( model_dir, config=config, from_tf=True, ) return model, tokeniser
def demo5(): from transformers import XLNetConfig, XLNetModel, XLNetTokenizer, XLNetForSequenceClassification import torch # 定义路径,初始化tokenizer XLN_PATH = r"D:\transformr_files\XLNetLMHeadModel" tokenizer = XLNetTokenizer.from_pretrained(XLN_PATH) # 加载配置 model_config = XLNetConfig.from_pretrained(XLN_PATH) # 设定类别数为3 model_config.num_labels = 3 # 直接从xlnet的config新建XLNetForSequenceClassification(和上一节方法等效) cls_model = XLNetForSequenceClassification.from_pretrained( XLN_PATH, config=model_config) # 设定模式 model.eval() token_codes = tokenizer.encode_plus("i like you, what about you")
def __init__(self, config, x_embed): super().__init__() # pretrained_weights = "xlnet-base-cased" self.model = XLNetModel.from_pretrained(config.pretrained_weights) self.pretrained_config = XLNetConfig.from_pretrained(config.pretrained_weights) # if config.use_gpu: # self.model = self.model.to(device=torch.device("cuda")) # if config.use_parallel: # self.model = torch.nn.DataParallel(self.model) # self.encoder_out_size = 768 self.encoder_out_size = self.model.config.d_model return
def get_bert(bert_name): if 'roberta' in bert_name: print('load roberta-base') model_config = RobertaConfig.from_pretrained('roberta-base') model_config.output_hidden_states = True bert = RobertaModel.from_pretrained('roberta-base', config=model_config) elif 'xlnet' in bert_name: print('load xlnet-base-cased') model_config = XLNetConfig.from_pretrained('xlnet-base-cased') model_config.output_hidden_states = True bert = XLNetModel.from_pretrained('xlnet-base-cased', config=model_config) else: print('load bert-base-uncased') model_config = BertConfig.from_pretrained('bert-base-uncased') model_config.output_hidden_states = True bert = BertModel.from_pretrained('bert-base-uncased', config=model_config) return bert
def get_bert_config(bert_model_type, output_hidden_states=False): if bert_model_type in [ 'bert-base-uncased', 'prod-bert-base-uncased', 'bert-base-cased', 'bert-large-uncased', 'tune_bert-base-uncased_nsp', 'bert-large-uncased-whole-word-masking', 'bert-large-uncased-whole-word-masking-finetuned-squad' ]: bert_config = BertConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in [ 'roberta-base', 'prod-roberta-base-cased', 'roberta-large', 'roberta-large-mnli', 'distilroberta-base' ]: bert_config = RobertaConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in ['xlnet-base-cased']: bert_config = XLNetConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in [ 'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1', 'albert-xxlarge-v1' ]: bert_config = AlbertConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in ['gpt2', 'gpt2-medium']: bert_config = GPT2Config.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in ['transfo-xl']: bert_config = TransfoXLConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in [ 'distilbert-base-uncased', 'distilbert-base-uncased-distilled-squad' ]: bert_config = DistilBertConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) else: raise ValueError( f'`bert_model_type` not understood: {bert_model_type}') bert_config.output_hidden_states = output_hidden_states return bert_config
def __init__(self, model_name='xlnet'): self._model_name = model_name self._device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # default parameter self._max_seq_length = 384 self._doc_stride = 128 self._max_query_length = 64 MODEL_CLASSES = { "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer) } # self._config, self._model_class, self._tokenizer = MODEL_CLASSES[model_name] self._tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True) self._config = XLNetConfig.from_pretrained('xlnet-base-cased', do_lower_case=True) # this class wraps required setting for transform learning self._model = XLNetForQuestionAnswering.from_pretrained( 'xlnet-base-cased', config=self._config)
def __init__(self, model_name, drop_prob=dropout_prob): super(transformer_model, self).__init__() # It is used to instantiate a XLNet model according to the specified arguments, defining the model architecture configuration = XLNetConfig.from_pretrained(model_name, output_hidden_states=True) self.xlnet = XLNetModel.from_pretrained(model_name, config=configuration) # freezes layers of the model if to_freeze: cnt = 0 for child in xlnet.xlnet.children(): cnt = cnt + 1 if cnt <= freeze_layers: for param in child.parameters(): param.requires_grad = False self.fc1 = nn.Linear(xlnet_dim, hidden_dim1) self.fc2 = nn.Linear(hidden_dim1, hidden_dim2) self.fc3 = nn.Linear(hidden_dim2, final_size) self.dropout = nn.Dropout(p=drop_prob)
def save_model(model_dir, saved_dir, n_labels, task, model_type='xlnet-base-cased'): """ save checkpoints as pytorch model: model.ckpt.index model.ckpt.data-00000-of-00001 """ config = XLNetConfig.from_pretrained( model_type, num_labels=n_labels, finetuning_task=task, ) model = XLNetModel.from_pretrained( model_dir, config=config, from_tf=True, ) tokeniser = XLNetTokenizer.from_pretrained( model_dir, config=config, from_tf=True, ) model.save_pretrained(saved_dir) # print model params for param in model.state_dict(): print(param, "\t", model.state_dict()[param].size())
# -*- coding: utf-8 -*- """ @Time : 2020/11/13 15:04 @Auth : xiaolu @File :test.py @IDE :PyCharm @Email:[email protected] """ import torch from pdb import set_trace from transformers import XLNetConfig, XLNetModel from transformers import XLNetTokenizer if __name__ == '__main__': tokenizer = XLNetTokenizer.from_pretrained('./xlnet_pretrain/spiece.model') config = XLNetConfig.from_pretrained('./xlnet_pretrain/config.json') model = XLNetModel.from_pretrained('./xlnet_pretrain/pytorch_model.bin', config=config) text = '你是我患得患失的梦' * 500 text = tokenizer.tokenize(text) input_ids = torch.tensor(tokenizer.convert_tokens_to_ids(text)).view(1, -1) print(input_ids.size()) output = model(input_ids) set_trace()
def main(): # Set device for PyTorch if torch.cuda.is_available(): # might need to update when using more than 1 GPU rank = 0 torch.cuda.set_device(rank) device = torch.device("cuda", rank) #torch.distributed.init_process_group(backend='nccl') n_gpu = torch.cuda.device_count() else: device = torch.device("cpu") n_gpu = 0 print("N GPU: ", n_gpu) # Parse arguments parser = argparse.ArgumentParser() parser.add_argument( "--feature_save_dir", type=str, help= "Preprocessed data (features) should be saved at '/gpfs/data/razavianlab/capstone19/preprocessed_data/{feature_save_dir}'. " ) parser.add_argument("--set_type", type=str, help="Specify train/test file.") args = parser.parse_args() # Load training data feature_save_path = os.path.join( '/gpfs/data/razavianlab/capstone19/preprocessed_data/', args.feature_save_dir) logger.info("Loading {} dataset".format(args.set_type)) dataloader = load_featurized_examples(batch_size=32, set_type=args.set_type, feature_save_path=feature_save_path) # Load saved model config = XLNetConfig.from_pretrained('xlnet-base-cased', num_labels=2292) model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', config=config) model.to(device) model = torch.nn.DataParallel(model, device_ids=list(range(n_gpu))) summaries = torch.empty(0, config.d_model).to(device) labels = torch.empty(0, config.num_labels).to(device) for i, batch in enumerate(dataloader): model.eval() with torch.no_grad(): input_ids, input_mask, segment_ids, label_ids = batch input_ids = input_ids.to(device).long() input_mask = input_mask.to(device).long() segment_ids = segment_ids.to(device).long() label_ids = label_ids.to(device).float() transformer_outputs = model.module.transformer( input_ids=input_ids, token_type_ids=segment_ids, input_mask=input_mask) output = transformer_outputs[0] # extracting the CLS token summary = output[:, 0] summary = summary.to(device) summaries = torch.cat([summaries, summary], dim=0) labels = torch.cat([labels, label_ids]) if i % 1000 == 0 and i > 0: logger.info("Embedded and summarized batch {} of {}".format( i, len(dataloader))) # Save the embedded representations of the document every 50,000 batches to save memory if i % 12000 == 0 and i > 0: logger.info("Saving summaries...") torch.save( summaries, os.path.join( feature_save_path, args.set_type + '_summaries_{}.pt'.format(int(i / 12000)))) torch.save( labels, os.path.join( feature_save_path, args.set_type + '_label_ids_{}.pt'.format(int(i / 12000)))) summaries = torch.empty(0, config.d_model).to(device) labels = torch.empty(0, config.num_labels).to(device) # Save any remaining embedded representations if i % 12000 != 0: logger.info("Saving summaries...") torch.save( summaries, os.path.join( feature_save_path, args.set_type + '_summaries_{}.pt'.format(int(math.ceil(i / 12000))))) torch.save( labels, os.path.join( feature_save_path, args.set_type + '_label_ids_{}.pt'.format(int(math.ceil(i / 12000))))) return
def __init__(self): super(XlnetModelTest, self).__init__() config = XLNetConfig.from_pretrained('Saier/models/config.json') self.xlnet = XLNetForSequenceClassification(config) # /bert_pretrain/ self.device = torch.device("cuda")
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files, field="data") # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = XLNetConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) tokenizer = XLNetTokenizerFast.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = XLNetForQuestionAnswering.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Preprocessing the datasets. # Preprocessing is slighlty different for training and evaluation. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names question_column_name = "question" if "question" in column_names else column_names[0] context_column_name = "context" if "context" in column_names else column_names[1] answer_column_name = "answers" if "answers" in column_names else column_names[2] # Padding side determines if we do (question|context) or (context|question). pad_on_right = tokenizer.padding_side == "right" # Training preprocessing def prepare_train_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[question_column_name if pad_on_right else context_column_name], examples[context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=data_args.max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, padding="max_length", ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers). special_tokens = tokenized_examples.pop("special_tokens_mask") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] tokenized_examples["is_impossible"] = [] tokenized_examples["cls_index"] = [] tokenized_examples["p_mask"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] cls_index = input_ids.index(tokenizer.cls_token_id) tokenized_examples["cls_index"].append(cls_index) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples["token_type_ids"][i] for k, s in enumerate(special_tokens[i]): if s: sequence_ids[k] = 3 context_idx = 1 if pad_on_right else 0 # Build the p_mask: non special tokens and context gets 0.0, the others get 1.0. # The cls token gets 1.0 too (for predictions of empty answers). tokenized_examples["p_mask"].append( [ 0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0 for k, s in enumerate(sequence_ids) ] ) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] answers = examples[answer_column_name][sample_index] # If no answers are given, set the cls_index as answer. if len(answers["answer_start"]) == 0: tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) tokenized_examples["is_impossible"].append(1.0) else: # Start/end character index of the answer in the text. start_char = answers["answer_start"][0] end_char = start_char + len(answers["text"][0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != context_idx: token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != context_idx: token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) tokenized_examples["is_impossible"].append(1.0) else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples["start_positions"].append(token_start_index - 1) while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples["end_positions"].append(token_end_index + 1) tokenized_examples["is_impossible"].append(0.0) return tokenized_examples if training_args.do_train: train_dataset = datasets["train"].map( prepare_train_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) # Validation preprocessing def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[question_column_name if pad_on_right else context_column_name], examples[context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=data_args.max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, padding="max_length", ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers). special_tokens = tokenized_examples.pop("special_tokens_mask") # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the # corresponding example_id and we will store the offset mappings. tokenized_examples["example_id"] = [] # We still provide the index of the CLS token and the p_mask to the model, but not the is_impossible label. tokenized_examples["cls_index"] = [] tokenized_examples["p_mask"] = [] for i, input_ids in enumerate(tokenized_examples["input_ids"]): # Find the CLS token in the input ids. cls_index = input_ids.index(tokenizer.cls_token_id) tokenized_examples["cls_index"].append(cls_index) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples["token_type_ids"][i] for k, s in enumerate(special_tokens[i]): if s: sequence_ids[k] = 3 context_idx = 1 if pad_on_right else 0 # Build the p_mask: non special tokens and context gets 0.0, the others 1.0. tokenized_examples["p_mask"].append( [ 0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0 for k, s in enumerate(sequence_ids) ] ) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] tokenized_examples["example_id"].append(examples["id"][sample_index]) # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_idx else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples if training_args.do_eval: validation_dataset = datasets["validation"].map( prepare_validation_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data # collator. data_collator = default_data_collator if data_args.pad_to_max_length else DataCollatorWithPadding(tokenizer) # Post-processing: def post_processing_function(examples, features, predictions): # Post-processing: we match the start logits and end logits to answers in the original context. predictions, scores_diff_json = postprocess_qa_predictions_with_beam_search( examples=examples, features=features, predictions=predictions, version_2_with_negative=data_args.version_2_with_negative, n_best_size=data_args.n_best_size, max_answer_length=data_args.max_answer_length, start_n_top=model.config.start_n_top, end_n_top=model.config.end_n_top, output_dir=training_args.output_dir, is_world_process_zero=trainer.is_world_process_zero(), ) # Format the result to the format the metric expects. if data_args.version_2_with_negative: formatted_predictions = [ {"id": k, "prediction_text": v, "no_answer_probability": scores_diff_json[k]} for k, v in predictions.items() ] else: formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in datasets["validation"]] return EvalPrediction(predictions=formatted_predictions, label_ids=references) # TODO: Once the fix lands in a Datasets release, remove the _local here and the squad_v2_local folder. current_dir = os.path.sep.join(os.path.join(__file__).split(os.path.sep)[:-1]) metric = load_metric(os.path.join(current_dir, "squad_v2_local") if data_args.version_2_with_negative else "squad") def compute_metrics(p: EvalPrediction): return metric.compute(predictions=p.predictions, references=p.label_ids) # Initialize our Trainer trainer = QuestionAnsweringTrainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=validation_dataset if training_args.do_eval else None, eval_examples=datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, post_process_function=post_processing_function, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train( model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None ) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") results = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") return results
def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Handle the repository creation if accelerator.is_main_process: if args.push_to_hub: if args.hub_model_id is None: repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id repo = Repository(args.output_dir, clone_from=repo_name) elif args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) accelerator.wait_for_everyone() # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file if args.test_file is not None: data_files["test"] = args.test_file extension = args.train_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files, field="data") # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = XLNetConfig.from_pretrained(args.model_name_or_path) tokenizer = XLNetTokenizerFast.from_pretrained(args.model_name_or_path) model = XLNetForQuestionAnswering.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config) # Preprocessing the datasets. # Preprocessing is slighlty different for training and evaluation. column_names = raw_datasets["train"].column_names question_column_name = "question" if "question" in column_names else column_names[ 0] context_column_name = "context" if "context" in column_names else column_names[ 1] answer_column_name = "answers" if "answers" in column_names else column_names[ 2] # Padding side determines if we do (question|context) or (context|question). pad_on_right = tokenizer.padding_side == "right" if args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(args.max_seq_length, tokenizer.model_max_length) # Training preprocessing def prepare_train_features(examples): # Some of the questions have lots of whitespace on the left, which is not useful and will make the # truncation of the context fail (the tokenized question will take a lots of space). So we remove that # left whitespace examples[question_column_name] = [ q.lstrip() for q in examples[question_column_name] ] # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, padding="max_length", ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers). special_tokens = tokenized_examples.pop("special_tokens_mask") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] tokenized_examples["is_impossible"] = [] tokenized_examples["cls_index"] = [] tokenized_examples["p_mask"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] cls_index = input_ids.index(tokenizer.cls_token_id) tokenized_examples["cls_index"].append(cls_index) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples["token_type_ids"][i] for k, s in enumerate(special_tokens[i]): if s: sequence_ids[k] = 3 context_idx = 1 if pad_on_right else 0 # Build the p_mask: non special tokens and context gets 0.0, the others get 1.0. # The cls token gets 1.0 too (for predictions of empty answers). tokenized_examples["p_mask"].append([ 0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0 for k, s in enumerate(sequence_ids) ]) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] answers = examples[answer_column_name][sample_index] # If no answers are given, set the cls_index as answer. if len(answers["answer_start"]) == 0: tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) tokenized_examples["is_impossible"].append(1.0) else: # Start/end character index of the answer in the text. start_char = answers["answer_start"][0] end_char = start_char + len(answers["text"][0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != context_idx: token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != context_idx: token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) tokenized_examples["is_impossible"].append(1.0) else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[ token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples["start_positions"].append( token_start_index - 1) while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples["end_positions"].append( token_end_index + 1) tokenized_examples["is_impossible"].append(0.0) return tokenized_examples if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if args.max_train_samples is not None: # We will select sample from whole data if agument is specified train_dataset = train_dataset.select(range(args.max_train_samples)) # Create train feature from dataset with accelerator.main_process_first(): train_dataset = train_dataset.map( prepare_train_features, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on train dataset", ) if args.max_train_samples is not None: # Number of samples might increase during Feature Creation, We select only specified max samples train_dataset = train_dataset.select(range(args.max_train_samples)) # Validation preprocessing def prepare_validation_features(examples): # Some of the questions have lots of whitespace on the left, which is not useful and will make the # truncation of the context fail (the tokenized question will take a lots of space). So we remove that # left whitespace examples[question_column_name] = [ q.lstrip() for q in examples[question_column_name] ] # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, padding="max_length", ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers). special_tokens = tokenized_examples.pop("special_tokens_mask") # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the # corresponding example_id and we will store the offset mappings. tokenized_examples["example_id"] = [] # We still provide the index of the CLS token and the p_mask to the model, but not the is_impossible label. tokenized_examples["cls_index"] = [] tokenized_examples["p_mask"] = [] for i, input_ids in enumerate(tokenized_examples["input_ids"]): # Find the CLS token in the input ids. cls_index = input_ids.index(tokenizer.cls_token_id) tokenized_examples["cls_index"].append(cls_index) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples["token_type_ids"][i] for k, s in enumerate(special_tokens[i]): if s: sequence_ids[k] = 3 context_idx = 1 if pad_on_right else 0 # Build the p_mask: non special tokens and context gets 0.0, the others 1.0. tokenized_examples["p_mask"].append([ 0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0 for k, s in enumerate(sequence_ids) ]) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] tokenized_examples["example_id"].append( examples["id"][sample_index]) # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_idx else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_examples = raw_datasets["validation"] if args.max_eval_samples is not None: # We will select sample from whole data eval_examples = eval_examples.select(range(args.max_eval_samples)) # Validation Feature Creation with accelerator.main_process_first(): eval_dataset = eval_examples.map( prepare_validation_features, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on validation dataset", ) if args.max_eval_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again eval_dataset = eval_dataset.select(range(args.max_eval_samples)) if args.do_predict: if "test" not in raw_datasets: raise ValueError("--do_predict requires a test dataset") predict_examples = raw_datasets["test"] if args.max_predict_samples is not None: # We will select sample from whole data predict_examples = predict_examples.select( range(args.max_predict_samples)) # Predict Feature Creation with accelerator.main_process_first(): predict_dataset = predict_examples.map( prepare_validation_features, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on prediction dataset", ) if args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again predict_dataset = predict_dataset.select( range(args.max_predict_samples)) # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: if args.pad_to_max_length: # If padding was already done ot max length, we use the default data collator that will just convert everything # to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorWithPadding( tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size) eval_dataset_for_model = eval_dataset.remove_columns( ["example_id", "offset_mapping"]) eval_dataloader = DataLoader(eval_dataset_for_model, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) if args.do_predict: predict_dataset_for_model = predict_dataset.remove_columns( ["example_id", "offset_mapping"]) predict_dataloader = DataLoader( predict_dataset_for_model, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Post-processing: def post_processing_function(examples, features, predictions, stage="eval"): # Post-processing: we match the start logits and end logits to answers in the original context. predictions, scores_diff_json = postprocess_qa_predictions_with_beam_search( examples=examples, features=features, predictions=predictions, version_2_with_negative=args.version_2_with_negative, n_best_size=args.n_best_size, max_answer_length=args.max_answer_length, start_n_top=model.config.start_n_top, end_n_top=model.config.end_n_top, output_dir=args.output_dir, prefix=stage, ) # Format the result to the format the metric expects. if args.version_2_with_negative: formatted_predictions = [{ "id": k, "prediction_text": v, "no_answer_probability": scores_diff_json[k] } for k, v in predictions.items()] else: formatted_predictions = [{ "id": k, "prediction_text": v } for k, v in predictions.items()] references = [{ "id": ex["id"], "answers": ex[answer_column_name] } for ex in examples] return EvalPrediction(predictions=formatted_predictions, label_ids=references) metric = load_metric( "squad_v2" if args.version_2_with_negative else "squad") def create_and_fill_np_array(start_or_end_logits, dataset, max_len): """ Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor Args: start_or_end_logits(:obj:`tensor`): This is the output predictions of the model. We can only enter either start or end logits. eval_dataset: Evaluation dataset max_len(:obj:`int`): The maximum length of the output tensor. ( See the model.eval() part for more details ) """ step = 0 # create a numpy array and fill it with -100. logits_concat = np.full((len(dataset), max_len), -100, dtype=np.float32) # Now since we have create an array now we will populate it with the outputs gathered using accelerator.gather for i, output_logit in enumerate( start_or_end_logits): # populate columns # We have to fill it such that we have to take the whole tensor and replace it on the newly created array # And after every iteration we have to change the step batch_size = output_logit.shape[0] cols = output_logit.shape[1] if step + batch_size < len(dataset): logits_concat[step:step + batch_size, :cols] = output_logit else: logits_concat[step:, :cols] = output_logit[:len(dataset) - step] step += batch_size return logits_concat # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader) # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True) # intialize all lists to collect the batches all_start_top_log_probs = [] all_start_top_index = [] all_end_top_log_probs = [] all_end_top_index = [] all_cls_logits = [] for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) start_top_log_probs = outputs.start_top_log_probs start_top_index = outputs.start_top_index end_top_log_probs = outputs.end_top_log_probs end_top_index = outputs.end_top_index cls_logits = outputs.cls_logits if not args.pad_to_max_length: # necessary to pad predictions and labels for being gathered start_top_log_probs = accelerator.pad_across_processes( start_top_log_probs, dim=1, pad_index=-100) start_top_index = accelerator.pad_across_processes( start_top_index, dim=1, pad_index=-100) end_top_log_probs = accelerator.pad_across_processes( end_top_log_probs, dim=1, pad_index=-100) end_top_index = accelerator.pad_across_processes( end_top_index, dim=1, pad_index=-100) cls_logits = accelerator.pad_across_processes(cls_logits, dim=1, pad_index=-100) all_start_top_log_probs.append( accelerator.gather(start_top_log_probs).cpu().numpy()) all_start_top_index.append( accelerator.gather(start_top_index).cpu().numpy()) all_end_top_log_probs.append( accelerator.gather(end_top_log_probs).cpu().numpy()) all_end_top_index.append( accelerator.gather(end_top_index).cpu().numpy()) all_cls_logits.append(accelerator.gather(cls_logits).cpu().numpy()) max_len = max([x.shape[1] for x in all_end_top_log_probs ]) # Get the max_length of the tensor # concatenate all numpy arrays collected above start_top_log_probs_concat = create_and_fill_np_array( all_start_top_log_probs, eval_dataset, max_len) start_top_index_concat = create_and_fill_np_array(all_start_top_index, eval_dataset, max_len) end_top_log_probs_concat = create_and_fill_np_array( all_end_top_log_probs, eval_dataset, max_len) end_top_index_concat = create_and_fill_np_array(all_end_top_index, eval_dataset, max_len) cls_logits_concat = np.concatenate(all_cls_logits, axis=0) # delete the list of numpy arrays del start_top_log_probs del start_top_index del end_top_log_probs del end_top_index del cls_logits outputs_numpy = ( start_top_log_probs_concat, start_top_index_concat, end_top_log_probs_concat, end_top_index_concat, cls_logits_concat, ) prediction = post_processing_function(eval_examples, eval_dataset, outputs_numpy) eval_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids) logger.info(f"Evaluation metrics: {eval_metric}") if args.do_predict: # intialize all lists to collect the batches all_start_top_log_probs = [] all_start_top_index = [] all_end_top_log_probs = [] all_end_top_index = [] all_cls_logits = [] for step, batch in enumerate(predict_dataloader): with torch.no_grad(): outputs = model(**batch) start_top_log_probs = outputs.start_top_log_probs start_top_index = outputs.start_top_index end_top_log_probs = outputs.end_top_log_probs end_top_index = outputs.end_top_index cls_logits = outputs.cls_logits if not args.pad_to_max_length: # necessary to pad predictions and labels for being gathered start_top_log_probs = accelerator.pad_across_processes( start_top_log_probs, dim=1, pad_index=-100) start_top_index = accelerator.pad_across_processes( start_top_index, dim=1, pad_index=-100) end_top_log_probs = accelerator.pad_across_processes( end_top_log_probs, dim=1, pad_index=-100) end_top_index = accelerator.pad_across_processes( end_top_index, dim=1, pad_index=-100) cls_logits = accelerator.pad_across_processes( cls_logits, dim=1, pad_index=-100) all_start_top_log_probs.append( accelerator.gather(start_top_log_probs).cpu().numpy()) all_start_top_index.append( accelerator.gather(start_top_index).cpu().numpy()) all_end_top_log_probs.append( accelerator.gather(end_top_log_probs).cpu().numpy()) all_end_top_index.append( accelerator.gather(end_top_index).cpu().numpy()) all_cls_logits.append( accelerator.gather(cls_logits).cpu().numpy()) max_len = max([x.shape[1] for x in all_end_top_log_probs ]) # Get the max_length of the tensor # concatenate all numpy arrays collected above start_top_log_probs_concat = create_and_fill_np_array( all_start_top_log_probs, predict_dataset, max_len) start_top_index_concat = create_and_fill_np_array( all_start_top_index, predict_dataset, max_len) end_top_log_probs_concat = create_and_fill_np_array( all_end_top_log_probs, predict_dataset, max_len) end_top_index_concat = create_and_fill_np_array( all_end_top_index, predict_dataset, max_len) cls_logits_concat = np.concatenate(all_cls_logits, axis=0) # delete the list of numpy arrays del start_top_log_probs del start_top_index del end_top_log_probs del end_top_index del cls_logits outputs_numpy = ( start_top_log_probs_concat, start_top_index_concat, end_top_log_probs_concat, end_top_index_concat, cls_logits_concat, ) prediction = post_processing_function(predict_examples, predict_dataset, outputs_numpy) predict_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids) logger.info(f"Predict metrics: {predict_metric}") if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) if args.push_to_hub: repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
'batch_size': 64, 'tenacity': 5, 'epoch_size': 4 } # Set up logger logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--model', default='xlnet-base-cased', help='model name or path') args = parser.parse_args() config = XLNetConfig.from_pretrained(args.model) model = XLNetModel.from_pretrained(args.model, config=config) tokenizer = XLNetTokenizer.from_pretrained(args.model) params_senteval['model'] = model.cuda().eval() params_senteval['tokenizer'] = tokenizer se = senteval.engine.SE(params_senteval, batcher, prepare) transfer_tasks = [ 'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 'Length', 'WordContent', 'Depth', 'TopConstituents', 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 'OddManOut', 'CoordinationInversion', 'ImageCaptionRetrieval', 'SNLI' ] results = se.eval(transfer_tasks)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_qa_beam_search", model_args, data_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] raw_datasets = load_dataset( extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = XLNetConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = XLNetTokenizerFast.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = XLNetForQuestionAnswering.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Preprocessing the datasets. # Preprocessing is slighlty different for training and evaluation. if training_args.do_train: column_names = raw_datasets["train"].column_names elif training_args.do_eval: column_names = raw_datasets["validation"].column_names else: column_names = raw_datasets["test"].column_names question_column_name = "question" if "question" in column_names else column_names[ 0] context_column_name = "context" if "context" in column_names else column_names[ 1] answer_column_name = "answers" if "answers" in column_names else column_names[ 2] # Padding side determines if we do (question|context) or (context|question). pad_on_right = tokenizer.padding_side == "right" if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # Training preprocessing def prepare_train_features(examples): # Some of the questions have lots of whitespace on the left, which is not useful and will make the # truncation of the context fail (the tokenized question will take a lots of space). So we remove that # left whitespace examples[question_column_name] = [ q.lstrip() for q in examples[question_column_name] ] # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, padding="max_length", ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers). special_tokens = tokenized_examples.pop("special_tokens_mask") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] tokenized_examples["is_impossible"] = [] tokenized_examples["cls_index"] = [] tokenized_examples["p_mask"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] cls_index = input_ids.index(tokenizer.cls_token_id) tokenized_examples["cls_index"].append(cls_index) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples["token_type_ids"][i] for k, s in enumerate(special_tokens[i]): if s: sequence_ids[k] = 3 context_idx = 1 if pad_on_right else 0 # Build the p_mask: non special tokens and context gets 0.0, the others get 1.0. # The cls token gets 1.0 too (for predictions of empty answers). tokenized_examples["p_mask"].append([ 0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0 for k, s in enumerate(sequence_ids) ]) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] answers = examples[answer_column_name][sample_index] # If no answers are given, set the cls_index as answer. if len(answers["answer_start"]) == 0: tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) tokenized_examples["is_impossible"].append(1.0) else: # Start/end character index of the answer in the text. start_char = answers["answer_start"][0] end_char = start_char + len(answers["text"][0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != context_idx: token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != context_idx: token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) tokenized_examples["is_impossible"].append(1.0) else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[ token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples["start_positions"].append( token_start_index - 1) while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples["end_positions"].append( token_end_index + 1) tokenized_examples["is_impossible"].append(0.0) return tokenized_examples if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: # Select samples from Dataset, This will help to decrease processing time max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) # Create Training Features with training_args.main_process_first( desc="train dataset map pre-processing"): train_dataset = train_dataset.map( prepare_train_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on train dataset", ) if data_args.max_train_samples is not None: # Select samples from dataset again since Feature Creation might increase number of features max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) # Validation preprocessing def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, padding="max_length", ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers). special_tokens = tokenized_examples.pop("special_tokens_mask") # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the # corresponding example_id and we will store the offset mappings. tokenized_examples["example_id"] = [] # We still provide the index of the CLS token and the p_mask to the model, but not the is_impossible label. tokenized_examples["cls_index"] = [] tokenized_examples["p_mask"] = [] for i, input_ids in enumerate(tokenized_examples["input_ids"]): # Find the CLS token in the input ids. cls_index = input_ids.index(tokenizer.cls_token_id) tokenized_examples["cls_index"].append(cls_index) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples["token_type_ids"][i] for k, s in enumerate(special_tokens[i]): if s: sequence_ids[k] = 3 context_idx = 1 if pad_on_right else 0 # Build the p_mask: non special tokens and context gets 0.0, the others 1.0. tokenized_examples["p_mask"].append([ 0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0 for k, s in enumerate(sequence_ids) ]) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] tokenized_examples["example_id"].append( examples["id"][sample_index]) # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_idx else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples if training_args.do_eval: if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_examples = raw_datasets["validation"] if data_args.max_eval_samples is not None: # Selecting Eval Samples from Dataset max_eval_samples = min(len(eval_examples), data_args.max_eval_samples) eval_examples = eval_examples.select(range(max_eval_samples)) # Create Features from Eval Dataset with training_args.main_process_first( desc="validation dataset map pre-processing"): eval_dataset = eval_examples.map( prepare_validation_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on validation dataset", ) if data_args.max_eval_samples is not None: # Selecting Samples from Dataset again since Feature Creation might increase samples size max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) if training_args.do_predict: if "test" not in raw_datasets: raise ValueError("--do_predict requires a test dataset") predict_examples = raw_datasets["test"] if data_args.max_predict_samples is not None: # We will select sample from whole data predict_examples = predict_examples.select( range(data_args.max_predict_samples)) # Test Feature Creation with training_args.main_process_first( desc="prediction dataset map pre-processing"): predict_dataset = predict_examples.map( prepare_validation_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on prediction dataset", ) if data_args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) predict_dataset = predict_dataset.select( range(max_predict_samples)) # Data collator # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data # collator. data_collator = (default_data_collator if data_args.pad_to_max_length else DataCollatorWithPadding( tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)) # Post-processing: def post_processing_function(examples, features, predictions, stage="eval"): # Post-processing: we match the start logits and end logits to answers in the original context. predictions, scores_diff_json = postprocess_qa_predictions_with_beam_search( examples=examples, features=features, predictions=predictions, version_2_with_negative=data_args.version_2_with_negative, n_best_size=data_args.n_best_size, max_answer_length=data_args.max_answer_length, start_n_top=model.config.start_n_top, end_n_top=model.config.end_n_top, output_dir=training_args.output_dir, log_level=log_level, prefix=stage, ) # Format the result to the format the metric expects. if data_args.version_2_with_negative: formatted_predictions = [{ "id": k, "prediction_text": v, "no_answer_probability": scores_diff_json[k] } for k, v in predictions.items()] else: formatted_predictions = [{ "id": k, "prediction_text": v } for k, v in predictions.items()] references = [{ "id": ex["id"], "answers": ex[answer_column_name] } for ex in examples] return EvalPrediction(predictions=formatted_predictions, label_ids=references) metric = evaluate.load( "squad_v2" if data_args.version_2_with_negative else "squad") def compute_metrics(p: EvalPrediction): return metric.compute(predictions=p.predictions, references=p.label_ids) # Initialize our Trainer trainer = QuestionAnsweringTrainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, eval_examples=eval_examples if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, post_process_function=post_processing_function, compute_metrics=compute_metrics, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Prediction if training_args.do_predict: logger.info("*** Predict ***") results = trainer.predict(predict_dataset, predict_examples) metrics = results.metrics max_predict_samples = (data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)) metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) trainer.log_metrics("predict", metrics) trainer.save_metrics("predict", metrics) kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "question-answering" } if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name kwargs[ "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs)
def main(): # Set device for PyTorch if torch.cuda.is_available(): # might need to update when using more than 1 GPU rank = 0 torch.cuda.set_device(rank) device = torch.device("cuda", rank) #torch.distributed.init_process_group(backend='nccl') n_gpu = torch.cuda.device_count() else: device = torch.device("cpu") n_gpu = 0 print("N GPU: ", n_gpu) # Parse arguments parser = argparse.ArgumentParser() parser.add_argument( "--model_id", type=str, help= "Model and optimizer should be saved at a folder inside '/gpfs/data/razavianlab/capstone19/models/{model_id}'. " ) parser.add_argument( "--checkpoint", type=str, help= "Checkpoint number. Model and optimizer should be saved at '/gpfs/data/razavianlab/capstone19/models/{model_id}/model_checkpoint_{checkpoint}'. " ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( "--feature_save_dir", type=str, help= "Preprocessed data (features) should be saved at '/gpfs/data/razavianlab/capstone19/preprocessed_data/{feature_save_dir}'. " ) parser.add_argument("--set_type", type=str, help="Specify train/test file.") args = parser.parse_args() # Load training data feature_save_path = os.path.join( '/gpfs/data/razavianlab/capstone19/preprocessed_data/', args.feature_save_dir) logger.info("Loading test dataset") test_dataloader = load_featurized_examples( batch_size=32, set_type=args.set_type, feature_save_path=feature_save_path) # Load saved model model_path = os.path.join('/gpfs/data/razavianlab/capstone19/models/', args.model_id, 'model_checkpoint_' + args.checkpoint) logger.info("Loading saved model from {}".format(model_path)) config = XLNetConfig.from_pretrained( os.path.join(model_path, 'config.json'), num_labels=2292) # TODO: check if we need this model = XLNetForSequenceClassification.from_pretrained(model_path, config=config) model.to(device) model = torch.nn.DataParallel(model, device_ids=list(range(n_gpu))) summaries = torch.empty(0, config.d_model).to(device) all_doc_ids = torch.empty(0).to(device) all_label_ids = torch.empty(0, 2292).to(device) for i, batch in enumerate(test_dataloader): model.eval() with torch.no_grad(): input_ids, input_mask, segment_ids, label_ids, doc_ids = batch input_ids = input_ids.to(device).long() input_mask = input_mask.to(device).long() segment_ids = segment_ids.to(device).long() doc_ids = doc_ids.to(device).float() label_ids = label_ids.to(device).float() transformer_outputs = model.module.transformer( input_ids=input_ids, token_type_ids=segment_ids, input_mask=input_mask) output = transformer_outputs[0] # extracting the CLS token summary = output[:, 0] summary = summary.to(device) summaries = torch.cat([summaries, summary], dim=0) all_doc_ids = torch.cat([all_doc_ids, doc_ids], dim=0) all_label_ids = torch.cat([all_label_ids, label_ids], dim=0) # Average the representation of the CLS token for all examples from the same document mask = torch.zeros(int(all_doc_ids.max().item()) + 1, len(summaries)) mask[all_doc_ids.long(), torch.arange(len(summaries))] = 1 averaging_matrix = torch.nn.functional.normalize(mask, p=1, dim=1).to(device) mean_summaries = torch.mm(averaging_matrix, summaries) # Create an object storing one copy of the labels per document last_doc_id = -1 label_ids = torch.empty(0, all_label_ids.size()[1]).to(device) for (i, doc_id) in enumerate(all_doc_ids): if doc_id.item() != last_doc_id: label_ids = torch.cat([label_ids, all_label_ids[i].unsqueeze(0)]) last_doc_id = doc_id.item() # Save the embedded representations of the document, along with the labels torch.save( mean_summaries, os.path.join(feature_save_path, args.set_type + '_summaries.pt')) torch.save( label_ids, os.path.join(feature_save_path, args.set_type + '_doc_label_ids.pt') ) # label_ids.pt has one record per window (and thus multiple records per document) return
def train(): # 加载预训练bert config = XLNetConfig.from_pretrained('xlnet_config.json') model = XLNetForQuestionAnswering.from_pretrained('xlnet_model.ckpt.index', from_tf=True, config=config) device = args.device model.to(device) # 准备 optimizer param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = adabound.AdaBound(optimizer_grouped_parameters, lr=1e-3, final_lr=0.1) # 准备数据 data = Dureader() train_dataloader, dev_dataloader = data.train_iter, data.dev_iter best_loss = 100000.0 model.train() for i in range(args.num_train_epochs): for step, batch in enumerate(tqdm(train_dataloader, desc="Epoch")): input_ids, input_mask, segment_ids, start_positions, end_positions = \ batch.input_ids, batch.input_mask, batch.segment_ids, batch.start_position, batch.end_position input_ids, input_mask, segment_ids, start_positions, end_positions = \ input_ids.to(device), input_mask.to(device), segment_ids.to(device), start_positions.to(device), end_positions.to(device) # 计算loss outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, start_positions=start_positions, end_positions=end_positions) loss = outputs[0] loss = loss / args.gradient_accumulation_steps loss.backward() # 更新梯度 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() # 验证 if step % args.log_step == 4: eval_loss = evaluate.evaluate(model, dev_dataloader) if eval_loss < best_loss: best_loss = eval_loss torch.save(model.state_dict(), './model_dir/' + "best_model") model.train()
def main(): # Section: Set device for PyTorch if torch.cuda.is_available(): # might need to update when using more than 1 GPU rank = 0 torch.cuda.set_device(rank) device = torch.device("cuda", rank) #torch.distributed.init_process_group(backend='nccl') n_gpu = torch.cuda.device_count() else: device = torch.device("cpu") n_gpu = 0 print("N GPU: ", n_gpu) # Parse arguments parser = argparse.ArgumentParser() parser.add_argument("--batch_size", default=32, type=int, help="Indicate batch size") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--num_train_epochs", default=3.0, type=int, help="Total number of training epochs to perform.") parser.add_argument( "--val_logging_step", default=100000, type=int, help="Number of steps in between logs of performance on validation set" ) parser.add_argument( "--train_logging_step", default=1000, type=int, help="Number of steps in between logs of performance on training set") parser.add_argument("--save_step", default=100000, type=int, help="Number of steps to save model parameters") parser.add_argument( "--model_id", type=str, help= "Model and optimizer will be saved at '/gpfs/data/razavianlab/capstone19/models/model_id'. " ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( "--feature_save_dir", type=str, help= "Preprocessed data (features) should be saved at '/gpfs/data/razavianlab/capstone19/preprocessed_data/feature_save_dir'. " ) parser.add_argument( "--model_type", default="base", type=str, help="Whether to use the xlnet base model or the xlnet large model") parser.add_argument("--learning_rate", default=4e-5, type=float, help="Learning rate for optimizer") args = parser.parse_args() # Set random seed set_seeds(seed=args.seed, n_gpu=n_gpu) # Load data feature_save_path = os.path.join( '/gpfs/data/razavianlab/capstone19/preprocessed_data/', args.feature_save_dir) logger.info("Loading train dataset") train_dataloader = load_featurized_examples( args.batch_size, set_type="train", feature_save_path=feature_save_path) logger.info("Loading validation dataset") val_dataloader = load_featurized_examples( args.batch_size, set_type="val", feature_save_path=feature_save_path) # Load pretrained model num_train_optimization_steps = args.num_train_epochs * len( train_dataloader) if args.model_type == "large": config = XLNetConfig.from_pretrained('xlnet-large-cased', num_labels=2292) model = XLNetForSequenceClassification.from_pretrained( 'xlnet-large-cased', config=config) else: config = XLNetConfig.from_pretrained( 'xlnet-base-cased', num_labels=2292) # TODO: check if we need this model = XLNetForSequenceClassification.from_pretrained( 'xlnet-base-cased', config=config) model.to(device) optimizer, scheduler, model = initialize_optimizer(model, train_dataloader, args) logger.info("***** Running training *****") logger.info(" Num batches = %d", len(train_dataloader)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Total train batch size = %d", args.batch_size) logger.info(" Total optimization steps = %d", len(train_dataloader) * args.num_train_epochs) model = torch.nn.DataParallel(model, device_ids=list(range(n_gpu))) train(train_dataloader=train_dataloader, val_dataloader=val_dataloader, model=model, optimizer=optimizer, scheduler=scheduler, num_train_epochs=args.num_train_epochs, n_gpu=n_gpu, device=device, model_id=args.model_id, save_step=args.save_step, train_logging_step=args.train_logging_step, val_logging_step=args.val_logging_step)